Skip to main content

strat9_kernel/process/
task.rs

1//! Task Management
2//!
3//! Defines the Task structure and related types for the Strat9-OS scheduler.
4
5use crate::memory::AddressSpace;
6use alloc::sync::Arc;
7use core::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, AtomicU8, AtomicUsize, Ordering};
8use intrusive_collections::LinkedListLink;
9use x86_64::{PhysAddr, VirtAddr};
10
11/// POSIX process ID.
12pub type Pid = u32;
13/// POSIX thread ID.
14pub type Tid = u32;
15
16/// Performs the next pid operation.
17#[inline]
18fn next_pid() -> Pid {
19    static NEXT_PID: AtomicU32 = AtomicU32::new(1);
20    NEXT_PID.fetch_add(1, Ordering::SeqCst)
21}
22
23/// Performs the next tid operation.
24#[inline]
25fn next_tid() -> Tid {
26    static NEXT_TID: AtomicU32 = AtomicU32::new(1);
27    NEXT_TID.fetch_add(1, Ordering::SeqCst)
28}
29
30/// Unique identifier for a task
31#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
32pub struct TaskId(u64);
33
34impl TaskId {
35    /// Generate a new unique task ID
36    pub fn new() -> Self {
37        static NEXT_ID: AtomicU64 = AtomicU64::new(0);
38        TaskId(NEXT_ID.fetch_add(1, Ordering::SeqCst))
39    }
40
41    /// Get the raw u64 value
42    pub fn as_u64(self) -> u64 {
43        self.0
44    }
45
46    /// Create a TaskId from a raw u64 (for IPC reply routing).
47    pub fn from_u64(raw: u64) -> Self {
48        TaskId(raw)
49    }
50}
51
52impl core::fmt::Display for TaskId {
53    /// Performs the fmt operation.
54    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
55        write!(f, "{}", self.0)
56    }
57}
58
59/// Priority levels for tasks
60#[derive(Debug, Clone, Copy, PartialEq, Eq)]
61pub enum TaskPriority {
62    Idle = 0,
63    Low = 1,
64    Normal = 2,
65    High = 3,
66    Realtime = 4,
67}
68
69/// State of a task in the scheduler
70#[repr(u8)]
71#[derive(Debug, Clone, Copy, PartialEq, Eq)]
72pub enum TaskState {
73    /// Task is ready to be scheduled
74    Ready = 0,
75    /// Task is currently running
76    Running = 1,
77    /// Task is blocked waiting for an event
78    Blocked = 2,
79    /// Task has exited
80    Dead = 3,
81}
82
83/// How this task must be resumed the next time the scheduler selects it.
84///
85/// - `RetFrame`: legacy kernel-only context switch using `ret`
86/// - `IretFrame`: interrupt/syscall-like frame restored with `iretq`
87#[derive(Debug, Clone, Copy, PartialEq, Eq)]
88pub enum ResumeKind {
89    RetFrame,
90    IretFrame,
91}
92
93use core::cell::UnsafeCell;
94
95/// A wrapper around UnsafeCell that implements Sync for TaskState
96pub struct SyncUnsafeCell<T> {
97    inner: UnsafeCell<T>,
98}
99
100unsafe impl<T> Sync for SyncUnsafeCell<T> {}
101
102impl<T> SyncUnsafeCell<T> {
103    /// Creates a new instance.
104    pub const fn new(value: T) -> Self {
105        Self {
106            inner: UnsafeCell::new(value),
107        }
108    }
109
110    /// Performs the get operation.
111    pub fn get(&self) -> *mut T {
112        self.inner.get()
113    }
114}
115
116/// FPU/SSE/AVX extended state, saved and restored on context switch.
117///
118/// When XSAVE is available, uses `xsave`/`xrstor` with a variable-size area.
119/// Falls back to `fxsave`/`fxrstor` (512 bytes) on older CPUs.
120#[repr(C, align(64))]
121pub struct ExtendedState {
122    pub data: [u8; Self::MAX_XSAVE_SIZE],
123    pub size: usize,
124    pub uses_xsave: bool,
125    pub xcr0_mask: u64,
126}
127
128impl core::fmt::Debug for ExtendedState {
129    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
130        f.debug_struct("ExtendedState")
131            .field("size", &self.size)
132            .field("uses_xsave", &self.uses_xsave)
133            .field("xcr0_mask", &self.xcr0_mask)
134            .finish()
135    }
136}
137
138impl ExtendedState {
139    pub const FXSAVE_SIZE: usize = 512;
140    pub const MAX_XSAVE_SIZE: usize = 2688;
141
142    /// Create a new default state using the host's maximum capabilities.
143    pub fn new() -> Self {
144        crate::serial_println!("[trace][fpu] ExtendedState::new enter");
145        let (uses_xsave, size, default_xcr0) = if crate::arch::x86_64::cpuid::host_uses_xsave() {
146            crate::serial_println!("[trace][fpu] ExtendedState::new host_uses_xsave=true");
147            let xcr0 = crate::arch::x86_64::cpuid::host_default_xcr0();
148            crate::serial_println!(
149                "[trace][fpu] ExtendedState::new host_default_xcr0={:#x}",
150                xcr0
151            );
152            let sz =
153                crate::arch::x86_64::cpuid::xsave_size_for_xcr0(xcr0).min(Self::MAX_XSAVE_SIZE);
154            crate::serial_println!("[trace][fpu] ExtendedState::new xsave_size={}", sz);
155            (true, sz, xcr0)
156        } else {
157            crate::serial_println!("[trace][fpu] ExtendedState::new host_uses_xsave=false");
158            (false, Self::FXSAVE_SIZE, 0x3)
159        };
160
161        crate::serial_println!(
162            "[trace][fpu] ExtendedState::new build state uses_xsave={} size={} xcr0={:#x}",
163            uses_xsave,
164            size,
165            default_xcr0
166        );
167        let mut state = Self {
168            data: [0u8; Self::MAX_XSAVE_SIZE],
169            size,
170            uses_xsave,
171            xcr0_mask: default_xcr0,
172        };
173        crate::serial_println!("[trace][fpu] ExtendedState::new state allocated");
174        state.set_defaults();
175        crate::serial_println!("[trace][fpu] ExtendedState::new defaults set");
176        state
177    }
178
179    /// Create a state for a specific XCR0 mask (per-silo feature restriction).
180    pub fn for_xcr0(xcr0: u64) -> Self {
181        let uses_xsave = crate::arch::x86_64::cpuid::host_uses_xsave();
182        let size = if uses_xsave {
183            crate::arch::x86_64::cpuid::xsave_size_for_xcr0(xcr0).min(Self::MAX_XSAVE_SIZE)
184        } else {
185            Self::FXSAVE_SIZE
186        };
187
188        let mut state = Self {
189            data: [0u8; Self::MAX_XSAVE_SIZE],
190            size,
191            uses_xsave,
192            xcr0_mask: xcr0,
193        };
194        state.set_defaults();
195        state
196    }
197
198    fn set_defaults(&mut self) {
199        // x87 FCW = 0x037F
200        self.data[0] = 0x7F;
201        self.data[1] = 0x03;
202        // MXCSR = 0x1F80
203        self.data[24] = 0x80;
204        self.data[25] = 0x1F;
205    }
206
207    /// Copy the state from another `ExtendedState`.
208    pub fn copy_from(&mut self, other: &ExtendedState) {
209        let len = other.size.min(self.size);
210        self.data[..len].copy_from_slice(&other.data[..len]);
211    }
212}
213
214/// Represents a single task/thread in the system
215pub struct Task {
216    /// Unique identifier for this task
217    pub id: TaskId,
218    /// Process identifier visible to userspace.
219    pub pid: Pid,
220    /// Thread identifier visible to userspace.
221    pub tid: Tid,
222    /// Thread-group identifier (equals process leader PID).
223    pub tgid: Pid,
224    /// Process group id (job-control group).
225    pub pgid: AtomicU32,
226    /// Session id.
227    pub sid: AtomicU32,
228    /// real user id.
229    pub uid: AtomicU32,
230    /// effective user id.
231    pub euid: AtomicU32,
232    /// real group id.
233    pub gid: AtomicU32,
234    /// effective group id.
235    pub egid: AtomicU32,
236    /// Current state of the task. Stored as AtomicU8 for lock-free cross-CPU visibility.
237    /// Use `get_state()` / `set_state()` for typed access.
238    pub state: AtomicU8,
239    /// Priority level of the task
240    pub priority: TaskPriority,
241    /// Saved CPU context for this task (just the stack pointer)
242    pub context: SyncUnsafeCell<CpuContext>,
243    /// Resume convention for this task's saved kernel stack frame.
244    pub resume_kind: SyncUnsafeCell<ResumeKind>,
245    /// Saved interrupt/syscall-compatible frame pointer for `iretq`-based resume.
246    pub interrupt_rsp: AtomicU64,
247    /// Kernel stack for this task
248    pub kernel_stack: KernelStack,
249    /// User stack for this task (if applicable)
250    pub user_stack: Option<UserStack>,
251    /// Task name for debugging purposes
252    pub name: &'static str,
253    /// Capabilities granted to this task
254    /// Address space for this task (kernel tasks share the kernel AS)
255    pub process: Arc<crate::process::process::Process>,
256    /// File descriptor table for this task
257    /// Pending signals for this task
258    pub pending_signals: super::signal::SignalSet,
259    /// Blocked signals mask for this task
260    pub blocked_signals: super::signal::SignalSet,
261    /// Suppress repeated IRQ-return delivery attempts until a normal delivery
262    /// path runs again.
263    pub irq_signal_delivery_blocked: AtomicBool,
264    /// Signal actions (handlers) for this task
265    /// Signal alternate stack for this task
266    pub signal_stack: SyncUnsafeCell<Option<super::signal::SigStack>>,
267    /// Interval timers (ITIMER_REAL, ITIMER_VIRTUAL, ITIMER_PROF)
268    pub itimers: super::timer::ITimers,
269    /// Pending wakeup flag: set by `wake_task()` when the task is not yet
270    /// in `blocked_tasks` (it is still transitioning to Blocked state).
271    /// Checked by `block_current_task()` : if set, the task skips blocking
272    /// and continues execution, preventing a lost-wakeup race.
273    pub wake_pending: AtomicBool,
274    /// Sleep deadline in nanoseconds (monotonic). If non-zero, the task
275    /// is sleeping until this time. Checked by the scheduler to auto-wake.
276    pub wake_deadline_ns: AtomicU64,
277    /// Program break (end of heap), in bytes. 0 = not yet initialised.
278    /// Lazily set to `BRK_BASE` on the first `sys_brk` call.
279    /// mmap_hint: next candidate virtual address for anonymous mmap allocations
280    /// User-space entry point for ring3 trampoline (ELF tasks only, 0 otherwise).
281    pub trampoline_entry: AtomicU64,
282    /// User-space stack top for ring3 trampoline (ELF tasks only, 0 otherwise).
283    pub trampoline_stack_top: AtomicU64,
284    /// First argument (RDI) passed to the user process on entry (e.g. bootstrap cap handle).
285    pub trampoline_arg0: AtomicU64,
286    /// Total CPU ticks consumed by this task
287    pub ticks: AtomicU64,
288    /// Scheduling policy (Fair, RealTime, Idle)
289    pub sched_policy: SyncUnsafeCell<crate::process::sched::SchedPolicy>,
290    /// Home CPU index for this task. Set when the task is first scheduled
291    /// or explicitly assigned. Used by `wake_task()` to route to the correct
292    /// per-CPU runqueue without acquiring `GLOBAL_SCHED_STATE`.
293    pub home_cpu: AtomicUsize,
294    /// Virtual runtime for CFS
295    pub vruntime: AtomicU64,
296    /// Monotonic token identifying the currently valid FAIR runqueue entry.
297    pub fair_rq_generation: AtomicU64,
298    /// Whether this task is logically present in the FAIR runqueue.
299    pub fair_on_rq: AtomicBool,
300    /// TID address for futex-based thread join (set_tid_address).
301    /// The kernel writes 0 here when the thread exits, then futex_wake.
302    pub clear_child_tid: AtomicU64,
303    /// Current working directory (POSIX, inherited by children).
304    /// File creation mask (inherited by children, NOT reset by exec).
305    /// User-space FS.base (TLS on x86_64, set via arch_prctl ARCH_SET_FS).
306    /// Saved/restored across context switches.
307    pub user_fs_base: AtomicU64,
308    /// FPU/SSE/AVX extended state saved during context switch.
309    pub fpu_state: SyncUnsafeCell<ExtendedState>,
310    /// XCR0 mask for this task (inherited from its silo).
311    pub xcr0_mask: AtomicU64,
312    /// Intrusive linked-list link for the RT run queue.
313    ///
314    /// Only touched while holding the per-CPU scheduler spinlock.
315    pub rt_link: LinkedListLink,
316}
317
318// SAFETY: `LinkedListLink` uses `UnsafeCell` internally and is therefore
319// `!Sync` by default, but all mutations to `rt_link` are performed under the
320// per-CPU scheduler spinlock.  Every other non-atomic field in `Task` is
321// similarly protected by the appropriate lock or by the task's own atomics.
322unsafe impl Sync for Task {}
323
324impl Task {
325    /// Leave this much headroom above the synthetic `SyscallFrame`.
326    ///
327    /// The raw IRQ switch path does `mov rsp, next_rsp` and then `call
328    /// finish_interrupt_switch`, so `next_rsp` must be close to the top of the
329    /// kernel stack to preserve downward growth room for the call chain.
330    const BOOTSTRAP_INTERRUPT_FRAME_TOP_HEADROOM: usize = 0x1000;
331
332    /// Canary placed below the interrupt frame to detect stack underflow
333    /// (interrupt handler overflowing downward past the frame)
334    const STACK_UNDERFLOW_CANARY_OFFSET: usize = 0x100; // 256 bytes from base
335
336    /// Performs the default sched policy operation.
337    pub fn default_sched_policy(priority: TaskPriority) -> crate::process::sched::SchedPolicy {
338        use crate::process::sched::{nice::Nice, real_time::RealTimePriority, SchedPolicy};
339        match priority {
340            TaskPriority::Idle => SchedPolicy::Idle,
341            TaskPriority::Realtime => SchedPolicy::RealTimeRR {
342                prio: RealTimePriority::new(50),
343            },
344            TaskPriority::High => SchedPolicy::Fair(Nice::new(-10)),
345            TaskPriority::Low => SchedPolicy::Fair(Nice::new(10)),
346            TaskPriority::Normal => SchedPolicy::Fair(Nice::default()),
347        }
348    }
349
350    /// Get the current scheduling policy of the task
351    pub fn sched_policy(&self) -> crate::process::sched::SchedPolicy {
352        unsafe { *self.sched_policy.get() }
353    }
354
355    /// Set the scheduling policy of the task
356    pub fn set_sched_policy(&self, policy: crate::process::sched::SchedPolicy) {
357        unsafe {
358            *self.sched_policy.get() = policy;
359        }
360    }
361
362    /// Returns the current resume convention for this task.
363    pub fn resume_kind(&self) -> ResumeKind {
364        unsafe { *self.resume_kind.get() }
365    }
366
367    /// Sets the resume convention for this task.
368    pub fn set_resume_kind(&self, kind: ResumeKind) {
369        unsafe {
370            *self.resume_kind.get() = kind;
371        }
372    }
373
374    /// Returns the saved `iretq`-compatible frame pointer for this task.
375    pub fn interrupt_rsp(&self) -> u64 {
376        self.interrupt_rsp.load(Ordering::Acquire)
377    }
378
379    /// Updates the saved `iretq`-compatible frame pointer for this task.
380    pub fn set_interrupt_rsp(&self, rsp: u64) {
381        self.interrupt_rsp.store(rsp, Ordering::Release);
382    }
383
384    /// Seed a synthetic interrupt frame for tasks that have not yet been
385    /// preempted from an IRQ path but must still be resumable via `iretq`.
386    pub fn seed_interrupt_frame(&self, frame: crate::syscall::SyscallFrame) {
387        let stack_base = self.kernel_stack.virt_base.as_u64();
388        let stack_top = stack_base + self.kernel_stack.size as u64;
389        let frame_size = core::mem::size_of::<crate::syscall::SyscallFrame>() as u64;
390        let raw_frame_addr = stack_top
391            .saturating_sub(Self::BOOTSTRAP_INTERRUPT_FRAME_TOP_HEADROOM as u64)
392            .saturating_sub(frame_size);
393        let frame_addr = raw_frame_addr & !0xF;
394        let frame_end = frame_addr + core::mem::size_of::<crate::syscall::SyscallFrame>() as u64;
395        assert!(
396            frame_addr >= stack_base && frame_end <= stack_top,
397            "kernel stack too small for bootstrap interrupt frame"
398        );
399        unsafe {
400            (frame_addr as *mut crate::syscall::SyscallFrame).write(frame);
401
402            // Place underflow canary below the frame (at lower address)
403            // This detects if interrupt handler overflows downward past expected range
404            let canary_addr = stack_base + Self::STACK_UNDERFLOW_CANARY_OFFSET as u64;
405            *(canary_addr as *mut u64) = 0xBAD57ACBAD57AC;
406        }
407        self.set_interrupt_rsp(frame_addr);
408    }
409
410    /// Seed an `iretq`-compatible frame from the legacy `CpuContext` bootstrap
411    /// layout used by kernel tasks (`ret` into `task_entry_trampoline`).
412    ///
413    /// The synthesised frame always sets IF=1 so that IRQ-driven resumes keep
414    /// receiving timer interrupts. First-launch tasks still enter through the
415    /// legacy `ret` trampoline and must explicitly re-enable interrupts in
416    /// `task_post_switch_enter`.
417    pub fn seed_kernel_interrupt_frame_from_context(&self) {
418        let stack_base = self.kernel_stack.virt_base.as_u64();
419        let stack_top = stack_base + self.kernel_stack.size as u64;
420        let saved_rsp = unsafe { (*self.context.get()).saved_rsp as *const u64 };
421        let saved_rsp_val = saved_rsp as u64;
422        debug_assert!(
423            saved_rsp_val >= stack_base && saved_rsp_val.saturating_add(7 * 8) <= stack_top,
424            "saved_rsp outside kernel stack while seeding interrupt frame"
425        );
426        let ret_target = unsafe { *saved_rsp.add(6) };
427        // Always set IF=1 (bit 9) so IRQ-driven resumes keep interrupts enabled.
428        // First-launch tasks still need an explicit sti() in
429        // task_post_switch_enter because the legacy bootstrap path reaches the
430        // entry point through a plain ret, not an iretq restoring RFLAGS.
431        let rflags = 0x202u64; // bit 9 = IF, bit 1 = reserved (always 1)
432        let frame = unsafe {
433            crate::syscall::SyscallFrame {
434                r15: *saved_rsp.add(0),
435                r14: *saved_rsp.add(1),
436                r13: *saved_rsp.add(2),
437                r12: *saved_rsp.add(3),
438                rbp: *saved_rsp.add(4),
439                rbx: *saved_rsp.add(5),
440                r11: 0,
441                r10: 0,
442                r9: 0,
443                r8: 0,
444                rsi: 0,
445                rdi: 0,
446                rdx: 0,
447                rcx: 0,
448                rax: 0,
449                iret_rip: ret_target,
450                iret_cs: crate::arch::x86_64::gdt::kernel_code_selector().0 as u64,
451                iret_rflags: rflags,
452                iret_rsp: self.kernel_stack.virt_base.as_u64() + self.kernel_stack.size as u64,
453                iret_ss: crate::arch::x86_64::gdt::kernel_data_selector().0 as u64,
454            }
455        };
456        self.seed_interrupt_frame(frame);
457    }
458
459    /// Get virtual runtime
460    pub fn vruntime(&self) -> u64 {
461        self.vruntime.load(Ordering::Relaxed)
462    }
463
464    /// Set virtual runtime
465    pub fn set_vruntime(&self, vruntime: u64) {
466        self.vruntime.store(vruntime, Ordering::Relaxed);
467    }
468
469    /// Prepare a new FAIR runqueue entry and return its generation token.
470    pub fn fair_prepare_enqueue(&self) -> (u64, bool) {
471        let was_queued = self.fair_on_rq.swap(true, Ordering::AcqRel);
472        let generation = self.fair_rq_generation.fetch_add(1, Ordering::Relaxed) + 1;
473        (generation, was_queued)
474    }
475
476    /// Returns the generation of the currently valid FAIR entry.
477    pub fn fair_generation(&self) -> u64 {
478        self.fair_rq_generation.load(Ordering::Relaxed)
479    }
480
481    /// Returns whether the task is logically queued in FAIR.
482    pub fn fair_is_on_rq(&self) -> bool {
483        self.fair_on_rq.load(Ordering::Acquire)
484    }
485
486    /// Marks the task as dequeued from FAIR.
487    pub fn fair_mark_dequeued(&self) -> bool {
488        self.fair_on_rq.swap(false, Ordering::AcqRel)
489    }
490
491    /// Invalidates the current FAIR entry so stale heap nodes can be skipped lazily.
492    pub fn fair_invalidate_rq_entry(&self) -> bool {
493        let was_queued = self.fair_on_rq.swap(false, Ordering::AcqRel);
494        self.fair_rq_generation.fetch_add(1, Ordering::Relaxed);
495        was_queued
496    }
497
498    /// Read the current task state atomically.
499    #[inline]
500    pub fn get_state(&self) -> TaskState {
501        let raw = self.state.load(Ordering::Acquire);
502        debug_assert!(
503            raw <= TaskState::Dead as u8,
504            "get_state: invalid TaskState discriminant {:#x}",
505            raw
506        );
507        // SAFETY: `raw` is always one of the four valid `#[repr(u8)]`
508        // discriminants (0..=3); the only writer is `set_state` which stores
509        // a cast from the same enum.
510        unsafe { core::mem::transmute(raw) }
511    }
512
513    /// Write the task state atomically. Uses Release ordering so the new state
514    /// is visible to any CPU that subsequently does an Acquire load.
515    #[inline]
516    pub fn set_state(&self, new_state: TaskState) {
517        self.state.store(new_state as u8, Ordering::Release);
518    }
519}
520
521/// CPU context saved/restored during context switches.
522///
523/// Only stores the saved RSP. All callee-saved registers (rbx, rbp, r12-r15)
524/// are pushed onto the task's kernel stack by `switch_context()`.
525#[repr(C)]
526pub struct CpuContext {
527    /// Saved stack pointer (points into the task's kernel stack)
528    pub saved_rsp: u64,
529}
530
531impl CpuContext {
532    /// Create a new CPU context for a task starting at the given entry point.
533    ///
534    /// Sets up a fake stack frame on the kernel stack that looks like
535    /// `switch_context()` just pushed callee-saved registers. When
536    /// `switch_context()` or `restore_first_task()` pops them and does `ret`,
537    /// it will jump to `task_entry_trampoline`, which enables interrupts
538    /// and jumps to the real entry point (stored in r12).
539    ///
540    /// Stack layout (growing downward):
541    /// ```text
542    /// [stack_top]
543    ///   0xDEADBEEFCAFEBABE      <- stack canary
544    ///   task_entry_trampoline   <- ret target
545    ///   0  (r15)
546    ///   0  (r14)
547    ///   0  (r13)
548    ///   entry_point (r12)      <- trampoline reads this
549    ///   0  (rbp)
550    ///   0  (rbx)
551    ///   <- saved_rsp points here
552    /// ```
553    pub fn new(entry_point: u64, kernel_stack: &KernelStack) -> Self {
554        let stack_top = kernel_stack.virt_base.as_u64() + kernel_stack.size as u64;
555
556        // Reserve space for the stack canary before building the fake frame.
557        const STACK_CANARY: u64 = 0xDEADBEEFCAFEBABE;
558        let canary_addr = stack_top - 8;
559        let initial_rsp = canary_addr - 7 * 8;
560
561        // SAFETY: We own this stack memory and it's properly allocated and zeroed.
562        // The stack region [virt_base, virt_base + size) is valid.
563        unsafe {
564            let stack = initial_rsp as *mut u64;
565            // Push order must match switch_context pops (LIFO, but we write linearly from RSP up):
566            // [RSP+0]  = r15
567            // [RSP+8]  = r14
568            // [RSP+16] = r13
569            // [RSP+24] = r12 (entry point)
570            // [RSP+32] = rbp
571            // [RSP+40] = rbx
572            // [RSP+48] = ret (trampoline)
573            *stack.add(0) = 0; // r15
574            *stack.add(1) = 0; // r14
575            *stack.add(2) = 0; // r13
576            *stack.add(3) = entry_point; // r12 (trampoline target)
577            *stack.add(4) = 0; // rbp
578            *stack.add(5) = 0; // rbx
579            *stack.add(6) = task_entry_trampoline as *const () as u64; // ret address
580        }
581
582        // Add stack canary at the very top (leave the frame below it so `ret` still points
583        // to `task_entry_trampoline`). The canary slot must be reserved before writing the
584        // frame to avoid overwriting the trampoline address.
585        unsafe {
586            let canary_ptr = canary_addr as *mut u64;
587            *canary_ptr = STACK_CANARY;
588        }
589
590        // Verify canary is still intact
591        unsafe {
592            let canary_ptr = canary_addr as *const u64;
593            let canary = *canary_ptr;
594            if canary != STACK_CANARY {
595                crate::serial_force_println!(
596                    "[PANIC] Stack canary corrupted at setup! entry_point={:#x} canary={:#x}",
597                    entry_point,
598                    canary
599                );
600            }
601        }
602
603        // Debug: verify entire stack frame
604        unsafe {
605            let stack = initial_rsp as *const u64;
606            crate::serial_println!(
607                "[CpuContext] frame verify: r15={:#x} r14={:#x} r13={:#x} r12={:#x} rbp={:#x} rbx={:#x} ret={:#x}",
608                *stack.add(0),
609                *stack.add(1),
610                *stack.add(2),
611                *stack.add(3),
612                *stack.add(4),
613                *stack.add(5),
614                *stack.add(6)
615            );
616            // Verify canary one more time
617            let canary_ptr = canary_addr as *const u64;
618            let canary = *canary_ptr;
619            if canary != STACK_CANARY {
620                crate::serial_force_println!(
621                    "[CpuContext] CANARY CORRUPTED AFTER FRAME SETUP! canary={:#x}",
622                    canary
623                );
624            }
625
626            // Debug: check if stack memory overlaps with another task
627            crate::serial_println!(
628                "[CpuContext] stack range: base={:#x} top={:#x} initial_rsp={:#x}",
629                kernel_stack.virt_base.as_u64(),
630                stack_top,
631                initial_rsp
632            );
633        }
634
635        CpuContext {
636            saved_rsp: initial_rsp,
637        }
638    }
639}
640
641/// Trampoline for newly created tasks.
642///
643/// When a new task is first scheduled, `switch_context()` pops the fake
644/// callee-saved registers and `ret`s here, then tail-jumps into the actual
645/// post-switch entry helper.
646#[unsafe(naked)]
647pub unsafe extern "C" fn task_entry_trampoline() -> ! {
648    core::arch::naked_asm!(
649        "mov al, 'T'",
650        "out 0xe9, al",
651        "call {finish_switch}",
652        "mov al, '1'",
653        "out 0xe9, al",
654        "mov rdi, r12", // entry_point
655        "mov rsi, r13", // arg0
656        "and rsp, -16",
657        "sub rsp, 8",
658        "jmp {post_switch_enter}",
659        finish_switch = sym crate::process::scheduler::finish_switch,
660        post_switch_enter = sym task_post_switch_enter,
661    );
662}
663
664fn task_post_switch_enter(entry: u64, arg0: u64) -> ! {
665    // E9 breadcrumb: 'P' = reached post_switch_enter (no serial lock needed).
666    unsafe {
667        core::arch::asm!("out 0xe9, al", in("al") b'P', options(nomem, nostack));
668    }
669
670    crate::arch::x86_64::percpu::mark_tlb_ready_current();
671
672    let cpu = crate::arch::x86_64::percpu::current_cpu_index();
673
674    let is_user_entry = crate::process::scheduler::current_task_clone_try()
675        .map(|task| task.trampoline_entry.load(Ordering::Relaxed) != 0)
676        .unwrap_or(false);
677
678    // Single diagnostic print (IF may be 0 or 1 depending on RFLAGS seed; either
679    // way E9 is IRQ-safe and this is the LAST trace call before entry_fn).
680    if let Some(task) = crate::process::scheduler::current_task_clone_try() {
681        crate::e9_println!(
682            "[pse] cpu={} tid={} user={} entry={:#x}",
683            cpu,
684            task.id.as_u64(),
685            is_user_entry,
686            entry
687        );
688        if is_user_entry {
689            crate::serial_println!(
690                "[trace][task] post_switch_enter cpu={} tid={} entry={:#x}",
691                cpu,
692                task.id.as_u64(),
693                entry
694            );
695        }
696    }
697
698    // First-launch tasks arrive here via the legacy `ret` bootstrap path, which
699    // does not restore RFLAGS. Re-enable interrupts now that `finish_switch()`
700    // has completed and the task is running on its own stack.
701    crate::arch::x86_64::sti();
702
703    // User tasks still transition to Ring 3 via iretq later and will restore
704    // their own RFLAGS there.
705
706    let entry_fn: extern "C" fn(u64) -> ! = unsafe { core::mem::transmute(entry as usize) };
707    entry_fn(arg0)
708}
709
710/// Kernel stack for a task
711pub struct KernelStack {
712    /// Physical address of the stack
713    pub base: PhysAddr,
714    /// Virtual address of the stack
715    pub virt_base: VirtAddr,
716    /// Size of the stack
717    pub size: usize,
718}
719
720impl KernelStack {
721    /// Allocate a new kernel stack using the buddy allocator
722    pub fn allocate(size: usize) -> Result<Self, &'static str> {
723        // Calculate number of pages needed (round up)
724        let pages = (size + 4095) / 4096;
725        let order = pages.next_power_of_two().trailing_zeros() as u8;
726
727        crate::serial_println!("[trace][task] kstack allocate begin size={}", size);
728        crate::serial_println!(
729            "[trace][task] kstack allocate pages={} order={}",
730            pages,
731            order
732        );
733
734        crate::serial_println!(
735            "[trace][task] kstack allocate calling allocate_frames order={}",
736            order
737        );
738        let frame = crate::sync::with_irqs_disabled(|token| {
739            crate::memory::allocate_kernel_stack_frames(token, order)
740        })
741        .map_err(|_| "Failed to allocate kernel stack")?;
742        crate::serial_println!(
743            "[trace][task] kstack allocate frame phys={:#x}",
744            frame.start_address.as_u64()
745        );
746
747        let phys_base = frame.start_address;
748        let virt_base = VirtAddr::new(crate::memory::phys_to_virt(phys_base.as_u64()));
749        crate::serial_println!(
750            "[trace][task] kstack allocate virt_base={:#x}",
751            virt_base.as_u64()
752        );
753
754        // Zero out the stack for safety
755        unsafe {
756            core::ptr::write_bytes(virt_base.as_mut_ptr::<u8>(), 0, size);
757        }
758        crate::serial_println!("[trace][task] kstack allocate memset done");
759
760        // Debug: verify zeroing worked
761        unsafe {
762            let first_word = *(virt_base.as_ptr::<u64>());
763            let mid_offset = size / 2;
764            let mid_word = *((virt_base.as_u64() + mid_offset as u64) as *const u64);
765            let last_offset = size - 8;
766            let last_word = *((virt_base.as_u64() + last_offset as u64) as *const u64);
767            if first_word != 0 || mid_word != 0 || last_word != 0 {
768                crate::serial_force_println!(
769                    "[WARN] kstack zeroing failed! first={:#x} mid={:#x} last={:#x}",
770                    first_word,
771                    mid_word,
772                    last_word
773                );
774            }
775        }
776
777        Ok(KernelStack {
778            base: phys_base,
779            virt_base,
780            size,
781        })
782    }
783
784    /// Debug: check if this stack overlaps with another range
785    pub fn overlaps(&self, other_base: u64, other_size: usize) -> bool {
786        let self_end = self.virt_base.as_u64() + self.size as u64;
787        let other_end = other_base + other_size as u64;
788        !(self_end <= other_base || other_end <= self.virt_base.as_u64())
789    }
790}
791
792impl Drop for KernelStack {
793    /// Performs the drop operation.
794    fn drop(&mut self) {
795        use crate::memory::frame::PhysFrame;
796
797        let pages = (self.size + 4095) / 4096;
798        let order = pages.next_power_of_two().trailing_zeros() as u8;
799        let frame = PhysFrame {
800            start_address: self.base,
801        };
802
803        crate::sync::with_irqs_disabled(|token| {
804            crate::memory::free_kernel_stack_frames(token, frame, order);
805        });
806    }
807}
808
809/// User stack for a task (when running in userspace)
810pub struct UserStack {
811    /// Virtual address of the user stack
812    pub virt_base: VirtAddr,
813    /// Size of the stack
814    pub size: usize,
815}
816
817impl Task {
818    /// Default kernel stack size (64 KB - increased from 16KB due to overflow)
819    pub const DEFAULT_STACK_SIZE: usize = 65536;
820
821    /// Create a new kernel task with a real allocated stack
822    pub fn new_kernel_task(
823        entry_point: extern "C" fn() -> !,
824        name: &'static str,
825        priority: TaskPriority,
826    ) -> Result<Arc<Self>, &'static str> {
827        Self::new_kernel_task_with_stack(entry_point, name, priority, Self::DEFAULT_STACK_SIZE)
828    }
829
830    /// Create a new kernel task with a custom kernel stack size.
831    pub fn new_kernel_task_with_stack(
832        entry_point: extern "C" fn() -> !,
833        name: &'static str,
834        priority: TaskPriority,
835        stack_size: usize,
836    ) -> Result<Arc<Self>, &'static str> {
837        crate::serial_println!(
838            "[trace][task] new_kernel_task_with_stack begin name={} stack_size={}",
839            name,
840            stack_size
841        );
842        // Allocate a real kernel stack
843        let kernel_stack = KernelStack::allocate(stack_size)?;
844        crate::serial_println!("[trace][task] new_kernel_task_with_stack kstack done");
845
846        // Create CPU context with the allocated stack
847        let context = CpuContext::new(entry_point as *const () as u64, &kernel_stack);
848        crate::serial_println!("[trace][task] new_kernel_task_with_stack context done");
849        let id = TaskId::new();
850        let (pid, tid, tgid) = Self::allocate_process_ids();
851        crate::serial_println!(
852            "[trace][task] new_kernel_task_with_stack ids done id={} pid={} tid={} tgid={}",
853            id.as_u64(),
854            pid,
855            tid,
856            tgid
857        );
858        let fpu_state = ExtendedState::new();
859        let xcr0_mask = fpu_state.xcr0_mask;
860
861        let process = Arc::new(crate::process::process::Process::new(
862            pid,
863            crate::memory::kernel_address_space().clone(),
864        ));
865        crate::serial_println!("[trace][task] new_kernel_task_with_stack process done");
866
867        log::debug!(
868            "[task][create] name={} id={} pid={} tid={} kstack={:?} kstack_kib={}",
869            name,
870            id.as_u64(),
871            pid,
872            tid,
873            kernel_stack.virt_base,
874            kernel_stack.size / 1024
875        );
876
877        let task = Arc::new(Task {
878            id,
879            pid,
880            tid,
881            tgid,
882            pgid: AtomicU32::new(pid),
883            sid: AtomicU32::new(pid),
884            uid: AtomicU32::new(0),
885            euid: AtomicU32::new(0),
886            gid: AtomicU32::new(0),
887            egid: AtomicU32::new(0),
888            state: AtomicU8::new(TaskState::Ready as u8),
889            priority,
890            context: SyncUnsafeCell::new(context),
891            resume_kind: SyncUnsafeCell::new(ResumeKind::RetFrame),
892            interrupt_rsp: AtomicU64::new(0),
893            kernel_stack,
894            user_stack: None,
895            name,
896            process,
897            pending_signals: super::signal::SignalSet::new(),
898            blocked_signals: super::signal::SignalSet::new(),
899            irq_signal_delivery_blocked: AtomicBool::new(false),
900            signal_stack: SyncUnsafeCell::new(None),
901            itimers: super::timer::ITimers::new(),
902            wake_pending: AtomicBool::new(false),
903            wake_deadline_ns: AtomicU64::new(0),
904            trampoline_entry: AtomicU64::new(0),
905            trampoline_stack_top: AtomicU64::new(0),
906            trampoline_arg0: AtomicU64::new(0),
907            ticks: AtomicU64::new(0),
908            sched_policy: SyncUnsafeCell::new(Self::default_sched_policy(priority)),
909            home_cpu: AtomicUsize::new(usize::MAX),
910            vruntime: AtomicU64::new(0),
911            fair_rq_generation: AtomicU64::new(0),
912            fair_on_rq: AtomicBool::new(false),
913            clear_child_tid: AtomicU64::new(0),
914            user_fs_base: AtomicU64::new(0),
915            fpu_state: SyncUnsafeCell::new(fpu_state),
916            xcr0_mask: AtomicU64::new(xcr0_mask),
917            rt_link: LinkedListLink::new(),
918        });
919        task.seed_kernel_interrupt_frame_from_context();
920        Ok(task)
921    }
922
923    /// Create a new user task with its own address space (stub for future use).
924    ///
925    /// The entry point and user stack must already be mapped in the given address space.
926    pub fn new_user_task(
927        entry_point: u64,
928        address_space: Arc<AddressSpace>,
929        name: &'static str,
930        priority: TaskPriority,
931    ) -> Result<Arc<Self>, &'static str> {
932        let kernel_stack = KernelStack::allocate(Self::DEFAULT_STACK_SIZE)?;
933        let context = CpuContext::new(entry_point, &kernel_stack);
934        let id = TaskId::new();
935        let (pid, tid, tgid) = Self::allocate_process_ids();
936        let fpu_state = ExtendedState::new();
937        let xcr0_mask = fpu_state.xcr0_mask;
938
939        log::debug!(
940            "[task][create] name={} id={} pid={} tid={} user_as_cr3={:#x}",
941            name,
942            id.as_u64(),
943            pid,
944            tid,
945            address_space.cr3().as_u64()
946        );
947
948        Ok(Arc::new(Task {
949            id,
950            pid,
951            tid,
952            tgid,
953            pgid: AtomicU32::new(pid),
954            sid: AtomicU32::new(pid),
955            uid: AtomicU32::new(0),
956            euid: AtomicU32::new(0),
957            gid: AtomicU32::new(0),
958            egid: AtomicU32::new(0),
959            state: AtomicU8::new(TaskState::Ready as u8),
960            priority,
961            context: SyncUnsafeCell::new(context),
962            resume_kind: SyncUnsafeCell::new(ResumeKind::RetFrame),
963            interrupt_rsp: AtomicU64::new(0),
964            kernel_stack,
965            user_stack: None,
966            name,
967            process: Arc::new(crate::process::process::Process::new(pid, address_space)),
968            pending_signals: super::signal::SignalSet::new(),
969            blocked_signals: super::signal::SignalSet::new(),
970            irq_signal_delivery_blocked: AtomicBool::new(false),
971            signal_stack: SyncUnsafeCell::new(None),
972            itimers: super::timer::ITimers::new(),
973            wake_pending: AtomicBool::new(false),
974            wake_deadline_ns: AtomicU64::new(0),
975            trampoline_entry: AtomicU64::new(0),
976            trampoline_stack_top: AtomicU64::new(0),
977            trampoline_arg0: AtomicU64::new(0),
978            ticks: AtomicU64::new(0),
979            sched_policy: SyncUnsafeCell::new(Self::default_sched_policy(priority)),
980            home_cpu: AtomicUsize::new(usize::MAX),
981            vruntime: AtomicU64::new(0),
982            fair_rq_generation: AtomicU64::new(0),
983            fair_on_rq: AtomicBool::new(false),
984            clear_child_tid: AtomicU64::new(0),
985            user_fs_base: AtomicU64::new(0),
986            fpu_state: SyncUnsafeCell::new(fpu_state),
987            xcr0_mask: AtomicU64::new(xcr0_mask),
988            rt_link: LinkedListLink::new(),
989        }))
990    }
991
992    /// Reset signal handlers during execve.
993    ///
994    /// POSIX requires handlers installed by userspace to revert to SIG_DFL on
995    /// exec, while dispositions already set to SIG_IGN remain ignored.
996    pub fn reset_signals(&self) {
997        // SAFETY: We have a valid reference to the task.
998        unsafe {
999            let actions = &mut *self.process.signal_actions.get();
1000            for action in actions.iter_mut() {
1001                if !action.is_ignore() {
1002                    *action = super::signal::SigActionData::default();
1003                }
1004            }
1005        }
1006    }
1007
1008    /// Returns true if this is a kernel task (shares the kernel address space).
1009    pub fn is_kernel(&self) -> bool {
1010        self.process.address_space_arc().is_kernel()
1011    }
1012
1013    /// Allocate POSIX identifiers for a new process leader.
1014    pub fn allocate_process_ids() -> (Pid, Tid, Pid) {
1015        let pid = next_pid();
1016        let tid = next_tid();
1017        (pid, tid, pid)
1018    }
1019
1020    /// Print the memory layout of Task and Process structs for debugging.
1021    ///
1022    /// Computes field offsets at runtime using addr_of! so the output is
1023    /// accurate regardless of Rust's struct reordering decisions.
1024    /// Call this early in kernel init to validate the crash-site offset analysis.
1025    pub fn debug_print_layout() {
1026        use core::mem;
1027        crate::serial_println!("[layout] === Struct Layout Debug ===");
1028        crate::serial_println!(
1029            "[layout] sizeof(Task)          = {}",
1030            mem::size_of::<Task>()
1031        );
1032        crate::serial_println!(
1033            "[layout] sizeof(ExtendedState) = {}",
1034            mem::size_of::<ExtendedState>()
1035        );
1036        crate::serial_println!(
1037            "[layout] alignof(ExtendedState)= {}",
1038            mem::align_of::<ExtendedState>()
1039        );
1040        crate::serial_println!(
1041            "[layout] sizeof(CpuContext)    = {}",
1042            mem::size_of::<CpuContext>()
1043        );
1044        crate::serial_println!(
1045            "[layout] sizeof(KernelStack)   = {}",
1046            mem::size_of::<KernelStack>()
1047        );
1048        crate::serial_println!(
1049            "[layout] sizeof(Process)       = {}",
1050            mem::size_of::<crate::process::process::Process>()
1051        );
1052        crate::serial_println!(
1053            "[layout] sizeof(FileDescriptorTable) = {}",
1054            mem::size_of::<crate::vfs::fd::FileDescriptorTable>()
1055        );
1056        crate::serial_println!(
1057            "[layout] sizeof(CapabilityTable)     = {}",
1058            mem::size_of::<crate::capability::CapabilityTable>()
1059        );
1060        crate::serial_println!(
1061            "[layout] sizeof(SigActionData)       = {}",
1062            mem::size_of::<crate::process::signal::SigActionData>()
1063        );
1064
1065        // Use heap-allocated MaybeUninit to avoid stack overflow from the ~3 KiB
1066        // ExtendedState embedded in Task. We only take *addresses* (addr_of!),
1067        // never read the uninitialized data itself, so this is sound.
1068        let task_box: alloc::boxed::Box<core::mem::MaybeUninit<Task>> =
1069            alloc::boxed::Box::new_uninit();
1070        // Cast to *const Task : we never read Task data, only compute field addresses.
1071        let task_ptr = task_box.as_ptr() as *const Task;
1072        let base = task_ptr as u64;
1073        // SAFETY: We only take addresses via addr_of!, no uninitialized reads.
1074        unsafe {
1075            let off_id = core::ptr::addr_of!((*task_ptr).id) as u64 - base;
1076            let off_pid = core::ptr::addr_of!((*task_ptr).pid) as u64 - base;
1077            let off_context = core::ptr::addr_of!((*task_ptr).context) as u64 - base;
1078            let off_kstack = core::ptr::addr_of!((*task_ptr).kernel_stack) as u64 - base;
1079            let off_process = core::ptr::addr_of!((*task_ptr).process) as u64 - base;
1080            let off_fpu = core::ptr::addr_of!((*task_ptr).fpu_state) as u64 - base;
1081            let off_xcr0 = core::ptr::addr_of!((*task_ptr).xcr0_mask) as u64 - base;
1082            let off_ticks = core::ptr::addr_of!((*task_ptr).ticks) as u64 - base;
1083            let off_name = core::ptr::addr_of!((*task_ptr).name) as u64 - base;
1084            let off_vruntime = core::ptr::addr_of!((*task_ptr).vruntime) as u64 - base;
1085            crate::serial_println!("[layout] Task field offsets (byte offset from Task data ptr):");
1086            crate::serial_println!("[layout]   id           @ +{:#x}", off_id);
1087            crate::serial_println!("[layout]   pid          @ +{:#x}", off_pid);
1088            crate::serial_println!("[layout]   context      @ +{:#x}", off_context);
1089            crate::serial_println!("[layout]   kernel_stack @ +{:#x}", off_kstack);
1090            crate::serial_println!("[layout]   process      @ +{:#x}", off_process);
1091            crate::serial_println!("[layout]   fpu_state    @ +{:#x}", off_fpu);
1092            crate::serial_println!("[layout]   xcr0_mask    @ +{:#x}", off_xcr0);
1093            crate::serial_println!("[layout]   ticks        @ +{:#x}", off_ticks);
1094            crate::serial_println!("[layout]   name         @ +{:#x}", off_name);
1095            crate::serial_println!("[layout]   vruntime     @ +{:#x}", off_vruntime);
1096        }
1097        // Arc<T> ArcInner overhead: strong(8)+weak(8)+data = data at offset 16.
1098        // So the crash at [ArcInner<Task>+0xbf8] means Task.process is at offset
1099        // 0xbf8 - 16 = 0xbe8 inside Task data. Check against off_process above.
1100        crate::serial_println!(
1101            "[layout] Expected task.process crash offset from Task data: {:#x}",
1102            0xbf8u64.saturating_sub(16)
1103        );
1104
1105        // Process field offsets
1106        let proc_box: alloc::boxed::Box<core::mem::MaybeUninit<crate::process::process::Process>> =
1107            alloc::boxed::Box::new_uninit();
1108        #[allow(unused_variables)]
1109        let proc_ptr = proc_box.as_ptr() as *const crate::process::process::Process;
1110        let proc_base = proc_ptr as u64;
1111        unsafe {
1112            let off_pid = core::ptr::addr_of!((*proc_ptr).pid) as u64 - proc_base;
1113            let off_as = core::ptr::addr_of!((*proc_ptr).address_space) as u64 - proc_base;
1114            let off_fd = core::ptr::addr_of!((*proc_ptr).fd_table) as u64 - proc_base;
1115            let off_caps = core::ptr::addr_of!((*proc_ptr).capabilities) as u64 - proc_base;
1116            let off_sigs = core::ptr::addr_of!((*proc_ptr).signal_actions) as u64 - proc_base;
1117            let off_brk = core::ptr::addr_of!((*proc_ptr).brk) as u64 - proc_base;
1118            crate::serial_println!(
1119                "[layout] Process field offsets (byte offset from Process data ptr):"
1120            );
1121            crate::serial_println!("[layout]   pid            @ +{:#x}", off_pid);
1122            crate::serial_println!("[layout]   address_space  @ +{:#x}", off_as);
1123            crate::serial_println!("[layout]   fd_table       @ +{:#x}", off_fd);
1124            crate::serial_println!("[layout]   capabilities   @ +{:#x}", off_caps);
1125            crate::serial_println!("[layout]   signal_actions @ +{:#x}", off_sigs);
1126            crate::serial_println!("[layout]   brk            @ +{:#x}", off_brk);
1127        }
1128        // The crash reads [ArcInner<Process>+0x830].
1129        // ArcInner<Process>.data is at ArcInner+16, so Process offset is 0x830-16 = 0x820.
1130        crate::serial_println!(
1131            "[layout] Expected process field crash offset from Process data: {:#x}",
1132            0x830u64.saturating_sub(16)
1133        );
1134        crate::serial_println!("[layout] ===========================");
1135    }
1136}
1137
1138/// Context switch dispatcher. Picks the xsave or fxsave path based on host
1139/// capabilities, then performs the full save/swap/restore sequence.
1140///
1141/// # Safety
1142/// Caller must ensure all pointers in `target` are valid and interrupts are disabled.
1143pub(super) unsafe fn do_switch_context(target: &super::scheduler::SwitchTarget) {
1144    // Temporary safety mode: force legacy FXSAVE/FXRSTOR path.
1145    // This avoids XSAVE/XRSTOR state-size mismatches that can corrupt task memory.
1146    //
1147    // TODO : re-enable XSAVE only after the kernel has a proven-stable end-to-end path
1148    // for:
1149    //     (1) xsave area sizing/allocation,
1150    //     (2) XCR0 transitions per task,
1151    //     (3) save/restore across scheduler, syscall, and interrupt returns.
1152    //
1153    // Until then old_xcr0/new_xcr0 stay intentionally unused in this path.
1154    let _ = target.old_xcr0;
1155    let _ = target.new_xcr0;
1156    switch_context_fxsave(
1157        target.old_rsp_ptr,
1158        target.new_rsp_ptr,
1159        target.old_fpu_ptr,
1160        target.new_fpu_ptr,
1161    );
1162}
1163
1164/// First-task restore dispatcher. Like `do_switch_context` but without
1165/// saving old state (there is no previous task).
1166///
1167/// # Safety
1168/// Caller must ensure pointers are valid and interrupts are disabled. Never returns.
1169pub(super) unsafe fn do_restore_first_task(
1170    frame_ptr: *const u64, // Points to the stack frame (r15, r14, r13, r12, rbp, rbx, ret)
1171    fpu_ptr: *const u8,
1172    xcr0: u64,
1173) -> ! {
1174    // Debug: verify frame pointer
1175    crate::serial_force_println!(
1176        "[task] do_restore_first_task frame_ptr={:#x} fpu_ptr={:#x}",
1177        frame_ptr as u64,
1178        fpu_ptr as u64
1179    );
1180
1181    // Verify the stack frame contains expected values
1182    crate::serial_force_println!(
1183        "[task] do_restore_first_task stack frame: r15={:#x} r14={:#x} r13={:#x} r12={:#x} rbp={:#x} rbx={:#x} ret={:#x}",
1184        *frame_ptr.add(0),
1185        *frame_ptr.add(1),
1186        *frame_ptr.add(2),
1187        *frame_ptr.add(3),
1188        *frame_ptr.add(4),
1189        *frame_ptr.add(5),
1190        *frame_ptr.add(6)
1191    );
1192
1193    // Verify canary immediately above the fake frame (frame is 7 words long).
1194    let canary_addr = frame_ptr as u64 + 56;
1195    let canary = *(canary_addr as *const u64);
1196    crate::serial_force_println!(
1197        "[task] do_restore_first_task canary at {:#x} = {:#x} (expected 0xdeadbeefcafebabe)",
1198        canary_addr,
1199        canary
1200    );
1201
1202    let _ = xcr0;
1203    restore_first_task_fxsave(frame_ptr, fpu_ptr);
1204}
1205
1206//  FXSAVE path (legacy, no XSAVE support) 
1207
1208/// rdi=old_rsp, rsi=new_rsp, rdx=old_fpu, rcx=new_fpu
1209#[unsafe(naked)]
1210unsafe extern "C" fn switch_context_fxsave(
1211    _old_rsp_ptr: *mut u64,
1212    _new_rsp_ptr: *const u64,
1213    _old_fpu_ptr: *mut u8,
1214    _new_fpu_ptr: *const u8,
1215) {
1216    core::arch::naked_asm!(
1217        "fxsave [rdx]",
1218        "push rbx",
1219        "push rbp",
1220        "push r12",
1221        "push r13",
1222        "push r14",
1223        "push r15",
1224        "mov [rdi], rsp",
1225        "mov rsp, [rsi]",
1226        "pop r15",
1227        "pop r14",
1228        "pop r13",
1229        "pop r12",
1230        "pop rbp",
1231        "pop rbx",
1232        "fxrstor [rcx]",
1233        "ret",
1234    );
1235}
1236
1237/// rdi=frame_ptr, rsi=fpu_ptr
1238#[unsafe(naked)]
1239unsafe extern "C" fn restore_first_task_fxsave(_rsp_ptr: *const u64, _fpu_ptr: *const u8) -> ! {
1240    // Debug: output pointers before restore (will be last serial output)
1241    // We can't use serial_println in naked functions, so this is just a marker
1242    // The actual debug output is in do_restore_first_task
1243    core::arch::naked_asm!(
1244        // `do_restore_first_task` passes the frame address directly.
1245        "mov rsp, rdi",
1246        "pop r15",
1247        "pop r14",
1248        "pop r13",
1249        "pop r12",
1250        "pop rbp",
1251        "pop rbx",
1252        "fxrstor [rsi]",
1253        "ret",
1254    );
1255}
1256
1257//  XSAVE path (with XCR0 switching per-silo) 
1258
1259/// rdi=old_rsp, rsi=new_rsp, rdx=old_fpu, rcx=new_fpu, r8=new_xcr0, r9=old_xcr0
1260#[unsafe(naked)]
1261unsafe extern "C" fn switch_context_xsave(
1262    _old_rsp_ptr: *mut u64,
1263    _new_rsp_ptr: *const u64,
1264    _old_fpu_ptr: *mut u8,
1265    _new_fpu_ptr: *const u8,
1266    _new_xcr0: u64,
1267    _old_xcr0: u64,
1268) {
1269    core::arch::naked_asm!(
1270        "mov r10, rdx",
1271        "mov r11, r8",
1272        "test r11, r11",
1273        "jnz 10f",
1274        "mov r11, 3",
1275        "10:",
1276        "test r9, r9",
1277        "jnz 11f",
1278        "mov r9, 3",
1279        "11:",
1280        "mov eax, r9d",
1281        "shr r9, 32",
1282        "mov edx, r9d",
1283        "xsave [r10]",
1284        "push rbx",
1285        "push rbp",
1286        "push r12",
1287        "push r13",
1288        "push r14",
1289        "push r15",
1290        "mov [rdi], rsp",
1291        "mov rsp, [rsi]",
1292        "pop r15",
1293        "pop r14",
1294        "pop r13",
1295        "pop r12",
1296        "pop rbp",
1297        "pop rbx",
1298        "push rcx",
1299        "mov ecx, 0",
1300        "mov eax, r11d",
1301        "mov r8, r11",
1302        "shr r8, 32",
1303        "mov edx, r8d",
1304        "xsetbv",
1305        "pop rcx",
1306        "mov eax, r11d",
1307        "mov r8, r11",
1308        "shr r8, 32",
1309        "mov edx, r8d",
1310        "xrstor [rcx]",
1311        "ret",
1312    );
1313}
1314
1315/// rdi=frame_ptr, rsi=fpu_ptr, rdx=xcr0
1316#[unsafe(naked)]
1317unsafe extern "C" fn restore_first_task_xsave(
1318    _rsp_ptr: *const u64,
1319    _fpu_ptr: *const u8,
1320    _xcr0: u64,
1321) -> ! {
1322    core::arch::naked_asm!(
1323        // `do_restore_first_task` passes the frame address directly.
1324        "mov rsp, rdi",
1325        "pop r15",
1326        "pop r14",
1327        "pop r13",
1328        "pop r12",
1329        "pop rbp",
1330        "pop rbx",
1331        "mov r8, rdx",
1332        "test r8, r8",
1333        "jnz 10f",
1334        "mov r8, 3",
1335        "10:",
1336        "mov r9, r8",
1337        "push rsi",
1338        "mov ecx, 0",
1339        "mov eax, r8d",
1340        "shr r8, 32",
1341        "mov edx, r8d",
1342        "xsetbv",
1343        "pop rsi",
1344        "mov eax, r9d",
1345        "shr r9, 32",
1346        "mov edx, r9d",
1347        "xrstor [rsi]",
1348        "ret",
1349    );
1350}