Skip to main content

strat9_kernel/process/
task.rs

1//! Task Management
2//!
3//! Defines the Task structure and related types for the Strat9-OS scheduler.
4
5use crate::memory::AddressSpace;
6use alloc::sync::Arc;
7use core::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering};
8use x86_64::{PhysAddr, VirtAddr};
9
10/// POSIX process ID.
11pub type Pid = u32;
12/// POSIX thread ID.
13pub type Tid = u32;
14
15/// Performs the next pid operation.
16#[inline]
17fn next_pid() -> Pid {
18    static NEXT_PID: AtomicU32 = AtomicU32::new(1);
19    NEXT_PID.fetch_add(1, Ordering::SeqCst)
20}
21
22/// Performs the next tid operation.
23#[inline]
24fn next_tid() -> Tid {
25    static NEXT_TID: AtomicU32 = AtomicU32::new(1);
26    NEXT_TID.fetch_add(1, Ordering::SeqCst)
27}
28
29/// Unique identifier for a task
30#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
31pub struct TaskId(u64);
32
33impl TaskId {
34    /// Generate a new unique task ID
35    pub fn new() -> Self {
36        static NEXT_ID: AtomicU64 = AtomicU64::new(0);
37        TaskId(NEXT_ID.fetch_add(1, Ordering::SeqCst))
38    }
39
40    /// Get the raw u64 value
41    pub fn as_u64(self) -> u64 {
42        self.0
43    }
44
45    /// Create a TaskId from a raw u64 (for IPC reply routing).
46    pub fn from_u64(raw: u64) -> Self {
47        TaskId(raw)
48    }
49}
50
51impl core::fmt::Display for TaskId {
52    /// Performs the fmt operation.
53    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
54        write!(f, "{}", self.0)
55    }
56}
57
58/// Priority levels for tasks
59#[derive(Debug, Clone, Copy, PartialEq, Eq)]
60pub enum TaskPriority {
61    Idle = 0,
62    Low = 1,
63    Normal = 2,
64    High = 3,
65    Realtime = 4,
66}
67
68/// State of a task in the scheduler
69#[derive(Debug, Clone, Copy, PartialEq, Eq)]
70pub enum TaskState {
71    /// Task is ready to be scheduled
72    Ready,
73    /// Task is currently running
74    Running,
75    /// Task is blocked waiting for an event
76    Blocked,
77    /// Task has exited
78    Dead,
79}
80
81use core::cell::UnsafeCell;
82
83/// A wrapper around UnsafeCell that implements Sync for TaskState
84pub struct SyncUnsafeCell<T> {
85    inner: UnsafeCell<T>,
86}
87
88unsafe impl<T> Sync for SyncUnsafeCell<T> {}
89
90impl<T> SyncUnsafeCell<T> {
91    /// Creates a new instance.
92    pub const fn new(value: T) -> Self {
93        Self {
94            inner: UnsafeCell::new(value),
95        }
96    }
97
98    /// Performs the get operation.
99    pub fn get(&self) -> *mut T {
100        self.inner.get()
101    }
102}
103
104/// FPU/SSE/AVX extended state, saved and restored on context switch.
105///
106/// When XSAVE is available, uses `xsave`/`xrstor` with a variable-size area.
107/// Falls back to `fxsave`/`fxrstor` (512 bytes) on older CPUs.
108#[repr(C, align(64))]
109pub struct ExtendedState {
110    pub data: [u8; Self::MAX_XSAVE_SIZE],
111    pub size: usize,
112    pub uses_xsave: bool,
113    pub xcr0_mask: u64,
114}
115
116impl core::fmt::Debug for ExtendedState {
117    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
118        f.debug_struct("ExtendedState")
119            .field("size", &self.size)
120            .field("uses_xsave", &self.uses_xsave)
121            .field("xcr0_mask", &self.xcr0_mask)
122            .finish()
123    }
124}
125
126impl ExtendedState {
127    pub const FXSAVE_SIZE: usize = 512;
128    pub const MAX_XSAVE_SIZE: usize = 2688;
129
130    /// Create a new default state using the host's maximum capabilities.
131    pub fn new() -> Self {
132        crate::serial_println!("[trace][fpu] ExtendedState::new enter");
133        let (uses_xsave, size, default_xcr0) = if crate::arch::x86_64::cpuid::host_uses_xsave() {
134            crate::serial_println!("[trace][fpu] ExtendedState::new host_uses_xsave=true");
135            let xcr0 = crate::arch::x86_64::cpuid::host_default_xcr0();
136            crate::serial_println!(
137                "[trace][fpu] ExtendedState::new host_default_xcr0={:#x}",
138                xcr0
139            );
140            let sz =
141                crate::arch::x86_64::cpuid::xsave_size_for_xcr0(xcr0).min(Self::MAX_XSAVE_SIZE);
142            crate::serial_println!("[trace][fpu] ExtendedState::new xsave_size={}", sz);
143            (true, sz, xcr0)
144        } else {
145            crate::serial_println!("[trace][fpu] ExtendedState::new host_uses_xsave=false");
146            (false, Self::FXSAVE_SIZE, 0x3)
147        };
148
149        crate::serial_println!(
150            "[trace][fpu] ExtendedState::new build state uses_xsave={} size={} xcr0={:#x}",
151            uses_xsave,
152            size,
153            default_xcr0
154        );
155        let mut state = Self {
156            data: [0u8; Self::MAX_XSAVE_SIZE],
157            size,
158            uses_xsave,
159            xcr0_mask: default_xcr0,
160        };
161        crate::serial_println!("[trace][fpu] ExtendedState::new state allocated");
162        state.set_defaults();
163        crate::serial_println!("[trace][fpu] ExtendedState::new defaults set");
164        state
165    }
166
167    /// Create a state for a specific XCR0 mask (per-silo feature restriction).
168    pub fn for_xcr0(xcr0: u64) -> Self {
169        let uses_xsave = crate::arch::x86_64::cpuid::host_uses_xsave();
170        let size = if uses_xsave {
171            crate::arch::x86_64::cpuid::xsave_size_for_xcr0(xcr0).min(Self::MAX_XSAVE_SIZE)
172        } else {
173            Self::FXSAVE_SIZE
174        };
175
176        let mut state = Self {
177            data: [0u8; Self::MAX_XSAVE_SIZE],
178            size,
179            uses_xsave,
180            xcr0_mask: xcr0,
181        };
182        state.set_defaults();
183        state
184    }
185
186    fn set_defaults(&mut self) {
187        // x87 FCW = 0x037F
188        self.data[0] = 0x7F;
189        self.data[1] = 0x03;
190        // MXCSR = 0x1F80
191        self.data[24] = 0x80;
192        self.data[25] = 0x1F;
193    }
194
195    /// Copy the state from another `ExtendedState`.
196    pub fn copy_from(&mut self, other: &ExtendedState) {
197        let len = other.size.min(self.size);
198        self.data[..len].copy_from_slice(&other.data[..len]);
199    }
200}
201
202/// Represents a single task/thread in the system
203pub struct Task {
204    /// Unique identifier for this task
205    pub id: TaskId,
206    /// Process identifier visible to userspace.
207    pub pid: Pid,
208    /// Thread identifier visible to userspace.
209    pub tid: Tid,
210    /// Thread-group identifier (equals process leader PID).
211    pub tgid: Pid,
212    /// Process group id (job-control group).
213    pub pgid: AtomicU32,
214    /// Session id.
215    pub sid: AtomicU32,
216    /// real user id.
217    pub uid: AtomicU32,
218    /// effective user id.
219    pub euid: AtomicU32,
220    /// real group id.
221    pub gid: AtomicU32,
222    /// effective group id.
223    pub egid: AtomicU32,
224    /// Current state of the task
225    pub state: SyncUnsafeCell<TaskState>,
226    /// Priority level of the task
227    pub priority: TaskPriority,
228    /// Saved CPU context for this task (just the stack pointer)
229    pub context: SyncUnsafeCell<CpuContext>,
230    /// Kernel stack for this task
231    pub kernel_stack: KernelStack,
232    /// User stack for this task (if applicable)
233    pub user_stack: Option<UserStack>,
234    /// Task name for debugging purposes
235    pub name: &'static str,
236    /// Capabilities granted to this task
237    /// Address space for this task (kernel tasks share the kernel AS)
238    pub process: Arc<crate::process::process::Process>,
239    /// File descriptor table for this task
240    /// Pending signals for this task
241    pub pending_signals: super::signal::SignalSet,
242    /// Blocked signals mask for this task
243    pub blocked_signals: super::signal::SignalSet,
244    /// Signal actions (handlers) for this task
245    /// Signal alternate stack for this task
246    pub signal_stack: SyncUnsafeCell<Option<super::signal::SigStack>>,
247    /// Interval timers (ITIMER_REAL, ITIMER_VIRTUAL, ITIMER_PROF)
248    pub itimers: super::timer::ITimers,
249    /// Pending wakeup flag: set by `wake_task()` when the task is not yet
250    /// in `blocked_tasks` (it is still transitioning to Blocked state).
251    /// Checked by `block_current_task()` — if set, the task skips blocking
252    /// and continues execution, preventing a lost-wakeup race.
253    pub wake_pending: AtomicBool,
254    /// Sleep deadline in nanoseconds (monotonic). If non-zero, the task
255    /// is sleeping until this time. Checked by the scheduler to auto-wake.
256    pub wake_deadline_ns: AtomicU64,
257    /// Program break (end of heap), in bytes. 0 = not yet initialised.
258    /// Lazily set to `BRK_BASE` on the first `sys_brk` call.
259    /// mmap_hint: next candidate virtual address for anonymous mmap allocations
260    /// User-space entry point for ring3 trampoline (ELF tasks only, 0 otherwise).
261    pub trampoline_entry: AtomicU64,
262    /// User-space stack top for ring3 trampoline (ELF tasks only, 0 otherwise).
263    pub trampoline_stack_top: AtomicU64,
264    /// First argument (RDI) passed to the user process on entry (e.g. bootstrap cap handle).
265    pub trampoline_arg0: AtomicU64,
266    /// Total CPU ticks consumed by this task
267    pub ticks: AtomicU64,
268    /// Scheduling policy (Fair, RealTime, Idle)
269    pub sched_policy: SyncUnsafeCell<crate::process::sched::SchedPolicy>,
270    /// Virtual runtime for CFS
271    pub vruntime: AtomicU64,
272    /// TID address for futex-based thread join (set_tid_address).
273    /// The kernel writes 0 here when the thread exits, then futex_wake.
274    pub clear_child_tid: AtomicU64,
275    /// Current working directory (POSIX, inherited by children).
276    /// File creation mask (inherited by children, NOT reset by exec).
277    /// User-space FS.base (TLS on x86_64, set via arch_prctl ARCH_SET_FS).
278    /// Saved/restored across context switches.
279    pub user_fs_base: AtomicU64,
280    /// FPU/SSE/AVX extended state saved during context switch.
281    pub fpu_state: SyncUnsafeCell<ExtendedState>,
282    /// XCR0 mask for this task (inherited from its silo).
283    pub xcr0_mask: AtomicU64,
284}
285
286impl Task {
287    /// Performs the default sched policy operation.
288    pub fn default_sched_policy(priority: TaskPriority) -> crate::process::sched::SchedPolicy {
289        use crate::process::sched::{nice::Nice, real_time::RealTimePriority, SchedPolicy};
290        match priority {
291            TaskPriority::Idle => SchedPolicy::Idle,
292            TaskPriority::Realtime => SchedPolicy::RealTimeRR {
293                prio: RealTimePriority::new(50),
294            },
295            TaskPriority::High => SchedPolicy::Fair(Nice::new(-10)),
296            TaskPriority::Low => SchedPolicy::Fair(Nice::new(10)),
297            TaskPriority::Normal => SchedPolicy::Fair(Nice::default()),
298        }
299    }
300
301    /// Get the current scheduling policy of the task
302    pub fn sched_policy(&self) -> crate::process::sched::SchedPolicy {
303        unsafe { *self.sched_policy.get() }
304    }
305
306    /// Set the scheduling policy of the task
307    pub fn set_sched_policy(&self, policy: crate::process::sched::SchedPolicy) {
308        unsafe {
309            *self.sched_policy.get() = policy;
310        }
311    }
312
313    /// Get virtual runtime
314    pub fn vruntime(&self) -> u64 {
315        self.vruntime.load(Ordering::Relaxed)
316    }
317
318    /// Set virtual runtime
319    pub fn set_vruntime(&self, vruntime: u64) {
320        self.vruntime.store(vruntime, Ordering::Relaxed);
321    }
322}
323
324/// CPU context saved/restored during context switches.
325///
326/// Only stores the saved RSP. All callee-saved registers (rbx, rbp, r12-r15)
327/// are pushed onto the task's kernel stack by `switch_context()`.
328#[repr(C)]
329pub struct CpuContext {
330    /// Saved stack pointer (points into the task's kernel stack)
331    pub saved_rsp: u64,
332}
333
334impl CpuContext {
335    /// Create a new CPU context for a task starting at the given entry point.
336    ///
337    /// Sets up a fake stack frame on the kernel stack that looks like
338    /// `switch_context()` just pushed callee-saved registers. When
339    /// `switch_context()` or `restore_first_task()` pops them and does `ret`,
340    /// it will jump to `task_entry_trampoline`, which enables interrupts
341    /// and jumps to the real entry point (stored in r12).
342    ///
343    /// Stack layout (growing downward):
344    /// ```text
345    /// [stack_top]
346    ///   task_entry_trampoline   <- ret target
347    ///   0  (r15)
348    ///   0  (r14)
349    ///   0  (r13)
350    ///   entry_point (r12)      <- trampoline reads this
351    ///   0  (rbp)
352    ///   0  (rbx)
353    ///   <- saved_rsp points here
354    /// ```
355    pub fn new(entry_point: u64, kernel_stack: &KernelStack) -> Self {
356        let stack_top = kernel_stack.virt_base.as_u64() + kernel_stack.size as u64;
357
358        // We need to push 7 values (each 8 bytes) onto the stack
359        let initial_rsp = stack_top - 7 * 8;
360
361        // SAFETY: We own this stack memory and it's properly allocated and zeroed.
362        // The stack region [virt_base, virt_base + size) is valid.
363        unsafe {
364            let stack = initial_rsp as *mut u64;
365            // Push order must match switch_context pops (LIFO, but we write linearly from RSP up):
366            // [RSP+0]  = r15
367            // [RSP+8]  = r14
368            // [RSP+16] = r13
369            // [RSP+24] = r12 (entry point)
370            // [RSP+32] = rbp
371            // [RSP+40] = rbx
372            // [RSP+48] = ret (trampoline)
373            *stack.add(0) = 0; // r15
374            *stack.add(1) = 0; // r14
375            *stack.add(2) = 0; // r13
376            *stack.add(3) = entry_point; // r12 (trampoline target)
377            *stack.add(4) = 0; // rbp
378            *stack.add(5) = 0; // rbx
379            *stack.add(6) = task_entry_trampoline as *const () as u64; // ret address
380        }
381
382        CpuContext {
383            saved_rsp: initial_rsp,
384        }
385    }
386}
387
388/// Trampoline for newly created tasks.
389///
390/// When a new task is first scheduled, `switch_context()` pops the fake
391/// callee-saved registers and `ret`s here, then tail-jumps into the actual
392/// post-switch entry helper.
393#[unsafe(naked)]
394unsafe extern "C" fn task_entry_trampoline() {
395    core::arch::naked_asm!(
396        "call {finish_switch}",
397        "mov rdi, r12",
398        "mov rsi, r13",
399        "jmp {post_switch_enter}",
400        finish_switch = sym crate::process::scheduler::finish_switch,
401        post_switch_enter = sym task_post_switch_enter,
402    );
403}
404
405fn task_post_switch_enter(entry: u64, arg0: u64) -> ! {
406    crate::arch::x86_64::percpu::mark_tlb_ready_current();
407
408    let is_user_entry = crate::process::scheduler::current_task_clone_try()
409        .map(|task| task.trampoline_entry.load(Ordering::Relaxed) != 0)
410        .unwrap_or(false);
411
412    if !is_user_entry {
413        crate::arch::x86_64::sti();
414    }
415
416    let entry_fn: extern "C" fn(u64) -> ! = unsafe { core::mem::transmute(entry as usize) };
417    entry_fn(arg0)
418}
419
420/// Kernel stack for a task
421pub struct KernelStack {
422    /// Physical address of the stack
423    pub base: PhysAddr,
424    /// Virtual address of the stack
425    pub virt_base: VirtAddr,
426    /// Size of the stack
427    pub size: usize,
428}
429
430impl KernelStack {
431    /// Allocate a new kernel stack using the buddy allocator
432    pub fn allocate(size: usize) -> Result<Self, &'static str> {
433        // Calculate number of pages needed (round up)
434        let pages = (size + 4095) / 4096;
435        let order = pages.next_power_of_two().trailing_zeros() as u8;
436
437        crate::serial_println!("[trace][task] kstack allocate begin size={}", size);
438        crate::serial_println!(
439            "[trace][task] kstack allocate pages={} order={}",
440            pages,
441            order
442        );
443
444        crate::serial_println!(
445            "[trace][task] kstack allocate calling allocate_frames order={}",
446            order
447        );
448        let frame = crate::sync::with_irqs_disabled(|token| {
449            crate::memory::allocate_frames(token, order)
450        })
451        .map_err(|_| "Failed to allocate kernel stack")?;
452        crate::serial_println!(
453            "[trace][task] kstack allocate frame phys={:#x}",
454            frame.start_address.as_u64()
455        );
456
457        let phys_base = frame.start_address;
458        let virt_base = VirtAddr::new(crate::memory::phys_to_virt(phys_base.as_u64()));
459        crate::serial_println!(
460            "[trace][task] kstack allocate virt_base={:#x}",
461            virt_base.as_u64()
462        );
463
464        // Zero out the stack for safety
465        unsafe {
466            core::ptr::write_bytes(virt_base.as_mut_ptr::<u8>(), 0, size);
467        }
468        crate::serial_println!("[trace][task] kstack allocate memset done");
469
470        Ok(KernelStack {
471            base: phys_base,
472            virt_base,
473            size,
474        })
475    }
476}
477
478impl Drop for KernelStack {
479    /// Performs the drop operation.
480    fn drop(&mut self) {
481        use crate::memory::frame::PhysFrame;
482
483        let pages = (self.size + 4095) / 4096;
484        let order = pages.next_power_of_two().trailing_zeros() as u8;
485        let frame = PhysFrame {
486            start_address: self.base,
487        };
488
489        crate::sync::with_irqs_disabled(|token| {
490            crate::memory::free_frames(token, frame, order);
491        });
492    }
493}
494
495/// User stack for a task (when running in userspace)
496pub struct UserStack {
497    /// Virtual address of the user stack
498    pub virt_base: VirtAddr,
499    /// Size of the stack
500    pub size: usize,
501}
502
503impl Task {
504    /// Default kernel stack size (16 KB)
505    pub const DEFAULT_STACK_SIZE: usize = 16384;
506
507    /// Create a new kernel task with a real allocated stack
508    pub fn new_kernel_task(
509        entry_point: extern "C" fn() -> !,
510        name: &'static str,
511        priority: TaskPriority,
512    ) -> Result<Arc<Self>, &'static str> {
513        Self::new_kernel_task_with_stack(entry_point, name, priority, Self::DEFAULT_STACK_SIZE)
514    }
515
516    /// Create a new kernel task with a custom kernel stack size.
517    pub fn new_kernel_task_with_stack(
518        entry_point: extern "C" fn() -> !,
519        name: &'static str,
520        priority: TaskPriority,
521        stack_size: usize,
522    ) -> Result<Arc<Self>, &'static str> {
523        crate::serial_println!(
524            "[trace][task] new_kernel_task_with_stack begin name={} stack_size={}",
525            name,
526            stack_size
527        );
528        // Allocate a real kernel stack
529        let kernel_stack = KernelStack::allocate(stack_size)?;
530        crate::serial_println!("[trace][task] new_kernel_task_with_stack kstack done");
531
532        // Create CPU context with the allocated stack
533        let context = CpuContext::new(entry_point as *const () as u64, &kernel_stack);
534        crate::serial_println!("[trace][task] new_kernel_task_with_stack context done");
535        let id = TaskId::new();
536        let (pid, tid, tgid) = Self::allocate_process_ids();
537        crate::serial_println!(
538            "[trace][task] new_kernel_task_with_stack ids done id={} pid={} tid={} tgid={}",
539            id.as_u64(),
540            pid,
541            tid,
542            tgid
543        );
544        let fpu_state = ExtendedState::new();
545        let xcr0_mask = fpu_state.xcr0_mask;
546
547        let process = Arc::new(crate::process::process::Process::new(
548            pid,
549            crate::memory::kernel_address_space().clone(),
550        ));
551        crate::serial_println!("[trace][task] new_kernel_task_with_stack process done");
552
553        log::debug!(
554            "[task][create] name={} id={} pid={} tid={} kstack={:?} kstack_kib={}",
555            name,
556            id.as_u64(),
557            pid,
558            tid,
559            kernel_stack.virt_base,
560            kernel_stack.size / 1024
561        );
562
563        Ok(Arc::new(Task {
564            id,
565            pid,
566            tid,
567            tgid,
568            pgid: AtomicU32::new(pid),
569            sid: AtomicU32::new(pid),
570            uid: AtomicU32::new(0),
571            euid: AtomicU32::new(0),
572            gid: AtomicU32::new(0),
573            egid: AtomicU32::new(0),
574            state: SyncUnsafeCell::new(TaskState::Ready),
575            priority,
576            context: SyncUnsafeCell::new(context),
577            kernel_stack,
578            user_stack: None,
579            name,
580            process,
581            pending_signals: super::signal::SignalSet::new(),
582            blocked_signals: super::signal::SignalSet::new(),
583            signal_stack: SyncUnsafeCell::new(None),
584            itimers: super::timer::ITimers::new(),
585            wake_pending: AtomicBool::new(false),
586            wake_deadline_ns: AtomicU64::new(0),
587            trampoline_entry: AtomicU64::new(0),
588            trampoline_stack_top: AtomicU64::new(0),
589            trampoline_arg0: AtomicU64::new(0),
590            ticks: AtomicU64::new(0),
591            sched_policy: SyncUnsafeCell::new(Self::default_sched_policy(priority)),
592            vruntime: AtomicU64::new(0),
593            clear_child_tid: AtomicU64::new(0),
594            user_fs_base: AtomicU64::new(0),
595            fpu_state: SyncUnsafeCell::new(fpu_state),
596            xcr0_mask: AtomicU64::new(xcr0_mask),
597        }))
598    }
599
600    /// Create a new user task with its own address space (stub for future use).
601    ///
602    /// The entry point and user stack must already be mapped in the given address space.
603    pub fn new_user_task(
604        entry_point: u64,
605        address_space: Arc<AddressSpace>,
606        name: &'static str,
607        priority: TaskPriority,
608    ) -> Result<Arc<Self>, &'static str> {
609        let kernel_stack = KernelStack::allocate(Self::DEFAULT_STACK_SIZE)?;
610        let context = CpuContext::new(entry_point, &kernel_stack);
611        let id = TaskId::new();
612        let (pid, tid, tgid) = Self::allocate_process_ids();
613        let fpu_state = ExtendedState::new();
614        let xcr0_mask = fpu_state.xcr0_mask;
615
616        log::debug!(
617            "[task][create] name={} id={} pid={} tid={} user_as_cr3={:#x}",
618            name,
619            id.as_u64(),
620            pid,
621            tid,
622            address_space.cr3().as_u64()
623        );
624
625        Ok(Arc::new(Task {
626            id,
627            pid,
628            tid,
629            tgid,
630            pgid: AtomicU32::new(pid),
631            sid: AtomicU32::new(pid),
632            uid: AtomicU32::new(0),
633            euid: AtomicU32::new(0),
634            gid: AtomicU32::new(0),
635            egid: AtomicU32::new(0),
636            state: SyncUnsafeCell::new(TaskState::Ready),
637            priority,
638            context: SyncUnsafeCell::new(context),
639            kernel_stack,
640            user_stack: None,
641            name,
642            process: Arc::new(crate::process::process::Process::new(pid, address_space)),
643            pending_signals: super::signal::SignalSet::new(),
644            blocked_signals: super::signal::SignalSet::new(),
645            signal_stack: SyncUnsafeCell::new(None),
646            itimers: super::timer::ITimers::new(),
647            wake_pending: AtomicBool::new(false),
648            wake_deadline_ns: AtomicU64::new(0),
649            trampoline_entry: AtomicU64::new(0),
650            trampoline_stack_top: AtomicU64::new(0),
651            trampoline_arg0: AtomicU64::new(0),
652            ticks: AtomicU64::new(0),
653            sched_policy: SyncUnsafeCell::new(Self::default_sched_policy(priority)),
654            vruntime: AtomicU64::new(0),
655            clear_child_tid: AtomicU64::new(0),
656            user_fs_base: AtomicU64::new(0),
657            fpu_state: SyncUnsafeCell::new(fpu_state),
658            xcr0_mask: AtomicU64::new(xcr0_mask),
659        }))
660    }
661
662    /// Reset all signal handlers to SIG_DFL (default).
663    ///
664    /// Called during execve to reset signal handlers as per POSIX:
665    /// handlers set to catch signals are reset to SIG_DFL, but SIG_IGN
666    /// remains ignored (implementation simplification: we reset all).
667    pub fn reset_signals(&self) {
668        // SAFETY: We have a valid reference to the task.
669        unsafe {
670            let actions = &mut *self.process.signal_actions.get();
671            for action in actions.iter_mut() {
672                *action = super::signal::SigActionData::default();
673            }
674        }
675    }
676
677    /// Returns true if this is a kernel task (shares the kernel address space).
678    pub fn is_kernel(&self) -> bool {
679        // SAFETY: address_space is immutable for the lifetime of the Arc?
680        // Actually we just updated it to SyncUnsafeCell.
681        unsafe { (*self.process.address_space.get()).is_kernel() }
682    }
683
684    /// Allocate POSIX identifiers for a new process leader.
685    pub fn allocate_process_ids() -> (Pid, Tid, Pid) {
686        let pid = next_pid();
687        let tid = next_tid();
688        (pid, tid, pid)
689    }
690
691    /// Print the memory layout of Task and Process structs for debugging.
692    ///
693    /// Computes field offsets at runtime using addr_of! so the output is
694    /// accurate regardless of Rust's struct reordering decisions.
695    /// Call this early in kernel init to validate the crash-site offset analysis.
696    pub fn debug_print_layout() {
697        use core::mem;
698        crate::serial_println!("[layout] === Struct Layout Debug ===");
699        crate::serial_println!(
700            "[layout] sizeof(Task)          = {}",
701            mem::size_of::<Task>()
702        );
703        crate::serial_println!(
704            "[layout] sizeof(ExtendedState) = {}",
705            mem::size_of::<ExtendedState>()
706        );
707        crate::serial_println!(
708            "[layout] alignof(ExtendedState)= {}",
709            mem::align_of::<ExtendedState>()
710        );
711        crate::serial_println!(
712            "[layout] sizeof(CpuContext)    = {}",
713            mem::size_of::<CpuContext>()
714        );
715        crate::serial_println!(
716            "[layout] sizeof(KernelStack)   = {}",
717            mem::size_of::<KernelStack>()
718        );
719        crate::serial_println!(
720            "[layout] sizeof(Process)       = {}",
721            mem::size_of::<crate::process::process::Process>()
722        );
723        crate::serial_println!(
724            "[layout] sizeof(FileDescriptorTable) = {}",
725            mem::size_of::<crate::vfs::fd::FileDescriptorTable>()
726        );
727        crate::serial_println!(
728            "[layout] sizeof(CapabilityTable)     = {}",
729            mem::size_of::<crate::capability::CapabilityTable>()
730        );
731        crate::serial_println!(
732            "[layout] sizeof(SigActionData)       = {}",
733            mem::size_of::<crate::process::signal::SigActionData>()
734        );
735
736        // Use heap-allocated MaybeUninit to avoid stack overflow from the ~3 KiB
737        // ExtendedState embedded in Task. We only take *addresses* (addr_of!),
738        // never read the uninitialized data itself, so this is sound.
739        let task_box: alloc::boxed::Box<core::mem::MaybeUninit<Task>> =
740            alloc::boxed::Box::new_uninit();
741        // Cast to *const Task — we never read Task data, only compute field addresses.
742        let task_ptr = task_box.as_ptr() as *const Task;
743        let base = task_ptr as u64;
744        // SAFETY: We only take addresses via addr_of!, no uninitialized reads.
745        unsafe {
746            let off_id = core::ptr::addr_of!((*task_ptr).id) as u64 - base;
747            let off_pid = core::ptr::addr_of!((*task_ptr).pid) as u64 - base;
748            let off_context = core::ptr::addr_of!((*task_ptr).context) as u64 - base;
749            let off_kstack = core::ptr::addr_of!((*task_ptr).kernel_stack) as u64 - base;
750            let off_process = core::ptr::addr_of!((*task_ptr).process) as u64 - base;
751            let off_fpu = core::ptr::addr_of!((*task_ptr).fpu_state) as u64 - base;
752            let off_xcr0 = core::ptr::addr_of!((*task_ptr).xcr0_mask) as u64 - base;
753            let off_ticks = core::ptr::addr_of!((*task_ptr).ticks) as u64 - base;
754            let off_name = core::ptr::addr_of!((*task_ptr).name) as u64 - base;
755            let off_vruntime = core::ptr::addr_of!((*task_ptr).vruntime) as u64 - base;
756            crate::serial_println!("[layout] Task field offsets (byte offset from Task data ptr):");
757            crate::serial_println!("[layout]   id           @ +{:#x}", off_id);
758            crate::serial_println!("[layout]   pid          @ +{:#x}", off_pid);
759            crate::serial_println!("[layout]   context      @ +{:#x}", off_context);
760            crate::serial_println!("[layout]   kernel_stack @ +{:#x}", off_kstack);
761            crate::serial_println!("[layout]   process      @ +{:#x}", off_process);
762            crate::serial_println!("[layout]   fpu_state    @ +{:#x}", off_fpu);
763            crate::serial_println!("[layout]   xcr0_mask    @ +{:#x}", off_xcr0);
764            crate::serial_println!("[layout]   ticks        @ +{:#x}", off_ticks);
765            crate::serial_println!("[layout]   name         @ +{:#x}", off_name);
766            crate::serial_println!("[layout]   vruntime     @ +{:#x}", off_vruntime);
767        }
768        // Arc<T> ArcInner overhead: strong(8)+weak(8)+data = data at offset 16.
769        // So the crash at [ArcInner<Task>+0xbf8] means Task.process is at offset
770        // 0xbf8 - 16 = 0xbe8 inside Task data. Check against off_process above.
771        crate::serial_println!(
772            "[layout] Expected task.process crash offset from Task data: {:#x}",
773            0xbf8u64.saturating_sub(16)
774        );
775
776        // Process field offsets
777        let proc_box: alloc::boxed::Box<core::mem::MaybeUninit<crate::process::process::Process>> =
778            alloc::boxed::Box::new_uninit();
779        #[allow(unused_variables)]
780        let proc_ptr = proc_box.as_ptr() as *const crate::process::process::Process;
781        let proc_base = proc_ptr as u64;
782        unsafe {
783            let off_pid = core::ptr::addr_of!((*proc_ptr).pid) as u64 - proc_base;
784            let off_as = core::ptr::addr_of!((*proc_ptr).address_space) as u64 - proc_base;
785            let off_fd = core::ptr::addr_of!((*proc_ptr).fd_table) as u64 - proc_base;
786            let off_caps = core::ptr::addr_of!((*proc_ptr).capabilities) as u64 - proc_base;
787            let off_sigs = core::ptr::addr_of!((*proc_ptr).signal_actions) as u64 - proc_base;
788            let off_brk = core::ptr::addr_of!((*proc_ptr).brk) as u64 - proc_base;
789            crate::serial_println!(
790                "[layout] Process field offsets (byte offset from Process data ptr):"
791            );
792            crate::serial_println!("[layout]   pid            @ +{:#x}", off_pid);
793            crate::serial_println!("[layout]   address_space  @ +{:#x}", off_as);
794            crate::serial_println!("[layout]   fd_table       @ +{:#x}", off_fd);
795            crate::serial_println!("[layout]   capabilities   @ +{:#x}", off_caps);
796            crate::serial_println!("[layout]   signal_actions @ +{:#x}", off_sigs);
797            crate::serial_println!("[layout]   brk            @ +{:#x}", off_brk);
798        }
799        // The crash reads [ArcInner<Process>+0x830].
800        // ArcInner<Process>.data is at ArcInner+16, so Process offset is 0x830-16 = 0x820.
801        crate::serial_println!(
802            "[layout] Expected process field crash offset from Process data: {:#x}",
803            0x830u64.saturating_sub(16)
804        );
805        crate::serial_println!("[layout] ===========================");
806    }
807}
808
809/// Context switch dispatcher. Picks the xsave or fxsave path based on host
810/// capabilities, then performs the full save/swap/restore sequence.
811///
812/// # Safety
813/// Caller must ensure all pointers in `target` are valid and interrupts are disabled.
814pub(super) unsafe fn do_switch_context(target: &super::scheduler::SwitchTarget) {
815    // Temporary safety mode: force legacy FXSAVE/FXRSTOR path.
816    // This avoids XSAVE/XRSTOR state-size mismatches that can corrupt task memory.
817    let _ = target.old_xcr0;
818    let _ = target.new_xcr0;
819    switch_context_fxsave(
820        target.old_rsp_ptr,
821        target.new_rsp_ptr,
822        target.old_fpu_ptr,
823        target.new_fpu_ptr,
824    );
825}
826
827/// First-task restore dispatcher. Like `do_switch_context` but without
828/// saving old state (there is no previous task).
829///
830/// # Safety
831/// Caller must ensure pointers are valid and interrupts are disabled. Never returns.
832pub(super) unsafe fn do_restore_first_task(
833    rsp_ptr: *const u64,
834    fpu_ptr: *const u8,
835    xcr0: u64,
836) -> ! {
837    let _ = xcr0;
838    restore_first_task_fxsave(rsp_ptr, fpu_ptr);
839}
840
841// ── FXSAVE path (legacy, no XSAVE support) ──
842
843/// rdi=old_rsp, rsi=new_rsp, rdx=old_fpu, rcx=new_fpu
844#[unsafe(naked)]
845unsafe extern "C" fn switch_context_fxsave(
846    _old_rsp_ptr: *mut u64,
847    _new_rsp_ptr: *const u64,
848    _old_fpu_ptr: *mut u8,
849    _new_fpu_ptr: *const u8,
850) {
851    core::arch::naked_asm!(
852        "fxsave [rdx]",
853        "push rbx",
854        "push rbp",
855        "push r12",
856        "push r13",
857        "push r14",
858        "push r15",
859        "mov [rdi], rsp",
860        "mov rsp, [rsi]",
861        "pop r15",
862        "pop r14",
863        "pop r13",
864        "pop r12",
865        "pop rbp",
866        "pop rbx",
867        "fxrstor [rcx]",
868        "ret",
869    );
870}
871
872/// rdi=rsp_ptr, rsi=fpu_ptr
873#[unsafe(naked)]
874unsafe extern "C" fn restore_first_task_fxsave(_rsp_ptr: *const u64, _fpu_ptr: *const u8) -> ! {
875    core::arch::naked_asm!(
876        "mov rsp, [rdi]",
877        "pop r15",
878        "pop r14",
879        "pop r13",
880        "pop r12",
881        "pop rbp",
882        "pop rbx",
883        "fxrstor [rsi]",
884        "ret",
885    );
886}
887
888// ── XSAVE path (with XCR0 switching per-silo) ──
889
890/// rdi=old_rsp, rsi=new_rsp, rdx=old_fpu, rcx=new_fpu, r8=new_xcr0, r9=old_xcr0
891#[unsafe(naked)]
892unsafe extern "C" fn switch_context_xsave(
893    _old_rsp_ptr: *mut u64,
894    _new_rsp_ptr: *const u64,
895    _old_fpu_ptr: *mut u8,
896    _new_fpu_ptr: *const u8,
897    _new_xcr0: u64,
898    _old_xcr0: u64,
899) {
900    core::arch::naked_asm!(
901        "mov r10, rdx",
902        "mov r11, r8",
903        "test r11, r11",
904        "jnz 10f",
905        "mov r11, 3",
906        "10:",
907        "test r9, r9",
908        "jnz 11f",
909        "mov r9, 3",
910        "11:",
911        "mov eax, r9d",
912        "shr r9, 32",
913        "mov edx, r9d",
914        "xsave [r10]",
915        "push rbx",
916        "push rbp",
917        "push r12",
918        "push r13",
919        "push r14",
920        "push r15",
921        "mov [rdi], rsp",
922        "mov rsp, [rsi]",
923        "pop r15",
924        "pop r14",
925        "pop r13",
926        "pop r12",
927        "pop rbp",
928        "pop rbx",
929        "push rcx",
930        "mov ecx, 0",
931        "mov eax, r11d",
932        "mov r8, r11",
933        "shr r8, 32",
934        "mov edx, r8d",
935        "xsetbv",
936        "pop rcx",
937        "mov eax, r11d",
938        "mov r8, r11",
939        "shr r8, 32",
940        "mov edx, r8d",
941        "xrstor [rcx]",
942        "ret",
943    );
944}
945
946/// rdi=rsp_ptr, rsi=fpu_ptr, rdx=xcr0
947#[unsafe(naked)]
948unsafe extern "C" fn restore_first_task_xsave(
949    _rsp_ptr: *const u64,
950    _fpu_ptr: *const u8,
951    _xcr0: u64,
952) -> ! {
953    core::arch::naked_asm!(
954        "mov rsp, [rdi]",
955        "pop r15",
956        "pop r14",
957        "pop r13",
958        "pop r12",
959        "pop rbp",
960        "pop rbx",
961        "mov r8, rdx",
962        "test r8, r8",
963        "jnz 10f",
964        "mov r8, 3",
965        "10:",
966        "mov r9, r8",
967        "push rsi",
968        "mov ecx, 0",
969        "mov eax, r8d",
970        "shr r8, 32",
971        "mov edx, r8d",
972        "xsetbv",
973        "pop rsi",
974        "mov eax, r9d",
975        "shr r9, 32",
976        "mov edx, r9d",
977        "xrstor [rsi]",
978        "ret",
979    );
980}