Skip to main content

strat9_kernel/arch/x86_64/
idt.rs

1//! Interrupt Descriptor Table (IDT) for Strat9-OS
2//!
3//! Handles CPU exceptions and hardware IRQs.
4//! Inspired by MaestroOS `idt.rs` and Redox-OS kernel.
5
6use super::{pic, tss};
7use core::sync::atomic::{AtomicBool, AtomicU32, Ordering};
8use x86_64::{
9    structures::{
10        gdt::SegmentSelector,
11        idt::{InterruptDescriptorTable, InterruptStackFrame, PageFaultErrorCode},
12    },
13    VirtAddr,
14};
15
16const KERNEL_CODE_SELECTOR: SegmentSelector = SegmentSelector(0x08);
17
18#[repr(C, packed)]
19struct Idtr {
20    limit: u16,
21    base: u64,
22}
23
24#[derive(Clone, Copy, Debug)]
25pub struct LiveIdtGateInfo {
26    pub vector: u8,
27    pub selector: u16,
28    pub options: u16,
29    pub offset: u64,
30}
31
32/// IRQ interrupt vector numbers (PIC1_OFFSET + IRQ number)
33#[allow(dead_code)]
34pub mod irq {
35    pub const TIMER: u8 = super::pic::PIC1_OFFSET; // IRQ0 = 0x20
36    pub const KEYBOARD: u8 = super::pic::PIC1_OFFSET + 1; // IRQ1 = 0x21
37    pub const CASCADE: u8 = super::pic::PIC1_OFFSET + 2; // IRQ2 = 0x22
38    pub const MOUSE: u8 = super::pic::PIC1_OFFSET + 12; // IRQ12 = 0x2C
39    pub const COM2: u8 = super::pic::PIC1_OFFSET + 3; // IRQ3 = 0x23
40    pub const COM1: u8 = super::pic::PIC1_OFFSET + 4; // IRQ4 = 0x24
41    pub const FLOPPY: u8 = super::pic::PIC1_OFFSET + 6; // IRQ6 = 0x26
42    pub const ATA_PRIMARY: u8 = super::pic::PIC1_OFFSET + 14; // IRQ14 = 0x2E
43    pub const ATA_SECONDARY: u8 = super::pic::PIC1_OFFSET + 15; // IRQ15 = 0x2F
44}
45
46/// RAII guard: swap GS to kernel on entry if we came from Ring 3, and restore
47/// user GS automatically on drop (covers every exit path including early returns).
48///
49/// # Why this is needed
50/// After `swapgs ; iretq` in the Ring-3 trampoline:
51///   - `IA32_GS_BASE`        = 0  (user GS base, inactive)
52///   - `IA32_KERNEL_GS_BASE` = kernel per-CPU pointer
53/// When an interrupt fires from Ring 3, the CPU does NOT automatically call
54/// swapgs.  The first `gs:[0]` access (e.g. in `current_cpu_index`) would
55/// dereference virtual address 0 → page fault → double fault → triple fault.
56///
57/// # Safety
58/// Must be constructed **before** any `gs:[…]` access in the handler.
59/// `InterruptStackFrame::code_segment` is a plain memory read from the
60/// interrupt stack : it does not access GS.
61struct SwapGsGuard {
62    from_ring3: bool,
63}
64
65impl SwapGsGuard {
66    /// Construct.  If `from_ring3` is true, executes `swapgs` immediately to
67    /// restore the kernel per-CPU GS base.
68    #[inline(always)]
69    fn new(from_ring3: bool) -> Self {
70        if from_ring3 {
71            // SAFETY: We are in Ring 0 with interrupts disabled (standard for
72            // interrupt handlers).  GS_BASE currently points at user space (0);
73            // swapgs gives us the kernel per-CPU block via KERNEL_GS_BASE.
74            unsafe { core::arch::asm!("swapgs", options(nostack, preserves_flags)) };
75        }
76        Self { from_ring3 }
77    }
78}
79
80impl Drop for SwapGsGuard {
81    #[inline(always)]
82    fn drop(&mut self) {
83        if self.from_ring3 {
84            // SAFETY: Symmetric to the constructor.  Restores user GS_BASE so
85            // that iretq returns to Ring 3 with the correct GS state.
86            unsafe { core::arch::asm!("swapgs", options(nostack, preserves_flags)) };
87        }
88    }
89}
90
91/// Determine whether `swapgs` is needed at interrupt/exception entry.
92///
93/// In the normal case, `code_segment & 3 == 3` (Ring 3) means we need
94/// swapgs.  However, between `swapgs` and `iretq` in
95/// `elf_ring3_trampoline`, CS is still Ring 0 but `IA32_GS_BASE` is
96/// already the user value (0).  If `iretq` itself faults, the exception
97/// handler sees CS=Ring 0 but GS=user : the simple ring check misses
98/// this.  Reading `IA32_GS_BASE` via `rdmsr` catches both cases.
99///
100/// Cost: ~20-30 cycles for the `rdmsr` : acceptable in exception paths
101/// (not used for high-frequency IRQ handlers where IF=0 prevents
102/// firing in the swapgs→iretq window).
103#[inline(always)]
104fn needs_swapgs(cs: u16) -> bool {
105    // Fast path: Ring 3 → always need swapgs.
106    if (cs & 3) == 3 {
107        return true;
108    }
109    // Slow path: check if GS_BASE unexpectedly points to user space.
110    // SAFETY: rdmsr is privileged but we are in Ring 0 (exception handler).
111    let gs_base: u64 = unsafe {
112        let lo: u32;
113        let hi: u32;
114        core::arch::asm!(
115            "rdmsr",
116            in("ecx") 0xC000_0101u32,  // IA32_GS_BASE
117            out("eax") lo,
118            out("edx") hi,
119            options(nostack, preserves_flags),
120        );
121        (lo as u64) | ((hi as u64) << 32)
122    };
123    gs_base < 0xFFFF_8000_0000_0000
124}
125
126/// Static IDT storage (must be 'static for load())
127static mut IDT_STORAGE: InterruptDescriptorTable = InterruptDescriptorTable::new();
128static IDT_STORAGE_LOCK: AtomicBool = AtomicBool::new(false);
129static USER_PF_TRACE_BUDGET: AtomicU32 = AtomicU32::new(64);
130static RESCHED_IPI_TRACE_BUDGET: AtomicU32 = AtomicU32::new(32);
131
132pub fn live_gate_info(vector: u8) -> Option<LiveIdtGateInfo> {
133    let mut idtr = Idtr { limit: 0, base: 0 };
134    // SAFETY: `sidt` is a privileged register read with no side effect.
135    unsafe {
136        core::arch::asm!(
137            "sidt [{}]",
138            in(reg) &mut idtr,
139            options(nostack, preserves_flags),
140        );
141    }
142
143    let entry_offset = vector as usize * 16;
144    if entry_offset + 16 > idtr.limit as usize + 1 {
145        return None;
146    }
147
148    // SAFETY: The IDTR base/limit were read from the CPU and bounds-checked above.
149    let (low, high) = unsafe {
150        let entry_ptr = (idtr.base + entry_offset as u64) as *const u64;
151        (
152            core::ptr::read_unaligned(entry_ptr),
153            core::ptr::read_unaligned(entry_ptr.add(1)),
154        )
155    };
156
157    let offset = (low & 0xFFFF) | (((low >> 48) & 0xFFFF) << 16) | ((high & 0xFFFF_FFFF) << 32);
158    let selector = ((low >> 16) & 0xFFFF) as u16;
159    let options = ((low >> 32) & 0xFFFF) as u16;
160
161    Some(LiveIdtGateInfo {
162        vector,
163        selector,
164        options,
165        offset,
166    })
167}
168
169/// Decision returned by the raw interrupt trampolines.
170///
171/// Phase 1 of the preemptive scheduler refactor only wires the raw timer/IPI
172/// stubs and returns `next_rsp = 0`, which means "restore the current
173/// interrupt frame and return with iretq". The future interrupt-aware scheduler
174/// path will return a non-zero `next_rsp` and matching FPU buffers.
175#[repr(C)]
176#[derive(Clone, Copy, Debug, Default)]
177pub struct InterruptReturnDecision {
178    pub next_rsp: u64,
179    pub old_fpu: *mut u8,
180    pub new_fpu: *const u8,
181}
182
183/// Raw Local APIC timer interrupt entry.
184///
185/// Saves registers in exactly the same order as `SyscallFrame`, calls the Rust
186/// inner handler, then restores the interrupted context and returns with
187/// `iretq`. This avoids the `extern "x86-interrupt"` ABI mismatch with the
188/// legacy `ret`-based scheduler switch path.
189#[unsafe(naked)]
190unsafe extern "C" fn lapic_timer_entry() -> ! {
191    core::arch::naked_asm!(
192        "cld",
193        // Hardware IRQ stack frame at entry:
194        //   [rsp+0]  = RIP
195        //   [rsp+8]  = CS
196        //   [rsp+16] = RFLAGS
197        //   [rsp+24] = RSP
198        //   [rsp+32] = SS
199        // If interrupted from Ring 3, restore kernel GS before any percpu use.
200        "test qword ptr [rsp + 8], 0x3",
201        "jz 2f",
202        "swapgs",
203        "2:",
204        // Save GPRs in reverse order so that final RSP points at a SyscallFrame.
205        "push rax",
206        "push rcx",
207        "push rdx",
208        "push rdi",
209        "push rsi",
210        "push r8",
211        "push r9",
212        "push r10",
213        "push r11",
214        "push rbx",
215        "push rbp",
216        "push r12",
217        "push r13",
218        "push r14",
219        "push r15",
220        // SysV large-struct return uses an implicit out-pointer in RDI.
221        // Reserve 32 bytes to keep 16-byte alignment before `call`.
222        "sub rsp, 32",
223        "mov rdi, rsp",
224        "lea rsi, [rsp + 32]",
225        "call {inner}",
226        // Load returned InterruptReturnDecision fields.
227        "mov rax, [rsp + 0]",
228        "mov rdx, [rsp + 8]",
229        "mov rcx, [rsp + 16]",
230        "add rsp, 32",
231        "test rax, rax",
232        "jz 3f",
233        // Context switch path: save old task's FPU, switch stack, restore new task's FPU.
234        "fxsave [rdx]",
235        "mov rsp, rax",
236        "fxrstor [rcx]",
237        "call {switch_finish}",
238        "3:",
239        // No context switch (rax == 0): skip FPU save/restore entirely.
240        // The interrupted task's FPU state remains unchanged.
241        // Restore current SyscallFrame.
242        "pop r15",
243        "pop r14",
244        "pop r13",
245        "pop r12",
246        "pop rbp",
247        "pop rbx",
248        "pop r11",
249        "pop r10",
250        "pop r9",
251        "pop r8",
252        "pop rsi",
253        "pop rdi",
254        "pop rdx",
255        "pop rcx",
256        "pop rax",
257        // Restore user GS iff we are returning to Ring 3.
258        "test qword ptr [rsp + 8], 0x3",
259        "jz 4f",
260        "swapgs",
261        "4:",
262        "iretq",
263        inner = sym lapic_timer_inner,
264        switch_finish = sym crate::process::scheduler::finish_interrupt_switch,
265    );
266}
267
268/// Raw reschedule IPI entry.
269///
270/// Uses the same `SyscallFrame` layout as the timer entry. Phase 1 only marks
271/// a reschedule hint and returns to the interrupted context.
272#[unsafe(naked)]
273unsafe extern "C" fn resched_ipi_entry() -> ! {
274    core::arch::naked_asm!(
275        "cld",
276        "test qword ptr [rsp + 8], 0x3",
277        "jz 2f",
278        "swapgs",
279        "2:",
280        "push rax",
281        "mov al, 0x65",
282        "out 0xe9, al",
283        "push rcx",
284        "push rdx",
285        "push rdi",
286        "push rsi",
287        "push r8",
288        "push r9",
289        "push r10",
290        "push r11",
291        "push rbx",
292        "push rbp",
293        "push r12",
294        "push r13",
295        "push r14",
296        "push r15",
297        "sub rsp, 32",
298        "mov al, 0x45",
299        "out 0xe9, al",
300        "mov rdi, rsp",
301        "lea rsi, [rsp + 32]",
302        "call {inner}",
303        "mov rax, [rsp + 0]",
304        "mov rdx, [rsp + 8]",
305        "mov rcx, [rsp + 16]",
306        "add rsp, 32",
307        "test rax, rax",
308        "jz 3f",
309        "ud2",
310        "3:",
311        "pop r15",
312        "pop r14",
313        "pop r13",
314        "pop r12",
315        "pop rbp",
316        "pop rbx",
317        "pop r11",
318        "pop r10",
319        "pop r9",
320        "pop r8",
321        "pop rsi",
322        "pop rdi",
323        "pop rdx",
324        "pop rcx",
325        "pop rax",
326        "test qword ptr [rsp + 8], 0x3",
327        "jz 4f",
328        "swapgs",
329        "4:",
330        "iretq",
331        inner = sym resched_ipi_inner,
332    );
333}
334
335extern "C" fn lapic_timer_inner(
336    frame: &mut crate::syscall::SyscallFrame,
337) -> InterruptReturnDecision {
338    let cpu = crate::arch::x86_64::percpu::current_cpu_index();
339    let ticks = crate::process::scheduler::ticks();
340    // Heartbeat: single byte only. e9_println!/format_args in IRQ can cause issues.
341    let from_ring3 = (frame.iret_cs & 3) == 3;
342    if from_ring3 && (ticks < 5 || ticks % 100 == 0) {
343        unsafe { core::arch::asm!("mov al, 0x48; out 0xe9, al", out("al") _) } // 'H'
344    }
345    crate::process::scheduler::timer_tick();
346    super::apic::eoi();
347
348    // Deliver pending POSIX signals before returning to Ring 3 via iretq.
349    // On this IRQ-return path we only perform deliveries that are safe from
350    // timer interrupt context; fatal/default actions remain deferred to the
351    // normal syscall-side delivery path, which may kill/switch the current
352    // task and is not yet validated on the raw timer-iret path.
353    if from_ring3 {
354        crate::process::signal::deliver_pending_signal_on_interrupt_return(frame);
355    }
356
357    // Temporarily keep timer IRQs side-effect free with respect to stack
358    // switching. The raw `iretq`-based resume path is not yet correct for all
359    // contexts:
360    // - Ring 3 resumes can return with a shifted IRET frame under SMP load.
361    // - Ring 0 resumes are fundamentally different because same-CPL `iretq`
362    //   does not restore RSP/SS, so synthetic `SyscallFrame` resumes of kernel
363    //   tasks can continue with a bogus stack pointer and RIP=0.
364    // Keep only the reschedule hint here and let tasks switch on safer paths
365    // (blocking syscalls, explicit yields, future validated return path).
366    crate::process::scheduler::request_force_resched_hint(cpu);
367    InterruptReturnDecision::default()
368}
369
370extern "C" fn resched_ipi_inner(
371    frame: &mut crate::syscall::SyscallFrame,
372) -> InterruptReturnDecision {
373    let cpu = crate::arch::x86_64::percpu::current_cpu_index();
374    let should_trace = RESCHED_IPI_TRACE_BUDGET
375        .fetch_update(Ordering::AcqRel, Ordering::Relaxed, |budget| {
376            budget.checked_sub(1)
377        })
378        .is_ok();
379    if should_trace {
380        let rsp0 = crate::arch::x86_64::tss::kernel_stack_for(cpu)
381            .map(|addr| addr.as_u64())
382            .unwrap_or(0);
383        let (slot_rip, slot_cs, slot_rsp, slot_ss) = if rsp0 >= 40 {
384            // SAFETY: rsp0 points at the top of the current CPU's kernel stack.
385            // During a Ring3->Ring0 interrupt, the CPU-saved IRET frame lives at
386            // [rsp0-40 .. rsp0-8]. We only read those 5 u64 words for diagnosis.
387            unsafe {
388                let frame_base = (rsp0 - 40) as *const u64;
389                (
390                    *frame_base.add(0),
391                    *frame_base.add(1),
392                    *frame_base.add(3),
393                    *frame_base.add(4),
394                )
395            }
396        } else {
397            (0, 0, 0, 0)
398        };
399        crate::e9_println!(
400            "[ipi-rsp0] cpu={} rsp0={:#x} slot_rip={:#x} slot_cs={:#x} slot_rsp={:#x} slot_ss={:#x} frame_rip={:#x} frame_cs={:#x} frame_rsp={:#x} frame_ss={:#x}",
401            cpu,
402            rsp0,
403            slot_rip,
404            slot_cs,
405            slot_rsp,
406            slot_ss,
407            frame.iret_rip,
408            frame.iret_cs,
409            frame.iret_rsp,
410            frame.iret_ss,
411        );
412    }
413    super::apic::eoi();
414    crate::process::scheduler::request_force_resched_hint(cpu);
415    InterruptReturnDecision::default()
416}
417
418#[inline]
419fn lock_idt_storage() {
420    while IDT_STORAGE_LOCK
421        .compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed)
422        .is_err()
423    {
424        core::hint::spin_loop();
425    }
426}
427
428#[inline]
429fn unlock_idt_storage() {
430    IDT_STORAGE_LOCK.store(false, Ordering::Release);
431}
432
433pub fn init() {
434    lock_idt_storage();
435    unsafe {
436        let idt = &raw mut IDT_STORAGE;
437
438        // CPU exceptions
439        (*idt)
440            .breakpoint
441            .set_handler_fn(breakpoint_handler)
442            .set_code_selector(KERNEL_CODE_SELECTOR);
443        (*idt)
444            .page_fault
445            .set_handler_fn(page_fault_handler)
446            .set_code_selector(KERNEL_CODE_SELECTOR);
447        (*idt)
448            .general_protection_fault
449            .set_handler_fn(general_protection_fault_handler)
450            .set_code_selector(KERNEL_CODE_SELECTOR);
451        (*idt)
452            .stack_segment_fault
453            .set_handler_fn(stack_segment_fault_handler)
454            .set_code_selector(KERNEL_CODE_SELECTOR);
455        (*idt)
456            .non_maskable_interrupt
457            .set_handler_fn(non_maskable_interrupt_handler)
458            .set_code_selector(KERNEL_CODE_SELECTOR);
459        (*idt)
460            .invalid_opcode
461            .set_handler_fn(invalid_opcode_handler)
462            .set_code_selector(KERNEL_CODE_SELECTOR);
463        (*idt)
464            .double_fault
465            .set_handler_fn(double_fault_handler)
466            .set_code_selector(KERNEL_CODE_SELECTOR)
467            .set_stack_index(tss::DOUBLE_FAULT_IST_INDEX);
468
469        // Hardware IRQs (PIC remapped to 0x20+)
470        let idt_ref = &mut *idt;
471        idt_ref[irq::TIMER as u8]
472            .set_handler_fn(legacy_timer_handler)
473            .set_code_selector(KERNEL_CODE_SELECTOR);
474        idt_ref[irq::KEYBOARD as u8]
475            .set_handler_fn(keyboard_handler)
476            .set_code_selector(KERNEL_CODE_SELECTOR);
477        idt_ref[irq::MOUSE as u8]
478            .set_handler_fn(mouse_handler)
479            .set_code_selector(KERNEL_CODE_SELECTOR);
480
481        // Spurious interrupt handler at vector 0xFF (APIC spurious vector)
482        idt_ref[0xFF_u8]
483            .set_handler_fn(spurious_handler)
484            .set_code_selector(KERNEL_CODE_SELECTOR);
485
486        // Cross-CPU reschedule IPI (vector 0xE0)
487
488        idt_ref[super::apic::IPI_RESCHED_VECTOR as u8]
489            .set_handler_addr(VirtAddr::from_ptr(resched_ipi_entry as *const ()))
490            .set_code_selector(KERNEL_CODE_SELECTOR);
491
492        // Cross-CPU TLB shootdown IPI (vector 0xF0)
493        idt_ref[super::apic::IPI_TLB_SHOOTDOWN_VECTOR as u8]
494            .set_handler_fn(tlb_shootdown_handler)
495            .set_code_selector(KERNEL_CODE_SELECTOR);
496
497        (*idt).load_unsafe();
498    }
499    unlock_idt_storage();
500
501    log::debug!("IDT initialized with {} entries", 256);
502}
503
504pub fn load() {
505    lock_idt_storage();
506    unsafe {
507        let idt = &raw const IDT_STORAGE;
508        (*idt).load_unsafe();
509    }
510    unlock_idt_storage();
511}
512
513/// Register the Local APIC timer IRQ vector to use the timer handler.
514pub fn register_lapic_timer_vector(vector: u8) {
515    lock_idt_storage();
516    unsafe {
517        let idt = &raw mut IDT_STORAGE;
518        (&mut *idt)[vector]
519            .set_handler_addr(VirtAddr::from_ptr(lapic_timer_entry as *const ()))
520            .set_code_selector(KERNEL_CODE_SELECTOR);
521        (*idt).load_unsafe();
522    }
523    unlock_idt_storage();
524}
525
526/// Register the AHCI storage controller IRQ handler.
527///
528/// Called after AHCI initialisation once the PCI interrupt line is known.
529pub fn register_ahci_irq(irq: u8) {
530    let vector = if irq < 16 {
531        super::pic::PIC1_OFFSET + irq
532    } else {
533        irq
534    };
535
536    lock_idt_storage();
537    unsafe {
538        let idt = &raw mut IDT_STORAGE;
539        (&mut *idt)[vector]
540            .set_handler_fn(ahci_handler)
541            .set_code_selector(KERNEL_CODE_SELECTOR);
542        (*idt).load_unsafe();
543    }
544    unlock_idt_storage();
545    log::info!("AHCI IRQ {} registered on vector {:#x}", irq, vector);
546}
547
548/// Register the VirtIO block device IRQ handler
549///
550/// Called after VirtIO block device initialization to route the device's
551/// IRQ to the correct handler.
552pub fn register_virtio_block_irq(irq: u8) {
553    // PCI INTx gives an IRQ line number (typically 0..15), while IDT expects
554    // a vector number. Map legacy IRQ lines to the remapped interrupt vectors.
555    let vector = if irq < 16 {
556        super::pic::PIC1_OFFSET + irq
557    } else {
558        irq
559    };
560
561    lock_idt_storage();
562    unsafe {
563        let idt = &raw mut IDT_STORAGE;
564        (&mut *idt)[vector]
565            .set_handler_fn(virtio_block_handler)
566            .set_code_selector(KERNEL_CODE_SELECTOR);
567        (*idt).load_unsafe();
568    }
569    unlock_idt_storage();
570    log::info!("VirtIO-blk IRQ {} registered on vector {:#x}", irq, vector);
571}
572
573/// Register the xHCI USB controller IRQ handler.
574///
575/// Called after xHCI initialization once the PCI interrupt line is known.
576pub fn register_xhci_irq(irq: u8) {
577    let vector = if irq < 16 {
578        super::pic::PIC1_OFFSET + irq
579    } else {
580        irq
581    };
582
583    lock_idt_storage();
584    unsafe {
585        let idt = &raw mut IDT_STORAGE;
586        (&mut *idt)[vector]
587            .set_handler_fn(xhci_handler)
588            .set_code_selector(KERNEL_CODE_SELECTOR);
589        (*idt).load_unsafe();
590    }
591    unlock_idt_storage();
592    log::info!("xHCI IRQ {} registered on vector {:#x}", irq, vector);
593}
594
595// =============================================
596// CPU Exception Handlers
597// =============================================
598
599/// Performs the breakpoint handler operation.
600extern "x86-interrupt" fn breakpoint_handler(stack_frame: InterruptStackFrame) {
601    let _gs = SwapGsGuard::new(needs_swapgs(stack_frame.code_segment.0));
602    log::warn!("EXCEPTION: BREAKPOINT\n{:#?}", stack_frame);
603}
604
605/// Performs the invalid opcode handler operation.
606extern "x86-interrupt" fn invalid_opcode_handler(stack_frame: InterruptStackFrame) {
607    let cs = stack_frame.code_segment.0;
608    let is_user = (cs & 3) == 3;
609    let _gs = SwapGsGuard::new(needs_swapgs(cs));
610    if is_user {
611        if let Some(tid) = crate::process::current_task_id() {
612            crate::silo::handle_user_fault(
613                tid,
614                crate::silo::SiloFaultReason::InvalidOpcode,
615                stack_frame.instruction_pointer.as_u64(),
616                0,
617                stack_frame.instruction_pointer.as_u64(),
618            );
619            return;
620        }
621    }
622    log::error!("EXCEPTION: INVALID OPCODE\n{:#?}", stack_frame);
623    panic!("Invalid opcode");
624}
625
626extern "x86-interrupt" fn non_maskable_interrupt_handler(stack_frame: InterruptStackFrame) {
627    // NMI can fire at any point : including the swapgs→iretq window.
628    // Use rdmsr to safely restore kernel GS if needed.
629    let _gs = SwapGsGuard::new(needs_swapgs(stack_frame.code_segment.0));
630    if crate::boot::panic::panic_in_progress() {
631        crate::arch::x86_64::cli();
632        loop {
633            crate::arch::x86_64::hlt();
634        }
635    }
636    crate::serial_force_println!(
637        "[NMI] rip={:#x} cs={:#x}",
638        stack_frame.instruction_pointer.as_u64(),
639        stack_frame.code_segment.0
640    );
641    crate::arch::x86_64::cli();
642    loop {
643        crate::arch::x86_64::hlt();
644    }
645}
646
647/// Performs the page fault handler operation.
648extern "x86-interrupt" fn page_fault_handler(
649    stack_frame: InterruptStackFrame,
650    error_code: PageFaultErrorCode,
651) {
652    use x86_64::registers::control::Cr2;
653    let cs = stack_frame.code_segment.0;
654    let is_user = (cs & 3) == 3;
655    // SAFETY: must be before any gs:[...] access – GS may point to user memory
656    // if the fault fired from Ring 3 (after swapgs in elf_ring3_trampoline),
657    // OR during the swapgs→iretq window (CS=Ring0 but GS=user).
658    // needs_swapgs() uses rdmsr to catch both cases.
659    let swapgs_needed = needs_swapgs(cs);
660    let _gs = SwapGsGuard::new(swapgs_needed);
661
662    // Detect the swapgs→iretq window: CS=Ring0 but GS was user (0).
663    if swapgs_needed && !is_user {
664        let fault_addr = x86_64::registers::control::Cr2::read()
665            .as_ref()
666            .map(|v| v.as_u64())
667            .unwrap_or(0);
668        crate::serial_force_println!(
669            "\x1b[31;1m[pagefault]\x1b[0m SWAPGS-WINDOW: CS={:#x} (Ring0) but GS was user! rip={:#x} addr={:#x} err={:#x}",
670            cs,
671            stack_frame.instruction_pointer.as_u64(),
672            fault_addr,
673            error_code.bits()
674        );
675        panic!("#PF in swapgs→iretq window");
676    }
677
678    // Get the faulting address
679    let fault_addr = Cr2::read();
680    let fault_vaddr = fault_addr.as_ref().map(|v| v.as_u64()).unwrap_or(0);
681    let rip = stack_frame.instruction_pointer.as_u64();
682    let user_rsp = stack_frame.stack_pointer.as_u64();
683
684    let mut trace_ctx = crate::trace::TraceTaskCtx::empty();
685    if is_user {
686        if let Some(task) = crate::process::current_task_clone() {
687            let as_ref = task.process.address_space_arc();
688            trace_ctx = crate::trace::TraceTaskCtx {
689                task_id: task.id.as_u64(),
690                pid: task.pid,
691                tid: task.tid,
692                cr3: as_ref.cr3().as_u64(),
693            };
694        }
695    }
696
697    let do_pf_trace = if is_user {
698        USER_PF_TRACE_BUDGET
699            .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |v| {
700                if v > 0 {
701                    Some(v - 1)
702                } else {
703                    None
704                }
705            })
706            .is_ok()
707    } else {
708        true
709    };
710    if do_pf_trace {
711        crate::trace_mem!(
712            crate::trace::category::MEM_PF,
713            crate::trace::TraceKind::MemPageFault,
714            error_code.bits() as u64,
715            trace_ctx,
716            rip,
717            fault_vaddr,
718            user_rsp,
719            0
720        );
721    }
722
723    // Try COW only for write-protection faults on already-present pages.
724    // For not-present faults, demand paging should run first.
725    if error_code.contains(PageFaultErrorCode::PROTECTION_VIOLATION)
726        && error_code.contains(PageFaultErrorCode::CAUSED_BY_WRITE)
727        && is_user
728    {
729        if let Some(task) = crate::process::current_task_clone() {
730            let address_space = task.process.address_space_arc();
731            if let Ok(vaddr) = fault_addr {
732                match crate::syscall::fork::handle_cow_fault(vaddr.as_u64(), &address_space) {
733                    Ok(()) => {
734                        crate::trace_mem!(
735                            crate::trace::category::MEM_COW,
736                            crate::trace::TraceKind::MemCow,
737                            1,
738                            trace_ctx,
739                            rip,
740                            vaddr.as_u64(),
741                            0,
742                            0
743                        );
744                        return;
745                    }
746                    Err(reason) => {
747                        crate::trace_mem!(
748                            crate::trace::category::MEM_COW,
749                            crate::trace::TraceKind::MemCow,
750                            0,
751                            trace_ctx,
752                            rip,
753                            vaddr.as_u64(),
754                            0,
755                            0
756                        );
757                        crate::serial_println!(
758                            "\x1b[31m[pagefault] COW resolve failed\x1b[0m: task={} \x1b[36mpid={}\x1b[0m tid={} \x1b[35maddr={:#x}\x1b[0m \x1b[35mrip={:#x}\x1b[0m err={}",
759                            task.id.as_u64(),
760                            task.pid,
761                            task.tid,
762                            vaddr.as_u64(),
763                            stack_frame.instruction_pointer.as_u64(),
764                            reason
765                        );
766                    }
767                }
768            }
769        }
770    }
771
772    if is_user {
773        if let Some(task) = crate::process::current_task_clone() {
774            let address_space = task.process.address_space_arc();
775            if let Ok(vaddr) = fault_addr {
776                if do_pf_trace {
777                    // FORCE OUTPUT for the first user faults only; lazy demand paging can
778                    // legitimately fault thousands of times during boot and flood serial.
779                    crate::serial_force_println!(
780                        "\x1b[33m[pagefault] USER fault\x1b[0m: tid={} rip={:#x} addr={:#x} err={:#x}",
781                        task.tid,
782                        rip,
783                        vaddr.as_u64(),
784                        error_code.bits()
785                    );
786                    // Mirror to e9 so the first handled faults stay visible in e9_debug.log.
787                    crate::e9_println!(
788                        "[PF] tid={} rip={:#x} addr={:#x} err={:#x}",
789                        task.tid,
790                        rip,
791                        vaddr.as_u64(),
792                        error_code.bits()
793                    );
794                }
795
796                match address_space.handle_fault(vaddr.as_u64()) {
797                    Ok(()) => {
798                        if do_pf_trace {
799                            crate::serial_force_println!(
800                                "\x1b[32m[pagefault] USER fault resolved\x1b[0m: tid={} addr={:#x}",
801                                task.tid,
802                                vaddr.as_u64()
803                            );
804                        }
805                        return;
806                    }
807                    Err(e) => {
808                        crate::serial_force_println!(
809                            "\x1b[31m[pagefault] USER fault resolution FAILED\x1b[0m: tid={} addr={:#x} err={:?}",
810                            task.tid,
811                            vaddr.as_u64(),
812                            e
813                        );
814                        crate::e9_println!(
815                            "[PF-FAIL] tid={} rip={:#x} addr={:#x}",
816                            task.tid,
817                            rip,
818                            vaddr.as_u64()
819                        );
820                        dump_user_pf_context(&address_space, rip, user_rsp);
821                    }
822                }
823            }
824        }
825    }
826
827    if is_user {
828        if let Some(tid) = crate::process::current_task_id() {
829            crate::silo::handle_user_fault(
830                tid,
831                crate::silo::SiloFaultReason::PageFault,
832                fault_addr.as_ref().map(|v| v.as_u64()).unwrap_or(0),
833                error_code.bits() as u64,
834                stack_frame.instruction_pointer.as_u64(),
835            );
836            return;
837        }
838    } else {
839        // FORCE OUTPUT for kernel fault
840        crate::serial_force_println!(
841            "\x1b[31;1m[pagefault] KERNEL fault\x1b[0m: rip={:#x} addr={:#x} err={:#x}",
842            rip,
843            fault_addr.as_ref().map(|v| v.as_u64()).unwrap_or(0),
844            error_code.bits()
845        );
846    }
847
848    // Capture current task (non-blocking, safe from IRQ context) for the diagnostic dump.
849    let task_snap = crate::process::scheduler::current_task_clone_try();
850    dump_page_fault_full(&stack_frame, error_code, fault_addr, &task_snap);
851}
852
853// =============================================================================
854// CRITICAL: Full page fault diagnostic dump
855//
856// Invoked for every non-recoverable page fault (kernel or unhandled user).
857// Designed to be deadlock-safe:
858//   - Uses serial_println! (direct UART) instead of the log framework, which
859//     may itself allocate or acquire locks.
860//   - All memory reads go through translate_via_raw_pt so no unmapped address
861//     is ever dereferenced.
862//   - The buddy allocator lock is acquired with try_lock (non-blocking) for
863//     memory statistics.
864//   - Uses current_task_clone_try (non-blocking) instead of current_task_clone.
865// =============================================================================
866
867/// Decodes `PageFaultErrorCode` bits into a human-readable string.
868fn decode_error_code(ec: PageFaultErrorCode) -> &'static str {
869    let p = ec.contains(PageFaultErrorCode::PROTECTION_VIOLATION);
870    let w = ec.contains(PageFaultErrorCode::CAUSED_BY_WRITE);
871    let u = ec.contains(PageFaultErrorCode::USER_MODE);
872    match (p, w, u) {
873        (false, false, false) => "kernel read of non-present page",
874        (false, true, false) => "kernel write to non-present page",
875        (false, false, true) => "user read of non-present page",
876        (false, true, true) => "user write to non-present page",
877        (true, false, false) => "kernel read protection violation",
878        (true, true, false) => "kernel write protection violation (COW / RO page)",
879        (true, false, true) => "user read protection violation (NX / supervisor-only)",
880        (true, true, true) => "user write protection violation (COW / RO page)",
881    }
882}
883
884/// Formats page table entry flags into a short human-readable byte string.
885fn format_pte_flags(entry: u64) -> [u8; 32] {
886    let mut buf = [b' '; 32];
887    let mut pos = 0usize;
888    let flags: &[(&str, u64)] = &[
889        ("P", 1 << 0),
890        ("RW", 1 << 1),
891        ("US", 1 << 2),
892        ("PWT", 1 << 3),
893        ("PCD", 1 << 4),
894        ("A", 1 << 5),
895        ("D", 1 << 6),
896        ("PS", 1 << 7),
897        ("G", 1 << 8),
898        ("NX", 1 << 63),
899    ];
900    for &(name, bit) in flags {
901        if entry & bit != 0 {
902            for &b in name.as_bytes() {
903                if pos < buf.len() {
904                    buf[pos] = b;
905                    pos += 1;
906                }
907            }
908            if pos < buf.len() {
909                buf[pos] = b'|';
910                pos += 1;
911            }
912        }
913    }
914    if pos > 0 && buf[pos - 1] == b'|' {
915        buf[pos - 1] = b' ';
916    }
917    buf
918}
919
920/// Translates a virtual address to a physical address via a manual 4-level
921/// page table walk.  Returns `Some(phys)` or `None` if any level is absent.
922///
923/// # SAFETY
924/// Read-only access to page tables through the HHDM mapping.
925/// All intermediate addresses are derived from table entries : no pointer
926/// originating from user-controlled data is ever dereferenced.
927fn translate_via_raw_pt(vaddr: u64, cr3_phys: u64, hhdm: u64) -> Option<u64> {
928    unsafe {
929        let l4_ptr = (cr3_phys + hhdm) as *const u64;
930        let l4e = *l4_ptr.add(((vaddr >> 39) & 0x1FF) as usize);
931        if l4e & 1 == 0 {
932            return None;
933        }
934
935        let l3_ptr = ((l4e & 0x000F_FFFF_FFFF_F000) + hhdm) as *const u64;
936        let l3e = *l3_ptr.add(((vaddr >> 30) & 0x1FF) as usize);
937        if l3e & 1 == 0 {
938            return None;
939        }
940        if l3e & 0x80 != 0 {
941            return Some((l3e & 0x000F_FFFF_C000_0000) + (vaddr & 0x3FFF_FFFF));
942        }
943
944        let l2_ptr = ((l3e & 0x000F_FFFF_FFFF_F000) + hhdm) as *const u64;
945        let l2e = *l2_ptr.add(((vaddr >> 21) & 0x1FF) as usize);
946        if l2e & 1 == 0 {
947            return None;
948        }
949        if l2e & 0x80 != 0 {
950            return Some((l2e & 0x000F_FFFF_FFE0_0000) + (vaddr & 0x1F_FFFF));
951        }
952
953        let l1_ptr = ((l2e & 0x000F_FFFF_FFFF_F000) + hhdm) as *const u64;
954        let l1e = *l1_ptr.add(((vaddr >> 12) & 0x1FF) as usize);
955        if l1e & 1 == 0 {
956            return None;
957        }
958        Some((l1e & 0x000F_FFFF_FFFF_F000) + (vaddr & 0xFFF))
959    }
960}
961
962/// Hex + ASCII dump of `count` bytes at virtual address `vaddr`.
963/// Each page boundary is translated through the raw page tables.
964fn dump_memory_bytes(vaddr: u64, cr3_phys: u64, count: usize, prefix: &str) {
965    let hhdm = crate::memory::hhdm_offset();
966    let mut offset = 0usize;
967    while offset < count {
968        let cur_va = vaddr.wrapping_add(offset as u64);
969        let page_off = (cur_va & 0xFFF) as usize;
970        let chunk = core::cmp::min(count - offset, 0x1000 - page_off);
971        let Some(phys) = translate_via_raw_pt(cur_va, cr3_phys, hhdm) else {
972            crate::serial_println!("{}(page {:#x} not mapped)", prefix, cur_va);
973            offset += chunk;
974            continue;
975        };
976        // SAFETY: read-only access to a valid physical page through the HHDM mapping.
977        let src = (phys - (cur_va & 0xFFF) + hhdm) as *const u8;
978        let mut line_off = 0usize;
979        while line_off < chunk {
980            let ll = core::cmp::min(16, chunk - line_off);
981            let line_va = cur_va.wrapping_add(line_off as u64);
982            let mut hex = [0u8; 48];
983            let mut asc = [b'.'; 16];
984            for i in 0..ll {
985                let byte = unsafe { *src.add(page_off + line_off + i) };
986                let hi = byte >> 4;
987                let lo = byte & 0xF;
988                hex[i * 3] = if hi < 10 { b'0' + hi } else { b'a' + hi - 10 };
989                hex[i * 3 + 1] = if lo < 10 { b'0' + lo } else { b'a' + lo - 10 };
990                hex[i * 3 + 2] = b' ';
991                if byte >= 0x20 && byte < 0x7F {
992                    asc[i] = byte;
993                }
994            }
995            for i in ll..16 {
996                hex[i * 3] = b' ';
997                hex[i * 3 + 1] = b' ';
998                hex[i * 3 + 2] = b' ';
999            }
1000            crate::serial_println!(
1001                "{}{:#018x}: {} |{}|",
1002                prefix,
1003                line_va,
1004                core::str::from_utf8(&hex[..48]).unwrap_or("???"),
1005                core::str::from_utf8(&asc[..ll]).unwrap_or("???")
1006            );
1007            line_off += ll;
1008        }
1009        offset += chunk;
1010    }
1011}
1012
1013/// Detailed page table walk with flag decoding at every level.
1014fn dump_page_table_walk(vaddr: u64, cr3_phys: u64) {
1015    let hhdm = crate::memory::hhdm_offset();
1016    let l4_idx = ((vaddr >> 39) & 0x1FF) as usize;
1017    let l3_idx = ((vaddr >> 30) & 0x1FF) as usize;
1018    let l2_idx = ((vaddr >> 21) & 0x1FF) as usize;
1019    let l1_idx = ((vaddr >> 12) & 0x1FF) as usize;
1020
1021    // SAFETY: read-only access through the HHDM mapping for diagnostic purposes.
1022    unsafe {
1023        let l4_ptr = (cr3_phys + hhdm) as *const u64;
1024        let l4e = *l4_ptr.add(l4_idx);
1025        let f = format_pte_flags(l4e);
1026        crate::serial_println!(
1027            "  PML4[{:>3}] = {:#018x}  phys={:#014x}  [{}]",
1028            l4_idx,
1029            l4e,
1030            l4e & 0x000F_FFFF_FFFF_F000,
1031            core::str::from_utf8(&f).unwrap_or("?").trim()
1032        );
1033        if l4e & 1 == 0 {
1034            crate::serial_println!("  \x1b[1;31m╰→ STOP: PML4 not present\x1b[0m");
1035            return;
1036        }
1037
1038        let l3_ptr = ((l4e & 0x000F_FFFF_FFFF_F000) + hhdm) as *const u64;
1039        let l3e = *l3_ptr.add(l3_idx);
1040        let f = format_pte_flags(l3e);
1041        crate::serial_println!(
1042            "  PDPT[{:>3}] = {:#018x}  phys={:#014x}  [{}]",
1043            l3_idx,
1044            l3e,
1045            l3e & 0x000F_FFFF_FFFF_F000,
1046            core::str::from_utf8(&f).unwrap_or("?").trim()
1047        );
1048        if l3e & 1 == 0 {
1049            crate::serial_println!("  \x1b[1;31m╰→ STOP: PDPT not present\x1b[0m");
1050            return;
1051        }
1052        if l3e & 0x80 != 0 {
1053            crate::serial_println!(
1054                "  ╰→ 1 GiB huge page → phys {:#x}",
1055                l3e & 0x000F_FFFF_C000_0000
1056            );
1057            return;
1058        } // 1 GiB
1059
1060        let l2_ptr = ((l3e & 0x000F_FFFF_FFFF_F000) + hhdm) as *const u64;
1061        let l2e = *l2_ptr.add(l2_idx);
1062        let f = format_pte_flags(l2e);
1063        crate::serial_println!(
1064            "  PD  [{:>3}] = {:#018x}  phys={:#014x}  [{}]",
1065            l2_idx,
1066            l2e,
1067            l2e & 0x000F_FFFF_FFFF_F000,
1068            core::str::from_utf8(&f).unwrap_or("?").trim()
1069        );
1070        if l2e & 1 == 0 {
1071            crate::serial_println!("  \x1b[1;31m╰→ STOP: PD not present\x1b[0m");
1072            return;
1073        }
1074        if l2e & 0x80 != 0 {
1075            crate::serial_println!(
1076                "  ╰→ 2 MiB huge page → phys {:#x}",
1077                l2e & 0x000F_FFFF_FFE0_0000
1078            );
1079            return;
1080        } // 2 MiB
1081
1082        let l1_ptr = ((l2e & 0x000F_FFFF_FFFF_F000) + hhdm) as *const u64;
1083        let l1e = *l1_ptr.add(l1_idx);
1084        let f = format_pte_flags(l1e);
1085        crate::serial_println!(
1086            "  PT  [{:>3}] = {:#018x}  phys={:#014x}  [{}]",
1087            l1_idx,
1088            l1e,
1089            l1e & 0x000F_FFFF_FFFF_F000,
1090            core::str::from_utf8(&f).unwrap_or("?").trim()
1091        );
1092        if l1e & 1 == 0 {
1093            crate::serial_println!("  \x1b[1;31m╰→ STOP: PT not present\x1b[0m");
1094        } else {
1095            crate::serial_println!(
1096                "  \x1b[1;32m╰→ PAGE PRESENT\x1b[0m → phys {:#x} (check RW/US/NX flags)",
1097                l1e & 0x000F_FFFF_FFFF_F000
1098            );
1099        }
1100        // Neighbouring PT entries for context
1101        crate::serial_println!("  --- Neighbouring PT entries ---");
1102        let start = if l1_idx >= 2 { l1_idx - 2 } else { 0 };
1103        for i in start..core::cmp::min(l1_idx + 3, 512) {
1104            let e = *l1_ptr.add(i);
1105            if e != 0 {
1106                let f = format_pte_flags(e);
1107                crate::serial_println!(
1108                    "    PT[{:>3}] = {:#018x}  [{}]{}",
1109                    i,
1110                    e,
1111                    core::str::from_utf8(&f).unwrap_or("?").trim(),
1112                    if i == l1_idx { " <<<" } else { "" }
1113                );
1114            }
1115        }
1116    }
1117}
1118
1119/// Dumps VMA regions near the faulting address.
1120fn dump_nearby_vma_regions(as_ref: &crate::memory::AddressSpace, fault_vaddr: u64) {
1121    let page_start = fault_vaddr & !0xFFF;
1122    let probes = [
1123        page_start,
1124        fault_vaddr & !0x1F_FFFF,
1125        fault_vaddr & !0x3FFF_FFFF,
1126        0x0000_0001_0000_0000,
1127        0x0000_0000_0040_0000,
1128        0x0000_7FFF_F000_0000,
1129    ];
1130    let mut found_any = false;
1131    for &p in &probes {
1132        if let Some(vma) = as_ref.region_by_start(p) {
1133            let end = vma.start + (vma.page_count as u64) * vma.page_size.bytes();
1134            let hit = fault_vaddr >= vma.start && fault_vaddr < end;
1135            crate::serial_println!(
1136                "  VMA {:#014x}..{:#014x}  pages={:<5}  type={:?}  flags={:?}  pgsz={:?}{}",
1137                vma.start,
1138                end,
1139                vma.page_count,
1140                vma.vma_type,
1141                vma.flags,
1142                vma.page_size,
1143                if hit {
1144                    "  \x1b[1;32m<<< FAULT\x1b[0m"
1145                } else {
1146                    ""
1147                }
1148            );
1149            found_any = true;
1150        }
1151    }
1152    if as_ref.has_mapping_in_range(page_start, 0x1000) {
1153        crate::serial_println!(
1154            "  Note: fault page {:#x} IS within a tracked mapping range",
1155            page_start
1156        );
1157    } else {
1158        crate::serial_println!(
1159            "  Note: fault page {:#x} is NOT within any tracked mapping range",
1160            page_start
1161        );
1162    }
1163    if !found_any {
1164        crate::serial_println!("  (no VMA regions found at probed addresses)");
1165    }
1166}
1167
1168/// Full diagnostic dump for a non-recoverable page fault.
1169///
1170/// Uses `serial_println!` directly (lock-free UART) to avoid any deadlock
1171/// with the log framework or the heap allocator.
1172fn dump_page_fault_full(
1173    stack_frame: &InterruptStackFrame,
1174    error_code: PageFaultErrorCode,
1175    fault_addr: Result<x86_64::VirtAddr, x86_64::addr::VirtAddrNotValid>,
1176    task: &Option<alloc::sync::Arc<crate::process::task::Task>>,
1177) -> ! {
1178    use x86_64::registers::control::{Cr0, Cr3, Cr4};
1179
1180    let rip = stack_frame.instruction_pointer.as_u64();
1181    let rsp = stack_frame.stack_pointer.as_u64();
1182    let cs = stack_frame.code_segment.0;
1183    let ss = stack_frame.stack_segment.0;
1184    let rflags = stack_frame.cpu_flags.bits();
1185    let fault_vaddr = fault_addr.as_ref().map(|v| v.as_u64()).unwrap_or(0);
1186    let is_user = (cs & 3) == 3;
1187
1188    crate::serial_println!("\x1b[1;31m");
1189    crate::serial_println!("╔══════════════════════════════════════════════════════════════════╗");
1190    crate::serial_println!("║                  KERNEL PAGE FAULT EXCEPTION                    ║");
1191    crate::serial_println!(
1192        "╚══════════════════════════════════════════════════════════════════╝\x1b[0m"
1193    );
1194
1195    // --- Error code ---
1196    crate::serial_println!("\x1b[1;33m--- Error Code ---\x1b[0m");
1197    crate::serial_println!("  Raw         : {:#06x}", error_code.bits());
1198    crate::serial_println!(
1199        "  Diagnostic  : \x1b[1;31m{}\x1b[0m",
1200        decode_error_code(error_code)
1201    );
1202    crate::serial_println!(
1203        "  PRESENT     : {} | WRITE : {} | USER : {} | RSVD : {} | FETCH : {}",
1204        error_code.contains(PageFaultErrorCode::PROTECTION_VIOLATION) as u8,
1205        error_code.contains(PageFaultErrorCode::CAUSED_BY_WRITE) as u8,
1206        error_code.contains(PageFaultErrorCode::USER_MODE) as u8,
1207        (error_code.bits() >> 3) & 1,
1208        (error_code.bits() >> 4) & 1
1209    );
1210
1211    // --- Faulting context ---
1212    crate::serial_println!("\x1b[1;33m--- Faulting Context ---\x1b[0m");
1213    crate::serial_println!("  CR2 (addr)  : \x1b[1;35m{:#018x}\x1b[0m", fault_vaddr);
1214    crate::serial_println!("  RIP         : \x1b[1;36m{:#018x}\x1b[0m", rip);
1215    crate::serial_println!("  RSP         : {:#018x}", rsp);
1216    crate::serial_println!(
1217        "  CS          : {:#06x}  (ring={}{}) | SS : {:#06x}",
1218        cs,
1219        cs & 3,
1220        if is_user { " USER" } else { " KERNEL" },
1221        ss
1222    );
1223
1224    // RFLAGS décodé
1225    let mut rf_str = [0u8; 64];
1226    let mut rfp = 0usize;
1227    for &(name, bit) in &[
1228        ("CF", 1u64),
1229        ("PF", 4),
1230        ("AF", 16),
1231        ("ZF", 64),
1232        ("SF", 128),
1233        ("TF", 256),
1234        ("IF", 512),
1235        ("DF", 1024),
1236        ("OF", 2048),
1237    ] {
1238        if rflags & bit != 0 {
1239            for &b in name.as_bytes() {
1240                if rfp < rf_str.len() {
1241                    rf_str[rfp] = b;
1242                    rfp += 1;
1243                }
1244            }
1245            if rfp < rf_str.len() {
1246                rf_str[rfp] = b' ';
1247                rfp += 1;
1248            }
1249        }
1250    }
1251    crate::serial_println!(
1252        "  RFLAGS      : {:#018x}  [{}]",
1253        rflags,
1254        core::str::from_utf8(&rf_str[..rfp]).unwrap_or("?")
1255    );
1256
1257    // --- Control registers ---
1258    crate::serial_println!("\x1b[1;33m--- Control Registers ---\x1b[0m");
1259    let cr0 = Cr0::read_raw();
1260    let (cr3_frame, cr3_flags) = Cr3::read();
1261    let cr3_phys = cr3_frame.start_address().as_u64();
1262    let cr4 = Cr4::read_raw();
1263    let efer: u64 = x86_64::registers::model_specific::Efer::read_raw();
1264    crate::serial_println!("  CR0         : {:#018x}", cr0);
1265    crate::serial_println!(
1266        "  CR3         : {:#018x}  (flags={:#x})",
1267        cr3_phys,
1268        cr3_flags.bits()
1269    );
1270    crate::serial_println!("  CR4         : {:#018x}", cr4);
1271    crate::serial_println!(
1272        "  EFER        : {:#018x}  [{}{}{}]",
1273        efer,
1274        if efer & 1 != 0 { "SCE " } else { "" },
1275        if efer & (1 << 8) != 0 { "LME " } else { "" },
1276        if efer & (1 << 11) != 0 { "NXE" } else { "" }
1277    );
1278
1279    // --- CPU context ---
1280    crate::serial_println!("\x1b[1;33m--- CPU Context ---\x1b[0m");
1281    crate::serial_println!("  LAPIC ID    : {}", super::apic::lapic_id());
1282    crate::serial_println!("  Ticks sched : {}", crate::process::scheduler::ticks());
1283    crate::serial_println!("  HHDM offset : {:#x}", crate::memory::hhdm_offset());
1284
1285    // --- Task context ---
1286    crate::serial_println!("\x1b[1;33m--- Task Context ---\x1b[0m");
1287    if let Some(ref t) = *task {
1288        crate::serial_println!(
1289            "  ID={} PID={} TID={} TGID={} name=\"{}\" prio={:?} ticks={}",
1290            t.id.as_u64(),
1291            t.pid,
1292            t.tid,
1293            t.tgid,
1294            t.name,
1295            t.priority,
1296            t.ticks.load(core::sync::atomic::Ordering::Relaxed)
1297        );
1298        // SAFETY: Read task CR3 safely using the hardware page-table walker
1299        // (translate_via_raw_pt) to prevent recursive page faults if the
1300        // process's Arc<AddressSpace> is partially initialized or corrupted.
1301        //
1302        // Chain: &t.process → Arc<Process> data ptr (Arc::as_ptr)
1303        //      → (*process).address_space.get() → *mut Arc<AddressSpace>
1304        //      → Arc::as_ptr(arc_as) → *const AddressSpace
1305        //      → (*addr_space).cr3_phys
1306        //
1307        // Each step uses translate_via_raw_pt to verify the pointer is mapped
1308        // before dereferencing, using the hardware CR3 (cr3_phys) which always
1309        // maps the kernel's HHDM region.
1310        let task_cr3: u64 = {
1311            let hhdm = crate::memory::hhdm_offset();
1312            // Step 1: Arc<Process> data (Arc::as_ptr is always valid for a live Arc)
1313            let _proc_ptr: u64 = alloc::sync::Arc::as_ptr(&t.process) as u64;
1314            // Step 2: address_space field in Process = SyncUnsafeCell whose .get()
1315            // returns a raw ptr into the Process data : always valid for a live Process.
1316            // However, reading the Arc<AddressSpace> *value* from that pointer may
1317            // fault if the memory is unmapped, so we use translate_via_raw_pt.
1318            let as_cell_addr: u64 =
1319                unsafe { (*alloc::sync::Arc::as_ptr(&t.process)).address_space.get() as u64 };
1320            // Step 3: read the 8-byte Arc<AddressSpace> inner pointer from as_cell_addr
1321            // via raw page table walk with current hardware CR3.
1322            let as_inner_u64: u64 = match translate_via_raw_pt(as_cell_addr, cr3_phys, hhdm) {
1323                Some(phys) => unsafe { *((phys + hhdm) as *const u64) },
1324                None => 0,
1325            };
1326            if as_inner_u64 == 0 {
1327                0u64
1328            } else {
1329                // as_inner_u64 is the NonNull ptr inside Arc<AddressSpace>
1330                // = pointer to ArcInner<AddressSpace>.
1331                // ArcInner = strong(8) + weak(8) + data(AddressSpace).
1332                // So AddressSpace data is at as_inner_u64 + 16.
1333                let as_data_ptr: u64 = as_inner_u64 + 2 * core::mem::size_of::<usize>() as u64;
1334                // cr3_phys is the first field of AddressSpace (PhysAddr = u64, 8 bytes).
1335                match translate_via_raw_pt(as_data_ptr, cr3_phys, hhdm) {
1336                    Some(phys) => unsafe { *((phys + hhdm) as *const u64) },
1337                    None => 0,
1338                }
1339            }
1340        };
1341        if task_cr3 == 0 {
1342            crate::serial_println!(
1343                "  Task CR3    : <unreadable : null/unmapped Arc<AddressSpace>>"
1344            );
1345        } else {
1346            crate::serial_println!(
1347                "  Task CR3    : {:#018x}{}",
1348                task_cr3,
1349                if task_cr3 != cr3_phys {
1350                    " *** DIFFERS from hardware CR3! ***"
1351                } else {
1352                    " (matches hardware CR3)"
1353                }
1354            );
1355        }
1356    } else {
1357        crate::serial_println!("  (no current task : scheduler idle or unavailable)");
1358    }
1359
1360    // --- Memory statistics ---
1361    crate::serial_println!("\x1b[1;33m--- Memory Stats ---\x1b[0m");
1362    if let Some(guard) = crate::memory::get_allocator().try_lock() {
1363        if let Some(ref alloc) = *guard {
1364            let (total, allocated) = alloc.page_totals();
1365            let free = total.saturating_sub(allocated);
1366            crate::serial_println!(
1367                "  Total={} pages ({} MiB)  Alloc={} ({} MiB)  Free={} ({} MiB)",
1368                total,
1369                total * 4 / 1024,
1370                allocated,
1371                allocated * 4 / 1024,
1372                free,
1373                free * 4 / 1024
1374            );
1375            let mut zones =
1376                [crate::memory::buddy::ZoneStats::empty(); crate::memory::zone::ZoneType::COUNT];
1377            let n = alloc.zone_snapshot(&mut zones);
1378            for i in 0..n {
1379                let zone = zones[i];
1380                let zone_ref = alloc.get_zone(i);
1381                crate::serial_println!(
1382                    "    Zone {} ({}): base={:#x} managed={} present={} spanned={} reserved={} alloc={} free={} state={:?} seg={}/{} largest={:?}",
1383                    i,
1384                    match zone.zone_type {
1385                        crate::memory::zone::ZoneType::DMA => "DMA",
1386                        crate::memory::zone::ZoneType::Normal => "Normal",
1387                        crate::memory::zone::ZoneType::HighMem => "High",
1388                    },
1389                    zone.base,
1390                    zone.managed_pages,
1391                    zone.present_pages,
1392                    zone.spanned_pages,
1393                    zone.reserved_pages,
1394                    zone.allocated_pages,
1395                    zone.free_pages,
1396                    zone.pressure(),
1397                    zone.segment_count,
1398                    zone.segment_capacity,
1399                    zone.largest_free_order
1400                );
1401                crate::serial_println!(
1402                    "      reserve={} avail={} holes={} cached[u/m]={}/{} free[u/m]={}/{} pageblocks[u/m]={}/{} total={} order={}",
1403                    zone.reserve_floor_pages(),
1404                    zone.available_after_reserve_pages(),
1405                    zone.hole_pages(),
1406                    zone.cached_unmovable_pages,
1407                    zone.cached_movable_pages,
1408                    zone.unmovable_free_pages,
1409                    zone.movable_free_pages,
1410                    zone.unmovable_pageblocks,
1411                    zone.movable_pageblocks,
1412                    zone.pageblock_count,
1413                    crate::memory::zone::PAGEBLOCK_ORDER
1414                );
1415                crate::serial_println!(
1416                    "      frag/order: o1={}%% o4={}%% o{}={}%%",
1417                    zone_ref.fragmentation_score(1, zone.cached_pages),
1418                    zone_ref.fragmentation_score(4, zone.cached_pages),
1419                    crate::memory::zone::PAGEBLOCK_ORDER,
1420                    zone_ref.fragmentation_score(
1421                        crate::memory::zone::PAGEBLOCK_ORDER as u8,
1422                        zone.cached_pages,
1423                    )
1424                );
1425            }
1426        } else {
1427            crate::serial_println!("  (allocator not initialized)");
1428        }
1429    } else {
1430        crate::serial_println!("  (allocator lock contended : skipping)");
1431    }
1432
1433    let quarantine = crate::memory::buddy::poison_quarantine_pages_snapshot();
1434    let fail_counts = crate::memory::buddy::buddy_alloc_fail_counts_snapshot();
1435    let compaction = crate::memory::buddy::compaction_stats_snapshot();
1436    crate::serial_println!("  Poison quarantine : {} pages", quarantine);
1437
1438    let mut printed_fail = false;
1439    for (order, count) in fail_counts.iter().enumerate() {
1440        if *count == 0 {
1441            continue;
1442        }
1443        printed_fail = true;
1444        crate::serial_println!("  Buddy alloc fail  : order={} count={}", order, count);
1445    }
1446    if !printed_fail {
1447        crate::serial_println!("  Buddy alloc fail  : none");
1448    }
1449
1450    if compaction.attempts == 0 {
1451        crate::serial_println!("  Compaction assist : none");
1452    } else {
1453        crate::serial_println!(
1454            "  Compaction assist : attempts={} success={} last_order={:?} migratetype={:?} zone={:?} pressure={:?}",
1455            compaction.attempts,
1456            compaction.successes,
1457            compaction.last_order,
1458            compaction.last_migratetype,
1459            compaction.last_zone,
1460            compaction.last_pressure
1461        );
1462        crate::serial_println!(
1463            "                      frag={}%% req={} avail={} usable={} cached={} drained={} pageblocks={}/{}",
1464            compaction.last_fragmentation_score,
1465            compaction.last_requested_pages,
1466            compaction.last_available_pages,
1467            compaction.last_usable_pages,
1468            compaction.last_cached_pages,
1469            compaction.last_drained_pages,
1470            compaction.last_matching_pageblocks,
1471            compaction.last_pageblock_count
1472        );
1473    }
1474
1475    // --- Code bytes at RIP ---
1476    crate::serial_println!("\x1b[1;33m--- Code at RIP ({:#x}) ---\x1b[0m", rip);
1477    dump_memory_bytes(rip, cr3_phys, 32, "  ");
1478
1479    // --- Stack dump ---
1480    crate::serial_println!("\x1b[1;33m--- Stack Dump (RSP={:#x}) ---\x1b[0m", rsp);
1481    dump_memory_bytes(rsp, cr3_phys, 128, "  ");
1482
1483    // --- Page table walk ---
1484    crate::serial_println!(
1485        "\x1b[1;33m--- Page Table Walk (CR2={:#x}, CR3={:#x}) ---\x1b[0m",
1486        fault_vaddr,
1487        cr3_phys
1488    );
1489    if fault_addr.is_ok() {
1490        dump_page_table_walk(fault_vaddr, cr3_phys);
1491    } else {
1492        crate::serial_println!("  (CR2 is a non-canonical address: {:#x})", fault_vaddr);
1493    }
1494
1495    // --- VMA regions near fault ---
1496    if let Some(ref t) = *task {
1497        crate::serial_println!("\x1b[1;33m--- VMA Regions Near Fault ---\x1b[0m");
1498        // SAFETY: Use the same safe ptr-chain read strategy as the Task CR3 section above:
1499        // Arc::as_ptr gives a valid *const AddressSpace if the Arc is alive, but the
1500        // Arc<AddressSpace> stored inside the SyncUnsafeCell might be corrupted.
1501        // We validate via translate_via_raw_pt before reading the inner ptr.
1502        let hhdm_vma = crate::memory::hhdm_offset();
1503        let safe_as: Option<*const crate::memory::AddressSpace> = unsafe {
1504            let as_cell_addr: u64 =
1505                (*alloc::sync::Arc::as_ptr(&t.process)).address_space.get() as u64;
1506            match translate_via_raw_pt(as_cell_addr, cr3_phys, hhdm_vma) {
1507                Some(phys) => {
1508                    // Read the Arc<AddressSpace> inner pointer (a NonNull ptr stored at this phys)
1509                    let as_inner_u64 = *((phys + hhdm_vma) as *const u64);
1510                    if as_inner_u64 == 0 {
1511                        None
1512                    } else {
1513                        // ArcInner<AddressSpace>.data at +16
1514                        let as_data_ptr = (as_inner_u64 + 2 * core::mem::size_of::<usize>() as u64)
1515                            as *const crate::memory::AddressSpace;
1516                        // Validate the AddressSpace pointer is mapped before returning it
1517                        if translate_via_raw_pt(as_data_ptr as u64, cr3_phys, hhdm_vma).is_some() {
1518                            Some(as_data_ptr)
1519                        } else {
1520                            None
1521                        }
1522                    }
1523                }
1524                None => None,
1525            }
1526        };
1527        if let Some(as_ptr) = safe_as {
1528            // SAFETY: We verified above that as_ptr is mapped and readable.
1529            let as_ref = unsafe { &*as_ptr };
1530            dump_nearby_vma_regions(as_ref, fault_vaddr);
1531        } else {
1532            crate::serial_println!("  (AddressSpace unreadable : skipping VMA dump)");
1533        }
1534    }
1535
1536    crate::serial_println!(
1537        "\x1b[1;31m╔══════════════════════════════════════════════════════════════════╗"
1538    );
1539    crate::serial_println!("║                     END OF PAGE FAULT DUMP                      ║");
1540    crate::serial_println!(
1541        "╚══════════════════════════════════════════════════════════════════╝\x1b[0m"
1542    );
1543
1544    panic!(
1545        "PAGE FAULT: {} at {:#x}, RIP={:#x}, CR3={:#x}, err={:#x}",
1546        decode_error_code(error_code),
1547        fault_vaddr,
1548        rip,
1549        cr3_phys,
1550        error_code.bits()
1551    );
1552}
1553
1554/// Performs the dump user pf context operation.
1555fn dump_user_pf_context(as_ref: &crate::memory::AddressSpace, rip: u64, rsp: u64) {
1556    use x86_64::VirtAddr;
1557
1558    let hhdm = crate::memory::hhdm_offset();
1559
1560    if let Some(phys) = as_ref.translate(VirtAddr::new(rip)) {
1561        let off = (rip & 0xfff) as usize;
1562        let mut bytes = [0u8; 8];
1563        // SAFETY: We read at most 8 bytes from a mapped user instruction page via HHDM.
1564        unsafe {
1565            let src = (phys.as_u64() - (rip & 0xfff) + hhdm + off as u64) as *const u8;
1566            core::ptr::copy_nonoverlapping(src, bytes.as_mut_ptr(), bytes.len());
1567        }
1568        crate::serial_println!(
1569            "[pagefault] ctx: rsp={:#x} rip-bytes={:02x} {:02x} {:02x} {:02x} {:02x} {:02x} {:02x} {:02x}",
1570            rsp,
1571            bytes[0],
1572            bytes[1],
1573            bytes[2],
1574            bytes[3],
1575            bytes[4],
1576            bytes[5],
1577            bytes[6],
1578            bytes[7],
1579        );
1580    } else {
1581        crate::serial_println!("[pagefault] ctx: rsp={:#x} rip page unmapped", rsp);
1582    }
1583
1584    if let Some(phys) = as_ref.translate(VirtAddr::new(rsp)) {
1585        crate::serial_println!(
1586            "[pagefault] stack-top: rsp mapped (phys={:#x})",
1587            phys.as_u64()
1588        );
1589    } else {
1590        crate::serial_println!("[pagefault] stack-top: rsp unmapped");
1591    }
1592}
1593
1594/// Performs the general protection fault handler operation.
1595extern "x86-interrupt" fn general_protection_fault_handler(
1596    stack_frame: InterruptStackFrame,
1597    error_code: u64,
1598) {
1599    let cs = stack_frame.code_segment.0;
1600    let is_user = (cs & 3) == 3;
1601    // Use rdmsr-based check: catches the swapgs→iretq window where
1602    // CS=Ring0 but GS=user (0).  Without this, #GP from a bad iretq
1603    // would escalate to double fault → triple fault.
1604    let swapgs_needed = needs_swapgs(cs);
1605    let _gs = SwapGsGuard::new(swapgs_needed);
1606    // Detect the swapgs→iretq window case: CS says Ring 0 but GS was user.
1607    if swapgs_needed && !is_user {
1608        crate::serial_force_println!(
1609            "\x1b[31;1m[GPF]\x1b[0m SWAPGS-WINDOW: CS={:#x} (Ring0) but GS was user! rip={:#x} err={:#x} rsp={:#x}",
1610            cs,
1611            stack_frame.instruction_pointer.as_u64(),
1612            error_code,
1613            stack_frame.stack_pointer.as_u64()
1614        );
1615        panic!("#GP in swapgs→iretq window (iretq frame invalid?)");
1616    }
1617    if is_user {
1618        if let Some(tid) = crate::process::current_task_id() {
1619            crate::serial_force_println!(
1620                "\x1b[31;1m[GPF]\x1b[0m USER tid={} rip={:#x} err={:#x}",
1621                tid,
1622                stack_frame.instruction_pointer.as_u64(),
1623                error_code
1624            );
1625            crate::silo::handle_user_fault(
1626                tid,
1627                crate::silo::SiloFaultReason::GeneralProtection,
1628                stack_frame.instruction_pointer.as_u64(),
1629                error_code,
1630                stack_frame.instruction_pointer.as_u64(),
1631            );
1632            return;
1633        }
1634    }
1635    crate::serial_force_println!(
1636        "\x1b[31;1m[GPF]\x1b[0m KERNEL rip={:#x} err={:#x} cs={:#x} rsp={:#x}",
1637        stack_frame.instruction_pointer.as_u64(),
1638        error_code,
1639        stack_frame.code_segment.0,
1640        stack_frame.stack_pointer.as_u64()
1641    );
1642    panic!("General protection fault");
1643}
1644
1645/// Performs the stack segment fault handler operation.
1646extern "x86-interrupt" fn stack_segment_fault_handler(
1647    stack_frame: InterruptStackFrame,
1648    error_code: u64,
1649) {
1650    // Use rdmsr-based check: iretq can trigger #SS if the user SS is bad,
1651    // and at that point GS is already swapped to user.
1652    let _gs = SwapGsGuard::new(needs_swapgs(stack_frame.code_segment.0));
1653    crate::serial_force_println!(
1654        "\x1b[31;1m[STACK_FAULT]\x1b[0m rip={:#x} err={:#x} cs={:#x} rsp={:#x}",
1655        stack_frame.instruction_pointer.as_u64(),
1656        error_code,
1657        stack_frame.code_segment.0,
1658        stack_frame.stack_pointer.as_u64()
1659    );
1660    panic!("Stack segment fault");
1661}
1662
1663/// Performs the double fault handler operation.
1664///
1665/// Uses IST stack so the handler always runs on a known-good stack, even
1666/// when RSP0 is corrupt.  We must still do `swapgs` if the fault originated
1667/// from Ring 3 (or from Ring 0 code that already did `swapgs`, e.g. the
1668/// `iretq` path in `elf_ring3_trampoline`).
1669///
1670/// # Note on divergent handler
1671/// This handler is `-> !`, so `SwapGsGuard::drop` will never run.  That is
1672/// fine because we never return to the interrupted context.
1673extern "x86-interrupt" fn double_fault_handler(
1674    stack_frame: InterruptStackFrame,
1675    error_code: u64,
1676) -> ! {
1677    // Best-effort swapgs: if GS currently points at user space (address 0)
1678    // we need to swap to kernel GS so that any code below that touches
1679    // `gs:[0]` (e.g. via `current_cpu_index`) does not page-fault again.
1680    // We use a raw read of IA32_GS_BASE via rdmsr to decide.
1681    //
1682    // During `elf_ring3_trampoline`, `swapgs` is executed *before* `iretq`.
1683    // If `iretq` itself faults, `code_segment` is still Ring 0 (0x08) but
1684    // GS_BASE is already the user value (0).  The normal `cs & 3 == 3` test
1685    // would miss this case.  Reading the MSR catches it.
1686    unsafe {
1687        let lo: u32;
1688        let hi: u32;
1689        core::arch::asm!(
1690            "rdmsr",
1691            in("ecx") 0xC000_0101u32,  // IA32_GS_BASE
1692            out("eax") lo,
1693            out("edx") hi,
1694            options(nostack, preserves_flags),
1695        );
1696        let gs_base = (lo as u64) | ((hi as u64) << 32);
1697        // If GS_BASE is in the low half (user space) or zero, swap to kernel.
1698        if gs_base < 0xFFFF_8000_0000_0000 {
1699            core::arch::asm!("swapgs", options(nostack, preserves_flags));
1700        }
1701    }
1702    crate::serial_force_println!(
1703        "\x1b[31;1m[DOUBLE_FAULT]\x1b[0m rip={:#x} err={:#x} cs={:#x} rsp={:#x}",
1704        stack_frame.instruction_pointer.as_u64(),
1705        error_code,
1706        stack_frame.code_segment.0,
1707        stack_frame.stack_pointer.as_u64()
1708    );
1709    panic!(
1710        "EXCEPTION: DOUBLE FAULT (error code: {:#x})\n{:#?}",
1711        error_code, stack_frame
1712    );
1713}
1714
1715// =============================================
1716// Hardware IRQ handlers
1717// =============================================
1718
1719/// Legacy external timer IRQ handler (PIC/IOAPIC IRQ0 path, vector 0x20).
1720///
1721/// When the LAPIC timer is active, we ignore this source to avoid double-ticking.
1722extern "x86-interrupt" fn legacy_timer_handler(stack_frame: InterruptStackFrame) {
1723    // Restore kernel GS if the timer fired while Ring 3 was running.
1724    let _gs = SwapGsGuard::new((stack_frame.code_segment.0 & 3) == 3);
1725    if crate::arch::x86_64::timer::is_apic_timer_active() {
1726        // Ignore legacy timer source once LAPIC timer is running.
1727        if super::apic::is_initialized() {
1728            super::apic::eoi();
1729        } else {
1730            pic::end_of_interrupt(0);
1731        }
1732        return;
1733    }
1734
1735    // FORCE OUTPUT for heartbeat (every 100 ticks to avoid flooding,
1736    // plus first 10 ticks to confirm timer fires after Ring-3 entry)
1737    let ticks = crate::process::scheduler::ticks();
1738    if ticks < 10 || ticks % 100 == 0 {
1739        crate::serial_force_println!("[heartbeat] PIC timer tick={}", ticks);
1740    }
1741
1742    // Increment tick counter
1743    crate::process::scheduler::timer_tick();
1744    // NOTE: avoid complex rendering/allocation work in IRQ context.
1745    // Status bar refresh is currently done from non-IRQ paths.
1746
1747    // Send EOI first so the timer can fire again on the new task
1748    if super::apic::is_initialized() {
1749        super::apic::eoi();
1750    } else {
1751        pic::end_of_interrupt(0);
1752    }
1753
1754    // Try to preempt the current task (no-op if scheduler lock is held
1755    // or no task is running yet)
1756    crate::process::scheduler::maybe_preempt();
1757    if ticks < 10 {
1758        crate::serial_force_println!("[heartbeat] PIC timer tick={} preempt_done", ticks);
1759    }
1760}
1761
1762/// Local APIC timer handler (dedicated vector, e.g. 0xD2).
1763extern "x86-interrupt" fn lapic_timer_handler(stack_frame: InterruptStackFrame) {
1764    // Restore kernel GS if the timer fired while Ring 3 was running.
1765    let cs = stack_frame.code_segment.0;
1766    let _gs = SwapGsGuard::new((cs & 3) == 3);
1767    let cpu = crate::arch::x86_64::percpu::current_cpu_index();
1768    let ticks = crate::process::scheduler::ticks();
1769    // Trace first 10 ticks per CPU unconditionally to confirm timer fires
1770    // after Ring-3 entry, then one-per-100 heartbeat to avoid flooding.
1771    if ticks < 10 || ticks % 100 == 0 {
1772        crate::serial_force_println!(
1773            "[heartbeat] APIC timer tick={} cpu={} cs={:#x} rip={:#x}",
1774            ticks,
1775            cpu,
1776            cs,
1777            stack_frame.instruction_pointer.as_u64()
1778        );
1779    }
1780
1781    // serial_force_println holds FORCE_LOCK (IRQ-disabled spinlock) while writing
1782    // to the UART. At 115200 baud each byte takes ~87 µs; a 60-char message is
1783    // ~5 ms of IRQs-off time : long enough to miss ticks and corrupt scheduling.
1784    // Keep serial output out of the hot IRQ path; use e9 port (µs-range) instead.
1785    unsafe { core::arch::asm!("mov al, '0'; out 0xe9, al", out("al") _) };
1786    crate::process::scheduler::timer_tick();
1787    unsafe { core::arch::asm!("mov al, '1'; out 0xe9, al", out("al") _) };
1788    super::apic::eoi();
1789    // IMPORTANT:
1790    // Do not run `maybe_preempt()` directly from a Ring-3-origin timer IRQ.
1791    //
1792    // Current scheduler switch path (`do_switch_context` + `ret`) is built for
1793    // task context frames, while this function is an `extern "x86-interrupt"`
1794    // frame that the compiler expects to unwind with iretq.
1795    //
1796    // On first user-mode preemption (CPU1), switching away from this frame can
1797    // corrupt the interrupt return state and trigger #DF/#TF. Instead, mark a
1798    // lock-free resched hint and return through the normal interrupt epilogue.
1799    // The scheduler will consume the hint on a safe path.
1800    if (cs & 3) == 3 {
1801        crate::process::scheduler::request_force_resched_hint(cpu);
1802        unsafe { core::arch::asm!("mov al, 'P'; out 0xe9, al", out("al") _) };
1803    } else {
1804        crate::process::scheduler::maybe_preempt();
1805    }
1806}
1807
1808/// PS/2 Mouse IRQ12 handler.
1809extern "x86-interrupt" fn mouse_handler(_stack_frame: InterruptStackFrame) {
1810    crate::arch::x86_64::mouse::handle_irq();
1811    // PS/2 mouse IRQ12 is intentionally kept on the remapped legacy PIC path.
1812    // Even when LAPIC/IOAPIC are active for timer/IPI traffic, this source must
1813    // still be acknowledged via the 8259 PIC.
1814    pic::end_of_interrupt(12);
1815}
1816
1817/// Performs the keyboard handler operation.
1818extern "x86-interrupt" fn keyboard_handler(_stack_frame: InterruptStackFrame) {
1819    let raw = unsafe { super::io::inb(0x60) };
1820    // Port 0x60 is consumed on read: feed the raw scancode directly.
1821    if let Some(ch) = super::keyboard_layout::handle_scancode_raw(raw) {
1822        crate::arch::x86_64::keyboard::add_to_buffer(ch);
1823    }
1824
1825    // PS/2 keyboard IRQ1 is intentionally kept on the remapped legacy PIC path.
1826    // A LAPIC EOI here leaves the PIC request in service and stalls keyboard
1827    // delivery after the first edge.
1828    pic::end_of_interrupt(1);
1829}
1830
1831/// Spurious interrupt handler (APIC vector 0xFF).
1832/// Per Intel SDM: do NOT send EOI for spurious interrupts.
1833extern "x86-interrupt" fn spurious_handler(_stack_frame: InterruptStackFrame) {
1834    // Intentionally empty : no EOI per Intel SDM
1835}
1836
1837/// AHCI storage controller IRQ handler.
1838///
1839/// Reads `HBA_IS`, processes per-port completions, wakes waiting tasks, then
1840/// sends EOI.  Must not call any function that may block or allocate.
1841extern "x86-interrupt" fn ahci_handler(_stack_frame: InterruptStackFrame) {
1842    crate::hardware::storage::ahci::handle_interrupt();
1843
1844    if super::apic::is_initialized() {
1845        super::apic::eoi();
1846    } else {
1847        let irq = crate::hardware::storage::ahci::AHCI_IRQ_LINE
1848            .load(core::sync::atomic::Ordering::Relaxed);
1849        pic::end_of_interrupt(irq);
1850    }
1851}
1852
1853/// VirtIO Block device IRQ handler
1854///
1855/// Handles interrupts from the VirtIO block device.
1856/// The IRQ line is determined at runtime from PCI config.
1857extern "x86-interrupt" fn virtio_block_handler(_stack_frame: InterruptStackFrame) {
1858    // Handle the VirtIO block interrupt
1859    crate::hardware::storage::virtio_block::handle_interrupt();
1860
1861    // Send EOI
1862    if super::apic::is_initialized() {
1863        super::apic::eoi();
1864    } else {
1865        // Get the IRQ number from the device
1866        let irq = crate::hardware::storage::virtio_block::get_irq();
1867        pic::end_of_interrupt(irq);
1868    }
1869}
1870
1871/// xHCI USB controller IRQ handler
1872///
1873/// Handles interrupts from the xHCI host controller.
1874/// Processes event ring completions for control transfers and HID reports.
1875extern "x86-interrupt" fn xhci_handler(_stack_frame: InterruptStackFrame) {
1876    crate::hardware::usb::xhci::handle_interrupt();
1877
1878    if super::apic::is_initialized() {
1879        super::apic::eoi();
1880    } else {
1881        let irq =
1882            crate::hardware::usb::xhci::XHCI_IRQ_LINE.load(core::sync::atomic::Ordering::Relaxed);
1883        pic::end_of_interrupt(irq);
1884    }
1885}
1886
1887/// Cross-CPU reschedule IPI handler (vector 0xF0).
1888///
1889/// Sent by another CPU (via `apic::send_resched_ipi`) to request that this
1890/// CPU preempts its current task immediately rather than waiting for the next
1891/// timer tick. This is used when a task running on this CPU is killed or
1892/// suspended by a different CPU.
1893///
1894/// EOI is sent ***before*** ` maybe_preempt()` so the APIC can accept further
1895/// IPIs before the potentially long context-switch path runs.
1896extern "x86-interrupt" fn resched_ipi_handler(stack_frame: InterruptStackFrame) {
1897    // Restore kernel GS if the IPI arrived while Ring 3 was running.
1898    let _gs = SwapGsGuard::new((stack_frame.code_segment.0 & 3) == 3);
1899    super::apic::eoi();
1900    crate::process::scheduler::maybe_preempt();
1901}
1902
1903/// Cross-CPU TLB shootdown IPI handler (vector 0xF0).
1904extern "x86-interrupt" fn tlb_shootdown_handler(stack_frame: InterruptStackFrame) {
1905    // Restore kernel GS if the IPI arrived while Ring 3 was running.
1906    let _gs = SwapGsGuard::new((stack_frame.code_segment.0 & 3) == 3);
1907    // Note: EOI is sent by the architecture-independent handler.
1908    super::tlb::tlb_shootdown_ipi_handler();
1909}