Skip to main content

strat9_kernel/arch/x86_64/
ring3_diag.rs

1//! Pre-IRETQ diagnostics for transitioning from Ring 0 to Ring 3
2//!
3//! [`validate_ring3_state`] should be invoked **immediately before** the IRETQ
4//! trampoline. It checks the four prerequisites required for a safe switch to
5//! Ring 3:
6//!
7//! 1. **GDT** – CS/SS descriptors have DPL=3, P=1 and the code segment has L=1
8//!    (64‑bit).
9//! 2. **Paging** – every level (PML4 → PDPT → PD → PT) for `target_rip` and
10//!    `target_rsp` has the `USER_ACCESSIBLE` flag.
11//! 3. **Alignment** – `target_rsp` is 16‑byte aligned (System V ABI requirement).
12//! 4. **TSS** – a TSS is loaded (`TR ≠ 0`) and `rsp0` points into kernel space
13//!    (≥ 0xffff_8000_0000_0000), ensuring the CPU can switch back on exception.
14//!
15//! If any check fails the function `panic!`s with a detailed description.
16
17use x86_64::{
18    instructions::tables::sgdt, registers::control::Cr3, structures::paging::PageTableFlags,
19};
20
21//  GDT descriptor decoding constants
22
23/// Bit 47 : Present.
24const DESC_PRESENT_BIT: u64 = 1 << 47;
25/// Bits [46:45] : DPL.
26const DESC_DPL_SHIFT: u32 = 45;
27const DESC_DPL_MASK: u64 = 0x3 << DESC_DPL_SHIFT;
28/// Bit 53 : L  Long Mode code segment (64-bit).
29const DESC_L_BIT: u64 = 1 << 53;
30/// Bit 44 : S  1 = segment code/data, 0 = system descriptor.
31const DESC_S_BIT: u64 = 1 << 44;
32/// Bit 43 : Executable (type bit for code segments).
33const DESC_EXEC_BIT: u64 = 1 << 43;
34
35//  Raw GDT descriptor access
36
37/// Reads an 8-byte GDT descriptor at the given `index` (base 0).
38///
39/// # Safety
40/// `gdt_base` must be the address returned by `sgdt` and `index` must remain
41/// within the `limit` of the GDTR.
42#[inline]
43unsafe fn read_gdt_raw(gdt_base: *const u64, index: usize) -> u64 {
44    // SAFETY: bounds check performed by the caller
45    unsafe { *gdt_base.add(index) }
46}
47
48//  Descriptor field accessors
49
50#[inline]
51fn desc_present(raw: u64) -> bool {
52    raw & DESC_PRESENT_BIT != 0
53}
54
55#[inline]
56fn desc_dpl(raw: u64) -> u8 {
57    ((raw & DESC_DPL_MASK) >> DESC_DPL_SHIFT) as u8
58}
59
60#[inline]
61fn desc_is_code(raw: u64) -> bool {
62    // S=1 (code/data) ET E=1 (executable)
63    raw & DESC_S_BIT != 0 && raw & DESC_EXEC_BIT != 0
64}
65
66#[inline]
67fn desc_long_mode(raw: u64) -> bool {
68    raw & DESC_L_BIT != 0
69}
70
71// ===  Recursive page-table walk helpers =======================================
72
73/// Physical address mask inside a page-table entry (bits [51:12]).
74const PHYS_ADDR_MASK: u64 = 0x000F_FFFF_FFFF_F000;
75
76/// Checks that `vaddr` is mapped and USER_ACCESSIBLE at every level of the
77/// page hierarchy (PML4 → PDPT → PD → PT).
78///
79/// Returns `Ok(())` if all levels have `PRESENT | USER_ACCESSIBLE` set.
80/// Otherwise returns `Err(&str)` naming the problematic level.
81///
82/// Handles 1 GiB (PDPT) and 2 MiB (PD) huge pages: stop descending once a
83/// huge-page entry is found with USER_ACCESSIBLE.
84fn check_user_mapping(vaddr: u64) -> Result<(), &'static str> {
85    let hhdm = crate::memory::hhdm_offset();
86
87    // == PML4 ==================================================================
88    let (pml4_frame, _) = Cr3::read();
89    let pml4_phys = pml4_frame.start_address().as_u64();
90    let pml4_ptr = (pml4_phys + hhdm) as *const u64;
91    let pml4_idx = ((vaddr >> 39) & 0x1FF) as usize;
92
93    // SAFETY: pml4_ptr is an HHDM address pointing to the active PML4 (CR3)
94    let pml4_e = unsafe { *pml4_ptr.add(pml4_idx) };
95    let f4 = PageTableFlags::from_bits_truncate(pml4_e);
96
97    if !f4.contains(PageTableFlags::PRESENT) {
98        return Err("PML4: entry not present (PRESENT=0)");
99    }
100    if !f4.contains(PageTableFlags::USER_ACCESSIBLE) {
101        return Err("PML4: bit USER_ACCESSIBLE missing");
102    }
103
104    // == PDPT ==================================================================
105    let pdpt_ptr = ((pml4_e & PHYS_ADDR_MASK) + hhdm) as *const u64;
106    let pdpt_idx = ((vaddr >> 30) & 0x1FF) as usize;
107
108    // SAFETY: physical address extracted from a valid PML4 entry + HHDM offset
109    let pdpt_e = unsafe { *pdpt_ptr.add(pdpt_idx) };
110    let fp = PageTableFlags::from_bits_truncate(pdpt_e);
111
112    if !fp.contains(PageTableFlags::PRESENT) {
113        return Err("PDPT: entrée non présente (PRESENT=0)");
114    }
115    if !fp.contains(PageTableFlags::USER_ACCESSIBLE) {
116        return Err("PDPT: bit USER_ACCESSIBLE missing");
117    }
118    // Huge page 1 GiB no need to descend further
119    if fp.contains(PageTableFlags::HUGE_PAGE) {
120        return Ok(());
121    }
122
123    // == PD ==================================================================
124    let pd_ptr = ((pdpt_e & PHYS_ADDR_MASK) + hhdm) as *const u64;
125    let pd_idx = ((vaddr >> 21) & 0x1FF) as usize;
126
127    // SAFETY: physical address extracted from a valid PDPT entry + HHDM offset
128    let pd_e = unsafe { *pd_ptr.add(pd_idx) };
129    let fd = PageTableFlags::from_bits_truncate(pd_e);
130
131    if !fd.contains(PageTableFlags::PRESENT) {
132        return Err("PD: entry not preset (PRESENT=0)");
133    }
134    if !fd.contains(PageTableFlags::USER_ACCESSIBLE) {
135        return Err("PD: bit USER_ACCESSIBLE missing");
136    }
137    // Huge page 2 Mo pas besoin de descendre plus loin
138    if fd.contains(PageTableFlags::HUGE_PAGE) {
139        return Ok(());
140    }
141
142    // == PT ==================================================================
143    let pt_ptr = ((pd_e & PHYS_ADDR_MASK) + hhdm) as *const u64;
144    let pt_idx = ((vaddr >> 12) & 0x1FF) as usize;
145
146    // SAFETY: physical address extracted from a valid PD entry + HHDM offset
147    let pt_e = unsafe { *pt_ptr.add(pt_idx) };
148    let ft = PageTableFlags::from_bits_truncate(pt_e);
149
150    if !ft.contains(PageTableFlags::PRESENT) {
151        return Err("PT (page 4 KiB): entry not here (PRESENT=0)");
152    }
153    if !ft.contains(PageTableFlags::USER_ACCESSIBLE) {
154        return Err("PT (page 4 KiB): bit USER_ACCESSIBLE missing");
155    }
156
157    Ok(())
158}
159
160//  Main validation routine ===============================================================
161
162/// Validates all CPU preconditions for a safe Ring 3 transition via `iretq`.
163///
164/// # When to call
165/// Immediately **before** the `iretq` trampoline. If this function returns,
166/// all prerequisites are satisfied. If a check fails it `panic!`s with a
167/// detailed diagnosis (bad GDT/paging/TSS, raw descriptor values, addresses
168/// involved).
169///
170/// # Arguments
171/// * `target_rip` – Ring 3 instruction pointer (ELF entry point)
172/// * `target_rsp` – User-stack top (must be 16‑byte aligned)
173/// * `cs`         – Code-segment selector in the IRETQ frame (e.g. `0x2B`)
174/// * `ss`         – Stack-segment selector in the IRETQ frame (e.g. `0x23`)
175///
176/// # Panics
177/// Panics with an explicit message if any of the following is false:
178/// - CS/SS present in GDT with DPL=3, P=1; CS has L=1
179/// - `target_rip` and `target_rsp` are USER_ACCESSIBLE at every page level
180/// - `target_rsp` is 16‑byte aligned
181/// - TSS is loaded and `rsp0` resides in kernel space
182pub fn validate_ring3_state(target_rip: u64, target_rsp: u64, cs: u16, ss: u16) {
183    crate::serial_force_println!(
184        "[validate_ring3] === Begin validation Ring 3 === \
185         RIP={:#x} RSP={:#x} CS={:#x} SS={:#x}",
186        target_rip,
187        target_rsp,
188        cs,
189        ss,
190    );
191
192    // ==================================================================
193    // 1. Verification GDT
194    // ==================================================================
195
196    // SAFETY: `sgdt` only reads the GDTR register : no side effects.
197    let gdtr = sgdt();
198    let gdt_base = gdtr.base.as_u64() as *const u64;
199    let gdt_limit = gdtr.limit as usize; // in bytes, inclusive
200
201    crate::serial_force_println!(
202        "[validate_ring3] GDTR base={:#x} limit={:#x}",
203        gdtr.base.as_u64(),
204        gdt_limit,
205    );
206
207    // Conversion selector → index in the qword table (RPL = bits [1:0],
208    // TI = bit 2 ; the byte index is in bits [15:3]).
209    let cs_byte_offset = (cs & !0x7) as usize; // align to 8 bytes by masking RPL+TI
210    let ss_byte_offset = (ss & !0x7) as usize;
211    let cs_index = cs_byte_offset / 8;
212    let ss_index = ss_byte_offset / 8;
213
214    // Verify that the offsets are within the GDT limits
215    if cs_byte_offset + 7 > gdt_limit {
216        panic!(
217            "[validate_ring3] GDT: CS selector {:#x} (byte offset {}) \
218             exceeds GDTR limit ({:#x}). GDT too small or invalid selector.",
219            cs, cs_byte_offset, gdt_limit,
220        );
221    }
222    if ss_byte_offset + 7 > gdt_limit {
223        panic!(
224            "[validate_ring3] GDT: selector SS {:#x} (byte offset {}) \
225             exceeds GDTR limit ({:#x}). GDT too small or invalid selector.",
226            ss, ss_byte_offset, gdt_limit,
227        );
228    }
229
230    // SAFETY: bounds checked just above; gdtr.base points to the active GDT.
231    let cs_raw = unsafe { read_gdt_raw(gdt_base, cs_index) };
232    let ss_raw = unsafe { read_gdt_raw(gdt_base, ss_index) };
233
234    crate::serial_force_println!(
235        "[validate_ring3] GDT[{}] CS raw={:#018x}  GDT[{}] SS raw={:#018x}",
236        cs_index,
237        cs_raw,
238        ss_index,
239        ss_raw,
240    );
241
242    //  CS : Present ======================================
243    if !desc_present(cs_raw) {
244        panic!(
245            "[validate_ring3] GDT CS {:#x} (index {}): bit P (Present) = 0 ! \
246             The descriptor is marked as absent. raw={:#018x}",
247            cs, cs_index, cs_raw,
248        );
249    }
250
251    //  CS : DPL = 3 =====================================
252    let cs_dpl = desc_dpl(cs_raw);
253    if cs_dpl != 3 {
254        panic!(
255            "[validate_ring3] GDT CS {:#x} (index {}): DPL={} (3 expected). \
256             The descriptor will not allow execution in Ring 3. raw={:#018x}",
257            cs, cs_index, cs_dpl, cs_raw,
258        );
259    }
260
261    //  CS : segment de code ==============================
262    if !desc_is_code(cs_raw) {
263        panic!(
264            "[validate_ring3] GDT CS {:#x} (index {}): this is not a code segment \
265             (S={} E={}). IRETQ with a data selector in CS will cause a #GP. \
266             raw={:#018x}",
267            cs,
268            cs_index,
269            (cs_raw >> 44) & 1,
270            (cs_raw >> 43) & 1,
271            cs_raw,
272        );
273    }
274
275    // CS : Long Mode (L=1 requis in 64-bit) ==================================
276    if !desc_long_mode(cs_raw) {
277        panic!(
278            "[validate_ring3] GDT CS {:#x} (index {}): bit L (Long Mode 64-bit) = 0 ! \
279             In 64-bit mode, all Ring 3 code segments must have L=1. \
280             Without L=1, the CPU switches to 32-bit compatibility mode -> TRIPLE FAULT guaranteed. \
281             raw={:#018x}",
282            cs, cs_index, cs_raw,
283        );
284    }
285
286    //  SS : Present ==========================================
287    if !desc_present(ss_raw) {
288        panic!(
289            "[validate_ring3] GDT SS {:#x} (index {}): bit P (Present) = 0 ! \
290             raw={:#018x}",
291            ss, ss_index, ss_raw,
292        );
293    }
294
295    //  SS : DPL = 3 ==========================
296    let ss_dpl = desc_dpl(ss_raw);
297    if ss_dpl != 3 {
298        panic!(
299            "[validate_ring3] GDT SS {:#x} (index {}): DPL={} (3 attendu). \
300             IRETQ requiert DPL(SS) == RPL(CS) == 3. raw={:#018x}",
301            ss, ss_index, ss_dpl, ss_raw,
302        );
303    }
304
305    crate::serial_force_println!(
306        "[validate_ring3] [1/4] GDT OK : \
307         CS={:#x} P=1 DPL={} L=1 | SS={:#x} P=1 DPL={}",
308        cs,
309        cs_dpl,
310        ss,
311        ss_dpl,
312    );
313
314    // ====================================================
315    // 2. RSP ALIGNMENT CHECK (System V ABI §3.2.2)
316    // ====================================================
317    //
318    // Before a CALL, RSP must be 16‑byte aligned. CALL itself pushes an 8-byte
319    // return address, so the ABI requires RSP ≡ 0 (mod 16) *before* the CALL,
320    // i.e. at the entry point of the callee. In our case we use `iretq` with
321    // no preceding CALL, hence `target_rsp` must already satisfy the alignment
322    // for the libc/crt0 startup code to work.
323    if target_rsp & 0xF != 0 {
324        panic!(
325            "[validate_ring3] RSP={:#x} not 16-byte aligned \
326             (RSP & 0xF = {:#x}). The System V ABI requires RSP ≡ 0 (mod 16) \
327             before calling _start. Align the stack by subtracting {}.",
328            target_rsp,
329            target_rsp & 0xF,
330            target_rsp & 0xF,
331        );
332    }
333
334    crate::serial_force_println!(
335        "[validate_ring3] [2/4] Alignement RSP OK : RSP={:#x} ≡ 0 (mod 16)",
336        target_rsp,
337    );
338
339    // ====================================================
340    // 3. PAGE TABLES VERIFICATION
341    // ====================================================
342    //
343    // We check that RIP and RSP (as well as the page just below RSP,
344    // which will be the first used during a push) are accessible from
345    // Ring 3 at all levels of the paged hierarchy.
346
347    // -- RIP ------------------------------------------------------------------
348    match check_user_mapping(target_rip) {
349        Ok(()) => {
350            crate::serial_force_println!(
351                "[validate_ring3] Pagination RIP OK : {:#x} USER_ACCESSIBLE \
352                 on 4 levels (PML4 -> PDPT -> PD -> PT)",
353                target_rip,
354            );
355        }
356        Err(reason) => {
357            panic!(
358                "[validate_ring3] Pagination RIP {:#x} INVALID : {}. \
359                 The CPU will trigger a #PF immediately after iretq, \
360                 then a TRIPLE FAULT (as rsp0 might also be invalid).",
361                target_rip, reason,
362            );
363        }
364    }
365
366    // -- RSP : current page + previous page (first push crosses the boundary) -
367    //
368    // target_rsp points to the top of the stack; the first instruction
369    // from the user will likely perform a PUSH that subtracts 8 from RSP.
370    // If RSP is exactly at the start of a page (RSP & 0xFFF == 0), this first
371    // PUSH will access the previous page (RSP - 8). We check both.
372    for &probe in &[target_rsp, target_rsp.wrapping_sub(8)] {
373        match check_user_mapping(probe) {
374            Ok(()) => {
375                crate::serial_force_println!(
376                    "[validate_ring3] Pagination RSP OK : page de {:#x} USER_ACCESSIBLE",
377                    probe,
378                );
379            }
380            Err(reason) => {
381                panic!(
382                    "[validate_ring3] Pagination RSP probe {:#x} INVALID : {}. \
383                     The user stack is not accessible from Ring 3.",
384                    probe, reason,
385                );
386            }
387        }
388    }
389
390    crate::serial_force_println!(
391        "[validate_ring3] [3/4] Pagination OK : RIP={:#x} RSP={:#x} USER_ACCESSIBLE",
392        target_rip,
393        target_rsp,
394    );
395
396    // ====================================================
397    // 4. VeRIFICATION TSS
398    // ====================================================
399    //
400    // The CPU automatically loads rsp0 from the TSS when an exception occurs
401    // from Ring 3. If the TSS is not loaded (TR=0) or if rsp0 is invalid,
402    // the CPU will generate a Triple Fault on the first exception.
403
404    // Read the Task Register (TR) via the STR instruction, which gives us the selector of the currently loaded TSS. If TR=0, no TSS is loaded.
405    let tr_sel: u16;
406    // SAFETY: STR is a system register read instruction,
407    // with no side effects and always available in Ring 0.
408    unsafe {
409        core::arch::asm!(
410            "str {0:x}",
411            out(reg) tr_sel,
412            options(nostack, nomem),
413        );
414    }
415
416    if tr_sel == 0 {
417        panic!(
418            "[validate_ring3] TR=0 : no TSS loaded (instruction `ltr` never executed). \
419             Without a TSS, the CPU cannot recover the kernel stack during an exception \
420             from Ring 3 → immediate Triple Fault. Call gdt::init() before this trampoline.",
421        );
422    }
423
424    // Verify rsp0 via our TSS abstraction (TSS::privilege_stack_table[0]) since the CPU would load rsp0 from there on exception.
425    let cpu_index = crate::arch::x86_64::percpu::current_cpu_index();
426    let tss = crate::arch::x86_64::tss::tss_for(cpu_index);
427    let rsp0 = tss.privilege_stack_table[0].as_u64();
428    let loaded_tss = crate::arch::x86_64::tss::loaded_tss_info();
429
430    if rsp0 == 0 {
431        panic!(
432            "[validate_ring3] TSS.rsp0=0 : kernel stack (Ring 0) not configured. \
433             Call tss::set_kernel_stack() with the current thread's stack \
434             before entering Ring 3.",
435        );
436    }
437
438    // The kernel space starts at 0xffff_8000_0000_0000 on x86_64 with the
439    // "high half" canonical address scheme. Any address below this is in
440    // user space and unacceptable as a kernel stack.
441    const KERNEL_VADDR_START: u64 = 0xffff_8000_0000_0000;
442
443    if rsp0 < KERNEL_VADDR_START {
444        panic!(
445            "[validate_ring3] TSS.rsp0={:#x} is a user address (< {:#x}). \
446             The CPU would load a user stack in Ring 0 during an exception, \
447             which allows trivial privilege escalation. \
448             Call tss::set_kernel_stack() with a valid kernel address.",
449            rsp0, KERNEL_VADDR_START,
450        );
451    }
452
453    crate::serial_force_println!(
454        "[validate_ring3] [4/4] TSS OK : TR={:#x} rsp0={:#x} (kernel space, CPU {})",
455        tr_sel,
456        rsp0,
457        cpu_index,
458    );
459    crate::e9_println!(
460        "[validate_ring3] [4/4] TSS OK TR={:#x} rsp0={:#x} cpu={}",
461        tr_sel,
462        rsp0,
463        cpu_index,
464    );
465    if let Some(info) = loaded_tss {
466        crate::serial_force_println!(
467            "[validate_ring3] TSS live : TR={:#x} base={:#x} rsp0={:#x}",
468            info.tr_selector,
469            info.tss_base,
470            info.rsp0,
471        );
472        crate::e9_println!(
473            "[validate_ring3] TSS live TR={:#x} base={:#x} rsp0={:#x}",
474            info.tr_selector,
475            info.tss_base,
476            info.rsp0,
477        );
478        if info.rsp0 != rsp0 {
479            crate::serial_force_println!(
480                "[validate_ring3] TSS MISMATCH : software_rsp0={:#x} live_rsp0={:#x} cpu={}",
481                rsp0,
482                info.rsp0,
483                cpu_index,
484            );
485            crate::e9_println!(
486                "[validate_ring3] TSS MISMATCH sw={:#x} live={:#x} cpu={}",
487                rsp0,
488                info.rsp0,
489                cpu_index,
490            );
491        }
492    }
493
494    for vector in [
495        crate::arch::x86_64::apic::LVT_TIMER_VECTOR,
496        crate::arch::x86_64::apic::IPI_RESCHED_VECTOR,
497        crate::arch::x86_64::idt::irq::MOUSE,
498        0x0e,
499    ] {
500        if let Some(gate) = crate::arch::x86_64::idt::live_gate_info(vector) {
501            crate::serial_force_println!(
502                "[validate_ring3] IDT live vec={:#x} sel={:#x} opts={:#x} off={:#x}",
503                gate.vector,
504                gate.selector,
505                gate.options,
506                gate.offset,
507            );
508            crate::e9_println!(
509                "[validate_ring3] IDT live vec={:#x} sel={:#x} opts={:#x} off={:#x}",
510                gate.vector,
511                gate.selector,
512                gate.options,
513                gate.offset,
514            );
515        }
516    }
517
518    // ====================================================
519    // End of checks
520    // ====================================================
521    crate::serial_force_println!(
522        "[validate_ring3] === ALL RING 3 PREREQUISiTES VALIDATED === \
523         RIP={:#x} RSP={:#x} CS={:#x} SS={:#x} → iretq",
524        target_rip,
525        target_rsp,
526        cs,
527        ss,
528    );
529}