Skip to main content

strat9_kernel/arch/x86_64/
smp.rs

1//! SMP (Symmetric Multi-Processing) boot for x86_64.
2//!
3//! Boots Application Processors (APs) using the legacy INIT+SIPI sequence
4//! and parks them in an idle loop. Per-CPU data is initialized but no
5//! per-CPU scheduler is active yet.
6
7use core::{
8    arch::global_asm,
9    sync::atomic::{AtomicBool, AtomicUsize, Ordering},
10};
11
12use alloc::{vec, vec::Vec};
13use x86_64::{
14    structures::paging::{Page, PageTableFlags, PhysFrame, Size4KiB},
15    PhysAddr, VirtAddr,
16};
17
18use crate::{
19    acpi::madt,
20    arch::x86_64::{apic, idt, io::io_wait, percpu, timer},
21    memory,
22};
23
24/// Physical address where the SMP trampoline is copied.
25pub const TRAMPOLINE_PHYS_ADDR: u64 = 0x8000;
26
27/// Number of booted cores (starts at 1 for BSP).
28static BOOTED_CORES: AtomicUsize = AtomicUsize::new(1);
29/// Counter for synchronization barriers.
30static SYNC_BARRIER: AtomicUsize = AtomicUsize::new(0);
31/// Target count for the rendezvous barrier (set by BSP before barrier).
32static BARRIER_TARGET: AtomicUsize = AtomicUsize::new(0);
33/// Gate used by BSP to release APs into scheduler/timer start.
34static AP_SCHED_GATE_OPEN: AtomicBool = AtomicBool::new(false);
35
36// Boot Application Processors using the legacy INIT+SIPI sequence.
37// Returns the number of online CPUs (including BSP) or an error string on failure.
38// - BSP must have already initialized the APIC and parsed MADT to get APIC IDs.
39// - APs are parked in an idle loop after booting, waiting for the scheduler gate
40// to open before starting the scheduler and timer on each AP.
41#[cfg(target_arch = "x86_64")]
42global_asm!(
43    r#"
44.section .text
45.code16
46
47.global smp_trampoline
48.global smp_trampoline_end
49
50.set SMP_VAR_ADDR, 0x8000 + (smp_trampoline_end - smp_trampoline)
51
52smp_trampoline:
53    cli
54    cld
55    ljmp 0, 0x8040
56
57.align 16
58_gdt_table:
59    .long 0, 0
60    .long 0x0000ffff, 0x00af9a00 # code 64
61    .long 0x0000ffff, 0x00cf9200 # data
62    .long 0x0000ffff, 0x00cf9a00 # code 32
63_gdt:
64    .word _gdt - _gdt_table - 1
65    .long 0x8010
66    .long 0, 0
67.align 64
68
69    xor ax, ax
70    mov ds, ax
71    lgdt [0x8030]
72    mov eax, cr0
73    or eax, 1
74    mov cr0, eax
75    ljmp 24, 0x8060
76
77.align 32
78.code32
79    mov ax, 16
80    mov ds, ax
81    mov ss, ax
82
83    # Get Local APIC ID
84    mov eax, 1
85    cpuid
86    shr ebx, 24
87
88    # Set PML4 physical address
89    mov eax, [SMP_VAR_ADDR]
90    mov cr3, eax
91
92    # Enable PSE, PAE, OSFXSR, OSXMMEXCPT.
93    # Do not force OSXSAVE here: some VMs/CPUs may fault before Rust-side
94    # feature probing. init_cpu_extensions() enables OSXSAVE conditionally.
95    mov eax, cr4
96    or eax, 0x630
97    mov cr4, eax
98
99    # Enable LME
100    mov ecx, 0xc0000080 # EFER
101    xor edx, edx
102    rdmsr
103    or eax, 0x901
104    wrmsr
105
106    # Enable paging, write protect, and FPU/SSE
107    mov eax, cr0
108    and eax, 0xFFFFFFFB # Clear EM (bit 2)
109    or eax, 0x80010002  # Set PG, WP, MP (bit 1)
110    mov cr0, eax
111
112    ljmp 8, 0x80c0
113
114.align 32
115.code64
116    # Setup local stack
117    mov rsp, [SMP_VAR_ADDR + 8]
118    shl rbx, 3
119    add rsp, rbx
120    mov rsp, [rsp]
121
122    push 0
123    popfq
124
125    movabs rax, offset smp_main
126    jmp rax
127
128.align 8
129smp_trampoline_end:
130"#
131);
132
133unsafe extern "C" {
134    /// Performs the smp trampoline operation.
135    fn smp_trampoline();
136    /// Performs the smp trampoline end operation.
137    fn smp_trampoline_end();
138}
139
140/// Busy-wait for the given number of microseconds (very rough).
141fn udelay(us: u32) {
142    for _ in 0..us {
143        io_wait();
144    }
145}
146
147/// Performs the ensure identity mapping operation.
148fn ensure_identity_mapping(phys_start: u64, length: usize) {
149    let start = phys_start & !0xFFFu64;
150    let end = (phys_start + length as u64 + 0xFFF) & !0xFFFu64;
151    let flags = PageTableFlags::PRESENT | PageTableFlags::WRITABLE;
152
153    let mut addr = start;
154    while addr < end {
155        let virt = VirtAddr::new(addr);
156        if let Some(mapped) = crate::memory::paging::translate(virt) {
157            if mapped.as_u64() != addr {
158                log::warn!(
159                    "SMP: identity map collision at {:#x} -> {:#x}",
160                    addr,
161                    mapped.as_u64()
162                );
163            }
164        } else {
165            let page = Page::<Size4KiB>::containing_address(virt);
166            let frame = PhysFrame::<Size4KiB>::containing_address(PhysAddr::new(addr));
167            if let Err(e) = crate::memory::paging::map_page(page, frame, flags) {
168                log::error!("SMP: failed to identity map {:#x}: {}", addr, e);
169            }
170        }
171        addr += 0x1000;
172    }
173}
174
175/// Performs the copy trampoline operation.
176fn copy_trampoline(cr3_phys: u64, stacks_ptr: *const u64) {
177    let tramp_len = (smp_trampoline_end as *const u8 as usize)
178        .saturating_sub(smp_trampoline as *const u8 as usize);
179
180    ensure_identity_mapping(TRAMPOLINE_PHYS_ADDR, tramp_len + 16);
181
182    let tramp_virt = memory::phys_to_virt(TRAMPOLINE_PHYS_ADDR) as *mut u8;
183
184    // SAFETY: trampoline destination is mapped and writable in HHDM.
185    unsafe {
186        core::ptr::copy_nonoverlapping(smp_trampoline as *const u8, tramp_virt, tramp_len);
187        let ptrs = tramp_virt.add(tramp_len) as *mut u64;
188        core::ptr::write_volatile(ptrs, cr3_phys);
189        core::ptr::write_volatile(ptrs.add(1), stacks_ptr as u64);
190    }
191}
192
193/// Performs the wait delivery operation.
194fn wait_delivery() {
195    const DELIVERY_STATUS: u32 = 1 << 12;
196    for _ in 0..1_000_000 {
197        // SAFETY: APIC initialized, ICR low is readable.
198        let val = unsafe { apic::read_reg(apic::REG_ICR_LOW) };
199        if val & DELIVERY_STATUS == 0 {
200            return;
201        }
202        core::hint::spin_loop();
203    }
204    log::warn!("SMP: IPI delivery timeout");
205}
206
207/// Performs the send ipi operation.
208fn send_ipi(apic_id: u32, value: u32) {
209    apic::send_ipi_raw(apic_id, value);
210    wait_delivery();
211}
212
213/// Performs the send init sipi operation.
214fn send_init_sipi(apic_id: u32) {
215    // INIT IPI (assert)
216    send_ipi(apic_id, 0x0000_c500);
217    udelay(10_000);
218
219    // INIT de-assert
220    send_ipi(apic_id, 0x0000_8500);
221    udelay(200);
222
223    // SIPI twice, vector = 0x8 (0x8000 >> 12)
224    for _ in 0..2 {
225        send_ipi(apic_id, 0x0000_0608);
226        udelay(200);
227    }
228}
229
230/// Broadcast a halt command to all other CPUs.
231///
232/// Used during panic to stop the system and prevent log corruption.
233pub fn broadcast_panic_halt() {
234    if !apic::is_initialized() {
235        return;
236    }
237    // Destination Shorthand: 0b11 (All excluding self)
238    // Delivery Mode: 0b100 (NMI)
239    // Level: 1 (Assert)
240    let icr_low = (0b11 << 18) | (0b100 << 8) | (1 << 14);
241    apic::send_ipi_raw(0, icr_low);
242}
243
244/// Wait at a synchronization barrier until all expected CPUs arrive.
245///
246/// Every CPU (BSP + APs) calls this once.  BSP must store the target count
247/// in `BARRIER_TARGET` before any CPU enters the barrier.
248fn rendezvous_barrier() {
249    let expected = BARRIER_TARGET.load(Ordering::Acquire);
250    SYNC_BARRIER.fetch_add(1, Ordering::AcqRel);
251    while SYNC_BARRIER.load(Ordering::Acquire) < expected {
252        core::hint::spin_loop();
253    }
254}
255
256/// Boot Application Processors.
257pub fn init() -> Result<usize, &'static str> {
258    if !apic::is_initialized() {
259        return Err("APIC not initialized");
260    }
261
262    BOOTED_CORES.store(1, Ordering::Release);
263    SYNC_BARRIER.store(0, Ordering::Release);
264    BARRIER_TARGET.store(0, Ordering::Release);
265
266    let madt_info = madt::parse_madt().ok_or("MADT not available")?;
267    let bsp_apic_id = apic::lapic_id();
268
269    if madt_info.local_apic_count <= 1 {
270        log::info!("SMP: single CPU system");
271        return Ok(1);
272    }
273
274    let mut max_apic_id: usize = 0;
275    for i in 0..madt_info.local_apic_count {
276        if let Some(ref entry) = madt_info.local_apics[i] {
277            max_apic_id = max_apic_id.max(entry.apic_id as usize);
278        }
279    }
280
281    let mut stacks: Vec<u64> = vec![0; max_apic_id + 1];
282    let cr3_phys = crate::memory::paging::kernel_l4_phys().as_u64();
283    let mut targets: Vec<u32> = Vec::new();
284    let mut expected: usize = 1;
285
286    for i in 0..madt_info.local_apic_count {
287        let Some(ref entry) = madt_info.local_apics[i] else {
288            continue;
289        };
290
291        let apic_id = entry.apic_id as u32;
292        if apic_id == bsp_apic_id {
293            continue;
294        }
295
296        // Allocate AP kernel stack from the buddy allocator.
297        // boot_alloc is sealed after buddy init (to prevent double-allocation),
298        // so AP stacks must come from buddy.
299        // AP stacks are permanent kernel allocations: we intentionally leak the frame so buddy never reclaims it.
300
301        let stack_size = crate::process::task::Task::DEFAULT_STACK_SIZE;
302        let pages = (stack_size + 4095) / 4096;
303        let order = pages.next_power_of_two().trailing_zeros() as u8;
304        let frame = crate::sync::with_irqs_disabled(|token| {
305            crate::memory::allocate_phys_contiguous(token, order)
306        })
307        .map_err(|_| "SMP: failed to allocate AP stack from buddy")?;
308        let stack_phys = frame.start_address.as_u64();
309        let stack_virt = crate::memory::phys_to_virt(stack_phys);
310
311        // SAFETY: buddy gave us a valid, exclusive physical frame; phys_to_virt maps it.
312        unsafe { core::ptr::write_bytes(stack_virt as *mut u8, 0, stack_size) };
313
314        // Stack grows downward: top = base_virt + size.
315        // The PhysFrame is Copy (no Drop): buddy keeps it marked as allocated
316        // since we never call free_frames : permanent kernel allocation.
317        let stack_top = stack_virt.saturating_add(stack_size as u64);
318
319        if apic_id as usize >= stacks.len() {
320            log::warn!("SMP: APIC id {} out of stack array range", apic_id);
321            continue;
322        }
323
324        stacks[apic_id as usize] = stack_top;
325
326        let cpu_index =
327            percpu::register_cpu(apic_id).ok_or("SMP: exceeded MAX_CPUS for per-CPU data")?;
328        percpu::set_kernel_stack_top(cpu_index, stack_top);
329
330        // We no longer need to push to AP_KERNEL_STACKS as these are static.
331        targets.push(apic_id);
332        expected += 1;
333    }
334
335    copy_trampoline(cr3_phys, stacks.as_ptr());
336
337    for apic_id in targets {
338        send_init_sipi(apic_id);
339    }
340
341    // Do not spin forever if one AP fails very early (e.g. trampoline fault).
342    // Keep booting with available CPUs and report partial bring-up.
343    let mut spins: u64 = 0;
344    const MAX_SPINS: u64 = 200_000_000;
345    crate::e9_println!("BE SMP wait APs expected={}", expected);
346    while BOOTED_CORES.load(Ordering::Acquire) < expected && spins < MAX_SPINS {
347        core::hint::spin_loop();
348        spins = spins.saturating_add(1);
349    }
350    crate::e9_println!(
351        "BF SMP done online={}",
352        BOOTED_CORES.load(Ordering::Acquire)
353    );
354    if BOOTED_CORES.load(Ordering::Acquire) < expected {
355        log::warn!(
356            "SMP: timeout waiting APs (online={} expected={}), continuing",
357            BOOTED_CORES.load(Ordering::Acquire),
358            expected
359        );
360    }
361
362    let online = BOOTED_CORES.load(Ordering::Acquire);
363    log::info!("SMP: {} cores online (expected {})", online, expected);
364
365    // Publish the barrier target so every CPU (BSP + APs) uses the same value.
366    BARRIER_TARGET.store(online, Ordering::Release);
367    // BSP reaches the rendezvous point.
368    rendezvous_barrier();
369
370    Ok(online)
371}
372
373/// First Rust function executed on APs after the trampoline.
374#[unsafe(no_mangle)]
375pub extern "C" fn smp_main() -> ! {
376    idt::load();
377
378    // Re-initialize Local APIC for this core (per-core registers).
379    apic::init_ap();
380
381    let apic_id = apic::lapic_id();
382    let cpu_index = match percpu::cpu_index_by_apic(apic_id) {
383        Some(idx) => idx,
384        None => {
385            // If we fall back to 0, this AP would share the BSP's
386            // CPU index and double-increment TICK_COUNT 2× timer speed.
387            log::error!("SMP AP: APIC id {} not registered : halting core", apic_id);
388            loop {
389                core::hint::spin_loop();
390            }
391        }
392    };
393
394    // Initialize per-CPU GS base.
395    crate::arch::x86_64::percpu::init_gs_base(cpu_index);
396
397    // Initialize per-CPU TSS/GDT (now uses O(1) current_cpu_index).
398    crate::arch::x86_64::tss::init_cpu(cpu_index);
399    crate::arch::x86_64::gdt::init_cpu(cpu_index);
400
401    crate::arch::x86_64::syscall::init();
402    crate::arch::x86_64::init_cpu_extensions();
403
404    if let Some(stack_top) = percpu::kernel_stack_top(cpu_index) {
405        crate::arch::x86_64::tss::set_kernel_stack_for(cpu_index, x86_64::VirtAddr::new(stack_top));
406    }
407
408    let _ = percpu::mark_online_by_apic(apic_id);
409    BOOTED_CORES.fetch_add(1, Ordering::Release);
410
411    // AP spins until BSP publishes the barrier target, then enters barrier.
412    while BARRIER_TARGET.load(Ordering::Acquire) == 0 {
413        core::hint::spin_loop();
414    }
415    rendezvous_barrier();
416
417    crate::serial_println!(
418        "[trace][ap] online cpu_index={} entering ap scheduler",
419        cpu_index
420    );
421
422    // Wait until BSP has finished scheduler initialization.
423    while !AP_SCHED_GATE_OPEN.load(Ordering::Acquire) {
424        core::hint::spin_loop();
425    }
426
427    // Start APIC timer on this CPU (uses cached calibration from BSP).
428    timer::start_apic_timer_cached();
429
430    // Start per-CPU scheduler (never returns).
431    crate::process::scheduler::schedule_on_cpu(cpu_index)
432}
433
434/// Return the number of online CPUs.
435pub fn cpu_count() -> usize {
436    BOOTED_CORES.load(Ordering::Acquire)
437}
438
439/// Allow APs to start their local timer and enter the scheduler.
440pub fn open_ap_scheduler_gate() {
441    AP_SCHED_GATE_OPEN.store(true, Ordering::Release);
442}