Skip to main content

strat9_kernel/arch/x86_64/
smp.rs

1//! SMP (Symmetric Multi-Processing) boot for x86_64.
2//!
3//! Boots Application Processors (APs) using the legacy INIT+SIPI sequence
4//! and parks them in an idle loop. Per-CPU data is initialized but no
5//! per-CPU scheduler is active yet.
6
7use core::{
8    arch::global_asm,
9    sync::atomic::{AtomicBool, AtomicUsize, Ordering},
10};
11
12use alloc::{vec, vec::Vec};
13use x86_64::{
14    structures::paging::{Page, PageTableFlags, PhysFrame, Size4KiB},
15    PhysAddr, VirtAddr,
16};
17
18use crate::{
19    acpi::madt,
20    arch::x86_64::{apic, idt, io::io_wait, percpu, timer},
21    memory,
22    process::task::KernelStack,
23    sync::SpinLock,
24};
25
26/// Physical address where the SMP trampoline is copied.
27pub const TRAMPOLINE_PHYS_ADDR: u64 = 0x8000;
28
29/// Number of booted cores (starts at 1 for BSP).
30static BOOTED_CORES: AtomicUsize = AtomicUsize::new(1);
31/// Counter for synchronization barriers.
32static SYNC_BARRIER: AtomicUsize = AtomicUsize::new(0);
33/// Target count for the rendezvous barrier (set by BSP before barrier).
34static BARRIER_TARGET: AtomicUsize = AtomicUsize::new(0);
35/// Gate used by BSP to release APs into scheduler/timer start.
36static AP_SCHED_GATE_OPEN: AtomicBool = AtomicBool::new(false);
37
38/// Keep AP kernel stacks alive.
39static AP_KERNEL_STACKS: SpinLock<Vec<KernelStack>> = SpinLock::new(Vec::new());
40
41// Boot Application Processors using the legacy INIT+SIPI sequence.
42// Returns the number of online CPUs (including BSP) or an error string on failure.
43// - BSP must have already initialized the APIC and parsed MADT to get APIC IDs.
44// - APs are parked in an idle loop after booting, waiting for the scheduler gate
45// to open before starting the scheduler and timer on each AP.
46#[cfg(target_arch = "x86_64")]
47global_asm!(
48    r#"
49.section .text
50.code16
51
52.global smp_trampoline
53.global smp_trampoline_end
54
55.set SMP_VAR_ADDR, 0x8000 + (smp_trampoline_end - smp_trampoline)
56
57smp_trampoline:
58    cli
59    cld
60    ljmp 0, 0x8040
61
62.align 16
63_gdt_table:
64    .long 0, 0
65    .long 0x0000ffff, 0x00af9a00 # code 64
66    .long 0x0000ffff, 0x00cf9200 # data
67    .long 0x0000ffff, 0x00cf9a00 # code 32
68_gdt:
69    .word _gdt - _gdt_table - 1
70    .long 0x8010
71    .long 0, 0
72.align 64
73
74    xor ax, ax
75    mov ds, ax
76    lgdt [0x8030]
77    mov eax, cr0
78    or eax, 1
79    mov cr0, eax
80    ljmp 24, 0x8060
81
82.align 32
83.code32
84    mov ax, 16
85    mov ds, ax
86    mov ss, ax
87
88    # Get Local APIC ID
89    mov eax, 1
90    cpuid
91    shr ebx, 24
92
93    # Set PML4 physical address
94    mov eax, [SMP_VAR_ADDR]
95    mov cr3, eax
96
97    # Enable PSE, PAE, OSFXSR, OSXMMEXCPT.
98    # Do not force OSXSAVE here: some VMs/CPUs may fault before Rust-side
99    # feature probing. init_cpu_extensions() enables OSXSAVE conditionally.
100    mov eax, cr4
101    or eax, 0x630
102    mov cr4, eax
103
104    # Enable LME
105    mov ecx, 0xc0000080 # EFER
106    xor edx, edx
107    rdmsr
108    or eax, 0x901
109    wrmsr
110
111    # Enable paging, write protect, and FPU/SSE
112    mov eax, cr0
113    and eax, 0xFFFFFFFB # Clear EM (bit 2)
114    or eax, 0x80010002  # Set PG, WP, MP (bit 1)
115    mov cr0, eax
116
117    ljmp 8, 0x80c0
118
119.align 32
120.code64
121    # Setup local stack
122    mov rsp, [SMP_VAR_ADDR + 8]
123    shl rbx, 3
124    add rsp, rbx
125    mov rsp, [rsp]
126
127    push 0
128    popfq
129
130    movabs rax, offset smp_main
131    jmp rax
132
133.align 8
134smp_trampoline_end:
135"#
136);
137
138unsafe extern "C" {
139    /// Performs the smp trampoline operation.
140    fn smp_trampoline();
141    /// Performs the smp trampoline end operation.
142    fn smp_trampoline_end();
143}
144
145/// Busy-wait for the given number of microseconds (very rough).
146fn udelay(us: u32) {
147    for _ in 0..us {
148        io_wait();
149    }
150}
151
152/// Performs the ensure identity mapping operation.
153fn ensure_identity_mapping(phys_start: u64, length: usize) {
154    let start = phys_start & !0xFFFu64;
155    let end = (phys_start + length as u64 + 0xFFF) & !0xFFFu64;
156    let flags = PageTableFlags::PRESENT | PageTableFlags::WRITABLE;
157
158    let mut addr = start;
159    while addr < end {
160        let virt = VirtAddr::new(addr);
161        if let Some(mapped) = crate::memory::paging::translate(virt) {
162            if mapped.as_u64() != addr {
163                log::warn!(
164                    "SMP: identity map collision at {:#x} -> {:#x}",
165                    addr,
166                    mapped.as_u64()
167                );
168            }
169        } else {
170            let page = Page::<Size4KiB>::containing_address(virt);
171            let frame = PhysFrame::<Size4KiB>::containing_address(PhysAddr::new(addr));
172            if let Err(e) = crate::memory::paging::map_page(page, frame, flags) {
173                log::error!("SMP: failed to identity map {:#x}: {}", addr, e);
174            }
175        }
176        addr += 0x1000;
177    }
178}
179
180/// Performs the copy trampoline operation.
181fn copy_trampoline(cr3_phys: u64, stacks_ptr: *const u64) {
182    let tramp_len = (smp_trampoline_end as *const u8 as usize)
183        .saturating_sub(smp_trampoline as *const u8 as usize);
184
185    ensure_identity_mapping(TRAMPOLINE_PHYS_ADDR, tramp_len + 16);
186
187    let tramp_virt = memory::phys_to_virt(TRAMPOLINE_PHYS_ADDR) as *mut u8;
188
189    // SAFETY: trampoline destination is mapped and writable in HHDM.
190    unsafe {
191        core::ptr::copy_nonoverlapping(smp_trampoline as *const u8, tramp_virt, tramp_len);
192        let ptrs = tramp_virt.add(tramp_len) as *mut u64;
193        core::ptr::write_volatile(ptrs, cr3_phys);
194        core::ptr::write_volatile(ptrs.add(1), stacks_ptr as u64);
195    }
196}
197
198/// Performs the wait delivery operation.
199fn wait_delivery() {
200    const DELIVERY_STATUS: u32 = 1 << 12;
201    for _ in 0..1_000_000 {
202        // SAFETY: APIC initialized, ICR low is readable.
203        let val = unsafe { apic::read_reg(apic::REG_ICR_LOW) };
204        if val & DELIVERY_STATUS == 0 {
205            return;
206        }
207        core::hint::spin_loop();
208    }
209    log::warn!("SMP: IPI delivery timeout");
210}
211
212/// Performs the send ipi operation.
213fn send_ipi(apic_id: u32, value: u32) {
214    apic::send_ipi_raw(apic_id, value);
215    wait_delivery();
216}
217
218/// Performs the send init sipi operation.
219fn send_init_sipi(apic_id: u32) {
220    // INIT IPI (assert)
221    send_ipi(apic_id, 0x0000_c500);
222    udelay(10_000);
223
224    // INIT de-assert
225    send_ipi(apic_id, 0x0000_8500);
226    udelay(200);
227
228    // SIPI twice, vector = 0x8 (0x8000 >> 12)
229    for _ in 0..2 {
230        send_ipi(apic_id, 0x0000_0608);
231        udelay(200);
232    }
233}
234
235/// Broadcast a halt command to all other CPUs.
236///
237/// Used during panic to stop the system and prevent log corruption.
238pub fn broadcast_panic_halt() {
239    if !apic::is_initialized() {
240        return;
241    }
242    // Destination Shorthand: 0b11 (All excluding self)
243    // Delivery Mode: 0b100 (NMI)
244    // Level: 1 (Assert)
245    let icr_low = (0b11 << 18) | (0b100 << 8) | (1 << 14);
246    apic::send_ipi_raw(0, icr_low);
247}
248
249/// Wait at a synchronization barrier until all expected CPUs arrive.
250///
251/// Every CPU (BSP + APs) calls this once.  BSP must store the target count
252/// in `BARRIER_TARGET` before any CPU enters the barrier.
253fn rendezvous_barrier() {
254    let expected = BARRIER_TARGET.load(Ordering::Acquire);
255    SYNC_BARRIER.fetch_add(1, Ordering::AcqRel);
256    while SYNC_BARRIER.load(Ordering::Acquire) < expected {
257        core::hint::spin_loop();
258    }
259}
260
261/// Boot Application Processors.
262pub fn init() -> Result<usize, &'static str> {
263    if !apic::is_initialized() {
264        return Err("APIC not initialized");
265    }
266
267    BOOTED_CORES.store(1, Ordering::Release);
268    SYNC_BARRIER.store(0, Ordering::Release);
269    BARRIER_TARGET.store(0, Ordering::Release);
270
271    let madt_info = madt::parse_madt().ok_or("MADT not available")?;
272    let bsp_apic_id = apic::lapic_id();
273
274    if madt_info.local_apic_count <= 1 {
275        log::info!("SMP: single CPU system");
276        return Ok(1);
277    }
278
279    let mut max_apic_id: usize = 0;
280    for i in 0..madt_info.local_apic_count {
281        if let Some(ref entry) = madt_info.local_apics[i] {
282            max_apic_id = max_apic_id.max(entry.apic_id as usize);
283        }
284    }
285
286    let mut stacks: Vec<u64> = vec![0; max_apic_id + 1];
287    let cr3_phys = crate::memory::paging::kernel_l4_phys().as_u64();
288    let mut targets: Vec<u32> = Vec::new();
289    let mut expected: usize = 1;
290
291    for i in 0..madt_info.local_apic_count {
292        let Some(ref entry) = madt_info.local_apics[i] else {
293            continue;
294        };
295
296        let apic_id = entry.apic_id as u32;
297        if apic_id == bsp_apic_id {
298            continue;
299        }
300
301        // Allocate AP kernel stack from the buddy allocator.
302        // boot_alloc is sealed after buddy init (to prevent double-allocation),
303        // so AP stacks must come from buddy.  
304        // AP stacks are permanent kernel allocations: we intentionally leak the frame so buddy never reclaims it.
305        
306        let stack_size = crate::process::task::Task::DEFAULT_STACK_SIZE;
307        let pages = (stack_size + 4095) / 4096;
308        let order = pages.next_power_of_two().trailing_zeros() as u8;
309        let frame = crate::sync::with_irqs_disabled(|token| {
310            crate::memory::allocate_frames(token, order)
311        })
312        .map_err(|_| "SMP: failed to allocate AP stack from buddy")?;
313        let stack_phys = frame.start_address.as_u64();
314        let stack_virt = crate::memory::phys_to_virt(stack_phys);
315
316        // SAFETY: buddy gave us a valid, exclusive physical frame; phys_to_virt maps it.
317        unsafe { core::ptr::write_bytes(stack_virt as *mut u8, 0, stack_size) };
318
319        // Stack grows downward: top = base_virt + size.
320        // The PhysFrame is Copy (no Drop): buddy keeps it marked as allocated
321        // since we never call free_frames : permanent kernel allocation.
322        let stack_top = stack_virt.saturating_add(stack_size as u64);
323
324        if apic_id as usize >= stacks.len() {
325            log::warn!("SMP: APIC id {} out of stack array range", apic_id);
326            continue;
327        }
328
329        stacks[apic_id as usize] = stack_top;
330
331        let cpu_index =
332            percpu::register_cpu(apic_id).ok_or("SMP: exceeded MAX_CPUS for per-CPU data")?;
333        percpu::set_kernel_stack_top(cpu_index, stack_top);
334
335        // We no longer need to push to AP_KERNEL_STACKS as these are static.
336        targets.push(apic_id);
337        expected += 1;
338    }
339
340    copy_trampoline(cr3_phys, stacks.as_ptr());
341
342    for apic_id in targets {
343        send_init_sipi(apic_id);
344    }
345
346    // Do not spin forever if one AP fails very early (e.g. trampoline fault).
347    // Keep booting with available CPUs and report partial bring-up.
348    let mut spins: u64 = 0;
349    const MAX_SPINS: u64 = 200_000_000;
350    while BOOTED_CORES.load(Ordering::Acquire) < expected && spins < MAX_SPINS {
351        core::hint::spin_loop();
352        spins = spins.saturating_add(1);
353    }
354    if BOOTED_CORES.load(Ordering::Acquire) < expected {
355        log::warn!(
356            "SMP: timeout waiting APs (online={} expected={}), continuing",
357            BOOTED_CORES.load(Ordering::Acquire),
358            expected
359        );
360    }
361
362    let online = BOOTED_CORES.load(Ordering::Acquire);
363    log::info!("SMP: {} cores online (expected {})", online, expected);
364
365    // Publish the barrier target so every CPU (BSP + APs) uses the same value.
366    BARRIER_TARGET.store(online, Ordering::Release);
367    // BSP reaches the rendezvous point.
368    rendezvous_barrier();
369
370    Ok(online)
371}
372
373/// First Rust function executed on APs after the trampoline.
374#[unsafe(no_mangle)]
375pub extern "C" fn smp_main() -> ! {
376    idt::load();
377
378    // Re-initialize Local APIC for this core (per-core registers).
379    apic::init_ap();
380
381    let apic_id = apic::lapic_id();
382    let cpu_index = match percpu::cpu_index_by_apic(apic_id) {
383        Some(idx) => idx,
384        None => {
385            // If we fall back to 0, this AP would share the BSP's
386            // CPU index and double-increment TICK_COUNT 2× timer speed.
387            log::error!("SMP AP: APIC id {} not registered — halting core", apic_id);
388            loop {
389                core::hint::spin_loop();
390            }
391        }
392    };
393
394    // Initialize per-CPU GS base.
395    crate::arch::x86_64::percpu::init_gs_base(cpu_index);
396
397    // Initialize per-CPU TSS/GDT (now uses O(1) current_cpu_index).
398    crate::arch::x86_64::tss::init_cpu(cpu_index);
399    crate::arch::x86_64::gdt::init_cpu(cpu_index);
400
401    crate::arch::x86_64::syscall::init();
402    crate::arch::x86_64::init_cpu_extensions();
403
404    if let Some(stack_top) = percpu::kernel_stack_top(cpu_index) {
405        crate::arch::x86_64::tss::set_kernel_stack_for(cpu_index, x86_64::VirtAddr::new(stack_top));
406    }
407
408    let _ = percpu::mark_online_by_apic(apic_id);
409    BOOTED_CORES.fetch_add(1, Ordering::Release);
410
411    // AP spins until BSP publishes the barrier target, then enters barrier.
412    while BARRIER_TARGET.load(Ordering::Acquire) == 0 {
413        core::hint::spin_loop();
414    }
415    rendezvous_barrier();
416
417    crate::serial_println!(
418        "[trace][ap] online cpu_index={} entering ap scheduler",
419        cpu_index
420    );
421
422    // Wait until BSP has finished scheduler initialization.
423    while !AP_SCHED_GATE_OPEN.load(Ordering::Acquire) {
424        core::hint::spin_loop();
425    }
426
427    // Start APIC timer on this CPU (uses cached calibration from BSP).
428    timer::start_apic_timer_cached();
429
430    // Start per-CPU scheduler (never returns).
431    crate::process::scheduler::schedule_on_cpu(cpu_index)
432}
433
434/// Return the number of online CPUs.
435pub fn cpu_count() -> usize {
436    BOOTED_CORES.load(Ordering::Acquire)
437}
438
439/// Allow APs to start their local timer and enter the scheduler.
440pub fn open_ap_scheduler_gate() {
441    AP_SCHED_GATE_OPEN.store(true, Ordering::Release);
442}