Skip to main content

strat9_kernel/arch/x86_64/
tlb.rs

1//! TLB (Translation Lookaside Buffer) shootdown for SMP.
2//!
3//! When a page table entry is modified on one CPU, all other CPUs that might
4//! have cached that entry in their TLB must be notified to invalidate it.
5//!
6//! This implementation uses a per-CPU mailbox system inspired by Asterinas :
7//! 1. Each CPU has its own queue of pending TLB operations.
8//! 2. The initiator pushes an operation into each target's queue.
9//! 3. The initiator sends a TLB shootdown IPI to all targets.
10//! 4. The targets process their own queue and set an ACK flag.
11//! 5. The initiator waits for all ACK flags to become true.
12//!
13//! This avoids global lock contention and race conditions on global counters.
14
15use core::sync::atomic::{AtomicBool, Ordering};
16use x86_64::VirtAddr;
17
18use crate::sync::SpinLock;
19
20/// Type of TLB shootdown operation.
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22enum TlbShootdownKind {
23    /// No pending shootdown.
24    None,
25    /// Invalidate a single page.
26    SinglePage,
27    /// Invalidate a range of pages.
28    Range,
29    /// Flush all TLB entries (full CR3 reload).
30    Full,
31}
32
33/// A single TLB operation.
34#[derive(Debug, Clone, Copy)]
35struct TlbOp {
36    kind: TlbShootdownKind,
37    vaddr_start: u64,
38    vaddr_end: u64,
39}
40
41impl TlbOp {
42    const NONE: Self = Self {
43        kind: TlbShootdownKind::None,
44        vaddr_start: 0,
45        vaddr_end: 0,
46    };
47}
48
49/// Per-CPU queue of pending TLB operations.
50struct TlbQueue {
51    ops: [TlbOp; 16],
52    count: usize,
53}
54
55impl TlbQueue {
56    const fn new() -> Self {
57        Self {
58            ops: [TlbOp::NONE; 16],
59            count: 0,
60        }
61    }
62
63    fn push(&mut self, op: TlbOp) {
64        if self.count < 16 {
65            self.ops[self.count] = op;
66            self.count += 1;
67        } else {
68            // Queue full: upgrade to a full flush to be safe.
69            self.ops[0] = TlbOp {
70                kind: TlbShootdownKind::Full,
71                vaddr_start: 0,
72                vaddr_end: 0,
73            };
74            self.count = 1;
75        }
76    }
77
78    fn clear(&mut self) {
79        self.count = 0;
80    }
81}
82
83/// Page-count threshold above which a ranged shootdown falls back to a full
84/// CR3 reload (`shootdown_all`) rather than issuing one `invlpg` per page.
85///
86/// Rationale: on x86, a CR3 write flushes all non-global TLB entries in a
87/// single pipeline stage. Individual `invlpg` instructions have a per-entry
88/// cost that grows linearly with the number of pages. Empirically, the
89/// crossover point is around 32–64 pages (µarch-dependent); 64 is a
90/// conservative upper bound that keeps the fast path below ~256 ns on modern
91/// hardware while avoiding spurious full flushes on small vmalloc frees.
92///
93/// The same constant governs both the sender side (`shootdown_range`,
94/// `local_range`) and the receiver side (IPI handler's `Range` arm) so the
95/// two never disagree on what "small range" means.
96pub const TLB_RANGE_THRESHOLD_PAGES: usize = 64;
97
98/// Global array of per-CPU TLB queues.
99static TLB_QUEUES: [SpinLock<TlbQueue>; crate::arch::x86_64::percpu::MAX_CPUS] =
100    [const { SpinLock::new(TlbQueue::new()) }; crate::arch::x86_64::percpu::MAX_CPUS];
101
102/// Global array of per-CPU acknowledgement flags.
103static TLB_ACKS: [AtomicBool; crate::arch::x86_64::percpu::MAX_CPUS] =
104    [const { AtomicBool::new(true) }; crate::arch::x86_64::percpu::MAX_CPUS];
105
106/// Initialize TLB shootdown system.
107pub fn init() {
108    log::debug!(
109        "TLB shootdown initialized (vector {:#x})",
110        crate::arch::x86_64::apic::IPI_TLB_SHOOTDOWN_VECTOR
111    );
112}
113
114/// Invalidate a single page on all CPUs.
115pub fn shootdown_page(vaddr: VirtAddr) {
116    let op = TlbOp {
117        kind: TlbShootdownKind::SinglePage,
118        vaddr_start: vaddr.as_u64(),
119        vaddr_end: vaddr.as_u64() + 4096,
120    };
121
122    // Flush local TLB.
123    unsafe { invlpg(vaddr) };
124
125    dispatch_op(op);
126}
127
128/// Invalidate a single page on the current CPU only.
129#[inline]
130pub fn local_page(vaddr: VirtAddr) {
131    unsafe { invlpg(vaddr) };
132}
133
134/// Invalidate a range on the current CPU only.
135pub fn local_range(start: VirtAddr, end: VirtAddr) {
136    if end.as_u64() <= start.as_u64() {
137        unsafe { flush_tlb_all() };
138        return;
139    }
140
141    let page_count = (end.as_u64() - start.as_u64()) / 4096;
142    if page_count > TLB_RANGE_THRESHOLD_PAGES as u64 {
143        unsafe { flush_tlb_all() };
144        return;
145    }
146
147    for i in 0..page_count {
148        let addr = start + (i * 4096);
149        unsafe { invlpg(addr) };
150    }
151}
152
153/// Invalidate a range of pages on all CPUs.
154///
155/// Falls back to [`shootdown_all`] when the range exceeds
156/// [`TLB_RANGE_THRESHOLD_PAGES`] pages, because a full CR3 reload is cheaper
157/// than that many `invlpg` instructions on both the initiating and receiving
158/// CPUs.
159pub fn shootdown_range(start: VirtAddr, end: VirtAddr) {
160    // Guard: end must be strictly after start; silently promote to full flush
161    // if the range is invalid rather than underflowing in u64 arithmetic.
162    if end.as_u64() <= start.as_u64() {
163        log::warn!(
164            "TLB shootdown_range: invalid range [{:#x}, {:#x}), using full flush",
165            start.as_u64(),
166            end.as_u64(),
167        );
168        shootdown_all();
169        return;
170    }
171
172    let page_count = (end.as_u64() - start.as_u64()) / 4096;
173    if page_count > TLB_RANGE_THRESHOLD_PAGES as u64 {
174        shootdown_all();
175        return;
176    }
177
178    let op = TlbOp {
179        kind: TlbShootdownKind::Range,
180        vaddr_start: start.as_u64(),
181        vaddr_end: end.as_u64(),
182    };
183
184    // Flush local TLB.
185    for i in 0..page_count {
186        let addr = start + (i * 4096);
187        unsafe { invlpg(addr) };
188    }
189
190    dispatch_op(op);
191}
192
193/// Flush all TLB entries on all CPUs.
194pub fn shootdown_all() {
195    let op = TlbOp {
196        kind: TlbShootdownKind::Full,
197        vaddr_start: 0,
198        vaddr_end: 0,
199    };
200
201    // Flush local TLB.
202    unsafe { flush_tlb_all() };
203
204    dispatch_op(op);
205}
206
207/// Internal helper to dispatch an operation to target CPUs.
208fn dispatch_op(op: TlbOp) {
209    if !crate::arch::x86_64::apic::is_initialized() {
210        return;
211    }
212
213    let mut targets = [0u32; crate::arch::x86_64::percpu::MAX_CPUS];
214    let count = collect_tlb_targets(&mut targets);
215    if count == 0 {
216        return;
217    }
218
219    // `queued` tracks only the APIC IDs that were successfully pushed to a
220    // mailbox queue.  We must not send an IPI to, or wait for an ACK from,
221    // an AP whose queue was skipped : doing so would either waste cycles or
222    // spin-wait forever on an ACK that was never cleared.
223    let mut queued = [0u32; crate::arch::x86_64::percpu::MAX_CPUS];
224    let mut queued_count = 0usize;
225
226    // 1. Push op to each target's mailbox and clear their ACK.
227    for i in 0..count {
228        let apic_id = targets[i];
229        // cpu_index_by_apic can return None if the AP went offline between
230        // collect_tlb_targets and here; skip silently rather than panicking
231        // in an IPI-send path.
232        let cpu_idx = match crate::arch::x86_64::percpu::cpu_index_by_apic(apic_id) {
233            Some(idx) => idx,
234            None => {
235                log::warn!(
236                    "TLB dispatch: APIC {} not in per-CPU table, skipping",
237                    apic_id
238                );
239                continue;
240            }
241        };
242        let mut queue = TLB_QUEUES[cpu_idx].lock();
243        queue.push(op);
244        TLB_ACKS[cpu_idx].store(false, Ordering::Release);
245        drop(queue);
246        // Record as a successfully-queued target.
247        queued[queued_count] = apic_id;
248        queued_count += 1;
249    }
250
251    if queued_count == 0 {
252        return;
253    }
254
255    // 2. Send IPI only to targets that actually received a queued op.
256    for i in 0..queued_count {
257        send_tlb_ipi(queued[i]);
258    }
259
260    // 3. Wait for ACKs from the same set.
261    wait_for_acks(&queued[..queued_count]);
262}
263
264/// IPI handler for TLB shootdown (called on receiving CPU).
265pub extern "C" fn tlb_shootdown_ipi_handler() {
266    let cpu_idx = current_cpu_index();
267
268    // 1. Take all pending ops from our mailbox.
269    let mut local_ops = [TlbOp::NONE; 16];
270    let count = {
271        let mut queue = TLB_QUEUES[cpu_idx].lock();
272        let c = queue.count;
273        for i in 0..c {
274            local_ops[i] = queue.ops[i];
275        }
276        queue.clear();
277        c
278    };
279
280    // 2. Perform the operations.
281    for i in 0..count {
282        let op = &local_ops[i];
283        match op.kind {
284            TlbShootdownKind::None => {}
285            TlbShootdownKind::SinglePage => {
286                unsafe { invlpg(VirtAddr::new(op.vaddr_start)) };
287            }
288            TlbShootdownKind::Range => {
289                let start = op.vaddr_start;
290                let end = op.vaddr_end;
291                // Guard: corrupt TlbOp must not underflow in release build.
292                // Also cap to TLB_RANGE_THRESHOLD_PAGES for defense-in-depth:
293                // the sender already guarantees this, but a stale or malformed
294                // op must not spend thousands of invlpg cycles on the receiver.
295                if end > start {
296                    let page_count = (end - start) / 4096;
297                    if page_count > TLB_RANGE_THRESHOLD_PAGES as u64 {
298                        unsafe { flush_tlb_all() };
299                    } else {
300                        for j in 0..page_count {
301                            let addr = VirtAddr::new(start + j * 4096);
302                            unsafe { invlpg(addr) };
303                        }
304                    }
305                } else {
306                    unsafe { flush_tlb_all() };
307                }
308            }
309            TlbShootdownKind::Full => {
310                unsafe { flush_tlb_all() };
311            }
312        }
313    }
314
315    // 3. Signal completion.
316    TLB_ACKS[cpu_idx].store(true, Ordering::Release);
317
318    // 4. Send EOI.
319    crate::arch::x86_64::apic::eoi();
320}
321
322/// Invalidate a single TLB entry (local CPU only).
323#[inline]
324unsafe fn invlpg(vaddr: VirtAddr) {
325    core::arch::asm!("invlpg [{}]", in(reg) vaddr.as_u64(), options(nostack, preserves_flags));
326}
327
328/// Flush all TLB entries by reloading CR3 (local CPU only).
329#[inline]
330unsafe fn flush_tlb_all() {
331    use x86_64::registers::control::Cr3;
332    let (frame, flags) = Cr3::read();
333    Cr3::write(frame, flags);
334}
335
336/// Send TLB IPI.
337fn send_tlb_ipi(target_apic_id: u32) {
338    let icr_low = crate::arch::x86_64::apic::IPI_TLB_SHOOTDOWN_VECTOR as u32 | (1 << 14);
339    crate::arch::x86_64::apic::send_ipi_raw(target_apic_id, icr_low);
340}
341
342/// Collect target APIC IDs into a pre-allocated buffer.
343fn collect_tlb_targets(targets: &mut [u32]) -> usize {
344    let my_cpu = crate::arch::x86_64::percpu::current_cpu_index();
345    let mut count = 0;
346    for cpu_idx in 0..crate::arch::x86_64::percpu::MAX_CPUS {
347        if !crate::arch::x86_64::percpu::tlb_ready(cpu_idx) {
348            continue;
349        }
350        if let Some(apic_id) = crate::arch::x86_64::percpu::apic_id_by_cpu_index(cpu_idx) {
351            if cpu_idx != my_cpu {
352                if count < targets.len() {
353                    targets[count] = apic_id;
354                    count += 1;
355                }
356            }
357        }
358    }
359    count
360}
361
362/// Wait for ACKs from specific APIC IDs.
363fn wait_for_acks(targets: &[u32]) {
364    const MAX_WAIT_CYCLES: usize = 10_000_000;
365    for &apic_id in targets {
366        // Use if-let: if the APIC ID is gone (AP offline after we sent the IPI)
367        // there is nothing to wait for : skip rather than panic in kernel context.
368        let cpu_idx = match crate::arch::x86_64::percpu::cpu_index_by_apic(apic_id) {
369            Some(idx) => idx,
370            None => {
371                log::warn!("TLB wait_acks: APIC {} disappeared, skipping", apic_id);
372                continue;
373            }
374        };
375        let mut success = false;
376        for _ in 0..MAX_WAIT_CYCLES {
377            if TLB_ACKS[cpu_idx].load(Ordering::Acquire) {
378                success = true;
379                break;
380            }
381            core::hint::spin_loop();
382        }
383        if !success {
384            log::warn!("TLB shootdown timeout on CPU {}", cpu_idx);
385        }
386    }
387}
388
389/// Current CPU index.
390fn current_cpu_index() -> usize {
391    crate::arch::x86_64::percpu::current_cpu_index()
392}