Skip to main content

strat9_kernel/memory/
vmalloc.rs

1//! VM-backed allocator for large heap objects.
2//!
3//! Provides virtually contiguous allocations backed by individually allocated
4//! physical pages. Unlike the buddy allocator, `vmalloc` does **not** require
5//! physically contiguous memory : it maps each page individually into a
6//! dedicated kernel virtual memory arena.
7//!
8//! ## Arena layout
9//!
10//! ```text
11//! VMALLOC_VIRT_START = 0xffffc000_0000_0000  (256 GiB boundary)
12//! VMALLOC_SIZE       = 1 GiB
13//! VMALLOC_VIRT_END   = 0xffffc040_0000_0000
14//! ```
15//!
16//! This region sits well above the HHDM direct map (starting at
17//! 0xffff8000_0000_0000) and kernel code/data.
18//!
19//! ## Allocation strategy
20//!
21//! 1. Allocate backing physical pages individually from the frame allocator.
22//! 2. Reserve a virtually contiguous range from the vmalloc extent allocator.
23//! 3. Map each page into the kernel page tables at the virtual address.
24//!
25//! ## Deallocation
26//!
27//! 1. Unmap each page from the kernel page tables.
28//! 2. Flush stale TLB entries across CPUs for the unmapped range.
29//! 3. Return the virtual range to the extent allocator.
30//! 4. Free each physical page back to the frame allocator.
31//!
32//! ## Metadata model
33//!
34//! The allocator no longer uses a fixed allocation-record table or a bitmap.
35//! It maintains:
36//! - a sorted free-extent list;
37//! - a sorted active-allocation list;
38//! - a metadata node pool carved from raw buddy pages, independent from the
39//!   general kernel heap.
40//!
41//! This removes the old fixed-slot ceiling and keeps vmalloc bookkeeping from
42//! recursing back into heap allocation.
43//!
44//! ## Thread safety
45//!
46//! Protected by a single `SpinLock<Vmalloc>`. IRQs are disabled during
47//! allocation to prevent deadlock with the buddy allocator.
48
49use crate::{
50    arch::x86_64::tlb::shootdown_range,
51    memory::{
52        frame::PhysFrame,
53        paging::{map_page_kernel, unmap_page_kernel},
54        phys_to_virt,
55    },
56    serial_println,
57    sync::{IrqDisabledToken, SpinLock},
58};
59use core::{
60    mem::size_of,
61    panic::Location,
62    ptr,
63    sync::atomic::{AtomicU64, Ordering as AtomicOrdering},
64};
65use x86_64::{
66    structures::paging::{Page, PageTableFlags, PhysFrame as X86PhysFrame},
67    VirtAddr,
68};
69
70// Arena constants =====================================================
71
72/// Base virtual address of the vmalloc arena.
73/// Placed at 0xffffc000_0000_0000 : well above the HHDM direct map.
74pub const VMALLOC_VIRT_START: u64 = 0xffff_c000_0000_0000;
75
76/// Total size of the vmalloc arena: 1 GiB.
77pub const VMALLOC_SIZE: usize = 1024 * 1024 * 1024;
78
79/// End virtual address of the vmalloc arena.
80pub const VMALLOC_VIRT_END: u64 = VMALLOC_VIRT_START + VMALLOC_SIZE as u64;
81
82/// Number of pages in the arena.
83const VMALLOC_PAGES: usize = VMALLOC_SIZE / 4096;
84
85/// First allocatable page index inside the arena.
86///
87/// Page 0 (`VMALLOC_VIRT_START`) is permanently mapped by the bootstrap frame
88/// allocated in `ensure_kernel_subtree_ready()`.  Keeping that mapping alive
89/// anchors the intermediate page-table nodes (PDPT → PD → PT) so they are
90/// inherited by every address space cloned after `init()` runs.
91/// The free-extent list therefore starts at page 1.
92const ARENA_START_PAGE: usize = 1;
93
94/// Maximum single allocation size.
95///
96/// The backend is now bounded by the actual vmalloc arena size rather than an
97/// arbitrary low ceiling.
98const VMALLOC_MAX_ALLOC: usize = VMALLOC_SIZE;
99
100struct FrameList {
101    ptr: *mut PhysFrame,
102    len: usize,
103    storage_frame: PhysFrame,
104    storage_order: u8,
105}
106
107impl FrameList {
108    fn new(len: usize, token: &IrqDisabledToken) -> Option<Self> {
109        let bytes = len.checked_mul(size_of::<PhysFrame>())?;
110        let pages_needed = bytes.saturating_add(4095) / 4096;
111        let order = if pages_needed <= 1 {
112            0
113        } else {
114            pages_needed.next_power_of_two().trailing_zeros() as u8
115        };
116        let storage_frame = crate::memory::buddy::alloc(token, order).ok()?;
117        let ptr = phys_to_virt(storage_frame.start_address.as_u64()) as *mut PhysFrame;
118        Some(Self {
119            ptr,
120            len,
121            storage_frame,
122            storage_order: order,
123        })
124    }
125
126    fn get(&self, index: usize) -> PhysFrame {
127        debug_assert!(index < self.len);
128        unsafe { *self.ptr.add(index) }
129    }
130
131    fn set(&mut self, index: usize, frame: PhysFrame) {
132        debug_assert!(index < self.len);
133        unsafe { *self.ptr.add(index) = frame };
134    }
135
136    fn free_storage(self, token: &IrqDisabledToken) {
137        crate::memory::buddy::free(token, self.storage_frame, self.storage_order);
138    }
139}
140
141// SAFETY: `FrameList` owns a contiguous region of physical memory allocated
142// from the buddy allocator. The raw pointer is never aliased : access is
143// exclusively mediated through `get`/`set`. Transferring ownership across
144// threads is safe because the backing storage frame and all frames stored
145// within are plain physical addresses that travel with the struct.
146unsafe impl Send for FrameList {}
147
148#[derive(Clone, Copy, Debug, Eq, PartialEq)]
149pub enum VmallocError {
150    ZeroSize,
151    SizeExceedsPolicy {
152        requested: usize,
153        max_allowed: usize,
154    },
155    MetadataAllocationFailed,
156    PhysicalMemoryExhausted,
157    VirtualRangeExhausted,
158    KernelMapFailed,
159}
160
161#[derive(Clone, Copy, Debug, Eq, PartialEq)]
162pub struct VmallocFailureSnapshot {
163    pub size: usize,
164    pub pages: usize,
165    pub error: VmallocError,
166}
167
168/// Unified metadata node used both for free extents and live allocations.
169///
170/// `frames == None` means the node describes a free extent.
171/// `frames == Some(_)` means the node describes an active allocation.
172///
173/// `attr` is only meaningful when `frames.is_some()`; it is zero-initialized
174/// for free-extent nodes.
175struct VmallocNode {
176    start_page: usize,
177    page_count: usize,
178    next: *mut VmallocNode,
179    frames: Option<FrameList>,
180    /// Attribution captured at allocation time. Used by leak diagnostics.
181    attr: VmallocAttr,
182}
183
184// SAFETY: access to nodes is serialized by `VMALLOC`; the struct only contains
185// plain integers, raw pointers forming intrusive lists, and `FrameList`, which
186// is already `Send`.
187unsafe impl Send for VmallocNode {}
188
189struct Vmalloc {
190    initialized: bool,
191    subtree_ready: bool,
192    /// True once the initial single-spanning free extent has been inserted.
193    /// Replaces the ambiguous `!free_head.is_null() || !alloc_head.is_null()`
194    /// guard that read as a free-space check rather than an init-state check.
195    arena_initialized: bool,
196    /// Bootstrap frame permanently mapped at `VMALLOC_VIRT_START` (arena page 0).
197    ///
198    /// Keeping this mapping live anchors the intermediate page-table nodes
199    /// (PDPT → PD → PT) so they are present in the canonical kernel L4 table
200    /// and inherited by every address space created after `init()`.  The frame
201    /// must never be freed while the kernel is running.
202    bootstrap_frame: Option<PhysFrame>,
203    free_head: *mut VmallocNode,
204    alloc_head: *mut VmallocNode,
205    node_pool_free: *mut VmallocNode,
206    alloc_count: usize,
207    allocated_pages: usize,
208    metadata_pages: usize,
209    fail_count: usize,
210    last_failure: Option<VmallocFailureSnapshot>,
211}
212
213// SAFETY: all access to the intrusive raw-pointer lists in `Vmalloc` is
214// serialized by `VMALLOC: SpinLock<Vmalloc>`. The raw pointers point only to
215// allocator-owned metadata nodes managed under that lock.
216unsafe impl Send for Vmalloc {}
217
218impl Vmalloc {
219    const fn new() -> Self {
220        Self {
221            initialized: false,
222            subtree_ready: false,
223            arena_initialized: false,
224            bootstrap_frame: None,
225            free_head: ptr::null_mut(),
226            alloc_head: ptr::null_mut(),
227            node_pool_free: ptr::null_mut(),
228            alloc_count: 0,
229            allocated_pages: 0,
230            metadata_pages: 0,
231            fail_count: 0,
232            last_failure: None,
233        }
234    }
235
236    fn record_failure(&mut self, size: usize, pages: usize, error: VmallocError) -> VmallocError {
237        self.fail_count = self.fail_count.saturating_add(1);
238        self.last_failure = Some(VmallocFailureSnapshot { size, pages, error });
239        error
240    }
241
242    unsafe fn refill_node_pool(&mut self, token: &IrqDisabledToken) -> Result<(), VmallocError> {
243        let frame = crate::memory::buddy::alloc(token, 0)
244            .map_err(|_| self.record_failure(0, 0, VmallocError::MetadataAllocationFailed))?;
245        let base = phys_to_virt(frame.start_address.as_u64()) as *mut VmallocNode;
246        // Compile-time guarantee that at least one node fits in a page.
247        const _: () = assert!(
248            core::mem::size_of::<VmallocNode>() < 4096,
249            "VmallocNode exceeds one page : refill_node_pool logic must be revised"
250        );
251        let nodes_per_page = 4096 / size_of::<VmallocNode>();
252
253        for i in 0..nodes_per_page {
254            let node = base.add(i);
255            ptr::write(
256                node,
257                VmallocNode {
258                    start_page: 0,
259                    page_count: 0,
260                    next: self.node_pool_free,
261                    frames: None,
262                    attr: VmallocAttr::default(),
263                },
264            );
265            self.node_pool_free = node;
266        }
267        self.metadata_pages = self.metadata_pages.saturating_add(1);
268        Ok(())
269    }
270
271    unsafe fn alloc_node(
272        &mut self,
273        token: &IrqDisabledToken,
274    ) -> Result<*mut VmallocNode, VmallocError> {
275        if self.node_pool_free.is_null() {
276            self.refill_node_pool(token)?;
277        }
278        let node = self.node_pool_free;
279        self.node_pool_free = (*node).next;
280        (*node).next = ptr::null_mut();
281        (*node).start_page = 0;
282        (*node).page_count = 0;
283        (*node).frames = None;
284        (*node).attr = VmallocAttr::default();
285        Ok(node)
286    }
287
288    unsafe fn release_node(&mut self, node: *mut VmallocNode) {
289        // Releasing a node that still holds a FrameList would silently drop the
290        // physical frames without freeing them : an unrecoverable leak / potential
291        // double-free if the frames are later re-allocated.  Free nodes must
292        // always have `frames == None` before being returned to the pool.
293        debug_assert!(
294            (*node).frames.is_none(),
295            "release_node: node at {:p} still has live frames (start_page={}) : \
296             caller must take() frames before releasing",
297            node,
298            (*node).start_page,
299        );
300        (*node).frames = None; // belt-and-suspenders in release builds
301        (*node).start_page = 0;
302        (*node).page_count = 0;
303        (*node).next = self.node_pool_free;
304        self.node_pool_free = node;
305    }
306
307    unsafe fn ensure_arena_ready(&mut self, token: &IrqDisabledToken) -> Result<(), VmallocError> {
308        if self.arena_initialized {
309            return Ok(());
310        }
311        let node = self.alloc_node(token)?;
312        // Page 0 (VMALLOC_VIRT_START) is reserved for the bootstrap mapping
313        // established by `ensure_kernel_subtree_ready()`. The allocatable arena
314        // begins at ARENA_START_PAGE to avoid colliding with that frame.
315        (*node).start_page = ARENA_START_PAGE;
316        (*node).page_count = VMALLOC_PAGES - ARENA_START_PAGE;
317        (*node).next = ptr::null_mut();
318        (*node).frames = None;
319        self.free_head = node;
320        self.arena_initialized = true;
321        Ok(())
322    }
323
324    unsafe fn reserve_range(
325        &mut self,
326        pages: usize,
327        token: &IrqDisabledToken,
328    ) -> Result<*mut VmallocNode, VmallocError> {
329        let mut best_prev = ptr::null_mut();
330        let mut best = ptr::null_mut();
331        let mut best_size = usize::MAX;
332
333        let mut prev = ptr::null_mut();
334        let mut cur = self.free_head;
335        while !cur.is_null() {
336            if (*cur).page_count >= pages && (*cur).page_count < best_size {
337                best = cur;
338                best_prev = prev;
339                best_size = (*cur).page_count;
340                if best_size == pages {
341                    break;
342                }
343            }
344            prev = cur;
345            cur = (*cur).next;
346        }
347
348        if best.is_null() {
349            return Err(VmallocError::VirtualRangeExhausted);
350        }
351
352        if (*best).page_count == pages {
353            let next = (*best).next;
354            if best_prev.is_null() {
355                self.free_head = next;
356            } else {
357                (*best_prev).next = next;
358            }
359            (*best).next = ptr::null_mut();
360            return Ok(best);
361        }
362
363        let alloc = self.alloc_node(token)?;
364        (*alloc).start_page = (*best).start_page;
365        (*alloc).page_count = pages;
366        (*alloc).next = ptr::null_mut();
367        (*alloc).frames = None;
368
369        (*best).start_page = (*best).start_page.saturating_add(pages);
370        (*best).page_count = (*best).page_count.saturating_sub(pages);
371        Ok(alloc)
372    }
373
374    unsafe fn insert_alloc_node(&mut self, node: *mut VmallocNode) {
375        let mut prev: *mut VmallocNode = ptr::null_mut();
376        let mut cur = self.alloc_head;
377        while !cur.is_null() && (*cur).start_page < (*node).start_page {
378            prev = cur;
379            cur = (*cur).next;
380        }
381        (*node).next = cur;
382        if prev.is_null() {
383            self.alloc_head = node;
384        } else {
385            (*prev).next = node;
386        }
387    }
388
389    unsafe fn take_alloc_node_by_addr(&mut self, addr: u64) -> *mut VmallocNode {
390        let mut prev: *mut VmallocNode = ptr::null_mut();
391        let mut cur = self.alloc_head;
392        while !cur.is_null() {
393            let cur_addr = VMALLOC_VIRT_START + ((*cur).start_page as u64 * 4096);
394            if cur_addr == addr {
395                let next = (*cur).next;
396                if prev.is_null() {
397                    self.alloc_head = next;
398                } else {
399                    (*prev).next = next;
400                }
401                (*cur).next = ptr::null_mut();
402                return cur;
403            }
404            if cur_addr > addr {
405                break;
406            }
407            prev = cur;
408            cur = (*cur).next;
409        }
410        ptr::null_mut()
411    }
412
413    unsafe fn insert_free_node_merge(&mut self, node: *mut VmallocNode) {
414        debug_assert!((*node).frames.is_none());
415
416        let mut prev: *mut VmallocNode = ptr::null_mut();
417        let mut cur = self.free_head;
418        while !cur.is_null() && (*cur).start_page < (*node).start_page {
419            prev = cur;
420            cur = (*cur).next;
421        }
422
423        (*node).next = cur;
424        if prev.is_null() {
425            self.free_head = node;
426        } else {
427            (*prev).next = node;
428        }
429
430        let mut merged = node;
431        if !prev.is_null() && (*prev).start_page + (*prev).page_count == (*node).start_page {
432            (*prev).page_count = (*prev).page_count.saturating_add((*node).page_count);
433            (*prev).next = (*node).next;
434            self.release_node(node);
435            merged = prev;
436        }
437
438        while !(*merged).next.is_null() {
439            let next = (*merged).next;
440            if (*merged).start_page + (*merged).page_count != (*next).start_page {
441                break;
442            }
443            (*merged).page_count = (*merged).page_count.saturating_add((*next).page_count);
444            (*merged).next = (*next).next;
445            self.release_node(next);
446        }
447    }
448
449    unsafe fn free_extent_count(&self) -> usize {
450        let mut count = 0usize;
451        let mut cur = self.free_head;
452        while !cur.is_null() {
453            count = count.saturating_add(1);
454            cur = (*cur).next;
455        }
456        count
457    }
458
459    unsafe fn largest_free_extent_pages(&self) -> usize {
460        let mut largest = 0usize;
461        let mut cur = self.free_head;
462        while !cur.is_null() {
463            largest = largest.max((*cur).page_count);
464            cur = (*cur).next;
465        }
466        largest
467    }
468
469    unsafe fn node_pool_free_count(&self) -> usize {
470        let mut count = 0usize;
471        let mut cur = self.node_pool_free;
472        while !cur.is_null() {
473            count = count.saturating_add(1);
474            cur = (*cur).next;
475        }
476        count
477    }
478}
479
480static VMALLOC: SpinLock<Vmalloc> = SpinLock::new(Vmalloc::new());
481
482/// Counts ZeroSize / policy-limit rejections.
483pub static VMALLOC_POLICY_REJECT_COUNT: AtomicU64 = AtomicU64::new(0);
484
485/// Monotonic count of **successful** `vmalloc` calls this boot (next seq = current value).
486///
487/// Incremented only after mapping succeeds so failed attempts do not consume
488/// sequence numbers.  Live nodes store the assigned value in [`VmallocAttr::alloc_seq`].
489/// Lower `alloc_seq` still means an earlier successful allocation among live mappings.
490pub static VMALLOC_ALLOC_SEQ: AtomicU64 = AtomicU64::new(0);
491
492/// High-watermark of simultaneously allocated pages (updated on every alloc).
493pub static VMALLOC_PEAK_PAGES: AtomicU64 = AtomicU64::new(0);
494
495/// Attribution snapshot captured at vmalloc time.
496///
497/// Stored inside each live allocation node so that `dump_live_allocations()`
498/// can attribute each mapping to a task and silo without external state.
499///
500/// Designed for post-mortem leak analysis:
501/// - `alloc_seq` gives ordering (smallest = oldest live alloc).
502/// - `pid`/`tid`/`silo_id` identify the requesting workload.
503/// - `size` is the **requested** byte count, not the page-rounded value.
504#[derive(Clone, Copy, Debug, Default)]
505pub struct VmallocAttr {
506    /// Scheduler task id (`0` = kernel or pre-scheduler context).
507    pub task_id: u64,
508    /// PID of the requesting task (`0` = kernel or pre-scheduler context).
509    pub pid: u32,
510    /// TID of the requesting task.
511    pub tid: u32,
512    /// Silo that owns the task (`0` = kernel / silo lookup failed / not in silo).
513    pub silo_id: u32,
514    /// Requested allocation size in bytes (before page-rounding).
515    pub size: usize,
516    /// Monotonic per-boot sequence number (see [`VMALLOC_ALLOC_SEQ`]).
517    pub alloc_seq: u64,
518    /// Best-effort callsite file of the allocator request.
519    pub caller_file: &'static str,
520    /// Best-effort callsite line of the allocator request.
521    pub caller_line: u32,
522    /// Best-effort callsite column of the allocator request.
523    pub caller_column: u32,
524}
525
526/// Capture attribution for the calling task, without holding VMALLOC.
527///
528/// Must be called **before** acquiring the VMALLOC lock to maintain
529/// the VMALLOC → SILO_MANAGER lock ordering and to avoid deadlocking
530/// if vmalloc is called from within silo or scheduler code.
531///
532/// Uses `current_task_clone_try()` (non-blocking) so that vmalloc called
533/// from within a scheduler path cannot deadlock on the per-CPU scheduler lock.
534fn capture_attr(size: usize, caller: &'static Location<'static>) -> VmallocAttr {
535    let (task_id, pid, tid, silo_id) = match crate::process::current_task_clone_try() {
536        Some(task) => {
537            let task_id = task.id.as_u64();
538            let pid = task.pid;
539            let tid = task.tid;
540            // Non-blocking: if SILO_MANAGER is held by an outer frame on this
541            // CPU, we just record silo_id=0 rather than risk a deadlock.
542            let silo_id = crate::silo::try_silo_id_for_task(task.id).unwrap_or(0);
543            (task_id, pid, tid, silo_id)
544        }
545        // No scheduler running yet, or per-CPU lock is contended.
546        None => (0, 0, 0, 0),
547    };
548
549    VmallocAttr {
550        task_id,
551        pid,
552        tid,
553        silo_id,
554        size,
555        alloc_seq: 0,
556        caller_file: caller.file(),
557        caller_line: caller.line(),
558        caller_column: caller.column(),
559    }
560}
561
562#[derive(Clone, Copy, Debug, Eq, PartialEq)]
563pub enum VmallocAllocBackend {
564    KernelVirtual,
565}
566
567#[derive(Clone, Copy, Debug, Eq, PartialEq)]
568pub struct VmallocLiveAllocationSnapshot {
569    pub seq: u64,
570    pub task_id: u64,
571    pub pid: u32,
572    pub tid: u32,
573    pub silo_id: u32,
574    pub size: usize,
575    pub pages: usize,
576    pub vaddr: u64,
577    pub backend: VmallocAllocBackend,
578    pub caller_file: &'static str,
579    pub caller_line: u32,
580    pub caller_column: u32,
581}
582
583#[derive(Clone, Copy, Debug, Eq, PartialEq)]
584pub struct VmallocDiagSnapshot {
585    pub arena_start: u64,
586    pub arena_end: u64,
587    pub alloc_count: usize,
588    pub allocated_pages: usize,
589    pub free_pages: usize,
590    pub peak_pages: u64,
591    /// Cumulative successful `vmalloc` calls this boot (matches [`VMALLOC_ALLOC_SEQ`]).
592    pub total_seq: u64,
593    pub fail_count: usize,
594    pub policy_rejects: u64,
595    pub free_extent_count: usize,
596    pub largest_free_pages: usize,
597    pub metadata_pages: usize,
598    pub node_pool_free: usize,
599    pub last_failure: Option<VmallocFailureSnapshot>,
600}
601
602/// Pre-allocate the intermediate page-table nodes (PML4 → PDPT → PD) for the
603/// vmalloc virtual address range in the **canonical kernel page table**.
604///
605/// ## Why this is necessary
606///
607/// Every new user address space clones `PML4[256..512]` from the kernel L4 at
608/// creation time.  If the PDPT/PD nodes for the vmalloc arena do not exist at
609/// that point, the new address space inherits `PML4[256] = 0` (not present).
610/// Any subsequent kernel access to a vmalloc address in that process's context
611/// will fault, because its page-table walk stops at the missing PML4 entry.
612///
613/// By mapping a page at [`VMALLOC_VIRT_START`] during `init()` and **keeping**
614/// that mapping (see `bootstrap_frame`), we force the page-table allocator to
615/// create and wire all intermediate nodes.  The leaf mapping anchors the
616/// subtree; the allocatable arena begins at page 1 so callers never receive the
617/// bootstrap virtual address.
618///
619/// ## Caller contract
620///
621/// **Called only from `init()`**, which runs before any user address space is
622/// created.  Do **not** call this from `vmalloc()`: the check (`subtree_ready`)
623/// would always succeed after boot and would add a gratuitous VMALLOC lock
624/// acquire on every allocation hot path.
625fn ensure_kernel_subtree_ready(token: &IrqDisabledToken) {
626    let mut guard = VMALLOC.lock();
627    if guard.subtree_ready {
628        return;
629    }
630
631    let Ok(frame) = crate::memory::allocate_frame(token) else {
632        serial_println!("[vmalloc] bootstrap: failed to allocate bootstrap frame");
633        return;
634    };
635
636    let page = Page::containing_address(VirtAddr::new(VMALLOC_VIRT_START));
637    let x86_frame = X86PhysFrame::containing_address(frame.start_address);
638    let flags = PageTableFlags::PRESENT | PageTableFlags::WRITABLE | PageTableFlags::NO_EXECUTE;
639
640    if map_page_kernel(page, x86_frame, flags).is_ok() {
641        // Keep the frame permanently mapped at VMALLOC_VIRT_START (arena page 0).
642        //
643        // We deliberately do NOT unmap here.  The goal is to keep the intermediate
644        // page-table nodes (PDPT → PD → PT) alive in the canonical kernel L4 so
645        // that every address space cloned after this point inherits them.
646        //
647        // Previously the code did map + immediate unmap, relying on the fact that
648        // `unmap_page_kernel` only removes the leaf PTE and never reclaims empty
649        // intermediate tables.  That invariant is not guaranteed to hold forever;
650        // anchoring through a live mapping makes the intent explicit and robust.
651        //
652        // The vmalloc arena consequently starts at ARENA_START_PAGE (page 1) to
653        // avoid handing out the bootstrap virtual address to callers.
654        guard.bootstrap_frame = Some(frame);
655        guard.subtree_ready = true;
656    } else {
657        // Mapping failed : free the frame; the arena will be unusable.
658        crate::memory::free_frame(token, frame);
659        serial_println!("[vmalloc] bootstrap: failed to map bootstrap page");
660    }
661}
662
663fn ensure_init() {
664    let mut guard = VMALLOC.lock();
665    if guard.initialized {
666        return;
667    }
668    guard.initialized = true;
669    serial_println!(
670        "[vmalloc] initialized: VA=0x{:x}..0x{:x} ({} pages, {} MiB)",
671        VMALLOC_VIRT_START,
672        VMALLOC_VIRT_END,
673        VMALLOC_PAGES,
674        VMALLOC_SIZE / (1024 * 1024)
675    );
676}
677
678pub fn init() {
679    ensure_init();
680    crate::sync::with_irqs_disabled(|token| {
681        ensure_kernel_subtree_ready(token);
682        let mut guard = VMALLOC.lock();
683        if let Err(e) = unsafe { guard.ensure_arena_ready(token) } {
684            serial_println!(
685                "[vmalloc] init: ensure_arena_ready failed ({:?}) : vmalloc will retry on first use",
686                e
687            );
688        }
689    });
690}
691
692pub fn last_failure_snapshot() -> Option<VmallocFailureSnapshot> {
693    let guard = VMALLOC.lock();
694    guard.last_failure
695}
696
697pub fn diag_snapshot() -> Option<VmallocDiagSnapshot> {
698    let guard = VMALLOC.lock();
699    let vm = &*guard;
700    if !vm.initialized {
701        return None;
702    }
703
704    // NOTE: holds VMALLOC for O(free_extents) traversal of the internal free
705    // lists. This is acceptable for shell/debug telemetry, but should not be
706    // used on allocator hot paths.
707    let (free_extents, largest_free, node_pool_free) = unsafe {
708        (
709            vm.free_extent_count(),
710            vm.largest_free_extent_pages(),
711            vm.node_pool_free_count(),
712        )
713    };
714
715    Some(VmallocDiagSnapshot {
716        arena_start: VMALLOC_VIRT_START,
717        arena_end: VMALLOC_VIRT_END,
718        alloc_count: vm.alloc_count,
719        allocated_pages: vm.allocated_pages,
720        free_pages: (VMALLOC_PAGES - ARENA_START_PAGE).saturating_sub(vm.allocated_pages),
721        peak_pages: VMALLOC_PEAK_PAGES.load(AtomicOrdering::Relaxed),
722        total_seq: VMALLOC_ALLOC_SEQ.load(AtomicOrdering::Relaxed),
723        fail_count: vm.fail_count,
724        policy_rejects: VMALLOC_POLICY_REJECT_COUNT.load(AtomicOrdering::Relaxed),
725        free_extent_count: free_extents,
726        largest_free_pages: largest_free,
727        metadata_pages: vm.metadata_pages,
728        node_pool_free,
729        last_failure: vm.last_failure,
730    })
731}
732
733/// Allocate `size` bytes of virtually contiguous kernel memory.
734///
735/// Prefer [`crate::memory::allocate_kernel_virtual`] over calling this
736/// directly.
737#[track_caller]
738pub(crate) fn vmalloc(size: usize, token: &IrqDisabledToken) -> Result<*mut u8, VmallocError> {
739    if size == 0 {
740        // Pure policy reject : no allocation attempted, no per-call context
741        // worth recording. VMALLOC_POLICY_REJECT_COUNT captures the count.
742        VMALLOC_POLICY_REJECT_COUNT.fetch_add(1, AtomicOrdering::Relaxed);
743        return Err(VmallocError::ZeroSize);
744    }
745    if size > VMALLOC_MAX_ALLOC {
746        VMALLOC_POLICY_REJECT_COUNT.fetch_add(1, AtomicOrdering::Relaxed);
747        return Err(VmallocError::SizeExceedsPolicy {
748            requested: size,
749            max_allowed: VMALLOC_MAX_ALLOC,
750        });
751    }
752
753    // Capture attribution before acquiring VMALLOC to respect lock ordering
754    // (VMALLOC → SILO_MANAGER) and to avoid a re-entrancy deadlock if this
755    // vmalloc call originates from within scheduler or silo code.
756    let mut attr = capture_attr(size, Location::caller());
757
758    ensure_init();
759
760    let pages = (size + 4095) / 4096;
761    let page_flags =
762        PageTableFlags::PRESENT | PageTableFlags::WRITABLE | PageTableFlags::NO_EXECUTE;
763
764    // Two-phase lock pattern:
765    //
766    // Phase A (no VMALLOC lock): allocate all physical frames from the buddy.
767    // Phase B (VMALLOC lock held): reserve a virtual range and map the frames.
768    //
769    // Keeping Phase A outside the lock allows the buddy to run concurrently with
770    // other vmalloc/vfree calls.  The trade-off is that another thread may exhaust
771    // the virtual arena between A and B, in which case the frames are rolled back
772    // and `VirtualRangeExhausted` is returned.  This is acceptable: physical frames
773    // are cheap to allocate and free compared to holding a global spinlock across
774    // potentially dozens of buddy allocations.
775    let mut frames = match FrameList::new(pages, token) {
776        Some(frames) => frames,
777        None => {
778            let mut guard = VMALLOC.lock();
779            return Err(guard.record_failure(size, pages, VmallocError::MetadataAllocationFailed));
780        }
781    };
782
783    for i in 0..pages {
784        match crate::memory::allocate_frame(token) {
785            Ok(frame) => frames.set(i, frame),
786            Err(_) => {
787                for j in 0..i {
788                    crate::memory::free_frame(token, frames.get(j));
789                }
790                frames.free_storage(token);
791                let mut guard = VMALLOC.lock();
792                return Err(guard.record_failure(
793                    size,
794                    pages,
795                    VmallocError::PhysicalMemoryExhausted,
796                ));
797            }
798        }
799    }
800
801    let mut guard = VMALLOC.lock();
802    let vm = &mut *guard;
803    unsafe {
804        if let Err(error) = vm.ensure_arena_ready(token) {
805            for i in 0..pages {
806                crate::memory::free_frame(token, frames.get(i));
807            }
808            frames.free_storage(token);
809            return Err(vm.record_failure(size, pages, error));
810        }
811
812        let alloc_node = match vm.reserve_range(pages, token) {
813            Ok(node) => node,
814            Err(error) => {
815                for i in 0..pages {
816                    crate::memory::free_frame(token, frames.get(i));
817                }
818                frames.free_storage(token);
819                return Err(vm.record_failure(size, pages, error));
820            }
821        };
822
823        let virt_base = VMALLOC_VIRT_START + ((*alloc_node).start_page as u64 * 4096);
824
825        for i in 0..pages {
826            let frame = frames.get(i);
827            let page_virt = virt_base + (i as u64 * 4096);
828            let page = Page::containing_address(VirtAddr::new(page_virt));
829            let x86_frame = X86PhysFrame::containing_address(frame.start_address);
830            if map_page_kernel(page, x86_frame, page_flags).is_err() {
831                for j in 0..i {
832                    let pv = virt_base + (j as u64 * 4096);
833                    let pg = Page::containing_address(VirtAddr::new(pv));
834                    let _ = unmap_page_kernel(pg);
835                }
836                (*alloc_node).frames = None;
837                vm.insert_free_node_merge(alloc_node);
838                for j in 0..pages {
839                    crate::memory::free_frame(token, frames.get(j));
840                }
841                frames.free_storage(token);
842                return Err(vm.record_failure(size, pages, VmallocError::KernelMapFailed));
843            }
844        }
845
846        (*alloc_node).frames = Some(frames);
847        attr.alloc_seq = VMALLOC_ALLOC_SEQ.fetch_add(1, AtomicOrdering::Relaxed);
848        (*alloc_node).attr = attr;
849        vm.insert_alloc_node(alloc_node);
850        vm.alloc_count = vm.alloc_count.saturating_add(1);
851        vm.allocated_pages = vm.allocated_pages.saturating_add(pages);
852        vm.last_failure = None;
853
854        // Update peak-pages high watermark (lock-free, best-effort).
855        let current_pages = vm.allocated_pages as u64;
856        let mut peak = VMALLOC_PEAK_PAGES.load(AtomicOrdering::Relaxed);
857        while current_pages > peak {
858            match VMALLOC_PEAK_PAGES.compare_exchange_weak(
859                peak,
860                current_pages,
861                AtomicOrdering::Relaxed,
862                AtomicOrdering::Relaxed,
863            ) {
864                Ok(_) => break,
865                Err(p) => peak = p,
866            }
867        }
868
869        Ok(virt_base as *mut u8)
870    }
871}
872
873/// Free a vmalloc allocation.
874///
875/// Structured as three phases to avoid a spinlock-under-IPI deadlock:
876///
877/// 1. **Under VMALLOC lock** : unmap pages (acquires/releases KERNEL_PT_LOCK per
878///    page), collect the frame list, update allocator bookkeeping, return the
879///    virtual extent to the free list.
880/// 2. **Lock released** : TLB shootdown. Remote CPUs servicing the IPI must
881///    acknowledge before returning. If VMALLOC were still held here, any
882///    remote CPU blocked on VMALLOC could not reach the acknowledgement path,
883///    causing a deadlock. Releasing first eliminates the hazard.
884/// 3. **No lock** : free physical frames back to the buddy allocator.
885///
886/// Returns `true` if a region was released (`free(NULL)` counts as success).
887/// Returns `false` if the pointer was non-null but did not denote a live
888/// vmalloc mapping in the arena (nothing freed : caller may be leaking).
889pub fn vfree(ptr: *mut u8, token: &IrqDisabledToken) -> bool {
890    if ptr.is_null() {
891        return true;
892    }
893
894    let addr = ptr as u64;
895    if addr < VMALLOC_VIRT_START || addr >= VMALLOC_VIRT_END {
896        return false;
897    }
898
899    // Phase 1 : unmap and collect under VMALLOC lock.
900    let (frames, range_start, range_end) = {
901        let mut guard = VMALLOC.lock();
902        let vm = &mut *guard;
903
904        unsafe {
905            let node = vm.take_alloc_node_by_addr(addr);
906            if node.is_null() {
907                serial_println!("[vmalloc] vfree: no allocation record for 0x{:x}", addr);
908                return false;
909            }
910
911            let page_count = (*node).page_count;
912            let virt_start = VMALLOC_VIRT_START + ((*node).start_page as u64 * 4096);
913            let frames = (*node).frames.take().unwrap();
914
915            for i in 0..page_count {
916                let page_start = virt_start + (i as u64 * 4096);
917                let page = Page::containing_address(VirtAddr::new(page_start));
918                // unmap_page_kernel acquires/releases KERNEL_PT_LOCK internally.
919                // Lock order: VMALLOC → KERNEL_PT_LOCK : consistent with vmalloc().
920                let _ = unmap_page_kernel(page);
921            }
922
923            let range_start = VirtAddr::new(virt_start);
924            let range_end = VirtAddr::new(virt_start + (page_count as u64 * 4096));
925
926            vm.alloc_count = vm.alloc_count.saturating_sub(1);
927            vm.allocated_pages = vm.allocated_pages.saturating_sub(page_count);
928            vm.insert_free_node_merge(node);
929            (frames, range_start, range_end)
930        }
931    }; // Phase 1 end : VMALLOC lock released here.
932
933    // Phase 2 : TLB shootdown with no lock held.
934    // All remote CPUs can freely enter vmalloc/vfree while processing the IPI.
935    shootdown_range(range_start, range_end);
936
937    // Phase 3 : return physical frames to the buddy allocator.
938    for i in 0..frames.len {
939        crate::memory::free_frame(token, frames.get(i));
940    }
941    frames.free_storage(token);
942    true
943}
944
945/// Dump all live large allocations with attribution to the serial console.
946///
947/// Output format (one line per allocation, sorted by `start_page`):
948/// ```text
949/// [vmalloc][live] seq=N pid=P tid=T silo=S size=B pages=N vaddr=0x...
950/// ```
951///
952/// A `silo=0` entry means the allocation was made by kernel code with no
953/// associated silo, or that the silo lookup failed (SILO_MANAGER contended).
954///
955/// This is the primary tool for leak investigation: run it periodically under
956/// a long-lived workload, diff the outputs, and identify growing sequences.
957pub fn dump_live_allocations() {
958    const MAX_SNAPSHOT: usize = 256;
959    let mut snapshot = [VmallocLiveAllocationSnapshot {
960        seq: 0,
961        task_id: 0,
962        pid: 0,
963        tid: 0,
964        silo_id: 0,
965        size: 0,
966        pages: 0,
967        vaddr: 0,
968        backend: VmallocAllocBackend::KernelVirtual,
969        caller_file: "",
970        caller_line: 0,
971        caller_column: 0,
972    }; MAX_SNAPSHOT];
973    let count = live_allocations_snapshot(&mut snapshot);
974    if count == 0 {
975        let guard = VMALLOC.lock();
976        if !guard.initialized {
977            serial_println!("[vmalloc][live] not initialized");
978            return;
979        }
980    }
981
982    let mut total_pages = 0usize;
983    for entry in snapshot.iter().take(count) {
984        serial_println!(
985            "[vmalloc][live] seq={} backend={:?} task={} pid={} tid={} silo={} size={} pages={} vaddr=0x{:x} caller={}:{}:{}",
986            entry.seq,
987            entry.backend,
988            entry.task_id,
989            entry.pid,
990            entry.tid,
991            entry.silo_id,
992            entry.size,
993            entry.pages,
994            entry.vaddr,
995            entry.caller_file,
996            entry.caller_line,
997            entry.caller_column,
998        );
999        total_pages = total_pages.saturating_add(entry.pages);
1000    }
1001
1002    let peak = VMALLOC_PEAK_PAGES.load(AtomicOrdering::Relaxed);
1003    let guard = VMALLOC.lock();
1004    let live_count = guard.alloc_count;
1005    let live_pages = guard.allocated_pages;
1006    serial_println!(
1007        "[vmalloc][live] total: {} allocs, {} pages ({} KiB), peak_pages={}",
1008        live_count,
1009        live_pages,
1010        live_pages.saturating_mul(4),
1011        peak,
1012    );
1013    if live_count > count {
1014        serial_println!(
1015            "[vmalloc][live] snapshot truncated: {} additional allocations not shown",
1016            live_count - count,
1017        );
1018    }
1019}
1020
1021/// Copy live vmalloc allocations into `out`, in allocation-address order.
1022///
1023/// Returns the number of entries written. If `out` is too small, the snapshot is
1024/// truncated; callers can compare the returned length with allocator totals to
1025/// detect truncation.
1026pub fn live_allocations_snapshot(out: &mut [VmallocLiveAllocationSnapshot]) -> usize {
1027    let guard = VMALLOC.lock();
1028    let vm = &*guard;
1029    if !vm.initialized {
1030        return 0;
1031    }
1032
1033    let mut count = 0usize;
1034    let mut cur = vm.alloc_head;
1035    while !cur.is_null() && count < out.len() {
1036        let node = unsafe { &*cur };
1037        out[count] = VmallocLiveAllocationSnapshot {
1038            seq: node.attr.alloc_seq,
1039            task_id: node.attr.task_id,
1040            pid: node.attr.pid,
1041            tid: node.attr.tid,
1042            silo_id: node.attr.silo_id,
1043            size: node.attr.size,
1044            pages: node.page_count,
1045            vaddr: VMALLOC_VIRT_START + (node.start_page as u64 * 4096),
1046            backend: VmallocAllocBackend::KernelVirtual,
1047            caller_file: node.attr.caller_file,
1048            caller_line: node.attr.caller_line,
1049            caller_column: node.attr.caller_column,
1050        };
1051        count += 1;
1052        cur = node.next;
1053    }
1054    count
1055}
1056
1057/// Returns whether `ptr` is currently the base address of a live vmalloc
1058/// allocation.
1059///
1060/// Returns `None` if the VMALLOC lock is contended. Intended for shell/debug
1061/// validation, not for allocator hot paths.
1062pub fn is_live_allocation(ptr: *mut u8) -> Option<bool> {
1063    let addr = ptr as u64;
1064    if addr < VMALLOC_VIRT_START || addr >= VMALLOC_VIRT_END {
1065        return Some(false);
1066    }
1067
1068    let guard = VMALLOC.try_lock()?;
1069    let vm = &*guard;
1070    if !vm.initialized {
1071        return Some(false);
1072    }
1073
1074    let start_page = ((addr - VMALLOC_VIRT_START) / 4096) as usize;
1075    let mut cur = vm.alloc_head;
1076    while !cur.is_null() {
1077        let node = unsafe { &*cur };
1078        if node.start_page == start_page {
1079            return Some(true);
1080        }
1081        if node.start_page > start_page {
1082            break;
1083        }
1084        cur = node.next;
1085    }
1086
1087    Some(false)
1088}
1089
1090/// Dump vmalloc diagnostics to the serial console.
1091pub fn dump_diagnostics() {
1092    let guard = VMALLOC.lock();
1093    let vm = &*guard;
1094    if !vm.initialized {
1095        serial_println!("[vmalloc][diag] not initialized");
1096        return;
1097    }
1098
1099    let policy_rejects = VMALLOC_POLICY_REJECT_COUNT.load(AtomicOrdering::Relaxed);
1100    let peak_pages = VMALLOC_PEAK_PAGES.load(AtomicOrdering::Relaxed);
1101    let total_seq = VMALLOC_ALLOC_SEQ.load(AtomicOrdering::Relaxed);
1102    let (free_extents, largest_free, node_pool_free) = unsafe {
1103        (
1104            vm.free_extent_count(),
1105            vm.largest_free_extent_pages(),
1106            vm.node_pool_free_count(),
1107        )
1108    };
1109    serial_println!(
1110        "[vmalloc][diag] arena=0x{:x}..0x{:x} allocs={} alloc_pages={} free_pages={} \
1111         peak_pages={} total_seq={} fails={} policy_rejects={}",
1112        VMALLOC_VIRT_START,
1113        VMALLOC_VIRT_END,
1114        vm.alloc_count,
1115        vm.allocated_pages,
1116        (VMALLOC_PAGES - ARENA_START_PAGE).saturating_sub(vm.allocated_pages),
1117        peak_pages,
1118        total_seq,
1119        vm.fail_count,
1120        policy_rejects
1121    );
1122    serial_println!(
1123        "[vmalloc][diag] extents={} largest_free_pages={} metadata_pages={} node_pool_free={}",
1124        free_extents,
1125        largest_free,
1126        vm.metadata_pages,
1127        node_pool_free
1128    );
1129    if let Some(last) = vm.last_failure {
1130        serial_println!(
1131            "[vmalloc][diag] last_failure: size={} pages={} error={:?}",
1132            last.size,
1133            last.pages,
1134            last.error
1135        );
1136    }
1137    // Print live allocations when any are present : useful for routine health checks.
1138    if vm.alloc_count > 0 {
1139        drop(guard); // release VMALLOC before re-acquiring inside dump_live_allocations
1140        dump_live_allocations();
1141    }
1142}