Skip to main content

strat9_kernel/memory/
heap.rs

1// Heap allocator: slab sub-allocator + VM-backed large-allocation path.
2//
3// Small allocations (effective size <= 2048 B) come from per-size-class slab
4// free lists.  Each slab class draws whole pages from the buddy allocator and
5// carves them into fixed-size blocks.  Freed blocks return to the slab free
6// list, so the buddy's page counter stabilises after warm-up instead of
7// growing on every tiny allocation.
8//
9// Large allocations (> 2048 B) go through the kernel vmalloc backend:
10// virtually contiguous, physically fragmented, and independent from
11// high-order physically contiguous buddy blocks.
12//
13// Lock ordering : SLAB_ALLOC (outer) may call the frame-allocation helpers.
14// Those helpers can hit a CPU-local cache (no global buddy lock) or fall back
15// to the global buddy lock as needed.
16
17use crate::{memory, sync::SpinLock};
18use core::{
19    alloc::{GlobalAlloc, Layout},
20    ptr,
21    sync::atomic::{AtomicUsize, Ordering as AtomicOrdering},
22};
23use x86_64::PhysAddr;
24
25// ---------------------------------------------------------------------------
26// Slab size classes
27// ---------------------------------------------------------------------------
28
29/// Slab block sizes chosen to bound internal fragmentation to ~25% worst-case
30/// (average ~12%) instead of 50% with pure power-of-two classes.
31///
32/// The progression follows a roughly 1.25× step above 64 bytes.  Below 64
33/// bytes the absolute waste of a 2× jump is small enough (max 32 bytes) to
34/// keep power-of-two boundaries, avoiding an explosion of size classes.
35///
36/// | Class range | Step      | Max waste |
37/// |-------------|-----------|-----------|
38/// | 8 to  64 B  | x2 / 1,5× | ≤ 32 B    |
39/// |64 to 256 B  | ~1.25×    | ≤ 64 B    |
40/// |256 to 2048 B| 1.25×     | ≤ 512 B   |
41
42const SLAB_SIZES: [usize; 26] = [
43    8, 16, 24, 32, 48, 64, 80, 96, 112, 128, 160, 192, 224, 256, 320, 384, 448, 512, 640, 768, 896,
44    1024, 1280, 1536, 1792, 2048,
45];
46const NUM_SLABS: usize = SLAB_SIZES.len();
47/// Allocations with effective size above this threshold bypass the slab.
48const MAX_SLAB_SIZE: usize = 2048;
49
50#[derive(Clone, Copy, Debug, Eq, PartialEq)]
51pub enum KernelHeapBackend {
52    Slab,
53    Vmalloc,
54}
55
56#[derive(Clone, Copy, Debug, Eq, PartialEq)]
57pub enum KernelHeapAllocError {
58    InvalidLayout,
59    /// [`GlobalAlloc`] large path uses vmalloc, which only guarantees 4 KiB alignment.
60    AlignmentExceedsKernelPage {
61        align: usize,
62    },
63    SlabRefillFailed {
64        effective: usize,
65        class_size: usize,
66    },
67    Vmalloc(memory::vmalloc::VmallocError),
68}
69
70#[derive(Clone, Copy, Debug, Eq, PartialEq)]
71pub struct KernelHeapFailureSnapshot {
72    pub backend: KernelHeapBackend,
73    pub requested_size: usize,
74    pub align: usize,
75    pub effective_size: usize,
76    pub error: KernelHeapAllocError,
77}
78
79#[derive(Clone, Copy, Debug, Eq, PartialEq)]
80pub struct SlabDiagSnapshot {
81    pub pages_allocated: usize,
82    pub pages_reclaimed: usize,
83    pub pages_live: usize,
84}
85
86#[inline]
87pub(crate) fn classify_kernel_heap_backend(layout: Layout) -> KernelHeapBackend {
88    let effective = layout.size().max(layout.align());
89    if effective <= MAX_SLAB_SIZE {
90        KernelHeapBackend::Slab
91    } else {
92        KernelHeapBackend::Vmalloc
93    }
94}
95
96// =============================================================================
97// CRITICAL: slab corruption detection
98//
99// Set HEAP_POISON_ENABLED to true during debugging of heap-corruption crashes.
100// When enabled:
101//   - Every block carved by refill() is filled with POISON_BYTE in bytes [8..N-4]
102//     and stamped with SLAB_CANARY in the last 4 bytes.
103//   - dealloc_block() restores the canary and re-poisons before linking.
104//   - alloc_block() verifies poison and canary before handing the block out;
105//     a mismatch is logged immediately via serial_println! (non-allocating).
106//
107// This detects:
108//   - Use-after-free: a write to a freed slab block overwrites poison bytes.
109//   - Buffer overflow: a write past the end overwrites the canary or the next
110//     block's free-list pointer.
111//
112// Cost: one memset + canary write per alloc/dealloc for slab classes.
113// =============================================================================
114const HEAP_POISON_ENABLED: bool = true;
115/// Byte pattern written to the body of freed slab blocks.
116const POISON_BYTE: u8 = 0xDE;
117/// Canary word placed at the last 4 bytes of each slab block.
118const SLAB_CANARY: u32 = 0xDEAD_BEEF;
119
120// ---------------------------------------------------------------------------
121// Slab page header : embedded at byte 0 of every buddy page used by a class.
122// Blocks start at offset SLAB_HEADER_SIZE within the page.
123// ---------------------------------------------------------------------------
124
125/// Header at the base of each 4 KiB page dedicated to a slab class.
126///
127/// Page layout:
128/// ```text
129/// [0 .. SLAB_HEADER_SIZE)   SlabPageHeader  (24 bytes)
130/// [SLAB_HEADER_SIZE .. 4096) slab blocks, each SLAB_SIZES[ci] bytes
131/// ```
132///
133/// A page lives in `partial_pages[ci]` while `0 < free_count < total_blocks`.
134/// It is removed when all blocks are allocated (`free_count == 0`), and is
135/// reclaimed to the buddy allocator when it becomes fully empty again
136/// (`free_count == total_blocks`).
137#[repr(C)]
138struct SlabPageHeader {
139    /// Next page in the partial list for this class (null = end of list).
140    next_partial: *mut SlabPageHeader,
141    /// Head of the intra-page free-block chain (null = page is full).
142    free_head: *mut u8,
143    /// Free blocks currently in this page.
144    free_count: u32,
145    /// Total blocks this page can hold (constant per class after refill).
146    total_blocks: u32,
147}
148
149// SAFETY: only accessed under SLAB_ALLOC spinlock.
150unsafe impl Send for SlabPageHeader {}
151unsafe impl Sync for SlabPageHeader {}
152
153/// Byte offset at which slab blocks begin within each slab page.
154const SLAB_HEADER_SIZE: usize = core::mem::size_of::<SlabPageHeader>();
155
156// Compile-time invariants.
157const _: () = assert!(
158    SLAB_HEADER_SIZE == 24,
159    "SlabPageHeader size changed : update docs"
160);
161const _: () = assert!(
162    (4096 - SLAB_HEADER_SIZE) / SLAB_SIZES[NUM_SLABS - 1] >= 1,
163    "SlabPageHeader too large: largest slab class gets 0 blocks per page"
164);
165
166// ---------------------------------------------------------------------------
167// SlabState
168// ---------------------------------------------------------------------------
169
170/// Per-size-class partial-page lists.
171///
172/// `partial_pages[ci]` is the head of a singly-linked list of `SlabPageHeader`
173/// nodes for class `ci`.  A page enters the list on `refill` and on the first
174/// `dealloc` after going full.  It leaves the list when all its blocks are
175/// allocated (it silently becomes "full") or when it becomes completely empty
176/// (it is then returned to the buddy allocator).
177struct SlabState {
178    partial_pages: [*mut SlabPageHeader; NUM_SLABS],
179}
180
181// SAFETY: protected exclusively through `SLAB_ALLOC: SpinLock<SlabState>`.
182unsafe impl Send for SlabState {}
183unsafe impl Sync for SlabState {}
184
185impl SlabState {
186    const fn new() -> Self {
187        SlabState {
188            partial_pages: [ptr::null_mut(); NUM_SLABS],
189        }
190    }
191
192    /// Return the slab class index for `layout`.
193    ///
194    /// The chosen class must be large enough for the payload and guarantee the
195    /// requested alignment for every block carved from that class.
196    #[inline]
197    fn class_index_for_layout(layout: Layout) -> usize {
198        for (i, &s) in SLAB_SIZES.iter().enumerate() {
199            if layout.size() <= s && layout.align() <= slab_class_alignment(i) {
200                return i;
201            }
202        }
203        unreachable!("class_index_for_layout called for unsupported slab layout")
204    }
205
206    /// Allocate one buddy page, write a `SlabPageHeader` at its base, carve
207    /// the remaining space into blocks, and prepend the page to `partial_pages[ci]`.
208    unsafe fn refill(&mut self, ci: usize, token: &crate::sync::IrqDisabledToken) {
209        let slab_size = SLAB_SIZES[ci];
210        let slab_align = slab_class_alignment(ci);
211        let blocks_offset = (SLAB_HEADER_SIZE + slab_align - 1) & !(slab_align - 1);
212        let num_blocks = (4096 - blocks_offset) / slab_size;
213        debug_assert!(
214            num_blocks >= 1,
215            "refill: slab_size {} yields 0 blocks",
216            slab_size
217        );
218
219        let frame = match memory::allocate_frame(token) {
220            Ok(f) => f,
221            Err(_) => return, // OOM : alloc_block will see null partial and return null
222        };
223        SLAB_PAGES_ALLOCATED.fetch_add(1, AtomicOrdering::Relaxed);
224
225        let page_virt = super::phys_to_virt(frame.start_address.as_u64()) as *mut u8;
226
227        // Initialise page header at byte 0.
228        let header = page_virt as *mut SlabPageHeader;
229        (*header).next_partial = ptr::null_mut();
230        (*header).free_head = ptr::null_mut();
231        (*header).free_count = 0;
232        (*header).total_blocks = num_blocks as u32;
233
234        // Carve blocks starting at an alignment-respecting offset, highest index first so
235        // the lowest-address block ends up at the head (cosmetic only).
236        let blocks_start = page_virt.add(blocks_offset);
237        for i in (0..num_blocks).rev() {
238            let block = blocks_start.add(i * slab_size);
239            debug_assert_eq!(
240                (block as usize) & (slab_align - 1),
241                0,
242                "slab block alignment invariant broken for class {}",
243                slab_size
244            );
245            *(block as *mut *mut u8) = (*header).free_head;
246            if HEAP_POISON_ENABLED {
247                let end = slab_size.saturating_sub(4);
248                for off in 8..end {
249                    *block.add(off) = POISON_BYTE;
250                }
251                if slab_size >= 12 {
252                    let cp = block.add(slab_size - 4) as *mut u32;
253                    *cp = SLAB_CANARY;
254                }
255            }
256            (*header).free_head = block;
257            (*header).free_count += 1;
258        }
259
260        // Prepend to partial list.
261        (*header).next_partial = self.partial_pages[ci];
262        self.partial_pages[ci] = header;
263    }
264
265    /// Pop one block from the first partial page for class `ci`.
266    /// Calls `refill` when the partial list is empty.  Returns null on OOM.
267    unsafe fn alloc_block(&mut self, ci: usize, token: &crate::sync::IrqDisabledToken) -> *mut u8 {
268        if self.partial_pages[ci].is_null() {
269            self.refill(ci, token);
270        }
271        let header = self.partial_pages[ci];
272        if header.is_null() {
273            return ptr::null_mut();
274        }
275
276        let block = (*header).free_head;
277        debug_assert!(
278            !block.is_null(),
279            "alloc_block: partial page has null free_head"
280        );
281
282        (*header).free_head = *(block as *const *mut u8);
283        (*header).free_count -= 1;
284
285        // Remove page from partial list when it is now full (free_count == 0).
286        if (*header).free_count == 0 {
287            self.partial_pages[ci] = (*header).next_partial;
288            (*header).next_partial = ptr::null_mut();
289        }
290
291        if HEAP_POISON_ENABLED {
292            let slab_size = SLAB_SIZES[ci];
293            let end = slab_size.saturating_sub(4);
294            let mut bad_off: Option<usize> = None;
295            for off in 8..end {
296                if *block.add(off) != POISON_BYTE {
297                    bad_off = Some(off);
298                    break;
299                }
300            }
301            if let Some(off) = bad_off {
302                let b0 = *block.add(off);
303                let b1 = if off + 1 < slab_size {
304                    *block.add(off + 1)
305                } else {
306                    0
307                };
308                let b2 = if off + 2 < slab_size {
309                    *block.add(off + 2)
310                } else {
311                    0
312                };
313                let b3 = if off + 3 < slab_size {
314                    *block.add(off + 3)
315                } else {
316                    0
317                };
318                crate::serial_println!(
319                    "\x1b[1;31m[HEAP] USE-AFTER-FREE: slab[{}] block={:#x} off={} bytes=[{:02x} {:02x} {:02x} {:02x}]\x1b[0m",
320                    slab_size,
321                    block as u64,
322                    off,
323                    b0,
324                    b1,
325                    b2,
326                    b3
327                );
328            }
329            if slab_size >= 12 {
330                let canary = *(block.add(slab_size - 4) as *const u32);
331                if canary != SLAB_CANARY {
332                    crate::serial_println!(
333                        "\x1b[1;31m[HEAP] CANARY OVERFLOW: slab[{}] block={:#x} expected={:#x} got={:#x}\x1b[0m",
334                        slab_size,
335                        block as u64,
336                        SLAB_CANARY,
337                        canary
338                    );
339                }
340            }
341        }
342
343        block
344    }
345
346    /// Return `ptr` to its slab page and reclaim the page to the buddy
347    /// allocator if it becomes fully empty.
348    unsafe fn dealloc_block(
349        &mut self,
350        ptr: *mut u8,
351        ci: usize,
352        token: &crate::sync::IrqDisabledToken,
353    ) {
354        let slab_size = SLAB_SIZES[ci];
355
356        if HEAP_POISON_ENABLED {
357            if slab_size >= 12 {
358                let cp = ptr.add(slab_size - 4) as *mut u32;
359                *cp = SLAB_CANARY;
360            }
361            let end = slab_size.saturating_sub(4);
362            for off in 8..end {
363                *ptr.add(off) = POISON_BYTE;
364            }
365        }
366
367        // Locate the page header: round ptr down to 4 KiB boundary.
368        let page_base = (ptr as usize) & !0xFFF;
369        let header = page_base as *mut SlabPageHeader;
370
371        let was_full = (*header).free_count == 0;
372
373        // Push block onto the page's intra-page free list.
374        *(ptr as *mut *mut u8) = (*header).free_head;
375        (*header).free_head = ptr;
376        (*header).free_count += 1;
377
378        if was_full {
379            // Page went full -> partial: re-insert at list head.
380            (*header).next_partial = self.partial_pages[ci];
381            self.partial_pages[ci] = header;
382        }
383
384        // Reclaim fully-empty pages to the buddy allocator.
385        if (*header).free_count == (*header).total_blocks {
386            self.remove_from_partial(header, ci);
387            let phys = super::virt_to_phys(page_base as u64);
388            // Zero the header before freeing to catch accidental reuse.
389            core::ptr::write_bytes(header as *mut u8, 0, SLAB_HEADER_SIZE);
390            let frame = memory::frame::PhysFrame {
391                start_address: PhysAddr::new(phys),
392            };
393            memory::free_frame(token, frame);
394            SLAB_PAGES_RECLAIMED.fetch_add(1, AtomicOrdering::Relaxed);
395        }
396    }
397
398    /// Unlink `page` from `partial_pages[ci]`.  O(n) in partial-list length.
399    unsafe fn remove_from_partial(&mut self, page: *mut SlabPageHeader, ci: usize) {
400        if self.partial_pages[ci] == page {
401            self.partial_pages[ci] = (*page).next_partial;
402            (*page).next_partial = ptr::null_mut();
403            return;
404        }
405        let mut cur = self.partial_pages[ci];
406        while !cur.is_null() {
407            let next = (*cur).next_partial;
408            if next == page {
409                (*cur).next_partial = (*page).next_partial;
410                (*page).next_partial = ptr::null_mut();
411                return;
412            }
413            cur = next;
414        }
415        debug_assert!(
416            false,
417            "remove_from_partial: page {:p} not found in class {} list",
418            page, ci
419        );
420    }
421}
422
423static SLAB_ALLOC: SpinLock<SlabState> = SpinLock::new(SlabState::new());
424static LAST_HEAP_FAILURE: SpinLock<Option<KernelHeapFailureSnapshot>> = SpinLock::new(None);
425
426/// Total buddy pages ever handed to the slab allocator.
427static SLAB_PAGES_ALLOCATED: AtomicUsize = AtomicUsize::new(0);
428/// Total buddy pages ever returned from the slab allocator (fully-empty reclaim).
429static SLAB_PAGES_RECLAIMED: AtomicUsize = AtomicUsize::new(0);
430
431/// Returns the slab lock address for deadlock tracing.
432pub fn debug_slab_lock_addr() -> usize {
433    &SLAB_ALLOC as *const _ as usize
434}
435
436/// Register slab lock for E9 trace (call from init).
437pub fn debug_register_slab_trace() {
438    crate::sync::debug_set_trace_slab_addr(debug_slab_lock_addr());
439}
440
441// ---------------------------------------------------------------------------
442// GlobalAlloc implementation
443// ---------------------------------------------------------------------------
444
445pub struct LockedHeap;
446
447unsafe impl GlobalAlloc for LockedHeap {
448    /// Performs the alloc operation.
449    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
450        try_alloc_kernel_heap(layout).unwrap_or(ptr::null_mut())
451    }
452
453    /// Performs the dealloc operation.
454    unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
455        let effective = layout.size().max(layout.align());
456
457        match classify_kernel_heap_backend(layout) {
458            KernelHeapBackend::Slab => {
459                // --- slab path: return block to free list ---
460                let ci = SlabState::class_index_for_layout(layout);
461                let cpu = crate::arch::x86_64::percpu::current_cpu_index();
462                let irq_enabled = crate::arch::x86_64::interrupts_enabled();
463                #[cfg(debug_assertions)]
464                if !irq_enabled {
465                    use core::sync::atomic::{AtomicUsize, Ordering};
466                    static HEAP_D_COUNT: AtomicUsize = AtomicUsize::new(0);
467                    let n = HEAP_D_COUNT.fetch_add(1, Ordering::Relaxed);
468                    if n % 100 == 0 {
469                        crate::e9_println!(
470                            "HEAP-D cpu={} irq=0 size={} ci={} n={}",
471                            cpu,
472                            effective,
473                            ci,
474                            n
475                        );
476                    }
477                }
478                // Catch layout mismatches where a vmalloc pointer is freed with
479                // a small layout (classify_kernel_heap_backend routes to Slab).
480                // This means the caller passed a different layout to dealloc than
481                // was used for alloc : a GlobalAlloc contract violation.
482                #[cfg(debug_assertions)]
483                {
484                    let addr = ptr as u64;
485                    if addr >= crate::memory::vmalloc::VMALLOC_VIRT_START
486                        && addr < crate::memory::vmalloc::VMALLOC_VIRT_END
487                    {
488                        crate::serial_println!(
489                            "[heap][bug] slab dealloc: ptr {:#x} is in vmalloc range : layout mismatch",
490                            addr
491                        );
492                        debug_assert!(
493                            false,
494                            "slab dealloc with vmalloc pointer : alloc/dealloc layout mismatch"
495                        );
496                    }
497                }
498                let mut slab = SLAB_ALLOC.lock();
499                slab.with_mut_and_token(|s, token| s.dealloc_block(ptr, ci, token));
500            }
501            KernelHeapBackend::Vmalloc => {
502                // vmalloc path: free via the vmalloc arena
503                let addr = ptr as u64;
504                if addr >= crate::memory::vmalloc::VMALLOC_VIRT_START
505                    && addr < crate::memory::vmalloc::VMALLOC_VIRT_END
506                {
507                    let ok = crate::sync::with_irqs_disabled(|token| {
508                        crate::memory::free_kernel_virtual(ptr, token)
509                    });
510                    if !ok {
511                        crate::serial_println!(
512                            "[heap][leak] vmalloc free: no live mapping at {:#x} (wrong base or double-free?)",
513                            addr
514                        );
515                    }
516                } else {
517                    // Pointer is outside the vmalloc arena with a large-allocation
518                    // layout : nothing is freed (GlobalAlloc contract violation).
519                    crate::serial_println!(
520                        "[heap][leak] vmalloc dealloc: ptr {:#x} outside vmalloc arena [{:#x}..{:#x}]",
521                        addr,
522                        crate::memory::vmalloc::VMALLOC_VIRT_START,
523                        crate::memory::vmalloc::VMALLOC_VIRT_END,
524                    );
525                    #[cfg(debug_assertions)]
526                    debug_assert!(
527                        false,
528                        "vmalloc dealloc with out-of-range pointer : memory leaked"
529                    );
530                }
531            }
532        }
533    }
534}
535
536fn record_heap_failure(
537    layout: Layout,
538    effective: usize,
539    backend: KernelHeapBackend,
540    error: KernelHeapAllocError,
541) -> KernelHeapAllocError {
542    *LAST_HEAP_FAILURE.lock() = Some(KernelHeapFailureSnapshot {
543        backend,
544        requested_size: layout.size(),
545        align: layout.align(),
546        effective_size: effective,
547        error,
548    });
549    error
550}
551
552pub fn last_heap_failure_snapshot() -> Option<KernelHeapFailureSnapshot> {
553    *LAST_HEAP_FAILURE.lock()
554}
555
556pub fn slab_diag_snapshot() -> SlabDiagSnapshot {
557    let allocated = SLAB_PAGES_ALLOCATED.load(AtomicOrdering::Relaxed);
558    let reclaimed = SLAB_PAGES_RECLAIMED.load(AtomicOrdering::Relaxed);
559    SlabDiagSnapshot {
560        pages_allocated: allocated,
561        pages_reclaimed: reclaimed,
562        pages_live: allocated.saturating_sub(reclaimed),
563    }
564}
565
566/// Number of slab size classes.
567pub const SLAB_NUM_CLASSES: usize = NUM_SLABS;
568
569/// Block size in bytes for slab class `ci`.
570///
571/// Panics in debug if `ci >= SLAB_NUM_CLASSES`.
572#[inline]
573pub fn slab_class_size(ci: usize) -> usize {
574    SLAB_SIZES[ci]
575}
576
577/// Guaranteed alignment in bytes for slab class `ci`.
578///
579/// This is the largest power-of-two divisor of the class size.
580#[inline]
581pub fn slab_class_alignment(ci: usize) -> usize {
582    1usize << SLAB_SIZES[ci].trailing_zeros()
583}
584
585/// Number of blocks that fit in one buddy page for slab class `ci`.
586///
587/// Accounts for the `SlabPageHeader` at the base of each page.
588#[inline]
589pub fn slab_blocks_per_page(ci: usize) -> usize {
590    let align = slab_class_alignment(ci);
591    let blocks_offset = (SLAB_HEADER_SIZE + align - 1) & !(align - 1);
592    (4096 - blocks_offset) / SLAB_SIZES[ci]
593}
594
595/// Returns whether the slab page at `page_base` is currently present in the
596/// partial-page list for class `ci`.
597///
598/// Returns `None` if the slab allocator lock is contended. Intended for
599/// shell/debug validation, not for allocator hot paths.
600pub fn slab_page_in_partial_list(ci: usize, page_base: u64) -> Option<bool> {
601    let mut guard = SLAB_ALLOC.try_lock()?;
602    Some(guard.with_mut_and_token(|s, _| unsafe {
603        let mut cur = s.partial_pages[ci];
604        while !cur.is_null() {
605            if cur as u64 == page_base {
606                return true;
607            }
608            cur = (*cur).next_partial;
609        }
610        false
611    }))
612}
613
614/// Fallible heap entry point with explicit backend-aware errors.
615///
616/// Kernel code that can recover from allocation failure should prefer this API
617/// over `Box`/`Vec`/`GlobalAlloc`, which eventually route to
618/// [`alloc_error_handler`] and remain fatal by language contract.
619#[inline]
620pub unsafe fn try_alloc_kernel_heap(layout: Layout) -> Result<*mut u8, KernelHeapAllocError> {
621    // Effective size must satisfy both the size and alignment requirements.
622    let effective = layout.size().max(layout.align());
623    // `Layout` constructors guarantee a non-zero alignment; keep the power-of-two
624    // check as a defensive guard for any malformed caller input.
625    if !layout.align().is_power_of_two() {
626        return Err(record_heap_failure(
627            layout,
628            effective,
629            classify_kernel_heap_backend(layout),
630            KernelHeapAllocError::InvalidLayout,
631        ));
632    }
633    // Large heap path uses vmalloc, which only aligns to 4 KiB pages.
634    if effective > MAX_SLAB_SIZE && layout.align() > 4096 {
635        return Err(record_heap_failure(
636            layout,
637            effective,
638            KernelHeapBackend::Vmalloc,
639            KernelHeapAllocError::AlignmentExceedsKernelPage {
640                align: layout.align(),
641            },
642        ));
643    }
644    let boot_reg = crate::silo::debug_boot_reg_active();
645    if boot_reg {
646        crate::serial_println!(
647            "[trace][heap] alloc enter effective={} size={} align={}",
648            effective,
649            layout.size(),
650            layout.align()
651        );
652    }
653
654    let result = match classify_kernel_heap_backend(layout) {
655        KernelHeapBackend::Slab => {
656            // --- slab path ---
657            let ci = SlabState::class_index_for_layout(layout);
658            // Race/corruption diagnostic: log alloc when IRQs disabled (rate-limited).
659            let cpu = crate::arch::x86_64::percpu::current_cpu_index();
660            let irq_enabled = crate::arch::x86_64::interrupts_enabled();
661            #[cfg(debug_assertions)]
662            if !irq_enabled {
663                use core::sync::atomic::{AtomicUsize, Ordering};
664                static HEAP_A_COUNT: AtomicUsize = AtomicUsize::new(0);
665                let n = HEAP_A_COUNT.fetch_add(1, Ordering::Relaxed);
666                if n % 100 == 0 {
667                    crate::e9_println!(
668                        "HEAP-A cpu={} irq=0 size={} ci={} n={}",
669                        cpu,
670                        effective,
671                        ci,
672                        n
673                    );
674                }
675            }
676            if boot_reg {
677                crate::serial_println!(
678                    "[trace][heap] alloc slab ci={} slab_size={} lock={:#x}",
679                    ci,
680                    SLAB_SIZES[ci],
681                    &SLAB_ALLOC as *const _ as usize
682                );
683            }
684            let mut slab = SLAB_ALLOC.lock();
685            if boot_reg {
686                crate::serial_println!("[trace][heap] alloc slab lock acquired");
687            }
688            let ptr = slab.with_mut_and_token(|s, token| s.alloc_block(ci, token));
689            if ptr.is_null() {
690                return Err(record_heap_failure(
691                    layout,
692                    effective,
693                    KernelHeapBackend::Slab,
694                    KernelHeapAllocError::SlabRefillFailed {
695                        effective,
696                        class_size: SLAB_SIZES[ci],
697                    },
698                ));
699            }
700            ptr
701        }
702        KernelHeapBackend::Vmalloc => {
703            // --- vmalloc path (large allocation) ---
704            if boot_reg {
705                crate::serial_println!("[trace][heap] alloc vmalloc size={}", effective);
706            }
707
708            crate::sync::with_irqs_disabled(|token| {
709                crate::memory::allocate_kernel_virtual(effective, token).map_err(|error| {
710                    record_heap_failure(
711                        layout,
712                        effective,
713                        KernelHeapBackend::Vmalloc,
714                        KernelHeapAllocError::Vmalloc(error),
715                    )
716                })
717            })?
718        }
719    };
720
721    Ok(result)
722}
723
724#[global_allocator]
725static HEAP_ALLOCATOR: LockedHeap = LockedHeap;
726
727/// Compatibility facade over the current global kernel heap policy.
728///
729/// Callers that need an explicit heap allocation entry point, rather than
730/// relying on `Box`/`Vec`/`GlobalAlloc`, should use this helper. The selected
731/// backend remains the current heap policy:
732/// - small allocations -> slab
733/// - large allocations -> vmalloc
734#[inline]
735pub unsafe fn alloc_kernel_heap(layout: Layout) -> *mut u8 {
736    try_alloc_kernel_heap(layout).unwrap_or(ptr::null_mut())
737}
738
739/// Free memory previously returned by [`alloc_kernel_heap`].
740#[inline]
741pub unsafe fn dealloc_kernel_heap(ptr: *mut u8, layout: Layout) {
742    HEAP_ALLOCATOR.dealloc(ptr, layout);
743}
744
745fn log_common_oom_header(layout: Layout, effective: usize) {
746    let cpu = crate::arch::x86_64::percpu::current_cpu_index();
747    let irq_enabled = crate::arch::x86_64::interrupts_enabled();
748    let tid = crate::process::current_task_id()
749        .map(|t| t.as_u64())
750        .unwrap_or(0);
751    let task_name = crate::process::current_task_clone()
752        .map(|t| t.name)
753        .unwrap_or("<none>");
754
755    crate::serial_println!(
756        "[heap][oom] cpu={} irq={} tid={} task={} size={} align={} effective={}",
757        cpu,
758        irq_enabled,
759        tid,
760        task_name,
761        layout.size(),
762        layout.align(),
763        effective
764    );
765}
766
767fn log_buddy_snapshot() -> Option<(usize, usize, usize)> {
768    if let Some(guard) = crate::memory::buddy::get_allocator().try_lock() {
769        if let Some(alloc) = guard.as_ref() {
770            let (total_pages, allocated_pages) = alloc.page_totals();
771            let free_pages = total_pages.saturating_sub(allocated_pages);
772            let fail_counts = crate::memory::buddy::buddy_alloc_fail_counts_snapshot();
773
774            crate::serial_println!(
775                "[heap][oom] buddy: total={} alloc={} free={}",
776                total_pages,
777                allocated_pages,
778                free_pages
779            );
780
781            let mut fail_line = alloc::string::String::from("[heap][oom] buddy_fail_by_order:");
782            for (i, &count) in fail_counts.iter().enumerate() {
783                use core::fmt::Write;
784                let _ = write!(fail_line, " o{}={} ", i, count);
785            }
786            crate::serial_println!("{}", fail_line);
787            return Some((total_pages, allocated_pages, free_pages));
788        }
789        crate::serial_println!("[heap][oom] buddy: allocator uninitialized");
790        return None;
791    }
792
793    crate::serial_println!("[heap][oom] buddy: allocator locked");
794    None
795}
796
797fn log_heap_failure_policy(layout: Layout) {
798    match last_heap_failure_snapshot() {
799        Some(snapshot) => {
800            crate::serial_println!(
801                "[heap][oom] last_failure backend={:?} requested={} align={} effective={} error={:?}",
802                snapshot.backend,
803                snapshot.requested_size,
804                snapshot.align,
805                snapshot.effective_size,
806                snapshot.error
807            );
808            if snapshot.requested_size != layout.size() || snapshot.align != layout.align() {
809                crate::serial_println!(
810                    "[heap][oom] note=last_heap_failure does not exactly match current layout; using best-effort context"
811                );
812            }
813        }
814        None => crate::serial_println!("[heap][oom] last_heap_failure unavailable"),
815    }
816}
817
818/// Allocates error handler.
819#[alloc_error_handler]
820fn alloc_error_handler(layout: Layout) -> ! {
821    let effective = layout.size().max(layout.align());
822    let pages_needed = (effective.saturating_add(4095)) / 4096;
823    let order = if pages_needed == 0 {
824        0
825    } else {
826        pages_needed.next_power_of_two().trailing_zeros() as u8
827    };
828    log_common_oom_header(layout, effective);
829
830    if effective <= MAX_SLAB_SIZE {
831        crate::serial_println!(
832            "[heap][oom] backend=slab effective={} class_max={} refill_order=0",
833            effective,
834            MAX_SLAB_SIZE
835        );
836        log_heap_failure_policy(layout);
837        if let Some((total_pages, _, free_pages)) = log_buddy_snapshot() {
838            crate::serial_println!(
839                "[heap][oom] slab-refill pages={} buddy_order={}",
840                pages_needed,
841                order
842            );
843            if free_pages > (total_pages / 4) {
844                crate::serial_println!(
845                    "[heap][oom] diagnosis=slab order-0 refill failed despite remaining free pages \
846                     ({} free pages): allocator pressure, zone exhaustion, or transient allocator state",
847                    free_pages,
848                );
849            }
850        }
851    } else {
852        crate::serial_println!(
853            "[heap][oom] backend=vmalloc request_pages={} legacy_buddy_order_hint={}",
854            pages_needed,
855            order
856        );
857        log_heap_failure_policy(layout);
858        if let Some(snap) = last_heap_failure_snapshot() {
859            if let KernelHeapAllocError::AlignmentExceedsKernelPage { align } = snap.error {
860                crate::serial_println!(
861                    "[heap][oom] diagnosis=layout alignment {} B exceeds 4 KiB page alignment guaranteed by vmalloc heap path",
862                    align
863                );
864            }
865        }
866        match crate::memory::vmalloc::last_failure_snapshot() {
867            Some(snapshot) => {
868                crate::serial_println!(
869                    "[heap][oom] vmalloc_last_failure size={} pages={} error={:?}",
870                    snapshot.size,
871                    snapshot.pages,
872                    snapshot.error
873                );
874                match snapshot.error {
875                    crate::memory::vmalloc::VmallocError::SizeExceedsPolicy {
876                        requested,
877                        max_allowed,
878                    } => {
879                        crate::serial_println!(
880                            "[heap][oom] diagnosis=vmalloc policy limit exceeded requested={} max_allowed={}",
881                            requested,
882                            max_allowed
883                        );
884                    }
885                    crate::memory::vmalloc::VmallocError::VirtualRangeExhausted => {
886                        crate::serial_println!(
887                            "[heap][oom] diagnosis=kernel virtual allocation arena exhausted or fragmented"
888                        );
889                    }
890                    crate::memory::vmalloc::VmallocError::PhysicalMemoryExhausted => {
891                        crate::serial_println!(
892                            "[heap][oom] diagnosis=vmalloc could not acquire enough physical pages"
893                        );
894                    }
895                    crate::memory::vmalloc::VmallocError::MetadataAllocationFailed => {
896                        crate::serial_println!(
897                            "[heap][oom] diagnosis=vmalloc metadata allocation failed"
898                        );
899                    }
900                    crate::memory::vmalloc::VmallocError::KernelMapFailed => {
901                        crate::serial_println!(
902                            "[heap][oom] diagnosis=kernel page-table mapping failed during vmalloc"
903                        );
904                    }
905                    crate::memory::vmalloc::VmallocError::ZeroSize => {
906                        crate::serial_println!("[heap][oom] diagnosis=zero-sized vmalloc request");
907                    }
908                }
909            }
910            None => {
911                crate::serial_println!("[heap][oom] vmalloc_last_failure unavailable");
912            }
913        }
914        let _ = log_buddy_snapshot();
915    }
916    crate::serial_println!(
917        "[heap][oom] policy=fatal_global_alloc_path use try_alloc_kernel_heap()/allocate_kernel_virtual() on recoverable paths"
918    );
919    panic!("fatal kernel heap allocation failure: {:?}", layout)
920}
921
922/// Dump heap and buddy allocator diagnostics to the serial console.
923///
924/// Safe to call from the shell or debug tooling. Prints:
925/// - Total/allocated/free pages
926/// - Per-order buddy free list head counts
927/// - Buddy allocation failure counts by order (fragmentation indicator)
928/// - Slab free list head pointers
929pub fn dump_diagnostics() {
930    crate::serial_println!("[heap][diag] === Heap Diagnostics ===");
931
932    // Buddy allocator stats
933    if let Some(guard) = crate::memory::buddy::get_allocator().try_lock() {
934        if let Some(alloc) = guard.as_ref() {
935            let (total_pages, allocated_pages) = alloc.page_totals();
936            let mut zones =
937                [crate::memory::buddy::ZoneStats::empty(); crate::memory::zone::ZoneType::COUNT];
938            let zone_count = alloc.zone_snapshot(&mut zones);
939            crate::serial_println!(
940                "[heap][diag] buddy: total={} pages, allocated={} pages, free={} pages",
941                total_pages,
942                allocated_pages,
943                total_pages.saturating_sub(allocated_pages)
944            );
945
946            for info in zones.iter().take(zone_count) {
947                crate::serial_println!(
948                    "[heap][diag] zone={:?} state={:?} managed={} present={} reserved={} free={} cached={} cu/cm={}/{} avail={} segments={}/{} pageblocks=u{}/m{} u_free={} m_free={} watermarks={}/{}/{} reserve={} largest_order={:?}",
949                    info.zone_type,
950                    info.pressure(),
951                    info.managed_pages,
952                    info.present_pages,
953                    info.reserved_pages,
954                    info.free_pages,
955                    info.cached_pages,
956                    info.cached_unmovable_pages,
957                    info.cached_movable_pages,
958                    info.available_after_reserve_pages(),
959                    info.segment_count,
960                    info.segment_capacity,
961                    info.unmovable_pageblocks,
962                    info.movable_pageblocks,
963                    info.unmovable_free_pages,
964                    info.movable_free_pages,
965                    info.watermark_min,
966                    info.watermark_low,
967                    info.watermark_high,
968                    info.lowmem_reserve_pages,
969                    info.largest_free_order
970                );
971            }
972
973            // Per-zone free list heads by migratetype.
974            for zi in 0..zone_count {
975                let zone = alloc.get_zone(zi);
976                let info = zones[zi];
977                let mut line = alloc::string::String::from("[heap][diag] ");
978                use core::fmt::Write;
979                let _ = write!(line, "zone={:?} free_heads:", zone.zone_type);
980                for order in 0..=crate::memory::zone::MAX_ORDER {
981                    let unmovable = zone.free_list_count_for(
982                        order as u8,
983                        crate::memory::zone::Migratetype::Unmovable,
984                    );
985                    let movable = zone.free_list_count_for(
986                        order as u8,
987                        crate::memory::zone::Migratetype::Movable,
988                    );
989                    if unmovable > 0 || movable > 0 {
990                        let _ = write!(line, " o{}=u{}/m{} ", order, unmovable, movable);
991                    }
992                }
993                crate::serial_println!("{}", line);
994
995                let mut frag = alloc::string::String::from("[heap][diag] ");
996                let _ = write!(frag, "zone={:?} frag:", zone.zone_type);
997                for order in 1..=crate::memory::zone::MAX_ORDER {
998                    let score = zone.fragmentation_score(order as u8, info.cached_pages);
999                    let _ = write!(frag, " o{}={}%", order, score);
1000                }
1001                crate::serial_println!("{}", frag);
1002            }
1003        }
1004    } else {
1005        crate::serial_println!("[heap][diag] buddy: allocator locked (retry later)");
1006    }
1007
1008    // Buddy failure counts
1009    let fail_counts = crate::memory::buddy::buddy_alloc_fail_counts_snapshot();
1010    let mut has_fails = false;
1011    for (i, &count) in fail_counts.iter().enumerate() {
1012        if count > 0 {
1013            has_fails = true;
1014        }
1015        crate::serial_println!("[heap][diag] buddy_fail[{}]: {}", i, count);
1016    }
1017    if has_fails {
1018        crate::serial_println!(
1019            "[heap][diag] => non-zero buddy_fail counts indicate fragmentation pressure"
1020        );
1021    }
1022
1023    // Slab stats
1024    {
1025        let alloc = SLAB_PAGES_ALLOCATED.load(AtomicOrdering::Relaxed);
1026        let reclaim = SLAB_PAGES_RECLAIMED.load(AtomicOrdering::Relaxed);
1027        crate::serial_println!(
1028            "[heap][diag] slab: pages_allocated={} pages_reclaimed={} pages_live={}",
1029            alloc,
1030            reclaim,
1031            alloc.saturating_sub(reclaim)
1032        );
1033    }
1034    if let Some(mut guard) = SLAB_ALLOC.try_lock() {
1035        // SAFETY: we hold the slab lock; raw pointer traversal is safe.
1036        guard.with_mut_and_token(|s, _| unsafe {
1037            for ci in 0..NUM_SLABS {
1038                let mut head = s.partial_pages[ci];
1039                if head.is_null() {
1040                    continue;
1041                }
1042                let mut page_count = 0usize;
1043                let mut free_blocks = 0u32;
1044                while !head.is_null() {
1045                    page_count += 1;
1046                    free_blocks = free_blocks.saturating_add((*head).free_count);
1047                    head = (*head).next_partial;
1048                }
1049                crate::serial_println!(
1050                    "[heap][diag] slab[{}]: partial_pages={} free_blocks={}",
1051                    SLAB_SIZES[ci],
1052                    page_count,
1053                    free_blocks
1054                );
1055            }
1056        });
1057    } else {
1058        crate::serial_println!("[heap][diag] slab: locked (retry later)");
1059    }
1060
1061    // Contiguous-physical allocation telemetry
1062    {
1063        let d = crate::memory::phys_contiguous_diag();
1064        crate::serial_println!(
1065            "[heap][diag] phys_contiguous: pages_allocated={} pages_freed={} pages_live={} alloc_failures={}",
1066            d.pages_allocated,
1067            d.pages_freed,
1068            d.pages_live,
1069            d.alloc_fail_count
1070        );
1071    }
1072
1073    if let Some(snapshot) = last_heap_failure_snapshot() {
1074        crate::serial_println!(
1075            "[heap][diag] last_heap_failure: backend={:?} requested={} align={} effective={} error={:?}",
1076            snapshot.backend,
1077            snapshot.requested_size,
1078            snapshot.align,
1079            snapshot.effective_size,
1080            snapshot.error
1081        );
1082    }
1083
1084    crate::memory::vmalloc::dump_diagnostics();
1085
1086    crate::serial_println!("[heap][diag] === End Diagnostics ===");
1087}