Skip to main content

strat9_kernel/syscall/
mmap.rs

1//! Memory-management syscall handlers: mmap, munmap, brk.
2//!
3//! Implements:
4//!  - [`sys_mmap`]   – map anonymous virtual memory (SYS_MMAP = 100)
5//!  - [`sys_munmap`] – unmap a virtual memory range (SYS_MUNMAP = 101)
6//!  - [`sys_brk`]    – set / query the program break / heap top (SYS_BRK = 102)
7//!  - [`sys_mremap`] – resize/remap an existing region (SYS_MREMAP = 103)
8//!  - [`sys_mprotect`] – change page permissions (SYS_MPROTECT = 104)
9
10use crate::{
11    memory::address_space::{VmaFlags, VmaType},
12    process::current_task_clone,
13    syscall::error::SyscallError,
14};
15use core::sync::atomic::Ordering;
16use x86_64::VirtAddr;
17
18// ─────────────────────────────────────────────────────────────────────────────
19// Virtual address layout constants
20// ─────────────────────────────────────────────────────────────────────────────
21
22/// Base virtual address for the heap (`brk`-managed region).
23pub const BRK_BASE: u64 = 0x0000_0000_2000_0000; // 512 MiB
24
25/// Initial hint address for anonymous `mmap` allocations.
26pub const MMAP_BASE: u64 = 0x0000_0000_6000_0000; // 1.5 GiB
27
28/// Exclusive upper bound of the canonical user-space address range.
29const USER_SPACE_END: u64 = 0x0000_8000_0000_0000;
30
31// ─────────────────────────────────────────────────────────────────────────────
32// PROT flags (arg3 of mmap)
33// ─────────────────────────────────────────────────────────────────────────────
34
35const PROT_READ: u32 = 1 << 0;
36const PROT_WRITE: u32 = 1 << 1;
37const PROT_EXEC: u32 = 1 << 2;
38
39// ─────────────────────────────────────────────────────────────────────────────
40// MAP flags (arg4 of mmap)
41// ─────────────────────────────────────────────────────────────────────────────
42
43const MAP_SHARED: u32 = 1 << 0;
44const MAP_PRIVATE: u32 = 1 << 1;
45const MAP_FIXED: u32 = 1 << 4;
46const MAP_ANONYMOUS: u32 = 1 << 5;
47const MAP_HUGETLB: u32 = 1 << 11; // Standard Linux flag for huge pages
48const MAP_FIXED_NOREPLACE: u32 = 1 << 20; // Linux-compatible extension bit.
49
50const MREMAP_MAYMOVE: u64 = 1 << 0;
51
52// ─────────────────────────────────────────────────────────────────────────────
53// Helpers
54// ─────────────────────────────────────────────────────────────────────────────
55
56/// Round `addr` up to the nearest 4 KiB page boundary.
57#[inline]
58fn page_align_up(addr: u64) -> u64 {
59    (addr.wrapping_add(4095)) & !4095u64
60}
61
62/// Round `addr` up to the nearest 2 MiB boundary.
63#[inline]
64fn huge_page_align_up(addr: u64) -> u64 {
65    (addr.wrapping_add((2 * 1024 * 1024) - 1)) & !((2 * 1024 * 1024) - 1)
66}
67
68/// Convert POSIX protection flags to `VmaFlags`.
69fn prot_to_vma_flags(prot: u32) -> VmaFlags {
70    VmaFlags {
71        readable: prot & PROT_READ != 0,
72        writable: prot & PROT_WRITE != 0,
73        executable: prot & PROT_EXEC != 0,
74        user_accessible: true,
75    }
76}
77
78// ─────────────────────────────────────────────────────────────────────────────
79// sys_mmap
80// ─────────────────────────────────────────────────────────────────────────────
81
82/// SYS_MMAP (100): map anonymous virtual memory.
83///
84/// Only `MAP_ANONYMOUS` mappings are supported at this stage; file-backed mmaps
85/// return `NotImplemented`.  Both `MAP_PRIVATE` and `MAP_SHARED` are accepted
86/// for anonymous memory (they are equivalent when there is no backing file).
87///
88/// Returns the mapped virtual address on success, or a negative error code.
89pub fn sys_mmap(
90    addr: u64,
91    len: u64,
92    prot: u32,
93    flags: u32,
94    fd_raw: u64,
95    offset: u64,
96) -> Result<u64, SyscallError> {
97    //  Validate arguments
98    if len == 0 {
99        return Err(SyscallError::InvalidArgument);
100    }
101
102    let known_flags =
103        MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED_NOREPLACE;
104    if flags & !known_flags != 0 {
105        return Err(SyscallError::InvalidArgument);
106    }
107
108    let is_huge = flags & MAP_HUGETLB != 0;
109    let page_size = if is_huge {
110        crate::memory::address_space::VmaPageSize::Huge
111    } else {
112        crate::memory::address_space::VmaPageSize::Small
113    };
114    let page_bytes = page_size.bytes();
115
116    // File-backed mappings: MAP_PRIVATE + fd → copy file data into anonymous pages.
117    if flags & MAP_ANONYMOUS == 0 {
118        let fd = fd_raw as u32;
119        let file_offset = offset;
120
121        let is_private = flags & MAP_PRIVATE != 0;
122        let is_shared = flags & MAP_SHARED != 0;
123        if is_private == is_shared {
124            return Err(SyscallError::InvalidArgument);
125        }
126        if !is_private {
127            log::warn!("sys_mmap: file-backed MAP_SHARED not yet supported");
128            return Err(SyscallError::NotImplemented);
129        }
130        if prot & !(PROT_READ | PROT_WRITE | PROT_EXEC) != 0 {
131            return Err(SyscallError::InvalidArgument);
132        }
133
134        let len_aligned = if is_huge {
135            huge_page_align_up(len)
136        } else {
137            page_align_up(len)
138        };
139        if len_aligned == 0 {
140            return Err(SyscallError::InvalidArgument);
141        }
142        let n_pages = (len_aligned / page_bytes) as usize;
143
144        let task = current_task_clone().ok_or(SyscallError::Fault)?;
145        let open_file = {
146            let fd_table = unsafe { &*task.process.fd_table.get() };
147            fd_table.get(fd)?
148        };
149        let addr_space = unsafe { &*task.process.address_space.get() };
150
151        let target = if flags & MAP_FIXED != 0 {
152            if addr % page_bytes != 0 || addr == 0 {
153                return Err(SyscallError::InvalidArgument);
154            }
155            if addr.saturating_add(len_aligned) > USER_SPACE_END {
156                return Err(SyscallError::InvalidArgument);
157            }
158            if flags & MAP_FIXED_NOREPLACE != 0 {
159                if addr_space.has_mapping_in_range(addr, len_aligned) {
160                    return Err(SyscallError::AlreadyExists);
161                }
162            } else {
163                addr_space
164                    .unmap_range(addr, len_aligned)
165                    .map_err(|_| SyscallError::InvalidArgument)?;
166            }
167            addr
168        } else {
169            let hint = if addr != 0 {
170                addr
171            } else {
172                task.process.mmap_hint.load(Ordering::Relaxed)
173            };
174            addr_space
175                .find_free_vma_range(hint, n_pages, page_size)
176                .or_else(|| addr_space.find_free_vma_range(MMAP_BASE, n_pages, page_size))
177                .ok_or(SyscallError::OutOfMemory)?
178        };
179
180        let vma_flags = prot_to_vma_flags(prot);
181        addr_space
182            .map_region(target, n_pages, vma_flags, VmaType::Anonymous, page_size)
183            .map_err(|_| SyscallError::OutOfMemory)?;
184
185        // Copy file content into the mapped pages via HHDM.
186        let read_len = len as usize;
187        let mut kbuf = [0u8; 4096];
188        let mut file_off = file_offset;
189        let mut dst_off = 0usize;
190        while dst_off < read_len {
191            let chunk = core::cmp::min(4096, read_len - dst_off);
192            let n = open_file.pread(file_off, &mut kbuf[..chunk]).unwrap_or(0);
193            if n == 0 {
194                break;
195            }
196            let mut written = 0;
197            while written < n {
198                let vaddr = target + (dst_off + written) as u64;
199                let page_off = (vaddr & 0xFFF) as usize;
200                let to_write = core::cmp::min(n - written, 4096 - page_off);
201                let phys = addr_space
202                    .translate(VirtAddr::new(vaddr))
203                    .ok_or(SyscallError::Fault)?;
204                let hhdm_ptr = crate::memory::phys_to_virt(phys.as_u64()) as *mut u8;
205                unsafe {
206                    core::ptr::copy_nonoverlapping(kbuf.as_ptr().add(written), hhdm_ptr, to_write);
207                }
208                written += to_write;
209            }
210            file_off += n as u64;
211            dst_off += n;
212        }
213
214        if flags & MAP_FIXED == 0 {
215            let new_hint = target.saturating_add(len_aligned);
216            let _ = task
217                .process
218                .mmap_hint
219                .fetch_max(new_hint, Ordering::Relaxed);
220        }
221
222        log::trace!(
223            "sys_mmap: file-backed {:#x}..{:#x} (fd={}, off={:#x})",
224            target,
225            target + len_aligned,
226            fd,
227            file_offset,
228        );
229        return Ok(target);
230    }
231
232    let is_private = flags & MAP_PRIVATE != 0;
233    let is_shared = flags & MAP_SHARED != 0;
234    // Exactly one of MAP_PRIVATE / MAP_SHARED.
235    if is_private == is_shared {
236        return Err(SyscallError::InvalidArgument);
237    }
238
239    // Anonymous mapping currently requires page-aligned zero offset.
240    if offset != 0 {
241        return Err(SyscallError::InvalidArgument);
242    }
243
244    // Reject unknown PROT bits.
245    if prot & !(PROT_READ | PROT_WRITE | PROT_EXEC) != 0 {
246        return Err(SyscallError::InvalidArgument);
247    }
248
249    // Round len up to a page boundary.  Overflow of len itself is caught here.
250    let len_aligned = if is_huge {
251        huge_page_align_up(len)
252    } else {
253        page_align_up(len)
254    };
255    if len_aligned == 0 {
256        // len was so large that aligning it overflowed to 0.
257        return Err(SyscallError::InvalidArgument);
258    }
259    let n_pages = (len_aligned / page_bytes) as usize;
260
261    //  Determine the target virtual address
262    let task = current_task_clone().ok_or(SyscallError::Fault)?;
263    let addr_space = unsafe { &*task.process.address_space.get() };
264
265    let target = if flags & MAP_FIXED != 0 {
266        // MAP_FIXED: the caller demands this exact page-aligned address.
267        if addr % page_bytes != 0 || addr == 0 {
268            return Err(SyscallError::InvalidArgument);
269        }
270        if addr.saturating_add(len_aligned) > USER_SPACE_END {
271            return Err(SyscallError::InvalidArgument);
272        }
273        if flags & MAP_FIXED_NOREPLACE != 0 {
274            // MAP_FIXED_NOREPLACE: fail if any mapping overlaps.
275            if addr_space.has_mapping_in_range(addr, len_aligned) {
276                return Err(SyscallError::AlreadyExists);
277            }
278        } else {
279            // Linux MAP_FIXED semantics: unmap overlaps before remap.
280            addr_space
281                .unmap_range(addr, len_aligned)
282                .map_err(|_| SyscallError::InvalidArgument)?;
283        }
284        addr
285    } else {
286        // Hint-based: use addr as a hint when non-zero, else use mmap_hint.
287        let hint = if addr != 0 {
288            addr
289        } else {
290            task.process.mmap_hint.load(Ordering::Relaxed)
291        };
292
293        // Try the hint first, then fall back to MMAP_BASE.
294        addr_space
295            .find_free_vma_range(hint, n_pages, page_size)
296            .or_else(|| addr_space.find_free_vma_range(MMAP_BASE, n_pages, page_size))
297            .ok_or(SyscallError::OutOfMemory)?
298    };
299
300    //  Map the region (lazily)
301    let vma_flags = prot_to_vma_flags(prot);
302    addr_space
303        .reserve_region(target, n_pages, vma_flags, VmaType::Anonymous, page_size)
304        .map_err(|_| SyscallError::OutOfMemory)?;
305
306    //  Advance mmap_hint past the new mapping (non-fixed only)
307    if flags & MAP_FIXED == 0 {
308        let new_hint = target.saturating_add(len_aligned);
309        // Atomically advance: only update if it moves forward.
310        let _ = task
311            .process
312            .mmap_hint
313            .fetch_max(new_hint, Ordering::Relaxed);
314    }
315
316    log::trace!(
317        "sys_mmap: mapped {:#x}..{:#x} ({} pages, prot={:#x}, flags={:#x})",
318        target,
319        target + len_aligned,
320        n_pages,
321        prot,
322        flags,
323    );
324
325    Ok(target)
326}
327
328// ─────────────────────────────────────────────────────────────────────────────
329// sys_munmap
330// ─────────────────────────────────────────────────────────────────────────────
331
332/// SYS_MUNMAP (101): unmap a virtual memory range.
333///
334/// `addr` must be page-aligned.  `len` is rounded up to a page boundary.
335/// Unmapping an address range that contains no mappings is silently ignored
336/// (POSIX behaviour).
337pub fn sys_munmap(addr: u64, len: u64) -> Result<u64, SyscallError> {
338    if addr == 0 || addr & 0xFFF != 0 {
339        return Err(SyscallError::InvalidArgument);
340    }
341    if len == 0 {
342        return Err(SyscallError::InvalidArgument);
343    }
344
345    let len_aligned = page_align_up(len);
346    if len_aligned == 0 {
347        return Err(SyscallError::InvalidArgument);
348    }
349    if addr.saturating_add(len_aligned) > USER_SPACE_END {
350        return Err(SyscallError::InvalidArgument);
351    }
352
353    let task = current_task_clone().ok_or(SyscallError::Fault)?;
354    unsafe { &*task.process.address_space.get() }
355        .unmap_range(addr, len_aligned)
356        .map_err(|_| SyscallError::InvalidArgument)?;
357
358    log::trace!(
359        "sys_munmap: unmapped {:#x}..{:#x}",
360        addr,
361        addr + len_aligned
362    );
363
364    Ok(0)
365}
366
367/// SYS_MREMAP (103): resize an existing mapping.
368///
369/// Current support:
370/// - Shrink in place.
371/// - Grow in place when the following range is free.
372/// - If `MREMAP_MAYMOVE` is set and growth in place fails, relocate only when
373///   the source mapping is still fully lazy (no present pages yet).
374pub fn sys_mremap(
375    old_addr: u64,
376    old_size: u64,
377    new_size: u64,
378    flags: u64,
379) -> Result<u64, SyscallError> {
380    if old_size == 0 || new_size == 0 {
381        return Err(SyscallError::InvalidArgument);
382    }
383    if flags & !MREMAP_MAYMOVE != 0 {
384        return Err(SyscallError::InvalidArgument);
385    }
386
387    let task = current_task_clone().ok_or(SyscallError::Fault)?;
388    let addr_space = unsafe { &*task.process.address_space.get() };
389    let vma = addr_space
390        .region_by_start(old_addr)
391        .ok_or(SyscallError::Fault)?;
392
393    let page_bytes = vma.page_size.bytes();
394    if old_addr % page_bytes != 0 {
395        return Err(SyscallError::InvalidArgument);
396    }
397
398    let old_len_aligned = if vma.page_size == crate::memory::address_space::VmaPageSize::Huge {
399        huge_page_align_up(old_size)
400    } else {
401        page_align_up(old_size)
402    };
403    let new_len_aligned = if vma.page_size == crate::memory::address_space::VmaPageSize::Huge {
404        huge_page_align_up(new_size)
405    } else {
406        page_align_up(new_size)
407    };
408    if old_len_aligned == 0 || new_len_aligned == 0 {
409        return Err(SyscallError::InvalidArgument);
410    }
411
412    let tracked_len = (vma.page_count as u64)
413        .checked_mul(page_bytes)
414        .ok_or(SyscallError::InvalidArgument)?;
415    if old_len_aligned != tracked_len {
416        return Err(SyscallError::InvalidArgument);
417    }
418
419    if new_len_aligned == old_len_aligned {
420        return Ok(old_addr);
421    }
422
423    if new_len_aligned < old_len_aligned {
424        let tail_addr = old_addr
425            .checked_add(new_len_aligned)
426            .ok_or(SyscallError::InvalidArgument)?;
427        let tail_len = old_len_aligned - new_len_aligned;
428        addr_space
429            .unmap_range(tail_addr, tail_len)
430            .map_err(|_| SyscallError::InvalidArgument)?;
431        return Ok(old_addr);
432    }
433
434    let grow_len = new_len_aligned - old_len_aligned;
435    let grow_start = old_addr
436        .checked_add(old_len_aligned)
437        .ok_or(SyscallError::InvalidArgument)?;
438
439    if !addr_space.has_mapping_in_range(grow_start, grow_len) {
440        let grow_pages = (grow_len / page_bytes) as usize;
441        addr_space
442            .reserve_region(
443                grow_start,
444                grow_pages,
445                vma.flags,
446                vma.vma_type,
447                vma.page_size,
448            )
449            .map_err(|_| SyscallError::OutOfMemory)?;
450        return Ok(old_addr);
451    }
452
453    if flags & MREMAP_MAYMOVE == 0 {
454        return Err(SyscallError::OutOfMemory);
455    }
456
457    let has_present_pages = addr_space
458        .any_mapped_in_range(old_addr, old_len_aligned, vma.page_size)
459        .map_err(|_| SyscallError::InvalidArgument)?;
460    if has_present_pages {
461        return Err(SyscallError::OutOfMemory);
462    }
463
464    let new_pages = (new_len_aligned / page_bytes) as usize;
465    let new_addr = addr_space
466        .find_free_vma_range(MMAP_BASE, new_pages, vma.page_size)
467        .ok_or(SyscallError::OutOfMemory)?;
468
469    addr_space
470        .unmap_range(old_addr, old_len_aligned)
471        .map_err(|_| SyscallError::InvalidArgument)?;
472    addr_space
473        .reserve_region(new_addr, new_pages, vma.flags, vma.vma_type, vma.page_size)
474        .map_err(|_| SyscallError::OutOfMemory)?;
475    Ok(new_addr)
476}
477
478/// SYS_MPROTECT (104): change permissions in an existing mapping range.
479pub fn sys_mprotect(addr: u64, len: u64, prot: u64) -> Result<u64, SyscallError> {
480    if len == 0 || addr == 0 || addr & 0xFFF != 0 {
481        return Err(SyscallError::InvalidArgument);
482    }
483    let prot_u32 = u32::try_from(prot).map_err(|_| SyscallError::InvalidArgument)?;
484    if prot_u32 & !(PROT_READ | PROT_WRITE | PROT_EXEC) != 0 {
485        return Err(SyscallError::InvalidArgument);
486    }
487
488    let len_aligned = page_align_up(len);
489    if len_aligned == 0 {
490        return Err(SyscallError::InvalidArgument);
491    }
492    if addr.saturating_add(len_aligned) > USER_SPACE_END {
493        return Err(SyscallError::InvalidArgument);
494    }
495
496    let task = current_task_clone().ok_or(SyscallError::Fault)?;
497    let addr_space = unsafe { &*task.process.address_space.get() };
498    let flags = prot_to_vma_flags(prot_u32);
499
500    addr_space
501        .protect_range(addr, len_aligned, flags)
502        .map_err(|_| SyscallError::InvalidArgument)?;
503
504    Ok(0)
505}
506
507// ─────────────────────────────────────────────────────────────────────────────
508// sys_brk
509// ─────────────────────────────────────────────────────────────────────────────
510
511/// SYS_BRK (102): set or query the program break (top of heap).
512///
513/// Calling convention (matches Linux):
514///
515/// | `addr`          | Behaviour                                              |
516/// |-----------------|--------------------------------------------------------|
517/// | `0`             | Query — return current break unchanged.                |
518/// | `> current_brk` | Extend heap; new pages are zero-filled RW anonymous.   |
519/// | `< current_brk` | Shrink heap; backing pages are freed.                  |
520/// | `< BRK_BASE`    | Invalid — return current break unchanged (Linux compat).|
521///
522/// On any error (OOM, out-of-range) the **unchanged** break is returned rather
523/// than a negative code — this is the Linux `brk(2)` contract.
524pub fn sys_brk(addr: u64) -> Result<u64, SyscallError> {
525    let task = current_task_clone().ok_or(SyscallError::Fault)?;
526
527    // ── Lazy initialisation ───────────────────────────────────────────────
528    // `task.process.brk == 0` means this task has never called brk.  The heap starts
529    // empty at BRK_BASE; no pages are mapped yet.
530    let current_brk = {
531        let raw = task.process.brk.load(Ordering::Relaxed);
532        if raw == 0 {
533            task.process.brk.store(BRK_BASE, Ordering::Relaxed);
534            BRK_BASE
535        } else {
536            raw
537        }
538    };
539
540    // ── Query ─────────────────────────────────────────────────────────────
541    if addr == 0 {
542        return Ok(current_brk);
543    }
544
545    // ── Range checks ─────────────────────────────────────────────────────
546    // Reject attempts to move the break below the heap base or into kernel AS.
547    if addr < BRK_BASE || addr >= USER_SPACE_END {
548        return Ok(current_brk); // return unchanged (Linux behaviour)
549    }
550
551    // ── Compute page-aligned extents ──────────────────────────────────────
552    // The heap occupies [BRK_BASE, page_align_up(current_brk)).
553    // Any bytes in the last partial page are already backed but not accounted
554    // for in the page-end calculation — they stay mapped on shrink.
555    let old_page_end = page_align_up(current_brk);
556    let new_page_end = page_align_up(addr);
557
558    if new_page_end > old_page_end {
559        // ── Grow: map [old_page_end, new_page_end) ────────────────────────
560        let n_pages = ((new_page_end - old_page_end) / 4096) as usize;
561        let vma_flags = VmaFlags {
562            readable: true,
563            writable: true,
564            executable: false,
565            user_accessible: true,
566        };
567        if unsafe { &*task.process.address_space.get() }
568            .reserve_region(
569                old_page_end,
570                n_pages,
571                vma_flags,
572                VmaType::Anonymous,
573                crate::memory::address_space::VmaPageSize::Small,
574            )
575            .is_err()
576        {
577            // OOM — return the unchanged break (Linux behaviour).
578            return Ok(current_brk);
579        }
580        log::trace!(
581            "sys_brk: grow {:#x}..{:#x} ({} pages, lazy)",
582            old_page_end,
583            new_page_end,
584            n_pages,
585        );
586    } else if new_page_end < old_page_end {
587        // ── Shrink: unmap [new_page_end, old_page_end) ───────────────────
588        let len = old_page_end - new_page_end;
589        if unsafe { &*task.process.address_space.get() }
590            .unmap_range(new_page_end, len)
591            .is_err()
592        {
593            return Ok(current_brk);
594        }
595        log::trace!(
596            "sys_brk: shrink {:#x}..{:#x} (-{} pages)",
597            new_page_end,
598            old_page_end,
599            len / 4096,
600        );
601    }
602    // If new_page_end == old_page_end, only the sub-page byte offset changed;
603    // no page-table operations are needed.
604
605    // ── Commit the new exact-byte program break ───────────────────────────
606    task.process.brk.store(addr, Ordering::Relaxed);
607    Ok(addr)
608}