Skip to main content

strat9_kernel/syscall/
exec.rs

1//! `execve()` syscall implementation.
2//! Replaces the current process image with a new one.
3
4use crate::{
5    memory::{AddressSpace, UserSliceRead, VmaFlags, VmaPageSize, VmaType},
6    process::{
7        current_task_clone,
8        elf::{load_elf_image, LoadedElfInfo, USER_STACK_BASE, USER_STACK_PAGES, USER_STACK_TOP},
9    },
10    syscall::{error::SyscallError, SyscallFrame},
11    vfs,
12};
13use alloc::vec::Vec;
14
15const AT_NULL: u64 = 0;
16const AT_PHDR: u64 = 3;
17const AT_PHENT: u64 = 4;
18const AT_PHNUM: u64 = 5;
19const AT_PAGESZ: u64 = 6;
20const AT_BASE: u64 = 7;
21const AT_ENTRY: u64 = 9;
22const AT_RANDOM: u64 = 25;
23const AT_EXECFN: u64 = 31;
24
25/// SYS_PROC_EXECVE (301): replace current process image.
26pub fn sys_execve(
27    frame: &mut SyscallFrame,
28    path_ptr: u64,
29    argv_ptr: u64,
30    envp_ptr: u64,
31) -> Result<u64, SyscallError> {
32    let current = current_task_clone().ok_or(SyscallError::PermissionDenied)?;
33
34    let mut path_buf = [0u8; 4096];
35    let path_slice = UserSliceRead::new(path_ptr, 4096).map_err(|_| SyscallError::Fault)?;
36
37    let mut len = 0;
38
39    loop {
40        if len >= 4096 {
41            return Err(SyscallError::ArgumentListTooLong);
42        } // Reused error code
43        let b = path_slice.read_u8(len).map_err(|_| SyscallError::Fault)?;
44        if b == 0 {
45            break;
46        }
47        path_buf[len] = b;
48        len += 1;
49    }
50    let path_str =
51        core::str::from_utf8(&path_buf[..len]).map_err(|_| SyscallError::InvalidArgument)?;
52
53    let fd = vfs::open(path_str, vfs::OpenFlags::READ)?;
54
55    // Read into memory
56    const MAX_EXEC_SIZE: usize = 64 * 1024 * 1024;
57    let mut elf_data = Vec::new();
58    let mut buf = [0u8; 4096];
59    loop {
60        match vfs::read(fd, &mut buf) {
61            Ok(n) => {
62                if n == 0 {
63                    break;
64                }
65                if elf_data.len() + n > MAX_EXEC_SIZE {
66                    let _ = vfs::close(fd);
67                    return Err(SyscallError::OutOfMemory);
68                }
69                elf_data.extend_from_slice(&buf[..n]);
70            }
71            Err(e) => {
72                let _ = vfs::close(fd);
73                return Err(e);
74            }
75        }
76    }
77    let _ = vfs::close(fd);
78
79    if elf_data.len() < 4 {
80        return Err(SyscallError::ExecFormatError);
81    }
82
83    let new_as = AddressSpace::new_user().map_err(|_| SyscallError::OutOfMemory)?;
84    let new_as_arc = alloc::sync::Arc::new(new_as);
85
86    let load_info =
87        load_elf_image(&elf_data, &new_as_arc).map_err(|_| SyscallError::ExecFormatError)?;
88
89    let stack_flags = VmaFlags {
90        readable: true,
91        writable: true,
92        executable: false,
93        user_accessible: true,
94    };
95    new_as_arc
96        .map_region(
97            USER_STACK_BASE,
98            USER_STACK_PAGES,
99            stack_flags,
100            VmaType::Stack,
101            VmaPageSize::Small,
102        )
103        .map_err(|_| SyscallError::OutOfMemory)?;
104
105    let sp = setup_user_stack(
106        &new_as_arc,
107        argv_ptr,
108        envp_ptr,
109        &load_info,
110        path_str.as_bytes(),
111    )?;
112
113    // TLS setup (Variant II) if the ELF has a PT_TLS segment.
114    let mut new_fs_base = 0u64;
115    if load_info.tls_memsz > 0 {
116        let tls_align = core::cmp::max(load_info.tls_align, 8).next_power_of_two();
117        let aligned_memsz = (load_info.tls_memsz + tls_align - 1) & !(tls_align - 1);
118        let total_size = aligned_memsz + 8;
119        let n_pages = ((total_size + 4095) / 4096) as usize;
120        let tls_flags = VmaFlags {
121            readable: true,
122            writable: true,
123            executable: false,
124            user_accessible: true,
125        };
126        let tls_base = new_as_arc
127            .find_free_vma_range(0x7FFF_E000_0000, n_pages, VmaPageSize::Small)
128            .ok_or(SyscallError::OutOfMemory)?;
129        new_as_arc
130            .map_region(
131                tls_base,
132                n_pages,
133                tls_flags,
134                VmaType::Anonymous,
135                VmaPageSize::Small,
136            )
137            .map_err(|_| SyscallError::OutOfMemory)?;
138        if load_info.tls_filesz > 0 && load_info.tls_vaddr != 0 {
139            let src_vaddr = load_info.tls_vaddr;
140            let mut off = 0u64;
141            let mut tmp = [0u8; 256];
142            while off < load_info.tls_filesz {
143                let chunk = core::cmp::min(256, (load_info.tls_filesz - off) as usize);
144                crate::process::elf::read_user_mapped_bytes_pub(
145                    &new_as_arc,
146                    src_vaddr + off,
147                    &mut tmp[..chunk],
148                )
149                .map_err(|_| SyscallError::Fault)?;
150                crate::process::elf::write_user_mapped_bytes_pub(
151                    &new_as_arc,
152                    tls_base + off,
153                    &tmp[..chunk],
154                )
155                .map_err(|_| SyscallError::Fault)?;
156                off += chunk as u64;
157            }
158        }
159        let tp = tls_base + aligned_memsz;
160        crate::process::elf::write_user_u64_pub(&new_as_arc, tp, tp)
161            .map_err(|_| SyscallError::Fault)?;
162        new_fs_base = tp;
163    }
164
165    // === EXECVE CLEANUP (POSIX semantics) ===
166    // Now that ELF is valid and loaded, perform cleanup before switching address space.
167
168    // 1. Close all file descriptors with CLOEXEC flag
169    unsafe {
170        let fd_table = &mut *current.process.fd_table.get();
171        fd_table.close_cloexec();
172    }
173
174    // 2. Reset all signal handlers to SIG_DFL
175    current.reset_signals();
176
177    // 3. Clear thread-local storage address and TID pointer : POSIX exec semantics.
178    current
179        .clear_child_tid
180        .store(0, core::sync::atomic::Ordering::Relaxed);
181    current
182        .user_fs_base
183        .store(new_fs_base, core::sync::atomic::Ordering::Relaxed);
184
185    // 4. Reset memory layout: brk and mmap_hint belong to the old image.
186    current
187        .process
188        .brk
189        .store(0, core::sync::atomic::Ordering::Relaxed);
190    current
191        .process
192        .mmap_hint
193        .store(0x0000_0000_6000_0000, core::sync::atomic::Ordering::Relaxed);
194    // Set FS.base MSR for the new image TLS (or 0 if no PT_TLS).
195    unsafe {
196        let lo = new_fs_base as u32;
197        let hi = (new_fs_base >> 32) as u32;
198        core::arch::asm!(
199            "mov ecx, 0xC0000100", // MSR_FS_BASE
200            "wrmsr",
201            in("eax") lo,
202            in("edx") hi,
203            options(nostack, preserves_flags),
204        );
205    }
206
207    let old_as = unsafe {
208        core::mem::replace(
209            &mut *current.process.address_space.get(),
210            new_as_arc.clone(),
211        )
212    };
213
214    unsafe {
215        (&*current.process.address_space.get()).switch_to();
216    }
217
218    frame.iret_rip = load_info.runtime_entry;
219    frame.iret_rsp = sp;
220    frame.iret_rflags = 0x200; // IF=1, clean slate for the new image
221
222    frame.rdi = 0;
223    frame.rsi = 0;
224    frame.rdx = 0;
225    frame.rcx = 0;
226    frame.r8 = 0;
227    frame.r9 = 0;
228    frame.r10 = 0;
229    frame.r11 = 0;
230    frame.rbx = 0;
231    frame.rbp = 0;
232    frame.r12 = 0;
233    frame.r13 = 0;
234    frame.r14 = 0;
235    frame.r15 = 0;
236    frame.rax = 0;
237
238    // Safely drop the old address space now that the new CR3 is loaded
239    drop(old_as);
240
241    Ok(0)
242}
243
244/// Performs the setup user stack operation.
245fn setup_user_stack(
246    new_as: &AddressSpace,
247    argv_ptr: u64,
248    envp_ptr: u64,
249    elf_info: &LoadedElfInfo,
250    exec_path: &[u8],
251) -> Result<u64, SyscallError> {
252    let args = read_string_array(argv_ptr)?;
253    let envs = read_string_array(envp_ptr)?;
254
255    let mut sp = USER_STACK_TOP;
256    let mut str_ptrs: Vec<u64> = Vec::with_capacity(args.len()); // stores pointers to arguments
257    let mut env_ptrs: Vec<u64> = Vec::with_capacity(envs.len()); // stores pointers to env vars
258
259    // Push strings to stack (highest addresses)
260    // We push them in reverse order so they appear in memory roughly sequentially for cache locality?
261    // Actually standard is to put them at very top. Order doesn't strictly matter as long as pointers are correct.
262    // We'll push ENV strings first (highest), then ARG strings.
263
264    // Push ENV strings
265    for env in envs.iter().rev() {
266        let len = (env.len() + 1) as u64;
267        sp -= len;
268        write_bytes_to_as(new_as, sp, env)?;
269        write_bytes_to_as(new_as, sp + env.len() as u64, &[0])?;
270        env_ptrs.push(sp);
271    }
272    // env_ptrs: [ptr_to_highest_env, ptr_to_second_highest...] which corresponds to [env[last], env[last-1]...]
273    // Userspace expects envp[0] to point to first env string.
274    // So we need to reverse env_ptrs to match original order.
275    env_ptrs.reverse();
276
277    // Push ARG strings
278    for arg in args.iter().rev() {
279        let len = (arg.len() + 1) as u64;
280        sp -= len;
281        write_bytes_to_as(new_as, sp, arg)?;
282        write_bytes_to_as(new_as, sp + arg.len() as u64, &[0])?;
283        str_ptrs.push(sp);
284    }
285    str_ptrs.reverse();
286
287    // Push exec path (for AT_EXECFN).
288    let mut execfn_ptr = 0u64;
289    if !exec_path.is_empty() {
290        let len = (exec_path.len() + 1) as u64;
291        sp -= len;
292        write_bytes_to_as(new_as, sp, exec_path)?;
293        write_bytes_to_as(new_as, sp + exec_path.len() as u64, &[0])?;
294        execfn_ptr = sp;
295    }
296
297    // Push 16 bytes of random seed for AT_RANDOM (deterministic fallback source).
298    sp -= 16;
299    let rand_ptr = sp;
300    let seed = generate_aux_random_seed();
301    write_bytes_to_as(new_as, rand_ptr, &seed)?;
302
303    // Align SP to 16 bytes for System V ABI
304    sp &= !0xF;
305
306    // Phase 2: Push auxv, pointer arrays, then argc.
307    let size_ptr = 8u64;
308
309    // auxv entries end with AT_NULL.
310    let mut auxv: Vec<(u64, u64)> = Vec::with_capacity(10);
311    auxv.push((AT_PHDR, elf_info.phdr_vaddr));
312    auxv.push((AT_PHENT, elf_info.phent as u64));
313    auxv.push((AT_PHNUM, elf_info.phnum as u64));
314    auxv.push((AT_PAGESZ, 4096));
315    if let Some(base) = elf_info.interp_base {
316        auxv.push((AT_BASE, base));
317    }
318    auxv.push((AT_ENTRY, elf_info.program_entry));
319    auxv.push((AT_RANDOM, rand_ptr));
320    if execfn_ptr != 0 {
321        auxv.push((AT_EXECFN, execfn_ptr));
322    }
323
324    // AT_NULL terminator.
325    sp -= size_ptr;
326    write_u64_to_as(new_as, sp, 0)?;
327    sp -= size_ptr;
328    write_u64_to_as(new_as, sp, AT_NULL)?;
329    for &(key, val) in auxv.iter().rev() {
330        sp -= size_ptr;
331        write_u64_to_as(new_as, sp, val)?;
332        sp -= size_ptr;
333        write_u64_to_as(new_as, sp, key)?;
334    }
335
336    // Push ENVP array
337    // [NULL]
338    // [envp[n]]
339    // ...
340    // [envp[0]]
341    sp -= size_ptr;
342    write_u64_to_as(new_as, sp, 0)?; // NULL terminator
343
344    for &ptr in env_ptrs.iter().rev() {
345        sp -= size_ptr;
346        write_u64_to_as(new_as, sp, ptr)?;
347    }
348    // Note: sp now points to envp[0]
349
350    // Push ARGV array
351    // [NULL]
352    // [argv[n]]
353    // ...
354    // [argv[0]]
355    sp -= size_ptr;
356    write_u64_to_as(new_as, sp, 0)?; // NULL terminator
357
358    for &ptr in str_ptrs.iter().rev() {
359        sp -= size_ptr;
360        write_u64_to_as(new_as, sp, ptr)?;
361    }
362    // Note: sp now points to argv[0]
363
364    // Push ARGC
365    sp -= size_ptr;
366    write_u64_to_as(new_as, sp, args.len() as u64)?;
367
368    Ok(sp)
369}
370
371/// Reads string array.
372fn read_string_array(ptr: u64) -> Result<Vec<Vec<u8>>, SyscallError> {
373    let mut res = Vec::new();
374    if ptr == 0 {
375        return Ok(res);
376    }
377
378    let mut arr_off = 0;
379    loop {
380        // Read string pointer from user memory (current AS)
381        let str_ptr = match UserSliceRead::new(ptr + arr_off, 8) {
382            Ok(slice) => match slice.read_u64(0) {
383                Ok(p) => p,
384                Err(_) => return Err(SyscallError::Fault),
385            },
386            Err(_) => return Err(SyscallError::Fault),
387        };
388
389        if str_ptr == 0 {
390            break;
391        }
392        if res.len() > 1024 {
393            return Err(SyscallError::ArgumentListTooLong);
394        }
395
396        let mut s = Vec::new();
397        let mut i = 0;
398        loop {
399            if i > 4096 {
400                return Err(SyscallError::ArgumentListTooLong);
401            }
402            let b = match UserSliceRead::new(str_ptr + i, 1) {
403                Ok(slice) => match slice.read_u8(0) {
404                    Ok(byte) => byte,
405                    Err(_) => return Err(SyscallError::Fault),
406                },
407                Err(_) => return Err(SyscallError::Fault),
408            };
409            if b == 0 {
410                break;
411            }
412            s.push(b);
413            i += 1;
414        }
415        res.push(s);
416        arr_off += 8;
417    }
418    Ok(res)
419}
420
421/// Writes bytes to as.
422fn write_bytes_to_as(as_ref: &AddressSpace, vaddr: u64, data: &[u8]) -> Result<(), SyscallError> {
423    use x86_64::VirtAddr;
424    let mut written = 0;
425    // We assume data is small enough or we loop? Using unsafe pointer arithmetic.
426    // The `AddressSpace` methods like `translate` are needed.
427
428    // Since `load_elf_image` in `elf.rs` used `translate`, we should verify visibility.
429    // `AddressSpace` is usually public. `translate` is on `Mapper` trait?
430    // `AddressSpace` in `strat9` likely implements `Mapper` or has it.
431    // `elf.rs` used `user_as.translate(...)`.
432
433    // I need to import Translate? `AddressSpace` usually has `translate`.
434
435    while written < data.len() {
436        let curr_vaddr = vaddr + written as u64;
437        let page_offset = (curr_vaddr & 0xFFF) as usize;
438        let chunk_size = core::cmp::min(data.len() - written, 4096 - page_offset);
439
440        // translate might fail if page not mapped.
441        // `USER_STACK_BASE`..`USER_STACK_TOP` is mapped.
442        let phys = as_ref
443            .translate(VirtAddr::new(curr_vaddr))
444            .ok_or(SyscallError::Fault)?;
445        let virt = crate::memory::phys_to_virt(phys.as_u64()) as *mut u8;
446
447        unsafe {
448            core::ptr::copy_nonoverlapping(data.as_ptr().add(written), virt, chunk_size);
449        }
450        written += chunk_size;
451    }
452    Ok(())
453}
454
455/// Writes u64 to as.
456fn write_u64_to_as(as_ref: &AddressSpace, vaddr: u64, val: u64) -> Result<(), SyscallError> {
457    let bytes = val.to_ne_bytes();
458    write_bytes_to_as(as_ref, vaddr, &bytes)
459}
460
461/// Performs the generate aux random seed operation.
462fn generate_aux_random_seed() -> [u8; 16] {
463    use x86_64::registers::control::Cr3;
464    let mut s = [0u8; 16];
465    let t = crate::process::scheduler::ticks();
466    let (cr3, _) = Cr3::read();
467    let x = t
468        ^ (cr3
469            .start_address()
470            .as_u64()
471            .wrapping_mul(0x9e37_79b9_7f4a_7c15));
472    s[..8].copy_from_slice(&x.to_le_bytes());
473    s[8..].copy_from_slice(&(x.rotate_left(17) ^ 0xa076_1d64_78bd_642f).to_le_bytes());
474    s
475}