Skip to main content

strat9_kernel/syscall/
exec.rs

1//! `execve()` syscall implementation.
2//! Replaces the current process image with a new one.
3
4use crate::{
5    memory::{AddressSpace, UserSliceRead, VmaFlags, VmaPageSize, VmaType},
6    process::{
7        current_task_clone,
8        elf::{load_elf_image, LoadedElfInfo, USER_STACK_BASE, USER_STACK_PAGES, USER_STACK_TOP},
9        get_task_ids_in_tgid,
10    },
11    syscall::{error::SyscallError, SyscallFrame},
12    vfs,
13};
14use alloc::vec::Vec;
15
16const AT_NULL: u64 = 0;
17const AT_PHDR: u64 = 3;
18const AT_PHENT: u64 = 4;
19const AT_PHNUM: u64 = 5;
20const AT_PAGESZ: u64 = 6;
21const AT_BASE: u64 = 7;
22const AT_ENTRY: u64 = 9;
23const AT_RANDOM: u64 = 25;
24const AT_EXECFN: u64 = 31;
25
26/// Read an executable image, borrowing static initfs bytes when available.
27fn read_exec_image(path: &str) -> Result<Option<Vec<u8>>, SyscallError> {
28    if crate::vfs::get_initfs_file_bytes(path).is_some() {
29        return Ok(None);
30    }
31
32    let fd = vfs::open(path, vfs::OpenFlags::READ)?;
33
34    const MAX_EXEC_SIZE: usize = 64 * 1024 * 1024;
35    let mut elf_data = Vec::new();
36    let mut buf = [0u8; 4096];
37    loop {
38        match vfs::read(fd, &mut buf) {
39            Ok(n) => {
40                if n == 0 {
41                    break;
42                }
43                if elf_data.len() + n > MAX_EXEC_SIZE {
44                    let _ = vfs::close(fd);
45                    return Err(SyscallError::OutOfMemory);
46                }
47                elf_data.extend_from_slice(&buf[..n]);
48            }
49            Err(e) => {
50                let _ = vfs::close(fd);
51                return Err(e);
52            }
53        }
54    }
55    let _ = vfs::close(fd);
56
57    Ok(Some(elf_data))
58}
59
60/// SYS_PROC_EXECVE (301): replace current process image.
61/// On success, does not return. On failure, returns an appropriate error code.
62/// This is the main syscall handler for execve, which performs the entire execve sequence:
63/// 1. Validate and read the executable image from the given path.
64/// 2. Create a new address space and load the ELF segments.
65/// 3. Set up the user stack with arguments, environment variables, and auxiliary vector.
66/// 4. Perform cleanup of the current process state (close fds, reset signals,
67///   clear TLS and TID pointer, etc) according to POSIX exec semantics.
68/// 5. Switch to the new address space and transfer control to the new image's entry point.
69/// The `setup_user_stack` function is a helper that performs step 3, which is complex enough to warrant its own function.
70/// The implementation assumes a simple model where sibling threads are not runnable during execve, which allows it to replace the entire address space without complex synchronization. This is a common approach in many kernels, but it does mean that multithreaded execve is not supported until the kernel can safely handle it.
71/// It also includes robust error handling to ensure that any failure during the execve sequence results in an appropriate error code without leaving the process in an inconsistent state.
72/// Note: This implementation does not currently support some features like setuid binaries, but it lays the groundwork for a full execve implementation with proper ELF loading and stack setup.
73///
74pub fn sys_execve(
75    frame: &mut SyscallFrame,
76    path_ptr: u64,
77    argv_ptr: u64,
78    envp_ptr: u64,
79) -> Result<u64, SyscallError> {
80    let current = current_task_clone().ok_or(SyscallError::PermissionDenied)?;
81
82    // Replacing a shared address space while sibling threads are still runnable
83    // is unsafe in the current model. Refuse multithreaded exec until the
84    // kernel can synchronize and reap sibling threads atomically.
85    if get_task_ids_in_tgid(current.tgid).len() > 1 {
86        return Err(SyscallError::NotSupported);
87    }
88
89    let mut path_buf = [0u8; 4096];
90    let path_slice = UserSliceRead::new(path_ptr, 4096).map_err(|_| SyscallError::Fault)?;
91
92    let mut len = 0;
93
94    loop {
95        if len >= 4096 {
96            return Err(SyscallError::ArgumentListTooLong);
97        } // Reused error code
98        let b = path_slice.read_u8(len).map_err(|_| SyscallError::Fault)?;
99        if b == 0 {
100            break;
101        }
102        path_buf[len] = b;
103        len += 1;
104    }
105    let path_str =
106        core::str::from_utf8(&path_buf[..len]).map_err(|_| SyscallError::InvalidArgument)?;
107
108    let owned_elf_data = read_exec_image(path_str)?;
109    let elf_data = owned_elf_data
110        .as_deref()
111        .or_else(|| crate::vfs::get_initfs_file_bytes(path_str))
112        .ok_or(SyscallError::NotFound)?;
113
114    if elf_data.len() < 4 {
115        return Err(SyscallError::ExecFormatError);
116    }
117
118    let new_as = AddressSpace::new_user().map_err(|_| SyscallError::OutOfMemory)?;
119    let new_as_arc = alloc::sync::Arc::new(new_as);
120    new_as_arc.set_owner_pid(current.pid);
121
122    let load_info =
123        load_elf_image(elf_data, &new_as_arc).map_err(|_| SyscallError::ExecFormatError)?;
124
125    let stack_flags = VmaFlags {
126        readable: true,
127        writable: true,
128        executable: false,
129        user_accessible: true,
130    };
131    new_as_arc
132        .map_region(
133            USER_STACK_BASE,
134            USER_STACK_PAGES,
135            stack_flags,
136            VmaType::Stack,
137            VmaPageSize::Small,
138        )
139        .map_err(|_| SyscallError::OutOfMemory)?;
140
141    let sp = setup_user_stack(
142        &new_as_arc,
143        argv_ptr,
144        envp_ptr,
145        &load_info,
146        path_str.as_bytes(),
147    )?;
148
149    // TLS setup (Variant II) if the ELF has a PT_TLS segment.
150    let mut new_fs_base = 0u64;
151    if load_info.tls_memsz > 0 {
152        let tls_align = core::cmp::max(load_info.tls_align, 8).next_power_of_two();
153        let aligned_memsz = (load_info.tls_memsz + tls_align - 1) & !(tls_align - 1);
154        let total_size = aligned_memsz + 8;
155        let n_pages = ((total_size + 4095) / 4096) as usize;
156        let tls_flags = VmaFlags {
157            readable: true,
158            writable: true,
159            executable: false,
160            user_accessible: true,
161        };
162        let tls_base = new_as_arc
163            .find_free_vma_range(0x7FFF_E000_0000, n_pages, VmaPageSize::Small)
164            .ok_or(SyscallError::OutOfMemory)?;
165        new_as_arc
166            .map_region(
167                tls_base,
168                n_pages,
169                tls_flags,
170                VmaType::Anonymous,
171                VmaPageSize::Small,
172            )
173            .map_err(|_| SyscallError::OutOfMemory)?;
174        if load_info.tls_filesz > 0 && load_info.tls_vaddr != 0 {
175            let src_vaddr = load_info.tls_vaddr;
176            let mut off = 0u64;
177            let mut tmp = [0u8; 256];
178            while off < load_info.tls_filesz {
179                let chunk = core::cmp::min(256, (load_info.tls_filesz - off) as usize);
180                crate::process::elf::read_user_mapped_bytes_pub(
181                    &new_as_arc,
182                    src_vaddr + off,
183                    &mut tmp[..chunk],
184                )
185                .map_err(|_| SyscallError::Fault)?;
186                crate::process::elf::write_user_mapped_bytes_pub(
187                    &new_as_arc,
188                    tls_base + off,
189                    &tmp[..chunk],
190                )
191                .map_err(|_| SyscallError::Fault)?;
192                off += chunk as u64;
193            }
194        }
195        let tp = tls_base + aligned_memsz;
196        crate::process::elf::write_user_u64_pub(&new_as_arc, tp, tp)
197            .map_err(|_| SyscallError::Fault)?;
198        new_fs_base = tp;
199    }
200
201    // === EXECVE CLEANUP (POSIX semantics) ===
202    // Now that ELF is valid and loaded, perform cleanup before switching address space.
203
204    // 1. Close all file descriptors with CLOEXEC flag
205    unsafe {
206        let fd_table = &mut *current.process.fd_table.get();
207        fd_table.close_cloexec();
208    }
209
210    // 2. Reset all signal handlers to SIG_DFL
211    current.reset_signals();
212
213    // 2b. POSIX: exec disables the alternate signal stack for the new image.
214    unsafe {
215        *current.signal_stack.get() = None;
216    }
217
218    // 3. Clear thread-local storage address and TID pointer : POSIX exec semantics.
219    current
220        .clear_child_tid
221        .store(0, core::sync::atomic::Ordering::Relaxed);
222    current
223        .user_fs_base
224        .store(new_fs_base, core::sync::atomic::Ordering::Relaxed);
225
226    // 4. Reset memory layout: brk and mmap_hint belong to the old image.
227    current
228        .process
229        .brk
230        .store(0, core::sync::atomic::Ordering::Relaxed);
231    current
232        .process
233        .mmap_hint
234        .store(0x0000_0000_6000_0000, core::sync::atomic::Ordering::Relaxed);
235    // Set FS.base MSR for the new image TLS (or 0 if no PT_TLS).
236    unsafe {
237        let lo = new_fs_base as u32;
238        let hi = (new_fs_base >> 32) as u32;
239        core::arch::asm!(
240            "mov ecx, 0xC0000100", // MSR_FS_BASE
241            "wrmsr",
242            in("eax") lo,
243            in("edx") hi,
244            options(nostack, preserves_flags),
245        );
246    }
247
248    let old_as = current.process.replace_address_space(new_as_arc.clone());
249
250    unsafe {
251        current.process.address_space_arc().switch_to();
252    }
253
254    frame.iret_rip = load_info.runtime_entry;
255    frame.iret_rsp = sp;
256    frame.iret_rflags = 0x200; // IF=1, clean slate for the new image
257
258    frame.rdi = 0;
259    frame.rsi = 0;
260    frame.rdx = 0;
261    frame.rcx = 0;
262    frame.r8 = 0;
263    frame.r9 = 0;
264    frame.r10 = 0;
265    frame.r11 = 0;
266    frame.rbx = 0;
267    frame.rbp = 0;
268    frame.r12 = 0;
269    frame.r13 = 0;
270    frame.r14 = 0;
271    frame.r15 = 0;
272    frame.rax = 0;
273
274    // Safely drop the old address space now that the new CR3 is loaded
275    drop(old_as);
276
277    Ok(0)
278}
279
280/// Performs the setup user stack operation.
281fn setup_user_stack(
282    new_as: &AddressSpace,
283    argv_ptr: u64,
284    envp_ptr: u64,
285    elf_info: &LoadedElfInfo,
286    exec_path: &[u8],
287) -> Result<u64, SyscallError> {
288    let args = read_string_array(argv_ptr)?;
289    let envs = read_string_array(envp_ptr)?;
290
291    let mut sp = USER_STACK_TOP;
292    let mut str_ptrs: Vec<u64> = Vec::with_capacity(args.len()); // stores pointers to arguments
293    let mut env_ptrs: Vec<u64> = Vec::with_capacity(envs.len()); // stores pointers to env vars
294
295    // Push strings to stack (highest addresses)
296    // We push them in reverse order so they appear in memory roughly sequentially for cache locality?
297    // Actually standard is to put them at very top. Order doesn't strictly matter as long as pointers are correct.
298    // We'll push ENV strings first (highest), then ARG strings.
299
300    // Push ENV strings
301    for env in envs.iter().rev() {
302        let len = (env.len() + 1) as u64;
303        sp -= len;
304        write_bytes_to_as(new_as, sp, env)?;
305        write_bytes_to_as(new_as, sp + env.len() as u64, &[0])?;
306        env_ptrs.push(sp);
307    }
308    // env_ptrs: [ptr_to_highest_env, ptr_to_second_highest...] which corresponds to [env[last], env[last-1]...]
309    // Userspace expects envp[0] to point to first env string.
310    // So we need to reverse env_ptrs to match original order.
311    env_ptrs.reverse();
312
313    // Push ARG strings
314    for arg in args.iter().rev() {
315        let len = (arg.len() + 1) as u64;
316        sp -= len;
317        write_bytes_to_as(new_as, sp, arg)?;
318        write_bytes_to_as(new_as, sp + arg.len() as u64, &[0])?;
319        str_ptrs.push(sp);
320    }
321    str_ptrs.reverse();
322
323    // Push exec path (for AT_EXECFN).
324    let mut execfn_ptr = 0u64;
325    if !exec_path.is_empty() {
326        let len = (exec_path.len() + 1) as u64;
327        sp -= len;
328        write_bytes_to_as(new_as, sp, exec_path)?;
329        write_bytes_to_as(new_as, sp + exec_path.len() as u64, &[0])?;
330        execfn_ptr = sp;
331    }
332
333    // Push 16 bytes of random seed for AT_RANDOM (deterministic fallback source).
334    sp -= 16;
335    let rand_ptr = sp;
336    let seed = generate_aux_random_seed();
337    write_bytes_to_as(new_as, rand_ptr, &seed)?;
338
339    // Align SP to 16 bytes for System V ABI
340    sp &= !0xF;
341
342    // Phase 2: Push auxv, pointer arrays, then argc.
343    let size_ptr = 8u64;
344
345    // auxv entries end with AT_NULL.
346    let mut auxv: Vec<(u64, u64)> = Vec::with_capacity(10);
347    auxv.push((AT_PHDR, elf_info.phdr_vaddr));
348    auxv.push((AT_PHENT, elf_info.phent as u64));
349    auxv.push((AT_PHNUM, elf_info.phnum as u64));
350    auxv.push((AT_PAGESZ, 4096));
351    if let Some(base) = elf_info.interp_base {
352        auxv.push((AT_BASE, base));
353    }
354    auxv.push((AT_ENTRY, elf_info.program_entry));
355    auxv.push((AT_RANDOM, rand_ptr));
356    if execfn_ptr != 0 {
357        auxv.push((AT_EXECFN, execfn_ptr));
358    }
359
360    // AT_NULL terminator.
361    sp -= size_ptr;
362    write_u64_to_as(new_as, sp, 0)?;
363    sp -= size_ptr;
364    write_u64_to_as(new_as, sp, AT_NULL)?;
365    for &(key, val) in auxv.iter().rev() {
366        sp -= size_ptr;
367        write_u64_to_as(new_as, sp, val)?;
368        sp -= size_ptr;
369        write_u64_to_as(new_as, sp, key)?;
370    }
371
372    // Push ENVP array
373    // [NULL]
374    // [envp[n]]
375    // ...
376    // [envp[0]]
377    sp -= size_ptr;
378    write_u64_to_as(new_as, sp, 0)?; // NULL terminator
379
380    for &ptr in env_ptrs.iter().rev() {
381        sp -= size_ptr;
382        write_u64_to_as(new_as, sp, ptr)?;
383    }
384    // Note: sp now points to envp[0]
385
386    // Push ARGV array
387    // [NULL]
388    // [argv[n]]
389    // ...
390    // [argv[0]]
391    sp -= size_ptr;
392    write_u64_to_as(new_as, sp, 0)?; // NULL terminator
393
394    for &ptr in str_ptrs.iter().rev() {
395        sp -= size_ptr;
396        write_u64_to_as(new_as, sp, ptr)?;
397    }
398    // Note: sp now points to argv[0]
399
400    // Push ARGC
401    sp -= size_ptr;
402    write_u64_to_as(new_as, sp, args.len() as u64)?;
403
404    Ok(sp)
405}
406
407/// Reads string array.
408fn read_string_array(ptr: u64) -> Result<Vec<Vec<u8>>, SyscallError> {
409    let mut res = Vec::new();
410    if ptr == 0 {
411        return Ok(res);
412    }
413
414    let mut arr_off = 0;
415    loop {
416        // Read string pointer from user memory (current AS)
417        let str_ptr = match UserSliceRead::new(ptr + arr_off, 8) {
418            Ok(slice) => match slice.read_u64(0) {
419                Ok(p) => p,
420                Err(_) => return Err(SyscallError::Fault),
421            },
422            Err(_) => return Err(SyscallError::Fault),
423        };
424
425        if str_ptr == 0 {
426            break;
427        }
428        if res.len() > 1024 {
429            return Err(SyscallError::ArgumentListTooLong);
430        }
431
432        let mut s = Vec::new();
433        let mut i = 0;
434        loop {
435            if i > 4096 {
436                return Err(SyscallError::ArgumentListTooLong);
437            }
438            let b = match UserSliceRead::new(str_ptr + i, 1) {
439                Ok(slice) => match slice.read_u8(0) {
440                    Ok(byte) => byte,
441                    Err(_) => return Err(SyscallError::Fault),
442                },
443                Err(_) => return Err(SyscallError::Fault),
444            };
445            if b == 0 {
446                break;
447            }
448            s.push(b);
449            i += 1;
450        }
451        res.push(s);
452        arr_off += 8;
453    }
454    Ok(res)
455}
456
457/// Writes bytes to as.
458fn write_bytes_to_as(as_ref: &AddressSpace, vaddr: u64, data: &[u8]) -> Result<(), SyscallError> {
459    use x86_64::VirtAddr;
460    let mut written = 0;
461    // We assume data is small enough or we loop? Using unsafe pointer arithmetic.
462    // The `AddressSpace` methods like `translate` are needed.
463
464    // Since `load_elf_image` in `elf.rs` used `translate`, we should verify visibility.
465    // `AddressSpace` is usually public. `translate` is on `Mapper` trait?
466    // `AddressSpace` in `strat9` likely implements `Mapper` or has it.
467    // `elf.rs` used `user_as.translate(...)`.
468
469    // I need to import Translate? `AddressSpace` usually has `translate`.
470
471    while written < data.len() {
472        let curr_vaddr = vaddr + written as u64;
473        let page_offset = (curr_vaddr & 0xFFF) as usize;
474        let chunk_size = core::cmp::min(data.len() - written, 4096 - page_offset);
475
476        // translate might fail if page not mapped.
477        // `USER_STACK_BASE`..`USER_STACK_TOP` is mapped.
478        let phys = as_ref
479            .translate(VirtAddr::new(curr_vaddr))
480            .ok_or(SyscallError::Fault)?;
481        let virt = crate::memory::phys_to_virt(phys.as_u64()) as *mut u8;
482
483        unsafe {
484            core::ptr::copy_nonoverlapping(data.as_ptr().add(written), virt, chunk_size);
485        }
486        written += chunk_size;
487    }
488    Ok(())
489}
490
491/// Writes u64 to as.
492fn write_u64_to_as(as_ref: &AddressSpace, vaddr: u64, val: u64) -> Result<(), SyscallError> {
493    let bytes = val.to_ne_bytes();
494    write_bytes_to_as(as_ref, vaddr, &bytes)
495}
496
497/// Performs the generate aux random seed operation.
498fn generate_aux_random_seed() -> [u8; 16] {
499    use x86_64::registers::control::Cr3;
500    let mut s = [0u8; 16];
501    let t = crate::process::scheduler::ticks();
502    let (cr3, _) = Cr3::read();
503    let x = t
504        ^ (cr3
505            .start_address()
506            .as_u64()
507            .wrapping_mul(0x9e37_79b9_7f4a_7c15));
508    s[..8].copy_from_slice(&x.to_le_bytes());
509    s[8..].copy_from_slice(&(x.rotate_left(17) ^ 0xa076_1d64_78bd_642f).to_le_bytes());
510    s
511}