Skip to main content

strat9_kernel/process/
elf.rs

1//! ELF64 loader for Strat9-OS.
2//!
3//! Parses ELF64 headers and loads PT_LOAD segments into a user address space,
4//! then creates a kernel task that trampolines into Ring 3 via IRETQ.
5//!
6//! Supports :
7//!   - ET_EXEC
8//!   - ET_DYN (PIE/static-PIE)
9//!   - ELF64 little-endian x86_64 binaries.
10
11use alloc::{sync::Arc, vec::Vec};
12use x86_64::{
13    structures::paging::{Mapper, Page, Size4KiB},
14    VirtAddr,
15};
16
17use crate::{
18    capability::Capability,
19    memory::address_space::{AddressSpace, VmaFlags, VmaPageSize, VmaType},
20    process::{
21        task::{CpuContext, KernelStack, SyncUnsafeCell, Task},
22        TaskId, TaskPriority, TaskState,
23    },
24};
25
26// ---------------------------------------------------------------------------
27// ELF64 constants
28// ---------------------------------------------------------------------------
29
30const ELF_MAGIC: [u8; 4] = [0x7F, b'E', b'L', b'F'];
31const ELFCLASS64: u8 = 2;
32const ELFDATA2LSB: u8 = 1;
33const ET_EXEC: u16 = 2;
34const ET_DYN: u16 = 3;
35const EV_CURRENT: u32 = 1;
36const EM_X86_64: u16 = 62;
37const PT_LOAD: u32 = 1;
38const PT_DYNAMIC: u32 = 2;
39const PT_INTERP: u32 = 3;
40const PT_TLS: u32 = 7;
41const PF_X: u32 = 1;
42const PF_W: u32 = 2;
43const PF_R: u32 = 4;
44const DT_NULL: i64 = 0;
45const DT_RELA: i64 = 7;
46const DT_RELASZ: i64 = 8;
47const DT_RELAENT: i64 = 9;
48const DT_STRTAB: i64 = 5;
49const DT_SYMTAB: i64 = 6;
50const DT_SYMENT: i64 = 11;
51const DT_JMPREL: i64 = 23;
52const DT_PLTRELSZ: i64 = 2;
53const DT_PLTREL: i64 = 20;
54const DT_RELACOUNT: i64 = 0x6fff_fff9;
55const DT_RELR: i64 = 36;
56const DT_RELRSZ: i64 = 35;
57const DT_RELRENT: i64 = 37;
58const R_X86_64_RELATIVE: u32 = 8;
59const R_X86_64_64: u32 = 1;
60const R_X86_64_COPY: u32 = 5;
61const R_X86_64_GLOB_DAT: u32 = 6;
62const R_X86_64_JUMP_SLOT: u32 = 7;
63const R_X86_64_TPOFF64: u32 = 18;
64const R_X86_64_IRELATIVE: u32 = 37;
65
66/// Maximum virtual address we accept for user-space mappings.
67pub const USER_ADDR_MAX: u64 = 0x0000_8000_0000_0000;
68/// Preferred base when placing ET_DYN (PIE) images.
69const PIE_BASE_ADDR: u64 = 0x0000_0001_0000_0000;
70
71/// User stack location (below the non-canonical gap).
72pub const USER_STACK_BASE: u64 = 0x0000_7FFF_F000_0000;
73/// Number of 4 KiB pages for the user stack (16 pages = 64 KiB).
74pub const USER_STACK_PAGES: usize = 16;
75/// Top of the user stack (stack grows down).
76pub const USER_STACK_TOP: u64 = USER_STACK_BASE + (USER_STACK_PAGES as u64) * 4096;
77
78/// Result of loading an ELF image into an address space.
79#[derive(Debug, Clone, Copy)]
80pub struct LoadedElfInfo {
81    pub runtime_entry: u64,
82    pub program_entry: u64,
83    pub phdr_vaddr: u64,
84    pub phent: u16,
85    pub phnum: u16,
86    pub interp_base: Option<u64>,
87    pub tls_vaddr: u64,
88    pub tls_filesz: u64,
89    pub tls_memsz: u64,
90    pub tls_align: u64,
91}
92
93// ---------------------------------------------------------------------------
94// ELF64 header structures
95// ---------------------------------------------------------------------------
96
97/// ELF64 file header (64 bytes).
98#[repr(C, packed)]
99#[derive(Debug, Clone, Copy)]
100struct Elf64Header {
101    e_ident: [u8; 16],
102    e_type: u16,
103    e_machine: u16,
104    e_version: u32,
105    e_entry: u64,
106    e_phoff: u64,
107    e_shoff: u64,
108    e_flags: u32,
109    e_ehsize: u16,
110    e_phentsize: u16,
111    e_phnum: u16,
112    e_shentsize: u16,
113    e_shnum: u16,
114    e_shstrndx: u16,
115}
116
117/// ELF64 program header (56 bytes).
118#[repr(C, packed)]
119#[derive(Debug, Clone, Copy)]
120struct Elf64Phdr {
121    p_type: u32,
122    p_flags: u32,
123    p_offset: u64,
124    p_vaddr: u64,
125    p_paddr: u64,
126    p_filesz: u64,
127    p_memsz: u64,
128    p_align: u64,
129}
130
131#[repr(C, packed)]
132#[derive(Debug, Clone, Copy)]
133struct Elf64Dyn {
134    d_tag: i64,
135    d_val: u64,
136}
137
138#[repr(C, packed)]
139#[derive(Debug, Clone, Copy)]
140struct Elf64Rela {
141    r_offset: u64,
142    r_info: u64,
143    r_addend: i64,
144}
145
146#[repr(C, packed)]
147#[derive(Debug, Clone, Copy)]
148struct Elf64Sym {
149    st_name: u32,
150    st_info: u8,
151    st_other: u8,
152    st_shndx: u16,
153    st_value: u64,
154    st_size: u64,
155}
156
157// ---------------------------------------------------------------------------
158// Parsing
159// ---------------------------------------------------------------------------
160
161/// Parse and validate the ELF64 file header from raw bytes.
162fn parse_header(data: &[u8]) -> Result<Elf64Header, &'static str> {
163    if data.len() < core::mem::size_of::<Elf64Header>() {
164        return Err("ELF data too small for header");
165    }
166
167    // SAFETY: data is large enough and Elf64Header is repr(C, packed) with no
168    // alignment requirements beyond 1.
169    let header: Elf64Header =
170        unsafe { core::ptr::read_unaligned(data.as_ptr() as *const Elf64Header) };
171
172    // Validate magic
173    if header.e_ident[0..4] != ELF_MAGIC {
174        return Err("Bad ELF magic");
175    }
176
177    // Class: 64-bit
178    if header.e_ident[4] != ELFCLASS64 {
179        return Err("Not ELF64");
180    }
181
182    // Data: little-endian
183    if header.e_ident[5] != ELFDATA2LSB {
184        return Err("Not little-endian ELF");
185    }
186
187    // Machine: x86_64
188    if header.e_machine != EM_X86_64 {
189        return Err("Not x86_64 ELF");
190    }
191
192    // Type: executable or shared object (PIE/static PIE executable image)
193    if header.e_type != ET_EXEC && header.e_type != ET_DYN {
194        return Err("Unsupported ELF type (expected ET_EXEC or ET_DYN)");
195    }
196
197    // ELF version
198    if header.e_version != EV_CURRENT {
199        return Err("Unsupported ELF version");
200    }
201
202    // Entry point must be canonical user space (for ET_DYN this is relative and
203    // validated again after relocation). ET_EXEC must be non-zero.
204    if header.e_entry >= USER_ADDR_MAX {
205        return Err("Entry point outside user address range");
206    }
207    // Some toolchains/images can emit ET_EXEC with e_entry=0.
208    // We handle this case later by deriving a fallback entry from PT_LOAD|PF_X.
209
210    // Sanity check program headers
211    if header.e_phentsize as usize != core::mem::size_of::<Elf64Phdr>() {
212        return Err("Unexpected phentsize");
213    }
214
215    let ph_end = (header.e_phoff as usize)
216        .checked_add((header.e_phnum as usize) * (header.e_phentsize as usize))
217        .ok_or("Program header table overflows")?;
218    if ph_end > data.len() {
219        return Err("Program headers extend past file");
220    }
221
222    Ok(header)
223}
224
225/// Iterate over program headers in the ELF.
226fn program_headers<'a>(
227    data: &'a [u8],
228    header: &Elf64Header,
229) -> impl Iterator<Item = Elf64Phdr> + 'a {
230    let phoff = header.e_phoff as usize;
231    let phsize = header.e_phentsize as usize;
232    let phnum = header.e_phnum as usize;
233
234    (0..phnum).map(move |i| {
235        let offset = phoff + i * phsize;
236        // SAFETY: parse_header already validated that all program headers fit
237        // within `data`, and Elf64Phdr is packed (align 1).
238        unsafe { core::ptr::read_unaligned(data.as_ptr().add(offset) as *const Elf64Phdr) }
239    })
240}
241
242/// Parses interp path.
243fn parse_interp_path<'a>(
244    elf_data: &'a [u8],
245    phdrs: &[Elf64Phdr],
246) -> Result<Option<&'a str>, &'static str> {
247    let Some(interp) = phdrs.iter().find(|ph| ph.p_type == PT_INTERP) else {
248        return Ok(None);
249    };
250    if interp.p_filesz == 0 {
251        return Err("PT_INTERP has empty path");
252    }
253    let start = interp.p_offset as usize;
254    let end = start
255        .checked_add(interp.p_filesz as usize)
256        .ok_or("PT_INTERP range overflow")?;
257    if end > elf_data.len() {
258        return Err("PT_INTERP extends past file");
259    }
260    let raw = &elf_data[start..end];
261    let nul = raw
262        .iter()
263        .position(|&b| b == 0)
264        .ok_or("PT_INTERP path is not NUL terminated")?;
265    let s = core::str::from_utf8(&raw[..nul]).map_err(|_| "PT_INTERP path is not UTF-8")?;
266    if s.is_empty() {
267        return Err("PT_INTERP path is empty");
268    }
269    Ok(Some(s))
270}
271
272/// Performs the find relocated phdr vaddr operation.
273fn find_relocated_phdr_vaddr(
274    header: &Elf64Header,
275    phdrs: &[Elf64Phdr],
276    load_bias: u64,
277) -> Result<u64, &'static str> {
278    let phoff = header.e_phoff;
279    for ph in phdrs {
280        if ph.p_type != PT_LOAD || ph.p_filesz == 0 {
281            continue;
282        }
283        let file_start = ph.p_offset;
284        let file_end = ph
285            .p_offset
286            .checked_add(ph.p_filesz)
287            .ok_or("PHDR location overflow")?;
288        if phoff >= file_start && phoff < file_end {
289            let delta = phoff - file_start;
290            let vaddr = ph
291                .p_vaddr
292                .checked_add(delta)
293                .and_then(|v| v.checked_add(load_bias))
294                .ok_or("Relocated PHDR address overflow")?;
295            if vaddr >= USER_ADDR_MAX {
296                return Err("Relocated PHDR outside user address space");
297            }
298            return Ok(vaddr);
299        }
300    }
301    Err("Program headers are not covered by a PT_LOAD segment")
302}
303
304/// Reads elf from vfs.
305fn read_elf_from_vfs(path: &str) -> Result<Vec<u8>, &'static str> {
306    const MAX_ELF_SIZE: usize = 64 * 1024 * 1024;
307    let fd =
308        crate::vfs::open(path, crate::vfs::OpenFlags::READ).map_err(|_| "PT_INTERP open failed")?;
309    let mut out = Vec::new();
310    let mut buf = [0u8; 4096];
311    loop {
312        let n = match crate::vfs::read(fd, &mut buf) {
313            Ok(n) => n,
314            Err(_) => {
315                let _ = crate::vfs::close(fd);
316                return Err("PT_INTERP read failed");
317            }
318        };
319        if n == 0 {
320            break;
321        }
322        if out.len().saturating_add(n) > MAX_ELF_SIZE {
323            let _ = crate::vfs::close(fd);
324            return Err("PT_INTERP file too large");
325        }
326        out.extend_from_slice(&buf[..n]);
327    }
328    let _ = crate::vfs::close(fd);
329    if out.is_empty() {
330        return Err("PT_INTERP file is empty");
331    }
332    Ok(out)
333}
334
335/// Compute total mapped bounds for all PT_LOAD segments.
336fn compute_load_bounds(phdrs: &[Elf64Phdr]) -> Result<(u64, u64), &'static str> {
337    let mut min_vaddr = u64::MAX;
338    let mut max_vaddr = 0u64;
339    let mut saw_load = false;
340
341    for phdr in phdrs {
342        if phdr.p_type != PT_LOAD {
343            continue;
344        }
345        if phdr.p_memsz == 0 {
346            continue;
347        }
348        saw_load = true;
349
350        if phdr.p_memsz < phdr.p_filesz {
351            return Err("PT_LOAD memsz < filesz");
352        }
353
354        // ELF requires p_vaddr % page == p_offset % page for PT_LOAD.
355        if ((phdr.p_vaddr ^ phdr.p_offset) & 0xFFF) != 0 {
356            return Err("PT_LOAD alignment mismatch (vaddr/offset)");
357        }
358
359        let seg_end = phdr
360            .p_vaddr
361            .checked_add(phdr.p_memsz)
362            .ok_or("PT_LOAD vaddr+memsz overflow")?;
363        if seg_end > USER_ADDR_MAX {
364            return Err("PT_LOAD exceeds user address space");
365        }
366
367        let seg_start_page = phdr.p_vaddr & !0xFFF;
368        let seg_end_page = (seg_end + 0xFFF) & !0xFFF;
369        min_vaddr = min_vaddr.min(seg_start_page);
370        max_vaddr = max_vaddr.max(seg_end_page);
371    }
372
373    if !saw_load {
374        return Err("ELF has no PT_LOAD segments");
375    }
376    Ok((min_vaddr, max_vaddr))
377}
378
379/// Compute load bias and relocated entry for ET_EXEC / ET_DYN.
380fn compute_load_bias_and_entry(
381    user_as: &AddressSpace,
382    header: &Elf64Header,
383    phdrs: &[Elf64Phdr],
384) -> Result<(u64, u64), &'static str> {
385    let (min_vaddr, max_vaddr) = compute_load_bounds(phdrs)?;
386    let span = max_vaddr
387        .checked_sub(min_vaddr)
388        .ok_or("Invalid PT_LOAD bounds")?;
389
390    let load_bias = if header.e_type == ET_EXEC {
391        0
392    } else {
393        let n_pages = (span as usize).div_ceil(4096);
394        let load_base = user_as
395            .find_free_vma_range(PIE_BASE_ADDR, n_pages, VmaPageSize::Small)
396            .or_else(|| {
397                user_as.find_free_vma_range(0x0000_0000_1000_0000, n_pages, VmaPageSize::Small)
398            })
399            .ok_or("No virtual range for ET_DYN image")?;
400        load_base
401            .checked_sub(min_vaddr)
402            .ok_or("ET_DYN load bias underflow")?
403    };
404
405    let relocated_end = max_vaddr
406        .checked_add(load_bias)
407        .ok_or("Relocated PT_LOAD range overflow")?;
408    if relocated_end > USER_ADDR_MAX {
409        return Err("Relocated PT_LOAD range exceeds user space");
410    }
411
412    let entry_raw = if header.e_type == ET_EXEC && header.e_entry == 0 {
413        let fallback = phdrs
414            .iter()
415            .find(|ph| ph.p_type == PT_LOAD && ph.p_memsz != 0 && (ph.p_flags & PF_X) != 0)
416            .map(|ph| ph.p_vaddr)
417            .ok_or("ET_EXEC has null entry and no executable PT_LOAD")?;
418        log::warn!(
419            "[elf] ET_EXEC has null entry, using fallback executable segment vaddr={:#x}",
420            fallback
421        );
422        fallback
423    } else {
424        header.e_entry
425    };
426
427    let relocated_entry = entry_raw
428        .checked_add(load_bias)
429        .ok_or("Relocated entry overflow")?;
430    if relocated_entry == 0 || relocated_entry >= USER_ADDR_MAX {
431        return Err("Relocated entry outside user space");
432    }
433
434    Ok((load_bias, relocated_entry))
435}
436
437/// Performs the apply segment permissions operation.
438fn apply_segment_permissions(
439    user_as: &AddressSpace,
440    page_start: u64,
441    page_count: usize,
442    flags: VmaFlags,
443) -> Result<(), &'static str> {
444    use x86_64::registers::control::Cr3;
445
446    let pte_flags = flags.to_page_flags();
447    // SAFETY: loader owns this AddressSpace during image construction.
448    let mut mapper = unsafe { user_as.mapper() };
449    for i in 0..page_count {
450        let vaddr = page_start
451            .checked_add((i as u64) * 4096)
452            .ok_or("Permission update address overflow")?;
453        let page = Page::<Size4KiB>::from_start_address(VirtAddr::new(vaddr))
454            .map_err(|_| "Invalid page while updating segment flags")?;
455        // SAFETY: the page is already mapped by map_region for this segment.
456        let _ = unsafe {
457            mapper
458                .update_flags(page, pte_flags)
459                .map_err(|_| "Failed to update segment page flags")?
460        };
461        // We ignore flush here and do a targeted flush decision below.
462    }
463
464    // During ELF loading we update a freshly-created user address space that is
465    // not active on other CPUs.  Cross-CPU shootdowns here only add boot-time
466    // latency and can timeout while APs are not yet servicing IPIs.
467    // If this address space is currently active on this CPU, local invalidation
468    // is enough for the loader path.
469    let (current_cr3, _) = Cr3::read();
470    if current_cr3.start_address() == user_as.cr3() {
471        let end = page_start + (page_count as u64) * 4096;
472        let mut v = page_start;
473        while v < end {
474            unsafe {
475                core::arch::asm!("invlpg [{}]", in(reg) v, options(nostack, preserves_flags));
476            }
477            v += 4096;
478        }
479    }
480
481    Ok(())
482}
483
484/// Reads user mapped bytes.
485fn read_user_mapped_bytes(
486    user_as: &AddressSpace,
487    mut vaddr: u64,
488    out: &mut [u8],
489) -> Result<(), &'static str> {
490    let end = vaddr
491        .checked_add(out.len() as u64)
492        .ok_or("Read range overflow")?;
493    if end > USER_ADDR_MAX {
494        return Err("Read range outside user space");
495    }
496    let mut copied = 0usize;
497    while copied < out.len() {
498        let page_off = (vaddr & 0xFFF) as usize;
499        let chunk = core::cmp::min(out.len() - copied, 4096 - page_off);
500        let phys = user_as
501            .translate(VirtAddr::new(vaddr))
502            .ok_or("Failed to translate mapped user bytes")?;
503        let src = crate::memory::phys_to_virt(phys.as_u64()) as *const u8;
504        // SAFETY: src points to mapped physical memory via HHDM.
505        unsafe { core::ptr::copy_nonoverlapping(src, out.as_mut_ptr().add(copied), chunk) };
506        copied += chunk;
507        vaddr = vaddr
508            .checked_add(chunk as u64)
509            .ok_or("Virtual address overflow while reading mapped bytes")?;
510    }
511    Ok(())
512}
513
514/// Writes user mapped bytes.
515fn write_user_mapped_bytes(
516    user_as: &AddressSpace,
517    mut vaddr: u64,
518    src: &[u8],
519) -> Result<(), &'static str> {
520    let end = vaddr
521        .checked_add(src.len() as u64)
522        .ok_or("Write range overflow")?;
523    if end > USER_ADDR_MAX {
524        return Err("Write range outside user space");
525    }
526    let mut written = 0usize;
527    while written < src.len() {
528        let page_off = (vaddr & 0xFFF) as usize;
529        let chunk = core::cmp::min(src.len() - written, 4096 - page_off);
530        let phys = user_as
531            .translate(VirtAddr::new(vaddr))
532            .ok_or("Failed to translate relocation target")?;
533        let dst = crate::memory::phys_to_virt(phys.as_u64()) as *mut u8;
534        // SAFETY: destination points to mapped user frame through HHDM.
535        unsafe { core::ptr::copy_nonoverlapping(src.as_ptr().add(written), dst, chunk) };
536        written += chunk;
537        vaddr = vaddr
538            .checked_add(chunk as u64)
539            .ok_or("Virtual address overflow while writing mapped bytes")?;
540    }
541    Ok(())
542}
543
544/// Reads user u64.
545fn read_user_u64(user_as: &AddressSpace, vaddr: u64) -> Result<u64, &'static str> {
546    let mut raw = [0u8; 8];
547    read_user_mapped_bytes(user_as, vaddr, &mut raw)?;
548    Ok(u64::from_le_bytes(raw))
549}
550
551/// Writes user u64.
552fn write_user_u64(user_as: &AddressSpace, vaddr: u64, value: u64) -> Result<(), &'static str> {
553    write_user_mapped_bytes(user_as, vaddr, &value.to_le_bytes())
554}
555
556/// Performs the apply relr relocations operation.
557fn apply_relr_relocations(
558    user_as: &AddressSpace,
559    load_bias: u64,
560    relr_base: u64,
561    relr_size: usize,
562    relr_ent: usize,
563) -> Result<usize, &'static str> {
564    if relr_size == 0 {
565        return Ok(0);
566    }
567    if relr_ent != core::mem::size_of::<u64>() {
568        return Err("Unsupported DT_RELRENT size");
569    }
570    if relr_size % relr_ent != 0 {
571        return Err("DT_RELR table size is not aligned");
572    }
573
574    let count = relr_size / relr_ent;
575    let mut applied = 0usize;
576    let mut where_addr = 0u64;
577
578    for i in 0..count {
579        let entry_addr = relr_base
580            .checked_add((i * relr_ent) as u64)
581            .ok_or("DT_RELR walk overflow")?;
582        let entry = read_user_u64(user_as, entry_addr)?;
583
584        if (entry & 1) == 0 {
585            where_addr = load_bias
586                .checked_add(entry)
587                .ok_or("DT_RELR absolute relocation overflow")?;
588            if where_addr >= USER_ADDR_MAX {
589                return Err("DT_RELR target outside user space");
590            }
591            let cur = read_user_u64(user_as, where_addr)?;
592            write_user_u64(
593                user_as,
594                where_addr,
595                cur.checked_add(load_bias)
596                    .ok_or("DT_RELR relocated value overflow")?,
597            )?;
598            where_addr = where_addr
599                .checked_add(8)
600                .ok_or("DT_RELR where pointer overflow")?;
601            applied += 1;
602        } else {
603            let mut bitmap = entry >> 1;
604            for bit in 0..63u64 {
605                if (bitmap & 1) != 0 {
606                    let slot = where_addr
607                        .checked_add(bit * 8)
608                        .ok_or("DT_RELR bitmap target overflow")?;
609                    if slot >= USER_ADDR_MAX {
610                        return Err("DT_RELR bitmap target outside user space");
611                    }
612                    let cur = read_user_u64(user_as, slot)?;
613                    write_user_u64(
614                        user_as,
615                        slot,
616                        cur.checked_add(load_bias)
617                            .ok_or("DT_RELR bitmap relocated value overflow")?,
618                    )?;
619                    applied += 1;
620                }
621                bitmap >>= 1;
622                if bitmap == 0 {
623                    break;
624                }
625            }
626            where_addr = where_addr
627                .checked_add(63 * 8)
628                .ok_or("DT_RELR where advance overflow")?;
629        }
630    }
631    Ok(applied)
632}
633
634/// Performs the apply dynamic relocations operation.
635fn apply_dynamic_relocations(
636    user_as: &AddressSpace,
637    phdrs: &[Elf64Phdr],
638    elf_type: u16,
639    load_bias: u64,
640) -> Result<(), &'static str> {
641    if elf_type != ET_DYN {
642        return Ok(());
643    }
644
645    let dynamic = phdrs.iter().find(|ph| ph.p_type == PT_DYNAMIC);
646    let Some(dynamic_ph) = dynamic else {
647        return Ok(());
648    };
649    if dynamic_ph.p_filesz == 0 {
650        return Ok(());
651    }
652
653    let dyn_addr = dynamic_ph
654        .p_vaddr
655        .checked_add(load_bias)
656        .ok_or("PT_DYNAMIC relocated address overflow")?;
657    let dyn_count = (dynamic_ph.p_filesz as usize) / core::mem::size_of::<Elf64Dyn>();
658
659    let mut rela_addr: Option<u64> = None;
660    let mut rela_size: usize = 0;
661    let mut rela_ent: usize = core::mem::size_of::<Elf64Rela>();
662    let mut jmprel_addr: Option<u64> = None;
663    let mut jmprel_size: usize = 0;
664    let mut pltrel_kind: Option<u64> = None;
665    let mut symtab_addr: Option<u64> = None;
666    let mut sym_ent: usize = core::mem::size_of::<Elf64Sym>();
667    let _strtab_addr: Option<u64> = None;
668    let mut rela_count_hint: Option<usize> = None;
669    let mut relr_addr: Option<u64> = None;
670    let mut relr_size: usize = 0;
671    let mut relr_ent: usize = 0;
672
673    for i in 0..dyn_count {
674        let entry_addr = dyn_addr
675            .checked_add((i * core::mem::size_of::<Elf64Dyn>()) as u64)
676            .ok_or("PT_DYNAMIC walk overflow")?;
677        let mut raw = [0u8; core::mem::size_of::<Elf64Dyn>()];
678        read_user_mapped_bytes(user_as, entry_addr, &mut raw)?;
679        // SAFETY: raw has exact size of Elf64Dyn; read_unaligned handles packing.
680        let dyn_entry = unsafe { core::ptr::read_unaligned(raw.as_ptr() as *const Elf64Dyn) };
681
682        match dyn_entry.d_tag {
683            DT_NULL => break,
684            DT_RELA => {
685                rela_addr = Some(
686                    dyn_entry
687                        .d_val
688                        .checked_add(load_bias)
689                        .ok_or("DT_RELA relocated address overflow")?,
690                )
691            }
692            DT_RELASZ => rela_size = dyn_entry.d_val as usize,
693            DT_RELAENT => rela_ent = dyn_entry.d_val as usize,
694            DT_RELACOUNT => rela_count_hint = Some(dyn_entry.d_val as usize),
695            DT_JMPREL => {
696                jmprel_addr = Some(
697                    dyn_entry
698                        .d_val
699                        .checked_add(load_bias)
700                        .ok_or("DT_JMPREL relocated address overflow")?,
701                )
702            }
703            DT_PLTRELSZ => jmprel_size = dyn_entry.d_val as usize,
704            DT_PLTREL => pltrel_kind = Some(dyn_entry.d_val),
705            DT_SYMTAB => {
706                symtab_addr = Some(
707                    dyn_entry
708                        .d_val
709                        .checked_add(load_bias)
710                        .ok_or("DT_SYMTAB relocated address overflow")?,
711                )
712            }
713            DT_SYMENT => sym_ent = dyn_entry.d_val as usize,
714            DT_STRTAB => {
715                let _ = dyn_entry
716                    .d_val
717                    .checked_add(load_bias)
718                    .ok_or("DT_STRTAB relocated address overflow")?;
719            }
720            DT_RELR => {
721                relr_addr = Some(
722                    dyn_entry
723                        .d_val
724                        .checked_add(load_bias)
725                        .ok_or("DT_RELR relocated address overflow")?,
726                )
727            }
728            DT_RELRSZ => relr_size = dyn_entry.d_val as usize,
729            DT_RELRENT => relr_ent = dyn_entry.d_val as usize,
730            _ => {}
731        }
732    }
733
734    let mut relr_applied = 0usize;
735    if let Some(relr_base) = relr_addr {
736        relr_applied = apply_relr_relocations(user_as, load_bias, relr_base, relr_size, relr_ent)?;
737    } else if relr_size != 0 || relr_ent != 0 {
738        return Err("DT_RELR metadata present without DT_RELR base");
739    }
740    if rela_ent != core::mem::size_of::<Elf64Rela>() {
741        return Err("Unsupported DT_RELAENT size");
742    }
743    if sym_ent != core::mem::size_of::<Elf64Sym>() {
744        return Err("Unsupported DT_SYMENT size");
745    }
746    if pltrel_kind.is_some() && pltrel_kind != Some(DT_RELA as u64) {
747        return Err("Only DT_PLTREL=DT_RELA is supported");
748    }
749
750    let read_sym_entry = |sym_idx: u32| -> Result<Elf64Sym, &'static str> {
751        let symtab = symtab_addr.ok_or("Missing DT_SYMTAB for symbol relocations")?;
752        let sym_addr = symtab
753            .checked_add((sym_idx as u64) * (sym_ent as u64))
754            .ok_or("Symbol table address overflow")?;
755        let mut raw = [0u8; core::mem::size_of::<Elf64Sym>()];
756        read_user_mapped_bytes(user_as, sym_addr, &mut raw)?;
757        Ok(unsafe { core::ptr::read_unaligned(raw.as_ptr() as *const Elf64Sym) })
758    };
759
760    let resolve_symbol = |sym_idx: u32| -> Result<u64, &'static str> {
761        if sym_idx == 0 {
762            return Ok(0);
763        }
764        let sym = read_sym_entry(sym_idx)?;
765        if sym.st_shndx == 0 {
766            return Err("Undefined symbol relocation not supported");
767        }
768        sym.st_value
769            .checked_add(load_bias)
770            .ok_or("Symbol value relocation overflow")
771    };
772
773    let resolve_symbol_raw = |sym_idx: u32| -> Result<u64, &'static str> {
774        if sym_idx == 0 {
775            return Ok(0);
776        }
777        let sym = read_sym_entry(sym_idx)?;
778        Ok(sym.st_value)
779    };
780
781    let resolve_symbol_size = |sym_idx: u32| -> Result<u64, &'static str> {
782        if sym_idx == 0 {
783            return Ok(0);
784        }
785        let sym = read_sym_entry(sym_idx)?;
786        Ok(sym.st_size)
787    };
788
789    let apply_rela_table = |table_base: u64,
790                            table_size: usize,
791                            count_hint: Option<usize>|
792     -> Result<usize, &'static str> {
793        if table_size == 0 {
794            return Ok(0);
795        }
796        let mut count = table_size / rela_ent;
797        if let Some(hint) = count_hint {
798            count = core::cmp::min(count, hint);
799        }
800        let mut applied = 0usize;
801        for i in 0..count {
802            let rela_addr_i = table_base
803                .checked_add((i * rela_ent) as u64)
804                .ok_or("Rela table overflow")?;
805            let mut raw = [0u8; core::mem::size_of::<Elf64Rela>()];
806            read_user_mapped_bytes(user_as, rela_addr_i, &mut raw)?;
807            // SAFETY: raw has exact size of Elf64Rela.
808            let rela = unsafe { core::ptr::read_unaligned(raw.as_ptr() as *const Elf64Rela) };
809
810            let r_type = (rela.r_info & 0xffff_ffff) as u32;
811            let r_sym = (rela.r_info >> 32) as u32;
812            let target = rela
813                .r_offset
814                .checked_add(load_bias)
815                .ok_or("Relocation target overflow")?;
816            if target >= USER_ADDR_MAX {
817                return Err("Relocation target outside user space");
818            }
819
820            let value = match r_type {
821                R_X86_64_RELATIVE => {
822                    if r_sym != 0 {
823                        return Err("R_X86_64_RELATIVE with non-zero symbol");
824                    }
825                    (load_bias as i128)
826                        .checked_add(rela.r_addend as i128)
827                        .ok_or("Relocation value overflow")?
828                }
829                R_X86_64_GLOB_DAT | R_X86_64_JUMP_SLOT | R_X86_64_64 => {
830                    let sym_val = resolve_symbol(r_sym)? as i128;
831                    sym_val
832                        .checked_add(rela.r_addend as i128)
833                        .ok_or("Relocation value overflow")?
834                }
835                R_X86_64_COPY => {
836                    let sym_val = resolve_symbol(r_sym)?;
837                    if sym_val == 0 {
838                        continue;
839                    }
840                    let sym_sz = resolve_symbol_size(r_sym)?;
841                    if sym_sz > 0 && sym_val < USER_ADDR_MAX {
842                        let mut tmp = [0u8; 256];
843                        let mut off = 0usize;
844                        while off < sym_sz as usize {
845                            let chunk = core::cmp::min(256, sym_sz as usize - off);
846                            read_user_mapped_bytes(
847                                user_as,
848                                sym_val + off as u64,
849                                &mut tmp[..chunk],
850                            )?;
851                            write_user_mapped_bytes(user_as, target + off as u64, &tmp[..chunk])?;
852                            off += chunk;
853                        }
854                    }
855                    applied += 1;
856                    continue;
857                }
858                R_X86_64_TPOFF64 => {
859                    let sym_val = if r_sym != 0 {
860                        resolve_symbol_raw(r_sym)? as i128
861                    } else {
862                        0i128
863                    };
864                    sym_val
865                        .checked_add(rela.r_addend as i128)
866                        .ok_or("TPOFF64 value overflow")?
867                }
868                R_X86_64_IRELATIVE => (load_bias as i128)
869                    .checked_add(rela.r_addend as i128)
870                    .ok_or("IRELATIVE value overflow")?,
871                _ => {
872                    log::warn!("[elf] Unsupported relocation type {}", r_type);
873                    continue;
874                }
875            };
876            if value < 0 || value > u64::MAX as i128 {
877                return Err("Relocation value out of range");
878            }
879            write_user_mapped_bytes(user_as, target, &(value as u64).to_le_bytes())?;
880            applied += 1;
881        }
882        Ok(applied)
883    };
884
885    let mut total_applied = 0usize;
886    if let Some(rela_base) = rela_addr {
887        total_applied += apply_rela_table(rela_base, rela_size, rela_count_hint)?;
888    }
889    if let Some(jmprel_base) = jmprel_addr {
890        total_applied += apply_rela_table(jmprel_base, jmprel_size, None)?;
891    }
892
893    if total_applied > 0 {
894        log::debug!("[elf] Applied {} RELA relocations", total_applied);
895    }
896    if relr_applied > 0 {
897        log::debug!("[elf] Applied {} RELR relocations", relr_applied);
898    }
899    Ok(())
900}
901
902// ---------------------------------------------------------------------------
903// Loading
904// ---------------------------------------------------------------------------
905
906/// Convert ELF p_flags to VmaFlags.
907fn elf_flags_to_vma(p_flags: u32) -> VmaFlags {
908    VmaFlags {
909        readable: p_flags & PF_R != 0,
910        writable: p_flags & PF_W != 0,
911        executable: p_flags & PF_X != 0,
912        user_accessible: true,
913    }
914}
915
916/// Load a single PT_LOAD segment into the given address space.
917///
918/// Allocates physical frames, maps them with appropriate permissions, and
919/// copies file data into the mapping. BSS (memsz > filesz) is already
920/// zero-filled because `map_region` zeroes newly allocated frames.
921fn load_segment(
922    user_as: &AddressSpace,
923    elf_data: &[u8],
924    phdr: &Elf64Phdr,
925    load_bias: u64,
926) -> Result<(), &'static str> {
927    let vaddr = phdr
928        .p_vaddr
929        .checked_add(load_bias)
930        .ok_or("PT_LOAD relocated vaddr overflow")?;
931    let memsz = phdr.p_memsz;
932    let filesz = phdr.p_filesz;
933    let offset = phdr.p_offset;
934
935    // Validate addresses are in user space
936    if vaddr >= USER_ADDR_MAX {
937        return Err("PT_LOAD vaddr outside user space");
938    }
939    let end = vaddr
940        .checked_add(memsz)
941        .ok_or("PT_LOAD vaddr+memsz overflows")?;
942    if end > USER_ADDR_MAX {
943        return Err("PT_LOAD segment extends past user space");
944    }
945
946    // Validate file region
947    let file_end = (offset as usize)
948        .checked_add(filesz as usize)
949        .ok_or("PT_LOAD offset+filesz overflows")?;
950    if file_end > elf_data.len() {
951        return Err("PT_LOAD file data extends past ELF");
952    }
953
954    // Calculate page-aligned mapping
955    let page_start = vaddr & !0xFFF;
956    let page_end = (end + 0xFFF) & !0xFFF;
957    let page_count = ((page_end - page_start) / 4096) as usize;
958
959    // Map writable during copy, then restore final ELF flags.
960    let actual_flags = elf_flags_to_vma(phdr.p_flags);
961    let load_flags = VmaFlags {
962        readable: true,
963        writable: true, // Need write access to copy data in
964        executable: actual_flags.executable,
965        user_accessible: true,
966    };
967
968    let vma_type = if actual_flags.executable {
969        VmaType::Code
970    } else {
971        VmaType::Anonymous
972    };
973    log::debug!(
974        "[elf] map PT_LOAD: start={:#x} pages={} filesz={:#x}",
975        page_start,
976        page_count,
977        filesz
978    );
979    user_as.map_region(
980        page_start,
981        page_count,
982        load_flags,
983        vma_type,
984        VmaPageSize::Small,
985    )?;
986
987    // Copy file data into the mapped pages.
988    // We translate each page through the user AS to find its physical frame,
989    // then access it via HHDM to write.
990    if filesz > 0 {
991        let src = &elf_data[offset as usize..file_end];
992        let mut copied = 0usize;
993
994        while copied < src.len() {
995            let dst_vaddr = vaddr + copied as u64;
996            let page_offset = (dst_vaddr & 0xFFF) as usize;
997            let chunk = core::cmp::min(src.len() - copied, 4096 - page_offset);
998
999            // Translate user virtual address → physical → HHDM virtual
1000            let phys = user_as
1001                .translate(VirtAddr::new(dst_vaddr))
1002                .ok_or("Failed to translate user page after mapping")?;
1003            let hhdm_ptr = crate::memory::phys_to_virt(phys.as_u64()) as *mut u8;
1004
1005            // SAFETY: hhdm_ptr points to a freshly mapped, zeroed frame via HHDM.
1006            // The source slice is validated above.
1007            unsafe {
1008                core::ptr::copy_nonoverlapping(src.as_ptr().add(copied), hhdm_ptr, chunk);
1009            }
1010
1011            copied += chunk;
1012        }
1013    }
1014
1015    // Tighten PTE permissions after copy.
1016    apply_segment_permissions(user_as, page_start, page_count, actual_flags)?;
1017
1018    log::debug!(
1019        "  PT_LOAD: {:#x}..{:#x} ({} pages, file {:#x}+{:#x}, flags {:?})",
1020        page_start,
1021        page_end,
1022        page_count,
1023        offset,
1024        filesz,
1025        actual_flags,
1026    );
1027
1028    Ok(())
1029}
1030
1031// ---------------------------------------------------------------------------
1032// Task creation with IRETQ trampoline
1033// ---------------------------------------------------------------------------
1034
1035/// Parameters for the Ring 3 trampoline, stored in a static so the
1036/// Trampoline that switches to user address space and does IRETQ to Ring 3.
1037///
1038/// Parameters (entry point, stack top, arg0, address space) are read from the
1039/// *current task* so that each ELF task carries its own copy.  This makes the
1040/// trampoline safe under SMP: two tasks can run their trampolines concurrently
1041/// on different CPUs without any shared mutable state.
1042extern "C" fn elf_ring3_trampoline() -> ! {
1043    use crate::arch::x86_64::gdt;
1044    use core::sync::atomic::Ordering;
1045
1046    crate::serial_force_println!("[trace][elf] ring3_trampoline before current_task");
1047    let task = crate::process::scheduler::current_task_clone_spin_debug("ring3_trampoline")
1048        .expect("elf_ring3_trampoline: no current task");
1049    crate::serial_force_println!(
1050        "[trace][elf] ring3_trampoline enter tid={} name={}",
1051        task.id.as_u64(),
1052        task.name
1053    );
1054
1055    let user_rip = task.trampoline_entry.load(Ordering::Acquire);
1056    let user_rsp = task.trampoline_stack_top.load(Ordering::Acquire);
1057    let user_arg0 = task.trampoline_arg0.load(Ordering::Acquire);
1058    crate::serial_force_println!(
1059        "[trace][elf] ring3_trampoline args tid={} rip={:#x} rsp={:#x} arg0={:#x}",
1060        task.id.as_u64(),
1061        user_rip,
1062        user_rsp,
1063        user_arg0
1064    );
1065
1066    // Switch to the user address space stored in the task.
1067    // SAFETY: The address space was set up during task creation and is valid.
1068    unsafe {
1069        let as_ref = &*task.process.address_space.get();
1070        as_ref.switch_to();
1071    }
1072    crate::serial_force_println!(
1073        "[trace][elf] ring3_trampoline switch_to done tid={}",
1074        task.id.as_u64()
1075    );
1076
1077    let user_cs = gdt::user_code_selector().0 as u64;
1078    let user_ss = gdt::user_data_selector().0 as u64;
1079    let user_rflags: u64 = 0x202; // IF=1, reserved bit 1 = 1
1080    crate::serial_force_println!(
1081        "[trace][elf] ring3_trampoline iret tid={} cs={:#x} ss={:#x} rflags={:#x}",
1082        task.id.as_u64(),
1083        user_cs,
1084        user_ss,
1085        user_rflags
1086    );
1087
1088    // SAFETY: Valid user mappings have been set up. IRETQ switches to Ring 3.
1089    unsafe {
1090        core::arch::asm!(
1091            "push {ss}",
1092            "push {rsp_val}",
1093            "push {rflags}",
1094            "push {cs}",
1095            "push {rip}",
1096            "mov rdi, {arg0}",
1097            "swapgs",
1098            "iretq",
1099            ss = in(reg) user_ss,
1100            rsp_val = in(reg) user_rsp,
1101            rflags = in(reg) user_rflags,
1102            cs = in(reg) user_cs,
1103            rip = in(reg) user_rip,
1104            arg0 = in(reg) user_arg0,
1105            options(noreturn),
1106        );
1107    }
1108}
1109
1110// ---------------------------------------------------------------------------
1111// Public API
1112// ---------------------------------------------------------------------------
1113
1114/// Load an ELF64 binary and schedule it as a Ring 3 user task.
1115///
1116/// # Arguments
1117/// * `elf_data` - Raw ELF file bytes (must remain valid until load completes).
1118/// * `name` - Name for the task (debugging purposes).
1119///
1120/// # Returns
1121/// `Ok(())` on success, `Err` with a static error message on failure.
1122pub fn load_and_run_elf(elf_data: &[u8], name: &'static str) -> Result<TaskId, &'static str> {
1123    load_and_run_elf_with_caps(elf_data, name, &[])
1124}
1125
1126/// Performs the load and run elf with caps operation.
1127pub fn load_and_run_elf_with_caps(
1128    elf_data: &[u8],
1129    name: &'static str,
1130    seed_caps: &[Capability],
1131) -> Result<TaskId, &'static str> {
1132    let task = load_elf_task_with_caps(elf_data, name, seed_caps)?;
1133    let task_id = task.id;
1134    let runtime_entry = task
1135        .trampoline_entry
1136        .load(core::sync::atomic::Ordering::Acquire);
1137    crate::process::add_task(task);
1138
1139    log::info!(
1140        "[elf] Task '{}' created: entry={:#x}, stack_top={:#x}",
1141        name,
1142        runtime_entry,
1143        USER_STACK_TOP,
1144    );
1145
1146    Ok(task_id)
1147}
1148
1149const AT_PHDR: u64 = 3;
1150const AT_PHENT: u64 = 4;
1151const AT_PHNUM: u64 = 5;
1152const AT_PAGESZ: u64 = 6;
1153const AT_BASE: u64 = 7;
1154const AT_ENTRY: u64 = 9;
1155const AT_RANDOM: u64 = 25;
1156
1157/// Performs the push auxv operation.
1158fn push_auxv(user_as: &AddressSpace, sp: &mut u64, tag: u64, val: u64) -> Result<(), &'static str> {
1159    *sp -= 8;
1160    write_user_u64(user_as, *sp, val)?;
1161    *sp -= 8;
1162    write_user_u64(user_as, *sp, tag)?;
1163    Ok(())
1164}
1165
1166/// Performs the setup boot user stack operation.
1167fn setup_boot_user_stack(
1168    user_as: &AddressSpace,
1169    name: &str,
1170    phdr_vaddr: u64,
1171    phent: u16,
1172    phnum: u16,
1173    program_entry: u64,
1174    interp_base: Option<u64>,
1175) -> Result<u64, &'static str> {
1176    let mut sp = USER_STACK_TOP;
1177
1178    let name_nul_len = (name.len() + 1) as u64;
1179    sp -= name_nul_len;
1180    let argv0_ptr = sp;
1181    write_user_mapped_bytes(user_as, sp, name.as_bytes())?;
1182    write_user_mapped_bytes(user_as, sp + name.len() as u64, &[0])?;
1183
1184    sp -= 16;
1185    let random_ptr = sp;
1186    write_user_mapped_bytes(user_as, sp, &[0x42u8; 16])?;
1187
1188    sp &= !0xF;
1189
1190    // AT_NULL
1191    push_auxv(user_as, &mut sp, 0, 0)?;
1192    push_auxv(user_as, &mut sp, AT_RANDOM, random_ptr)?;
1193    push_auxv(user_as, &mut sp, AT_ENTRY, program_entry)?;
1194    if let Some(base) = interp_base {
1195        push_auxv(user_as, &mut sp, AT_BASE, base)?;
1196    }
1197    push_auxv(user_as, &mut sp, AT_PAGESZ, 4096)?;
1198    push_auxv(user_as, &mut sp, AT_PHNUM, phnum as u64)?;
1199    push_auxv(user_as, &mut sp, AT_PHENT, phent as u64)?;
1200    push_auxv(user_as, &mut sp, AT_PHDR, phdr_vaddr)?;
1201
1202    // envp NULL terminator
1203    sp -= 8;
1204    write_user_u64(user_as, sp, 0)?;
1205    // argv[0], argv NULL terminator
1206    sp -= 8;
1207    write_user_u64(user_as, sp, 0)?;
1208    sp -= 8;
1209    write_user_u64(user_as, sp, argv0_ptr)?;
1210    // argc
1211    sp -= 8;
1212    write_user_u64(user_as, sp, 1)?;
1213
1214    // System V ABI: %rsp % 16 == 0 at process entry
1215    sp &= !0xF;
1216    Ok(sp)
1217}
1218
1219/// Performs the load elf task with caps operation.
1220pub fn load_elf_task_with_caps(
1221    elf_data: &[u8],
1222    name: &'static str,
1223    seed_caps: &[Capability],
1224) -> Result<Arc<Task>, &'static str> {
1225    log::info!("[elf] Loading ELF '{}'...", name);
1226
1227    // Step 1: Parse and validate ELF header
1228    let header = parse_header(elf_data)?;
1229    // Step 2: Create user address space
1230    let user_as = Arc::new(AddressSpace::new_user()?);
1231
1232    let phdrs: Vec<Elf64Phdr> = program_headers(elf_data, &header).collect();
1233    let interp_path = parse_interp_path(elf_data, &phdrs)?;
1234    let (load_bias, entry) = compute_load_bias_and_entry(&user_as, &header, &phdrs)?;
1235    let phdr_vaddr = find_relocated_phdr_vaddr(&header, &phdrs, load_bias)?;
1236
1237    let phnum = header.e_phnum;
1238    log::info!(
1239        "[elf] ELF '{}': type={}, entry={:#x}, bias={:#x}, {} program headers",
1240        name,
1241        if header.e_type == ET_DYN {
1242            "ET_DYN"
1243        } else {
1244            "ET_EXEC"
1245        },
1246        entry,
1247        load_bias,
1248        phnum,
1249    );
1250
1251    // Step 3: Load all PT_LOAD segments
1252    let mut load_count = 0u32;
1253    for phdr in phdrs.iter() {
1254        if phdr.p_type == PT_LOAD && phdr.p_memsz != 0 {
1255            load_segment(&user_as, elf_data, phdr, load_bias)?;
1256            load_count += 1;
1257        }
1258    }
1259    if interp_path.is_none() {
1260        apply_dynamic_relocations(&user_as, &phdrs, header.e_type, load_bias)?;
1261    }
1262
1263    log::info!("[elf] Loaded {} PT_LOAD segment(s)", load_count);
1264
1265    let mut runtime_entry = entry;
1266    let mut interp_base: Option<u64> = None;
1267    if let Some(path) = interp_path {
1268        let interp_data = read_elf_from_vfs(path)?;
1269        let interp_header = parse_header(&interp_data)?;
1270        let interp_phdrs: Vec<Elf64Phdr> = program_headers(&interp_data, &interp_header).collect();
1271        if parse_interp_path(&interp_data, &interp_phdrs)?.is_some() {
1272            return Err("Nested PT_INTERP is not supported");
1273        }
1274        let (interp_bias, interp_entry) =
1275            compute_load_bias_and_entry(&user_as, &interp_header, &interp_phdrs)?;
1276        let (interp_min_vaddr, _) = compute_load_bounds(&interp_phdrs)?;
1277        let mut interp_load_count = 0u32;
1278        for phdr in interp_phdrs.iter() {
1279            if phdr.p_type == PT_LOAD && phdr.p_memsz != 0 {
1280                load_segment(&user_as, &interp_data, phdr, interp_bias)?;
1281                interp_load_count += 1;
1282            }
1283        }
1284        apply_dynamic_relocations(&user_as, &interp_phdrs, interp_header.e_type, interp_bias)?;
1285        runtime_entry = interp_entry;
1286        interp_base = Some(interp_min_vaddr.saturating_add(interp_bias));
1287        log::info!(
1288            "[elf] PT_INTERP '{}' loaded: {} PT_LOAD, entry={:#x}",
1289            path,
1290            interp_load_count,
1291            runtime_entry
1292        );
1293    }
1294
1295    // TLS setup (Variant II: data at negative offsets from FS:0)
1296    let mut user_fs_base_val = 0u64;
1297    if let Some(tls) = phdrs.iter().find(|p| p.p_type == PT_TLS) {
1298        let tls_memsz = tls.p_memsz;
1299        let tls_filesz = tls.p_filesz;
1300        let tls_align = core::cmp::max(tls.p_align, 8).next_power_of_two();
1301        let aligned_memsz = (tls_memsz + tls_align - 1) & !(tls_align - 1);
1302        let total_size = aligned_memsz + 8;
1303        let n_tls_pages = ((total_size + 4095) / 4096) as usize;
1304        let tls_flags = VmaFlags {
1305            readable: true,
1306            writable: true,
1307            executable: false,
1308            user_accessible: true,
1309        };
1310        let tls_base = user_as
1311            .find_free_vma_range(0x7FFF_E000_0000, n_tls_pages, VmaPageSize::Small)
1312            .ok_or("No space for TLS block")?;
1313        user_as.map_region(
1314            tls_base,
1315            n_tls_pages,
1316            tls_flags,
1317            VmaType::Anonymous,
1318            VmaPageSize::Small,
1319        )?;
1320        if tls_filesz > 0 {
1321            let src_off = tls.p_offset as usize;
1322            let src_end = src_off + tls_filesz as usize;
1323            if src_end <= elf_data.len() {
1324                write_user_mapped_bytes(&user_as, tls_base, &elf_data[src_off..src_end])?;
1325            }
1326        }
1327        let tp = tls_base + aligned_memsz;
1328        write_user_u64(&user_as, tp, tp)?;
1329        user_fs_base_val = tp;
1330    }
1331
1332    // Step 4: Map user stack
1333    let stack_flags = VmaFlags {
1334        readable: true,
1335        writable: true,
1336        executable: false,
1337        user_accessible: true,
1338    };
1339    user_as.map_region(
1340        USER_STACK_BASE,
1341        USER_STACK_PAGES,
1342        stack_flags,
1343        VmaType::Stack,
1344        VmaPageSize::Small,
1345    )?;
1346    log::debug!(
1347        "[elf] User stack: {:#x}..{:#x} ({} pages)",
1348        USER_STACK_BASE,
1349        USER_STACK_TOP,
1350        USER_STACK_PAGES,
1351    );
1352
1353    let boot_sp = setup_boot_user_stack(
1354        &user_as,
1355        name,
1356        phdr_vaddr,
1357        header.e_phentsize,
1358        header.e_phnum,
1359        entry,
1360        interp_base,
1361    )?;
1362
1363    // Step 5: Create kernel task — trampoline params are stored inside the task
1364    // itself so that concurrent SMP execution of multiple trampolines is safe.
1365    let kernel_stack = KernelStack::allocate(Task::DEFAULT_STACK_SIZE)?;
1366    let context = CpuContext::new(elf_ring3_trampoline as *const () as u64, &kernel_stack);
1367    let (pid, tid, tgid) = Task::allocate_process_ids();
1368    let fpu_state = crate::process::task::ExtendedState::new();
1369    let xcr0_mask = fpu_state.xcr0_mask;
1370
1371    let task = Arc::new(Task {
1372        id: TaskId::new(),
1373        pid,
1374        tid,
1375        tgid,
1376        pgid: core::sync::atomic::AtomicU32::new(pid),
1377        sid: core::sync::atomic::AtomicU32::new(pid),
1378        uid: core::sync::atomic::AtomicU32::new(0),
1379        euid: core::sync::atomic::AtomicU32::new(0),
1380        gid: core::sync::atomic::AtomicU32::new(0),
1381        egid: core::sync::atomic::AtomicU32::new(0),
1382        state: SyncUnsafeCell::new(TaskState::Ready),
1383        priority: TaskPriority::Normal,
1384        context: SyncUnsafeCell::new(context),
1385        kernel_stack,
1386        user_stack: None,
1387        name,
1388        process: Arc::new(crate::process::process::Process::new(pid, user_as)),
1389        pending_signals: super::signal::SignalSet::new(),
1390        blocked_signals: super::signal::SignalSet::new(),
1391        signal_stack: SyncUnsafeCell::new(None),
1392        itimers: super::timer::ITimers::new(),
1393        wake_pending: core::sync::atomic::AtomicBool::new(false),
1394        wake_deadline_ns: core::sync::atomic::AtomicU64::new(0),
1395        trampoline_entry: core::sync::atomic::AtomicU64::new(runtime_entry),
1396        trampoline_stack_top: core::sync::atomic::AtomicU64::new(boot_sp),
1397        trampoline_arg0: core::sync::atomic::AtomicU64::new(0),
1398        ticks: core::sync::atomic::AtomicU64::new(0),
1399        sched_policy: crate::process::task::SyncUnsafeCell::new(Task::default_sched_policy(
1400            TaskPriority::Normal,
1401        )),
1402        vruntime: core::sync::atomic::AtomicU64::new(0),
1403        clear_child_tid: core::sync::atomic::AtomicU64::new(0),
1404        user_fs_base: core::sync::atomic::AtomicU64::new(user_fs_base_val),
1405        fpu_state: crate::process::task::SyncUnsafeCell::new(fpu_state),
1406        xcr0_mask: core::sync::atomic::AtomicU64::new(xcr0_mask),
1407    });
1408
1409    // Seed capabilities into the new task (before scheduling).
1410    let mut bootstrap_handle: Option<u64> = None;
1411    if !seed_caps.is_empty() {
1412        let caps = unsafe { &mut *task.process.capabilities.get() };
1413        for cap in seed_caps {
1414            let id = caps.insert(cap.clone());
1415            if bootstrap_handle.is_none()
1416                && cap.resource_type == crate::capability::ResourceType::Volume
1417            {
1418                bootstrap_handle = Some(id.as_u64());
1419            }
1420        }
1421    }
1422
1423    // Setup stdin/stdout/stderr (fd 0/1/2) pointing to /dev/console
1424    // SAFETY: task is not yet scheduled, exclusive access to fd_table
1425    {
1426        let fd_table = unsafe { &mut *task.process.fd_table.get() };
1427        crate::vfs::console_scheme::setup_stdio(fd_table);
1428    }
1429
1430    if let Some(h) = bootstrap_handle {
1431        // Program entry will see this in its first argument register (RDI).
1432        task.trampoline_arg0
1433            .store(h, core::sync::atomic::Ordering::Release);
1434    }
1435
1436    // Bootstrapping: grant Silo Admin capability to the initial userspace task.
1437    if name == "init"
1438        || name == "silo-admin"
1439        || name.starts_with("strate-admin:")
1440        || name.contains("/strate-admin-")
1441    {
1442        let _ = crate::silo::grant_silo_admin_to_task(&task);
1443    }
1444
1445    {
1446        let arc_data_ptr = alloc::sync::Arc::as_ptr(&task) as usize;
1447        let fpu_ptr = task.fpu_state.get() as usize;
1448        if let Some(cur) = crate::process::scheduler::current_task_clone() {
1449            let cur_data_ptr = alloc::sync::Arc::as_ptr(&cur) as usize;
1450            let cur_strong = alloc::sync::Arc::strong_count(&cur);
1451            log::info!(
1452                "[elf] Task '{}' prepared: entry={:#x}, stack_top={:#x} \
1453                 new_arc={:#x} new_fpu={:#x} cur_arc={:#x} cur_strong={}",
1454                name,
1455                runtime_entry,
1456                boot_sp,
1457                arc_data_ptr,
1458                fpu_ptr,
1459                cur_data_ptr,
1460                cur_strong,
1461            );
1462        } else {
1463            log::info!(
1464                "[elf] Task '{}' prepared: entry={:#x}, stack_top={:#x} \
1465                 new_arc={:#x} new_fpu={:#x} (no current task)",
1466                name,
1467                runtime_entry,
1468                boot_sp,
1469                arc_data_ptr,
1470                fpu_ptr,
1471            );
1472        }
1473    }
1474
1475    Ok(task)
1476}
1477
1478/// Load an ELF binary into the provided address space.
1479/// Returns the entry point address.
1480pub fn load_elf_image(
1481    elf_data: &[u8],
1482    user_as: &AddressSpace,
1483) -> Result<LoadedElfInfo, &'static str> {
1484    let header = parse_header(elf_data)?;
1485    let phdrs: Vec<Elf64Phdr> = program_headers(elf_data, &header).collect();
1486    let interp_path = parse_interp_path(elf_data, &phdrs)?;
1487    let (load_bias, entry) = compute_load_bias_and_entry(user_as, &header, &phdrs)?;
1488    let phdr_vaddr = find_relocated_phdr_vaddr(&header, &phdrs, load_bias)?;
1489
1490    for phdr in phdrs.iter() {
1491        if phdr.p_type == PT_LOAD && phdr.p_memsz != 0 {
1492            load_segment(user_as, elf_data, phdr, load_bias)?;
1493        }
1494    }
1495    if interp_path.is_none() {
1496        apply_dynamic_relocations(user_as, &phdrs, header.e_type, load_bias)?;
1497    }
1498
1499    let (tls_vaddr, tls_filesz, tls_memsz, tls_align) =
1500        if let Some(tls) = phdrs.iter().find(|ph| ph.p_type == PT_TLS) {
1501            let align = core::cmp::max(tls.p_align, 1).next_power_of_two();
1502            (
1503                tls.p_vaddr.saturating_add(load_bias),
1504                tls.p_filesz,
1505                tls.p_memsz,
1506                align,
1507            )
1508        } else {
1509            (0, 0, 0, 1)
1510        };
1511
1512    let mut runtime_entry = entry;
1513    let mut interp_base = None;
1514    if let Some(path) = interp_path {
1515        let interp_data = read_elf_from_vfs(path)?;
1516        let interp_header = parse_header(&interp_data)?;
1517        let interp_phdrs: Vec<Elf64Phdr> = program_headers(&interp_data, &interp_header).collect();
1518        if parse_interp_path(&interp_data, &interp_phdrs)?.is_some() {
1519            return Err("Nested PT_INTERP is not supported");
1520        }
1521        let (interp_bias, interp_entry) =
1522            compute_load_bias_and_entry(user_as, &interp_header, &interp_phdrs)?;
1523        let (interp_min_vaddr, _) = compute_load_bounds(&interp_phdrs)?;
1524        for phdr in interp_phdrs.iter() {
1525            if phdr.p_type == PT_LOAD && phdr.p_memsz != 0 {
1526                load_segment(user_as, &interp_data, phdr, interp_bias)?;
1527            }
1528        }
1529        apply_dynamic_relocations(user_as, &interp_phdrs, interp_header.e_type, interp_bias)?;
1530        runtime_entry = interp_entry;
1531        interp_base = Some(interp_min_vaddr.saturating_add(interp_bias));
1532    }
1533
1534    Ok(LoadedElfInfo {
1535        runtime_entry,
1536        program_entry: entry,
1537        phdr_vaddr,
1538        phent: header.e_phentsize,
1539        phnum: header.e_phnum,
1540        interp_base,
1541        tls_vaddr,
1542        tls_filesz,
1543        tls_memsz,
1544        tls_align,
1545    })
1546}
1547
1548/// Reads user mapped bytes pub.
1549pub fn read_user_mapped_bytes_pub(
1550    user_as: &AddressSpace,
1551    vaddr: u64,
1552    out: &mut [u8],
1553) -> Result<(), &'static str> {
1554    read_user_mapped_bytes(user_as, vaddr, out)
1555}
1556
1557/// Writes user mapped bytes pub.
1558pub fn write_user_mapped_bytes_pub(
1559    user_as: &AddressSpace,
1560    vaddr: u64,
1561    src: &[u8],
1562) -> Result<(), &'static str> {
1563    write_user_mapped_bytes(user_as, vaddr, src)
1564}
1565
1566/// Writes user u64 pub.
1567pub fn write_user_u64_pub(
1568    user_as: &AddressSpace,
1569    vaddr: u64,
1570    value: u64,
1571) -> Result<(), &'static str> {
1572    write_user_u64(user_as, vaddr, value)
1573}