Skip to main content

strat9_kernel/process/scheduler/
task_ops.rs

1use super::{runtime_ops::finish_switch, *};
2use crate::{memory::UserSliceWrite, sync::FixedQueue};
3
4const PENDING_SILO_CLEANUPS_CAPACITY: usize = 256;
5
6static PENDING_SILO_CLEANUPS: SpinLock<FixedQueue<TaskId, PENDING_SILO_CLEANUPS_CAPACITY>> =
7    SpinLock::new(FixedQueue::new());
8
9/// Mark the current task as Dead and yield to the scheduler.
10///
11/// Called by SYS_PROC_EXIT. The task will not be re-queued because
12/// `pick_next_task()` only re-queues tasks in `Running` state.
13/// This function does not return.
14pub fn exit_current_task(exit_code: i32) -> ! {
15    // -- clear_child_tid (POSIX pthread join) --
16    // Must happen BEFORE we drop the address space - write 0 to the TID pointer
17    // and do a futex_wake so any waiting pthread_join() can proceed.
18    if let Some(task) = current_task_clone() {
19        let tidptr = task
20            .clear_child_tid
21            .load(core::sync::atomic::Ordering::Relaxed);
22        if tidptr != 0 {
23            let zero = 0u32.to_ne_bytes();
24            // POSIX clear_child_tid targets a userspace u32; keep the existing
25            // alignment check for futex semantics, but validate the mapping via
26            // UserSliceWrite instead of dereferencing the raw userspace pointer.
27            if (tidptr & 3) == 0 {
28                if let Ok(user) = UserSliceWrite::new(tidptr, zero.len()) {
29                    user.copy_from(&zero);
30                }
31                // Futex wake: wake all threads waiting on this address (e.g. pthread_join).
32                let _ = crate::syscall::futex::sys_futex_wake(tidptr, u32::MAX);
33            }
34        }
35    }
36
37    let cpu_index = current_cpu_index();
38    let mut parent_to_signal: Option<TaskId> = None;
39    let mut ipi_to_cpu: Option<usize> = None;
40    {
41        let saved_flags = save_flags_and_cli();
42        let mut scheduler = GLOBAL_SCHED_STATE.lock();
43        let current = {
44            let local = LOCAL_SCHEDULERS[cpu_index].lock();
45            local.as_ref().and_then(|cpu| cpu.current_task.clone())
46        };
47        if let Some(ref mut sched) = *scheduler {
48            if let Some(current) = current {
49                let current_id = current.id;
50                let current_pid = current.pid;
51                let parent = {
52                    let identity = SCHED_IDENTITY.read();
53                    identity.parent_of.get(&current_id).copied()
54                };
55                let _ = sched.clear_task_wake_deadline_locked(current_id);
56                current.set_state(TaskState::Dead);
57                // Do NOT call cleanup_task_resources or all_tasks.remove() here!
58                // The task is still in current_task[cpu_index], and an interrupt
59                // could access it. Instead, mark it Dead and let pick_next_task
60                // handle the cleanup when it moves the task to task_to_drop.
61                // We only remove task_cpu and identity mappings to prevent
62                // lookups while the task is dying.
63                sched.task_cpu.remove(&current_id);
64                {
65                    let mut identity = SCHED_IDENTITY.write();
66                    GlobalSchedState::unregister_identity_locked(
67                        &mut identity,
68                        current_id,
69                        current_pid,
70                        current.tid,
71                    );
72                    identity.parent_of.remove(&current_id);
73                }
74
75                ipi_to_cpu = {
76                    let mut identity = SCHED_IDENTITY.write();
77                    reparent_children(sched, &mut identity, current_id)
78                };
79
80                if parent.is_some() {
81                    sched.zombies.insert(current_id, (exit_code, current_pid));
82                }
83                if let Some(parent_id) = parent {
84                    let (_, ipi_wake) = sched.wake_task_locked(parent_id);
85                    if ipi_to_cpu.is_none() {
86                        ipi_to_cpu = ipi_wake;
87                    }
88                    parent_to_signal = Some(parent_id);
89                }
90            }
91        }
92        drop(scheduler);
93        restore_flags(saved_flags);
94    }
95    if let Some(ci) = ipi_to_cpu {
96        send_resched_ipi_to_cpu(ci);
97    }
98
99    if let Some(parent_id) = parent_to_signal {
100        // Must happen outside scheduler lock to avoid lock recursion.
101        let _ =
102            crate::process::signal::send_signal(parent_id, crate::process::signal::Signal::SIGCHLD);
103    }
104
105    // Yield to pick the next task. Since we're Dead, we won't come back.
106    // Use yield_dead_task() which bypasses the PreemptGuard check : the task
107    // is already marked Dead and will never run again, so the guard is irrelevant.
108    // Using yield_task() here would silently return if a PreemptGuard is active,
109    // leaving the dead task spinning in the hlt() loop below.
110    yield_dead_task();
111
112    // Safety net - should never reach here
113    loop {
114        crate::arch::x86_64::hlt();
115    }
116}
117
118/// Get the current task's ID (if any task is running).
119pub fn current_task_id() -> Option<TaskId> {
120    let saved_flags = save_flags_and_cli();
121    let cpu_index = current_cpu_index();
122    let id = LOCAL_SCHEDULERS[cpu_index]
123        .lock()
124        .as_ref()
125        .and_then(|cpu| cpu.current_task.as_ref().map(|t| t.id));
126    restore_flags(saved_flags);
127    id
128}
129
130/// Get the current task's ID without blocking (safe for exceptions).
131pub fn current_task_id_try() -> Option<TaskId> {
132    let saved_flags = save_flags_and_cli();
133    let cpu_index = current_cpu_index();
134    let id = LOCAL_SCHEDULERS[cpu_index]
135        .try_lock_no_irqsave()
136        .and_then(|guard| {
137            guard
138                .as_ref()
139                .and_then(|cpu| cpu.current_task.as_ref().map(|t| t.id))
140        });
141    restore_flags(saved_flags);
142    id
143}
144
145/// Get the current process ID (POSIX pid).
146pub fn current_pid() -> Option<Pid> {
147    current_task_clone().map(|t| t.pid)
148}
149
150/// Get the current thread ID (POSIX tid).
151pub fn current_tid() -> Option<Tid> {
152    current_task_clone().map(|t| t.tid)
153}
154
155/// Get the current process group id.
156pub fn current_pgid() -> Option<Pid> {
157    current_task_clone().map(|t| t.pgid.load(Ordering::Relaxed))
158}
159
160/// Get the current session id.
161pub fn current_sid() -> Option<Pid> {
162    current_task_clone().map(|t| t.sid.load(Ordering::Relaxed))
163}
164
165/// Get the current task (cloned Arc), if any.
166#[track_caller]
167pub fn current_task_clone() -> Option<Arc<Task>> {
168    let saved_flags = save_flags_and_cli();
169    let cpu_index = current_cpu_index();
170    let caller = core::panic::Location::caller();
171    let task = LOCAL_SCHEDULERS[cpu_index].lock().as_ref().and_then(|cpu| {
172        let arc = cpu.current_task.as_ref()?;
173        let strong = Arc::strong_count(arc);
174        // Heuristic only: keep the warning, but do not mutate scheduler state here.
175        if strong == 0 || strong > (isize::MAX as usize) / 2 {
176            let ptr = Arc::as_ptr(arc) as *const u8;
177            crate::serial_println!(
178                "[sched] CORRUPT Arc refcount! cpu={} strong={:#x} ptr={:p} caller={}:{}",
179                cpu_index,
180                strong,
181                ptr,
182                caller.file(),
183                caller.line(),
184            );
185        }
186        Some(arc.clone())
187    });
188    restore_flags(saved_flags);
189    task
190}
191
192/// Best-effort, non-blocking variant of [`current_task_clone`].
193///
194/// Returns `None` when the scheduler lock is contended.
195/// Useful in cleanup paths where blocking on `GLOBAL_SCHED_STATE.lock()` could deadlock.
196#[track_caller]
197pub fn current_task_clone_try() -> Option<Arc<Task>> {
198    let saved_flags = save_flags_and_cli();
199    let cpu_index = current_cpu_index();
200    let caller = core::panic::Location::caller();
201    let task = LOCAL_SCHEDULERS[cpu_index]
202        .try_lock_no_irqsave()
203        .and_then(|guard| {
204            guard.as_ref().and_then(|cpu| {
205                let arc = cpu.current_task.as_ref()?;
206                let strong = Arc::strong_count(arc);
207                // Heuristic only: keep the warning, but do not mutate scheduler state here.
208                if strong == 0 || strong > (isize::MAX as usize) / 2 {
209                    let ptr = Arc::as_ptr(arc) as *const u8;
210                    crate::serial_println!(
211                        "[sched] CORRUPT Arc refcount! cpu={} strong={:#x} ptr={:p} caller={}:{}",
212                        cpu_index,
213                        strong,
214                        ptr,
215                        caller.file(),
216                        caller.line(),
217                    );
218                }
219                Some(arc.clone())
220            })
221        });
222    restore_flags(saved_flags);
223    task
224}
225
226/// Debug-only blocking variant used to diagnose early ring3 entry stalls.
227///
228/// Spins with `try_lock()` so we can emit progress logs instead of blocking
229/// silently on `GLOBAL_SCHED_STATE.lock()`.
230pub fn current_task_clone_spin_debug(trace_label: &str) -> Option<Arc<Task>> {
231    let saved_flags = save_flags_and_cli();
232    let cpu_index = current_cpu_index();
233    let mut spins = 0usize;
234    let result = loop {
235        if let Some(guard) = LOCAL_SCHEDULERS[cpu_index].try_lock_no_irqsave() {
236            break guard.as_ref().and_then(|cpu| {
237                if cpu.current_task.is_none() {
238                    unsafe { core::arch::asm!("mov al, 'N'; out 0xe9, al", out("al") _) };
239                    return None;
240                }
241                let arc = cpu.current_task.as_ref().unwrap();
242                let strong = Arc::strong_count(arc);
243                // Racy, pifometric diagnostic only: strong_count can move
244                // concurrently, so this is a heuristic for suspicious
245                // scheduler state, not a formal corruption proof.
246                if strong == 0 || strong > (isize::MAX as usize) / 2 {
247                    let ptr = Arc::as_ptr(arc) as *const u8;
248                    crate::serial_force_println!(
249                        "[trace][sched] {} suspicious current_task heuristic cpu={} strong={:#x} ptr={:p}",
250                        trace_label,
251                        cpu_index,
252                        strong,
253                        ptr,
254                    );
255                }
256                Some(arc.clone())
257            });
258        }
259
260        spins = spins.saturating_add(1);
261        if spins == 2_000_000 {
262            crate::serial_force_println!(
263                "[trace][sched] {} waiting current_task cpu={} owner_cpu={}",
264                trace_label,
265                cpu_index,
266                GLOBAL_SCHED_STATE.owner_cpu()
267            );
268            spins = 0;
269        }
270        core::hint::spin_loop();
271    };
272    restore_flags(saved_flags);
273    result
274}
275
276/// Resolve a POSIX pid to internal TaskId.
277pub fn get_task_id_by_pid(pid: Pid) -> Option<TaskId> {
278    SCHED_IDENTITY.read().pid_to_task.get(&pid).copied()
279}
280
281/// Resolve a POSIX pid to the corresponding task.
282pub fn get_task_by_pid(pid: Pid) -> Option<Arc<Task>> {
283    let tid = get_task_id_by_pid(pid)?;
284    get_task_by_id(tid)
285}
286
287/// Resolve a direct child of `parent` by POSIX pid.
288///
289/// Unlike the global pid index, this remains valid after the child has called
290/// exit and before it is reaped, because the task object stays in `all_tasks`
291/// until waitpid consumes the zombie.
292///
293/// Lock order: `GLOBAL_SCHED_STATE` before `SCHED_IDENTITY` (see module docs).
294pub fn get_child_task_id_by_pid(parent: TaskId, pid: Pid) -> Option<TaskId> {
295    let saved_flags = save_flags_and_cli();
296    let out = {
297        let scheduler = GLOBAL_SCHED_STATE.lock();
298        if let Some(ref sched) = *scheduler {
299            let children = {
300                let identity = SCHED_IDENTITY.read();
301                identity
302                    .children_of
303                    .get(&parent)
304                    .cloned()
305                    .unwrap_or_default()
306            };
307            if children.is_empty() {
308                None
309            } else {
310                children.iter().copied().find(|child_id| {
311                    sched
312                        .all_tasks
313                        .get(child_id)
314                        .map(|task| task.pid == pid)
315                        .unwrap_or(false)
316                })
317            }
318        } else {
319            None
320        }
321    };
322    restore_flags(saved_flags);
323    out
324}
325
326/// Resolve a POSIX tid to the corresponding internal task id.
327pub fn get_task_id_by_tid(tid: Tid) -> Option<TaskId> {
328    let identity = SCHED_IDENTITY.read();
329    identity
330        .tid_to_task
331        .get(&tid)
332        .copied()
333        .or_else(|| identity.pid_to_task.get(&(tid as Pid)).copied())
334}
335
336/// Resolve a direct child of `parent` by POSIX tid.
337///
338/// This remains valid for dead-but-not-yet-reaped threads because it scans the
339/// caller's child set and the retained task object instead of relying on the
340/// global tid index removed during exit.
341///
342/// Lock order: `GLOBAL_SCHED_STATE` before `SCHED_IDENTITY` (see module docs).
343pub fn get_child_task_id_by_tid(parent: TaskId, tid: Tid) -> Option<TaskId> {
344    let saved_flags = save_flags_and_cli();
345    let out = {
346        let scheduler = GLOBAL_SCHED_STATE.lock();
347        if let Some(ref sched) = *scheduler {
348            let children = {
349                let identity = SCHED_IDENTITY.read();
350                identity
351                    .children_of
352                    .get(&parent)
353                    .cloned()
354                    .unwrap_or_default()
355            };
356            if children.is_empty() {
357                None
358            } else {
359                children.iter().copied().find(|child_id| {
360                    sched
361                        .all_tasks
362                        .get(child_id)
363                        .map(|task| task.tid == tid)
364                        .unwrap_or(false)
365                })
366            }
367        } else {
368            None
369        }
370    };
371    restore_flags(saved_flags);
372    out
373}
374
375/// Resolve a PID to the current process group id.
376pub fn get_pgid_by_pid(pid: Pid) -> Option<Pid> {
377    SCHED_IDENTITY.read().pid_to_pgid.get(&pid).copied()
378}
379
380/// Resolve a PID to the current session id.
381pub fn get_sid_by_pid(pid: Pid) -> Option<Pid> {
382    SCHED_IDENTITY.read().pid_to_sid.get(&pid).copied()
383}
384
385/// Collect task IDs that currently belong to process group `pgid`.
386pub fn get_task_ids_in_pgid(pgid: Pid) -> alloc::vec::Vec<TaskId> {
387    use alloc::vec::Vec;
388    SCHED_IDENTITY
389        .read()
390        .pgid_members
391        .get(&pgid)
392        .cloned()
393        .unwrap_or_else(Vec::new)
394}
395
396/// Collect task IDs that currently belong to thread group `tgid`.
397pub fn get_task_ids_in_tgid(tgid: Pid) -> alloc::vec::Vec<TaskId> {
398    use alloc::vec::Vec;
399    let saved_flags = save_flags_and_cli();
400    let out = {
401        let scheduler = GLOBAL_SCHED_STATE.lock();
402        if let Some(ref sched) = *scheduler {
403            sched
404                .all_tasks
405                .values()
406                .filter(|task| task.tgid == tgid)
407                .map(|task| task.id)
408                .collect::<Vec<_>>()
409        } else {
410            Vec::new()
411        }
412    };
413    restore_flags(saved_flags);
414    out
415}
416
417/// Set process group id for `target_pid` (or current if `None`).
418pub fn set_process_group(
419    requester: TaskId,
420    target_pid: Option<Pid>,
421    new_pgid: Option<Pid>,
422) -> Result<Pid, crate::syscall::error::SyscallError> {
423    use crate::syscall::error::SyscallError;
424
425    let saved_flags = save_flags_and_cli();
426    let result = (|| -> Result<Pid, SyscallError> {
427        // Step 1: Get task references from GLOBAL_SCHED_STATE.
428        let (requester_task, target_id, target_task, desired_pgid, _group_leader_sid) = {
429            let scheduler = GLOBAL_SCHED_STATE.lock();
430            let sched = scheduler.as_ref().ok_or(SyscallError::Fault)?;
431
432            let requester_task = sched
433                .all_tasks
434                .get(&requester)
435                .cloned()
436                .ok_or(SyscallError::Fault)?;
437            let requester_sid = requester_task.sid.load(Ordering::Relaxed);
438
439            let target_id = match target_pid {
440                None => requester,
441                Some(pid) => SCHED_IDENTITY
442                    .read()
443                    .pid_to_task
444                    .get(&pid)
445                    .copied()
446                    .ok_or(SyscallError::NotFound)?,
447            };
448
449            if target_id != requester {
450                let is_child = SCHED_IDENTITY
451                    .read()
452                    .children_of
453                    .get(&requester)
454                    .map(|children| children.iter().any(|child| *child == target_id))
455                    .unwrap_or(false);
456                if !is_child {
457                    return Err(SyscallError::PermissionDenied);
458                }
459            }
460
461            let target_task = sched
462                .all_tasks
463                .get(&target_id)
464                .cloned()
465                .ok_or(SyscallError::NotFound)?;
466            let target_pid_value = target_task.pid;
467            let target_sid = target_task.sid.load(Ordering::Relaxed);
468
469            if target_sid != requester_sid {
470                return Err(SyscallError::PermissionDenied);
471            }
472
473            if target_pid_value == target_sid {
474                return Err(SyscallError::PermissionDenied);
475            }
476
477            let desired_pgid = new_pgid.unwrap_or(target_pid_value);
478            let group_leader_sid = if desired_pgid != target_pid_value {
479                let group_leader_tid = SCHED_IDENTITY
480                    .read()
481                    .pid_to_task
482                    .get(&desired_pgid)
483                    .copied()
484                    .ok_or(SyscallError::NotFound)?;
485                let group_leader = sched
486                    .all_tasks
487                    .get(&group_leader_tid)
488                    .ok_or(SyscallError::NotFound)?;
489                if group_leader.sid.load(Ordering::Relaxed) != target_sid {
490                    return Err(SyscallError::PermissionDenied);
491                }
492                group_leader.sid.load(Ordering::Relaxed)
493            } else {
494                0
495            };
496            Ok::<_, SyscallError>((
497                requester_task,
498                target_id,
499                target_task,
500                desired_pgid,
501                group_leader_sid,
502            ))
503        }?;
504
505        // Step 2: Mutate identity maps under SCHED_IDENTITY lock.
506        let old_pgid = target_task.pgid.load(Ordering::Relaxed);
507        target_task
508            .pgid
509            .store(new_pgid.unwrap_or(target_task.pid), Ordering::Relaxed);
510        {
511            let mut identity = SCHED_IDENTITY.write();
512            GlobalSchedState::member_remove(&mut identity.pgid_members, old_pgid, target_id);
513            GlobalSchedState::member_add(
514                &mut identity.pgid_members,
515                new_pgid.unwrap_or(target_task.pid),
516                target_id,
517            );
518            identity
519                .pid_to_pgid
520                .insert(target_task.pid, new_pgid.unwrap_or(target_task.pid));
521        }
522        Ok(new_pgid.unwrap_or(target_task.pid))
523    })();
524    restore_flags(saved_flags);
525    result
526}
527
528/// Create a new session for the calling task.
529pub fn create_session(requester: TaskId) -> Result<Pid, crate::syscall::error::SyscallError> {
530    use crate::syscall::error::SyscallError;
531
532    let saved_flags = save_flags_and_cli();
533    let result = (|| -> Result<Pid, SyscallError> {
534        // Get task reference from GLOBAL_SCHED_STATE.
535        let requester_task = {
536            let scheduler = GLOBAL_SCHED_STATE.lock();
537            let sched = scheduler.as_ref().ok_or(SyscallError::Fault)?;
538            sched
539                .all_tasks
540                .get(&requester)
541                .cloned()
542                .ok_or(SyscallError::Fault)?
543        };
544        let pid = requester_task.pid;
545        if requester_task.pgid.load(Ordering::Relaxed) == pid {
546            return Err(SyscallError::PermissionDenied);
547        }
548
549        let old_sid = requester_task.sid.load(Ordering::Relaxed);
550        let old_pgid = requester_task.pgid.load(Ordering::Relaxed);
551        requester_task.sid.store(pid, Ordering::Relaxed);
552        requester_task.pgid.store(pid, Ordering::Relaxed);
553        {
554            let mut identity = SCHED_IDENTITY.write();
555            GlobalSchedState::member_remove(&mut identity.sid_members, old_sid, requester);
556            GlobalSchedState::member_remove(&mut identity.pgid_members, old_pgid, requester);
557            GlobalSchedState::member_add(&mut identity.sid_members, pid, requester);
558            GlobalSchedState::member_add(&mut identity.pgid_members, pid, requester);
559            identity.pid_to_sid.insert(pid, pid);
560            identity.pid_to_pgid.insert(pid, pid);
561        }
562        Ok(pid)
563    })();
564    restore_flags(saved_flags);
565    result
566}
567
568/// Get a task by its TaskId (if still registered).
569pub fn get_task_by_id(id: TaskId) -> Option<Arc<Task>> {
570    let saved_flags = save_flags_and_cli();
571    let task = {
572        let scheduler = GLOBAL_SCHED_STATE.lock();
573        if let Some(ref sched) = *scheduler {
574            sched.all_tasks.get(&id).cloned()
575        } else {
576            None
577        }
578    };
579    restore_flags(saved_flags);
580    task
581}
582
583/// Update a task scheduling policy and requeue if needed.
584pub fn set_task_sched_policy(id: TaskId, policy: crate::process::sched::SchedPolicy) -> bool {
585    let saved_flags = save_flags_and_cli();
586    let mut ipi_to_cpu: Option<usize> = None;
587    let updated = {
588        let mut scheduler = GLOBAL_SCHED_STATE.lock();
589        if let Some(ref mut sched) = *scheduler {
590            let cpu_index = sched.task_cpu.get(&id).copied().unwrap_or(0);
591            let task = match sched.all_tasks.get(&id).cloned() {
592                Some(t) => t,
593                None => return false,
594            };
595            task.set_sched_policy(policy);
596            let class = sched.class_table.class_for_task(&task);
597
598            if let Some(ref mut local_cpu) = *LOCAL_SCHEDULERS[cpu_index].lock() {
599                // If task is queued in ready classes, migrate it to the new class.
600                if local_cpu.class_rqs.remove(id) {
601                    local_cpu.class_rqs.enqueue(class, task.clone());
602                }
603                local_cpu.need_resched = true;
604            }
605            if cpu_index != current_cpu_index() {
606                ipi_to_cpu = Some(cpu_index);
607            }
608            sched_trace(format_args!(
609                "set_policy task={} cpu={} policy={:?}",
610                id.as_u64(),
611                cpu_index,
612                policy
613            ));
614            true
615        } else {
616            false
617        }
618    };
619    if let Some(ci) = ipi_to_cpu {
620        send_resched_ipi_to_cpu(ci);
621    }
622    restore_flags(saved_flags);
623    updated
624}
625
626/// Get parent task ID for a child task.
627pub fn get_parent_id(child: TaskId) -> Option<TaskId> {
628    SCHED_IDENTITY.read().parent_of.get(&child).copied()
629}
630
631/// Get parent process ID for a child task.
632pub fn get_parent_pid(child: TaskId) -> Option<Pid> {
633    let parent_tid = get_parent_id(child)?;
634    let parent = get_task_by_id(parent_tid)?;
635    Some(parent.pid)
636}
637
638/// Try to reap a zombie child.
639///
640/// `target=None` means "any child".
641pub fn try_wait_child(parent: TaskId, target: Option<TaskId>) -> WaitChildResult {
642    let saved_flags = save_flags_and_cli();
643    let result = {
644        let mut scheduler = GLOBAL_SCHED_STATE.lock();
645        if let Some(ref mut sched) = *scheduler {
646            sched.try_reap_child_locked(parent, target)
647        } else {
648            WaitChildResult::NoChildren
649        }
650    };
651    restore_flags(saved_flags);
652    result
653}
654
655/// Block the current task and yield to the scheduler.
656///
657/// The current task is moved from Running to Blocked state and placed
658/// in the `blocked_tasks` map. It will not be re-scheduled until
659/// `wake_task(id)` is called.
660///
661/// ## Lock design
662///
663/// This function acquires **only** the `BLOCKED_TASKS` lock + the current
664/// CPU's `LOCAL_SCHEDULERS[cpu]` lock. It does **not** touch
665/// `GLOBAL_SCHED_STATE`, avoiding contention with cold-path operations
666/// (fork, exit, kill).
667///
668/// ## Lost-wakeup prevention
669///
670/// Before actually blocking, this function checks the task's `wake_pending`
671/// flag. If a concurrent `wake_task()` fired between the moment the task
672/// added itself to a `WaitQueue` and this call, the flag will be set and
673/// the function returns immediately without blocking.
674///
675/// Must NOT be called with interrupts disabled or while holding the
676/// scheduler lock (this function acquires both).
677pub fn block_current_task() {
678    let saved_flags = save_flags_and_cli();
679    let cpu_index = current_cpu_index();
680
681    let switch_target = {
682        // Hold BLOCKED_TASKS and LOCAL together through the state transition
683        // and task selection so a concurrent wake cannot observe the task as
684        // blocked, requeue it, and race with us tearing down current_task.
685        let mut blocked = super::BLOCKED_TASKS.lock();
686        let mut local = LOCAL_SCHEDULERS[cpu_index].lock();
687        let out = if let Some(ref mut cpu) = *local {
688            if let Some(ref current) = cpu.current_task {
689                if current
690                    .wake_pending
691                    .swap(false, core::sync::atomic::Ordering::AcqRel)
692                {
693                    // Pending wakeup consumed - do not block.
694                    None
695                } else {
696                    current.set_state(TaskState::Blocked);
697                    // Record home CPU so wake_task can route without GLOBAL.
698                    current
699                        .home_cpu
700                        .store(cpu_index, core::sync::atomic::Ordering::Relaxed);
701                    blocked.insert(current.id, current.clone());
702                    super::core_impl::yield_cpu_local(cpu, cpu_index)
703                }
704            } else {
705                None
706            }
707        } else {
708            None
709        };
710        drop(local);
711        drop(blocked);
712        out
713    }; // Locks released
714
715    if let Some(ref target) = switch_target {
716        unsafe {
717            crate::process::task::do_switch_context(target);
718        }
719        finish_switch();
720    }
721
722    restore_flags(saved_flags);
723}
724
725/// Wake a blocked task by its ID.
726///
727/// Moves the task from `blocked_tasks` to the ready queue and sets its
728/// state to Ready. Returns `true` if the task was found and woken.
729///
730/// ## Lock design
731///
732/// The primary path (task found in `BLOCKED_TASKS`) acquires **only** the
733/// `BLOCKED_TASKS` lock + the target CPU's `LOCAL_SCHEDULERS[cpu]` lock.
734/// It does **not** touch `GLOBAL_SCHED_STATE`, avoiding contention with
735/// cold-path operations (fork, exit, kill).
736///
737/// ## Lost-wakeup prevention
738///
739/// If the task is not yet in `blocked_tasks` (it is still transitioning
740/// from Ready -> Blocked inside `block_current_task()`), this function sets
741/// the task's `wake_pending` flag so that `block_current_task()` will see
742/// the pending wakeup and return immediately without actually blocking.
743pub fn wake_task(id: TaskId) -> bool {
744    let saved_flags = save_flags_and_cli();
745
746    // --- Primary path: task is in BLOCKED_TASKS ---
747    // Acquire only BLOCKED_TASKS + LOCAL[target_cpu]. No GLOBAL_SCHED_STATE.
748    let mut task_to_enqueue: Option<Arc<Task>> = None;
749    let mut ipi_cpu: Option<usize> = None;
750    let mut woken = false;
751
752    {
753        let mut blocked = super::BLOCKED_TASKS.lock();
754        if let Some(task) = blocked.remove(&id) {
755            task.set_state(TaskState::Ready);
756            let home = task.home_cpu.load(core::sync::atomic::Ordering::Relaxed);
757            let cpu_index = if home != usize::MAX { home } else { 0 };
758
759            // Compute the scheduling class for this task (done without GLOBAL).
760            let class = {
761                use crate::process::sched::SchedClassId;
762                match task.sched_policy() {
763                    crate::process::sched::SchedPolicy::RealTimeRR { .. }
764                    | crate::process::sched::SchedPolicy::RealTimeFifo { .. } => {
765                        SchedClassId::RealTime
766                    }
767                    crate::process::sched::SchedPolicy::Fair(_) => SchedClassId::Fair,
768                    crate::process::sched::SchedPolicy::Idle => SchedClassId::Idle,
769                }
770            };
771
772            if let Some(ref mut local_cpu) = *LOCAL_SCHEDULERS[cpu_index].lock() {
773                local_cpu.class_rqs.enqueue(class, task.clone());
774                local_cpu.need_resched = true;
775            }
776
777            ipi_cpu = if cpu_index != current_cpu_index() {
778                Some(cpu_index)
779            } else {
780                None
781            };
782            woken = true;
783            task_to_enqueue = None; // task is enqueued, no deferred drop needed
784        }
785    } // BLOCKED_TASKS lock released
786
787    if woken {
788        if let Some(ci) = ipi_cpu {
789            send_resched_ipi_to_cpu(ci);
790        }
791        restore_flags(saved_flags);
792        return true;
793    }
794
795    // --- Fallback path: task not yet in BLOCKED_TASKS ---
796    // Set wake_pending so block_current_task skips blocking.
797    {
798        let mut scheduler = GLOBAL_SCHED_STATE.lock();
799        if let Some(ref mut sched) = *scheduler {
800            let (fallback_woken, _) = sched.wake_task_locked(id);
801            woken = fallback_woken;
802        }
803    }
804
805    if let Some(ci) = ipi_cpu {
806        send_resched_ipi_to_cpu(ci);
807    }
808    restore_flags(saved_flags);
809    woken
810}
811
812/// Sets task wake deadline.
813pub fn set_task_wake_deadline(id: TaskId, deadline_ns: u64) -> bool {
814    let saved_flags = save_flags_and_cli();
815    let out = {
816        let mut scheduler = GLOBAL_SCHED_STATE.lock();
817        if let Some(ref mut sched) = *scheduler {
818            sched.set_task_wake_deadline_locked(id, deadline_ns)
819        } else {
820            false
821        }
822    };
823    restore_flags(saved_flags);
824    out
825}
826
827/// Performs the clear task wake deadline operation.
828pub fn clear_task_wake_deadline(id: TaskId) -> bool {
829    set_task_wake_deadline(id, 0)
830}
831
832/// Suspend a task by ID (best-effort).
833///
834/// Moves the task to the blocked map and marks it Blocked.
835/// - If the task is the *current* task on *this* CPU, a context switch is
836///   performed immediately.
837/// - If the task is the *current* task on *another* CPU, an IPI is sent to
838///   trigger preemption on that CPU. The task will not be re-queued at the
839///   next tick because its state is Blocked.
840pub fn suspend_task(id: TaskId) -> bool {
841    let saved_flags = save_flags_and_cli();
842
843    let mut switch_target: Option<SwitchTarget> = None;
844    let mut suspended = false;
845    let mut ipi_to_cpu: Option<usize> = None;
846
847    let my_cpu = current_cpu_index();
848    let n = active_cpu_count();
849
850    // Check if the task is the current task on any CPU.
851    for ci in 0..n {
852        let task_id_on_cpu = LOCAL_SCHEDULERS[ci]
853            .lock()
854            .as_ref()
855            .and_then(|cpu| cpu.current_task.as_ref().map(|t| (t.id, t.clone())));
856        if let Some((tid, current)) = task_id_on_cpu {
857            if tid == id {
858                current.set_state(TaskState::Blocked);
859                current
860                    .home_cpu
861                    .store(ci, core::sync::atomic::Ordering::Relaxed);
862                super::BLOCKED_TASKS
863                    .lock()
864                    .insert(current.id, current.clone());
865                suspended = true;
866                if ci == my_cpu {
867                    // Re-acquire LOCAL to yield.  The gap between the
868                    // probe above and this lock is safe because IRQs
869                    // are disabled (save_flags_and_cli), so no timer
870                    // tick can preempt us or mutate current_task.
871                    let mut local = LOCAL_SCHEDULERS[ci].lock();
872                    if let Some(ref mut cpu) = *local {
873                        switch_target = super::core_impl::yield_cpu_local(cpu, ci);
874                    }
875                } else {
876                    // Cross-CPU: IPI will make the remote CPU preempt.
877                    ipi_to_cpu = Some(ci);
878                }
879                break;
880            }
881        }
882    }
883
884    // Remove from ready queues (task was not running anywhere).
885    if !suspended {
886        for ci in 0..n {
887            let removed = {
888                let mut local = LOCAL_SCHEDULERS[ci].lock();
889                if let Some(ref mut cpu) = *local {
890                    cpu.class_rqs.remove(id)
891                } else {
892                    false
893                }
894            };
895            if removed {
896                if let Some(task) = get_task_by_id(id) {
897                    task.set_state(TaskState::Blocked);
898                    task.home_cpu
899                        .store(ci, core::sync::atomic::Ordering::Relaxed);
900                    super::BLOCKED_TASKS.lock().insert(task.id, task.clone());
901                }
902                suspended = true;
903                break;
904            }
905        }
906    }
907
908    // Already blocked.
909    if !suspended && super::BLOCKED_TASKS.lock().contains_key(&id) {
910        suspended = true;
911    }
912
913    if let Some(ref target) = switch_target {
914        unsafe {
915            crate::process::task::do_switch_context(target);
916        }
917        finish_switch();
918    }
919
920    if let Some(ci) = ipi_to_cpu {
921        send_resched_ipi_to_cpu(ci);
922    }
923
924    restore_flags(saved_flags);
925    suspended
926}
927
928/// Resume a previously suspended task by ID.
929///
930/// Moves the task from blocked to ready queue and marks it Ready.
931pub fn resume_task(id: TaskId) -> bool {
932    let saved_flags = save_flags_and_cli();
933    let mut ipi_to_cpu: Option<usize> = None;
934
935    let mut task_to_enqueue: Option<Arc<Task>> = None;
936    {
937        let mut blocked = super::BLOCKED_TASKS.lock();
938        if let Some(task) = blocked.remove(&id) {
939            task.set_state(TaskState::Ready);
940            let home = task.home_cpu.load(core::sync::atomic::Ordering::Relaxed);
941            let cpu_index = if home != usize::MAX { home } else { 0 };
942
943            let class = {
944                use crate::process::sched::SchedClassId;
945                match task.sched_policy() {
946                    crate::process::sched::SchedPolicy::RealTimeRR { .. }
947                    | crate::process::sched::SchedPolicy::RealTimeFifo { .. } => {
948                        SchedClassId::RealTime
949                    }
950                    crate::process::sched::SchedPolicy::Fair(_) => SchedClassId::Fair,
951                    crate::process::sched::SchedPolicy::Idle => SchedClassId::Idle,
952                }
953            };
954
955            if let Some(ref mut local_cpu) = *LOCAL_SCHEDULERS[cpu_index].lock() {
956                local_cpu.class_rqs.enqueue(class, task.clone());
957                local_cpu.need_resched = true;
958            }
959
960            if cpu_index != current_cpu_index() {
961                ipi_to_cpu = Some(cpu_index);
962            }
963            drop(blocked);
964            task_to_enqueue = Some(task);
965        }
966    }
967
968    if let Some(ci) = ipi_to_cpu {
969        send_resched_ipi_to_cpu(ci);
970    }
971    restore_flags(saved_flags);
972    task_to_enqueue.is_some()
973}
974
975/// Kill a task by ID (best-effort).
976///
977/// - Ready / blocked tasks are removed and marked Dead immediately.
978/// - If the task is the *current* task on *this* CPU, a context switch is
979///   performed immediately.
980/// - If the task is the *current* task on *another* CPU, an IPI triggers
981///   preemption on that CPU; the task will not be re-queued because its
982///   state is Dead.
983///
984/// Returns `true` if the task was found and killed.
985pub fn kill_task(id: TaskId) -> bool {
986    let pid = crate::process::get_task_by_id(id)
987        .map(|t| t.pid)
988        .unwrap_or(0);
989    crate::audit::log(
990        crate::audit::AuditCategory::Process,
991        pid,
992        crate::silo::task_silo_id(id).unwrap_or(0),
993        alloc::format!("kill_task tid={}", id.as_u64()),
994    );
995    let saved_flags = save_flags_and_cli();
996
997    let mut switch_target: Option<SwitchTarget> = None;
998    let mut killed = false;
999    let mut ipi_to_cpu: Option<usize> = None;
1000    let mut parent_to_signal: Option<TaskId> = None;
1001
1002    {
1003        let mut scheduler = GLOBAL_SCHED_STATE.lock();
1004        if let Some(ref mut sched) = *scheduler {
1005            // Keep parent/waitpid semantics even for forced termination paths.
1006            // A killed child must still become a zombie until reaped by waitpid().
1007            const FORCED_KILL_EXIT_CODE: i32 = 1;
1008            let my_cpu = current_cpu_index();
1009
1010            // Check if the task is the current task on any CPU.
1011            let n = active_cpu_count();
1012            let mut running_hit: Option<(usize, Arc<Task>)> = None;
1013            for ci in 0..n {
1014                let hit = LOCAL_SCHEDULERS[ci].lock().as_ref().and_then(|cpu| {
1015                    cpu.current_task
1016                        .as_ref()
1017                        .map(|t| (t.id, t.get_state(), t.clone()))
1018                });
1019                if let Some((tid, state, current)) = hit {
1020                    if tid == id {
1021                        // Check if already marked Dead by a previous kill attempt
1022                        if state != TaskState::Dead {
1023                            running_hit = Some((ci, current));
1024                        }
1025                        break;
1026                    }
1027                }
1028            }
1029            if let Some((ci, current)) = running_hit {
1030                let task_pid = current.pid;
1031                let _ = sched.clear_task_wake_deadline_locked(id);
1032                current.set_state(TaskState::Dead);
1033                // Do NOT call cleanup_task_resources or all_tasks.remove() here!
1034                // The task is still in current_task[ci], and an interrupt could
1035                // access it. Instead, mark it Dead and let pick_next_task handle
1036                // the cleanup when it moves the task to task_to_drop.
1037                sched.task_cpu.remove(&id);
1038                {
1039                    let mut identity = SCHED_IDENTITY.write();
1040                    GlobalSchedState::unregister_identity_locked(
1041                        &mut identity,
1042                        id,
1043                        task_pid,
1044                        current.tid,
1045                    );
1046                }
1047                let (parent, ipi_death) =
1048                    finalize_forced_death(sched, id, FORCED_KILL_EXIT_CODE, task_pid);
1049                parent_to_signal = parent;
1050                killed = true;
1051                if ci == my_cpu {
1052                    let mut local = LOCAL_SCHEDULERS[ci].lock();
1053                    if let Some(ref mut cpu) = *local {
1054                        switch_target = super::core_impl::yield_cpu_local(cpu, ci);
1055                    }
1056                } else {
1057                    ipi_to_cpu = Some(ci);
1058                }
1059                if ipi_to_cpu.is_none() {
1060                    ipi_to_cpu = ipi_death;
1061                }
1062            }
1063
1064            // Remove from ready queues.
1065            if !killed {
1066                let mut removed_from_ready = false;
1067                for ci in 0..n {
1068                    let removed = {
1069                        let mut local = LOCAL_SCHEDULERS[ci].lock();
1070                        if let Some(ref mut cpu) = *local {
1071                            cpu.class_rqs.remove(id)
1072                        } else {
1073                            false
1074                        }
1075                    };
1076                    if removed {
1077                        removed_from_ready = true;
1078                        break;
1079                    }
1080                }
1081                if removed_from_ready {
1082                    let _ = sched.clear_task_wake_deadline_locked(id);
1083                    if let Some(task) = sched.remove_all_task_locked(id) {
1084                        let task_pid = task.pid;
1085                        task.set_state(TaskState::Dead);
1086                        cleanup_task_resources(&task);
1087                        sched.task_cpu.remove(&id);
1088                        {
1089                            let mut identity = SCHED_IDENTITY.write();
1090                            GlobalSchedState::unregister_identity_locked(
1091                                &mut identity,
1092                                id,
1093                                task_pid,
1094                                task.tid,
1095                            );
1096                        }
1097                        let (parent, ipi_death) =
1098                            finalize_forced_death(sched, id, FORCED_KILL_EXIT_CODE, task_pid);
1099                        parent_to_signal = parent;
1100                        if ipi_to_cpu.is_none() {
1101                            ipi_to_cpu = ipi_death;
1102                        }
1103                    }
1104                    killed = true;
1105                }
1106            }
1107
1108            // Remove from blocked map.
1109            if !killed {
1110                if let Some(task) = super::BLOCKED_TASKS.lock().remove(&id) {
1111                    let task_pid = task.pid;
1112                    let _ = sched.clear_task_wake_deadline_locked(id);
1113                    task.set_state(TaskState::Dead);
1114                    cleanup_task_resources(&task);
1115                    let _ = sched.remove_all_task_locked(id);
1116                    sched.task_cpu.remove(&id);
1117                    {
1118                        let mut identity = SCHED_IDENTITY.write();
1119                        GlobalSchedState::unregister_identity_locked(
1120                            &mut identity,
1121                            id,
1122                            task_pid,
1123                            task.tid,
1124                        );
1125                    }
1126                    let (parent, ipi_death) =
1127                        finalize_forced_death(sched, id, FORCED_KILL_EXIT_CODE, task_pid);
1128                    parent_to_signal = parent;
1129                    if ipi_to_cpu.is_none() {
1130                        ipi_to_cpu = ipi_death;
1131                    }
1132                    killed = true;
1133                }
1134            }
1135        }
1136    } // scheduler lock released before IPI and context switch
1137
1138    if let Some(ref target) = switch_target {
1139        unsafe {
1140            crate::process::task::do_switch_context(target);
1141        }
1142        finish_switch();
1143    }
1144
1145    if let Some(ci) = ipi_to_cpu {
1146        send_resched_ipi_to_cpu(ci);
1147    }
1148
1149    if let Some(parent_id) = parent_to_signal {
1150        // Must happen outside scheduler lock to avoid lock recursion.
1151        let _ =
1152            crate::process::signal::send_signal(parent_id, crate::process::signal::Signal::SIGCHLD);
1153    }
1154
1155    restore_flags(saved_flags);
1156    killed
1157}
1158
1159/// Performs the finalize forced death operation.
1160fn finalize_forced_death(
1161    sched: &mut GlobalSchedState,
1162    task_id: TaskId,
1163    exit_code: i32,
1164    task_pid: Pid,
1165) -> (Option<TaskId>, Option<usize>) {
1166    let ipi_reparent = {
1167        let mut identity = SCHED_IDENTITY.write();
1168        reparent_children(sched, &mut identity, task_id)
1169    };
1170    let parent = {
1171        let mut identity = SCHED_IDENTITY.write();
1172        identity.parent_of.remove(&task_id)
1173    };
1174    if let Some(parent_id) = parent {
1175        sched.zombies.insert(task_id, (exit_code, task_pid));
1176        let (_, ipi_wake) = sched.wake_task_locked(parent_id);
1177        (Some(parent_id), ipi_reparent.or(ipi_wake))
1178    } else {
1179        (None, ipi_reparent)
1180    }
1181}
1182
1183/// Performs the reparent children operation.
1184fn reparent_children(
1185    sched: &mut GlobalSchedState,
1186    identity: &mut SchedIdentity,
1187    dying: TaskId,
1188) -> Option<usize> {
1189    let children = match identity.children_of.remove(&dying) {
1190        Some(c) => c,
1191        None => return None,
1192    };
1193    let init_id = identity
1194        .pid_to_task
1195        .get(&1)
1196        .copied()
1197        .or_else(|| sched.all_tasks.keys().next().copied());
1198    let Some(init_id) = init_id else {
1199        for child in &children {
1200            identity.parent_of.remove(child);
1201        }
1202        return None;
1203    };
1204    if init_id == dying {
1205        for child in &children {
1206            identity.parent_of.remove(child);
1207        }
1208        return None;
1209    }
1210    let mut has_zombie = false;
1211    let init_children = identity.children_of.entry(init_id).or_default();
1212    for child in children {
1213        if !has_zombie && sched.zombies.contains_key(&child) {
1214            has_zombie = true;
1215        }
1216        identity.parent_of.insert(child, init_id);
1217        init_children.push(child);
1218    }
1219    if has_zombie {
1220        let (_, ipi) = sched.wake_task_locked(init_id);
1221        ipi
1222    } else {
1223        None
1224    }
1225}
1226
1227/// Performs the cleanup task resources operation.
1228///
1229/// Called when a task exits or is killed to release ports, capabilities,
1230/// and user address space mappings.
1231///
1232/// # Safety
1233/// Must be called with the scheduler lock held and the task no longer
1234/// accessible from any global map (all_tasks, current_task, etc.).
1235fn queue_silo_cleanup(task_id: TaskId) {
1236    let mut guard = PENDING_SILO_CLEANUPS.lock();
1237    guard
1238        .push_back(task_id)
1239        .unwrap_or_else(|_| panic!("pending silo cleanup queue overflow"));
1240}
1241
1242pub fn flush_deferred_silo_cleanups() {
1243    let mut guard = match PENDING_SILO_CLEANUPS.try_lock() {
1244        Some(g) => g,
1245        None => return, // Lock held by preempted task or other CPU, skip safely
1246    };
1247    if guard.is_empty() {
1248        return;
1249    }
1250    let mut drained = FixedQueue::<TaskId, PENDING_SILO_CLEANUPS_CAPACITY>::new();
1251    core::mem::swap(&mut *guard, &mut drained);
1252    drop(guard);
1253    while let Some(task_id) = drained.pop_front() {
1254        crate::silo::on_task_terminated(task_id);
1255    }
1256}
1257
1258pub(crate) fn cleanup_task_resources(task: &Arc<Task>) {
1259    crate::ipc::port::cleanup_ports_for_task(task.id);
1260    queue_silo_cleanup(task.id);
1261
1262    // SAFETY: strong_count is racy (a concurrent get_task_by_id may temporarily
1263    // hold an extra Arc ref). Worst case: cleanup is deferred until the last ref
1264    // drops elsewhere - no resource leak, just delayed release.
1265    let is_last_process_ref = Arc::strong_count(&task.process) == 1;
1266    if !is_last_process_ref {
1267        return;
1268    }
1269
1270    unsafe {
1271        (&mut *task.process.fd_table.get()).close_all();
1272        let capabilities = (&mut *task.process.capabilities.get()).take_all();
1273        for capability in &capabilities {
1274            crate::capability::release_capability(capability, Some(task.id));
1275        }
1276    }
1277
1278    let as_ref = task.process.address_space_arc();
1279    if !as_ref.is_kernel() && Arc::strong_count(&as_ref) == 1 {
1280        as_ref.unmap_all_user_regions();
1281    }
1282}