1use super::{runtime_ops::idle_task_main, *};
2
3pub(super) fn create_cpu_scheduler(cpu_idx: usize) -> SchedulerCpu {
5 crate::serial_println!(
6 "[trace][sched] create_cpu_scheduler cpu={} create idle begin",
7 cpu_idx
8 );
9 let idle_task = Task::new_kernel_task(idle_task_main, "idle", TaskPriority::Idle)
10 .expect("Failed to create idle task");
11 crate::serial_println!(
12 "[trace][sched] create_cpu_scheduler cpu={} create idle done id={}",
13 cpu_idx,
14 idle_task.id.as_u64()
15 );
16 idle_task.set_sched_policy(crate::process::sched::SchedPolicy::Idle);
17 let mut class_rqs = PerCpuClassRqSet::new();
18 class_rqs.enqueue(crate::process::sched::SchedClassId::Idle, idle_task.clone());
19 SchedulerCpu {
20 class_rqs,
21 current_task: None,
22 current_runtime: crate::process::sched::CurrentRuntime::new(),
23 idle_task,
24 task_to_requeue: None,
25 task_to_drop: None,
26 need_resched: false,
27 class_table: crate::process::sched::SchedClassTable::default(),
28 }
29}
30
31impl GlobalSchedState {
32 pub fn new() -> Self {
34 crate::serial_println!("[trace][sched] GlobalSchedState::new enter");
35 GlobalSchedState {
36 all_tasks: BTreeMap::new(),
37 all_tasks_scan: Vec::new(),
38 task_cpu: BTreeMap::new(),
39 wake_deadlines: BTreeMap::new(),
40 wake_deadline_of: BTreeMap::new(),
41 zombies: BTreeMap::new(),
42 class_table: crate::process::sched::SchedClassTable::default(),
43 }
44 }
45
46 pub(crate) fn member_add(
48 map: &mut BTreeMap<Pid, alloc::vec::Vec<TaskId>>,
49 key: Pid,
50 task_id: TaskId,
51 ) {
52 let members = map.entry(key).or_default();
53 if !members.iter().any(|id| *id == task_id) {
54 members.push(task_id);
55 }
56 }
57
58 pub(crate) fn member_remove(
60 map: &mut BTreeMap<Pid, alloc::vec::Vec<TaskId>>,
61 key: Pid,
62 task_id: TaskId,
63 ) {
64 let mut clear = false;
65 if let Some(members) = map.get_mut(&key) {
66 members.retain(|id| *id != task_id);
67 clear = members.is_empty();
68 }
69 if clear {
70 map.remove(&key);
71 }
72 }
73
74 pub(crate) fn register_identity_locked(identity: &mut SchedIdentity, task: &Arc<Task>) {
76 let task_id = task.id;
77 let pid = task.pid;
78 let pgid = task.pgid.load(Ordering::Relaxed);
79 let sid = task.sid.load(Ordering::Relaxed);
80 crate::serial_println!(
81 "[trace][sched] register_identity enter tid={} pid={} pgid={} sid={}",
82 task_id.as_u64(),
83 pid,
84 pgid,
85 sid
86 );
87 identity.pid_to_pgid.insert(pid, pgid);
88 crate::serial_println!(
89 "[trace][sched] register_identity pid_to_pgid inserted pid={}",
90 pid
91 );
92 identity.pid_to_sid.insert(pid, sid);
93 crate::serial_println!(
94 "[trace][sched] register_identity pid_to_sid inserted pid={}",
95 pid
96 );
97 Self::member_add(&mut identity.pgid_members, pgid, task_id);
98 Self::member_add(&mut identity.sid_members, sid, task_id);
99 crate::serial_println!(
100 "[trace][sched] register_identity done tid={}",
101 task_id.as_u64()
102 );
103 }
104
105 pub(crate) fn unregister_identity_locked(
107 identity: &mut SchedIdentity,
108 task_id: TaskId,
109 pid: Pid,
110 tid: Tid,
111 ) {
112 identity.pid_to_task.remove(&pid);
113 identity.tid_to_task.remove(&tid);
114 if let Some(pgid) = identity.pid_to_pgid.remove(&pid) {
115 Self::member_remove(&mut identity.pgid_members, pgid, task_id);
116 }
117 if let Some(sid) = identity.pid_to_sid.remove(&pid) {
118 Self::member_remove(&mut identity.sid_members, sid, task_id);
119 }
120 }
121
122 pub fn add_task(&mut self, task: Arc<Task>) -> Option<usize> {
124 let cpu_index = self.select_cpu_for_task();
125 self.add_task_on_cpu(task, cpu_index)
126 }
127
128 pub fn add_task_with_parent(&mut self, task: Arc<Task>, parent: TaskId) -> Option<usize> {
130 let child = task.id;
131 let cpu_index = self.select_cpu_for_task();
132 let ipi = self.add_task_on_cpu(task, cpu_index);
133 {
134 let mut identity = SCHED_IDENTITY.write();
135 identity.parent_of.insert(child, parent);
136 identity.children_of.entry(parent).or_default().push(child);
137 }
138 ipi
139 }
140
141 fn add_task_on_cpu(&mut self, task: Arc<Task>, cpu_index: usize) -> Option<usize> {
143 let task_id = task.id;
144 crate::serial_println!(
145 "[trace][sched] add_task_on_cpu enter tid={} cpu={}",
146 task_id.as_u64(),
147 cpu_index
148 );
149 task.set_state(TaskState::Ready);
150 crate::serial_println!(
151 "[trace][sched] add_task_on_cpu state ready tid={}",
152 task_id.as_u64()
153 );
154
155 crate::serial_println!(
156 "[trace][sched] add_task_on_cpu before clone tid={} all_tasks_len={}",
157 task_id.as_u64(),
158 self.all_tasks.len()
159 );
160 let task_clone = task.clone();
161 crate::serial_println!(
162 "[trace][sched] add_task_on_cpu before all_tasks.insert tid={}",
163 task_id.as_u64()
164 );
165 self.insert_all_task_locked(task_id, task_clone);
166 crate::serial_println!(
167 "[trace][sched] add_task_on_cpu all_tasks inserted tid={}",
168 task_id.as_u64()
169 );
170 self.task_cpu.insert(task_id, cpu_index);
171 task.home_cpu
172 .store(cpu_index, core::sync::atomic::Ordering::Relaxed);
173 crate::serial_println!(
174 "[trace][sched] add_task_on_cpu task_cpu inserted tid={}",
175 task_id.as_u64()
176 );
177 {
178 let mut identity = SCHED_IDENTITY.write();
179 identity.pid_to_task.insert(task.pid, task_id);
180 crate::serial_println!(
181 "[trace][sched] add_task_on_cpu pid map inserted tid={}",
182 task_id.as_u64()
183 );
184 identity.tid_to_task.insert(task.tid, task_id);
185 crate::serial_println!(
186 "[trace][sched] add_task_on_cpu tid map inserted tid={}",
187 task_id.as_u64()
188 );
189 Self::register_identity_locked(&mut identity, &task);
190 crate::serial_println!(
191 "[trace][sched] add_task_on_cpu identity registered tid={}",
192 task_id.as_u64()
193 );
194 }
195 {
196 let class = self.class_table.class_for_task(&task);
197 if let Some(ref mut local_cpu) = *LOCAL_SCHEDULERS[cpu_index].lock() {
198 local_cpu.class_rqs.enqueue(class, task);
199 local_cpu.need_resched = true;
200 crate::serial_println!(
201 "[trace][sched] add_task_on_cpu enqueued tid={} cpu={}",
202 task_id.as_u64(),
203 cpu_index
204 );
205 }
206 }
207 sched_trace(format_args!(
208 "enqueue task={} cpu={}",
209 task_id.as_u64(),
210 cpu_index
211 ));
212 if cpu_index != current_cpu_index() {
213 Some(cpu_index)
214 } else {
215 None
216 }
217 }
218
219 pub(super) fn insert_all_task_locked(&mut self, task_id: TaskId, task: Arc<Task>) {
220 assert_eq!(
221 task.id,
222 task_id,
223 "scheduler corruption: insert_all_task_locked task.id={} != task_id={}",
224 task.id.as_u64(),
225 task_id.as_u64()
226 );
227 if self.all_tasks.contains_key(&task_id) {
228 unsafe {
229 core::arch::asm!("mov al, 'D'; out 0xe9, al", out("al") _);
230 }
231 crate::serial_force_println!(
232 "[RACE] insert_all_task_locked: duplicate tid={} all_tasks={} all_tasks_scan={}",
233 task_id.as_u64(),
234 self.all_tasks.len(),
235 self.all_tasks_scan.len()
236 );
237 panic!(
238 "scheduler corruption: duplicate insert_all_task_locked tid={}",
239 task_id.as_u64()
240 );
241 }
242 self.all_tasks.insert(task_id, task.clone());
243 self.all_tasks_scan.push(task);
244 let bt_len = self.all_tasks.len();
246 let scan_len = self.all_tasks_scan.len();
247 if bt_len != scan_len {
248 unsafe {
249 core::arch::asm!("mov al, 'X'; out 0xe9, al", out("al") _);
250 }
251 crate::serial_force_println!(
252 "[RACE] insert_all_task_locked: all_tasks={} != all_tasks_scan={} tid={}",
253 bt_len,
254 scan_len,
255 task_id.as_u64()
256 );
257 panic!(
258 "scheduler corruption: insert_all_task_locked len mismatch all_tasks={} all_tasks_scan={} tid={}",
259 bt_len,
260 scan_len,
261 task_id.as_u64()
262 );
263 }
264 }
265
266 pub(super) fn remove_all_task_locked(&mut self, task_id: TaskId) -> Option<Arc<Task>> {
267 let removed = self.all_tasks.remove(&task_id);
268 if removed.is_some() {
269 if let Some(idx) = self
270 .all_tasks_scan
271 .iter()
272 .position(|task| task.id == task_id)
273 {
274 self.all_tasks_scan.swap_remove(idx);
275 } else {
276 unsafe { core::arch::asm!("mov al, 'Z'; out 0xe9, al", out("al") _) };
277 crate::serial_force_println!(
278 "[RACE] remove_all_task_locked: tid={} in all_tasks but NOT in all_tasks_scan",
279 task_id.as_u64()
280 );
281 panic!(
282 "scheduler corruption: remove_all_task_locked missing scan entry tid={}",
283 task_id.as_u64()
284 );
285 }
286 }
287 let bt_len = self.all_tasks.len();
288 let scan_len = self.all_tasks_scan.len();
289 if bt_len != scan_len {
290 unsafe {
291 core::arch::asm!("mov al, 'X'; out 0xe9, al", out("al") _);
292 }
293 crate::serial_force_println!(
294 "[RACE] remove_all_task_locked: all_tasks={} != all_tasks_scan={} tid={}",
295 bt_len,
296 scan_len,
297 task_id.as_u64()
298 );
299 panic!(
300 "scheduler corruption: remove_all_task_locked len mismatch all_tasks={} all_tasks_scan={} tid={}",
301 bt_len,
302 scan_len,
303 task_id.as_u64()
304 );
305 }
306 removed
307 }
308
309 pub fn clear_task_wake_deadline_locked(&mut self, id: TaskId) -> bool {
311 if let Some(task) = self.all_tasks.get(&id) {
312 task.wake_deadline_ns.store(0, Ordering::Relaxed);
313 true
314 } else {
315 false
316 }
317 }
318
319 pub fn set_task_wake_deadline_locked(&mut self, id: TaskId, deadline: u64) -> bool {
321 if deadline == 0 {
322 return self.clear_task_wake_deadline_locked(id);
323 }
324 if let Some(task) = self.all_tasks.get(&id) {
325 task.wake_deadline_ns.store(deadline, Ordering::Relaxed);
326 true
327 } else {
328 false
329 }
330 }
331
332 pub fn wake_task_locked(&mut self, id: TaskId) -> (bool, Option<usize>) {
341 self.clear_task_wake_deadline_locked(id);
342 if let Some(task) = self.all_tasks.get(&id) {
345 task.wake_pending
346 .store(true, core::sync::atomic::Ordering::Release);
347 (true, None)
348 } else {
349 (false, None)
350 }
351 }
352
353 pub fn try_reap_child_locked(
359 &mut self,
360 parent: TaskId,
361 target: Option<TaskId>,
362 ) -> WaitChildResult {
363 let target_is_child = {
365 let identity = SCHED_IDENTITY.read();
366 let Some(children_view) = identity.children_of.get(&parent) else {
367 return WaitChildResult::NoChildren;
368 };
369
370 if children_view.is_empty() {
371 return WaitChildResult::NoChildren;
372 }
373
374 let target_is_child = if let Some(target_id) = target {
375 children_view.iter().any(|&id| id == target_id)
376 } else {
377 true
378 };
379 target_is_child
380 };
381
382 if !target_is_child {
383 return WaitChildResult::NoChildren;
384 }
385
386 let zombie = {
388 let identity = SCHED_IDENTITY.read();
389 let children = match identity.children_of.get(&parent) {
390 Some(c) => c.clone(),
391 None => return WaitChildResult::NoChildren,
392 };
393 children
394 .iter()
395 .copied()
396 .find(|id| target.map_or(true, |t| t == *id) && self.zombies.contains_key(id))
397 };
398
399 if let Some(child) = zombie {
400 let (status, child_pid) = self.zombies.remove(&child).unwrap_or((0, 0));
401 let reaped_task = self.remove_all_task_locked(child);
404 if let Some(task) = reaped_task.as_ref() {
405 super::task_ops::cleanup_task_resources(task);
406 }
407 let child_tid = reaped_task.as_ref().map(|t| t.tid);
408 if child_pid != 0 {
409 if let Some(tid) = child_tid {
410 let mut identity = SCHED_IDENTITY.write();
411 Self::unregister_identity_locked(&mut identity, child, child_pid, tid);
412 }
413 }
414 {
415 let mut identity = SCHED_IDENTITY.write();
416 if let Some(children) = identity.children_of.get_mut(&parent) {
417 children.retain(|&id| id != child);
418 if children.is_empty() {
419 identity.children_of.remove(&parent);
420 }
421 }
422 identity.parent_of.remove(&child);
423 }
424 return WaitChildResult::Reaped {
425 child,
426 pid: child_pid,
427 status,
428 };
429 }
430
431 WaitChildResult::StillRunning
432 }
433
434 fn select_cpu_for_task(&self) -> usize {
442 let n = active_cpu_count();
447 let all_idle = (0..n).all(|i| {
448 LOCAL_SCHEDULERS[i]
449 .lock()
450 .as_ref()
451 .map(|cpu| cpu.current_task.is_none())
452 .unwrap_or(true)
453 });
454 if all_idle {
455 crate::serial_println!("[trace][sched] select_cpu_for_task early-boot best=0");
456 return 0;
457 }
458 let mut best = 0usize;
459 let mut best_load = usize::MAX;
460 for idx in 0..n {
461 let load = {
462 let guard = LOCAL_SCHEDULERS[idx].lock();
463 if let Some(ref cpu) = *guard {
464 let mut l = cpu.class_rqs.runnable_len();
465 if let Some(current) = cpu.current_task.as_ref() {
466 if self.class_table.class_for_task(current)
467 != crate::process::sched::SchedClassId::Idle
468 {
469 l += 1;
470 }
471 }
472 l
473 } else {
474 0
475 }
476 };
477 if load < best_load {
478 best = idx;
479 best_load = load;
480 }
481 }
482 crate::serial_println!(
483 "[trace][sched] select_cpu_for_task best={} load={}",
484 best,
485 best_load
486 );
487 best
488 }
489
490 pub fn migrate_ready_tasks_for_new_class_table(&mut self) {
492 let mut ready: Vec<(TaskId, Arc<Task>, usize)> = Vec::new();
493 for (id, task) in self.all_tasks.iter() {
494 let state = task.get_state();
495 if state != TaskState::Ready {
496 continue;
497 }
498 let cpu = self.task_cpu.get(id).copied().unwrap_or(0);
499 ready.push((*id, task.clone(), cpu));
500 }
501
502 for (id, task, cpu_idx) in ready {
503 let mut guard = LOCAL_SCHEDULERS[cpu_idx].lock();
504 let Some(ref mut cpu) = *guard else {
505 continue;
506 };
507 if cpu.class_rqs.remove(id) {
508 let class = self.class_table.class_for_task(&task);
509 cpu.class_rqs.enqueue(class, task);
510 cpu.need_resched = true;
511 }
512 }
513 }
514}
515
516pub(super) fn steal_task_local(cpu: &mut SchedulerCpu, cpu_index: usize) -> Option<Arc<Task>> {
536 let now_tick = TICK_COUNT.load(Ordering::Relaxed);
537 if now_tick < LAST_STEAL_TICK[cpu_index].load(Ordering::Relaxed) + STEAL_COOLDOWN_TICKS {
538 return None;
539 }
540
541 let mut scheduler = GLOBAL_SCHED_STATE.try_lock_no_irqsave()?;
544 let sched = scheduler.as_mut()?;
545
546 let n = active_cpu_count();
547 let my_load = cpu.class_rqs.runnable_len();
548
549 let mut best_cpu = None;
550 let mut best_load = 0usize;
551
552 for i in 0..n {
553 if i == cpu_index {
554 continue;
555 }
556 if let Some(guard) = LOCAL_SCHEDULERS[i].try_lock_no_irqsave() {
558 if let Some(ref sib) = *guard {
559 let load = sib.class_rqs.runnable_len();
560 if load > best_load {
561 best_load = load;
562 best_cpu = Some(i);
563 }
564 }
565 }
566 }
567
568 if best_load < my_load.saturating_add(STEAL_IMBALANCE_MIN) {
569 return None;
570 }
571 let steal_from = best_cpu?;
572
573 if let Some(mut guard) = LOCAL_SCHEDULERS[steal_from].try_lock_no_irqsave() {
575 if let Some(ref mut sib) = *guard {
576 if sib.class_rqs.runnable_len() < 2 {
577 return None;
578 }
579 if let Some(task) = sib.class_rqs.steal_candidate(&sib.class_table) {
580 sched.task_cpu.insert(task.id, cpu_index);
581 task.home_cpu
582 .store(cpu_index, core::sync::atomic::Ordering::Relaxed);
583 if cpu_is_valid(cpu_index) {
584 CPU_STEAL_IN_COUNT[cpu_index].fetch_add(1, Ordering::Relaxed);
585 }
586 if cpu_is_valid(steal_from) {
587 CPU_STEAL_OUT_COUNT[steal_from].fetch_add(1, Ordering::Relaxed);
588 }
589 LAST_STEAL_TICK[cpu_index].store(now_tick, Ordering::Relaxed);
590 return Some(task);
591 }
592 }
593 }
594 None
595}
596
597pub(super) fn pick_next_task_local(cpu: &mut SchedulerCpu, cpu_index: usize) -> Arc<Task> {
607 if let Some(task) = cpu.current_task.take() {
609 match task.get_state() {
610 TaskState::Running => {
611 task.set_state(TaskState::Ready);
612 if !Arc::ptr_eq(&task, &cpu.idle_task) {
613 cpu.task_to_requeue = Some(task);
616 }
617 }
618 TaskState::Dead => {
619 cpu.task_to_drop = Some(task);
623 }
624 TaskState::Blocked | TaskState::Ready => {
625 }
628 }
629 }
630
631 let next = if let Some(next) = cpu.class_rqs.pick_next(&cpu.class_table) {
633 next
634 } else if let Some(stolen) = steal_task_local(cpu, cpu_index) {
635 stolen
637 } else {
638 cpu.idle_task.clone()
640 };
641
642 next.set_state(TaskState::Running);
643 cpu.current_task = Some(next.clone());
644 cpu.current_runtime = crate::process::sched::CurrentRuntime::new();
645 next
646}
647
648pub(super) fn yield_cpu_local(cpu: &mut SchedulerCpu, cpu_index: usize) -> Option<SwitchTarget> {
655 let current = cpu.current_task.as_ref()?.clone();
656
657 let next = pick_next_task_local(cpu, cpu_index);
658
659 if Arc::ptr_eq(¤t, &next) {
660 return None;
661 }
662 if cpu_is_valid(cpu_index) {
663 CPU_SWITCH_COUNT[cpu_index].fetch_add(1, Ordering::Relaxed);
664 }
665
666 if let Err(e) = validate_task_context(&next) {
667 let bad_rsp = unsafe { (*next.context.get()).saved_rsp };
668 let stk_base = next.kernel_stack.virt_base.as_u64();
669 let stk_top = stk_base + next.kernel_stack.size as u64;
670 crate::serial_println!(
671 "[sched-local] WARN: invalid ctx task='{}' id={} cpu={}: {} \
672 rsp={:#x} stack=[{:#x}..{:#x}] : restoring current",
673 next.name,
674 next.id.as_u64(),
675 cpu_index,
676 e,
677 bad_rsp,
678 stk_base,
679 stk_top,
680 );
681
682 let is_idle = Arc::ptr_eq(&next, &cpu.idle_task);
684 drop(cpu.task_to_drop.take());
685 if let Some(prev) = cpu.task_to_requeue.take() {
686 prev.set_state(TaskState::Running);
687 cpu.current_task = Some(prev);
688 } else {
689 current.set_state(TaskState::Running);
690 cpu.current_task = Some(current.clone());
691 }
692 if !is_idle {
693 next.set_state(TaskState::Ready);
694 let class = cpu.class_table.class_for_task(&next);
695 cpu.class_rqs.enqueue(class, next);
696 }
697 return None;
698 }
699
700 let stack_top = next.kernel_stack.virt_base.as_u64() + next.kernel_stack.size as u64;
702 crate::arch::x86_64::tss::set_kernel_stack(x86_64::VirtAddr::new(stack_top));
703 crate::arch::x86_64::syscall::set_kernel_rsp(stack_top);
704
705 unsafe {
708 next.process.address_space_arc().switch_to();
709 }
710
711 Some(SwitchTarget {
712 old_rsp_ptr: unsafe { &raw mut (*current.context.get()).saved_rsp },
713 new_rsp_ptr: unsafe { &raw const (*next.context.get()).saved_rsp },
714 old_fpu_ptr: current.fpu_state.get() as *mut u8,
715 new_fpu_ptr: next.fpu_state.get() as *const u8,
716 old_xcr0: current
717 .xcr0_mask
718 .load(core::sync::atomic::Ordering::Relaxed),
719 new_xcr0: next.xcr0_mask.load(core::sync::atomic::Ordering::Relaxed),
720 })
721}
722
723pub(super) fn drain_post_switch_local(
726 cpu: &mut SchedulerCpu,
727 take_drop: bool,
728) -> Option<Arc<Task>> {
729 let task_to_drop = if take_drop {
730 cpu.task_to_drop.take()
731 } else {
732 None
733 };
734 if let Some(task) = cpu.task_to_requeue.take() {
735 let class = cpu.class_table.class_for_task(&task);
736 cpu.class_rqs.enqueue(class, task);
737 }
738 task_to_drop
739}