1use super::{runtime_ops::idle_task_main, *};
2
3impl Scheduler {
4 pub fn new(cpu_count: usize) -> Self {
6 crate::serial_println!(
7 "[trace][sched] Scheduler::new enter cpu_count={}",
8 cpu_count
9 );
10 let mut cpus = alloc::vec::Vec::new();
11 for cpu_idx in 0..cpu_count {
12 crate::serial_println!(
13 "[trace][sched] Scheduler::new cpu={} create idle begin",
14 cpu_idx
15 );
16 let idle_task = Task::new_kernel_task(idle_task_main, "idle", TaskPriority::Idle)
17 .expect("Failed to create idle task");
18 crate::serial_println!(
19 "[trace][sched] Scheduler::new cpu={} create idle done id={}",
20 cpu_idx,
21 idle_task.id.as_u64()
22 );
23 idle_task.set_sched_policy(crate::process::sched::SchedPolicy::Idle);
24 let mut class_rqs = PerCpuClassRqSet::new();
25 class_rqs.enqueue(crate::process::sched::SchedClassId::Idle, idle_task.clone());
26 cpus.push(SchedulerCpu {
27 class_rqs,
28 current_task: None,
29 current_runtime: crate::process::sched::CurrentRuntime::new(),
30 idle_task,
31 task_to_requeue: None,
32 task_to_drop: None,
33 need_resched: false,
34 });
35 crate::serial_println!("[trace][sched] Scheduler::new cpu={} push done", cpu_idx);
36 }
37 crate::serial_println!(
38 "[trace][sched] Scheduler::new cpus ready len={}",
39 cpus.len()
40 );
41
42 Scheduler {
43 cpus,
44 blocked_tasks: BTreeMap::new(),
45 all_tasks: BTreeMap::new(),
46 task_cpu: BTreeMap::new(),
47 pid_to_task: BTreeMap::new(),
48 tid_to_task: BTreeMap::new(),
49 pid_to_pgid: BTreeMap::new(),
50 pid_to_sid: BTreeMap::new(),
51 pgid_members: BTreeMap::new(),
52 sid_members: BTreeMap::new(),
53 wake_deadlines: BTreeMap::new(),
54 wake_deadline_of: BTreeMap::new(),
55 parent_of: BTreeMap::new(),
56 children_of: BTreeMap::new(),
57 zombies: BTreeMap::new(),
58 class_table: crate::process::sched::SchedClassTable::default(),
59 }
60 }
61
62 pub(crate) fn member_add(
64 map: &mut BTreeMap<Pid, alloc::vec::Vec<TaskId>>,
65 key: Pid,
66 task_id: TaskId,
67 ) {
68 crate::serial_println!(
69 "[trace][sched] member_add enter key={} tid={}",
70 key,
71 task_id.as_u64()
72 );
73 let members = map.entry(key).or_default();
74 crate::serial_println!(
75 "[trace][sched] member_add entry key={} len={}",
76 key,
77 members.len()
78 );
79 if !members.iter().any(|id| *id == task_id) {
80 members.push(task_id);
81 crate::serial_println!(
82 "[trace][sched] member_add pushed key={} tid={}",
83 key,
84 task_id.as_u64()
85 );
86 }
87 }
88
89 pub(crate) fn member_remove(
91 map: &mut BTreeMap<Pid, alloc::vec::Vec<TaskId>>,
92 key: Pid,
93 task_id: TaskId,
94 ) {
95 let mut clear = false;
96 if let Some(members) = map.get_mut(&key) {
97 members.retain(|id| *id != task_id);
98 clear = members.is_empty();
99 }
100 if clear {
101 map.remove(&key);
102 }
103 }
104
105 pub(crate) fn register_identity_locked(&mut self, task: &Arc<Task>) {
107 let task_id = task.id;
108 let pid = task.pid;
109 let pgid = task.pgid.load(Ordering::Relaxed);
110 let sid = task.sid.load(Ordering::Relaxed);
111 crate::serial_println!(
112 "[trace][sched] register_identity enter tid={} pid={} pgid={} sid={}",
113 task_id.as_u64(),
114 pid,
115 pgid,
116 sid
117 );
118 self.pid_to_pgid.insert(pid, pgid);
119 crate::serial_println!(
120 "[trace][sched] register_identity pid_to_pgid inserted pid={}",
121 pid
122 );
123 self.pid_to_sid.insert(pid, sid);
124 crate::serial_println!(
125 "[trace][sched] register_identity pid_to_sid inserted pid={}",
126 pid
127 );
128 Self::member_add(&mut self.pgid_members, pgid, task_id);
129 Self::member_add(&mut self.sid_members, sid, task_id);
130 crate::serial_println!(
131 "[trace][sched] register_identity done tid={}",
132 task_id.as_u64()
133 );
134 }
135
136 pub(crate) fn unregister_identity_locked(&mut self, task_id: TaskId, pid: Pid, tid: Tid) {
138 self.pid_to_task.remove(&pid);
139 self.tid_to_task.remove(&tid);
140 if let Some(pgid) = self.pid_to_pgid.remove(&pid) {
141 Self::member_remove(&mut self.pgid_members, pgid, task_id);
142 }
143 if let Some(sid) = self.pid_to_sid.remove(&pid) {
144 Self::member_remove(&mut self.sid_members, sid, task_id);
145 }
146 }
147
148 pub fn add_task(&mut self, task: Arc<Task>) -> Option<usize> {
150 let cpu_index = self.select_cpu_for_task();
151 self.add_task_on_cpu(task, cpu_index)
152 }
153
154 pub fn add_task_with_parent(&mut self, task: Arc<Task>, parent: TaskId) -> Option<usize> {
156 let child = task.id;
157 let cpu_index = self.select_cpu_for_task();
158 let ipi = self.add_task_on_cpu(task, cpu_index);
159 self.parent_of.insert(child, parent);
160 self.children_of.entry(parent).or_default().push(child);
161 ipi
162 }
163
164 fn add_task_on_cpu(&mut self, task: Arc<Task>, cpu_index: usize) -> Option<usize> {
166 let task_id = task.id;
167 crate::serial_println!(
168 "[trace][sched] add_task_on_cpu enter tid={} cpu={}",
169 task_id.as_u64(),
170 cpu_index
171 );
172 unsafe {
174 *task.state.get() = TaskState::Ready;
175 }
176 crate::serial_println!(
177 "[trace][sched] add_task_on_cpu state ready tid={}",
178 task_id.as_u64()
179 );
180
181 crate::serial_println!(
182 "[trace][sched] add_task_on_cpu before clone tid={} all_tasks_len={}",
183 task_id.as_u64(),
184 self.all_tasks.len()
185 );
186 let task_clone = task.clone();
187 crate::serial_println!(
188 "[trace][sched] add_task_on_cpu before all_tasks.insert tid={}",
189 task_id.as_u64()
190 );
191 self.all_tasks.insert(task_id, task_clone);
192 crate::serial_println!(
193 "[trace][sched] add_task_on_cpu all_tasks inserted tid={}",
194 task_id.as_u64()
195 );
196 self.task_cpu.insert(task_id, cpu_index);
197 crate::serial_println!(
198 "[trace][sched] add_task_on_cpu task_cpu inserted tid={}",
199 task_id.as_u64()
200 );
201 self.pid_to_task.insert(task.pid, task_id);
202 crate::serial_println!(
203 "[trace][sched] add_task_on_cpu pid map inserted tid={}",
204 task_id.as_u64()
205 );
206 self.tid_to_task.insert(task.tid, task_id);
207 crate::serial_println!(
208 "[trace][sched] add_task_on_cpu tid map inserted tid={}",
209 task_id.as_u64()
210 );
211 self.register_identity_locked(&task);
212 crate::serial_println!(
213 "[trace][sched] add_task_on_cpu identity registered tid={}",
214 task_id.as_u64()
215 );
216 if let Some(cpu) = self.cpus.get_mut(cpu_index) {
217 let class = self.class_table.class_for_task(&task);
218 cpu.class_rqs.enqueue(class, task);
219 cpu.need_resched = true;
220 crate::serial_println!(
221 "[trace][sched] add_task_on_cpu enqueued tid={} cpu={}",
222 task_id.as_u64(),
223 cpu_index
224 );
225 }
226 sched_trace(format_args!(
227 "enqueue task={} cpu={}",
228 task_id.as_u64(),
229 cpu_index
230 ));
231 if cpu_index != current_cpu_index() {
232 Some(cpu_index)
233 } else {
234 None
235 }
236 }
237
238 pub fn clear_task_wake_deadline_locked(&mut self, id: TaskId) -> bool {
240 if let Some(task) = self.all_tasks.get(&id) {
241 task.wake_deadline_ns.store(0, Ordering::Relaxed);
242 true
243 } else {
244 false
245 }
246 }
247
248 pub fn set_task_wake_deadline_locked(&mut self, id: TaskId, deadline: u64) -> bool {
250 if deadline == 0 {
251 return self.clear_task_wake_deadline_locked(id);
252 }
253 if let Some(task) = self.all_tasks.get(&id) {
254 task.wake_deadline_ns.store(deadline, Ordering::Relaxed);
255 true
256 } else {
257 false
258 }
259 }
260
261 pub fn wake_task_locked(&mut self, id: TaskId) -> (bool, Option<usize>) {
266 self.clear_task_wake_deadline_locked(id);
267 if let Some(task) = self.blocked_tasks.remove(&id) {
268 unsafe {
270 *task.state.get() = TaskState::Ready;
271 }
272 let cpu_index = self.task_cpu.get(&id).copied().unwrap_or(0);
273 if let Some(cpu) = self.cpus.get_mut(cpu_index) {
274 let class = self.class_table.class_for_task(&task);
275 cpu.class_rqs.enqueue(class, task);
276 cpu.need_resched = true;
277 }
278 let ipi = if cpu_index != current_cpu_index() {
279 Some(cpu_index)
280 } else {
281 None
282 };
283 (true, ipi)
284 } else if let Some(task) = self.all_tasks.get(&id) {
285 task.wake_pending
286 .store(true, core::sync::atomic::Ordering::Release);
287 (true, None)
288 } else {
289 (false, None)
290 }
291 }
292
293 pub fn try_reap_child_locked(
295 &mut self,
296 parent: TaskId,
297 target: Option<TaskId>,
298 ) -> WaitChildResult {
299 let Some(children_view) = self.children_of.get(&parent) else {
300 return WaitChildResult::NoChildren;
301 };
302
303 if children_view.is_empty() {
304 return WaitChildResult::NoChildren;
305 }
306
307 if let Some(target_id) = target {
308 if !children_view.iter().any(|&id| id == target_id) {
309 return WaitChildResult::NoChildren;
310 }
311 }
312
313 let zombie = children_view
314 .iter()
315 .copied()
316 .find(|id| target.map_or(true, |t| t == *id) && self.zombies.contains_key(id));
317
318 if let Some(child) = zombie {
319 let (status, child_pid) = self.zombies.remove(&child).unwrap_or((0, 0));
320 let reaped_task = self.all_tasks.remove(&child);
323 let child_tid = reaped_task.as_ref().map(|t| t.tid);
324 if child_pid != 0 {
325 if let Some(tid) = child_tid {
326 self.unregister_identity_locked(child, child_pid, tid);
327 }
328 }
329 if let Some(children) = self.children_of.get_mut(&parent) {
330 children.retain(|&id| id != child);
331 if children.is_empty() {
332 self.children_of.remove(&parent);
333 }
334 }
335 self.parent_of.remove(&child);
336 return WaitChildResult::Reaped {
337 child,
338 pid: child_pid,
339 status,
340 };
341 }
342
343 WaitChildResult::StillRunning
344 }
345
346 pub fn pick_next_task(&mut self, cpu_index: usize) -> Arc<Task> {
357 let current_task = self.cpus[cpu_index].current_task.take();
359 if let Some(task) = current_task {
360 let task_state = unsafe { *task.state.get() };
362 if task_state == TaskState::Running {
363 unsafe {
364 *task.state.get() = TaskState::Ready;
365 }
366 if !Arc::ptr_eq(&task, &self.cpus[cpu_index].idle_task) {
371 self.cpus[cpu_index].task_to_requeue = Some(task);
374 }
375 } else if task_state == TaskState::Dead {
376 let task_id = task.id;
377 let task_pid = task.pid;
378 let task_tid = task.tid;
379
380 let was_registered = self.all_tasks.remove(&task_id).is_some();
385 self.task_cpu.remove(&task_id);
386 self.unregister_identity_locked(task_id, task_pid, task_tid);
387
388 if was_registered {
389 super::task_ops::cleanup_task_resources(&task);
391 }
392
393 self.cpus[cpu_index].task_to_drop = Some(task);
395 } else {
396 }
399 }
400
401 let next_task = if let Some(task) =
403 self.cpus[cpu_index].class_rqs.pick_next(&self.class_table)
404 {
405 task
406 } else if let Some(task) = self.steal_task(cpu_index, TICK_COUNT.load(Ordering::Relaxed)) {
407 task
408 } else {
409 self.cpus[cpu_index].idle_task.clone()
410 };
411
412 unsafe {
414 *next_task.state.get() = TaskState::Running;
415 }
416 let cloned = next_task.clone();
417 let strong_after = Arc::strong_count(&cloned);
418 self.cpus[cpu_index].current_task = Some(cloned);
419 self.cpus[cpu_index].current_runtime = crate::process::sched::CurrentRuntime::new();
421 sched_trace(format_args!(
422 "cpu={} pick_next task={} policy={:?} strong={}",
423 cpu_index,
424 next_task.id.as_u64(),
425 next_task.sched_policy(),
426 strong_after,
427 ));
428 next_task
429 }
430
431 pub fn steal_task(&mut self, dst_cpu: usize, now_tick: u64) -> Option<Arc<Task>> {
437 if !cpu_is_valid(dst_cpu) || dst_cpu >= self.cpus.len() {
438 return None;
439 }
440 let last = LAST_STEAL_TICK[dst_cpu].load(Ordering::Relaxed);
441 if now_tick.saturating_sub(last) < STEAL_COOLDOWN_TICKS {
442 return None;
443 }
444
445 let dst_load = {
446 let cpu = &self.cpus[dst_cpu];
447 cpu.class_rqs.runnable_len() + usize::from(cpu.current_task.is_some())
448 };
449
450 let (best_src, best_src_load) = (0..self.cpus.len())
452 .filter(|&i| i != dst_cpu)
453 .map(|i| {
454 let cpu = &self.cpus[i];
455 let load = cpu.class_rqs.runnable_len() + usize::from(cpu.current_task.is_some());
456 (i, load)
457 })
458 .max_by_key(|(_, load)| *load)?;
459
460 if best_src_load < dst_load.saturating_add(STEAL_IMBALANCE_MIN) {
461 return None;
462 }
463
464 if self.cpus[best_src].class_rqs.runnable_len() < 2 {
466 return None;
467 }
468
469 let task = self.cpus[best_src]
470 .class_rqs
471 .steal_candidate(&self.class_table)?;
472 self.task_cpu.insert(task.id, dst_cpu);
474 log::trace!(
475 "WS: CPU {} stole task {:?} from CPU {} (src had {} tasks)",
476 dst_cpu,
477 task.id,
478 best_src,
479 self.cpus[best_src].class_rqs.runnable_len() + 1
480 );
481 sched_trace(format_args!(
482 "work-steal dst_cpu={} src_cpu={} task={}",
483 dst_cpu,
484 best_src,
485 task.id.as_u64()
486 ));
487 if cpu_is_valid(dst_cpu) {
488 CPU_STEAL_IN_COUNT[dst_cpu].fetch_add(1, Ordering::Relaxed);
489 }
490 if cpu_is_valid(best_src) {
491 CPU_STEAL_OUT_COUNT[best_src].fetch_add(1, Ordering::Relaxed);
492 }
493 LAST_STEAL_TICK[dst_cpu].store(now_tick, Ordering::Relaxed);
494 Some(task)
495 }
496
497 pub(super) fn yield_cpu(&mut self, cpu_index: usize) -> Option<SwitchTarget> {
503 if cpu_index >= self.cpus.len() {
504 return None;
505 }
506 let current_ref = self.cpus[cpu_index].current_task.as_ref()?;
508 let strong_before = Arc::strong_count(current_ref);
509 if strong_before == 0 || strong_before > (isize::MAX as usize) {
510 log::error!(
511 "[sched] CORRUPT Arc in yield_cpu before clone! cpu={} strong={:#x} task={} ptr={:p}",
512 cpu_index, strong_before,
513 current_ref.id.as_u64(),
514 Arc::as_ptr(current_ref) as *const u8,
515 );
516 return None;
517 }
518 let current = current_ref.clone();
519
520 let next = self.pick_next_task(cpu_index);
522
523 if Arc::ptr_eq(¤t, &next) {
525 return None;
526 }
527 if cpu_is_valid(cpu_index) {
528 CPU_SWITCH_COUNT[cpu_index].fetch_add(1, Ordering::Relaxed);
529 }
530
531 if let Err(e) = validate_task_context(&next) {
532 let bad_rsp = unsafe { (*next.context.get()).saved_rsp };
535 let stk_base = next.kernel_stack.virt_base.as_u64();
536 let stk_top = stk_base + next.kernel_stack.size as u64;
537 crate::serial_println!(
538 "[sched] WARN: invalid ctx for task '{}' (id={}) cpu={}: {} \
539 rsp={:#x} stack=[{:#x}..{:#x}] - restoring current task",
540 next.name,
541 next.id.as_u64(),
542 cpu_index,
543 e,
544 bad_rsp,
545 stk_base,
546 stk_top,
547 );
548 log::error!(
549 "[sched] invalid context for task '{}' (id={:?}) cpu={}: {} \
550 rsp={:#x} stack=[{:#x}..{:#x}]",
551 next.name,
552 next.id,
553 cpu_index,
554 e,
555 bad_rsp,
556 stk_base,
557 stk_top,
558 );
559
560 let is_idle_fallback = Arc::ptr_eq(&next, &self.cpus[cpu_index].idle_task);
574
575 drop(self.cpus[cpu_index].task_to_drop.take());
581
582 if let Some(prev) = self.cpus[cpu_index].task_to_requeue.take() {
587 unsafe {
589 *prev.state.get() = TaskState::Running;
590 }
591 self.cpus[cpu_index].current_task = Some(prev);
592 } else {
593 unsafe {
595 *current.state.get() = TaskState::Running;
596 }
597 self.cpus[cpu_index].current_task = Some(current.clone());
598 }
599
600 if !is_idle_fallback {
605 unsafe {
607 *next.state.get() = TaskState::Ready;
608 }
609 let class = self.class_table.class_for_task(&next);
610 if let Some(cpu) = self.cpus.get_mut(cpu_index) {
611 cpu.class_rqs.enqueue(class, next);
612 }
613 }
614 return None;
619 }
620
621 let stack_top = next.kernel_stack.virt_base.as_u64() + next.kernel_stack.size as u64;
623 crate::arch::x86_64::tss::set_kernel_stack(x86_64::VirtAddr::new(stack_top));
624
625 crate::arch::x86_64::syscall::set_kernel_rsp(stack_top);
627
628 unsafe {
631 (*next.process.address_space.get()).switch_to();
632 }
633
634 Some(SwitchTarget {
635 old_rsp_ptr: unsafe { &raw mut (*current.context.get()).saved_rsp },
636 new_rsp_ptr: unsafe { &raw const (*next.context.get()).saved_rsp },
637 old_fpu_ptr: current.fpu_state.get() as *mut u8,
638 new_fpu_ptr: next.fpu_state.get() as *const u8,
639 old_xcr0: current
640 .xcr0_mask
641 .load(core::sync::atomic::Ordering::Relaxed),
642 new_xcr0: next.xcr0_mask.load(core::sync::atomic::Ordering::Relaxed),
643 })
644 }
645
646 fn select_cpu_for_task(&self) -> usize {
648 if (self as *const Self).is_null() {
649 return 0;
650 }
651 let mut best = 0usize;
652 let mut best_load = usize::MAX;
653 for (idx, cpu) in self.cpus.iter().enumerate() {
654 let mut load = cpu.class_rqs.runnable_len();
655 if let Some(current) = cpu.current_task.as_ref() {
656 if self.class_table.class_for_task(current)
657 != crate::process::sched::SchedClassId::Idle
658 {
659 load += 1;
660 }
661 }
662 if load < best_load {
663 best = idx;
664 best_load = load;
665 }
666 }
667 crate::serial_println!(
668 "[trace][sched] select_cpu_for_task best={} load={}",
669 best,
670 best_load
671 );
672 best
673 }
674
675 pub fn migrate_ready_tasks_for_new_class_table(&mut self) {
677 let mut ready: Vec<(TaskId, Arc<Task>, usize)> = Vec::new();
678 for (id, task) in self.all_tasks.iter() {
679 let state = unsafe { *task.state.get() };
681 if state != TaskState::Ready {
682 continue;
683 }
684 let cpu = self.task_cpu.get(id).copied().unwrap_or(0);
685 ready.push((*id, task.clone(), cpu));
686 }
687
688 for (id, task, cpu_idx) in ready {
689 let Some(cpu) = self.cpus.get_mut(cpu_idx) else {
690 continue;
691 };
692 if cpu.class_rqs.remove(id) {
693 let class = self.class_table.class_for_task(&task);
694 cpu.class_rqs.enqueue(class, task);
695 cpu.need_resched = true;
696 }
697 }
698 }
699}