forked from torvalds/linux
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit d3c9f4e
sched/eevdf: Fix NULL deref when avg_vruntime nears overflow
Fix a kernel NULL pointer dereference in pick_next_task_fair() caused by
EEVDF scheduler arithmetic overflows when cfs_rq->avg_vruntime
approaches the s64 low.
The issue occurs when:
1. cfs_rq->avg_vruntime is driven downward by dynamic reweight
operations on se->vruntime combined with frequent enqueue/dequeue of
another sched_entity with large se->vlag values. Note that the presence
of only one other sched_entity (besides the current one) is critical
because having more would average out the effect and prevent the
continuous and rapid decrease of cfs_rq->avg_vruntime.
2. These factors `reweight` and `frequent enqueue/dequeue` persistently
suppress cfs_rq->min_vruntime, causing cfs_rq->avg_vruntime to
decrease rapidly toward S64_MIN.
3. In vruntime_eligible(), the calculation (int64_t)(vruntime -
cfs_rq->min_vruntime) * load may overflow downward, becoming a large
positive value.
4. This causes vruntime_eligible() to incorrectly judge all tasks as
ineligible, leading to NULL pointer dereference in
pick_next_task_fair().
The fix addresses this by adjusting the current sched_entity's vruntime
during reweight operations when:
- The entity is cfs_rq->curr and the only running task
- The entity is on the runqueue
- Its vruntime is below min_vruntime
The most straightforward fix would be to adjust the vruntime during
dequeue, but that would require checking and possibly modifying the
curr's vruntime on every dequeue, which has a broader impact and
concurrency concerns. Therefore, we choose to apply the fix in the
reweight path, which is one of the necessary conditions for the problem
to occur.
BUG: kernel NULL pointer dereference, address: 00000000000000a0
RIP: 0010:pick_next_task_fair+0x39b/0xab03
KERNEL: vmlinux [TAINTED]
DUMPFILE: 127.0.0.1-2025-10-30-13:52:24/vmcore [PARTIAL DUMP]
CPUS: 4
DATE: Thu Oct 30 05:52:18 UTC 2025
UPTIME: 02:02:50
LOAD AVERAGE: 15.00, 15.00, 15.00
TASKS: 151
NODENAME: SangforOS.localdomain
RELEASE: 6.6.0+
VERSION: #4 SMP Thu Oct 30 11:25:11 CST 2025
MACHINE: x86_64 (2194 Mhz)
MEMORY: 4 GB
PANIC: "Oops: 0000 [#1] SMP PTI" (check log for details)
PID: 4702
COMMAND: "test_sched_2/-1"
TASK: ffff8881362dcf80 [THREAD_INFO: ffff8881362dcf80]
CPU: 1
STATE: TASK_UNINTERRUPTIBLE (PANIC)
crash> bt
PID: 4702 TASK: ffff8881362dcf80 CPU: 1 COMMAND: "test_sched_2/-1"
#0 [ffffc90000fffab0] machine_kexec at ffffffffb567e767
#1 [ffffc90000fffb10] __crash_kexec at ffffffffb580474a
#2 [ffffc90000fffbd0] crash_kexec at ffffffffb5805768
#3 [ffffc90000fffbd8] oops_end at ffffffffb5639599
#4 [ffffc90000fffbf8] page_fault_oops at ffffffffb56954a8
#5 [ffffc90000fffc50] exc_page_fault at ffffffffb63424a9
torvalds#6 [ffffc90000fffcb0] asm_exc_page_fault at ffffffffb6400c12
[exception RIP: pick_next_task_fair+923]
RIP: ffffffffb576f22b RSP: ffffc90000fffd60 RFLAGS: 00010046
RAX: 0000000000000000 RBX: ffff8881340b4d80 RCX: 82a3cdbe7f1c7aed
RDX: 01721730951583fc RSI: 0000000000015f5f RDI: 00105468401dc9e3
RBP: ffffc90000fffe18 R8: 00000000000003fa R9: 0000000000000002
R10: 0000000000000002 R11: 0000000000000064 R12: ffff8881362dcf80
R13: ffffc90000fffdc0 R14: ffff8881340b4e00 R15: ffff8881340b4e00
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0000
torvalds#7 [ffffc90000fffdb0] __schedule at ffffffffb6348cc8
torvalds#8 [ffffc90000fffe20] schedule at ffffffffb63493ab
torvalds#9 [ffffc90000fffe38] schedule_timeout at ffffffffb634eeaf
crash>
crash>
crash> p runqueues
PER-CPU DATA TYPE:
struct rq runqueues;
PER-CPU ADDRESSES:
[0]: ffff888134034d80
[1]: ffff8881340b4d80
[2]: ffff888134134d80
[3]: ffff8881341b4d80
crash>
crash> struct -o rq.cfs ffff8881340b4d80
struct rq {
[ffff8881340b4e00] struct cfs_rq cfs;
}
crash> struct cfs_rq.nr_running,curr,next,tasks_timeline,min_vruntime,avg_vruntime,avg_load,load,exec_clock ffff8881340b4e00
nr_running = 3,
curr = 0xffff888139b57c00,
next = 0xffff888139b57c00,
tasks_timeline = {
rb_root = {
rb_node = 0xffff8881362d80d0
},
rb_leftmost = 0xffff8881362d9b50
},
min_vruntime = 4596406356396515,
avg_vruntime = -9137321448325056783,
avg_load = 88933,
load = {
weight = 92109859,
inv_weight = 0
},
exec_clock = 0,
crash> struct sched_entity.on_rq,deadline,min_vruntime,vruntime,load,vlag,slice,exec_start,sum_exec_runtime,prev_sum_exec_runtime,my_q,run_node 0xffff888139b57c00
on_rq = 1,
deadline = 4705706610399852,
min_vruntime = 4493662477571149,
vruntime = 4698735667604793,
load = {
weight = 1042467,
inv_weight = 0
},
vlag = 4493662483537817,
slice = 2250000,
exec_start = 7308537586004,
sum_exec_runtime = 7196457582967,
prev_sum_exec_runtime = 7196456203065,
my_q = 0xffff888139b55000,
run_node = {
__rb_parent_color = 1,
rb_right = 0xffff8881362d80d0,
rb_left = 0x0
},
crash> struct sched_entity.deadline,min_vruntime,vruntime,load,vlag,slice,exec_start,sum_exec_runtime,prev_sum_exec_runtime,my_q,run_node -l sched_entity.run_node 0xffff8881362d80d0
deadline = 4493662533339551,
min_vruntime = 4493662476669436,
vruntime = 4493662519944203,
load = {
weight = 176128,
inv_weight = 24970740
},
vlag = 4493662519002535,
slice = 2250000,
exec_start = 7308527703195,
sum_exec_runtime = 4759831,
prev_sum_exec_runtime = 2351660,
my_q = 0x0,
run_node = {
__rb_parent_color = 1,
rb_right = 0x0,
rb_left = 0xffff8881362d9b50
},
crash> struct sched_entity.deadline,min_vruntime,vruntime,load,vlag,slice,exec_start,sum_exec_runtime,prev_sum_exec_runtime,my_q,run_node -l sched_entity.run_node 0xffff8881362d9b50
deadline = 4493662476695393,
min_vruntime = 4493662476669436,
vruntime = 4493662476669436,
load = {
weight = 90891264,
inv_weight = 48388
},
vlag = 51914,
slice = 2250000,
exec_start = 7308536206102,
sum_exec_runtime = 2102797408,
prev_sum_exec_runtime = 2102198648,
my_q = 0x0,
run_node = {
__rb_parent_color = 18446612687273951440,
rb_right = 0x0,
rb_left = 0x0
},
crash>
In vruntime_eligible():
for sched_entity curr [0xffff888139b57c00]: avg [-9033150209515029779], (int64_t)(vruntime - cfs_rq->min_vruntime) * load [9204623872495814378], so return false
for sched_entity root [0xffff8881362d80d0]: avg [-9033150209515029779], (int64_t)(vruntime - cfs_rq->min_vruntime) * load [9204833240987634904], so return false
for sched_entity leftmost [0xffff8881362d9b50]: avg [-9033150209515029779], (int64_t)(vruntime - cfs_rq->min_vruntime) * load [9204829348379068487], so return false
Therefore, all sched_entities on this cfs_rq have no eligibility to run
to cause the NULL pointer dereference in pick_next_task_fair().
Fixes: 147f3ef ("sched/fair: Implement an EEVDF-like scheduling policy")
Signed-off-by: Zicheng Qu <[email protected]>
Signed-off-by: wulibin163 <[email protected]>1 parent 977b9a0 commit d3c9f4eCopy full SHA for d3c9f4e
File tree
Expand file treeCollapse file tree
1 file changed
+8
-0
lines changedOpen diff view settings
Filter options
- kernel/sched
Expand file treeCollapse file tree
1 file changed
+8
-0
lines changedOpen diff view settings
Collapse file
+8Lines changed: 8 additions & 0 deletions
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
3772 | 3772 | | |
3773 | 3773 | | |
3774 | 3774 | | |
| 3775 | + | |
| 3776 | + | |
| 3777 | + | |
| 3778 | + | |
| 3779 | + | |
| 3780 | + | |
| 3781 | + | |
| 3782 | + | |
3775 | 3783 | | |
3776 | 3784 | | |
3777 | 3785 | | |
| |||
0 commit comments