Skip to content

Commit fd6db58

Browse files
committed
slab: fix barn NULL pointer dereference on memoryless nodes
Phil reported a boot failure once sheaves become used in commits 59faa4d ("maple_tree: use percpu sheaves for maple_node_cache") and 3accabd ("mm, vma: use percpu sheaves for vm_area_struct cache"): BUG: kernel NULL pointer dereference, address: 0000000000000040 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: Oops: 0000 [#1] SMP NOPTI CPU: 21 UID: 0 PID: 818 Comm: kworker/u398:0 Not tainted 6.17.0-rc3.slab+ #5 PREEMPT(voluntary) Hardware name: Dell Inc. PowerEdge R7425/02MJ3T, BIOS 1.26.0 07/30/2025 RIP: 0010:__pcs_replace_empty_main+0x44/0x1d0 Code: ec 08 48 8b 46 10 48 8b 76 08 48 85 c0 74 0b 8b 48 18 85 c9 0f 85 e5 00 00 00 65 48 63 05 e4 ee 50 02 49 8b 84 c6 e0 00 00 00 <4c> 8b 68 40 4c 89 ef e8 b0 81 ff ff 48 89 c5 48 85 c0 74 1d 48 89 RSP: 0018:ffffd2d10950bdb0 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff8a775dab74b0 RCX: 00000000ffffffff RDX: 0000000000000cc0 RSI: ffff8a6800804000 RDI: ffff8a680004e300 RBP: ffffd2d10950be40 R08: 0000000000000060 R09: ffffffffb9367388 R10: 00000000000149e8 R11: ffff8a6f87a38000 R12: 0000000000000cc0 R13: 0000000000000cc0 R14: ffff8a680004e300 R15: 00000000000000c0 FS: 0000000000000000(0000) GS:ffff8a77a3541000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000040 CR3: 0000000e1aa24000 CR4: 00000000003506f0 Call Trace: <TASK> ? srso_return_thunk+0x5/0x5f ? vm_area_alloc+0x1e/0x60 kmem_cache_alloc_noprof+0x4ec/0x5b0 vm_area_alloc+0x1e/0x60 create_init_stack_vma+0x26/0x210 alloc_bprm+0x139/0x200 kernel_execve+0x4a/0x140 call_usermodehelper_exec_async+0xd0/0x190 ? __pfx_call_usermodehelper_exec_async+0x10/0x10 ret_from_fork+0xf0/0x110 ? __pfx_call_usermodehelper_exec_async+0x10/0x10 ret_from_fork_asm+0x1a/0x30 </TASK> Modules linked in: CR2: 0000000000000040 ---[ end trace 0000000000000000 ]--- RIP: 0010:__pcs_replace_empty_main+0x44/0x1d0 Code: ec 08 48 8b 46 10 48 8b 76 08 48 85 c0 74 0b 8b 48 18 85 c9 0f 85 e5 00 00 00 65 48 63 05 e4 ee 50 02 49 8b 84 c6 e0 00 00 00 <4c> 8b 68 40 4c 89 ef e8 b0 81 ff ff 48 89 c5 48 85 c0 74 1d 48 89 RSP: 0018:ffffd2d10950bdb0 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff8a775dab74b0 RCX: 00000000ffffffff RDX: 0000000000000cc0 RSI: ffff8a6800804000 RDI: ffff8a680004e300 RBP: ffffd2d10950be40 R08: 0000000000000060 R09: ffffffffb9367388 R10: 00000000000149e8 R11: ffff8a6f87a38000 R12: 0000000000000cc0 R13: 0000000000000cc0 R14: ffff8a680004e300 R15: 00000000000000c0 FS: 0000000000000000(0000) GS:ffff8a77a3541000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000040 CR3: 0000000e1aa24000 CR4: 00000000003506f0 Kernel panic - not syncing: Fatal exception Kernel Offset: 0x36a00000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) ---[ end Kernel panic - not syncing: Fatal exception ]--- And noted "this is an AMD EPYC 7401 with 8 NUMA nodes configured such that memory is only on 2 of them." # numactl --hardware available: 8 nodes (0-7) node 0 cpus: 0 8 16 24 32 40 48 56 64 72 80 88 node 0 size: 0 MB node 0 free: 0 MB node 1 cpus: 2 10 18 26 34 42 50 58 66 74 82 90 node 1 size: 31584 MB node 1 free: 30397 MB node 2 cpus: 4 12 20 28 36 44 52 60 68 76 84 92 node 2 size: 0 MB node 2 free: 0 MB node 3 cpus: 6 14 22 30 38 46 54 62 70 78 86 94 node 3 size: 0 MB node 3 free: 0 MB node 4 cpus: 1 9 17 25 33 41 49 57 65 73 81 89 node 4 size: 0 MB node 4 free: 0 MB node 5 cpus: 3 11 19 27 35 43 51 59 67 75 83 91 node 5 size: 32214 MB node 5 free: 31625 MB node 6 cpus: 5 13 21 29 37 45 53 61 69 77 85 93 node 6 size: 0 MB node 6 free: 0 MB node 7 cpus: 7 15 23 31 39 47 55 63 71 79 87 95 node 7 size: 0 MB node 7 free: 0 MB Linus decoded the stacktrace to get_barn() and get_node() and determined that kmem_cache->node[numa_mem_id()] is NULL. The problem is due to a wrong assumption that memoryless nodes only exist on systems with CONFIG_HAVE_MEMORYLESS_NODES, where numa_mem_id() points to the nearest node that has memory. SLUB has been allocating its kmem_cache_node structures only on nodes with memory and so it does with struct node_barn. For kmem_cache_node, get_partial_node() checks if get_node() result is not NULL, which I assumed was for protection from a bogus node id passed to kmalloc_node() but apparently it's also for systems where numa_mem_id() (used when no specific node is given) might return a memoryless node. Fix the sheaves code the same way by checking the result of get_node() and bailing out if it's NULL. Note that cpus on such memoryless nodes will have degraded sheaves performance, which can be improved later, preferably by making numa_mem_id() work properly on such systems. Fixes: 2d517aa ("slab: add opt-in caching layer of percpu sheaves") Reported-and-tested-by: Phil Auld <pauld@redhat.com> Closes: https://lore.kernel.org/all/20251010151116.GA436967@pauld.westford.csb/ Analyzed-by: Linus Torvalds <torvalds@linux-foundation.org> Link: https://lore.kernel.org/all/CAHk-%3Dwg1xK%2BBr%3DFJ5QipVhzCvq7uQVPt5Prze6HDhQQ%3DQD_BcQ@mail.gmail.com/ Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
1 parent f76b168 commit fd6db58

File tree

1 file changed

+51
-14
lines changed

1 file changed

+51
-14
lines changed

mm/slub.c

Lines changed: 51 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -504,10 +504,18 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
504504
return s->node[node];
505505
}
506506

507-
/* Get the barn of the current cpu's memory node */
507+
/*
508+
* Get the barn of the current cpu's closest memory node. It may not exist on
509+
* systems with memoryless nodes but without CONFIG_HAVE_MEMORYLESS_NODES
510+
*/
508511
static inline struct node_barn *get_barn(struct kmem_cache *s)
509512
{
510-
return get_node(s, numa_mem_id())->barn;
513+
struct kmem_cache_node *n = get_node(s, numa_mem_id());
514+
515+
if (!n)
516+
return NULL;
517+
518+
return n->barn;
511519
}
512520

513521
/*
@@ -4982,6 +4990,10 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs,
49824990
}
49834991

49844992
barn = get_barn(s);
4993+
if (!barn) {
4994+
local_unlock(&s->cpu_sheaves->lock);
4995+
return NULL;
4996+
}
49854997

49864998
full = barn_replace_empty_sheaf(barn, pcs->main);
49874999

@@ -5153,13 +5165,20 @@ unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
51535165
if (unlikely(pcs->main->size == 0)) {
51545166

51555167
struct slab_sheaf *full;
5168+
struct node_barn *barn;
51565169

51575170
if (pcs->spare && pcs->spare->size > 0) {
51585171
swap(pcs->main, pcs->spare);
51595172
goto do_alloc;
51605173
}
51615174

5162-
full = barn_replace_empty_sheaf(get_barn(s), pcs->main);
5175+
barn = get_barn(s);
5176+
if (!barn) {
5177+
local_unlock(&s->cpu_sheaves->lock);
5178+
return allocated;
5179+
}
5180+
5181+
full = barn_replace_empty_sheaf(barn, pcs->main);
51635182

51645183
if (full) {
51655184
stat(s, BARN_GET);
@@ -5314,6 +5333,7 @@ kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size)
53145333
{
53155334
struct slub_percpu_sheaves *pcs;
53165335
struct slab_sheaf *sheaf = NULL;
5336+
struct node_barn *barn;
53175337

53185338
if (unlikely(size > s->sheaf_capacity)) {
53195339

@@ -5355,8 +5375,11 @@ kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size)
53555375
pcs->spare = NULL;
53565376
stat(s, SHEAF_PREFILL_FAST);
53575377
} else {
5378+
barn = get_barn(s);
5379+
53585380
stat(s, SHEAF_PREFILL_SLOW);
5359-
sheaf = barn_get_full_or_empty_sheaf(get_barn(s));
5381+
if (barn)
5382+
sheaf = barn_get_full_or_empty_sheaf(barn);
53605383
if (sheaf && sheaf->size)
53615384
stat(s, BARN_GET);
53625385
else
@@ -5426,7 +5449,7 @@ void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp,
54265449
* If the barn has too many full sheaves or we fail to refill the sheaf,
54275450
* simply flush and free it.
54285451
*/
5429-
if (data_race(barn->nr_full) >= MAX_FULL_SHEAVES ||
5452+
if (!barn || data_race(barn->nr_full) >= MAX_FULL_SHEAVES ||
54305453
refill_sheaf(s, sheaf, gfp)) {
54315454
sheaf_flush_unused(s, sheaf);
54325455
free_empty_sheaf(s, sheaf);
@@ -5943,10 +5966,9 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
59435966
* put the full sheaf there.
59445967
*/
59455968
static void __pcs_install_empty_sheaf(struct kmem_cache *s,
5946-
struct slub_percpu_sheaves *pcs, struct slab_sheaf *empty)
5969+
struct slub_percpu_sheaves *pcs, struct slab_sheaf *empty,
5970+
struct node_barn *barn)
59475971
{
5948-
struct node_barn *barn;
5949-
59505972
lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
59515973

59525974
/* This is what we expect to find if nobody interrupted us. */
@@ -5956,8 +5978,6 @@ static void __pcs_install_empty_sheaf(struct kmem_cache *s,
59565978
return;
59575979
}
59585980

5959-
barn = get_barn(s);
5960-
59615981
/*
59625982
* Unlikely because if the main sheaf had space, we would have just
59635983
* freed to it. Get rid of our empty sheaf.
@@ -6002,6 +6022,11 @@ __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs)
60026022
lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
60036023

60046024
barn = get_barn(s);
6025+
if (!barn) {
6026+
local_unlock(&s->cpu_sheaves->lock);
6027+
return NULL;
6028+
}
6029+
60056030
put_fail = false;
60066031

60076032
if (!pcs->spare) {
@@ -6084,7 +6109,7 @@ __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs)
60846109
}
60856110

60866111
pcs = this_cpu_ptr(s->cpu_sheaves);
6087-
__pcs_install_empty_sheaf(s, pcs, empty);
6112+
__pcs_install_empty_sheaf(s, pcs, empty, barn);
60886113

60896114
return pcs;
60906115
}
@@ -6121,8 +6146,9 @@ bool free_to_pcs(struct kmem_cache *s, void *object)
61216146

61226147
static void rcu_free_sheaf(struct rcu_head *head)
61236148
{
6149+
struct kmem_cache_node *n;
61246150
struct slab_sheaf *sheaf;
6125-
struct node_barn *barn;
6151+
struct node_barn *barn = NULL;
61266152
struct kmem_cache *s;
61276153

61286154
sheaf = container_of(head, struct slab_sheaf, rcu_head);
@@ -6139,7 +6165,11 @@ static void rcu_free_sheaf(struct rcu_head *head)
61396165
*/
61406166
__rcu_free_sheaf_prepare(s, sheaf);
61416167

6142-
barn = get_node(s, sheaf->node)->barn;
6168+
n = get_node(s, sheaf->node);
6169+
if (!n)
6170+
goto flush;
6171+
6172+
barn = n->barn;
61436173

61446174
/* due to slab_free_hook() */
61456175
if (unlikely(sheaf->size == 0))
@@ -6157,11 +6187,12 @@ static void rcu_free_sheaf(struct rcu_head *head)
61576187
return;
61586188
}
61596189

6190+
flush:
61606191
stat(s, BARN_PUT_FAIL);
61616192
sheaf_flush_unused(s, sheaf);
61626193

61636194
empty:
6164-
if (data_race(barn->nr_empty) < MAX_EMPTY_SHEAVES) {
6195+
if (barn && data_race(barn->nr_empty) < MAX_EMPTY_SHEAVES) {
61656196
barn_put_empty_sheaf(barn, sheaf);
61666197
return;
61676198
}
@@ -6191,6 +6222,10 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj)
61916222
}
61926223

61936224
barn = get_barn(s);
6225+
if (!barn) {
6226+
local_unlock(&s->cpu_sheaves->lock);
6227+
goto fail;
6228+
}
61946229

61956230
empty = barn_get_empty_sheaf(barn);
61966231

@@ -6304,6 +6339,8 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
63046339
goto do_free;
63056340

63066341
barn = get_barn(s);
6342+
if (!barn)
6343+
goto no_empty;
63076344

63086345
if (!pcs->spare) {
63096346
empty = barn_get_empty_sheaf(barn);

0 commit comments

Comments
 (0)