Skip to content

Commit 7366c55

Browse files
committed
bpf: Support atomic update for htab of maps
JIRA: https://issues.redhat.com/browse/RHEL-110274 Conflicts: changed context due to missing upstream commit 4fa8d68 ("bpf: Convert hashtab.c to rqspinlock") commit 2c30417 Author: Hou Tao <houtao1@huawei.com> Date: Tue Apr 1 14:22:47 2025 +0800 bpf: Support atomic update for htab of maps As reported by Cody Haas [1], when there is concurrent map lookup and map update operation in an existing element for htab of maps, the map lookup procedure may return -ENOENT unexpectedly. The root cause is twofold: 1) the update of existing element involves two separated list operation In htab_map_update_elem(), it first inserts the new element at the head of list, then it deletes the old element. Therefore, it is possible a lookup operation has already iterated to the middle of the list when a concurrent update operation begins, and the lookup operation will fail to find the target element. 2) the immediate reuse of htab element. It is more subtle. Even through the lookup operation finds the old element, it is possible that the target element has been removed by a concurrent update operation, and the element has been reused immediately by other update operation which runs on the same CPU as the previous update operation, and the element is inserted into the same bucket list. After these steps above, when the lookup operation tries to compare the key in the old element with the expected key, the match will fail because the key in the old element have been overwritten by other update operation. The two-step update process is relatively straightforward to address. The more challenging aspect is the immediate reuse. As Alexei pointed out: So since 2022 both prealloc and no_prealloc reuse elements. We can consider a new flag for the hash map like F_REUSE_AFTER_RCU_GP that will use _rcu() flavor of freeing into bpf_ma, but it has to have a strong reason. Given that htab of maps doesn't support special field in value and directly stores the inner map pointer in htab_element, just do in-place update for htab of maps instead of attempting to address the immediate reuse issue. [1]: https://lore.kernel.org/xdp-newbies/CAH7f-ULFTwKdoH_t2SFc5rWCVYLEg-14d1fBYWH2eekudsnTRg@mail.gmail.com/ Acked-by: Andrii Nakryiko <andrii@kernel.org> Signed-off-by: Hou Tao <houtao1@huawei.com> Link: https://lore.kernel.org/r/20250401062250.543403-4-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Viktor Malik <vmalik@redhat.com>
1 parent 7249fd8 commit 7366c55

File tree

1 file changed

+21
-23
lines changed

1 file changed

+21
-23
lines changed

kernel/bpf/hashtab.c

Lines changed: 21 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1107,10 +1107,9 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
11071107
u64 map_flags)
11081108
{
11091109
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1110-
struct htab_elem *l_new = NULL, *l_old;
1110+
struct htab_elem *l_new, *l_old;
11111111
struct hlist_nulls_head *head;
11121112
unsigned long flags;
1113-
void *old_map_ptr;
11141113
struct bucket *b;
11151114
u32 key_size, hash;
11161115
int ret;
@@ -1191,24 +1190,14 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
11911190
hlist_nulls_del_rcu(&l_old->hash_node);
11921191

11931192
/* l_old has already been stashed in htab->extra_elems, free
1194-
* its special fields before it is available for reuse. Also
1195-
* save the old map pointer in htab of maps before unlock
1196-
* and release it after unlock.
1193+
* its special fields before it is available for reuse.
11971194
*/
1198-
old_map_ptr = NULL;
1199-
if (htab_is_prealloc(htab)) {
1200-
if (map->ops->map_fd_put_ptr)
1201-
old_map_ptr = fd_htab_map_get_ptr(map, l_old);
1195+
if (htab_is_prealloc(htab))
12021196
check_and_free_fields(htab, l_old);
1203-
}
12041197
}
12051198
htab_unlock_bucket(htab, b, hash, flags);
1206-
if (l_old) {
1207-
if (old_map_ptr)
1208-
map->ops->map_fd_put_ptr(map, old_map_ptr, true);
1209-
if (!htab_is_prealloc(htab))
1210-
free_htab_elem(htab, l_old);
1211-
}
1199+
if (l_old && !htab_is_prealloc(htab))
1200+
free_htab_elem(htab, l_old);
12121201
return 0;
12131202
err:
12141203
htab_unlock_bucket(htab, b, hash, flags);
@@ -1296,6 +1285,7 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
12961285
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
12971286
struct htab_elem *l_new, *l_old;
12981287
struct hlist_nulls_head *head;
1288+
void *old_map_ptr = NULL;
12991289
unsigned long flags;
13001290
struct bucket *b;
13011291
u32 key_size, hash;
@@ -1327,8 +1317,15 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
13271317

13281318
if (l_old) {
13291319
/* Update value in-place */
1330-
pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
1331-
value, onallcpus);
1320+
if (percpu) {
1321+
pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
1322+
value, onallcpus);
1323+
} else {
1324+
void **inner_map_pptr = htab_elem_value(l_old, key_size);
1325+
1326+
old_map_ptr = *inner_map_pptr;
1327+
WRITE_ONCE(*inner_map_pptr, *(void **)value);
1328+
}
13321329
} else {
13331330
l_new = alloc_htab_elem(htab, key, value, key_size,
13341331
hash, percpu, onallcpus, NULL);
@@ -1340,6 +1337,8 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
13401337
}
13411338
err:
13421339
htab_unlock_bucket(htab, b, hash, flags);
1340+
if (old_map_ptr)
1341+
map->ops->map_fd_put_ptr(map, old_map_ptr, true);
13431342
return ret;
13441343
}
13451344

@@ -2567,24 +2566,23 @@ int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
25672566
return ret;
25682567
}
25692568

2570-
/* only called from syscall */
2569+
/* Only called from syscall */
25712570
int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
25722571
void *key, void *value, u64 map_flags)
25732572
{
25742573
void *ptr;
25752574
int ret;
2576-
u32 ufd = *(u32 *)value;
25772575

2578-
ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
2576+
ptr = map->ops->map_fd_get_ptr(map, map_file, *(int *)value);
25792577
if (IS_ERR(ptr))
25802578
return PTR_ERR(ptr);
25812579

25822580
/* The htab bucket lock is always held during update operations in fd
25832581
* htab map, and the following rcu_read_lock() is only used to avoid
2584-
* the WARN_ON_ONCE in htab_map_update_elem().
2582+
* the WARN_ON_ONCE in htab_map_update_elem_in_place().
25852583
*/
25862584
rcu_read_lock();
2587-
ret = htab_map_update_elem(map, key, &ptr, map_flags);
2585+
ret = htab_map_update_elem_in_place(map, key, &ptr, map_flags, false, false);
25882586
rcu_read_unlock();
25892587
if (ret)
25902588
map->ops->map_fd_put_ptr(map, ptr, false);

0 commit comments

Comments
 (0)