Skip to content

Commit

Permalink
Merge branch 'bpf-reduce-the-use-of-migrate_-disable-enable'
Browse files Browse the repository at this point in the history
Hou Tao says:

====================
The use of migrate_{disable|enable} pair in BPF is mainly due to the
introduction of bpf memory allocator and the use of per-CPU data struct
in its internal implementation. The caller needs to disable migration
before invoking the alloc or free APIs of bpf memory allocator, and
enable migration after the invocation.

The main users of bpf memory allocator are various kind of bpf maps in
which the map values or the special fields in the map values are
allocated by using bpf memory allocator.

At present, the running context for bpf program has already disabled
migration explictly or implictly, therefore, when these maps are
manipulated in bpf program, it is OK to not invoke migrate_disable()
and migrate_enable() pair. Howevers, it is not always the case when
these maps are manipulated through bpf syscall, therefore many
migrate_{disable|enable} pairs are added when the map can either be
manipulated by BPF program or BPF syscall.

The initial idea of reducing the use of migrate_{disable|enable} comes
from Alexei [1]. I turned it into a patch set that archives the goals
through the following three methods:

1. remove unnecessary migrate_{disable|enable} pair
when the BPF syscall path also disables migration, it is OK to remove
the pair. Patch #1~#3 fall into this category, while patch #4~#5 are
partially included.

2. move the migrate_{disable|enable} pair from inner callee to outer
   caller
Instead of invoking migrate_disable() in the inner callee, invoking
migrate_disable() in the outer caller to simplify reasoning about when
migrate_disable() is needed. Patch #4~#5 and patch torvalds#6~torvalds#19 belongs to
this category.

3. add cant_migrate() check in the inner callee
Add cant_migrate() check in the inner callee to ensure the guarantee
that migration is disabled is not broken. Patch #1~#5, torvalds#13, torvalds#16~torvalds#19 also
belong to this category.

Please check the individual patches for more details. Comments are
always welcome.

Change Log:
v2:
  * sqaush the ->map_free related patches (torvalds#10~torvalds#12, torvalds#15) into one patch
  * remove unnecessary cant_migrate() checks.

v1: https://lore.kernel.org/bpf/[email protected]
====================

Link: https://patch.msgid.link/[email protected]
Signed-off-by: Alexei Starovoitov <[email protected]>
  • Loading branch information
Alexei Starovoitov committed Jan 9, 2025
2 parents bfaac2a + d86088e commit e8ec1c9
Show file tree
Hide file tree
Showing 12 changed files with 55 additions and 88 deletions.
6 changes: 2 additions & 4 deletions kernel/bpf/arraymap.c
Original file line number Diff line number Diff line change
Expand Up @@ -735,13 +735,13 @@ static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback
u64 ret = 0;
void *val;

cant_migrate();

if (flags != 0)
return -EINVAL;

is_percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
array = container_of(map, struct bpf_array, map);
if (is_percpu)
migrate_disable();
for (i = 0; i < map->max_entries; i++) {
if (is_percpu)
val = this_cpu_ptr(array->pptrs[i]);
Expand All @@ -756,8 +756,6 @@ static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback
break;
}

if (is_percpu)
migrate_enable();
return num_elems;
}

Expand Down
15 changes: 7 additions & 8 deletions kernel/bpf/bpf_cgrp_storage.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,20 @@ static DEFINE_PER_CPU(int, bpf_cgrp_storage_busy);

static void bpf_cgrp_storage_lock(void)
{
migrate_disable();
cant_migrate();
this_cpu_inc(bpf_cgrp_storage_busy);
}

static void bpf_cgrp_storage_unlock(void)
{
this_cpu_dec(bpf_cgrp_storage_busy);
migrate_enable();
}

static bool bpf_cgrp_storage_trylock(void)
{
migrate_disable();
cant_migrate();
if (unlikely(this_cpu_inc_return(bpf_cgrp_storage_busy) != 1)) {
this_cpu_dec(bpf_cgrp_storage_busy);
migrate_enable();
return false;
}
return true;
Expand All @@ -47,17 +45,18 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup)
{
struct bpf_local_storage *local_storage;

migrate_disable();
rcu_read_lock();
local_storage = rcu_dereference(cgroup->bpf_cgrp_storage);
if (!local_storage) {
rcu_read_unlock();
return;
}
if (!local_storage)
goto out;

bpf_cgrp_storage_lock();
bpf_local_storage_destroy(local_storage);
bpf_cgrp_storage_unlock();
out:
rcu_read_unlock();
migrate_enable();
}

static struct bpf_local_storage_data *
Expand Down
9 changes: 5 additions & 4 deletions kernel/bpf/bpf_inode_storage.c
Original file line number Diff line number Diff line change
Expand Up @@ -62,16 +62,17 @@ void bpf_inode_storage_free(struct inode *inode)
if (!bsb)
return;

migrate_disable();
rcu_read_lock();

local_storage = rcu_dereference(bsb->storage);
if (!local_storage) {
rcu_read_unlock();
return;
}
if (!local_storage)
goto out;

bpf_local_storage_destroy(local_storage);
out:
rcu_read_unlock();
migrate_enable();
}

static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
Expand Down
30 changes: 9 additions & 21 deletions kernel/bpf/bpf_local_storage.c
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
return NULL;

if (smap->bpf_ma) {
migrate_disable();
selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags);
migrate_enable();
if (selem)
/* Keep the original bpf_map_kzalloc behavior
* before started using the bpf_mem_cache_alloc.
Expand Down Expand Up @@ -174,17 +172,14 @@ static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
return;
}

if (smap) {
migrate_disable();
if (smap)
bpf_mem_cache_free(&smap->storage_ma, local_storage);
migrate_enable();
} else {
else
/* smap could be NULL if the selem that triggered
* this 'local_storage' creation had been long gone.
* In this case, directly do call_rcu().
*/
call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
}
}

/* rcu tasks trace callback for bpf_ma == false */
Expand Down Expand Up @@ -217,7 +212,10 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
/* The bpf_local_storage_map_free will wait for rcu_barrier */
smap = rcu_dereference_check(SDATA(selem)->smap, 1);

migrate_disable();
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
migrate_enable();
bpf_mem_cache_raw_free(selem);
}

Expand Down Expand Up @@ -256,9 +254,7 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
* bpf_mem_cache_free will be able to reuse selem
* immediately.
*/
migrate_disable();
bpf_mem_cache_free(&smap->selem_ma, selem);
migrate_enable();
return;
}

Expand Down Expand Up @@ -497,15 +493,11 @@ int bpf_local_storage_alloc(void *owner,
if (err)
return err;

if (smap->bpf_ma) {
migrate_disable();
if (smap->bpf_ma)
storage = bpf_mem_cache_alloc_flags(&smap->storage_ma, gfp_flags);
migrate_enable();
} else {
else
storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
gfp_flags | __GFP_NOWARN);
}

if (!storage) {
err = -ENOMEM;
goto uncharge;
Expand Down Expand Up @@ -902,15 +894,11 @@ void bpf_local_storage_map_free(struct bpf_map *map,
while ((selem = hlist_entry_safe(
rcu_dereference_raw(hlist_first_rcu(&b->list)),
struct bpf_local_storage_elem, map_node))) {
if (busy_counter) {
migrate_disable();
if (busy_counter)
this_cpu_inc(*busy_counter);
}
bpf_selem_unlink(selem, true);
if (busy_counter) {
if (busy_counter)
this_cpu_dec(*busy_counter);
migrate_enable();
}
cond_resched_rcu();
}
rcu_read_unlock();
Expand Down
15 changes: 7 additions & 8 deletions kernel/bpf/bpf_task_storage.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,22 +24,20 @@ static DEFINE_PER_CPU(int, bpf_task_storage_busy);

static void bpf_task_storage_lock(void)
{
migrate_disable();
cant_migrate();
this_cpu_inc(bpf_task_storage_busy);
}

static void bpf_task_storage_unlock(void)
{
this_cpu_dec(bpf_task_storage_busy);
migrate_enable();
}

static bool bpf_task_storage_trylock(void)
{
migrate_disable();
cant_migrate();
if (unlikely(this_cpu_inc_return(bpf_task_storage_busy) != 1)) {
this_cpu_dec(bpf_task_storage_busy);
migrate_enable();
return false;
}
return true;
Expand Down Expand Up @@ -72,18 +70,19 @@ void bpf_task_storage_free(struct task_struct *task)
{
struct bpf_local_storage *local_storage;

migrate_disable();
rcu_read_lock();

local_storage = rcu_dereference(task->bpf_storage);
if (!local_storage) {
rcu_read_unlock();
return;
}
if (!local_storage)
goto out;

bpf_task_storage_lock();
bpf_local_storage_destroy(local_storage);
bpf_task_storage_unlock();
out:
rcu_read_unlock();
migrate_enable();
}

static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
Expand Down
2 changes: 0 additions & 2 deletions kernel/bpf/cpumask.c
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,7 @@ __bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask)
if (!refcount_dec_and_test(&cpumask->usage))
return;

migrate_disable();
bpf_mem_cache_free_rcu(&bpf_cpumask_ma, cpumask);
migrate_enable();
}

__bpf_kfunc void bpf_cpumask_release_dtor(void *cpumask)
Expand Down
19 changes: 7 additions & 12 deletions kernel/bpf/hashtab.c
Original file line number Diff line number Diff line change
Expand Up @@ -897,11 +897,9 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
{
check_and_free_fields(htab, l);

migrate_disable();
if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
bpf_mem_cache_free(&htab->ma, l);
migrate_enable();
}

static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
Expand Down Expand Up @@ -1502,10 +1500,9 @@ static void delete_all_elements(struct bpf_htab *htab)
{
int i;

/* It's called from a worker thread, so disable migration here,
* since bpf_mem_cache_free() relies on that.
/* It's called from a worker thread and migration has been disabled,
* therefore, it is OK to invoke bpf_mem_cache_free() directly.
*/
migrate_disable();
for (i = 0; i < htab->n_buckets; i++) {
struct hlist_nulls_head *head = select_bucket(htab, i);
struct hlist_nulls_node *n;
Expand All @@ -1517,7 +1514,6 @@ static void delete_all_elements(struct bpf_htab *htab)
}
cond_resched();
}
migrate_enable();
}

static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab)
Expand Down Expand Up @@ -2208,17 +2204,18 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_
bool is_percpu;
u64 ret = 0;

cant_migrate();

if (flags != 0)
return -EINVAL;

is_percpu = htab_is_percpu(htab);

roundup_key_size = round_up(map->key_size, 8);
/* disable migration so percpu value prepared here will be the
* same as the one seen by the bpf program with bpf_map_lookup_elem().
/* migration has been disabled, so percpu value prepared here will be
* the same as the one seen by the bpf program with
* bpf_map_lookup_elem().
*/
if (is_percpu)
migrate_disable();
for (i = 0; i < htab->n_buckets; i++) {
b = &htab->buckets[i];
rcu_read_lock();
Expand All @@ -2244,8 +2241,6 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_
rcu_read_unlock();
}
out:
if (is_percpu)
migrate_enable();
return num_elems;
}

Expand Down
4 changes: 0 additions & 4 deletions kernel/bpf/helpers.c
Original file line number Diff line number Diff line change
Expand Up @@ -2066,9 +2066,7 @@ void bpf_list_head_free(const struct btf_field *field, void *list_head,
/* The contained type can also have resources, including a
* bpf_list_head which needs to be freed.
*/
migrate_disable();
__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
migrate_enable();
}
}

Expand Down Expand Up @@ -2105,9 +2103,7 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
obj -= field->graph_root.node_offset;


migrate_disable();
__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
migrate_enable();
}
}

Expand Down
20 changes: 4 additions & 16 deletions kernel/bpf/lpm_trie.c
Original file line number Diff line number Diff line change
Expand Up @@ -289,16 +289,11 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
}

static struct lpm_trie_node *lpm_trie_node_alloc(struct lpm_trie *trie,
const void *value,
bool disable_migration)
const void *value)
{
struct lpm_trie_node *node;

if (disable_migration)
migrate_disable();
node = bpf_mem_cache_alloc(&trie->ma);
if (disable_migration)
migrate_enable();

if (!node)
return NULL;
Expand Down Expand Up @@ -342,10 +337,8 @@ static long trie_update_elem(struct bpf_map *map,
if (key->prefixlen > trie->max_prefixlen)
return -EINVAL;

/* Allocate and fill a new node. Need to disable migration before
* invoking bpf_mem_cache_alloc().
*/
new_node = lpm_trie_node_alloc(trie, value, true);
/* Allocate and fill a new node */
new_node = lpm_trie_node_alloc(trie, value);
if (!new_node)
return -ENOMEM;

Expand Down Expand Up @@ -425,8 +418,7 @@ static long trie_update_elem(struct bpf_map *map,
goto out;
}

/* migration is disabled within the locked scope */
im_node = lpm_trie_node_alloc(trie, NULL, false);
im_node = lpm_trie_node_alloc(trie, NULL);
if (!im_node) {
trie->n_entries--;
ret = -ENOMEM;
Expand All @@ -452,11 +444,9 @@ static long trie_update_elem(struct bpf_map *map,
out:
raw_spin_unlock_irqrestore(&trie->lock, irq_flags);

migrate_disable();
if (ret)
bpf_mem_cache_free(&trie->ma, new_node);
bpf_mem_cache_free_rcu(&trie->ma, free_node);
migrate_enable();

return ret;
}
Expand Down Expand Up @@ -555,10 +545,8 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
out:
raw_spin_unlock_irqrestore(&trie->lock, irq_flags);

migrate_disable();
bpf_mem_cache_free_rcu(&trie->ma, free_parent);
bpf_mem_cache_free_rcu(&trie->ma, free_node);
migrate_enable();

return ret;
}
Expand Down
2 changes: 0 additions & 2 deletions kernel/bpf/range_tree.c
Original file line number Diff line number Diff line change
Expand Up @@ -259,9 +259,7 @@ void range_tree_destroy(struct range_tree *rt)

while ((rn = range_it_iter_first(rt, 0, -1U))) {
range_it_remove(rn, rt);
migrate_disable();
bpf_mem_free(&bpf_global_ma, rn);
migrate_enable();
}
}

Expand Down
Loading

0 comments on commit e8ec1c9

Please sign in to comment.