diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f3f50e29d6392..bcd6504daad2a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -184,8 +184,8 @@ struct bpf_map_ops { }; enum { - /* Support at most 11 fields in a BTF type */ - BTF_FIELDS_MAX = 11, + /* Support at most 13 fields in a BTF type */ + BTF_FIELDS_MAX = 13, }; enum btf_field_type { @@ -204,6 +204,7 @@ enum btf_field_type { BPF_REFCOUNT = (1 << 9), BPF_WORKQUEUE = (1 << 10), BPF_UPTR = (1 << 11), + BPF_DYNPTR = (1 << 12), }; typedef void (*btf_dtor_kfunc_t)(void *); @@ -257,6 +258,14 @@ struct bpf_list_node_kern { void *owner; } __attribute__((aligned(8))); +/* Internal map flags */ +enum { + /* map key supports bpf_dynptr */ + BPF_INT_F_DYNPTR_IN_KEY = (1U << 31), +}; + +#define BPF_INT_F_MASK (1U << 31) + struct bpf_map { const struct bpf_map_ops *ops; struct bpf_map *inner_map_meta; @@ -268,9 +277,20 @@ struct bpf_map { u32 value_size; u32 max_entries; u64 map_extra; /* any per-map-type extra fields */ + /* The topmost bit of map_flags is used as an internal map flag + * (aka BPF_INT_F_DYNPTR_IN_KEY) and it can't be set through bpf + * syscall. + */ u32 map_flags; u32 id; + /* BTF record for special fields in map value. bpf_dynptr is disallowed + * at present. + */ struct btf_record *record; + /* BTF record for special fields in map key. Only bpf_dynptr is allowed + * at present. + */ + struct btf_record *key_record; int numa_node; u32 btf_key_type_id; u32 btf_value_type_id; @@ -309,6 +329,11 @@ struct bpf_map { s64 __percpu *elem_count; }; +static inline bool bpf_map_has_dynptr_key(const struct bpf_map *map) +{ + return map->map_flags & BPF_INT_F_DYNPTR_IN_KEY; +} + static inline const char *btf_field_type_name(enum btf_field_type type) { switch (type) { @@ -335,6 +360,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type) return "bpf_rb_node"; case BPF_REFCOUNT: return "bpf_refcount"; + case BPF_DYNPTR: + return "bpf_dynptr"; default: WARN_ON_ONCE(1); return "unknown"; @@ -365,6 +392,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type) return sizeof(struct bpf_rb_node); case BPF_REFCOUNT: return sizeof(struct bpf_refcount); + case BPF_DYNPTR: + return sizeof(struct bpf_dynptr); default: WARN_ON_ONCE(1); return 0; @@ -395,6 +424,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type) return __alignof__(struct bpf_rb_node); case BPF_REFCOUNT: return __alignof__(struct bpf_refcount); + case BPF_DYNPTR: + return __alignof__(struct bpf_dynptr); default: WARN_ON_ONCE(1); return 0; @@ -425,6 +456,7 @@ static inline void bpf_obj_init_field(const struct btf_field *field, void *addr) case BPF_KPTR_REF: case BPF_KPTR_PERCPU: case BPF_UPTR: + case BPF_DYNPTR: break; default: WARN_ON_ONCE(1); @@ -603,7 +635,8 @@ static inline bool bpf_map_offload_neutral(const struct bpf_map *map) static inline bool bpf_map_support_seq_show(const struct bpf_map *map) { return (map->btf_value_type_id || map->btf_vmlinux_value_type_id) && - map->ops->map_seq_show_elem; + map->ops->map_seq_show_elem && + !bpf_map_has_dynptr_key(map); } int map_check_no_btf(const struct bpf_map *map, @@ -1319,6 +1352,7 @@ enum bpf_dynptr_type { }; int bpf_dynptr_check_size(u32 size); +void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u32 new_size); u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr); const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len); void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len); diff --git a/include/linux/btf.h b/include/linux/btf.h index 2a08a2b55592e..ee1488494c73d 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -223,8 +223,10 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, u32 expected_offset, u32 expected_size); struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t, u32 field_mask, u32 value_size); +struct btf_record *btf_new_bpf_dynptr_record(void); int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec); bool btf_type_is_void(const struct btf_type *t); +bool btf_type_is_dynptr(const struct btf *btf, const struct btf_type *t); s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind); s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p); const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2acf9b3363717..7d96685513c55 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -7335,6 +7335,12 @@ struct bpf_dynptr { __u64 __opaque[2]; } __attribute__((aligned(8))); +struct bpf_dynptr_user { + __bpf_md_ptr(void *, data); + __u32 size; + __u32 reserved; +} __attribute__((aligned(8))); + struct bpf_list_head { __u64 __opaque[2]; } __attribute__((aligned(8))); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 9de6acddd479b..37910b7f9bac5 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3500,6 +3500,7 @@ static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_ field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root"); field_mask_test_name(BPF_RB_NODE, "bpf_rb_node"); field_mask_test_name(BPF_REFCOUNT, "bpf_refcount"); + field_mask_test_name(BPF_DYNPTR, "bpf_dynptr"); /* Only return BPF_KPTR when all other types with matchable names fail */ if (field_mask & (BPF_KPTR | BPF_UPTR) && !__btf_type_is_struct(var_type)) { @@ -3538,6 +3539,7 @@ static int btf_repeat_fields(struct btf_field_info *info, int info_cnt, case BPF_UPTR: case BPF_LIST_HEAD: case BPF_RB_ROOT: + case BPF_DYNPTR: break; default: return -EINVAL; @@ -3660,6 +3662,7 @@ static int btf_find_field_one(const struct btf *btf, case BPF_LIST_NODE: case BPF_RB_NODE: case BPF_REFCOUNT: + case BPF_DYNPTR: ret = btf_find_struct(btf, var_type, off, sz, field_type, info_cnt ? &info[0] : &tmp); if (ret < 0) @@ -3925,6 +3928,16 @@ static int btf_field_cmp(const void *_a, const void *_b, const void *priv) return 0; } +static void btf_init_record(struct btf_record *record) +{ + record->cnt = 0; + record->field_mask = 0; + record->spin_lock_off = -EINVAL; + record->timer_off = -EINVAL; + record->wq_off = -EINVAL; + record->refcount_off = -EINVAL; +} + struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t, u32 field_mask, u32 value_size) { @@ -3943,14 +3956,11 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type /* This needs to be kzalloc to zero out padding and unused fields, see * comment in btf_record_equal. */ - rec = kzalloc(offsetof(struct btf_record, fields[cnt]), GFP_KERNEL | __GFP_NOWARN); + rec = kzalloc(struct_size(rec, fields, cnt), GFP_KERNEL | __GFP_NOWARN); if (!rec) return ERR_PTR(-ENOMEM); - rec->spin_lock_off = -EINVAL; - rec->timer_off = -EINVAL; - rec->wq_off = -EINVAL; - rec->refcount_off = -EINVAL; + btf_init_record(rec); for (i = 0; i < cnt; i++) { field_type_size = btf_field_type_size(info_arr[i].type); if (info_arr[i].off + field_type_size > value_size) { @@ -4010,6 +4020,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type break; case BPF_LIST_NODE: case BPF_RB_NODE: + case BPF_DYNPTR: break; default: ret = -EFAULT; @@ -4041,6 +4052,25 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type return ERR_PTR(ret); } +struct btf_record *btf_new_bpf_dynptr_record(void) +{ + struct btf_record *record; + + record = kzalloc(struct_size(record, fields, 1), GFP_KERNEL | __GFP_NOWARN); + if (!record) + return ERR_PTR(-ENOMEM); + + btf_init_record(record); + + record->cnt = 1; + record->field_mask = BPF_DYNPTR; + record->fields[0].offset = 0; + record->fields[0].size = sizeof(struct bpf_dynptr); + record->fields[0].type = BPF_DYNPTR; + + return record; +} + int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec) { int i; @@ -7439,6 +7469,12 @@ static bool btf_is_dynptr_ptr(const struct btf *btf, const struct btf_type *t) return false; } +bool btf_type_is_dynptr(const struct btf *btf, const struct btf_type *t) +{ + return __btf_type_is_struct(t) && t->size == sizeof(struct bpf_dynptr) && + !strcmp(__btf_name_by_offset(btf, t->name_off), "bpf_dynptr"); +} + struct bpf_cand_cache { const char *name; u32 name_len; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 4a9eeb7aef855..948fbc87e47fb 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -19,7 +19,7 @@ #define HTAB_CREATE_FLAG_MASK \ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ - BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED) + BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED | BPF_INT_F_DYNPTR_IN_KEY) #define BATCH_OPS(_name) \ .map_lookup_batch = \ @@ -88,6 +88,7 @@ struct bpf_htab { struct bpf_map map; struct bpf_mem_alloc ma; struct bpf_mem_alloc pcpu_ma; + struct bpf_mem_alloc dynptr_ma; struct bucket *buckets; void *elems; union { @@ -425,6 +426,7 @@ static int htab_map_alloc_check(union bpf_attr *attr) bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED); + bool dynptr_in_key = (attr->map_flags & BPF_INT_F_DYNPTR_IN_KEY); int numa_node = bpf_map_attr_numa_node(attr); BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != @@ -438,6 +440,14 @@ static int htab_map_alloc_check(union bpf_attr *attr) !bpf_map_flags_access_ok(attr->map_flags)) return -EINVAL; + if (dynptr_in_key) { + if (percpu || lru || prealloc || !attr->map_extra) + return -EINVAL; + if ((attr->map_extra >> 32) || bpf_dynptr_check_size(attr->map_extra) || + bpf_mem_alloc_check_size(percpu, attr->map_extra)) + return -E2BIG; + } + if (!lru && percpu_lru) return -EINVAL; @@ -482,6 +492,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) */ bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); + bool dynptr_in_key = (attr->map_flags & BPF_INT_F_DYNPTR_IN_KEY); struct bpf_htab *htab; int err, i; @@ -598,6 +609,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (err) goto free_map_locked; } + if (dynptr_in_key) { + err = bpf_mem_alloc_init(&htab->dynptr_ma, 0, false); + if (err) + goto free_map_locked; + } } return &htab->map; @@ -610,6 +626,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) free_percpu(htab->map_locked[i]); bpf_map_area_free(htab->buckets); + bpf_mem_alloc_destroy(&htab->dynptr_ma); bpf_mem_alloc_destroy(&htab->pcpu_ma); bpf_mem_alloc_destroy(&htab->ma); free_elem_count: @@ -620,13 +637,55 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) return ERR_PTR(err); } -static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd) +static inline u32 __htab_map_hash(const void *key, u32 key_len, u32 hashrnd) { if (likely(key_len % 4 == 0)) return jhash2(key, key_len / 4, hashrnd); return jhash(key, key_len, hashrnd); } +static u32 htab_map_dynptr_hash(const void *key, u32 key_len, u32 hashrnd, + const struct btf_record *rec) +{ + unsigned int i, cnt = rec->cnt; + unsigned int hash = hashrnd; + unsigned int offset = 0; + + for (i = 0; i < cnt; i++) { + const struct btf_field *field = &rec->fields[i]; + const struct bpf_dynptr_kern *kptr; + unsigned int len; + + if (field->type != BPF_DYNPTR) + continue; + + /* non-dynptr part ? */ + if (offset < field->offset) + hash = jhash(key + offset, field->offset - offset, hash); + + /* Skip nullified dynptr */ + kptr = key + field->offset; + if (kptr->data) { + len = __bpf_dynptr_size(kptr); + hash = jhash(__bpf_dynptr_data(kptr, len), len, hash); + } + offset = field->offset + field->size; + } + + if (offset < key_len) + hash = jhash(key + offset, key_len - offset, hash); + + return hash; +} + +static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd, + const struct btf_record *rec) +{ + if (likely(!rec)) + return __htab_map_hash(key, key_len, hashrnd); + return htab_map_dynptr_hash(key, key_len, hashrnd, rec); +} + static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) { return &htab->buckets[hash & (htab->n_buckets - 1)]; @@ -637,15 +696,68 @@ static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 return &__select_bucket(htab, hash)->head; } +static bool is_same_dynptr_key(const void *key, const void *tgt, unsigned int key_size, + const struct btf_record *rec) +{ + unsigned int i, cnt = rec->cnt; + unsigned int offset = 0; + + for (i = 0; i < cnt; i++) { + const struct btf_field *field = &rec->fields[i]; + const struct bpf_dynptr_kern *kptr, *tgt_kptr; + const void *data, *tgt_data; + unsigned int len; + + if (field->type != BPF_DYNPTR) + continue; + + if (offset < field->offset && + memcmp(key + offset, tgt + offset, field->offset - offset)) + return false; + + /* + * For a nullified dynptr in the target key, __bpf_dynptr_size() + * will return 0, and there will be no match for the target key. + */ + kptr = key + field->offset; + tgt_kptr = tgt + field->offset; + len = __bpf_dynptr_size(kptr); + if (len != __bpf_dynptr_size(tgt_kptr)) + return false; + + data = __bpf_dynptr_data(kptr, len); + tgt_data = __bpf_dynptr_data(tgt_kptr, len); + if (memcmp(data, tgt_data, len)) + return false; + + offset = field->offset + field->size; + } + + if (offset < key_size && + memcmp(key + offset, tgt + offset, key_size - offset)) + return false; + + return true; +} + +static inline bool htab_is_same_key(const void *key, const void *tgt, unsigned int key_size, + const struct btf_record *rec) +{ + if (likely(!rec)) + return !memcmp(key, tgt, key_size); + return is_same_dynptr_key(key, tgt, key_size, rec); +} + /* this lookup function can only be called with bucket lock taken */ -static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash, - void *key, u32 key_size) +static __always_inline struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash, + void *key, u32 key_size, + const struct btf_record *record) { struct hlist_nulls_node *n; struct htab_elem *l; hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) - if (l->hash == hash && !memcmp(&l->key, key, key_size)) + if (l->hash == hash && htab_is_same_key(l->key, key, key_size, record)) return l; return NULL; @@ -655,16 +767,17 @@ static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash * the unlikely event when elements moved from one bucket into another * while link list is being walked */ -static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head, - u32 hash, void *key, - u32 key_size, u32 n_buckets) +static __always_inline struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head, + u32 hash, void *key, + u32 key_size, u32 n_buckets, + const struct btf_record *record) { struct hlist_nulls_node *n; struct htab_elem *l; again: hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) - if (l->hash == hash && !memcmp(&l->key, key, key_size)) + if (l->hash == hash && htab_is_same_key(l->key, key, key_size, record)) return l; if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1)))) @@ -681,6 +794,7 @@ static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head, static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + const struct btf_record *record; struct hlist_nulls_head *head; struct htab_elem *l; u32 hash, key_size; @@ -689,12 +803,13 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) !rcu_read_lock_bh_held()); key_size = map->key_size; + record = map->key_record; - hash = htab_map_hash(key, key_size, htab->hashrnd); + hash = htab_map_hash(key, key_size, htab->hashrnd, record); head = select_bucket(htab, hash); - l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); + l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets, record); return l; } @@ -784,6 +899,26 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map, return insn - insn_buf; } +static void htab_free_dynptr_key(struct bpf_htab *htab, void *key) +{ + const struct btf_record *record = htab->map.key_record; + unsigned int i, cnt = record->cnt; + + for (i = 0; i < cnt; i++) { + const struct btf_field *field = &record->fields[i]; + struct bpf_dynptr_kern *kptr; + + if (field->type != BPF_DYNPTR) + continue; + + /* It may be accessed concurrently, so don't overwrite + * the kptr. + */ + kptr = key + field->offset; + bpf_mem_free_rcu(&htab->dynptr_ma, kptr->data); + } +} + static void check_and_free_fields(struct bpf_htab *htab, struct htab_elem *elem) { @@ -835,10 +970,95 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) return l == tgt_l; } +static int htab_copy_dynptr_key(struct bpf_htab *htab, void *dst_key, const void *key, u32 key_size, + bool copy_in) +{ + const struct btf_record *rec = htab->map.key_record; + struct bpf_dynptr_kern *dst_kptr; + const struct btf_field *field; + unsigned int i, cnt, offset; + int err; + + offset = 0; + cnt = rec->cnt; + for (i = 0; i < cnt; i++) { + const struct bpf_dynptr_kern *kptr; + unsigned int len; + const void *data; + void *dst_data; + + field = &rec->fields[i]; + if (field->type != BPF_DYNPTR) + continue; + + if (offset < field->offset) + memcpy(dst_key + offset, key + offset, field->offset - offset); + + /* Doesn't support nullified dynptr in map key */ + kptr = key + field->offset; + if (copy_in && !kptr->data) { + err = -EINVAL; + goto out; + } + len = __bpf_dynptr_size(kptr); + data = __bpf_dynptr_data(kptr, len); + + dst_kptr = dst_key + field->offset; + if (copy_in) { + dst_data = bpf_mem_alloc(&htab->dynptr_ma, len); + if (!dst_data) { + err = -ENOMEM; + goto out; + } + bpf_dynptr_init(dst_kptr, dst_data, BPF_DYNPTR_TYPE_LOCAL, 0, len); + } else { + dst_data = __bpf_dynptr_data_rw(dst_kptr, len); + if (!dst_data) { + err = -ENOSPC; + goto out; + } + + if (__bpf_dynptr_size(dst_kptr) > len) + bpf_dynptr_set_size(dst_kptr, len); + } + memcpy(dst_data, data, len); + + offset = field->offset + field->size; + } + + if (offset < key_size) + memcpy(dst_key + offset, key + offset, key_size - offset); + + return 0; + +out: + for (; i > 0 && copy_in; i--) { + field = &rec->fields[i - 1]; + if (field->type != BPF_DYNPTR) + continue; + + dst_kptr = dst_key + field->offset; + bpf_mem_free(&htab->dynptr_ma, dst_kptr->data); + } + return err; +} + +static inline int htab_copy_next_key(struct bpf_htab *htab, void *next_key, const void *key, + u32 key_size) +{ + if (!bpf_map_has_dynptr_key(&htab->map)) { + memcpy(next_key, key, key_size); + return 0; + } + + return htab_copy_dynptr_key(htab, next_key, key, key_size, false); +} + /* Called from syscall */ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + const struct btf_record *key_record = map->key_record; struct hlist_nulls_head *head; struct htab_elem *l, *next_l; u32 hash, key_size; @@ -851,12 +1071,12 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) if (!key) goto find_first_elem; - hash = htab_map_hash(key, key_size, htab->hashrnd); + hash = htab_map_hash(key, key_size, htab->hashrnd, key_record); head = select_bucket(htab, hash); /* lookup the key */ - l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); + l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets, key_record); if (!l) goto find_first_elem; @@ -867,8 +1087,7 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) if (next_l) { /* if next elem in this hash list is non-zero, just return it */ - memcpy(next_key, next_l->key, key_size); - return 0; + return htab_copy_next_key(htab, next_key, next_l->key, key_size); } /* no more elements in this hash list, go to the next bucket */ @@ -885,8 +1104,7 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) struct htab_elem, hash_node); if (next_l) { /* if it's not empty, just return it */ - memcpy(next_key, next_l->key, key_size); - return 0; + return htab_copy_next_key(htab, next_key, next_l->key, key_size); } } @@ -896,11 +1114,27 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) { + bool dynptr_in_key = bpf_map_has_dynptr_key(&htab->map); + + if (dynptr_in_key) + htab_free_dynptr_key(htab, l->key); + check_and_free_fields(htab, l); if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr); - bpf_mem_cache_free(&htab->ma, l); + + /* + * For dynptr key, the update of dynptr in the key is not atomic: + * both the pointer and the size are updated. If the element is reused + * immediately, the access of the dynptr key during lookup procedure may + * incur invalid memory access due to mismatch between the size and the + * data pointer, so reuse the element after one RCU GP. + */ + if (dynptr_in_key) + bpf_mem_cache_free_rcu(&htab->ma, l); + else + bpf_mem_cache_free(&htab->ma, l); } static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) @@ -1047,7 +1281,19 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, } } - memcpy(l_new->key, key, key_size); + if (bpf_map_has_dynptr_key(&htab->map)) { + int copy_err; + + copy_err = htab_copy_dynptr_key(htab, l_new->key, key, key_size, true); + if (copy_err) { + bpf_mem_cache_free(&htab->ma, l_new); + l_new = ERR_PTR(copy_err); + goto dec_count; + } + } else { + memcpy(l_new->key, key, key_size); + } + if (percpu) { if (prealloc) { pptr = htab_elem_get_ptr(l_new, key_size); @@ -1103,6 +1349,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + const struct btf_record *key_record = map->key_record; struct htab_elem *l_new = NULL, *l_old; struct hlist_nulls_head *head; unsigned long flags; @@ -1120,7 +1367,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, key_size = map->key_size; - hash = htab_map_hash(key, key_size, htab->hashrnd); + hash = htab_map_hash(key, key_size, htab->hashrnd, key_record); b = __select_bucket(htab, hash); head = &b->head; @@ -1130,7 +1377,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, return -EINVAL; /* find an element without taking the bucket lock */ l_old = lookup_nulls_elem_raw(head, hash, key, key_size, - htab->n_buckets); + htab->n_buckets, key_record); ret = check_flags(htab, l_old, map_flags); if (ret) return ret; @@ -1151,7 +1398,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, if (ret) return ret; - l_old = lookup_elem_raw(head, hash, key, key_size); + l_old = lookup_elem_raw(head, hash, key, key_size, key_record); ret = check_flags(htab, l_old, map_flags); if (ret) @@ -1238,7 +1485,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value key_size = map->key_size; - hash = htab_map_hash(key, key_size, htab->hashrnd); + hash = __htab_map_hash(key, key_size, htab->hashrnd); b = __select_bucket(htab, hash); head = &b->head; @@ -1258,7 +1505,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value if (ret) goto err_lock_bucket; - l_old = lookup_elem_raw(head, hash, key, key_size); + l_old = lookup_elem_raw(head, hash, key, key_size, NULL); ret = check_flags(htab, l_old, map_flags); if (ret) @@ -1307,7 +1554,7 @@ static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, key_size = map->key_size; - hash = htab_map_hash(key, key_size, htab->hashrnd); + hash = __htab_map_hash(key, key_size, htab->hashrnd); b = __select_bucket(htab, hash); head = &b->head; @@ -1316,7 +1563,7 @@ static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, if (ret) return ret; - l_old = lookup_elem_raw(head, hash, key, key_size); + l_old = lookup_elem_raw(head, hash, key, key_size, NULL); ret = check_flags(htab, l_old, map_flags); if (ret) @@ -1362,7 +1609,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, key_size = map->key_size; - hash = htab_map_hash(key, key_size, htab->hashrnd); + hash = htab_map_hash(key, key_size, htab->hashrnd, NULL); b = __select_bucket(htab, hash); head = &b->head; @@ -1382,7 +1629,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, if (ret) goto err_lock_bucket; - l_old = lookup_elem_raw(head, hash, key, key_size); + l_old = lookup_elem_raw(head, hash, key, key_size, NULL); ret = check_flags(htab, l_old, map_flags); if (ret) @@ -1428,6 +1675,7 @@ static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, static long htab_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + const struct btf_record *key_record = map->key_record; struct hlist_nulls_head *head; struct bucket *b; struct htab_elem *l; @@ -1440,7 +1688,7 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key) key_size = map->key_size; - hash = htab_map_hash(key, key_size, htab->hashrnd); + hash = htab_map_hash(key, key_size, htab->hashrnd, key_record); b = __select_bucket(htab, hash); head = &b->head; @@ -1448,7 +1696,7 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key) if (ret) return ret; - l = lookup_elem_raw(head, hash, key, key_size); + l = lookup_elem_raw(head, hash, key, key_size, key_record); if (l) hlist_nulls_del_rcu(&l->hash_node); else @@ -1476,7 +1724,7 @@ static long htab_lru_map_delete_elem(struct bpf_map *map, void *key) key_size = map->key_size; - hash = htab_map_hash(key, key_size, htab->hashrnd); + hash = __htab_map_hash(key, key_size, htab->hashrnd); b = __select_bucket(htab, hash); head = &b->head; @@ -1484,7 +1732,7 @@ static long htab_lru_map_delete_elem(struct bpf_map *map, void *key) if (ret) return ret; - l = lookup_elem_raw(head, hash, key, key_size); + l = lookup_elem_raw(head, hash, key, key_size, NULL); if (l) hlist_nulls_del_rcu(&l->hash_node); @@ -1579,6 +1827,7 @@ static void htab_map_free(struct bpf_map *map) bpf_map_free_elem_count(map); free_percpu(htab->extra_elems); bpf_map_area_free(htab->buckets); + bpf_mem_alloc_destroy(&htab->dynptr_ma); bpf_mem_alloc_destroy(&htab->pcpu_ma); bpf_mem_alloc_destroy(&htab->ma); if (htab->use_percpu_counter) @@ -1615,6 +1864,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, bool is_percpu, u64 flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + const struct btf_record *key_record; struct hlist_nulls_head *head; unsigned long bflags; struct htab_elem *l; @@ -1623,8 +1873,9 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, int ret; key_size = map->key_size; + key_record = map->key_record; - hash = htab_map_hash(key, key_size, htab->hashrnd); + hash = htab_map_hash(key, key_size, htab->hashrnd, key_record); b = __select_bucket(htab, hash); head = &b->head; @@ -1632,7 +1883,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, if (ret) return ret; - l = lookup_elem_raw(head, hash, key, key_size); + l = lookup_elem_raw(head, hash, key, key_size, key_record); if (!l) { ret = -ENOENT; goto out_unlock; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index f27ce162427ab..de4f90b968374 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1688,7 +1688,7 @@ u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr) return ptr->size & DYNPTR_SIZE_MASK; } -static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u32 new_size) +void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u32 new_size) { u32 metadata = ptr->size & ~DYNPTR_SIZE_MASK; diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 645bd30bc9a9d..564ebcc857564 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -12,6 +12,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) struct bpf_map *inner_map, *inner_map_meta; u32 inner_map_meta_size; CLASS(fd, f)(inner_map_ufd); + int ret; inner_map = __bpf_map_get(f); if (IS_ERR(inner_map)) @@ -45,10 +46,15 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) * invalid/empty/valid, but ERR_PTR in case of errors. During * equality NULL or IS_ERR is equivalent. */ - struct bpf_map *ret = ERR_CAST(inner_map_meta->record); - kfree(inner_map_meta); - return ret; + ret = PTR_ERR(inner_map_meta->record); + goto free_meta; } + inner_map_meta->key_record = btf_record_dup(inner_map->key_record); + if (IS_ERR(inner_map_meta->key_record)) { + ret = PTR_ERR(inner_map_meta->key_record); + goto free_record; + } + /* Note: We must use the same BTF, as we also used btf_record_dup above * which relies on BTF being same for both maps, as some members like * record->fields.list_head have pointers like value_rec pointing into @@ -71,6 +77,12 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_map_meta->bypass_spec_v1 = inner_map->bypass_spec_v1; } return inner_map_meta; + +free_record: + btf_record_free(inner_map_meta->record); +free_meta: + kfree(inner_map_meta); + return ERR_PTR(ret); } void bpf_map_meta_free(struct bpf_map *map_meta) @@ -88,7 +100,8 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0, meta0->key_size == meta1->key_size && meta0->value_size == meta1->value_size && meta0->map_flags == meta1->map_flags && - btf_record_equal(meta0->record, meta1->record); + btf_record_equal(meta0->record, meta1->record) && + btf_record_equal(meta0->key_record, meta1->key_record); } void *bpf_map_fd_get_ptr(struct bpf_map *map, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c420edbfb7c87..ca57e126b45f7 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -651,6 +651,7 @@ void btf_record_free(struct btf_record *rec) case BPF_TIMER: case BPF_REFCOUNT: case BPF_WORKQUEUE: + case BPF_DYNPTR: /* Nothing to release */ break; default: @@ -664,7 +665,9 @@ void btf_record_free(struct btf_record *rec) void bpf_map_free_record(struct bpf_map *map) { btf_record_free(map->record); + btf_record_free(map->key_record); map->record = NULL; + map->key_record = NULL; } struct btf_record *btf_record_dup(const struct btf_record *rec) @@ -703,6 +706,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) case BPF_TIMER: case BPF_REFCOUNT: case BPF_WORKQUEUE: + case BPF_DYNPTR: /* Nothing to acquire */ break; default: @@ -821,6 +825,8 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) case BPF_RB_NODE: case BPF_REFCOUNT: break; + case BPF_DYNPTR: + break; default: WARN_ON_ONCE(1); continue; @@ -830,6 +836,7 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) static void bpf_map_free(struct bpf_map *map) { + struct btf_record *key_rec = map->key_record; struct btf_record *rec = map->record; struct btf *btf = map->btf; @@ -850,6 +857,7 @@ static void bpf_map_free(struct bpf_map *map) * eventually calls bpf_map_free_meta, since inner_map_meta is only a * template bpf_map struct used during verification. */ + btf_record_free(key_rec); btf_record_free(rec); /* Delay freeing of btf for maps, as map_free callback may need * struct_meta info which will be freed with btf_put(). @@ -1180,6 +1188,8 @@ int map_check_no_btf(const struct bpf_map *map, return -ENOTSUPP; } +#define MAX_DYNPTR_CNT_IN_MAP_KEY 1 + static int map_check_btf(struct bpf_map *map, struct bpf_token *token, const struct btf *btf, u32 btf_key_id, u32 btf_value_id) { @@ -1202,6 +1212,37 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, if (!value_type || value_size != map->value_size) return -EINVAL; + /* Key BTF type can't be data section */ + if (btf_type_is_dynptr(btf, key_type)) + map->key_record = btf_new_bpf_dynptr_record(); + else if (__btf_type_is_struct(key_type)) + map->key_record = btf_parse_fields(btf, key_type, BPF_DYNPTR, map->key_size); + else + map->key_record = NULL; + if (!IS_ERR_OR_NULL(map->key_record)) { + if (map->key_record->cnt > MAX_DYNPTR_CNT_IN_MAP_KEY) { + ret = -E2BIG; + goto free_map_tab; + } + if (map->map_type != BPF_MAP_TYPE_HASH) { + ret = -EOPNOTSUPP; + goto free_map_tab; + } + if (!bpf_token_capable(token, CAP_BPF)) { + ret = -EPERM; + goto free_map_tab; + } + /* Disallow key with dynptr for special map */ + if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) { + ret = -EACCES; + goto free_map_tab; + } + } else if (IS_ERR(map->key_record)) { + /* Return an error early even the bpf program doesn't use it */ + ret = PTR_ERR(map->key_record); + goto free_map_tab; + } + map->record = btf_parse_fields(btf, value_type, BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR, @@ -1304,6 +1345,49 @@ static bool bpf_net_capable(void) return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN); } +static struct btf *get_map_btf(int btf_fd) +{ + struct btf *btf = btf_get_by_fd(btf_fd); + + if (IS_ERR(btf)) + return btf; + + if (btf_is_kernel(btf)) { + btf_put(btf); + return ERR_PTR(-EACCES); + } + + return btf; +} + +static int map_has_dynptr_in_key_type(struct btf *btf, u32 btf_key_id, u32 key_size) +{ + const struct btf_type *type; + struct btf_record *record; + u32 btf_key_size; + + if (!btf_key_id) + return 0; + + type = btf_type_id_size(btf, &btf_key_id, &btf_key_size); + if (!type || btf_key_size != key_size) + return -EINVAL; + + /* For dynptr key, key BTF type must be struct */ + if (!__btf_type_is_struct(type)) + return 0; + + if (btf_type_is_dynptr(btf, type)) + return 1; + + record = btf_parse_fields(btf, type, BPF_DYNPTR, key_size); + if (IS_ERR(record)) + return PTR_ERR(record); + + btf_record_free(record); + return !!record; +} + #define BPF_MAP_CREATE_LAST_FIELD map_token_fd /* called via syscall */ static int map_create(union bpf_attr *attr) @@ -1312,6 +1396,7 @@ static int map_create(union bpf_attr *attr) struct bpf_token *token = NULL; int numa_node = bpf_map_attr_numa_node(attr); u32 map_type = attr->map_type; + struct btf *btf = NULL; struct bpf_map *map; bool token_flag; int f_flags; @@ -1321,6 +1406,8 @@ static int map_create(union bpf_attr *attr) if (err) return -EINVAL; + if (attr->map_flags & BPF_INT_F_MASK) + return -EINVAL; /* check BPF_F_TOKEN_FD flag, remember if it's set, and then clear it * to avoid per-map type checks tripping on unknown flag */ @@ -1335,43 +1422,72 @@ static int map_create(union bpf_attr *attr) return -EINVAL; } + if (attr->btf_key_type_id || attr->btf_value_type_id) { + btf = get_map_btf(attr->btf_fd); + if (IS_ERR(btf)) + return PTR_ERR(btf); + + err = map_has_dynptr_in_key_type(btf, attr->btf_key_type_id, attr->key_size); + if (err < 0) + goto put_btf; + if (err > 0) { + attr->map_flags |= BPF_INT_F_DYNPTR_IN_KEY; + err = 0; + } + } + if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER && attr->map_type != BPF_MAP_TYPE_ARENA && - attr->map_extra != 0) - return -EINVAL; + !(attr->map_flags & BPF_INT_F_DYNPTR_IN_KEY) && + attr->map_extra != 0) { + err = -EINVAL; + goto put_btf; + } f_flags = bpf_get_file_flag(attr->map_flags); - if (f_flags < 0) - return f_flags; + if (f_flags < 0) { + err = f_flags; + goto put_btf; + } if (numa_node != NUMA_NO_NODE && ((unsigned int)numa_node >= nr_node_ids || - !node_online(numa_node))) - return -EINVAL; + !node_online(numa_node))) { + err = -EINVAL; + goto put_btf; + } /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ map_type = attr->map_type; - if (map_type >= ARRAY_SIZE(bpf_map_types)) - return -EINVAL; + if (map_type >= ARRAY_SIZE(bpf_map_types)) { + err = -EINVAL; + goto put_btf; + } map_type = array_index_nospec(map_type, ARRAY_SIZE(bpf_map_types)); ops = bpf_map_types[map_type]; - if (!ops) - return -EINVAL; + if (!ops) { + err = -EINVAL; + goto put_btf; + } if (ops->map_alloc_check) { err = ops->map_alloc_check(attr); if (err) - return err; + goto put_btf; } if (attr->map_ifindex) ops = &bpf_map_offload_ops; - if (!ops->map_mem_usage) - return -EINVAL; + if (!ops->map_mem_usage) { + err = -EINVAL; + goto put_btf; + } if (token_flag) { token = bpf_token_get_from_fd(attr->map_token_fd); - if (IS_ERR(token)) - return PTR_ERR(token); + if (IS_ERR(token)) { + err = PTR_ERR(token); + goto put_btf; + } /* if current token doesn't grant map creation permissions, * then we can't use this token, so ignore it and rely on @@ -1461,30 +1577,27 @@ static int map_create(union bpf_attr *attr) mutex_init(&map->freeze_mutex); spin_lock_init(&map->owner.lock); - if (attr->btf_key_type_id || attr->btf_value_type_id || - /* Even the map's value is a kernel's struct, - * the bpf_prog.o must have BTF to begin with - * to figure out the corresponding kernel's - * counter part. Thus, attr->btf_fd has - * to be valid also. - */ - attr->btf_vmlinux_value_type_id) { - struct btf *btf; - - btf = btf_get_by_fd(attr->btf_fd); - if (IS_ERR(btf)) { - err = PTR_ERR(btf); - goto free_map; - } - if (btf_is_kernel(btf)) { - btf_put(btf); - err = -EACCES; - goto free_map; + /* Even the struct_ops map's value is a kernel's struct, + * the bpf_prog.o must have BTF to begin with + * to figure out the corresponding kernel's + * counter part. Thus, attr->btf_fd has + * to be valid also. + */ + if (btf || attr->btf_vmlinux_value_type_id) { + if (!btf) { + btf = get_map_btf(attr->btf_fd); + if (IS_ERR(btf)) { + err = PTR_ERR(btf); + btf = NULL; + goto free_map; + } } + map->btf = btf; + btf = NULL; if (attr->btf_value_type_id) { - err = map_check_btf(map, token, btf, attr->btf_key_type_id, + err = map_check_btf(map, token, map->btf, attr->btf_key_type_id, attr->btf_value_type_id); if (err) goto free_map; @@ -1516,7 +1629,6 @@ static int map_create(union bpf_attr *attr) * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID. */ bpf_map_put_with_uref(map); - return err; } return err; @@ -1527,6 +1639,8 @@ static int map_create(union bpf_attr *attr) bpf_map_free(map); put_token: bpf_token_put(token); +put_btf: + btf_put(btf); return err; } @@ -1597,10 +1711,87 @@ int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) return -ENOTSUPP; } -static void *__bpf_copy_key(void __user *ukey, u64 key_size) +static void *bpf_copy_from_dynptr_ukey(const struct bpf_map *map, bpfptr_t ukey, bool copy_data) { - if (key_size) - return vmemdup_user(ukey, key_size); + const struct btf_record *record; + const struct btf_field *field; + struct bpf_dynptr_user *uptr; + struct bpf_dynptr_kern *kptr; + void *key, *new_key, *kdata; + unsigned int key_size, size; + unsigned int i; + int err; + + key_size = map->key_size; + key = kvmemdup_bpfptr(ukey, key_size); + if (IS_ERR(key)) + return ERR_CAST(key); + + size = key_size; + record = map->key_record; + for (i = 0; i < record->cnt; i++) { + field = &record->fields[i]; + if (field->type != BPF_DYNPTR) + continue; + + uptr = key + field->offset; + if (!uptr->size || uptr->size > map->map_extra || uptr->reserved) { + err = -EINVAL; + goto free_key; + } + + size += uptr->size; + /* Overflow ? */ + if (size < uptr->size) { + err = -E2BIG; + goto free_key; + } + } + + /* Place all dynptrs' data in the end of the key */ + new_key = kvrealloc(key, size, GFP_USER | __GFP_NOWARN); + if (!new_key) { + err = -ENOMEM; + goto free_key; + } + + key = new_key; + kdata = key + key_size; + for (i = 0; i < record->cnt; i++) { + field = &record->fields[i]; + if (field->type != BPF_DYNPTR) + continue; + + uptr = key + field->offset; + size = uptr->size; + if (copy_data) { + bpfptr_t udata = make_bpfptr((u64)(uintptr_t)uptr->data, + bpfptr_is_kernel(ukey)); + + if (copy_from_bpfptr(kdata, udata, size)) { + err = -EFAULT; + goto free_key; + } + } + kptr = (struct bpf_dynptr_kern *)uptr; + bpf_dynptr_init(kptr, kdata, BPF_DYNPTR_TYPE_LOCAL, 0, size); + kdata += size; + } + + return key; + +free_key: + kvfree(key); + return ERR_PTR(err); +} + +static void *__bpf_copy_key(const struct bpf_map *map, void __user *ukey) +{ + if (bpf_map_has_dynptr_key(map)) + return bpf_copy_from_dynptr_ukey(map, USER_BPFPTR(ukey), true); + + if (map->key_size) + return vmemdup_user(ukey, map->key_size); if (ukey) return ERR_PTR(-EINVAL); @@ -1608,10 +1799,13 @@ static void *__bpf_copy_key(void __user *ukey, u64 key_size) return NULL; } -static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size) +static void *___bpf_copy_key(const struct bpf_map *map, bpfptr_t ukey) { - if (key_size) - return kvmemdup_bpfptr(ukey, key_size); + if (bpf_map_has_dynptr_key(map)) + return bpf_copy_from_dynptr_ukey(map, ukey, true); + + if (map->key_size) + return kvmemdup_bpfptr(ukey, map->key_size); if (!bpfptr_is_null(ukey)) return ERR_PTR(-EINVAL); @@ -1619,6 +1813,51 @@ static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size) return NULL; } +static int bpf_copy_to_dynptr_ukey(const struct bpf_map *map, + void __user *ukey, void *key) +{ + struct bpf_dynptr_user __user *uptr; + struct bpf_dynptr_kern *kptr; + struct btf_record *record; + unsigned int i, offset; + + offset = 0; + record = map->key_record; + for (i = 0; i < record->cnt; i++) { + struct btf_field *field; + unsigned int size; + void *udata; + + field = &record->fields[i]; + if (field->type != BPF_DYNPTR) + continue; + + /* Any no-dynptr part before the dynptr ? */ + if (offset < field->offset && + copy_to_user(ukey + offset, key + offset, field->offset - offset)) + return -EFAULT; + + /* dynptr part */ + uptr = ukey + field->offset; + if (copy_from_user(&udata, &uptr->data, sizeof(udata))) + return -EFAULT; + + kptr = key + field->offset; + size = __bpf_dynptr_size(kptr); + if (copy_to_user((void __user *)udata, __bpf_dynptr_data(kptr, size), size) || + put_user(size, &uptr->size) || put_user(0, &uptr->reserved)) + return -EFAULT; + + offset = field->offset + field->size; + } + + if (offset < map->key_size && + copy_to_user(ukey + offset, key + offset, map->key_size - offset)) + return -EFAULT; + + return 0; +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags @@ -1648,7 +1887,7 @@ static int map_lookup_elem(union bpf_attr *attr) !btf_record_has_field(map->record, BPF_SPIN_LOCK)) return -EINVAL; - key = __bpf_copy_key(ukey, map->key_size); + key = __bpf_copy_key(map, ukey); if (IS_ERR(key)) return PTR_ERR(key); @@ -1715,7 +1954,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) goto err_put; } - key = ___bpf_copy_key(ukey, map->key_size); + key = ___bpf_copy_key(map, ukey); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; @@ -1762,7 +2001,7 @@ static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr) goto err_put; } - key = ___bpf_copy_key(ukey, map->key_size); + key = ___bpf_copy_key(map, ukey); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; @@ -1814,17 +2053,26 @@ static int map_get_next_key(union bpf_attr *attr) return -EPERM; if (ukey) { - key = __bpf_copy_key(ukey, map->key_size); + key = __bpf_copy_key(map, ukey); if (IS_ERR(key)) return PTR_ERR(key); } else { key = NULL; } - err = -ENOMEM; - next_key = kvmalloc(map->key_size, GFP_USER); - if (!next_key) + if (bpf_map_has_dynptr_key(map)) + next_key = bpf_copy_from_dynptr_ukey(map, USER_BPFPTR(unext_key), false); + else + next_key = kvmalloc(map->key_size, GFP_USER); + if (IS_ERR_OR_NULL(next_key)) { + if (!next_key) { + err = -ENOMEM; + } else { + err = PTR_ERR(next_key); + next_key = NULL; + } goto free_key; + } if (bpf_map_is_offloaded(map)) { err = bpf_map_offload_get_next_key(map, key, next_key); @@ -1838,12 +2086,13 @@ static int map_get_next_key(union bpf_attr *attr) if (err) goto free_next_key; - err = -EFAULT; - if (copy_to_user(unext_key, next_key, map->key_size) != 0) + if (bpf_map_has_dynptr_key(map)) + err = bpf_copy_to_dynptr_ukey(map, unext_key, next_key); + else + err = copy_to_user(unext_key, next_key, map->key_size) ? -EFAULT : 0; + if (err) goto free_next_key; - err = 0; - free_next_key: kvfree(next_key); free_key: @@ -2111,7 +2360,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) goto err_put; } - key = __bpf_copy_key(ukey, map->key_size); + key = __bpf_copy_key(map, ukey); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; @@ -4982,7 +5231,7 @@ static int bpf_map_get_info_by_fd(struct file *file, info.key_size = map->key_size; info.value_size = map->value_size; info.max_entries = map->max_entries; - info.map_flags = map->map_flags; + info.map_flags = map->map_flags & ~BPF_INT_F_MASK; info.map_extra = map->map_extra; memcpy(info.name, map->name, sizeof(map->name)); @@ -5293,6 +5542,10 @@ static int bpf_map_do_batch(const union bpf_attr *attr, err = -EPERM; goto err_put; } + if (bpf_map_has_dynptr_key(map)) { + err = -EOPNOTSUPP; + goto err_put; + } if (cmd == BPF_MAP_LOOKUP_BATCH) BPF_DO_BATCH(map->ops->map_lookup_batch, map, attr, uattr); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9971c03adfd5d..b2fd849415971 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -791,7 +791,7 @@ static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_func_stat * While we don't allow reading STACK_INVALID, it is still possible to * do <8 byte writes marking some but not all slots as STACK_MISC. Then, * helpers or insns can do partial read of that part without failing, - * but check_stack_range_initialized, check_stack_read_var_off, and + * but check_stack_range_access, check_stack_read_var_off, and * check_stack_read_fixed_off will do mark_reg_read for all 8-bytes of * the slot conservatively. Hence we need to prevent those liveness * marking walks. @@ -5301,11 +5301,11 @@ enum bpf_access_src { ACCESS_HELPER = 2, /* the access is performed by a helper */ }; -static int check_stack_range_initialized(struct bpf_verifier_env *env, - int regno, int off, int access_size, - bool zero_size_allowed, - enum bpf_access_type type, - struct bpf_call_arg_meta *meta); +static int check_stack_range_access(struct bpf_verifier_env *env, + int regno, int off, int access_size, + bool zero_size_allowed, + enum bpf_access_type type, + struct bpf_call_arg_meta *meta); static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno) { @@ -5336,8 +5336,8 @@ static int check_stack_read_var_off(struct bpf_verifier_env *env, /* Note that we pass a NULL meta, so raw access will not be permitted. */ - err = check_stack_range_initialized(env, ptr_regno, off, size, - false, BPF_READ, NULL); + err = check_stack_range_access(env, ptr_regno, off, size, + false, BPF_READ, NULL); if (err) return err; @@ -7625,44 +7625,13 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i return 0; } -/* When register 'regno' is used to read the stack (either directly or through - * a helper function) make sure that it's within stack boundary and, depending - * on the access type and privileges, that all elements of the stack are - * initialized. - * - * 'off' includes 'regno->off', but not its dynamic part (if any). - * - * All registers that have been spilled on the stack in the slots within the - * read offsets are marked as read. - */ -static int check_stack_range_initialized( - struct bpf_verifier_env *env, int regno, int off, - int access_size, bool zero_size_allowed, - enum bpf_access_type type, struct bpf_call_arg_meta *meta) +static int get_stack_access_range(struct bpf_verifier_env *env, int regno, int off, + int *min_off, int *max_off) { struct bpf_reg_state *reg = reg_state(env, regno); - struct bpf_func_state *state = func(env, reg); - int err, min_off, max_off, i, j, slot, spi; - /* Some accesses can write anything into the stack, others are - * read-only. - */ - bool clobber = false; - - if (access_size == 0 && !zero_size_allowed) { - verbose(env, "invalid zero-sized read\n"); - return -EACCES; - } - - if (type == BPF_WRITE) - clobber = true; - - err = check_stack_access_within_bounds(env, regno, off, access_size, type); - if (err) - return err; - if (tnum_is_const(reg->var_off)) { - min_off = max_off = reg->var_off.value + off; + *min_off = *max_off = reg->var_off.value + off; } else { /* Variable offset is prohibited for unprivileged mode for * simplicity since it requires corresponding support in @@ -7677,50 +7646,158 @@ static int check_stack_range_initialized( regno, tn_buf); return -EACCES; } - /* Only initialized buffer on stack is allowed to be accessed - * with variable offset. With uninitialized buffer it's hard to - * guarantee that whole memory is marked as initialized on - * helper return since specific bounds are unknown what may - * cause uninitialized stack leaking. - */ - if (meta && meta->raw_mode) - meta = NULL; - min_off = reg->smin_value + off; - max_off = reg->smax_value + off; + *min_off = reg->smin_value + off; + *max_off = reg->smax_value + off; } - if (meta && meta->raw_mode) { - /* Ensure we won't be overwriting dynptrs when simulating byte - * by byte access in check_helper_call using meta.access_size. - * This would be a problem if we have a helper in the future - * which takes: - * - * helper(uninit_mem, len, dynptr) - * - * Now, uninint_mem may overlap with dynptr pointer. Hence, it - * may end up writing to dynptr itself when touching memory from - * arg 1. This can be relaxed on a case by case basis for known - * safe cases, but reject due to the possibilitiy of aliasing by - * default. - */ - for (i = min_off; i < max_off + access_size; i++) { - int stack_off = -i - 1; + return 0; +} - spi = __get_spi(i); - /* raw_mode may write past allocated_stack */ - if (state->allocated_stack <= stack_off) - continue; - if (state->stack[spi].slot_type[stack_off % BPF_REG_SIZE] == STACK_DYNPTR) { - verbose(env, "potential write to dynptr at off=%d disallowed\n", i); - return -EACCES; +static int allow_uninitialized_stack_range(struct bpf_verifier_env *env, int regno, + int min_off, int max_off, int access_size, + struct bpf_call_arg_meta *meta) +{ + struct bpf_reg_state *reg = reg_state(env, regno); + struct bpf_func_state *state = func(env, reg); + int i, stack_off, spi; + + /* Disallow uninitialized buffer on stack */ + if (!meta || !meta->raw_mode) + return 0; + + /* Only initialized buffer on stack is allowed to be accessed + * with variable offset. With uninitialized buffer it's hard to + * guarantee that whole memory is marked as initialized on + * helper return since specific bounds are unknown what may + * cause uninitialized stack leaking. + */ + if (!tnum_is_const(reg->var_off)) + return 0; + + /* Ensure we won't be overwriting dynptrs when simulating byte + * by byte access in check_helper_call using meta.access_size. + * This would be a problem if we have a helper in the future + * which takes: + * + * helper(uninit_mem, len, dynptr) + * + * Now, uninint_mem may overlap with dynptr pointer. Hence, it + * may end up writing to dynptr itself when touching memory from + * arg 1. This can be relaxed on a case by case basis for known + * safe cases, but reject due to the possibilitiy of aliasing by + * default. + */ + for (i = min_off; i < max_off + access_size; i++) { + stack_off = -i - 1; + spi = __get_spi(i); + /* raw_mode may write past allocated_stack */ + if (state->allocated_stack <= stack_off) + continue; + if (state->stack[spi].slot_type[stack_off % BPF_REG_SIZE] == STACK_DYNPTR) { + verbose(env, "potential write to dynptr at off=%d disallowed\n", i); + return -EACCES; + } + } + meta->access_size = access_size; + meta->regno = regno; + + return 1; +} + +struct dynptr_key_state { + const struct btf_record *rec; + const struct btf_field *cur_dynptr; + bool valid_dynptr_id; + int cur_dynptr_id; +}; + +static int init_dynptr_key_state(struct bpf_verifier_env *env, const struct btf_record *rec, + struct dynptr_key_state *state) +{ + unsigned int i; + + /* Find the first dynptr in the dynptr-key */ + for (i = 0; i < rec->cnt; i++) { + if (rec->fields[i].type == BPF_DYNPTR) + break; + } + if (i >= rec->cnt) { + verbose(env, "verifier bug: dynptr not found\n"); + return -EFAULT; + } + + state->rec = rec; + state->cur_dynptr = &rec->fields[i]; + state->valid_dynptr_id = false; + + return 0; +} + +static int check_dynptr_key_access(struct bpf_verifier_env *env, struct dynptr_key_state *state, + struct bpf_reg_state *reg, u8 stype, int offset) +{ + const struct btf_field *dynptr = state->cur_dynptr; + + /* Non-dynptr part before a dynptr or non-dynptr part after + * the last dynptr. + */ + if (offset < dynptr->offset || offset >= dynptr->offset + dynptr->size) { + if (stype == STACK_DYNPTR) { + verbose(env, + "dynptr-key expects non-dynptr at offset %d cur_dynptr_offset %u\n", + offset, dynptr->offset); + return -EACCES; + } + } else { + if (stype != STACK_DYNPTR) { + verbose(env, + "dynptr-key expects dynptr at offset %d cur_dynptr_offset %u\n", + offset, dynptr->offset); + return -EACCES; + } + + /* A dynptr is composed of parts from two dynptrs */ + if (state->valid_dynptr_id && reg->id != state->cur_dynptr_id) { + verbose(env, "malformed dynptr-key at offset %d cur_dynptr_offset %u\n", + offset, dynptr->offset); + return -EACCES; + } + if (!state->valid_dynptr_id) { + state->valid_dynptr_id = true; + state->cur_dynptr_id = reg->id; + } + + if (offset == dynptr->offset + dynptr->size - 1) { + const struct btf_record *rec = state->rec; + unsigned int i; + + for (i = dynptr - rec->fields + 1; i < rec->cnt; i++) { + if (rec->fields[i].type == BPF_DYNPTR) { + state->cur_dynptr = &rec->fields[i]; + state->valid_dynptr_id = false; + break; + } } } - meta->access_size = access_size; - meta->regno = regno; - return 0; } + return 0; +} + +static int check_stack_range_initialized(struct bpf_verifier_env *env, int regno, + int min_off, int max_off, int access_size, + enum bpf_access_type type, + struct dynptr_key_state *dynkey) +{ + struct bpf_reg_state *reg = reg_state(env, regno); + struct bpf_func_state *state = func(env, reg); + int i, j, slot, spi; + /* Some accesses can write anything into the stack, others are + * read-only. + */ + bool clobber = type == BPF_WRITE; + for (i = min_off; i < max_off + access_size; i++) { u8 *stype; @@ -7734,6 +7811,8 @@ static int check_stack_range_initialized( stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE]; if (*stype == STACK_MISC) goto mark; + if (dynkey && *stype == STACK_DYNPTR) + goto mark; if ((*stype == STACK_ZERO) || (*stype == STACK_INVALID && env->allow_uninit_stack)) { if (clobber) { @@ -7766,21 +7845,122 @@ static int check_stack_range_initialized( } return -EACCES; mark: + if (dynkey) { + int err = check_dynptr_key_access(env, dynkey, + &state->stack[spi].spilled_ptr, + *stype, i - min_off); + + if (err) + return err; + } + /* reading any byte out of 8-byte 'spill_slot' will cause * the whole slot to be marked as 'read' - */ - mark_reg_read(env, &state->stack[spi].spilled_ptr, - state->stack[spi].spilled_ptr.parent, - REG_LIVE_READ64); - /* We do not set REG_LIVE_WRITTEN for stack slot, as we can not + * + * We do not set REG_LIVE_WRITTEN for stack slot, as we can not * be sure that whether stack slot is written to or not. Hence, * we must still conservatively propagate reads upwards even if * helper may write to the entire memory range. */ + mark_reg_read(env, &state->stack[spi].spilled_ptr, + state->stack[spi].spilled_ptr.parent, + REG_LIVE_READ64); } + return 0; } +/* When register 'regno' is used to read the stack (either directly or through + * a helper function) make sure that it's within stack boundary and, depending + * on the access type and privileges, that all elements of the stack are + * initialized. + * + * 'off' includes 'regno->off', but not its dynamic part (if any). + * + * All registers that have been spilled on the stack in the slots within the + * read offsets are marked as read. + */ +static int check_stack_range_access(struct bpf_verifier_env *env, int regno, int off, + int access_size, bool zero_size_allowed, + enum bpf_access_type type, struct bpf_call_arg_meta *meta) +{ + int err, min_off, max_off; + + if (access_size == 0 && !zero_size_allowed) { + verbose(env, "invalid zero-sized read\n"); + return -EACCES; + } + + err = check_stack_access_within_bounds(env, regno, off, access_size, type); + if (err) + return err; + + err = get_stack_access_range(env, regno, off, &min_off, &max_off); + if (err) + return err; + + err = allow_uninitialized_stack_range(env, regno, min_off, max_off, access_size, meta); + if (err < 0) + return err; + if (err > 0) + return 0; + + return check_stack_range_initialized(env, regno, min_off, max_off, access_size, type, NULL); +} + +static int check_dynkey_stack_access_offset(struct bpf_verifier_env *env, int regno, int off) +{ + struct bpf_reg_state *reg = reg_state(env, regno); + + if (!tnum_is_const(reg->var_off)) { + verbose(env, "R%d variable offset prohibited for dynptr-key\n", regno); + return -EACCES; + } + + off = reg->var_off.value + off; + if (off % BPF_REG_SIZE) { + verbose(env, "R%d misaligned offset %d for dynptr-key\n", regno, off); + return -EACCES; + } + + return 0; +} + +/* It is almost the same as check_stack_range_access(), except the following + * things: + * (1) no need to check whether access_size is zero (due to non-zero key_size) + * (2) disallow uninitialized stack range + * (3) need BPF_REG_SIZE-aligned access with fixed-size offset + * (4) need to check whether the layout of bpf_dynptr part and non-bpf_dynptr + * part in the stack range is the same as the layout of dynptr key + */ +static int check_dynkey_stack_range_access(struct bpf_verifier_env *env, int regno, int off, + int access_size, struct bpf_call_arg_meta *meta) +{ + enum bpf_access_type type = BPF_READ; + struct dynptr_key_state dynkey; + int err, min_off, max_off; + + err = check_stack_access_within_bounds(env, regno, off, access_size, type); + if (err) + return err; + + err = check_dynkey_stack_access_offset(env, regno, off); + if (err) + return err; + + err = get_stack_access_range(env, regno, off, &min_off, &max_off); + if (err) + return err; + + err = init_dynptr_key_state(env, meta->map_ptr->key_record, &dynkey); + if (err) + return err; + + return check_stack_range_initialized(env, regno, min_off, max_off, access_size, type, + &dynkey); +} + static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, int access_size, enum bpf_access_type access_type, bool zero_size_allowed, @@ -7834,10 +8014,8 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, access_size, zero_size_allowed, max_access); case PTR_TO_STACK: - return check_stack_range_initialized( - env, - regno, reg->off, access_size, - zero_size_allowed, access_type, meta); + return check_stack_range_access(env, regno, reg->off, access_size, + zero_size_allowed, access_type, meta); case PTR_TO_BTF_ID: return check_ptr_to_btf_access(env, regs, regno, reg->off, access_size, BPF_READ, -1); @@ -9350,13 +9528,26 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, verbose(env, "invalid map_ptr to access map->key\n"); return -EACCES; } + key_size = meta->map_ptr->key_size; - err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL); + /* Only allow PTR_TO_STACK for dynptr-key */ + if (bpf_map_has_dynptr_key(meta->map_ptr)) { + if (base_type(reg->type) != PTR_TO_STACK) { + verbose(env, "map dynptr-key requires stack ptr but got %s\n", + reg_type_str(env, reg->type)); + return -EACCES; + } + err = check_dynkey_stack_range_access(env, regno, reg->off, key_size, meta); + } else { + err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL); + if (!err) { + meta->const_map_key = get_constant_map_key(env, reg, key_size); + if (meta->const_map_key < 0 && meta->const_map_key != -EOPNOTSUPP) + err = meta->const_map_key; + } + } if (err) return err; - meta->const_map_key = get_constant_map_key(env, reg, key_size); - if (meta->const_map_key < 0 && meta->const_map_key != -EOPNOTSUPP) - return meta->const_map_key; break; case ARG_PTR_TO_MAP_VALUE: if (type_may_be_null(arg_type) && register_is_null(reg)) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2acf9b3363717..7d96685513c55 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -7335,6 +7335,12 @@ struct bpf_dynptr { __u64 __opaque[2]; } __attribute__((aligned(8))); +struct bpf_dynptr_user { + __bpf_md_ptr(void *, data); + __u32 size; + __u32 reserved; +} __attribute__((aligned(8))); + struct bpf_list_head { __u64 __opaque[2]; } __attribute__((aligned(8))); diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 6722080b2107a..392036406f01b 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -812,6 +812,7 @@ $(OUTPUT)/bench_local_storage_create.o: $(OUTPUT)/bench_local_storage_create.ske $(OUTPUT)/bench_bpf_hashmap_lookup.o: $(OUTPUT)/bpf_hashmap_lookup.skel.h $(OUTPUT)/bench_htab_mem.o: $(OUTPUT)/htab_mem_bench.skel.h $(OUTPUT)/bench_bpf_crypto.o: $(OUTPUT)/crypto_bench.skel.h +$(OUTPUT)/bench_dynptr_key.o: $(OUTPUT)/dynptr_key_bench.skel.h $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ) $(OUTPUT)/bench: LDLIBS += -lm $(OUTPUT)/bench: $(OUTPUT)/bench.o \ @@ -832,6 +833,7 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \ $(OUTPUT)/bench_local_storage_create.o \ $(OUTPUT)/bench_htab_mem.o \ $(OUTPUT)/bench_bpf_crypto.o \ + $(OUTPUT)/bench_dynptr_key.o \ # $(call msg,BINARY,,$@) $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@ diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index 1bd403a5ef7b3..b13271600bc02 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -283,6 +283,7 @@ extern struct argp bench_local_storage_create_argp; extern struct argp bench_htab_mem_argp; extern struct argp bench_trigger_batch_argp; extern struct argp bench_crypto_argp; +extern struct argp bench_dynptr_key_argp; static const struct argp_child bench_parsers[] = { { &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 }, @@ -297,6 +298,7 @@ static const struct argp_child bench_parsers[] = { { &bench_htab_mem_argp, 0, "hash map memory benchmark", 0 }, { &bench_trigger_batch_argp, 0, "BPF triggering benchmark", 0 }, { &bench_crypto_argp, 0, "bpf crypto benchmark", 0 }, + { &bench_dynptr_key_argp, 0, "dynptr key benchmark", 0 }, {}, }; @@ -549,6 +551,10 @@ extern const struct bench bench_local_storage_create; extern const struct bench bench_htab_mem; extern const struct bench bench_crypto_encrypt; extern const struct bench bench_crypto_decrypt; +extern const struct bench bench_norm_htab_lookup; +extern const struct bench bench_dynkey_htab_lookup; +extern const struct bench bench_norm_htab_update; +extern const struct bench bench_dynkey_htab_update; static const struct bench *benchs[] = { &bench_count_global, @@ -609,6 +615,10 @@ static const struct bench *benchs[] = { &bench_htab_mem, &bench_crypto_encrypt, &bench_crypto_decrypt, + &bench_norm_htab_lookup, + &bench_dynkey_htab_lookup, + &bench_norm_htab_update, + &bench_dynkey_htab_update, }; static void find_benchmark(void) diff --git a/tools/testing/selftests/bpf/benchs/bench_dynptr_key.c b/tools/testing/selftests/bpf/benchs/bench_dynptr_key.c new file mode 100644 index 0000000000000..713f00cdaac69 --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_dynptr_key.c @@ -0,0 +1,612 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2025. Huawei Technologies Co., Ltd */ +#include +#include +#include +#include +#include "bench.h" +#include "bpf_util.h" +#include "cgroup_helpers.h" + +#include "dynptr_key_bench.skel.h" + +enum { + NORM_HTAB = 0, + DYNPTR_KEY_HTAB, +}; + +static struct dynptr_key_ctx { + struct dynptr_key_bench *skel; + int cgrp_dfd; + u64 map_slab_mem; +} ctx; + +static struct { + const char *file; + __u32 entries; + __u32 max_size; +} args = { + .max_size = 256, +}; + +struct run_stat { + __u64 stats[2]; +}; + +struct dynkey_key { + /* prevent unnecessary hole */ + __u64 cookie; + struct bpf_dynptr_user desc; +}; + +struct var_size_str { + /* the same size as cookie */ + __u64 len; + unsigned char data[]; +}; + +enum { + ARG_DATA_FILE = 11001, + ARG_DATA_ENTRIES = 11002, + ARG_MAX_SIZE = 11003, +}; + +static const struct argp_option opts[] = { + { "file", ARG_DATA_FILE, "DATA-FILE", 0, "Set data file" }, + { "entries", ARG_DATA_ENTRIES, "DATA-ENTRIES", 0, "Set data entries" }, + { "max_size", ARG_MAX_SIZE, "MAX-SIZE", 0, "Set data max size" }, + {}, +}; + +static error_t dynptr_key_parse_arg(int key, char *arg, struct argp_state *state) +{ + switch (key) { + case ARG_DATA_FILE: + args.file = strdup(arg); + if (!args.file) { + fprintf(stderr, "no mem for file name\n"); + argp_usage(state); + } + break; + case ARG_DATA_ENTRIES: + args.entries = strtoul(arg, NULL, 10); + break; + case ARG_MAX_SIZE: + args.max_size = strtoul(arg, NULL, 10); + break; + default: + return ARGP_ERR_UNKNOWN; + } + + return 0; +} + +const struct argp bench_dynptr_key_argp = { + .options = opts, + .parser = dynptr_key_parse_arg, +}; + +static int count_nr_item(const char *name, char *buf, size_t size, unsigned int *nr_items) +{ + unsigned int i = 0; + FILE *file; + int err; + + file = fopen(name, "rb"); + if (!file) { + fprintf(stderr, "open %s err %s\n", name, strerror(errno)); + return -1; + } + + err = 0; + while (true) { + unsigned int len; + char *got; + + got = fgets(buf, size, file); + if (!got) { + if (!feof(file)) { + fprintf(stderr, "read file %s error\n", name); + err = -1; + } + break; + } + + len = strlen(got); + if (len && got[len - 1] == '\n') { + got[len - 1] = 0; + len -= 1; + } + if (!len) + continue; + i++; + } + fclose(file); + + if (!err) + *nr_items = i; + + return err; +} + +static int parse_data_set(const char *name, struct var_size_str ***set, unsigned int *nr, + unsigned int *max_len) +{ +#define FILE_DATA_MAX_SIZE 4095 + unsigned int i, nr_items, item_max_len; + char line[FILE_DATA_MAX_SIZE + 1]; + struct var_size_str **items; + struct var_size_str *cur; + int err = 0; + FILE *file; + char *got; + + if (count_nr_item(name, line, sizeof(line), &nr_items)) + return -1; + if (!nr_items) { + fprintf(stderr, "empty file ?\n"); + return -1; + } + fprintf(stdout, "%u items in %s\n", nr_items, name); + + file = fopen(name, "rb"); + if (!file) { + fprintf(stderr, "open %s err %s\n", name, strerror(errno)); + return -1; + } + + items = (struct var_size_str **)calloc(nr_items, sizeof(*items) + FILE_DATA_MAX_SIZE); + if (!items) { + fprintf(stderr, "no mem for items\n"); + err = -1; + goto out; + } + + i = 0; + item_max_len = 0; + cur = (void *)items + sizeof(*items) * nr_items; + while (true) { + unsigned int len; + + got = fgets(line, sizeof(line), file); + if (!got) { + if (!feof(file)) { + fprintf(stderr, "read file %s error\n", name); + err = -1; + } + break; + } + + len = strlen(got); + if (len && got[len - 1] == '\n') { + got[len - 1] = 0; + len -= 1; + } + if (!len) + continue; + + if (i >= nr_items) { + fprintf(stderr, "too many line in %s\n", name); + break; + } + + if (len > item_max_len) + item_max_len = len; + cur->len = len; + memcpy(cur->data, got, len); + items[i++] = cur; + cur = (void *)cur + FILE_DATA_MAX_SIZE; + } + + if (!err) { + if (i != nr_items) + fprintf(stdout, "few lines in %s (exp %u got %u)\n", name, nr_items, i); + *nr = i; + *set = items; + *max_len = item_max_len; + } else { + free(items); + } + +out: + fclose(file); + return err; +} + +static int gen_data_set(unsigned int max_size, struct var_size_str ***set, unsigned int *nr, + unsigned int *max_len) +{ +#define GEN_DATA_MAX_SIZE 4088 + struct var_size_str **items; + size_t ptr_size, data_size; + struct var_size_str *cur; + unsigned int i, nr_items; + size_t left; + ssize_t got; + int err = 0; + void *dst; + + ptr_size = *nr * sizeof(*items); + data_size = *nr * (sizeof(*cur) + max_size); + items = (struct var_size_str **)malloc(ptr_size + data_size); + if (!items) { + fprintf(stderr, "no mem for items\n"); + err = -1; + goto out; + } + + cur = (void *)items + ptr_size; + dst = cur; + left = data_size; + while (left > 0) { + got = syscall(__NR_getrandom, dst, left, 0); + if (got <= 0) { + fprintf(stderr, "getrandom error %s got %zd\n", strerror(errno), got); + err = -1; + goto out; + } + left -= got; + dst += got; + } + + nr_items = 0; + for (i = 0; i < *nr; i++) { + cur->len &= (max_size - 1); + cur->len += 1; + if (cur->len > GEN_DATA_MAX_SIZE) + cur->len = GEN_DATA_MAX_SIZE; + items[nr_items++] = cur; + memset(cur->data + cur->len, 0, max_size - cur->len); + cur = (void *)cur + (sizeof(*cur) + max_size); + } + if (!nr_items) { + fprintf(stderr, "no valid key in random data\n"); + err = -1; + goto out; + } + fprintf(stdout, "generate %u random keys\n", nr_items); + + *nr = nr_items; + *set = items; + *max_len = max_size <= GEN_DATA_MAX_SIZE ? max_size : GEN_DATA_MAX_SIZE; +out: + if (err && items) + free(items); + return err; +} + +static inline bool is_pow_of_2(size_t x) +{ + return x && (x & (x - 1)) == 0; +} + +static void dynptr_key_validate(void) +{ + if (env.consumer_cnt != 0) { + fprintf(stderr, "dynptr_key benchmark doesn't support consumer!\n"); + exit(1); + } + + if (!args.file && !args.entries) { + fprintf(stderr, "must specify entries when use random generated data set\n"); + exit(1); + } + + if (args.file && access(args.file, R_OK)) { + fprintf(stderr, "data file is un-accessible\n"); + exit(1); + } + + if (args.entries && !is_pow_of_2(args.max_size)) { + fprintf(stderr, "invalid max size %u (should be power-of-two)\n", args.max_size); + exit(1); + } +} + +static void dynptr_key_init_map_opts(struct dynptr_key_bench *skel, unsigned int data_size, + unsigned int nr) +{ + /* The value will be used as the key for hash map */ + bpf_map__set_value_size(skel->maps.array, + offsetof(struct dynkey_key, desc) + data_size); + bpf_map__set_max_entries(skel->maps.array, nr); + + bpf_map__set_key_size(skel->maps.htab, offsetof(struct dynkey_key, desc) + data_size); + bpf_map__set_max_entries(skel->maps.htab, nr); + + bpf_map__set_map_extra(skel->maps.dynkey_htab, data_size); + bpf_map__set_max_entries(skel->maps.dynkey_htab, nr); +} + +static void dynptr_key_setup_key_map(struct bpf_map *map, struct var_size_str **set, + unsigned int nr) +{ + int fd = bpf_map__fd(map); + unsigned int i; + + for (i = 0; i < nr; i++) { + void *value; + int err; + + value = (void *)set[i]; + err = bpf_map_update_elem(fd, &i, value, 0); + if (err) { + fprintf(stderr, "add #%u key (%s) on %s error %d\n", + i, set[i]->data, bpf_map__name(map), err); + exit(1); + } + } +} + +static u64 dynptr_key_get_slab_mem(int dfd) +{ + const char *magic = "slab "; + const char *name = "memory.stat"; + int fd; + ssize_t nr; + char buf[4096]; + char *from; + + fd = openat(dfd, name, 0); + if (fd < 0) { + fprintf(stdout, "no %s (cgroup v1 ?)\n", name); + return 0; + } + + nr = read(fd, buf, sizeof(buf)); + if (nr <= 0) { + fprintf(stderr, "empty %s ?\n", name); + exit(1); + } + buf[nr - 1] = 0; + + close(fd); + + from = strstr(buf, magic); + if (!from) { + fprintf(stderr, "no slab in %s\n", name); + exit(1); + } + + return strtoull(from + strlen(magic), NULL, 10); +} + +static void dynptr_key_setup_lookup_map(struct bpf_map *map, unsigned int map_type, + struct var_size_str **set, unsigned int nr) +{ + int fd = bpf_map__fd(map); + unsigned int i; + + for (i = 0; i < nr; i++) { + struct dynkey_key dynkey; + void *key; + int err; + + if (map_type == NORM_HTAB) { + key = set[i]; + } else { + dynkey.cookie = set[i]->len; + bpf_dynptr_user_init(set[i]->data, set[i]->len, &dynkey.desc); + key = &dynkey; + } + /* May have duplicated keys */ + err = bpf_map_update_elem(fd, key, &i, 0); + if (err) { + fprintf(stderr, "add #%u key (%s) on %s error %d\n", + i, set[i]->data, bpf_map__name(map), err); + exit(1); + } + } +} + +static void dump_data_set_metric(struct var_size_str **set, unsigned int nr) +{ + double mean = 0.0, stddev = 0.0; + unsigned int max = 0; + unsigned int i; + + for (i = 0; i < nr; i++) { + if (set[i]->len > max) + max = set[i]->len; + mean += set[i]->len / (0.0 + nr); + } + + if (nr > 1) { + for (i = 0; i < nr; i++) + stddev += (mean - set[i]->len) * (mean - set[i]->len) / (nr - 1.0); + stddev = sqrt(stddev); + } + + fprintf(stdout, "str length: max %u mean %.0f stdev %.0f\n", max, mean, stddev); +} + +static void dynptr_key_setup(unsigned int map_type, const char *prog_name) +{ + struct var_size_str **set = NULL; + struct dynptr_key_bench *skel; + unsigned int nr = 0, max_len = 0; + struct bpf_program *prog; + struct bpf_link *link; + struct bpf_map *map; + u64 before, after; + int dfd; + int err; + + if (!args.file) { + nr = args.entries; + err = gen_data_set(args.max_size, &set, &nr, &max_len); + } else { + err = parse_data_set(args.file, &set, &nr, &max_len); + } + if (err < 0) + exit(1); + + if (args.entries && args.entries < nr) + nr = args.entries; + + dump_data_set_metric(set, nr); + + dfd = cgroup_setup_and_join("/dynptr_key"); + if (dfd < 0) { + fprintf(stderr, "failed to setup cgroup env\n"); + goto free_str_set; + } + + setup_libbpf(); + + before = dynptr_key_get_slab_mem(dfd); + + skel = dynptr_key_bench__open(); + if (!skel) { + fprintf(stderr, "failed to open skeleton\n"); + goto leave_cgroup; + } + + dynptr_key_init_map_opts(skel, max_len, nr); + + skel->rodata->max_dynkey_size = max_len; + skel->bss->update_nr = nr; + skel->bss->update_chunk = nr / env.producer_cnt; + + prog = bpf_object__find_program_by_name(skel->obj, prog_name); + if (!prog) { + fprintf(stderr, "no such prog %s\n", prog_name); + goto destroy_skel; + } + bpf_program__set_autoload(prog, true); + + err = dynptr_key_bench__load(skel); + if (err) { + fprintf(stderr, "failed to load skeleton\n"); + goto destroy_skel; + } + + dynptr_key_setup_key_map(skel->maps.array, set, nr); + + map = (map_type == NORM_HTAB) ? skel->maps.htab : skel->maps.dynkey_htab; + dynptr_key_setup_lookup_map(map, map_type, set, nr); + + after = dynptr_key_get_slab_mem(dfd); + + link = bpf_program__attach(prog); + if (!link) { + fprintf(stderr, "failed to attach %s\n", prog_name); + goto destroy_skel; + } + + ctx.skel = skel; + ctx.cgrp_dfd = dfd; + ctx.map_slab_mem = after - before; + free(set); + return; + +destroy_skel: + dynptr_key_bench__destroy(skel); +leave_cgroup: + close(dfd); + cleanup_cgroup_environment(); +free_str_set: + free(set); + exit(1); +} + +static void dynkey_htab_lookup_setup(void) +{ + dynptr_key_setup(DYNPTR_KEY_HTAB, "dynkey_htab_lookup"); +} + +static void norm_htab_lookup_setup(void) +{ + dynptr_key_setup(NORM_HTAB, "htab_lookup"); +} + +static void dynkey_htab_update_setup(void) +{ + dynptr_key_setup(DYNPTR_KEY_HTAB, "dynkey_htab_update"); +} + +static void norm_htab_update_setup(void) +{ + dynptr_key_setup(NORM_HTAB, "htab_update"); +} + +static void *dynptr_key_producer(void *ctx) +{ + while (true) + (void)syscall(__NR_getpgid); + return NULL; +} + +static void dynptr_key_measure(struct bench_res *res) +{ + static __u64 last_hits, last_drops; + __u64 total_hits = 0, total_drops = 0; + unsigned int i, nr_cpus; + + nr_cpus = bpf_num_possible_cpus(); + for (i = 0; i < nr_cpus; i++) { + struct run_stat *s = (void *)&ctx.skel->bss->percpu_stats[i & 255]; + + total_hits += s->stats[0]; + total_drops += s->stats[1]; + } + + res->hits = total_hits - last_hits; + res->drops = total_drops - last_drops; + + last_hits = total_hits; + last_drops = total_drops; +} + +static void dynptr_key_report_final(struct bench_res res[], int res_cnt) +{ + close(ctx.cgrp_dfd); + cleanup_cgroup_environment(); + + fprintf(stdout, "Slab: %.3f MiB\n", (float)ctx.map_slab_mem / 1024 / 1024); + hits_drops_report_final(res, res_cnt); +} + +const struct bench bench_dynkey_htab_lookup = { + .name = "dynkey-htab-lookup", + .argp = &bench_dynptr_key_argp, + .validate = dynptr_key_validate, + .setup = dynkey_htab_lookup_setup, + .producer_thread = dynptr_key_producer, + .measure = dynptr_key_measure, + .report_progress = hits_drops_report_progress, + .report_final = dynptr_key_report_final, +}; + +const struct bench bench_norm_htab_lookup = { + .name = "norm-htab-lookup", + .argp = &bench_dynptr_key_argp, + .validate = dynptr_key_validate, + .setup = norm_htab_lookup_setup, + .producer_thread = dynptr_key_producer, + .measure = dynptr_key_measure, + .report_progress = hits_drops_report_progress, + .report_final = dynptr_key_report_final, +}; + +const struct bench bench_dynkey_htab_update = { + .name = "dynkey-htab-update", + .argp = &bench_dynptr_key_argp, + .validate = dynptr_key_validate, + .setup = dynkey_htab_update_setup, + .producer_thread = dynptr_key_producer, + .measure = dynptr_key_measure, + .report_progress = hits_drops_report_progress, + .report_final = dynptr_key_report_final, +}; + +const struct bench bench_norm_htab_update = { + .name = "norm-htab-update", + .argp = &bench_dynptr_key_argp, + .validate = dynptr_key_validate, + .setup = norm_htab_update_setup, + .producer_thread = dynptr_key_producer, + .measure = dynptr_key_measure, + .report_progress = hits_drops_report_progress, + .report_final = dynptr_key_report_final, +}; diff --git a/tools/testing/selftests/bpf/benchs/run_bench_dynptr_key.sh b/tools/testing/selftests/bpf/benchs/run_bench_dynptr_key.sh new file mode 100755 index 0000000000000..ec074ce55a363 --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/run_bench_dynptr_key.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source ./benchs/run_common.sh + +set -eufo pipefail + +prod_list=${PROD_LIST:-"1 2 4 8"} +entries=${ENTRIES:-8192} +max_size=${MAX_SIZE:-256} +str_file=${STR_FILE:-} + +summarize_rate_and_mem() +{ + local bench="$1" + local mem=$(echo $2 | grep Slab: | \ + sed -E "s/.*Slab:\s+([0-9]+\.[0-9]+ MiB).*/\1/") + local summary=$(echo $2 | tail -n1) + + printf "%-20s %s (drops %s, mem %s)\n" "$bench" "$(hits $summary)" \ + "$(drops $summary)" "$mem" +} + +htab_bench() +{ + local opts="--entries ${entries} --max_size ${max_size}" + local desc="${entries}" + local name + local prod + + if test -n "${str_file}" && test -f "${str_file}" + then + opts="--file ${str_file}" + desc="${str_file}" + fi + + for name in htab-lookup htab-update + do + for prod in ${prod_list} + do + summarize_rate_and_mem "${name}-p${prod}-${desc}" \ + "$($RUN_BENCH -p${prod} ${1}-${name} ${opts})" + done + done +} + +header "normal hash map" +htab_bench norm + +header "dynptr-keyed hash map" +htab_bench dynkey diff --git a/tools/testing/selftests/bpf/bpf_util.h b/tools/testing/selftests/bpf/bpf_util.h index 5f6963a320d73..8ad7e97006c75 100644 --- a/tools/testing/selftests/bpf/bpf_util.h +++ b/tools/testing/selftests/bpf/bpf_util.h @@ -71,4 +71,13 @@ static inline void bpf_strlcpy(char *dst, const char *src, size_t sz) #define ENOTSUPP 524 #endif +/* sys_bpf() will check the validity of data and size */ +static inline void bpf_dynptr_user_init(void *data, __u32 size, + struct bpf_dynptr_user *dynptr) +{ + dynptr->data = data; + dynptr->size = size; + dynptr->reserved = 0; +} + #endif /* __BPF_UTIL__ */ diff --git a/tools/testing/selftests/bpf/prog_tests/htab_dynkey_test.c b/tools/testing/selftests/bpf/prog_tests/htab_dynkey_test.c new file mode 100644 index 0000000000000..b1f86642e89c1 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/htab_dynkey_test.c @@ -0,0 +1,427 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2025. Huawei Technologies Co., Ltd */ +#include +#include +#include +#include +#include + +#include "htab_dynkey_test_success.skel.h" +#include "htab_dynkey_test_failure.skel.h" + +struct id_dname_key { + int id; + struct bpf_dynptr_user name; +}; + +struct dname_key { + struct bpf_dynptr_user name; +}; + +struct nested_dynptr_key { + unsigned long f_1; + struct id_dname_key f_2; + unsigned long f_3; +}; + +static char *name_list[] = { + "systemd", + "[rcu_sched]", + "[kworker/42:0H-events_highpri]", + "[ksoftirqd/58]", + "[rcu_tasks_trace]", +}; + +#define INIT_VALUE 100 +#define INIT_ID 1000 + +static void setup_pure_dynptr_key_map(int fd) +{ + struct bpf_dynptr_user key, _cur_key, _next_key; + struct bpf_dynptr_user *cur_key, *next_key; + bool marked[ARRAY_SIZE(name_list)]; + unsigned int i, next_idx, size; + unsigned long value, got; + char name[2][64]; + char msg[64]; + void *data; + int err; + + /* lookup non-existent keys */ + for (i = 0; i < ARRAY_SIZE(name_list); i++) { + snprintf(msg, sizeof(msg), "#%u bad lookup", i); + /* Use strdup() to ensure that the content pointed by dynptr is + * used for lookup instead of the pointer in dynptr. sys_bpf() + * will handle the NULL case properly. + */ + data = strdup(name_list[i]); + bpf_dynptr_user_init(data, strlen(name_list[i]) + 1, &key); + err = bpf_map_lookup_elem(fd, &key, &value); + ASSERT_EQ(err, -ENOENT, msg); + free(data); + } + + /* update keys */ + for (i = 0; i < ARRAY_SIZE(name_list); i++) { + snprintf(msg, sizeof(msg), "#%u insert", i); + data = strdup(name_list[i]); + bpf_dynptr_user_init(data, strlen(name_list[i]) + 1, &key); + value = INIT_VALUE + i; + err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST); + ASSERT_OK(err, msg); + free(data); + } + + /* lookup existent keys */ + for (i = 0; i < ARRAY_SIZE(name_list); i++) { + snprintf(msg, sizeof(msg), "#%u lookup", i); + data = strdup(name_list[i]); + bpf_dynptr_user_init(data, strlen(name_list[i]) + 1, &key); + got = 0; + err = bpf_map_lookup_elem(fd, &key, &got); + ASSERT_OK(err, msg); + free(data); + + value = INIT_VALUE + i; + ASSERT_EQ(got, value, msg); + } + + /* delete keys */ + for (i = 0; i < ARRAY_SIZE(name_list); i++) { + snprintf(msg, sizeof(msg), "#%u delete", i); + data = strdup(name_list[i]); + bpf_dynptr_user_init(data, strlen(name_list[i]) + 1, &key); + err = bpf_map_delete_elem(fd, &key); + ASSERT_OK(err, msg); + free(data); + } + + /* re-insert keys */ + for (i = 0; i < ARRAY_SIZE(name_list); i++) { + snprintf(msg, sizeof(msg), "#%u re-insert", i); + data = strdup(name_list[i]); + bpf_dynptr_user_init(data, strlen(name_list[i]) + 1, &key); + value = 0; + err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST); + ASSERT_OK(err, msg); + free(data); + } + + /* overwrite keys */ + for (i = 0; i < ARRAY_SIZE(name_list); i++) { + snprintf(msg, sizeof(msg), "#%u overwrite", i); + data = strdup(name_list[i]); + bpf_dynptr_user_init(data, strlen(name_list[i]) + 1, &key); + value = INIT_VALUE + i; + err = bpf_map_update_elem(fd, &key, &value, BPF_EXIST); + ASSERT_OK(err, msg); + free(data); + } + + /* get_next keys */ + next_idx = 0; + cur_key = NULL; + next_key = &_next_key; + memset(&marked, 0, sizeof(marked)); + while (true) { + bpf_dynptr_user_init(name[next_idx], sizeof(name[next_idx]), next_key); + err = bpf_map_get_next_key(fd, cur_key, next_key); + if (err) { + ASSERT_EQ(err, -ENOENT, "get_next_key"); + break; + } + + size = next_key->size; + data = next_key->data; + for (i = 0; i < ARRAY_SIZE(name_list); i++) { + if (size == strlen(name_list[i]) + 1 && + !memcmp(name_list[i], data, size)) { + ASSERT_FALSE(marked[i], name_list[i]); + marked[i] = true; + break; + } + } + ASSERT_EQ(next_key->reserved, 0, "reserved"); + + if (!cur_key) + cur_key = &_cur_key; + *cur_key = *next_key; + next_idx ^= 1; + } + + for (i = 0; i < ARRAY_SIZE(marked); i++) + ASSERT_TRUE(marked[i], name_list[i]); + + /* lookup_and_delete all elements except the first one */ + for (i = 1; i < ARRAY_SIZE(name_list); i++) { + snprintf(msg, sizeof(msg), "#%u lookup_delete", i); + data = strdup(name_list[i]); + bpf_dynptr_user_init(data, strlen(name_list[i]) + 1, &key); + got = 0; + err = bpf_map_lookup_and_delete_elem(fd, &key, &got); + ASSERT_OK(err, msg); + free(data); + + value = INIT_VALUE + i; + ASSERT_EQ(got, value, msg); + } + + /* get the key after the first element */ + cur_key = &_cur_key; + strncpy(name[0], name_list[0], sizeof(name[0]) - 1); + name[0][sizeof(name[0]) - 1] = 0; + bpf_dynptr_user_init(name[0], strlen(name[0]) + 1, cur_key); + + next_key = &_next_key; + bpf_dynptr_user_init(name[1], sizeof(name[1]), next_key); + err = bpf_map_get_next_key(fd, cur_key, next_key); + ASSERT_EQ(err, -ENOENT, "get_last"); +} + +static void setup_mixed_dynptr_key_map(int fd) +{ + struct id_dname_key key, _cur_key, _next_key; + struct id_dname_key *cur_key, *next_key; + bool marked[ARRAY_SIZE(name_list)]; + unsigned int i, next_idx, size; + unsigned long value; + char name[2][64]; + char msg[64]; + void *data; + int err; + + /* Zero the hole */ + memset(&key, 0, sizeof(key)); + + /* lookup non-existent keys */ + for (i = 0; i < ARRAY_SIZE(name_list); i++) { + snprintf(msg, sizeof(msg), "#%u bad lookup", i); + key.id = INIT_ID + i; + data = strdup(name_list[i]); + bpf_dynptr_user_init(data, strlen(name_list[i]) + 1, &key.name); + err = bpf_map_lookup_elem(fd, &key, &value); + ASSERT_EQ(err, -ENOENT, msg); + free(data); + } + + /* update keys */ + for (i = 0; i < ARRAY_SIZE(name_list); i++) { + snprintf(msg, sizeof(msg), "#%u insert", i); + key.id = INIT_ID + i; + data = strdup(name_list[i]); + bpf_dynptr_user_init(data, strlen(name_list[i]) + 1, &key.name); + value = INIT_VALUE + i; + err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST); + ASSERT_OK(err, msg); + free(data); + } + + /* lookup existent keys */ + for (i = 0; i < ARRAY_SIZE(name_list); i++) { + unsigned long got = 0; + + snprintf(msg, sizeof(msg), "#%u lookup", i); + key.id = INIT_ID + i; + data = strdup(name_list[i]); + bpf_dynptr_user_init(data, strlen(name_list[i]) + 1, &key.name); + err = bpf_map_lookup_elem(fd, &key, &got); + ASSERT_OK(err, msg); + free(data); + + value = INIT_VALUE + i; + ASSERT_EQ(got, value, msg); + } + + /* delete keys */ + for (i = 0; i < ARRAY_SIZE(name_list); i++) { + snprintf(msg, sizeof(msg), "#%u delete", i); + key.id = INIT_ID + i; + data = strdup(name_list[i]); + bpf_dynptr_user_init(data, strlen(name_list[i]) + 1, &key.name); + err = bpf_map_delete_elem(fd, &key); + ASSERT_OK(err, msg); + free(data); + } + + /* re-insert keys */ + for (i = 0; i < ARRAY_SIZE(name_list); i++) { + snprintf(msg, sizeof(msg), "#%u re-insert", i); + key.id = INIT_ID + i; + data = strdup(name_list[i]); + bpf_dynptr_user_init(data, strlen(name_list[i]) + 1, &key.name); + value = 0; + err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST); + ASSERT_OK(err, msg); + free(data); + } + + /* overwrite keys */ + for (i = 0; i < ARRAY_SIZE(name_list); i++) { + snprintf(msg, sizeof(msg), "#%u overwrite", i); + key.id = INIT_ID + i; + data = strdup(name_list[i]); + bpf_dynptr_user_init(data, strlen(name_list[i]) + 1, &key.name); + value = INIT_VALUE + i; + err = bpf_map_update_elem(fd, &key, &value, BPF_EXIST); + ASSERT_OK(err, msg); + free(data); + } + + /* get_next keys */ + next_idx = 0; + cur_key = NULL; + next_key = &_next_key; + memset(&marked, 0, sizeof(marked)); + while (true) { + bpf_dynptr_user_init(name[next_idx], sizeof(name[next_idx]), &next_key->name); + err = bpf_map_get_next_key(fd, cur_key, next_key); + if (err) { + ASSERT_EQ(err, -ENOENT, "last get_next"); + break; + } + + size = next_key->name.size; + data = next_key->name.data; + for (i = 0; i < ARRAY_SIZE(name_list); i++) { + if (size == strlen(name_list[i]) + 1 && + !memcmp(name_list[i], data, size)) { + ASSERT_FALSE(marked[i], name_list[i]); + ASSERT_EQ(next_key->id, INIT_ID + i, name_list[i]); + marked[i] = true; + break; + } + } + ASSERT_EQ(next_key->name.reserved, 0, "reserved"); + + if (!cur_key) + cur_key = &_cur_key; + *cur_key = *next_key; + next_idx ^= 1; + } + + for (i = 0; i < ARRAY_SIZE(marked); i++) + ASSERT_TRUE(marked[i], name_list[i]); +} + +static void setup_nested_dynptr_key_map(int fd) +{ + struct nested_dynptr_key key, cur_key, next_key; + unsigned long value; + unsigned int size; + char name[2][64]; + void *data; + int err; + + /* Zero the hole */ + memset(&key, 0, sizeof(key)); + + key.f_1 = 1; + key.f_2.id = 2; + key.f_3 = 3; + + /* lookup a non-existent key */ + data = strdup(name_list[0]); + bpf_dynptr_user_init(data, strlen(name_list[0]) + 1, &key.f_2.name); + err = bpf_map_lookup_elem(fd, &key, &value); + ASSERT_EQ(err, -ENOENT, "lookup"); + + /* update key */ + value = INIT_VALUE; + err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST); + ASSERT_OK(err, "update"); + free(data); + + /* lookup key */ + data = strdup(name_list[0]); + bpf_dynptr_user_init(data, strlen(name_list[0]) + 1, &key.f_2.name); + err = bpf_map_lookup_elem(fd, &key, &value); + ASSERT_OK(err, "lookup"); + ASSERT_EQ(value, INIT_VALUE, "lookup"); + + /* delete key */ + err = bpf_map_delete_elem(fd, &key); + ASSERT_OK(err, "delete"); + free(data); + + /* re-insert keys */ + bpf_dynptr_user_init(name_list[0], strlen(name_list[0]) + 1, &key.f_2.name); + value = 0; + err = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST); + ASSERT_OK(err, "re-insert"); + + /* overwrite keys */ + data = strdup(name_list[0]); + bpf_dynptr_user_init(data, strlen(name_list[0]) + 1, &key.f_2.name); + value = INIT_VALUE; + err = bpf_map_update_elem(fd, &key, &value, BPF_EXIST); + ASSERT_OK(err, "overwrite"); + free(data); + + /* get_next_key */ + bpf_dynptr_user_init(name[0], sizeof(name[0]), &next_key.f_2.name); + err = bpf_map_get_next_key(fd, NULL, &next_key); + ASSERT_OK(err, "first get_next"); + + ASSERT_EQ(next_key.f_1, 1, "f_1"); + + ASSERT_EQ(next_key.f_2.id, 2, "f_2 id"); + size = next_key.f_2.name.size; + data = next_key.f_2.name.data; + if (ASSERT_EQ(size, strlen(name_list[0]) + 1, "f_2 size")) + ASSERT_TRUE(!memcmp(name_list[0], data, size), "f_2 data"); + ASSERT_EQ(next_key.f_2.name.reserved, 0, "f_2 reserved"); + + ASSERT_EQ(next_key.f_3, 3, "f_3"); + + cur_key = next_key; + bpf_dynptr_user_init(name[1], sizeof(name[1]), &next_key.f_2.name); + err = bpf_map_get_next_key(fd, &cur_key, &next_key); + ASSERT_EQ(err, -ENOENT, "last get_next_key"); +} + +static void test_htab_dynptr_key(bool pure, bool nested) +{ + struct htab_dynkey_test_success *skel; + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct bpf_program *prog; + int err; + + skel = htab_dynkey_test_success__open(); + if (!ASSERT_OK_PTR(skel, "open()")) + return; + + prog = pure ? skel->progs.pure_dynptr_key : + (nested ? skel->progs.nested_dynptr_key : skel->progs.mixed_dynptr_key); + bpf_program__set_autoload(prog, true); + + err = htab_dynkey_test_success__load(skel); + if (!ASSERT_OK(err, "load()")) + goto out; + + if (pure) { + setup_pure_dynptr_key_map(bpf_map__fd(skel->maps.htab_1)); + setup_pure_dynptr_key_map(bpf_map__fd(skel->maps.htab_2)); + } else if (nested) { + setup_nested_dynptr_key_map(bpf_map__fd(skel->maps.htab_4)); + } else { + setup_mixed_dynptr_key_map(bpf_map__fd(skel->maps.htab_3)); + } + + err = bpf_prog_test_run_opts(bpf_program__fd(prog), &opts); + ASSERT_OK(err, "run"); + ASSERT_EQ(opts.retval, 0, "retval"); +out: + htab_dynkey_test_success__destroy(skel); +} + +void test_htab_dynkey_test(void) +{ + if (test__start_subtest("pure_dynptr_key")) + test_htab_dynptr_key(true, false); + if (test__start_subtest("mixed_dynptr_key")) + test_htab_dynptr_key(false, false); + if (test__start_subtest("nested_dynptr_key")) + test_htab_dynptr_key(false, true); + + RUN_TESTS(htab_dynkey_test_failure); +} diff --git a/tools/testing/selftests/bpf/progs/cpumask_common.h b/tools/testing/selftests/bpf/progs/cpumask_common.h index 4ece7873ba609..afbf2e99b1bb8 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_common.h +++ b/tools/testing/selftests/bpf/progs/cpumask_common.h @@ -10,7 +10,7 @@ /* Should use BTF_FIELDS_MAX, but it is not always available in vmlinux.h, * so use the hard-coded number as a workaround. */ -#define CPUMASK_KPTR_FIELDS_MAX 11 +#define CPUMASK_KPTR_FIELDS_MAX 13 int err; diff --git a/tools/testing/selftests/bpf/progs/dynptr_key_bench.c b/tools/testing/selftests/bpf/progs/dynptr_key_bench.c new file mode 100644 index 0000000000000..2f3dea926776b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/dynptr_key_bench.c @@ -0,0 +1,250 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2025. Huawei Technologies Co., Ltd */ +#include +#include +#include +#include +#include + +struct bpf_map; + +struct dynkey_key { + /* Use 8 bytes to prevent unnecessary hole */ + __u64 cookie; + struct bpf_dynptr desc; +}; + +struct var_size_key { + __u64 len; + unsigned char data[]; +}; + +/* Its value will be used as the key of hash map. The size of value is fixed, + * however, the first 8 bytes denote the length of valid data in the value. + */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(key_size, 4); +} array SEC(".maps"); + +/* key_size will be set by benchmark */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(value_size, 4); + __uint(map_flags, BPF_F_NO_PREALLOC); +} htab SEC(".maps"); + +/* map_extra will be set by benchmark */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct dynkey_key); + __type(value, unsigned int); + __uint(map_flags, BPF_F_NO_PREALLOC); +} dynkey_htab SEC(".maps"); + +char _license[] SEC("license") = "GPL"; + +struct { + __u64 stats[2]; +} __attribute__((__aligned__(256))) percpu_stats[256]; + +struct update_ctx { + unsigned int max; + unsigned int from; +}; + +volatile const unsigned int max_dynkey_size; +unsigned int update_nr; +unsigned int update_chunk; + +static __always_inline void update_stats(int idx) +{ + __u32 cpu = bpf_get_smp_processor_id(); + + percpu_stats[cpu & 255].stats[idx]++; +} + +static int lookup_htab(struct bpf_map *map, __u32 *key, void *value, void *data) +{ + __u32 *index; + + index = bpf_map_lookup_elem(&htab, value); + if (index && *index == *key) + update_stats(0); + else + update_stats(1); + return 0; +} + +static int lookup_dynkey_htab(struct bpf_map *map, __u32 *key, void *value, void *data) +{ + struct var_size_key *var_size_key = value; + struct dynkey_key dynkey; + __u32 *index; + __u64 len; + + len = var_size_key->len; + if (len > max_dynkey_size) + return 0; + + dynkey.cookie = len; + bpf_dynptr_from_mem(var_size_key->data, len, 0, &dynkey.desc); + index = bpf_map_lookup_elem(&dynkey_htab, &dynkey); + if (index && *index == *key) + update_stats(0); + else + update_stats(1); + return 0; +} + +static int update_htab_loop(unsigned int i, void *ctx) +{ + struct update_ctx *update = ctx; + void *value; + int err; + + if (update->from >= update->max) + update->from = 0; + value = bpf_map_lookup_elem(&array, &update->from); + if (!value) + return 1; + + err = bpf_map_update_elem(&htab, value, &update->from, 0); + if (!err) + update_stats(0); + else + update_stats(1); + update->from++; + + return 0; +} + +static int delete_htab_loop(unsigned int i, void *ctx) +{ + struct update_ctx *update = ctx; + void *value; + int err; + + if (update->from >= update->max) + update->from = 0; + value = bpf_map_lookup_elem(&array, &update->from); + if (!value) + return 1; + + err = bpf_map_delete_elem(&htab, value); + if (!err) + update_stats(0); + update->from++; + + return 0; +} + +static int update_dynkey_htab_loop(unsigned int i, void *ctx) +{ + struct update_ctx *update = ctx; + struct var_size_key *value; + struct dynkey_key dynkey; + __u64 len; + int err; + + if (update->from >= update->max) + update->from = 0; + value = bpf_map_lookup_elem(&array, &update->from); + if (!value) + return 1; + len = value->len; + if (len > max_dynkey_size) + return 1; + + dynkey.cookie = len; + bpf_dynptr_from_mem(value->data, len, 0, &dynkey.desc); + err = bpf_map_update_elem(&dynkey_htab, &dynkey, &update->from, 0); + if (!err) + update_stats(0); + else + update_stats(1); + update->from++; + + return 0; +} + +static int delete_dynkey_htab_loop(unsigned int i, void *ctx) +{ + struct update_ctx *update = ctx; + struct var_size_key *value; + struct dynkey_key dynkey; + __u64 len; + int err; + + if (update->from >= update->max) + update->from = 0; + value = bpf_map_lookup_elem(&array, &update->from); + if (!value) + return 1; + len = value->len; + if (len > max_dynkey_size) + return 1; + + dynkey.cookie = len; + bpf_dynptr_from_mem(value->data, len, 0, &dynkey.desc); + err = bpf_map_delete_elem(&dynkey_htab, &dynkey); + if (!err) + update_stats(0); + update->from++; + + return 0; +} + +SEC("?tp/syscalls/sys_enter_getpgid") +int htab_lookup(void *ctx) +{ + bpf_for_each_map_elem(&array, lookup_htab, NULL, 0); + return 0; +} + +SEC("?tp/syscalls/sys_enter_getpgid") +int dynkey_htab_lookup(void *ctx) +{ + bpf_for_each_map_elem(&array, lookup_dynkey_htab, NULL, 0); + return 0; +} + +SEC("?tp/syscalls/sys_enter_getpgid") +int htab_update(void *ctx) +{ + unsigned int index = bpf_get_smp_processor_id() * update_chunk; + struct update_ctx update; + + update.max = update_nr; + if (update.max && index >= update.max) + index %= update.max; + + /* Only operate part of keys according to cpu id */ + update.from = index; + bpf_loop(update_chunk, update_htab_loop, &update, 0); + + update.from = index; + bpf_loop(update_chunk, delete_htab_loop, &update, 0); + + return 0; +} + +SEC("?tp/syscalls/sys_enter_getpgid") +int dynkey_htab_update(void *ctx) +{ + unsigned int index = bpf_get_smp_processor_id() * update_chunk; + struct update_ctx update; + + update.max = update_nr; + if (update.max && index >= update.max) + index %= update.max; + + /* Only operate part of keys according to cpu id */ + update.from = index; + bpf_loop(update_chunk, update_dynkey_htab_loop, &update, 0); + + update.from = index; + bpf_loop(update_chunk, delete_dynkey_htab_loop, &update, 0); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/htab_dynkey_test_failure.c b/tools/testing/selftests/bpf/progs/htab_dynkey_test_failure.c new file mode 100644 index 0000000000000..2899f1041624b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/htab_dynkey_test_failure.c @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2025. Huawei Technologies Co., Ltd */ +#include +#include +#include +#include +#include + +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +struct bpf_map; + +struct id_dname_key { + int id; + struct bpf_dynptr name; +}; + +struct dname_id_key { + struct bpf_dynptr name; + int id; +}; + +struct id_name_key { + int id; + char name[20]; +}; + +struct dname_key { + struct bpf_dynptr name; +}; + +struct dname_dname_key { + struct bpf_dynptr name_1; + struct bpf_dynptr name_2; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 10); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, struct id_dname_key); + __type(value, unsigned long); + __uint(map_extra, 1024); +} htab_1 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 10); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, struct dname_key); + __type(value, unsigned long); + __uint(map_extra, 1024); +} htab_2 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 10); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, struct bpf_dynptr); + __type(value, unsigned long); + __uint(map_extra, 1024); +} htab_3 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 4096); +} ringbuf SEC(".maps"); + +char dynptr_buf[32] = {}; + +/* uninitialized dynptr */ +SEC("fentry/" SYS_PREFIX "sys_nanosleep") +__failure __msg("dynptr-key expects dynptr at offset 8") +int BPF_PROG(uninit_dynptr) +{ + struct id_dname_key key; + + key.id = 100; + bpf_map_lookup_elem(&htab_1, &key); + + return 0; +} + +/* invalid dynptr */ +SEC("fentry/" SYS_PREFIX "sys_nanosleep") +__failure __msg("dynptr-key expects dynptr at offset 8") +int BPF_PROG(invalid_dynptr) +{ + struct id_dname_key key; + + key.id = 100; + bpf_ringbuf_reserve_dynptr(&ringbuf, 10, 0, &key.name); + bpf_ringbuf_discard_dynptr(&key.name, 0); + bpf_map_lookup_elem(&htab_1, &key); + + return 0; +} + +/* expect no-dynptr got dynptr */ +SEC("fentry/" SYS_PREFIX "sys_nanosleep") +__failure __msg("dynptr-key expects non-dynptr at offset 0") +int BPF_PROG(invalid_non_dynptr) +{ + struct dname_id_key key; + + __builtin_memcpy(dynptr_buf, "test", 4); + bpf_dynptr_from_mem(dynptr_buf, 4, 0, &key.name); + key.id = 100; + bpf_map_lookup_elem(&htab_1, &key); + + return 0; +} + +/* expect dynptr get non-dynptr */ +SEC("fentry/" SYS_PREFIX "sys_nanosleep") +__failure __msg("dynptr-key expects dynptr at offset 8") +int BPF_PROG(no_dynptr) +{ + struct id_name_key key; + + key.id = 100; + __builtin_memset(key.name, 0, sizeof(key.name)); + __builtin_memcpy(key.name, "test", 4); + bpf_map_lookup_elem(&htab_1, &key); + + return 0; +} + +/* malformed */ +SEC("fentry/" SYS_PREFIX "sys_nanosleep") +__failure __msg("malformed dynptr-key at offset 8") +int BPF_PROG(malformed_dynptr) +{ + struct dname_dname_key key; + + bpf_dynptr_from_mem(dynptr_buf, 4, 0, &key.name_1); + bpf_dynptr_from_mem(dynptr_buf, 4, 0, &key.name_2); + + bpf_map_lookup_elem(&htab_2, (void *)&key + 8); + + return 0; +} + +/* misaligned */ +SEC("fentry/" SYS_PREFIX "sys_nanosleep") +__failure __msg("R2 misaligned offset -28 for dynptr-key") +int BPF_PROG(misaligned_dynptr) +{ + struct dname_dname_key key; + + bpf_map_lookup_elem(&htab_1, (char *)&key + 4); + + return 0; +} + +/* variable offset */ +SEC("fentry/" SYS_PREFIX "sys_nanosleep") +__failure __msg("R2 variable offset prohibited for dynptr-key") +int BPF_PROG(variable_offset_dynptr) +{ + struct bpf_dynptr dynptr_1; + struct bpf_dynptr dynptr_2; + char *key; + + bpf_dynptr_from_mem(dynptr_buf, 4, 0, &dynptr_1); + bpf_dynptr_from_mem(dynptr_buf, 4, 0, &dynptr_2); + + key = (char *)&dynptr_2; + key = key + (bpf_get_prandom_u32() & 1) * 16; + + bpf_map_lookup_elem(&htab_2, key); + + return 0; +} + +SEC("fentry/" SYS_PREFIX "sys_nanosleep") +__failure __msg("map dynptr-key requires stack ptr but got map_value") +int BPF_PROG(map_value_as_key) +{ + bpf_map_lookup_elem(&htab_1, dynptr_buf); + + return 0; +} + +static int lookup_htab(struct bpf_map *map, struct id_dname_key *key, void *value, void *data) +{ + bpf_map_lookup_elem(&htab_1, key); + return 0; +} + +SEC("fentry/" SYS_PREFIX "sys_nanosleep") +__failure __msg("map dynptr-key requires stack ptr but got map_key") +int BPF_PROG(map_key_as_key) +{ + bpf_for_each_map_elem(&htab_1, lookup_htab, NULL, 0); + return 0; +} + +__noinline __weak int subprog_lookup_htab(struct bpf_dynptr *dynptr) +{ + bpf_map_lookup_elem(&htab_3, dynptr); + return 0; +} + +SEC("fentry/" SYS_PREFIX "sys_nanosleep") +__failure __msg("R2 type=dynptr_ptr expected=") +int BPF_PROG(subprog_dynptr) +{ + struct bpf_dynptr dynptr; + + bpf_dynptr_from_mem(dynptr_buf, 4, 0, &dynptr); + subprog_lookup_htab(&dynptr); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/htab_dynkey_test_success.c b/tools/testing/selftests/bpf/progs/htab_dynkey_test_success.c new file mode 100644 index 0000000000000..ff37f22f07da4 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/htab_dynkey_test_success.c @@ -0,0 +1,383 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2025. Huawei Technologies Co., Ltd */ +#include +#include +#include +#include +#include + +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +struct pure_dynptr_key { + struct bpf_dynptr name; +}; + +struct mixed_dynptr_key { + int id; + struct bpf_dynptr name; +}; + +struct nested_dynptr_key { + unsigned long f_1; + struct mixed_dynptr_key f_2; + unsigned long f_3; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 10); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, struct bpf_dynptr); + __type(value, unsigned long); + __uint(map_extra, 1024); +} htab_1 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 10); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, struct pure_dynptr_key); + __type(value, unsigned long); + __uint(map_extra, 1024); +} htab_2 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 10); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, struct mixed_dynptr_key); + __type(value, unsigned long); + __uint(map_extra, 1024); +} htab_3 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 10); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, struct nested_dynptr_key); + __type(value, unsigned long); + __uint(map_extra, 1024); +} htab_4 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 4096); +} ringbuf SEC(".maps"); + +char dynptr_buf[2][32] = {{}, {}}; + +static const char systemd_name[] = "systemd"; +static const char udevd_name[] = "udevd"; +static const char rcu_sched_name[] = "[rcu_sched]"; + +struct bpf_map; + +static int test_pure_dynptr_key_htab(struct bpf_map *htab) +{ + unsigned long new_value, *value; + struct bpf_dynptr key; + int err = 0; + + /* Lookup a existent key */ + __builtin_memcpy(dynptr_buf[0], systemd_name, sizeof(systemd_name)); + bpf_dynptr_from_mem(dynptr_buf[0], sizeof(systemd_name), 0, &key); + value = bpf_map_lookup_elem(htab, &key); + if (!value) { + err = 1; + goto out; + } + if (*value != 100) { + err = 2; + goto out; + } + + /* Look up a non-existent key */ + __builtin_memcpy(dynptr_buf[0], udevd_name, sizeof(udevd_name)); + bpf_dynptr_from_mem(dynptr_buf[0], sizeof(udevd_name), 0, &key); + value = bpf_map_lookup_elem(htab, &key); + if (value) { + err = 3; + goto out; + } + + /* Insert a new key */ + new_value = 42; + err = bpf_map_update_elem(htab, &key, &new_value, BPF_NOEXIST); + if (err) { + err = 4; + goto out; + } + + /* Insert an existent key */ + bpf_ringbuf_reserve_dynptr(&ringbuf, sizeof(udevd_name), 0, &key); + err = bpf_dynptr_write(&key, 0, (void *)udevd_name, sizeof(udevd_name), 0); + if (err) { + bpf_ringbuf_discard_dynptr(&key, 0); + err = 5; + goto out; + } + + err = bpf_map_update_elem(htab, &key, &new_value, BPF_NOEXIST); + bpf_ringbuf_discard_dynptr(&key, 0); + if (err != -EEXIST) { + err = 6; + goto out; + } + + /* Lookup it again */ + bpf_dynptr_from_mem(dynptr_buf[0], sizeof(udevd_name), 0, &key); + value = bpf_map_lookup_elem(htab, &key); + if (!value) { + err = 7; + goto out; + } + if (*value != 42) { + err = 8; + goto out; + } + + /* Delete then lookup it */ + bpf_ringbuf_reserve_dynptr(&ringbuf, sizeof(udevd_name), 0, &key); + err = bpf_dynptr_write(&key, 0, (void *)udevd_name, sizeof(udevd_name), 0); + if (err) { + bpf_ringbuf_discard_dynptr(&key, 0); + err = 9; + goto out; + } + err = bpf_map_delete_elem(htab, &key); + bpf_ringbuf_discard_dynptr(&key, 0); + if (err) { + err = 10; + goto out; + } + + bpf_dynptr_from_mem(dynptr_buf[0], sizeof(udevd_name), 0, &key); + value = bpf_map_lookup_elem(htab, &key); + if (value) { + err = 10; + goto out; + } +out: + return err; +} + +static int test_mixed_dynptr_key_htab(struct bpf_map *htab) +{ + unsigned long new_value, *value; + char udevd_name[] = "udevd"; + struct mixed_dynptr_key key; + int err = 0; + + __builtin_memset(&key, 0, sizeof(key)); + key.id = 1000; + + /* Lookup a existent key */ + __builtin_memcpy(dynptr_buf[0], systemd_name, sizeof(systemd_name)); + bpf_dynptr_from_mem(dynptr_buf[0], sizeof(systemd_name), 0, &key.name); + value = bpf_map_lookup_elem(htab, &key); + if (!value) { + err = 1; + goto out; + } + if (*value != 100) { + err = 2; + goto out; + } + + /* Look up a non-existent key */ + __builtin_memcpy(dynptr_buf[0], udevd_name, sizeof(udevd_name)); + bpf_dynptr_from_mem(dynptr_buf[0], sizeof(udevd_name), 0, &key.name); + value = bpf_map_lookup_elem(htab, &key); + if (value) { + err = 3; + goto out; + } + + /* Insert a new key */ + new_value = 42; + err = bpf_map_update_elem(htab, &key, &new_value, BPF_NOEXIST); + if (err) { + err = 4; + goto out; + } + + /* Insert an existent key */ + bpf_ringbuf_reserve_dynptr(&ringbuf, sizeof(udevd_name), 0, &key.name); + err = bpf_dynptr_write(&key.name, 0, (void *)udevd_name, sizeof(udevd_name), 0); + if (err) { + bpf_ringbuf_discard_dynptr(&key.name, 0); + err = 5; + goto out; + } + + err = bpf_map_update_elem(htab, &key, &new_value, BPF_NOEXIST); + bpf_ringbuf_discard_dynptr(&key.name, 0); + if (err != -EEXIST) { + err = 6; + goto out; + } + + /* Lookup it again */ + bpf_dynptr_from_mem(dynptr_buf[0], sizeof(udevd_name), 0, &key.name); + value = bpf_map_lookup_elem(htab, &key); + if (!value) { + err = 7; + goto out; + } + if (*value != 42) { + err = 8; + goto out; + } + + /* Delete then lookup it */ + bpf_ringbuf_reserve_dynptr(&ringbuf, sizeof(udevd_name), 0, &key.name); + err = bpf_dynptr_write(&key.name, 0, (void *)udevd_name, sizeof(udevd_name), 0); + if (err) { + bpf_ringbuf_discard_dynptr(&key.name, 0); + err = 9; + goto out; + } + err = bpf_map_delete_elem(htab, &key); + bpf_ringbuf_discard_dynptr(&key.name, 0); + if (err) { + err = 10; + goto out; + } + + bpf_dynptr_from_mem(dynptr_buf[0], sizeof(udevd_name), 0, &key.name); + value = bpf_map_lookup_elem(htab, &key); + if (value) { + err = 10; + goto out; + } +out: + return err; +} + +static int test_nested_dynptr_key_htab(struct bpf_map *htab) +{ + unsigned long new_value, *value; + struct nested_dynptr_key key; + int err = 0; + + __builtin_memset(&key, 0, sizeof(key)); + key.f_1 = 1; + key.f_2.id = 2; + key.f_3 = 3; + + /* Lookup a existent key */ + __builtin_memcpy(dynptr_buf[0], systemd_name, sizeof(systemd_name)); + bpf_dynptr_from_mem(dynptr_buf[0], sizeof(systemd_name), 0, &key.f_2.name); + value = bpf_map_lookup_elem(htab, &key); + if (!value) { + err = 1; + goto out; + } + if (*value != 100) { + err = 2; + goto out; + } + + /* Look up a non-existent key */ + __builtin_memcpy(dynptr_buf[0], rcu_sched_name, sizeof(rcu_sched_name)); + bpf_dynptr_from_mem(dynptr_buf[0], sizeof(rcu_sched_name), 0, &key.f_2.name); + value = bpf_map_lookup_elem(htab, &key); + if (value) { + err = 3; + goto out; + } + + /* Insert a new key */ + new_value = 42; + err = bpf_map_update_elem(htab, &key, &new_value, BPF_NOEXIST); + if (err) { + err = 4; + goto out; + } + + /* Insert an existent key */ + bpf_ringbuf_reserve_dynptr(&ringbuf, sizeof(rcu_sched_name), 0, &key.f_2.name); + err = bpf_dynptr_write(&key.f_2.name, 0, (void *)rcu_sched_name, sizeof(rcu_sched_name), 0); + if (err) { + bpf_ringbuf_discard_dynptr(&key.f_2.name, 0); + err = 5; + goto out; + } + err = bpf_map_update_elem(htab, &key, &new_value, BPF_NOEXIST); + bpf_ringbuf_discard_dynptr(&key.f_2.name, 0); + if (err != -EEXIST) { + err = 6; + goto out; + } + + /* Lookup a non-existent key */ + bpf_dynptr_from_mem(dynptr_buf[0], sizeof(rcu_sched_name), 0, &key.f_2.name); + key.f_3 = 0; + value = bpf_map_lookup_elem(htab, &key); + if (value) { + err = 7; + goto out; + } + + /* Lookup an existent key */ + key.f_3 = 3; + value = bpf_map_lookup_elem(htab, &key); + if (!value) { + err = 8; + goto out; + } + if (*value != 42) { + err = 9; + goto out; + } + + /* Delete the newly-inserted key */ + bpf_ringbuf_reserve_dynptr(&ringbuf, sizeof(systemd_name), 0, &key.f_2.name); + err = bpf_dynptr_write(&key.f_2.name, 0, (void *)systemd_name, sizeof(systemd_name), 0); + if (err) { + bpf_ringbuf_discard_dynptr(&key.f_2.name, 0); + err = 10; + goto out; + } + err = bpf_map_delete_elem(htab, &key); + if (err) { + bpf_ringbuf_discard_dynptr(&key.f_2.name, 0); + err = 11; + goto out; + } + + /* Lookup it again */ + value = bpf_map_lookup_elem(htab, &key); + bpf_ringbuf_discard_dynptr(&key.f_2.name, 0); + if (value) { + err = 12; + goto out; + } +out: + return err; +} + +SEC("?raw_tp") +int BPF_PROG(pure_dynptr_key) +{ + int err; + + err = test_pure_dynptr_key_htab((struct bpf_map *)&htab_1); + err |= test_pure_dynptr_key_htab((struct bpf_map *)&htab_2) << 8; + + return err; +} + +SEC("?raw_tp") +int BPF_PROG(mixed_dynptr_key) +{ + return test_mixed_dynptr_key_htab((struct bpf_map *)&htab_3); +} + +SEC("?raw_tp") +int BPF_PROG(nested_dynptr_key) +{ + return test_nested_dynptr_key_htab((struct bpf_map *)&htab_4); +}