From a5969cabbb4660eab42b6ef0412cbbd1200cf14d Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Sat, 12 Oct 2024 07:10:09 +0000
Subject: [PATCH] 修改led为gpio
---
kernel/kernel/bpf/hashtab.c | 857 +++++++++++++++++++++++++++++++++++++++++++++++++++-----
1 files changed, 773 insertions(+), 84 deletions(-)
diff --git a/kernel/kernel/bpf/hashtab.c b/kernel/kernel/bpf/hashtab.c
index 3f3ed33..0ce445a 100644
--- a/kernel/kernel/bpf/hashtab.c
+++ b/kernel/kernel/bpf/hashtab.c
@@ -1,14 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#include <linux/bpf.h>
#include <linux/btf.h>
@@ -17,17 +9,81 @@
#include <linux/rculist_nulls.h>
#include <linux/random.h>
#include <uapi/linux/btf.h>
+#include <linux/rcupdate_trace.h>
#include "percpu_freelist.h"
#include "bpf_lru_list.h"
#include "map_in_map.h"
#define HTAB_CREATE_FLAG_MASK \
(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
- BPF_F_RDONLY | BPF_F_WRONLY)
+ BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED)
+#define BATCH_OPS(_name) \
+ .map_lookup_batch = \
+ _name##_map_lookup_batch, \
+ .map_lookup_and_delete_batch = \
+ _name##_map_lookup_and_delete_batch, \
+ .map_update_batch = \
+ generic_map_update_batch, \
+ .map_delete_batch = \
+ generic_map_delete_batch
+
+/*
+ * The bucket lock has two protection scopes:
+ *
+ * 1) Serializing concurrent operations from BPF programs on differrent
+ * CPUs
+ *
+ * 2) Serializing concurrent operations from BPF programs and sys_bpf()
+ *
+ * BPF programs can execute in any context including perf, kprobes and
+ * tracing. As there are almost no limits where perf, kprobes and tracing
+ * can be invoked from the lock operations need to be protected against
+ * deadlocks. Deadlocks can be caused by recursion and by an invocation in
+ * the lock held section when functions which acquire this lock are invoked
+ * from sys_bpf(). BPF recursion is prevented by incrementing the per CPU
+ * variable bpf_prog_active, which prevents BPF programs attached to perf
+ * events, kprobes and tracing to be invoked before the prior invocation
+ * from one of these contexts completed. sys_bpf() uses the same mechanism
+ * by pinning the task to the current CPU and incrementing the recursion
+ * protection accross the map operation.
+ *
+ * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain
+ * operations like memory allocations (even with GFP_ATOMIC) from atomic
+ * contexts. This is required because even with GFP_ATOMIC the memory
+ * allocator calls into code pathes which acquire locks with long held lock
+ * sections. To ensure the deterministic behaviour these locks are regular
+ * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only
+ * true atomic contexts on an RT kernel are the low level hardware
+ * handling, scheduling, low level interrupt handling, NMIs etc. None of
+ * these contexts should ever do memory allocations.
+ *
+ * As regular device interrupt handlers and soft interrupts are forced into
+ * thread context, the existing code which does
+ * spin_lock*(); alloc(GPF_ATOMIC); spin_unlock*();
+ * just works.
+ *
+ * In theory the BPF locks could be converted to regular spinlocks as well,
+ * but the bucket locks and percpu_freelist locks can be taken from
+ * arbitrary contexts (perf, kprobes, tracepoints) which are required to be
+ * atomic contexts even on RT. These mechanisms require preallocated maps,
+ * so there is no need to invoke memory allocations within the lock held
+ * sections.
+ *
+ * BPF maps which need dynamic allocation are only used from (forced)
+ * thread context on RT and can therefore use regular spinlocks which in
+ * turn allows to invoke memory allocations from the lock held section.
+ *
+ * On a non RT kernel this distinction is neither possible nor required.
+ * spinlock maps to raw_spinlock and the extra code is optimized out by the
+ * compiler.
+ */
struct bucket {
struct hlist_nulls_head head;
- raw_spinlock_t lock;
+ union {
+ raw_spinlock_t raw_lock;
+ spinlock_t lock;
+ };
};
struct bpf_htab {
@@ -54,6 +110,7 @@
union {
struct bpf_htab *htab;
struct pcpu_freelist_node fnode;
+ struct htab_elem *batch_flink;
};
};
};
@@ -62,8 +119,53 @@
struct bpf_lru_node lru_node;
};
u32 hash;
- char key[0] __aligned(8);
+ char key[] __aligned(8);
};
+
+static inline bool htab_is_prealloc(const struct bpf_htab *htab)
+{
+ return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
+}
+
+static inline bool htab_use_raw_lock(const struct bpf_htab *htab)
+{
+ return (!IS_ENABLED(CONFIG_PREEMPT_RT) || htab_is_prealloc(htab));
+}
+
+static void htab_init_buckets(struct bpf_htab *htab)
+{
+ unsigned i;
+
+ for (i = 0; i < htab->n_buckets; i++) {
+ INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
+ if (htab_use_raw_lock(htab))
+ raw_spin_lock_init(&htab->buckets[i].raw_lock);
+ else
+ spin_lock_init(&htab->buckets[i].lock);
+ }
+}
+
+static inline unsigned long htab_lock_bucket(const struct bpf_htab *htab,
+ struct bucket *b)
+{
+ unsigned long flags;
+
+ if (htab_use_raw_lock(htab))
+ raw_spin_lock_irqsave(&b->raw_lock, flags);
+ else
+ spin_lock_irqsave(&b->lock, flags);
+ return flags;
+}
+
+static inline void htab_unlock_bucket(const struct bpf_htab *htab,
+ struct bucket *b,
+ unsigned long flags)
+{
+ if (htab_use_raw_lock(htab))
+ raw_spin_unlock_irqrestore(&b->raw_lock, flags);
+ else
+ spin_unlock_irqrestore(&b->lock, flags);
+}
static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
@@ -77,11 +179,6 @@
{
return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH ||
htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
-}
-
-static bool htab_is_prealloc(const struct bpf_htab *htab)
-{
- return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
}
static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
@@ -124,6 +221,17 @@
bpf_map_area_free(htab->elems);
}
+/* The LRU list has a lock (lru_lock). Each htab bucket has a lock
+ * (bucket_lock). If both locks need to be acquired together, the lock
+ * order is always lru_lock -> bucket_lock and this only happens in
+ * bpf_lru_list.c logic. For example, certain code path of
+ * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(),
+ * will acquire lru_lock first followed by acquiring bucket_lock.
+ *
+ * In hashtab.c, to avoid deadlock, lock acquisition of
+ * bucket_lock followed by lru_lock is not allowed. In such cases,
+ * bucket_lock needs to be released first before acquiring lru_lock.
+ */
static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
u32 hash)
{
@@ -244,6 +352,7 @@
*/
bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
+ bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED);
int numa_node = bpf_map_attr_numa_node(attr);
BUILD_BUG_ON(offsetof(struct htab_elem, htab) !=
@@ -251,14 +360,18 @@
BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
offsetof(struct htab_elem, hash_node.pprev));
- if (lru && !capable(CAP_SYS_ADMIN))
+ if (lru && !bpf_capable())
/* LRU implementation is much complicated than other
- * maps. Hence, limit to CAP_SYS_ADMIN for now.
+ * maps. Hence, limit to CAP_BPF.
*/
return -EPERM;
- if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK)
- /* reserved bits should not be used */
+ if (zero_seed && !capable(CAP_SYS_ADMIN))
+ /* Guard against local DoS, and discourage production use. */
+ return -EPERM;
+
+ if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK ||
+ !bpf_map_flags_access_ok(attr->map_flags))
return -EINVAL;
if (!lru && percpu_lru)
@@ -309,8 +422,8 @@
bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
struct bpf_htab *htab;
- int err, i;
u64 cost;
+ int err;
htab = kzalloc(sizeof(*htab), GFP_USER);
if (!htab)
@@ -355,14 +468,8 @@
else
cost += (u64) htab->elem_size * num_possible_cpus();
- if (cost >= U32_MAX - PAGE_SIZE)
- /* make sure page count doesn't overflow */
- goto free_htab;
-
- htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-
- /* if map size is larger than memlock limit, reject it early */
- err = bpf_map_precharge_memlock(htab->map.pages);
+ /* if map size is larger than memlock limit, reject it */
+ err = bpf_map_charge_init(&htab->map.memory, cost);
if (err)
goto free_htab;
@@ -371,13 +478,14 @@
sizeof(struct bucket),
htab->map.numa_node);
if (!htab->buckets)
- goto free_htab;
+ goto free_charge;
- htab->hashrnd = get_random_int();
- for (i = 0; i < htab->n_buckets; i++) {
- INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
- raw_spin_lock_init(&htab->buckets[i].lock);
- }
+ if (htab->map.map_flags & BPF_F_ZERO_SEED)
+ htab->hashrnd = 0;
+ else
+ htab->hashrnd = get_random_int();
+
+ htab_init_buckets(htab);
if (prealloc) {
err = prealloc_init(htab);
@@ -400,6 +508,8 @@
prealloc_destroy(htab);
free_buckets:
bpf_map_area_free(htab->buckets);
+free_charge:
+ bpf_map_charge_finish(&htab->map.memory);
free_htab:
kfree(htab);
return ERR_PTR(err);
@@ -468,8 +578,7 @@
struct htab_elem *l;
u32 hash, key_size;
- /* Must be called with rcu_read_lock. */
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
key_size = map->key_size;
@@ -503,7 +612,7 @@
* bpf_prog
* __htab_map_lookup_elem
*/
-static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
struct bpf_insn *insn = insn_buf;
const int ret = BPF_REG_0;
@@ -542,7 +651,7 @@
return __htab_lru_map_lookup_elem(map, key, false);
}
-static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
+static int htab_lru_map_gen_lookup(struct bpf_map *map,
struct bpf_insn *insn_buf)
{
struct bpf_insn *insn = insn_buf;
@@ -583,7 +692,7 @@
b = __select_bucket(htab, tgt_l->hash);
head = &b->head;
- raw_spin_lock_irqsave(&b->lock, flags);
+ flags = htab_lock_bucket(htab, b);
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
if (l == tgt_l) {
@@ -591,7 +700,7 @@
break;
}
- raw_spin_unlock_irqrestore(&b->lock, flags);
+ htab_unlock_bucket(htab, b, flags);
return l == tgt_l;
}
@@ -712,19 +821,36 @@
}
}
+static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
+ void *value, bool onallcpus)
+{
+ /* When using prealloc and not setting the initial value on all cpus,
+ * zero-fill element values for other cpus (just as what happens when
+ * not using prealloc). Otherwise, bpf program has no way to ensure
+ * known initial values for cpus other than current one
+ * (onallcpus=false always when coming from bpf prog).
+ */
+ if (htab_is_prealloc(htab) && !onallcpus) {
+ u32 size = round_up(htab->map.value_size, 8);
+ int current_cpu = raw_smp_processor_id();
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ if (cpu == current_cpu)
+ bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value,
+ size);
+ else
+ memset(per_cpu_ptr(pptr, cpu), 0, size);
+ }
+ } else {
+ pcpu_copy_value(htab, pptr, value, onallcpus);
+ }
+}
+
static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
{
return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS &&
BITS_PER_LONG == 64;
-}
-
-static u32 htab_size_value(const struct bpf_htab *htab, bool percpu)
-{
- u32 size = htab->map.value_size;
-
- if (percpu || fd_htab_map_needs_adjust(htab))
- size = round_up(size, 8);
- return size;
}
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
@@ -732,7 +858,7 @@
bool percpu, bool onallcpus,
struct htab_elem *old_elem)
{
- u32 size = htab_size_value(htab, percpu);
+ u32 size = htab->map.value_size;
bool prealloc = htab_is_prealloc(htab);
struct htab_elem *l_new, **pl_new;
void __percpu *pptr;
@@ -771,10 +897,13 @@
l_new = ERR_PTR(-ENOMEM);
goto dec_count;
}
+ check_and_init_map_lock(&htab->map,
+ l_new->key + round_up(key_size, 8));
}
memcpy(l_new->key, key, key_size);
if (percpu) {
+ size = round_up(size, 8);
if (prealloc) {
pptr = htab_elem_get_ptr(l_new, key_size);
} else {
@@ -788,12 +917,17 @@
}
}
- pcpu_copy_value(htab, pptr, value, onallcpus);
+ pcpu_init_value(htab, pptr, value, onallcpus);
if (!prealloc)
htab_elem_set_ptr(l_new, key_size, pptr);
- } else {
+ } else if (fd_htab_map_needs_adjust(htab)) {
+ size = round_up(size, 8);
memcpy(l_new->key + round_up(key_size, 8), value, size);
+ } else {
+ copy_map_value(&htab->map,
+ l_new->key + round_up(key_size, 8),
+ value);
}
l_new->hash = hash;
@@ -806,11 +940,11 @@
static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
u64 map_flags)
{
- if (l_old && map_flags == BPF_NOEXIST)
+ if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
/* elem already exists */
return -EEXIST;
- if (!l_old && map_flags == BPF_EXIST)
+ if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
/* elem doesn't exist, cannot update it */
return -ENOENT;
@@ -829,11 +963,11 @@
u32 key_size, hash;
int ret;
- if (unlikely(map_flags > BPF_EXIST))
+ if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
/* unknown flags */
return -EINVAL;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
key_size = map->key_size;
@@ -842,14 +976,49 @@
b = __select_bucket(htab, hash);
head = &b->head;
- /* bpf_map_update_elem() can be called in_irq() */
- raw_spin_lock_irqsave(&b->lock, flags);
+ if (unlikely(map_flags & BPF_F_LOCK)) {
+ if (unlikely(!map_value_has_spin_lock(map)))
+ return -EINVAL;
+ /* find an element without taking the bucket lock */
+ l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
+ htab->n_buckets);
+ ret = check_flags(htab, l_old, map_flags);
+ if (ret)
+ return ret;
+ if (l_old) {
+ /* grab the element lock and update value in place */
+ copy_map_value_locked(map,
+ l_old->key + round_up(key_size, 8),
+ value, false);
+ return 0;
+ }
+ /* fall through, grab the bucket lock and lookup again.
+ * 99.9% chance that the element won't be found,
+ * but second lookup under lock has to be done.
+ */
+ }
+
+ flags = htab_lock_bucket(htab, b);
l_old = lookup_elem_raw(head, hash, key, key_size);
ret = check_flags(htab, l_old, map_flags);
if (ret)
goto err;
+
+ if (unlikely(l_old && (map_flags & BPF_F_LOCK))) {
+ /* first lookup without the bucket lock didn't find the element,
+ * but second lookup with the bucket lock found it.
+ * This case is highly unlikely, but has to be dealt with:
+ * grab the element lock in addition to the bucket lock
+ * and update element in place
+ */
+ copy_map_value_locked(map,
+ l_old->key + round_up(key_size, 8),
+ value, false);
+ ret = 0;
+ goto err;
+ }
l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
l_old);
@@ -870,7 +1039,7 @@
}
ret = 0;
err:
- raw_spin_unlock_irqrestore(&b->lock, flags);
+ htab_unlock_bucket(htab, b, flags);
return ret;
}
@@ -889,7 +1058,7 @@
/* unknown flags */
return -EINVAL;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
key_size = map->key_size;
@@ -908,8 +1077,7 @@
return -ENOMEM;
memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size);
- /* bpf_map_update_elem() can be called in_irq() */
- raw_spin_lock_irqsave(&b->lock, flags);
+ flags = htab_lock_bucket(htab, b);
l_old = lookup_elem_raw(head, hash, key, key_size);
@@ -928,7 +1096,7 @@
ret = 0;
err:
- raw_spin_unlock_irqrestore(&b->lock, flags);
+ htab_unlock_bucket(htab, b, flags);
if (ret)
bpf_lru_push_free(&htab->lru, &l_new->lru_node);
@@ -963,8 +1131,7 @@
b = __select_bucket(htab, hash);
head = &b->head;
- /* bpf_map_update_elem() can be called in_irq() */
- raw_spin_lock_irqsave(&b->lock, flags);
+ flags = htab_lock_bucket(htab, b);
l_old = lookup_elem_raw(head, hash, key, key_size);
@@ -987,7 +1154,7 @@
}
ret = 0;
err:
- raw_spin_unlock_irqrestore(&b->lock, flags);
+ htab_unlock_bucket(htab, b, flags);
return ret;
}
@@ -1027,8 +1194,7 @@
return -ENOMEM;
}
- /* bpf_map_update_elem() can be called in_irq() */
- raw_spin_lock_irqsave(&b->lock, flags);
+ flags = htab_lock_bucket(htab, b);
l_old = lookup_elem_raw(head, hash, key, key_size);
@@ -1043,14 +1209,14 @@
pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
value, onallcpus);
} else {
- pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size),
+ pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size),
value, onallcpus);
hlist_nulls_add_head_rcu(&l_new->hash_node, head);
l_new = NULL;
}
ret = 0;
err:
- raw_spin_unlock_irqrestore(&b->lock, flags);
+ htab_unlock_bucket(htab, b, flags);
if (l_new)
bpf_lru_push_free(&htab->lru, &l_new->lru_node);
return ret;
@@ -1080,7 +1246,7 @@
u32 hash, key_size;
int ret = -ENOENT;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
key_size = map->key_size;
@@ -1088,7 +1254,7 @@
b = __select_bucket(htab, hash);
head = &b->head;
- raw_spin_lock_irqsave(&b->lock, flags);
+ flags = htab_lock_bucket(htab, b);
l = lookup_elem_raw(head, hash, key, key_size);
@@ -1098,7 +1264,7 @@
ret = 0;
}
- raw_spin_unlock_irqrestore(&b->lock, flags);
+ htab_unlock_bucket(htab, b, flags);
return ret;
}
@@ -1112,7 +1278,7 @@
u32 hash, key_size;
int ret = -ENOENT;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
key_size = map->key_size;
@@ -1120,7 +1286,7 @@
b = __select_bucket(htab, hash);
head = &b->head;
- raw_spin_lock_irqsave(&b->lock, flags);
+ flags = htab_lock_bucket(htab, b);
l = lookup_elem_raw(head, hash, key, key_size);
@@ -1129,7 +1295,7 @@
ret = 0;
}
- raw_spin_unlock_irqrestore(&b->lock, flags);
+ htab_unlock_bucket(htab, b, flags);
if (l)
bpf_lru_push_free(&htab->lru, &l->lru_node);
return ret;
@@ -1156,12 +1322,10 @@
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
- * so the programs (can be more than one that used this map) were
- * disconnected from events. Wait for outstanding critical sections in
- * these programs to complete
+ /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback.
+ * bpf_free_used_maps() is called after bpf prog is no longer executing.
+ * There is no need to synchronize_rcu() here to protect map elements.
*/
- synchronize_rcu();
/* some of free_htab_elem() callbacks for elements of this map may
* not have executed. Wait for them.
@@ -1198,7 +1362,476 @@
rcu_read_unlock();
}
+static int
+__htab_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr,
+ bool do_delete, bool is_lru_map,
+ bool is_percpu)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ u32 bucket_cnt, total, key_size, value_size, roundup_key_size;
+ void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val;
+ void __user *uvalues = u64_to_user_ptr(attr->batch.values);
+ void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
+ void *ubatch = u64_to_user_ptr(attr->batch.in_batch);
+ u32 batch, max_count, size, bucket_size;
+ struct htab_elem *node_to_free = NULL;
+ u64 elem_map_flags, map_flags;
+ struct hlist_nulls_head *head;
+ struct hlist_nulls_node *n;
+ unsigned long flags = 0;
+ bool locked = false;
+ struct htab_elem *l;
+ struct bucket *b;
+ int ret = 0;
+
+ elem_map_flags = attr->batch.elem_flags;
+ if ((elem_map_flags & ~BPF_F_LOCK) ||
+ ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
+ return -EINVAL;
+
+ map_flags = attr->batch.flags;
+ if (map_flags)
+ return -EINVAL;
+
+ max_count = attr->batch.count;
+ if (!max_count)
+ return 0;
+
+ if (put_user(0, &uattr->batch.count))
+ return -EFAULT;
+
+ batch = 0;
+ if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch)))
+ return -EFAULT;
+
+ if (batch >= htab->n_buckets)
+ return -ENOENT;
+
+ key_size = htab->map.key_size;
+ roundup_key_size = round_up(htab->map.key_size, 8);
+ value_size = htab->map.value_size;
+ size = round_up(value_size, 8);
+ if (is_percpu)
+ value_size = size * num_possible_cpus();
+ total = 0;
+ /* while experimenting with hash tables with sizes ranging from 10 to
+ * 1000, it was observed that a bucket can have upto 5 entries.
+ */
+ bucket_size = 5;
+
+alloc:
+ /* We cannot do copy_from_user or copy_to_user inside
+ * the rcu_read_lock. Allocate enough space here.
+ */
+ keys = kvmalloc_array(key_size, bucket_size, GFP_USER | __GFP_NOWARN);
+ values = kvmalloc_array(value_size, bucket_size, GFP_USER | __GFP_NOWARN);
+ if (!keys || !values) {
+ ret = -ENOMEM;
+ goto after_loop;
+ }
+
+again:
+ bpf_disable_instrumentation();
+ rcu_read_lock();
+again_nocopy:
+ dst_key = keys;
+ dst_val = values;
+ b = &htab->buckets[batch];
+ head = &b->head;
+ /* do not grab the lock unless need it (bucket_cnt > 0). */
+ if (locked)
+ flags = htab_lock_bucket(htab, b);
+
+ bucket_cnt = 0;
+ hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
+ bucket_cnt++;
+
+ if (bucket_cnt && !locked) {
+ locked = true;
+ goto again_nocopy;
+ }
+
+ if (bucket_cnt > (max_count - total)) {
+ if (total == 0)
+ ret = -ENOSPC;
+ /* Note that since bucket_cnt > 0 here, it is implicit
+ * that the locked was grabbed, so release it.
+ */
+ htab_unlock_bucket(htab, b, flags);
+ rcu_read_unlock();
+ bpf_enable_instrumentation();
+ goto after_loop;
+ }
+
+ if (bucket_cnt > bucket_size) {
+ bucket_size = bucket_cnt;
+ /* Note that since bucket_cnt > 0 here, it is implicit
+ * that the locked was grabbed, so release it.
+ */
+ htab_unlock_bucket(htab, b, flags);
+ rcu_read_unlock();
+ bpf_enable_instrumentation();
+ kvfree(keys);
+ kvfree(values);
+ goto alloc;
+ }
+
+ /* Next block is only safe to run if you have grabbed the lock */
+ if (!locked)
+ goto next_batch;
+
+ hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
+ memcpy(dst_key, l->key, key_size);
+
+ if (is_percpu) {
+ int off = 0, cpu;
+ void __percpu *pptr;
+
+ pptr = htab_elem_get_ptr(l, map->key_size);
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(dst_val + off,
+ per_cpu_ptr(pptr, cpu), size);
+ off += size;
+ }
+ } else {
+ value = l->key + roundup_key_size;
+ if (elem_map_flags & BPF_F_LOCK)
+ copy_map_value_locked(map, dst_val, value,
+ true);
+ else
+ copy_map_value(map, dst_val, value);
+ check_and_init_map_lock(map, dst_val);
+ }
+ if (do_delete) {
+ hlist_nulls_del_rcu(&l->hash_node);
+
+ /* bpf_lru_push_free() will acquire lru_lock, which
+ * may cause deadlock. See comments in function
+ * prealloc_lru_pop(). Let us do bpf_lru_push_free()
+ * after releasing the bucket lock.
+ */
+ if (is_lru_map) {
+ l->batch_flink = node_to_free;
+ node_to_free = l;
+ } else {
+ free_htab_elem(htab, l);
+ }
+ }
+ dst_key += key_size;
+ dst_val += value_size;
+ }
+
+ htab_unlock_bucket(htab, b, flags);
+ locked = false;
+
+ while (node_to_free) {
+ l = node_to_free;
+ node_to_free = node_to_free->batch_flink;
+ bpf_lru_push_free(&htab->lru, &l->lru_node);
+ }
+
+next_batch:
+ /* If we are not copying data, we can go to next bucket and avoid
+ * unlocking the rcu.
+ */
+ if (!bucket_cnt && (batch + 1 < htab->n_buckets)) {
+ batch++;
+ goto again_nocopy;
+ }
+
+ rcu_read_unlock();
+ bpf_enable_instrumentation();
+ if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys,
+ key_size * bucket_cnt) ||
+ copy_to_user(uvalues + total * value_size, values,
+ value_size * bucket_cnt))) {
+ ret = -EFAULT;
+ goto after_loop;
+ }
+
+ total += bucket_cnt;
+ batch++;
+ if (batch >= htab->n_buckets) {
+ ret = -ENOENT;
+ goto after_loop;
+ }
+ goto again;
+
+after_loop:
+ if (ret == -EFAULT)
+ goto out;
+
+ /* copy # of entries and next batch */
+ ubatch = u64_to_user_ptr(attr->batch.out_batch);
+ if (copy_to_user(ubatch, &batch, sizeof(batch)) ||
+ put_user(total, &uattr->batch.count))
+ ret = -EFAULT;
+
+out:
+ kvfree(keys);
+ kvfree(values);
+ return ret;
+}
+
+static int
+htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
+ false, true);
+}
+
+static int
+htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
+ false, true);
+}
+
+static int
+htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
+ false, false);
+}
+
+static int
+htab_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
+ false, false);
+}
+
+static int
+htab_lru_percpu_map_lookup_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
+ true, true);
+}
+
+static int
+htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
+ true, true);
+}
+
+static int
+htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
+ true, false);
+}
+
+static int
+htab_lru_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
+ true, false);
+}
+
+struct bpf_iter_seq_hash_map_info {
+ struct bpf_map *map;
+ struct bpf_htab *htab;
+ void *percpu_value_buf; // non-zero means percpu hash
+ u32 bucket_id;
+ u32 skip_elems;
+};
+
+static struct htab_elem *
+bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info,
+ struct htab_elem *prev_elem)
+{
+ const struct bpf_htab *htab = info->htab;
+ u32 skip_elems = info->skip_elems;
+ u32 bucket_id = info->bucket_id;
+ struct hlist_nulls_head *head;
+ struct hlist_nulls_node *n;
+ struct htab_elem *elem;
+ struct bucket *b;
+ u32 i, count;
+
+ if (bucket_id >= htab->n_buckets)
+ return NULL;
+
+ /* try to find next elem in the same bucket */
+ if (prev_elem) {
+ /* no update/deletion on this bucket, prev_elem should be still valid
+ * and we won't skip elements.
+ */
+ n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node));
+ elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node);
+ if (elem)
+ return elem;
+
+ /* not found, unlock and go to the next bucket */
+ b = &htab->buckets[bucket_id++];
+ rcu_read_unlock();
+ skip_elems = 0;
+ }
+
+ for (i = bucket_id; i < htab->n_buckets; i++) {
+ b = &htab->buckets[i];
+ rcu_read_lock();
+
+ count = 0;
+ head = &b->head;
+ hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
+ if (count >= skip_elems) {
+ info->bucket_id = i;
+ info->skip_elems = count;
+ return elem;
+ }
+ count++;
+ }
+
+ rcu_read_unlock();
+ skip_elems = 0;
+ }
+
+ info->bucket_id = i;
+ info->skip_elems = 0;
+ return NULL;
+}
+
+static void *bpf_hash_map_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct bpf_iter_seq_hash_map_info *info = seq->private;
+ struct htab_elem *elem;
+
+ elem = bpf_hash_map_seq_find_next(info, NULL);
+ if (!elem)
+ return NULL;
+
+ if (*pos == 0)
+ ++*pos;
+ return elem;
+}
+
+static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_iter_seq_hash_map_info *info = seq->private;
+
+ ++*pos;
+ ++info->skip_elems;
+ return bpf_hash_map_seq_find_next(info, v);
+}
+
+static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem)
+{
+ struct bpf_iter_seq_hash_map_info *info = seq->private;
+ u32 roundup_key_size, roundup_value_size;
+ struct bpf_iter__bpf_map_elem ctx = {};
+ struct bpf_map *map = info->map;
+ struct bpf_iter_meta meta;
+ int ret = 0, off = 0, cpu;
+ struct bpf_prog *prog;
+ void __percpu *pptr;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, elem == NULL);
+ if (prog) {
+ ctx.meta = &meta;
+ ctx.map = info->map;
+ if (elem) {
+ roundup_key_size = round_up(map->key_size, 8);
+ ctx.key = elem->key;
+ if (!info->percpu_value_buf) {
+ ctx.value = elem->key + roundup_key_size;
+ } else {
+ roundup_value_size = round_up(map->value_size, 8);
+ pptr = htab_elem_get_ptr(elem, map->key_size);
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(info->percpu_value_buf + off,
+ per_cpu_ptr(pptr, cpu),
+ roundup_value_size);
+ off += roundup_value_size;
+ }
+ ctx.value = info->percpu_value_buf;
+ }
+ }
+ ret = bpf_iter_run_prog(prog, &ctx);
+ }
+
+ return ret;
+}
+
+static int bpf_hash_map_seq_show(struct seq_file *seq, void *v)
+{
+ return __bpf_hash_map_seq_show(seq, v);
+}
+
+static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v)
+{
+ if (!v)
+ (void)__bpf_hash_map_seq_show(seq, NULL);
+ else
+ rcu_read_unlock();
+}
+
+static int bpf_iter_init_hash_map(void *priv_data,
+ struct bpf_iter_aux_info *aux)
+{
+ struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
+ struct bpf_map *map = aux->map;
+ void *value_buf;
+ u32 buf_size;
+
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+ buf_size = round_up(map->value_size, 8) * num_possible_cpus();
+ value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN);
+ if (!value_buf)
+ return -ENOMEM;
+
+ seq_info->percpu_value_buf = value_buf;
+ }
+
+ bpf_map_inc_with_uref(map);
+ seq_info->map = map;
+ seq_info->htab = container_of(map, struct bpf_htab, map);
+ return 0;
+}
+
+static void bpf_iter_fini_hash_map(void *priv_data)
+{
+ struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
+
+ bpf_map_put_with_uref(seq_info->map);
+ kfree(seq_info->percpu_value_buf);
+}
+
+static const struct seq_operations bpf_hash_map_seq_ops = {
+ .start = bpf_hash_map_seq_start,
+ .next = bpf_hash_map_seq_next,
+ .stop = bpf_hash_map_seq_stop,
+ .show = bpf_hash_map_seq_show,
+};
+
+static const struct bpf_iter_seq_info iter_seq_info = {
+ .seq_ops = &bpf_hash_map_seq_ops,
+ .init_seq_private = bpf_iter_init_hash_map,
+ .fini_seq_private = bpf_iter_fini_hash_map,
+ .seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info),
+};
+
+static int htab_map_btf_id;
const struct bpf_map_ops htab_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
@@ -1208,9 +1841,15 @@
.map_delete_elem = htab_map_delete_elem,
.map_gen_lookup = htab_map_gen_lookup,
.map_seq_show_elem = htab_map_seq_show_elem,
+ BATCH_OPS(htab),
+ .map_btf_name = "bpf_htab",
+ .map_btf_id = &htab_map_btf_id,
+ .iter_seq_info = &iter_seq_info,
};
+static int htab_lru_map_btf_id;
const struct bpf_map_ops htab_lru_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
@@ -1221,6 +1860,10 @@
.map_delete_elem = htab_lru_map_delete_elem,
.map_gen_lookup = htab_lru_map_gen_lookup,
.map_seq_show_elem = htab_map_seq_show_elem,
+ BATCH_OPS(htab_lru),
+ .map_btf_name = "bpf_htab",
+ .map_btf_id = &htab_lru_map_btf_id,
+ .iter_seq_info = &iter_seq_info,
};
/* Called from eBPF program */
@@ -1296,7 +1939,38 @@
return ret;
}
+static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key,
+ struct seq_file *m)
+{
+ struct htab_elem *l;
+ void __percpu *pptr;
+ int cpu;
+
+ rcu_read_lock();
+
+ l = __htab_map_lookup_elem(map, key);
+ if (!l) {
+ rcu_read_unlock();
+ return;
+ }
+
+ btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
+ seq_puts(m, ": {\n");
+ pptr = htab_elem_get_ptr(l, map->key_size);
+ for_each_possible_cpu(cpu) {
+ seq_printf(m, "\tcpu%d: ", cpu);
+ btf_type_seq_show(map->btf, map->btf_value_type_id,
+ per_cpu_ptr(pptr, cpu), m);
+ seq_puts(m, "\n");
+ }
+ seq_puts(m, "}\n");
+
+ rcu_read_unlock();
+}
+
+static int htab_percpu_map_btf_id;
const struct bpf_map_ops htab_percpu_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
@@ -1304,9 +1978,16 @@
.map_lookup_elem = htab_percpu_map_lookup_elem,
.map_update_elem = htab_percpu_map_update_elem,
.map_delete_elem = htab_map_delete_elem,
+ .map_seq_show_elem = htab_percpu_map_seq_show_elem,
+ BATCH_OPS(htab_percpu),
+ .map_btf_name = "bpf_htab",
+ .map_btf_id = &htab_percpu_map_btf_id,
+ .iter_seq_info = &iter_seq_info,
};
+static int htab_lru_percpu_map_btf_id;
const struct bpf_map_ops htab_lru_percpu_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
@@ -1314,6 +1995,11 @@
.map_lookup_elem = htab_lru_percpu_map_lookup_elem,
.map_update_elem = htab_lru_percpu_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem,
+ .map_seq_show_elem = htab_percpu_map_seq_show_elem,
+ BATCH_OPS(htab_lru_percpu),
+ .map_btf_name = "bpf_htab",
+ .map_btf_id = &htab_lru_percpu_map_btf_id,
+ .iter_seq_info = &iter_seq_info,
};
static int fd_htab_map_alloc_check(union bpf_attr *attr)
@@ -1412,7 +2098,7 @@
return READ_ONCE(*inner_map);
}
-static u32 htab_of_map_gen_lookup(struct bpf_map *map,
+static int htab_of_map_gen_lookup(struct bpf_map *map,
struct bpf_insn *insn_buf)
{
struct bpf_insn *insn = insn_buf;
@@ -1436,6 +2122,7 @@
fd_htab_map_free(map);
}
+static int htab_of_maps_map_btf_id;
const struct bpf_map_ops htab_of_maps_map_ops = {
.map_alloc_check = fd_htab_map_alloc_check,
.map_alloc = htab_of_map_alloc,
@@ -1448,4 +2135,6 @@
.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
.map_gen_lookup = htab_of_map_gen_lookup,
.map_check_btf = map_check_no_btf,
+ .map_btf_name = "bpf_htab",
+ .map_btf_id = &htab_of_maps_map_btf_id,
};
--
Gitblit v1.6.2