In the security kexec_file_load case, the buffer which holds the kernel
image should not be accessible from the userspace.

Typically, BPF data flow occurs between user space and kernel space in
either direction.  However, kexec_file_load presents a unique case where
user-originated data must be parsed and then forwarded to the kernel for
subsequent parsing stages.  This necessitates a mechanism to channel the
intermedia data from the BPF program directly to the kernel.

bpf_kexec_carrier() is introduced to serve that purpose.

Signed-off-by: Pingfan Liu <pi...@redhat.com>
Cc: Alexei Starovoitov <a...@kernel.org>
Cc: Daniel Borkmann <dan...@iogearbox.net>
Cc: John Fastabend <john.fastab...@gmail.com>
Cc: Andrii Nakryiko <and...@kernel.org>
Cc: Martin KaFai Lau <martin....@linux.dev>
Cc: Eduard Zingerman <eddy...@gmail.com>
Cc: Song Liu <s...@kernel.org>
Cc: Yonghong Song <yonghong.s...@linux.dev>
Cc: KP Singh <kpsi...@kernel.org>
Cc: Stanislav Fomichev <s...@fomichev.me>
Cc: Hao Luo <hao...@google.com>
Cc: Jiri Olsa <jo...@kernel.org>
To: b...@vger.kernel.org
---
 include/linux/bpf.h          |  42 +++++++
 kernel/bpf/Makefile          |   3 +
 kernel/bpf/helpers.c         |   4 +
 kernel/bpf/helpers_carrier.c | 215 +++++++++++++++++++++++++++++++++++
 4 files changed, 264 insertions(+)
 create mode 100644 kernel/bpf/helpers_carrier.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index cc700925b802f..f40e14eb63178 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -3703,4 +3703,46 @@ int bpf_prog_get_file_line(struct bpf_prog *prog, 
unsigned long ip, const char *
                           const char **linep, int *nump);
 struct bpf_prog *bpf_prog_find_from_stack(void);
 
+enum alloc_type {
+       TYPE_KALLOC,
+       TYPE_VMALLOC,
+       TYPE_VMAP,
+};
+
+struct mem_range_result {
+       struct kref ref;
+       char *buf;
+       uint32_t buf_sz;
+       uint32_t data_sz;
+       /* kmalloc-ed, vmalloc-ed, or vmap-ed */
+       enum alloc_type alloc_type;
+       /* Valid if vmap-ed */
+       struct page **pages;
+       unsigned int pg_cnt;
+       int status;
+       struct mem_cgroup *memcg;
+};
+
+struct mem_range_result *mem_range_result_alloc(void);
+void mem_range_result_get(struct mem_range_result *r);
+void mem_range_result_put(struct mem_range_result *r);
+
+__bpf_kfunc int bpf_mem_range_result_put(struct mem_range_result *result);
+__bpf_kfunc int bpf_copy_to_kernel(const char *name, char *buf, int size);
+
+typedef int (*resource_handler)(const char *name, struct mem_range_result *r);
+
+struct carrier_listener {
+       struct hlist_node node;
+       char *name;
+       resource_handler handler;
+       /*
+        * bpf_copy_to_kernel() knows the size in advance, so vmap-ed is not
+        * supported.
+        */
+       enum alloc_type alloc_type;
+};
+
+int register_carrier_listener(struct carrier_listener *listener);
+int unregister_carrier_listener(char *str);
 #endif /* _LINUX_BPF_H */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 269c04a246640..3912ed4300472 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -56,6 +56,9 @@ obj-$(CONFIG_BPF_SYSCALL) += kmem_cache_iter.o
 ifeq ($(CONFIG_DMA_SHARED_BUFFER),y)
 obj-$(CONFIG_BPF_SYSCALL) += dmabuf_iter.o
 endif
+ifeq ($(CONFIG_KEXEC_PE_IMAGE),y)
+obj-$(CONFIG_BPF_SYSCALL) += helpers_carrier.o
+endif
 
 CFLAGS_REMOVE_percpu_freelist.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_bpf_lru_list.o = $(CC_FLAGS_FTRACE)
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 6b4877e85a68c..bd83ec9a2b2a6 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -3709,6 +3709,10 @@ BTF_KFUNCS_START(generic_btf_ids)
 #ifdef CONFIG_CRASH_DUMP
 BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
 #endif
+#ifdef CONFIG_KEXEC_PE_IMAGE
+BTF_ID_FLAGS(func, bpf_mem_range_result_put, KF_RELEASE | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_copy_to_kernel, KF_TRUSTED_ARGS | KF_SLEEPABLE)
+#endif
 BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)
diff --git a/kernel/bpf/helpers_carrier.c b/kernel/bpf/helpers_carrier.c
new file mode 100644
index 0000000000000..7af4ef07ce750
--- /dev/null
+++ b/kernel/bpf/helpers_carrier.c
@@ -0,0 +1,215 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/bpf.h>
+#include <linux/bpf-cgroup.h>
+#include <linux/cgroup.h>
+#include <linux/rcupdate.h>
+#include <linux/hashtable.h>
+#include <linux/jhash.h>
+#include <linux/mutex.h>
+#include <linux/kref.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+DEFINE_STATIC_SRCU(srcu);
+static DEFINE_MUTEX(carrier_listeners_mutex);
+static DEFINE_HASHTABLE(carrier_listeners, 8);
+
+static struct carrier_listener *find_listener(const char *str)
+{
+       struct carrier_listener *item;
+       unsigned int hash = jhash(str, strlen(str), 0);
+
+       hash_for_each_possible_rcu(carrier_listeners, item, node, hash) {
+               if (strcmp(item->name, str) == 0)
+                       return item;
+       }
+       return NULL;
+}
+
+static void __mem_range_result_free(struct kref *kref)
+{
+       struct mem_range_result *result = container_of(kref, struct 
mem_range_result, ref);
+       struct mem_cgroup *memcg, *old_memcg;
+
+       /* vunmap() is blocking */
+       might_sleep();
+       memcg = result->memcg;
+       old_memcg = set_active_memcg(memcg);
+       if (likely(!!result->buf)) {
+               switch (result->alloc_type) {
+               case TYPE_KALLOC:
+                       kfree(result->buf);
+                       break;
+               case TYPE_VMALLOC:
+                       vfree(result->buf);
+                       break;
+               case TYPE_VMAP:
+                       vunmap(result->buf);
+                       for (unsigned int i = 0; i < result->pg_cnt; i++)
+                               __free_pages(result->pages[i], 0);
+                       vfree(result->pages);
+               }
+       }
+       kfree(result);
+       set_active_memcg(old_memcg);
+       mem_cgroup_put(memcg);
+}
+
+struct mem_range_result *mem_range_result_alloc(void)
+{
+       struct mem_range_result *range;
+
+       range = kmalloc(sizeof(struct mem_range_result), GFP_KERNEL);
+       if (!range)
+               return NULL;
+       kref_init(&range->ref);
+       return range;
+}
+
+void mem_range_result_get(struct mem_range_result *r)
+{
+       if (!r)
+               return;
+       kref_get(&r->ref);
+}
+
+void mem_range_result_put(struct mem_range_result *r)
+{
+       might_sleep();
+       if (!r)
+               return;
+       kref_put(&r->ref, __mem_range_result_free);
+}
+
+__bpf_kfunc int bpf_mem_range_result_put(struct mem_range_result *result)
+{
+       mem_range_result_put(result);
+       return 0;
+}
+
+/*
+ * Cache the content in @buf into kernel
+ */
+__bpf_kfunc int bpf_copy_to_kernel(const char *name, char *buf, int size)
+{
+       struct mem_range_result *range;
+       struct mem_cgroup *memcg, *old_memcg;
+       struct carrier_listener *item;
+       resource_handler handler;
+       enum alloc_type alloc_type;
+       char *kbuf;
+       int id, ret = 0;
+
+       /*
+        * This lock ensures no use of item after free and there is no in-flight
+        * handler
+        */
+       id = srcu_read_lock(&srcu);
+       item = find_listener(name);
+       if (!item) {
+               srcu_read_unlock(&srcu, id);
+               return -EINVAL;
+       }
+       alloc_type = item->alloc_type;
+       handler = item->handler;
+       memcg = get_mem_cgroup_from_current();
+       old_memcg = set_active_memcg(memcg);
+       range = mem_range_result_alloc();
+       if (!range) {
+               pr_err("fail to allocate mem_range_result\n");
+               ret = -ENOMEM;
+               goto err;
+       }
+
+       switch (alloc_type) {
+       case TYPE_KALLOC:
+               kbuf = kmalloc(size, GFP_KERNEL | __GFP_ACCOUNT);
+               break;
+       case TYPE_VMALLOC:
+               kbuf = __vmalloc(size, GFP_KERNEL | __GFP_ACCOUNT);
+               break;
+       default:
+               kfree(range);
+               ret = -EINVAL;
+               goto err;
+       }
+       if (!kbuf) {
+               kfree(range);
+               ret = -ENOMEM;
+               goto err;
+       }
+       ret = copy_from_kernel_nofault(kbuf, buf, size);
+       if (unlikely(ret < 0)) {
+               if (range->alloc_type == TYPE_KALLOC)
+                       kfree(kbuf);
+               else
+                       vfree(kbuf);
+               kfree(range);
+               ret = -EINVAL;
+               goto err;
+       }
+       range->buf = kbuf;
+       range->buf_sz = size;
+       range->data_sz = size;
+       range->memcg = memcg;
+       mem_cgroup_tryget(memcg);
+       range->status = 0;
+       range->alloc_type = alloc_type;
+       /* We exit the lock after the handler finishes */
+       ret = handler(name, range);
+       srcu_read_unlock(&srcu, id);
+       mem_range_result_put(range);
+err:
+       if (ret != 0)
+               srcu_read_unlock(&srcu, id);
+       set_active_memcg(old_memcg);
+       mem_cgroup_put(memcg);
+       return ret;
+}
+
+int register_carrier_listener(struct carrier_listener *listener)
+{
+       unsigned int hash;
+       int ret = 0;
+       char *str = listener->name;
+
+       /* Not support vmap-ed */
+       if (listener->alloc_type > TYPE_VMALLOC)
+               return -EINVAL;
+       if (!str)
+               return -EINVAL;
+       hash = jhash(str, strlen(str), 0);
+       mutex_lock(&carrier_listeners_mutex);
+       if (!find_listener(str))
+               hash_add_rcu(carrier_listeners, &listener->node, hash);
+       else
+               ret = -EBUSY;
+       mutex_unlock(&carrier_listeners_mutex);
+
+       return ret;
+}
+EXPORT_SYMBOL(register_carrier_listener);
+
+int unregister_carrier_listener(char *str)
+{
+       struct carrier_listener *item;
+       int ret = 0;
+
+       mutex_lock(&carrier_listeners_mutex);
+       item = find_listener(str);
+       if (!!item) {
+               hash_del_rcu(&item->node);
+               /*
+                * It also waits on in-flight handler. Refer to note on the read
+                * side
+                */
+               synchronize_srcu(&srcu);
+       } else {
+               ret = -EINVAL;
+       }
+       mutex_unlock(&carrier_listeners_mutex);
+
+       return ret;
+}
+EXPORT_SYMBOL(unregister_carrier_listener);
+
-- 
2.49.0

Reply via email to