Historically binfmt_misc has only matched binaries by magic bytes or file
extension, and always redirects a match to a fixed interpreter recorded in
the registration. This is insufficient when the match requires parsing the
binary header (e.g. inspecting ELF program headers) or when the interpreter
must be computed per-binary rather than hard-coded.
Introduce a new 'B' (BPF) handler type. A pinned SOCKET_FILTER program is
registered in place of the magic/mask, and no interpreter is recorded:
echo ':name:B:<bpf_pinned_path>::::<flags>' \
> /proc/sys/fs/binfmt_misc/register
When a binary is executed, binfmt_misc runs the program with the
BINPRM_BUF_SIZE file-header buffer as context. Returning 1 selects the
handler; returning 0 falls through to the remaining handlers.
Unlike magic/extension handlers, a 'B' handler carries no interpreter of its
own: the program chooses it via a new helper, bpf_binprm_set_interp(). This
lets the program compute the interpreter path however it sees fit (for example
relative to the binary). A 'B' handler is therefore a strict superset of the
existing magic handlers -- any of them can be expressed as a program that
matches on the header and sets a fixed interpreter.
bpf_binprm_set_interp() is exposed to SOCKET_FILTER programs and stashes the
chosen path on a per-CPU area that binfmt_misc reads back immediately after
the run under migrate_disable(); only a match that set an interpreter
allocates.
Assisted-by: Gemini <[email protected]>
Signed-off-by: Farid Zakaria <[email protected]>
---
Hey Christian,
Thank you for the suggestion and discussion on the mailing list. I took a stab
at your idea of driving binfmt_misc interpreter selection from a BPF program
(hopefully this is what you had in mind). The other approach was also to use a
static interpreter during registration but this current approach feels right.
This prototype demonstrates binfmt_misc interpreter selection from a BPF program
so the interpreter can be *computed* (e.g. $ORIGIN-relative) rather than
recorded
statically. It replaces my earlier "pluggable ELF interpreter loader registry"
RFC.
As this is a first draft, a few things you probably will have notes on.
I tried my best but I am a novice here so I welcome your feedback.
* The prototype uses socket filter type. This seemed the easiest to wire up to
start. My guess is it should be a dedicated type (eg. BPF_PROG_TYPE_BINFMT).
* bpf_binprm_set_interp() is a "classic helper", but that list
is frozen in favour of kfuncs. A kfunc needs BTF and can't be called from
the raw-bytecode selftest I wrote.
* The program only gets bprm->buf today.
That is enough to match on the ELF header but not to compute an
$ORIGIN-relative path from the binary's location, which needs bprm->filename
* I have some selftests that exercises match + program-chosen interpreter. I
don't
include them here yet since it seems unnecessary when discussing the idea.
To demo the functionality, I thought it would be neat to see how this is a
superset
and can replace all the qemu binfmt usage [1].
$ ./hello-aarch64;
-bash: /etc/binfmt-demo/hello-aarch64: cannot execute binary file: Exec
format error
$ bpftool prog load filter.bpf.o /sys/fs/bpf/qemu type socket
$ echo ':qemu-bpf:B:/sys/fs/bpf/qemu::::P' > /proc/sys/fs/binfmt_misc/register
$ ./hello-aarch64
AARCH64_RAN_VIA_QEMU
All the per-CPU guards were recommended by AI.
[1] https://gist.github.com/fzakaria/bef27d2e21b0e36ffccda1cbf417b636
fs/binfmt_misc.c | 181 ++++++++++++++++++++++++++++++++++++---
include/linux/bpf.h | 1 +
include/uapi/linux/bpf.h | 1 +
net/core/filter.c | 8 ++
4 files changed, 180 insertions(+), 11 deletions(-)
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 84349fcb9..cf6698d59 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -29,6 +29,11 @@
#include <linux/fs.h>
#include <linux/uaccess.h>
+#ifdef CONFIG_BPF_SYSCALL
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#endif
+
#include "internal.h"
#ifdef DEBUG
@@ -41,12 +46,14 @@ enum {
VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */
};
-enum {Enabled, Magic};
+enum {Enabled, Magic, Bpf};
#define MISC_FMT_PRESERVE_ARGV0 (1UL << 31)
#define MISC_FMT_OPEN_BINARY (1UL << 30)
#define MISC_FMT_CREDENTIALS (1UL << 29)
#define MISC_FMT_OPEN_FILE (1UL << 28)
+struct bpf_prog;
+
typedef struct {
struct list_head list;
unsigned long flags; /* type, status, etc. */
@@ -59,6 +66,9 @@ typedef struct {
struct dentry *dentry;
struct file *interp_file;
refcount_t users; /* sync removal with load_misc_binary()
*/
+#ifdef CONFIG_BPF_SYSCALL
+ struct bpf_prog *bpf_prog;
+#endif
} Node;
static struct file_system_type bm_fs_type;
@@ -78,10 +88,51 @@ static struct file_system_type bm_fs_type;
*/
#define MAX_REGISTER_LENGTH 1920
+#ifdef CONFIG_BPF_SYSCALL
+struct binfmt_bpf_interp {
+ char path[PATH_MAX];
+ int len; /* < 0 if the current program set no interpreter */
+};
+static DEFINE_PER_CPU(struct binfmt_bpf_interp, binfmt_bpf_interp);
+
+/*
+ * bpf_binprm_set_interp - let a binfmt_misc 'B' program pick the interpreter.
+ * @path: interpreter path, in BPF-accessible memory
+ * @len: number of bytes in @path
+ *
+ * The program computes the interpreter path however it sees fit (e.g. relative
+ * to the binary). The path is stashed on a per-CPU area that binfmt_misc reads
+ * back immediately after running the program under migrate_disable(), so it
+ * cannot race with another CPU.
+ */
+BPF_CALL_2(bpf_binprm_set_interp, const char *, path, u32, len)
+{
+ struct binfmt_bpf_interp *sc = this_cpu_ptr(&binfmt_bpf_interp);
+
+ if (len == 0 || len >= PATH_MAX)
+ return -EINVAL;
+ /* @path is @len bytes of BPF memory and is not NUL-terminated. */
+ memcpy(sc->path, path, len);
+ sc->path[len] = '\0';
+ sc->len = len;
+ return 0;
+}
+
+const struct bpf_func_proto bpf_binprm_set_interp_proto = {
+ .func = bpf_binprm_set_interp,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg2_type = ARG_CONST_SIZE,
+};
+#endif /* CONFIG_BPF_SYSCALL */
+
/**
* search_binfmt_handler - search for a binary handler for @bprm
* @misc: handle to binfmt_misc instance
* @bprm: binary for which we are looking for a handler
+ * @bpf_interp: receives a kmalloc'd interpreter path if a 'B' program chooses
+ * one via bpf_binprm_set_interp(); the caller must kfree() it
*
* Search for a binary type handler for @bprm in the list of registered binary
* type handlers.
@@ -89,7 +140,8 @@ static struct file_system_type bm_fs_type;
* Return: binary type list entry on success, NULL on failure
*/
static Node *search_binfmt_handler(struct binfmt_misc *misc,
- struct linux_binprm *bprm)
+ struct linux_binprm *bprm,
+ char **bpf_interp)
{
char *p = strrchr(bprm->interp, '.');
Node *e;
@@ -103,6 +155,37 @@ static Node *search_binfmt_handler(struct binfmt_misc
*misc,
if (!test_bit(Enabled, &e->flags))
continue;
+ /* Do matching based on BPF if applicable. */
+ if (test_bit(Bpf, &e->flags)) {
+#ifdef CONFIG_BPF_SYSCALL
+ if (e->bpf_prog) {
+ struct binfmt_bpf_interp *sc;
+ u32 ret;
+
+ migrate_disable();
+ sc = this_cpu_ptr(&binfmt_bpf_interp);
+ sc->len = -1;
+
+ rcu_read_lock();
+ ret = bpf_prog_run(e->bpf_prog, bprm->buf);
+ rcu_read_unlock();
+
+ if (ret == 1 && sc->len > 0)
+ *bpf_interp = kmemdup_nul(sc->path,
+ sc->len,
+ GFP_ATOMIC);
+ migrate_enable();
+
+ pr_debug("binfmt_misc: ran BPF program for %s,
ret = %u\n",
+ bprm->filename, ret);
+
+ if (ret == 1)
+ return e;
+ }
+#endif
+ continue;
+ }
+
/* Do matching based on extension if applicable. */
if (!test_bit(Magic, &e->flags)) {
if (p && !strcmp(e->magic, p + 1))
@@ -139,12 +222,13 @@ static Node *search_binfmt_handler(struct binfmt_misc
*misc,
* Return: binary type list entry on success, NULL on failure
*/
static Node *get_binfmt_handler(struct binfmt_misc *misc,
- struct linux_binprm *bprm)
+ struct linux_binprm *bprm,
+ char **bpf_interp)
{
Node *e;
read_lock(&misc->entries_lock);
- e = search_binfmt_handler(misc, bprm);
+ e = search_binfmt_handler(misc, bprm, bpf_interp);
if (e)
refcount_inc(&e->users);
read_unlock(&misc->entries_lock);
@@ -164,6 +248,10 @@ static void put_binfmt_handler(Node *e)
if (refcount_dec_and_test(&e->users)) {
if (e->flags & MISC_FMT_OPEN_FILE)
filp_close(e->interp_file, NULL);
+#ifdef CONFIG_BPF_SYSCALL
+ if (test_bit(Bpf, &e->flags) && e->bpf_prog)
+ bpf_prog_put(e->bpf_prog);
+#endif
kfree(e);
}
}
@@ -206,15 +294,27 @@ static int load_misc_binary(struct linux_binprm *bprm)
struct file *interp_file = NULL;
int retval = -ENOEXEC;
struct binfmt_misc *misc;
+ char *bpf_interp __free(kfree) = NULL;
+ const char *interpreter;
misc = load_binfmt_misc();
if (!misc->enabled)
return retval;
- fmt = get_binfmt_handler(misc, bprm);
+ fmt = get_binfmt_handler(misc, bprm, &bpf_interp);
if (!fmt)
return retval;
+ /*
+ * A 'B' (BPF) handler carries no interpreter of its own; the program
+ * chooses it via bpf_binprm_set_interp(). Other handlers use the
+ * interpreter recorded at registration.
+ */
+ interpreter = bpf_interp ? bpf_interp : fmt->interpreter;
+ retval = -ENOEXEC;
+ if (!interpreter[0])
+ goto ret;
+
/* Need to be able to load the file after exec */
retval = -ENOENT;
if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
@@ -238,22 +338,27 @@ static int load_misc_binary(struct linux_binprm *bprm)
bprm->argc++;
/* add the interp as argv[0] */
- retval = copy_string_kernel(fmt->interpreter, bprm);
+ retval = copy_string_kernel(interpreter, bprm);
if (retval < 0)
goto ret;
bprm->argc++;
/* Update interp in case binfmt_script needs it. */
- retval = bprm_change_interp(fmt->interpreter, bprm);
+ retval = bprm_change_interp(interpreter, bprm);
if (retval < 0)
goto ret;
- if (fmt->flags & MISC_FMT_OPEN_FILE) {
+ /*
+ * The pre-opened interp_file (MISC_FMT_OPEN_FILE / 'F' flag) only
+ * applies to the statically registered interpreter; a program-supplied
+ * path is opened here.
+ */
+ if ((fmt->flags & MISC_FMT_OPEN_FILE) && !bpf_interp) {
interp_file = file_clone_open(fmt->interp_file);
if (!IS_ERR(interp_file))
deny_write_access(interp_file);
} else {
- interp_file = open_exec(fmt->interpreter);
+ interp_file = open_exec(interpreter);
}
retval = PTR_ERR(interp_file);
if (IS_ERR(interp_file))
@@ -404,6 +509,10 @@ static Node *create_entry(const char __user *buffer,
size_t count)
pr_debug("register: type: M (magic)\n");
e->flags = (1 << Enabled) | (1 << Magic);
break;
+ case 'B':
+ pr_debug("register: type: B (bpf)\n");
+ e->flags = (1 << Enabled) | (1 << Bpf);
+ break;
default:
goto einval;
}
@@ -492,6 +601,45 @@ static Node *create_entry(const char __user *buffer,
size_t count)
}
}
}
+ } else if (test_bit(Bpf, &e->flags)) {
+ /* Handle the 'B' (BPF) format. */
+ char *s;
+
+ /* The offset field actually holds the pinned BPF program path
*/
+ s = strchr(p, del);
+ if (!s)
+ goto einval;
+ *s++ = '\0';
+ e->magic = p; /* Keep path in e->magic */
+ pr_debug("register: bpf program path: %s\n", e->magic);
+
+#ifdef CONFIG_BPF_SYSCALL
+ e->bpf_prog = bpf_prog_get_type_path(e->magic,
BPF_PROG_TYPE_SOCKET_FILTER);
+ if (IS_ERR(e->bpf_prog)) {
+ err = PTR_ERR(e->bpf_prog);
+ e->bpf_prog = NULL;
+ kfree(e);
+ return ERR_PTR(err);
+ }
+#else
+ goto einval;
+#endif
+
+ p = s;
+
+ /* The magic field is unused, must be empty */
+ s = strchr(p, del);
+ if (!s || p != s)
+ goto einval;
+ *s++ = '\0';
+ p = s;
+
+ /* The mask field is unused, must be empty */
+ s = strchr(p, del);
+ if (!s || p != s)
+ goto einval;
+ *s++ = '\0';
+ p = s;
} else {
/* Handle the 'E' (extension) format. */
@@ -524,8 +672,17 @@ static Node *create_entry(const char __user *buffer,
size_t count)
if (!p)
goto einval;
*p++ = '\0';
- if (!e->interpreter[0])
+ if (test_bit(Bpf, &e->flags)) {
+ /*
+ * A 'B' (BPF) handler carries no interpreter of its own; the
+ * program picks it via bpf_binprm_set_interp(). Reject a
+ * statically registered one.
+ */
+ if (e->interpreter[0])
+ goto einval;
+ } else if (!e->interpreter[0]) {
goto einval;
+ }
pr_debug("register: interpreter: {%s}\n", e->interpreter);
/* Parse the 'flags' field. */
@@ -602,7 +759,9 @@ static void entry_status(Node *e, char *page)
*dp++ = 'F';
*dp++ = '\n';
- if (!test_bit(Magic, &e->flags)) {
+ if (test_bit(Bpf, &e->flags)) {
+ sprintf(dp, "bpf %s\n", e->magic);
+ } else if (!test_bit(Magic, &e->flags)) {
sprintf(dp, "extension .%s\n", e->magic);
} else {
dp += sprintf(dp, "offset %i\nmagic ", e->offset);
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7719f6528..3aef44b69 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -3874,6 +3874,7 @@ extern const struct bpf_func_proto bpf_ktime_get_ns_proto;
extern const struct bpf_func_proto bpf_ktime_get_boot_ns_proto;
extern const struct bpf_func_proto bpf_ktime_get_tai_ns_proto;
extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto;
+extern const struct bpf_func_proto bpf_binprm_set_interp_proto;
extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
extern const struct bpf_func_proto bpf_get_current_comm_proto;
extern const struct bpf_func_proto bpf_get_stackid_proto;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 89b36de5f..dce155c5b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6142,6 +6142,7 @@ union bpf_attr {
FN(user_ringbuf_drain, 209, ##ctx) \
FN(cgrp_storage_get, 210, ##ctx) \
FN(cgrp_storage_delete, 211, ##ctx) \
+ FN(binprm_set_interp, 212, ##ctx) \
/* This helper list is effectively frozen. If you are trying to \
* add a new helper, you should add a kfunc instead which has \
* less stability guarantees. See Documentation/bpf/kfuncs.rst \
diff --git a/net/core/filter.c b/net/core/filter.c
index 2e96b4b84..187692a4a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -8397,6 +8397,14 @@ sk_filter_func_proto(enum bpf_func_id func_id, const
struct bpf_prog *prog)
return &bpf_get_socket_uid_proto;
case BPF_FUNC_perf_event_output:
return &bpf_skb_event_output_proto;
+#if IS_BUILTIN(CONFIG_BINFMT_MISC)
+ case BPF_FUNC_binprm_set_interp:
+ /*
+ * binfmt_misc reuses SOCKET_FILTER programs to select an
+ * interpreter; expose the helper that lets them set it.
+ */
+ return &bpf_binprm_set_interp_proto;
+#endif
default:
return bpf_sk_base_func_proto(func_id, prog);
}
--
2.51.2