From: Andrey Ignatov <r...@fb.com> "Post-hooks" are hooks that are called right before returning from sys_bind. At this time IP and port are already allocated and no further changes to `struct sock` can happen before returning from sys_bind but BPF program has a chance to inspect the socket and change sys_bind result.
Specifically it can e.g. inspect what port was allocated and if it doesn't satisfy some policy, BPF program can force sys_bind to release that port and return an error to user. Another example of usage is recording the IP:port pair to some map to use it in later calls to sys_connect. E.g. if some TCP server inside cgroup was bound to some IP:port and then some TCP client inside same cgroup is trying to connect to 127.0.0.1:port then BPF hook for sys_connect can override the destination and connect application to IP:port instead of 127.0.0.1:port. That helps forcing all applications inside a cgroup to use desired IP and not break those applications if they user e.g. localhost to communicate between each other. == Implementation details == Post-hooks are implemented as two new prog types `BPF_PROG_TYPE_CGROUP_INET4_POST_BIND` and `BPF_PROG_TYPE_CGROUP_INET6_POST_BIND` and corresponding attach types `BPF_CGROUP_INET4_POST_BIND` and `BPF_CGROUP_INET6_POST_BIND`. Separate prog types for IPv4 and IPv6 are introduced to avoid access to IPv6 field in `struct sock` from `inet_bind()` and to IPv4 field from `inet6_bind()` since those fields might not make sense in such cases. `BPF_PROG_TYPE_CGROUP_SOCK` prog type is not reused because it provides write access to some `struct sock` fields, but socket must not be changed in post-hooks for sys_bind. Signed-off-by: Andrey Ignatov <r...@fb.com> Signed-off-by: Alexei Starovoitov <a...@kernel.org> --- include/linux/bpf-cgroup.h | 16 ++++- include/linux/bpf_types.h | 2 + include/uapi/linux/bpf.h | 13 ++++ kernel/bpf/syscall.c | 14 ++++ kernel/bpf/verifier.c | 2 + net/core/filter.c | 170 ++++++++++++++++++++++++++++++++++++++++----- net/ipv4/af_inet.c | 3 +- net/ipv6/af_inet6.c | 3 +- 8 files changed, 202 insertions(+), 21 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 6b5c25ef1482..693c542632e3 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -98,16 +98,24 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, __ret; \ }) -#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ +#define BPF_CGROUP_RUN_SK_PROG(sk, type) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled) { \ - __ret = __cgroup_bpf_run_filter_sk(sk, \ - BPF_CGROUP_INET_SOCK_CREATE); \ + __ret = __cgroup_bpf_run_filter_sk(sk, type); \ } \ __ret; \ }) +#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ + BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_CREATE) + +#define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) \ + BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET4_POST_BIND) + +#define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) \ + BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET6_POST_BIND) + #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type) \ ({ \ int __ret = 0; \ @@ -183,6 +191,8 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; }) diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 52a571827b9f..23a97978b544 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -10,6 +10,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_INET4_BIND, cg_inet4_bind) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_INET6_BIND, cg_inet6_bind) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_INET4_POST_BIND, cg_inet4_post_bind) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_INET6_POST_BIND, cg_inet6_post_bind) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_INET4_CONNECT, cg_inet4_connect) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_INET6_CONNECT, cg_inet6_connect) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 441a674f385a..7dcc75a65a97 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -137,6 +137,8 @@ enum bpf_prog_type { BPF_PROG_TYPE_CGROUP_INET6_BIND, BPF_PROG_TYPE_CGROUP_INET4_CONNECT, BPF_PROG_TYPE_CGROUP_INET6_CONNECT, + BPF_PROG_TYPE_CGROUP_INET4_POST_BIND, + BPF_PROG_TYPE_CGROUP_INET6_POST_BIND, }; enum bpf_attach_type { @@ -151,6 +153,8 @@ enum bpf_attach_type { BPF_CGROUP_INET6_BIND, BPF_CGROUP_INET4_CONNECT, BPF_CGROUP_INET6_CONNECT, + BPF_CGROUP_INET4_POST_BIND, + BPF_CGROUP_INET6_POST_BIND, __MAX_BPF_ATTACH_TYPE }; @@ -903,6 +907,15 @@ struct bpf_sock { __u32 protocol; __u32 mark; __u32 priority; + __u32 src_ip4; /* Allows 1,2,4-byte read. + * Stored in network byte order. + */ + __u32 src_ip6[4]; /* Allows 1,2,4-byte read. + * Stored in network byte order. + */ + __u32 src_port; /* Allows 4-byte read. + * Stored in network byte order + */ }; #define XDP_PACKET_HEADROOM 256 diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 145de3332e32..2eb941dacbc5 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1382,6 +1382,12 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_INET6_BIND: ptype = BPF_PROG_TYPE_CGROUP_INET6_BIND; break; + case BPF_CGROUP_INET4_POST_BIND: + ptype = BPF_PROG_TYPE_CGROUP_INET4_POST_BIND; + break; + case BPF_CGROUP_INET6_POST_BIND: + ptype = BPF_PROG_TYPE_CGROUP_INET6_POST_BIND; + break; case BPF_CGROUP_INET4_CONNECT: ptype = BPF_PROG_TYPE_CGROUP_INET4_CONNECT; break; @@ -1449,6 +1455,12 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_INET6_BIND: ptype = BPF_PROG_TYPE_CGROUP_INET6_BIND; break; + case BPF_CGROUP_INET4_POST_BIND: + ptype = BPF_PROG_TYPE_CGROUP_INET4_POST_BIND; + break; + case BPF_CGROUP_INET6_POST_BIND: + ptype = BPF_PROG_TYPE_CGROUP_INET6_POST_BIND; + break; case BPF_CGROUP_INET4_CONNECT: ptype = BPF_PROG_TYPE_CGROUP_INET4_CONNECT; break; @@ -1504,6 +1516,8 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET_SOCK_CREATE: case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_SOCK_OPS: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cda7830a2c1b..84faec85fe3e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3874,6 +3874,8 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_INET4_BIND: case BPF_PROG_TYPE_CGROUP_INET6_BIND: + case BPF_PROG_TYPE_CGROUP_INET4_POST_BIND: + case BPF_PROG_TYPE_CGROUP_INET6_POST_BIND: case BPF_PROG_TYPE_CGROUP_INET4_CONNECT: case BPF_PROG_TYPE_CGROUP_INET6_CONNECT: case BPF_PROG_TYPE_SOCK_OPS: diff --git a/net/core/filter.c b/net/core/filter.c index 916195b86a23..e27196248c10 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3840,6 +3840,62 @@ static bool sock_filter_is_valid_access(int off, int size, return true; } +static bool __sock_is_valid_access(unsigned short ctx_family, int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + unsigned short requested_family = 0; + + if (off < 0 || off >= sizeof(struct bpf_sock)) + return false; + if (off % size != 0) + return false; + if (type != BPF_READ) + return false; + + switch (off) { + case bpf_ctx_range(struct bpf_sock, src_ip4): + requested_family = AF_INET; + /* FALLTHROUGH */ + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): + if (!requested_family) + requested_family = AF_INET6; + /* Disallow access to IPv6 fields from IPv4 contex and vise + * versa. + */ + if (requested_family != ctx_family) + return false; + bpf_ctx_record_field_size(info, size_default); + if (!bpf_ctx_narrow_access_ok(off, size, size_default)) + return false; + break; + case bpf_ctx_range(struct bpf_sock, family): + case bpf_ctx_range(struct bpf_sock, type): + case bpf_ctx_range(struct bpf_sock, protocol): + case bpf_ctx_range(struct bpf_sock, src_port): + if (size != size_default) + return false; + break; + default: + return false; + } + + return true; +} + +static bool sock4_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + return __sock_is_valid_access(AF_INET, off, size, type, info); +} + +static bool sock6_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + return __sock_is_valid_access(AF_INET6, off, size, type, info); +} + static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog, int drop_verdict) { @@ -4406,6 +4462,40 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +static u32 __sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct bpf_sock, family): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2); + + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + offsetof(struct sock, sk_family)); + break; + + case offsetof(struct bpf_sock, type): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + offsetof(struct sock, __sk_flags_offset)); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT); + break; + + case offsetof(struct bpf_sock, protocol): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + offsetof(struct sock, __sk_flags_offset)); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, + SK_FL_PROTO_SHIFT); + break; + } + + return insn - insn_buf; +} + static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -4447,26 +4537,56 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, offsetof(struct sock, sk_priority)); break; - case offsetof(struct bpf_sock, family): - BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2); + default: + return __sock_convert_ctx_access(type, si, insn_buf, prog, + target_size); + } - *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, - offsetof(struct sock, sk_family)); - break; + return insn - insn_buf; +} - case offsetof(struct bpf_sock, type): - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sock, __sk_flags_offset)); - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK); - *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT); - break; +static u32 sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + int off; - case offsetof(struct bpf_sock, protocol): - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sock, __sk_flags_offset)); - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); - *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT); + switch (si->off) { + case offsetof(struct bpf_sock, src_ip4): + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_rcv_saddr, + FIELD_SIZEOF(struct sock_common, + skc_rcv_saddr), + target_size)); break; + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): + off = si->off; + off -= offsetof(struct bpf_sock, src_ip6[0]); + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off( + struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0], + FIELD_SIZEOF(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]), + target_size) + off); + break; + case offsetof(struct bpf_sock, src_port): + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock_common, skc_num), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_num, + FIELD_SIZEOF(struct sock_common, + skc_num), + target_size)); + break; + default: + return __sock_convert_ctx_access(type, si, insn_buf, prog, + target_size); } return insn - insn_buf; @@ -5122,6 +5242,24 @@ const struct bpf_verifier_ops cg_sock_verifier_ops = { const struct bpf_prog_ops cg_sock_prog_ops = { }; +const struct bpf_verifier_ops cg_inet4_post_bind_verifier_ops = { + .get_func_proto = sock_filter_func_proto, + .is_valid_access = sock4_is_valid_access, + .convert_ctx_access = sock_convert_ctx_access, +}; + +const struct bpf_prog_ops cg_inet4_post_bind_prog_ops = { +}; + +const struct bpf_verifier_ops cg_inet6_post_bind_verifier_ops = { + .get_func_proto = sock_filter_func_proto, + .is_valid_access = sock6_is_valid_access, + .convert_ctx_access = sock_convert_ctx_access, +}; + +const struct bpf_prog_ops cg_inet6_post_bind_prog_ops = { +}; + const struct bpf_verifier_ops cg_inet4_bind_verifier_ops = { .get_func_proto = inet_bind_func_proto, .is_valid_access = sock_addr4_is_valid_access, diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 488fe26ac8e5..28e2e7fdd5b1 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -521,7 +521,8 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, /* Make sure we are allowed to bind here. */ if ((snum || !(inet->bind_address_no_port || force_bind_address_no_port)) && - sk->sk_prot->get_port(sk, snum)) { + (sk->sk_prot->get_port(sk, snum) || + BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk))) { inet->inet_saddr = inet->inet_rcv_saddr = 0; err = -EADDRINUSE; goto out_release_sock; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 13110bee5c14..473cc55a3a7d 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -414,7 +414,8 @@ int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, /* Make sure we are allowed to bind here. */ if ((snum || !(inet->bind_address_no_port || force_bind_address_no_port)) && - sk->sk_prot->get_port(sk, snum)) { + (sk->sk_prot->get_port(sk, snum) || + BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk))) { sk->sk_ipv6only = saved_ipv6only; inet_reset_saddr(sk); err = -EADDRINUSE; -- 2.9.5