eBPF programs attached to ingress and egress qdiscs see inconsistent skb-data.
For ingress L2 header is already pulled, whereas for egress it's present.
This is known to program writers which are currently forced to use
BPF_LL_OFF workaround.
Since programs don't change skb internal pointers it is safe to do
pull/push right around invocation of the program and earlier taps and
later pt-func() will not be affected.
Multiple taps via packet_rcv(), tpacket_rcv() are doing the same trick
around run_filter/BPF_PROG_RUN even if skb_shared.
This fix finally allows programs to use optimized LD_ABS/IND instructions
without BPF_LL_OFF for higher performance.
tc ingress + cls_bpf + samples/bpf/tcbpf1_kern.o
w/o JIT w/JIT
before 20.5 23.6 Mpps
after 21.8 26.6 Mpps
Old programs with BPF_LL_OFF will still work as-is.
We can now undo most of the earlier workaround commit:
a166151cbe33 (bpf: fix bpf helpers to use skb-mac_header relative offsets)
Signed-off-by: Alexei Starovoitov a...@plumgrid.com
---
new V1-V2: fixed u32-bool and added a check for CONFIG_NET_CLS_ACT
This patch is on top of 'fix build due to tc_verd':
http://patchwork.ozlabs.org/patch/480783/
Earlier versions were trying to do too much to make ingress and egress qdisc
consistent for all classifiers and actions or had too big of a scope of
push/pull:
v1: http://thread.gmane.org/gmane.linux.network/358168/focus=358168
v2: http://thread.gmane.org/gmane.linux.network/358524/focus=358532
v3: http://thread.gmane.org/gmane.linux.network/358733/focus=358734
v4: http://thread.gmane.org/gmane.linux.network/359129/focus=359694
skb-data will still be different for all non-bpf classifiers/actions.
This fix will still allow us to explore further optimizations like
moving skb_pull() from eth_type_trans() into netif_receive_skb() in the future.
Here is how ingress callchain looks:
netif_receive_skb, // likely skb-users == 1
deliver_skb
packet_rcv // skb-users == 2
orig_skb_data = skb-data
push l2
res = BPF_PROG_RUN
if (!res) {
skb-data = orig_skb_data
consume_skb(skb)// skb-users == 1
goto out
}
skb2 = skb_clone(skb)
skb-data = orig_skb_data
consume_skb(skb) // skb-users == 1
__skb_queue_tail(skb2)
deliver_skb
Tpacket_rcv // skb-users == 2
orig_skb_data = skb-data
push l2
res = BPF_PROG_RUN
if (!res) {
skb-data = orig_skb_data
kfree_skb(skb) // skb-users == 1
goto out
}
if (...) {
skb2 = skb_clone(skb)
__skb_queue_tail(skb2)
}
skb_copy_bits(skb)
skb-data = orig_skb_data
kfree_skb(skb)// skb-users == 1
tc_classify
cls_u32 and other classifiers don't touch skb
actions like mirred do clone before redirect, etc.
cls_bpf // skb-users == 1
push l2
res = BPF_PROG_RUN
pull l2
actions // still see skb-data at L3
cls_xxx // still see skb-data at L3
actions
netfilter
vlan_do_receive
bridge
deliver_skb
mpls_forward and other ptype specific taps
ip_rcv // skb-users == 1
net/core/filter.c | 26 +++---
net/sched/act_bpf.c |9 -
net/sched/cls_bpf.c | 16 +++-
samples/bpf/tcbpf1_kern.c |8
4 files changed, 30 insertions(+), 29 deletions(-)
diff --git a/net/core/filter.c b/net/core/filter.c
index 09b2062eb5b8..36a69e33d76b 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1238,21 +1238,6 @@ int sk_attach_bpf(u32 ufd, struct sock *sk)
return 0;
}
-/**
- * bpf_skb_clone_not_writable - is the header of a clone not writable
- * @skb: buffer to check
- * @len: length up to which to write, can be negative
- *
- * Returns true if modifying the header part of the cloned buffer
- * does require the data to be copied. I.e. this version works with
- * negative lengths needed for eBPF case!
- */
-static bool bpf_skb_clone_unwritable(const struct sk_buff *skb, int len)
-{
- return skb_header_cloned(skb) ||
- (int) skb_headroom(skb) + len skb-hdr_len;
-}
-
#define BPF_RECOMPUTE_CSUM(flags) ((flags) 1)
static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
@@ -1275,9 +1260,8 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3,
u64 r4, u64 flags)
if (unlikely((u32) offset 0x || len sizeof(buf)))
return -EFAULT;
- offset -= skb-data - skb_mac_header(skb);
if (unlikely(skb_cloned(skb)
-bpf_skb_clone_unwritable(skb, offset + len)))
+!skb_clone_writable(skb, offset + len)))
return -EFAULT;
ptr = skb_header_pointer(skb, offset, len, buf);
@@ -1321,9 +1305,8 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64