On 11/23/2018 02:38 AM, John Fastabend wrote: > This adds a BPF SK_MSG program helper so that we can pop data from a > msg. We use this to pop metadata from a previous push data call. > > Signed-off-by: John Fastabend <john.fastab...@gmail.com> > --- > include/uapi/linux/bpf.h | 13 +++- > net/core/filter.c | 169 > +++++++++++++++++++++++++++++++++++++++++++++++ > net/ipv4/tcp_bpf.c | 14 +++- > 3 files changed, 192 insertions(+), 4 deletions(-) > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index c1554aa..64681f8 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -2268,6 +2268,16 @@ union bpf_attr { > * > * Return > * 0 on success, or a negative error in case of failure. > + * > + * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 pop, u64 > flags) > + * Description > + * Will remove 'pop' bytes from a msg starting at byte 'start'. > + * This result in ENOMEM errors under certain situations where > + * a allocation and copy are required due to a full ring buffer. > + * However, the helper will try to avoid doing the allocation > + * if possible. Other errors can occur if input parameters are > + * invalid either do to start byte not being valid part of msg > + * payload and/or pop value being to large. > */ > #define __BPF_FUNC_MAPPER(FN) \ > FN(unspec), \ > @@ -2360,7 +2370,8 @@ union bpf_attr { > FN(map_push_elem), \ > FN(map_pop_elem), \ > FN(map_peek_elem), \ > - FN(msg_push_data), > + FN(msg_push_data), \ > + FN(msg_pop_data), > > /* integer value in 'imm' field of BPF_CALL instruction selects which helper > * function eBPF program intends to call > diff --git a/net/core/filter.c b/net/core/filter.c > index f6ca38a..c6b35b5 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -2428,6 +2428,173 @@ static const struct bpf_func_proto > bpf_msg_push_data_proto = { > .arg4_type = ARG_ANYTHING, > }; > > +static void sk_msg_shift_left(struct sk_msg *msg, int i) > +{ > + int prev; > + > + do { > + prev = i; > + sk_msg_iter_var_next(i); > + msg->sg.data[prev] = msg->sg.data[i]; > + } while (i != msg->sg.end); > + > + sk_msg_iter_prev(msg, end); > +} > + > +static void sk_msg_shift_right(struct sk_msg *msg, int i) > +{ > + struct scatterlist tmp, sge; > + > + sk_msg_iter_next(msg, end); > + sge = sk_msg_elem_cpy(msg, i); > + sk_msg_iter_var_next(i); > + tmp = sk_msg_elem_cpy(msg, i); > + > + while (i != msg->sg.end) { > + msg->sg.data[i] = sge; > + sk_msg_iter_var_next(i); > + sge = tmp; > + tmp = sk_msg_elem_cpy(msg, i); > + } > +} > + > +BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, > + u32, len, u64, flags) > +{ > + u32 i = 0, l, space, offset = 0; > + u64 last = start + len; > + int pop; > + > + if (unlikely(flags)) > + return -EINVAL; > + > + /* First find the starting scatterlist element */ > + i = msg->sg.start; > + do { > + l = sk_msg_elem(msg, i)->length; > + > + if (start < offset + l) > + break; > + offset += l; > + sk_msg_iter_var_next(i); > + } while (i != msg->sg.end); > + > + /* Bounds checks: start and pop must be inside message */ > + if (start >= offset + l || last >= msg->sg.size) > + return -EINVAL; > + > + space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); > + > + pop = len; > + /* --------------| offset > + * -| start |------- len ------| > + * > + * |----- a ----|-------- pop -------|----- b ----| > + * |______________________________________________| length > + * > + * > + * a: region at front of scatter element to save > + * b: region at back of scatter element to save when length > A + pop > + * pop: region to pop from element, same as input 'pop' here will be > + * decremented below per iteration. > + * > + * Two top-level cases to handle when start != offset, first B is non > + * zero and second B is zero corresponding to when a pop includes more > + * than one element. > + * > + * Then if B is non-zero AND there is no space allocate space and > + * compact A, B regions into page. If there is space shift ring to > + * the rigth free'ing the next element in ring to place B, leaving > + * A untouched except to reduce length. > + */ > + if (start != offset) { > + struct scatterlist *nsge, *sge = sk_msg_elem(msg, i); > + int a = start; > + int b = sge->length - pop - a; > + > + sk_msg_iter_var_next(i); > + > + if (pop < sge->length - a) { > + if (space) { > + sge->length = a; > + sk_msg_shift_right(msg, i); > + nsge = sk_msg_elem(msg, i); > + get_page(sg_page(sge)); > + sg_set_page(nsge, > + sg_page(sge), b, offset + pop); > + } else { > + struct page *page, *orig; > + u8 *to, *from; > + > + page = alloc_pages(__GFP_NOWARN | > + __GFP_COMP | GFP_ATOMIC, > + get_order(a + b)); > + if (unlikely(!page)) > + return -ENOMEM; > + > + sge->length = a; > + orig = sg_page(sge); > + from = sg_virt(sge); > + to = page_address(page); > + memcpy(to, from, a); > + memcpy(to + a, from + a + pop, b); > + sg_set_page(sge, page, a + b, 0); > + put_page(orig); > + } > + pop = 0; > + } else if (pop >= sge->length - a) { > + sge->length = a; > + pop -= (sge->length - a); > + } > + } > + > + /* From above the current layout _must_ be as follows, > + * > + * -| offset > + * -| start > + * > + * |---- pop ---|---------------- b ------------| > + * |____________________________________________| length > + * > + * Offset and start of the current msg elem are equal because in the > + * previous case we handled offset != start and either consumed the > + * entire element and advanced to the next element OR pop == 0. > + * > + * Two cases to handle here are first pop is less than the length > + * leaving some remainder b above. Simply adjust the element's layout > + * in this case. Or pop >= length of the element so that b = 0. In this > + * case advance to next element decrementing pop. > + */ > + while (pop) { > + struct scatterlist *sge = sk_msg_elem(msg, i); > + > + if (pop < sge->length) { > + sge->length -= pop; > + sge->offset += pop; > + pop = 0; > + } else { > + pop -= sge->length; > + sk_msg_shift_left(msg, i); > + } > + sk_msg_iter_var_next(i); > + } > + > + sk_mem_uncharge(msg->sk, len - pop); > + msg->sg.size -= (len - pop); > + sk_msg_compute_data_pointers(msg); > + return 0; > +}
Hmm, had to revert. I noticed this will have a use after free. While you do the sk_msg_compute_data_pointers() there is nothing from verifier that forces a reload of the payload pointers that were set up before the helper call, so they can still be used after the bpf_msg_pop_data() even though we dropped ref from page etc. Thanks, Daniel