On 03/12/2018 08:23 PM, John Fastabend wrote: [...] > /* integer value in 'imm' field of BPF_CALL instruction selects which helper > * function eBPF program intends to call > diff --git a/net/core/filter.c b/net/core/filter.c > index 2c73af0..7b9e63e 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -1956,6 +1956,134 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff > *msg) > .arg2_type = ARG_ANYTHING, > }; > > +BPF_CALL_4(bpf_msg_pull_data, > + struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags) > +{ > + unsigned int len = 0, offset = 0, copy = 0; > + struct scatterlist *sg = msg->sg_data; > + int first_sg, last_sg, i, shift; > + unsigned char *p, *to, *from; > + int bytes = end - start; > + struct page *page; > + > + if (unlikely(end < start)) > + return -EINVAL;
Actually should be: if (unlikely(flags || end <= start)) return -EINVAL; > + /* First find the starting scatterlist element */ > + i = msg->sg_start; > + do { > + len = sg[i].length; > + offset += len; > + if (start < offset + len) > + break; > + i++; > + if (i == MAX_SKB_FRAGS) > + i = 0; > + } while (i != msg->sg_end); > + > + if (unlikely(start >= offset + len)) > + return -EINVAL; > + > + if (!msg->sg_copy[i] && bytes <= len) > + goto out; > + > + first_sg = i; > + > + /* At this point we need to linearize multiple scatterlist > + * elements or a single shared page. Either way we need to > + * copy into a linear buffer exclusively owned by BPF. Then > + * place the buffer in the scatterlist and fixup the original > + * entries by removing the entries now in the linear buffer > + * and shifting the remaining entries. For now we do not try > + * to copy partial entries to avoid complexity of running out > + * of sg_entry slots. The downside is reading a single byte > + * will copy the entire sg entry. > + */ > + do { > + copy += sg[i].length; > + i++; > + if (i == MAX_SKB_FRAGS) > + i = 0; > + if (bytes < copy) > + break; > + } while (i != msg->sg_end); > + last_sg = i; > + > + if (unlikely(copy < end - start)) > + return -EINVAL; > + > + page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC, get_order(copy)); if (unlikely(!page)) return -ENOMEM; > + p = page_address(page); > + offset = 0; > + > + i = first_sg; > + do { > + from = sg_virt(&sg[i]); > + len = sg[i].length; > + to = p + offset; > + > + memcpy(to, from, len); > + offset += len; > + sg[i].length = 0; > + put_page(sg_page(&sg[i])); > + > + i++; > + if (i == MAX_SKB_FRAGS) > + i = 0; > + } while (i != last_sg); > + > + sg[first_sg].length = copy; > + sg_set_page(&sg[first_sg], page, copy, 0); > + > + /* To repair sg ring we need to shift entries. If we only > + * had a single entry though we can just replace it and > + * be done. Otherwise walk the ring and shift the entries. > + */ > + shift = last_sg - first_sg - 1; > + if (!shift) > + goto out; > + > + i = first_sg + 1; > + do { > + int move_from; > + > + if (i + shift >= MAX_SKB_FRAGS) > + move_from = i + shift - MAX_SKB_FRAGS; > + else > + move_from = i + shift; > + > + if (move_from == msg->sg_end) > + break; > + > + sg[i] = sg[move_from]; > + sg[move_from].length = 0; > + sg[move_from].page_link = 0; > + sg[move_from].offset = 0; > + > + i++; > + if (i == MAX_SKB_FRAGS) > + i = 0; > + } while (1); > + msg->sg_end -= shift; > + if (msg->sg_end < 0) > + msg->sg_end += MAX_SKB_FRAGS; > +out: > + msg->data = sg_virt(&sg[i]) + start - offset; > + msg->data_end = msg->data + bytes; > + > + return 0; > +} > + > +static const struct bpf_func_proto bpf_msg_pull_data_proto = { > + .func = bpf_msg_pull_data, > + .gpl_only = false, > + .ret_type = RET_INTEGER, > + .arg1_type = ARG_PTR_TO_CTX, > + .arg2_type = ARG_ANYTHING, > + .arg3_type = ARG_ANYTHING, > + .arg4_type = ARG_ANYTHING, > +}; > + > BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) > { > return task_get_classid(skb); > @@ -2897,7 +3025,8 @@ bool bpf_helper_changes_pkt_data(void *func) > func == bpf_l3_csum_replace || > func == bpf_l4_csum_replace || > func == bpf_xdp_adjust_head || > - func == bpf_xdp_adjust_meta) > + func == bpf_xdp_adjust_meta || > + func == bpf_msg_pull_data) > return true; > > return false; > @@ -3666,6 +3795,8 @@ static const struct bpf_func_proto > *sk_msg_func_proto(enum bpf_func_id func_id) > return &bpf_msg_apply_bytes_proto; > case BPF_FUNC_msg_cork_bytes: > return &bpf_msg_cork_bytes_proto; > + case BPF_FUNC_msg_pull_data: > + return &bpf_msg_pull_data_proto; > default: > return bpf_base_func_proto(func_id); > } >