Re: [PATCH v5 bpf-next 2/9] veth: Add driver XDP

2018-07-26 Thread Toshiaki Makita
Hi John,

On 2018/07/27 12:02, John Fastabend wrote:
> On 07/26/2018 07:40 AM, Toshiaki Makita wrote:
>> From: Toshiaki Makita 
>>
>> This is the basic implementation of veth driver XDP.
>>
>> Incoming packets are sent from the peer veth device in the form of skb,
>> so this is generally doing the same thing as generic XDP.
>>
>> This itself is not so useful, but a starting point to implement other
>> useful veth XDP features like TX and REDIRECT.
>>
>> This introduces NAPI when XDP is enabled, because XDP is now heavily
>> relies on NAPI context. Use ptr_ring to emulate NIC ring. Tx function
>> enqueues packets to the ring and peer NAPI handler drains the ring.
>>
>> Currently only one ring is allocated for each veth device, so it does
>> not scale on multiqueue env. This can be resolved by allocating rings
>> on the per-queue basis later.
>>
>> Note that NAPI is not used but netif_rx is used when XDP is not loaded,
>> so this does not change the default behaviour.
>>
>> v3:
>> - Fix race on closing the device.
>> - Add extack messages in ndo_bpf.
>>
>> v2:
>> - Squashed with the patch adding NAPI.
>> - Implement adjust_tail.
>> - Don't acquire consumer lock because it is guarded by NAPI.
>> - Make poll_controller noop since it is unnecessary.
>> - Register rxq_info on enabling XDP rather than on opening the device.
>>
>> Signed-off-by: Toshiaki Makita 
>> ---
> 
> 
> [...]
> 
> One nit and one question.
> 
>> +
>> +static struct sk_buff *veth_xdp_rcv_skb(struct veth_priv *priv,
>> +struct sk_buff *skb)
>> +{
>> +u32 pktlen, headroom, act, metalen;
>> +void *orig_data, *orig_data_end;
>> +int size, mac_len, delta, off;
>> +struct bpf_prog *xdp_prog;
>> +struct xdp_buff xdp;
>> +
>> +rcu_read_lock();
>> +xdp_prog = rcu_dereference(priv->xdp_prog);
>> +if (unlikely(!xdp_prog)) {
>> +rcu_read_unlock();
>> +goto out;
>> +}
>> +
>> +mac_len = skb->data - skb_mac_header(skb);
>> +pktlen = skb->len + mac_len;
>> +size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) +
>> +   SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
>> +if (size > PAGE_SIZE)
>> +goto drop;
> 
> I'm not sure why it matters if size > PAGE_SIZE here. Why not
> just consume it and use the correct page order in alloc_page if
> its not linear.

Indeed. We can allow such skbs here at least if we don't need
reallocation (which is highly unlikely though).

But I'm not sure we should allocate multiple pages in atomic context.
It tends to cause random allocation failure which is IMO more
frustrating. We are now prohibiting such a situation by max_mtu and
dropping features, which looks more robust to me.

>> +
>> +headroom = skb_headroom(skb) - mac_len;
>> +if (skb_shared(skb) || skb_head_is_locked(skb) ||
>> +skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) {
>> +struct sk_buff *nskb;
>> +void *head, *start;
>> +struct page *page;
>> +int head_off;
>> +
>> +page = alloc_page(GFP_ATOMIC);
> 
> Should also have __NO_WARN here as well this can be triggered by
> external events so we don't want DDOS here to flood system logs.

Sure, thanks!

-- 
Toshiaki Makita



Re: [PATCH v5 bpf-next 2/9] veth: Add driver XDP

2018-07-26 Thread John Fastabend
On 07/26/2018 07:40 AM, Toshiaki Makita wrote:
> From: Toshiaki Makita 
> 
> This is the basic implementation of veth driver XDP.
> 
> Incoming packets are sent from the peer veth device in the form of skb,
> so this is generally doing the same thing as generic XDP.
> 
> This itself is not so useful, but a starting point to implement other
> useful veth XDP features like TX and REDIRECT.
> 
> This introduces NAPI when XDP is enabled, because XDP is now heavily
> relies on NAPI context. Use ptr_ring to emulate NIC ring. Tx function
> enqueues packets to the ring and peer NAPI handler drains the ring.
> 
> Currently only one ring is allocated for each veth device, so it does
> not scale on multiqueue env. This can be resolved by allocating rings
> on the per-queue basis later.
> 
> Note that NAPI is not used but netif_rx is used when XDP is not loaded,
> so this does not change the default behaviour.
> 
> v3:
> - Fix race on closing the device.
> - Add extack messages in ndo_bpf.
> 
> v2:
> - Squashed with the patch adding NAPI.
> - Implement adjust_tail.
> - Don't acquire consumer lock because it is guarded by NAPI.
> - Make poll_controller noop since it is unnecessary.
> - Register rxq_info on enabling XDP rather than on opening the device.
> 
> Signed-off-by: Toshiaki Makita 
> ---


[...]

One nit and one question.

> +
> +static struct sk_buff *veth_xdp_rcv_skb(struct veth_priv *priv,
> + struct sk_buff *skb)
> +{
> + u32 pktlen, headroom, act, metalen;
> + void *orig_data, *orig_data_end;
> + int size, mac_len, delta, off;
> + struct bpf_prog *xdp_prog;
> + struct xdp_buff xdp;
> +
> + rcu_read_lock();
> + xdp_prog = rcu_dereference(priv->xdp_prog);
> + if (unlikely(!xdp_prog)) {
> + rcu_read_unlock();
> + goto out;
> + }
> +
> + mac_len = skb->data - skb_mac_header(skb);
> + pktlen = skb->len + mac_len;
> + size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) +
> +SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> + if (size > PAGE_SIZE)
> + goto drop;

I'm not sure why it matters if size > PAGE_SIZE here. Why not
just consume it and use the correct page order in alloc_page if
its not linear.

> +
> + headroom = skb_headroom(skb) - mac_len;
> + if (skb_shared(skb) || skb_head_is_locked(skb) ||
> + skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) {
> + struct sk_buff *nskb;
> + void *head, *start;
> + struct page *page;
> + int head_off;
> +
> + page = alloc_page(GFP_ATOMIC);

Should also have __NO_WARN here as well this can be triggered by
external events so we don't want DDOS here to flood system logs.

> + if (!page)
> + goto drop;
> +
> + head = page_address(page);
> + start = head + VETH_XDP_HEADROOM;
> + if (skb_copy_bits(skb, -mac_len, start, pktlen)) {
> + page_frag_free(head);
> + goto drop;
> + }
> +
> + nskb = veth_build_skb(head,
> +   VETH_XDP_HEADROOM + mac_len, skb->len,
> +   PAGE_SIZE);
> + if (!nskb) {
> + page_frag_free(head);
> + goto drop;
> + }
> +
> + skb_copy_header(nskb, skb);
> + head_off = skb_headroom(nskb) - skb_headroom(skb);
> + skb_headers_offset_update(nskb, head_off);
> + if (skb->sk)
> + skb_set_owner_w(nskb, skb->sk);
> + consume_skb(skb);
> + skb = nskb;
> + }
> +
> + xdp.data_hard_start = skb->head;
> + xdp.data = skb_mac_header(skb);
> + xdp.data_end = xdp.data + pktlen;
> + xdp.data_meta = xdp.data;
> + xdp.rxq = >xdp_rxq;
> + orig_data = xdp.data;
> + orig_data_end = xdp.data_end;
> +
> + act = bpf_prog_run_xdp(xdp_prog, );
> +
> + switch (act) {
> + case XDP_PASS:
> + break;
> + default:
> + bpf_warn_invalid_xdp_action(act);
> + case XDP_ABORTED:
> + trace_xdp_exception(priv->dev, xdp_prog, act);
> + case XDP_DROP:
> + goto drop;
> + }
> + rcu_read_unlock();
> +
> + delta = orig_data - xdp.data;
> + off = mac_len + delta;
> + if (off > 0)
> + __skb_push(skb, off);
> + else if (off < 0)
> + __skb_pull(skb, -off);
> + skb->mac_header -= delta;
> + off = xdp.data_end - orig_data_end;
> + if (off != 0)
> + __skb_put(skb, off);
> + skb->protocol = eth_type_trans(skb, priv->dev);
> +
> + metalen = xdp.data - xdp.data_meta;
> + if (metalen)
> + skb_metadata_set(skb, metalen);
> +out:
> + return skb;
> +drop:
> + rcu_read_unlock();
> + kfree_skb(skb);
> + return NULL;
> +}
> +

Thanks,
John



[PATCH v5 bpf-next 2/9] veth: Add driver XDP

2018-07-26 Thread Toshiaki Makita
From: Toshiaki Makita 

This is the basic implementation of veth driver XDP.

Incoming packets are sent from the peer veth device in the form of skb,
so this is generally doing the same thing as generic XDP.

This itself is not so useful, but a starting point to implement other
useful veth XDP features like TX and REDIRECT.

This introduces NAPI when XDP is enabled, because XDP is now heavily
relies on NAPI context. Use ptr_ring to emulate NIC ring. Tx function
enqueues packets to the ring and peer NAPI handler drains the ring.

Currently only one ring is allocated for each veth device, so it does
not scale on multiqueue env. This can be resolved by allocating rings
on the per-queue basis later.

Note that NAPI is not used but netif_rx is used when XDP is not loaded,
so this does not change the default behaviour.

v3:
- Fix race on closing the device.
- Add extack messages in ndo_bpf.

v2:
- Squashed with the patch adding NAPI.
- Implement adjust_tail.
- Don't acquire consumer lock because it is guarded by NAPI.
- Make poll_controller noop since it is unnecessary.
- Register rxq_info on enabling XDP rather than on opening the device.

Signed-off-by: Toshiaki Makita 
---
 drivers/net/veth.c | 373 -
 1 file changed, 366 insertions(+), 7 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index a69ad39ee57e..78fa08cb6e24 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -19,10 +19,18 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
+#include 
+#include 
 
 #define DRV_NAME   "veth"
 #define DRV_VERSION"1.0"
 
+#define VETH_RING_SIZE 256
+#define VETH_XDP_HEADROOM  (XDP_PACKET_HEADROOM + NET_IP_ALIGN)
+
 struct pcpu_vstats {
u64 packets;
u64 bytes;
@@ -30,9 +38,16 @@ struct pcpu_vstats {
 };
 
 struct veth_priv {
+   struct napi_struct  xdp_napi;
+   struct net_device   *dev;
+   struct bpf_prog __rcu   *xdp_prog;
+   struct bpf_prog *_xdp_prog;
struct net_device __rcu *peer;
atomic64_t  dropped;
unsignedrequested_headroom;
+   boolrx_notify_masked;
+   struct ptr_ring xdp_ring;
+   struct xdp_rxq_info xdp_rxq;
 };
 
 /*
@@ -98,11 +113,43 @@ static const struct ethtool_ops veth_ethtool_ops = {
.get_link_ksettings = veth_get_link_ksettings,
 };
 
-static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
+/* general routines */
+
+static void __veth_xdp_flush(struct veth_priv *priv)
+{
+   /* Write ptr_ring before reading rx_notify_masked */
+   smp_mb();
+   if (!priv->rx_notify_masked) {
+   priv->rx_notify_masked = true;
+   napi_schedule(>xdp_napi);
+   }
+}
+
+static int veth_xdp_rx(struct veth_priv *priv, struct sk_buff *skb)
+{
+   if (unlikely(ptr_ring_produce(>xdp_ring, skb))) {
+   dev_kfree_skb_any(skb);
+   return NET_RX_DROP;
+   }
+
+   return NET_RX_SUCCESS;
+}
+
+static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, bool 
xdp)
 {
struct veth_priv *priv = netdev_priv(dev);
+
+   return __dev_forward_skb(dev, skb) ?: xdp ?
+   veth_xdp_rx(priv, skb) :
+   netif_rx(skb);
+}
+
+static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+   struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
struct net_device *rcv;
int length = skb->len;
+   bool rcv_xdp = false;
 
rcu_read_lock();
rcv = rcu_dereference(priv->peer);
@@ -111,7 +158,10 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct 
net_device *dev)
goto drop;
}
 
-   if (likely(dev_forward_skb(rcv, skb) == NET_RX_SUCCESS)) {
+   rcv_priv = netdev_priv(rcv);
+   rcv_xdp = rcu_access_pointer(rcv_priv->xdp_prog);
+
+   if (likely(veth_forward_skb(rcv, skb, rcv_xdp) == NET_RX_SUCCESS)) {
struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
 
u64_stats_update_begin(>syncp);
@@ -122,14 +172,15 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct 
net_device *dev)
 drop:
atomic64_inc(>dropped);
}
+
+   if (rcv_xdp)
+   __veth_xdp_flush(rcv_priv);
+
rcu_read_unlock();
+
return NETDEV_TX_OK;
 }
 
-/*
- * general routines
- */
-
 static u64 veth_stats_one(struct pcpu_vstats *result, struct net_device *dev)
 {
struct veth_priv *priv = netdev_priv(dev);
@@ -179,18 +230,253 @@ static void veth_set_multicast_list(struct net_device 
*dev)
 {
 }
 
+static struct sk_buff *veth_build_skb(void *head, int headroom, int len,
+ int buflen)
+{
+   struct sk_buff *skb;
+
+   if (!buflen) {
+   buflen = SKB_DATA_ALIGN(headroom + len) +
+