Re: [PATCH,v3,net-next 2/2] tun: enable napi_gro_frags() for TUN/TAP driver

2017-09-25 Thread महेश बंडेवार
On Fri, Sep 22, 2017 at 1:49 PM, Petar Penkov  wrote:
> Add a TUN/TAP receive mode that exercises the napi_gro_frags()
> interface. This mode is available only in TAP mode, as the interface
> expects packets with Ethernet headers.
>
> Furthermore, packets follow the layout of the iovec_iter that was
> received. The first iovec is the linear data, and every one after the
> first is a fragment. If there are more fragments than the max number,
> drop the packet. Additionally, invoke eth_get_headlen() to exercise flow
> dissector code and to verify that the header resides in the linear data.
>
> The napi_gro_frags() mode requires setting the IFF_NAPI_FRAGS option.
> This is imposed because this mode is intended for testing via tools like
> syzkaller and packetdrill, and the increased flexibility it provides can
> introduce security vulnerabilities. This flag is accepted only if the
> device is in TAP mode and has the IFF_NAPI flag set as well. This is
> done because both of these are explicit requirements for correct
> operation in this mode.
>
> Signed-off-by: Petar Penkov 

Thank you Petar.

Acked-by: Mahesh Bandewar 
> Cc: Eric Dumazet 
> Cc: Mahesh Bandewar 
> Cc: Willem de Bruijn 
> Cc: da...@davemloft.net
> Cc: ppen...@stanford.edu
> ---
>  drivers/net/tun.c   | 134 
> ++--
>  include/uapi/linux/if_tun.h |   1 +
>  2 files changed, 129 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index f16407242b18..9880b3bc8fa5 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -75,6 +75,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>
>  #include 
>
> @@ -121,7 +122,8 @@ do {  
>   \
>  #define TUN_VNET_BE 0x4000
>
>  #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
> - IFF_MULTI_QUEUE | IFF_NAPI)
> + IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS)
> +
>  #define GOODCOPY_LEN 128
>
>  #define FLT_EXACT_COUNT 8
> @@ -173,6 +175,7 @@ struct tun_file {
> unsigned int ifindex;
> };
> struct napi_struct napi;
> +   struct mutex napi_mutex;/* Protects access to the above napi 
> */
> struct list_head next;
> struct tun_struct *detached;
> struct skb_array tx_array;
> @@ -277,6 +280,7 @@ static void tun_napi_init(struct tun_struct *tun, struct 
> tun_file *tfile,
> netif_napi_add(tun->dev, >napi, tun_napi_poll,
>NAPI_POLL_WEIGHT);
> napi_enable(>napi);
> +   mutex_init(>napi_mutex);
> }
>  }
>
> @@ -292,6 +296,11 @@ static void tun_napi_del(struct tun_struct *tun, struct 
> tun_file *tfile)
> netif_napi_del(>napi);
>  }
>
> +static bool tun_napi_frags_enabled(const struct tun_struct *tun)
> +{
> +   return READ_ONCE(tun->flags) & IFF_NAPI_FRAGS;
> +}
> +
>  #ifdef CONFIG_TUN_VNET_CROSS_LE
>  static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
>  {
> @@ -1036,7 +1045,8 @@ static void tun_poll_controller(struct net_device *dev)
>  * supports polling, which enables bridge devices in virt setups to
>  * still use netconsole
>  * If NAPI is enabled, however, we need to schedule polling for all
> -* queues.
> +* queues unless we are using napi_gro_frags(), which we call in
> +* process context and not in NAPI context.
>  */
> struct tun_struct *tun = netdev_priv(dev);
>
> @@ -1044,6 +1054,9 @@ static void tun_poll_controller(struct net_device *dev)
> struct tun_file *tfile;
> int i;
>
> +   if (tun_napi_frags_enabled(tun))
> +   return;
> +
> rcu_read_lock();
> for (i = 0; i < tun->numqueues; i++) {
> tfile = rcu_dereference(tun->tfiles[i]);
> @@ -1266,6 +1279,64 @@ static unsigned int tun_chr_poll(struct file *file, 
> poll_table *wait)
> return mask;
>  }
>
> +static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile,
> +   size_t len,
> +   const struct iov_iter *it)
> +{
> +   struct sk_buff *skb;
> +   size_t linear;
> +   int err;
> +   int i;
> +
> +   if (it->nr_segs > MAX_SKB_FRAGS + 1)
> +   return ERR_PTR(-ENOMEM);
> +
> +   local_bh_disable();
> +   skb = napi_get_frags(>napi);
> +   local_bh_enable();
> +   if (!skb)
> +   return ERR_PTR(-ENOMEM);
> +
> +   linear = iov_iter_single_seg_count(it);
> +   err = __skb_grow(skb, linear);
> +   if (err)
> +   goto free;
> +
> +   skb->len = len;
> +   

[PATCH,v3,net-next 2/2] tun: enable napi_gro_frags() for TUN/TAP driver

2017-09-22 Thread Petar Penkov
Add a TUN/TAP receive mode that exercises the napi_gro_frags()
interface. This mode is available only in TAP mode, as the interface
expects packets with Ethernet headers.

Furthermore, packets follow the layout of the iovec_iter that was
received. The first iovec is the linear data, and every one after the
first is a fragment. If there are more fragments than the max number,
drop the packet. Additionally, invoke eth_get_headlen() to exercise flow
dissector code and to verify that the header resides in the linear data.

The napi_gro_frags() mode requires setting the IFF_NAPI_FRAGS option.
This is imposed because this mode is intended for testing via tools like
syzkaller and packetdrill, and the increased flexibility it provides can
introduce security vulnerabilities. This flag is accepted only if the
device is in TAP mode and has the IFF_NAPI flag set as well. This is
done because both of these are explicit requirements for correct
operation in this mode.

Signed-off-by: Petar Penkov 
Cc: Eric Dumazet 
Cc: Mahesh Bandewar 
Cc: Willem de Bruijn 
Cc: da...@davemloft.net
Cc: ppen...@stanford.edu
---
 drivers/net/tun.c   | 134 ++--
 include/uapi/linux/if_tun.h |   1 +
 2 files changed, 129 insertions(+), 6 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index f16407242b18..9880b3bc8fa5 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -75,6 +75,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -121,7 +122,8 @@ do {
\
 #define TUN_VNET_BE 0x4000
 
 #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
- IFF_MULTI_QUEUE | IFF_NAPI)
+ IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS)
+
 #define GOODCOPY_LEN 128
 
 #define FLT_EXACT_COUNT 8
@@ -173,6 +175,7 @@ struct tun_file {
unsigned int ifindex;
};
struct napi_struct napi;
+   struct mutex napi_mutex;/* Protects access to the above napi */
struct list_head next;
struct tun_struct *detached;
struct skb_array tx_array;
@@ -277,6 +280,7 @@ static void tun_napi_init(struct tun_struct *tun, struct 
tun_file *tfile,
netif_napi_add(tun->dev, >napi, tun_napi_poll,
   NAPI_POLL_WEIGHT);
napi_enable(>napi);
+   mutex_init(>napi_mutex);
}
 }
 
@@ -292,6 +296,11 @@ static void tun_napi_del(struct tun_struct *tun, struct 
tun_file *tfile)
netif_napi_del(>napi);
 }
 
+static bool tun_napi_frags_enabled(const struct tun_struct *tun)
+{
+   return READ_ONCE(tun->flags) & IFF_NAPI_FRAGS;
+}
+
 #ifdef CONFIG_TUN_VNET_CROSS_LE
 static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
 {
@@ -1036,7 +1045,8 @@ static void tun_poll_controller(struct net_device *dev)
 * supports polling, which enables bridge devices in virt setups to
 * still use netconsole
 * If NAPI is enabled, however, we need to schedule polling for all
-* queues.
+* queues unless we are using napi_gro_frags(), which we call in
+* process context and not in NAPI context.
 */
struct tun_struct *tun = netdev_priv(dev);
 
@@ -1044,6 +1054,9 @@ static void tun_poll_controller(struct net_device *dev)
struct tun_file *tfile;
int i;
 
+   if (tun_napi_frags_enabled(tun))
+   return;
+
rcu_read_lock();
for (i = 0; i < tun->numqueues; i++) {
tfile = rcu_dereference(tun->tfiles[i]);
@@ -1266,6 +1279,64 @@ static unsigned int tun_chr_poll(struct file *file, 
poll_table *wait)
return mask;
 }
 
+static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile,
+   size_t len,
+   const struct iov_iter *it)
+{
+   struct sk_buff *skb;
+   size_t linear;
+   int err;
+   int i;
+
+   if (it->nr_segs > MAX_SKB_FRAGS + 1)
+   return ERR_PTR(-ENOMEM);
+
+   local_bh_disable();
+   skb = napi_get_frags(>napi);
+   local_bh_enable();
+   if (!skb)
+   return ERR_PTR(-ENOMEM);
+
+   linear = iov_iter_single_seg_count(it);
+   err = __skb_grow(skb, linear);
+   if (err)
+   goto free;
+
+   skb->len = len;
+   skb->data_len = len - linear;
+   skb->truesize += skb->data_len;
+
+   for (i = 1; i < it->nr_segs; i++) {
+   size_t fragsz = it->iov[i].iov_len;
+   unsigned long offset;
+   struct page *page;
+   void *data;
+
+   if (fragsz == 0 || fragsz > PAGE_SIZE) {
+   err = -EINVAL;
+   goto free;