Re: [PATCH bpf-next] xsk: build skb by page

2021-01-19 Thread Alexander Lobakin
From: Xuan Zhuo 
Date: Sat, 16 Jan 2021 10:44:53 +0800

> This patch is used to construct skb based on page to save memory copy
> overhead.
> 
> This has one problem:
> 
> We construct the skb by fill the data page as a frag into the skb. In
> this way, the linear space is empty, and the header information is also
> in the frag, not in the linear space, which is not allowed for some
> network cards. For example, Mellanox Technologies MT27710 Family
> [ConnectX-4 Lx] will get the following error message:
> 
> mlx5_core :3b:00.1 eth1: Error cqe on cqn 0x817, ci 0x8, qn 0x1dbb, 
> opcode 0xd, syndrome 0x1, vendor syndrome 0x68
> : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 0010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 0020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 0030: 00 00 00 00 60 10 68 01 0a 00 1d bb 00 0f 9f d2
> WQE DUMP: WQ size 1024 WQ cur size 0, WQE index 0xf, len: 64
> : 00 00 0f 0a 00 1d bb 03 00 00 00 08 00 00 00 00
> 0010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 0020: 00 00 00 2b 00 08 00 00 00 00 00 05 9e e3 08 00
> 0030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> mlx5_core :3b:00.1 eth1: ERR CQE on SQ: 0x1dbb
> 
> I also tried to use build_skb to construct skb, but because of the
> existence of skb_shinfo, it must be behind the linear space, so this
> method is not working. We can't put skb_shinfo on desc->addr, it will be
> exposed to users, this is not safe.
> 
> Finally, I added a feature NETIF_F_SKB_NO_LINEAR to identify whether the
> network card supports the header information of the packet in the frag
> and not in the linear space.
> 
>  Performance Testing 
> 
> The test environment is Aliyun ECS server.
> Test cmd:
> ```
> xdpsock -i eth0 -t  -S -s 
> ```
> 
> Test result data:
> 
> size64  512 10241500
> copy1916747 1775988 1600203 1440054
> page1974058 1953655 1945463 1904478
> percent 3.0%10.0%   21.58%  32.3%
> 
> Signed-off-by: Xuan Zhuo 
> Reviewed-by: Dust Li 
> ---
>  drivers/net/virtio_net.c|   2 +-
>  include/linux/netdev_features.h |   5 +-
>  net/ethtool/common.c|   1 +
>  net/xdp/xsk.c   | 108 
> +---
>  4 files changed, 97 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 4ecccb8..841a331 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -2985,7 +2985,7 @@ static int virtnet_probe(struct virtio_device *vdev)
>   /* Set up network device as normal. */
>   dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
>   dev->netdev_ops = _netdev;
> - dev->features = NETIF_F_HIGHDMA;
> + dev->features = NETIF_F_HIGHDMA | NETIF_F_SKB_NO_LINEAR;
>  
>   dev->ethtool_ops = _ethtool_ops;
>   SET_NETDEV_DEV(dev, >dev);
> diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
> index 934de56..8dd28e2 100644
> --- a/include/linux/netdev_features.h
> +++ b/include/linux/netdev_features.h
> @@ -85,9 +85,11 @@ enum {
>  
>   NETIF_F_HW_MACSEC_BIT,  /* Offload MACsec operations */
>  
> + NETIF_F_SKB_NO_LINEAR_BIT,  /* Allow skb linear is empty */
> +
>   /*
>* Add your fresh new feature above and remember to update
> -  * netdev_features_strings[] in net/core/ethtool.c and maybe
> +  * netdev_features_strings[] in net/ethtool/common.c and maybe
>* some feature mask #defines below. Please also describe it
>* in Documentation/networking/netdev-features.rst.
>*/
> @@ -157,6 +159,7 @@ enum {
>  #define NETIF_F_GRO_FRAGLIST __NETIF_F(GRO_FRAGLIST)
>  #define NETIF_F_GSO_FRAGLIST __NETIF_F(GSO_FRAGLIST)
>  #define NETIF_F_HW_MACSEC__NETIF_F(HW_MACSEC)
> +#define NETIF_F_SKB_NO_LINEAR__NETIF_F(SKB_NO_LINEAR)
>  
>  /* Finds the next feature with the highest number of the range of start till 
> 0.
>   */
> diff --git a/net/ethtool/common.c b/net/ethtool/common.c
> index 24036e3..2f3d309 100644
> --- a/net/ethtool/common.c
> +++ b/net/ethtool/common.c
> @@ -68,6 +68,7 @@
>   [NETIF_F_HW_TLS_RX_BIT] ="tls-hw-rx-offload",
>   [NETIF_F_GRO_FRAGLIST_BIT] = "rx-gro-list",
>   [NETIF_F_HW_MACSEC_BIT] ="macsec-hw-offload",
> + [NETIF_F_SKB_NO_LINEAR_BIT] ="skb-no-linear",
>  };
>  
>  const char
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 8037b04..94d17dc 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -430,6 +430,95 @@ static void xsk_destruct_skb(struct sk_buff *skb)
>   sock_wfree(skb);
>  }
>  
> +static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> +   struct xdp_desc *desc)
> +{
> + u32 len, offset, copy, copied;
> + struct sk_buff *skb;
> + struct page *page;
> + char *buffer;
> 

Re: [PATCH bpf-next] xsk: build skb by page

2021-01-18 Thread Magnus Karlsson
On Mon, Jan 18, 2021 at 5:38 PM Alexander Lobakin  wrote:
>
> > From: Magnus Karlsson 
> > Date: Mon, 18 Jan 2021 16:10:40 +0100
> >
> > On Mon, Jan 18, 2021 at 3:47 PM Alexander Lobakin  wrote:
> > >
> > > From: Alexander Lobakin 
> > > Date: Mon, 18 Jan 2021 13:00:17 +
> > >
> > > > From: Yunsheng Lin 
> > > > Date: Mon, 18 Jan 2021 20:40:52 +0800
> > > >
> > > >> On 2021/1/16 10:44, Xuan Zhuo wrote:
> > > >>> This patch is used to construct skb based on page to save memory copy
> > > >>> overhead.
> > > >>>
> > > >>> This has one problem:
> > > >>>
> > > >>> We construct the skb by fill the data page as a frag into the skb. In
> > > >>> this way, the linear space is empty, and the header information is 
> > > >>> also
> > > >>> in the frag, not in the linear space, which is not allowed for some
> > > >>> network cards. For example, Mellanox Technologies MT27710 Family
> > > >>> [ConnectX-4 Lx] will get the following error message:
> > > >>>
> > > >>> mlx5_core :3b:00.1 eth1: Error cqe on cqn 0x817, ci 0x8, qn 
> > > >>> 0x1dbb, opcode 0xd, syndrome 0x1, vendor syndrome 0x68
> > > >>> : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> > > >>> 0010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> > > >>> 0020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> > > >>> 0030: 00 00 00 00 60 10 68 01 0a 00 1d bb 00 0f 9f d2
> > > >>> WQE DUMP: WQ size 1024 WQ cur size 0, WQE index 0xf, len: 64
> > > >>> : 00 00 0f 0a 00 1d bb 03 00 00 00 08 00 00 00 00
> > > >>> 0010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> > > >>> 0020: 00 00 00 2b 00 08 00 00 00 00 00 05 9e e3 08 00
> > > >>> 0030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> > > >>> mlx5_core :3b:00.1 eth1: ERR CQE on SQ: 0x1dbb
> > > >>>
> > > >>> I also tried to use build_skb to construct skb, but because of the
> > > >>> existence of skb_shinfo, it must be behind the linear space, so this
> > > >>> method is not working. We can't put skb_shinfo on desc->addr, it will 
> > > >>> be
> > > >>> exposed to users, this is not safe.
> > > >>>
> > > >>> Finally, I added a feature NETIF_F_SKB_NO_LINEAR to identify whether 
> > > >>> the
> > > >>
> > > >> Does it make sense to use ETHTOOL_TX_COPYBREAK tunable in ethtool to
> > > >> configure if the data is copied or not?
> > > >
> > > > As far as I can grep, only mlx4 supports this, and it has a different
> > > > meaning in that driver.
> > > > So I guess a new netdev_feature would be a better solution.
> > > >
> > > >>> network card supports the header information of the packet in the frag
> > > >>> and not in the linear space.
> > > >>>
> > > >>>  Performance Testing 
> > > >>>
> > > >>> The test environment is Aliyun ECS server.
> > > >>> Test cmd:
> > > >>> ```
> > > >>> xdpsock -i eth0 -t  -S -s 
> > > >>> ```
> > > >>>
> > > >>> Test result data:
> > > >>>
> > > >>> size64  512 10241500
> > > >>> copy1916747 1775988 1600203 1440054
> > > >>> page1974058 1953655 1945463 1904478
> > > >>> percent 3.0%10.0%   21.58%  32.3%
> > > >>>
> > > >>> Signed-off-by: Xuan Zhuo 
> > > >>> Reviewed-by: Dust Li 
> > > >>> ---
> > > >>>  drivers/net/virtio_net.c|   2 +-
> > > >>>  include/linux/netdev_features.h |   5 +-
> > > >>>  net/ethtool/common.c|   1 +
> > > >>>  net/xdp/xsk.c   | 108 
> > > >>> +---
> > > >>>  4 files changed, 97 insertions(+), 19 deletions(-)
> > > >>>
> > > >>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > >>> index 4ecccb8..841a331 100644
> > > >>> --- a/drivers/net/virtio_net.c
> > > >>> +++ b/drivers/net/virtio_net.c
> > > >>> @@ -2985,7 +2985,7 @@ static int virtnet_probe(struct virtio_device 
> > > >>> *vdev)
> > > >>> /* Set up network device as normal. */
> > > >>> dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
> > > >>> dev->netdev_ops = _netdev;
> > > >>> -   dev->features = NETIF_F_HIGHDMA;
> > > >>> +   dev->features = NETIF_F_HIGHDMA | NETIF_F_SKB_NO_LINEAR;
> > > >>>
> > > >>> dev->ethtool_ops = _ethtool_ops;
> > > >>> SET_NETDEV_DEV(dev, >dev);
> > > >>> diff --git a/include/linux/netdev_features.h 
> > > >>> b/include/linux/netdev_features.h
> > > >>> index 934de56..8dd28e2 100644
> > > >>> --- a/include/linux/netdev_features.h
> > > >>> +++ b/include/linux/netdev_features.h
> > > >>> @@ -85,9 +85,11 @@ enum {
> > > >>>
> > > >>> NETIF_F_HW_MACSEC_BIT,  /* Offload MACsec operations */
> > > >>>
> > > >>> +   NETIF_F_SKB_NO_LINEAR_BIT,  /* Allow skb linear is empty */
> > > >>> +
> > > >>> /*
> > > >>>  * Add your fresh new feature above and remember to update
> > > >>> -* netdev_features_strings[] in net/core/ethtool.c and maybe
> > > >>> +* netdev_features_strings[] in net/ethtool/common.c and maybe
> > > >>>  * some feature mask 

Re: [PATCH bpf-next] xsk: build skb by page

2021-01-18 Thread Alexander Lobakin
> From: Magnus Karlsson 
> Date: Mon, 18 Jan 2021 16:10:40 +0100
> 
> On Mon, Jan 18, 2021 at 3:47 PM Alexander Lobakin  wrote:
> >
> > From: Alexander Lobakin 
> > Date: Mon, 18 Jan 2021 13:00:17 +
> >
> > > From: Yunsheng Lin 
> > > Date: Mon, 18 Jan 2021 20:40:52 +0800
> > >
> > >> On 2021/1/16 10:44, Xuan Zhuo wrote:
> > >>> This patch is used to construct skb based on page to save memory copy
> > >>> overhead.
> > >>>
> > >>> This has one problem:
> > >>>
> > >>> We construct the skb by fill the data page as a frag into the skb. In
> > >>> this way, the linear space is empty, and the header information is also
> > >>> in the frag, not in the linear space, which is not allowed for some
> > >>> network cards. For example, Mellanox Technologies MT27710 Family
> > >>> [ConnectX-4 Lx] will get the following error message:
> > >>>
> > >>> mlx5_core :3b:00.1 eth1: Error cqe on cqn 0x817, ci 0x8, qn 
> > >>> 0x1dbb, opcode 0xd, syndrome 0x1, vendor syndrome 0x68
> > >>> : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> > >>> 0010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> > >>> 0020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> > >>> 0030: 00 00 00 00 60 10 68 01 0a 00 1d bb 00 0f 9f d2
> > >>> WQE DUMP: WQ size 1024 WQ cur size 0, WQE index 0xf, len: 64
> > >>> : 00 00 0f 0a 00 1d bb 03 00 00 00 08 00 00 00 00
> > >>> 0010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> > >>> 0020: 00 00 00 2b 00 08 00 00 00 00 00 05 9e e3 08 00
> > >>> 0030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> > >>> mlx5_core :3b:00.1 eth1: ERR CQE on SQ: 0x1dbb
> > >>>
> > >>> I also tried to use build_skb to construct skb, but because of the
> > >>> existence of skb_shinfo, it must be behind the linear space, so this
> > >>> method is not working. We can't put skb_shinfo on desc->addr, it will be
> > >>> exposed to users, this is not safe.
> > >>>
> > >>> Finally, I added a feature NETIF_F_SKB_NO_LINEAR to identify whether the
> > >>
> > >> Does it make sense to use ETHTOOL_TX_COPYBREAK tunable in ethtool to
> > >> configure if the data is copied or not?
> > >
> > > As far as I can grep, only mlx4 supports this, and it has a different
> > > meaning in that driver.
> > > So I guess a new netdev_feature would be a better solution.
> > >
> > >>> network card supports the header information of the packet in the frag
> > >>> and not in the linear space.
> > >>>
> > >>>  Performance Testing 
> > >>>
> > >>> The test environment is Aliyun ECS server.
> > >>> Test cmd:
> > >>> ```
> > >>> xdpsock -i eth0 -t  -S -s 
> > >>> ```
> > >>>
> > >>> Test result data:
> > >>>
> > >>> size64  512 10241500
> > >>> copy1916747 1775988 1600203 1440054
> > >>> page1974058 1953655 1945463 1904478
> > >>> percent 3.0%10.0%   21.58%  32.3%
> > >>>
> > >>> Signed-off-by: Xuan Zhuo 
> > >>> Reviewed-by: Dust Li 
> > >>> ---
> > >>>  drivers/net/virtio_net.c|   2 +-
> > >>>  include/linux/netdev_features.h |   5 +-
> > >>>  net/ethtool/common.c|   1 +
> > >>>  net/xdp/xsk.c   | 108 
> > >>> +---
> > >>>  4 files changed, 97 insertions(+), 19 deletions(-)
> > >>>
> > >>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > >>> index 4ecccb8..841a331 100644
> > >>> --- a/drivers/net/virtio_net.c
> > >>> +++ b/drivers/net/virtio_net.c
> > >>> @@ -2985,7 +2985,7 @@ static int virtnet_probe(struct virtio_device 
> > >>> *vdev)
> > >>> /* Set up network device as normal. */
> > >>> dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
> > >>> dev->netdev_ops = _netdev;
> > >>> -   dev->features = NETIF_F_HIGHDMA;
> > >>> +   dev->features = NETIF_F_HIGHDMA | NETIF_F_SKB_NO_LINEAR;
> > >>>
> > >>> dev->ethtool_ops = _ethtool_ops;
> > >>> SET_NETDEV_DEV(dev, >dev);
> > >>> diff --git a/include/linux/netdev_features.h 
> > >>> b/include/linux/netdev_features.h
> > >>> index 934de56..8dd28e2 100644
> > >>> --- a/include/linux/netdev_features.h
> > >>> +++ b/include/linux/netdev_features.h
> > >>> @@ -85,9 +85,11 @@ enum {
> > >>>
> > >>> NETIF_F_HW_MACSEC_BIT,  /* Offload MACsec operations */
> > >>>
> > >>> +   NETIF_F_SKB_NO_LINEAR_BIT,  /* Allow skb linear is empty */
> > >>> +
> > >>> /*
> > >>>  * Add your fresh new feature above and remember to update
> > >>> -* netdev_features_strings[] in net/core/ethtool.c and maybe
> > >>> +* netdev_features_strings[] in net/ethtool/common.c and maybe
> > >>>  * some feature mask #defines below. Please also describe it
> > >>>  * in Documentation/networking/netdev-features.rst.
> > >>>  */
> > >>> @@ -157,6 +159,7 @@ enum {
> > >>>  #define NETIF_F_GRO_FRAGLIST   __NETIF_F(GRO_FRAGLIST)
> > >>>  #define NETIF_F_GSO_FRAGLIST   __NETIF_F(GSO_FRAGLIST)
> > >>>  #define 

Re: [PATCH bpf-next] xsk: build skb by page

2021-01-18 Thread Magnus Karlsson
On Mon, Jan 18, 2021 at 3:47 PM Alexander Lobakin  wrote:
>
> From: Alexander Lobakin 
> Date: Mon, 18 Jan 2021 13:00:17 +
>
> > From: Yunsheng Lin 
> > Date: Mon, 18 Jan 2021 20:40:52 +0800
> >
> >> On 2021/1/16 10:44, Xuan Zhuo wrote:
> >>> This patch is used to construct skb based on page to save memory copy
> >>> overhead.
> >>>
> >>> This has one problem:
> >>>
> >>> We construct the skb by fill the data page as a frag into the skb. In
> >>> this way, the linear space is empty, and the header information is also
> >>> in the frag, not in the linear space, which is not allowed for some
> >>> network cards. For example, Mellanox Technologies MT27710 Family
> >>> [ConnectX-4 Lx] will get the following error message:
> >>>
> >>> mlx5_core :3b:00.1 eth1: Error cqe on cqn 0x817, ci 0x8, qn 
> >>> 0x1dbb, opcode 0xd, syndrome 0x1, vendor syndrome 0x68
> >>> : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> >>> 0010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> >>> 0020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> >>> 0030: 00 00 00 00 60 10 68 01 0a 00 1d bb 00 0f 9f d2
> >>> WQE DUMP: WQ size 1024 WQ cur size 0, WQE index 0xf, len: 64
> >>> : 00 00 0f 0a 00 1d bb 03 00 00 00 08 00 00 00 00
> >>> 0010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> >>> 0020: 00 00 00 2b 00 08 00 00 00 00 00 05 9e e3 08 00
> >>> 0030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> >>> mlx5_core :3b:00.1 eth1: ERR CQE on SQ: 0x1dbb
> >>>
> >>> I also tried to use build_skb to construct skb, but because of the
> >>> existence of skb_shinfo, it must be behind the linear space, so this
> >>> method is not working. We can't put skb_shinfo on desc->addr, it will be
> >>> exposed to users, this is not safe.
> >>>
> >>> Finally, I added a feature NETIF_F_SKB_NO_LINEAR to identify whether the
> >>
> >> Does it make sense to use ETHTOOL_TX_COPYBREAK tunable in ethtool to
> >> configure if the data is copied or not?
> >
> > As far as I can grep, only mlx4 supports this, and it has a different
> > meaning in that driver.
> > So I guess a new netdev_feature would be a better solution.
> >
> >>> network card supports the header information of the packet in the frag
> >>> and not in the linear space.
> >>>
> >>>  Performance Testing 
> >>>
> >>> The test environment is Aliyun ECS server.
> >>> Test cmd:
> >>> ```
> >>> xdpsock -i eth0 -t  -S -s 
> >>> ```
> >>>
> >>> Test result data:
> >>>
> >>> size64  512 10241500
> >>> copy1916747 1775988 1600203 1440054
> >>> page1974058 1953655 1945463 1904478
> >>> percent 3.0%10.0%   21.58%  32.3%
> >>>
> >>> Signed-off-by: Xuan Zhuo 
> >>> Reviewed-by: Dust Li 
> >>> ---
> >>>  drivers/net/virtio_net.c|   2 +-
> >>>  include/linux/netdev_features.h |   5 +-
> >>>  net/ethtool/common.c|   1 +
> >>>  net/xdp/xsk.c   | 108 
> >>> +---
> >>>  4 files changed, 97 insertions(+), 19 deletions(-)
> >>>
> >>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> >>> index 4ecccb8..841a331 100644
> >>> --- a/drivers/net/virtio_net.c
> >>> +++ b/drivers/net/virtio_net.c
> >>> @@ -2985,7 +2985,7 @@ static int virtnet_probe(struct virtio_device *vdev)
> >>> /* Set up network device as normal. */
> >>> dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
> >>> dev->netdev_ops = _netdev;
> >>> -   dev->features = NETIF_F_HIGHDMA;
> >>> +   dev->features = NETIF_F_HIGHDMA | NETIF_F_SKB_NO_LINEAR;
> >>>
> >>> dev->ethtool_ops = _ethtool_ops;
> >>> SET_NETDEV_DEV(dev, >dev);
> >>> diff --git a/include/linux/netdev_features.h 
> >>> b/include/linux/netdev_features.h
> >>> index 934de56..8dd28e2 100644
> >>> --- a/include/linux/netdev_features.h
> >>> +++ b/include/linux/netdev_features.h
> >>> @@ -85,9 +85,11 @@ enum {
> >>>
> >>> NETIF_F_HW_MACSEC_BIT,  /* Offload MACsec operations */
> >>>
> >>> +   NETIF_F_SKB_NO_LINEAR_BIT,  /* Allow skb linear is empty */
> >>> +
> >>> /*
> >>>  * Add your fresh new feature above and remember to update
> >>> -* netdev_features_strings[] in net/core/ethtool.c and maybe
> >>> +* netdev_features_strings[] in net/ethtool/common.c and maybe
> >>>  * some feature mask #defines below. Please also describe it
> >>>  * in Documentation/networking/netdev-features.rst.
> >>>  */
> >>> @@ -157,6 +159,7 @@ enum {
> >>>  #define NETIF_F_GRO_FRAGLIST   __NETIF_F(GRO_FRAGLIST)
> >>>  #define NETIF_F_GSO_FRAGLIST   __NETIF_F(GSO_FRAGLIST)
> >>>  #define NETIF_F_HW_MACSEC  __NETIF_F(HW_MACSEC)
> >>> +#define NETIF_F_SKB_NO_LINEAR  __NETIF_F(SKB_NO_LINEAR)
> >>>
> >>>  /* Finds the next feature with the highest number of the range of start 
> >>> till 0.
> >>>   */
> >>> diff --git a/net/ethtool/common.c b/net/ethtool/common.c
> >>> index 

Re: [PATCH bpf-next] xsk: build skb by page

2021-01-18 Thread Magnus Karlsson
On Mon, Jan 18, 2021 at 3:47 PM Alexander Lobakin  wrote:
>
> From: Alexander Lobakin 
> Date: Mon, 18 Jan 2021 13:00:17 +
>
> > From: Yunsheng Lin 
> > Date: Mon, 18 Jan 2021 20:40:52 +0800
> >
> >> On 2021/1/16 10:44, Xuan Zhuo wrote:
> >>> This patch is used to construct skb based on page to save memory copy
> >>> overhead.
> >>>
> >>> This has one problem:
> >>>
> >>> We construct the skb by fill the data page as a frag into the skb. In
> >>> this way, the linear space is empty, and the header information is also
> >>> in the frag, not in the linear space, which is not allowed for some
> >>> network cards. For example, Mellanox Technologies MT27710 Family
> >>> [ConnectX-4 Lx] will get the following error message:
> >>>
> >>> mlx5_core :3b:00.1 eth1: Error cqe on cqn 0x817, ci 0x8, qn 
> >>> 0x1dbb, opcode 0xd, syndrome 0x1, vendor syndrome 0x68
> >>> : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> >>> 0010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> >>> 0020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> >>> 0030: 00 00 00 00 60 10 68 01 0a 00 1d bb 00 0f 9f d2
> >>> WQE DUMP: WQ size 1024 WQ cur size 0, WQE index 0xf, len: 64
> >>> : 00 00 0f 0a 00 1d bb 03 00 00 00 08 00 00 00 00
> >>> 0010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> >>> 0020: 00 00 00 2b 00 08 00 00 00 00 00 05 9e e3 08 00
> >>> 0030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> >>> mlx5_core :3b:00.1 eth1: ERR CQE on SQ: 0x1dbb
> >>>
> >>> I also tried to use build_skb to construct skb, but because of the
> >>> existence of skb_shinfo, it must be behind the linear space, so this
> >>> method is not working. We can't put skb_shinfo on desc->addr, it will be
> >>> exposed to users, this is not safe.
> >>>
> >>> Finally, I added a feature NETIF_F_SKB_NO_LINEAR to identify whether the
> >>
> >> Does it make sense to use ETHTOOL_TX_COPYBREAK tunable in ethtool to
> >> configure if the data is copied or not?
> >
> > As far as I can grep, only mlx4 supports this, and it has a different
> > meaning in that driver.
> > So I guess a new netdev_feature would be a better solution.
> >
> >>> network card supports the header information of the packet in the frag
> >>> and not in the linear space.
> >>>
> >>>  Performance Testing 
> >>>
> >>> The test environment is Aliyun ECS server.
> >>> Test cmd:
> >>> ```
> >>> xdpsock -i eth0 -t  -S -s 
> >>> ```
> >>>
> >>> Test result data:
> >>>
> >>> size64  512 10241500
> >>> copy1916747 1775988 1600203 1440054
> >>> page1974058 1953655 1945463 1904478
> >>> percent 3.0%10.0%   21.58%  32.3%
> >>>
> >>> Signed-off-by: Xuan Zhuo 
> >>> Reviewed-by: Dust Li 
> >>> ---
> >>>  drivers/net/virtio_net.c|   2 +-
> >>>  include/linux/netdev_features.h |   5 +-
> >>>  net/ethtool/common.c|   1 +
> >>>  net/xdp/xsk.c   | 108 
> >>> +---
> >>>  4 files changed, 97 insertions(+), 19 deletions(-)
> >>>
> >>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> >>> index 4ecccb8..841a331 100644
> >>> --- a/drivers/net/virtio_net.c
> >>> +++ b/drivers/net/virtio_net.c
> >>> @@ -2985,7 +2985,7 @@ static int virtnet_probe(struct virtio_device *vdev)
> >>> /* Set up network device as normal. */
> >>> dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
> >>> dev->netdev_ops = _netdev;
> >>> -   dev->features = NETIF_F_HIGHDMA;
> >>> +   dev->features = NETIF_F_HIGHDMA | NETIF_F_SKB_NO_LINEAR;
> >>>
> >>> dev->ethtool_ops = _ethtool_ops;
> >>> SET_NETDEV_DEV(dev, >dev);
> >>> diff --git a/include/linux/netdev_features.h 
> >>> b/include/linux/netdev_features.h
> >>> index 934de56..8dd28e2 100644
> >>> --- a/include/linux/netdev_features.h
> >>> +++ b/include/linux/netdev_features.h
> >>> @@ -85,9 +85,11 @@ enum {
> >>>
> >>> NETIF_F_HW_MACSEC_BIT,  /* Offload MACsec operations */
> >>>
> >>> +   NETIF_F_SKB_NO_LINEAR_BIT,  /* Allow skb linear is empty */
> >>> +
> >>> /*
> >>>  * Add your fresh new feature above and remember to update
> >>> -* netdev_features_strings[] in net/core/ethtool.c and maybe
> >>> +* netdev_features_strings[] in net/ethtool/common.c and maybe
> >>>  * some feature mask #defines below. Please also describe it
> >>>  * in Documentation/networking/netdev-features.rst.
> >>>  */
> >>> @@ -157,6 +159,7 @@ enum {
> >>>  #define NETIF_F_GRO_FRAGLIST   __NETIF_F(GRO_FRAGLIST)
> >>>  #define NETIF_F_GSO_FRAGLIST   __NETIF_F(GSO_FRAGLIST)
> >>>  #define NETIF_F_HW_MACSEC  __NETIF_F(HW_MACSEC)
> >>> +#define NETIF_F_SKB_NO_LINEAR  __NETIF_F(SKB_NO_LINEAR)
> >>>
> >>>  /* Finds the next feature with the highest number of the range of start 
> >>> till 0.
> >>>   */
> >>> diff --git a/net/ethtool/common.c b/net/ethtool/common.c
> >>> index 

Re: [PATCH bpf-next] xsk: build skb by page

2021-01-18 Thread Alexander Lobakin
From: Alexander Lobakin 
Date: Mon, 18 Jan 2021 13:00:17 +

> From: Yunsheng Lin 
> Date: Mon, 18 Jan 2021 20:40:52 +0800
>
>> On 2021/1/16 10:44, Xuan Zhuo wrote:
>>> This patch is used to construct skb based on page to save memory copy
>>> overhead.
>>>
>>> This has one problem:
>>>
>>> We construct the skb by fill the data page as a frag into the skb. In
>>> this way, the linear space is empty, and the header information is also
>>> in the frag, not in the linear space, which is not allowed for some
>>> network cards. For example, Mellanox Technologies MT27710 Family
>>> [ConnectX-4 Lx] will get the following error message:
>>>
>>> mlx5_core :3b:00.1 eth1: Error cqe on cqn 0x817, ci 0x8, qn 0x1dbb, 
>>> opcode 0xd, syndrome 0x1, vendor syndrome 0x68
>>> : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>>> 0010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>>> 0020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>>> 0030: 00 00 00 00 60 10 68 01 0a 00 1d bb 00 0f 9f d2
>>> WQE DUMP: WQ size 1024 WQ cur size 0, WQE index 0xf, len: 64
>>> : 00 00 0f 0a 00 1d bb 03 00 00 00 08 00 00 00 00
>>> 0010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>>> 0020: 00 00 00 2b 00 08 00 00 00 00 00 05 9e e3 08 00
>>> 0030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>>> mlx5_core :3b:00.1 eth1: ERR CQE on SQ: 0x1dbb
>>>
>>> I also tried to use build_skb to construct skb, but because of the
>>> existence of skb_shinfo, it must be behind the linear space, so this
>>> method is not working. We can't put skb_shinfo on desc->addr, it will be
>>> exposed to users, this is not safe.
>>>
>>> Finally, I added a feature NETIF_F_SKB_NO_LINEAR to identify whether the
>>
>> Does it make sense to use ETHTOOL_TX_COPYBREAK tunable in ethtool to
>> configure if the data is copied or not?
>
> As far as I can grep, only mlx4 supports this, and it has a different
> meaning in that driver.
> So I guess a new netdev_feature would be a better solution.
>
>>> network card supports the header information of the packet in the frag
>>> and not in the linear space.
>>>
>>>  Performance Testing 
>>>
>>> The test environment is Aliyun ECS server.
>>> Test cmd:
>>> ```
>>> xdpsock -i eth0 -t  -S -s 
>>> ```
>>>
>>> Test result data:
>>>
>>> size64  512 10241500
>>> copy1916747 1775988 1600203 1440054
>>> page1974058 1953655 1945463 1904478
>>> percent 3.0%10.0%   21.58%  32.3%
>>>
>>> Signed-off-by: Xuan Zhuo 
>>> Reviewed-by: Dust Li 
>>> ---
>>>  drivers/net/virtio_net.c|   2 +-
>>>  include/linux/netdev_features.h |   5 +-
>>>  net/ethtool/common.c|   1 +
>>>  net/xdp/xsk.c   | 108 
>>> +---
>>>  4 files changed, 97 insertions(+), 19 deletions(-)
>>> 
>>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>>> index 4ecccb8..841a331 100644
>>> --- a/drivers/net/virtio_net.c
>>> +++ b/drivers/net/virtio_net.c
>>> @@ -2985,7 +2985,7 @@ static int virtnet_probe(struct virtio_device *vdev)
>>> /* Set up network device as normal. */
>>> dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
>>> dev->netdev_ops = _netdev;
>>> -   dev->features = NETIF_F_HIGHDMA;
>>> +   dev->features = NETIF_F_HIGHDMA | NETIF_F_SKB_NO_LINEAR;
>>>  
>>> dev->ethtool_ops = _ethtool_ops;
>>> SET_NETDEV_DEV(dev, >dev);
>>> diff --git a/include/linux/netdev_features.h 
>>> b/include/linux/netdev_features.h
>>> index 934de56..8dd28e2 100644
>>> --- a/include/linux/netdev_features.h
>>> +++ b/include/linux/netdev_features.h
>>> @@ -85,9 +85,11 @@ enum {
>>>  
>>> NETIF_F_HW_MACSEC_BIT,  /* Offload MACsec operations */
>>>  
>>> +   NETIF_F_SKB_NO_LINEAR_BIT,  /* Allow skb linear is empty */
>>> +
>>> /*
>>>  * Add your fresh new feature above and remember to update
>>> -* netdev_features_strings[] in net/core/ethtool.c and maybe
>>> +* netdev_features_strings[] in net/ethtool/common.c and maybe
>>>  * some feature mask #defines below. Please also describe it
>>>  * in Documentation/networking/netdev-features.rst.
>>>  */
>>> @@ -157,6 +159,7 @@ enum {
>>>  #define NETIF_F_GRO_FRAGLIST   __NETIF_F(GRO_FRAGLIST)
>>>  #define NETIF_F_GSO_FRAGLIST   __NETIF_F(GSO_FRAGLIST)
>>>  #define NETIF_F_HW_MACSEC  __NETIF_F(HW_MACSEC)
>>> +#define NETIF_F_SKB_NO_LINEAR  __NETIF_F(SKB_NO_LINEAR)
>>>  
>>>  /* Finds the next feature with the highest number of the range of start 
>>> till 0.
>>>   */
>>> diff --git a/net/ethtool/common.c b/net/ethtool/common.c
>>> index 24036e3..2f3d309 100644
>>> --- a/net/ethtool/common.c
>>> +++ b/net/ethtool/common.c
>>> @@ -68,6 +68,7 @@
>>> [NETIF_F_HW_TLS_RX_BIT] ="tls-hw-rx-offload",
>>> [NETIF_F_GRO_FRAGLIST_BIT] = "rx-gro-list",
>>> [NETIF_F_HW_MACSEC_BIT] ="macsec-hw-offload",
>>> 

Re: [PATCH bpf-next] xsk: build skb by page

2021-01-18 Thread Alexander Lobakin
From: Yunsheng Lin 
Date: Mon, 18 Jan 2021 20:40:52 +0800

> On 2021/1/16 10:44, Xuan Zhuo wrote:
>> This patch is used to construct skb based on page to save memory copy
>> overhead.
>>
>> This has one problem:
>>
>> We construct the skb by fill the data page as a frag into the skb. In
>> this way, the linear space is empty, and the header information is also
>> in the frag, not in the linear space, which is not allowed for some
>> network cards. For example, Mellanox Technologies MT27710 Family
>> [ConnectX-4 Lx] will get the following error message:
>>
>> mlx5_core :3b:00.1 eth1: Error cqe on cqn 0x817, ci 0x8, qn 0x1dbb, 
>> opcode 0xd, syndrome 0x1, vendor syndrome 0x68
>> : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>> 0010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>> 0020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>> 0030: 00 00 00 00 60 10 68 01 0a 00 1d bb 00 0f 9f d2
>> WQE DUMP: WQ size 1024 WQ cur size 0, WQE index 0xf, len: 64
>> : 00 00 0f 0a 00 1d bb 03 00 00 00 08 00 00 00 00
>> 0010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>> 0020: 00 00 00 2b 00 08 00 00 00 00 00 05 9e e3 08 00
>> 0030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>> mlx5_core :3b:00.1 eth1: ERR CQE on SQ: 0x1dbb
>>
>> I also tried to use build_skb to construct skb, but because of the
>> existence of skb_shinfo, it must be behind the linear space, so this
>> method is not working. We can't put skb_shinfo on desc->addr, it will be
>> exposed to users, this is not safe.
>>
>> Finally, I added a feature NETIF_F_SKB_NO_LINEAR to identify whether the
>
> Does it make sense to use ETHTOOL_TX_COPYBREAK tunable in ethtool to
> configure if the data is copied or not?

As far as I can grep, only mlx4 supports this, and it has a different
meaning in that driver.
So I guess a new netdev_feature would be a better solution.

>> network card supports the header information of the packet in the frag
>> and not in the linear space.
>>
>>  Performance Testing 
>>
>> The test environment is Aliyun ECS server.
>> Test cmd:
>> ```
>> xdpsock -i eth0 -t  -S -s 
>> ```
>>
>> Test result data:
>>
>> size64  512 10241500
>> copy1916747 1775988 1600203 1440054
>> page1974058 1953655 1945463 1904478
>> percent 3.0%10.0%   21.58%  32.3%
>>
>> Signed-off-by: Xuan Zhuo 
>> Reviewed-by: Dust Li 
>> ---
>>  drivers/net/virtio_net.c|   2 +-
>>  include/linux/netdev_features.h |   5 +-
>>  net/ethtool/common.c|   1 +
>>  net/xdp/xsk.c   | 108 
>> +---
>>  4 files changed, 97 insertions(+), 19 deletions(-)
>> 
>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> index 4ecccb8..841a331 100644
>> --- a/drivers/net/virtio_net.c
>> +++ b/drivers/net/virtio_net.c
>> @@ -2985,7 +2985,7 @@ static int virtnet_probe(struct virtio_device *vdev)
>>  /* Set up network device as normal. */
>>  dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
>>  dev->netdev_ops = _netdev;
>> -dev->features = NETIF_F_HIGHDMA;
>> +dev->features = NETIF_F_HIGHDMA | NETIF_F_SKB_NO_LINEAR;
>>  
>>  dev->ethtool_ops = _ethtool_ops;
>>  SET_NETDEV_DEV(dev, >dev);
>> diff --git a/include/linux/netdev_features.h 
>> b/include/linux/netdev_features.h
>> index 934de56..8dd28e2 100644
>> --- a/include/linux/netdev_features.h
>> +++ b/include/linux/netdev_features.h
>> @@ -85,9 +85,11 @@ enum {
>>  
>>  NETIF_F_HW_MACSEC_BIT,  /* Offload MACsec operations */
>>  
>> +NETIF_F_SKB_NO_LINEAR_BIT,  /* Allow skb linear is empty */
>> +
>>  /*
>>   * Add your fresh new feature above and remember to update
>> - * netdev_features_strings[] in net/core/ethtool.c and maybe
>> + * netdev_features_strings[] in net/ethtool/common.c and maybe
>>   * some feature mask #defines below. Please also describe it
>>   * in Documentation/networking/netdev-features.rst.
>>   */
>> @@ -157,6 +159,7 @@ enum {
>>  #define NETIF_F_GRO_FRAGLIST__NETIF_F(GRO_FRAGLIST)
>>  #define NETIF_F_GSO_FRAGLIST__NETIF_F(GSO_FRAGLIST)
>>  #define NETIF_F_HW_MACSEC   __NETIF_F(HW_MACSEC)
>> +#define NETIF_F_SKB_NO_LINEAR   __NETIF_F(SKB_NO_LINEAR)
>>  
>>  /* Finds the next feature with the highest number of the range of start 
>> till 0.
>>   */
>> diff --git a/net/ethtool/common.c b/net/ethtool/common.c
>> index 24036e3..2f3d309 100644
>> --- a/net/ethtool/common.c
>> +++ b/net/ethtool/common.c
>> @@ -68,6 +68,7 @@
>>  [NETIF_F_HW_TLS_RX_BIT] ="tls-hw-rx-offload",
>>  [NETIF_F_GRO_FRAGLIST_BIT] = "rx-gro-list",
>>  [NETIF_F_HW_MACSEC_BIT] ="macsec-hw-offload",
>> +[NETIF_F_SKB_NO_LINEAR_BIT] ="skb-no-linear",

I completely forgot to add that you'd better to mention in both
enumeration/feature and its Ethtool string 

Re: [PATCH bpf-next] xsk: build skb by page

2021-01-18 Thread Alexander Lobakin
From: Xuan Zhuo 
Date: Sat, 16 Jan 2021 10:44:53 +0800

> This patch is used to construct skb based on page to save memory copy
> overhead.
> 
> This has one problem:
> 
> We construct the skb by fill the data page as a frag into the skb. In
> this way, the linear space is empty, and the header information is also
> in the frag, not in the linear space, which is not allowed for some
> network cards. For example, Mellanox Technologies MT27710 Family
> [ConnectX-4 Lx] will get the following error message:
> 
> mlx5_core :3b:00.1 eth1: Error cqe on cqn 0x817, ci 0x8, qn 0x1dbb, 
> opcode 0xd, syndrome 0x1, vendor syndrome 0x68
> : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 0010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 0020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 0030: 00 00 00 00 60 10 68 01 0a 00 1d bb 00 0f 9f d2
> WQE DUMP: WQ size 1024 WQ cur size 0, WQE index 0xf, len: 64
> : 00 00 0f 0a 00 1d bb 03 00 00 00 08 00 00 00 00
> 0010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 0020: 00 00 00 2b 00 08 00 00 00 00 00 05 9e e3 08 00
> 0030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> mlx5_core :3b:00.1 eth1: ERR CQE on SQ: 0x1dbb
> 
> I also tried to use build_skb to construct skb, but because of the
> existence of skb_shinfo, it must be behind the linear space, so this
> method is not working. We can't put skb_shinfo on desc->addr, it will be
> exposed to users, this is not safe.
> 
> Finally, I added a feature NETIF_F_SKB_NO_LINEAR to identify whether the
> network card supports the header information of the packet in the frag
> and not in the linear space.
> 
>  Performance Testing 
> 
> The test environment is Aliyun ECS server.
> Test cmd:
> ```
> xdpsock -i eth0 -t  -S -s 
> ```
> 
> Test result data:
> 
> size64  512 10241500
> copy1916747 1775988 1600203 1440054
> page1974058 1953655 1945463 1904478
> percent 3.0%10.0%   21.58%  32.3%
> 
> Signed-off-by: Xuan Zhuo 
> Reviewed-by: Dust Li 
> ---
>  drivers/net/virtio_net.c|   2 +-
>  include/linux/netdev_features.h |   5 +-
>  net/ethtool/common.c|   1 +
>  net/xdp/xsk.c   | 108 
> +---
>  4 files changed, 97 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 4ecccb8..841a331 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -2985,7 +2985,7 @@ static int virtnet_probe(struct virtio_device *vdev)
>   /* Set up network device as normal. */
>   dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
>   dev->netdev_ops = _netdev;
> - dev->features = NETIF_F_HIGHDMA;
> + dev->features = NETIF_F_HIGHDMA | NETIF_F_SKB_NO_LINEAR;
>  
>   dev->ethtool_ops = _ethtool_ops;
>   SET_NETDEV_DEV(dev, >dev);
> diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
> index 934de56..8dd28e2 100644
> --- a/include/linux/netdev_features.h
> +++ b/include/linux/netdev_features.h
> @@ -85,9 +85,11 @@ enum {
>  
>   NETIF_F_HW_MACSEC_BIT,  /* Offload MACsec operations */
>  
> + NETIF_F_SKB_NO_LINEAR_BIT,  /* Allow skb linear is empty */
> +
>   /*
>* Add your fresh new feature above and remember to update
> -  * netdev_features_strings[] in net/core/ethtool.c and maybe
> +  * netdev_features_strings[] in net/ethtool/common.c and maybe
>* some feature mask #defines below. Please also describe it
>* in Documentation/networking/netdev-features.rst.
>*/
> @@ -157,6 +159,7 @@ enum {
>  #define NETIF_F_GRO_FRAGLIST __NETIF_F(GRO_FRAGLIST)
>  #define NETIF_F_GSO_FRAGLIST __NETIF_F(GSO_FRAGLIST)
>  #define NETIF_F_HW_MACSEC__NETIF_F(HW_MACSEC)
> +#define NETIF_F_SKB_NO_LINEAR__NETIF_F(SKB_NO_LINEAR)
>  
>  /* Finds the next feature with the highest number of the range of start till 
> 0.
>   */
> diff --git a/net/ethtool/common.c b/net/ethtool/common.c
> index 24036e3..2f3d309 100644
> --- a/net/ethtool/common.c
> +++ b/net/ethtool/common.c
> @@ -68,6 +68,7 @@
>   [NETIF_F_HW_TLS_RX_BIT] ="tls-hw-rx-offload",
>   [NETIF_F_GRO_FRAGLIST_BIT] = "rx-gro-list",
>   [NETIF_F_HW_MACSEC_BIT] ="macsec-hw-offload",
> + [NETIF_F_SKB_NO_LINEAR_BIT] ="skb-no-linear",
>  };
>  
>  const char

I think the best would be if you will split this patch into three:
 - the first one will introduce NETI_F_SKB_NO_LINEAR;
 - the second will add this feature to virtio_net;
 - the third will do the rest.

Also, it would be nice if you'll mention (in the cover letter or
in the third patch) that in order to get a nice boost on non-ZC
XSK xmit developers can add a support for completely non-linear
skbs and advertise this new feature in their drivers. I think
there'll be enough folks 

RE: [PATCH bpf-next] xsk: build skb by page

2020-12-31 Thread John Fastabend
Xuan Zhuo wrote:
> This patch is used to construct skb based on page to save memory copy
> overhead.
> 
> Taking into account the problem of addr unaligned, and the
> possibility of frame size greater than page in the future.
> 
> Signed-off-by: Xuan Zhuo 
> ---
>  net/xdp/xsk.c | 68 
> ---
>  1 file changed, 51 insertions(+), 17 deletions(-)
> 
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index ac4a317..7cab40f 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -430,6 +430,55 @@ static void xsk_destruct_skb(struct sk_buff *skb)
>   sock_wfree(skb);
>  }
>  
> +static struct sk_buff *xsk_build_skb_bypage(struct xdp_sock *xs, struct 
> xdp_desc *desc)
> +{
> + char *buffer;
> + u64 addr;
> + u32 len, offset, copy, copied;
> + int err, i;
> + struct page *page;
> + struct sk_buff *skb;
> +
> + skb = sock_alloc_send_skb(>sk, 0, 1, );

Because this is just grabbing an skb did you consider build_skb?

> + if (unlikely(!skb))
> + return NULL;

I think it would be best to push err back to caller here with ERR_PTR().

> +
> + addr = desc->addr;
> + len = desc->len;
> +
> + buffer = xsk_buff_raw_get_data(xs->pool, addr);
> + offset = offset_in_page(buffer);
> + addr = buffer - (char *)xs->pool->addrs;
> +
> + for (copied = 0, i = 0; copied < len; ++i) {
> + page = xs->pool->umem->pgs[addr >> PAGE_SHIFT];
> +
> + get_page(page);

Is it obvious why this get_page() is needed? Maybe a small comment would
be nice. Something like, "we need to inc refcnt on page to ensure skb
does not release page from pool".

> +
> + copy = min((u32)(PAGE_SIZE - offset), len - copied);
> +

nit: take it or leave it, seems like a lot of new lines imo. I would
just put all these together. Not really important though.

> + skb_fill_page_desc(skb, i, page, offset, copy);
> +
> + copied += copy;
> + addr += copy;
> + offset = 0;
> + }
> +
> + skb->len += len;
> + skb->data_len += len;
> + skb->truesize += len;
> +
> + refcount_add(len, >sk.sk_wmem_alloc);
> +
> + skb->dev = xs->dev;
> + skb->priority = xs->sk.sk_priority;
> + skb->mark = xs->sk.sk_mark;
> + skb_shinfo(skb)->destructor_arg = (void *)(long)addr;
> + skb->destructor = xsk_destruct_skb;
> +
> + return skb;
> +}
> +
>  static int xsk_generic_xmit(struct sock *sk)
>  {
>   struct xdp_sock *xs = xdp_sk(sk);
> @@ -445,40 +494,25 @@ static int xsk_generic_xmit(struct sock *sk)
>   goto out;
>  
>   while (xskq_cons_peek_desc(xs->tx, , xs->pool)) {
> - char *buffer;
> - u64 addr;
> - u32 len;
> -
>   if (max_batch-- == 0) {
>   err = -EAGAIN;
>   goto out;
>   }
>  
> - len = desc.len;
> - skb = sock_alloc_send_skb(sk, len, 1, );
> + skb = xsk_build_skb_bypage(xs, );
>   if (unlikely(!skb))

Is err set here? Either way if skb is an ERR_PTR we can use that
here for better error handling.

>   goto out;
>  
> - skb_put(skb, len);
> - addr = desc.addr;
> - buffer = xsk_buff_raw_get_data(xs->pool, addr);
> - err = skb_store_bits(skb, 0, buffer, len);
>   /* This is the backpressure mechanism for the Tx path.
>* Reserve space in the completion queue and only proceed
>* if there is space in it. This avoids having to implement
>* any buffering in the Tx path.
>*/
> - if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
> + if (xskq_prod_reserve(xs->pool->cq)) {
>   kfree_skb(skb);

Same here, do we need to set err now that its not explicit above in
err = skb_store_bits...

>   goto out;
>   }
>  
> - skb->dev = xs->dev;
> - skb->priority = sk->sk_priority;
> - skb->mark = sk->sk_mark;
> - skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
> - skb->destructor = xsk_destruct_skb;
> -
>   err = __dev_direct_xmit(skb, xs->queue_id);
>   if  (err == NETDEV_TX_BUSY) {
>   /* Tell user-space to retry the send */
> -- 
> 1.8.3.1
> 


[PATCH bpf-next] xsk: build skb by page

2020-12-29 Thread Xuan Zhuo
This patch is used to construct skb based on page to save memory copy
overhead.

Taking into account the problem of addr unaligned, and the
possibility of frame size greater than page in the future.

The test environment is Aliyun ECS server.
Test cmd:
```
xdpsock -i eth0 -t  -S -s 
```

Test result data:

size64  512 10241500
copy1916747 1775988 1600203 1440054
page1974058 1953655 1945463 1904478
percent 3.0%10.0%   21.58%  32.3%

Signed-off-by: Xuan Zhuo 
---
 net/xdp/xsk.c | 68 ---
 1 file changed, 51 insertions(+), 17 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index ac4a317..7cab40f 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -430,6 +430,55 @@ static void xsk_destruct_skb(struct sk_buff *skb)
sock_wfree(skb);
 }
 
+static struct sk_buff *xsk_build_skb_bypage(struct xdp_sock *xs, struct 
xdp_desc *desc)
+{
+   char *buffer;
+   u64 addr;
+   u32 len, offset, copy, copied;
+   int err, i;
+   struct page *page;
+   struct sk_buff *skb;
+
+   skb = sock_alloc_send_skb(>sk, 0, 1, );
+   if (unlikely(!skb))
+   return NULL;
+
+   addr = desc->addr;
+   len = desc->len;
+
+   buffer = xsk_buff_raw_get_data(xs->pool, addr);
+   offset = offset_in_page(buffer);
+   addr = buffer - (char *)xs->pool->addrs;
+
+   for (copied = 0, i = 0; copied < len; ++i) {
+   page = xs->pool->umem->pgs[addr >> PAGE_SHIFT];
+
+   get_page(page);
+
+   copy = min((u32)(PAGE_SIZE - offset), len - copied);
+
+   skb_fill_page_desc(skb, i, page, offset, copy);
+
+   copied += copy;
+   addr += copy;
+   offset = 0;
+   }
+
+   skb->len += len;
+   skb->data_len += len;
+   skb->truesize += len;
+
+   refcount_add(len, >sk.sk_wmem_alloc);
+
+   skb->dev = xs->dev;
+   skb->priority = xs->sk.sk_priority;
+   skb->mark = xs->sk.sk_mark;
+   skb_shinfo(skb)->destructor_arg = (void *)(long)addr;
+   skb->destructor = xsk_destruct_skb;
+
+   return skb;
+}
+
 static int xsk_generic_xmit(struct sock *sk)
 {
struct xdp_sock *xs = xdp_sk(sk);
@@ -445,40 +494,25 @@ static int xsk_generic_xmit(struct sock *sk)
goto out;
 
while (xskq_cons_peek_desc(xs->tx, , xs->pool)) {
-   char *buffer;
-   u64 addr;
-   u32 len;
-
if (max_batch-- == 0) {
err = -EAGAIN;
goto out;
}
 
-   len = desc.len;
-   skb = sock_alloc_send_skb(sk, len, 1, );
+   skb = xsk_build_skb_bypage(xs, );
if (unlikely(!skb))
goto out;
 
-   skb_put(skb, len);
-   addr = desc.addr;
-   buffer = xsk_buff_raw_get_data(xs->pool, addr);
-   err = skb_store_bits(skb, 0, buffer, len);
/* This is the backpressure mechanism for the Tx path.
 * Reserve space in the completion queue and only proceed
 * if there is space in it. This avoids having to implement
 * any buffering in the Tx path.
 */
-   if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
+   if (xskq_prod_reserve(xs->pool->cq)) {
kfree_skb(skb);
goto out;
}
 
-   skb->dev = xs->dev;
-   skb->priority = sk->sk_priority;
-   skb->mark = sk->sk_mark;
-   skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
-   skb->destructor = xsk_destruct_skb;
-
err = __dev_direct_xmit(skb, xs->queue_id);
if  (err == NETDEV_TX_BUSY) {
/* Tell user-space to retry the send */
-- 
1.8.3.1



Re: [PATCH bpf-next] xsk: build skb by page

2020-12-23 Thread Magnus Karlsson
On Wed, Dec 23, 2020 at 9:57 AM Xuan Zhuo  wrote:
>
> This patch is used to construct skb based on page to save memory copy
> overhead.
>
> Taking into account the problem of addr unaligned, and the
> possibility of frame size greater than page in the future.

Thanks Xuan for the patch set. Could you please share performance
numbers so we know how much this buys us? Would be good if you could
produce them for 64 bytes, 1500 bytes and something in the middle so
we can judge the benefits of this.

Please note that responses will be delayed this week and next due to
the Christmas and New Years holidays over here.

> Signed-off-by: Xuan Zhuo 
> ---
>  net/xdp/xsk.c | 68 
> ---
>  1 file changed, 51 insertions(+), 17 deletions(-)
>
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index ac4a317..7cab40f 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -430,6 +430,55 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> sock_wfree(skb);
>  }
>
> +static struct sk_buff *xsk_build_skb_bypage(struct xdp_sock *xs, struct 
> xdp_desc *desc)
> +{
> +   char *buffer;
> +   u64 addr;
> +   u32 len, offset, copy, copied;
> +   int err, i;
> +   struct page *page;
> +   struct sk_buff *skb;
> +
> +   skb = sock_alloc_send_skb(>sk, 0, 1, );
> +   if (unlikely(!skb))
> +   return NULL;
> +
> +   addr = desc->addr;
> +   len = desc->len;
> +
> +   buffer = xsk_buff_raw_get_data(xs->pool, addr);
> +   offset = offset_in_page(buffer);
> +   addr = buffer - (char *)xs->pool->addrs;
> +
> +   for (copied = 0, i = 0; copied < len; ++i) {
> +   page = xs->pool->umem->pgs[addr >> PAGE_SHIFT];
> +
> +   get_page(page);
> +
> +   copy = min((u32)(PAGE_SIZE - offset), len - copied);
> +
> +   skb_fill_page_desc(skb, i, page, offset, copy);
> +
> +   copied += copy;
> +   addr += copy;
> +   offset = 0;
> +   }
> +
> +   skb->len += len;
> +   skb->data_len += len;
> +   skb->truesize += len;
> +
> +   refcount_add(len, >sk.sk_wmem_alloc);
> +
> +   skb->dev = xs->dev;
> +   skb->priority = xs->sk.sk_priority;
> +   skb->mark = xs->sk.sk_mark;
> +   skb_shinfo(skb)->destructor_arg = (void *)(long)addr;
> +   skb->destructor = xsk_destruct_skb;
> +
> +   return skb;
> +}
> +
>  static int xsk_generic_xmit(struct sock *sk)
>  {
> struct xdp_sock *xs = xdp_sk(sk);
> @@ -445,40 +494,25 @@ static int xsk_generic_xmit(struct sock *sk)
> goto out;
>
> while (xskq_cons_peek_desc(xs->tx, , xs->pool)) {
> -   char *buffer;
> -   u64 addr;
> -   u32 len;
> -
> if (max_batch-- == 0) {
> err = -EAGAIN;
> goto out;
> }
>
> -   len = desc.len;
> -   skb = sock_alloc_send_skb(sk, len, 1, );
> +   skb = xsk_build_skb_bypage(xs, );
> if (unlikely(!skb))
> goto out;
>
> -   skb_put(skb, len);
> -   addr = desc.addr;
> -   buffer = xsk_buff_raw_get_data(xs->pool, addr);
> -   err = skb_store_bits(skb, 0, buffer, len);
> /* This is the backpressure mechanism for the Tx path.
>  * Reserve space in the completion queue and only proceed
>  * if there is space in it. This avoids having to implement
>  * any buffering in the Tx path.
>  */
> -   if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
> +   if (xskq_prod_reserve(xs->pool->cq)) {
> kfree_skb(skb);
> goto out;
> }
>
> -   skb->dev = xs->dev;
> -   skb->priority = sk->sk_priority;
> -   skb->mark = sk->sk_mark;
> -   skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
> -   skb->destructor = xsk_destruct_skb;
> -
> err = __dev_direct_xmit(skb, xs->queue_id);
> if  (err == NETDEV_TX_BUSY) {
> /* Tell user-space to retry the send */
> --
> 1.8.3.1
>


[PATCH bpf-next] xsk: build skb by page

2020-12-23 Thread Xuan Zhuo
This patch is used to construct skb based on page to save memory copy
overhead.

Taking into account the problem of addr unaligned, and the
possibility of frame size greater than page in the future.

Signed-off-by: Xuan Zhuo 
---
 net/xdp/xsk.c | 68 ---
 1 file changed, 51 insertions(+), 17 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index ac4a317..7cab40f 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -430,6 +430,55 @@ static void xsk_destruct_skb(struct sk_buff *skb)
sock_wfree(skb);
 }
 
+static struct sk_buff *xsk_build_skb_bypage(struct xdp_sock *xs, struct 
xdp_desc *desc)
+{
+   char *buffer;
+   u64 addr;
+   u32 len, offset, copy, copied;
+   int err, i;
+   struct page *page;
+   struct sk_buff *skb;
+
+   skb = sock_alloc_send_skb(>sk, 0, 1, );
+   if (unlikely(!skb))
+   return NULL;
+
+   addr = desc->addr;
+   len = desc->len;
+
+   buffer = xsk_buff_raw_get_data(xs->pool, addr);
+   offset = offset_in_page(buffer);
+   addr = buffer - (char *)xs->pool->addrs;
+
+   for (copied = 0, i = 0; copied < len; ++i) {
+   page = xs->pool->umem->pgs[addr >> PAGE_SHIFT];
+
+   get_page(page);
+
+   copy = min((u32)(PAGE_SIZE - offset), len - copied);
+
+   skb_fill_page_desc(skb, i, page, offset, copy);
+
+   copied += copy;
+   addr += copy;
+   offset = 0;
+   }
+
+   skb->len += len;
+   skb->data_len += len;
+   skb->truesize += len;
+
+   refcount_add(len, >sk.sk_wmem_alloc);
+
+   skb->dev = xs->dev;
+   skb->priority = xs->sk.sk_priority;
+   skb->mark = xs->sk.sk_mark;
+   skb_shinfo(skb)->destructor_arg = (void *)(long)addr;
+   skb->destructor = xsk_destruct_skb;
+
+   return skb;
+}
+
 static int xsk_generic_xmit(struct sock *sk)
 {
struct xdp_sock *xs = xdp_sk(sk);
@@ -445,40 +494,25 @@ static int xsk_generic_xmit(struct sock *sk)
goto out;
 
while (xskq_cons_peek_desc(xs->tx, , xs->pool)) {
-   char *buffer;
-   u64 addr;
-   u32 len;
-
if (max_batch-- == 0) {
err = -EAGAIN;
goto out;
}
 
-   len = desc.len;
-   skb = sock_alloc_send_skb(sk, len, 1, );
+   skb = xsk_build_skb_bypage(xs, );
if (unlikely(!skb))
goto out;
 
-   skb_put(skb, len);
-   addr = desc.addr;
-   buffer = xsk_buff_raw_get_data(xs->pool, addr);
-   err = skb_store_bits(skb, 0, buffer, len);
/* This is the backpressure mechanism for the Tx path.
 * Reserve space in the completion queue and only proceed
 * if there is space in it. This avoids having to implement
 * any buffering in the Tx path.
 */
-   if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
+   if (xskq_prod_reserve(xs->pool->cq)) {
kfree_skb(skb);
goto out;
}
 
-   skb->dev = xs->dev;
-   skb->priority = sk->sk_priority;
-   skb->mark = sk->sk_mark;
-   skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
-   skb->destructor = xsk_destruct_skb;
-
err = __dev_direct_xmit(skb, xs->queue_id);
if  (err == NETDEV_TX_BUSY) {
/* Tell user-space to retry the send */
-- 
1.8.3.1