[PATCH net-next v6 06/15] mm: page_frag: add '_va' suffix to page_frag API

2024-06-05 Thread Yunsheng Lin
Currently the page_frag API is returning 'virtual address'
or 'va' when allocing and expecting 'virtual address' or
'va' as input when freeing.

As we are about to support new use cases that the caller
need to deal with 'struct page' or need to deal with both
'va' and 'struct page'. In order to differentiate the API
handling between 'va' and 'struct page', add '_va' suffix
to the corresponding API mirroring the page_pool_alloc_va()
API of the page_pool. So that callers expecting to deal with
va, page or both va and page may call page_frag_alloc_va*,
page_frag_alloc_pg*, or page_frag_alloc* API accordingly.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
---
 drivers/net/ethernet/google/gve/gve_rx.c  |  4 ++--
 drivers/net/ethernet/intel/ice/ice_txrx.c |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx.h |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx_lib.c |  2 +-
 .../net/ethernet/intel/ixgbevf/ixgbevf_main.c |  4 ++--
 .../marvell/octeontx2/nic/otx2_common.c   |  2 +-
 drivers/net/ethernet/mediatek/mtk_wed_wo.c|  4 ++--
 drivers/nvme/host/tcp.c   |  8 +++
 drivers/nvme/target/tcp.c | 22 +--
 drivers/vhost/net.c   |  6 ++---
 include/linux/page_frag_cache.h   | 21 +-
 include/linux/skbuff.h|  2 +-
 kernel/bpf/cpumap.c   |  2 +-
 mm/page_frag_cache.c  | 12 +-
 mm/page_frag_test.c   | 13 ++-
 net/core/skbuff.c | 18 +++
 net/core/xdp.c|  2 +-
 net/rxrpc/txbuf.c | 15 +++--
 net/sunrpc/svcsock.c  |  6 ++---
 19 files changed, 76 insertions(+), 71 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_rx.c 
b/drivers/net/ethernet/google/gve/gve_rx.c
index acb73d4d0de6..b6c10100e462 100644
--- a/drivers/net/ethernet/google/gve/gve_rx.c
+++ b/drivers/net/ethernet/google/gve/gve_rx.c
@@ -729,7 +729,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
total_len = headroom + SKB_DATA_ALIGN(len) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-   frame = page_frag_alloc(>page_cache, total_len, GFP_ATOMIC);
+   frame = page_frag_alloc_va(>page_cache, total_len, GFP_ATOMIC);
if (!frame) {
u64_stats_update_begin(>statss);
rx->xdp_alloc_fails++;
@@ -742,7 +742,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
err = xdp_do_redirect(dev, , xdp_prog);
if (err)
-   page_frag_free(frame);
+   page_frag_free_va(frame);
 
return err;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c 
b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 8bb743f78fcb..399b317c509d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -126,7 +126,7 @@ ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct 
ice_tx_buf *tx_buf)
dev_kfree_skb_any(tx_buf->skb);
break;
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame(tx_buf->xdpf);
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h 
b/drivers/net/ethernet/intel/ice/ice_txrx.h
index feba314a3fe4..6379f57d8228 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -148,7 +148,7 @@ static inline int ice_skb_pad(void)
  * @ICE_TX_BUF_DUMMY: dummy Flow Director packet, unmap and kfree()
  * @ICE_TX_BUF_FRAG: mapped skb OR _buff frag, only unmap DMA
  * @ICE_TX_BUF_SKB: _buff, unmap and consume_skb(), update stats
- * @ICE_TX_BUF_XDP_TX: _buff, unmap and page_frag_free(), stats
+ * @ICE_TX_BUF_XDP_TX: _buff, unmap and page_frag_free_va(), stats
  * @ICE_TX_BUF_XDP_XMIT: _frame, unmap and xdp_return_frame(), stats
  * @ICE_TX_BUF_XSK_TX: _buff on XSk queue, xsk_buff_free(), stats
  */
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c 
b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
index 2719f0e20933..a1a41a14df0d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
@@ -250,7 +250,7 @@ ice_clean_xdp_tx_buf(struct device *dev, struct ice_tx_buf 
*tx_buf,
 
switch (tx_buf->type) {
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame_bulk(tx_buf->xdpf, bq);
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 
b/drivers/net/ethernet/intel/ixgbe

[PATCH net-next v6 07/15] mm: page_frag: avoid caller accessing 'page_frag_cache' directly

2024-06-05 Thread Yunsheng Lin
Use appropriate frag_page API instead of caller accessing
'page_frag_cache' directly.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
---
 drivers/vhost/net.c |  2 +-
 include/linux/page_frag_cache.h | 10 ++
 mm/page_frag_test.c |  2 +-
 net/core/skbuff.c   |  6 +++---
 net/rxrpc/conn_object.c |  4 +---
 net/rxrpc/local_object.c|  4 +---
 net/sunrpc/svcsock.c|  6 ++
 7 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 6691fac01e0d..b2737dc0dc50 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1325,7 +1325,7 @@ static int vhost_net_open(struct inode *inode, struct 
file *f)
vqs[VHOST_NET_VQ_RX]);
 
f->private_data = n;
-   n->pf_cache.va = NULL;
+   page_frag_cache_init(>pf_cache);
 
return 0;
 }
diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index c6fde197a6eb..6ac3a25089d1 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -23,6 +23,16 @@ struct page_frag_cache {
bool pfmemalloc;
 };
 
+static inline void page_frag_cache_init(struct page_frag_cache *nc)
+{
+   nc->va = NULL;
+}
+
+static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
+{
+   return !!nc->pfmemalloc;
+}
+
 void page_frag_cache_drain(struct page_frag_cache *nc);
 void __page_frag_cache_drain(struct page *page, unsigned int count);
 void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
diff --git a/mm/page_frag_test.c b/mm/page_frag_test.c
index c3cfce87fbbf..8b259e422fae 100644
--- a/mm/page_frag_test.c
+++ b/mm/page_frag_test.c
@@ -341,7 +341,7 @@ static int __init page_frag_test_init(void)
u64 duration;
int ret;
 
-   test_frag.va = NULL;
+   page_frag_cache_init(_frag);
atomic_set(, 2);
init_completion();
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index dca4e7445348..caee22db1cc7 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -741,12 +741,12 @@ struct sk_buff *__netdev_alloc_skb(struct net_device 
*dev, unsigned int len,
if (in_hardirq() || irqs_disabled()) {
nc = this_cpu_ptr(_alloc_cache);
data = page_frag_alloc_va(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
} else {
local_bh_disable();
nc = this_cpu_ptr(_alloc_cache.page);
data = page_frag_alloc_va(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
local_bh_enable();
}
 
@@ -834,7 +834,7 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, 
unsigned int len)
len = SKB_HEAD_ALIGN(len);
 
data = page_frag_alloc_va(>page, len, gfp_mask);
-   pfmemalloc = nc->page.pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(>page);
}
 
if (unlikely(!data))
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 1539d315afe7..694c4df7a1a3 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -337,9 +337,7 @@ static void rxrpc_clean_up_connection(struct work_struct 
*work)
 */
rxrpc_purge_queue(>rx_queue);
 
-   if (conn->tx_data_alloc.va)
-   __page_frag_cache_drain(virt_to_page(conn->tx_data_alloc.va),
-   conn->tx_data_alloc.pagecnt_bias);
+   page_frag_cache_drain(>tx_data_alloc);
call_rcu(>rcu, rxrpc_rcu_free_connection);
 }
 
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 504453c688d7..a8cffe47cf01 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -452,9 +452,7 @@ void rxrpc_destroy_local(struct rxrpc_local *local)
 #endif
rxrpc_purge_queue(>rx_queue);
rxrpc_purge_client_connections(local);
-   if (local->tx_alloc.va)
-   __page_frag_cache_drain(virt_to_page(local->tx_alloc.va),
-   local->tx_alloc.pagecnt_bias);
+   page_frag_cache_drain(>tx_alloc);
 }
 
 /*
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 42d20412c1c3..4b1e87187614 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1609,7 +1609,6 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt)
 static void svc_sock_free(struct svc_xprt *xprt)
 {
struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
-   struct page_frag_cache *pfc = >sk_frag_cache;
struct socket *sock = svsk->sk_sock;
 
trace_svcsock_free(svsk, sock);
@@ -1619,8 +1618,7 @@ static void svc_sock_free(struct svc_xprt *xprt)
sockfd_put(sock);

[PATCH net-next v5 07/13] mm: page_frag: avoid caller accessing 'page_frag_cache' directly

2024-05-28 Thread Yunsheng Lin
Use appropriate frag_page API instead of caller accessing
'page_frag_cache' directly.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
---
 drivers/vhost/net.c |  2 +-
 include/linux/page_frag_cache.h | 10 ++
 mm/page_frag_test.c |  2 +-
 net/core/skbuff.c   |  6 +++---
 net/rxrpc/conn_object.c |  4 +---
 net/rxrpc/local_object.c|  4 +---
 net/sunrpc/svcsock.c|  6 ++
 7 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 6691fac01e0d..b2737dc0dc50 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1325,7 +1325,7 @@ static int vhost_net_open(struct inode *inode, struct 
file *f)
vqs[VHOST_NET_VQ_RX]);
 
f->private_data = n;
-   n->pf_cache.va = NULL;
+   page_frag_cache_init(>pf_cache);
 
return 0;
 }
diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index c6fde197a6eb..6ac3a25089d1 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -23,6 +23,16 @@ struct page_frag_cache {
bool pfmemalloc;
 };
 
+static inline void page_frag_cache_init(struct page_frag_cache *nc)
+{
+   nc->va = NULL;
+}
+
+static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
+{
+   return !!nc->pfmemalloc;
+}
+
 void page_frag_cache_drain(struct page_frag_cache *nc);
 void __page_frag_cache_drain(struct page *page, unsigned int count);
 void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
diff --git a/mm/page_frag_test.c b/mm/page_frag_test.c
index bb9f5aa84631..641255136105 100644
--- a/mm/page_frag_test.c
+++ b/mm/page_frag_test.c
@@ -337,7 +337,7 @@ static int __init page_frag_test_init(void)
u64 duration;
int ret;
 
-   test_frag.va = NULL;
+   page_frag_cache_init(_frag);
atomic_set(, 2);
init_completion();
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index dca4e7445348..caee22db1cc7 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -741,12 +741,12 @@ struct sk_buff *__netdev_alloc_skb(struct net_device 
*dev, unsigned int len,
if (in_hardirq() || irqs_disabled()) {
nc = this_cpu_ptr(_alloc_cache);
data = page_frag_alloc_va(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
} else {
local_bh_disable();
nc = this_cpu_ptr(_alloc_cache.page);
data = page_frag_alloc_va(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
local_bh_enable();
}
 
@@ -834,7 +834,7 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, 
unsigned int len)
len = SKB_HEAD_ALIGN(len);
 
data = page_frag_alloc_va(>page, len, gfp_mask);
-   pfmemalloc = nc->page.pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(>page);
}
 
if (unlikely(!data))
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 1539d315afe7..694c4df7a1a3 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -337,9 +337,7 @@ static void rxrpc_clean_up_connection(struct work_struct 
*work)
 */
rxrpc_purge_queue(>rx_queue);
 
-   if (conn->tx_data_alloc.va)
-   __page_frag_cache_drain(virt_to_page(conn->tx_data_alloc.va),
-   conn->tx_data_alloc.pagecnt_bias);
+   page_frag_cache_drain(>tx_data_alloc);
call_rcu(>rcu, rxrpc_rcu_free_connection);
 }
 
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 504453c688d7..a8cffe47cf01 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -452,9 +452,7 @@ void rxrpc_destroy_local(struct rxrpc_local *local)
 #endif
rxrpc_purge_queue(>rx_queue);
rxrpc_purge_client_connections(local);
-   if (local->tx_alloc.va)
-   __page_frag_cache_drain(virt_to_page(local->tx_alloc.va),
-   local->tx_alloc.pagecnt_bias);
+   page_frag_cache_drain(>tx_alloc);
 }
 
 /*
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 42d20412c1c3..4b1e87187614 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1609,7 +1609,6 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt)
 static void svc_sock_free(struct svc_xprt *xprt)
 {
struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
-   struct page_frag_cache *pfc = >sk_frag_cache;
struct socket *sock = svsk->sk_sock;
 
trace_svcsock_free(svsk, sock);
@@ -1619,8 +1618,7 @@ static void svc_sock_free(struct svc_xprt *xprt)
sockfd_put(sock);

[PATCH net-next v5 06/13] mm: page_frag: add '_va' suffix to page_frag API

2024-05-28 Thread Yunsheng Lin
Currently the page_frag API is returning 'virtual address'
or 'va' when allocing and expecting 'virtual address' or
'va' as input when freeing.

As we are about to support new use cases that the caller
need to deal with 'struct page' or need to deal with both
'va' and 'struct page'. In order to differentiate the API
handling between 'va' and 'struct page', add '_va' suffix
to the corresponding API mirroring the page_pool_alloc_va()
API of the page_pool. So that callers expecting to deal with
va, page or both va and page may call page_frag_alloc_va*,
page_frag_alloc_pg*, or page_frag_alloc* API accordingly.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
---
 drivers/net/ethernet/google/gve/gve_rx.c  |  4 ++--
 drivers/net/ethernet/intel/ice/ice_txrx.c |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx.h |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx_lib.c |  2 +-
 .../net/ethernet/intel/ixgbevf/ixgbevf_main.c |  4 ++--
 .../marvell/octeontx2/nic/otx2_common.c   |  2 +-
 drivers/net/ethernet/mediatek/mtk_wed_wo.c|  4 ++--
 drivers/nvme/host/tcp.c   |  8 +++
 drivers/nvme/target/tcp.c | 22 +--
 drivers/vhost/net.c   |  6 ++---
 include/linux/page_frag_cache.h   | 21 +-
 include/linux/skbuff.h|  2 +-
 kernel/bpf/cpumap.c   |  2 +-
 mm/page_frag_cache.c  | 12 +-
 mm/page_frag_test.c   | 11 +-
 net/core/skbuff.c | 18 +++
 net/core/xdp.c|  2 +-
 net/rxrpc/txbuf.c | 15 +++--
 net/sunrpc/svcsock.c  |  6 ++---
 19 files changed, 74 insertions(+), 71 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_rx.c 
b/drivers/net/ethernet/google/gve/gve_rx.c
index acb73d4d0de6..b6c10100e462 100644
--- a/drivers/net/ethernet/google/gve/gve_rx.c
+++ b/drivers/net/ethernet/google/gve/gve_rx.c
@@ -729,7 +729,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
total_len = headroom + SKB_DATA_ALIGN(len) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-   frame = page_frag_alloc(>page_cache, total_len, GFP_ATOMIC);
+   frame = page_frag_alloc_va(>page_cache, total_len, GFP_ATOMIC);
if (!frame) {
u64_stats_update_begin(>statss);
rx->xdp_alloc_fails++;
@@ -742,7 +742,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
err = xdp_do_redirect(dev, , xdp_prog);
if (err)
-   page_frag_free(frame);
+   page_frag_free_va(frame);
 
return err;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c 
b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 8bb743f78fcb..399b317c509d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -126,7 +126,7 @@ ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct 
ice_tx_buf *tx_buf)
dev_kfree_skb_any(tx_buf->skb);
break;
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame(tx_buf->xdpf);
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h 
b/drivers/net/ethernet/intel/ice/ice_txrx.h
index feba314a3fe4..6379f57d8228 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -148,7 +148,7 @@ static inline int ice_skb_pad(void)
  * @ICE_TX_BUF_DUMMY: dummy Flow Director packet, unmap and kfree()
  * @ICE_TX_BUF_FRAG: mapped skb OR _buff frag, only unmap DMA
  * @ICE_TX_BUF_SKB: _buff, unmap and consume_skb(), update stats
- * @ICE_TX_BUF_XDP_TX: _buff, unmap and page_frag_free(), stats
+ * @ICE_TX_BUF_XDP_TX: _buff, unmap and page_frag_free_va(), stats
  * @ICE_TX_BUF_XDP_XMIT: _frame, unmap and xdp_return_frame(), stats
  * @ICE_TX_BUF_XSK_TX: _buff on XSk queue, xsk_buff_free(), stats
  */
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c 
b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
index 2719f0e20933..a1a41a14df0d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
@@ -250,7 +250,7 @@ ice_clean_xdp_tx_buf(struct device *dev, struct ice_tx_buf 
*tx_buf,
 
switch (tx_buf->type) {
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame_bulk(tx_buf->xdpf, bq);
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 
b/drivers/net/ethernet/intel/ixgbe

[RFC v4 06/13] mm: page_frag: add '_va' suffix to page_frag API

2024-05-15 Thread Yunsheng Lin
Currently the page_frag API is returning 'virtual address'
or 'va' when allocing and expecting 'virtual address' or
'va' as input when freeing.

As we are about to support new use cases that the caller
need to deal with 'struct page' or need to deal with both
'va' and 'struct page'. In order to differentiate the API
handling between 'va' and 'struct page', add '_va' suffix
to the corresponding API mirroring the page_pool_alloc_va()
API of the page_pool. So that callers expecting to deal with
va, page or both va and page may call page_frag_alloc_va*,
page_frag_alloc_pg*, or page_frag_alloc* API accordingly.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
---
 drivers/net/ethernet/google/gve/gve_rx.c  |  4 ++--
 drivers/net/ethernet/intel/ice/ice_txrx.c |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx.h |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx_lib.c |  2 +-
 .../net/ethernet/intel/ixgbevf/ixgbevf_main.c |  4 ++--
 .../marvell/octeontx2/nic/otx2_common.c   |  2 +-
 drivers/net/ethernet/mediatek/mtk_wed_wo.c|  4 ++--
 drivers/nvme/host/tcp.c   |  8 +++
 drivers/nvme/target/tcp.c | 22 +--
 drivers/vhost/net.c   |  6 ++---
 include/linux/page_frag_cache.h   | 21 +-
 include/linux/skbuff.h|  2 +-
 kernel/bpf/cpumap.c   |  2 +-
 mm/page_frag_cache.c  | 12 +-
 mm/page_frag_test.c   | 11 +-
 net/core/skbuff.c | 18 +++
 net/core/xdp.c|  2 +-
 net/rxrpc/txbuf.c | 15 +++--
 net/sunrpc/svcsock.c  |  6 ++---
 19 files changed, 74 insertions(+), 71 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_rx.c 
b/drivers/net/ethernet/google/gve/gve_rx.c
index acb73d4d0de6..b6c10100e462 100644
--- a/drivers/net/ethernet/google/gve/gve_rx.c
+++ b/drivers/net/ethernet/google/gve/gve_rx.c
@@ -729,7 +729,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
total_len = headroom + SKB_DATA_ALIGN(len) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-   frame = page_frag_alloc(>page_cache, total_len, GFP_ATOMIC);
+   frame = page_frag_alloc_va(>page_cache, total_len, GFP_ATOMIC);
if (!frame) {
u64_stats_update_begin(>statss);
rx->xdp_alloc_fails++;
@@ -742,7 +742,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
err = xdp_do_redirect(dev, , xdp_prog);
if (err)
-   page_frag_free(frame);
+   page_frag_free_va(frame);
 
return err;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c 
b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 8bb743f78fcb..399b317c509d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -126,7 +126,7 @@ ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct 
ice_tx_buf *tx_buf)
dev_kfree_skb_any(tx_buf->skb);
break;
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame(tx_buf->xdpf);
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h 
b/drivers/net/ethernet/intel/ice/ice_txrx.h
index feba314a3fe4..6379f57d8228 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -148,7 +148,7 @@ static inline int ice_skb_pad(void)
  * @ICE_TX_BUF_DUMMY: dummy Flow Director packet, unmap and kfree()
  * @ICE_TX_BUF_FRAG: mapped skb OR _buff frag, only unmap DMA
  * @ICE_TX_BUF_SKB: _buff, unmap and consume_skb(), update stats
- * @ICE_TX_BUF_XDP_TX: _buff, unmap and page_frag_free(), stats
+ * @ICE_TX_BUF_XDP_TX: _buff, unmap and page_frag_free_va(), stats
  * @ICE_TX_BUF_XDP_XMIT: _frame, unmap and xdp_return_frame(), stats
  * @ICE_TX_BUF_XSK_TX: _buff on XSk queue, xsk_buff_free(), stats
  */
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c 
b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
index 2719f0e20933..a1a41a14df0d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
@@ -250,7 +250,7 @@ ice_clean_xdp_tx_buf(struct device *dev, struct ice_tx_buf 
*tx_buf,
 
switch (tx_buf->type) {
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame_bulk(tx_buf->xdpf, bq);
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 
b/drivers/net/ethernet/intel/ixgbe

[RFC v4 07/13] mm: page_frag: avoid caller accessing 'page_frag_cache' directly

2024-05-15 Thread Yunsheng Lin
Use appropriate frag_page API instead of caller accessing
'page_frag_cache' directly.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
---
 drivers/vhost/net.c |  2 +-
 include/linux/page_frag_cache.h | 10 ++
 mm/page_frag_test.c |  2 +-
 net/core/skbuff.c   |  6 +++---
 net/rxrpc/conn_object.c |  4 +---
 net/rxrpc/local_object.c|  4 +---
 net/sunrpc/svcsock.c|  6 ++
 7 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 6691fac01e0d..b2737dc0dc50 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1325,7 +1325,7 @@ static int vhost_net_open(struct inode *inode, struct 
file *f)
vqs[VHOST_NET_VQ_RX]);
 
f->private_data = n;
-   n->pf_cache.va = NULL;
+   page_frag_cache_init(>pf_cache);
 
return 0;
 }
diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index a5747cf7a3a1..024ff73a7ea4 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -23,6 +23,16 @@ struct page_frag_cache {
bool pfmemalloc;
 };
 
+static inline void page_frag_cache_init(struct page_frag_cache *nc)
+{
+   nc->va = NULL;
+}
+
+static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
+{
+   return !!nc->pfmemalloc;
+}
+
 void page_frag_cache_drain(struct page_frag_cache *nc);
 void __page_frag_cache_drain(struct page *page, unsigned int count);
 void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
diff --git a/mm/page_frag_test.c b/mm/page_frag_test.c
index bb9f5aa84631..641255136105 100644
--- a/mm/page_frag_test.c
+++ b/mm/page_frag_test.c
@@ -337,7 +337,7 @@ static int __init page_frag_test_init(void)
u64 duration;
int ret;
 
-   test_frag.va = NULL;
+   page_frag_cache_init(_frag);
atomic_set(, 2);
init_completion();
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index dca4e7445348..caee22db1cc7 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -741,12 +741,12 @@ struct sk_buff *__netdev_alloc_skb(struct net_device 
*dev, unsigned int len,
if (in_hardirq() || irqs_disabled()) {
nc = this_cpu_ptr(_alloc_cache);
data = page_frag_alloc_va(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
} else {
local_bh_disable();
nc = this_cpu_ptr(_alloc_cache.page);
data = page_frag_alloc_va(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
local_bh_enable();
}
 
@@ -834,7 +834,7 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, 
unsigned int len)
len = SKB_HEAD_ALIGN(len);
 
data = page_frag_alloc_va(>page, len, gfp_mask);
-   pfmemalloc = nc->page.pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(>page);
}
 
if (unlikely(!data))
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 1539d315afe7..694c4df7a1a3 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -337,9 +337,7 @@ static void rxrpc_clean_up_connection(struct work_struct 
*work)
 */
rxrpc_purge_queue(>rx_queue);
 
-   if (conn->tx_data_alloc.va)
-   __page_frag_cache_drain(virt_to_page(conn->tx_data_alloc.va),
-   conn->tx_data_alloc.pagecnt_bias);
+   page_frag_cache_drain(>tx_data_alloc);
call_rcu(>rcu, rxrpc_rcu_free_connection);
 }
 
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 504453c688d7..a8cffe47cf01 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -452,9 +452,7 @@ void rxrpc_destroy_local(struct rxrpc_local *local)
 #endif
rxrpc_purge_queue(>rx_queue);
rxrpc_purge_client_connections(local);
-   if (local->tx_alloc.va)
-   __page_frag_cache_drain(virt_to_page(local->tx_alloc.va),
-   local->tx_alloc.pagecnt_bias);
+   page_frag_cache_drain(>tx_alloc);
 }
 
 /*
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 42d20412c1c3..4b1e87187614 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1609,7 +1609,6 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt)
 static void svc_sock_free(struct svc_xprt *xprt)
 {
struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
-   struct page_frag_cache *pfc = >sk_frag_cache;
struct socket *sock = svsk->sk_sock;
 
trace_svcsock_free(svsk, sock);
@@ -1619,8 +1618,7 @@ static void svc_sock_free(struct svc_xprt *xprt)
sockfd_put(sock);

[PATCH net-next v3 07/13] mm: page_frag: avoid caller accessing 'page_frag_cache' directly

2024-05-08 Thread Yunsheng Lin
Use appropriate frag_page API instead of caller accessing
'page_frag_cache' directly.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
---
 drivers/vhost/net.c |  2 +-
 include/linux/page_frag_cache.h | 10 ++
 mm/page_frag_test.c |  2 +-
 net/core/skbuff.c   |  6 +++---
 net/rxrpc/conn_object.c |  4 +---
 net/rxrpc/local_object.c|  4 +---
 net/sunrpc/svcsock.c|  6 ++
 7 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 6691fac01e0d..b2737dc0dc50 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1325,7 +1325,7 @@ static int vhost_net_open(struct inode *inode, struct 
file *f)
vqs[VHOST_NET_VQ_RX]);
 
f->private_data = n;
-   n->pf_cache.va = NULL;
+   page_frag_cache_init(>pf_cache);
 
return 0;
 }
diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index a5747cf7a3a1..024ff73a7ea4 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -23,6 +23,16 @@ struct page_frag_cache {
bool pfmemalloc;
 };
 
+static inline void page_frag_cache_init(struct page_frag_cache *nc)
+{
+   nc->va = NULL;
+}
+
+static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
+{
+   return !!nc->pfmemalloc;
+}
+
 void page_frag_cache_drain(struct page_frag_cache *nc);
 void __page_frag_cache_drain(struct page *page, unsigned int count);
 void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
diff --git a/mm/page_frag_test.c b/mm/page_frag_test.c
index 92eb288aab75..8a974d0588bf 100644
--- a/mm/page_frag_test.c
+++ b/mm/page_frag_test.c
@@ -329,7 +329,7 @@ static int __init page_frag_test_init(void)
u64 duration;
int ret;
 
-   test_frag.va = NULL;
+   page_frag_cache_init(_frag);
atomic_set(, 2);
init_completion();
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index dca4e7445348..caee22db1cc7 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -741,12 +741,12 @@ struct sk_buff *__netdev_alloc_skb(struct net_device 
*dev, unsigned int len,
if (in_hardirq() || irqs_disabled()) {
nc = this_cpu_ptr(_alloc_cache);
data = page_frag_alloc_va(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
} else {
local_bh_disable();
nc = this_cpu_ptr(_alloc_cache.page);
data = page_frag_alloc_va(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
local_bh_enable();
}
 
@@ -834,7 +834,7 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, 
unsigned int len)
len = SKB_HEAD_ALIGN(len);
 
data = page_frag_alloc_va(>page, len, gfp_mask);
-   pfmemalloc = nc->page.pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(>page);
}
 
if (unlikely(!data))
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 1539d315afe7..694c4df7a1a3 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -337,9 +337,7 @@ static void rxrpc_clean_up_connection(struct work_struct 
*work)
 */
rxrpc_purge_queue(>rx_queue);
 
-   if (conn->tx_data_alloc.va)
-   __page_frag_cache_drain(virt_to_page(conn->tx_data_alloc.va),
-   conn->tx_data_alloc.pagecnt_bias);
+   page_frag_cache_drain(>tx_data_alloc);
call_rcu(>rcu, rxrpc_rcu_free_connection);
 }
 
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 504453c688d7..a8cffe47cf01 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -452,9 +452,7 @@ void rxrpc_destroy_local(struct rxrpc_local *local)
 #endif
rxrpc_purge_queue(>rx_queue);
rxrpc_purge_client_connections(local);
-   if (local->tx_alloc.va)
-   __page_frag_cache_drain(virt_to_page(local->tx_alloc.va),
-   local->tx_alloc.pagecnt_bias);
+   page_frag_cache_drain(>tx_alloc);
 }
 
 /*
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 42d20412c1c3..4b1e87187614 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1609,7 +1609,6 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt)
 static void svc_sock_free(struct svc_xprt *xprt)
 {
struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
-   struct page_frag_cache *pfc = >sk_frag_cache;
struct socket *sock = svsk->sk_sock;
 
trace_svcsock_free(svsk, sock);
@@ -1619,8 +1618,7 @@ static void svc_sock_free(struct svc_xprt *xprt)
sockfd_put(sock);

[PATCH net-next v3 06/13] mm: page_frag: add '_va' suffix to page_frag API

2024-05-08 Thread Yunsheng Lin
Currently the page_frag API is returning 'virtual address'
or 'va' when allocing and expecting 'virtual address' or
'va' as input when freeing.

As we are about to support new use cases that the caller
need to deal with 'struct page' or need to deal with both
'va' and 'struct page'. In order to differentiate the API
handling between 'va' and 'struct page', add '_va' suffix
to the corresponding API mirroring the page_pool_alloc_va()
API of the page_pool. So that callers expecting to deal with
va, page or both va and page may call page_frag_alloc_va*,
page_frag_alloc_pg*, or page_frag_alloc* API accordingly.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
---
 drivers/net/ethernet/google/gve/gve_rx.c  |  4 ++--
 drivers/net/ethernet/intel/ice/ice_txrx.c |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx.h |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx_lib.c |  2 +-
 .../net/ethernet/intel/ixgbevf/ixgbevf_main.c |  4 ++--
 .../marvell/octeontx2/nic/otx2_common.c   |  2 +-
 drivers/net/ethernet/mediatek/mtk_wed_wo.c|  4 ++--
 drivers/nvme/host/tcp.c   |  8 +++
 drivers/nvme/target/tcp.c | 22 +--
 drivers/vhost/net.c   |  6 ++---
 include/linux/page_frag_cache.h   | 21 +-
 include/linux/skbuff.h|  2 +-
 kernel/bpf/cpumap.c   |  2 +-
 mm/page_frag_cache.c  | 12 +-
 mm/page_frag_test.c   | 11 +-
 net/core/skbuff.c | 18 +++
 net/core/xdp.c|  2 +-
 net/rxrpc/txbuf.c | 15 +++--
 net/sunrpc/svcsock.c  |  6 ++---
 19 files changed, 74 insertions(+), 71 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_rx.c 
b/drivers/net/ethernet/google/gve/gve_rx.c
index acb73d4d0de6..b6c10100e462 100644
--- a/drivers/net/ethernet/google/gve/gve_rx.c
+++ b/drivers/net/ethernet/google/gve/gve_rx.c
@@ -729,7 +729,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
total_len = headroom + SKB_DATA_ALIGN(len) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-   frame = page_frag_alloc(>page_cache, total_len, GFP_ATOMIC);
+   frame = page_frag_alloc_va(>page_cache, total_len, GFP_ATOMIC);
if (!frame) {
u64_stats_update_begin(>statss);
rx->xdp_alloc_fails++;
@@ -742,7 +742,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
err = xdp_do_redirect(dev, , xdp_prog);
if (err)
-   page_frag_free(frame);
+   page_frag_free_va(frame);
 
return err;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c 
b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 8bb743f78fcb..399b317c509d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -126,7 +126,7 @@ ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct 
ice_tx_buf *tx_buf)
dev_kfree_skb_any(tx_buf->skb);
break;
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame(tx_buf->xdpf);
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h 
b/drivers/net/ethernet/intel/ice/ice_txrx.h
index feba314a3fe4..6379f57d8228 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -148,7 +148,7 @@ static inline int ice_skb_pad(void)
  * @ICE_TX_BUF_DUMMY: dummy Flow Director packet, unmap and kfree()
  * @ICE_TX_BUF_FRAG: mapped skb OR _buff frag, only unmap DMA
  * @ICE_TX_BUF_SKB: _buff, unmap and consume_skb(), update stats
- * @ICE_TX_BUF_XDP_TX: _buff, unmap and page_frag_free(), stats
+ * @ICE_TX_BUF_XDP_TX: _buff, unmap and page_frag_free_va(), stats
  * @ICE_TX_BUF_XDP_XMIT: _frame, unmap and xdp_return_frame(), stats
  * @ICE_TX_BUF_XSK_TX: _buff on XSk queue, xsk_buff_free(), stats
  */
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c 
b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
index 2719f0e20933..a1a41a14df0d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
@@ -250,7 +250,7 @@ ice_clean_xdp_tx_buf(struct device *dev, struct ice_tx_buf 
*tx_buf,
 
switch (tx_buf->type) {
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame_bulk(tx_buf->xdpf, bq);
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 
b/drivers/net/ethernet/intel/ixgbe

Re: [PATCH net-next v2 07/15] mm: page_frag: add '_va' suffix to page_frag API

2024-04-17 Thread Yunsheng Lin
On 2024/4/17 0:12, Alexander H Duyck wrote:
> On Mon, 2024-04-15 at 21:19 +0800, Yunsheng Lin wrote:
>> Currently most of the API for page_frag API is returning
>> 'virtual address' as output or expecting 'virtual address'
>> as input, in order to differentiate the API handling between
>> 'virtual address' and 'struct page', add '_va' suffix to the
>> corresponding API mirroring the page_pool_alloc_va() API of
>> the page_pool.
>>
>> Signed-off-by: Yunsheng Lin 
> 
> This patch is a total waste of time. By that logic we should be
> renaming __get_free_pages since it essentially does the same thing.
> 
> This just seems like more code changes for the sake of adding code
> changes rather than fixing anything. In my opinion it should be dropped
> from the set.

The rename is to support different use case as mentioned below in patch
14:
"Depending on different use cases, callers expecting to deal with va, page or
both va and page for them may call page_frag_alloc_va*, page_frag_alloc_pg*,
or page_frag_alloc* API accordingly."

Naming is hard anyway, I am open to better API naming for the above use cases.

> 
> .
> 



[PATCH net-next v2 07/15] mm: page_frag: add '_va' suffix to page_frag API

2024-04-15 Thread Yunsheng Lin
Currently most of the API for page_frag API is returning
'virtual address' as output or expecting 'virtual address'
as input, in order to differentiate the API handling between
'virtual address' and 'struct page', add '_va' suffix to the
corresponding API mirroring the page_pool_alloc_va() API of
the page_pool.

Signed-off-by: Yunsheng Lin 
---
 drivers/net/ethernet/google/gve/gve_rx.c  |  4 ++--
 drivers/net/ethernet/intel/ice/ice_txrx.c |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx.h |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx_lib.c |  2 +-
 .../net/ethernet/intel/ixgbevf/ixgbevf_main.c |  4 ++--
 .../marvell/octeontx2/nic/otx2_common.c   |  2 +-
 drivers/net/ethernet/mediatek/mtk_wed_wo.c|  4 ++--
 drivers/nvme/host/tcp.c   |  8 +++
 drivers/nvme/target/tcp.c | 22 -
 drivers/vhost/net.c   |  6 ++---
 include/linux/page_frag_cache.h   | 24 ++-
 include/linux/skbuff.h|  2 +-
 kernel/bpf/cpumap.c   |  2 +-
 mm/page_frag_cache.c  | 10 
 mm/page_frag_test.c   |  6 ++---
 net/core/skbuff.c | 15 ++--
 net/core/xdp.c|  2 +-
 net/rxrpc/txbuf.c | 15 ++--
 net/sunrpc/svcsock.c  |  6 ++---
 19 files changed, 71 insertions(+), 67 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_rx.c 
b/drivers/net/ethernet/google/gve/gve_rx.c
index cd727e55ae0f..820874c1c570 100644
--- a/drivers/net/ethernet/google/gve/gve_rx.c
+++ b/drivers/net/ethernet/google/gve/gve_rx.c
@@ -687,7 +687,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
total_len = headroom + SKB_DATA_ALIGN(len) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-   frame = page_frag_alloc(>page_cache, total_len, GFP_ATOMIC);
+   frame = page_frag_alloc_va(>page_cache, total_len, GFP_ATOMIC);
if (!frame) {
u64_stats_update_begin(>statss);
rx->xdp_alloc_fails++;
@@ -700,7 +700,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
err = xdp_do_redirect(dev, , xdp_prog);
if (err)
-   page_frag_free(frame);
+   page_frag_free_va(frame);
 
return err;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c 
b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 8bb743f78fcb..399b317c509d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -126,7 +126,7 @@ ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct 
ice_tx_buf *tx_buf)
dev_kfree_skb_any(tx_buf->skb);
break;
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame(tx_buf->xdpf);
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h 
b/drivers/net/ethernet/intel/ice/ice_txrx.h
index feba314a3fe4..6379f57d8228 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -148,7 +148,7 @@ static inline int ice_skb_pad(void)
  * @ICE_TX_BUF_DUMMY: dummy Flow Director packet, unmap and kfree()
  * @ICE_TX_BUF_FRAG: mapped skb OR _buff frag, only unmap DMA
  * @ICE_TX_BUF_SKB: _buff, unmap and consume_skb(), update stats
- * @ICE_TX_BUF_XDP_TX: _buff, unmap and page_frag_free(), stats
+ * @ICE_TX_BUF_XDP_TX: _buff, unmap and page_frag_free_va(), stats
  * @ICE_TX_BUF_XDP_XMIT: _frame, unmap and xdp_return_frame(), stats
  * @ICE_TX_BUF_XSK_TX: _buff on XSk queue, xsk_buff_free(), stats
  */
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c 
b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
index df072ce767b1..c34cc02ad578 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
@@ -288,7 +288,7 @@ ice_clean_xdp_tx_buf(struct device *dev, struct ice_tx_buf 
*tx_buf,
 
switch (tx_buf->type) {
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame_bulk(tx_buf->xdpf, bq);
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 
b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 3161a13079fe..c35b8f675b48 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -303,7 +303,7 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector 
*q_vector,
 
/* free the skb */
   

[PATCH net-next v1 04/12] mm: page_frag: add '_va' suffix to page_frag API

2024-04-07 Thread Yunsheng Lin
Currently most of the API for page_frag API is returning
'virtual address' as output or expecting 'virtual address'
as input, in order to differentiate the API handling between
'virtual address' and 'struct page', add '_va' suffix to the
corresponding API mirroring the page_pool_alloc_va() API of
the page_pool.

Signed-off-by: Yunsheng Lin 
---
 drivers/net/ethernet/google/gve/gve_rx.c  |  4 ++--
 drivers/net/ethernet/intel/ice/ice_txrx.c |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx.h |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx_lib.c |  2 +-
 .../net/ethernet/intel/ixgbevf/ixgbevf_main.c |  4 ++--
 .../marvell/octeontx2/nic/otx2_common.c   |  2 +-
 drivers/net/ethernet/mediatek/mtk_wed_wo.c|  4 ++--
 drivers/nvme/host/tcp.c   |  8 +++
 drivers/nvme/target/tcp.c | 22 -
 drivers/vhost/net.c   |  6 ++---
 include/linux/page_frag_cache.h   | 24 ++-
 include/linux/skbuff.h|  2 +-
 kernel/bpf/cpumap.c   |  2 +-
 mm/page_frag_cache.c  | 10 
 net/core/skbuff.c | 15 ++--
 net/core/xdp.c|  2 +-
 net/rxrpc/txbuf.c | 15 ++--
 net/sunrpc/svcsock.c  |  4 ++--
 18 files changed, 67 insertions(+), 63 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_rx.c 
b/drivers/net/ethernet/google/gve/gve_rx.c
index cd727e55ae0f..820874c1c570 100644
--- a/drivers/net/ethernet/google/gve/gve_rx.c
+++ b/drivers/net/ethernet/google/gve/gve_rx.c
@@ -687,7 +687,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
total_len = headroom + SKB_DATA_ALIGN(len) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-   frame = page_frag_alloc(>page_cache, total_len, GFP_ATOMIC);
+   frame = page_frag_alloc_va(>page_cache, total_len, GFP_ATOMIC);
if (!frame) {
u64_stats_update_begin(>statss);
rx->xdp_alloc_fails++;
@@ -700,7 +700,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
err = xdp_do_redirect(dev, , xdp_prog);
if (err)
-   page_frag_free(frame);
+   page_frag_free_va(frame);
 
return err;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c 
b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 8bb743f78fcb..399b317c509d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -126,7 +126,7 @@ ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct 
ice_tx_buf *tx_buf)
dev_kfree_skb_any(tx_buf->skb);
break;
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame(tx_buf->xdpf);
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h 
b/drivers/net/ethernet/intel/ice/ice_txrx.h
index feba314a3fe4..6379f57d8228 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -148,7 +148,7 @@ static inline int ice_skb_pad(void)
  * @ICE_TX_BUF_DUMMY: dummy Flow Director packet, unmap and kfree()
  * @ICE_TX_BUF_FRAG: mapped skb OR _buff frag, only unmap DMA
  * @ICE_TX_BUF_SKB: _buff, unmap and consume_skb(), update stats
- * @ICE_TX_BUF_XDP_TX: _buff, unmap and page_frag_free(), stats
+ * @ICE_TX_BUF_XDP_TX: _buff, unmap and page_frag_free_va(), stats
  * @ICE_TX_BUF_XDP_XMIT: _frame, unmap and xdp_return_frame(), stats
  * @ICE_TX_BUF_XSK_TX: _buff on XSk queue, xsk_buff_free(), stats
  */
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c 
b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
index df072ce767b1..c34cc02ad578 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
@@ -288,7 +288,7 @@ ice_clean_xdp_tx_buf(struct device *dev, struct ice_tx_buf 
*tx_buf,
 
switch (tx_buf->type) {
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame_bulk(tx_buf->xdpf, bq);
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 
b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 3161a13079fe..c35b8f675b48 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -303,7 +303,7 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector 
*q_vector,
 
/* free the skb */
if (ring_is_xdp(tx_ring))
- 

[PATCH RFC 04/10] mm: page_frag: add '_va' suffix to page_frag API

2024-03-28 Thread Yunsheng Lin
Currently most of the API for page_frag API is returning
'virtual address' as output or expecting 'virtual address'
as input, in order to differentiate the API handling between
'virtual address' and 'struct page', add '_va' suffix to the
corresponding API mirroring the page_pool_alloc_va() API of
the page_pool.

Signed-off-by: Yunsheng Lin 
---
 drivers/net/ethernet/google/gve/gve_rx.c  |  4 ++--
 drivers/net/ethernet/intel/ice/ice_txrx.c |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx.h |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx_lib.c |  2 +-
 .../net/ethernet/intel/ixgbevf/ixgbevf_main.c |  4 ++--
 .../marvell/octeontx2/nic/otx2_common.c   |  2 +-
 drivers/net/ethernet/mediatek/mtk_wed_wo.c|  4 ++--
 drivers/nvme/host/tcp.c   |  8 +++
 drivers/nvme/target/tcp.c | 22 -
 drivers/vhost/net.c   |  6 ++---
 include/linux/page_frag_cache.h   | 24 ++-
 include/linux/skbuff.h|  2 +-
 kernel/bpf/cpumap.c   |  2 +-
 mm/page_frag_alloc.c  | 10 
 net/core/skbuff.c | 15 ++--
 net/core/xdp.c|  2 +-
 net/rxrpc/txbuf.c | 15 ++--
 net/sunrpc/svcsock.c  |  4 ++--
 18 files changed, 67 insertions(+), 63 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_rx.c 
b/drivers/net/ethernet/google/gve/gve_rx.c
index 20f5a9e7fae9..58091de93430 100644
--- a/drivers/net/ethernet/google/gve/gve_rx.c
+++ b/drivers/net/ethernet/google/gve/gve_rx.c
@@ -687,7 +687,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
total_len = headroom + SKB_DATA_ALIGN(len) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-   frame = page_frag_alloc(>page_cache, total_len, GFP_ATOMIC);
+   frame = page_frag_alloc_va(>page_cache, total_len, GFP_ATOMIC);
if (!frame) {
u64_stats_update_begin(>statss);
rx->xdp_alloc_fails++;
@@ -700,7 +700,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
err = xdp_do_redirect(dev, , xdp_prog);
if (err)
-   page_frag_free(frame);
+   page_frag_free_va(frame);
 
return err;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c 
b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 97d41d6ebf1f..87f23995b657 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -126,7 +126,7 @@ ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct 
ice_tx_buf *tx_buf)
dev_kfree_skb_any(tx_buf->skb);
break;
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame(tx_buf->xdpf);
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h 
b/drivers/net/ethernet/intel/ice/ice_txrx.h
index af955b0e5dc5..65ad1757824f 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -148,7 +148,7 @@ static inline int ice_skb_pad(void)
  * @ICE_TX_BUF_DUMMY: dummy Flow Director packet, unmap and kfree()
  * @ICE_TX_BUF_FRAG: mapped skb OR _buff frag, only unmap DMA
  * @ICE_TX_BUF_SKB: _buff, unmap and consume_skb(), update stats
- * @ICE_TX_BUF_XDP_TX: _buff, unmap and page_frag_free(), stats
+ * @ICE_TX_BUF_XDP_TX: _buff, unmap and page_frag_free_va(), stats
  * @ICE_TX_BUF_XDP_XMIT: _frame, unmap and xdp_return_frame(), stats
  * @ICE_TX_BUF_XSK_TX: _buff on XSk queue, xsk_buff_free(), stats
  */
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c 
b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
index f8f1d2bdc1be..312f351ac601 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
@@ -279,7 +279,7 @@ ice_clean_xdp_tx_buf(struct device *dev, struct ice_tx_buf 
*tx_buf,
 
switch (tx_buf->type) {
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame_bulk(tx_buf->xdpf, bq);
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 
b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 9c960017a6de..f781c5f202c9 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -303,7 +303,7 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector 
*q_vector,
 
/* free the skb */
if (ring_is_xdp(tx_ring))
- 

[PATCH net-next v6 4/5] vhost/net: remove vhost_net_page_frag_refill()

2024-02-28 Thread Yunsheng Lin
The page frag in vhost_net_page_frag_refill() uses the
'struct page_frag' from skb_page_frag_refill(), but it's
implementation is similar to page_frag_alloc_align() now.

This patch removes vhost_net_page_frag_refill() by using
'struct page_frag_cache' instead of 'struct page_frag',
and allocating frag using page_frag_alloc_align().

The added benefit is that not only unifying the page frag
implementation a little, but also having about 0.5% performance
boost testing by using the vhost_net_test introduced in the
last patch.

Signed-off-by: Yunsheng Lin 
Acked-by: Jason Wang 
---
 drivers/vhost/net.c | 91 ++---
 1 file changed, 27 insertions(+), 64 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index e574e21cc0ca..4b2fcb228a0a 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -141,10 +141,8 @@ struct vhost_net {
unsigned tx_zcopy_err;
/* Flush in progress. Protected by tx vq lock. */
bool tx_flush;
-   /* Private page frag */
-   struct page_frag page_frag;
-   /* Refcount bias of page frag */
-   int refcnt_bias;
+   /* Private page frag cache */
+   struct page_frag_cache pf_cache;
 };
 
 static unsigned vhost_net_zcopy_mask __read_mostly;
@@ -655,41 +653,6 @@ static bool tx_can_batch(struct vhost_virtqueue *vq, 
size_t total_len)
   !vhost_vq_avail_empty(vq->dev, vq);
 }
 
-static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz,
-  struct page_frag *pfrag, gfp_t gfp)
-{
-   if (pfrag->page) {
-   if (pfrag->offset + sz <= pfrag->size)
-   return true;
-   __page_frag_cache_drain(pfrag->page, net->refcnt_bias);
-   }
-
-   pfrag->offset = 0;
-   net->refcnt_bias = 0;
-   if (SKB_FRAG_PAGE_ORDER) {
-   /* Avoid direct reclaim but allow kswapd to wake */
-   pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
- __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY | __GFP_NOMEMALLOC,
- SKB_FRAG_PAGE_ORDER);
-   if (likely(pfrag->page)) {
-   pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
-   goto done;
-   }
-   }
-   pfrag->page = alloc_page(gfp);
-   if (likely(pfrag->page)) {
-   pfrag->size = PAGE_SIZE;
-   goto done;
-   }
-   return false;
-
-done:
-   net->refcnt_bias = USHRT_MAX;
-   page_ref_add(pfrag->page, USHRT_MAX - 1);
-   return true;
-}
-
 #define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
 
 static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
@@ -699,7 +662,6 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
struct vhost_net *net = container_of(vq->dev, struct vhost_net,
 dev);
struct socket *sock = vhost_vq_get_backend(vq);
-   struct page_frag *alloc_frag = >page_frag;
struct virtio_net_hdr *gso;
struct xdp_buff *xdp = >xdp[nvq->batched_xdp];
struct tun_xdp_hdr *hdr;
@@ -710,6 +672,7 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
int sock_hlen = nvq->sock_hlen;
void *buf;
int copied;
+   int ret;
 
if (unlikely(len < nvq->sock_hlen))
return -EFAULT;
@@ -719,18 +682,17 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
return -ENOSPC;
 
buflen += SKB_DATA_ALIGN(len + pad);
-   alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
-   if (unlikely(!vhost_net_page_frag_refill(net, buflen,
-alloc_frag, GFP_KERNEL)))
+   buf = page_frag_alloc_align(>pf_cache, buflen, GFP_KERNEL,
+   SMP_CACHE_BYTES);
+   if (unlikely(!buf))
return -ENOMEM;
 
-   buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
-   copied = copy_page_from_iter(alloc_frag->page,
-alloc_frag->offset +
-offsetof(struct tun_xdp_hdr, gso),
-sock_hlen, from);
-   if (copied != sock_hlen)
-   return -EFAULT;
+   copied = copy_from_iter(buf + offsetof(struct tun_xdp_hdr, gso),
+   sock_hlen, from);
+   if (copied != sock_hlen) {
+   ret = -EFAULT;
+   goto err;
+   }
 
hdr = buf;
gso = >gso;
@@ -743,27 +705,30 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
   vho

[PATCH net-next v6 5/5] tools: virtio: introduce vhost_net_test

2024-02-28 Thread Yunsheng Lin
introduce vhost_net_test for both vhost_net tx and rx basing
on virtio_test to test vhost_net changing in the kernel.

Steps for vhost_net tx testing:
1. Prepare a out buf.
2. Kick the vhost_net to do tx processing.
3. Do the receiving in the tun side.
4. verify the data received by tun is correct.

Steps for vhost_net rx testing:
1. Prepare a in buf.
2. Do the sending in the tun side.
3. Kick the vhost_net to do rx processing.
4. verify the data received by vhost_net is correct.

Signed-off-by: Yunsheng Lin 
---
 tools/virtio/.gitignore|   1 +
 tools/virtio/Makefile  |   8 +-
 tools/virtio/linux/virtio_config.h |   4 +
 tools/virtio/vhost_net_test.c  | 532 +
 4 files changed, 542 insertions(+), 3 deletions(-)
 create mode 100644 tools/virtio/vhost_net_test.c

diff --git a/tools/virtio/.gitignore b/tools/virtio/.gitignore
index 9934d48d9a55..7e47b281c442 100644
--- a/tools/virtio/.gitignore
+++ b/tools/virtio/.gitignore
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 *.d
 virtio_test
+vhost_net_test
 vringh_test
 virtio-trace/trace-agent
diff --git a/tools/virtio/Makefile b/tools/virtio/Makefile
index d128925980e0..e25e99c1c3b7 100644
--- a/tools/virtio/Makefile
+++ b/tools/virtio/Makefile
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0
 all: test mod
-test: virtio_test vringh_test
+test: virtio_test vringh_test vhost_net_test
 virtio_test: virtio_ring.o virtio_test.o
 vringh_test: vringh_test.o vringh.o virtio_ring.o
+vhost_net_test: virtio_ring.o vhost_net_test.o
 
 try-run = $(shell set -e;  \
if ($(1)) >/dev/null 2>&1;  \
@@ -49,6 +50,7 @@ oot-clean: OOT_BUILD+=clean
 
 .PHONY: all test mod clean vhost oot oot-clean oot-build
 clean:
-   ${RM} *.o vringh_test virtio_test vhost_test/*.o vhost_test/.*.cmd \
-  vhost_test/Module.symvers vhost_test/modules.order *.d
+   ${RM} *.o vringh_test virtio_test vhost_net_test vhost_test/*.o \
+  vhost_test/.*.cmd vhost_test/Module.symvers \
+  vhost_test/modules.order *.d
 -include *.d
diff --git a/tools/virtio/linux/virtio_config.h 
b/tools/virtio/linux/virtio_config.h
index 2a8a70e2a950..42a564f22f2d 100644
--- a/tools/virtio/linux/virtio_config.h
+++ b/tools/virtio/linux/virtio_config.h
@@ -1,4 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LINUX_VIRTIO_CONFIG_H
+#define LINUX_VIRTIO_CONFIG_H
 #include 
 #include 
 #include 
@@ -95,3 +97,5 @@ static inline __virtio64 cpu_to_virtio64(struct virtio_device 
*vdev, u64 val)
 {
return __cpu_to_virtio64(virtio_is_little_endian(vdev), val);
 }
+
+#endif
diff --git a/tools/virtio/vhost_net_test.c b/tools/virtio/vhost_net_test.c
new file mode 100644
index ..389d99a6d7c7
--- /dev/null
+++ b/tools/virtio/vhost_net_test.c
@@ -0,0 +1,532 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define HDR_LENsizeof(struct virtio_net_hdr_mrg_rxbuf)
+#define TEST_BUF_LEN   256
+#define TEST_PTYPE ETH_P_LOOPBACK
+#define DESC_NUM   256
+
+/* Used by implementation of kmalloc() in tools/virtio/linux/kernel.h */
+void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
+
+struct vq_info {
+   int kick;
+   int call;
+   int idx;
+   long started;
+   long completed;
+   struct pollfd fds;
+   void *ring;
+   /* copy used for control */
+   struct vring vring;
+   struct virtqueue *vq;
+};
+
+struct vdev_info {
+   struct virtio_device vdev;
+   int control;
+   struct vq_info vqs[2];
+   int nvqs;
+   void *buf;
+   size_t buf_size;
+   char *test_buf;
+   char *res_buf;
+   struct vhost_memory *mem;
+   int sock;
+   int ifindex;
+   unsigned char mac[ETHER_ADDR_LEN];
+};
+
+static int tun_alloc(struct vdev_info *dev, char *tun_name)
+{
+   struct ifreq ifr;
+   int len = HDR_LEN;
+   int fd, e;
+
+   fd = open("/dev/net/tun", O_RDWR);
+   if (fd < 0) {
+   perror("Cannot open /dev/net/tun");
+   return fd;
+   }
+
+   memset(, 0, sizeof(ifr));
+
+   ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
+   strncpy(ifr.ifr_name, tun_name, IFNAMSIZ);
+
+   e = ioctl(fd, TUNSETIFF, );
+   if (e < 0) {
+   perror("ioctl[TUNSETIFF]");
+   close(fd);
+   return e;
+   }
+
+   e = ioctl(fd, TUNSETVNETHDRSZ, );
+   if (e < 0) {
+   perror("ioctl[TUNSETVNETHDRSZ]");
+   close(fd);
+   return e;
+   }
+
+   e = ioctl(fd, SIOCGIFHWADDR, );
+   if (e < 0) {
+   perror("ioctl[SI

[PATCH net-next v6 2/5] page_frag: unify gfp bits for order 3 page allocation

2024-02-28 Thread Yunsheng Lin
Currently there seems to be three page frag implementations
which all try to allocate order 3 page, if that fails, it
then fail back to allocate order 0 page, and each of them
all allow order 3 page allocation to fail under certain
condition by using specific gfp bits.

The gfp bits for order 3 page allocation are different
between different implementation, __GFP_NOMEMALLOC is
or'd to forbid access to emergency reserves memory for
__page_frag_cache_refill(), but it is not or'd in other
implementions, __GFP_DIRECT_RECLAIM is masked off to avoid
direct reclaim in vhost_net_page_frag_refill(), but it is
not masked off in __page_frag_cache_refill().

This patch unifies the gfp bits used between different
implementions by or'ing __GFP_NOMEMALLOC and masking off
__GFP_DIRECT_RECLAIM for order 3 page allocation to avoid
possible pressure for mm.

Leave the gfp unifying for page frag implementation in sock.c
for now as suggested by Paolo Abeni.

Signed-off-by: Yunsheng Lin 
Reviewed-by: Alexander Duyck 
CC: Alexander Duyck 
---
 drivers/vhost/net.c | 2 +-
 mm/page_alloc.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f2ed7167c848..e574e21cc0ca 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -670,7 +670,7 @@ static bool vhost_net_page_frag_refill(struct vhost_net 
*net, unsigned int sz,
/* Avoid direct reclaim but allow kswapd to wake */
pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
  __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY,
+ __GFP_NORETRY | __GFP_NOMEMALLOC,
  SKB_FRAG_PAGE_ORDER);
if (likely(pfrag->page)) {
pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c0f7e67c4250..636145c29f70 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4685,8 +4685,8 @@ static struct page *__page_frag_cache_refill(struct 
page_frag_cache *nc,
gfp_t gfp = gfp_mask;
 
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-   gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
-   __GFP_NOMEMALLOC;
+   gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
+  __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
PAGE_FRAG_CACHE_MAX_ORDER);
nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
-- 
2.33.0




Re: [PATCH net-next v5 5/5] tools: virtio: introduce vhost_net_test

2024-02-05 Thread Yunsheng Lin
On 2024/2/6 11:08, Jason Wang wrote:

...

>> +
>> +static void wait_for_interrupt(struct vq_info *vq)
>> +{
>> +   unsigned long long val;
>> +
>> +   poll(>fds, 1, -1);
> 
> It's not good to wait indefinitely.

How about a timeout value of 100ms as below?
poll(>fds, 1, 100);

> 
>> +
>> +   if (vq->fds.revents & POLLIN)
>> +   read(vq->fds.fd, , sizeof(val));
>> +}
>> +
>> +static void verify_res_buf(char *res_buf)
>> +{
>> +   int i;
>> +
>> +   for (i = ETHER_HDR_LEN; i < TEST_BUF_LEN; i++)
>> +   assert(res_buf[i] == (char)i);
>> +}
>> +
>> +static void run_tx_test(struct vdev_info *dev, struct vq_info *vq,
>> +   bool delayed, int bufs)
>> +{
>> +   long long spurious = 0;
>> +   struct scatterlist sl;
>> +   unsigned int len;
>> +   int r;
>> +
>> +   for (;;) {
>> +   long started_before = vq->started;
>> +   long completed_before = vq->completed;
>> +
>> +   virtqueue_disable_cb(vq->vq);
>> +   do {
>> +   while (vq->started < bufs &&
>> +  (vq->started - vq->completed) < 1) {
>> +   sg_init_one(, dev->test_buf, HDR_LEN + 
>> TEST_BUF_LEN);
>> +   r = virtqueue_add_outbuf(vq->vq, , 1,
>> +dev->test_buf + 
>> vq->started,
>> +GFP_ATOMIC);
>> +   if (unlikely(r != 0))
>> +   break;
>> +
>> +   ++vq->started;
> 
> If we never decrease started/completed shouldn't we use unsigned here?
> (as well as completed)
> 
> Otherwise we may get unexpected results for vq->started as well as
> vq->completed.

We have "vq->started < bufs" checking before the increasing as above,
and there is 'assert(nbufs > 0)' when getting optarg in main(), which
means we never allow started/completed to be greater than nbufs as
my understanding.

> 
>> +
>> +   if (unlikely(!virtqueue_kick(vq->vq))) {
>> +   r = -1;
>> +   break;
>> +   }
>> +   }
>> +
>> +   if (vq->started >= bufs)
>> +   r = -1;
> 
> Which condition do we reach here?

It is also a copy & paste of virtio_test.c
It means we have finished adding the outbuf in virtqueue, and set 'r'
to be '-1' so that we can break the inner while loop if there is no
result for virtqueue_get_buf() as my understanding.

> 
>> +
>> +   /* Flush out completed bufs if any */
>> +   while (virtqueue_get_buf(vq->vq, )) {
>> +   int n;
>> +
>> +   n = recvfrom(dev->sock, dev->res_buf, 
>> TEST_BUF_LEN, 0, NULL, NULL);
>> +   assert(n == TEST_BUF_LEN);
>> +   verify_res_buf(dev->res_buf);
>> +
>> +   ++vq->completed;
>> +   r = 0;
>> +   }
>> +   } while (r == 0);
>> +
>> +   if (vq->completed == completed_before && vq->started == 
>> started_before)
>> +   ++spurious;
>> +
>> +   assert(vq->completed <= bufs);
>> +   assert(vq->started <= bufs);
>> +   if (vq->completed == bufs)
>> +   break;
>> +
>> +   if (delayed) {
>> +   if (virtqueue_enable_cb_delayed(vq->vq))
>> +   wait_for_interrupt(vq);
>> +   } else {
>> +   if (virtqueue_enable_cb(vq->vq))
>> +   wait_for_interrupt(vq);
>> +   }
> 
> This could be simplified with
> 
> if (delayed)
> else
> 
> wait_for_interrupt(vq)

I am not sure if I understand the above comment.
The wait_for_interrupt() is only called conditionally depending on the
returning of virtqueue_enable_cb_delayed() and virtqueue_enable_cb().

> 
>> +   }
>> +   printf("TX spurious wakeups: 0x%llx started=0x%lx completed=0x%lx\n",
>> +  spurious, vq->started, vq->completed);
>> +}
>> +

...

>> +
>> +   /* Flush out completed bufs if any */
>> +   while (virtqueue_get_buf(vq->vq, )) {
>> +   struct ether_header *eh;
>> +
>> +   eh = (struct ether_header *)(dev->res_buf + 
>> HDR_LEN);
>> +
>> +   /* tun netdev is up and running, ignore the
>> +* non-TEST_PTYPE packet.
>> +*/
>> +   if (eh->ether_type != htons(TEST_PTYPE)) {
>> +

[PATCH net-next v5 5/5] tools: virtio: introduce vhost_net_test

2024-02-05 Thread Yunsheng Lin
introduce vhost_net_test for both vhost_net tx and rx basing
on virtio_test to test vhost_net changing in the kernel.

Steps for vhost_net tx testing:
1. Prepare a out buf.
2. Kick the vhost_net to do tx processing.
3. Do the receiving in the tun side.
4. verify the data received by tun is correct.

Steps for vhost_net rx testing:
1. Prepare a in buf.
2. Do the sending in the tun side.
3. Kick the vhost_net to do rx processing.
4. verify the data received by vhost_net is correct.

Signed-off-by: Yunsheng Lin 
---
 tools/virtio/.gitignore|   1 +
 tools/virtio/Makefile  |   8 +-
 tools/virtio/linux/virtio_config.h |   4 +
 tools/virtio/vhost_net_test.c  | 536 +
 4 files changed, 546 insertions(+), 3 deletions(-)
 create mode 100644 tools/virtio/vhost_net_test.c

diff --git a/tools/virtio/.gitignore b/tools/virtio/.gitignore
index 9934d48d9a55..7e47b281c442 100644
--- a/tools/virtio/.gitignore
+++ b/tools/virtio/.gitignore
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 *.d
 virtio_test
+vhost_net_test
 vringh_test
 virtio-trace/trace-agent
diff --git a/tools/virtio/Makefile b/tools/virtio/Makefile
index d128925980e0..e25e99c1c3b7 100644
--- a/tools/virtio/Makefile
+++ b/tools/virtio/Makefile
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0
 all: test mod
-test: virtio_test vringh_test
+test: virtio_test vringh_test vhost_net_test
 virtio_test: virtio_ring.o virtio_test.o
 vringh_test: vringh_test.o vringh.o virtio_ring.o
+vhost_net_test: virtio_ring.o vhost_net_test.o
 
 try-run = $(shell set -e;  \
if ($(1)) >/dev/null 2>&1;  \
@@ -49,6 +50,7 @@ oot-clean: OOT_BUILD+=clean
 
 .PHONY: all test mod clean vhost oot oot-clean oot-build
 clean:
-   ${RM} *.o vringh_test virtio_test vhost_test/*.o vhost_test/.*.cmd \
-  vhost_test/Module.symvers vhost_test/modules.order *.d
+   ${RM} *.o vringh_test virtio_test vhost_net_test vhost_test/*.o \
+  vhost_test/.*.cmd vhost_test/Module.symvers \
+  vhost_test/modules.order *.d
 -include *.d
diff --git a/tools/virtio/linux/virtio_config.h 
b/tools/virtio/linux/virtio_config.h
index 2a8a70e2a950..42a564f22f2d 100644
--- a/tools/virtio/linux/virtio_config.h
+++ b/tools/virtio/linux/virtio_config.h
@@ -1,4 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LINUX_VIRTIO_CONFIG_H
+#define LINUX_VIRTIO_CONFIG_H
 #include 
 #include 
 #include 
@@ -95,3 +97,5 @@ static inline __virtio64 cpu_to_virtio64(struct virtio_device 
*vdev, u64 val)
 {
return __cpu_to_virtio64(virtio_is_little_endian(vdev), val);
 }
+
+#endif
diff --git a/tools/virtio/vhost_net_test.c b/tools/virtio/vhost_net_test.c
new file mode 100644
index ..6c41204e6707
--- /dev/null
+++ b/tools/virtio/vhost_net_test.c
@@ -0,0 +1,536 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define HDR_LENsizeof(struct virtio_net_hdr_mrg_rxbuf)
+#define TEST_BUF_LEN   256
+#define TEST_PTYPE ETH_P_LOOPBACK
+#define DESC_NUM   256
+
+/* Used by implementation of kmalloc() in tools/virtio/linux/kernel.h */
+void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
+
+struct vq_info {
+   int kick;
+   int call;
+   int idx;
+   long started;
+   long completed;
+   struct pollfd fds;
+   void *ring;
+   /* copy used for control */
+   struct vring vring;
+   struct virtqueue *vq;
+};
+
+struct vdev_info {
+   struct virtio_device vdev;
+   int control;
+   struct vq_info vqs[2];
+   int nvqs;
+   void *buf;
+   size_t buf_size;
+   char *test_buf;
+   char *res_buf;
+   struct vhost_memory *mem;
+   int sock;
+   int ifindex;
+   unsigned char mac[ETHER_ADDR_LEN];
+};
+
+static int tun_alloc(struct vdev_info *dev, char *tun_name)
+{
+   struct ifreq ifr;
+   int len = HDR_LEN;
+   int fd, e;
+
+   fd = open("/dev/net/tun", O_RDWR);
+   if (fd < 0) {
+   perror("Cannot open /dev/net/tun");
+   return fd;
+   }
+
+   memset(, 0, sizeof(ifr));
+
+   ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
+   strncpy(ifr.ifr_name, tun_name, IFNAMSIZ);
+
+   e = ioctl(fd, TUNSETIFF, );
+   if (e < 0) {
+   perror("ioctl[TUNSETIFF]");
+   close(fd);
+   return e;
+   }
+
+   e = ioctl(fd, TUNSETVNETHDRSZ, );
+   if (e < 0) {
+   perror("ioctl[TUNSETVNETHDRSZ]");
+   close(fd);
+   return e;
+   }
+
+   e = ioctl(fd, SIOCGIFHWADDR, );
+   if (e < 0) {
+   perror("ioctl[SI

[PATCH net-next v5 4/5] vhost/net: remove vhost_net_page_frag_refill()

2024-02-05 Thread Yunsheng Lin
The page frag in vhost_net_page_frag_refill() uses the
'struct page_frag' from skb_page_frag_refill(), but it's
implementation is similar to page_frag_alloc_align() now.

This patch removes vhost_net_page_frag_refill() by using
'struct page_frag_cache' instead of 'struct page_frag',
and allocating frag using page_frag_alloc_align().

The added benefit is that not only unifying the page frag
implementation a little, but also having about 0.5% performance
boost testing by using the vhost_net_test introduced in the
last patch.

Signed-off-by: Yunsheng Lin 
Acked-by: Jason Wang 
---
 drivers/vhost/net.c | 91 ++---
 1 file changed, 27 insertions(+), 64 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index e574e21cc0ca..4b2fcb228a0a 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -141,10 +141,8 @@ struct vhost_net {
unsigned tx_zcopy_err;
/* Flush in progress. Protected by tx vq lock. */
bool tx_flush;
-   /* Private page frag */
-   struct page_frag page_frag;
-   /* Refcount bias of page frag */
-   int refcnt_bias;
+   /* Private page frag cache */
+   struct page_frag_cache pf_cache;
 };
 
 static unsigned vhost_net_zcopy_mask __read_mostly;
@@ -655,41 +653,6 @@ static bool tx_can_batch(struct vhost_virtqueue *vq, 
size_t total_len)
   !vhost_vq_avail_empty(vq->dev, vq);
 }
 
-static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz,
-  struct page_frag *pfrag, gfp_t gfp)
-{
-   if (pfrag->page) {
-   if (pfrag->offset + sz <= pfrag->size)
-   return true;
-   __page_frag_cache_drain(pfrag->page, net->refcnt_bias);
-   }
-
-   pfrag->offset = 0;
-   net->refcnt_bias = 0;
-   if (SKB_FRAG_PAGE_ORDER) {
-   /* Avoid direct reclaim but allow kswapd to wake */
-   pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
- __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY | __GFP_NOMEMALLOC,
- SKB_FRAG_PAGE_ORDER);
-   if (likely(pfrag->page)) {
-   pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
-   goto done;
-   }
-   }
-   pfrag->page = alloc_page(gfp);
-   if (likely(pfrag->page)) {
-   pfrag->size = PAGE_SIZE;
-   goto done;
-   }
-   return false;
-
-done:
-   net->refcnt_bias = USHRT_MAX;
-   page_ref_add(pfrag->page, USHRT_MAX - 1);
-   return true;
-}
-
 #define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
 
 static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
@@ -699,7 +662,6 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
struct vhost_net *net = container_of(vq->dev, struct vhost_net,
 dev);
struct socket *sock = vhost_vq_get_backend(vq);
-   struct page_frag *alloc_frag = >page_frag;
struct virtio_net_hdr *gso;
struct xdp_buff *xdp = >xdp[nvq->batched_xdp];
struct tun_xdp_hdr *hdr;
@@ -710,6 +672,7 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
int sock_hlen = nvq->sock_hlen;
void *buf;
int copied;
+   int ret;
 
if (unlikely(len < nvq->sock_hlen))
return -EFAULT;
@@ -719,18 +682,17 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
return -ENOSPC;
 
buflen += SKB_DATA_ALIGN(len + pad);
-   alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
-   if (unlikely(!vhost_net_page_frag_refill(net, buflen,
-alloc_frag, GFP_KERNEL)))
+   buf = page_frag_alloc_align(>pf_cache, buflen, GFP_KERNEL,
+   SMP_CACHE_BYTES);
+   if (unlikely(!buf))
return -ENOMEM;
 
-   buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
-   copied = copy_page_from_iter(alloc_frag->page,
-alloc_frag->offset +
-offsetof(struct tun_xdp_hdr, gso),
-sock_hlen, from);
-   if (copied != sock_hlen)
-   return -EFAULT;
+   copied = copy_from_iter(buf + offsetof(struct tun_xdp_hdr, gso),
+   sock_hlen, from);
+   if (copied != sock_hlen) {
+   ret = -EFAULT;
+   goto err;
+   }
 
hdr = buf;
gso = >gso;
@@ -743,27 +705,30 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
   vho

[PATCH net-next v5 2/5] page_frag: unify gfp bits for order 3 page allocation

2024-02-05 Thread Yunsheng Lin
Currently there seems to be three page frag implementations
which all try to allocate order 3 page, if that fails, it
then fail back to allocate order 0 page, and each of them
all allow order 3 page allocation to fail under certain
condition by using specific gfp bits.

The gfp bits for order 3 page allocation are different
between different implementation, __GFP_NOMEMALLOC is
or'd to forbid access to emergency reserves memory for
__page_frag_cache_refill(), but it is not or'd in other
implementions, __GFP_DIRECT_RECLAIM is masked off to avoid
direct reclaim in vhost_net_page_frag_refill(), but it is
not masked off in __page_frag_cache_refill().

This patch unifies the gfp bits used between different
implementions by or'ing __GFP_NOMEMALLOC and masking off
__GFP_DIRECT_RECLAIM for order 3 page allocation to avoid
possible pressure for mm.

Leave the gfp unifying for page frag implementation in sock.c
for now as suggested by Paolo Abeni.

Signed-off-by: Yunsheng Lin 
Reviewed-by: Alexander Duyck 
CC: Alexander Duyck 
---
 drivers/vhost/net.c | 2 +-
 mm/page_alloc.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f2ed7167c848..e574e21cc0ca 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -670,7 +670,7 @@ static bool vhost_net_page_frag_refill(struct vhost_net 
*net, unsigned int sz,
/* Avoid direct reclaim but allow kswapd to wake */
pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
  __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY,
+ __GFP_NORETRY | __GFP_NOMEMALLOC,
  SKB_FRAG_PAGE_ORDER);
if (likely(pfrag->page)) {
pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c0f7e67c4250..636145c29f70 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4685,8 +4685,8 @@ static struct page *__page_frag_cache_refill(struct 
page_frag_cache *nc,
gfp_t gfp = gfp_mask;
 
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-   gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
-   __GFP_NOMEMALLOC;
+   gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
+  __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
PAGE_FRAG_CACHE_MAX_ORDER);
nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
-- 
2.33.0




Re: [PATCH net-next v4 5/5] tools: virtio: introduce vhost_net_test

2024-02-03 Thread Yunsheng Lin
On 2024/2/4 9:30, Jason Wang wrote:
> On Fri, Feb 2, 2024 at 8:24 PM Yunsheng Lin  wrote:
>>
>> On 2024/2/2 12:05, Jason Wang wrote:
>>> On Tue, Jan 30, 2024 at 7:38 PM Yunsheng Lin  wrote:
>>>>
>>>> introduce vhost_net_test basing on virtio_test to test
>>>> vhost_net changing in the kernel.
>>>
>>> Let's describe what kind of test is being done and how it is done here.
>>
>> How about something like below:
>>
>> This patch introduces testing for both vhost_net tx and rx.
>> Steps for vhost_net tx testing:
>> 1. Prepare a out buf
>> 2. Kick the vhost_net to do tx processing
>> 3. Do the receiving in the tun side
>> 4. verify the data received by tun is correct
>>
>> Steps for vhost_net rx testing::
>> 1. Prepare a in buf
>> 2. Do the sending in the tun side
>> 3. Kick the vhost_net to do rx processing
>> 4. verify the data received by vhost_net is correct
> 
> It looks like some important details were lost, e.g the logic for batching 
> etc.

I am supposeing you are referring to the virtio desc batch handling,
right?

It was a copy & paste code of virtio_test.c, I was thinking about removing
the virtio desc batch handling for now, as this patchset does not require
that to do the testing, it mainly depend on the "sock->sk->sk_sndbuf" to
be INT_MAX to call vhost_net_build_xdp(), which seems to be the default
case for vhost_net.

> 
>>

...

>>>> +static void vdev_create_socket(struct vdev_info *dev)
>>>> +{
>>>> +   struct ifreq ifr;
>>>> +
>>>> +   dev->sock = socket(AF_PACKET, SOCK_RAW, htons(TEST_PTYPE));
>>>> +   assert(dev->sock != -1);
>>>> +
>>>> +   snprintf(ifr.ifr_name, IFNAMSIZ, "tun_%d", getpid());
>>>
>>> Nit: it might be better to accept the device name instead of repeating
>>> the snprintf trick here, this would facilitate the future changes.
>>
>> I am not sure I understand what did you mean by "accept the device name"
>> here.
>>
>> The above is used to get ifindex of the tun netdevice created in
>> tun_alloc(), so that we can use it in vdev_send_packet() to send
>> a packet using the tun netdevice created in tun_alloc(). Is there
>> anything obvious I missed here?
> 
> I meant a const char *ifname for this function and let the caller to
> pass the name.

Sure.

> 
>>

>>>> +
>>>> +static void run_rx_test(struct vdev_info *dev, struct vq_info *vq,
>>>> +   bool delayed, int batch, int bufs)
>>>> +{
>>>> +   const bool random_batch = batch == RANDOM_BATCH;
>>>> +   long long spurious = 0;
>>>> +   struct scatterlist sl;
>>>> +   unsigned int len;
>>>> +   int r;
>>>> +
>>>> +   for (;;) {
>>>> +   long started_before = vq->started;
>>>> +   long completed_before = vq->completed;
>>>> +
>>>> +   do {
>>>> +   if (random_batch)
>>>> +   batch = (random() % vq->vring.num) + 1;
>>>> +
>>>> +   while (vq->started < bufs &&
>>>> +  (vq->started - vq->completed) < batch) {
>>>> +   sg_init_one(, dev->res_buf, HDR_LEN + 
>>>> TEST_BUF_LEN);
>>>> +
>>>> +   r = virtqueue_add_inbuf(vq->vq, , 1,
>>>> +   dev->res_buf + 
>>>> vq->started,
>>>> +   GFP_ATOMIC);
>>>> +   if (unlikely(r != 0)) {
>>>> +   if (r == -ENOSPC &&
>>>
>>> Drivers usually maintain a #free_slots, this can help to avoid the
>>> trick for checking ENOSPC?
>>
>> The above "(vq->started - vq->completed) < batch" seems to ensure that
>> the 'r' can't be '-ENOSPC'?
> 
> Well, if this is true any reason we still check ENOSPEC here?

As mentioned above, It was a copy & paste code of virtio_test.c.
Will remove 'r == -ENOSPC' checking.

> 
>> We just need to ensure the batch <= desc_num,
>> and the 'r == -ENOSPC' checking seems to be unnecessary.
>>
>>>
>>>> + 

Re: [PATCH net-next v4 2/5] page_frag: unify gfp bits for order 3 page allocation

2024-02-02 Thread Yunsheng Lin
On 2024/2/2 16:36, Paolo Abeni wrote:
> On Fri, 2024-02-02 at 10:10 +0800, Yunsheng Lin wrote:
>> On 2024/2/1 21:16, Paolo Abeni wrote:
>>
>>> from the __page_frag_cache_refill() allocator - which never accesses
>>> the memory reserves.
>>
>> I am not really sure I understand the above commemt.
>> The semantic is the same as skb_page_frag_refill() as explained above
>> as my understanding. Note that __page_frag_cache_refill() use 'gfp_mask'
>> for allocating order 3 pages and use the original 'gfp' for allocating
>> order 0 pages.
> 
> You are right! I got fooled misreading 'gfp' as 'gfp_mask' in there.
> 
>>> I'm unsure we want to propagate the __page_frag_cache_refill behavior
>>> here, the current behavior could be required by some systems.
>>>
>>> It looks like this series still leave the skb_page_frag_refill()
>>> allocator alone, what about dropping this chunk, too? 
>>
>> As explained above, I would prefer to keep it as it is as it seems
>> to be quite obvious that we can avoid possible pressure for mm by
>> not using memory reserve for order 3 pages as we have the fallback
>> for order 0 pages.
>>
>> Please let me know if there is anything obvious I missed.
>>
> 
> I still think/fear that behaviours changes here could have
> subtle/negative side effects - even if I agree the change looks safe.
> 
> I think the series without this patch would still achieve its goals and
> would be much more uncontroversial. What about move this patch as a
> standalone follow-up?

Fair enough, will remove that for now.

> 
> Thanks!
> 
> Paolo
> 
> .
> 



Re: [PATCH net-next v4 5/5] tools: virtio: introduce vhost_net_test

2024-02-02 Thread Yunsheng Lin
On 2024/2/2 12:05, Jason Wang wrote:
> On Tue, Jan 30, 2024 at 7:38 PM Yunsheng Lin  wrote:
>>
>> introduce vhost_net_test basing on virtio_test to test
>> vhost_net changing in the kernel.
> 
> Let's describe what kind of test is being done and how it is done here.

How about something like below:

This patch introduces testing for both vhost_net tx and rx.
Steps for vhost_net tx testing:
1. Prepare a out buf
2. Kick the vhost_net to do tx processing
3. Do the receiving in the tun side
4. verify the data received by tun is correct

Steps for vhost_net rx testing::
1. Prepare a in buf
2. Do the sending in the tun side
3. Kick the vhost_net to do rx processing
4. verify the data received by vhost_net is correct


>> +
>> +static int tun_alloc(struct vdev_info *dev)
>> +{
>> +   struct ifreq ifr;
>> +   int len = HDR_LEN;
> 
> Any reason you can't just use the virtio_net uapi?

I didn't find a macro for that in include/uapi/linux/virtio_net.h.

Did you mean using something like below?
sizeof(struct virtio_net_hdr_mrg_rxbuf)

> 
>> +   int fd, e;
>> +
>> +   fd = open("/dev/net/tun", O_RDWR);
>> +   if (fd < 0) {
>> +   perror("Cannot open /dev/net/tun");
>> +   return fd;
>> +   }
>> +
>> +   memset(, 0, sizeof(ifr));
>> +
>> +   ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
>> +   snprintf(ifr.ifr_name, IFNAMSIZ, "tun_%d", getpid());
>> +
>> +   e = ioctl(fd, TUNSETIFF, );
>> +   if (e < 0) {
>> +   perror("ioctl[TUNSETIFF]");
>> +   close(fd);
>> +   return e;
>> +   }
>> +
>> +   e = ioctl(fd, TUNSETVNETHDRSZ, );
>> +   if (e < 0) {
>> +   perror("ioctl[TUNSETVNETHDRSZ]");
>> +   close(fd);
>> +   return e;
>> +   }
>> +
>> +   e = ioctl(fd, SIOCGIFHWADDR, );
>> +   if (e < 0) {
>> +   perror("ioctl[SIOCGIFHWADDR]");
>> +   close(fd);
>> +   return e;
>> +   }
>> +
>> +   memcpy(dev->mac, _hwaddr.sa_data, ETHER_ADDR_LEN);
>> +   return fd;
>> +}
>> +
>> +static void vdev_create_socket(struct vdev_info *dev)
>> +{
>> +   struct ifreq ifr;
>> +
>> +   dev->sock = socket(AF_PACKET, SOCK_RAW, htons(TEST_PTYPE));
>> +   assert(dev->sock != -1);
>> +
>> +   snprintf(ifr.ifr_name, IFNAMSIZ, "tun_%d", getpid());
> 
> Nit: it might be better to accept the device name instead of repeating
> the snprintf trick here, this would facilitate the future changes.

I am not sure I understand what did you mean by "accept the device name"
here.

The above is used to get ifindex of the tun netdevice created in
tun_alloc(), so that we can use it in vdev_send_packet() to send
a packet using the tun netdevice created in tun_alloc(). Is there
anything obvious I missed here?

> 
>> +   assert(ioctl(dev->sock, SIOCGIFINDEX, ) >= 0);
>> +
>> +   dev->ifindex = ifr.ifr_ifindex;
>> +
>> +   /* Set the flags that bring the device up */
>> +   assert(ioctl(dev->sock, SIOCGIFFLAGS, ) >= 0);
>> +   ifr.ifr_flags |= (IFF_UP | IFF_RUNNING);
>> +   assert(ioctl(dev->sock, SIOCSIFFLAGS, ) >= 0);
>> +}
>> +
>> +static void vdev_send_packet(struct vdev_info *dev)
>> +{
>> +   char *sendbuf = dev->test_buf + HDR_LEN;
>> +   struct sockaddr_ll saddrll = {0};
>> +   int sockfd = dev->sock;
>> +   int ret;
>> +
>> +   saddrll.sll_family = PF_PACKET;
>> +   saddrll.sll_ifindex = dev->ifindex;
>> +   saddrll.sll_halen = ETH_ALEN;
>> +   saddrll.sll_protocol = htons(TEST_PTYPE);
>> +
>> +   ret = sendto(sockfd, sendbuf, TEST_BUF_LEN, 0,
>> +(struct sockaddr *),
>> +sizeof(struct sockaddr_ll));
>> +   assert(ret >= 0);
>> +}
>> +

...

>> +
>> +static void vq_info_add(struct vdev_info *dev, int idx, int num, int fd)
>> +{
>> +   struct vhost_vring_file backend = { .index = idx, .fd = fd };
>> +   struct vq_info *info = >vqs[idx];
>> +   int r;
>> +
>> +   info->idx = idx;
>> +   info->kick = eventfd(0, EFD_NONBLOCK);
>> +   info->call = eventfd(0, EFD_NONBLOCK);
> 
> If we don't care about the callback, let's just avoid to set the call here?
> 
> (As I see vq_cal

Re: [PATCH net-next v4 2/5] page_frag: unify gfp bits for order 3 page allocation

2024-02-01 Thread Yunsheng Lin
On 2024/2/1 21:16, Paolo Abeni wrote:
> On Tue, 2024-01-30 at 19:37 +0800, Yunsheng Lin wrote:
>> Currently there seems to be three page frag implementions
>> which all try to allocate order 3 page, if that fails, it
>> then fail back to allocate order 0 page, and each of them
>> all allow order 3 page allocation to fail under certain
>> condition by using specific gfp bits.
>>
>> The gfp bits for order 3 page allocation are different
>> between different implementation, __GFP_NOMEMALLOC is
>> or'd to forbid access to emergency reserves memory for
>> __page_frag_cache_refill(), but it is not or'd in other
>> implementions, __GFP_DIRECT_RECLAIM is masked off to avoid
>> direct reclaim in skb_page_frag_refill(), but it is not
>> masked off in __page_frag_cache_refill().
>>
>> This patch unifies the gfp bits used between different
>> implementions by or'ing __GFP_NOMEMALLOC and masking off
>> __GFP_DIRECT_RECLAIM for order 3 page allocation to avoid
>> possible pressure for mm.
>>
>> Signed-off-by: Yunsheng Lin 
>> Reviewed-by: Alexander Duyck 
>> CC: Alexander Duyck 
>> ---
>>  drivers/vhost/net.c | 2 +-
>>  mm/page_alloc.c | 4 ++--
>>  net/core/sock.c | 2 +-
>>  3 files changed, 4 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
>> index f2ed7167c848..e574e21cc0ca 100644
>> --- a/drivers/vhost/net.c
>> +++ b/drivers/vhost/net.c
>> @@ -670,7 +670,7 @@ static bool vhost_net_page_frag_refill(struct vhost_net 
>> *net, unsigned int sz,
>>  /* Avoid direct reclaim but allow kswapd to wake */
>>  pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
>>__GFP_COMP | __GFP_NOWARN |
>> -  __GFP_NORETRY,
>> +  __GFP_NORETRY | __GFP_NOMEMALLOC,
>>SKB_FRAG_PAGE_ORDER);
> 
>>  if (likely(pfrag->page)) {
>>  pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index c0f7e67c4250..636145c29f70 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -4685,8 +4685,8 @@ static struct page *__page_frag_cache_refill(struct 
>> page_frag_cache *nc,
>>  gfp_t gfp = gfp_mask;
>>  
>>  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>> -gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
>> -__GFP_NOMEMALLOC;
>> +gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
>> +   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
>>  page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
>>  PAGE_FRAG_CACHE_MAX_ORDER);
>>  nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
>> diff --git a/net/core/sock.c b/net/core/sock.c
>> index 88bf810394a5..8289a3d8c375 100644
>> --- a/net/core/sock.c
>> +++ b/net/core/sock.c
>> @@ -2919,7 +2919,7 @@ bool skb_page_frag_refill(unsigned int sz, struct 
>> page_frag *pfrag, gfp_t gfp)
>>  /* Avoid direct reclaim but allow kswapd to wake */
>>  pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
>>__GFP_COMP | __GFP_NOWARN |
>> -  __GFP_NORETRY,
>> +  __GFP_NORETRY | __GFP_NOMEMALLOC,
>>SKB_FRAG_PAGE_ORDER);
> 
> This will prevent memory reserve usage when allocating order 3 pages,
> but not when allocating a single page as a fallback. Still different

More accurately, the above ensures memory reserve is always not used
for order 3 pages, whether memory reserve is used for order 0 pages
depending on original 'gfp' flags, if 'gfp' does not have __GFP_NOMEMALLOC
bit set, memory reserve may still be used  for order 0 pages.

> from the __page_frag_cache_refill() allocator - which never accesses
> the memory reserves.

I am not really sure I understand the above commemt.
The semantic is the same as skb_page_frag_refill() as explained above
as my understanding. Note that __page_frag_cache_refill() use 'gfp_mask'
for allocating order 3 pages and use the original 'gfp' for allocating
order 0 pages.

> 
> I'm unsure we want to propagate the __page_frag_cache_refill behavior
> here, the current behavior could be required by some systems.
> 
> It looks like this series still leave the skb_page_frag_refill()
> allocator alone, what about dropping this chunk, too? 

As explained above, I would prefer to keep it as it is as it seems
to be quite obvious that we can avoid possible pressure for mm by
not using memory reserve for order 3 pages as we have the fallback
for order 0 pages.

Please let me know if there is anything obvious I missed.

> 
> Thanks!
> 
> Paolo
> 
> 
> .
> 



[PATCH net-next v4 5/5] tools: virtio: introduce vhost_net_test

2024-01-30 Thread Yunsheng Lin
introduce vhost_net_test basing on virtio_test to test
vhost_net changing in the kernel.

Signed-off-by: Yunsheng Lin 
---
 tools/virtio/.gitignore   |   1 +
 tools/virtio/Makefile |   8 +-
 tools/virtio/vhost_net_test.c | 576 ++
 3 files changed, 582 insertions(+), 3 deletions(-)
 create mode 100644 tools/virtio/vhost_net_test.c

diff --git a/tools/virtio/.gitignore b/tools/virtio/.gitignore
index 9934d48d9a55..7e47b281c442 100644
--- a/tools/virtio/.gitignore
+++ b/tools/virtio/.gitignore
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 *.d
 virtio_test
+vhost_net_test
 vringh_test
 virtio-trace/trace-agent
diff --git a/tools/virtio/Makefile b/tools/virtio/Makefile
index d128925980e0..e25e99c1c3b7 100644
--- a/tools/virtio/Makefile
+++ b/tools/virtio/Makefile
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0
 all: test mod
-test: virtio_test vringh_test
+test: virtio_test vringh_test vhost_net_test
 virtio_test: virtio_ring.o virtio_test.o
 vringh_test: vringh_test.o vringh.o virtio_ring.o
+vhost_net_test: virtio_ring.o vhost_net_test.o
 
 try-run = $(shell set -e;  \
if ($(1)) >/dev/null 2>&1;  \
@@ -49,6 +50,7 @@ oot-clean: OOT_BUILD+=clean
 
 .PHONY: all test mod clean vhost oot oot-clean oot-build
 clean:
-   ${RM} *.o vringh_test virtio_test vhost_test/*.o vhost_test/.*.cmd \
-  vhost_test/Module.symvers vhost_test/modules.order *.d
+   ${RM} *.o vringh_test virtio_test vhost_net_test vhost_test/*.o \
+  vhost_test/.*.cmd vhost_test/Module.symvers \
+  vhost_test/modules.order *.d
 -include *.d
diff --git a/tools/virtio/vhost_net_test.c b/tools/virtio/vhost_net_test.c
new file mode 100644
index ..e336792a0d77
--- /dev/null
+++ b/tools/virtio/vhost_net_test.c
@@ -0,0 +1,576 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define RANDOM_BATCH   -1
+#define HDR_LEN12
+#define TEST_BUF_LEN   256
+#define TEST_PTYPE ETH_P_LOOPBACK
+
+/* Used by implementation of kmalloc() in tools/virtio/linux/kernel.h */
+void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
+
+struct vq_info {
+   int kick;
+   int call;
+   int idx;
+   long started;
+   long completed;
+   struct pollfd fds;
+   void *ring;
+   /* copy used for control */
+   struct vring vring;
+   struct virtqueue *vq;
+};
+
+struct vdev_info {
+   struct virtio_device vdev;
+   int control;
+   struct vq_info vqs[2];
+   int nvqs;
+   void *buf;
+   size_t buf_size;
+   char *test_buf;
+   char *res_buf;
+   struct vhost_memory *mem;
+   int sock;
+   int ifindex;
+   unsigned char mac[ETHER_ADDR_LEN];
+};
+
+static int tun_alloc(struct vdev_info *dev)
+{
+   struct ifreq ifr;
+   int len = HDR_LEN;
+   int fd, e;
+
+   fd = open("/dev/net/tun", O_RDWR);
+   if (fd < 0) {
+   perror("Cannot open /dev/net/tun");
+   return fd;
+   }
+
+   memset(, 0, sizeof(ifr));
+
+   ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
+   snprintf(ifr.ifr_name, IFNAMSIZ, "tun_%d", getpid());
+
+   e = ioctl(fd, TUNSETIFF, );
+   if (e < 0) {
+   perror("ioctl[TUNSETIFF]");
+   close(fd);
+   return e;
+   }
+
+   e = ioctl(fd, TUNSETVNETHDRSZ, );
+   if (e < 0) {
+   perror("ioctl[TUNSETVNETHDRSZ]");
+   close(fd);
+   return e;
+   }
+
+   e = ioctl(fd, SIOCGIFHWADDR, );
+   if (e < 0) {
+   perror("ioctl[SIOCGIFHWADDR]");
+   close(fd);
+   return e;
+   }
+
+   memcpy(dev->mac, _hwaddr.sa_data, ETHER_ADDR_LEN);
+   return fd;
+}
+
+static void vdev_create_socket(struct vdev_info *dev)
+{
+   struct ifreq ifr;
+
+   dev->sock = socket(AF_PACKET, SOCK_RAW, htons(TEST_PTYPE));
+   assert(dev->sock != -1);
+
+   snprintf(ifr.ifr_name, IFNAMSIZ, "tun_%d", getpid());
+   assert(ioctl(dev->sock, SIOCGIFINDEX, ) >= 0);
+
+   dev->ifindex = ifr.ifr_ifindex;
+
+   /* Set the flags that bring the device up */
+   assert(ioctl(dev->sock, SIOCGIFFLAGS, ) >= 0);
+   ifr.ifr_flags |= (IFF_UP | IFF_RUNNING);
+   assert(ioctl(dev->sock, SIOCSIFFLAGS, ) >= 0);
+}
+
+static void vdev_send_packet(struct vdev_info *dev)
+{
+   char *sendbuf = dev->test_buf + HDR_LEN;
+   struct sockaddr_ll saddrll = {0};
+   int sockfd = dev->sock;
+   int ret;
+
+   saddrll.sll_family = PF_PACKET;
+

[PATCH net-next v4 4/5] vhost/net: remove vhost_net_page_frag_refill()

2024-01-30 Thread Yunsheng Lin
The page frag in vhost_net_page_frag_refill() uses the
'struct page_frag' from skb_page_frag_refill(), but it's
implementation is similar to page_frag_alloc_align() now.

This patch removes vhost_net_page_frag_refill() by using
'struct page_frag_cache' instead of 'struct page_frag',
and allocating frag using page_frag_alloc_align().

The added benefit is that not only unifying the page frag
implementation a little, but also having about 0.5% performance
boost testing by using the vhost_net_test introduced in the
last patch.

Signed-off-by: Yunsheng Lin 
Acked-by: Jason Wang 
---
 drivers/vhost/net.c | 91 ++---
 1 file changed, 27 insertions(+), 64 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index e574e21cc0ca..4b2fcb228a0a 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -141,10 +141,8 @@ struct vhost_net {
unsigned tx_zcopy_err;
/* Flush in progress. Protected by tx vq lock. */
bool tx_flush;
-   /* Private page frag */
-   struct page_frag page_frag;
-   /* Refcount bias of page frag */
-   int refcnt_bias;
+   /* Private page frag cache */
+   struct page_frag_cache pf_cache;
 };
 
 static unsigned vhost_net_zcopy_mask __read_mostly;
@@ -655,41 +653,6 @@ static bool tx_can_batch(struct vhost_virtqueue *vq, 
size_t total_len)
   !vhost_vq_avail_empty(vq->dev, vq);
 }
 
-static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz,
-  struct page_frag *pfrag, gfp_t gfp)
-{
-   if (pfrag->page) {
-   if (pfrag->offset + sz <= pfrag->size)
-   return true;
-   __page_frag_cache_drain(pfrag->page, net->refcnt_bias);
-   }
-
-   pfrag->offset = 0;
-   net->refcnt_bias = 0;
-   if (SKB_FRAG_PAGE_ORDER) {
-   /* Avoid direct reclaim but allow kswapd to wake */
-   pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
- __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY | __GFP_NOMEMALLOC,
- SKB_FRAG_PAGE_ORDER);
-   if (likely(pfrag->page)) {
-   pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
-   goto done;
-   }
-   }
-   pfrag->page = alloc_page(gfp);
-   if (likely(pfrag->page)) {
-   pfrag->size = PAGE_SIZE;
-   goto done;
-   }
-   return false;
-
-done:
-   net->refcnt_bias = USHRT_MAX;
-   page_ref_add(pfrag->page, USHRT_MAX - 1);
-   return true;
-}
-
 #define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
 
 static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
@@ -699,7 +662,6 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
struct vhost_net *net = container_of(vq->dev, struct vhost_net,
 dev);
struct socket *sock = vhost_vq_get_backend(vq);
-   struct page_frag *alloc_frag = >page_frag;
struct virtio_net_hdr *gso;
struct xdp_buff *xdp = >xdp[nvq->batched_xdp];
struct tun_xdp_hdr *hdr;
@@ -710,6 +672,7 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
int sock_hlen = nvq->sock_hlen;
void *buf;
int copied;
+   int ret;
 
if (unlikely(len < nvq->sock_hlen))
return -EFAULT;
@@ -719,18 +682,17 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
return -ENOSPC;
 
buflen += SKB_DATA_ALIGN(len + pad);
-   alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
-   if (unlikely(!vhost_net_page_frag_refill(net, buflen,
-alloc_frag, GFP_KERNEL)))
+   buf = page_frag_alloc_align(>pf_cache, buflen, GFP_KERNEL,
+   SMP_CACHE_BYTES);
+   if (unlikely(!buf))
return -ENOMEM;
 
-   buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
-   copied = copy_page_from_iter(alloc_frag->page,
-alloc_frag->offset +
-offsetof(struct tun_xdp_hdr, gso),
-sock_hlen, from);
-   if (copied != sock_hlen)
-   return -EFAULT;
+   copied = copy_from_iter(buf + offsetof(struct tun_xdp_hdr, gso),
+   sock_hlen, from);
+   if (copied != sock_hlen) {
+   ret = -EFAULT;
+   goto err;
+   }
 
hdr = buf;
gso = >gso;
@@ -743,27 +705,30 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
   vho

[PATCH net-next v4 2/5] page_frag: unify gfp bits for order 3 page allocation

2024-01-30 Thread Yunsheng Lin
Currently there seems to be three page frag implementions
which all try to allocate order 3 page, if that fails, it
then fail back to allocate order 0 page, and each of them
all allow order 3 page allocation to fail under certain
condition by using specific gfp bits.

The gfp bits for order 3 page allocation are different
between different implementation, __GFP_NOMEMALLOC is
or'd to forbid access to emergency reserves memory for
__page_frag_cache_refill(), but it is not or'd in other
implementions, __GFP_DIRECT_RECLAIM is masked off to avoid
direct reclaim in skb_page_frag_refill(), but it is not
masked off in __page_frag_cache_refill().

This patch unifies the gfp bits used between different
implementions by or'ing __GFP_NOMEMALLOC and masking off
__GFP_DIRECT_RECLAIM for order 3 page allocation to avoid
possible pressure for mm.

Signed-off-by: Yunsheng Lin 
Reviewed-by: Alexander Duyck 
CC: Alexander Duyck 
---
 drivers/vhost/net.c | 2 +-
 mm/page_alloc.c | 4 ++--
 net/core/sock.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f2ed7167c848..e574e21cc0ca 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -670,7 +670,7 @@ static bool vhost_net_page_frag_refill(struct vhost_net 
*net, unsigned int sz,
/* Avoid direct reclaim but allow kswapd to wake */
pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
  __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY,
+ __GFP_NORETRY | __GFP_NOMEMALLOC,
  SKB_FRAG_PAGE_ORDER);
if (likely(pfrag->page)) {
pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c0f7e67c4250..636145c29f70 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4685,8 +4685,8 @@ static struct page *__page_frag_cache_refill(struct 
page_frag_cache *nc,
gfp_t gfp = gfp_mask;
 
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-   gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
-   __GFP_NOMEMALLOC;
+   gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
+  __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
PAGE_FRAG_CACHE_MAX_ORDER);
nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
diff --git a/net/core/sock.c b/net/core/sock.c
index 88bf810394a5..8289a3d8c375 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2919,7 +2919,7 @@ bool skb_page_frag_refill(unsigned int sz, struct 
page_frag *pfrag, gfp_t gfp)
/* Avoid direct reclaim but allow kswapd to wake */
pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
  __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY,
+ __GFP_NORETRY | __GFP_NOMEMALLOC,
  SKB_FRAG_PAGE_ORDER);
if (likely(pfrag->page)) {
pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
-- 
2.33.0




[PATCH net-next v3 5/5] tools: virtio: introduce vhost_net_test

2024-01-23 Thread Yunsheng Lin
introduce vhost_net_test basing on virtio_test to test
vhost_net changing in the kernel.

Signed-off-by: Yunsheng Lin 
---
 tools/virtio/.gitignore   |   1 +
 tools/virtio/Makefile |   8 +-
 tools/virtio/vhost_net_test.c | 576 ++
 3 files changed, 582 insertions(+), 3 deletions(-)
 create mode 100644 tools/virtio/vhost_net_test.c

diff --git a/tools/virtio/.gitignore b/tools/virtio/.gitignore
index 9934d48d9a55..7e47b281c442 100644
--- a/tools/virtio/.gitignore
+++ b/tools/virtio/.gitignore
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 *.d
 virtio_test
+vhost_net_test
 vringh_test
 virtio-trace/trace-agent
diff --git a/tools/virtio/Makefile b/tools/virtio/Makefile
index d128925980e0..e25e99c1c3b7 100644
--- a/tools/virtio/Makefile
+++ b/tools/virtio/Makefile
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0
 all: test mod
-test: virtio_test vringh_test
+test: virtio_test vringh_test vhost_net_test
 virtio_test: virtio_ring.o virtio_test.o
 vringh_test: vringh_test.o vringh.o virtio_ring.o
+vhost_net_test: virtio_ring.o vhost_net_test.o
 
 try-run = $(shell set -e;  \
if ($(1)) >/dev/null 2>&1;  \
@@ -49,6 +50,7 @@ oot-clean: OOT_BUILD+=clean
 
 .PHONY: all test mod clean vhost oot oot-clean oot-build
 clean:
-   ${RM} *.o vringh_test virtio_test vhost_test/*.o vhost_test/.*.cmd \
-  vhost_test/Module.symvers vhost_test/modules.order *.d
+   ${RM} *.o vringh_test virtio_test vhost_net_test vhost_test/*.o \
+  vhost_test/.*.cmd vhost_test/Module.symvers \
+  vhost_test/modules.order *.d
 -include *.d
diff --git a/tools/virtio/vhost_net_test.c b/tools/virtio/vhost_net_test.c
new file mode 100644
index ..e336792a0d77
--- /dev/null
+++ b/tools/virtio/vhost_net_test.c
@@ -0,0 +1,576 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define RANDOM_BATCH   -1
+#define HDR_LEN12
+#define TEST_BUF_LEN   256
+#define TEST_PTYPE ETH_P_LOOPBACK
+
+/* Used by implementation of kmalloc() in tools/virtio/linux/kernel.h */
+void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
+
+struct vq_info {
+   int kick;
+   int call;
+   int idx;
+   long started;
+   long completed;
+   struct pollfd fds;
+   void *ring;
+   /* copy used for control */
+   struct vring vring;
+   struct virtqueue *vq;
+};
+
+struct vdev_info {
+   struct virtio_device vdev;
+   int control;
+   struct vq_info vqs[2];
+   int nvqs;
+   void *buf;
+   size_t buf_size;
+   char *test_buf;
+   char *res_buf;
+   struct vhost_memory *mem;
+   int sock;
+   int ifindex;
+   unsigned char mac[ETHER_ADDR_LEN];
+};
+
+static int tun_alloc(struct vdev_info *dev)
+{
+   struct ifreq ifr;
+   int len = HDR_LEN;
+   int fd, e;
+
+   fd = open("/dev/net/tun", O_RDWR);
+   if (fd < 0) {
+   perror("Cannot open /dev/net/tun");
+   return fd;
+   }
+
+   memset(, 0, sizeof(ifr));
+
+   ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
+   snprintf(ifr.ifr_name, IFNAMSIZ, "tun_%d", getpid());
+
+   e = ioctl(fd, TUNSETIFF, );
+   if (e < 0) {
+   perror("ioctl[TUNSETIFF]");
+   close(fd);
+   return e;
+   }
+
+   e = ioctl(fd, TUNSETVNETHDRSZ, );
+   if (e < 0) {
+   perror("ioctl[TUNSETVNETHDRSZ]");
+   close(fd);
+   return e;
+   }
+
+   e = ioctl(fd, SIOCGIFHWADDR, );
+   if (e < 0) {
+   perror("ioctl[SIOCGIFHWADDR]");
+   close(fd);
+   return e;
+   }
+
+   memcpy(dev->mac, _hwaddr.sa_data, ETHER_ADDR_LEN);
+   return fd;
+}
+
+static void vdev_create_socket(struct vdev_info *dev)
+{
+   struct ifreq ifr;
+
+   dev->sock = socket(AF_PACKET, SOCK_RAW, htons(TEST_PTYPE));
+   assert(dev->sock != -1);
+
+   snprintf(ifr.ifr_name, IFNAMSIZ, "tun_%d", getpid());
+   assert(ioctl(dev->sock, SIOCGIFINDEX, ) >= 0);
+
+   dev->ifindex = ifr.ifr_ifindex;
+
+   /* Set the flags that bring the device up */
+   assert(ioctl(dev->sock, SIOCGIFFLAGS, ) >= 0);
+   ifr.ifr_flags |= (IFF_UP | IFF_RUNNING);
+   assert(ioctl(dev->sock, SIOCSIFFLAGS, ) >= 0);
+}
+
+static void vdev_send_packet(struct vdev_info *dev)
+{
+   char *sendbuf = dev->test_buf + HDR_LEN;
+   struct sockaddr_ll saddrll = {0};
+   int sockfd = dev->sock;
+   int ret;
+
+   saddrll.sll_family = PF_PACKET;
+

[PATCH net-next v3 4/5] vhost/net: remove vhost_net_page_frag_refill()

2024-01-23 Thread Yunsheng Lin
The page frag in vhost_net_page_frag_refill() uses the
'struct page_frag' from skb_page_frag_refill(), but it's
implementation is similar to page_frag_alloc_align() now.

This patch removes vhost_net_page_frag_refill() by using
'struct page_frag_cache' instead of 'struct page_frag',
and allocating frag using page_frag_alloc_align().

The added benefit is that not only unifying the page frag
implementation a little, but also having about 0.5% performance
boost testing by using the vhost_net_test introduced in the
last patch.

Signed-off-by: Yunsheng Lin 
Acked-by: Jason Wang 
---
 drivers/vhost/net.c | 91 ++---
 1 file changed, 27 insertions(+), 64 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index e574e21cc0ca..4b2fcb228a0a 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -141,10 +141,8 @@ struct vhost_net {
unsigned tx_zcopy_err;
/* Flush in progress. Protected by tx vq lock. */
bool tx_flush;
-   /* Private page frag */
-   struct page_frag page_frag;
-   /* Refcount bias of page frag */
-   int refcnt_bias;
+   /* Private page frag cache */
+   struct page_frag_cache pf_cache;
 };
 
 static unsigned vhost_net_zcopy_mask __read_mostly;
@@ -655,41 +653,6 @@ static bool tx_can_batch(struct vhost_virtqueue *vq, 
size_t total_len)
   !vhost_vq_avail_empty(vq->dev, vq);
 }
 
-static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz,
-  struct page_frag *pfrag, gfp_t gfp)
-{
-   if (pfrag->page) {
-   if (pfrag->offset + sz <= pfrag->size)
-   return true;
-   __page_frag_cache_drain(pfrag->page, net->refcnt_bias);
-   }
-
-   pfrag->offset = 0;
-   net->refcnt_bias = 0;
-   if (SKB_FRAG_PAGE_ORDER) {
-   /* Avoid direct reclaim but allow kswapd to wake */
-   pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
- __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY | __GFP_NOMEMALLOC,
- SKB_FRAG_PAGE_ORDER);
-   if (likely(pfrag->page)) {
-   pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
-   goto done;
-   }
-   }
-   pfrag->page = alloc_page(gfp);
-   if (likely(pfrag->page)) {
-   pfrag->size = PAGE_SIZE;
-   goto done;
-   }
-   return false;
-
-done:
-   net->refcnt_bias = USHRT_MAX;
-   page_ref_add(pfrag->page, USHRT_MAX - 1);
-   return true;
-}
-
 #define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
 
 static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
@@ -699,7 +662,6 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
struct vhost_net *net = container_of(vq->dev, struct vhost_net,
 dev);
struct socket *sock = vhost_vq_get_backend(vq);
-   struct page_frag *alloc_frag = >page_frag;
struct virtio_net_hdr *gso;
struct xdp_buff *xdp = >xdp[nvq->batched_xdp];
struct tun_xdp_hdr *hdr;
@@ -710,6 +672,7 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
int sock_hlen = nvq->sock_hlen;
void *buf;
int copied;
+   int ret;
 
if (unlikely(len < nvq->sock_hlen))
return -EFAULT;
@@ -719,18 +682,17 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
return -ENOSPC;
 
buflen += SKB_DATA_ALIGN(len + pad);
-   alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
-   if (unlikely(!vhost_net_page_frag_refill(net, buflen,
-alloc_frag, GFP_KERNEL)))
+   buf = page_frag_alloc_align(>pf_cache, buflen, GFP_KERNEL,
+   SMP_CACHE_BYTES);
+   if (unlikely(!buf))
return -ENOMEM;
 
-   buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
-   copied = copy_page_from_iter(alloc_frag->page,
-alloc_frag->offset +
-offsetof(struct tun_xdp_hdr, gso),
-sock_hlen, from);
-   if (copied != sock_hlen)
-   return -EFAULT;
+   copied = copy_from_iter(buf + offsetof(struct tun_xdp_hdr, gso),
+   sock_hlen, from);
+   if (copied != sock_hlen) {
+   ret = -EFAULT;
+   goto err;
+   }
 
hdr = buf;
gso = >gso;
@@ -743,27 +705,30 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
   vho

[PATCH net-next v3 2/5] page_frag: unify gfp bits for order 3 page allocation

2024-01-23 Thread Yunsheng Lin
Currently there seems to be three page frag implementions
which all try to allocate order 3 page, if that fails, it
then fail back to allocate order 0 page, and each of them
all allow order 3 page allocation to fail under certain
condition by using specific gfp bits.

The gfp bits for order 3 page allocation are different
between different implementation, __GFP_NOMEMALLOC is
or'd to forbid access to emergency reserves memory for
__page_frag_cache_refill(), but it is not or'd in other
implementions, __GFP_DIRECT_RECLAIM is masked off to avoid
direct reclaim in skb_page_frag_refill(), but it is not
masked off in __page_frag_cache_refill().

This patch unifies the gfp bits used between different
implementions by or'ing __GFP_NOMEMALLOC and masking off
__GFP_DIRECT_RECLAIM for order 3 page allocation to avoid
possible pressure for mm.

Signed-off-by: Yunsheng Lin 
Reviewed-by: Alexander Duyck 
CC: Alexander Duyck 
---
 drivers/vhost/net.c | 2 +-
 mm/page_alloc.c | 4 ++--
 net/core/sock.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f2ed7167c848..e574e21cc0ca 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -670,7 +670,7 @@ static bool vhost_net_page_frag_refill(struct vhost_net 
*net, unsigned int sz,
/* Avoid direct reclaim but allow kswapd to wake */
pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
  __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY,
+ __GFP_NORETRY | __GFP_NOMEMALLOC,
  SKB_FRAG_PAGE_ORDER);
if (likely(pfrag->page)) {
pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c0f7e67c4250..636145c29f70 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4685,8 +4685,8 @@ static struct page *__page_frag_cache_refill(struct 
page_frag_cache *nc,
gfp_t gfp = gfp_mask;
 
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-   gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
-   __GFP_NOMEMALLOC;
+   gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
+  __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
PAGE_FRAG_CACHE_MAX_ORDER);
nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
diff --git a/net/core/sock.c b/net/core/sock.c
index 158dbdebce6a..d4bc4269d7d7 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2908,7 +2908,7 @@ bool skb_page_frag_refill(unsigned int sz, struct 
page_frag *pfrag, gfp_t gfp)
/* Avoid direct reclaim but allow kswapd to wake */
pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
  __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY,
+ __GFP_NORETRY | __GFP_NOMEMALLOC,
  SKB_FRAG_PAGE_ORDER);
if (likely(pfrag->page)) {
pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
-- 
2.33.0




Re: [PATCH net-next 4/6] vhost/net: remove vhost_net_page_frag_refill()

2024-01-08 Thread Yunsheng Lin
On 2024/1/6 0:06, Alexander H Duyck wrote:
>>  
>>  static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
>> @@ -1353,8 +1318,7 @@ static int vhost_net_open(struct inode *inode, struct 
>> file *f)
>>  vqs[VHOST_NET_VQ_RX]);
>>  
>>  f->private_data = n;
>> -n->page_frag.page = NULL;
>> -n->refcnt_bias = 0;
>> +n->pf_cache.va = NULL;
>>  
>>  return 0;
>>  }
>> @@ -1422,8 +1386,9 @@ static int vhost_net_release(struct inode *inode, 
>> struct file *f)
>>  kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue);
>>  kfree(n->vqs[VHOST_NET_VQ_TX].xdp);
>>  kfree(n->dev.vqs);
>> -if (n->page_frag.page)
>> -__page_frag_cache_drain(n->page_frag.page, n->refcnt_bias);
>> +if (n->pf_cache.va)
>> +__page_frag_cache_drain(virt_to_head_page(n->pf_cache.va),
>> +n->pf_cache.pagecnt_bias);
>>  kvfree(n);
>>  return 0;
>>  }
> 
> I would recommend reordering this patch with patch 5. Then you could
> remove the block that is setting "n->pf_cache.va = NULL" above and just
> make use of page_frag_cache_drain in the lower block which would also
> return the va to NULL.

I am not sure if we can as there is no zeroing for 'struct vhost_net' in
vhost_net_open().

If we don't have "n->pf_cache.va = NULL", don't we use the uninitialized data
when calling page_frag_alloc_align() for the first time?

> .
> 



Re: [PATCH net-next 2/6] page_frag: unify gfp bits for order 3 page allocation

2024-01-08 Thread Yunsheng Lin
On 2024/1/5 23:35, Alexander H Duyck wrote:
> On Wed, 2024-01-03 at 17:56 +0800, Yunsheng Lin wrote:
>> Currently there seems to be three page frag implementions
>> which all try to allocate order 3 page, if that fails, it
>> then fail back to allocate order 0 page, and each of them
>> all allow order 3 page allocation to fail under certain
>> condition by using specific gfp bits.
>>
>> The gfp bits for order 3 page allocation are different
>> between different implementation, __GFP_NOMEMALLOC is
>> or'd to forbid access to emergency reserves memory for
>> __page_frag_cache_refill(), but it is not or'd in other
>> implementions, __GFP_DIRECT_RECLAIM is masked off to avoid
>> direct reclaim in skb_page_frag_refill(), but it is not
>> masked off in __page_frag_cache_refill().
>>
>> This patch unifies the gfp bits used between different
>> implementions by or'ing __GFP_NOMEMALLOC and masking off
>> __GFP_DIRECT_RECLAIM for order 3 page allocation to avoid
>> possible pressure for mm.
>>
>> Signed-off-by: Yunsheng Lin 
>> CC: Alexander Duyck 
>> ---
>>  drivers/vhost/net.c | 2 +-
>>  mm/page_alloc.c | 4 ++--
>>  net/core/sock.c | 2 +-
>>  3 files changed, 4 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
>> index f2ed7167c848..e574e21cc0ca 100644
>> --- a/drivers/vhost/net.c
>> +++ b/drivers/vhost/net.c
>> @@ -670,7 +670,7 @@ static bool vhost_net_page_frag_refill(struct vhost_net 
>> *net, unsigned int sz,
>>  /* Avoid direct reclaim but allow kswapd to wake */
>>  pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
>>__GFP_COMP | __GFP_NOWARN |
>> -  __GFP_NORETRY,
>> +  __GFP_NORETRY | __GFP_NOMEMALLOC,
>>SKB_FRAG_PAGE_ORDER);
>>  if (likely(pfrag->page)) {
>>  pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 9a16305cf985..1f0b36dd81b5 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -4693,8 +4693,8 @@ static struct page *__page_frag_cache_refill(struct 
>> page_frag_cache *nc,
>>  gfp_t gfp = gfp_mask;
>>  
>>  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>> -gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
>> -__GFP_NOMEMALLOC;
>> +gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
>> +   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
>>  page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
>>  PAGE_FRAG_CACHE_MAX_ORDER);
>>  nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
>> diff --git a/net/core/sock.c b/net/core/sock.c
>> index 446e945f736b..d643332c3ee5 100644
>> --- a/net/core/sock.c
>> +++ b/net/core/sock.c
>> @@ -2900,7 +2900,7 @@ bool skb_page_frag_refill(unsigned int sz, struct 
>> page_frag *pfrag, gfp_t gfp)
>>  /* Avoid direct reclaim but allow kswapd to wake */
>>  pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
>>__GFP_COMP | __GFP_NOWARN |
>> -  __GFP_NORETRY,
>> +  __GFP_NORETRY | __GFP_NOMEMALLOC,
>>SKB_FRAG_PAGE_ORDER);
>>  if (likely(pfrag->page)) {
>>  pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
> 
> Looks fine to me.
> 
> One thing you may want to consider would be to place this all in an
> inline function that could just consolidate all the code.

Do you think it is possible to further unify the implementations of the
'struct page_frag_cache' and 'struct page_frag', so adding a inline
function for above is unnecessary?

> 
> Reviewed-by: Alexander Duyck 
> 
> .
> 



Re: [PATCH net-next 6/6] tools: virtio: introduce vhost_net_test

2024-01-04 Thread Yunsheng Lin
On 2024/1/5 0:17, Eugenio Perez Martin wrote:
> On Wed, Jan 3, 2024 at 11:00 AM Yunsheng Lin  wrote:

...

>> +
>> +static void run_tx_test(struct vdev_info *dev, struct vq_info *vq,
>> +   bool delayed, int batch, int bufs)
>> +{
>> +   const bool random_batch = batch == RANDOM_BATCH;
>> +   long long spurious = 0;
>> +   struct scatterlist sl;
>> +   unsigned int len;
>> +   int r;
>> +
>> +   for (;;) {
>> +   long started_before = vq->started;
>> +   long completed_before = vq->completed;
>> +
>> +   virtqueue_disable_cb(vq->vq);
>> +   do {
>> +   if (random_batch)
>> +   batch = (random() % vq->vring.num) + 1;
>> +
>> +   while (vq->started < bufs &&
>> +  (vq->started - vq->completed) < batch) {
>> +   sg_init_one(, dev->test_buf, HDR_LEN + 
>> TEST_BUF_LEN);
>> +   r = virtqueue_add_outbuf(vq->vq, , 1,
>> +dev->test_buf + 
>> vq->started,
>> +GFP_ATOMIC);
>> +   if (unlikely(r != 0)) {
>> +   if (r == -ENOSPC &&
>> +   vq->started > started_before)
>> +   r = 0;
>> +   else
>> +   r = -1;
>> +   break;
>> +   }
>> +
>> +   ++vq->started;
>> +
>> +   if (unlikely(!virtqueue_kick(vq->vq))) {
>> +   r = -1;
>> +   break;
>> +   }
>> +   }
>> +
>> +   if (vq->started >= bufs)
>> +   r = -1;
>> +
>> +   /* Flush out completed bufs if any */
>> +   while (virtqueue_get_buf(vq->vq, )) {
>> +   int n, i;
>> +
>> +   n = recvfrom(dev->sock, dev->res_buf, 
>> TEST_BUF_LEN, 0, NULL, NULL);
>> +   assert(n == TEST_BUF_LEN);
>> +
>> +   for (i = ETHER_HDR_LEN; i < n; i++)
>> +   assert(dev->res_buf[i] == (char)i);
>> +
>> +   ++vq->completed;
>> +   r = 0;
>> +   }
>> +   } while (r == 0);
>> +
>> +   if (vq->completed == completed_before && vq->started == 
>> started_before)
>> +   ++spurious;
>> +
>> +   assert(vq->completed <= bufs);
>> +   assert(vq->started <= bufs);
>> +   if (vq->completed == bufs)
>> +   break;
>> +
>> +   if (delayed) {
>> +   if (virtqueue_enable_cb_delayed(vq->vq))
>> +   wait_for_interrupt(vq);
>> +   } else {
>> +   if (virtqueue_enable_cb(vq->vq))
>> +   wait_for_interrupt(vq);
>> +   }
>> +   }
>> +   printf("TX spurious wakeups: 0x%llx started=0x%lx completed=0x%lx\n",
>> +  spurious, vq->started, vq->completed);
>> +}
>> +
>> +static void run_rx_test(struct vdev_info *dev, struct vq_info *vq,
>> +   bool delayed, int batch, int bufs)
>> +{
>> +   const bool random_batch = batch == RANDOM_BATCH;
>> +   long long spurious = 0;
>> +   struct scatterlist sl;
>> +   unsigned int len;
>> +   int r;
>> +
>> +   for (;;) {
>> +   long started_before = vq->started;
>> +   long completed_before = vq->completed;
>> +
>> +   do {
>> +   if (random_batch)
>> +   batch = (random() % vq->vring.num) + 1;
>> +
>>

[PATCH net-next 6/6] tools: virtio: introduce vhost_net_test

2024-01-03 Thread Yunsheng Lin
introduce vhost_net_test basing on virtio_test to test
vhost_net changing in the kernel.

Signed-off-by: Yunsheng Lin 
---
 tools/virtio/Makefile |   8 +-
 tools/virtio/vhost_net_test.c | 574 ++
 2 files changed, 579 insertions(+), 3 deletions(-)
 create mode 100644 tools/virtio/vhost_net_test.c

diff --git a/tools/virtio/Makefile b/tools/virtio/Makefile
index d128925980e0..e25e99c1c3b7 100644
--- a/tools/virtio/Makefile
+++ b/tools/virtio/Makefile
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0
 all: test mod
-test: virtio_test vringh_test
+test: virtio_test vringh_test vhost_net_test
 virtio_test: virtio_ring.o virtio_test.o
 vringh_test: vringh_test.o vringh.o virtio_ring.o
+vhost_net_test: virtio_ring.o vhost_net_test.o
 
 try-run = $(shell set -e;  \
if ($(1)) >/dev/null 2>&1;  \
@@ -49,6 +50,7 @@ oot-clean: OOT_BUILD+=clean
 
 .PHONY: all test mod clean vhost oot oot-clean oot-build
 clean:
-   ${RM} *.o vringh_test virtio_test vhost_test/*.o vhost_test/.*.cmd \
-  vhost_test/Module.symvers vhost_test/modules.order *.d
+   ${RM} *.o vringh_test virtio_test vhost_net_test vhost_test/*.o \
+  vhost_test/.*.cmd vhost_test/Module.symvers \
+  vhost_test/modules.order *.d
 -include *.d
diff --git a/tools/virtio/vhost_net_test.c b/tools/virtio/vhost_net_test.c
new file mode 100644
index ..cfffcef53d94
--- /dev/null
+++ b/tools/virtio/vhost_net_test.c
@@ -0,0 +1,574 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define RANDOM_BATCH   -1
+#define HDR_LEN12
+#define TEST_BUF_LEN   256
+#define TEST_PTYPE ETH_P_LOOPBACK
+
+/* Used by implementation of kmalloc() in tools/virtio/linux/kernel.h */
+void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
+
+struct vq_info {
+   int kick;
+   int call;
+   int idx;
+   long started;
+   long completed;
+   struct pollfd fds;
+   void *ring;
+   /* copy used for control */
+   struct vring vring;
+   struct virtqueue *vq;
+};
+
+struct vdev_info {
+   struct virtio_device vdev;
+   int control;
+   struct vq_info vqs[2];
+   int nvqs;
+   void *buf;
+   size_t buf_size;
+   char *test_buf;
+   char *res_buf;
+   struct vhost_memory *mem;
+   int sock;
+   int ifindex;
+   unsigned char mac[ETHER_ADDR_LEN];
+};
+
+static int tun_alloc(struct vdev_info *dev)
+{
+   struct ifreq ifr;
+   int len = HDR_LEN;
+   int fd, e;
+
+   fd = open("/dev/net/tun", O_RDWR);
+   if (fd < 0) {
+   perror("Cannot open /dev/net/tun");
+   return fd;
+   }
+
+   memset(, 0, sizeof(ifr));
+
+   ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
+   snprintf(ifr.ifr_name, IFNAMSIZ, "tun_%d", getpid());
+
+   e = ioctl(fd, TUNSETIFF, );
+   if (e < 0) {
+   perror("ioctl[TUNSETIFF]");
+   close(fd);
+   return e;
+   }
+
+   e = ioctl(fd, TUNSETVNETHDRSZ, );
+   if (e < 0) {
+   perror("ioctl[TUNSETVNETHDRSZ]");
+   close(fd);
+   return e;
+   }
+
+   e = ioctl(fd, SIOCGIFHWADDR, );
+   if (e < 0) {
+   perror("ioctl[SIOCGIFHWADDR]");
+   close(fd);
+   return e;
+   }
+
+   memcpy(dev->mac, _hwaddr.sa_data, ETHER_ADDR_LEN);
+   return fd;
+}
+
+static void vdev_create_socket(struct vdev_info *dev)
+{
+   struct ifreq ifr;
+
+   dev->sock = socket(AF_PACKET, SOCK_RAW, htons(TEST_PTYPE));
+   assert(dev->sock != -1);
+
+   snprintf(ifr.ifr_name, IFNAMSIZ, "tun_%d", getpid());
+   assert(ioctl(dev->sock, SIOCGIFINDEX, ) >= 0);
+
+   dev->ifindex = ifr.ifr_ifindex;
+
+   /* Set the flags that bring the device up */
+   assert(ioctl(dev->sock, SIOCGIFFLAGS, ) >= 0);
+   ifr.ifr_flags |= (IFF_UP | IFF_RUNNING);
+   assert(ioctl(dev->sock, SIOCSIFFLAGS, ) >= 0);
+}
+
+static void vdev_send_packet(struct vdev_info *dev)
+{
+   char *sendbuf = dev->test_buf + HDR_LEN;
+   struct sockaddr_ll saddrll = {0};
+   int sockfd = dev->sock;
+   int ret;
+
+   memset(, 0, sizeof(saddrll));
+   saddrll.sll_family = PF_PACKET;
+   saddrll.sll_ifindex = dev->ifindex;
+   saddrll.sll_halen = ETH_ALEN;
+   saddrll.sll_protocol = htons(TEST_PTYPE);
+
+   ret = sendto(sockfd, sendbuf, TEST_BUF_LEN, 0,
+(struct sockaddr *),
+sizeof(struct sockaddr_ll));
+   as

[PATCH net-next 5/6] net: introduce page_frag_cache_drain()

2024-01-03 Thread Yunsheng Lin
When draining a page_frag_cache, most user are doing
the similar steps, so introduce an API to avoid code
duplication.

Signed-off-by: Yunsheng Lin 
Acked-by: Jason Wang 
---
 drivers/net/ethernet/google/gve/gve_main.c | 11 ++-
 drivers/net/ethernet/mediatek/mtk_wed_wo.c | 17 ++---
 drivers/nvme/host/tcp.c|  7 +--
 drivers/nvme/target/tcp.c  |  4 +---
 drivers/vhost/net.c|  4 +---
 include/linux/gfp.h|  2 ++
 mm/page_alloc.c| 10 ++
 7 files changed, 19 insertions(+), 36 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_main.c 
b/drivers/net/ethernet/google/gve/gve_main.c
index 619bf63ec935..d976190b0f4d 100644
--- a/drivers/net/ethernet/google/gve/gve_main.c
+++ b/drivers/net/ethernet/google/gve/gve_main.c
@@ -1278,17 +1278,10 @@ static void gve_unreg_xdp_info(struct gve_priv *priv)
 
 static void gve_drain_page_cache(struct gve_priv *priv)
 {
-   struct page_frag_cache *nc;
int i;
 
-   for (i = 0; i < priv->rx_cfg.num_queues; i++) {
-   nc = >rx[i].page_cache;
-   if (nc->va) {
-   __page_frag_cache_drain(virt_to_page(nc->va),
-   nc->pagecnt_bias);
-   nc->va = NULL;
-   }
-   }
+   for (i = 0; i < priv->rx_cfg.num_queues; i++)
+   page_frag_cache_drain(>rx[i].page_cache);
 }
 
 static int gve_open(struct net_device *dev)
diff --git a/drivers/net/ethernet/mediatek/mtk_wed_wo.c 
b/drivers/net/ethernet/mediatek/mtk_wed_wo.c
index d58b07e7e123..7063c78bd35f 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed_wo.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed_wo.c
@@ -286,7 +286,6 @@ mtk_wed_wo_queue_free(struct mtk_wed_wo *wo, struct 
mtk_wed_wo_queue *q)
 static void
 mtk_wed_wo_queue_tx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q)
 {
-   struct page *page;
int i;
 
for (i = 0; i < q->n_desc; i++) {
@@ -301,19 +300,12 @@ mtk_wed_wo_queue_tx_clean(struct mtk_wed_wo *wo, struct 
mtk_wed_wo_queue *q)
entry->buf = NULL;
}
 
-   if (!q->cache.va)
-   return;
-
-   page = virt_to_page(q->cache.va);
-   __page_frag_cache_drain(page, q->cache.pagecnt_bias);
-   memset(>cache, 0, sizeof(q->cache));
+   page_frag_cache_drain(>cache);
 }
 
 static void
 mtk_wed_wo_queue_rx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q)
 {
-   struct page *page;
-
for (;;) {
void *buf = mtk_wed_wo_dequeue(wo, q, NULL, true);
 
@@ -323,12 +315,7 @@ mtk_wed_wo_queue_rx_clean(struct mtk_wed_wo *wo, struct 
mtk_wed_wo_queue *q)
skb_free_frag(buf);
}
 
-   if (!q->cache.va)
-   return;
-
-   page = virt_to_page(q->cache.va);
-   __page_frag_cache_drain(page, q->cache.pagecnt_bias);
-   memset(>cache, 0, sizeof(q->cache));
+   page_frag_cache_drain(>cache);
 }
 
 static void
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 08805f027810..c80037a78066 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1344,7 +1344,6 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl 
*ctrl)
 
 static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
 {
-   struct page *page;
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
struct nvme_tcp_queue *queue = >queues[qid];
unsigned int noreclaim_flag;
@@ -1355,11 +1354,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, 
int qid)
if (queue->hdr_digest || queue->data_digest)
nvme_tcp_free_crypto(queue);
 
-   if (queue->pf_cache.va) {
-   page = virt_to_head_page(queue->pf_cache.va);
-   __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
-   queue->pf_cache.va = NULL;
-   }
+   page_frag_cache_drain(>pf_cache);
 
noreclaim_flag = memalloc_noreclaim_save();
/* ->sock will be released by fput() */
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 4cc27856aa8f..11237557cfc5 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1576,7 +1576,6 @@ static void nvmet_tcp_free_cmd_data_in_buffers(struct 
nvmet_tcp_queue *queue)
 
 static void nvmet_tcp_release_queue_work(struct work_struct *w)
 {
-   struct page *page;
struct nvmet_tcp_queue *queue =
container_of(w, struct nvmet_tcp_queue, release_work);
 
@@ -1600,8 +1599,7 @@ static void nvmet_tcp_release_queue_work(struct 
work_struct *w)
if (queue->hdr_digest || queue->data_digest)
nvmet_tcp_free_crypto(queue);
ida_free(_tcp_queue_ida, queue->idx);
-   p

[PATCH net-next 4/6] vhost/net: remove vhost_net_page_frag_refill()

2024-01-03 Thread Yunsheng Lin
The page frag in vhost_net_page_frag_refill() uses the
'struct page_frag' from skb_page_frag_refill(), but it's
implementation is similar to page_frag_alloc_align() now.

This patch removes vhost_net_page_frag_refill() by using
'struct page_frag_cache' instead of 'struct page_frag',
and allocating frag using page_frag_alloc_align().

The added benefit is that not only unifying the page frag
implementation a little, but also having about 0.5% performance
boost testing by using the vhost_net_test introduced in the
last patch.

Signed-off-by: Yunsheng Lin 
Acked-by: Jason Wang 
---
 drivers/vhost/net.c | 93 ++---
 1 file changed, 29 insertions(+), 64 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index e574e21cc0ca..805e11d598e4 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -141,10 +141,8 @@ struct vhost_net {
unsigned tx_zcopy_err;
/* Flush in progress. Protected by tx vq lock. */
bool tx_flush;
-   /* Private page frag */
-   struct page_frag page_frag;
-   /* Refcount bias of page frag */
-   int refcnt_bias;
+   /* Private page frag cache */
+   struct page_frag_cache pf_cache;
 };
 
 static unsigned vhost_net_zcopy_mask __read_mostly;
@@ -655,41 +653,6 @@ static bool tx_can_batch(struct vhost_virtqueue *vq, 
size_t total_len)
   !vhost_vq_avail_empty(vq->dev, vq);
 }
 
-static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz,
-  struct page_frag *pfrag, gfp_t gfp)
-{
-   if (pfrag->page) {
-   if (pfrag->offset + sz <= pfrag->size)
-   return true;
-   __page_frag_cache_drain(pfrag->page, net->refcnt_bias);
-   }
-
-   pfrag->offset = 0;
-   net->refcnt_bias = 0;
-   if (SKB_FRAG_PAGE_ORDER) {
-   /* Avoid direct reclaim but allow kswapd to wake */
-   pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
- __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY | __GFP_NOMEMALLOC,
- SKB_FRAG_PAGE_ORDER);
-   if (likely(pfrag->page)) {
-   pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
-   goto done;
-   }
-   }
-   pfrag->page = alloc_page(gfp);
-   if (likely(pfrag->page)) {
-   pfrag->size = PAGE_SIZE;
-   goto done;
-   }
-   return false;
-
-done:
-   net->refcnt_bias = USHRT_MAX;
-   page_ref_add(pfrag->page, USHRT_MAX - 1);
-   return true;
-}
-
 #define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
 
 static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
@@ -699,7 +662,6 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
struct vhost_net *net = container_of(vq->dev, struct vhost_net,
 dev);
struct socket *sock = vhost_vq_get_backend(vq);
-   struct page_frag *alloc_frag = >page_frag;
struct virtio_net_hdr *gso;
struct xdp_buff *xdp = >xdp[nvq->batched_xdp];
struct tun_xdp_hdr *hdr;
@@ -710,6 +672,7 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
int sock_hlen = nvq->sock_hlen;
void *buf;
int copied;
+   int ret;
 
if (unlikely(len < nvq->sock_hlen))
return -EFAULT;
@@ -719,18 +682,17 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
return -ENOSPC;
 
buflen += SKB_DATA_ALIGN(len + pad);
-   alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
-   if (unlikely(!vhost_net_page_frag_refill(net, buflen,
-alloc_frag, GFP_KERNEL)))
+   buf = page_frag_alloc_align(>pf_cache, buflen, GFP_KERNEL,
+   SMP_CACHE_BYTES);
+   if (unlikely(!buf))
return -ENOMEM;
 
-   buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
-   copied = copy_page_from_iter(alloc_frag->page,
-alloc_frag->offset +
-offsetof(struct tun_xdp_hdr, gso),
-sock_hlen, from);
-   if (copied != sock_hlen)
-   return -EFAULT;
+   copied = copy_from_iter(buf + offsetof(struct tun_xdp_hdr, gso),
+   sock_hlen, from);
+   if (copied != sock_hlen) {
+   ret = -EFAULT;
+   goto err;
+   }
 
hdr = buf;
gso = >gso;
@@ -743,27 +705,30 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
   vho

[PATCH net-next 2/6] page_frag: unify gfp bits for order 3 page allocation

2024-01-03 Thread Yunsheng Lin
Currently there seems to be three page frag implementions
which all try to allocate order 3 page, if that fails, it
then fail back to allocate order 0 page, and each of them
all allow order 3 page allocation to fail under certain
condition by using specific gfp bits.

The gfp bits for order 3 page allocation are different
between different implementation, __GFP_NOMEMALLOC is
or'd to forbid access to emergency reserves memory for
__page_frag_cache_refill(), but it is not or'd in other
implementions, __GFP_DIRECT_RECLAIM is masked off to avoid
direct reclaim in skb_page_frag_refill(), but it is not
masked off in __page_frag_cache_refill().

This patch unifies the gfp bits used between different
implementions by or'ing __GFP_NOMEMALLOC and masking off
__GFP_DIRECT_RECLAIM for order 3 page allocation to avoid
possible pressure for mm.

Signed-off-by: Yunsheng Lin 
CC: Alexander Duyck 
---
 drivers/vhost/net.c | 2 +-
 mm/page_alloc.c | 4 ++--
 net/core/sock.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f2ed7167c848..e574e21cc0ca 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -670,7 +670,7 @@ static bool vhost_net_page_frag_refill(struct vhost_net 
*net, unsigned int sz,
/* Avoid direct reclaim but allow kswapd to wake */
pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
  __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY,
+ __GFP_NORETRY | __GFP_NOMEMALLOC,
  SKB_FRAG_PAGE_ORDER);
if (likely(pfrag->page)) {
pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9a16305cf985..1f0b36dd81b5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4693,8 +4693,8 @@ static struct page *__page_frag_cache_refill(struct 
page_frag_cache *nc,
gfp_t gfp = gfp_mask;
 
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-   gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
-   __GFP_NOMEMALLOC;
+   gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
+  __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
PAGE_FRAG_CACHE_MAX_ORDER);
nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
diff --git a/net/core/sock.c b/net/core/sock.c
index 446e945f736b..d643332c3ee5 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2900,7 +2900,7 @@ bool skb_page_frag_refill(unsigned int sz, struct 
page_frag *pfrag, gfp_t gfp)
/* Avoid direct reclaim but allow kswapd to wake */
pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
  __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY,
+ __GFP_NORETRY | __GFP_NOMEMALLOC,
  SKB_FRAG_PAGE_ORDER);
if (likely(pfrag->page)) {
pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
-- 
2.33.0




Re: [PATCH net-next 6/6] tools: virtio: introduce vhost_net_test

2023-12-20 Thread Yunsheng Lin
On 2023/12/21 10:33, Jason Wang wrote:
> On Wed, Dec 20, 2023 at 8:45 PM Yunsheng Lin  wrote:
>>
>> On 2023/12/12 12:35, Jason Wang wrote:>>>> +done:
>>>>>> +   backend.fd = tun_alloc();
>>>>>> +   assert(backend.fd >= 0);
>>>>>> +   vdev_info_init(, features);
>>>>>> +   vq_info_add(, 256);
>>>>>> +   run_test(, [0], delayed, batch, reset, nbufs);
>>>>>
>>>>> I'd expect we are testing some basic traffic here. E.g can we use a
>>>>> packet socket then we can test both tx and rx?
>>>>
>>>> Yes, only rx for tun is tested.
>>>> Do you have an idea how to test the tx too? As I am not familar enough
>>>> with vhost_net and tun yet.
>>>
>>> Maybe you can have a packet socket to bind to the tun/tap. Then you can 
>>> test:
>>>
>>> 1) TAP RX: by write a packet via virtqueue through vhost_net and read
>>> it from packet socket
>>> 2) TAP TX:  by write via packet socket and read it from the virtqueue
>>> through vhost_net
>>
>> When implementing the TAP TX by adding VHOST_NET_F_VIRTIO_NET_HDR,
>> I found one possible use of uninitialized data in vhost_net_build_xdp().
>>
>> And vhost_hlen is set to sizeof(struct virtio_net_hdr_mrg_rxbuf) and
>> sock_hlen is set to zero in vhost_net_set_features() for both tx and rx
>> queue.
>>
>> For vhost_net_build_xdp() called by handle_tx_copy():
>>
>> The (gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) checking below may cause a
>> read of uninitialized data if sock_hlen is zero.
> 
> Which data is uninitialized here?

The 'gso', as the sock_hlen is zero, there is no copying for:

 copied = copy_page_from_iter(alloc_frag->page,
  alloc_frag->offset +
  offsetof(struct tun_xdp_hdr, gso),
  sock_hlen, from);

> 
>>
>> And it seems vhost_hdr is skipped in get_tx_bufs():
>> https://elixir.bootlin.com/linux/latest/source/drivers/vhost/net.c#L616
>>
>> static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
>>struct iov_iter *from)
>> {
>> ...
>> buflen += SKB_DATA_ALIGN(len + pad);
>> alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
>> if (unlikely(!vhost_net_page_frag_refill(net, buflen,
>>  alloc_frag, GFP_KERNEL)))
>> return -ENOMEM;
>>
>> buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
>> copied = copy_page_from_iter(alloc_frag->page,
>>  alloc_frag->offset +
>>  offsetof(struct tun_xdp_hdr, gso),
>>  sock_hlen, from);
>> if (copied != sock_hlen)
>> return -EFAULT;
>>
>> hdr = buf;
>> gso = >gso;
>>
>> if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
>> vhost16_to_cpu(vq, gso->csum_start) +
>> vhost16_to_cpu(vq, gso->csum_offset) + 2 >
>> vhost16_to_cpu(vq, gso->hdr_len)) {
>> ...
>> }
>>
>> I seems the handle_tx_copy() does not handle the VHOST_NET_F_VIRTIO_NET_HDR
>> case correctly, Or do I miss something obvious here?
> 
> In get_tx_bufs() we did:
> 
> *len = init_iov_iter(vq, >msg_iter, nvq->vhost_hlen, *out);
> 
> Which covers this case?

It does not seems to cover it, as the vhost_hdr is just skipped without any
handling in get_tx_bufs():
https://elixir.bootlin.com/linux/v6.7-rc6/source/drivers/vhost/net.c#L616

> 
> Thanks



Re: [PATCH net-next 6/6] tools: virtio: introduce vhost_net_test

2023-12-20 Thread Yunsheng Lin
On 2023/12/12 12:35, Jason Wang wrote: +done:
 +   backend.fd = tun_alloc();
 +   assert(backend.fd >= 0);
 +   vdev_info_init(, features);
 +   vq_info_add(, 256);
 +   run_test(, [0], delayed, batch, reset, nbufs);
>>>
>>> I'd expect we are testing some basic traffic here. E.g can we use a
>>> packet socket then we can test both tx and rx?
>>
>> Yes, only rx for tun is tested.
>> Do you have an idea how to test the tx too? As I am not familar enough
>> with vhost_net and tun yet.
> 
> Maybe you can have a packet socket to bind to the tun/tap. Then you can test:
> 
> 1) TAP RX: by write a packet via virtqueue through vhost_net and read
> it from packet socket
> 2) TAP TX:  by write via packet socket and read it from the virtqueue
> through vhost_net

When implementing the TAP TX by adding VHOST_NET_F_VIRTIO_NET_HDR,
I found one possible use of uninitialized data in vhost_net_build_xdp().

And vhost_hlen is set to sizeof(struct virtio_net_hdr_mrg_rxbuf) and
sock_hlen is set to zero in vhost_net_set_features() for both tx and rx
queue.

For vhost_net_build_xdp() called by handle_tx_copy():

The (gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) checking below may cause a
read of uninitialized data if sock_hlen is zero.

And it seems vhost_hdr is skipped in get_tx_bufs():
https://elixir.bootlin.com/linux/latest/source/drivers/vhost/net.c#L616

static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
   struct iov_iter *from)
{
...
buflen += SKB_DATA_ALIGN(len + pad);
alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
if (unlikely(!vhost_net_page_frag_refill(net, buflen,
 alloc_frag, GFP_KERNEL)))
return -ENOMEM;

buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
copied = copy_page_from_iter(alloc_frag->page,
 alloc_frag->offset +
 offsetof(struct tun_xdp_hdr, gso),
 sock_hlen, from);
if (copied != sock_hlen)
return -EFAULT;

hdr = buf;
gso = >gso;

if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
vhost16_to_cpu(vq, gso->csum_start) +
vhost16_to_cpu(vq, gso->csum_offset) + 2 >
vhost16_to_cpu(vq, gso->hdr_len)) {
...
}

I seems the handle_tx_copy() does not handle the VHOST_NET_F_VIRTIO_NET_HDR
case correctly, Or do I miss something obvious here?

> 
> Thanks
> 
>>
>>>
>>> Thanks
>>
> 
> .
> 



Re: [PATCH net-next v2 3/3] net: add netmem_t to skb_frag_t

2023-12-18 Thread Yunsheng Lin
On 2023/12/17 16:09, Mina Almasry wrote:
> Use netmem_t instead of page directly in skb_frag_t. Currently netmem_t
> is always a struct page underneath, but the abstraction allows efforts
> to add support for skb frags not backed by pages.
> 
> There is unfortunately 1 instance where the skb_frag_t is assumed to be
> a bio_vec in kcm. For this case, add a debug assert that the skb frag is
> indeed backed by a page, and do a cast.
> 
> Add skb[_frag]_fill_netmem_*() and skb_add_rx_frag_netmem() helpers so
> that the API can be used to create netmem skbs.
> 
> Signed-off-by: Mina Almasry 
> 

...

>  
> -typedef struct bio_vec skb_frag_t;
> +typedef struct skb_frag {
> + struct netmem *bv_page;

bv_page -> bv_netmem?

> + unsigned int bv_len;
> + unsigned int bv_offset;
> +} skb_frag_t;
>  
>  /**
>   * skb_frag_size() - Returns the size of a skb fragment
> @@ -2431,22 +2436,37 @@ static inline unsigned int skb_pagelen(const struct 
> sk_buff *skb)
>   return skb_headlen(skb) + __skb_pagelen(skb);
>  }
>  

...

>  /**
> @@ -2462,10 +2482,10 @@ static inline void skb_len_add(struct sk_buff *skb, 
> int delta)
>  }
>  
>  /**
> - * __skb_fill_page_desc - initialise a paged fragment in an skb
> + * __skb_fill_netmem_desc - initialise a paged fragment in an skb
>   * @skb: buffer containing fragment to be initialised
>   * @i: paged fragment index to initialise
> - * @page: the page to use for this fragment
> + * @netmem: the netmem to use for this fragment
>   * @off: the offset to the data with @page
>   * @size: the length of the data
>   *
> @@ -2474,10 +2494,13 @@ static inline void skb_len_add(struct sk_buff *skb, 
> int delta)
>   *
>   * Does not take any additional reference on the fragment.
>   */
> -static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
> - struct page *page, int off, int size)
> +static inline void __skb_fill_netmem_desc(struct sk_buff *skb, int i,
> +   struct netmem *netmem, int off,
> +   int size)
>  {
> - __skb_fill_page_desc_noacc(skb_shinfo(skb), i, page, off, size);
> + struct page *page = netmem_to_page(netmem);
> +
> + __skb_fill_netmem_desc_noacc(skb_shinfo(skb), i, netmem, off, size);
>  
>   /* Propagate page pfmemalloc to the skb if we can. The problem is
>* that not all callers have unique ownership of the page but rely
> @@ -2485,7 +2508,21 @@ static inline void __skb_fill_page_desc(struct sk_buff 
> *skb, int i,
>*/
>   page = compound_head(page);
>   if (page_is_pfmemalloc(page))
> - skb->pfmemalloc = true;
> + skb->pfmemalloc = true;

Is it possible to introduce netmem_is_pfmemalloc() and netmem_compound_head()
for netmem, and have some built-time testing to ensure the implementation
is the same between page_is_pfmemalloc()/compound_head() and
netmem_is_pfmemalloc()/netmem_compound_head()? So that we can avoid the
netmem_to_page() as much as possible, especially in the driver.


> +}
> +
> +static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
> + struct page *page, int off, int size)
> +{
> + __skb_fill_netmem_desc(skb, i, page_to_netmem(page), off, size);
> +}
> +

...

>   */
>  static inline struct page *skb_frag_page(const skb_frag_t *frag)
>  {
> - return frag->bv_page;
> + return netmem_to_page(frag->bv_page);

It seems we are not able to have a safe type protection for the above
function, as the driver may be able to pass a devmem frag as a param here,
and pass the returned page into the mm subsystem, and compiler is not able
to catch it when compiling.

>  }
>  
>  /**
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 83af8aaeb893..053d220aa2f2 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -845,16 +845,24 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct 
> *napi, unsigned int len,
>  }
>  EXPORT_SYMBOL(__napi_alloc_skb);
>  

...

> diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
> index 65d1f6755f98..5c46db045f4c 100644
> --- a/net/kcm/kcmsock.c
> +++ b/net/kcm/kcmsock.c
> @@ -636,9 +636,15 @@ static int kcm_write_msgs(struct kcm_sock *kcm)
>   for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
>   msize += skb_shinfo(skb)->frags[i].bv_len;
>  
> + /* The cast to struct bio_vec* here assumes the frags are
> +  * struct page based.
> +  */
> + DEBUG_NET_WARN_ON_ONCE(
> + !skb_frag_page(_shinfo(skb)->frags[0]));

It seems skb_frag_page() always return non-NULL in this patch, the above
checking seems unnecessary?



Re: [PATCH net-next 6/6] tools: virtio: introduce vhost_net_test

2023-12-07 Thread Yunsheng Lin
On 2023/12/7 14:00, Jason Wang wrote:
> On Tue, Dec 5, 2023 at 7:35 PM Yunsheng Lin  wrote:
...

>> +
>> +static int tun_alloc(void)
>> +{
>> +   struct ifreq ifr;
>> +   int fd, e;
>> +
>> +   fd = open("/dev/net/tun", O_RDWR);
>> +   if (fd < 0) {
>> +   perror("Cannot open /dev/net/tun");
>> +   return fd;
>> +   }
>> +
>> +   memset(, 0, sizeof(ifr));
>> +
>> +   ifr.ifr_flags = IFF_TUN | IFF_NO_PI;
> 
> Why did you use IFF_TUN but not IFF_TAP here?

To be honest, no particular reason, I just picked IFF_TUN and it happened
to work for me to test changing in vhost_net_build_xdp().

Is there a particular reason you perfer the IFF_TAP over IFF_TUN?

> 
>> +   strncpy(ifr.ifr_name, "tun0", IFNAMSIZ);
> 
> tun0 is pretty common if there's a VPN. Do we need some randomized name here?

How about something like below?

snprintf(ifr.ifr_name, IFNAMSIZ, "tun_%d", getpid());

> 
> 
>> +
>> +   e = ioctl(fd, TUNSETIFF, (void *) );
>> +   if (e < 0) {
>> +   perror("ioctl[TUNSETIFF]");
>> +   close(fd);
>> +   return e;
>> +   }
>> +
>> +   return fd;
>> +}
>> +
>> +/* Unused */
>> +void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
> 
> Why do we need trick like these here?

That is because of the below error:
tools/virtio/./linux/kernel.h:58: undefined reference to `__kmalloc_fake'

when virtio_ring.c is compiled in the userspace, the kmalloc raleted function
is implemented in tools/virtio/./linux/kernel.h, which requires those varibles
to be defined.

> 
>> +
>> +struct vq_info {
>> +   int kick;
>> +   int call;
>> +   int num;
>> +   int idx;
>> +   void *ring;
>> +   /* copy used for control */
>> +   struct vring vring;
>> +   struct virtqueue *vq;
>> +};
>> +
>> +struct vdev_info {
>> +   struct virtio_device vdev;
>> +   int control;
>> +   struct pollfd fds[1];
>> +   struct vq_info vqs[1];
>> +   int nvqs;
>> +   void *buf;
>> +   size_t buf_size;
>> +   struct vhost_memory *mem;
>> +};
>> +
>> +static struct vhost_vring_file no_backend = { .index = 1, .fd = -1 },
>> +backend = { .index = 1, .fd = 1 };
> 
> A magic number like fd = 1 is pretty confusing.
> 
> And I don't see why we need global variables here.

I was using the virtio_test.c as reference, will try to remove it
if it is possible.

> 
>> +static const struct vhost_vring_state null_state = {};
>> +

..

>> +
>> +done:
>> +   backend.fd = tun_alloc();
>> +   assert(backend.fd >= 0);
>> +   vdev_info_init(, features);
>> +   vq_info_add(, 256);
>> +   run_test(, [0], delayed, batch, reset, nbufs);
> 
> I'd expect we are testing some basic traffic here. E.g can we use a
> packet socket then we can test both tx and rx?

Yes, only rx for tun is tested.
Do you have an idea how to test the tx too? As I am not familar enough
with vhost_net and tun yet.

> 
> Thanks



Re: [PATCH net-next 2/6] page_frag: unify gfp bit for order 3 page allocation

2023-12-07 Thread Yunsheng Lin
On 2023/12/7 11:15, Jakub Kicinski wrote:
> On Tue, 5 Dec 2023 19:34:40 +0800 Yunsheng Lin wrote:
>> __GFP_DIRECT_RECLAIM is xor'd to avoid
>> direct reclaim in skb_page_frag_refill(), but it is not
>> xor'd in __page_frag_cache_refill().
> 
> xor is not the same thing as masking a bit off.

You are right.
Will use 'mask off', thanks.

> The patch itself LGTM.
> .
> 



[PATCH net-next 6/6] tools: virtio: introduce vhost_net_test

2023-12-05 Thread Yunsheng Lin
introduce vhost_net_test basing on virtio_test to test
vhost_net changing in the kernel.

Signed-off-by: Yunsheng Lin 
---
 tools/virtio/Makefile |   8 +-
 tools/virtio/vhost_net_test.c | 441 ++
 2 files changed, 446 insertions(+), 3 deletions(-)
 create mode 100644 tools/virtio/vhost_net_test.c

diff --git a/tools/virtio/Makefile b/tools/virtio/Makefile
index d128925980e0..e25e99c1c3b7 100644
--- a/tools/virtio/Makefile
+++ b/tools/virtio/Makefile
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0
 all: test mod
-test: virtio_test vringh_test
+test: virtio_test vringh_test vhost_net_test
 virtio_test: virtio_ring.o virtio_test.o
 vringh_test: vringh_test.o vringh.o virtio_ring.o
+vhost_net_test: virtio_ring.o vhost_net_test.o
 
 try-run = $(shell set -e;  \
if ($(1)) >/dev/null 2>&1;  \
@@ -49,6 +50,7 @@ oot-clean: OOT_BUILD+=clean
 
 .PHONY: all test mod clean vhost oot oot-clean oot-build
 clean:
-   ${RM} *.o vringh_test virtio_test vhost_test/*.o vhost_test/.*.cmd \
-  vhost_test/Module.symvers vhost_test/modules.order *.d
+   ${RM} *.o vringh_test virtio_test vhost_net_test vhost_test/*.o \
+  vhost_test/.*.cmd vhost_test/Module.symvers \
+  vhost_test/modules.order *.d
 -include *.d
diff --git a/tools/virtio/vhost_net_test.c b/tools/virtio/vhost_net_test.c
new file mode 100644
index ..7e7b7aba3668
--- /dev/null
+++ b/tools/virtio/vhost_net_test.c
@@ -0,0 +1,441 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define RANDOM_BATCH -1
+
+static int tun_alloc(void)
+{
+   struct ifreq ifr;
+   int fd, e;
+
+   fd = open("/dev/net/tun", O_RDWR);
+   if (fd < 0) {
+   perror("Cannot open /dev/net/tun");
+   return fd;
+   }
+
+   memset(, 0, sizeof(ifr));
+
+   ifr.ifr_flags = IFF_TUN | IFF_NO_PI;
+   strncpy(ifr.ifr_name, "tun0", IFNAMSIZ);
+
+   e = ioctl(fd, TUNSETIFF, (void *) );
+   if (e < 0) {
+   perror("ioctl[TUNSETIFF]");
+   close(fd);
+   return e;
+   }
+
+   return fd;
+}
+
+/* Unused */
+void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
+
+struct vq_info {
+   int kick;
+   int call;
+   int num;
+   int idx;
+   void *ring;
+   /* copy used for control */
+   struct vring vring;
+   struct virtqueue *vq;
+};
+
+struct vdev_info {
+   struct virtio_device vdev;
+   int control;
+   struct pollfd fds[1];
+   struct vq_info vqs[1];
+   int nvqs;
+   void *buf;
+   size_t buf_size;
+   struct vhost_memory *mem;
+};
+
+static struct vhost_vring_file no_backend = { .index = 1, .fd = -1 },
+backend = { .index = 1, .fd = 1 };
+static const struct vhost_vring_state null_state = {};
+
+bool vq_notify(struct virtqueue *vq)
+{
+   struct vq_info *info = vq->priv;
+   unsigned long long v = 1;
+   int r;
+   r = write(info->kick, , sizeof v);
+   assert(r == sizeof v);
+   return true;
+}
+
+void vq_callback(struct virtqueue *vq)
+{
+}
+
+
+void vhost_vq_setup(struct vdev_info *dev, struct vq_info *info)
+{
+   struct vhost_vring_state state = { .index = info->idx };
+   struct vhost_vring_file file = { .index = info->idx };
+   unsigned long long features = dev->vdev.features;
+   struct vhost_vring_addr addr = {
+   .index = info->idx,
+   .desc_user_addr = (uint64_t)(unsigned long)info->vring.desc,
+   .avail_user_addr = (uint64_t)(unsigned long)info->vring.avail,
+   .used_user_addr = (uint64_t)(unsigned long)info->vring.used,
+   };
+   int r;
+   r = ioctl(dev->control, VHOST_SET_FEATURES, );
+   assert(r >= 0);
+   state.num = info->vring.num;
+   r = ioctl(dev->control, VHOST_SET_VRING_NUM, );
+   assert(r >= 0);
+   state.num = 0;
+   r = ioctl(dev->control, VHOST_SET_VRING_BASE, );
+   assert(r >= 0);
+   r = ioctl(dev->control, VHOST_SET_VRING_ADDR, );
+   assert(r >= 0);
+   file.fd = info->kick;
+   r = ioctl(dev->control, VHOST_SET_VRING_KICK, );
+   assert(r >= 0);
+   file.fd = info->call;
+   r = ioctl(dev->control, VHOST_SET_VRING_CALL, );
+   assert(r >= 0);
+}
+
+static void vq_reset(struct vq_info *info, int num, struct virtio_device *vdev)
+{
+   if (info->vq)
+   vring_del_virtqueue(info->vq);
+
+   memset(info->ring, 0, vring_size(num, 4096));
+   vring_init(>vring, num, info->ring, 4096

[PATCH net-next 5/6] net: introduce page_frag_cache_drain()

2023-12-05 Thread Yunsheng Lin
When draining a page_frag_cache, most user are doing
the similar steps, so introduce an API to avoid code
duplication.

Signed-off-by: Yunsheng Lin 
---
 drivers/net/ethernet/google/gve/gve_main.c | 11 ++-
 drivers/net/ethernet/mediatek/mtk_wed_wo.c | 17 ++---
 drivers/nvme/host/tcp.c|  7 +--
 drivers/nvme/target/tcp.c  |  4 +---
 drivers/vhost/net.c|  4 +---
 include/linux/gfp.h|  2 ++
 mm/page_alloc.c| 10 ++
 7 files changed, 19 insertions(+), 36 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_main.c 
b/drivers/net/ethernet/google/gve/gve_main.c
index 619bf63ec935..d976190b0f4d 100644
--- a/drivers/net/ethernet/google/gve/gve_main.c
+++ b/drivers/net/ethernet/google/gve/gve_main.c
@@ -1278,17 +1278,10 @@ static void gve_unreg_xdp_info(struct gve_priv *priv)
 
 static void gve_drain_page_cache(struct gve_priv *priv)
 {
-   struct page_frag_cache *nc;
int i;
 
-   for (i = 0; i < priv->rx_cfg.num_queues; i++) {
-   nc = >rx[i].page_cache;
-   if (nc->va) {
-   __page_frag_cache_drain(virt_to_page(nc->va),
-   nc->pagecnt_bias);
-   nc->va = NULL;
-   }
-   }
+   for (i = 0; i < priv->rx_cfg.num_queues; i++)
+   page_frag_cache_drain(>rx[i].page_cache);
 }
 
 static int gve_open(struct net_device *dev)
diff --git a/drivers/net/ethernet/mediatek/mtk_wed_wo.c 
b/drivers/net/ethernet/mediatek/mtk_wed_wo.c
index 7ffbd4fca881..df0a3ceaf59b 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed_wo.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed_wo.c
@@ -286,7 +286,6 @@ mtk_wed_wo_queue_free(struct mtk_wed_wo *wo, struct 
mtk_wed_wo_queue *q)
 static void
 mtk_wed_wo_queue_tx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q)
 {
-   struct page *page;
int i;
 
for (i = 0; i < q->n_desc; i++) {
@@ -298,19 +297,12 @@ mtk_wed_wo_queue_tx_clean(struct mtk_wed_wo *wo, struct 
mtk_wed_wo_queue *q)
entry->buf = NULL;
}
 
-   if (!q->cache.va)
-   return;
-
-   page = virt_to_page(q->cache.va);
-   __page_frag_cache_drain(page, q->cache.pagecnt_bias);
-   memset(>cache, 0, sizeof(q->cache));
+   page_frag_cache_drain(>cache);
 }
 
 static void
 mtk_wed_wo_queue_rx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q)
 {
-   struct page *page;
-
for (;;) {
void *buf = mtk_wed_wo_dequeue(wo, q, NULL, true);
 
@@ -320,12 +312,7 @@ mtk_wed_wo_queue_rx_clean(struct mtk_wed_wo *wo, struct 
mtk_wed_wo_queue *q)
skb_free_frag(buf);
}
 
-   if (!q->cache.va)
-   return;
-
-   page = virt_to_page(q->cache.va);
-   __page_frag_cache_drain(page, q->cache.pagecnt_bias);
-   memset(>cache, 0, sizeof(q->cache));
+   page_frag_cache_drain(>cache);
 }
 
 static void
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index d79811cfa0ce..1c85e1398e4e 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1344,7 +1344,6 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl 
*ctrl)
 
 static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
 {
-   struct page *page;
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
struct nvme_tcp_queue *queue = >queues[qid];
unsigned int noreclaim_flag;
@@ -1355,11 +1354,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, 
int qid)
if (queue->hdr_digest || queue->data_digest)
nvme_tcp_free_crypto(queue);
 
-   if (queue->pf_cache.va) {
-   page = virt_to_head_page(queue->pf_cache.va);
-   __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
-   queue->pf_cache.va = NULL;
-   }
+   page_frag_cache_drain(>pf_cache);
 
noreclaim_flag = memalloc_noreclaim_save();
/* ->sock will be released by fput() */
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 4cc27856aa8f..11237557cfc5 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1576,7 +1576,6 @@ static void nvmet_tcp_free_cmd_data_in_buffers(struct 
nvmet_tcp_queue *queue)
 
 static void nvmet_tcp_release_queue_work(struct work_struct *w)
 {
-   struct page *page;
struct nvmet_tcp_queue *queue =
container_of(w, struct nvmet_tcp_queue, release_work);
 
@@ -1600,8 +1599,7 @@ static void nvmet_tcp_release_queue_work(struct 
work_struct *w)
if (queue->hdr_digest || queue->data_digest)
nvmet_tcp_free_crypto(queue);
ida_free(_tcp_queue_ida, queue->idx);
-   page = virt_to_head_page(queue->pf_cache

[PATCH net-next 4/6] vhost/net: remove vhost_net_page_frag_refill()

2023-12-05 Thread Yunsheng Lin
The page frag in vhost_net_page_frag_refill() uses the
'struct page_frag' from skb_page_frag_refill(), but it's
implementation is similar to page_frag_alloc_align() now.

This patch removes vhost_net_page_frag_refill() by using
'struct page_frag_cache' instead of 'struct page_frag',
and allocating frag using page_frag_alloc_align().

The added benefit is that not only unifying the page frag
implementation a little, but also having about 0.5% performance
boost testing by using the vhost_net_test introduced in the
last patch.

Signed-off-by: Yunsheng Lin 
---
 drivers/vhost/net.c | 93 ++---
 1 file changed, 29 insertions(+), 64 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index e574e21cc0ca..805e11d598e4 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -141,10 +141,8 @@ struct vhost_net {
unsigned tx_zcopy_err;
/* Flush in progress. Protected by tx vq lock. */
bool tx_flush;
-   /* Private page frag */
-   struct page_frag page_frag;
-   /* Refcount bias of page frag */
-   int refcnt_bias;
+   /* Private page frag cache */
+   struct page_frag_cache pf_cache;
 };
 
 static unsigned vhost_net_zcopy_mask __read_mostly;
@@ -655,41 +653,6 @@ static bool tx_can_batch(struct vhost_virtqueue *vq, 
size_t total_len)
   !vhost_vq_avail_empty(vq->dev, vq);
 }
 
-static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz,
-  struct page_frag *pfrag, gfp_t gfp)
-{
-   if (pfrag->page) {
-   if (pfrag->offset + sz <= pfrag->size)
-   return true;
-   __page_frag_cache_drain(pfrag->page, net->refcnt_bias);
-   }
-
-   pfrag->offset = 0;
-   net->refcnt_bias = 0;
-   if (SKB_FRAG_PAGE_ORDER) {
-   /* Avoid direct reclaim but allow kswapd to wake */
-   pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
- __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY | __GFP_NOMEMALLOC,
- SKB_FRAG_PAGE_ORDER);
-   if (likely(pfrag->page)) {
-   pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
-   goto done;
-   }
-   }
-   pfrag->page = alloc_page(gfp);
-   if (likely(pfrag->page)) {
-   pfrag->size = PAGE_SIZE;
-   goto done;
-   }
-   return false;
-
-done:
-   net->refcnt_bias = USHRT_MAX;
-   page_ref_add(pfrag->page, USHRT_MAX - 1);
-   return true;
-}
-
 #define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
 
 static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
@@ -699,7 +662,6 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
struct vhost_net *net = container_of(vq->dev, struct vhost_net,
 dev);
struct socket *sock = vhost_vq_get_backend(vq);
-   struct page_frag *alloc_frag = >page_frag;
struct virtio_net_hdr *gso;
struct xdp_buff *xdp = >xdp[nvq->batched_xdp];
struct tun_xdp_hdr *hdr;
@@ -710,6 +672,7 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
int sock_hlen = nvq->sock_hlen;
void *buf;
int copied;
+   int ret;
 
if (unlikely(len < nvq->sock_hlen))
return -EFAULT;
@@ -719,18 +682,17 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
return -ENOSPC;
 
buflen += SKB_DATA_ALIGN(len + pad);
-   alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
-   if (unlikely(!vhost_net_page_frag_refill(net, buflen,
-alloc_frag, GFP_KERNEL)))
+   buf = page_frag_alloc_align(>pf_cache, buflen, GFP_KERNEL,
+   SMP_CACHE_BYTES);
+   if (unlikely(!buf))
return -ENOMEM;
 
-   buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
-   copied = copy_page_from_iter(alloc_frag->page,
-alloc_frag->offset +
-offsetof(struct tun_xdp_hdr, gso),
-sock_hlen, from);
-   if (copied != sock_hlen)
-   return -EFAULT;
+   copied = copy_from_iter(buf + offsetof(struct tun_xdp_hdr, gso),
+   sock_hlen, from);
+   if (copied != sock_hlen) {
+   ret = -EFAULT;
+   goto err;
+   }
 
hdr = buf;
gso = >gso;
@@ -743,27 +705,30 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
   vhost16_to_cpu(vq, gso->csum_start) +
 

[PATCH net-next 2/6] page_frag: unify gfp bit for order 3 page allocation

2023-12-05 Thread Yunsheng Lin
Currently there seems to be three page frag implementions
which all try to allocate order 3 page, if that fails, it
then fail back to allocate order 0 page, and each of them
all allow order 3 page allocation to fail under certain
condition by using specific gfp bits.

The gfp bits for order 3 page allocation are different
between different implementation, __GFP_NOMEMALLOC is
or'd to forbid access to emergency reserves memory for
__page_frag_cache_refill(), but it is not or'd in other
implementions, __GFP_DIRECT_RECLAIM is xor'd to avoid
direct reclaim in skb_page_frag_refill(), but it is not
xor'd in __page_frag_cache_refill().

This patch unifies the gfp bits used between different
implementions by or'ing __GFP_NOMEMALLOC and xor'ing
__GFP_DIRECT_RECLAIM for order 3 page allocation to avoid
possible pressure for mm.

Signed-off-by: Yunsheng Lin 
CC: Alexander Duyck 
---
 drivers/vhost/net.c | 2 +-
 mm/page_alloc.c | 4 ++--
 net/core/sock.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f2ed7167c848..e574e21cc0ca 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -670,7 +670,7 @@ static bool vhost_net_page_frag_refill(struct vhost_net 
*net, unsigned int sz,
/* Avoid direct reclaim but allow kswapd to wake */
pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
  __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY,
+ __GFP_NORETRY | __GFP_NOMEMALLOC,
  SKB_FRAG_PAGE_ORDER);
if (likely(pfrag->page)) {
pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9a16305cf985..1f0b36dd81b5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4693,8 +4693,8 @@ static struct page *__page_frag_cache_refill(struct 
page_frag_cache *nc,
gfp_t gfp = gfp_mask;
 
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-   gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
-   __GFP_NOMEMALLOC;
+   gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
+  __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
PAGE_FRAG_CACHE_MAX_ORDER);
nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
diff --git a/net/core/sock.c b/net/core/sock.c
index fef349dd72fa..4efa9cae4b0d 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2904,7 +2904,7 @@ bool skb_page_frag_refill(unsigned int sz, struct 
page_frag *pfrag, gfp_t gfp)
/* Avoid direct reclaim but allow kswapd to wake */
pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
  __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY,
+ __GFP_NORETRY | __GFP_NOMEMALLOC,
  SKB_FRAG_PAGE_ORDER);
if (likely(pfrag->page)) {
pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
-- 
2.33.0




[PATCH RFC 5/6] net: introduce page_frag_cache_drain()

2023-12-01 Thread Yunsheng Lin
When draining a page_frag_cache, most user are doing
the similar steps, so introduce an API to avoid code
duplication.

Signed-off-by: Yunsheng Lin 
---
 drivers/net/ethernet/google/gve/gve_main.c | 11 ++-
 drivers/net/ethernet/mediatek/mtk_wed_wo.c | 17 ++---
 drivers/nvme/host/tcp.c|  7 +--
 drivers/nvme/target/tcp.c  |  4 +---
 drivers/vhost/net.c|  4 +---
 include/linux/gfp.h|  2 ++
 mm/page_alloc.c| 10 ++
 7 files changed, 19 insertions(+), 36 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_main.c 
b/drivers/net/ethernet/google/gve/gve_main.c
index 619bf63ec935..d976190b0f4d 100644
--- a/drivers/net/ethernet/google/gve/gve_main.c
+++ b/drivers/net/ethernet/google/gve/gve_main.c
@@ -1278,17 +1278,10 @@ static void gve_unreg_xdp_info(struct gve_priv *priv)
 
 static void gve_drain_page_cache(struct gve_priv *priv)
 {
-   struct page_frag_cache *nc;
int i;
 
-   for (i = 0; i < priv->rx_cfg.num_queues; i++) {
-   nc = >rx[i].page_cache;
-   if (nc->va) {
-   __page_frag_cache_drain(virt_to_page(nc->va),
-   nc->pagecnt_bias);
-   nc->va = NULL;
-   }
-   }
+   for (i = 0; i < priv->rx_cfg.num_queues; i++)
+   page_frag_cache_drain(>rx[i].page_cache);
 }
 
 static int gve_open(struct net_device *dev)
diff --git a/drivers/net/ethernet/mediatek/mtk_wed_wo.c 
b/drivers/net/ethernet/mediatek/mtk_wed_wo.c
index 7ffbd4fca881..df0a3ceaf59b 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed_wo.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed_wo.c
@@ -286,7 +286,6 @@ mtk_wed_wo_queue_free(struct mtk_wed_wo *wo, struct 
mtk_wed_wo_queue *q)
 static void
 mtk_wed_wo_queue_tx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q)
 {
-   struct page *page;
int i;
 
for (i = 0; i < q->n_desc; i++) {
@@ -298,19 +297,12 @@ mtk_wed_wo_queue_tx_clean(struct mtk_wed_wo *wo, struct 
mtk_wed_wo_queue *q)
entry->buf = NULL;
}
 
-   if (!q->cache.va)
-   return;
-
-   page = virt_to_page(q->cache.va);
-   __page_frag_cache_drain(page, q->cache.pagecnt_bias);
-   memset(>cache, 0, sizeof(q->cache));
+   page_frag_cache_drain(>cache);
 }
 
 static void
 mtk_wed_wo_queue_rx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q)
 {
-   struct page *page;
-
for (;;) {
void *buf = mtk_wed_wo_dequeue(wo, q, NULL, true);
 
@@ -320,12 +312,7 @@ mtk_wed_wo_queue_rx_clean(struct mtk_wed_wo *wo, struct 
mtk_wed_wo_queue *q)
skb_free_frag(buf);
}
 
-   if (!q->cache.va)
-   return;
-
-   page = virt_to_page(q->cache.va);
-   __page_frag_cache_drain(page, q->cache.pagecnt_bias);
-   memset(>cache, 0, sizeof(q->cache));
+   page_frag_cache_drain(>cache);
 }
 
 static void
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 89661a9cf850..8d4f4a06f9d9 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1338,7 +1338,6 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl 
*ctrl)
 
 static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
 {
-   struct page *page;
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
struct nvme_tcp_queue *queue = >queues[qid];
unsigned int noreclaim_flag;
@@ -1349,11 +1348,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, 
int qid)
if (queue->hdr_digest || queue->data_digest)
nvme_tcp_free_crypto(queue);
 
-   if (queue->pf_cache.va) {
-   page = virt_to_head_page(queue->pf_cache.va);
-   __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
-   queue->pf_cache.va = NULL;
-   }
+   page_frag_cache_drain(>pf_cache);
 
noreclaim_flag = memalloc_noreclaim_save();
/* ->sock will be released by fput() */
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 92b74d0b8686..f9a553d70a61 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1576,7 +1576,6 @@ static void nvmet_tcp_free_cmd_data_in_buffers(struct 
nvmet_tcp_queue *queue)
 
 static void nvmet_tcp_release_queue_work(struct work_struct *w)
 {
-   struct page *page;
struct nvmet_tcp_queue *queue =
container_of(w, struct nvmet_tcp_queue, release_work);
 
@@ -1600,8 +1599,7 @@ static void nvmet_tcp_release_queue_work(struct 
work_struct *w)
if (queue->hdr_digest || queue->data_digest)
nvmet_tcp_free_crypto(queue);
ida_free(_tcp_queue_ida, queue->idx);
-   page = virt_to_head_page(queue->pf_cache

[PATCH RFC 6/6] tools: virtio: introduce vhost_net_test

2023-12-01 Thread Yunsheng Lin
introduce vhost_net_test basing on virtio_test to test
vhost_net changing in the kernel.

Signed-off-by: Yunsheng Lin 
---
 tools/virtio/Makefile |   8 +-
 tools/virtio/vhost_net_test.c | 441 ++
 2 files changed, 446 insertions(+), 3 deletions(-)
 create mode 100644 tools/virtio/vhost_net_test.c

diff --git a/tools/virtio/Makefile b/tools/virtio/Makefile
index d128925980e0..e25e99c1c3b7 100644
--- a/tools/virtio/Makefile
+++ b/tools/virtio/Makefile
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0
 all: test mod
-test: virtio_test vringh_test
+test: virtio_test vringh_test vhost_net_test
 virtio_test: virtio_ring.o virtio_test.o
 vringh_test: vringh_test.o vringh.o virtio_ring.o
+vhost_net_test: virtio_ring.o vhost_net_test.o
 
 try-run = $(shell set -e;  \
if ($(1)) >/dev/null 2>&1;  \
@@ -49,6 +50,7 @@ oot-clean: OOT_BUILD+=clean
 
 .PHONY: all test mod clean vhost oot oot-clean oot-build
 clean:
-   ${RM} *.o vringh_test virtio_test vhost_test/*.o vhost_test/.*.cmd \
-  vhost_test/Module.symvers vhost_test/modules.order *.d
+   ${RM} *.o vringh_test virtio_test vhost_net_test vhost_test/*.o \
+  vhost_test/.*.cmd vhost_test/Module.symvers \
+  vhost_test/modules.order *.d
 -include *.d
diff --git a/tools/virtio/vhost_net_test.c b/tools/virtio/vhost_net_test.c
new file mode 100644
index ..7e7b7aba3668
--- /dev/null
+++ b/tools/virtio/vhost_net_test.c
@@ -0,0 +1,441 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define RANDOM_BATCH -1
+
+static int tun_alloc(void)
+{
+   struct ifreq ifr;
+   int fd, e;
+
+   fd = open("/dev/net/tun", O_RDWR);
+   if (fd < 0) {
+   perror("Cannot open /dev/net/tun");
+   return fd;
+   }
+
+   memset(, 0, sizeof(ifr));
+
+   ifr.ifr_flags = IFF_TUN | IFF_NO_PI;
+   strncpy(ifr.ifr_name, "tun0", IFNAMSIZ);
+
+   e = ioctl(fd, TUNSETIFF, (void *) );
+   if (e < 0) {
+   perror("ioctl[TUNSETIFF]");
+   close(fd);
+   return e;
+   }
+
+   return fd;
+}
+
+/* Unused */
+void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
+
+struct vq_info {
+   int kick;
+   int call;
+   int num;
+   int idx;
+   void *ring;
+   /* copy used for control */
+   struct vring vring;
+   struct virtqueue *vq;
+};
+
+struct vdev_info {
+   struct virtio_device vdev;
+   int control;
+   struct pollfd fds[1];
+   struct vq_info vqs[1];
+   int nvqs;
+   void *buf;
+   size_t buf_size;
+   struct vhost_memory *mem;
+};
+
+static struct vhost_vring_file no_backend = { .index = 1, .fd = -1 },
+backend = { .index = 1, .fd = 1 };
+static const struct vhost_vring_state null_state = {};
+
+bool vq_notify(struct virtqueue *vq)
+{
+   struct vq_info *info = vq->priv;
+   unsigned long long v = 1;
+   int r;
+   r = write(info->kick, , sizeof v);
+   assert(r == sizeof v);
+   return true;
+}
+
+void vq_callback(struct virtqueue *vq)
+{
+}
+
+
+void vhost_vq_setup(struct vdev_info *dev, struct vq_info *info)
+{
+   struct vhost_vring_state state = { .index = info->idx };
+   struct vhost_vring_file file = { .index = info->idx };
+   unsigned long long features = dev->vdev.features;
+   struct vhost_vring_addr addr = {
+   .index = info->idx,
+   .desc_user_addr = (uint64_t)(unsigned long)info->vring.desc,
+   .avail_user_addr = (uint64_t)(unsigned long)info->vring.avail,
+   .used_user_addr = (uint64_t)(unsigned long)info->vring.used,
+   };
+   int r;
+   r = ioctl(dev->control, VHOST_SET_FEATURES, );
+   assert(r >= 0);
+   state.num = info->vring.num;
+   r = ioctl(dev->control, VHOST_SET_VRING_NUM, );
+   assert(r >= 0);
+   state.num = 0;
+   r = ioctl(dev->control, VHOST_SET_VRING_BASE, );
+   assert(r >= 0);
+   r = ioctl(dev->control, VHOST_SET_VRING_ADDR, );
+   assert(r >= 0);
+   file.fd = info->kick;
+   r = ioctl(dev->control, VHOST_SET_VRING_KICK, );
+   assert(r >= 0);
+   file.fd = info->call;
+   r = ioctl(dev->control, VHOST_SET_VRING_CALL, );
+   assert(r >= 0);
+}
+
+static void vq_reset(struct vq_info *info, int num, struct virtio_device *vdev)
+{
+   if (info->vq)
+   vring_del_virtqueue(info->vq);
+
+   memset(info->ring, 0, vring_size(num, 4096));
+   vring_init(>vring, num, info->ring, 4096

[PATCH RFC 4/6] vhost/net: remove vhost_net_page_frag_refill()

2023-12-01 Thread Yunsheng Lin
The page frag in vhost_net_page_frag_refill() uses the
'struct page_frag' from skb_page_frag_refill(), but it's
implementation is similar to page_frag_alloc_align() now.

This patch removes vhost_net_page_frag_refill() by using
'struct page_frag_cache' instead of 'struct page_frag',
and allocating frag using page_frag_alloc_align().

The added benefit is that not only unifying the page frag
implementation a little, but also having about 0.5% performance
boost testing by using the vhost_net_test introduced in the
last patch.

Signed-off-by: Yunsheng Lin 
---
 drivers/vhost/net.c | 93 ++---
 1 file changed, 29 insertions(+), 64 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index e574e21cc0ca..805e11d598e4 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -141,10 +141,8 @@ struct vhost_net {
unsigned tx_zcopy_err;
/* Flush in progress. Protected by tx vq lock. */
bool tx_flush;
-   /* Private page frag */
-   struct page_frag page_frag;
-   /* Refcount bias of page frag */
-   int refcnt_bias;
+   /* Private page frag cache */
+   struct page_frag_cache pf_cache;
 };
 
 static unsigned vhost_net_zcopy_mask __read_mostly;
@@ -655,41 +653,6 @@ static bool tx_can_batch(struct vhost_virtqueue *vq, 
size_t total_len)
   !vhost_vq_avail_empty(vq->dev, vq);
 }
 
-static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz,
-  struct page_frag *pfrag, gfp_t gfp)
-{
-   if (pfrag->page) {
-   if (pfrag->offset + sz <= pfrag->size)
-   return true;
-   __page_frag_cache_drain(pfrag->page, net->refcnt_bias);
-   }
-
-   pfrag->offset = 0;
-   net->refcnt_bias = 0;
-   if (SKB_FRAG_PAGE_ORDER) {
-   /* Avoid direct reclaim but allow kswapd to wake */
-   pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
- __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY | __GFP_NOMEMALLOC,
- SKB_FRAG_PAGE_ORDER);
-   if (likely(pfrag->page)) {
-   pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
-   goto done;
-   }
-   }
-   pfrag->page = alloc_page(gfp);
-   if (likely(pfrag->page)) {
-   pfrag->size = PAGE_SIZE;
-   goto done;
-   }
-   return false;
-
-done:
-   net->refcnt_bias = USHRT_MAX;
-   page_ref_add(pfrag->page, USHRT_MAX - 1);
-   return true;
-}
-
 #define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
 
 static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
@@ -699,7 +662,6 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
struct vhost_net *net = container_of(vq->dev, struct vhost_net,
 dev);
struct socket *sock = vhost_vq_get_backend(vq);
-   struct page_frag *alloc_frag = >page_frag;
struct virtio_net_hdr *gso;
struct xdp_buff *xdp = >xdp[nvq->batched_xdp];
struct tun_xdp_hdr *hdr;
@@ -710,6 +672,7 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
int sock_hlen = nvq->sock_hlen;
void *buf;
int copied;
+   int ret;
 
if (unlikely(len < nvq->sock_hlen))
return -EFAULT;
@@ -719,18 +682,17 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
return -ENOSPC;
 
buflen += SKB_DATA_ALIGN(len + pad);
-   alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
-   if (unlikely(!vhost_net_page_frag_refill(net, buflen,
-alloc_frag, GFP_KERNEL)))
+   buf = page_frag_alloc_align(>pf_cache, buflen, GFP_KERNEL,
+   SMP_CACHE_BYTES);
+   if (unlikely(!buf))
return -ENOMEM;
 
-   buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
-   copied = copy_page_from_iter(alloc_frag->page,
-alloc_frag->offset +
-offsetof(struct tun_xdp_hdr, gso),
-sock_hlen, from);
-   if (copied != sock_hlen)
-   return -EFAULT;
+   copied = copy_from_iter(buf + offsetof(struct tun_xdp_hdr, gso),
+   sock_hlen, from);
+   if (copied != sock_hlen) {
+   ret = -EFAULT;
+   goto err;
+   }
 
hdr = buf;
gso = >gso;
@@ -743,27 +705,30 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue 
*nvq,
   vhost16_to_cpu(vq, gso->csum_start) +
 

[PATCH RFC 2/6] page_frag: unify gfp bit for order 3 page allocation

2023-12-01 Thread Yunsheng Lin
Currently there seems to be three page frag implementions
which all try to allocate order 3 page, if that fails, it
then fail back to allocate order 0 page, and each of them
all allow order 3 page allocation to fail under certain
condition by using specific gfp bits.

The gfp bits for order 3 page allocation are different
between different implementation, __GFP_NOMEMALLOC is
or'd to forbid access to emergency reserves memory for
__page_frag_cache_refill(), but it is not or'd in other
implementions, __GFP_DIRECT_RECLAIM is xor'd to avoid
direct reclaim in skb_page_frag_refill(), but it is not
xor'd in __page_frag_cache_refill().

This patch unifies the gfp bits used between different
implementions by or'ing __GFP_NOMEMALLOC and xor'ing
__GFP_DIRECT_RECLAIM for order 3 page allocation to avoid
possible pressure for mm.

Signed-off-by: Yunsheng Lin 
CC: Alexander Duyck 
---
 drivers/vhost/net.c | 2 +-
 mm/page_alloc.c | 4 ++--
 net/core/sock.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f2ed7167c848..e574e21cc0ca 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -670,7 +670,7 @@ static bool vhost_net_page_frag_refill(struct vhost_net 
*net, unsigned int sz,
/* Avoid direct reclaim but allow kswapd to wake */
pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
  __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY,
+ __GFP_NORETRY | __GFP_NOMEMALLOC,
  SKB_FRAG_PAGE_ORDER);
if (likely(pfrag->page)) {
pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9a16305cf985..1f0b36dd81b5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4693,8 +4693,8 @@ static struct page *__page_frag_cache_refill(struct 
page_frag_cache *nc,
gfp_t gfp = gfp_mask;
 
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-   gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
-   __GFP_NOMEMALLOC;
+   gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
+  __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
PAGE_FRAG_CACHE_MAX_ORDER);
nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
diff --git a/net/core/sock.c b/net/core/sock.c
index fef349dd72fa..4efa9cae4b0d 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2904,7 +2904,7 @@ bool skb_page_frag_refill(unsigned int sz, struct 
page_frag *pfrag, gfp_t gfp)
/* Avoid direct reclaim but allow kswapd to wake */
pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
  __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY,
+ __GFP_NORETRY | __GFP_NOMEMALLOC,
  SKB_FRAG_PAGE_ORDER);
if (likely(pfrag->page)) {
pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
-- 
2.33.0




Re: [PATCH net v4 1/2] net: sched: fix packet stuck problem for lockless qdisc

2021-04-19 Thread Yunsheng Lin
On 2021/4/20 7:55, Michal Kubecek wrote:
> On Mon, Apr 19, 2021 at 05:29:46PM +0200, Michal Kubecek wrote:
>>
>> As pointed out in the discussion on v3, this patch may result in
>> significantly higher CPU consumption with multiple threads competing on
>> a saturated outgoing device. I missed this submission so that I haven't
>> checked it yet but given the description of v3->v4 changes above, it's
>> quite likely that it suffers from the same problem.
> 
> And it indeed does. However, with the additional patch from the v3
> discussion, the numbers are approximately the same as with an unpatched
> mainline kernel.
> 
> As with v4, I tried this patch on top of 5.12-rc7 with real devices.
> I used two machines with 10Gb/s Intel ixgbe NICs, sender has 16 CPUs
> (2 8-core CPUs with HT disabled) and 16 Rx/Tx queues, receiver has
> 48 CPUs (2 12-core CPUs with HT enabled) and 48 Rx/Tx queues.
> 
>   threads5.12-rc75.12-rc7 + v45.12-rc7 + v4 + stop
>  125.1%  38.1%22.9%
>  866.2% 277.0%74.1%
> 1690.1% 150.7%91.0%
> 32   107.2% 272.6%   108.3%
> 64   116.3% 487.5%   118.1%
>128   126.1% 946.7%   126.9%
> 
> (The values are normalized to one core, i.e. 100% corresponds to one
> fully used logical CPU.)
> 
> So it seems that repeated scheduling while the queue was stopped is
> indeed the main performance issue and that other cases of the logic
> being too pessimistic do not play significant role. There is an
> exception with 8 connections/threads and the result with just this
> series also looks abnormally high (e.g. much higher than with
> 16 threads). It might be worth investigating what happens there and
> what do the results with other thread counts around 8 look like.

Will try to investigate the 8 connections/threads case.

> 
> I'll run some more tests with other traffic patterns tomorrow and
> I'm also going to take a closer look at the additional patch.

Thanks for taking the detail testing and looking.

> 
> Michal
> 
> .
> 



Re: [PATCH net v3] net: sched: fix packet stuck problem for lockless qdisc

2021-04-19 Thread Yunsheng Lin
On 2021/4/19 22:57, Michal Kubecek wrote:
> On Mon, Apr 19, 2021 at 10:04:27AM +0800, Yunsheng Lin wrote:
>>>
>>> I tried this patch o top of 5.12-rc7 with real devices. I used two
>>> machines with 10Gb/s Intel ixgbe NICs, sender has 16 CPUs (2 8-core CPUs
>>> with HT disabled) and 16 Rx/Tx queues, receiver has 48 CPUs (2 12-core
>>> CPUs with HT enabled) and 48 Rx/Tx queues. With multiple TCP streams on
>>> a saturated ethernet, the CPU consumption grows quite a lot:
>>>
>>> threads unpatched 5.12-rc75.12-rc7 + v3   
>>>   1   25.6%   30.6%
>>>   8   73.1%  241.4%
>>> 128  132.2% 1012.0%
>>>
>>> (The values are normalized to one core, i.e. 100% corresponds to one
>>> fully used logical CPU.) I didn't perform a full statistical evaluation
>>> but the growth is way beyond any statistical fluctuation with one
>>> exception: 8-thread test of patched kernel showed values from 155.5% to
>>> 311.4%. Closer look shows that most of the CPU time was spent in softirq
>>> and running top in parallel with the test confirms that there are
>>> multiple ksofirqd threads running at 100% CPU. I had similar problem
>>> with earlier versions of my patch (work in progress, I still need to
>>> check some corner cases and write commit message explaining the logic)
>>
>> Great, if there is a better idea, maybe share the core idea first so
>> that we both can work on the that?
> 
> I'm not sure if it's really better but to minimize the false positives
> and unnecessary calls to __netif_schedule(), I replaced q->seqlock with
> an atomic combination of a "running" flag (which corresponds to current
> seqlock being locked) and a "drainers" count (number of other threads
> going to clean up the qdisc queue). This way we could keep track of them
> and get reliable information if another thread is going to run a cleanup
> after we leave the qdisc_run() critical section (so that there is no
> need to schedule).

It seems you are trying to match the skb enqueuing with the calling of
__qdisc_run() here, which is not reliable when considering the dequeue
batching, see try_bulk_dequeue_skb() or try_bulk_dequeue_skb_slow() in
dequeue_skb().

> 
>>> The biggest problem IMHO is that the loop in __qdisc_run() may finish
>>> without rescheduling not only when the qdisc queue is empty but also
>>> when the corresponding device Tx queue is stopped which devices tend to
>>> do whenever they cannot send any more packets out. Thus whenever
>>> __QDISC_STATE_NEED_RESCHEDULE is set with device queue stopped or
>>> frozen, we keep rescheduling the queue cleanup without any chance to
>>> progress or clear __QDISC_STATE_NEED_RESCHEDULE. For this to happen, all
>>> we need is another thready to fail the first spin_trylock() while device
>>> queue is stopped and qdisc queue not empty.
>>
>> Yes, We could just return false before doing the second spin_trylock() if
>> the netdev queue corresponding qdisc is stopped, and when the netdev queue
>> is restarted, __netif_schedule() is called again, see netif_tx_wake_queue().
>>
>> Maybe add a sch_qdisc_stopped() function and do the testting in 
>> qdisc_run_begin:
>>
>> if (dont_retry || sch_qdisc_stopped())
>>  return false;
>>
>> bool sch_qdisc_stopped(struct Qdisc *q)
>> {
>>  const struct netdev_queue *txq = q->dev_queue;
>>
>>  if (!netif_xmit_frozen_or_stopped(txq))
>>  return true;
>>
>>  reture false;
>> }
>>
>> At least for qdisc with TCQ_F_ONETXQUEUE flags set is doable?
> 
> Either this or you can do the check in qdisc_run_end() - when the device
> queue is stopped or frozen, there is no need to schedule as we know it's
> going to be done when the flag is cleared again (and we cannot do
> anything until then anyway).
> 
>>> Another part of the problem may be that to avoid the race, the logic is
>>> too pessimistic: consider e.g. (dotted lines show "barriers" where
>>> ordering is important):
>>>
>>> CPU ACPU B
>>> spin_trylock() succeeds
>>>  pfifo_fast_enqueue()
>>> ..
>>> skb_array empty, exit loop
>>>  first spin_trylock() fails
>>>  set __QDISC_STATE_NEED_RESCHEDU

Re: [Linuxarm] Re: [PATCH net v3] net: sched: fix packet stuck problem for lockless qdisc

2021-04-19 Thread Yunsheng Lin
On 2021/4/19 10:04, Yunsheng Lin wrote:
> On 2021/4/19 6:59, Michal Kubecek wrote:
>> On Thu, Mar 25, 2021 at 11:13:11AM +0800, Yunsheng Lin wrote:
>>> Lockless qdisc has below concurrent problem:
>>> cpu0 cpu1
>>>  . .
>>> q->enqueue .
>>>  . .
>>> qdisc_run_begin()  .
>>>  . .
>>> dequeue_skb()  .
>>>  . .
>>> sch_direct_xmit()  .
>>>  . .
>>>  .q->enqueue
>>>  . qdisc_run_begin()
>>>  .return and do nothing
>>>  . .
>>> qdisc_run_end().
>>>
>>> cpu1 enqueue a skb without calling __qdisc_run() because cpu0
>>> has not released the lock yet and spin_trylock() return false
>>> for cpu1 in qdisc_run_begin(), and cpu0 do not see the skb
>>> enqueued by cpu1 when calling dequeue_skb() because cpu1 may
>>> enqueue the skb after cpu0 calling dequeue_skb() and before
>>> cpu0 calling qdisc_run_end().
>>>
>>> Lockless qdisc has below another concurrent problem when
>>> tx_action is involved:
>>>
>>> cpu0(serving tx_action) cpu1 cpu2
>>>   .   ..
>>>   .  q->enqueue.
>>>   .qdisc_run_begin()   .
>>>   .  dequeue_skb() .
>>>   .   .q->enqueue
>>>   .   ..
>>>   . sch_direct_xmit()  .
>>>   .   . qdisc_run_begin()
>>>   .   .   return and do nothing
>>>   .   ..
>>>  clear __QDISC_STATE_SCHED..
>>>  qdisc_run_begin()..
>>>  return and do nothing..
>>>   .   ..
>>>   .qdisc_run_end() .
>>>
>>> This patch fixes the above data race by:
>>> 1. Get the flag before doing spin_trylock().
>>> 2. If the first spin_trylock() return false and the flag is not
>>>set before the first spin_trylock(), Set the flag and retry
>>>another spin_trylock() in case other CPU may not see the new
>>>flag after it releases the lock.
>>> 3. reschedule if the flags is set after the lock is released
>>>at the end of qdisc_run_end().
>>>
>>> For tx_action case, the flags is also set when cpu1 is at the
>>> end if qdisc_run_end(), so tx_action will be rescheduled
>>> again to dequeue the skb enqueued by cpu2.
>>>
>>> Only clear the flag before retrying a dequeuing when dequeuing
>>> returns NULL in order to reduce the overhead of the above double
>>> spin_trylock() and __netif_schedule() calling.
>>>
>>> The performance impact of this patch, tested using pktgen and
>>> dummy netdev with pfifo_fast qdisc attached:
>>>
>>>  threads  without+this_patch   with+this_patch  delta
>>> 12.61Mpps2.60Mpps   -0.3%
>>> 23.97Mpps3.82Mpps   -3.7%
>>> 45.62Mpps5.59Mpps   -0.5%
>>> 82.78Mpps2.77Mpps   -0.3%
>>>162.22Mpps2.22Mpps   -0.0%
>>>
>>> Fixes: 6b3ba9146fe6 ("net: sched: allow qdiscs to handle locking")
>>> Signed-off-by: Yunsheng Lin 
>>> ---
>>> V3: fix a compile error and a few comment typo, remove the
>>> __QDISC_STATE_DEACTIVATED checking, and update the
>>> performance data.
>>> V2: Avoid the overhead of fixing the data race as much as
>>> possible.
>>
>> I tried this patch o top of 5.12-rc7 with real devices. I used two
>> machines with 10Gb/s Intel ixgbe NICs, sender has 16 CPUs (2 8-core CPUs
>> with HT disabled) and 16 Rx/Tx queues, receiver has 48 CPUs (2 12-core
>> CPUs with HT enabled) and 48 Rx/Tx queues. With multiple TCP streams on
>> a saturated ethernet, the CPU consumption grows quite a lot:
>>
>> threads unpatched 5.12-rc75.12-rc7 + v3   
>>   1   25.6%   30.6%
>>   8

Re: [PATCH net v3] net: sched: fix packet stuck problem for lockless qdisc

2021-04-18 Thread Yunsheng Lin
On 2021/4/19 6:59, Michal Kubecek wrote:
> On Thu, Mar 25, 2021 at 11:13:11AM +0800, Yunsheng Lin wrote:
>> Lockless qdisc has below concurrent problem:
>> cpu0 cpu1
>>  . .
>> q->enqueue .
>>  . .
>> qdisc_run_begin()  .
>>  . .
>> dequeue_skb()  .
>>  . .
>> sch_direct_xmit()  .
>>  . .
>>  .q->enqueue
>>  . qdisc_run_begin()
>>  .return and do nothing
>>  . .
>> qdisc_run_end().
>>
>> cpu1 enqueue a skb without calling __qdisc_run() because cpu0
>> has not released the lock yet and spin_trylock() return false
>> for cpu1 in qdisc_run_begin(), and cpu0 do not see the skb
>> enqueued by cpu1 when calling dequeue_skb() because cpu1 may
>> enqueue the skb after cpu0 calling dequeue_skb() and before
>> cpu0 calling qdisc_run_end().
>>
>> Lockless qdisc has below another concurrent problem when
>> tx_action is involved:
>>
>> cpu0(serving tx_action) cpu1 cpu2
>>   .   ..
>>   .  q->enqueue.
>>   .qdisc_run_begin()   .
>>   .  dequeue_skb() .
>>   .   .q->enqueue
>>   .   ..
>>   . sch_direct_xmit()  .
>>   .   . qdisc_run_begin()
>>   .   .   return and do nothing
>>   .   ..
>>  clear __QDISC_STATE_SCHED..
>>  qdisc_run_begin()..
>>  return and do nothing..
>>   .   ..
>>   .qdisc_run_end() .
>>
>> This patch fixes the above data race by:
>> 1. Get the flag before doing spin_trylock().
>> 2. If the first spin_trylock() return false and the flag is not
>>set before the first spin_trylock(), Set the flag and retry
>>another spin_trylock() in case other CPU may not see the new
>>flag after it releases the lock.
>> 3. reschedule if the flags is set after the lock is released
>>at the end of qdisc_run_end().
>>
>> For tx_action case, the flags is also set when cpu1 is at the
>> end if qdisc_run_end(), so tx_action will be rescheduled
>> again to dequeue the skb enqueued by cpu2.
>>
>> Only clear the flag before retrying a dequeuing when dequeuing
>> returns NULL in order to reduce the overhead of the above double
>> spin_trylock() and __netif_schedule() calling.
>>
>> The performance impact of this patch, tested using pktgen and
>> dummy netdev with pfifo_fast qdisc attached:
>>
>>  threads  without+this_patch   with+this_patch      delta
>> 12.61Mpps2.60Mpps   -0.3%
>> 23.97Mpps3.82Mpps   -3.7%
>> 45.62Mpps5.59Mpps   -0.5%
>> 82.78Mpps2.77Mpps   -0.3%
>>162.22Mpps2.22Mpps   -0.0%
>>
>> Fixes: 6b3ba9146fe6 ("net: sched: allow qdiscs to handle locking")
>> Signed-off-by: Yunsheng Lin 
>> ---
>> V3: fix a compile error and a few comment typo, remove the
>> __QDISC_STATE_DEACTIVATED checking, and update the
>> performance data.
>> V2: Avoid the overhead of fixing the data race as much as
>> possible.
> 
> I tried this patch o top of 5.12-rc7 with real devices. I used two
> machines with 10Gb/s Intel ixgbe NICs, sender has 16 CPUs (2 8-core CPUs
> with HT disabled) and 16 Rx/Tx queues, receiver has 48 CPUs (2 12-core
> CPUs with HT enabled) and 48 Rx/Tx queues. With multiple TCP streams on
> a saturated ethernet, the CPU consumption grows quite a lot:
> 
> threads unpatched 5.12-rc75.12-rc7 + v3   
>   1   25.6%   30.6%
>   8   73.1%  241.4%
> 128  132.2% 1012.0%
> 
> (The values are normalized to one core, i.e. 100% corresponds to one
> fully used logical CPU.) I didn't perform a full statistical evaluation
> but the growth is way beyond any statistical fluctuation with one
> exception: 8-thread test of patched kernel showed values from 155.5% to
> 311.4%.

Re: [PATCH net] net: fix use-after-free when UDP GRO with shared fraglist

2021-04-16 Thread Yunsheng Lin
On 2021/1/6 11:32, Dongseok Yi wrote:
> On 2021-01-06 12:07, Willem de Bruijn wrote:
>>
>> On Tue, Jan 5, 2021 at 8:29 PM Dongseok Yi  wrote:
>>>
>>> On 2021-01-05 06:03, Willem de Bruijn wrote:

 On Mon, Jan 4, 2021 at 4:00 AM Dongseok Yi  wrote:
>
> skbs in frag_list could be shared by pskb_expand_head() from BPF.

 Can you elaborate on the BPF connection?
>>>
>>> With the following registered ptypes,
>>>
>>> /proc/net # cat ptype
>>> Type Device  Function
>>> ALL   tpacket_rcv
>>> 0800  ip_rcv.cfi_jt
>>> 0011  llc_rcv.cfi_jt
>>> 0004  llc_rcv.cfi_jt
>>> 0806  arp_rcv
>>> 86dd  ipv6_rcv.cfi_jt
>>>
>>> BPF checks skb_ensure_writable between tpacket_rcv and ip_rcv
>>> (or ipv6_rcv). And it calls pskb_expand_head.
>>>
>>> [  132.051228] pskb_expand_head+0x360/0x378
>>> [  132.051237] skb_ensure_writable+0xa0/0xc4
>>> [  132.051249] bpf_skb_pull_data+0x28/0x60
>>> [  132.051262] bpf_prog_331d69c77ea5e964_schedcls_ingres+0x5f4/0x1000
>>> [  132.051273] cls_bpf_classify+0x254/0x348
>>> [  132.051284] tcf_classify+0xa4/0x180
>>
>> Ah, you have a BPF program loaded at TC. That was not entirely obvious.
>>
>> This program gets called after packet sockets with ptype_all, before
>> those with a specific protocol.
>>
>> Tcpdump will have inserted a program with ptype_all, which cloned the
>> skb. This triggers skb_ensure_writable -> pskb_expand_head ->
>> skb_clone_fraglist -> skb_get.
>>
>>> [  132.051294] __netif_receive_skb_core+0x590/0xd28
>>> [  132.051303] __netif_receive_skb+0x50/0x17c
>>> [  132.051312] process_backlog+0x15c/0x1b8
>>>

> While tcpdump, sk_receive_queue of PF_PACKET has the original frag_list.
> But the same frag_list is queued to PF_INET (or PF_INET6) as the fraglist
> chain made by skb_segment_list().
>
> If the new skb (not frag_list) is queued to one of the sk_receive_queue,
> multiple ptypes can see this. The skb could be released by ptypes and
> it causes use-after-free.

 If I understand correctly, a udp-gro-list skb makes it up the receive
 path with one or more active packet sockets.

 The packet socket will call skb_clone after accepting the filter. This
 replaces the head_skb, but shares the skb_shinfo and thus frag_list.

 udp_rcv_segment later converts the udp-gro-list skb to a list of
 regular packets to pass these one-by-one to udp_queue_rcv_one_skb.
 Now all the frags are fully fledged packets, with headers pushed
 before the payload. This does not change their refcount anymore than
 the skb_clone in pf_packet did. This should be 1.

 Eventually udp_recvmsg will call skb_consume_udp on each packet.

 The packet socket eventually also frees its cloned head_skb, which triggers

   kfree_skb_list(shinfo->frag_list)
 kfree_skb
   skb_unref
 refcount_dec_and_test(>users)
>>>
>>> Every your understanding is right, but
>>>

>
> [ 4443.426215] [ cut here ]
> [ 4443.426222] refcount_t: underflow; use-after-free.
> [ 4443.426291] WARNING: CPU: 7 PID: 28161 at lib/refcount.c:190
> refcount_dec_and_test_checked+0xa4/0xc8
> [ 4443.426726] pstate: 6045 (nZCv daif +PAN -UAO)
> [ 4443.426732] pc : refcount_dec_and_test_checked+0xa4/0xc8
> [ 4443.426737] lr : refcount_dec_and_test_checked+0xa0/0xc8
> [ 4443.426808] Call trace:
> [ 4443.426813]  refcount_dec_and_test_checked+0xa4/0xc8
> [ 4443.426823]  skb_release_data+0x144/0x264
> [ 4443.426828]  kfree_skb+0x58/0xc4
> [ 4443.426832]  skb_queue_purge+0x64/0x9c
> [ 4443.426844]  packet_set_ring+0x5f0/0x820
> [ 4443.426849]  packet_setsockopt+0x5a4/0xcd0
> [ 4443.426853]  __sys_setsockopt+0x188/0x278
> [ 4443.426858]  __arm64_sys_setsockopt+0x28/0x38
> [ 4443.426869]  el0_svc_common+0xf0/0x1d0
> [ 4443.426873]  el0_svc_handler+0x74/0x98
> [ 4443.426880]  el0_svc+0x8/0xc
>
> Fixes: 3a1296a38d0c (net: Support GRO/GSO fraglist chaining.)
> Signed-off-by: Dongseok Yi 
> ---
>  net/core/skbuff.c | 20 +++-
>  1 file changed, 19 insertions(+), 1 deletion(-)
>
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index f62cae3..1dcbda8 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -3655,7 +3655,8 @@ struct sk_buff *skb_segment_list(struct sk_buff 
> *skb,
> unsigned int delta_truesize = 0;
> unsigned int delta_len = 0;
> struct sk_buff *tail = NULL;
> -   struct sk_buff *nskb;
> +   struct sk_buff *nskb, *tmp;
> +   int err;
>
> skb_push(skb, -skb_network_offset(skb) + offset);
>
> @@ -3665,11 +3666,28 @@ struct sk_buff *skb_segment_list(struct sk_buff 
> *skb,
> nskb = list_skb;
> list_skb = list_skb->next;
>
> +   

[PATCH net v4 1/2] net: sched: fix packet stuck problem for lockless qdisc

2021-04-15 Thread Yunsheng Lin
Lockless qdisc has below concurrent problem:
cpu0 cpu1
 . .
q->enqueue .
 . .
qdisc_run_begin()  .
 . .
dequeue_skb()  .
 . .
sch_direct_xmit()  .
 . .
 .q->enqueue
 . qdisc_run_begin()
 .return and do nothing
 . .
qdisc_run_end().

cpu1 enqueue a skb without calling __qdisc_run() because cpu0
has not released the lock yet and spin_trylock() return false
for cpu1 in qdisc_run_begin(), and cpu0 do not see the skb
enqueued by cpu1 when calling dequeue_skb() because cpu1 may
enqueue the skb after cpu0 calling dequeue_skb() and before
cpu0 calling qdisc_run_end().

Lockless qdisc has below another concurrent problem when
tx_action is involved:

cpu0(serving tx_action) cpu1 cpu2
  .   ..
  .  q->enqueue.
  .qdisc_run_begin()   .
  .  dequeue_skb() .
  .   .q->enqueue
  .   ..
  . sch_direct_xmit()  .
  .   . qdisc_run_begin()
  .   .   return and do nothing
  .   ..
 clear __QDISC_STATE_SCHED..
 qdisc_run_begin()..
 return and do nothing..
  .   ..
  .qdisc_run_end() .

This patch fixes the above data race by:
1. Test STATE_MISSED before doing spin_trylock().
2. If the first spin_trylock() return false and STATE_MISSED is
   not set before the first spin_trylock(), Set STATE_MISSED and
   retry another spin_trylock() in case other CPU may not see
   STATE_MISSED after it releases the lock.
3. reschedule if STATE_MISSED is set after the lock is released
   at the end of qdisc_run_end().

For tx_action case, STATE_MISSED is also set when cpu1 is at the
end if qdisc_run_end(), so tx_action will be rescheduled again
to dequeue the skb enqueued by cpu2.

Clear STATE_MISSED before retrying a dequeuing when dequeuing
returns NULL in order to reduce the overhead of the above double
spin_trylock() and __netif_schedule() calling.

The performance impact of this patch, tested using pktgen and
dummy netdev with pfifo_fast qdisc attached:

 threads  without+this_patch   with+this_patch  delta
12.61Mpps2.60Mpps   -0.3%
23.97Mpps3.82Mpps   -3.7%
45.62Mpps5.59Mpps   -0.5%
82.78Mpps2.77Mpps   -0.3%
   162.22Mpps2.22Mpps   -0.0%

Fixes: 6b3ba9146fe6 ("net: sched: allow qdiscs to handle locking")
Signed-off-by: Yunsheng Lin 
Tested-by: Juergen Gross 
---
V4: Change STATE_NEED_RESCHEDULE to STATE_MISSED mirroring
NAPI's NAPIF_STATE_MISSED, and add Juergen's "Tested-by"
tag for there is only renaming and typo fixing between
V4 and V3.
V3: Fix a compile error and a few comment typo, remove the
__QDISC_STATE_DEACTIVATED checking, and update the
performance data.
V2: Avoid the overhead of fixing the data race as much as
possible.
---
 include/net/sch_generic.h | 37 -
 net/sched/sch_generic.c   | 12 
 2 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index f7a6e14..b85b8ea 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -36,6 +36,7 @@ struct qdisc_rate_table {
 enum qdisc_state_t {
__QDISC_STATE_SCHED,
__QDISC_STATE_DEACTIVATED,
+   __QDISC_STATE_MISSED,
 };
 
 struct qdisc_size_table {
@@ -159,8 +160,37 @@ static inline bool qdisc_is_empty(const struct Qdisc 
*qdisc)
 static inline bool qdisc_run_begin(struct Qdisc *qdisc)
 {
if (qdisc->flags & TCQ_F_NOLOCK) {
+   bool dont_retry = test_bit(__QDISC_STATE_MISSED,
+  >state);
+
+   if (spin_trylock(>seqlock))
+   goto nolock_empty;
+
+   /* If the flag is set before doing the spin_trylock() and
+* the above spin_trylock() return false, it means other cpu
+* holding the lock will do dequeuing for us, or it wil see
+* the flag set after releasing lock and reschedule the
+* net_tx_action() to do the dequeuing.
+*/
+   if (dont_retry)
+   return false;
+
+   /* We could do set_bit() before the first spin_trylock(),
+* and

[PATCH net v4 2/2] net: sched: fix endless tx action reschedule during deactivation

2021-04-15 Thread Yunsheng Lin
Currently qdisc_run() checks the STATE_DEACTIVATED of lockless
qdisc before calling __qdisc_run(), which ultimately clear the
STATE_MISSED when all the skb is dequeued. If STATE_DEACTIVATED
is set before clearing STATE_MISSED, there may be endless
rescheduling of net_tx_action() at the end of qdisc_run_end(),
see below:

CPU0(net_tx_atcion)  CPU1(__dev_xmit_skb)  CPU2(dev_deactivate)
  .   . .
  .set STATE_MISSED .
  .   __netif_schedule().
  .   .   set STATE_DEACTIVATED
  .   .qdisc_reset()
  .   . .
  .<---   .  synchronize_net()
clear __QDISC_STATE_SCHED  |  . .
  .|  . .
  .|  . .
  .|  .   ->.
  .|  .  |  .
  test STATE_DEACTIVATED   |  .  | some_qdisc_is_busy()
__qdisc_run() *not* called |  .  |-return *true*
  .|  . .
   test STATE_MISS |  . .
 __netif_schedule()|  . .
  .   . .
  .   . .

__qdisc_run() is not called by net_tx_atcion() in CPU0 because
CPU2 has set STATE_DEACTIVATED flag during dev_deactivate(), and
STATE_MISSED is only cleared in __qdisc_run(), __netif_schedule
is called endlessly at the end of qdisc_run_end(), causing endless
tx action rescheduling problem.

qdisc_run() called by net_tx_action() runs in the softirq context,
which should has the same semantic as the qdisc_run() called by
__dev_xmit_skb() protected by rcu_read_lock_bh(). And there is a
synchronize_net() between STATE_DEACTIVATED flag being set and
qdisc_reset()/some_qdisc_is_busy in dev_deactivate(), we can safely
bail out for the deactived lockless qdisc in net_tx_action(), and
qdisc_reset() will reset all skb not dequeued yet.

So add the rcu_read_lock() explicitly to protect the qdisc_run()
and do the STATE_DEACTIVATED checking in net_tx_action() before
calling qdisc_run_begin(). Another option is to do the checking in
the qdisc_run_end(), but it will add unnecessary overhead for
non-tx_action case, because __dev_queue_xmit() will not see qdisc
with STATE_DEACTIVATED after synchronize_net(), the qdisc with
STATE_DEACTIVATED can only be seen by net_tx_action() because of
__netif_schedule().

The STATE_DEACTIVATED checking in qdisc_run() is to avoid race
between net_tx_action() and qdisc_reset(), see:
commit d518d2ed8640 ("net/sched: fix race between deactivation
and dequeue for NOLOCK qdisc"). As the bailout added above for
deactived lockless qdisc in net_tx_action() provides better
protection for the race without calling qdisc_run() at all, so
remove the STATE_DEACTIVATED checking in qdisc_run().

After qdisc_reset(), there is no skb in qdisc to be dequeued, so
clear the STATE_MISSED in dev_reset_queue() too.

Fixes: 6b3ba9146fe6 ("net: sched: allow qdiscs to handle locking")
Signed-off-by: Yunsheng Lin 
---
 include/net/pkt_sched.h |  7 +--
 net/core/dev.c  | 26 ++
 net/sched/sch_generic.c |  4 +++-
 3 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index f5c1bee..6d7b12c 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -128,12 +128,7 @@ void __qdisc_run(struct Qdisc *q);
 static inline void qdisc_run(struct Qdisc *q)
 {
if (qdisc_run_begin(q)) {
-   /* NOLOCK qdisc must check 'state' under the qdisc seqlock
-* to avoid racing with dev_qdisc_reset()
-*/
-   if (!(q->flags & TCQ_F_NOLOCK) ||
-   likely(!test_bit(__QDISC_STATE_DEACTIVATED, >state)))
-   __qdisc_run(q);
+   __qdisc_run(q);
qdisc_run_end(q);
}
 }
diff --git a/net/core/dev.c b/net/core/dev.c
index be941ed..47cefcc 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4958,25 +4958,43 @@ static __latent_entropy void net_tx_action(struct 
softirq_action *h)
sd->output_queue_tailp = >output_queue;
local_irq_enable();
 
+   rcu_read_lock();
+
while (head) {
struct Qdisc *q = head;
spinlock_t *root_lock = NULL;
 
head = head->next_sched;
 
-   if (!(q->flags & TCQ_F_NOLOCK)) {
-   root_lock = qdisc_lock(q);
-   spin_lock(root_lock);
-   }
/* We need to make sure head-&

[PATCH net v4 0/2] fix packet stuck problem for lockless qdisc

2021-04-15 Thread Yunsheng Lin
This patchset fixes the packet stuck problem mentioned in [1].

Patch 1: Add STATE_MISSED flag to fix packet stuck problem.
Patch 2: Fix a tx_action rescheduling problem after STATE_MISSED
 flag is added in patch 1.

V4: Change STATE_NEED_RESCHEDULE to STATE_MISSED and add patch 2.

[1]. https://lkml.org/lkml/2019/10/9/42

Yunsheng Lin (2):
  net: sched: fix packet stuck problem for lockless qdisc
  net: sched: fix endless tx action reschedule during deactivation

 include/net/pkt_sched.h   |  7 +--
 include/net/sch_generic.h | 37 -
 net/core/dev.c| 26 ++
 net/sched/sch_generic.c   | 16 +++-
 4 files changed, 74 insertions(+), 12 deletions(-)

-- 
2.7.4



[PATCH net v4 0/2] fix packet stuck problem for lockless qdisc

2021-04-15 Thread Yunsheng Lin
This patchset fixes the packet stuck problem mentioned in [1].

Patch 1: Add STATE_MISSED flag to fix packet stuck problem.
Patch 2: Fix a tx_action rescheduling problem after STATE_MISSED
 flag is added in patch 1.

V4: Change STATE_NEED_RESCHEDULE to STATE_MISSED and add patch 2.

[1]. https://lkml.org/lkml/2019/10/9/42

Yunsheng Lin (2):
  net: sched: fix packet stuck problem for lockless qdisc
  net: sched: fix endless tx action reschedule during deactivation

 include/net/pkt_sched.h   |  7 +--
 include/net/sch_generic.h | 37 -
 net/core/dev.c| 26 ++
 net/sched/sch_generic.c   | 16 +++-
 4 files changed, 74 insertions(+), 12 deletions(-)

-- 
2.7.4



Re: [PATCH net v3] net: sched: fix packet stuck problem for lockless qdisc

2021-04-13 Thread Yunsheng Lin
On 2021/4/13 16:33, Hillf Danton wrote:
> On Tue, 13 Apr 2021 15:57:29  Yunsheng Lin wrote:
>> On 2021/4/13 15:12, Hillf Danton wrote:
>>> On Tue, 13 Apr 2021 11:34:27 Yunsheng Lin wrote:
>>>> On 2021/4/13 11:26, Hillf Danton wrote:
>>>>> On Tue, 13 Apr 2021 10:56:42 Yunsheng Lin wrote:
>>>>>> On 2021/4/13 10:21, Hillf Danton wrote:
>>>>>>> On Mon, 12 Apr 2021 20:00:43  Yunsheng Lin wrote:
>>>>>>>>
>>>>>>>> Yes, the below patch seems to fix the data race described in
>>>>>>>> the commit log.
>>>>>>>> Then what is the difference between my patch and your patch below:)
>>>>>>>
>>>>>>> Hehe, this is one of the tough questions over a bounch of weeks.
>>>>>>>
>>>>>>> If a seqcount can detect the race between skb enqueue and dequeue then 
>>>>>>> we
>>>>>>> cant see any excuse for not rolling back to the point without NOLOCK.
>>>>>>
>>>>>> I am not sure I understood what you meant above.
>>>>>>
>>>>>> As my understanding, the below patch is essentially the same as
>>>>>> your previous patch, the only difference I see is it uses qdisc->pad
>>>>>> instead of __QDISC_STATE_NEED_RESCHEDULE.
>>>>>>
>>>>>> So instead of proposing another patch, it would be better if you
>>>>>> comment on my patch, and make improvement upon that.
>>>>>>
>>>>> Happy to do that after you show how it helps revert NOLOCK.
>>>>
>>>> Actually I am not going to revert NOLOCK, but add optimization
>>>> to it if the patch fixes the packet stuck problem.
>>>>
>>> Fix is not optimization, right?
>>
>> For this patch, it is a fix.
>> In case you missed it, I do have a couple of idea to optimize the
>> lockless qdisc:
>>
>> 1. RFC patch to add lockless qdisc bypass optimization:
>>
>> https://patchwork.kernel.org/project/netdevbpf/patch/1616404156-11772-1-git-send-email-linyunsh...@huawei.com/
>>
>> 2. implement lockless enqueuing for lockless qdisc using the idea
>>   from Jason and Toke. And it has a noticable proformance increase with
>>   1-4 threads running using the below prototype based on ptr_ring.
>>
>> static inline int __ptr_ring_multi_produce(struct ptr_ring *r, void *ptr)
>> {
>>
>>int producer, next_producer;
>>
>>
>>do {
>>producer = READ_ONCE(r->producer);
>>if (unlikely(!r->size) || r->queue[producer])
>>return -ENOSPC;
>>next_producer = producer + 1;
>>if (unlikely(next_producer >= r->size))
>>next_producer = 0;
>>} while(cmpxchg_relaxed(>producer, producer, next_producer) != 
>> producer);
>>
>>/* Make sure the pointer we are storing points to a valid data. */
>>/* Pairs with the dependency ordering in __ptr_ring_consume. */
>>smp_wmb();
>>
>>WRITE_ONCE(r->queue[producer], ptr);
>>return 0;
>> }
>>
>> 3. Maybe it is possible to remove the netif_tx_lock for lockless qdisc
>>   too, because dev_hard_start_xmit is also in the protection of
>>   qdisc_run_begin()/qdisc_run_end()(if there is only one qdisc using
>>   a netdev queue, which is true for pfifo_fast, I believe).
>>
>> 4. Remove the qdisc->running seqcount operation for lockless qdisc, which
>>   is mainly used to do heuristic locking on q->busylock for locked qdisc.
>>
> 
> Sounds good. They can stand two months, cant they?
>>>
>>>> Is there any reason why you want to revert it?
>>>>
>>> I think you know Jiri's plan and it would be nice to wait a couple of
>>> months for it to complete.
>>
>> I am not sure I am aware of Jiri's plan.
>> Is there any link referring to the plan?
>>
> https://lore.kernel.org/lkml/eaff25bc-9b64-037e-b9bc-c06fc4a5a...@huawei.com/

I think there is some misunderstanding here.

As my understanding, Jiri and Juergen are from the same team(using
the suse.com mail server).
what Jiri said about "I am still planning to have Yunsheng Lin's
(CCing) fix [1] tested in the coming days." is that Juergen has
done the test and provide a "Tested-by" tag.

So if this patch fixes the packet stuck problem, Jiri is ok with
NOLOCK qdisc too.

Or do I misunderstand it again? Perhaps Jiri and Juergen can help to
clarify this?


> 
> .
> 



Re: [PATCH net v3] net: sched: fix packet stuck problem for lockless qdisc

2021-04-13 Thread Yunsheng Lin
On 2021/4/13 15:12, Hillf Danton wrote:
> On Tue, 13 Apr 2021 11:34:27 Yunsheng Lin wrote:
>> On 2021/4/13 11:26, Hillf Danton wrote:
>>> On Tue, 13 Apr 2021 10:56:42 Yunsheng Lin wrote:
>>>> On 2021/4/13 10:21, Hillf Danton wrote:
>>>>> On Mon, 12 Apr 2021 20:00:43  Yunsheng Lin wrote:
>>>>>>
>>>>>> Yes, the below patch seems to fix the data race described in
>>>>>> the commit log.
>>>>>> Then what is the difference between my patch and your patch below:)
>>>>>
>>>>> Hehe, this is one of the tough questions over a bounch of weeks.
>>>>>
>>>>> If a seqcount can detect the race between skb enqueue and dequeue then we
>>>>> cant see any excuse for not rolling back to the point without NOLOCK.
>>>>
>>>> I am not sure I understood what you meant above.
>>>>
>>>> As my understanding, the below patch is essentially the same as
>>>> your previous patch, the only difference I see is it uses qdisc->pad
>>>> instead of __QDISC_STATE_NEED_RESCHEDULE.
>>>>
>>>> So instead of proposing another patch, it would be better if you
>>>> comment on my patch, and make improvement upon that.
>>>>
>>> Happy to do that after you show how it helps revert NOLOCK.
>>
>> Actually I am not going to revert NOLOCK, but add optimization
>> to it if the patch fixes the packet stuck problem.
>>
> Fix is not optimization, right?

For this patch, it is a fix.
In case you missed it, I do have a couple of idea to optimize the
lockless qdisc:

1. RFC patch to add lockless qdisc bypass optimization:

https://patchwork.kernel.org/project/netdevbpf/patch/1616404156-11772-1-git-send-email-linyunsh...@huawei.com/

2. implement lockless enqueuing for lockless qdisc using the idea
   from Jason and Toke. And it has a noticable proformance increase with
   1-4 threads running using the below prototype based on ptr_ring.

static inline int __ptr_ring_multi_produce(struct ptr_ring *r, void *ptr)
{

int producer, next_producer;


do {
producer = READ_ONCE(r->producer);
if (unlikely(!r->size) || r->queue[producer])
return -ENOSPC;
next_producer = producer + 1;
if (unlikely(next_producer >= r->size))
next_producer = 0;
} while(cmpxchg_relaxed(>producer, producer, next_producer) != 
producer);

/* Make sure the pointer we are storing points to a valid data. */
/* Pairs with the dependency ordering in __ptr_ring_consume. */
smp_wmb();

WRITE_ONCE(r->queue[producer], ptr);
return 0;
}

3. Maybe it is possible to remove the netif_tx_lock for lockless qdisc
   too, because dev_hard_start_xmit is also in the protection of
   qdisc_run_begin()/qdisc_run_end()(if there is only one qdisc using
   a netdev queue, which is true for pfifo_fast, I believe).

4. Remove the qdisc->running seqcount operation for lockless qdisc, which
   is mainly used to do heuristic locking on q->busylock for locked qdisc.

> 
>> Is there any reason why you want to revert it?
>>
> I think you know Jiri's plan and it would be nice to wait a couple of
> months for it to complete.

I am not sure I am aware of Jiri's plan.
Is there any link referring to the plan?

> 
> .
> 



Re: [PATCH net v3] net: sched: fix packet stuck problem for lockless qdisc

2021-04-12 Thread Yunsheng Lin
On 2021/4/13 11:26, Hillf Danton wrote:
> On Tue, 13 Apr 2021 10:56:42 Yunsheng Lin wrote:
>> On 2021/4/13 10:21, Hillf Danton wrote:
>>> On Mon, 12 Apr 2021 20:00:43  Yunsheng Lin wrote:
>>>>
>>>> Yes, the below patch seems to fix the data race described in
>>>> the commit log.
>>>> Then what is the difference between my patch and your patch below:)
>>>
>>> Hehe, this is one of the tough questions over a bounch of weeks.
>>>
>>> If a seqcount can detect the race between skb enqueue and dequeue then we
>>> cant see any excuse for not rolling back to the point without NOLOCK.
>>
>> I am not sure I understood what you meant above.
>>
>> As my understanding, the below patch is essentially the same as
>> your previous patch, the only difference I see is it uses qdisc->pad
>> instead of __QDISC_STATE_NEED_RESCHEDULE.
>>
>> So instead of proposing another patch, it would be better if you
>> comment on my patch, and make improvement upon that.
>>
> Happy to do that after you show how it helps revert NOLOCK.

Actually I am not going to revert NOLOCK, but add optimization
to it if the patch fixes the packet stuck problem.

Is there any reason why you want to revert it?

> 
> .
> 



Re: [PATCH net v3] net: sched: fix packet stuck problem for lockless qdisc

2021-04-12 Thread Yunsheng Lin
On 2021/4/13 10:21, Hillf Danton wrote:
> On Mon, 12 Apr 2021 20:00:43  Yunsheng Lin wrote:
>>
>> Yes, the below patch seems to fix the data race described in
>> the commit log.
>> Then what is the difference between my patch and your patch below:)
> 
> Hehe, this is one of the tough questions over a bounch of weeks.
> 
> If a seqcount can detect the race between skb enqueue and dequeue then we
> cant see any excuse for not rolling back to the point without NOLOCK.

I am not sure I understood what you meant above.

As my understanding, the below patch is essentially the same as
your previous patch, the only difference I see is it uses qdisc->pad
instead of __QDISC_STATE_NEED_RESCHEDULE.

So instead of proposing another patch, it would be better if you
comment on my patch, and make improvement upon that.

> 
> --- a/net/sched/sch_generic.c
> +++ b/net/sched/sch_generic.c
> @@ -632,6 +632,7 @@ static int pfifo_fast_enqueue(struct sk_
>   return qdisc_drop(skb, qdisc, to_free);
>   }
>  
> + qdisc->pad++;

As has been mentioned:
Doing updating in pfifo_fast_enqueue() unconditionally does not
seems to be performance friendly, which is something my patch
tries to avoid as much as possible.

>   qdisc_update_stats_at_enqueue(qdisc, pkt_len);
>   return NET_XMIT_SUCCESS;
>  }
> @@ -642,6 +643,7 @@ static struct sk_buff *pfifo_fast_dequeu
>   struct sk_buff *skb = NULL;
>   int band;
>  
> + qdisc->pad = 0;
>   for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
>   struct skb_array *q = band2list(priv, band);
>  
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -176,8 +176,12 @@ static inline bool qdisc_run_begin(struc
>  static inline void qdisc_run_end(struct Qdisc *qdisc)
>  {
>   write_seqcount_end(>running);
> - if (qdisc->flags & TCQ_F_NOLOCK)
> + if (qdisc->flags & TCQ_F_NOLOCK) {
>   spin_unlock(>seqlock);
> +
> + if (qdisc->pad != 0)
> + __netif_schedule(qdisc);
> + }
>  }
>  
>  static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
> 
> .
> 



Re: [PATCH net v3] net: sched: fix packet stuck problem for lockless qdisc

2021-04-12 Thread Yunsheng Lin
On 2021/4/12 15:28, Hillf Danton wrote:
> On Mon, 12 Apr 2021 11:37:24 Yunsheng Lin wrote:
>> On 2021/4/12 11:21, Hillf Danton wrote:
>>> On Mon, 12 Apr 2021 09:24:30  Yunsheng Lin wrote:
>>>> On 2021/4/9 17:09, Hillf Danton wrote:
>>>>> On Fri, 9 Apr 2021 07:31:03  Juergen Gross wrote:
>>>>>> On 25.03.21 04:13, Yunsheng Lin wrote:
>>>>>> I have a setup which is able to reproduce the issue quite reliably:
>>>>>>
>>>>>> In a Xen guest I'm mounting 8 NFS shares and run sysbench fileio on
>>>>>> each of them. The average latency reported by sysbench is well below
>>>>>> 1 msec, but at least once per hour I get latencies in the minute
>>>>>> range.
>>>>>>
>>>>>> With this patch I don't see these high latencies any longer (test
>>>>>> is running for more than 20 hours now).
>>>>>>
>>>>>> So you can add my:
>>>>>>
>>>>>> Tested-by: Juergen Gross 
>>>>>>
>>>>>
>>>>> If retry is allowed in the dequeue method then a simple seqcount can do 
>>>>> the
>>>>> work of serializing enqueuer and dequeuer. IIUC it was not attempted last 
>>>>> year.
>>>>
>>>> At the first glance, I do not think the below patch fix the data race
>>>
>>> Thanks for taking a look.
>>>
>>>> described in the commit log, as it does not handle the time window
>>>> between dequeuing and q->seqlock releasing, as below:
>>>>
>>> Yes the time window does exist.
>>>
>>>> The cpu1 may not see the qdisc->pad changed after pfifo_fast_dequeue(),
>>>> and cpu2 is not able to take the q->seqlock yet because cpu1 do not
>>>> release the q->seqlock.
>>>>
>>> It's now covered by extending the seqcount aperture a bit.
>>>
>>> --- x/net/sched/sch_generic.c
>>> +++ y/net/sched/sch_generic.c
>>> @@ -380,14 +380,23 @@ void __qdisc_run(struct Qdisc *q)
>>>  {
>>> int quota = dev_tx_weight;
>>> int packets;
>>> +   int seq;
>>> +
>>> +again:
>>> +   seq = READ_ONCE(q->pad);
>>> +   smp_rmb();
>>>  
>>> while (qdisc_restart(q, )) {
>>> quota -= packets;
>>> if (quota <= 0) {
>>> __netif_schedule(q);
>>> -   break;
>>> +   return;
>>> }
>>> }
>>> +
>>> +   smp_rmb();
>>> +   if (seq != READ_ONCE(q->pad))
>>> +   goto again;
>>
>> As my understanding, there is still time window between q->pad checking
>> above and q->seqlock releasing in qdisc_run_end().
>>
> Then extend the cover across q->seqlock on top of the flag you added.

Yes, the below patch seems to fix the data race described in
the commit log.
Then what is the difference between my patch and your patch below:)

> 
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -36,6 +36,7 @@ struct qdisc_rate_table {
>  enum qdisc_state_t {
>   __QDISC_STATE_SCHED,
>   __QDISC_STATE_DEACTIVATED,
> + __QDISC_STATE_NEED_RESCHEDULE,
>  };
>  
>  struct qdisc_size_table {
> @@ -176,8 +177,13 @@ static inline bool qdisc_run_begin(struc
>  static inline void qdisc_run_end(struct Qdisc *qdisc)
>  {
>   write_seqcount_end(>running);
> - if (qdisc->flags & TCQ_F_NOLOCK)
> + if (qdisc->flags & TCQ_F_NOLOCK) {
>   spin_unlock(>seqlock);
> +
> + if (test_and_clear_bit(__QDISC_STATE_NEED_RESCHEDULE,
> + >state))
> + __netif_schedule(qdisc);
> + }
>  }
>  
>  static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
> --- a/net/sched/sch_generic.c
> +++ b/net/sched/sch_generic.c
> @@ -381,13 +381,21 @@ void __qdisc_run(struct Qdisc *q)
>   int quota = dev_tx_weight;
>   int packets;
>  
> + if (q->flags & TCQ_F_NOLOCK)
> + clear_bit(__QDISC_STATE_NEED_RESCHEDULE, >state);
> +again:
>   while (qdisc_restart(q, )) {
>   quota -= packets;
>   if (quota <= 0) {
>   __netif_schedule(q);
> - break;
> + return;
>   }
>   }
> +
> + if (q->flags & TCQ_F_NOLOCK)
> + if (test_and_clear_bit(__QDISC_STATE_NEED_RESCHEDULE,
> + >state))
> + goto again;
>  }
>  
>  unsigned long dev_trans_start(struct net_device *dev)
> @@ -632,6 +640,9 @@ static int pfifo_fast_enqueue(struct sk_
>   return qdisc_drop(skb, qdisc, to_free);
>   }
>  
> + if (qdisc->flags & TCQ_F_NOLOCK)
> + set_bit(__QDISC_STATE_NEED_RESCHEDULE, >state);

Doing set_bit() in pfifo_fast_enqueue() unconditionally does not
seems to be performance friendly, because it requires exclusive access
to the cache line of qdisc->state.
Perhaps do some performance test?


> +
>   qdisc_update_stats_at_enqueue(qdisc, pkt_len);
>   return NET_XMIT_SUCCESS;
>  }
> 
> .
> 



Re: [PATCH net v3] net: sched: fix packet stuck problem for lockless qdisc

2021-04-11 Thread Yunsheng Lin
On 2021/4/12 11:21, Hillf Danton wrote:
> On Mon, 12 Apr 2021 09:24:30  Yunsheng Lin wrote:
>> On 2021/4/9 17:09, Hillf Danton wrote:
>>> On Fri, 9 Apr 2021 07:31:03  Juergen Gross wrote:
>>>> On 25.03.21 04:13, Yunsheng Lin wrote:
>>>> I have a setup which is able to reproduce the issue quite reliably:
>>>>
>>>> In a Xen guest I'm mounting 8 NFS shares and run sysbench fileio on
>>>> each of them. The average latency reported by sysbench is well below
>>>> 1 msec, but at least once per hour I get latencies in the minute
>>>> range.
>>>>
>>>> With this patch I don't see these high latencies any longer (test
>>>> is running for more than 20 hours now).
>>>>
>>>> So you can add my:
>>>>
>>>> Tested-by: Juergen Gross 
>>>>
>>>
>>> If retry is allowed in the dequeue method then a simple seqcount can do the
>>> work of serializing enqueuer and dequeuer. IIUC it was not attempted last 
>>> year.
>>
>> At the first glance, I do not think the below patch fix the data race
> 
> Thanks for taking a look.
> 
>> described in the commit log, as it does not handle the time window
>> between dequeuing and q->seqlock releasing, as below:
>>
> Yes the time window does exist.
> 
>> The cpu1 may not see the qdisc->pad changed after pfifo_fast_dequeue(),
>> and cpu2 is not able to take the q->seqlock yet because cpu1 do not
>> release the q->seqlock.
>>
> It's now covered by extending the seqcount aperture a bit.
> 
> --- x/net/sched/sch_generic.c
> +++ y/net/sched/sch_generic.c
> @@ -380,14 +380,23 @@ void __qdisc_run(struct Qdisc *q)
>  {
>   int quota = dev_tx_weight;
>   int packets;
> + int seq;
> +
> +again:
> + seq = READ_ONCE(q->pad);
> + smp_rmb();
>  
>   while (qdisc_restart(q, )) {
>   quota -= packets;
>   if (quota <= 0) {
>   __netif_schedule(q);
> - break;
> + return;
>   }
>   }
> +
> + smp_rmb();
> + if (seq != READ_ONCE(q->pad))
> + goto again;

As my understanding, there is still time window between q->pad checking
above and q->seqlock releasing in qdisc_run_end().

>  }
>  
>  unsigned long dev_trans_start(struct net_device *dev)
> @@ -632,6 +641,9 @@ static int pfifo_fast_enqueue(struct sk_
>   return qdisc_drop(skb, qdisc, to_free);
>   }
>  
> + qdisc->pad++;
> + smp_wmb();
> +
>   qdisc_update_stats_at_enqueue(qdisc, pkt_len);
>   return NET_XMIT_SUCCESS;
>  }
> 
> .
> 



Re: [PATCH net v3] net: sched: fix packet stuck problem for lockless qdisc

2021-04-11 Thread Yunsheng Lin
On 2021/4/9 17:09, Hillf Danton wrote:
> On Fri, 9 Apr 2021 07:31:03  Juergen Gross wrote:
>> On 25.03.21 04:13, Yunsheng Lin wrote:
>> I have a setup which is able to reproduce the issue quite reliably:
>>
>> In a Xen guest I'm mounting 8 NFS shares and run sysbench fileio on
>> each of them. The average latency reported by sysbench is well below
>> 1 msec, but at least once per hour I get latencies in the minute
>> range.
>>
>> With this patch I don't see these high latencies any longer (test
>> is running for more than 20 hours now).
>>
>> So you can add my:
>>
>> Tested-by: Juergen Gross 
>>
> 
> If retry is allowed in the dequeue method then a simple seqcount can do the
> work of serializing enqueuer and dequeuer. IIUC it was not attempted last 
> year.

At the first glance, I do not think the below patch fix the data race
described in the commit log, as it does not handle the time window
between dequeuing and q->seqlock releasing, as below:

The cpu1 may not see the qdisc->pad changed after pfifo_fast_dequeue(),
and cpu2 is not able to take the q->seqlock yet because cpu1 do not
release the q->seqlock.

> 
> --- x/net/sched/sch_generic.c
> +++ y/net/sched/sch_generic.c
> @@ -632,6 +632,9 @@ static int pfifo_fast_enqueue(struct sk_
>   return qdisc_drop(skb, qdisc, to_free);
>   }
>  
> + qdisc->pad++;
> + smp_wmb();
> +
>   qdisc_update_stats_at_enqueue(qdisc, pkt_len);
>   return NET_XMIT_SUCCESS;
>  }
> @@ -641,6 +644,11 @@ static struct sk_buff *pfifo_fast_dequeu
>   struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
>   struct sk_buff *skb = NULL;
>   int band;
> + int seq;
> +
> +again:
> + seq = READ_ONCE(qdisc->pad);
> + smp_rmb();
>  
>   for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
>   struct skb_array *q = band2list(priv, band);
> @@ -652,10 +660,15 @@ static struct sk_buff *pfifo_fast_dequeu
>   }
>   if (likely(skb)) {
>   qdisc_update_stats_at_dequeue(qdisc, skb);
> - } else {
> - WRITE_ONCE(qdisc->empty, true);
> + return skb;
>   }
>  
> + smp_rmb();
> + if (seq != READ_ONCE(qdisc->pad))
> + goto again;
> +
> + WRITE_ONCE(qdisc->empty, true);
> +
>   return skb;
>  }
>  
> 
> .
> 



Re: [PATCH net v3] net: sched: fix packet stuck problem for lockless qdisc

2021-04-11 Thread Yunsheng Lin
On 2021/4/9 13:31, Juergen Gross wrote:
> On 25.03.21 04:13, Yunsheng Lin wrote:
>> Lockless qdisc has below concurrent problem:
>>  cpu0 cpu1
>>   . .
>> q->enqueue .
>>   . .
>> qdisc_run_begin()  .
>>   . .
>> dequeue_skb()  .
>>   . .
>> sch_direct_xmit()  .
>>   . .
>>   .q->enqueue
>>   . qdisc_run_begin()
>>   .return and do nothing
>>   . .
>> qdisc_run_end().
>>
>> cpu1 enqueue a skb without calling __qdisc_run() because cpu0
>> has not released the lock yet and spin_trylock() return false
>> for cpu1 in qdisc_run_begin(), and cpu0 do not see the skb
>> enqueued by cpu1 when calling dequeue_skb() because cpu1 may
>> enqueue the skb after cpu0 calling dequeue_skb() and before
>> cpu0 calling qdisc_run_end().
>>
>> Lockless qdisc has below another concurrent problem when
>> tx_action is involved:
>>
>> cpu0(serving tx_action) cpu1 cpu2
>>.   ..
>>.  q->enqueue.
>>.qdisc_run_begin()   .
>>.  dequeue_skb() .
>>.   .q->enqueue
>>.   ..
>>. sch_direct_xmit()  .
>>.   . qdisc_run_begin()
>>.   .   return and do nothing
>>.   ..
>>   clear __QDISC_STATE_SCHED..
>>   qdisc_run_begin()..
>>   return and do nothing..
>>.   ..
>>.qdisc_run_end() .
>>
>> This patch fixes the above data race by:
>> 1. Get the flag before doing spin_trylock().
>> 2. If the first spin_trylock() return false and the flag is not
>> set before the first spin_trylock(), Set the flag and retry
>> another spin_trylock() in case other CPU may not see the new
>> flag after it releases the lock.
>> 3. reschedule if the flags is set after the lock is released
>> at the end of qdisc_run_end().
>>
>> For tx_action case, the flags is also set when cpu1 is at the
>> end if qdisc_run_end(), so tx_action will be rescheduled
>> again to dequeue the skb enqueued by cpu2.
>>
>> Only clear the flag before retrying a dequeuing when dequeuing
>> returns NULL in order to reduce the overhead of the above double
>> spin_trylock() and __netif_schedule() calling.
>>
>> The performance impact of this patch, tested using pktgen and
>> dummy netdev with pfifo_fast qdisc attached:
>>
>>   threads  without+this_patch   with+this_patch  delta
>>  12.61Mpps2.60Mpps   -0.3%
>>  23.97Mpps3.82Mpps   -3.7%
>>  45.62Mpps5.59Mpps   -0.5%
>>  82.78Mpps2.77Mpps   -0.3%
>> 162.22Mpps2.22Mpps   -0.0%
>>
>> Fixes: 6b3ba9146fe6 ("net: sched: allow qdiscs to handle locking")
>> Signed-off-by: Yunsheng Lin 
> 
> I have a setup which is able to reproduce the issue quite reliably:
> 
> In a Xen guest I'm mounting 8 NFS shares and run sysbench fileio on
> each of them. The average latency reported by sysbench is well below
> 1 msec, but at least once per hour I get latencies in the minute
> range.
> 
> With this patch I don't see these high latencies any longer (test
> is running for more than 20 hours now).
> 
> So you can add my:
> 
> Tested-by: Juergen Gross 

Hi, Juergen

Thanks for the testing.

With the simulated test case suggested by Michal, I still has some
potential issue to debug, hopefully will send out new version in
this week.

Also, is it possible to run your testcase any longer? I think "72 hours"
would be enough to testify that it fixes the problem completely:)



> 
> 
> Juergen



Re: Packet gets stuck in NOLOCK pfifo_fast qdisc

2021-04-06 Thread Yunsheng Lin
On 2021/4/6 15:31, Michal Kubecek wrote:
> On Tue, Apr 06, 2021 at 10:46:29AM +0800, Yunsheng Lin wrote:
>> On 2021/4/6 9:49, Cong Wang wrote:
>>> On Sat, Apr 3, 2021 at 5:23 AM Jiri Kosina  wrote:
>>>>
>>>> I am still planning to have Yunsheng Lin's (CCing) fix [1] tested in the
>>>> coming days. If it works, then we can consider proceeding with it,
>>>> otherwise I am all for reverting the whole NOLOCK stuff.
>>>>
>>>> [1] 
>>>> https://lore.kernel.org/linux-can/1616641991-14847-1-git-send-email-linyunsh...@huawei.com/T/#u
>>>
>>> I personally prefer to just revert that bit, as it brings more troubles
>>> than gains. Even with Yunsheng's patch, there are still some issues.
>>> Essentially, I think the core qdisc scheduling code is not ready for
>>> lockless, just look at those NOLOCK checks in sch_generic.c. :-/
>>
>> I am also awared of the NOLOCK checks too:), and I am willing to
>> take care of it if that is possible.
>>
>> As the number of cores in a system is increasing, it is the trend
>> to become lockless, right? Even there is only one cpu involved, the
>> spinlock taking and releasing takes about 30ns on our arm64 system
>> when CONFIG_PREEMPT_VOLUNTARY is enable(ip forwarding testing).
> 
> I agree with the benefits but currently the situation is that we have
> a race condition affecting the default qdisc which is being hit in
> production and can cause serious trouble which is made worse by commit
> 1f3279ae0c13 ("tcp: avoid retransmits of TCP packets hanging in host
> queues") preventing the retransmits of the stuck packet being sent.
> 
> Perhaps rather than patching over current implementation which requires
> more and more complicated hacks to work around the fact that we cannot
> make the "queue is empty" check and leaving the critical section atomic,
> it would make sense to reimplement it in a way which would allow us
> making it atomic.

Yes, reimplementing that is also an option.
But what if reimplemention also has the same problem if we do not find
the root cause of this problem? I think it better to find the root cause
of it first?

> 
> Michal
> 
> 
> .
> 



Re: Packet gets stuck in NOLOCK pfifo_fast qdisc

2021-04-06 Thread Yunsheng Lin
On 2021/4/6 18:13, Juergen Gross wrote:
> On 06.04.21 09:06, Michal Kubecek wrote:
>> On Tue, Apr 06, 2021 at 08:55:41AM +0800, Yunsheng Lin wrote:
>>>
>>> Hi, Jiri
>>> Do you have a reproducer that can be shared here?
>>> With reproducer, I can debug and test it myself too.
>>
>> I'm afraid we are not aware of a simple reproducer. As mentioned in the
>> original discussion, the race window is extremely small and the other
>> thread has to do quite a lot in the meantime which is probably why, as
>> far as I know, this was never observed on real hardware, only in
>> virtualization environments. NFS may also be important as, IIUC, it can
>> often issue an RPC request from a different CPU right after a data
>> transfer. Perhaps you could cheat a bit and insert a random delay
>> between the empty queue check and releasing q->seqlock to make it more
>> likely to happen.
>>
>> Other than that, it's rather just "run this complex software in a xen VM
>> and wait".
> 
> Being the one who has managed to reproduce the issue I can share my
> setup, maybe you can setup something similar (we have seen the issue
> with this kind of setup on two different machines).
> 
> I'm using a physical machine with 72 cpus and 48 GB of memory. It is
> running Xen as virtualization platform.
> 
> Xen dom0 is limited to 40 vcpus and 32 GB of memory, the dom0 vcpus are
> limited to run on the first 40 physical cpus (no idea whether that
> matters, though).
> 
> In a guest with 16 vcpu and 8GB of memory I'm running 8 parallel
> sysbench instances in a loop, those instances are prepared via
> 
> sysbench --file-test-mode=rndrd --test=fileio prepare
> 
> and then started in a do while loop via:
> 
> sysbench --test=fileio --file-test-mode=rndrw --rand-seed=0 --max-time=300 
> --max-requests=0 run
> 
> Each instance is using a dedicated NFS mount to run on. The NFS
> server for the 8 mounts is running in dom0 of the same server, the
> data of the NFS shares is located in a RAM disk (size is a little bit
> above 16GB). The shares are mounted in the guest with:
> 
> mount -t nfs -o 
> rw,proto=tcp,nolock,nfsvers=3,rsize=65536,wsize=65536,nosharetransport 
> dom0:/ramdisk/share[1-8] /mnt[1-8]
> 
> The guests vcpus are limited to run on physical cpus 40-55, on the same
> physical cpus I have 16 small guests running eating up cpu time, each of
> those guests is pinned to one of the physical cpus 40-55.
> 
> That's basically it. All you need to do is to watch out for sysbench
> reporting maximum latencies above one second or so (in my setup there
> are latencies of several minutes at least once each hour of testing).
> 
> In case you'd like to have some more details about the setup don't
> hesitate to contact me directly. I can provide you with some scripts
> and config runes if you want.

The setup is rather complex, I just tried Michal' suggestion using
the below patch:

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 9fb0ad4..b691eda 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -207,6 +207,11 @@ static inline void qdisc_run_end(struct Qdisc *qdisc)
 {
write_seqcount_end(>running);
if (qdisc->flags & TCQ_F_NOLOCK) {
+   udelay(1);
+   udelay(1);
+   udelay(1);
+   udelay(1);
+   udelay(1);
spin_unlock(>seqlock);

if (unlikely(test_bit(__QDISC_STATE_MISSED,
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 6d7f954..a83c520 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -630,6 +630,8 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct 
Qdisc *qdisc,
return qdisc_drop_cpu(skb, qdisc, to_free);
else
return qdisc_drop(skb, qdisc, to_free);
+   } else {
+   skb->enqueue_jiffies = jiffies;
}

qdisc_update_stats_at_enqueue(qdisc, pkt_len);
@@ -653,6 +655,13 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc 
*qdisc)
skb = __skb_array_consume(q);
}
if (likely(skb)) {
+   unsigned int delay_ms;
+
+   delay_ms = jiffies_to_msecs(jiffies - skb->enqueue_jiffies);
linyunsheng@plinth:~/ci/kernel$ vi qdisc_reproducer.patch
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -920,7 +920,7 @@ struct sk_buff {
*data;
unsigned inttruesize;
refcount_t  users;
-
+   unsigned long   enqueue_jiffies;
 #ifdef CONFIG_SKB_EXTENSIONS
/* only useable after checking ->active_extensions != 0 */
struc

Re: Packet gets stuck in NOLOCK pfifo_fast qdisc

2021-04-05 Thread Yunsheng Lin
On 2021/4/6 9:49, Cong Wang wrote:
> On Sat, Apr 3, 2021 at 5:23 AM Jiri Kosina  wrote:
>>
>> I am still planning to have Yunsheng Lin's (CCing) fix [1] tested in the
>> coming days. If it works, then we can consider proceeding with it,
>> otherwise I am all for reverting the whole NOLOCK stuff.
>>
>> [1] 
>> https://lore.kernel.org/linux-can/1616641991-14847-1-git-send-email-linyunsh...@huawei.com/T/#u
> 
> I personally prefer to just revert that bit, as it brings more troubles
> than gains. Even with Yunsheng's patch, there are still some issues.
> Essentially, I think the core qdisc scheduling code is not ready for
> lockless, just look at those NOLOCK checks in sch_generic.c. :-/

I am also awared of the NOLOCK checks too:), and I am willing to
take care of it if that is possible.

As the number of cores in a system is increasing, it is the trend
to become lockless, right? Even there is only one cpu involved, the
spinlock taking and releasing takes about 30ns on our arm64 system
when CONFIG_PREEMPT_VOLUNTARY is enable(ip forwarding testing).

Currently I has three ideas to optimize the lockless qdisc:
1. implement the qdisc bypass for lockless qdisc too, see [1].

2. implement lockless enqueuing for lockless qdisc using the idea
   from Jason and Toke. And it has a noticable proformance increase with
   1-4 threads running using the below prototype based on ptr_ring.

static inline int __ptr_ring_multi_produce(struct ptr_ring *r, void *ptr)
{

int producer, next_producer;


do {
producer = READ_ONCE(r->producer);
if (unlikely(!r->size) || r->queue[producer])
return -ENOSPC;
next_producer = producer + 1;
if (unlikely(next_producer >= r->size))
next_producer = 0;
} while(cmpxchg_relaxed(>producer, producer, next_producer) != 
producer);

/* Make sure the pointer we are storing points to a valid data. */
/* Pairs with the dependency ordering in __ptr_ring_consume. */
smp_wmb();

WRITE_ONCE(r->queue[producer], ptr);
return 0;
}

3. Maybe it is possible to remove the netif_tx_lock for lockless qdisc
   too, because dev_hard_start_xmit is also in the protection of
   qdisc_run_begin()/qdisc_run_end()(if there is only one qdisc using
   a netdev queue, which is true for pfifo_fast, I believe).


[1]. 
https://patchwork.kernel.org/project/netdevbpf/patch/1616404156-11772-1-git-send-email-linyunsh...@huawei.com/

> 
> Thanks.
> 
> .
> 



Re: Packet gets stuck in NOLOCK pfifo_fast qdisc

2021-04-05 Thread Yunsheng Lin
On 2021/4/3 20:23, Jiri Kosina wrote:
> On Sat, 3 Apr 2021, Hillf Danton wrote:
> 
> Sure. Seems they crept in over time. I had some plans to write a
> lockless HTB implementation. But with fq+EDT with BPF it seems that
> it is no longer needed, we have a more generic/better solution.  So
> I dropped it. Also most folks should really be using fq, fq_codel,
> etc. by default anyways. Using pfifo_fast alone is not ideal IMO.

 Half a year later, we still have the NOLOCK implementation
 present, and pfifo_fast still does set the TCQ_F_NOLOCK flag on itself.

 And we've just been bitten by this very same race which appears to be
 still unfixed, with single packet being stuck in pfifo_fast qdisc
 basically indefinitely due to this very race that this whole thread began
 with back in 2019.

 Unless there are

(a) any nice ideas how to solve this in an elegant way without
(re-)introducing extra spinlock (Cong's fix) or

(b) any objections to revert as per the argumentation above

 I'll be happy to send a revert of the whole NOLOCK implementation next
 week.

>>> Jiri
>>>
>>
>> Feel free to revert it as the scorch wont end without a deluge.
> 
> I am still planning to have Yunsheng Lin's (CCing) fix [1] tested in the 
> coming days. If it works, then we can consider proceeding with it, 
> otherwise I am all for reverting the whole NOLOCK stuff.

Hi, Jiri
Do you have a reproducer that can be shared here?
With reproducer, I can debug and test it myself too.

Thanks.

> 
> [1] 
> https://lore.kernel.org/linux-can/1616641991-14847-1-git-send-email-linyunsh...@huawei.com/T/#u
> 



[PATCH net v3] net: sched: fix packet stuck problem for lockless qdisc

2021-03-24 Thread Yunsheng Lin
Lockless qdisc has below concurrent problem:
cpu0 cpu1
 . .
q->enqueue .
 . .
qdisc_run_begin()  .
 . .
dequeue_skb()  .
 . .
sch_direct_xmit()  .
 . .
 .q->enqueue
 . qdisc_run_begin()
 .return and do nothing
 . .
qdisc_run_end().

cpu1 enqueue a skb without calling __qdisc_run() because cpu0
has not released the lock yet and spin_trylock() return false
for cpu1 in qdisc_run_begin(), and cpu0 do not see the skb
enqueued by cpu1 when calling dequeue_skb() because cpu1 may
enqueue the skb after cpu0 calling dequeue_skb() and before
cpu0 calling qdisc_run_end().

Lockless qdisc has below another concurrent problem when
tx_action is involved:

cpu0(serving tx_action) cpu1 cpu2
  .   ..
  .  q->enqueue.
  .qdisc_run_begin()   .
  .  dequeue_skb() .
  .   .q->enqueue
  .   ..
  . sch_direct_xmit()  .
  .   . qdisc_run_begin()
  .   .   return and do nothing
  .   ..
 clear __QDISC_STATE_SCHED..
 qdisc_run_begin()..
 return and do nothing..
  .   ..
  .qdisc_run_end() .

This patch fixes the above data race by:
1. Get the flag before doing spin_trylock().
2. If the first spin_trylock() return false and the flag is not
   set before the first spin_trylock(), Set the flag and retry
   another spin_trylock() in case other CPU may not see the new
   flag after it releases the lock.
3. reschedule if the flags is set after the lock is released
   at the end of qdisc_run_end().

For tx_action case, the flags is also set when cpu1 is at the
end if qdisc_run_end(), so tx_action will be rescheduled
again to dequeue the skb enqueued by cpu2.

Only clear the flag before retrying a dequeuing when dequeuing
returns NULL in order to reduce the overhead of the above double
spin_trylock() and __netif_schedule() calling.

The performance impact of this patch, tested using pktgen and
dummy netdev with pfifo_fast qdisc attached:

 threads  without+this_patch   with+this_patch  delta
12.61Mpps2.60Mpps   -0.3%
23.97Mpps3.82Mpps   -3.7%
45.62Mpps5.59Mpps   -0.5%
82.78Mpps2.77Mpps   -0.3%
   162.22Mpps2.22Mpps   -0.0%

Fixes: 6b3ba9146fe6 ("net: sched: allow qdiscs to handle locking")
Signed-off-by: Yunsheng Lin 
---
V3: fix a compile error and a few comment typo, remove the
__QDISC_STATE_DEACTIVATED checking, and update the
performance data.
V2: Avoid the overhead of fixing the data race as much as
possible.
---
 include/net/sch_generic.h | 38 +-
 net/sched/sch_generic.c   | 12 
 2 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index f7a6e14..e3f46eb 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -36,6 +36,7 @@ struct qdisc_rate_table {
 enum qdisc_state_t {
__QDISC_STATE_SCHED,
__QDISC_STATE_DEACTIVATED,
+   __QDISC_STATE_NEED_RESCHEDULE,
 };
 
 struct qdisc_size_table {
@@ -159,8 +160,38 @@ static inline bool qdisc_is_empty(const struct Qdisc 
*qdisc)
 static inline bool qdisc_run_begin(struct Qdisc *qdisc)
 {
if (qdisc->flags & TCQ_F_NOLOCK) {
+   bool dont_retry = test_bit(__QDISC_STATE_NEED_RESCHEDULE,
+  >state);
+
+   if (spin_trylock(>seqlock))
+   goto nolock_empty;
+
+   /* If the flag is set before doing the spin_trylock() and
+* the above spin_trylock() return false, it means other cpu
+* holding the lock will do dequeuing for us, or it wil see
+* the flag set after releasing lock and reschedule the
+* net_tx_action() to do the dequeuing.
+*/
+   if (dont_retry)
+   return false;
+
+   /* We could do set_bit() before the first spin_trylock(),
+* and avoid doing second spin_trylock() completely, then
+* we could have multi cpus doing the set_bit(). Here use
+* dont_retry to avoid doing the set_bit() and the second
+* spin_trylock(), w

Re: [PATCH net v2] net: sched: fix packet stuck problem for lockless qdisc

2021-03-24 Thread Yunsheng Lin
On 2021/3/25 3:20, Cong Wang wrote:
> On Tue, Mar 23, 2021 at 7:24 PM Yunsheng Lin  wrote:
>> @@ -176,8 +207,23 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
>>  static inline void qdisc_run_end(struct Qdisc *qdisc)
>>  {
>> write_seqcount_end(>running);
>> -   if (qdisc->flags & TCQ_F_NOLOCK)
>> +   if (qdisc->flags & TCQ_F_NOLOCK) {
>> spin_unlock(>seqlock);
>> +
>> +   /* qdisc_run_end() is protected by RCU lock, and
>> +* qdisc reset will do a synchronize_net() after
>> +* setting __QDISC_STATE_DEACTIVATED, so testing
>> +* the below two bits separately should be fine.
> 
> Hmm, why synchronize_net() after setting this bit is fine? It could
> still be flipped right after you test RESCHEDULE bit.

That depends on when it will be fliped again.

As I see:
1. __QDISC_STATE_DEACTIVATED is set during dev_deactivate() process,
   which should also wait for all process related to "test_bit(
   __QDISC_STATE_NEED_RESCHEDULE, >state)" to finish by calling
   synchronize_net() and checking some_qdisc_is_busy().

2. it is cleared during dev_activate() process.

And dev_deactivate() and dev_activate() is protected by RTNL lock, or
serialized by linkwatch.

> 
> 
>> +* For qdisc_run() in net_tx_action() case, we
>> +* really should provide rcu protection explicitly
>> +* for document purposes or PREEMPT_RCU.
>> +*/
>> +   if (unlikely(test_bit(__QDISC_STATE_NEED_RESCHEDULE,
>> + >state) &&
>> +!test_bit(__QDISC_STATE_DEACTIVATED,
>> +  >state)))
> 
> Why do you want to test __QDISC_STATE_DEACTIVATED bit at all?
> dev_deactivate_many() will wait for those scheduled but being
> deactivated, so what's the problem of scheduling it even with this bit?

The problem I tried to fix is:

  CPU0(calling dev_deactivate)   CPU1(calling qdisc_run_end)   CPU2(calling 
tx_atcion)
 .   __netif_schedule()   .
 . set __QDISC_STATE_SCHED.
 ..   .
clear __QDISC_STATE_DEACTIVATED   .   .
 synchronize_net().   .
 ..   .
 ..  clear 
__QDISC_STATE_SCHED
 ..   .
 some_qdisc_is_busy() return false.   .
 ..   .
 ..  qdisc_run()

some_qdisc_is_busy() checks if the qdisc is busy by checking __QDISC_STATE_SCHED
and spin_is_locked(>seqlock) for lockless qdisc, and some_qdisc_is_busy()
return false for CPU0 because CPU2 has cleared the __QDISC_STATE_SCHED and has 
not
taken the qdisc->seqlock yet, qdisc is clearly still busy when qdisc_run() is 
run
by CPU2 later.

So you are right, testing __QDISC_STATE_DEACTIVATED does not completely solve
the above data race, and there are __netif_schedule() called by 
dev_requeue_skb()
and __qdisc_run() too, which need the same fixing.

So will remove the __QDISC_STATE_DEACTIVATED testing for this patch first, and
deal with it later.

> 
> Thanks.
> 
> .
> 



Re: [Linuxarm] Re: [RFC v2] net: sched: implement TCQ_F_CAN_BYPASS for lockless qdisc

2021-03-23 Thread Yunsheng Lin
On 2021/3/24 9:49, Cong Wang wrote:
> On Sun, Mar 21, 2021 at 5:55 PM Yunsheng Lin  wrote:
>>
>> On 2021/3/20 2:15, Cong Wang wrote:
>>> On Thu, Mar 18, 2021 at 12:33 AM Yunsheng Lin  
>>> wrote:
>>>>
>>>> On 2021/3/17 21:45, Jason A. Donenfeld wrote:
>>>>> On 3/17/21, Toke Høiland-Jørgensen  wrote:
>>>>>> Cong Wang  writes:
>>>>>>
>>>>>>> On Mon, Mar 15, 2021 at 2:07 PM Jakub Kicinski  wrote:
>>>>>>>>
>>>>>>>> I thought pfifo was supposed to be "lockless" and this change
>>>>>>>> re-introduces a lock between producer and consumer, no?
>>>>>>>
>>>>>>> It has never been truly lockless, it uses two spinlocks in the ring
>>>>>>> buffer
>>>>>>> implementation, and it introduced a q->seqlock recently, with this patch
>>>>>>> now we have priv->lock, 4 locks in total. So our "lockless" qdisc ends
>>>>>>> up having more locks than others. ;) I don't think we are going to a
>>>>>>> right direction...
>>>>>>
>>>>>> Just a thought, have you guys considered adopting the lockless MSPC ring
>>>>>> buffer recently introduced into Wireguard in commit:
>>>>>>
>>>>>> 8b5553ace83c ("wireguard: queueing: get rid of per-peer ring buffers")
>>>>>>
>>>>>> Jason indicated he was willing to work on generalising it into a
>>>>>> reusable library if there was a use case for it. I haven't quite though
>>>>>> through the details of whether this would be such a use case, but
>>>>>> figured I'd at least mention it :)
>>>>>
>>>>> That offer definitely still stands. Generalization sounds like a lot of 
>>>>> fun.
>>>>>
>>>>> Keep in mind though that it's an eventually consistent queue, not an
>>>>> immediately consistent one, so that might not match all use cases. It
>>>>> works with wg because we always trigger the reader thread anew when it
>>>>> finishes, but that doesn't apply to everyone's queueing setup.
>>>>
>>>> Thanks for mentioning this.
>>>>
>>>> "multi-producer, single-consumer" seems to match the lockless qdisc's
>>>> paradigm too, for now concurrent enqueuing/dequeuing to the pfifo_fast's
>>>> queues() is not allowed, it is protected by producer_lock or consumer_lock.
>>>>
>>>> So it would be good to has lockless concurrent enqueuing, while dequeuing
>>>> can be protected by qdisc_lock() or q->seqlock, which meets the 
>>>> "multi-producer,
>>>> single-consumer" paradigm.
>>>
>>> I don't think so. Usually we have one queue for each CPU so we can expect
>>> each CPU has a lockless qdisc assigned, but we can not assume this in
>>> the code, so we still have to deal with multiple CPU's sharing a lockless 
>>> qdisc,
>>> and we usually enqueue and dequeue in process context, so it means we could
>>> have multiple producers and multiple consumers.
>>
>> For lockless qdisc, dequeuing is always within the qdisc_run_begin() and
>> qdisc_run_end(), so multiple consumers is protected with each other by
>> q->seqlock .
> 
> So are you saying you will never go lockless for lockless qdisc? I thought
> you really want to go lockless with Jason's proposal of MPMC ring buffer
> code.

I think we has different definition about lockless qdisc.

For my understanding, the dequeuing is within the qdisc_run_begin()
and qdisc_run_end(), so it is always protected by q->seqlock for
lockless qdisck currently, and by lockless qdisc, I never mean
lockless dequeuing, and I am not proposing lockless dequeuing
currently.

Current lockless qdisc for pfifo_fast only means there is no lock
for protection between dequeuing and enqueuing, which also means
when __qdisc_run() is dequeuing a skb while other cpu is enqueuing
a skb.

But enqueuing is protected by producer_lock in skb_array_produce(),
so only one cpu can do the enqueuing at the same time, so I am
proposing to use Jason's proposal to enable multi cpus to do
concurrent enqueuing without taking any lock.

> 
>>
>> For enqueuing, multiple consumers is protected by producer_lock, see
>> pfifo_fast_enqueue() -> skb_array_produce() -> ptr_ring_produce().
> 
> I think you seriously misunderstand how we classify MPMC or MPSC,
> it is not about how we lock them, it is about whether we truly have
> a single or multiple consumers regardless of locks used, because the
> goal is to go lockless.

I think I am only relying on the MPSC(multi-produce & single-consumer),
as explained above.

> 
>> I am not sure if lockless MSPC can work with the process context, but
>> even if not, the enqueuing is also protected by rcu_read_lock_bh(),
>> which provides some kind of atomicity, so that producer_lock can be
>> reomved when lockless MSPC is used.
> 
> 
> Not sure if I can even understand what you are saying here, Jason's
> code only disables preemption with busy wait, I can't see why it can
> not be used in the process context.

I am saying q->enqeue() is protected by rcu_read_lock_bh().
rcu_read_lock_bh() will disable preemption for us for most configuation,
otherwise it will break netdev_xmit_more() interface too, for it relies
on the cpu not being prempted by using per cpu var(softnet_data.xmit.more).

> 
> Thanks.
> 
> .
> 



[PATCH net v2] net: sched: fix packet stuck problem for lockless qdisc

2021-03-23 Thread Yunsheng Lin
Lockless qdisc has below concurrent problem:
cpu0 cpu1
 . .
q->enqueue .
 . .
qdisc_run_begin()  .
 . .
dequeue_skb()  .
 . .
sch_direct_xmit()  .
 . .
 .q->enqueue
 . qdisc_run_begin()
 .return and do nothing
 . .
qdisc_run_end().

cpu1 enqueue a skb without calling __qdisc_run() because cpu0
has not released the lock yet and spin_trylock() return false
for cpu1 in qdisc_run_begin(), and cpu0 do not see the skb
enqueued by cpu1 when calling dequeue_skb() because cpu1 may
enqueue the skb after cpu0 calling dequeue_skb() and before
cpu0 calling qdisc_run_end().

Lockless qdisc has below another concurrent problem when
tx_action is involved:

cpu0(serving tx_action) cpu1 cpu2
  .   ..
  .  q->enqueue.
  .qdisc_run_begin()   .
  .  dequeue_skb() .
  .   .q->enqueue
  .   ..
  . sch_direct_xmit()  .
  .   . qdisc_run_begin()
  .   .   return and do nothing
  .   ..
 clear __QDISC_STATE_SCHED..
 qdisc_run_begin()..
 return and do nothing..
  .   ..
  .qdisc_run_end() .

This patch fixes the above data race by:
1. Get the flag before doing spin_trylock().
2. If the first spin_trylock() return false and the flag is not
   set before the first spin_trylock(), Set the flag and retry
   another spin_trylock() in case other CPU may not see the new
   flag after it releases the lock.
3. reschedule if the flags is set after the lock is released
   at the end of qdisc_run_end().

For tx_action case, the flags is also set when cpu1 is at the
end if qdisc_run_end(), so tx_action will be rescheduled
again to dequeue the skb enqueued by cpu2.

Only clear the flag before retrying a dequeuing when dequeuing
returns NULL in order to reduce the overhead of the above double
spin_trylock() and __netif_schedule() calling.

The performance impact of this patch, tested using pktgen and
dummy netdev with pfifo_fast qdisc attached:

 threads  without+this_patch   with+this_patch  delta
12.6Mpps2.6Mpps +0.0%
23.9Mpps3.8Mpps -2.5%
45.6Mpps5.6Mpps -0.0%
82.7Mpps2.8Mpps +3.7%
   162.2Mpps2.2Mpps +0.0%

Fixes: 6b3ba9146fe6 ("net: sched: allow qdiscs to handle locking")
Signed-off-by: Yunsheng Lin 
---
V2: Avoid the overhead of fixing the data race as much as
possible.
---
 include/net/sch_generic.h | 48 ++-
 net/sched/sch_generic.c   | 12 
 2 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index f7a6e14..09a755d 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -36,6 +36,7 @@ struct qdisc_rate_table {
 enum qdisc_state_t {
__QDISC_STATE_SCHED,
__QDISC_STATE_DEACTIVATED,
+   __QDISC_STATE_NEED_RESCHEDULE,
 };
 
 struct qdisc_size_table {
@@ -159,12 +160,42 @@ static inline bool qdisc_is_empty(const struct Qdisc 
*qdisc)
 static inline bool qdisc_run_begin(struct Qdisc *qdisc)
 {
if (qdisc->flags & TCQ_F_NOLOCK) {
+   bool dont_retry = test_bit(__QDISC_STATE_NEED_RESCHEDULE,
+  >state);
+
+   if (spin_trylock(>seqlock))
+   goto out;
+
+   /* If the flag is set before doing the spin_trylock() and
+* the above spin_trylock() return false, it means other cpu
+* holding the lock will do dequeuing for us, or it wil see
+* the flag set after releasing lock and reschedule the
+* net_tx_action() to do the dequeuing.
+*/
+   if (dont_retry)
+   return false;
+
+   /* We could do set_bit() before the first spin_trylock(),
+* and avoid doing secord spin_trylock() completely, then
+* we could have multi cpus doing the test_bit(). Here use
+* dont_retry to avoiding the test_bit() and the second
+* spin_trylock(), which has 5% performance improvement than
+* doing the set_bit() before the first spin_trylock().
+   

Re: [RFC v3] net: sched: implement TCQ_F_CAN_BYPASS for lockless qdisc

2021-03-23 Thread Yunsheng Lin
On 2021/3/23 14:37, Ahmad Fatoum wrote:
> Hi,
> 
> On 22.03.21 10:09, Yunsheng Lin wrote:
>> Currently pfifo_fast has both TCQ_F_CAN_BYPASS and TCQ_F_NOLOCK
>> flag set, but queue discipline by-pass does not work for lockless
>> qdisc because skb is always enqueued to qdisc even when the qdisc
>> is empty, see __dev_xmit_skb().
>>
>> This patch calls sch_direct_xmit() to transmit the skb directly
>> to the driver for empty lockless qdisc too, which aviod enqueuing
>> and dequeuing operation. qdisc->empty is set to false whenever a
>> skb is enqueued, see pfifo_fast_enqueue(), and is set to true when
>> skb dequeuing return NULL, see pfifo_fast_dequeue().
>>
>> There is a data race between enqueue/dequeue and qdisc->empty
>> setting, qdisc->empty is only used as a hint, so we need to call
>> sch_may_need_requeuing() to see if the queue is really empty and if
>> there is requeued skb, which has higher priority than the current
>> skb.
>>
>> The performance for ip_forward test increases about 10% with this
>> patch.
>>
>> Signed-off-by: Yunsheng Lin 
>> ---
>> Hi, Vladimir and Ahmad
>>  Please give it a test to see if there is any out of order
>> packet for this patch, which has removed the priv->lock added in
>> RFC v2.
> 
> Overnight test (10h, 64 mil frames) didn't see any out-of-order frames
> between 2 FlexCANs on a dual core machine:
> 
> Tested-by: Ahmad Fatoum 
> 
> No performance measurements taken.

Thanks for the testing.
And I has done the performance measurement.

L3 forward testing improves from 1.09Mpps to 1.21Mpps, still about
10% improvement.

pktgen + dummy netdev:

 threads  without+this_patch   with+this_patch  delta
1   2.56Mpps3.11Mpps +21%
2   3.76Mpps4.31Mpps +14%
4   5.51Mpps5.53Mpps +0.3%
8   2.81Mpps2.72Mpps -3%
   16   2.24Mpps2.22Mpps -0.8%

> 
>>




[RFC v3] net: sched: implement TCQ_F_CAN_BYPASS for lockless qdisc

2021-03-22 Thread Yunsheng Lin
Currently pfifo_fast has both TCQ_F_CAN_BYPASS and TCQ_F_NOLOCK
flag set, but queue discipline by-pass does not work for lockless
qdisc because skb is always enqueued to qdisc even when the qdisc
is empty, see __dev_xmit_skb().

This patch calls sch_direct_xmit() to transmit the skb directly
to the driver for empty lockless qdisc too, which aviod enqueuing
and dequeuing operation. qdisc->empty is set to false whenever a
skb is enqueued, see pfifo_fast_enqueue(), and is set to true when
skb dequeuing return NULL, see pfifo_fast_dequeue().

There is a data race between enqueue/dequeue and qdisc->empty
setting, qdisc->empty is only used as a hint, so we need to call
sch_may_need_requeuing() to see if the queue is really empty and if
there is requeued skb, which has higher priority than the current
skb.

The performance for ip_forward test increases about 10% with this
patch.

Signed-off-by: Yunsheng Lin 
---
Hi, Vladimir and Ahmad
Please give it a test to see if there is any out of order
packet for this patch, which has removed the priv->lock added in
RFC v2.

There is a data race as below:

  CPU1   CPU2
qdisc_run_begin(q).
.q->enqueue()
sch_may_need_requeuing()  .
return true   .
. .
. .
q->enqueue()  .

When above happen, the skb enqueued by CPU1 is dequeued after the
skb enqueued by CPU2 because sch_may_need_requeuing() return true.
If there is not qdisc bypass, the CPU1 has better chance to queue
the skb quicker than CPU2.

This patch does not take care of the above data race, because I
view this as similar as below:

Even at the same time CPU1 and CPU2 write the skb to two socket
which both heading to the same qdisc, there is no guarantee that
which skb will hit the qdisc first, becuase there is a lot of
factor like interrupt/softirq/cache miss/scheduling afffecting
that.

So I hope the above data race will not cause problem for Vladimir
and Ahmad.
---
 include/net/pkt_sched.h   |  1 +
 include/net/sch_generic.h |  1 -
 net/core/dev.c| 22 ++
 net/sched/sch_generic.c   | 11 +++
 4 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index f5c1bee..5715ddf 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -122,6 +122,7 @@ void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc);
 bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 struct net_device *dev, struct netdev_queue *txq,
 spinlock_t *root_lock, bool validate);
+bool sch_may_need_requeuing(struct Qdisc *q);
 
 void __qdisc_run(struct Qdisc *q);
 
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index f7a6e14..e08cc77 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -161,7 +161,6 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
if (qdisc->flags & TCQ_F_NOLOCK) {
if (!spin_trylock(>seqlock))
return false;
-   WRITE_ONCE(qdisc->empty, false);
} else if (qdisc_is_running(qdisc)) {
return false;
}
diff --git a/net/core/dev.c b/net/core/dev.c
index be941ed..317180a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3796,9 +3796,31 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, 
struct Qdisc *q,
qdisc_calculate_pkt_len(skb, q);
 
if (q->flags & TCQ_F_NOLOCK) {
+   if (q->flags & TCQ_F_CAN_BYPASS && READ_ONCE(q->empty) &&
+   qdisc_run_begin(q)) {
+   if (sch_may_need_requeuing(q)) {
+   rc = q->enqueue(skb, q, _free) & 
NET_XMIT_MASK;
+   __qdisc_run(q);
+   qdisc_run_end(q);
+
+   goto no_lock_out;
+   }
+
+   qdisc_bstats_cpu_update(q, skb);
+
+   if (sch_direct_xmit(skb, q, dev, txq, NULL, true) &&
+   !READ_ONCE(q->empty))
+   __qdisc_run(q);
+
+   qdisc_run_end(q);
+   return NET_XMIT_SUCCESS;
+   }
+
rc = q->enqueue(skb, q, _free) & NET_XMIT_MASK;
+   WRITE_ONCE(q->empty, false);
qdisc_run(q);
 
+no_lock_out:
if (unlikely(to_free))
kfree_skb_list(to_free);
return rc;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 44991ea..2145fdad 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c

Re: [Linuxarm] [PATCH net] net: sched: fix packet stuck problem for lockless qdisc

2021-03-21 Thread Yunsheng Lin
On 2021/3/20 3:45, Cong Wang wrote:
> On Fri, Mar 19, 2021 at 2:25 AM Yunsheng Lin  wrote:
>> I had done some performance test to see if there is value to
>> fix the packet stuck problem and support lockless qdisc bypass,
>> here is some result using pktgen in 'queue_xmit' mode on a dummy
>> device as Paolo Abeni had done in [1], and using pfifo_fast qdisc:
>>
>> threads  vanillalocked-qdiscvanilla+this_patch
>>1 2.6Mpps  2.9Mpps2.5Mpps
>>2 3.9Mpps  4.8Mpps3.6Mpps
>>4 5.6Mpps  3.0Mpps4.7Mpps
>>8 2.7Mpps  1.6Mpps2.8Mpps
>>162.2Mpps  1.3Mpps2.3Mpps
>>
>> locked-qdisc: test by removing the "TCQ_F_NOLOCK | TCQ_F_CPUSTATS".
> 
> I read this as this patch introduces somehow a performance
> regression for -net, as the lockless bypass patch you submitted is
> for -net-next.

Yes, right now there is performance regression for fixing this bug,
but the problem is that if we can not fix the above data race without
any performance regression, do you prefer to send this patch to
-net, or to -net-next with the lockless bypass patch?

Any idea to fix this with less performance regression?

> 
> Thanks.
> 
> .
> 



Re: [PATCH net] net: sched: fix packet stuck problem for lockless qdisc

2021-03-21 Thread Yunsheng Lin
On 2021/3/20 3:40, Cong Wang wrote:
> On Wed, Mar 17, 2021 at 11:52 PM Yunsheng Lin  wrote:
>>
>> Lockless qdisc has below concurrent problem:
>> cpu0  cpu1
>>   . .
>>  q->enqueue .
>>   . .
>>qdisc_run_begin().
>>   . .
>>  dequeue_skb()  .
>>   . .
>>sch_direct_xmit().
>>   . .
>>   .q->enqueue
>>   . qdisc_run_begin()
>>   .return and do nothing
>>   . .
>> qdisc_run_end() .
>>
>> cpu1 enqueue a skb without calling __qdisc_run() because cpu0
>> has not released the lock yet and spin_trylock() return false
>> for cpu1 in qdisc_run_begin(), and cpu0 do not see the skb
>> enqueued by cpu1 when calling dequeue_skb() because cpu1 may
>> enqueue the skb after cpu0 calling dequeue_skb() and before
>> cpu0 calling qdisc_run_end().
>>
>> Lockless qdisc has another concurrent problem when tx_action
>> is involved:
>>
>> cpu0(serving tx_action) cpu1 cpu2
>>   .   ..
>>   .  q->enqueue.
>>   .qdisc_run_begin()   .
>>   .  dequeue_skb() .
>>   .   .q->enqueue
>>   .   ..
>>   . sch_direct_xmit()  .
>>   .   . qdisc_run_begin()
>>   .   .   return and do nothing
>>   .   ..
>> clear __QDISC_STATE_SCHED ..
>> qdisc_run_begin() ..
>> return and do nothing ..
>>   .   ..
>>   .  qdisc_run_begin() .
>>
>> This patch fixes the above data race by:
>> 1. Set a flag after spin_trylock() return false.
>> 2. Retry a spin_trylock() in case other CPU may not see the
>>new flag after it releases the lock.
>> 3. reschedule if the flag is set after the lock is released
>>at the end of qdisc_run_end().
>>
>> For tx_action case, the flags is also set when cpu1 is at the
>> end if qdisc_run_begin(), so tx_action will be rescheduled
>> again to dequeue the skb enqueued by cpu2.
>>
>> Also clear the flag before dequeuing in order to reduce the
>> overhead of the above process, and aviod doing the heavy
>> test_and_clear_bit() at the end of qdisc_run_end().
>>
>> Fixes: 6b3ba9146fe6 ("net: sched: allow qdiscs to handle locking")
>> Signed-off-by: Yunsheng Lin 
>> ---
>> For those who has not been following the qdsic scheduling
>> discussion, there is packet stuck problem for lockless qdisc,
>> see [1], and I has done some cleanup and added some enhanced
>> features too, see [2] [3].
>> While I was doing the optimization for lockless qdisc, it
>> accurred to me that these optimization is useless if there is
>> still basic bug in lockless qdisc, even the bug is not easily
>> reproducible. So look through [1] again, I found that the data
>> race for tx action mentioned by Michael, and thought deep about
>> it and came up with this patch trying to fix it.
>>
>> So I am really appreciated some who still has the reproducer
>> can try this patch and report back.
>>
>> 1. 
>> https://lore.kernel.org/netdev/d102074f-7489-e35a-98cf-e2cad7efd...@netrounds.com/t/#ma7013a79b8c4d8e7c49015c724e481e6d5325b32
>> 2. 
>> https://patchwork.kernel.org/project/netdevbpf/patch/1615777818-13969-1-git-send-email-linyunsh...@huawei.com/
>> 3. 
>> https://patchwork.kernel.org/project/netdevbpf/patch/1615800610-34700-1-git-send-email-linyunsh...@huawei.com/
>> ---
>>  include/net/sch_generic.h | 23 ---
>>  net/sched/sch_generic.c   |  1 +
>>  2 files changed, 21 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
>> index f7a6e14..4220eab 100644
>> --- a/include/net/sch_generic.h
>> +++ b/include/net/sch_generic.h
>> @@ -36,6 +36,7 @@ struct qdisc_rate_table {
>>  enum qdisc_state_t {
>> __QDISC_STATE_SCHED,
>> __QDISC_STATE_DEACTIVATED,
>> +   __QDISC_STATE_NEED_RESC

Re: [Linuxarm] Re: [RFC v2] net: sched: implement TCQ_F_CAN_BYPASS for lockless qdisc

2021-03-21 Thread Yunsheng Lin
On 2021/3/20 3:03, Jason A. Donenfeld wrote:
> On Thu, Mar 18, 2021 at 1:33 AM Yunsheng Lin  wrote:
>>> That offer definitely still stands. Generalization sounds like a lot of fun.
>>>
>>> Keep in mind though that it's an eventually consistent queue, not an
>>> immediately consistent one, so that might not match all use cases. It
>>> works with wg because we always trigger the reader thread anew when it
>>> finishes, but that doesn't apply to everyone's queueing setup.
>>
>> Thanks for mentioning this.
>>
>> "multi-producer, single-consumer" seems to match the lockless qdisc's
>> paradigm too, for now concurrent enqueuing/dequeuing to the pfifo_fast's
>> queues() is not allowed, it is protected by producer_lock or consumer_lock.
> 
> The other thing is that if you've got memory for a ring buffer rather
> than a list queue, we worked on an MPMC ring structure for WireGuard a
> few years ago that we didn't wind up using in the end, but it lives
> here:
> https://git.zx2c4.com/wireguard-monolithic-historical/tree/src/mpmc_ptr_ring.h?h=tg/mpmc-benchmark

Thanks for mentioning that, It seems that is exactly what the
pfifo_fast qdisc need for locklees multi-producer, because it
only need the memory to store the skb pointer.

Does it have any limitation? More specifically, does it works with
the process or softirq context, if not, how about context with
rcu protection?

> ___
> Linuxarm mailing list -- linux...@openeuler.org
> To unsubscribe send an email to linuxarm-le...@openeuler.org
> 



Re: [Linuxarm] Re: [RFC v2] net: sched: implement TCQ_F_CAN_BYPASS for lockless qdisc

2021-03-21 Thread Yunsheng Lin
On 2021/3/20 2:15, Cong Wang wrote:
> On Thu, Mar 18, 2021 at 12:33 AM Yunsheng Lin  wrote:
>>
>> On 2021/3/17 21:45, Jason A. Donenfeld wrote:
>>> On 3/17/21, Toke Høiland-Jørgensen  wrote:
>>>> Cong Wang  writes:
>>>>
>>>>> On Mon, Mar 15, 2021 at 2:07 PM Jakub Kicinski  wrote:
>>>>>>
>>>>>> I thought pfifo was supposed to be "lockless" and this change
>>>>>> re-introduces a lock between producer and consumer, no?
>>>>>
>>>>> It has never been truly lockless, it uses two spinlocks in the ring
>>>>> buffer
>>>>> implementation, and it introduced a q->seqlock recently, with this patch
>>>>> now we have priv->lock, 4 locks in total. So our "lockless" qdisc ends
>>>>> up having more locks than others. ;) I don't think we are going to a
>>>>> right direction...
>>>>
>>>> Just a thought, have you guys considered adopting the lockless MSPC ring
>>>> buffer recently introduced into Wireguard in commit:
>>>>
>>>> 8b5553ace83c ("wireguard: queueing: get rid of per-peer ring buffers")
>>>>
>>>> Jason indicated he was willing to work on generalising it into a
>>>> reusable library if there was a use case for it. I haven't quite though
>>>> through the details of whether this would be such a use case, but
>>>> figured I'd at least mention it :)
>>>
>>> That offer definitely still stands. Generalization sounds like a lot of fun.
>>>
>>> Keep in mind though that it's an eventually consistent queue, not an
>>> immediately consistent one, so that might not match all use cases. It
>>> works with wg because we always trigger the reader thread anew when it
>>> finishes, but that doesn't apply to everyone's queueing setup.
>>
>> Thanks for mentioning this.
>>
>> "multi-producer, single-consumer" seems to match the lockless qdisc's
>> paradigm too, for now concurrent enqueuing/dequeuing to the pfifo_fast's
>> queues() is not allowed, it is protected by producer_lock or consumer_lock.
>>
>> So it would be good to has lockless concurrent enqueuing, while dequeuing
>> can be protected by qdisc_lock() or q->seqlock, which meets the 
>> "multi-producer,
>> single-consumer" paradigm.
> 
> I don't think so. Usually we have one queue for each CPU so we can expect
> each CPU has a lockless qdisc assigned, but we can not assume this in
> the code, so we still have to deal with multiple CPU's sharing a lockless 
> qdisc,
> and we usually enqueue and dequeue in process context, so it means we could
> have multiple producers and multiple consumers.

For lockless qdisc, dequeuing is always within the qdisc_run_begin() and
qdisc_run_end(), so multiple consumers is protected with each other by
q->seqlock .

For enqueuing, multiple consumers is protected by producer_lock, see
pfifo_fast_enqueue() -> skb_array_produce() -> ptr_ring_produce().
I am not sure if lockless MSPC can work with the process context, but
even if not, the enqueuing is also protected by rcu_read_lock_bh(),
which provides some kind of atomicity, so that producer_lock can be
reomved when lockless MSPC is used.

> 
> On the other hand, I don't think the problems we have been fixing are the ring
> buffer implementation itself, they are about the high-level qdisc
> state transitions.
> 
> Thanks.
> 
> .
> 



Re: [Linuxarm] [PATCH net] net: sched: fix packet stuck problem for lockless qdisc

2021-03-19 Thread Yunsheng Lin
On 2021/3/18 14:53, Yunsheng Lin wrote:
> Lockless qdisc has below concurrent problem:
> cpu0  cpu1
>   . .
>  q->enqueue .
>   . .
>qdisc_run_begin().
>   . .
>  dequeue_skb()  .
>   . .
>sch_direct_xmit().
>   . .
>   .q->enqueue
>   . qdisc_run_begin()
>   .return and do nothing
>   . .
> qdisc_run_end() .
> 
> cpu1 enqueue a skb without calling __qdisc_run() because cpu0
> has not released the lock yet and spin_trylock() return false
> for cpu1 in qdisc_run_begin(), and cpu0 do not see the skb
> enqueued by cpu1 when calling dequeue_skb() because cpu1 may
> enqueue the skb after cpu0 calling dequeue_skb() and before
> cpu0 calling qdisc_run_end().
> 
> Lockless qdisc has another concurrent problem when tx_action
> is involved:
> 
> cpu0(serving tx_action) cpu1 cpu2
>   .   ..
>   .  q->enqueue.
>   .qdisc_run_begin()   .
>   .  dequeue_skb() .
>   .   .q->enqueue
>   .   ..
>   . sch_direct_xmit()  .
>   .   . qdisc_run_begin()
>   .   .   return and do nothing
>   .   ..
> clear __QDISC_STATE_SCHED ..
> qdisc_run_begin() ..
> return and do nothing ..
>   .   ..
>   .  qdisc_run_begin() .
> 
> This patch fixes the above data race by:
> 1. Set a flag after spin_trylock() return false.
> 2. Retry a spin_trylock() in case other CPU may not see the
>new flag after it releases the lock.
> 3. reschedule if the flag is set after the lock is released
>at the end of qdisc_run_end().
> 
> For tx_action case, the flags is also set when cpu1 is at the
> end if qdisc_run_begin(), so tx_action will be rescheduled
> again to dequeue the skb enqueued by cpu2.
> 
> Also clear the flag before dequeuing in order to reduce the
> overhead of the above process, and aviod doing the heavy
> test_and_clear_bit() at the end of qdisc_run_end().
> 
> Fixes: 6b3ba9146fe6 ("net: sched: allow qdiscs to handle locking")
> Signed-off-by: Yunsheng Lin 
> ---
> For those who has not been following the qdsic scheduling
> discussion, there is packet stuck problem for lockless qdisc,
> see [1], and I has done some cleanup and added some enhanced
> features too, see [2] [3].
> While I was doing the optimization for lockless qdisc, it
> accurred to me that these optimization is useless if there is
> still basic bug in lockless qdisc, even the bug is not easily
> reproducible. So look through [1] again, I found that the data
> race for tx action mentioned by Michael, and thought deep about
> it and came up with this patch trying to fix it.
> 
> So I am really appreciated some who still has the reproducer
> can try this patch and report back.

I had done some performance test to see if there is value to
fix the packet stuck problem and support lockless qdisc bypass,
here is some result using pktgen in 'queue_xmit' mode on a dummy
device as Paolo Abeni had done in [1], and using pfifo_fast qdisc:

threads  vanillalocked-qdiscvanilla+this_patch
   1 2.6Mpps  2.9Mpps2.5Mpps
   2 3.9Mpps  4.8Mpps3.6Mpps
   4 5.6Mpps  3.0Mpps4.7Mpps
   8 2.7Mpps  1.6Mpps2.8Mpps
   162.2Mpps  1.3Mpps2.3Mpps

locked-qdisc: test by removing the "TCQ_F_NOLOCK | TCQ_F_CPUSTATS".

And add the lockless qdisc bypatch and other optimization upon
this patch:

threads   patch_set_1   patch_set_2 patch_set_3
   1   2.5Mpps3.0Mpps 3.0Mpps
   2   3.6Mpps4.1Mpps 5.3Mpps
   4   4.7Mpps4.6Mpps 5.1Mpps
   8   2.8Mpps2.6Mpps 2.7Mpps
   16  2.3Mpps2.2Mpps 2.2Mpps

patch_set_1: vanilla + this_patch
patch_set_2: vanilla + this_patch + lockless_qdisc_bypass_patch
patch_set_3: vanilla + this_patch + lockless_qdisc_bypass_patch +
 remove_seq_operation_for_lockless_qdisc_optimization +
 check_rc_before_calling_qdisc_run()_optimization +
 spin_trylock()_retry_optimizatio

Re: [RFC v2] net: sched: implement TCQ_F_CAN_BYPASS for lockless qdisc

2021-03-18 Thread Yunsheng Lin
On 2021/3/18 15:10, Ahmad Fatoum wrote:
> On 15.03.21 04:10, Yunsheng Lin wrote:
>> Currently pfifo_fast has both TCQ_F_CAN_BYPASS and TCQ_F_NOLOCK
>> flag set, but queue discipline by-pass does not work for lockless
>> qdisc because skb is always enqueued to qdisc even when the qdisc
>> is empty, see __dev_xmit_skb().
>>
>> This patch calls sch_direct_xmit() to transmit the skb directly
>> to the driver for empty lockless qdisc too, which aviod enqueuing
>> and dequeuing operation. qdisc->empty is set to false whenever a
>> skb is enqueued, see pfifo_fast_enqueue(), and is set to true when
>> skb dequeuing return NULL, see pfifo_fast_dequeue(), a spinlock is
>> added to avoid the race between enqueue/dequeue and qdisc->empty
>> setting.
>>
>> If there is requeued skb in q->gso_skb, and qdisc->empty is true,
>> do not allow bypassing requeued skb. enqueuing and dequeuing in
>> q->gso_skb is always protected by qdisc->seqlock, so is the access
>> of q->gso_skb by skb_queue_empty();
>>
>> Also, qdisc is scheduled at the end of qdisc_run_end() when q->empty
>> is false to avoid packet stuck problem.
>>
>> The performance for ip_forward test increases about 10% with this
>> patch.
>>
>> Signed-off-by: Yunsheng Lin 
>> ---
>> RFC V2: fix requeued skb out of order and data race problem.
> 
> cansequence didn't find any frame reordering with 2 FlexCAN's communicating
> with each other on a dual core i.MX6. Feel free to add:
> 
> Tested-by: Ahmad Fatoum 

Thanks for testing.
Actually I has a newer implemetion that canget rid of the priv->lock
added in this patch.
I am not sending it out yet:
1. There is a packet stuck problem for lockless qdisc I try to fix,
   see [1], and I prefer not to add more optimization to lockless
   qdisc before we find out real cause, it will make backporting
   packet stuck patch harder and optimization is useless if there
   is still basic bug for lockless qdisc
2. I am still not convinced that the lockless implemetion is clearer
   than the priv->lock implemetion, I still need to do some thinking
   and testing.


> 
>> ---
>>  include/net/pkt_sched.h   |  2 ++
>>  include/net/sch_generic.h |  7 +--
>>  net/core/dev.c| 14 ++
>>  net/sched/sch_generic.c   | 31 ++-
>>  4 files changed, 51 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
>> index f5c1bee..c760f6a 100644
>> --- a/include/net/pkt_sched.h
>> +++ b/include/net/pkt_sched.h
>> @@ -122,6 +122,8 @@ void qdisc_warn_nonwc(const char *txt, struct Qdisc 
>> *qdisc);
>>  bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
>>   struct net_device *dev, struct netdev_queue *txq,
>>   spinlock_t *root_lock, bool validate);
>> +bool sch_may_need_requeuing(struct sk_buff *skb, struct Qdisc *q,
>> +struct net_device *dev);
>>  
>>  void __qdisc_run(struct Qdisc *q);
>>  
>> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
>> index 2d6eb60..6591356 100644
>> --- a/include/net/sch_generic.h
>> +++ b/include/net/sch_generic.h
>> @@ -161,7 +161,6 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
>>  if (qdisc->flags & TCQ_F_NOLOCK) {
>>  if (!spin_trylock(>seqlock))
>>  return false;
>> -WRITE_ONCE(qdisc->empty, false);
>>  } else if (qdisc_is_running(qdisc)) {
>>  return false;
>>  }
>> @@ -176,8 +175,12 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
>>  static inline void qdisc_run_end(struct Qdisc *qdisc)
>>  {
>>  write_seqcount_end(>running);
>> -if (qdisc->flags & TCQ_F_NOLOCK)
>> +if (qdisc->flags & TCQ_F_NOLOCK) {
>>  spin_unlock(>seqlock);
>> +
>> +if (unlikely(!READ_ONCE(qdisc->empty)))
>> +__netif_schedule(qdisc);
>> +}
>>  }
>>  
>>  static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index 2bfdd52..8f4afb6 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -3791,6 +3791,20 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, 
>> struct Qdisc *q,
>>  qdisc_calculate_pkt_len(skb, q);
>>  
>>  if (q->flags & TCQ_F_NOLOCK) {
>> +if (q->flags & TCQ_F_CAN_BYPASS && READ_ONCE(q->empty) &&
>> +   

Re: [Linuxarm] Re: [RFC v2] net: sched: implement TCQ_F_CAN_BYPASS for lockless qdisc

2021-03-18 Thread Yunsheng Lin
On 2021/3/17 21:45, Jason A. Donenfeld wrote:
> On 3/17/21, Toke Høiland-Jørgensen  wrote:
>> Cong Wang  writes:
>>
>>> On Mon, Mar 15, 2021 at 2:07 PM Jakub Kicinski  wrote:

 I thought pfifo was supposed to be "lockless" and this change
 re-introduces a lock between producer and consumer, no?
>>>
>>> It has never been truly lockless, it uses two spinlocks in the ring
>>> buffer
>>> implementation, and it introduced a q->seqlock recently, with this patch
>>> now we have priv->lock, 4 locks in total. So our "lockless" qdisc ends
>>> up having more locks than others. ;) I don't think we are going to a
>>> right direction...
>>
>> Just a thought, have you guys considered adopting the lockless MSPC ring
>> buffer recently introduced into Wireguard in commit:
>>
>> 8b5553ace83c ("wireguard: queueing: get rid of per-peer ring buffers")
>>
>> Jason indicated he was willing to work on generalising it into a
>> reusable library if there was a use case for it. I haven't quite though
>> through the details of whether this would be such a use case, but
>> figured I'd at least mention it :)
> 
> That offer definitely still stands. Generalization sounds like a lot of fun.
> 
> Keep in mind though that it's an eventually consistent queue, not an
> immediately consistent one, so that might not match all use cases. It
> works with wg because we always trigger the reader thread anew when it
> finishes, but that doesn't apply to everyone's queueing setup.

Thanks for mentioning this.

"multi-producer, single-consumer" seems to match the lockless qdisc's
paradigm too, for now concurrent enqueuing/dequeuing to the pfifo_fast's
queues() is not allowed, it is protected by producer_lock or consumer_lock.

So it would be good to has lockless concurrent enqueuing, while dequeuing
can be protected by qdisc_lock() or q->seqlock, which meets the "multi-producer,
single-consumer" paradigm.

But right now lockless qdisc has some packet stuck problem, which I tried to
fix in [1].

If the packet stuck problem for lockless qdisc can be fixed, and we can do
more optimization on lockless qdisc, including the one you mention:)

1.https://patchwork.kernel.org/project/netdevbpf/patch/1616050402-37023-1-git-send-email-linyunsh...@huawei.com/

> ___
> Linuxarm mailing list -- linux...@openeuler.org
> To unsubscribe send an email to linuxarm-le...@openeuler.org
> 



[PATCH net] net: sched: fix packet stuck problem for lockless qdisc

2021-03-18 Thread Yunsheng Lin
Lockless qdisc has below concurrent problem:
cpu0  cpu1
  . .
 q->enqueue .
  . .
   qdisc_run_begin().
  . .
 dequeue_skb()  .
  . .
   sch_direct_xmit().
  . .
  .q->enqueue
  . qdisc_run_begin()
  .return and do nothing
  . .
qdisc_run_end() .

cpu1 enqueue a skb without calling __qdisc_run() because cpu0
has not released the lock yet and spin_trylock() return false
for cpu1 in qdisc_run_begin(), and cpu0 do not see the skb
enqueued by cpu1 when calling dequeue_skb() because cpu1 may
enqueue the skb after cpu0 calling dequeue_skb() and before
cpu0 calling qdisc_run_end().

Lockless qdisc has another concurrent problem when tx_action
is involved:

cpu0(serving tx_action) cpu1 cpu2
  .   ..
  .  q->enqueue.
  .qdisc_run_begin()   .
  .  dequeue_skb() .
  .   .q->enqueue
  .   ..
  . sch_direct_xmit()  .
  .   . qdisc_run_begin()
  .   .   return and do nothing
  .   ..
clear __QDISC_STATE_SCHED ..
qdisc_run_begin() ..
return and do nothing ..
  .   ..
  .  qdisc_run_begin() .

This patch fixes the above data race by:
1. Set a flag after spin_trylock() return false.
2. Retry a spin_trylock() in case other CPU may not see the
   new flag after it releases the lock.
3. reschedule if the flag is set after the lock is released
   at the end of qdisc_run_end().

For tx_action case, the flags is also set when cpu1 is at the
end if qdisc_run_begin(), so tx_action will be rescheduled
again to dequeue the skb enqueued by cpu2.

Also clear the flag before dequeuing in order to reduce the
overhead of the above process, and aviod doing the heavy
test_and_clear_bit() at the end of qdisc_run_end().

Fixes: 6b3ba9146fe6 ("net: sched: allow qdiscs to handle locking")
Signed-off-by: Yunsheng Lin 
---
For those who has not been following the qdsic scheduling
discussion, there is packet stuck problem for lockless qdisc,
see [1], and I has done some cleanup and added some enhanced
features too, see [2] [3].
While I was doing the optimization for lockless qdisc, it
accurred to me that these optimization is useless if there is
still basic bug in lockless qdisc, even the bug is not easily
reproducible. So look through [1] again, I found that the data
race for tx action mentioned by Michael, and thought deep about
it and came up with this patch trying to fix it.

So I am really appreciated some who still has the reproducer
can try this patch and report back.

1. 
https://lore.kernel.org/netdev/d102074f-7489-e35a-98cf-e2cad7efd...@netrounds.com/t/#ma7013a79b8c4d8e7c49015c724e481e6d5325b32
2. 
https://patchwork.kernel.org/project/netdevbpf/patch/1615777818-13969-1-git-send-email-linyunsh...@huawei.com/
3. 
https://patchwork.kernel.org/project/netdevbpf/patch/1615800610-34700-1-git-send-email-linyunsh...@huawei.com/
---
 include/net/sch_generic.h | 23 ---
 net/sched/sch_generic.c   |  1 +
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index f7a6e14..4220eab 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -36,6 +36,7 @@ struct qdisc_rate_table {
 enum qdisc_state_t {
__QDISC_STATE_SCHED,
__QDISC_STATE_DEACTIVATED,
+   __QDISC_STATE_NEED_RESCHEDULE,
 };
 
 struct qdisc_size_table {
@@ -159,8 +160,17 @@ static inline bool qdisc_is_empty(const struct Qdisc 
*qdisc)
 static inline bool qdisc_run_begin(struct Qdisc *qdisc)
 {
if (qdisc->flags & TCQ_F_NOLOCK) {
-   if (!spin_trylock(>seqlock))
-   return false;
+   if (!spin_trylock(>seqlock)) {
+   set_bit(__QDISC_STATE_NEED_RESCHEDULE,
+   >state);
+
+   /* Retry again in case other CPU may not see the
+* new flags after it releases the lock at the
+* end of qdisc_run_end().
+*/
+   if (!spin_trylock(>seqlock))
+   return false;
+   }
WRITE_ONCE(qdisc->empty, false);
} else if (qdisc_is_running(qdisc)) {
return false;
@@ -176,8 +

Re: [RFC v2] net: sched: implement TCQ_F_CAN_BYPASS for lockless qdisc

2021-03-16 Thread Yunsheng Lin
On 2021/3/17 6:48, Cong Wang wrote:
> On Mon, Mar 15, 2021 at 2:07 PM Jakub Kicinski  wrote:
>>
>> I thought pfifo was supposed to be "lockless" and this change
>> re-introduces a lock between producer and consumer, no?
> 
> It has never been truly lockless, it uses two spinlocks in the ring buffer
> implementation, and it introduced a q->seqlock recently, with this patch
> now we have priv->lock, 4 locks in total. So our "lockless" qdisc ends
> up having more locks than others. ;) I don't think we are going to a
> right direction...

Yes, we have 4 locks in total, but lockless qdisc only use two locks
in this patch, which are priv->lock and q->seqlock.

The qdisc at least uses two locks, which is qdisc_lock(q) and q->busylock,
which seems to have bigger contention when concurrent accessing to the
same qdisc.

If we want to reduce the total number of lock, we can use qdisc_lock(q)
for lockless qdisc and remove q->seqlock:)

> 
> Thanks.
> 
> .
> 



Re: [PATCH net-next] net: sched: remove unnecessay lock protection for skb_bad_txq/gso_skb

2021-03-16 Thread Yunsheng Lin
On 2021/3/17 2:41, Cong Wang wrote:
> On Mon, Mar 15, 2021 at 2:29 AM Yunsheng Lin  wrote:
>>
>> Currently qdisc_lock(q) is taken before enqueuing and dequeuing
>> for lockless qdisc's skb_bad_txq/gso_skb queue, qdisc->seqlock is
>> also taken, which can provide the same protection as qdisc_lock(q).
>>
>> This patch removes the unnecessay qdisc_lock(q) protection for
>> lockless qdisc' skb_bad_txq/gso_skb queue.
>>
>> And dev_reset_queue() takes the qdisc->seqlock for lockless qdisc
>> besides taking the qdisc_lock(q) when doing the qdisc reset,
>> some_qdisc_is_busy() takes both qdisc->seqlock and qdisc_lock(q)
>> when checking qdisc status. It is unnecessary to take both lock
>> while the fast path only take one lock, so this patch also changes
>> it to only take qdisc_lock(q) for locked qdisc, and only take
>> qdisc->seqlock for lockless qdisc.
>>
>> Since qdisc->seqlock is taken for lockless qdisc when calling
>> qdisc_is_running() in some_qdisc_is_busy(), use qdisc->running
>> to decide if the lockless qdisc is running.
> 
> What's the benefit here? Since qdisc->q.lock is also per-qdisc,
> so there is no actual contention to take it when we already acquire
> q->seqlock, right?

Yes, there is no actual contention to take qdisc->q.lock while
q->seqlock is acquired, but a cleanup or minor optimization.

> 
> Also, is ->seqlock supposed to be used for protecting skb_bad_txq
> etc.? From my understanding, it was introduced merely for replacing
> __QDISC_STATE_RUNNING. If you want to extend it, you probably
> have to rename it too.

How about just using qdisc->q.lock for lockless qdisc too and remove
dqisc->seqlock completely?

> 
> Thanks.
> 
> .
> 



Re: [PATCH net-next] net: sched: remove unnecessay lock protection for skb_bad_txq/gso_skb

2021-03-16 Thread Yunsheng Lin
On 2021/3/17 2:43, Cong Wang wrote:
> On Mon, Mar 15, 2021 at 4:42 PM David Miller  wrote:
>>
>> From: Yunsheng Lin 
>> Date: Mon, 15 Mar 2021 17:30:10 +0800
>>
>>> Currently qdisc_lock(q) is taken before enqueuing and dequeuing
>>> for lockless qdisc's skb_bad_txq/gso_skb queue, qdisc->seqlock is
>>> also taken, which can provide the same protection as qdisc_lock(q).
>>>
>>> This patch removes the unnecessay qdisc_lock(q) protection for
>>> lockless qdisc' skb_bad_txq/gso_skb queue.
>>>
>>> And dev_reset_queue() takes the qdisc->seqlock for lockless qdisc
>>> besides taking the qdisc_lock(q) when doing the qdisc reset,
>>> some_qdisc_is_busy() takes both qdisc->seqlock and qdisc_lock(q)
>>> when checking qdisc status. It is unnecessary to take both lock
>>> while the fast path only take one lock, so this patch also changes
>>> it to only take qdisc_lock(q) for locked qdisc, and only take
>>> qdisc->seqlock for lockless qdisc.
>>>
>>> Since qdisc->seqlock is taken for lockless qdisc when calling
>>> qdisc_is_running() in some_qdisc_is_busy(), use qdisc->running
>>> to decide if the lockless qdisc is running.
>>>
>>> Signed-off-by: Yunsheng Lin 
>>
>> What about other things protected by this lock, such as statistics and qlen?
>>
>> This change looks too risky to me.
> 
> They are per-cpu for pfifo_fast which sets TCQ_F_CPUSTATS too.

Did you mean qdisc_lock(q) are protecting per-cpu stats for
pfifo_fast too?

> 
> Thanks.
> 
> .
> 



Re: [PATCH net-next] net: sched: remove unnecessay lock protection for skb_bad_txq/gso_skb

2021-03-16 Thread Yunsheng Lin
On 2021/3/17 5:45, David Miller wrote:
> From: Yunsheng Lin 
> Date: Tue, 16 Mar 2021 10:40:56 +0800
> 
>> On 2021/3/16 7:41, David Miller wrote:
>>> From: Yunsheng Lin 
>>
>> At least for the fast path, taking two locks for lockless qdisc hurts
>> performance when handling requeued skb, especially if the lockless
>> qdisc supports TCQ_F_CAN_BYPASS.
> 
> The bad txq and gro skb cases are not "fast path", sorry

You are right, it is more of exceptional data path, but it is still
ok to clean that up without obvious risk, right?
For I am going to replace qdisc->seqlock with qdisc_lock(q) for lockless
qdisc too and remove qdisc->seqlock, which makes more sense.


> 
> .
> 



Re: [RFC v2] net: sched: implement TCQ_F_CAN_BYPASS for lockless qdisc

2021-03-16 Thread Yunsheng Lin
On 2021/3/16 16:15, Eric Dumazet wrote:
> On Tue, Mar 16, 2021 at 1:35 AM Yunsheng Lin  wrote:
>>
>> On 2021/3/16 2:53, Jakub Kicinski wrote:
>>> On Mon, 15 Mar 2021 11:10:18 +0800 Yunsheng Lin wrote:
>>>> @@ -606,6 +623,11 @@ static const u8 prio2band[TC_PRIO_MAX + 1] = {
>>>>   */
>>>>  struct pfifo_fast_priv {
>>>>  struct skb_array q[PFIFO_FAST_BANDS];
>>>> +
>>>> +/* protect against data race between enqueue/dequeue and
>>>> + * qdisc->empty setting
>>>> + */
>>>> +spinlock_t lock;
>>>>  };
>>>>
>>>>  static inline struct skb_array *band2list(struct pfifo_fast_priv *priv,
>>>> @@ -623,7 +645,10 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, 
>>>> struct Qdisc *qdisc,
>>>>  unsigned int pkt_len = qdisc_pkt_len(skb);
>>>>  int err;
>>>>
>>>> -err = skb_array_produce(q, skb);
>>>> +spin_lock(>lock);
>>>> +err = __ptr_ring_produce(>ring, skb);
>>>> +WRITE_ONCE(qdisc->empty, false);
>>>> +spin_unlock(>lock);
>>>>
>>>>  if (unlikely(err)) {
>>>>  if (qdisc_is_percpu_stats(qdisc))
>>>> @@ -642,6 +667,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc 
>>>> *qdisc)
>>>>  struct sk_buff *skb = NULL;
>>>>  int band;
>>>>
>>>> +spin_lock(>lock);
>>>>  for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
>>>>  struct skb_array *q = band2list(priv, band);
>>>>
>>>> @@ -655,6 +681,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc 
>>>> *qdisc)
>>>>  } else {
>>>>  WRITE_ONCE(qdisc->empty, true);
>>>>  }
>>>> +spin_unlock(>lock);
>>>>
>>>>  return skb;
>>>>  }
>>>
>>> I thought pfifo was supposed to be "lockless" and this change
>>> re-introduces a lock between producer and consumer, no?
>>
>> Yes, the lock breaks the "lockless" of the lockless qdisc for now
>> I do not how to solve the below data race locklessly:
>>
>> CPU1:   CPU2:
>>   dequeue skb.
>>   .  .
>>   . enqueue skb
>>   .  .
>>   .  WRITE_ONCE(qdisc->empty, false);
>>   .  .
>>   .  .
>> WRITE_ONCE(qdisc->empty, true);
> 
> 
> Maybe it is time to fully document/explain how this can possibly work.

I might be able to provide some document/explain on how the lockless
qdisc work for I was looking through the code the past few days.

By "lockless", I suppose it means there is no lock between enqueuing and
dequeuing, whoever grasps the qdisc->seqlock can dequeue the skb and send
it out after enqueuing the skb it tries to send, other CPU which does not
grasp the qdisc->seqlock just enqueue the skb and return, hoping other cpu
holding the qdisc->seqlock can dequeue it's skb and send it out.

For the locked qdisck(the one without TCQ_F_NOLOCK flags set), it holds
the qdisc_lock() when doing the enqueuing/dequeuing and sch_direct_xmit(),
and in sch_direct_xmit() it releases the qdisc_lock() when doing skb validation
and calling dev_hard_start_xmit() to send skb to the driver, and hold the
qdisc_lock() again after calling dev_hard_start_xmit(), so other cpu may
grasps the qdisc_lock() and repeat the above process during that qdisc_lock()
release period.

So the main difference between lockess qdisc and locked qdisc is if
there is a lock to protect both enqueuing and dequeuing. For example, pfifo_fast
uses ptr_ring to become lockless for enqueuing or dequeuing, but it still needs
producer_lock to protect the concurrent enqueue, and consumer_lock to protect
the concurrent dequeue. for Other qdisc that can not provide the lockless 
between
enqueuing or dequeuing, maybe we implement the locking in the specific qdisc
implementation, so that it still can claim to be "lockless", like the locking
added for pfifo_fast in this patch.

Idealy we can claim all qdisc to be lockess qdisc as long as we make sure
all qdisc either use lockless implementation, or use internal lock to protect
between enqueuing and dequeuing, so that we can remove the locked dqisc and
will have le

Re: [Linuxarm] Re: [RFC v2] net: sched: implement TCQ_F_CAN_BYPASS for lockless qdisc

2021-03-15 Thread Yunsheng Lin
On 2021/3/16 8:35, Yunsheng Lin wrote:
> On 2021/3/16 2:53, Jakub Kicinski wrote:
>> On Mon, 15 Mar 2021 11:10:18 +0800 Yunsheng Lin wrote:
>>> @@ -606,6 +623,11 @@ static const u8 prio2band[TC_PRIO_MAX + 1] = {
>>>   */
>>>  struct pfifo_fast_priv {
>>> struct skb_array q[PFIFO_FAST_BANDS];
>>> +
>>> +   /* protect against data race between enqueue/dequeue and
>>> +* qdisc->empty setting
>>> +*/
>>> +   spinlock_t lock;
>>>  };
>>>  
>>>  static inline struct skb_array *band2list(struct pfifo_fast_priv *priv,
>>> @@ -623,7 +645,10 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, 
>>> struct Qdisc *qdisc,
>>> unsigned int pkt_len = qdisc_pkt_len(skb);
>>> int err;
>>>  
>>> -   err = skb_array_produce(q, skb);
>>> +   spin_lock(>lock);
>>> +   err = __ptr_ring_produce(>ring, skb);
>>> +   WRITE_ONCE(qdisc->empty, false);
>>> +   spin_unlock(>lock);
>>>  
>>> if (unlikely(err)) {
>>> if (qdisc_is_percpu_stats(qdisc))
>>> @@ -642,6 +667,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc 
>>> *qdisc)
>>> struct sk_buff *skb = NULL;
>>> int band;
>>>  
>>> +   spin_lock(>lock);
>>> for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
>>> struct skb_array *q = band2list(priv, band);
>>>  
>>> @@ -655,6 +681,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc 
>>> *qdisc)
>>> } else {
>>> WRITE_ONCE(qdisc->empty, true);
>>> }
>>> +   spin_unlock(>lock);
>>>  
>>> return skb;
>>>  }
>>
>> I thought pfifo was supposed to be "lockless" and this change
>> re-introduces a lock between producer and consumer, no?
> 
> Yes, the lock breaks the "lockless" of the lockless qdisc for now
> I do not how to solve the below data race locklessly:
> 
>   CPU1:   CPU2:
>   dequeue skb  .
> .  .  
> . enqueue skb
> .  .
> .  WRITE_ONCE(qdisc->empty, false);
> .  .
> .  .
> WRITE_ONCE(qdisc->empty, true);
> 
> If the above happens, the qdisc->empty is true even if the qdisc has some
> skb, which may cuase out of order or packet stuck problem.
> 
> It seems we may need to update ptr_ring' status(empty or not) while
> enqueuing/dequeuing atomically in the ptr_ring implementation.
> 
> Any better idea?

It seems we can use __ptr_ring_empty() within the qdisc->seqlock protection,
because qdisc->seqlock is clearly served as r->consumer_lock.

> 
>>
>> .
>>
> ___
> Linuxarm mailing list -- linux...@openeuler.org
> To unsubscribe send an email to linuxarm-le...@openeuler.org
> 



Re: [PATCH net-next] net: sched: remove unnecessay lock protection for skb_bad_txq/gso_skb

2021-03-15 Thread Yunsheng Lin
On 2021/3/16 7:41, David Miller wrote:
> From: Yunsheng Lin 
> Date: Mon, 15 Mar 2021 17:30:10 +0800
> 
>> Currently qdisc_lock(q) is taken before enqueuing and dequeuing
>> for lockless qdisc's skb_bad_txq/gso_skb queue, qdisc->seqlock is
>> also taken, which can provide the same protection as qdisc_lock(q).
>>
>> This patch removes the unnecessay qdisc_lock(q) protection for
>> lockless qdisc' skb_bad_txq/gso_skb queue.
>>
>> And dev_reset_queue() takes the qdisc->seqlock for lockless qdisc
>> besides taking the qdisc_lock(q) when doing the qdisc reset,
>> some_qdisc_is_busy() takes both qdisc->seqlock and qdisc_lock(q)
>> when checking qdisc status. It is unnecessary to take both lock
>> while the fast path only take one lock, so this patch also changes
>> it to only take qdisc_lock(q) for locked qdisc, and only take
>> qdisc->seqlock for lockless qdisc.
>>
>> Since qdisc->seqlock is taken for lockless qdisc when calling
>> qdisc_is_running() in some_qdisc_is_busy(), use qdisc->running
>> to decide if the lockless qdisc is running.
>>
>> Signed-off-by: Yunsheng Lin 
> 
> What about other things protected by this lock, such as statistics and qlen?
> 
> This change looks too risky to me.

Ok, If that is the case, maybe we just remove qdisc->seqlock and use
qdisc_lock(q) for lockless qdisc too, so that we do not need to worry
about "lockless qdisc' other things protected by qdisc_lock(q)".

At least for the fast path, taking two locks for lockless qdisc hurts
performance when handling requeued skb, especially if the lockless
qdisc supports TCQ_F_CAN_BYPASS.

> 
> 
> .
> 



Re: [RFC v2] net: sched: implement TCQ_F_CAN_BYPASS for lockless qdisc

2021-03-15 Thread Yunsheng Lin
On 2021/3/16 2:53, Jakub Kicinski wrote:
> On Mon, 15 Mar 2021 11:10:18 +0800 Yunsheng Lin wrote:
>> @@ -606,6 +623,11 @@ static const u8 prio2band[TC_PRIO_MAX + 1] = {
>>   */
>>  struct pfifo_fast_priv {
>>  struct skb_array q[PFIFO_FAST_BANDS];
>> +
>> +/* protect against data race between enqueue/dequeue and
>> + * qdisc->empty setting
>> + */
>> +spinlock_t lock;
>>  };
>>  
>>  static inline struct skb_array *band2list(struct pfifo_fast_priv *priv,
>> @@ -623,7 +645,10 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, 
>> struct Qdisc *qdisc,
>>  unsigned int pkt_len = qdisc_pkt_len(skb);
>>  int err;
>>  
>> -err = skb_array_produce(q, skb);
>> +spin_lock(>lock);
>> +err = __ptr_ring_produce(>ring, skb);
>> +WRITE_ONCE(qdisc->empty, false);
>> +spin_unlock(>lock);
>>  
>>  if (unlikely(err)) {
>>  if (qdisc_is_percpu_stats(qdisc))
>> @@ -642,6 +667,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc 
>> *qdisc)
>>  struct sk_buff *skb = NULL;
>>  int band;
>>  
>> +spin_lock(>lock);
>>  for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
>>  struct skb_array *q = band2list(priv, band);
>>  
>> @@ -655,6 +681,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc 
>> *qdisc)
>>  } else {
>>  WRITE_ONCE(qdisc->empty, true);
>>  }
>> +spin_unlock(>lock);
>>  
>>  return skb;
>>  }
> 
> I thought pfifo was supposed to be "lockless" and this change
> re-introduces a lock between producer and consumer, no?

Yes, the lock breaks the "lockless" of the lockless qdisc for now
I do not how to solve the below data race locklessly:

CPU1:   CPU2:
  dequeue skb.
  .  .  
  . enqueue skb
  .  .
  .  WRITE_ONCE(qdisc->empty, false);
  .  .
  .  .
WRITE_ONCE(qdisc->empty, true);

If the above happens, the qdisc->empty is true even if the qdisc has some
skb, which may cuase out of order or packet stuck problem.

It seems we may need to update ptr_ring' status(empty or not) while
enqueuing/dequeuing atomically in the ptr_ring implementation.

Any better idea?

> 
> .
> 



[PATCH net-next] net: sched: remove unnecessay lock protection for skb_bad_txq/gso_skb

2021-03-15 Thread Yunsheng Lin
Currently qdisc_lock(q) is taken before enqueuing and dequeuing
for lockless qdisc's skb_bad_txq/gso_skb queue, qdisc->seqlock is
also taken, which can provide the same protection as qdisc_lock(q).

This patch removes the unnecessay qdisc_lock(q) protection for
lockless qdisc' skb_bad_txq/gso_skb queue.

And dev_reset_queue() takes the qdisc->seqlock for lockless qdisc
besides taking the qdisc_lock(q) when doing the qdisc reset,
some_qdisc_is_busy() takes both qdisc->seqlock and qdisc_lock(q)
when checking qdisc status. It is unnecessary to take both lock
while the fast path only take one lock, so this patch also changes
it to only take qdisc_lock(q) for locked qdisc, and only take
qdisc->seqlock for lockless qdisc.

Since qdisc->seqlock is taken for lockless qdisc when calling
qdisc_is_running() in some_qdisc_is_busy(), use qdisc->running
to decide if the lockless qdisc is running.

Signed-off-by: Yunsheng Lin 
---
 include/net/sch_generic.h |  2 --
 net/sched/sch_generic.c   | 72 +--
 2 files changed, 19 insertions(+), 55 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 2d6eb60..0e497ed 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -139,8 +139,6 @@ static inline struct Qdisc *qdisc_refcount_inc_nz(struct 
Qdisc *qdisc)
 
 static inline bool qdisc_is_running(struct Qdisc *qdisc)
 {
-   if (qdisc->flags & TCQ_F_NOLOCK)
-   return spin_is_locked(>seqlock);
return (raw_read_seqcount(>running) & 1) ? true : false;
 }
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 49eae93..a5f1e3c 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -38,7 +38,7 @@ EXPORT_SYMBOL(default_qdisc_ops);
 /* Main transmission queue. */
 
 /* Modifications to data participating in scheduling must be protected with
- * qdisc_lock(qdisc) spinlock.
+ * qdisc_lock(qdisc) or qdisc->seqlock spinlock.
  *
  * The idea is the following:
  * - enqueue, dequeue are serialized via qdisc root lock
@@ -51,14 +51,8 @@ EXPORT_SYMBOL(default_qdisc_ops);
 static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q)
 {
const struct netdev_queue *txq = q->dev_queue;
-   spinlock_t *lock = NULL;
struct sk_buff *skb;
 
-   if (q->flags & TCQ_F_NOLOCK) {
-   lock = qdisc_lock(q);
-   spin_lock(lock);
-   }
-
skb = skb_peek(>skb_bad_txq);
if (skb) {
/* check the reason of requeuing without tx lock first */
@@ -77,9 +71,6 @@ static inline struct sk_buff *__skb_dequeue_bad_txq(struct 
Qdisc *q)
}
}
 
-   if (lock)
-   spin_unlock(lock);
-
return skb;
 }
 
@@ -96,13 +87,6 @@ static inline struct sk_buff 
*qdisc_dequeue_skb_bad_txq(struct Qdisc *q)
 static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q,
 struct sk_buff *skb)
 {
-   spinlock_t *lock = NULL;
-
-   if (q->flags & TCQ_F_NOLOCK) {
-   lock = qdisc_lock(q);
-   spin_lock(lock);
-   }
-
__skb_queue_tail(>skb_bad_txq, skb);
 
if (qdisc_is_percpu_stats(q)) {
@@ -112,20 +96,10 @@ static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc 
*q,
qdisc_qstats_backlog_inc(q, skb);
q->q.qlen++;
}
-
-   if (lock)
-   spin_unlock(lock);
 }
 
 static inline void dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
 {
-   spinlock_t *lock = NULL;
-
-   if (q->flags & TCQ_F_NOLOCK) {
-   lock = qdisc_lock(q);
-   spin_lock(lock);
-   }
-
while (skb) {
struct sk_buff *next = skb->next;
 
@@ -144,8 +118,7 @@ static inline void dev_requeue_skb(struct sk_buff *skb, 
struct Qdisc *q)
 
skb = next;
}
-   if (lock)
-   spin_unlock(lock);
+
__netif_schedule(q);
 }
 
@@ -207,24 +180,9 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool 
*validate,
 
*packets = 1;
if (unlikely(!skb_queue_empty(>gso_skb))) {
-   spinlock_t *lock = NULL;
-
-   if (q->flags & TCQ_F_NOLOCK) {
-   lock = qdisc_lock(q);
-   spin_lock(lock);
-   }
 
skb = skb_peek(>gso_skb);
 
-   /* skb may be null if another cpu pulls gso_skb off in between
-* empty check and lock.
-*/
-   if (!skb) {
-   if (lock)
-   spin_unlock(lock);
-   goto validate;
-   }
-
/* skb in gso_skb were already validated */
*validate = false;
if (xfrm_offload(skb))
@@ -243,11 +201,10 @@ static struct sk_buff *dequeue_skb(struct

[RFC v2] net: sched: implement TCQ_F_CAN_BYPASS for lockless qdisc

2021-03-14 Thread Yunsheng Lin
Currently pfifo_fast has both TCQ_F_CAN_BYPASS and TCQ_F_NOLOCK
flag set, but queue discipline by-pass does not work for lockless
qdisc because skb is always enqueued to qdisc even when the qdisc
is empty, see __dev_xmit_skb().

This patch calls sch_direct_xmit() to transmit the skb directly
to the driver for empty lockless qdisc too, which aviod enqueuing
and dequeuing operation. qdisc->empty is set to false whenever a
skb is enqueued, see pfifo_fast_enqueue(), and is set to true when
skb dequeuing return NULL, see pfifo_fast_dequeue(), a spinlock is
added to avoid the race between enqueue/dequeue and qdisc->empty
setting.

If there is requeued skb in q->gso_skb, and qdisc->empty is true,
do not allow bypassing requeued skb. enqueuing and dequeuing in
q->gso_skb is always protected by qdisc->seqlock, so is the access
of q->gso_skb by skb_queue_empty();

Also, qdisc is scheduled at the end of qdisc_run_end() when q->empty
is false to avoid packet stuck problem.

The performance for ip_forward test increases about 10% with this
patch.

Signed-off-by: Yunsheng Lin 
---
RFC V2: fix requeued skb out of order and data race problem.
---
 include/net/pkt_sched.h   |  2 ++
 include/net/sch_generic.h |  7 +--
 net/core/dev.c| 14 ++
 net/sched/sch_generic.c   | 31 ++-
 4 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index f5c1bee..c760f6a 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -122,6 +122,8 @@ void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc);
 bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 struct net_device *dev, struct netdev_queue *txq,
 spinlock_t *root_lock, bool validate);
+bool sch_may_need_requeuing(struct sk_buff *skb, struct Qdisc *q,
+   struct net_device *dev);
 
 void __qdisc_run(struct Qdisc *q);
 
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 2d6eb60..6591356 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -161,7 +161,6 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
if (qdisc->flags & TCQ_F_NOLOCK) {
if (!spin_trylock(>seqlock))
return false;
-   WRITE_ONCE(qdisc->empty, false);
} else if (qdisc_is_running(qdisc)) {
return false;
}
@@ -176,8 +175,12 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
 static inline void qdisc_run_end(struct Qdisc *qdisc)
 {
write_seqcount_end(>running);
-   if (qdisc->flags & TCQ_F_NOLOCK)
+   if (qdisc->flags & TCQ_F_NOLOCK) {
spin_unlock(>seqlock);
+
+   if (unlikely(!READ_ONCE(qdisc->empty)))
+   __netif_schedule(qdisc);
+   }
 }
 
 static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
diff --git a/net/core/dev.c b/net/core/dev.c
index 2bfdd52..8f4afb6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3791,6 +3791,20 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, 
struct Qdisc *q,
qdisc_calculate_pkt_len(skb, q);
 
if (q->flags & TCQ_F_NOLOCK) {
+   if (q->flags & TCQ_F_CAN_BYPASS && READ_ONCE(q->empty) &&
+   qdisc_run_begin(q)) {
+   qdisc_bstats_cpu_update(q, skb);
+
+   if (sch_may_need_requeuing(skb, q, dev))
+   __qdisc_run(q);
+   else if (sch_direct_xmit(skb, q, dev, txq, NULL, true) 
&&
+!READ_ONCE(q->empty))
+   __qdisc_run(q);
+
+   qdisc_run_end(q);
+   return NET_XMIT_SUCCESS;
+   }
+
rc = q->enqueue(skb, q, _free) & NET_XMIT_MASK;
qdisc_run(q);
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 49eae93..0df1462 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -273,6 +273,23 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool 
*validate,
return skb;
 }
 
+bool sch_may_need_requeuing(struct sk_buff *skb, struct Qdisc *q,
+   struct net_device *dev)
+{
+   bool again = false;
+
+   if (likely(skb_queue_empty(>gso_skb)))
+   return false;
+
+   /* need validating before requeuing */
+   skb = validate_xmit_skb_list(skb, dev, );
+   if (unlikely(!skb))
+   return true;
+
+   dev_requeue_skb(skb, q);
+   return true;
+}
+
 /*
  * Transmit possibly several skbs, and handle the return status as
  * required. Owning running seqcount bit guarantees that
@@ -606,6 +623,11 @@ static const u8 prio2band[TC_PRIO_MAX + 1] = {
  */
 struct pfifo_fast_p

Re: [PATCH RFC] net: sched: implement TCQ_F_CAN_BYPASS for lockless qdisc

2021-03-14 Thread Yunsheng Lin
On 2021/3/14 18:15, Marc Kleine-Budde wrote:
> Cc += linux-...@vger.kernel.org
> 
> On 3/14/21 1:03 AM, Vladimir Oltean wrote:
>> On Sat, Mar 13, 2021 at 10:47:47AM +0800, Yunsheng Lin wrote:
>>> Currently pfifo_fast has both TCQ_F_CAN_BYPASS and TCQ_F_NOLOCK
>>> flag set, but queue discipline by-pass does not work for lockless
>>> qdisc because skb is always enqueued to qdisc even when the qdisc
>>> is empty, see __dev_xmit_skb().
>>>
>>> This patch calles sch_direct_xmit() to transmit the skb directly
>>> to the driver for empty lockless qdisc too, which aviod enqueuing
>>> and dequeuing operation. qdisc->empty is set to false whenever a
>>> skb is enqueued, and is set to true when skb dequeuing return NULL,
>>> see pfifo_fast_dequeue().
>>>
>>> Also, qdisc is scheduled at the end of qdisc_run_end() when q->empty
>>> is false to avoid packet stuck problem.
>>>
>>> The performance for ip_forward test increases about 10% with this
>>> patch.
>>>
>>> Signed-off-by: Yunsheng Lin 
>>> ---
>>
>> I can confirm the ~10% IP forwarding throughput improvement brought by
>> this patch, but as you might be aware, there was a previous attempt to
>> add qdisc bypass to pfifo_fast by Paolo Abeni:
>> https://lore.kernel.org/netdev/661cc33a-5f65-2769-cc1a-65791cb4b...@pengutronix.de/

Thanks for mention the previous attempt to add qdisc bypass to pfifo_fast.

>> It was reverted because TX reordering was observed with SocketCAN
>> (although, presumably it should also be seen with Ethernet and such).

When writing this patch, I was more foucusing on packet stuck problem
when TCQ_F_CAN_BYPASS is added for lockless qdisc.

When I am looking at flexcan_start_xmit() used by the can driver you mentioned,
it calls netif_stop_queue() to disable the queue when sending each skb, which 
may
cuause other skb to be requeued, see dev_requeue_skb() called by 
sch_direct_xmit(),
and q->empty is still true when this happens, so other cpu may send skb directly
bypassing the requeued skb, causing an out of order problem.

I will try to deal with the above requeued skb problem, and see if there are 
other
timing issus beside the requeued skb problem.

Thanks for the testing again.

> 
> Thanks for testing that, I just stumbled over this patch by accident.
> 
> Marc
> 



[PATCH RFC] net: sched: implement TCQ_F_CAN_BYPASS for lockless qdisc

2021-03-12 Thread Yunsheng Lin
Currently pfifo_fast has both TCQ_F_CAN_BYPASS and TCQ_F_NOLOCK
flag set, but queue discipline by-pass does not work for lockless
qdisc because skb is always enqueued to qdisc even when the qdisc
is empty, see __dev_xmit_skb().

This patch calles sch_direct_xmit() to transmit the skb directly
to the driver for empty lockless qdisc too, which aviod enqueuing
and dequeuing operation. qdisc->empty is set to false whenever a
skb is enqueued, and is set to true when skb dequeuing return NULL,
see pfifo_fast_dequeue().

Also, qdisc is scheduled at the end of qdisc_run_end() when q->empty
is false to avoid packet stuck problem.

The performance for ip_forward test increases about 10% with this
patch.

Signed-off-by: Yunsheng Lin 
---
 include/net/sch_generic.h |  7 +--
 net/core/dev.c| 11 +++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 2d6eb60..6591356 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -161,7 +161,6 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
if (qdisc->flags & TCQ_F_NOLOCK) {
if (!spin_trylock(>seqlock))
return false;
-   WRITE_ONCE(qdisc->empty, false);
} else if (qdisc_is_running(qdisc)) {
return false;
}
@@ -176,8 +175,12 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
 static inline void qdisc_run_end(struct Qdisc *qdisc)
 {
write_seqcount_end(>running);
-   if (qdisc->flags & TCQ_F_NOLOCK)
+   if (qdisc->flags & TCQ_F_NOLOCK) {
spin_unlock(>seqlock);
+
+   if (unlikely(!READ_ONCE(qdisc->empty)))
+   __netif_schedule(qdisc);
+   }
 }
 
 static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
diff --git a/net/core/dev.c b/net/core/dev.c
index 2bfdd52..fa8504d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3791,7 +3791,18 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, 
struct Qdisc *q,
qdisc_calculate_pkt_len(skb, q);
 
if (q->flags & TCQ_F_NOLOCK) {
+   if (q->flags & TCQ_F_CAN_BYPASS && READ_ONCE(q->empty) && 
qdisc_run_begin(q)) {
+   qdisc_bstats_cpu_update(q, skb);
+
+   if (sch_direct_xmit(skb, q, dev, txq, NULL, true) && 
!READ_ONCE(q->empty))
+   __qdisc_run(q);
+
+   qdisc_run_end(q);
+   return NET_XMIT_SUCCESS;
+   }
+
rc = q->enqueue(skb, q, _free) & NET_XMIT_MASK;
+   WRITE_ONCE(q->empty, false);
qdisc_run(q);
 
if (unlikely(to_free))
-- 
2.7.4



Re: [PATCH 4/4] net: hns3: double free 'skb'

2021-02-02 Thread Yunsheng Lin
On 2021/2/3 11:13, Wenjia Zhao wrote:
> net: hns3: double free 'skb'
> 
> The false branch of (tx_ret == NETDEV_TX_OK) free the skb. However, the
> kfree_skb(skb) in the out label will be execute when exits the function.
> So the skb has a double-free bugs.
> 
> Remove the kfree_skb(skb) at line 269

The freeing is added by the below patch:

commit: 8f9eed1a8791("net: hns3: fix for skb leak when doing selftest")

which is to fix a skb leak problem.

kfree_skb(skb) in the out label corresponds to alloc_skb(),
and kfree_skb(skb) removed in this patch corresponds to
skb_get(skb) before calling hns3_nic_net_xmit() when
hns3_nic_net_xmit() non-NETDEV_TX_OK.

So I do not think there is double free 'skb' here, unless
I miss something here?

> 
> Signed-off-by: Wenjia Zhao 
> ---
>  drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 1 -
>  1 file changed, 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c 
> b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
> index 2622e04..1b926ff 100644
> --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
> +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
> @@ -266,7 +266,6 @@ static int hns3_lp_run_test(struct net_device *ndev, enum 
> hnae3_loop mode)
>   if (tx_ret == NETDEV_TX_OK) {
>   good_cnt++;
>   } else {
> - kfree_skb(skb);
>   netdev_err(ndev, "hns3_lb_run_test xmit failed: %d\n",
>  tx_ret);
>   }
> 



Re: [PATCH net-next v2 3/3] xsk: build skb by page

2021-01-21 Thread Yunsheng Lin
On 2021/1/21 15:41, Magnus Karlsson wrote:
> On Wed, Jan 20, 2021 at 9:29 PM Alexander Lobakin  wrote:
>>
>> From: Xuan Zhuo 
>> Date: Wed, 20 Jan 2021 16:30:56 +0800
>>
>>> This patch is used to construct skb based on page to save memory copy
>>> overhead.
>>>
>>> This function is implemented based on IFF_TX_SKB_NO_LINEAR. Only the
>>> network card priv_flags supports IFF_TX_SKB_NO_LINEAR will use page to
>>> directly construct skb. If this feature is not supported, it is still
>>> necessary to copy data to construct skb.
>>>
>>>  Performance Testing 
>>>
>>> The test environment is Aliyun ECS server.
>>> Test cmd:
>>> ```
>>> xdpsock -i eth0 -t  -S -s 
>>> ```
>>>
>>> Test result data:
>>>
>>> size64  512 10241500
>>> copy1916747 1775988 1600203 1440054
>>> page1974058 1953655 1945463 1904478
>>> percent 3.0%10.0%   21.58%  32.3%
>>>
>>> Signed-off-by: Xuan Zhuo 
>>> Reviewed-by: Dust Li 
>>> ---
>>>  net/xdp/xsk.c | 104 
>>> --
>>>  1 file changed, 86 insertions(+), 18 deletions(-)
>>
>> Now I like the result, thanks!
>>
>> But Patchwork still display your series incorrectly (messages 0 and 1
>> are missing). I'm concerning maintainers may not take this in such
>> form. Try to pass the folder's name, not folder/*.patch to
>> git send-email when sending, and don't use --in-reply-to when sending
>> a new iteration.
> 
> Xuan,
> 
> Please make the new submission of the patch set a v3 even though you
> did not change the code. Just so we can clearly see it is the new
> submission.
> 
>>> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
>>> index 8037b04..40bac11 100644
>>> --- a/net/xdp/xsk.c
>>> +++ b/net/xdp/xsk.c
>>> @@ -430,6 +430,87 @@ static void xsk_destruct_skb(struct sk_buff *skb)
>>>   sock_wfree(skb);
>>>  }
>>>
>>> +static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
>>> +   struct xdp_desc *desc)
>>> +{
>>> + u32 len, offset, copy, copied;
>>> + struct sk_buff *skb;
>>> + struct page *page;
>>> + void *buffer;
>>> + int err, i;
>>> + u64 addr;
>>> +
>>> + skb = sock_alloc_send_skb(>sk, 0, 1, );
>>> + if (unlikely(!skb))
>>> + return ERR_PTR(err);
>>> +
>>> + addr = desc->addr;
>>> + len = desc->len;
>>> +
>>> + buffer = xsk_buff_raw_get_data(xs->pool, addr);
>>> + offset = offset_in_page(buffer);
>>> + addr = buffer - xs->pool->addrs;
>>> +
>>> + for (copied = 0, i = 0; copied < len; i++) {
>>> + page = xs->pool->umem->pgs[addr >> PAGE_SHIFT];
>>> +
>>> + get_page(page);
>>> +
>>> + copy = min_t(u32, PAGE_SIZE - offset, len - copied);
>>> +
>>> + skb_fill_page_desc(skb, i, page, offset, copy);
>>> +
>>> + copied += copy;
>>> + addr += copy;
>>> + offset = 0;
>>> + }
>>> +
>>> + skb->len += len;
>>> + skb->data_len += len;
>>> + skb->truesize += len;
>>> +
>>> + refcount_add(len, >sk.sk_wmem_alloc);
>>> +
>>> + return skb;
>>> +}
>>> +
>>> +static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>>> +  struct xdp_desc *desc)
>>> +{
>>> + struct sk_buff *skb = NULL;

It seems the above init is unnecessary, for the skb is always
set before being used.

>>> +
>>> + if (xs->dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
>>> + skb = xsk_build_skb_zerocopy(xs, desc);
>>> + if (IS_ERR(skb))
>>> + return skb;
>>> + } else {
>>> + void *buffer;
>>> + u32 len;
>>> + int err;
>>> +
>>> + len = desc->len;
>>> + skb = sock_alloc_send_skb(>sk, len, 1, );
>>> + if (unlikely(!skb))
>>> + return ERR_PTR(err);
>>> +
>>> + skb_put(skb, len);
>>> + buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
>>> + err = skb_store_bits(skb, 0, buffer, len);
>>> + if (unlikely(err)) {
>>> + kfree_skb(skb);
>>> + return ERR_PTR(err);
>>> + }
>>> + }
>>> +
>>> + skb->dev = xs->dev;
>>> + skb->priority = xs->sk.sk_priority;
>>> + skb->mark = xs->sk.sk_mark;
>>> + skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
>>> + skb->destructor = xsk_destruct_skb;
>>> +
>>> + return skb;
>>> +}
>>> +
>>>  static int xsk_generic_xmit(struct sock *sk)
>>>  {
>>>   struct xdp_sock *xs = xdp_sk(sk);
>>> @@ -446,43 +527,30 @@ static int xsk_generic_xmit(struct sock *sk)
>>>   goto out;
>>>
>>>   while (xskq_cons_peek_desc(xs->tx, , xs->pool)) {
>>> - char *buffer;
>>> - u64 addr;
>>> - u32 len;
>>> -
>>>   if (max_batch-- == 0) {
>>>   err = -EAGAIN;
>>>   goto out;
>>>   }
>>>

Re: [PATCH] powerpc: fix the allyesconfig build

2020-11-29 Thread Yunsheng Lin
On 2020/11/29 3:36, Jakub Kicinski wrote:
> On Sat, 28 Nov 2020 16:20:54 +1100 Stephen Rothwell wrote:
>> On Fri, 27 Nov 2020 17:56:42 -0800 Jakub Kicinski  wrote:
>>>
>>> What's the offending structure in hisilicon? I'd rather have a look
>>> packing structs with pointers in 'em sounds questionable.
>>>
>>> I only see these two:
>>>
>>> $ git grep packed drivers/net/ethernet/hisilicon/
>>> drivers/net/ethernet/hisilicon/hns/hnae.h:struct __packed hnae_desc {
>>> drivers/net/ethernet/hisilicon/hns3/hns3_enet.h:struct __packed hns3_desc { 
>>>  
>>
>> struct hclge_dbg_reg_type_info which is 28 bytes long due to the
>> included struct struct hclge_dbg_reg_common_msg (which is 12 bytes
>> long).  They are surrounded by #pragma pack(1)/pack().
>>
>> This forces the 2 pointers in each second array element of
>> hclge_dbg_reg_info[] to be 4 byte aligned (where pointers are 8 bytes
>> long on PPC64).
> 
> Ah! Thanks, I don't see a reason for these to be packed. 
> Looks  like an accident, there is no reason to pack anything 
> past struct hclge_dbg_reg_common_msg AFAICT.
> 
> Huawei folks, would you mind sending a fix if the analysis is correct?

Yes, will send a patch to fix that. Thanks for the analysis.

> .
> 


Re: [PATCH] powerpc: fix the allyesconfig build

2020-11-27 Thread Yunsheng Lin
On 2020/11/28 9:56, Jakub Kicinski wrote:
> On Sat, 28 Nov 2020 12:28:19 +1100 Stephen Rothwell wrote:
>> There are 2 drivers that have arrays of packed structures that contain
>> pointers that end up at unaligned offsets.  These produce warnings in
>> the PowerPC allyesconfig build like this:
>>
>> WARNING: 148 bad relocations
>> ce56510b R_PPC64_UADDR64   .rodata+0x01c72378
>> ce565126 R_PPC64_UADDR64   .rodata+0x01c723c0
>>
>> They are not drivers that are used on PowerPC (I assume), so mark them
>> to not be built on PPC64 when CONFIG_RELOCATABLE is enabled.
> 
> 
> 
> What's the offending structure in hisilicon? I'd rather have a look
> packing structs with pointers in 'em sounds questionable.
> 
> I only see these two:
> 
> $ git grep packed drivers/net/ethernet/hisilicon/
> drivers/net/ethernet/hisilicon/hns/hnae.h:struct __packed hnae_desc {
> drivers/net/ethernet/hisilicon/hns3/hns3_enet.h:struct __packed hns3_desc {

I assmue "struct __packed hnae_desc" is the offending structure, because
flag_ipoffset field is defined as __le32 and is not 32 bit aligned.

struct __packed hnae_desc {
__le64 addr;//0
union {
struct {//64
union {
__le16 asid_bufnum_pid;
__le16 asid;
};
__le16 send_size;   //92
union {
__le32 flag_ipoffset;   //*108*
struct {
__u8 bn_pid;
__u8 ra_ri_cs_fe_vld;
__u8 ip_offset;
__u8 tse_vlan_snap_v6_sctp_nth;
};
};
__le16 mss;
__u8 l4_len;
__u8 reserved1;
__le16 paylen;
__u8 vmid;
__u8 qid;
__le32 reserved2[2];
} tx;

struct {
__le32 ipoff_bnum_pid_flag;
__le16 pkt_len;
__le16 size;
union {
__le32 vlan_pri_asid;
struct {
__le16 asid;
__le16 vlan_cfi_pri;
};
};
__le32 rss_hash;
__le32 reserved_1[2];
} rx;
};
};

> .
> 


  1   2   3   4   >