from:"Li,Rongqing"

[PATCH][net-next,v2] virtio_net: Remove u64_stats_update_begin()/end() for stats fetch

2024-06-21 Thread Li RongQing

This place is fetching the stats, u64_stats_update_begin()/end()
should not be used, and the fetcher of stats is in the same context
as the updater of the stats, so don't need any protection

Suggested-by: Jakub Kicinski 
Signed-off-by: Li RongQing 
---
 drivers/net/virtio_net.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 61a57d1..6604339 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2336,12 +2336,12 @@ static void virtnet_rx_dim_update(struct virtnet_info 
*vi, struct receive_queue
if (!rq->packets_in_napi)
return;
 
-   u64_stats_update_begin(&rq->stats.syncp);
+   /* Don't need protection when fetching stats, since fetcher and
+* updater of the stats are in same context */
dim_update_sample(rq->calls,
  u64_stats_read(&rq->stats.packets),
  u64_stats_read(&rq->stats.bytes),
  &cur_sample);
-   u64_stats_update_end(&rq->stats.syncp);
 
net_dim(&rq->dim, cur_sample);
rq->packets_in_napi = 0;
-- 
2.9.4

RE: [????] Re: [PATCH] virtio_net: Use u64_stats_fetch_begin() for stats fetch

2024-06-20 Thread Li,Rongqing

> Did you by any chance use an automated tool of any sort to find this issue or
> generate the fix?
> 
> I don't think this is actually necessary here, you're in the same context as 
> the
> updater of the stats, you don't need any protection.
> You can remove u64_stats_update_begin() / end() (in net-next, there's no bug).
> 
> I won't comment on implications of calling dim_update_sample() in a loop.
> 
> Please make sure you answer my "did you use a tool" question, I'm really
> curious.


I have no tool, I find this when I investigating a DIM related issue.

Thanks

[PATCH] virtio_net: Use u64_stats_fetch_begin() for stats fetch

2024-06-18 Thread Li RongQing

This place is fetching the stats, so u64_stats_fetch_begin
and u64_stats_fetch_retry should be used

Fixes: 6208799553a8 ("virtio-net: support rx netdim")
Signed-off-by: Li RongQing 
---
 drivers/net/virtio_net.c | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 61a57d1..b669e73 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2332,16 +2332,18 @@ static void virtnet_poll_cleantx(struct receive_queue 
*rq)
 static void virtnet_rx_dim_update(struct virtnet_info *vi, struct 
receive_queue *rq)
 {
struct dim_sample cur_sample = {};
+   unsigned int start;
 
if (!rq->packets_in_napi)
return;
 
-   u64_stats_update_begin(&rq->stats.syncp);
-   dim_update_sample(rq->calls,
- u64_stats_read(&rq->stats.packets),
- u64_stats_read(&rq->stats.bytes),
- &cur_sample);
-   u64_stats_update_end(&rq->stats.syncp);
+   do {
+   start = u64_stats_fetch_begin(&rq->stats.syncp);
+   dim_update_sample(rq->calls,
+   u64_stats_read(&rq->stats.packets),
+   u64_stats_read(&rq->stats.bytes),
+   &cur_sample);
+   } while (u64_stats_fetch_retry(&rq->stats.syncp, start));
 
net_dim(&rq->dim, cur_sample);
rq->packets_in_napi = 0;
-- 
2.9.4

[PATCH][v2] igb: avoid premature Rx buffer reuse

2021-01-13 Thread Li RongQing

Igb needs a similar fix as commit 75aab4e10ae6a ("i40e: avoid
premature Rx buffer reuse")

The page recycle code, incorrectly, relied on that a page fragment
could not be freed inside xdp_do_redirect(). This assumption leads to
that page fragments that are used by the stack/XDP redirect can be
reused and overwritten.

To avoid this, store the page count prior invoking xdp_do_redirect().

Longer explanation:

Intel NICs have a recycle mechanism. The main idea is that a page is
split into two parts. One part is owned by the driver, one part might
be owned by someone else, such as the stack.

t0: Page is allocated, and put on the Rx ring
  +---
used by NIC ->| upper buffer
(rx_buffer)   +---
  | lower buffer
  +---
  page count  == USHRT_MAX
  rx_buffer->pagecnt_bias == USHRT_MAX

t1: Buffer is received, and passed to the stack (e.g.)
  +---
  | upper buff (skb)
  +---
used by NIC ->| lower buffer
(rx_buffer)   +---
  page count  == USHRT_MAX
  rx_buffer->pagecnt_bias == USHRT_MAX - 1

t2: Buffer is received, and redirected
  +---
  | upper buff (skb)
  +---
used by NIC ->| lower buffer
(rx_buffer)   +---

Now, prior calling xdp_do_redirect():
  page count  == USHRT_MAX
  rx_buffer->pagecnt_bias == USHRT_MAX - 2

This means that buffer *cannot* be flipped/reused, because the skb is
still using it.

The problem arises when xdp_do_redirect() actually frees the
segment. Then we get:
  page count  == USHRT_MAX - 1
  rx_buffer->pagecnt_bias == USHRT_MAX - 2

>From a recycle perspective, the buffer can be flipped and reused,
which means that the skb data area is passed to the Rx HW ring!

To work around this, the page count is stored prior calling
xdp_do_redirect().

Fixes: 9cbc948b5a20 ("igb: add XDP support")
Signed-off-by: Li RongQing 
---
 drivers/net/ethernet/intel/igb/igb_main.c | 22 +++---
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c 
b/drivers/net/ethernet/intel/igb/igb_main.c
index 03f78fdb0dcd..3e0d903cf919 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -8232,7 +8232,8 @@ static inline bool igb_page_is_reserved(struct page *page)
return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
 }
 
-static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer)
+static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer,
+int 
rx_buf_pgcnt)
 {
unsigned int pagecnt_bias = rx_buffer->pagecnt_bias;
struct page *page = rx_buffer->page;
@@ -8243,7 +8244,7 @@ static bool igb_can_reuse_rx_page(struct igb_rx_buffer 
*rx_buffer)
 
 #if (PAGE_SIZE < 8192)
/* if we are only owner of page we can reuse it */
-   if (unlikely((page_ref_count(page) - pagecnt_bias) > 1))
+   if (unlikely((rx_buf_pgcnt - pagecnt_bias) > 1))
return false;
 #else
 #define IGB_LAST_OFFSET \
@@ -8632,11 +8633,17 @@ static unsigned int igb_rx_offset(struct igb_ring 
*rx_ring)
 }
 
 static struct igb_rx_buffer *igb_get_rx_buffer(struct igb_ring *rx_ring,
-  const unsigned int size)
+  const unsigned int size, int 
*rx_buf_pgcnt)
 {
struct igb_rx_buffer *rx_buffer;
 
rx_buffer = &rx_ring->rx_buffer_info[rx_ring->next_to_clean];
+   *rx_buf_pgcnt =
+#if (PAGE_SIZE < 8192)
+   page_count(rx_buffer->page);
+#else
+   0;
+#endif
prefetchw(rx_buffer->page);
 
/* we are reusing so sync this buffer for CPU use */
@@ -8652,9 +8659,9 @@ static struct igb_rx_buffer *igb_get_rx_buffer(struct 
igb_ring *rx_ring,
 }
 
 static void igb_put_rx_buffer(struct igb_ring *rx_ring,
- struct igb_rx_buffer *rx_buffer)
+ struct igb_rx_buffer *rx_buffer, int rx_buf_pgcnt)
 {
-   if (igb_can_reuse_rx_page(rx_buffer)) {
+   if (igb_can_reuse_rx_page(rx_buffer, rx_buf_pgcnt)) {
/* hand second half of page back to the ring */
igb_reuse_rx_page(rx_ring, rx_buffer);
} else {
@@ -8681,6 +8688,7 @@ static int igb_clean_rx_irq(struct igb_q_vector 
*q_vector, const int budget)
u16 cleaned_count = igb_desc_unused(rx_ring);
unsigned int xdp_xmit = 0;
struct xdp_buff xdp;
+   int rx_buf_pgcnt;
 
xdp.rxq = &rx_ring->xdp_rxq;
 
@@ -8711,7 +8719,7 @@ static int igb_clean_rx_irq(struct igb_q_vector 
*q_vector, const int budget)
 */
dma_rmb();
 
-   rx_buffer = igb_get_rx_buffer(rx_ri

RE: [PATCH] igb: avoid premature Rx buffer reuse

2021-01-12 Thread Li,Rongqing



> -Original Message-
> From: Alexander Duyck [mailto:alexander.du...@gmail.com]
> Sent: Wednesday, January 13, 2021 5:23 AM
> To: Li,Rongqing 
> Cc: Netdev ; intel-wired-lan
> ; Björn Töpel 
> Subject: Re: [PATCH] igb: avoid premature Rx buffer r 
> Okay, this explanation makes much more sense. Could you please either include
> this explanation in your patch, or include a reference to this patch as this
> explains clearly what the issue is while yours didn't and led to the 
> confusion as I
> was assuming the freeing was happening closer to the t0 case, and really the
> problem is t1.
> 
> Thanks.
> 
> - Alex


Ok, I will send V2

Thanks

-Li

RE: [PATCH] igb: avoid premature Rx buffer reuse

2021-01-11 Thread Li,Rongqing



> -Original Message-
> From: Alexander Duyck [mailto:alexander.du...@gmail.com]
> Sent: Tuesday, January 12, 2021 4:54 AM
> To: Li,Rongqing 
> Cc: Netdev ; intel-wired-lan
> ; Björn Töpel 
> Subject: Re: [PATCH] igb: avoid premature Rx buffer reuse
> 
> On Wed, Jan 6, 2021 at 7:53 PM Li RongQing  wrote:
> >
> > The page recycle code, incorrectly, relied on that a page fragment
> > could not be freed inside xdp_do_redirect(). This assumption leads to
> > that page fragments that are used by the stack/XDP redirect can be
> > reused and overwritten.
> >
> > To avoid this, store the page count prior invoking xdp_do_redirect().
> >
> > Fixes: 9cbc948b5a20 ("igb: add XDP support")
> > Signed-off-by: Li RongQing 
> > Cc: Björn Töpel 
> 
> I'm not sure what you are talking about here. We allow for a 0 to 1 count
> difference in the pagecount bias. The idea is the driver should be holding 
> onto
> at least one reference from the driver at all times.
> Are you saying that is not the case?
> 
> As far as the code itself we hold onto the page as long as our difference does
> not exceed 1. So specifically if the XDP call is freeing the page the page 
> itself
> should still be valid as the reference count shouldn't drop below 1, and in 
> that
> case the driver should be holding that one reference to the page.
> 
> When we perform our check we are performing it such at output of either 0 if
> the page is freed, or 1 if the page is not freed are acceptable for us to 
> allow
> reuse. The key bit is in igb_clean_rx_irq where we will flip the buffer for 
> the
> IGB_XDP_TX | IGB_XDP_REDIR case and just increment the pagecnt_bias
> indicating that the page was dropped in the non-flipped case.
> 
> Are you perhaps seeing a function that is returning an error and still 
> consuming
> the page? If so that might explain what you are seeing.
> However the bug would be in the other driver not this one. The
> xdp_do_redirect function is not supposed to free the page if it returns an 
> error.
> It is supposed to leave that up to the function that called xdp_do_redirect.
> 
> > ---
> >  drivers/net/ethernet/intel/igb/igb_main.c | 22 +++---
> >  1 file changed, 15 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/intel/igb/igb_main.c
> > b/drivers/net/ethernet/intel/igb/igb_main.c
> > index 03f78fdb0dcd..3e0d903cf919 100644
> > --- a/drivers/net/ethernet/intel/igb/igb_main.c
> > +++ b/drivers/net/ethernet/intel/igb/igb_main.c
> > @@ -8232,7 +8232,8 @@ static inline bool igb_page_is_reserved(struct
> page *page)
> > return (page_to_nid(page) != numa_mem_id()) ||
> > page_is_pfmemalloc(page);  }
> >
> > -static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer)
> > +static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer,
> > +
> int
> > +rx_buf_pgcnt)
> >  {
> > unsigned int pagecnt_bias = rx_buffer->pagecnt_bias;
> > struct page *page = rx_buffer->page; @@ -8243,7 +8244,7 @@
> > static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer)
> >
> >  #if (PAGE_SIZE < 8192)
> > /* if we are only owner of page we can reuse it */
> > -   if (unlikely((page_ref_count(page) - pagecnt_bias) > 1))
> > +   if (unlikely((rx_buf_pgcnt - pagecnt_bias) > 1))
> > return false;
> >  #else
> I would need more info on the actual issue. If nothing else it might be 
> useful to
> have an example where you print out the page_ref_count versus the
> pagecnt_bias at a few points to verify exactly what is going on. As I said 
> before
> if the issue is the xdp_do_redirect returning an error and still consuming the
> page then the bug is elsewhere and not here.


This patch is same as 75aab4e10ae6a4593a60f66d13de755d4e91f400


commit 75aab4e10ae6a4593a60f66d13de755d4e91f400
Author: Björn Töpel 
Date:   Tue Aug 25 19:27:34 2020 +0200

i40e: avoid premature Rx buffer reuse

The page recycle code, incorrectly, relied on that a page fragment
could not be freed inside xdp_do_redirect(). This assumption leads to
that page fragments that are used by the stack/XDP redirect can be
reused and overwritten.

To avoid this, store the page count prior invoking xdp_do_redirect().

Longer explanation:

Intel NICs have a recycle mechanism. The main idea is that a page is
split into two parts. One part is owned by the driver, one part might
be owned by someone else, such as the stack.

t0: Page is allocated, and put on the Rx ring
  +---

[PATCH] igb: avoid premature Rx buffer reuse

2021-01-06 Thread Li RongQing

The page recycle code, incorrectly, relied on that a page fragment
could not be freed inside xdp_do_redirect(). This assumption leads to
that page fragments that are used by the stack/XDP redirect can be
reused and overwritten.

To avoid this, store the page count prior invoking xdp_do_redirect().

Fixes: 9cbc948b5a20 ("igb: add XDP support")
Signed-off-by: Li RongQing 
Cc: Björn Töpel 
---
 drivers/net/ethernet/intel/igb/igb_main.c | 22 +++---
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c 
b/drivers/net/ethernet/intel/igb/igb_main.c
index 03f78fdb0dcd..3e0d903cf919 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -8232,7 +8232,8 @@ static inline bool igb_page_is_reserved(struct page *page)
return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
 }
 
-static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer)
+static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer,
+int 
rx_buf_pgcnt)
 {
unsigned int pagecnt_bias = rx_buffer->pagecnt_bias;
struct page *page = rx_buffer->page;
@@ -8243,7 +8244,7 @@ static bool igb_can_reuse_rx_page(struct igb_rx_buffer 
*rx_buffer)
 
 #if (PAGE_SIZE < 8192)
/* if we are only owner of page we can reuse it */
-   if (unlikely((page_ref_count(page) - pagecnt_bias) > 1))
+   if (unlikely((rx_buf_pgcnt - pagecnt_bias) > 1))
return false;
 #else
 #define IGB_LAST_OFFSET \
@@ -8632,11 +8633,17 @@ static unsigned int igb_rx_offset(struct igb_ring 
*rx_ring)
 }
 
 static struct igb_rx_buffer *igb_get_rx_buffer(struct igb_ring *rx_ring,
-  const unsigned int size)
+  const unsigned int size, int 
*rx_buf_pgcnt)
 {
struct igb_rx_buffer *rx_buffer;
 
rx_buffer = &rx_ring->rx_buffer_info[rx_ring->next_to_clean];
+   *rx_buf_pgcnt =
+#if (PAGE_SIZE < 8192)
+   page_count(rx_buffer->page);
+#else
+   0;
+#endif
prefetchw(rx_buffer->page);
 
/* we are reusing so sync this buffer for CPU use */
@@ -8652,9 +8659,9 @@ static struct igb_rx_buffer *igb_get_rx_buffer(struct 
igb_ring *rx_ring,
 }
 
 static void igb_put_rx_buffer(struct igb_ring *rx_ring,
- struct igb_rx_buffer *rx_buffer)
+ struct igb_rx_buffer *rx_buffer, int rx_buf_pgcnt)
 {
-   if (igb_can_reuse_rx_page(rx_buffer)) {
+   if (igb_can_reuse_rx_page(rx_buffer, rx_buf_pgcnt)) {
/* hand second half of page back to the ring */
igb_reuse_rx_page(rx_ring, rx_buffer);
} else {
@@ -8681,6 +8688,7 @@ static int igb_clean_rx_irq(struct igb_q_vector 
*q_vector, const int budget)
u16 cleaned_count = igb_desc_unused(rx_ring);
unsigned int xdp_xmit = 0;
struct xdp_buff xdp;
+   int rx_buf_pgcnt;
 
xdp.rxq = &rx_ring->xdp_rxq;
 
@@ -8711,7 +8719,7 @@ static int igb_clean_rx_irq(struct igb_q_vector 
*q_vector, const int budget)
 */
dma_rmb();
 
-   rx_buffer = igb_get_rx_buffer(rx_ring, size);
+   rx_buffer = igb_get_rx_buffer(rx_ring, size, &rx_buf_pgcnt);
 
/* retrieve a buffer from the ring */
if (!skb) {
@@ -8754,7 +8762,7 @@ static int igb_clean_rx_irq(struct igb_q_vector 
*q_vector, const int budget)
break;
}
 
-   igb_put_rx_buffer(rx_ring, rx_buffer);
+   igb_put_rx_buffer(rx_ring, rx_buffer, rx_buf_pgcnt);
cleaned_count++;
 
/* fetch next buffer in frame if non-eop */
-- 
2.17.3

[PATCH][V2] libbpf: add support for canceling cached_cons advance

2020-11-23 Thread Li RongQing

Add a new function for returning descriptors the user received
after an xsk_ring_cons__peek call. After the application has
gotten a number of descriptors from a ring, it might not be able
to or want to process them all for various reasons. Therefore,
it would be useful to have an interface for returning or
cancelling a number of them so that they are returned to the ring.

This patch adds a new function called xsk_ring_cons__cancel that
performs this operation on nb descriptors counted from the end of
the batch of descriptors that was received through the peek call.

Signed-off-by: Li RongQing 
[ Magnus Karlsson: rewrote changelog ]
Cc: Magnus Karlsson 
---
diff with v1: fix the building, and rewrote changelog

 tools/lib/bpf/xsk.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/tools/lib/bpf/xsk.h b/tools/lib/bpf/xsk.h
index 1069c46364ff..1719a327e5f9 100644
--- a/tools/lib/bpf/xsk.h
+++ b/tools/lib/bpf/xsk.h
@@ -153,6 +153,12 @@ static inline size_t xsk_ring_cons__peek(struct 
xsk_ring_cons *cons,
return entries;
 }
 
+static inline void xsk_ring_cons__cancel(struct xsk_ring_cons *cons,
+size_t nb)
+{
+   cons->cached_cons -= nb;
+}
+
 static inline void xsk_ring_cons__release(struct xsk_ring_cons *cons, size_t 
nb)
 {
/* Make sure data has been read before indicating we are done
-- 
2.17.3

RE: [PATCH] libbpf: add support for canceling cached_cons advance

2020-11-23 Thread Li,Rongqing



> -Original Message-
> From: Magnus Karlsson [mailto:magnus.karls...@gmail.com]
> Sent: Monday, November 23, 2020 5:40 PM
> To: Li,Rongqing 
> Cc: Network Development ; bpf
> 
> Subject: Re: [PATCH] libbpf: add support for canceling cached_cons advance
> 
> On Sun, Nov 22, 2020 at 2:21 PM Li RongQing  wrote:
> >
> > It is possible to fail receiving packets after calling
> > xsk_ring_cons__peek, at this condition, cached_cons has been advanced,
> > should be cancelled.
> 
> Thanks RongQing,
> 
> I have needed this myself in various situations, so I think we should add 
> this.
> But your motivation in the commit message is somewhat confusing. How about
> something like this?
> 
> Add a new function for returning descriptors the user received after an
> xsk_ring_cons__peek call. After the application has gotten a number of
> descriptors from a ring, it might not be able to or want to process them all 
> for
> various reasons. Therefore, it would be useful to have an interface for 
> returning
> or cancelling a number of them so that they are returned to the ring. This 
> patch
> adds a new function called xsk_ring_cons__cancel that performs this operation
> on nb descriptors counted from the end of the batch of descriptors that was
> received through the peek call.
> 
> Replace your commit message with this, fix the bug below, send a v2 and then I
> am happy to ack this.


Thank you very much
> 
> /Magnus
> 
> > Signed-off-by: Li RongQing 
> > ---
> >  tools/lib/bpf/xsk.h | 6 ++
> >  1 file changed, 6 insertions(+)
> >
> > diff --git a/tools/lib/bpf/xsk.h b/tools/lib/bpf/xsk.h index
> > 1069c46364ff..4128215c246b 100644
> > --- a/tools/lib/bpf/xsk.h
> > +++ b/tools/lib/bpf/xsk.h
> > @@ -153,6 +153,12 @@ static inline size_t xsk_ring_cons__peek(struct
> xsk_ring_cons *cons,
> > return entries;
> >  }
> >
> > +static inline void xsk_ring_cons__cancel(struct xsk_ring_cons *cons,
> > +size_t nb) {
> > +   rx->cached_cons -= nb;
> 
> cons-> not rx->. Please make sure the v2 compiles and passes checkpatch.
> 

Sorry for building error
I will send V2

Thanks 

-Li


> > +}
> > +
> >  static inline void xsk_ring_cons__release(struct xsk_ring_cons *cons,
> > size_t nb)  {
> > /* Make sure data has been read before indicating we are done
> > --
> > 2.17.3
> >

[PATCH] libbpf: add support for canceling cached_cons advance

2020-11-22 Thread Li RongQing

It is possible to fail receiving packets after calling
xsk_ring_cons__peek, at this condition, cached_cons has
been advanced, should be cancelled.

Signed-off-by: Li RongQing 
---
 tools/lib/bpf/xsk.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/tools/lib/bpf/xsk.h b/tools/lib/bpf/xsk.h
index 1069c46364ff..4128215c246b 100644
--- a/tools/lib/bpf/xsk.h
+++ b/tools/lib/bpf/xsk.h
@@ -153,6 +153,12 @@ static inline size_t xsk_ring_cons__peek(struct 
xsk_ring_cons *cons,
return entries;
 }
 
+static inline void xsk_ring_cons__cancel(struct xsk_ring_cons *cons,
+size_t nb)
+{
+   rx->cached_cons -= nb;
+}
+
 static inline void xsk_ring_cons__release(struct xsk_ring_cons *cons, size_t 
nb)
 {
/* Make sure data has been read before indicating we are done
-- 
2.17.3

[PATCH][next][v2] iavf: use kvzalloc instead of kzalloc for rx/tx_bi buffer

2020-08-27 Thread Li RongQing

when changes the rx/tx ring to 4096, kzalloc may fail due to
a temporary shortage on slab entries.

so using kvmalloc to allocate this memory as there is no need
that this memory area is physical continuously.

and using __GFP_RETRY_MAYFAIL to allocate from kmalloc as
far as possible, which can reduce TLB pressure than vmalloc
as suggested by Eric Dumazet

Signed-off-by: Li RongQing 
---
v2: __GFP_RETRY_MAYFAIL is used

 drivers/net/ethernet/intel/iavf/iavf_txrx.c | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c 
b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index 256fa07d54d5..e7a1e9039cee 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -92,7 +92,7 @@ void iavf_clean_tx_ring(struct iavf_ring *tx_ring)
 void iavf_free_tx_resources(struct iavf_ring *tx_ring)
 {
iavf_clean_tx_ring(tx_ring);
-   kfree(tx_ring->tx_bi);
+   kvfree(tx_ring->tx_bi);
tx_ring->tx_bi = NULL;
 
if (tx_ring->desc) {
@@ -622,7 +622,8 @@ int iavf_setup_tx_descriptors(struct iavf_ring *tx_ring)
/* warn if we are about to overwrite the pointer */
WARN_ON(tx_ring->tx_bi);
bi_size = sizeof(struct iavf_tx_buffer) * tx_ring->count;
-   tx_ring->tx_bi = kzalloc(bi_size, GFP_KERNEL);
+   tx_ring->tx_bi = kvzalloc(bi_size, GFP_KERNEL |
+ __GFP_RETRY_MAYFAIL);
if (!tx_ring->tx_bi)
goto err;
 
@@ -643,7 +644,7 @@ int iavf_setup_tx_descriptors(struct iavf_ring *tx_ring)
return 0;
 
 err:
-   kfree(tx_ring->tx_bi);
+   kvfree(tx_ring->tx_bi);
tx_ring->tx_bi = NULL;
return -ENOMEM;
 }
@@ -714,7 +715,7 @@ void iavf_clean_rx_ring(struct iavf_ring *rx_ring)
 void iavf_free_rx_resources(struct iavf_ring *rx_ring)
 {
iavf_clean_rx_ring(rx_ring);
-   kfree(rx_ring->rx_bi);
+   kvfree(rx_ring->rx_bi);
rx_ring->rx_bi = NULL;
 
if (rx_ring->desc) {
@@ -738,7 +739,8 @@ int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
/* warn if we are about to overwrite the pointer */
WARN_ON(rx_ring->rx_bi);
bi_size = sizeof(struct iavf_rx_buffer) * rx_ring->count;
-   rx_ring->rx_bi = kzalloc(bi_size, GFP_KERNEL);
+   rx_ring->rx_bi = kvzalloc(bi_size, GFP_KERNEL |
+ __GFP_RETRY_MAYFAIL);
if (!rx_ring->rx_bi)
goto err;
 
@@ -762,7 +764,7 @@ int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
 
return 0;
 err:
-   kfree(rx_ring->rx_bi);
+   kvfree(rx_ring->rx_bi);
rx_ring->rx_bi = NULL;
return -ENOMEM;
 }
-- 
2.16.2

RE: [PATCH] iavf: use kvzalloc instead of kzalloc for rx/tx_bi buffer

2020-08-27 Thread Li,Rongqing



> -Original Message-
> From: Eric Dumazet [mailto:eric.duma...@gmail.com]
> Sent: Thursday, August 27, 2020 5:55 PM
> To: Li,Rongqing ; Eric Dumazet
> ; netdev@vger.kernel.org;
> intel-wired-...@lists.osuosl.org
> Subject: Re: [PATCH] iavf: use kvzalloc instead of kzalloc for rx/tx_bi buffer
> 
> 
> 
> On 8/27/20 1:53 AM, Li,Rongqing wrote:
> >
> >
> >> -Original Message-
> >> From: Eric Dumazet [mailto:eric.duma...@gmail.com]
> >> Sent: Thursday, August 27, 2020 4:26 PM
> >> To: Li,Rongqing ; netdev@vger.kernel.org;
> >> intel-wired-...@lists.osuosl.org
> >> Subject: Re: [PATCH] iavf: use kvzalloc instead of kzalloc for
> >> rx/tx_bi buffer
> >>
> >>
> >>
> >> On 8/27/20 12:53 AM, Li RongQing wrote:
> >>> when changes the rx/tx ring to 4096, kzalloc may fail due to a
> >>> temporary shortage on slab entries.
> >>>
> >>> kvmalloc is used to allocate this memory as there is no need to have
> >>> this memory area physical continuously.
> >>>
> >>> Signed-off-by: Li RongQing 
> >>> ---
> >>
> >>
> >> Well, fallback to vmalloc() overhead because order-1 pages are not
> >> readily available when the NIC is setup (usually one time per boot)
> >> is adding TLB cost at run time, for billions of packets to come, maybe for
> months.
> >>
> >> Surely trying a bit harder to get order-1 pages is desirable.
> >>
> >>  __GFP_RETRY_MAYFAIL is supposed to help here.
> >
> > Could we add __GFP_RETRY_MAYFAIL to kvmalloc, to ensure the allocation
> success ?
> 
> __GFP_RETRY_MAYFAIL does not _ensure_ the allocation success.
> 
> The idea here is that for large allocations (bigger than PAGE_SIZE),
> kvmalloc_node() will not force __GFP_NORETRY, meaning that page allocator
> will not bailout immediately in case of memory pressure.
> 
> This gives a chance for page reclaims to happen, and eventually the high order
> page allocation will succeed under normal circumstances.
> 
> It is a trade-off, and only worth it for long living allocations.

Thanks, Eric; 
I will change it as below, that kmalloc will be used in most time, and ensure 
allocation success if fail to reclaim memory under memory pressure.

@@ -622,7 +622,7 @@ int iavf_setup_tx_descriptors(struct iavf_ring *tx_ring)
/* warn if we are about to overwrite the pointer */
WARN_ON(tx_ring->tx_bi);
bi_size = sizeof(struct iavf_tx_buffer) * tx_ring->count;
-   tx_ring->tx_bi = kzalloc(bi_size, GFP_KERNEL);
+   tx_ring->tx_bi = kvzalloc(bi_size, GFP_KERNEL|__GFP_RETRY_MAYFAIL);
if (!tx_ring->tx_bi)
goto err;
 
@@ -643,7 +643,7 @@ int iavf_setup_tx_descriptors(struct iavf_ring *tx_ring)


-Li

RE: [PATCH] iavf: use kvzalloc instead of kzalloc for rx/tx_bi buffer

2020-08-27 Thread Li,Rongqing



> -Original Message-
> From: Eric Dumazet [mailto:eric.duma...@gmail.com]
> Sent: Thursday, August 27, 2020 4:26 PM
> To: Li,Rongqing ; netdev@vger.kernel.org;
> intel-wired-...@lists.osuosl.org
> Subject: Re: [PATCH] iavf: use kvzalloc instead of kzalloc for rx/tx_bi buffer
> 
> 
> 
> On 8/27/20 12:53 AM, Li RongQing wrote:
> > when changes the rx/tx ring to 4096, kzalloc may fail due to a
> > temporary shortage on slab entries.
> >
> > kvmalloc is used to allocate this memory as there is no need to have
> > this memory area physical continuously.
> >
> > Signed-off-by: Li RongQing 
> > ---
> 
> 
> Well, fallback to vmalloc() overhead because order-1 pages are not readily
> available when the NIC is setup (usually one time per boot) is adding TLB cost
> at run time, for billions of packets to come, maybe for months.
> 
> Surely trying a bit harder to get order-1 pages is desirable.
> 
>  __GFP_RETRY_MAYFAIL is supposed to help here.

Could we add __GFP_RETRY_MAYFAIL to kvmalloc, to ensure the allocation success ?

> 

I see that lots of drivers are using vmalloc for this buffer, should we change 
it kmalloc?  

grep "buffer_info =" drivers/net/ethernet/intel/ -rI|grep alloc

drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c:  tx_ring->tx_buffer_info 
= vmalloc(size);
drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c:  rx_ring->rx_buffer_info 
= vmalloc(size);
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c:  tx_ring->tx_buffer_info = 
vmalloc_node(size, ring_node);
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c:  tx_ring->tx_buffer_info 
= vmalloc(size);
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c:  rx_ring->rx_buffer_info = 
vmalloc_node(size, ring_node);
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c:  rx_ring->rx_buffer_info 
= vmalloc(size);
drivers/net/ethernet/intel/ixgb/ixgb_main.c:txdr->buffer_info = 
vzalloc(size);
drivers/net/ethernet/intel/ixgb/ixgb_main.c:rxdr->buffer_info = 
vzalloc(size);
drivers/net/ethernet/intel/e1000e/ethtool.c:tx_ring->buffer_info = 
kcalloc(tx_ring->count,
drivers/net/ethernet/intel/e1000e/ethtool.c:rx_ring->buffer_info = 
kcalloc(rx_ring->count,
drivers/net/ethernet/intel/e1000e/netdev.c: tx_ring->buffer_info = 
vzalloc(size);
drivers/net/ethernet/intel/e1000e/netdev.c: rx_ring->buffer_info = 
vzalloc(size);
drivers/net/ethernet/intel/igb/igb_main.c:  tx_ring->tx_buffer_info = 
vmalloc(size);
drivers/net/ethernet/intel/igb/igb_main.c:  rx_ring->rx_buffer_info = 
vmalloc(size);
drivers/net/ethernet/intel/e1000/e1000_ethtool.c:   txdr->buffer_info = 
kcalloc(txdr->count, sizeof(struct e1000_tx_buffer),
drivers/net/ethernet/intel/e1000/e1000_ethtool.c:   rxdr->buffer_info = 
kcalloc(rxdr->count, sizeof(struct e1000_rx_buffer),
drivers/net/ethernet/intel/e1000/e1000_main.c:  txdr->buffer_info = 
vzalloc(size);
drivers/net/ethernet/intel/e1000/e1000_main.c:  rxdr->buffer_info = 
vzalloc(size);
drivers/net/ethernet/intel/igc/igc_main.c:  tx_ring->tx_buffer_info = 
vzalloc(size);
drivers/net/ethernet/intel/igc/igc_main.c:  rx_ring->rx_buffer_info = 
vzalloc(size);
drivers/net/ethernet/intel/igbvf/netdev.c:  tx_ring->buffer_info = 
vzalloc(size);
drivers/net/ethernet/intel/igbvf/netdev.c:  rx_ring->buffer_info = 
vzalloc(size);


-Li

[PATCH] iavf: use kvzalloc instead of kzalloc for rx/tx_bi buffer

2020-08-27 Thread Li RongQing

when changes the rx/tx ring to 4096, kzalloc may fail due to
a temporary shortage on slab entries.

kvmalloc is used to allocate this memory as there is no need
to have this memory area physical continuously.

Signed-off-by: Li RongQing 
---
 drivers/net/ethernet/intel/iavf/iavf_txrx.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c 
b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index 256fa07d54d5..f5a3e195ec54 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -92,7 +92,7 @@ void iavf_clean_tx_ring(struct iavf_ring *tx_ring)
 void iavf_free_tx_resources(struct iavf_ring *tx_ring)
 {
iavf_clean_tx_ring(tx_ring);
-   kfree(tx_ring->tx_bi);
+   kvfree(tx_ring->tx_bi);
tx_ring->tx_bi = NULL;
 
if (tx_ring->desc) {
@@ -622,7 +622,7 @@ int iavf_setup_tx_descriptors(struct iavf_ring *tx_ring)
/* warn if we are about to overwrite the pointer */
WARN_ON(tx_ring->tx_bi);
bi_size = sizeof(struct iavf_tx_buffer) * tx_ring->count;
-   tx_ring->tx_bi = kzalloc(bi_size, GFP_KERNEL);
+   tx_ring->tx_bi = kvzalloc(bi_size, GFP_KERNEL);
if (!tx_ring->tx_bi)
goto err;
 
@@ -643,7 +643,7 @@ int iavf_setup_tx_descriptors(struct iavf_ring *tx_ring)
return 0;
 
 err:
-   kfree(tx_ring->tx_bi);
+   kvfree(tx_ring->tx_bi);
tx_ring->tx_bi = NULL;
return -ENOMEM;
 }
@@ -714,7 +714,7 @@ void iavf_clean_rx_ring(struct iavf_ring *rx_ring)
 void iavf_free_rx_resources(struct iavf_ring *rx_ring)
 {
iavf_clean_rx_ring(rx_ring);
-   kfree(rx_ring->rx_bi);
+   kvfree(rx_ring->rx_bi);
rx_ring->rx_bi = NULL;
 
if (rx_ring->desc) {
@@ -738,7 +738,7 @@ int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
/* warn if we are about to overwrite the pointer */
WARN_ON(rx_ring->rx_bi);
bi_size = sizeof(struct iavf_rx_buffer) * rx_ring->count;
-   rx_ring->rx_bi = kzalloc(bi_size, GFP_KERNEL);
+   rx_ring->rx_bi = kvzalloc(bi_size, GFP_KERNEL);
if (!rx_ring->rx_bi)
goto err;
 
@@ -762,7 +762,7 @@ int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
 
return 0;
 err:
-   kfree(rx_ring->rx_bi);
+   kvfree(rx_ring->rx_bi);
rx_ring->rx_bi = NULL;
return -ENOMEM;
 }
-- 
2.16.2

RE: [PATCH net 2/3] ixgbe: avoid premature Rx buffer reuse

2020-08-25 Thread Li,Rongqing



> -Original Message-
> From: Björn Töpel [mailto:bjorn.to...@gmail.com]
> Sent: Tuesday, August 25, 2020 5:16 PM
> To: jeffrey.t.kirs...@intel.com; intel-wired-...@lists.osuosl.org
> Cc: Björn Töpel ; magnus.karls...@intel.com;
> magnus.karls...@gmail.com; netdev@vger.kernel.org;
> maciej.fijalkow...@intel.com; piotr.raczyn...@intel.com;
> maciej.machnikow...@intel.com; Li,Rongqing 
> Subject: [PATCH net 2/3] ixgbe: avoid premature Rx buffer reuse
> 
> From: Björn Töpel 
> 
> The page recycle code, incorrectly, relied on that a page fragment could not 
> be
> freed inside xdp_do_redirect(). This assumption leads to that page fragments
> that are used by the stack/XDP redirect can be reused and overwritten.
> 
> To avoid this, store the page count prior invoking xdp_do_redirect().
> 
> Fixes: 6453073987ba ("ixgbe: add initial support for xdp redirect")
> Signed-off-by: Björn Töpel 

Reported-and-analyzed-by: Li RongQing 

Thanks

-Li

RE: [PATCH net 3/3] ice: avoid premature Rx buffer reuse

2020-08-25 Thread Li,Rongqing



> -Original Message-
> From: Björn Töpel [mailto:bjorn.to...@gmail.com]
> Sent: Tuesday, August 25, 2020 5:16 PM
> To: jeffrey.t.kirs...@intel.com; intel-wired-...@lists.osuosl.org
> Cc: Björn Töpel ; magnus.karls...@intel.com;
> magnus.karls...@gmail.com; netdev@vger.kernel.org;
> maciej.fijalkow...@intel.com; piotr.raczyn...@intel.com;
> maciej.machnikow...@intel.com; Li,Rongqing 
> Subject: [PATCH net 3/3] ice: avoid premature Rx buffer reuse
> 
> From: Björn Töpel 
> 
> The page recycle code, incorrectly, relied on that a page fragment could not 
> be
> freed inside xdp_do_redirect(). This assumption leads to that page fragments
> that are used by the stack/XDP redirect can be reused and overwritten.
> 
> To avoid this, store the page count prior invoking xdp_do_redirect().
> 
> Fixes: efc2214b6047 ("ice: Add support for XDP")
> Signed-off-by: Björn Töpel 


Reported-and-analyzed-by: Li RongQing 

Thanks

-Li

[PATCH][next] i40e: switch kvzalloc to allocate rx/tx_bi buffer

2020-08-21 Thread Li RongQing

when changes the rx/tx ring to 4096, rx/tx_bi needs an order
5 pages, and allocation maybe fail due to memory fragmentation
so switch to kvzalloc

 i40e :1a:00.0 xgbe0: Changing Rx descriptor count from 512 to 4096
 ethtool: page allocation failure: order:5, 
mode:0x60c0c0(GFP_KERNEL|__GFP_COMP|__GFP_ZERO), nodemask=(null)
 ethtool cpuset=/ mems_allowed=0
 CPU: 34 PID: 47182 Comm: ethtool Kdump: loaded Tainted: GE 
4.19 #1
 Hardware name: Inspur BJINSPURV2G5Y72-32A/SN6115M5, BIOS 3.0.12 03/29/2018
 Call Trace:
  dump_stack+0x66/0x8b
  warn_alloc+0xff/0x1a0
  __alloc_pages_slowpath+0xcc9/0xd00
  __alloc_pages_nodemask+0x25e/0x2a0
  kmalloc_order+0x14/0x40
  kmalloc_order_trace+0x1d/0xb0
  i40e_setup_rx_descriptors+0x47/0x1e0 [i40e]
  i40e_set_ringparam+0x25e/0x7c0 [i40e]
  dev_ethtool+0x1fa3/0x2920
  ? inet_ioctl+0xe0/0x250
  ? __rtnl_unlock+0x25/0x40
  ? netdev_run_todo+0x5e/0x2f0
  ? dev_ioctl+0xb3/0x560
  dev_ioctl+0xb3/0x560
  sock_do_ioctl+0xae/0x150
  ? sock_ioctl+0x1c6/0x310
  sock_ioctl+0x1c6/0x310
  ? do_vfs_ioctl+0xa4/0x630
  ? dlci_ioctl_set+0x30/0x30
  do_vfs_ioctl+0xa4/0x630
  ? handle_mm_fault+0xe6/0x240
  ? __do_page_fault+0x288/0x510
  ksys_ioctl+0x70/0x80
  __x64_sys_ioctl+0x16/0x20
  do_syscall_64+0x5b/0x1b0
  entry_SYSCALL_64_after_hwframe+0x44/0xa9

Signed-off-by: Li RongQing 
---
 drivers/net/ethernet/intel/i40e/i40e_main.c |  2 +-
 drivers/net/ethernet/intel/i40e/i40e_txrx.c | 10 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index a63548cb022d..1a817580b945 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -3260,7 +3260,7 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
if (ring->vsi->type == I40E_VSI_MAIN)
xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq);
 
-   kfree(ring->rx_bi);
+   kvfree(ring->rx_bi);
ring->xsk_umem = i40e_xsk_umem(ring);
if (ring->xsk_umem) {
ret = i40e_alloc_rx_bi_zc(ring);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 5f9fe55bb66d..4dc7d6e6b226 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -674,7 +674,7 @@ void i40e_clean_tx_ring(struct i40e_ring *tx_ring)
 void i40e_free_tx_resources(struct i40e_ring *tx_ring)
 {
i40e_clean_tx_ring(tx_ring);
-   kfree(tx_ring->tx_bi);
+   kvfree(tx_ring->tx_bi);
tx_ring->tx_bi = NULL;
 
if (tx_ring->desc) {
@@ -1273,7 +1273,7 @@ int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
/* warn if we are about to overwrite the pointer */
WARN_ON(tx_ring->tx_bi);
bi_size = sizeof(struct i40e_tx_buffer) * tx_ring->count;
-   tx_ring->tx_bi = kzalloc(bi_size, GFP_KERNEL);
+   tx_ring->tx_bi = kvzalloc(bi_size, GFP_KERNEL);
if (!tx_ring->tx_bi)
goto err;
 
@@ -1300,7 +1300,7 @@ int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
return 0;
 
 err:
-   kfree(tx_ring->tx_bi);
+   kvfree(tx_ring->tx_bi);
tx_ring->tx_bi = NULL;
return -ENOMEM;
 }
@@ -1309,7 +1309,7 @@ int i40e_alloc_rx_bi(struct i40e_ring *rx_ring)
 {
unsigned long sz = sizeof(*rx_ring->rx_bi) * rx_ring->count;
 
-   rx_ring->rx_bi = kzalloc(sz, GFP_KERNEL);
+   rx_ring->rx_bi = kvzalloc(sz, GFP_KERNEL);
return rx_ring->rx_bi ? 0 : -ENOMEM;
 }
 
@@ -1394,7 +1394,7 @@ void i40e_free_rx_resources(struct i40e_ring *rx_ring)
if (rx_ring->vsi->type == I40E_VSI_MAIN)
xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
rx_ring->xdp_prog = NULL;
-   kfree(rx_ring->rx_bi);
+   kvfree(rx_ring->rx_bi);
rx_ring->rx_bi = NULL;
 
if (rx_ring->desc) {
-- 
2.16.2

答复: 答复: [Intel-wired-lan] [PATCH 0/2] intel/xdp fixes for fliping rx buffer

2020-08-19 Thread Li,Rongqing



> -邮件原件-
> 发件人: Björn Töpel [mailto:bjorn.to...@intel.com]
> 发送时间: 2020年8月19日 14:45
> 收件人: Li,Rongqing ; Björn Töpel
> 
> 抄送: Netdev ; intel-wired-lan
> ; Karlsson, Magnus
> ; bpf ; Maciej Fijalkowski
> ; Piotr ; Maciej
> 
> 主题: Re: 答复: [Intel-wired-lan] [PATCH 0/2] intel/xdp fixes for fliping rx 
> buffer
> 
> On 2020-08-19 03:37, Li,Rongqing wrote:
> [...]
>  > Hi:
>  >
>  > Thanks for your explanation.
>  >
>  > But we can reproduce this bug
>  >
>  > We use ebpf to redirect only-Vxlan packets to non-zerocopy AF_XDP, First we
> see panic on tcp stack, in tcp_collapse: BUG_ON(offset < 0); it is very hard 
> to
> reproduce.
>  >
>  > Then we use the scp to do test, and has lots of vxlan packet at the same
> time, scp will be broken frequently.
>  >
> 
> Ok! Just so that I'm certain of your setup. You receive packets to an i40e 
> netdev
> where there's an XDP program. The program does XDP_PASS or XDP_REDIRECT
> to e.g. devmap for non-vxlan packets. However, vxlan packets are redirected to
> AF_XDP socket(s) in *copy-mode*. Am I understanding that correct?
> 
Similar as your description, 

but the xdp program only redirects vxlan packets to af_xdp socket, other 
packets will go to Linux kernel networking stack, like scp/ssh packets


> I'm assuming this is an x86-64 with 4k page size, right? :-) The page 
> flipping is a
> bit different if the PAGE_SIZE is not 4k.
> 

We use 4k page size, page flipping is 4k, we did not change the i40e drivers, 
4.19 stable kernel

>  > With this fixes, scp has not been broken again, and kernel is not panic
> again  >
> 
> Let's dig into your scenario.
> 
> Are you saying the following:
> 
> Page A:
> +
> | "first skb" > Rx HW ring entry X 
> +
> | "second skb"> Rx HW ring entry X+1 (or X+n)
> +
> 

Like:

First skb will be into tcp socket rx queue

Seconds skb is vxlan packet, will be copy to af_xdp socket, and released.

> This is a scenario that shouldn't be allowed, because there are now two users
> of the page. If that's the case, the refcounting is broken. Is that the case?
> 

True, it is broken for copy mode xsk

-Li

> Check out i40e_can_reuse_rx_page(). The idea with page flipping/reuse is that
> the page is only reused if there is only one user.
> 
>  > Seem your explanation is unable to solve my analysis:
>  >
>  > 1. first skb is not for xsk, and forwarded to another device
>  >or socket queue
> 
> The data for the "first skb" resides on a page:
> A:
> +
> | "first skb"
> +
> | to be reused
> +
> refcount >>1
> 
>  > 2. seconds skb is for xsk, copy data to xsk memory, and page
>  >of skb->data is released
> 
> Note that page B != page A.
> 
> B:
> +
> | to be reused/or used by the stack
> +
> | "second skb for xsk"
> +
> refcount >>1
> 
> data is copied to socket, page_frag_free() is called, and the page count is
> decreased. The driver will then check if the page can be reused. If not, it's 
> freed
> to the page allocator.
> 
>  > 3. rx_buff is reusable since only first skb is in it, but
>  >*_rx_buffer_flip will make that page_offset is set to
>  >first skb data
> 
> I'm having trouble grasping how this is possible. More than one user implies
> that it wont be reused. If this is possible, the recounting/reuse mechanism is
> broken, and that is what should be fixed.
> 
> The AF_XDP redirect should not have semantics different from, say, devmap
> redirect. It's just that the page_frag_free() is called earlier for AF_XDP, 
> instead
> of from i40e_clean_tx_irq() as the case for devmap/XDP_TX.
> 
>  > 4. then reuse rx buffer, first skb which still is living
>  >will be corrupted.
>  >
>  >
>  > The root cause is difference you said upper, so I only fixes for 
> non-zerocopy
> AF_XDP  >
> 
> I have only addressed non-zerocopy, so we're on the same page (pun
> intended) here!
> 
> 
> Björn
> 
>  > -Li

答复: [Intel-wired-lan] [PATCH 0/2] intel/xdp fixes for fliping rx buffer

2020-08-18 Thread Li,Rongqing



> -邮件原件-
> 发件人: Björn Töpel [mailto:bjorn.to...@gmail.com]
> 发送时间: 2020年8月18日 22:05
> 收件人: Li,Rongqing 
> 抄送: Netdev ; intel-wired-lan
> ; Karlsson, Magnus
> ; Björn Töpel ; bpf
> ; Maciej Fijalkowski ;
> Piotr ; Maciej 
> 主题: Re: [Intel-wired-lan] [PATCH 0/2] intel/xdp fixes for fliping rx buffer
> 
> On Fri, 17 Jul 2020 at 08:24, Li RongQing  wrote:
> >
> > This fixes ice/i40e/ixgbe/ixgbevf_rx_buffer_flip in copy mode xdp that
> > can lead to data corruption.
> >
> > I split two patches, since i40e/xgbe/ixgbevf supports xsk receiving
> > from 4.18, put their fixes in a patch
> >
> 
> Li, sorry for the looong latency. I took a looong vacation. :-P
> 
> Thanks for taking a look at this, but I believe this is not a bug.
> 
> The Intel Ethernet drivers (obviously non-zerocopy AF_XDP -- "good ol'
> XDP") use a page reuse algorithm.
> 
> Basic idea is that a page is allocated from the page allocator
> (i40e_alloc_mapped_page()). The refcount is increased to USHRT_MAX. The
> page is split into two chunks (simplified). If there's one user of the page, 
> the
> page can be reused (flipped). If not, a new page needs to be allocated (with 
> the
> large refcount).
> 
> So, the idea is that usually the page can be reused (flipped), and the page 
> only
> needs to be "put" not "get" since the refcount was initally bumped to a large
> value.
> 
> All frames (except XDP_DROP which can be reused directly) "die" via
> page_frag_free() which decreases the page refcount, and frees the page if the
> refcount is zero.
> 
> Let's take some scenarios as examples:
> 
> 1. A frame is received in "vanilla" XDP (MEM_TYPE_PAGE_SHARED), and
>the XDP program verdict is XDP_TX. The frame will be placed on the
>HW Tx ring, and freed* (async) in i40e_clean_tx_irq:
> /* free the skb/XDP data */
> if (ring_is_xdp(tx_ring))
> xdp_return_frame(tx_buf->xdpf); // calls page_frag_free()
> 
> 2. A frame is passed to the stack, eventually it's freed* via
>skb_free_frag().
> 
> 3. A frame is passed to an AF_XDP socket. The data is copied to the
>socket data area, and the frame is directly freed*.
> 
> Not the * by the freed. Actually freeing here means calling page_frag_free(),
> which means decreasing the refcount. The page reuse algorithm makes sure
> that the buffers are not stale.
> 
> The only difference from XDP_TX and XDP_DIRECT to dev/cpumaps, compared
> to AF_XDP sockets is that the latter calls page_frag_free() directly, whereas
> the other does it asynchronous from the Tx clean up phase.
> 

Hi:

Thanks for your explanation.

But we can reproduce this bug

We use ebpf to redirect only-Vxlan packets to non-zerocopy AF_XDP,  First we 
see panic on tcp stack, in tcp_collapse: BUG_ON(offset < 0); it is very hard to 
reproduce.

Then we use the scp to do test, and has lots of vxlan packet at the same time, 
scp will be broken frequently.

With this fixes, scp has not been broken again, and kernel is not panic again

Seem your explanation is unable to solve my analysis:

   1. first skb is not for xsk, and forwarded to another device
  or socket queue
   2. seconds skb is for xsk, copy data to xsk memory, and page
  of skb->data is released
   3. rx_buff is reusable since only first skb is in it, but
  *_rx_buffer_flip will make that page_offset is set to
  first skb data
   4. then reuse rx buffer, first skb which still is living
  will be corrupted.


The root cause is difference you said upper, so I only fixes for non-zerocopy 
AF_XDP

-Li
> Let me know if it's still not clear, but the bottom line is that none of these
> patches are needed.
> 
> 
> Thanks!
> Björn
> 
> 
> > Li RongQing (2):
> >   xdp: i40e: ixgbe: ixgbevf: not flip rx buffer for copy mode xdp
> >   ice/xdp: not adjust rx buffer for copy mode xdp
> >
> >  drivers/net/ethernet/intel/i40e/i40e_txrx.c   | 5 -
> >  drivers/net/ethernet/intel/ice/ice_txrx.c | 5 -
> >  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 5 -
> >  drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 5 -
> >  include/net/xdp.h | 3 +++
> >  net/xdp/xsk.c | 4 +++-
> >  6 files changed, 22 insertions(+), 5 deletions(-)
> >
> > --
> > 2.16.2
> >
> > ___
> > Intel-wired-lan mailing list
> > intel-wired-...@osuosl.org
> > https://lists.osuosl.org/mailman/listinfo/intel-wired-lan

[PATCH][v3] i40e: optimise prefetch page refcount

2020-08-18 Thread Li RongQing

refcount of rx_buffer page will be added here originally, so prefetchw
is needed, but after commit 1793668c3b8c ("i40e/i40evf: Update code to
 better handle incrementing page count"), and refcount is not added
everytime, so change prefetchw as prefetch,

now it mainly services page_address(), but which accesses struct page
only when WANT_PAGE_VIRTUAL or HASHED_PAGE_VIRTUAL is defined otherwise
it returns address based on offset, so we prefetch it conditionally

Jakub suggested to define prefetch_page_address in a common header

Reported-by: kernel test robot 
Suggested-by: Jakub Kicinski 
Signed-off-by: Li RongQing 
---
diff with v2: fix a build warning -Wvisibility 
diff with v1: create a common function prefetch_page_address
 drivers/net/ethernet/intel/i40e/i40e_txrx.c | 2 +-
 include/linux/prefetch.h| 8 
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 62f5b2d35f63..5f9fe55bb66d 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -1953,7 +1953,7 @@ static struct i40e_rx_buffer *i40e_get_rx_buffer(struct 
i40e_ring *rx_ring,
struct i40e_rx_buffer *rx_buffer;
 
rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
-   prefetchw(rx_buffer->page);
+   prefetch_page_address(rx_buffer->page);
 
/* we are reusing so sync this buffer for CPU use */
dma_sync_single_range_for_cpu(rx_ring->dev,
diff --git a/include/linux/prefetch.h b/include/linux/prefetch.h
index 13eafebf3549..b83a3f944f28 100644
--- a/include/linux/prefetch.h
+++ b/include/linux/prefetch.h
@@ -15,6 +15,7 @@
 #include 
 #include 
 
+struct page;
 /*
prefetch(x) attempts to pre-emptively get the memory pointed to
by address "x" into the CPU L1 cache. 
@@ -62,4 +63,11 @@ static inline void prefetch_range(void *addr, size_t len)
 #endif
 }
 
+static inline void prefetch_page_address(struct page *page)
+{
+#if defined(WANT_PAGE_VIRTUAL) || defined(HASHED_PAGE_VIRTUAL)
+   prefetch(page);
+#endif
+}
+
 #endif
-- 
2.16.2

[PATCH][v2] i40e: optimise prefetch page refcount

2020-07-31 Thread Li RongQing

refcount of rx_buffer page will be added here originally, so prefetchw
is needed, but after commit 1793668c3b8c ("i40e/i40evf: Update code to
 better handle incrementing page count"), and refcount is not added
everytime, so change prefetchw as prefetch,

now it mainly services page_address(), but which accesses struct page
only when WANT_PAGE_VIRTUAL or HASHED_PAGE_VIRTUAL is defined otherwise
it returns address based on offset, so we prefetch it conditionally

Jakub suggested to define prefetch_page_address in a common header

Suggested-by: Jakub Kicinski 
Signed-off-by: Li RongQing 
---
diff with v1: create a common function prefetch_page_address

 drivers/net/ethernet/intel/i40e/i40e_txrx.c | 2 +-
 include/linux/prefetch.h| 7 +++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 62f5b2d35f63..5f9fe55bb66d 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -1953,7 +1953,7 @@ static struct i40e_rx_buffer *i40e_get_rx_buffer(struct 
i40e_ring *rx_ring,
struct i40e_rx_buffer *rx_buffer;
 
rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
-   prefetchw(rx_buffer->page);
+   prefetch_page_address(rx_buffer->page);
 
/* we are reusing so sync this buffer for CPU use */
dma_sync_single_range_for_cpu(rx_ring->dev,
diff --git a/include/linux/prefetch.h b/include/linux/prefetch.h
index 13eafebf3549..1e0283982dd3 100644
--- a/include/linux/prefetch.h
+++ b/include/linux/prefetch.h
@@ -62,4 +62,11 @@ static inline void prefetch_range(void *addr, size_t len)
 #endif
 }
 
+static inline void prefetch_page_address(struct page *page)
+{
+#if defined(WANT_PAGE_VIRTUAL) || defined(HASHED_PAGE_VIRTUAL)
+   prefetch(page);
+#endif
+}
+
 #endif
-- 
2.16.2

答复: [net-next 2/6] i40e: prefetch struct page of Rx buffer conditionally

2020-07-29 Thread Li,Rongqing



> -邮件原件-
> 发件人: Jakub Kicinski [mailto:k...@kernel.org]
> 发送时间: 2020年7月30日 5:20
> 收件人: Li,Rongqing 
> 抄送: Tony Nguyen ; da...@davemloft.net;
> netdev@vger.kernel.org; nhor...@redhat.com; sassm...@redhat.com;
> jeffrey.t.kirs...@intel.com; Andrew Bowers 
> 主题: Re: [net-next 2/6] i40e: prefetch struct page of Rx buffer conditionally
> 
> On Wed, 29 Jul 2020 06:20:47 + Li,Rongqing wrote:
> > > Looks like something that belongs in a common header not
> > > (potentially
> > > multiple) C sources.
> >
> > Not clear, how should I change?
> 
> Can you add something like:
> 
> static inline void prefetch_page_address(struct page *page) { #if
> defined(WANT_PAGE_VIRTUAL) || defined(HASHED_PAGE_VIRTUAL)
>   prefetch(page);
> #endif
> }
> 
> to mm.h or prefetch.h or i40e.h or some other header? That's preferred over
> adding #ifs directly in the source code.


Thanks, I will send a new version

-Li

答复: [net-next 2/6] i40e: prefetch struct page of Rx buffer conditionally

2020-07-28 Thread Li,Rongqing



> -邮件原件-
> 发件人: Jakub Kicinski [mailto:k...@kernel.org]
> 发送时间: 2020年7月29日 4:14
> 收件人: Tony Nguyen 
> 抄送: da...@davemloft.net; Li,Rongqing ;
> netdev@vger.kernel.org; nhor...@redhat.com; sassm...@redhat.com;
> jeffrey.t.kirs...@intel.com; Andrew Bowers 
> 主题: Re: [net-next 2/6] i40e: prefetch struct page of Rx buffer conditionally
> 
> On Tue, 28 Jul 2020 12:08:38 -0700 Tony Nguyen wrote:
> > From: Li RongQing 
> >
> > page_address() accesses struct page only when WANT_PAGE_VIRTUAL or
> > HASHED_PAGE_VIRTUAL is defined, otherwise it returns address based on
> > offset, so we prefetch it conditionally
> >
> > Signed-off-by: Li RongQing 
> > Tested-by: Andrew Bowers 
> > Signed-off-by: Tony Nguyen 
> > ---
> >  drivers/net/ethernet/intel/i40e/i40e_txrx.c | 2 ++
> >  1 file changed, 2 insertions(+)
> >
> > diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> > b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> > index 3e5c566ceb01..5d408fe26063 100644
> > --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> > +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> > @@ -1953,7 +1953,9 @@ static struct i40e_rx_buffer
> *i40e_get_rx_buffer(struct i40e_ring *rx_ring,
> > struct i40e_rx_buffer *rx_buffer;
> >
> > rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
> > +#if defined(WANT_PAGE_VIRTUAL) || defined(HASHED_PAGE_VIRTUAL)
> > prefetchw(rx_buffer->page);

Maybe I change prefetchw to prefetch.

refcount of rx_buffer page will be added here originally,
so prefetchw is needed, but after commit 1793668c3b8c
("i40e/i40evf: Update code to better handle incrementing page count"),
and refcount is not added everytime, so change prefetchw
as prefetch,

> > +#endif
> 
> Looks like something that belongs in a common header not (potentially
> multiple) C sources.

Not clear, how should I change?

-Li

[PATCH 1/2] xdp/i40e/ixgbe: not flip rx buffer for copy mode xdp

2020-07-24 Thread Li RongQing

i40e/ixgbe_rx_buffer_flip in copy mode xdp can lead to data
corruption, like the following flow:

   1. first skb is not for xsk, and forwarded to another device
  or socket queue
   2. seconds skb is for xsk, copy data to xsk memory, and page
  of skb->data is released
   3. rx_buff is reusable since only first skb is in it, but
  *_rx_buffer_flip will make that page_offset is set to
  first skb data
   4. then reuse rx buffer, first skb which still is living
  will be corrupted.

so not flip rx buffer for copy mode xdp

Fixes: c497176cb2e4 ("xsk: add Rx receive functions and poll support")
Signed-off-by: Li RongQing 
Signed-off-by: Dongsheng Rong 
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   |  5 -
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  5 -
 include/linux/filter.h| 11 +++
 3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index b3836092c327..a8cea62fdbf5 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2394,7 +2394,10 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, 
int budget)
 
if (xdp_res & (I40E_XDP_TX | I40E_XDP_REDIR)) {
xdp_xmit |= xdp_res;
-   i40e_rx_buffer_flip(rx_ring, rx_buffer, size);
+
+   if (xdp.rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL 
||
+   xdp_get_map_type_no_direct() != 
BPF_MAP_TYPE_XSKMAP)
+   i40e_rx_buffer_flip(rx_ring, rx_buffer, 
size);
} else {
rx_buffer->pagecnt_bias++;
}
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index a8bf941c5c29..e5607ad7ac4f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -2351,7 +2351,10 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector 
*q_vector,
 
if (xdp_res & (IXGBE_XDP_TX | IXGBE_XDP_REDIR)) {
xdp_xmit |= xdp_res;
-   ixgbe_rx_buffer_flip(rx_ring, rx_buffer, size);
+
+   if (xdp.rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL 
||
+   xdp_get_map_type_no_direct() != 
BPF_MAP_TYPE_XSKMAP)
+   ixgbe_rx_buffer_flip(rx_ring, 
rx_buffer, size);
} else {
rx_buffer->pagecnt_bias++;
}
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 259377723603..3b3103814693 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -919,6 +919,17 @@ static inline void xdp_clear_return_frame_no_direct(void)
ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT;
 }
 
+static inline enum bpf_map_type xdp_get_map_type_no_direct(void)
+{
+   struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+   struct bpf_map *map = READ_ONCE(ri->map);
+
+   if (map)
+   return map->map_type;
+   else
+   return BPF_MAP_TYPE_UNSPEC;
+}
+
 static inline int xdp_ok_fwd_dev(const struct net_device *fwd,
 unsigned int pktlen)
 {
-- 
2.16.2

[PATCH 2/2] ice/xdp: not adjust rx buffer for copy mode xdp

2020-07-24 Thread Li RongQing

ice_rx_buf_adjust_pg_offset in copy mode xdp can lead to data
corruption, like the following flow:

   1. first skb is not for xsk, and forwarded to another device
  or socket queue
   2. seconds skb is for xsk, copy data to xsk memory, and page
  of skb->data is released
   3. rx_buff is reusable since only first skb is in it, but
  ice_rx_buf_adjust_pg_offset will make that page_offset
  is set to first skb data
   4. then reuse rx buffer, first skb which still is living
  will be corrupted.

so adjust rx buffer page offset when xdp memory type is
MEM_TYPE_XSK_BUFF_POOL, or map type is not BPF_MAP_TYPE_XSKMAP
which means that memory will be released immediately

Fixes: 2d4238f55697 ("ice: Add support for AF_XDP")
Signed-off-by: Li RongQing 
---
 drivers/net/ethernet/intel/ice/ice_txrx.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c 
b/drivers/net/ethernet/intel/ice/ice_txrx.c
index abdb137c8bb7..6ceb1a0c33ae 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -1169,7 +1169,10 @@ int ice_clean_rx_irq(struct ice_ring *rx_ring, int 
budget)
goto construct_skb;
if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR)) {
xdp_xmit |= xdp_res;
-   ice_rx_buf_adjust_pg_offset(rx_buf, xdp.frame_sz);
+
+   if (xdp.rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ||
+   xdp_get_map_type_no_direct() != BPF_MAP_TYPE_XSKMAP)
+   ice_rx_buf_adjust_pg_offset(rx_buf, 
xdp.frame_sz);
} else {
rx_buf->pagecnt_bias++;
}
-- 
2.16.2

答复: [Intel-wired-lan] [PATCH 1/2] xdp: i40e: ixgbe: ixgbevf: not flip rx buffer for copy mode xdp

2020-07-21 Thread Li,Rongqing



> -邮件原件-
> 发件人: Li,Rongqing
> 发送时间: 2020年7月21日 9:43
> 收件人: 'Magnus Karlsson' 
> 抄送: Network Development ; intel-wired-lan
> ; Karlsson, Magnus
> ; Björn Töpel 
> 主题: 答复: [Intel-wired-lan] [PATCH 1/2] xdp: i40e: ixgbe: ixgbevf: not flip rx
> buffer for copy mode xdp
> 
> 
> 
> > -邮件原件-
> > 发件人: Magnus Karlsson [mailto:magnus.karls...@gmail.com]
> > 发送时间: 2020年7月20日 15:21
> > 收件人: Li,Rongqing 
> > 抄送: Network Development ; intel-wired-lan
> > ; Karlsson, Magnus
> > ; Björn Töpel 
> > 主题: Re: [Intel-wired-lan] [PATCH 1/2] xdp: i40e: ixgbe: ixgbevf: not
> > flip rx buffer for copy mode xdp
> >
> > On Fri, Jul 17, 2020 at 8:24 AM Li RongQing  wrote:
> > >
> > > i40e/ixgbe/ixgbevf_rx_buffer_flip in copy mode xdp can lead to data
> > > corruption, like the following flow:
> > >
> > >1. first skb is not for xsk, and forwarded to another device
> > >   or socket queue
> > >2. seconds skb is for xsk, copy data to xsk memory, and page
> > >   of skb->data is released
> > >3. rx_buff is reusable since only first skb is in it, but
> > >   *_rx_buffer_flip will make that page_offset is set to
> > >   first skb data
> > >4. then reuse rx buffer, first skb which still is living
> > >   will be corrupted.
> e, but known size type */
> > > u32 id;
> > > @@ -73,6 +75,7 @@ struct xdp_buff {
> > > struct xdp_rxq_info *rxq;
> > > struct xdp_txq_info *txq;
> > > u32 frame_sz; /* frame size to deduce data_hard_end/reserved
> > > tailroom*/
> > > +   u32 flags;
> >
> > RongQing,
> >
> > Sorry that I was not clear enough. Could you please submit the simple
> > patch you had, the one that only tests for the memory type.
> >
> > if (xdp->rxq->mem.type != MEM_TYPE_XSK_BUFF_POOL)
> >   i40e_rx_buffer_flip(rx_ring, rx_buffer, size);
> >
> > I do not think that adding a flags field in the xdp_mem_info to fix an
> > Intel driver problem will be hugely popular. The struct is also meant
> > to contain long lived information, not things that will frequently change.
> >
> 
> 
> Thank you Magnus
> 
> My original suggestion is wrong , it should be following
> 
> if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL)
>i40e_rx_buffer_flip(rx_ring, rx_buffer, size);
> 
> 
> but I feel it is not enough to only check mem.type, it must ensure that
> map_type is BPF_MAP_TYPE_XSKMAP ? but it is not expose.
> 
> other maptype, like BPF_MAP_TYPE_DEVMAP,  and if mem.type is
> MEM_TYPE_PAGE_SHARED, not flip the rx buffer, will cause data corruption.
> 
> 
> -Li
> 
> 

How about this?

--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2394,7 +2394,10 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, 
int budget)
 
if (xdp_res & (I40E_XDP_TX | I40E_XDP_REDIR)) {
xdp_xmit |= xdp_res;
-   i40e_rx_buffer_flip(rx_ring, rx_buffer, size);
+
+   if (xdp.rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL 
||
+   xdp_get_map_type() != BPF_MAP_TYPE_XSKMAP)
+   i40e_rx_buffer_flip(rx_ring, rx_buffer, 
size);
} else {
rx_buffer->pagecnt_bias++;
}
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 259377723603..94f4435a77f3 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -919,6 +919,17 @@ static inline void xdp_clear_return_frame_no_direct(void)
ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT;
 }
 
+static enum bpf_map_type xdp_get_map_type(void)
+{
+   struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+   struct bpf_map *map = READ_ONCE(ri->map);
+
+   if (map)
+   return map->map_type;
+   else
+   return BPF_MAP_TYPE_UNSPEC;
+}
+
 static inline int xdp_ok_fwd_dev(const struct net_device *fwd,
 unsigned int pktlen)

答复: [Intel-wired-lan] [PATCH 1/2] xdp: i40e: ixgbe: ixgbevf: not flip rx buffer for copy mode xdp

2020-07-20 Thread Li,Rongqing



> -邮件原件-
> 发件人: Magnus Karlsson [mailto:magnus.karls...@gmail.com]
> 发送时间: 2020年7月20日 15:21
> 收件人: Li,Rongqing 
> 抄送: Network Development ; intel-wired-lan
> ; Karlsson, Magnus
> ; Björn Töpel 
> 主题: Re: [Intel-wired-lan] [PATCH 1/2] xdp: i40e: ixgbe: ixgbevf: not flip rx
> buffer for copy mode xdp
> 
> On Fri, Jul 17, 2020 at 8:24 AM Li RongQing  wrote:
> >
> > i40e/ixgbe/ixgbevf_rx_buffer_flip in copy mode xdp can lead to data
> > corruption, like the following flow:
> >
> >1. first skb is not for xsk, and forwarded to another device
> >   or socket queue
> >2. seconds skb is for xsk, copy data to xsk memory, and page
> >   of skb->data is released
> >3. rx_buff is reusable since only first skb is in it, but
> >   *_rx_buffer_flip will make that page_offset is set to
> >   first skb data
> >4. then reuse rx buffer, first skb which still is living
> >   will be corrupted.
e, but known size type */
> > u32 id;
> > @@ -73,6 +75,7 @@ struct xdp_buff {
> > struct xdp_rxq_info *rxq;
> > struct xdp_txq_info *txq;
> > u32 frame_sz; /* frame size to deduce data_hard_end/reserved
> > tailroom*/
> > +   u32 flags;
> 
> RongQing,
> 
> Sorry that I was not clear enough. Could you please submit the simple patch
> you had, the one that only tests for the memory type.
> 
> if (xdp->rxq->mem.type != MEM_TYPE_XSK_BUFF_POOL)
>   i40e_rx_buffer_flip(rx_ring, rx_buffer, size);
> 
> I do not think that adding a flags field in the xdp_mem_info to fix an Intel 
> driver
> problem will be hugely popular. The struct is also meant to contain long lived
> information, not things that will frequently change.
> 


Thank you Magnus

My original suggestion is wrong , it should be following

if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL)
   i40e_rx_buffer_flip(rx_ring, rx_buffer, size);


but I feel it is not enough to only check mem.type, it must ensure that 
map_type is BPF_MAP_TYPE_XSKMAP ? but it is not expose. 

other maptype, like BPF_MAP_TYPE_DEVMAP,  and if mem.type is 
MEM_TYPE_PAGE_SHARED, not flip the rx buffer, will cause data corruption.


-Li

[PATCH 1/2] xdp: i40e: ixgbe: ixgbevf: not flip rx buffer for copy mode xdp

2020-07-16 Thread Li RongQing

i40e/ixgbe/ixgbevf_rx_buffer_flip in copy mode xdp can lead to
data corruption, like the following flow:

   1. first skb is not for xsk, and forwarded to another device
  or socket queue
   2. seconds skb is for xsk, copy data to xsk memory, and page
  of skb->data is released
   3. rx_buff is reusable since only first skb is in it, but
  *_rx_buffer_flip will make that page_offset is set to
  first skb data
   4. then reuse rx buffer, first skb which still is living
  will be corrupted.

so add flags in xdp struct, to report xdp's data status, then
driver has knowledge whether to flip rx buffer

Fixes: c497176cb2e4 ("xsk: add Rx receive functions and poll support")
Signed-off-by: Li RongQing 
Signed-off-by: Dongsheng Rong 
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   | 5 -
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 5 -
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 5 -
 include/net/xdp.h | 3 +++
 net/xdp/xsk.c | 4 +++-
 5 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index b3836092c327..51fa6f86f917 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2376,6 +2376,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, 
int budget)
 
/* retrieve a buffer from the ring */
if (!skb) {
+   xdp.flags = 0;
xdp.data = page_address(rx_buffer->page) +
   rx_buffer->page_offset;
xdp.data_meta = xdp.data;
@@ -2394,7 +2395,9 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, 
int budget)
 
if (xdp_res & (I40E_XDP_TX | I40E_XDP_REDIR)) {
xdp_xmit |= xdp_res;
-   i40e_rx_buffer_flip(rx_ring, rx_buffer, size);
+
+   if (!(xdp.flags & XDP_DATA_RELEASED))
+   i40e_rx_buffer_flip(rx_ring, rx_buffer, 
size);
} else {
rx_buffer->pagecnt_bias++;
}
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index a8bf941c5c29..9e44a7e1d91c 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -2333,6 +2333,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector 
*q_vector,
 
/* retrieve a buffer from the ring */
if (!skb) {
+   xdp.flags = 0;
xdp.data = page_address(rx_buffer->page) +
   rx_buffer->page_offset;
xdp.data_meta = xdp.data;
@@ -2351,7 +2352,9 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector 
*q_vector,
 
if (xdp_res & (IXGBE_XDP_TX | IXGBE_XDP_REDIR)) {
xdp_xmit |= xdp_res;
-   ixgbe_rx_buffer_flip(rx_ring, rx_buffer, size);
+
+   if (!(xdp.flags & XDP_DATA_RELEASED))
+   ixgbe_rx_buffer_flip(rx_ring, 
rx_buffer, size);
} else {
rx_buffer->pagecnt_bias++;
}
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 
b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index a39e2cb384dd..1c1a8b6a5dcf 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -1168,6 +1168,7 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector 
*q_vector,
 
/* retrieve a buffer from the ring */
if (!skb) {
+   xdp.flags = 0;
xdp.data = page_address(rx_buffer->page) +
   rx_buffer->page_offset;
xdp.data_meta = xdp.data;
@@ -1184,7 +1185,9 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector 
*q_vector,
if (IS_ERR(skb)) {
if (PTR_ERR(skb) == -IXGBEVF_XDP_TX) {
xdp_xmit = true;
-   ixgbevf_rx_buffer_flip(rx_ring, rx_buffer,
+
+   if (!(xdp.flags & XDP_DATA_RELEASED))
+   ixgbevf_rx_buffer_flip(rx_ring, 
rx_buffer,
   size);
} else {
rx_buffer->pagecnt_bias++;
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 609f819ed08b..6b32a01ade19 100644
--- a/include/net/xdp.h
+++ b

[PATCH 2/2] ice/xdp: not adjust rx buffer for copy mode xdp

2020-07-16 Thread Li RongQing

ice_rx_buf_adjust_pg_offset in copy mode xdp can lead to data
corruption, like the following flow:

   1. first skb is not for xsk, and forwarded to another device
  or socket queue
   2. seconds skb is for xsk, copy data to xsk memory, and page
  of skb->data is released
   3. rx_buff is reusable since only first skb is in it, but
  ice_rx_buf_adjust_pg_offset will make that page_offset
  is set to first skb data
   4. then reuse rx buffer, first skb which still is living
  will be corrupted.

so adjust rx buffer page offset only when xdp data is not released

Fixes: 2d4238f55697 ("ice: Add support for AF_XDP")
Signed-off-by: Li RongQing 
---
 drivers/net/ethernet/intel/ice/ice_txrx.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c 
b/drivers/net/ethernet/intel/ice/ice_txrx.c
index abdb137c8bb7..2c58daf4d0d1 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -1147,6 +1147,7 @@ int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget)
goto construct_skb;
}
 
+   xdp.flags = 0;
xdp.data = page_address(rx_buf->page) + rx_buf->page_offset;
xdp.data_hard_start = xdp.data - ice_rx_offset(rx_ring);
xdp.data_meta = xdp.data;
@@ -1169,7 +1170,9 @@ int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget)
goto construct_skb;
if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR)) {
xdp_xmit |= xdp_res;
-   ice_rx_buf_adjust_pg_offset(rx_buf, xdp.frame_sz);
+
+   if (!(xdp.flags & XDP_DATA_RELEASED))
+   ice_rx_buf_adjust_pg_offset(rx_buf, 
xdp.frame_sz);
} else {
rx_buf->pagecnt_bias++;
}
-- 
2.16.2

[PATCH 0/2] intel/xdp fixes for fliping rx buffer

2020-07-16 Thread Li RongQing

This fixes ice/i40e/ixgbe/ixgbevf_rx_buffer_flip in
copy mode xdp that can lead to data corruption.

I split two patches, since i40e/xgbe/ixgbevf supports xsk
receiving from 4.18, put their fixes in a patch 

Li RongQing (2):
  xdp: i40e: ixgbe: ixgbevf: not flip rx buffer for copy mode xdp
  ice/xdp: not adjust rx buffer for copy mode xdp

 drivers/net/ethernet/intel/i40e/i40e_txrx.c   | 5 -
 drivers/net/ethernet/intel/ice/ice_txrx.c | 5 -
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 5 -
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 5 -
 include/net/xdp.h | 3 +++
 net/xdp/xsk.c | 4 +++-
 6 files changed, 22 insertions(+), 5 deletions(-)

-- 
2.16.2

答复: [Intel-wired-lan] [bug ?] i40e_rx_buffer_flip should not be called for redirected xsk copy mode

2020-07-15 Thread Li,Rongqing

> > > >
> > > > Thank you RongQing for reporting this. I will take a look at it
> > > > and produce a patch.
> > > >
> > > > /Magnus
> > >
> >
> > Ping
> 
> My apologies RongQing, but it is taking longer than expected due to key people
> being on vacation during this summer period. We are debating weather the
> simple fix you provided covers all cases.
> Hopefully it does, but we just want to make sure. The fix is needed in four
> drivers: the ones you mention plus ice.
> 
> /Magnus

If my RFC patch is suitable for this bug, I will resend it.

should I send four patches for four drivers, or should I send one patch for 
them?


Thanks

-Li

答复: [Intel-wired-lan] [bug ?] i40e_rx_buffer_flip should not be called for redirected xsk copy mode

2020-07-14 Thread Li,Rongqing



> -邮件原件-
> 发件人: Li,Rongqing
> 发送时间: 2020年7月6日 14:38
> 收件人: 'Magnus Karlsson' 
> 抄送: intel-wired-lan ; Björn Töpel
> ; Karlsson, Magnus ;
> Netdev 
> 主题: 答复: [Intel-wired-lan] [bug ?] i40e_rx_buffer_flip should not be called
> for redirected xsk copy mode
> 
> 
> 
> > -邮件原件-
> > 发件人: Magnus Karlsson [mailto:magnus.karls...@gmail.com]
> > 发送时间: 2020年7月6日 14:13
> > 收件人: Li,Rongqing 
> > 抄送: intel-wired-lan ; Björn Töpel
> > ; Karlsson, Magnus ;
> > Netdev 
> > 主题: Re: [Intel-wired-lan] [bug ?] i40e_rx_buffer_flip should not be
> > called for redirected xsk copy mode
> >
> > Thank you RongQing for reporting this. I will take a look at it and
> > produce a patch.
> >
> > /Magnus
> 

Ping


-Li

答复: [Intel-wired-lan] [bug ?] i40e_rx_buffer_flip should not be called for redirected xsk copy mode

2020-07-05 Thread Li,Rongqing



> -邮件原件-
> 发件人: Magnus Karlsson [mailto:magnus.karls...@gmail.com]
> 发送时间: 2020年7月6日 14:13
> 收件人: Li,Rongqing 
> 抄送: intel-wired-lan ; Björn Töpel
> ; Karlsson, Magnus ;
> Netdev 
> 主题: Re: [Intel-wired-lan] [bug ?] i40e_rx_buffer_flip should not be called for
> redirected xsk copy mode
> 
> Thank you RongQing for reporting this. I will take a look at it and produce a
> patch.
> 
> /Magnus

Thanks, and ixgbevf/ixgbe have same issue.

drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
drivers/net/ethernet/intel/i40e/i40e_txrx.c
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c

I have a RFC, it maybe not better due to adding a new variable 
http://patchwork.ozlabs.org/project/netdev/patch/1593763926-24292-1-git-send-email-lirongq...@baidu.com/

or check memory type

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index b3836092c327..f41664e0c84e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2394,7 +2394,9 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, 
int budget)
 
if (xdp_res & (I40E_XDP_TX | I40E_XDP_REDIR)) {
xdp_xmit |= xdp_res;
-   i40e_rx_buffer_flip(rx_ring, rx_buffer, size);
+
+   if (xdp->rxq->mem.type != 
MEM_TYPE_XSK_BUFF_POOL)
+   i40e_rx_buffer_flip(rx_ring, rx_buffer, 
size);
} else {
rx_buffer->pagecnt_bias++;
        }
Whether,  like to see your patch

-Li RongQing

[PATCH][RFC] i40e: not flip rx buffer for copy mode xdp

2020-07-03 Thread Li RongQing

i40e_rx_buffer_flip in copy mode xdp can lead to data corruption,
like the following flow:

   1. first skb is not for xsk, and forwarded to another device
  or socket queue
   2. seconds skb is for xsk, copy data to xsk memory, and page
  of skb->data is released
   3. rx_buff is reusable since only first skb is in it, but
  i40e_rx_buffer_flip will make that page_offset is set to
  first skb data
   4. then reuse rx buffer, first skb which still is living
  will be corrupted.

so add flags in xdp struct, to report xdp's data status

Signed-off-by: Li RongQing 
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c | 5 -
 include/net/xdp.h   | 3 +++
 net/xdp/xsk.c   | 4 +++-
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index b3836092c327..51fa6f86f917 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2376,6 +2376,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, 
int budget)
 
/* retrieve a buffer from the ring */
if (!skb) {
+   xdp.flags = 0;
xdp.data = page_address(rx_buffer->page) +
   rx_buffer->page_offset;
xdp.data_meta = xdp.data;
@@ -2394,7 +2395,9 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, 
int budget)
 
if (xdp_res & (I40E_XDP_TX | I40E_XDP_REDIR)) {
xdp_xmit |= xdp_res;
-   i40e_rx_buffer_flip(rx_ring, rx_buffer, size);
+
+   if (!(xdp.flags & XDP_DATA_RELEASED))
+   i40e_rx_buffer_flip(rx_ring, rx_buffer, 
size);
} else {
rx_buffer->pagecnt_bias++;
}
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 609f819ed08b..6241e1efbcc7 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -47,6 +47,8 @@ enum xdp_mem_type {
 #define XDP_XMIT_FLUSH (1U << 0)   /* doorbell signal consumer */
 #define XDP_XMIT_FLAGS_MASKXDP_XMIT_FLUSH
 
+#define XDP_DATA_RELEASED (1U << 1)
+
 struct xdp_mem_info {
u32 type; /* enum xdp_mem_type, but known size type */
u32 id;
@@ -73,6 +75,7 @@ struct xdp_buff {
struct xdp_rxq_info *rxq;
struct xdp_txq_info *txq;
u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/
+   u32 flags;
 };
 
 /* Reserve memory area at end-of data area.
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index b6c0f08bd80d..2c4c5c16660b 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -172,8 +172,10 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff 
*xdp, u32 len,
xsk_buff_free(xsk_xdp);
return err;
}
-   if (explicit_free)
+   if (explicit_free) {
xdp_return_buff(xdp);
+   xdp->flags |= XDP_DATA_RELEASED;
+   }
return 0;
 }
 
-- 
2.16.2

[PATCH][net-next] i40e: prefetch struct page of rx buffer conditionally

2020-07-01 Thread Li RongQing

page_address() accesses struct page only when WANT_PAGE_VIRTUAL
or HASHED_PAGE_VIRTUAL is defined, otherwise it returns address
based on offset, so we prefetch it conditionally

Signed-off-by: Li RongQing 
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index f9555c847f73..b3836092c327 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -1953,7 +1953,9 @@ static struct i40e_rx_buffer *i40e_get_rx_buffer(struct 
i40e_ring *rx_ring,
struct i40e_rx_buffer *rx_buffer;
 
rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
+#if defined(WANT_PAGE_VIRTUAL) || defined(HASHED_PAGE_VIRTUAL)
prefetchw(rx_buffer->page);
+#endif
 
/* we are reusing so sync this buffer for CPU use */
dma_sync_single_range_for_cpu(rx_ring->dev,
-- 
2.16.2

[PATCH] i40e: not compute affinity_mask for IRQ

2020-07-01 Thread Li RongQing

After commit 759dc4a7e605 ("i40e: initialize our affinity_mask
based on cpu_possible_mask"), NAPI IRQ affinity_mask is bind to
all possible cpus, not a fixed cpu

Signed-off-by: Li RongQing 
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 12 +++-
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 5d807c8004f8..a63548cb022d 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -11156,11 +11156,10 @@ static int i40e_init_msix(struct i40e_pf *pf)
  * i40e_vsi_alloc_q_vector - Allocate memory for a single interrupt vector
  * @vsi: the VSI being configured
  * @v_idx: index of the vector in the vsi struct
- * @cpu: cpu to be used on affinity_mask
  *
  * We allocate one q_vector.  If allocation fails we return -ENOMEM.
  **/
-static int i40e_vsi_alloc_q_vector(struct i40e_vsi *vsi, int v_idx, int cpu)
+static int i40e_vsi_alloc_q_vector(struct i40e_vsi *vsi, int v_idx)
 {
struct i40e_q_vector *q_vector;
 
@@ -11193,7 +11192,7 @@ static int i40e_vsi_alloc_q_vector(struct i40e_vsi 
*vsi, int v_idx, int cpu)
 static int i40e_vsi_alloc_q_vectors(struct i40e_vsi *vsi)
 {
struct i40e_pf *pf = vsi->back;
-   int err, v_idx, num_q_vectors, current_cpu;
+   int err, v_idx, num_q_vectors;
 
/* if not MSIX, give the one vector only to the LAN VSI */
if (pf->flags & I40E_FLAG_MSIX_ENABLED)
@@ -11203,15 +11202,10 @@ static int i40e_vsi_alloc_q_vectors(struct i40e_vsi 
*vsi)
else
return -EINVAL;
 
-   current_cpu = cpumask_first(cpu_online_mask);
-
for (v_idx = 0; v_idx < num_q_vectors; v_idx++) {
-   err = i40e_vsi_alloc_q_vector(vsi, v_idx, current_cpu);
+   err = i40e_vsi_alloc_q_vector(vsi, v_idx);
if (err)
goto err_out;
-   current_cpu = cpumask_next(current_cpu, cpu_online_mask);
-   if (unlikely(current_cpu >= nr_cpu_ids))
-   current_cpu = cpumask_first(cpu_online_mask);
}
 
return 0;
-- 
2.16.2

[PATCH] xdp: fix xsk_generic_xmit errno

2020-06-10 Thread Li RongQing

propagate sock_alloc_send_skb error code, not set it
to EAGAIN unconditionally, when fail to allocate skb,
which maybe causes that user space unnecessary loops

Fixes: 35fcde7f8deb "(xsk: support for Tx)"
Signed-off-by: Li RongQing 
---
 net/xdp/xsk.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index b6c0f08bd80d..1ba3ea262c15 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -353,7 +353,6 @@ static int xsk_generic_xmit(struct sock *sk)
len = desc.len;
skb = sock_alloc_send_skb(sk, len, 1, &err);
if (unlikely(!skb)) {
-   err = -EAGAIN;
goto out;
}
 
-- 
2.16.2

答复: [PATCH] i40e: fix wrong index in i40e_xsk_umem_dma_map

2020-06-02 Thread Li,Rongqing



> -邮件原件-
> 发件人: Björn Töpel [mailto:bjorn.to...@gmail.com]
> 发送时间: 2020年6月2日 19:27
> 收件人: Li,Rongqing ; intel-wired-lan
> ; Netdev 
> 抄送: bpf ; Karlsson, Magnus
> 
> 主题: Re: [PATCH] i40e: fix wrong index in i40e_xsk_umem_dma_map
> 
> On Tue, 2 Jun 2020 at 11:20, Li RongQing  wrote:
> >
> 
> Li, thanks for the patch! Good catch!
> 
> Please add a proper description for the patch. The fix should be added to the
> stable branches (5.7 and earlier). Note that this code was recently removed in
> favor of the new AF_XDP buffer allocation scheme.
> 
> 

Ok

-LiRongQing 

> Björn
> 
> > Fixes: 0a714186d3c0 "(i40e: add AF_XDP zero-copy Rx support)"
> > Signed-off-by: Li RongQing 
> > ---
> >  drivers/net/ethernet/intel/i40e/i40e_xsk.c | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> > b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> > index 0b7d29192b2c..c926438118ea 100644
> > --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> > +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> > @@ -37,9 +37,9 @@ static int i40e_xsk_umem_dma_map(struct i40e_vsi
> > *vsi, struct xdp_umem *umem)
> >
> >  out_unmap:
> > for (j = 0; j < i; j++) {
> > -   dma_unmap_page_attrs(dev, umem->pages[i].dma,
> PAGE_SIZE,
> > +   dma_unmap_page_attrs(dev, umem->pages[j].dma,
> > + PAGE_SIZE,
> >  DMA_BIDIRECTIONAL,
> I40E_RX_DMA_ATTR);
> > -   umem->pages[i].dma = 0;
> > +   umem->pages[j].dma = 0;
> > }
> >
> > return -1;
> > --
> > 2.16.2
> >

[PATCH][v2] i40e: fix wrong index in i40e_xsk_umem_dma_map

2020-06-02 Thread Li RongQing

The dma should be unmapped in rollback path from
umem->pages[0].dma till umem->pages[i-1].dma which
is last dma map address

Fixes: 0a714186d3c0 "(i40e: add AF_XDP zero-copy Rx support)"
Signed-off-by: Li RongQing 
---
diff with v1: add description

 drivers/net/ethernet/intel/i40e/i40e_xsk.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c 
b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index 0b7d29192b2c..c926438118ea 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -37,9 +37,9 @@ static int i40e_xsk_umem_dma_map(struct i40e_vsi *vsi, struct 
xdp_umem *umem)
 
 out_unmap:
for (j = 0; j < i; j++) {
-   dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE,
+   dma_unmap_page_attrs(dev, umem->pages[j].dma, PAGE_SIZE,
 DMA_BIDIRECTIONAL, I40E_RX_DMA_ATTR);
-   umem->pages[i].dma = 0;
+   umem->pages[j].dma = 0;
}
 
return -1;
-- 
2.16.2

[PATCH] i40e: fix wrong index in i40e_xsk_umem_dma_map

2020-06-02 Thread Li RongQing

Fixes: 0a714186d3c0 "(i40e: add AF_XDP zero-copy Rx support)"
Signed-off-by: Li RongQing 
---
 drivers/net/ethernet/intel/i40e/i40e_xsk.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c 
b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index 0b7d29192b2c..c926438118ea 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -37,9 +37,9 @@ static int i40e_xsk_umem_dma_map(struct i40e_vsi *vsi, struct 
xdp_umem *umem)
 
 out_unmap:
for (j = 0; j < i; j++) {
-   dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE,
+   dma_unmap_page_attrs(dev, umem->pages[j].dma, PAGE_SIZE,
 DMA_BIDIRECTIONAL, I40E_RX_DMA_ATTR);
-   umem->pages[i].dma = 0;
+   umem->pages[j].dma = 0;
}
 
return -1;
-- 
2.16.2

答复: [PATCH] openvswitch: change type of UPCALL_PID attribute to NLA_UNSPEC

2019-09-25 Thread Li,Rongqing



> -邮件原件-
> 发件人: Pravin Shelar [mailto:pshe...@ovn.org]
> 发送时间: 2019年9月26日 5:03
> 收件人: Li,Rongqing 
> 抄送: Linux Kernel Network Developers 
> 主题: Re: [PATCH] openvswitch: change type of UPCALL_PID attribute to
> NLA_UNSPEC
> 
> On Tue, Sep 24, 2019 at 4:11 AM Li RongQing  wrote:
> >
> > userspace openvswitch patch "(dpif-linux: Implement the API functions
> > to allow multiple handler threads read upcall)"
> > changes its type from U32 to UNSPEC, but leave the kernel unchanged
> >
> > and after kernel 6e237d099fac "(netlink: Relax attr validation for
> > fixed length types)", this bug is exposed by the below warning
> >
> > [   57.215841] netlink: 'ovs-vswitchd': attribute type 5 has an
> invalid length.
> >
> > Signed-off-by: Li RongQing 
> 
> Acked-by: Pravin B Shelar 
> 

Add a fixes:

Fixes: 5cd667b0a456 ("openvswitch: Allow each vport to have an array of 
'port_id's")

-LI

> Thanks,
> Pravin.

[PATCH] openvswitch: change type of UPCALL_PID attribute to NLA_UNSPEC

2019-09-24 Thread Li RongQing

userspace openvswitch patch "(dpif-linux: Implement the API
functions to allow multiple handler threads read upcall)"
changes its type from U32 to UNSPEC, but leave the kernel
unchanged

and after kernel 6e237d099fac "(netlink: Relax attr validation
for fixed length types)", this bug is exposed by the below
warning

[   57.215841] netlink: 'ovs-vswitchd': attribute type 5 has an invalid 
length.

Signed-off-by: Li RongQing 
---
 net/openvswitch/datapath.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index dde9d762edee..f30e406fbec5 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -2294,7 +2294,7 @@ static const struct nla_policy 
vport_policy[OVS_VPORT_ATTR_MAX + 1] = {
[OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) },
[OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 },
[OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 },
-   [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 },
+   [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_UNSPEC },
[OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED },
[OVS_VPORT_ATTR_IFINDEX] = { .type = NLA_U32 },
[OVS_VPORT_ATTR_NETNSID] = { .type = NLA_S32 },
-- 
2.16.2

答复: [PATCH][net-next] net: drop_monitor: change the stats variable to u64 in net_dm_stats_put

2019-08-22 Thread Li,Rongqing



> -邮件原件-
> 发件人: Ido Schimmel [mailto:ido...@idosch.org]
> 发送时间: 2019年8月22日 20:00
> 收件人: Li,Rongqing 
> 抄送: netdev@vger.kernel.org; ido...@mellanox.com
> 主题: Re: [PATCH][net-next] net: drop_monitor: change the stats variable to
> u64 in net_dm_stats_put
> 
> On Thu, Aug 22, 2019 at 02:22:33PM +0800, Li RongQing wrote:
> > only the element drop of struct net_dm_stats is used, so simplify it
> > to u64
> 
> Thanks for the patch, but I don't really see the value here. The struct 
> allows for
> easy extensions in the future. What do you gain from this change? We merely
> read stats and report them to user space, so I guess it's not about
> performance either.
> 

I think u64 can reduce to call memset and dereference stats.drop

If it is for future, keep it

-RongQing

[PATCH][net-next] net: drop_monitor: change the stats variable to u64 in net_dm_stats_put

2019-08-21 Thread Li RongQing

only the element drop of struct net_dm_stats is used, so simplify it to u64

Signed-off-by: Li RongQing 
---
 net/core/drop_monitor.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index bfc024024aa3..ed10a40cf629 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -1329,11 +1329,11 @@ static int net_dm_cmd_config_get(struct sk_buff *skb, 
struct genl_info *info)
return rc;
 }
 
-static void net_dm_stats_read(struct net_dm_stats *stats)
+static void net_dm_stats_read(u64 *stats)
 {
int cpu;
 
-   memset(stats, 0, sizeof(*stats));
+   *stats = 0;
for_each_possible_cpu(cpu) {
struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu);
struct net_dm_stats *cpu_stats = &data->stats;
@@ -1345,14 +1345,14 @@ static void net_dm_stats_read(struct net_dm_stats 
*stats)
dropped = cpu_stats->dropped;
} while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));
 
-   stats->dropped += dropped;
+   *stats += dropped;
}
 }
 
 static int net_dm_stats_put(struct sk_buff *msg)
 {
-   struct net_dm_stats stats;
struct nlattr *attr;
+   u64 stats;
 
net_dm_stats_read(&stats);
 
@@ -1361,7 +1361,7 @@ static int net_dm_stats_put(struct sk_buff *msg)
return -EMSGSIZE;
 
if (nla_put_u64_64bit(msg, NET_DM_ATTR_STATS_DROPPED,
- stats.dropped, NET_DM_ATTR_PAD))
+ stats, NET_DM_ATTR_PAD))
goto nla_put_failure;
 
nla_nest_end(msg, attr);
-- 
2.16.2

[PATCH][V2] net: fix __ip_mc_inc_group usage

2019-08-19 Thread Li RongQing

in ip_mc_inc_group, memory allocation flag, not mcast mode, is expected
by __ip_mc_inc_group

similar issue in __ip_mc_join_group, both mcase mode and gfp_t are needed
here, so use ip_mc_inc_group(...)

Fixes: 9fb20801dab4 ("net: Fix ip_mc_{dec,inc}_group allocation context")
Signed-off-by: Li RongQing 
Signed-off-by: Florian Fainelli 
Signed-off-by: Zhang Yu 
---
v1-->v2:
fixes "Fixes tag"
use ip_mc_inc_group in __ip_mc_join_group

 net/ipv4/igmp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 180f6896b98b..480d0b22db1a 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1475,7 +1475,7 @@ EXPORT_SYMBOL(__ip_mc_inc_group);
 
 void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
 {
-   __ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE);
+   __ip_mc_inc_group(in_dev, addr, GFP_KERNEL);
 }
 EXPORT_SYMBOL(ip_mc_inc_group);
 
@@ -2197,7 +2197,7 @@ static int __ip_mc_join_group(struct sock *sk, struct 
ip_mreqn *imr,
iml->sflist = NULL;
iml->sfmode = mode;
rcu_assign_pointer(inet->mc_list, iml);
-   __ip_mc_inc_group(in_dev, addr, mode);
+   ip_mc_inc_group(in_dev, addr, mode, GFP_KERNEL);
err = 0;
 done:
return err;
-- 
2.16.2

答复: [PATCH] net: Fix __ip_mc_inc_group argument 3 input

2019-08-19 Thread Li,Rongqing

> 
> 
> On 8/19/2019 7:25 PM, Li RongQing wrote:
> > It expects gfp_t, but got unsigned int mode
> >
> > Fixes: 6e2059b53f98 ("ipv4/igmp: init group mode as INCLUDE when join
> > source group")
> > Signed-off-by: Li RongQing 
> > Signed-off-by: Zhang Yu 
> 
> You have identified a problem, but I don't think it came from this commit,
> rather from:
> 
> 9fb20801dab4 ("net: Fix ip_mc_{dec,inc}_group allocation context")
> 
> see below for details.
> 
> > ---
> >  net/ipv4/igmp.c | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index
> > 180f6896b98b..b8352d716253 100644
> > --- a/net/ipv4/igmp.c
> > +++ b/net/ipv4/igmp.c
> > @@ -1475,7 +1475,7 @@ EXPORT_SYMBOL(__ip_mc_inc_group);
> >
> >  void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)  {
> > -   __ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE);
> > +   __ip_mc_inc_group(in_dev, addr, GFP_KERNEL);
> 
> That part looks fine.
> 
> >  }
> >  EXPORT_SYMBOL(ip_mc_inc_group);
> >
> > @@ -2197,7 +2197,7 @@ static int __ip_mc_join_group(struct sock *sk,
> struct ip_mreqn *imr,
> > iml->sflist = NULL;
> > iml->sfmode = mode;
> > rcu_assign_pointer(inet->mc_list, iml);
> > -   __ip_mc_inc_group(in_dev, addr, mode);
> > +   __ip_mc_inc_group(in_dev, addr, GFP_KERNEL);
> 
> But here, we probably want to pass both mode and gfp_t and use
> ip_mc_inc_group(in_dev, addr, mode, GFP_KERNEL):
> 
> diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index
> 180f6896b98b..2459b5e3fd98 100644
> --- a/net/ipv4/igmp.c
> +++ b/net/ipv4/igmp.c
> @@ -2197,7 +2197,7 @@ static int __ip_mc_join_group(struct sock *sk,
> struct ip_mreqn *imr,
> iml->sflist = NULL;
> iml->sfmode = mode;
> rcu_assign_pointer(inet->mc_list, iml);
> -   __ip_mc_inc_group(in_dev, addr, mode);
> +   ip_mc_inc_group(in_dev, addr, mode, GFP_KERNEL);
> err = 0;
>  done:
> return err;
> 
> What do you think?


You are right, I will send V2

Thanks

-RongQing

[PATCH] net: fix icmp_socket_deliver argument 2 input

2019-08-19 Thread Li RongQing

it expects a unsigned int, but got a __be32

Signed-off-by: Li RongQing 
Signed-off-by: Zhang Yu 
---
 net/ipv4/icmp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 1510e951f451..bf7b5d45de99 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -902,7 +902,7 @@ static bool icmp_redirect(struct sk_buff *skb)
return false;
}
 
-   icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway);
+   icmp_socket_deliver(skb, ntohl(icmp_hdr(skb)->un.gateway));
return true;
 }
 
-- 
2.16.2

[PATCH] net: Fix __ip_mc_inc_group argument 3 input

2019-08-19 Thread Li RongQing

It expects gfp_t, but got unsigned int mode

Fixes: 6e2059b53f98 ("ipv4/igmp: init group mode as INCLUDE when join source 
group")
Signed-off-by: Li RongQing 
Signed-off-by: Zhang Yu 
---
 net/ipv4/igmp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 180f6896b98b..b8352d716253 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1475,7 +1475,7 @@ EXPORT_SYMBOL(__ip_mc_inc_group);
 
 void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
 {
-   __ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE);
+   __ip_mc_inc_group(in_dev, addr, GFP_KERNEL);
 }
 EXPORT_SYMBOL(ip_mc_inc_group);
 
@@ -2197,7 +2197,7 @@ static int __ip_mc_join_group(struct sock *sk, struct 
ip_mreqn *imr,
iml->sflist = NULL;
iml->sfmode = mode;
rcu_assign_pointer(inet->mc_list, iml);
-   __ip_mc_inc_group(in_dev, addr, mode);
+   __ip_mc_inc_group(in_dev, addr, GFP_KERNEL);
err = 0;
 done:
return err;
-- 
2.16.2

[PATCH][net-next] net: remove empty inet_exit_net

2019-08-19 Thread Li RongQing

Pointer members of an object with static storage duration, if not
explicitly initialized, will be initialized to a NULL pointer. The
net namespace API checks if this pointer is not NULL before using it,
it are safe to remove the function.

Signed-off-by: Li RongQing 
---
 net/ipv4/af_inet.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index ed2301ef872e..70f92aaca411 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1845,13 +1845,8 @@ static __net_init int inet_init_net(struct net *net)
return 0;
 }
 
-static __net_exit void inet_exit_net(struct net *net)
-{
-}
-
 static __net_initdata struct pernet_operations af_inet_ops = {
.init = inet_init_net,
-   .exit = inet_exit_net,
 };
 
 static int __init init_inet_pernet_ops(void)
-- 
2.16.2

[PATCH][net-next] net: remove unused parameter from skb_checksum_try_convert

2019-07-04 Thread Li RongQing

the check parameter is never used

Signed-off-by: Li RongQing 
---
 include/linux/skbuff.h | 8 +++-
 net/ipv4/gre_demux.c   | 2 +-
 net/ipv4/udp.c | 3 +--
 net/ipv6/udp.c | 3 +--
 4 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c922ac8a8bd6..f0b5adeb644d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3914,18 +3914,16 @@ static inline bool __skb_checksum_convert_check(struct 
sk_buff *skb)
return (skb->ip_summed == CHECKSUM_NONE && skb->csum_valid);
 }
 
-static inline void __skb_checksum_convert(struct sk_buff *skb,
- __sum16 check, __wsum pseudo)
+static inline void __skb_checksum_convert(struct sk_buff *skb, __wsum pseudo)
 {
skb->csum = ~pseudo;
skb->ip_summed = CHECKSUM_COMPLETE;
 }
 
-#define skb_checksum_try_convert(skb, proto, check, compute_pseudo)\
+#define skb_checksum_try_convert(skb, proto, compute_pseudo)   \
 do {   \
if (__skb_checksum_convert_check(skb))  \
-   __skb_checksum_convert(skb, check,  \
-  compute_pseudo(skb, proto)); \
+   __skb_checksum_convert(skb, compute_pseudo(skb, proto)); \
 } while (0)
 
 static inline void skb_remcsum_adjust_partial(struct sk_buff *skb, void *ptr,
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 293acfb36376..44bfeecac33e 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -83,7 +83,7 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info 
*tpi,
options = (__be32 *)(greh + 1);
if (greh->flags & GRE_CSUM) {
if (!skb_checksum_simple_validate(skb)) {
-   skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
+   skb_checksum_try_convert(skb, IPPROTO_GRE,
 null_compute_pseudo);
} else if (csum_err) {
*csum_err = true;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 1b971bd95786..c21862ba9c02 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2224,8 +2224,7 @@ static int udp_unicast_rcv_skb(struct sock *sk, struct 
sk_buff *skb,
int ret;
 
if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
-   skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
-inet_compute_pseudo);
+   skb_checksum_try_convert(skb, IPPROTO_UDP, inet_compute_pseudo);
 
ret = udp_queue_rcv_skb(sk, skb);
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 66ca5a4b17c4..4406e059da68 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -826,8 +826,7 @@ static int udp6_unicast_rcv_skb(struct sock *sk, struct 
sk_buff *skb,
int ret;
 
if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
-   skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
-ip6_compute_pseudo);
+   skb_checksum_try_convert(skb, IPPROTO_UDP, ip6_compute_pseudo);
 
ret = udpv6_queue_rcv_skb(sk, skb);
 
-- 
2.16.2

答复: [PATCH] xfrm: use list_for_each_entry_safe in xfrm_policy_flush

2019-07-01 Thread Li,Rongqing



> -邮件原件-
> 发件人: Florian Westphal [mailto:f...@strlen.de]
> 发送时间: 2019年7月1日 17:04
> 收件人: Li,Rongqing 
> 抄送: netdev@vger.kernel.org
> 主题: Re: [PATCH] xfrm: use list_for_each_entry_safe in xfrm_policy_flush
> 
> Li RongQing  wrote:
> > The iterated pol maybe be freed since it is not protected by RCU or
> > spinlock when put it, lead to UAF, so use _safe function to iterate
> > over it against removal
> >
> > Signed-off-by: Li RongQing 
> > ---
> >  net/xfrm/xfrm_policy.c | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index
> > 3235562f6588..87d770dab1f5 100644
> > --- a/net/xfrm/xfrm_policy.c
> > +++ b/net/xfrm/xfrm_policy.c
> > @@ -1772,7 +1772,7 @@ xfrm_policy_flush_secctx_check(struct net *net,
> > u8 type, bool task_valid)  int xfrm_policy_flush(struct net *net, u8
> > type, bool task_valid)  {
> > int dir, err = 0, cnt = 0;
> > -   struct xfrm_policy *pol;
> > +   struct xfrm_policy *pol, *tmp;
> >
> > spin_lock_bh(&net->xfrm.xfrm_policy_lock);
> >
> > @@ -1781,7 +1781,7 @@ int xfrm_policy_flush(struct net *net, u8 type, bool
> task_valid)
> > goto out;
> >
> >  again:
> > -   list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
> > +   list_for_each_entry_safe(pol, tmp, &net->xfrm.policy_all, walk.all)
> > +{
> > dir = xfrm_policy_id2dir(pol->index);
> > if (pol->walk.dead ||
> > dir >= XFRM_POLICY_MAX ||
> 
> This function drops the lock, but after re-acquire jumps to the 'again'
> label, so I do not see the UAF as the entire loop gets restarted.

You are right, sorry for the noise

-Li

[PATCH] xfrm: use list_for_each_entry_safe in xfrm_policy_flush

2019-07-01 Thread Li RongQing

The iterated pol maybe be freed since it is not protected
by RCU or spinlock when put it, lead to UAF, so use _safe
function to iterate over it against removal

Signed-off-by: Li RongQing 
---
 net/xfrm/xfrm_policy.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 3235562f6588..87d770dab1f5 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1772,7 +1772,7 @@ xfrm_policy_flush_secctx_check(struct net *net, u8 type, 
bool task_valid)
 int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
 {
int dir, err = 0, cnt = 0;
-   struct xfrm_policy *pol;
+   struct xfrm_policy *pol, *tmp;
 
spin_lock_bh(&net->xfrm.xfrm_policy_lock);
 
@@ -1781,7 +1781,7 @@ int xfrm_policy_flush(struct net *net, u8 type, bool 
task_valid)
goto out;
 
 again:
-   list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
+   list_for_each_entry_safe(pol, tmp, &net->xfrm.policy_all, walk.all) {
dir = xfrm_policy_id2dir(pol->index);
if (pol->walk.dead ||
dir >= XFRM_POLICY_MAX ||
-- 
2.16.2

[PATCH][net-next] netns: restore ops before calling ops_exit_list

2019-06-20 Thread Li RongQing

ops has been iterated to first element when call pre_exit, and
it needs to restore from save_ops, not save ops to save_ops

Fixes: d7d99872c144 ("netns: add pre_exit method to struct pernet_operations")
Signed-off-by: Li RongQing 
---
 net/core/net_namespace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 89dc99a28978..198ce503ae73 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -345,7 +345,7 @@ static __net_init int setup_net(struct net *net, struct 
user_namespace *user_ns)
 
synchronize_rcu();
 
-   saved_ops = ops;
+   ops = saved_ops;
list_for_each_entry_continue_reverse(ops, &pernet_list, list)
ops_exit_list(ops, &net_exit_list);
 
-- 
2.16.2

[PATCH][net-next] net: remove empty netlink_tap_exit_net

2019-06-13 Thread Li RongQing

Pointer members of an object with static storage duration, if not
explicitly initialized, will be initialized to a NULL pointer. The
net namespace API checks if this pointer is not NULL before using it,
it are safe to remove the function.

Signed-off-by: Li RongQing 
---
 net/netlink/af_netlink.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 718a97d5f1fd..9a2cd648d9c6 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -245,13 +245,8 @@ static __net_init int netlink_tap_init_net(struct net *net)
return 0;
 }
 
-static void __net_exit netlink_tap_exit_net(struct net *net)
-{
-}
-
 static struct pernet_operations netlink_tap_net_ops = {
.init = netlink_tap_init_net,
-   .exit = netlink_tap_exit_net,
.id   = &netlink_tap_net_id,
.size = sizeof(struct netlink_tap_net),
 };
-- 
2.16.2

[PATCH] xfrm: remove empty xfrmi_init_net

2019-06-13 Thread Li RongQing

Pointer members of an object with static storage duration, if not
explicitly initialized, will be initialized to a NULL pointer. The
net namespace API checks if this pointer is not NULL before using it,
it are safe to remove the function.

Signed-off-by: Li RongQing 
---
 net/xfrm/xfrm_interface.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c
index 6be8c7df15bb..2c9c4108a426 100644
--- a/net/xfrm/xfrm_interface.c
+++ b/net/xfrm/xfrm_interface.c
@@ -782,11 +782,6 @@ static void __net_exit xfrmi_destroy_interfaces(struct 
xfrmi_net *xfrmn)
unregister_netdevice_many(&list);
 }
 
-static int __net_init xfrmi_init_net(struct net *net)
-{
-   return 0;
-}
-
 static void __net_exit xfrmi_exit_net(struct net *net)
 {
struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
@@ -797,7 +792,6 @@ static void __net_exit xfrmi_exit_net(struct net *net)
 }
 
 static struct pernet_operations xfrmi_net_ops = {
-   .init = xfrmi_init_net,
.exit = xfrmi_exit_net,
.id   = &xfrmi_net_id,
.size = sizeof(struct xfrmi_net),
-- 
2.16.2

答复: [PATCH][v2] net: ethtool: not call vzalloc for zero sized memory request

2019-03-29 Thread Li,Rongqing



> -邮件原件-
> 发件人: Michal Kubecek [mailto:mkube...@suse.cz]
> 发送时间: 2019年3月29日 17:29
> 收件人: Li,Rongqing 
> 抄送: netdev@vger.kernel.org
> 主题: Re: [PATCH][v2] net: ethtool: not call vzalloc for zero sized memory
> request
> 
> On Fri, Mar 29, 2019 at 09:18:02AM +0800, Li RongQing wrote:
> > NULL or ZERO_SIZE_PTR will be returned for zero sized memory request,
> > and derefencing them will lead to a segfault
> >
> > so it is unnecessory to call vzalloc for zero sized memory request and
> > not call functions which maybe derefence the NULL allocated memory
> >
> > this also fixes a possible memory leak if phy_ethtool_get_stats
> > returns error, memory should be freed before exit
> >
> > Signed-off-by: Li RongQing 
> > Reviewed-by: Wang Li 
> > ---
> > v1->v2: not call get_ethtool_stats if n_stats is 0
> >
> >  net/core/ethtool.c | 46
> > ++
> >  1 file changed, 30 insertions(+), 16 deletions(-)
> 
> This looks correct, I have just one minor comment below.
> 
> > diff --git a/net/core/ethtool.c b/net/core/ethtool.c index
> > b1eb32419732..36ed619faf36 100644
> > --- a/net/core/ethtool.c
> > +++ b/net/core/ethtool.c
> > @@ -1797,11 +1797,16 @@ static int ethtool_get_strings(struct net_device
> *dev, void __user *useraddr)
> > WARN_ON_ONCE(!ret);
> >
> > gstrings.len = ret;
> > -   data = vzalloc(array_size(gstrings.len, ETH_GSTRING_LEN));
> > -   if (gstrings.len && !data)
> > -   return -ENOMEM;
> >
> > -   __ethtool_get_strings(dev, gstrings.string_set, data);
> > +   if (gstrings.len) {
> > +   data = vzalloc(array_size(gstrings.len, ETH_GSTRING_LEN));
> > +   if (!data)
> > +   return -ENOMEM;
> > +
> > +   __ethtool_get_strings(dev, gstrings.string_set, data);
> > +   } else {
> > +   data = NULL;
> > +   }
> >
> > ret = -EFAULT;
> > if (copy_to_user(useraddr, &gstrings, sizeof(gstrings)))
> 
> If gstrings.len is zero, there is no need to set data pointer as it's not 
> going to be
> used so that you could omit the else branch. It's the same in the other two
> functions.
> 
I think it is must, 
since data is from stack, and not init, once it is passed into vfree, will 
cause panic

thanks

-RongQing

> Michal

[PATCH][v2] net: ethtool: not call vzalloc for zero sized memory request

2019-03-28 Thread Li RongQing

NULL or ZERO_SIZE_PTR will be returned for zero sized memory
request, and derefencing them will lead to a segfault

so it is unnecessory to call vzalloc for zero sized memory
request and not call functions which maybe derefence the
NULL allocated memory

this also fixes a possible memory leak if phy_ethtool_get_stats
returns error, memory should be freed before exit

Signed-off-by: Li RongQing 
Reviewed-by: Wang Li 
---
v1->v2: not call get_ethtool_stats if n_stats is 0 

 net/core/ethtool.c | 46 ++
 1 file changed, 30 insertions(+), 16 deletions(-)

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index b1eb32419732..36ed619faf36 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1797,11 +1797,16 @@ static int ethtool_get_strings(struct net_device *dev, 
void __user *useraddr)
WARN_ON_ONCE(!ret);
 
gstrings.len = ret;
-   data = vzalloc(array_size(gstrings.len, ETH_GSTRING_LEN));
-   if (gstrings.len && !data)
-   return -ENOMEM;
 
-   __ethtool_get_strings(dev, gstrings.string_set, data);
+   if (gstrings.len) {
+   data = vzalloc(array_size(gstrings.len, ETH_GSTRING_LEN));
+   if (!data)
+   return -ENOMEM;
+
+   __ethtool_get_strings(dev, gstrings.string_set, data);
+   } else {
+   data = NULL;
+   }
 
ret = -EFAULT;
if (copy_to_user(useraddr, &gstrings, sizeof(gstrings)))
@@ -1897,11 +1902,15 @@ static int ethtool_get_stats(struct net_device *dev, 
void __user *useraddr)
return -EFAULT;
 
stats.n_stats = n_stats;
-   data = vzalloc(array_size(n_stats, sizeof(u64)));
-   if (n_stats && !data)
-   return -ENOMEM;
 
-   ops->get_ethtool_stats(dev, &stats, data);
+   if (n_stats) {
+   data = vzalloc(array_size(n_stats, sizeof(u64)));
+   if (!data)
+   return -ENOMEM;
+   ops->get_ethtool_stats(dev, &stats, data);
+   } else {
+   data = NULL;
+   }
 
ret = -EFAULT;
if (copy_to_user(useraddr, &stats, sizeof(stats)))
@@ -1941,16 +1950,21 @@ static int ethtool_get_phy_stats(struct net_device 
*dev, void __user *useraddr)
return -EFAULT;
 
stats.n_stats = n_stats;
-   data = vzalloc(array_size(n_stats, sizeof(u64)));
-   if (n_stats && !data)
-   return -ENOMEM;
 
-   if (dev->phydev && !ops->get_ethtool_phy_stats) {
-   ret = phy_ethtool_get_stats(dev->phydev, &stats, data);
-   if (ret < 0)
-   return ret;
+   if (n_stats) {
+   data = vzalloc(array_size(n_stats, sizeof(u64)));
+   if (!data)
+   return -ENOMEM;
+
+   if (dev->phydev && !ops->get_ethtool_phy_stats) {
+   ret = phy_ethtool_get_stats(dev->phydev, &stats, data);
+   if (ret < 0)
+   goto out;
+   } else {
+   ops->get_ethtool_phy_stats(dev, &stats, data);
+   }
} else {
-   ops->get_ethtool_phy_stats(dev, &stats, data);
+   data = NULL;
}
 
ret = -EFAULT;
-- 
2.16.2

答复: 答复: [PATCH] net: ethtool: not call vzalloc for zero sized memory request

2019-03-28 Thread Li,Rongqing



> -邮件原件-
> 发件人: Michal Kubecek [mailto:mkube...@suse.cz]
> 发送时间: 2019年3月28日 18:26
> 收件人: Li,Rongqing 
> 抄送: netdev@vger.kernel.org
> 主题: Re: 答复: [PATCH] net: ethtool: not call vzalloc for zero sized memory
> request
> 
> On Thu, Mar 28, 2019 at 09:51:56AM +, Li,Rongqing wrote:
> > > -邮件原件-
> > > 发件人: Michal Kubecek [mailto:mkube...@suse.cz]
> > > 发送时间: 2019年3月28日 17:09
> > > 收件人: Li,Rongqing 
> > > 抄送: netdev@vger.kernel.org
> > > 主题: Re: [PATCH] net: ethtool: not call vzalloc for zero sized memory
> > > request
> > >
> > > On Thu, Mar 28, 2019 at 02:01:09PM +0800, Li RongQing wrote:
> > > > NULL or ZERO_SIZE_PTR will be returned for zero sized memory
> > > > request, and derefencing them will lead to a segfault
> > > >
> > > > so it is unnecessory to call vzalloc for zero sized memory request
> > > > and not call __ethtool_get_strings which always uses the allocated
> > > > memory
> > > >
> > > > this also fixes a possible memory leak if phy_ethtool_get_stats
> > > > returns error, memory should be freed before exit
> > > >
> > > > Signed-off-by: Li RongQing 
> > > > Reviewed-by: Wang Li 
> > > > ---
> > > >  net/core/ethtool.c | 37 ++---
> > > >  1 file changed, 26 insertions(+), 11 deletions(-)
> > > >
> > > > diff --git a/net/core/ethtool.c b/net/core/ethtool.c index
> > > > b1eb32419732..3e971a36e37c 100644
> > > > --- a/net/core/ethtool.c
> > > > +++ b/net/core/ethtool.c
> > > ...
> > > > @@ -1897,9 +1902,14 @@ static int ethtool_get_stats(struct
> > > > net_device
> > > *dev, void __user *useraddr)
> > > > return -EFAULT;
> > > >
> > > > stats.n_stats = n_stats;
> > > > -   data = vzalloc(array_size(n_stats, sizeof(u64)));
> > > > -   if (n_stats && !data)
> > > > -   return -ENOMEM;
> > > > +
> > > > +   if (n_stats) {
> > > > +   data = vzalloc(array_size(n_stats, sizeof(u64)));
> > > > +   if (!data)
> > > > +   return -ENOMEM;
> > > > +   } else {
> > > > +   data = NULL;
> > > > +   }
> > > >
> > > > ops->get_ethtool_stats(dev, &stats, data);
> > > >
> > >
> > > You avoid the vzalloc() call here but you still pass null data
> > > pointer to device's
> > > get_ethtool_stats() handler which seems to contradict what the
> > > commit message says. Is it really what you want? (The same applies
> > > to
> > > ethtool_get_phy_stats() below.)
> > >
> >
> >
> > I keep it deliberately
> >
> > ops->get_ethtool_stats(dev, &stats, data) have three parameter,
> > if n_stats is 0 [we assume the returning 0 of ops->get_sset_count is
> > correct, or get_sset_count should be fixed], data is NULL,
> > get_ethtool_stats () maybe still store the data into its seconds
> > parameter, stats, even if I did not find which drivers is like that
> 
> stats is a local variable declared as
> 
>   struct ethtool_stats stats;
> 
> where struct ethtool_stats which looks like
> 
> struct ethtool_stats {
>   __u32   cmd;
>   __u32   n_stats;
>   __u64   data[0];
> };
> 
> so that storing anything to stats.data would result in stack corruption.
> This is why ethtool_ops::get_ethtool_stats() has the third parameter telling 
> it
> where to put the statistics.
> 
> There isn't much point touching cmd and n_stats should be set to the same
> value as we got from ethtool_ops::get_sset_counts earlier, i.e.
> zero in our case.
> 

Ok, I will send v2 which does not call ops->get_ethtool_stats, if n_stats is 0

Thanks

-RongQing

> Michal

答复: [PATCH] net: ethtool: not call vzalloc for zero sized memory request

2019-03-28 Thread Li,Rongqing



> -邮件原件-
> 发件人: Michal Kubecek [mailto:mkube...@suse.cz]
> 发送时间: 2019年3月28日 17:09
> 收件人: Li,Rongqing 
> 抄送: netdev@vger.kernel.org
> 主题: Re: [PATCH] net: ethtool: not call vzalloc for zero sized memory request
> 
> On Thu, Mar 28, 2019 at 02:01:09PM +0800, Li RongQing wrote:
> > NULL or ZERO_SIZE_PTR will be returned for zero sized memory request,
> > and derefencing them will lead to a segfault
> >
> > so it is unnecessory to call vzalloc for zero sized memory request and
> > not call __ethtool_get_strings which always uses the allocated memory
> >
> > this also fixes a possible memory leak if phy_ethtool_get_stats
> > returns error, memory should be freed before exit
> >
> > Signed-off-by: Li RongQing 
> > Reviewed-by: Wang Li 
> > ---
> >  net/core/ethtool.c | 37 ++---
> >  1 file changed, 26 insertions(+), 11 deletions(-)
> >
> > diff --git a/net/core/ethtool.c b/net/core/ethtool.c index
> > b1eb32419732..3e971a36e37c 100644
> > --- a/net/core/ethtool.c
> > +++ b/net/core/ethtool.c
> ...
> > @@ -1897,9 +1902,14 @@ static int ethtool_get_stats(struct net_device
> *dev, void __user *useraddr)
> > return -EFAULT;
> >
> > stats.n_stats = n_stats;
> > -   data = vzalloc(array_size(n_stats, sizeof(u64)));
> > -   if (n_stats && !data)
> > -   return -ENOMEM;
> > +
> > +   if (n_stats) {
> > +   data = vzalloc(array_size(n_stats, sizeof(u64)));
> > +   if (!data)
> > +   return -ENOMEM;
> > +   } else {
> > +   data = NULL;
> > +   }
> >
> > ops->get_ethtool_stats(dev, &stats, data);
> >
> 
> You avoid the vzalloc() call here but you still pass null data pointer to 
> device's
> get_ethtool_stats() handler which seems to contradict what the commit
> message says. Is it really what you want? (The same applies to
> ethtool_get_phy_stats() below.)
> 


I keep it deliberately

ops->get_ethtool_stats(dev, &stats, data) have three parameter, 
if n_stats is 0 [we assume the returning 0 of ops->get_sset_count is correct, 
or 
get_sset_count should be fixed], data is NULL,  get_ethtool_stats () maybe 
still 
store the data into its seconds parameter, stats, even if I did not find which 
drivers
is like that


but __ethtool_get_strings is different, only one place to store data, so
if count is 0, __ethtool_get_strings does not need to be called

-RongQing



> Michal
> 
> > @@ -1941,14 +1951,19 @@ static int ethtool_get_phy_stats(struct
> net_device *dev, void __user *useraddr)
> > return -EFAULT;
> >
> > stats.n_stats = n_stats;
> > -   data = vzalloc(array_size(n_stats, sizeof(u64)));
> > -   if (n_stats && !data)
> > -   return -ENOMEM;
> > +
> > +   if (n_stats) {
> > +   data = vzalloc(array_size(n_stats, sizeof(u64)));
> > +   if (!data)
> > +   return -ENOMEM;
> > +   } else {
> > +   data = NULL;
> > +   }
> >
> > if (dev->phydev && !ops->get_ethtool_phy_stats) {
> > ret = phy_ethtool_get_stats(dev->phydev, &stats, data);
> > if (ret < 0)
> > -   return ret;
> > +   goto out;
> > } else {
> > ops->get_ethtool_phy_stats(dev, &stats, data);
> > }
> > --
> > 2.16.2
> >

[PATCH] net: ethtool: not call vzalloc for zero sized memory request

2019-03-27 Thread Li RongQing

NULL or ZERO_SIZE_PTR will be returned for zero sized memory
request, and derefencing them will lead to a segfault

so it is unnecessory to call vzalloc for zero sized memory
request and not call __ethtool_get_strings which always uses
the allocated memory

this also fixes a possible memory leak if phy_ethtool_get_stats
returns error, memory should be freed before exit

Signed-off-by: Li RongQing 
Reviewed-by: Wang Li 
---
 net/core/ethtool.c | 37 ++---
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index b1eb32419732..3e971a36e37c 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1797,11 +1797,16 @@ static int ethtool_get_strings(struct net_device *dev, 
void __user *useraddr)
WARN_ON_ONCE(!ret);
 
gstrings.len = ret;
-   data = vzalloc(array_size(gstrings.len, ETH_GSTRING_LEN));
-   if (gstrings.len && !data)
-   return -ENOMEM;
 
-   __ethtool_get_strings(dev, gstrings.string_set, data);
+   if (gstrings.len) {
+   data = vzalloc(array_size(gstrings.len, ETH_GSTRING_LEN));
+   if (!data)
+   return -ENOMEM;
+
+   __ethtool_get_strings(dev, gstrings.string_set, data);
+   } else {
+   data = NULL;
+   }
 
ret = -EFAULT;
if (copy_to_user(useraddr, &gstrings, sizeof(gstrings)))
@@ -1897,9 +1902,14 @@ static int ethtool_get_stats(struct net_device *dev, 
void __user *useraddr)
return -EFAULT;
 
stats.n_stats = n_stats;
-   data = vzalloc(array_size(n_stats, sizeof(u64)));
-   if (n_stats && !data)
-   return -ENOMEM;
+
+   if (n_stats) {
+   data = vzalloc(array_size(n_stats, sizeof(u64)));
+   if (!data)
+   return -ENOMEM;
+   } else {
+   data = NULL;
+   }
 
ops->get_ethtool_stats(dev, &stats, data);
 
@@ -1941,14 +1951,19 @@ static int ethtool_get_phy_stats(struct net_device 
*dev, void __user *useraddr)
return -EFAULT;
 
stats.n_stats = n_stats;
-   data = vzalloc(array_size(n_stats, sizeof(u64)));
-   if (n_stats && !data)
-   return -ENOMEM;
+
+   if (n_stats) {
+   data = vzalloc(array_size(n_stats, sizeof(u64)));
+   if (!data)
+   return -ENOMEM;
+   } else {
+   data = NULL;
+   }
 
if (dev->phydev && !ops->get_ethtool_phy_stats) {
ret = phy_ethtool_get_stats(dev->phydev, &stats, data);
if (ret < 0)
-   return ret;
+   goto out;
} else {
ops->get_ethtool_phy_stats(dev, &stats, data);
}
-- 
2.16.2

[PATCH] connector: fix unsafe usage of ->real_parent

2019-03-05 Thread Li RongQing

yte region [88a027698000, 88a02769af80)
[  748.171114] The buggy address belongs to the page:
[  748.177055] page:ea00809da600 count:1 mapcount:0 
mapping:888107d01e00 index:0x0 compound_mapcount: 0
[  748.189136] flags: 0x57c0008100(slab|head)
[  748.194688] raw: 0057c0008100 ea00809a3200 00030003 
888107d01e00
[  748.204424] raw:  00020002 0001 

[  748.214146] page dumped because: kasan: bad access detected
[  748.220976]
[  748.223197] Memory state around the buggy address:
[  748.229128]  88a027698780: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb 
fb
[  748.238271]  88a027698800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb 
fb
[  748.247414] >88a027698880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb 
fb
[  748.256564]^
[  748.264267]  88a027698900: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb 
fb
[  748.273493]  88a027698980: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb 
fb
[  748.282630] 
==

Fixes: b086ff87251b4a4 ("connector: add parent pid and tgid to coredump and 
exit events")
Signed-off-by: Zhang Yu 
Signed-off-by: Li RongQing 
---
 drivers/connector/cn_proc.c | 22 ++
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c
index ed5e42461094..ad48fd52cb53 100644
--- a/drivers/connector/cn_proc.c
+++ b/drivers/connector/cn_proc.c
@@ -250,6 +250,7 @@ void proc_coredump_connector(struct task_struct *task)
 {
struct cn_msg *msg;
struct proc_event *ev;
+   struct task_struct *parent;
__u8 buffer[CN_PROC_MSG_SIZE] __aligned(8);
 
if (atomic_read(&proc_event_num_listeners) < 1)
@@ -262,8 +263,14 @@ void proc_coredump_connector(struct task_struct *task)
ev->what = PROC_EVENT_COREDUMP;
ev->event_data.coredump.process_pid = task->pid;
ev->event_data.coredump.process_tgid = task->tgid;
-   ev->event_data.coredump.parent_pid = task->real_parent->pid;
-   ev->event_data.coredump.parent_tgid = task->real_parent->tgid;
+
+   rcu_read_lock();
+   if (pid_alive(task)) {
+   parent = rcu_dereference(task->real_parent);
+   ev->event_data.coredump.parent_pid = parent->pid;
+   ev->event_data.coredump.parent_tgid = parent->tgid;
+   }
+   rcu_read_unlock();
 
memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
msg->ack = 0; /* not used */
@@ -276,6 +283,7 @@ void proc_exit_connector(struct task_struct *task)
 {
struct cn_msg *msg;
struct proc_event *ev;
+   struct task_struct *parent;
__u8 buffer[CN_PROC_MSG_SIZE] __aligned(8);
 
if (atomic_read(&proc_event_num_listeners) < 1)
@@ -290,8 +298,14 @@ void proc_exit_connector(struct task_struct *task)
ev->event_data.exit.process_tgid = task->tgid;
ev->event_data.exit.exit_code = task->exit_code;
ev->event_data.exit.exit_signal = task->exit_signal;
-   ev->event_data.exit.parent_pid = task->real_parent->pid;
-   ev->event_data.exit.parent_tgid = task->real_parent->tgid;
+
+   rcu_read_lock();
+   if (pid_alive(task)) {
+   parent = rcu_dereference(task->real_parent);
+   ev->event_data.exit.parent_pid = parent->pid;
+   ev->event_data.exit.parent_tgid = parent->tgid;
+   }
+   rcu_read_unlock();
 
memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
msg->ack = 0; /* not used */
-- 
2.16.2

[PATCH][net-next] ethtool: Use explicit designated initializers for .cmd

2019-02-27 Thread Li RongQing

Initialize the .cmd member by using a designated struct
initializer. This fixes warning of missing field initializers,
and makes code a little easier to read.

Signed-off-by: Li RongQing 
---
 net/core/ethtool.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 47558ffae423..d4918ffddda8 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1717,7 +1717,7 @@ static noinline_for_stack int ethtool_set_channels(struct 
net_device *dev,
 
 static int ethtool_get_pauseparam(struct net_device *dev, void __user 
*useraddr)
 {
-   struct ethtool_pauseparam pauseparam = { ETHTOOL_GPAUSEPARAM };
+   struct ethtool_pauseparam pauseparam = { .cmd = ETHTOOL_GPAUSEPARAM };
 
if (!dev->ethtool_ops->get_pauseparam)
return -EOPNOTSUPP;
@@ -2503,7 +2503,7 @@ static int set_phy_tunable(struct net_device *dev, void 
__user *useraddr)
 
 static int ethtool_get_fecparam(struct net_device *dev, void __user *useraddr)
 {
-   struct ethtool_fecparam fecparam = { ETHTOOL_GFECPARAM };
+   struct ethtool_fecparam fecparam = { .cmd = ETHTOOL_GFECPARAM };
int rc;
 
if (!dev->ethtool_ops->get_fecparam)
-- 
2.16.2

[PATCH][net-next] net: Use RCU_POINTER_INITIALIZER() to init static variable

2019-02-24 Thread Li RongQing

This pointer is RCU protected, so proper primitives should be used.

Signed-off-by: Zhang Yu 
Signed-off-by: Li RongQing 
---
 net/sched/sch_generic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 38e5add14fab..7bcee8b8f803 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -559,7 +559,7 @@ struct Qdisc_ops noop_qdisc_ops __read_mostly = {
 };
 
 static struct netdev_queue noop_netdev_queue = {
-   .qdisc  =   &noop_qdisc,
+   RCU_POINTER_INITIALIZER(qdisc, &noop_qdisc),
.qdisc_sleeping =   &noop_qdisc,
 };
 
-- 
2.16.2

[PATCH][net-next] ipv6: sanitize RCU usage on fib6_next

2019-02-22 Thread Li RongQing

using rcu_assign_pointer when setting, which has a memory
barrier to ensure the initialization is seen first.

using rcu_dereference when dereference this pointer

Signed-off-by: Zhang Yu 
Signed-off-by: Li RongQing 
---
 net/ipv6/ip6_fib.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 6613d8dbb0e5..b73d40d68178 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1143,7 +1143,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct 
fib6_info *rt,
 
atomic_inc(&rt->fib6_ref);
rcu_assign_pointer(rt->fib6_node, fn);
-   rt->fib6_next = iter->fib6_next;
+   rcu_assign_pointer(rt->fib6_next, iter->fib6_next);
rcu_assign_pointer(*ins, rt);
if (!info->skip_notify)
inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
@@ -1761,7 +1761,9 @@ static void fib6_del_route(struct fib6_table *table, 
struct fib6_node *fn,
RT6_TRACE("fib6_del_route\n");
 
/* Unlink it */
-   *rtp = rt->fib6_next;
+   *rtp = rcu_dereference_protected(rt->fib6_next,
+   lockdep_is_held(&rt->fib6_table->tb6_lock));
+
rt->fib6_node = NULL;
net->ipv6.rt6_stats->fib_rt_entries--;
net->ipv6.rt6_stats->fib_discarded_routes++;
-- 
2.16.2

[PATCH][net-next] net: Use RCU_INIT_POINTER() to set sk_wq

2019-02-22 Thread Li RongQing

This pointer is RCU protected, so proper primitives should be used.

Signed-off-by: Zhang Yu 
Signed-off-by: Li RongQing 
---
 net/core/sock.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index f4b8b78535f8..f5d82f3fa474 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1865,7 +1865,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const 
gfp_t priority)
 */
sk_refcnt_debug_inc(newsk);
sk_set_socket(newsk, NULL);
-   newsk->sk_wq = NULL;
+   RCU_INIT_POINTER(newsk->sk_wq, NULL);
 
if (newsk->sk_prot->sockets_allocated)
sk_sockets_allocated_inc(newsk);
@@ -2828,11 +2828,11 @@ void sock_init_data(struct socket *sock, struct sock 
*sk)
 
if (sock) {
sk->sk_type =   sock->type;
-   sk->sk_wq   =   sock->wq;
+   RCU_INIT_POINTER(sk->sk_wq, sock->wq);
sock->sk=   sk;
sk->sk_uid  =   SOCK_INODE(sock)->i_uid;
} else {
-   sk->sk_wq   =   NULL;
+   RCU_INIT_POINTER(sk->sk_wq, NULL);
sk->sk_uid  =   make_kuid(sock_net(sk)->user_ns, 0);
}
 
-- 
2.16.2

[PATCH][net-next] bridge: remove redundant check on err in br_multicast_ipv4_rcv

2019-02-18 Thread Li RongQing

br_ip4_multicast_mrd_rcv only return 0 and -ENOMSG,
no other negative value

Signed-off-by: Li RongQing 
---
 net/bridge/br_multicast.c | 7 +--
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 4a048fd1cbea..fe9f2d8ca2c1 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1615,12 +1615,7 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
if (ip_hdr(skb)->protocol == IPPROTO_PIM)
br_multicast_pim(br, port, skb);
} else if (ipv4_is_all_snoopers(ip_hdr(skb)->daddr)) {
-   err = br_ip4_multicast_mrd_rcv(br, port, skb);
-
-   if (err < 0 && err != -ENOMSG) {
-   br_multicast_err_count(br, port, skb->protocol);
-   return err;
-   }
+   br_ip4_multicast_mrd_rcv(br, port, skb);
}
 
return 0;
-- 
2.16.2

[PATCH] net: remove unneeded switch fall-through

2019-02-18 Thread Li RongQing

This case block has been terminated by a return, so not need
a switch fall-through

Signed-off-by: Li RongQing 
---
 net/ipv4/igmp.c| 1 -
 net/ipv6/mcast_snoop.c | 1 -
 2 files changed, 2 deletions(-)

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index b448cf32296c..6c2febc39dca 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1541,7 +1541,6 @@ static int ip_mc_check_igmp_msg(struct sk_buff *skb)
case IGMP_HOST_LEAVE_MESSAGE:
case IGMP_HOST_MEMBERSHIP_REPORT:
case IGMPV2_HOST_MEMBERSHIP_REPORT:
-   /* fall through */
return 0;
case IGMPV3_HOST_MEMBERSHIP_REPORT:
return ip_mc_check_igmp_reportv3(skb);
diff --git a/net/ipv6/mcast_snoop.c b/net/ipv6/mcast_snoop.c
index 55e2ac179f28..75d1be0e 100644
--- a/net/ipv6/mcast_snoop.c
+++ b/net/ipv6/mcast_snoop.c
@@ -128,7 +128,6 @@ static int ipv6_mc_check_mld_msg(struct sk_buff *skb)
switch (mld->mld_type) {
case ICMPV6_MGM_REDUCTION:
case ICMPV6_MGM_REPORT:
-   /* fall through */
return 0;
case ICMPV6_MLD2_REPORT:
return ipv6_mc_check_mld_reportv2(skb);
-- 
2.16.2

[PATCH] ipv6: propagate genlmsg_reply return code

2019-02-11 Thread Li RongQing

genlmsg_reply can fail, so propagate its return code

Fixes: 915d7e5e593 ("ipv6: sr: add code base for control plane support of 
SR-IPv6")
Signed-off-by: Li RongQing 
---
 net/ipv6/seg6.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
index 8d0ba757a46c..9b2f272ca164 100644
--- a/net/ipv6/seg6.c
+++ b/net/ipv6/seg6.c
@@ -221,9 +221,7 @@ static int seg6_genl_get_tunsrc(struct sk_buff *skb, struct 
genl_info *info)
rcu_read_unlock();
 
genlmsg_end(msg, hdr);
-   genlmsg_reply(msg, info);
-
-   return 0;
+   return genlmsg_reply(msg, info);
 
 nla_put_failure:
rcu_read_unlock();
-- 
2.16.2

[PATCH][net-next] devlink: use direct return of genlmsg_reply

2019-02-11 Thread Li RongQing

This can remove redundant check

Signed-off-by: Li RongQing 
---
 net/core/devlink.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/net/core/devlink.c b/net/core/devlink.c
index e6a015b8ac9b..76a9d287dbec 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -4355,11 +4355,8 @@ static int devlink_fmsg_snd(struct devlink_fmsg *fmsg,
err = -EMSGSIZE;
goto nla_put_failure;
}
-   err = genlmsg_reply(skb, info);
-   if (err)
-   return err;
 
-   return 0;
+   return genlmsg_reply(skb, info);
 
 nla_put_failure:
nlmsg_free(skb);
-- 
2.16.2

[PATCH][net-next] tun: remove unnecessary check in tun_flow_update

2018-12-06 Thread Li RongQing

caller has guaranted that rxhash is not zero

Signed-off-by: Li RongQing 
---
 drivers/net/tun.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index d0745dc81976..6760b86547df 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -529,10 +529,7 @@ static void tun_flow_update(struct tun_struct *tun, u32 
rxhash,
unsigned long delay = tun->ageing_time;
u16 queue_index = tfile->queue_index;
 
-   if (!rxhash)
-   return;
-   else
-   head = &tun->flows[tun_hashfn(rxhash)];
+   head = &tun->flows[tun_hashfn(rxhash)];
 
rcu_read_lock();
 
-- 
2.16.2

[PATCH][net-next] tun: align write-heavy flow entry members to a cache line

2018-12-06 Thread Li RongQing

tun flow entry 'updated' fields are written when receive
every packet. Thus if a flow is receiving packets from a
particular flow entry, it'll cause false-sharing with
all the other who has looked it up, so move it in its own
cache line

and update 'queue_index' and 'update' field only when
they are changed to reduce the cache false-sharing.

Signed-off-by: Zhang Yu 
Signed-off-by: Wang Li 
Signed-off-by: Li RongQing 
---
 drivers/net/tun.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 835c73f42ae7..d0745dc81976 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -201,7 +201,7 @@ struct tun_flow_entry {
u32 rxhash;
u32 rps_rxhash;
int queue_index;
-   unsigned long updated;
+   unsigned long updated cacheline_aligned_in_smp;
 };
 
 #define TUN_NUM_FLOW_ENTRIES 1024
@@ -539,8 +539,10 @@ static void tun_flow_update(struct tun_struct *tun, u32 
rxhash,
e = tun_flow_find(head, rxhash);
if (likely(e)) {
/* TODO: keep queueing to old queue until it's empty? */
-   e->queue_index = queue_index;
-   e->updated = jiffies;
+   if (e->queue_index != queue_index)
+   e->queue_index = queue_index;
+   if (e->updated != jiffies)
+   e->updated = jiffies;
sock_rps_record_flow_hash(e->rps_rxhash);
} else {
spin_lock_bh(&tun->lock);
-- 
2.16.2

答复: [PATCH][net-next] vhost:net: allocate 32KB memory instead of 32K pages when page frag refill

2018-11-22 Thread Li,Rongqing


On 2018/11/23 上午10:04, Li RongQing wrote:
> >when page frag refills, 32K pages, 128MB memory is asked, it hardly 
> >successes when system has memory stress


> Looking at get_order(), it seems we get 3 after get_order(32768) since it 
> accepts the size of block.

You are right, I understood wrongly, 

Please drop this patch, sorry for the noise

-Q

答复: [PATCH] net: fix the per task frag allocator size

2018-11-22 Thread Li,Rongqing


> get_order(8) returns zero here if I understood it correctly.


You are right, I understood wrongly, 

Please drop this patch, sorry for the noise

-Q

[PATCH][net-next] vhost:net: allocate 32KB memory instead of 32K pages when page frag refill

2018-11-22 Thread Li RongQing

when page frag refills, 32K pages, 128MB memory is asked, it
hardly successes when system has memory stress

And such large memory size will cause the underflow of reference
bias, and make refcount of page chaos, since reference bias will
be decreased to negative before the allocated memory is used up

so 32KB memory is safe choice, meanwhile, remove a unnecessary
check

Fixes: e4dab1e6ea64 ("vhost_net: mitigate page reference counting during page 
frag refill")
Signed-off-by: Zhang Yu 
Signed-off-by: Li RongQing 
---
 drivers/vhost/net.c | 22 +++---
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index d919284f103b..b933a4a8e4ba 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -641,7 +641,7 @@ static bool tx_can_batch(struct vhost_virtqueue *vq, size_t 
total_len)
   !vhost_vq_avail_empty(vq->dev, vq);
 }
 
-#define SKB_FRAG_PAGE_ORDER get_order(32768)
+#define SKB_FRAG_PAGE_ORDER3
 
 static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz,
   struct page_frag *pfrag, gfp_t gfp)
@@ -654,17 +654,17 @@ static bool vhost_net_page_frag_refill(struct vhost_net 
*net, unsigned int sz,
 
pfrag->offset = 0;
net->refcnt_bias = 0;
-   if (SKB_FRAG_PAGE_ORDER) {
-   /* Avoid direct reclaim but allow kswapd to wake */
-   pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
- __GFP_COMP | __GFP_NOWARN |
- __GFP_NORETRY,
- SKB_FRAG_PAGE_ORDER);
-   if (likely(pfrag->page)) {
-   pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
-   goto done;
-   }
+
+   /* Avoid direct reclaim but allow kswapd to wake */
+   pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
+ __GFP_COMP | __GFP_NOWARN |
+ __GFP_NORETRY,
+ SKB_FRAG_PAGE_ORDER);
+   if (likely(pfrag->page)) {
+   pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
+   goto done;
}
+
pfrag->page = alloc_page(gfp);
if (likely(pfrag->page)) {
pfrag->size = PAGE_SIZE;
-- 
2.16.2

[PATCH] net: fix the per task frag allocator size

2018-11-22 Thread Li RongQing

when fill task frag, 32K pages, 128MB memory is asked, it
hardly successes when system has memory stress

and commit '5640f7685831 ("net: use a per task frag allocator")'
said it wants 32768 bytes, not 32768 pages:

 "(up to 32768 bytes per frag, thats order-3 pages on x86)"

Fixes: 5640f7685831e ("net: use a per task frag allocator")
Signed-off-by: Zhang Yu 
Signed-off-by: Li RongQing 
---
 net/core/sock.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index 6d7e189e3cd9..e3cbefeedf5c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2186,8 +2186,8 @@ static void sk_leave_memory_pressure(struct sock *sk)
}
 }
 
-/* On 32bit arches, an skb frag is limited to 2^15 */
-#define SKB_FRAG_PAGE_ORDERget_order(32768)
+/* On 32bit arches, an skb frag is limited to 2^15 bytes*/
+#define SKB_FRAG_PAGE_ORDERget_order(8)
 
 /**
  * skb_page_frag_refill - check that a page_frag contains enough room
-- 
2.16.2

[PATCH][net-next] net: slightly optimize eth_type_trans

2018-11-12 Thread Li RongQing

netperf udp stream shows that eth_type_trans takes certain cpu,
so adjust the mac address check order, and firstly check if it
is device address, and only check if it is multicast address
only if not the device address.

After this change:
To unicast, and skb dst mac is device mac, this is most of time
reduce a comparision
To unicast, and skb dst mac is not device mac, nothing change
To multicast, increase a comparision

Before:
1.03%  [kernel]  [k] eth_type_trans

After:
0.78%  [kernel]  [k] eth_type_trans

Signed-off-by: Zhang Yu 
Signed-off-by: Li RongQing 
---
 net/ethernet/eth.c | 18 ++
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index fd8faa0dfa61..1c88f5c5d5b1 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -165,15 +165,17 @@ __be16 eth_type_trans(struct sk_buff *skb, struct 
net_device *dev)
eth = (struct ethhdr *)skb->data;
skb_pull_inline(skb, ETH_HLEN);
 
-   if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) {
-   if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast))
-   skb->pkt_type = PACKET_BROADCAST;
-   else
-   skb->pkt_type = PACKET_MULTICAST;
+   if (unlikely(!ether_addr_equal_64bits(eth->h_dest,
+  dev->dev_addr))) {
+   if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) {
+   if (ether_addr_equal_64bits(eth->h_dest, 
dev->broadcast))
+   skb->pkt_type = PACKET_BROADCAST;
+   else
+   skb->pkt_type = PACKET_MULTICAST;
+   } else {
+   skb->pkt_type = PACKET_OTHERHOST;
+   }
}
-   else if (unlikely(!ether_addr_equal_64bits(eth->h_dest,
-  dev->dev_addr)))
-   skb->pkt_type = PACKET_OTHERHOST;
 
/*
 * Some variants of DSA tagging don't have an ethertype field
-- 
2.16.2

[PATCH][net-next][v2] net: remove BUG_ON from __pskb_pull_tail

2018-11-12 Thread Li RongQing

if list is NULL pointer, and the following access of list
will trigger panic, which is same as BUG_ON

Signed-off-by: Li RongQing 
---
 net/core/skbuff.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 396fcb3baad0..d69503d66021 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1925,8 +1925,6 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta)
struct sk_buff *insp = NULL;
 
do {
-   BUG_ON(!list);
-
if (list->len <= eat) {
/* Eaten as whole. */
eat -= list->len;
-- 
2.16.2

[PATCH][xfrm-next] xfrm6: remove BUG_ON from xfrm6_dst_ifdown

2018-11-12 Thread Li RongQing

if loopback_idev is NULL pointer, and the following access of
loopback_idev will trigger panic, which is same as BUG_ON

Signed-off-by: Li RongQing 
---
 net/ipv6/xfrm6_policy.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index d35bcf92969c..769f8f78d3b8 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -262,7 +262,6 @@ static void xfrm6_dst_ifdown(struct dst_entry *dst, struct 
net_device *dev,
if (xdst->u.rt6.rt6i_idev->dev == dev) {
struct inet6_dev *loopback_idev =
in6_dev_get(dev_net(dev)->loopback_dev);
-   BUG_ON(!loopback_idev);
 
do {
in6_dev_put(xdst->u.rt6.rt6i_idev);
-- 
2.16.2

[PATCH][net-next] net: remove BUG_ON from __pskb_pull_tail

2018-11-12 Thread Li RongQing

if list is NULL pointer, and the following access of list
will trigger panic, which is same as BUG_ON

Signed-off-by: Li RongQing 
---
 net/core/skbuff.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 396fcb3baad0..cd668b52f96f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1925,7 +1925,6 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta)
struct sk_buff *insp = NULL;
 
do {
-   BUG_ON(!list);
 
if (list->len <= eat) {
/* Eaten as whole. */
-- 
2.16.2

答复: [PATCH][RFC] udp: cache sock to avoid searching it twice

2018-11-11 Thread Li,Rongqing

> >   return pp;
> >   }
>
> What if 'pp' is NULL?
>
> Aside from that, this replace a lookup with 2 atomic ops, and only when
> such lookup is amortized on multiple aggregated packets: I'm unsure if
> it's worthy and I don't understand how that improves RR tests (where
> the socket can't see multiple, consecutive skbs, AFAIK).
>
> Cheers,
>
> Paolo
>

If we not release the socket in udp_gro_complete , we can reduce a udp socket
Lookup when do ip demux again, it maybe more worthy.

I test UDP_STREAM, find no difference, both can reach NIC's limit, 10G; 
so Test RR, I will do more tests

-RongQing

答复: [PATCH][RFC] udp: cache sock to avoid searching it twice

2018-11-11 Thread Li,Rongqing


On Sat, Nov 10, 2018 at 1:29 AM Eric Dumazet  wrote:
>
>
>
> On 11/08/2018 10:21 PM, Li RongQing wrote:
> > GRO for UDP needs to lookup socket twice, first is in gro receive,
> > second is gro complete, so if store sock to skb to avoid looking up
> > twice, this can give small performance boost
> >
> > netperf -t UDP_RR -l 10
> >
> > Before:
> >   Rate per sec: 28746.01
> > After:
> >   Rate per sec: 29401.67
> >
> > Signed-off-by: Li RongQing 
> > ---
> >  net/ipv4/udp_offload.c | 18 +-
> >  1 file changed, 17 insertions(+), 1 deletion(-)
> >
> > diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
> > index 0646d61f4fa8..429570112a33 100644
> > --- a/net/ipv4/udp_offload.c
> > +++ b/net/ipv4/udp_offload.c
> > @@ -408,6 +408,11 @@ struct sk_buff *udp_gro_receive(struct list_head 
> > *head, struct sk_buff *skb,
> >
> >   if (udp_sk(sk)->gro_enabled) {
> >   pp = call_gro_receive(udp_gro_receive_segment, head, skb);
> > +
> > + if (!IS_ERR(pp) && NAPI_GRO_CB(pp)->count > 1) {
> > + sock_hold(sk);
> > + pp->sk = sk;
>
>
> You also have to set pp->destructor to sock_edemux
>
> flush_gro_hash -> kfree_skb()
>
> If there is no destructor, the reference on pp->sk will never be released.
>
>

Ok, thanks,

does it need to reset sk in udp_gro_complete,  ip early demuxing will lookup 
udp socket again, if we can keep it, we can avoid to lookup socket again


-RongQing
>
>
> > + }
> >   rcu_read_unlock();
> >   return pp;
> >   }
> > @@ -444,6 +449,10 @@ struct sk_buff *udp_gro_receive(struct list_head 
> > *head, struct sk_buff *skb,
> >   skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
> >   pp = call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb);
> >
> > + if (!IS_ERR(pp) && NAPI_GRO_CB(pp)->count > 1) {
> > + sock_hold(sk);
> > + pp->sk = sk;
> > + }
> >  out_unlock:
> >   rcu_read_unlock();
> >   skb_gro_flush_final(skb, pp, flush);
> > @@ -502,7 +511,9 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff,
> >   uh->len = newlen;
> >
> >   rcu_read_lock();
> > - sk = (*lookup)(skb, uh->source, uh->dest);
> > + sk = skb->sk;
> > + if (!sk)
> > + sk = (*lookup)(skb, uh->source, uh->dest);
> >   if (sk && udp_sk(sk)->gro_enabled) {
> >   err = udp_gro_complete_segment(skb);
> >   } else if (sk && udp_sk(sk)->gro_complete) {
> > @@ -516,6 +527,11 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff,
> >   err = udp_sk(sk)->gro_complete(sk, skb,
> >   nhoff + sizeof(struct udphdr));
> >   }
> > +
> > + if (skb->sk) {
> > + sock_put(skb->sk);
> > + skb->sk = NULL;
> > + }
> >   rcu_read_unlock();
> >
> >   if (skb->remcsum_offload)
> >

[PATCH][net-next] net: tcp: remove BUG_ON from tcp_v4_err

2018-11-09 Thread Li RongQing

if skb is NULL pointer, and the following access of skb's
skb_mstamp_ns will trigger panic, which is same as BUG_ON

Signed-off-by: Li RongQing 
---
 net/ipv4/tcp_ipv4.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a336787d75e5..5424a4077c27 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -542,7 +542,6 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 
skb = tcp_rtx_queue_head(sk);
-   BUG_ON(!skb);
 
tcp_mstamp_refresh(tp);
delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
-- 
2.16.2

[PATCH][RFC] udp: cache sock to avoid searching it twice

2018-11-08 Thread Li RongQing

GRO for UDP needs to lookup socket twice, first is in gro receive,
second is gro complete, so if store sock to skb to avoid looking up
twice, this can give small performance boost

netperf -t UDP_RR -l 10

Before:
Rate per sec: 28746.01
After:
Rate per sec: 29401.67

Signed-off-by: Li RongQing 
---
 net/ipv4/udp_offload.c | 18 +-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 0646d61f4fa8..429570112a33 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -408,6 +408,11 @@ struct sk_buff *udp_gro_receive(struct list_head *head, 
struct sk_buff *skb,
 
if (udp_sk(sk)->gro_enabled) {
pp = call_gro_receive(udp_gro_receive_segment, head, skb);
+
+   if (!IS_ERR(pp) && NAPI_GRO_CB(pp)->count > 1) {
+   sock_hold(sk);
+   pp->sk = sk;
+   }
rcu_read_unlock();
return pp;
}
@@ -444,6 +449,10 @@ struct sk_buff *udp_gro_receive(struct list_head *head, 
struct sk_buff *skb,
skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
pp = call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb);
 
+   if (!IS_ERR(pp) && NAPI_GRO_CB(pp)->count > 1) {
+   sock_hold(sk);
+   pp->sk = sk;
+   }
 out_unlock:
rcu_read_unlock();
skb_gro_flush_final(skb, pp, flush);
@@ -502,7 +511,9 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff,
uh->len = newlen;
 
rcu_read_lock();
-   sk = (*lookup)(skb, uh->source, uh->dest);
+   sk = skb->sk;
+   if (!sk)
+   sk = (*lookup)(skb, uh->source, uh->dest);
if (sk && udp_sk(sk)->gro_enabled) {
err = udp_gro_complete_segment(skb);
} else if (sk && udp_sk(sk)->gro_complete) {
@@ -516,6 +527,11 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff,
err = udp_sk(sk)->gro_complete(sk, skb,
nhoff + sizeof(struct udphdr));
}
+
+   if (skb->sk) {
+   sock_put(skb->sk);
+   skb->sk = NULL;
+   }
rcu_read_unlock();
 
if (skb->remcsum_offload)
-- 
2.16.2

[PATCH][net-next] openvswitch: remove BUG_ON from get_dpdev

2018-11-08 Thread Li RongQing

if local is NULL pointer, and the following access of local's
dev will trigger panic, which is same as BUG_ON

Signed-off-by: Li RongQing 
---
 net/openvswitch/vport-netdev.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index 2e5e7a41d8ef..9bec22e3e9e8 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -84,7 +84,6 @@ static struct net_device *get_dpdev(const struct datapath *dp)
struct vport *local;
 
local = ovs_vport_ovsl(dp, OVSP_LOCAL);
-   BUG_ON(!local);
return local->dev;
 }
 
-- 
2.16.2

[PATCH][net-next][v2] net/ipv6: compute anycast address hash only if dev is null

2018-11-07 Thread Li RongQing

avoid to compute the hash value if dev is not null, since
hash value is not used

Signed-off-by: Li RongQing 
---
 net/ipv6/anycast.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 94999058e110..cca3b3603c42 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -433,7 +433,6 @@ static bool ipv6_chk_acast_dev(struct net_device *dev, 
const struct in6_addr *ad
 bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev,
 const struct in6_addr *addr)
 {
-   unsigned int hash = inet6_acaddr_hash(net, addr);
struct net_device *nh_dev;
struct ifacaddr6 *aca;
bool found = false;
@@ -441,7 +440,9 @@ bool ipv6_chk_acast_addr(struct net *net, struct net_device 
*dev,
rcu_read_lock();
if (dev)
found = ipv6_chk_acast_dev(dev, addr);
-   else
+   else {
+   unsigned int hash = inet6_acaddr_hash(net, addr);
+
hlist_for_each_entry_rcu(aca, &inet6_acaddr_lst[hash],
 aca_addr_lst) {
nh_dev = fib6_info_nh_dev(aca->aca_rt);
@@ -452,6 +453,7 @@ bool ipv6_chk_acast_addr(struct net *net, struct net_device 
*dev,
break;
}
}
+   }
rcu_read_unlock();
return found;
 }
-- 
2.16.2

[PATCH][net-next] net/ipv6: compute anycast address hash only if dev is null

2018-11-07 Thread Li RongQing

avoid to compute the hash value if dev is not null, since
hash value is not used

Signed-off-by: Li RongQing 
---
 net/ipv6/anycast.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 94999058e110..a20e344486cb 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -433,15 +433,16 @@ static bool ipv6_chk_acast_dev(struct net_device *dev, 
const struct in6_addr *ad
 bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev,
 const struct in6_addr *addr)
 {
-   unsigned int hash = inet6_acaddr_hash(net, addr);
struct net_device *nh_dev;
struct ifacaddr6 *aca;
bool found = false;
+   unsigned int hash;
 
rcu_read_lock();
if (dev)
found = ipv6_chk_acast_dev(dev, addr);
-   else
+   else {
+   hash = inet6_acaddr_hash(net, addr);
hlist_for_each_entry_rcu(aca, &inet6_acaddr_lst[hash],
 aca_addr_lst) {
nh_dev = fib6_info_nh_dev(aca->aca_rt);
@@ -452,6 +453,7 @@ bool ipv6_chk_acast_addr(struct net *net, struct net_device 
*dev,
break;
}
}
+   }
rcu_read_unlock();
return found;
 }
-- 
2.16.2

[net-next][PATCH] net/ipv4: fix a net leak

2018-10-24 Thread Li RongQing

put net when input a invalid ifindex, otherwise it will be leaked

Fixes: 5fcd266a9f64("net/ipv4: Add support for dumping addresses for a specific 
device")
Cc: David Ahern 
Signed-off-by: Zhang Yu 
Signed-off-by: Li RongQing 
---
 net/ipv4/devinet.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 63d5b58fbfdb..fd0c5a47e742 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1775,8 +1775,10 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct 
netlink_callback *cb)
 
if (fillargs.ifindex) {
dev = __dev_get_by_index(tgt_net, fillargs.ifindex);
-   if (!dev)
+   if (!dev) {
+   put_net(tgt_net);
return -ENODEV;
+   }
 
in_dev = __in_dev_get_rtnl(dev);
if (in_dev) {
-- 
2.16.2

[PATCH][ipsec-next] xfrm: use correct size to initialise sp->ovec

2018-10-06 Thread Li RongQing

This place should want to initialize array, not a element,
so it should be sizeof(array) instead of sizeof(element)

but now this array only has one element, so no error in
this condition that XFRM_MAX_OFFLOAD_DEPTH is 1

Signed-off-by: Li RongQing 
---
 net/xfrm/xfrm_input.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index be3520e429c9..684c0bc01e2c 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -131,7 +131,7 @@ struct sec_path *secpath_dup(struct sec_path *src)
sp->len = 0;
sp->olen = 0;
 
-   memset(sp->ovec, 0, sizeof(sp->ovec[XFRM_MAX_OFFLOAD_DEPTH]));
+   memset(sp->ovec, 0, sizeof(sp->ovec));
 
if (src) {
int i;
-- 
2.16.2

[PATCH][ipsec-next] xfrm: remove unnecessary check in xfrmi_get_stats64

2018-10-06 Thread Li RongQing

if tstats of a device is not allocated, this device is not
registered correctly and can not be used.

Signed-off-by: Li RongQing 
---
 net/xfrm/xfrm_interface.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c
index dc5b20bf29cf..abafd49cc65d 100644
--- a/net/xfrm/xfrm_interface.c
+++ b/net/xfrm/xfrm_interface.c
@@ -561,9 +561,6 @@ static void xfrmi_get_stats64(struct net_device *dev,
 {
int cpu;
 
-   if (!dev->tstats)
-   return;
-
for_each_possible_cpu(cpu) {
struct pcpu_sw_netstats *stats;
struct pcpu_sw_netstats tmp;
-- 
2.16.2

Re: [PATCH][net-next] ipv6: drop container_of when convert dst to rt6_info

2018-09-30 Thread Li RongQing

>
> I don't understand why you are doing this? It is not going to be
> faster (or safer) than container_of. container_of provides the
> same functionality and is safe against position of the member
> in the structure.
>

In fact, most places are converting dst to rt6_info directly, and only
few place uses container_of


net/ipv6/ip6_output.c:  struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
net/ipv6/route.c:   const struct rt6_info *rt = (struct rt6_info *)dst;

-Li

Re: [PATCH][net-next] ipv6: drop container_of when convert dst to rt6_info

2018-09-30 Thread Li RongQing

> + BUILD_BUG_ON(offsetof(struct rt6_info, dst) != 0);
> +

please drop this patch, thanks
since BUILD_BUG_ON has been added in ip6_fib.h

include/net/ip6_fib.h:  BUILD_BUG_ON(offsetof(struct rt6_info, dst) != 0);

-Li

[PATCH] xfrm: fix gro_cells leak when remove virtual xfrm interfaces

2018-09-30 Thread Li RongQing

The device gro_cells has been initialized, it should be freed,
otherwise it will be leaked

Fixes: f203b76d78092faf2 ("xfrm: Add virtual xfrm interfaces")
Signed-off-by: Zhang Yu 
Signed-off-by: Li RongQing 
---
 net/xfrm/xfrm_interface.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c
index 4b4ef4f662d9..9cc6e72bc802 100644
--- a/net/xfrm/xfrm_interface.c
+++ b/net/xfrm/xfrm_interface.c
@@ -116,6 +116,9 @@ static void xfrmi_unlink(struct xfrmi_net *xfrmn, struct 
xfrm_if *xi)
 
 static void xfrmi_dev_free(struct net_device *dev)
 {
+   struct xfrm_if *xi = netdev_priv(dev);
+
+   gro_cells_destroy(&xi->gro_cells);
free_percpu(dev->tstats);
 }
 
-- 
2.16.2

[PATCH][net-next] ipv6: drop container_of when convert dst to rt6_info

2018-09-29 Thread Li RongQing

we can save container_of computation and return dst directly,
since dst is always first member of struct rt6_info

Add a BUILD_BUG_ON() to catch any change that could break this
assertion.

Signed-off-by: Li RongQing 
---
 include/net/ip6_route.h | 4 +++-
 net/ipv6/route.c| 6 +++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 7b9c82de11cc..1f09298634cb 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -194,8 +194,10 @@ static inline const struct rt6_info *skb_rt6_info(const 
struct sk_buff *skb)
const struct dst_entry *dst = skb_dst(skb);
const struct rt6_info *rt6 = NULL;
 
+   BUILD_BUG_ON(offsetof(struct rt6_info, dst) != 0);
+
if (dst)
-   rt6 = container_of(dst, struct rt6_info, dst);
+   rt6 = (struct rt6_info *)dst;
 
return rt6;
 }
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index d28f83e01593..3fb8034fc2d0 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -217,7 +217,7 @@ static struct neighbour *ip6_dst_neigh_lookup(const struct 
dst_entry *dst,
  struct sk_buff *skb,
  const void *daddr)
 {
-   const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
+   const struct rt6_info *rt = (struct rt6_info *)dst;
 
return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
 }
@@ -2187,7 +2187,7 @@ static struct dst_entry *ip6_dst_check(struct dst_entry 
*dst, u32 cookie)
struct fib6_info *from;
struct rt6_info *rt;
 
-   rt = container_of(dst, struct rt6_info, dst);
+   rt = (struct rt6_info *)dst;
 
rcu_read_lock();
 
@@ -4911,7 +4911,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, 
struct nlmsghdr *nlh,
}
 
 
-   rt = container_of(dst, struct rt6_info, dst);
+   rt = (struct rt6_info *)dst;
if (rt->dst.error) {
err = rt->dst.error;
ip6_rt_put(rt);
-- 
2.16.2

[PATCH][net-next] net: drop container_of in dst_cache_get_ip4

2018-09-29 Thread Li RongQing

we can save container_of computation and return dst directly,
since dst is always first member of struct rtable, and any
breaking will be caught by BUILD_BUG_ON in route.h

include/net/route.h:BUILD_BUG_ON(offsetof(struct rtable, dst) != 0);

Signed-off-by: Li RongQing 
---
 net/core/dst_cache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c
index 64cef977484a..0753838480fd 100644
--- a/net/core/dst_cache.c
+++ b/net/core/dst_cache.c
@@ -87,7 +87,7 @@ struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, 
__be32 *saddr)
return NULL;
 
*saddr = idst->in_saddr.s_addr;
-   return container_of(dst, struct rtable, dst);
+   return (struct rtable *)dst;
 }
 EXPORT_SYMBOL_GPL(dst_cache_get_ip4);
 
-- 
2.16.2

答复: [PATCH][next-next][v2] netlink: avoid to allocate full skb when sending to many devices

2018-09-20 Thread Li,Rongqing

: Re: [PATCH][next-next][v2] netlink: avoid to allocate full skb when
> sending to many devices
> 
> 
> 
> On 09/20/2018 06:43 AM, Eric Dumazet wrote:
> >
> 

Sorry, I should cc to you.

> > And lastly this patch looks way too complicated to me.
> > You probably can write something much simpler.
> 

But it  should not increase the negative performance

> Something like :
> 
> diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index
> 930d17fa906c9ebf1cf7b6031ce0a22f9f66c0e4..e0a81beb4f37751421dbbe794c
> cf3d5a46bdf900 100644
> --- a/net/netlink/af_netlink.c
> +++ b/net/netlink/af_netlink.c
> @@ -278,22 +278,26 @@ static bool netlink_filter_tap(const struct sk_buff
> *skb)
> return false;
>  }
> 
> -static int __netlink_deliver_tap_skb(struct sk_buff *skb,
> +static int __netlink_deliver_tap_skb(struct sk_buff **pskb,
>  struct net_device *dev)  {
> -   struct sk_buff *nskb;
> +   struct sk_buff *nskb, *skb = *pskb;
> struct sock *sk = skb->sk;
> int ret = -ENOMEM;
> 
> if (!net_eq(dev_net(dev), sock_net(sk)))
> return 0;
> 
> -   dev_hold(dev);
> -
> -   if (is_vmalloc_addr(skb->head))
> +   if (is_vmalloc_addr(skb->head)) {
> nskb = netlink_to_full_skb(skb, GFP_ATOMIC);
> -   else
> -   nskb = skb_clone(skb, GFP_ATOMIC);
> +   if (!nskb)
> +   return -ENOMEM;
> +   consume_skb(skb);

The original skb can not be freed, since it will be used after send to tap in 
__netlink_sendskb

> +   skb = nskb;
> +   *pskb = skb;
> +   }
> +   dev_hold(dev);
> +   nskb = skb_clone(skb, GFP_ATOMIC);

since original skb can not be freed, skb_clone will lead to a leak.

> if (nskb) {
> nskb->dev = dev;
> nskb->protocol = htons((u16) sk->sk_protocol); @@ -318,7 
> +322,7
> @@ static void __netlink_deliver_tap(struct sk_buff *skb, struct
> netlink_tap_net *n
> return;
> 
> list_for_each_entry_rcu(tmp, &nn->netlink_tap_all, list) {
> -   ret = __netlink_deliver_tap_skb(skb, tmp->dev);
> +   ret = __netlink_deliver_tap_skb(&skb, tmp->dev);
> if (unlikely(ret))
> break;
> }
> 


The below change seems simple, but it increase skb allocation and
free one time,  

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index e3a0538ec0be..b9631137f0fe 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -290,10 +290,8 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb,
 
dev_hold(dev);
 
-   if (is_vmalloc_addr(skb->head))
-   nskb = netlink_to_full_skb(skb, GFP_ATOMIC);
-   else
-   nskb = skb_clone(skb, GFP_ATOMIC);
+   nskb = skb_clone(skb GFP_ATOMIC);
+
if (nskb) {
nskb->dev = dev;
nskb->protocol = htons((u16) sk->sk_protocol);
@@ -317,11 +315,20 @@ static void __netlink_deliver_tap(struct sk_buff *skb, 
struct netlink_tap_net *n
if (!netlink_filter_tap(skb))
return;
 
+   if (is_vmalloc_addr(skb->head)) {
+   skb = netlink_to_full_skb(skb, GFP_ATOMIC);
+   if (!skb)
+  return;
+   alloc = true;
+   }
+
list_for_each_entry_rcu(tmp, &nn->netlink_tap_all, list) {
+
ret = __netlink_deliver_tap_skb(skb, tmp->dev);
if (unlikely(ret))
break;
}
+
+   if (alloc)
+   consume_skb(skb);
 }

-Q

[PATCH][next-next][v2] netlink: avoid to allocate full skb when sending to many devices

2018-09-20 Thread Li RongQing

if skb->head is vmalloc address, when this skb is delivered, full
allocation for this skb is required, if there are many devices,
the full allocation will be called for every devices

now if it is vmalloc, allocate a new skb, whose data is not vmalloc
address, and use new allocated skb to clone and send, to avoid to
allocate full skb everytime.

Signed-off-by: Zhang Yu 
Signed-off-by: Li RongQing 
---
 net/netlink/af_netlink.c | 37 ++---
 1 file changed, 30 insertions(+), 7 deletions(-)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index e3a0538ec0be..a5b1bf706526 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -279,21 +279,25 @@ static bool netlink_filter_tap(const struct sk_buff *skb)
 }
 
 static int __netlink_deliver_tap_skb(struct sk_buff *skb,
-struct net_device *dev)
+struct net_device *dev, bool alloc, bool last)
 {
struct sk_buff *nskb;
struct sock *sk = skb->sk;
int ret = -ENOMEM;
 
-   if (!net_eq(dev_net(dev), sock_net(sk)))
+   if (!net_eq(dev_net(dev), sock_net(sk))) {
+   if (last && alloc)
+   consume_skb(skb);
return 0;
+   }
 
dev_hold(dev);
 
-   if (is_vmalloc_addr(skb->head))
-   nskb = netlink_to_full_skb(skb, GFP_ATOMIC);
+   if (unlikely(last && alloc))
+   nskb = skb;
else
nskb = skb_clone(skb, GFP_ATOMIC);
+
if (nskb) {
nskb->dev = dev;
nskb->protocol = htons((u16) sk->sk_protocol);
@@ -303,6 +307,8 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb,
ret = dev_queue_xmit(nskb);
if (unlikely(ret > 0))
ret = net_xmit_errno(ret);
+   } else if (alloc) {
+   kfree_skb(skb);
}
 
dev_put(dev);
@@ -311,16 +317,33 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb,
 
 static void __netlink_deliver_tap(struct sk_buff *skb, struct netlink_tap_net 
*nn)
 {
+   struct netlink_tap *tmp, *next;
+   bool alloc = false;
int ret;
-   struct netlink_tap *tmp;
 
if (!netlink_filter_tap(skb))
return;
 
-   list_for_each_entry_rcu(tmp, &nn->netlink_tap_all, list) {
-   ret = __netlink_deliver_tap_skb(skb, tmp->dev);
+   tmp = list_first_or_null_rcu(&nn->netlink_tap_all,
+   struct netlink_tap, list);
+   if (!tmp)
+   return;
+
+   if (is_vmalloc_addr(skb->head)) {
+   skb = netlink_to_full_skb(skb, GFP_ATOMIC);
+   if (!skb)
+   return;
+   alloc = true;
+   }
+
+   while (tmp) {
+   next = list_next_or_null_rcu(&nn->netlink_tap_all, &tmp->list,
+   struct netlink_tap, list);
+
+   ret = __netlink_deliver_tap_skb(skb, tmp->dev, alloc, !next);
if (unlikely(ret))
break;
+   tmp = next;
}
 }
 
-- 
2.16.2

答复: [PATCH][net-next] netlink: avoid to allocate full skb when sending to many devices

2018-09-18 Thread Li,Rongqing

> On 09/17/2018 10:26 PM, Li RongQing wrote:
> > if skb->head is vmalloc address, when this skb is delivered, full
> > allocation for this skb is required, if there are many devices, the
> > ---
> >  net/netlink/af_netlink.c | 14 --
> >  1 file changed, 8 insertions(+), 6 deletions(-)
> >
> >
> 
> This looks very broken to me.
> 
> Only the original skb (given as an argument to __netlink_deliver_tap()) is
> guaranteed to not disappear while the loop is performed.
> 
> (There is no skb_clone() after the first netlink_to_full_skb())
> 

Thank you;

I will rework it

-RongQing

[PATCH][net-next] netlink: avoid to allocate full skb when sending to many devices

2018-09-17 Thread Li RongQing

if skb->head is vmalloc address, when this skb is delivered, full
allocation for this skb is required, if there are many devices,
the full allocation will be called for every devices

now using the first time allocated skb when iterate other devices
to send, reduce full allocation and speedup deliver.

Signed-off-by: Zhang Yu 
Signed-off-by: Li RongQing 
---
 net/netlink/af_netlink.c | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index e3a0538ec0be..095b99e3c1fb 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -278,11 +278,11 @@ static bool netlink_filter_tap(const struct sk_buff *skb)
return false;
 }
 
-static int __netlink_deliver_tap_skb(struct sk_buff *skb,
+static int __netlink_deliver_tap_skb(struct sk_buff **skb,
 struct net_device *dev)
 {
struct sk_buff *nskb;
-   struct sock *sk = skb->sk;
+   struct sock *sk = (*skb)->sk;
int ret = -ENOMEM;
 
if (!net_eq(dev_net(dev), sock_net(sk)))
@@ -290,10 +290,12 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb,
 
dev_hold(dev);
 
-   if (is_vmalloc_addr(skb->head))
-   nskb = netlink_to_full_skb(skb, GFP_ATOMIC);
+   if (is_vmalloc_addr((*skb)->head)) {
+   nskb = netlink_to_full_skb(*skb, GFP_ATOMIC);
+   *skb = nskb;
+   }
else
-   nskb = skb_clone(skb, GFP_ATOMIC);
+   nskb = skb_clone(*skb, GFP_ATOMIC);
if (nskb) {
nskb->dev = dev;
nskb->protocol = htons((u16) sk->sk_protocol);
@@ -318,7 +320,7 @@ static void __netlink_deliver_tap(struct sk_buff *skb, 
struct netlink_tap_net *n
return;
 
list_for_each_entry_rcu(tmp, &nn->netlink_tap_all, list) {
-   ret = __netlink_deliver_tap_skb(skb, tmp->dev);
+   ret = __netlink_deliver_tap_skb(&skb, tmp->dev);
if (unlikely(ret))
break;
}
-- 
2.16.2

[PATCH][net-next] veth: rename pcpu_vstats as pcpu_lstats

2018-09-17 Thread Li RongQing

struct pcpu_vstats and pcpu_lstats have same members and
usage, and pcpu_lstats is used in many files, so rename
pcpu_vstats as pcpu_lstats to reduce duplicate definition

Signed-off-by: Zhang Yu 
Signed-off-by: Li RongQing 
---
 drivers/net/veth.c| 22 --
 include/linux/netdevice.h |  1 -
 2 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index bc8faf13a731..aeecb5892e26 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -37,12 +37,6 @@
 #define VETH_XDP_TXBIT(0)
 #define VETH_XDP_REDIR BIT(1)
 
-struct pcpu_vstats {
-   u64 packets;
-   u64 bytes;
-   struct u64_stats_sync   syncp;
-};
-
 struct veth_rq {
struct napi_struct  xdp_napi;
struct net_device   *dev;
@@ -217,7 +211,7 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct 
net_device *dev)
 
skb_tx_timestamp(skb);
if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) {
-   struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
+   struct pcpu_lstats *stats = this_cpu_ptr(dev->lstats);
 
u64_stats_update_begin(&stats->syncp);
stats->bytes += length;
@@ -236,7 +230,7 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct 
net_device *dev)
return NETDEV_TX_OK;
 }
 
-static u64 veth_stats_one(struct pcpu_vstats *result, struct net_device *dev)
+static u64 veth_stats_one(struct pcpu_lstats *result, struct net_device *dev)
 {
struct veth_priv *priv = netdev_priv(dev);
int cpu;
@@ -244,7 +238,7 @@ static u64 veth_stats_one(struct pcpu_vstats *result, 
struct net_device *dev)
result->packets = 0;
result->bytes = 0;
for_each_possible_cpu(cpu) {
-   struct pcpu_vstats *stats = per_cpu_ptr(dev->vstats, cpu);
+   struct pcpu_lstats *stats = per_cpu_ptr(dev->lstats, cpu);
u64 packets, bytes;
unsigned int start;
 
@@ -264,7 +258,7 @@ static void veth_get_stats64(struct net_device *dev,
 {
struct veth_priv *priv = netdev_priv(dev);
struct net_device *peer;
-   struct pcpu_vstats one;
+   struct pcpu_lstats one;
 
tot->tx_dropped = veth_stats_one(&one, dev);
tot->tx_bytes = one.bytes;
@@ -830,13 +824,13 @@ static int veth_dev_init(struct net_device *dev)
 {
int err;
 
-   dev->vstats = netdev_alloc_pcpu_stats(struct pcpu_vstats);
-   if (!dev->vstats)
+   dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
+   if (!dev->lstats)
return -ENOMEM;
 
err = veth_alloc_queues(dev);
if (err) {
-   free_percpu(dev->vstats);
+   free_percpu(dev->lstats);
return err;
}
 
@@ -846,7 +840,7 @@ static int veth_dev_init(struct net_device *dev)
 static void veth_dev_free(struct net_device *dev)
 {
veth_free_queues(dev);
-   free_percpu(dev->vstats);
+   free_percpu(dev->lstats);
 }
 
 #ifdef CONFIG_NET_POLL_CONTROLLER
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index baed5d5088c5..1cbbf77a685f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2000,7 +2000,6 @@ struct net_device {
struct pcpu_lstats __percpu *lstats;
struct pcpu_sw_netstats __percpu*tstats;
struct pcpu_dstats __percpu *dstats;
-   struct pcpu_vstats __percpu *vstats;
};
 
 #if IS_ENABLED(CONFIG_GARP)
-- 
2.16.2

[PATCH][net-next] net: move definition of pcpu_lstats to header file

2018-09-14 Thread Li RongQing

pcpu_lstats is defined in several files, so unify them as one
and move to header file

Signed-off-by: Zhang Yu 
Signed-off-by: Li RongQing 
---
 drivers/net/loopback.c|  6 --
 drivers/net/nlmon.c   |  6 --
 drivers/net/vsockmon.c| 14 --
 include/linux/netdevice.h |  6 ++
 4 files changed, 10 insertions(+), 22 deletions(-)

diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 30612497643c..a7207fa7e451 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -59,12 +59,6 @@
 #include 
 #include 
 
-struct pcpu_lstats {
-   u64 packets;
-   u64 bytes;
-   struct u64_stats_sync   syncp;
-};
-
 /* The higher levels take care of making this non-reentrant (it's
  * called with bh's disabled).
  */
diff --git a/drivers/net/nlmon.c b/drivers/net/nlmon.c
index 4b22955de191..dd0db7534cb3 100644
--- a/drivers/net/nlmon.c
+++ b/drivers/net/nlmon.c
@@ -6,12 +6,6 @@
 #include 
 #include 
 
-struct pcpu_lstats {
-   u64 packets;
-   u64 bytes;
-   struct u64_stats_sync syncp;
-};
-
 static netdev_tx_t nlmon_xmit(struct sk_buff *skb, struct net_device *dev)
 {
int len = skb->len;
diff --git a/drivers/net/vsockmon.c b/drivers/net/vsockmon.c
index c28bdce14fd5..7bad5c95551f 100644
--- a/drivers/net/vsockmon.c
+++ b/drivers/net/vsockmon.c
@@ -11,12 +11,6 @@
 #define DEFAULT_MTU (VIRTIO_VSOCK_MAX_PKT_BUF_SIZE + \
 sizeof(struct af_vsockmon_hdr))
 
-struct pcpu_lstats {
-   u64 rx_packets;
-   u64 rx_bytes;
-   struct u64_stats_sync syncp;
-};
-
 static int vsockmon_dev_init(struct net_device *dev)
 {
dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
@@ -56,8 +50,8 @@ static netdev_tx_t vsockmon_xmit(struct sk_buff *skb, struct 
net_device *dev)
struct pcpu_lstats *stats = this_cpu_ptr(dev->lstats);
 
u64_stats_update_begin(&stats->syncp);
-   stats->rx_bytes += len;
-   stats->rx_packets++;
+   stats->bytes += len;
+   stats->packets++;
u64_stats_update_end(&stats->syncp);
 
dev_kfree_skb(skb);
@@ -80,8 +74,8 @@ vsockmon_get_stats64(struct net_device *dev, struct 
rtnl_link_stats64 *stats)
 
do {
start = u64_stats_fetch_begin_irq(&vstats->syncp);
-   tbytes = vstats->rx_bytes;
-   tpackets = vstats->rx_packets;
+   tbytes = vstats->bytes;
+   tpackets = vstats->packets;
} while (u64_stats_fetch_retry_irq(&vstats->syncp, start));
 
packets += tpackets;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e2b3bd750c98..baed5d5088c5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2382,6 +2382,12 @@ struct pcpu_sw_netstats {
struct u64_stats_sync   syncp;
 };
 
+struct pcpu_lstats {
+   u64 packets;
+   u64 bytes;
+   struct u64_stats_sync syncp;
+};
+
 #define __netdev_alloc_pcpu_stats(type, gfp)   \
 ({ \
typeof(type) __percpu *pcpu_stats = alloc_percpu_gfp(type, gfp);\
-- 
2.16.2

1 2 >

1 - 100 of 162 matches

Mail list logo