Re: [PATCH] atl1c: optimize rx loop

2021-03-21 Thread Sieng Piaw Liew
On Fri, Mar 19, 2021 at 05:15:35AM +0100, Willy Tarreau wrote:
> On Fri, Mar 19, 2021 at 12:04:47PM +0800, Sieng Piaw Liew wrote:
> > Remove this trivial bit of inefficiency from the rx receive loop,
> > results in increase of a few Mbps in iperf3. Tested on Intel Core2
> > platform.
> > 
> > Signed-off-by: Sieng Piaw Liew 
> > ---
> >  drivers/net/ethernet/atheros/atl1c/atl1c_main.c | 4 +---
> >  1 file changed, 1 insertion(+), 3 deletions(-)
> > 
> > diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c 
> > b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
> > index 3f65f2b370c5..b995f9a0479c 100644
> > --- a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
> > +++ b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
> > @@ -1796,9 +1796,7 @@ static void atl1c_clean_rx_irq(struct atl1c_adapter 
> > *adapter,
> > struct atl1c_recv_ret_status *rrs;
> > struct atl1c_buffer *buffer_info;
> >  
> > -   while (1) {
> > -   if (*work_done >= work_to_do)
> > -   break;
> > +   while (*work_done < work_to_do) {
> 
> It should not change anything, or only based on the compiler's optimization
> and should not result in a measurable difference because what it does is
> exactly the same. Have you really compared the compiled output code to
> explain the difference ? I strongly suspect you'll find no difference at
> all.
> 
> Thus for me it's certainly not an optimization, it could be qualified as
> a cleanup to improve code readability however.
> 
> Willy

You're right. Objdump and diff showed no difference.

Regards,
Sieng Piaw


[PATCH] atl1c: use napi_alloc_skb

2021-03-18 Thread Sieng Piaw Liew
Using napi_alloc_skb in NAPI context avoids enable/disable IRQs, which
increases iperf3 result by a few Mbps. Since napi_alloc_skb() uses
NET_IP_ALIGN, convert other alloc methods to the same padding. Tested
on Intel Core2 and AMD K10 platforms.

Signed-off-by: Sieng Piaw Liew 
---
 .../net/ethernet/atheros/atl1c/atl1c_main.c   | 28 +++
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c 
b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
index 3f65f2b370c5..66325ba5b3a1 100644
--- a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
+++ b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
@@ -47,7 +47,7 @@ static void atl1c_down(struct atl1c_adapter *adapter);
 static int atl1c_reset_mac(struct atl1c_hw *hw);
 static void atl1c_reset_dma_ring(struct atl1c_adapter *adapter);
 static int atl1c_configure(struct atl1c_adapter *adapter);
-static int atl1c_alloc_rx_buffer(struct atl1c_adapter *adapter);
+static int atl1c_alloc_rx_buffer(struct atl1c_adapter *adapter, bool 
napi_mode);
 
 
 static const u32 atl1c_default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE |
@@ -470,7 +470,7 @@ static void atl1c_set_rxbufsize(struct atl1c_adapter 
*adapter,
adapter->rx_buffer_len = mtu > AT_RX_BUF_SIZE ?
roundup(mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN, 8) : 
AT_RX_BUF_SIZE;
 
-   head_size = SKB_DATA_ALIGN(adapter->rx_buffer_len + NET_SKB_PAD) +
+   head_size = SKB_DATA_ALIGN(adapter->rx_buffer_len + NET_SKB_PAD + 
NET_IP_ALIGN) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
adapter->rx_frag_size = roundup_pow_of_two(head_size);
 }
@@ -1434,7 +1434,7 @@ static int atl1c_configure(struct atl1c_adapter *adapter)
atl1c_set_multi(netdev);
atl1c_restore_vlan(adapter);
 
-   num = atl1c_alloc_rx_buffer(adapter);
+   num = atl1c_alloc_rx_buffer(adapter, false);
if (unlikely(num == 0))
return -ENOMEM;
 
@@ -1650,14 +1650,20 @@ static inline void atl1c_rx_checksum(struct 
atl1c_adapter *adapter,
skb_checksum_none_assert(skb);
 }
 
-static struct sk_buff *atl1c_alloc_skb(struct atl1c_adapter *adapter)
+static struct sk_buff *atl1c_alloc_skb(struct atl1c_adapter *adapter,
+  bool napi_mode)
 {
struct sk_buff *skb;
struct page *page;
 
-   if (adapter->rx_frag_size > PAGE_SIZE)
-   return netdev_alloc_skb(adapter->netdev,
-   adapter->rx_buffer_len);
+   if (adapter->rx_frag_size > PAGE_SIZE) {
+   if (likely(napi_mode))
+   return napi_alloc_skb(>napi,
+ adapter->rx_buffer_len);
+   else
+   return netdev_alloc_skb_ip_align(adapter->netdev,
+
adapter->rx_buffer_len);
+   }
 
page = adapter->rx_page;
if (!page) {
@@ -1670,7 +1676,7 @@ static struct sk_buff *atl1c_alloc_skb(struct 
atl1c_adapter *adapter)
skb = build_skb(page_address(page) + adapter->rx_page_offset,
adapter->rx_frag_size);
if (likely(skb)) {
-   skb_reserve(skb, NET_SKB_PAD);
+   skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
adapter->rx_page_offset += adapter->rx_frag_size;
if (adapter->rx_page_offset >= PAGE_SIZE)
adapter->rx_page = NULL;
@@ -1680,7 +1686,7 @@ static struct sk_buff *atl1c_alloc_skb(struct 
atl1c_adapter *adapter)
return skb;
 }
 
-static int atl1c_alloc_rx_buffer(struct atl1c_adapter *adapter)
+static int atl1c_alloc_rx_buffer(struct atl1c_adapter *adapter, bool napi_mode)
 {
struct atl1c_rfd_ring *rfd_ring = >rfd_ring;
struct pci_dev *pdev = adapter->pdev;
@@ -1701,7 +1707,7 @@ static int atl1c_alloc_rx_buffer(struct atl1c_adapter 
*adapter)
while (next_info->flags & ATL1C_BUFFER_FREE) {
rfd_desc = ATL1C_RFD_DESC(rfd_ring, rfd_next_to_use);
 
-   skb = atl1c_alloc_skb(adapter);
+   skb = atl1c_alloc_skb(adapter, napi_mode);
if (unlikely(!skb)) {
if (netif_msg_rx_err(adapter))
dev_warn(>dev, "alloc rx buffer 
failed\n");
@@ -1857,7 +1863,7 @@ static void atl1c_clean_rx_irq(struct atl1c_adapter 
*adapter,
count++;
}
if (count)
-   atl1c_alloc_rx_buffer(adapter);
+   atl1c_alloc_rx_buffer(adapter, true);
 }
 
 /**
-- 
2.17.1



[PATCH] atl1c: optimize rx loop

2021-03-18 Thread Sieng Piaw Liew
Remove this trivial bit of inefficiency from the rx receive loop,
results in increase of a few Mbps in iperf3. Tested on Intel Core2
platform.

Signed-off-by: Sieng Piaw Liew 
---
 drivers/net/ethernet/atheros/atl1c/atl1c_main.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c 
b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
index 3f65f2b370c5..b995f9a0479c 100644
--- a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
+++ b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
@@ -1796,9 +1796,7 @@ static void atl1c_clean_rx_irq(struct atl1c_adapter 
*adapter,
struct atl1c_recv_ret_status *rrs;
struct atl1c_buffer *buffer_info;
 
-   while (1) {
-   if (*work_done >= work_to_do)
-   break;
+   while (*work_done < work_to_do) {
rrs = ATL1C_RRD_DESC(rrd_ring, rrd_ring->next_to_clean);
if (likely(RRS_RXD_IS_VALID(rrs->word3))) {
rfd_num = (rrs->word0 >> RRS_RX_RFD_CNT_SHIFT) &
-- 
2.17.1



[PATCH net-next] atl1c: switch to napi_gro_receive

2021-03-18 Thread Sieng Piaw Liew
Changing to napi_gro_receive() improves efficiency significantly. Tested
on Intel Core2-based motherboards and iperf3.

Signed-off-by: Sieng Piaw Liew 
---
 drivers/net/ethernet/atheros/atl1c/atl1c_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c 
b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
index 3f65f2b370c5..3e440c2dc68a 100644
--- a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
+++ b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
@@ -1851,7 +1851,7 @@ static void atl1c_clean_rx_irq(struct atl1c_adapter 
*adapter,
vlan = le16_to_cpu(vlan);
__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan);
}
-   netif_receive_skb(skb);
+   napi_gro_receive(>napi, skb);
 
(*work_done)++;
count++;
-- 
2.17.1



[PATCH net] atl1c: switch to napi_gro_receive

2021-02-21 Thread Sieng Piaw Liew
Changing to napi_gro_receive() improves efficiency significantly. Tested
on Intel Core2-based motherboards and iperf3.

Signed-off-by: Sieng Piaw Liew 
---
 drivers/net/ethernet/atheros/atl1c/atl1c_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c 
b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
index 3f65f2b370c5..3e440c2dc68a 100644
--- a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
+++ b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
@@ -1851,7 +1851,7 @@ static void atl1c_clean_rx_irq(struct atl1c_adapter 
*adapter,
vlan = le16_to_cpu(vlan);
__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan);
}
-   netif_receive_skb(skb);
+   napi_gro_receive(>napi, skb);
 
(*work_done)++;
count++;
-- 
2.17.1



[PATCH net] bcm63xx_enet: fix sporadic kernel panic

2021-02-21 Thread Sieng Piaw Liew
In ndo_stop functions, netdev_completed_queue() is called during forced
tx reclaim, after netdev_reset_queue(). This may trigger kernel panic if
there is any tx skb left.

This patch moves netdev_reset_queue() to after tx reclaim, so BQL can
complete successfully then reset.

Signed-off-by: Sieng Piaw Liew 
---
 drivers/net/ethernet/broadcom/bcm63xx_enet.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c 
b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
index fd8767213165..977f097fc7bf 100644
--- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c
+++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
@@ -1192,7 +1192,6 @@ static int bcm_enet_stop(struct net_device *dev)
kdev = >pdev->dev;
 
netif_stop_queue(dev);
-   netdev_reset_queue(dev);
napi_disable(>napi);
if (priv->has_phy)
phy_stop(dev->phydev);
@@ -1231,6 +1230,9 @@ static int bcm_enet_stop(struct net_device *dev)
if (priv->has_phy)
phy_disconnect(dev->phydev);
 
+   /* reset BQL after forced tx reclaim to prevent kernel panic */
+   netdev_reset_queue(dev);
+
return 0;
 }
 
@@ -2343,7 +2345,6 @@ static int bcm_enetsw_stop(struct net_device *dev)
 
del_timer_sync(>swphy_poll);
netif_stop_queue(dev);
-   netdev_reset_queue(dev);
napi_disable(>napi);
del_timer_sync(>rx_timeout);
 
@@ -2371,6 +2372,9 @@ static int bcm_enetsw_stop(struct net_device *dev)
free_irq(priv->irq_tx, dev);
free_irq(priv->irq_rx, dev);
 
+   /* reset BQL after forced tx reclaim to prevent kernel panic */
+   netdev_reset_queue(dev);
+
return 0;
 }
 
-- 
2.17.1



[PATCH net-next v3 7/7] bcm63xx_enet: improve rx loop

2021-01-06 Thread Sieng Piaw Liew
Use existing rx processed count to track against budget, thereby making
budget decrement operation redundant.

rx_desc_count can be calculated outside the rx loop, making the loop a
bit smaller.

Signed-off-by: Sieng Piaw Liew 
---
 drivers/net/ethernet/broadcom/bcm63xx_enet.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c 
b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
index c11491429ed2..fd8767213165 100644
--- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c
+++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
@@ -339,7 +339,6 @@ static int bcm_enet_receive_queue(struct net_device *dev, 
int budget)
priv->rx_curr_desc++;
if (priv->rx_curr_desc == priv->rx_ring_size)
priv->rx_curr_desc = 0;
-   priv->rx_desc_count--;
 
/* if the packet does not have start of packet _and_
 * end of packet flag set, then just recycle it */
@@ -404,9 +403,10 @@ static int bcm_enet_receive_queue(struct net_device *dev, 
int budget)
dev->stats.rx_bytes += len;
list_add_tail(>list, _list);
 
-   } while (--budget > 0);
+   } while (processed < budget);
 
netif_receive_skb_list(_list);
+   priv->rx_desc_count -= processed;
 
if (processed || !priv->rx_desc_count) {
bcm_enet_refill_rx(dev, true);
-- 
2.17.1



[PATCH net-next v3 3/7] bcm63xx_enet: add xmit_more support

2021-01-06 Thread Sieng Piaw Liew
Support bulking hardware TX queue by using netdev_xmit_more().

Signed-off-by: Sieng Piaw Liew 
---
 drivers/net/ethernet/broadcom/bcm63xx_enet.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c 
b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
index 90f8214b4d22..21744dae30ce 100644
--- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c
+++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
@@ -634,7 +634,8 @@ bcm_enet_start_xmit(struct sk_buff *skb, struct net_device 
*dev)
netdev_sent_queue(dev, skb->len);
 
/* kick tx dma */
-   enet_dmac_writel(priv, priv->dma_chan_en_mask,
+   if (!netdev_xmit_more() || !priv->tx_desc_count)
+   enet_dmac_writel(priv, priv->dma_chan_en_mask,
 ENETDMAC_CHANCFG, priv->tx_chan);
 
/* stop queue if no more desc available */
-- 
2.17.1



[PATCH net-next v3 6/7] bcm63xx_enet: convert to build_skb

2021-01-06 Thread Sieng Piaw Liew
We can increase the efficiency of rx path by using buffers to receive
packets then build SKBs around them just before passing into the network
stack. In contrast, preallocating SKBs too early reduces CPU cache
efficiency.

Check if we're in NAPI context when refilling RX. Normally we're almost
always running in NAPI context. Dispatch to napi_alloc_frag directly
instead of relying on netdev_alloc_frag which does the same but
with the overhead of local_bh_disable/enable.

Tested on BCM6328 320 MHz and iperf3 -M 512 to measure packet/sec
performance. Included netif_receive_skb_list and NET_IP_ALIGN
optimizations.

Before:
[ ID] Interval   Transfer Bandwidth   Retr
[  4]   0.00-10.00  sec  49.9 MBytes  41.9 Mbits/sec  197 sender
[  4]   0.00-10.00  sec  49.3 MBytes  41.3 Mbits/secreceiver

After:
[ ID] Interval   Transfer Bandwidth   Retr
[  4]   0.00-30.00  sec   171 MBytes  47.8 Mbits/sec  272 sender
[  4]   0.00-30.00  sec   170 MBytes  47.6 Mbits/secreceiver

Signed-off-by: Sieng Piaw Liew 
---
 drivers/net/ethernet/broadcom/bcm63xx_enet.c | 111 ++-
 drivers/net/ethernet/broadcom/bcm63xx_enet.h |  14 ++-
 2 files changed, 71 insertions(+), 54 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c 
b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
index e34b05b10e43..c11491429ed2 100644
--- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c
+++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
@@ -220,7 +220,7 @@ static void bcm_enet_mdio_write_mii(struct net_device *dev, 
int mii_id,
 /*
  * refill rx queue
  */
-static int bcm_enet_refill_rx(struct net_device *dev)
+static int bcm_enet_refill_rx(struct net_device *dev, bool napi_mode)
 {
struct bcm_enet_priv *priv;
 
@@ -228,29 +228,29 @@ static int bcm_enet_refill_rx(struct net_device *dev)
 
while (priv->rx_desc_count < priv->rx_ring_size) {
struct bcm_enet_desc *desc;
-   struct sk_buff *skb;
-   dma_addr_t p;
int desc_idx;
u32 len_stat;
 
desc_idx = priv->rx_dirty_desc;
desc = >rx_desc_cpu[desc_idx];
 
-   if (!priv->rx_skb[desc_idx]) {
-   if (priv->enet_is_sw)
-   skb = netdev_alloc_skb_ip_align(dev, 
priv->rx_skb_size);
+   if (!priv->rx_buf[desc_idx]) {
+   void *buf;
+
+   if (likely(napi_mode))
+   buf = napi_alloc_frag(priv->rx_frag_size);
else
-   skb = netdev_alloc_skb(dev, priv->rx_skb_size);
-   if (!skb)
+   buf = netdev_alloc_frag(priv->rx_frag_size);
+   if (unlikely(!buf))
break;
-   priv->rx_skb[desc_idx] = skb;
-   p = dma_map_single(>pdev->dev, skb->data,
-  priv->rx_skb_size,
-  DMA_FROM_DEVICE);
-   desc->address = p;
+   priv->rx_buf[desc_idx] = buf;
+   desc->address = dma_map_single(>pdev->dev,
+  buf + 
priv->rx_buf_offset,
+  priv->rx_buf_size,
+  DMA_FROM_DEVICE);
}
 
-   len_stat = priv->rx_skb_size << DMADESC_LENGTH_SHIFT;
+   len_stat = priv->rx_buf_size << DMADESC_LENGTH_SHIFT;
len_stat |= DMADESC_OWNER_MASK;
if (priv->rx_dirty_desc == priv->rx_ring_size - 1) {
len_stat |= (DMADESC_WRAP_MASK >> priv->dma_desc_shift);
@@ -290,7 +290,7 @@ static void bcm_enet_refill_rx_timer(struct timer_list *t)
struct net_device *dev = priv->net_dev;
 
spin_lock(>rx_lock);
-   bcm_enet_refill_rx(dev);
+   bcm_enet_refill_rx(dev, false);
spin_unlock(>rx_lock);
 }
 
@@ -320,6 +320,7 @@ static int bcm_enet_receive_queue(struct net_device *dev, 
int budget)
int desc_idx;
u32 len_stat;
unsigned int len;
+   void *buf;
 
desc_idx = priv->rx_curr_desc;
desc = >rx_desc_cpu[desc_idx];
@@ -365,16 +366,14 @@ static int bcm_enet_receive_queue(struct net_device *dev, 
int budget)
}
 
/* valid packet */
-   skb = priv->rx_skb[desc_idx];
+   buf = priv->rx_buf[desc_idx];
len = (len_stat & DMADESC_LENGTH_MASK) >> DMADESC_LENGTH_SHIFT;
/* don't include FCS */
len -= 4;
 
  

[PATCH net-next v3 5/7] bcm63xx_enet: consolidate rx SKB ring cleanup code

2021-01-06 Thread Sieng Piaw Liew
The rx SKB ring use the same code for cleanup at various points.
Combine them into a function to reduce lines of code.

Signed-off-by: Sieng Piaw Liew 
---
 drivers/net/ethernet/broadcom/bcm63xx_enet.c | 72 ++--
 1 file changed, 22 insertions(+), 50 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c 
b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
index 96d56c3e2cc9..e34b05b10e43 100644
--- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c
+++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
@@ -860,6 +860,24 @@ static void bcm_enet_adjust_link(struct net_device *dev)
priv->pause_tx ? "tx" : "off");
 }
 
+static void bcm_enet_free_rx_skb_ring(struct device *kdev, struct 
bcm_enet_priv *priv)
+{
+   int i;
+
+   for (i = 0; i < priv->rx_ring_size; i++) {
+   struct bcm_enet_desc *desc;
+
+   if (!priv->rx_skb[i])
+   continue;
+
+   desc = >rx_desc_cpu[i];
+   dma_unmap_single(kdev, desc->address, priv->rx_skb_size,
+DMA_FROM_DEVICE);
+   kfree_skb(priv->rx_skb[i]);
+   }
+   kfree(priv->rx_skb);
+}
+
 /*
  * open callback, allocate dma rings & buffers and start rx operation
  */
@@ -1084,18 +1102,7 @@ static int bcm_enet_open(struct net_device *dev)
return 0;
 
 out:
-   for (i = 0; i < priv->rx_ring_size; i++) {
-   struct bcm_enet_desc *desc;
-
-   if (!priv->rx_skb[i])
-   continue;
-
-   desc = >rx_desc_cpu[i];
-   dma_unmap_single(kdev, desc->address, priv->rx_skb_size,
-DMA_FROM_DEVICE);
-   kfree_skb(priv->rx_skb[i]);
-   }
-   kfree(priv->rx_skb);
+   bcm_enet_free_rx_skb_ring(kdev, priv);
 
 out_free_tx_skb:
kfree(priv->tx_skb);
@@ -1174,7 +1181,6 @@ static int bcm_enet_stop(struct net_device *dev)
 {
struct bcm_enet_priv *priv;
struct device *kdev;
-   int i;
 
priv = netdev_priv(dev);
kdev = >pdev->dev;
@@ -1203,20 +1209,9 @@ static int bcm_enet_stop(struct net_device *dev)
bcm_enet_tx_reclaim(dev, 1);
 
/* free the rx skb ring */
-   for (i = 0; i < priv->rx_ring_size; i++) {
-   struct bcm_enet_desc *desc;
-
-   if (!priv->rx_skb[i])
-   continue;
-
-   desc = >rx_desc_cpu[i];
-   dma_unmap_single(kdev, desc->address, priv->rx_skb_size,
-DMA_FROM_DEVICE);
-   kfree_skb(priv->rx_skb[i]);
-   }
+   bcm_enet_free_rx_skb_ring(kdev, priv);
 
/* free remaining allocated memory */
-   kfree(priv->rx_skb);
kfree(priv->tx_skb);
dma_free_coherent(kdev, priv->rx_desc_alloc_size,
  priv->rx_desc_cpu, priv->rx_desc_dma);
@@ -2303,18 +2298,7 @@ static int bcm_enetsw_open(struct net_device *dev)
return 0;
 
 out:
-   for (i = 0; i < priv->rx_ring_size; i++) {
-   struct bcm_enet_desc *desc;
-
-   if (!priv->rx_skb[i])
-   continue;
-
-   desc = >rx_desc_cpu[i];
-   dma_unmap_single(kdev, desc->address, priv->rx_skb_size,
-DMA_FROM_DEVICE);
-   kfree_skb(priv->rx_skb[i]);
-   }
-   kfree(priv->rx_skb);
+   bcm_enet_free_rx_skb_ring(kdev, priv);
 
 out_free_tx_skb:
kfree(priv->tx_skb);
@@ -2343,7 +2327,6 @@ static int bcm_enetsw_stop(struct net_device *dev)
 {
struct bcm_enet_priv *priv;
struct device *kdev;
-   int i;
 
priv = netdev_priv(dev);
kdev = >pdev->dev;
@@ -2366,20 +2349,9 @@ static int bcm_enetsw_stop(struct net_device *dev)
bcm_enet_tx_reclaim(dev, 1);
 
/* free the rx skb ring */
-   for (i = 0; i < priv->rx_ring_size; i++) {
-   struct bcm_enet_desc *desc;
-
-   if (!priv->rx_skb[i])
-   continue;
-
-   desc = >rx_desc_cpu[i];
-   dma_unmap_single(kdev, desc->address, priv->rx_skb_size,
-DMA_FROM_DEVICE);
-   kfree_skb(priv->rx_skb[i]);
-   }
+   bcm_enet_free_rx_skb_ring(kdev, priv);
 
/* free remaining allocated memory */
-   kfree(priv->rx_skb);
kfree(priv->tx_skb);
dma_free_coherent(kdev, priv->rx_desc_alloc_size,
  priv->rx_desc_cpu, priv->rx_desc_dma);
-- 
2.17.1



[PATCH net-next v3 4/7] bcm63xx_enet: alloc rx skb with NET_IP_ALIGN

2021-01-06 Thread Sieng Piaw Liew
Use netdev_alloc_skb_ip_align on newer SoCs with integrated switch
(enetsw) when refilling RX. Increases packet processing performance
by 30% (with netif_receive_skb_list).

Non-enetsw SoCs cannot function with the extra pad so continue to use
the regular netdev_alloc_skb.

Tested on BCM6328 320 MHz and iperf3 -M 512 to measure packet/sec
performance.

Before:
[ ID] Interval Transfer Bandwidth Retr
[ 4] 0.00-30.00 sec 120 MBytes 33.7 Mbits/sec 277 sender
[ 4] 0.00-30.00 sec 120 MBytes 33.5 Mbits/sec receiver

After (+netif_receive_skb_list):
[ 4] 0.00-30.00 sec 155 MBytes 43.3 Mbits/sec 354 sender
[ 4] 0.00-30.00 sec 154 MBytes 43.1 Mbits/sec receiver

Signed-off-by: Sieng Piaw Liew 
---
 drivers/net/ethernet/broadcom/bcm63xx_enet.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c 
b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
index 21744dae30ce..96d56c3e2cc9 100644
--- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c
+++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
@@ -237,7 +237,10 @@ static int bcm_enet_refill_rx(struct net_device *dev)
desc = >rx_desc_cpu[desc_idx];
 
if (!priv->rx_skb[desc_idx]) {
-   skb = netdev_alloc_skb(dev, priv->rx_skb_size);
+   if (priv->enet_is_sw)
+   skb = netdev_alloc_skb_ip_align(dev, 
priv->rx_skb_size);
+   else
+   skb = netdev_alloc_skb(dev, priv->rx_skb_size);
if (!skb)
break;
priv->rx_skb[desc_idx] = skb;
-- 
2.17.1



[PATCH net-next v3 2/7] bcm63xx_enet: add BQL support

2021-01-06 Thread Sieng Piaw Liew
Add Byte Queue Limits support to reduce/remove bufferbloat in
bcm63xx_enet.

Signed-off-by: Sieng Piaw Liew 
---
 drivers/net/ethernet/broadcom/bcm63xx_enet.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c 
b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
index b82b7805c36a..90f8214b4d22 100644
--- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c
+++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
@@ -417,9 +417,11 @@ static int bcm_enet_receive_queue(struct net_device *dev, 
int budget)
 static int bcm_enet_tx_reclaim(struct net_device *dev, int force)
 {
struct bcm_enet_priv *priv;
+   unsigned int bytes;
int released;
 
priv = netdev_priv(dev);
+   bytes = 0;
released = 0;
 
while (priv->tx_desc_count < priv->tx_ring_size) {
@@ -456,10 +458,13 @@ static int bcm_enet_tx_reclaim(struct net_device *dev, 
int force)
if (desc->len_stat & DMADESC_UNDER_MASK)
dev->stats.tx_errors++;
 
+   bytes += skb->len;
dev_kfree_skb(skb);
released++;
}
 
+   netdev_completed_queue(dev, released, bytes);
+
if (netif_queue_stopped(dev) && released)
netif_wake_queue(dev);
 
@@ -626,6 +631,8 @@ bcm_enet_start_xmit(struct sk_buff *skb, struct net_device 
*dev)
desc->len_stat = len_stat;
wmb();
 
+   netdev_sent_queue(dev, skb->len);
+
/* kick tx dma */
enet_dmac_writel(priv, priv->dma_chan_en_mask,
 ENETDMAC_CHANCFG, priv->tx_chan);
@@ -1169,6 +1176,7 @@ static int bcm_enet_stop(struct net_device *dev)
kdev = >pdev->dev;
 
netif_stop_queue(dev);
+   netdev_reset_queue(dev);
napi_disable(>napi);
if (priv->has_phy)
phy_stop(dev->phydev);
@@ -2338,6 +2346,7 @@ static int bcm_enetsw_stop(struct net_device *dev)
 
del_timer_sync(>swphy_poll);
netif_stop_queue(dev);
+   netdev_reset_queue(dev);
napi_disable(>napi);
del_timer_sync(>rx_timeout);
 
-- 
2.17.1



[PATCH net-next v3 1/7] bcm63xx_enet: batch process rx path

2021-01-06 Thread Sieng Piaw Liew
Use netif_receive_skb_list to batch process rx skb.
Tested on BCM6328 320 MHz using iperf3 -M 512, increasing performance
by 12.5%.

Before:
[ ID] Interval   Transfer Bandwidth   Retr
[  4]   0.00-30.00  sec   120 MBytes  33.7 Mbits/sec  277 sender
[  4]   0.00-30.00  sec   120 MBytes  33.5 Mbits/secreceiver

After:
[ ID] Interval   Transfer Bandwidth   Retr
[  4]   0.00-30.00  sec   136 MBytes  37.9 Mbits/sec  203 sender
[  4]   0.00-30.00  sec   135 MBytes  37.7 Mbits/secreceiver

Signed-off-by: Sieng Piaw Liew 
---
 drivers/net/ethernet/broadcom/bcm63xx_enet.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c 
b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
index 916824cca3fd..b82b7805c36a 100644
--- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c
+++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
@@ -297,10 +297,12 @@ static void bcm_enet_refill_rx_timer(struct timer_list *t)
 static int bcm_enet_receive_queue(struct net_device *dev, int budget)
 {
struct bcm_enet_priv *priv;
+   struct list_head rx_list;
struct device *kdev;
int processed;
 
priv = netdev_priv(dev);
+   INIT_LIST_HEAD(_list);
kdev = >pdev->dev;
processed = 0;
 
@@ -391,10 +393,12 @@ static int bcm_enet_receive_queue(struct net_device *dev, 
int budget)
skb->protocol = eth_type_trans(skb, dev);
dev->stats.rx_packets++;
dev->stats.rx_bytes += len;
-   netif_receive_skb(skb);
+   list_add_tail(>list, _list);
 
} while (--budget > 0);
 
+   netif_receive_skb_list(_list);
+
if (processed || !priv->rx_desc_count) {
bcm_enet_refill_rx(dev);
 
-- 
2.17.1



[PATCH net-next v3 0/7] bcm63xx_enet: major makeover of driver

2021-01-06 Thread Sieng Piaw Liew
This patch series aim to improve the bcm63xx_enet driver by integrating the
latest networking features, i.e. batched rx processing, BQL, build_skb,
etc.

The newer enetsw SoCs are found to be able to do unaligned rx DMA by adding
NET_IP_ALIGN padding which, combined with these patches, improved packet
processing performance by ~50% on BCM6328.

Older non-enetsw SoCs still benefit mainly from rx batching. Performance
improvement of ~30% is observed on BCM6333.

The BCM63xx SoCs are designed for routers. As such, having BQL is
beneficial as well as trivial to add.

v3:
* Simplify xmit_more patch by not moving around the code needlessly.
* Fix indentation in xmit_more patch.
* Fix indentation in build_skb patch.
* Split rx ring cleanup patch from build_skb patch and precede build_skb
  patch for better understanding, as suggested by Florian Fainelli.

v2:
* Add xmit_more support and rx loop improvisation patches.
* Moved BQL netdev_reset_queue() to bcm_enet_stop()/bcm_enetsw_stop()
  functions as suggested by Florian Fainelli.
* Improved commit messages.

Sieng Piaw Liew (7):
  bcm63xx_enet: batch process rx path
  bcm63xx_enet: add BQL support
  bcm63xx_enet: add xmit_more support
  bcm63xx_enet: alloc rx skb with NET_IP_ALIGN
  bcm63xx_enet: consolidate rx SKB ring cleanup code
  bcm63xx_enet: convert to build_skb
  bcm63xx_enet: improve rx loop

 drivers/net/ethernet/broadcom/bcm63xx_enet.c | 186 +--
 drivers/net/ethernet/broadcom/bcm63xx_enet.h |  14 +-
 2 files changed, 103 insertions(+), 97 deletions(-)

-- 
2.17.1



[PATCH net-next v2 6/6] bcm63xx_enet: improve rx loop

2020-12-24 Thread Sieng Piaw Liew
Use existing rx processed count to track against budget, thereby making
budget decrement operation redundant.

rx_desc_count can be calculated outside the rx loop, making the loop a
bit smaller.

Signed-off-by: Sieng Piaw Liew 
---
 drivers/net/ethernet/broadcom/bcm63xx_enet.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c 
b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
index 8c2e97311a2c..5ff0d39be2b2 100644
--- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c
+++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
@@ -339,7 +339,6 @@ static int bcm_enet_receive_queue(struct net_device *dev, 
int budget)
priv->rx_curr_desc++;
if (priv->rx_curr_desc == priv->rx_ring_size)
priv->rx_curr_desc = 0;
-   priv->rx_desc_count--;
 
/* if the packet does not have start of packet _and_
 * end of packet flag set, then just recycle it */
@@ -404,9 +403,10 @@ static int bcm_enet_receive_queue(struct net_device *dev, 
int budget)
dev->stats.rx_bytes += len;
list_add_tail(>list, _list);
 
-   } while (--budget > 0);
+   } while (processed < budget);
 
netif_receive_skb_list(_list);
+   priv->rx_desc_count -= processed;
 
if (processed || !priv->rx_desc_count) {
bcm_enet_refill_rx(dev, true);
-- 
2.17.1



[PATCH net-next v2 5/6] bcm63xx_enet: convert to build_skb

2020-12-24 Thread Sieng Piaw Liew
We can increase the efficiency of rx path by using buffers to receive
packets then build SKBs around them just before passing into the network
stack. In contrast, preallocating SKBs too early reduces CPU cache
efficiency.

Check if we're in NAPI context when refilling RX. Normally we're almost
always running in NAPI context. Dispatch to napi_alloc_frag directly
instead of relying on netdev_alloc_frag which does the same but
with the overhead of local_bh_disable/enable.

Tested on BCM6328 320 MHz and iperf3 -M 512 to measure packet/sec
performance. Included netif_receive_skb_list and NET_IP_ALIGN
optimizations.

Before:
[ ID] Interval   Transfer Bandwidth   Retr
[  4]   0.00-10.00  sec  49.9 MBytes  41.9 Mbits/sec  197 sender
[  4]   0.00-10.00  sec  49.3 MBytes  41.3 Mbits/secreceiver

After:
[ ID] Interval   Transfer Bandwidth   Retr
[  4]   0.00-30.00  sec   171 MBytes  47.8 Mbits/sec  272 sender
[  4]   0.00-30.00  sec   170 MBytes  47.6 Mbits/secreceiver

Signed-off-by: Sieng Piaw Liew 
---
 drivers/net/ethernet/broadcom/bcm63xx_enet.c | 157 +--
 drivers/net/ethernet/broadcom/bcm63xx_enet.h |  14 +-
 2 files changed, 80 insertions(+), 91 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c 
b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
index 51976ed87d2d..8c2e97311a2c 100644
--- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c
+++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
@@ -220,7 +220,7 @@ static void bcm_enet_mdio_write_mii(struct net_device *dev, 
int mii_id,
 /*
  * refill rx queue
  */
-static int bcm_enet_refill_rx(struct net_device *dev)
+static int bcm_enet_refill_rx(struct net_device *dev, bool napi_mode)
 {
struct bcm_enet_priv *priv;
 
@@ -228,29 +228,29 @@ static int bcm_enet_refill_rx(struct net_device *dev)
 
while (priv->rx_desc_count < priv->rx_ring_size) {
struct bcm_enet_desc *desc;
-   struct sk_buff *skb;
-   dma_addr_t p;
int desc_idx;
u32 len_stat;
 
desc_idx = priv->rx_dirty_desc;
desc = >rx_desc_cpu[desc_idx];
 
-   if (!priv->rx_skb[desc_idx]) {
-   if (priv->enet_is_sw)
-   skb = netdev_alloc_skb_ip_align(dev, 
priv->rx_skb_size);
+   if (!priv->rx_buf[desc_idx]) {
+   void *buf;
+
+   if (likely(napi_mode))
+   buf = napi_alloc_frag(priv->rx_frag_size);
else
-   skb = netdev_alloc_skb(dev, priv->rx_skb_size);
-   if (!skb)
+   buf = netdev_alloc_frag(priv->rx_frag_size);
+   if (unlikely(!buf))
break;
-   priv->rx_skb[desc_idx] = skb;
-   p = dma_map_single(>pdev->dev, skb->data,
-  priv->rx_skb_size,
-  DMA_FROM_DEVICE);
-   desc->address = p;
+   priv->rx_buf[desc_idx] = buf;
+   desc->address = dma_map_single(>pdev->dev,
+  buf + 
priv->rx_buf_offset,
+  priv->rx_buf_size,
+  DMA_FROM_DEVICE);
}
 
-   len_stat = priv->rx_skb_size << DMADESC_LENGTH_SHIFT;
+   len_stat = priv->rx_buf_size << DMADESC_LENGTH_SHIFT;
len_stat |= DMADESC_OWNER_MASK;
if (priv->rx_dirty_desc == priv->rx_ring_size - 1) {
len_stat |= (DMADESC_WRAP_MASK >> priv->dma_desc_shift);
@@ -290,7 +290,7 @@ static void bcm_enet_refill_rx_timer(struct timer_list *t)
struct net_device *dev = priv->net_dev;
 
spin_lock(>rx_lock);
-   bcm_enet_refill_rx(dev);
+   bcm_enet_refill_rx(dev, false);
spin_unlock(>rx_lock);
 }
 
@@ -320,6 +320,7 @@ static int bcm_enet_receive_queue(struct net_device *dev, 
int budget)
int desc_idx;
u32 len_stat;
unsigned int len;
+   void *buf;
 
desc_idx = priv->rx_curr_desc;
desc = >rx_desc_cpu[desc_idx];
@@ -365,16 +366,14 @@ static int bcm_enet_receive_queue(struct net_device *dev, 
int budget)
}
 
/* valid packet */
-   skb = priv->rx_skb[desc_idx];
+   buf = priv->rx_buf[desc_idx];
len = (len_stat & DMADESC_LENGTH_MASK) >> DMADESC_LENGTH_SHIFT;
/* don't include FCS */
len -= 4;
 
  

[PATCH net-next v2 3/6] bcm63xx_enet: add xmit_more support

2020-12-24 Thread Sieng Piaw Liew
Support bulking hardware TX queue by using netdev_xmit_more().

Signed-off-by: Sieng Piaw Liew 
---
 drivers/net/ethernet/broadcom/bcm63xx_enet.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c 
b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
index 90f8214b4d22..452968f168ed 100644
--- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c
+++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
@@ -633,14 +633,17 @@ bcm_enet_start_xmit(struct sk_buff *skb, struct 
net_device *dev)
 
netdev_sent_queue(dev, skb->len);
 
-   /* kick tx dma */
-   enet_dmac_writel(priv, priv->dma_chan_en_mask,
-ENETDMAC_CHANCFG, priv->tx_chan);
-
/* stop queue if no more desc available */
if (!priv->tx_desc_count)
netif_stop_queue(dev);
 
+   /* kick tx dma */
+if (!netdev_xmit_more() || !priv->tx_desc_count)
+enet_dmac_writel(priv, priv->dma_chan_en_mask,
+ ENETDMAC_CHANCFG, priv->tx_chan);
+
+
+
dev->stats.tx_bytes += skb->len;
dev->stats.tx_packets++;
ret = NETDEV_TX_OK;
-- 
2.17.1



[PATCH net-next v2 2/6] bcm63xx_enet: add BQL support

2020-12-24 Thread Sieng Piaw Liew
Add Byte Queue Limits support to reduce/remove bufferbloat in
bcm63xx_enet.

Signed-off-by: Sieng Piaw Liew 
---
 drivers/net/ethernet/broadcom/bcm63xx_enet.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c 
b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
index b82b7805c36a..90f8214b4d22 100644
--- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c
+++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
@@ -417,9 +417,11 @@ static int bcm_enet_receive_queue(struct net_device *dev, 
int budget)
 static int bcm_enet_tx_reclaim(struct net_device *dev, int force)
 {
struct bcm_enet_priv *priv;
+   unsigned int bytes;
int released;
 
priv = netdev_priv(dev);
+   bytes = 0;
released = 0;
 
while (priv->tx_desc_count < priv->tx_ring_size) {
@@ -456,10 +458,13 @@ static int bcm_enet_tx_reclaim(struct net_device *dev, 
int force)
if (desc->len_stat & DMADESC_UNDER_MASK)
dev->stats.tx_errors++;
 
+   bytes += skb->len;
dev_kfree_skb(skb);
released++;
}
 
+   netdev_completed_queue(dev, released, bytes);
+
if (netif_queue_stopped(dev) && released)
netif_wake_queue(dev);
 
@@ -626,6 +631,8 @@ bcm_enet_start_xmit(struct sk_buff *skb, struct net_device 
*dev)
desc->len_stat = len_stat;
wmb();
 
+   netdev_sent_queue(dev, skb->len);
+
/* kick tx dma */
enet_dmac_writel(priv, priv->dma_chan_en_mask,
 ENETDMAC_CHANCFG, priv->tx_chan);
@@ -1169,6 +1176,7 @@ static int bcm_enet_stop(struct net_device *dev)
kdev = >pdev->dev;
 
netif_stop_queue(dev);
+   netdev_reset_queue(dev);
napi_disable(>napi);
if (priv->has_phy)
phy_stop(dev->phydev);
@@ -2338,6 +2346,7 @@ static int bcm_enetsw_stop(struct net_device *dev)
 
del_timer_sync(>swphy_poll);
netif_stop_queue(dev);
+   netdev_reset_queue(dev);
napi_disable(>napi);
del_timer_sync(>rx_timeout);
 
-- 
2.17.1



[PATCH net-next v2 1/6] bcm63xx_enet: batch process rx path

2020-12-24 Thread Sieng Piaw Liew
Use netif_receive_skb_list to batch process rx skb.
Tested on BCM6328 320 MHz using iperf3 -M 512, increasing performance
by 12.5%.

Before:
[ ID] Interval   Transfer Bandwidth   Retr
[  4]   0.00-30.00  sec   120 MBytes  33.7 Mbits/sec  277 sender
[  4]   0.00-30.00  sec   120 MBytes  33.5 Mbits/secreceiver

After:
[ ID] Interval   Transfer Bandwidth   Retr
[  4]   0.00-30.00  sec   136 MBytes  37.9 Mbits/sec  203 sender
[  4]   0.00-30.00  sec   135 MBytes  37.7 Mbits/secreceiver

Signed-off-by: Sieng Piaw Liew 
---
 drivers/net/ethernet/broadcom/bcm63xx_enet.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c 
b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
index 916824cca3fd..b82b7805c36a 100644
--- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c
+++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
@@ -297,10 +297,12 @@ static void bcm_enet_refill_rx_timer(struct timer_list *t)
 static int bcm_enet_receive_queue(struct net_device *dev, int budget)
 {
struct bcm_enet_priv *priv;
+   struct list_head rx_list;
struct device *kdev;
int processed;
 
priv = netdev_priv(dev);
+   INIT_LIST_HEAD(_list);
kdev = >pdev->dev;
processed = 0;
 
@@ -391,10 +393,12 @@ static int bcm_enet_receive_queue(struct net_device *dev, 
int budget)
skb->protocol = eth_type_trans(skb, dev);
dev->stats.rx_packets++;
dev->stats.rx_bytes += len;
-   netif_receive_skb(skb);
+   list_add_tail(>list, _list);
 
} while (--budget > 0);
 
+   netif_receive_skb_list(_list);
+
if (processed || !priv->rx_desc_count) {
bcm_enet_refill_rx(dev);
 
-- 
2.17.1



[PATCH net-next v2 0/6] bcm63xx_enet: major makeover of driver

2020-12-24 Thread Sieng Piaw Liew
This patch series aim to improve the bcm63xx_enet driver by integrating the
latest networking features, i.e. batched rx processing, BQL, build_skb, etc.

The newer enetsw SoCs are found to be able to do unaligned rx DMA by adding
NET_IP_ALIGN padding which, combined with these patches, improved packet
processing performance by ~50% on BCM6328.

Older non-enetsw SoCs still benefit mainly from rx batching. Performance
improvement of ~30% is observed on BCM6333.

The BCM63xx SoCs are designed for routers. As such, having BQL is beneficial
as well as trivial to add.

v2:
* Add xmit_more support and rx loop improvisation patches.
* Moved BQL netdev_reset_queue() to bcm_enet_stop()/bcm_enetsw_stop()
  functions as suggested by Florian Fainelli.
* Improved commit messages.

Sieng Piaw Liew (6):
  bcm63xx_enet: batch process rx path
  bcm63xx_enet: add BQL support
  bcm63xx_enet: add xmit_more support
  bcm63xx_enet: alloc rx skb with NET_IP_ALIGN
  bcm63xx_enet: convert to build_skb
  bcm63xx_enet: improve rx loop

 drivers/net/ethernet/broadcom/bcm63xx_enet.c | 184 ++-
 drivers/net/ethernet/broadcom/bcm63xx_enet.h |  14 +-
 2 files changed, 103 insertions(+), 95 deletions(-)

-- 
2.17.1



[PATCH net-next v2 4/6] bcm63xx_enet: alloc rx skb with NET_IP_ALIGN

2020-12-24 Thread Sieng Piaw Liew
Use netdev_alloc_skb_ip_align on newer SoCs with integrated switch
(enetsw) when refilling RX. Increases packet processing performance
by 30% (with netif_receive_skb_list).

Non-enetsw SoCs cannot function with the extra pad so continue to use
the regular netdev_alloc_skb.

Tested on BCM6328 320 MHz and iperf3 -M 512 to measure packet/sec
performance.

Before:
[ ID] Interval Transfer Bandwidth Retr
[ 4] 0.00-30.00 sec 120 MBytes 33.7 Mbits/sec 277 sender
[ 4] 0.00-30.00 sec 120 MBytes 33.5 Mbits/sec receiver

After (+netif_receive_skb_list):
[ 4] 0.00-30.00 sec 155 MBytes 43.3 Mbits/sec 354 sender
[ 4] 0.00-30.00 sec 154 MBytes 43.1 Mbits/sec receiver

Signed-off-by: Sieng Piaw Liew 
---
 drivers/net/ethernet/broadcom/bcm63xx_enet.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c 
b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
index 452968f168ed..51976ed87d2d 100644
--- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c
+++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
@@ -237,7 +237,10 @@ static int bcm_enet_refill_rx(struct net_device *dev)
desc = >rx_desc_cpu[desc_idx];
 
if (!priv->rx_skb[desc_idx]) {
-   skb = netdev_alloc_skb(dev, priv->rx_skb_size);
+   if (priv->enet_is_sw)
+   skb = netdev_alloc_skb_ip_align(dev, 
priv->rx_skb_size);
+   else
+   skb = netdev_alloc_skb(dev, priv->rx_skb_size);
if (!skb)
break;
priv->rx_skb[desc_idx] = skb;
-- 
2.17.1



[PATCH v2] mtd: spi-nor: macronix: enable 4-bit BP support for MX25L6405D

2020-12-07 Thread Sieng Piaw Liew
Enable 4-bit Block Protect support for MX256405D and its variants using
the same ID.

Tested on Innacom W3400V6 router with MX25L6406E chip.
https://github.com/openwrt/openwrt/pull/3501

Signed-off-by: Sieng Piaw Liew 
---
Changes in v2:
- Add SPI_NOR_HAS_LOCK which SPI_NOR_4BIT_BP required.

 drivers/mtd/spi-nor/macronix.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/mtd/spi-nor/macronix.c b/drivers/mtd/spi-nor/macronix.c
index 9203abaac229..033ede381673 100644
--- a/drivers/mtd/spi-nor/macronix.c
+++ b/drivers/mtd/spi-nor/macronix.c
@@ -42,7 +42,9 @@ static const struct flash_info macronix_parts[] = {
{ "mx25l1606e",  INFO(0xc22015, 0, 64 * 1024,  32, SECT_4K) },
{ "mx25l3205d",  INFO(0xc22016, 0, 64 * 1024,  64, SECT_4K) },
{ "mx25l3255e",  INFO(0xc29e16, 0, 64 * 1024,  64, SECT_4K) },
-   { "mx25l6405d",  INFO(0xc22017, 0, 64 * 1024, 128, SECT_4K) },
+   { "mx25l6405d",  INFO(0xc22017, 0, 64 * 1024, 128,
+ SECT_4K | SPI_NOR_HAS_LOCK |
+ SPI_NOR_4BIT_BP) },
{ "mx25u2033e",  INFO(0xc22532, 0, 64 * 1024,   4, SECT_4K) },
{ "mx25u3235f",  INFO(0xc22536, 0, 64 * 1024,  64,
  SECT_4K | SPI_NOR_DUAL_READ |
-- 
2.17.1



Re: [PATCH] mtd: spi-nor: macronix: enable 4-bit BP support for MX25L6405D

2020-12-07 Thread Sieng Piaw Liew
On Mon, Dec 07, 2020 at 05:53:20PM +, tudor.amba...@microchip.com wrote:
> Hi, Sieng,
> 
> On 12/7/20 4:46 AM, Sieng Piaw Liew wrote:
> > EXTERNAL EMAIL: Do not click links or open attachments unless you know the 
> > content is safe
> > 
> > Enable 4-bit Block Protect support for MX256405D and its variants using
> > the same ID.
> > 
> > Tested on Innacom W3400V6 router with MX25L6406E chip.
> 
> :) What kind of tests did you exactly make?

OpenWrt cannot write into spi-nor after first boot.
After hacking 4-bit BP support into OpenWrt's kernel v5.4, writing works.

> 
> > https://github.com/openwrt/openwrt/pull/3501
> > 
> > Signed-off-by: Sieng Piaw Liew 
> > ---
> >  drivers/mtd/spi-nor/macronix.c | 3 ++-
> >  1 file changed, 2 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/mtd/spi-nor/macronix.c b/drivers/mtd/spi-nor/macronix.c
> > index 9203abaac229..7aa8b1ee9daa 100644
> > --- a/drivers/mtd/spi-nor/macronix.c
> > +++ b/drivers/mtd/spi-nor/macronix.c
> > @@ -42,7 +42,8 @@ static const struct flash_info macronix_parts[] = {
> > { "mx25l1606e",  INFO(0xc22015, 0, 64 * 1024,  32, SECT_4K) },
> > { "mx25l3205d",  INFO(0xc22016, 0, 64 * 1024,  64, SECT_4K) },
> > { "mx25l3255e",  INFO(0xc29e16, 0, 64 * 1024,  64, SECT_4K) },
> > -   { "mx25l6405d",  INFO(0xc22017, 0, 64 * 1024, 128, SECT_4K) },
> > +   { "mx25l6405d",  INFO(0xc22017, 0, 64 * 1024, 128,
> > + SECT_4K | SPI_NOR_4BIT_BP) },
> 
> I assume this won't work because it misses the SPI_NOR_HAS_LOCK flag.
> 
> Cheers,
> ta

Yes, I'll have v2 patch sent shortly.

> 
> > { "mx25u2033e",  INFO(0xc22532, 0, 64 * 1024,   4, SECT_4K) },
> > { "mx25u3235f",  INFO(0xc22536, 0, 64 * 1024,  64,
> >   SECT_4K | SPI_NOR_DUAL_READ |
> > --
> > 2.17.1
> > 
> 


[PATCH] mtd: spi-nor: macronix: enable 4-bit BP support for MX25L6405D

2020-12-06 Thread Sieng Piaw Liew
Enable 4-bit Block Protect support for MX256405D and its variants using
the same ID.

Tested on Innacom W3400V6 router with MX25L6406E chip.
https://github.com/openwrt/openwrt/pull/3501

Signed-off-by: Sieng Piaw Liew 
---
 drivers/mtd/spi-nor/macronix.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/mtd/spi-nor/macronix.c b/drivers/mtd/spi-nor/macronix.c
index 9203abaac229..7aa8b1ee9daa 100644
--- a/drivers/mtd/spi-nor/macronix.c
+++ b/drivers/mtd/spi-nor/macronix.c
@@ -42,7 +42,8 @@ static const struct flash_info macronix_parts[] = {
{ "mx25l1606e",  INFO(0xc22015, 0, 64 * 1024,  32, SECT_4K) },
{ "mx25l3205d",  INFO(0xc22016, 0, 64 * 1024,  64, SECT_4K) },
{ "mx25l3255e",  INFO(0xc29e16, 0, 64 * 1024,  64, SECT_4K) },
-   { "mx25l6405d",  INFO(0xc22017, 0, 64 * 1024, 128, SECT_4K) },
+   { "mx25l6405d",  INFO(0xc22017, 0, 64 * 1024, 128,
+ SECT_4K | SPI_NOR_4BIT_BP) },
{ "mx25u2033e",  INFO(0xc22532, 0, 64 * 1024,   4, SECT_4K) },
{ "mx25u3235f",  INFO(0xc22536, 0, 64 * 1024,  64,
  SECT_4K | SPI_NOR_DUAL_READ |
-- 
2.17.1