On Thursday 11 December 2014 19:42:27 Ding Tianhong wrote:
> v9:
> - There is no tx completion interrupts to free DMAd Tx packets, it means taht
>   we rely on new tx packets arriving to run the destructors of completed 
> packets,
>   which open up space in their sockets's send queues. Sometimes we don't get 
> such
>   new packets causing Tx to stall, a single UDP transmitter is a good example 
> of
>   this situation, so we need a clean up workqueue to reclaims completed 
> packets,
>   the workqueue will only free the last packets which is already stay for 
> several jiffies.
>   Also fix some format cleanups.

You must have missed the reply in which David Miller explained why
you can't call skb_orphan and rely on an occasional cleanup of the
queue. Please use something like the patch below:

- drop the broken skb_orphan call
- drop the workqueue
- batch cleanup based on tx_coalesce_frames/usecs for better throughput
- use a reasonable default tx timeout (200us, could be shorted
  based on measurements) with a range timer
- fix napi poll function return value
- use a lockless queue for cleanup

Signed-off-by: Arnd Bergmann <[email protected]>

Feel free to fold this into your patch rather than keeping it separate.
Completely untested, probably contains some bugs. Please ask if you
have questions about this.

diff --git a/drivers/net/ethernet/hisilicon/hip04_eth.c 
b/drivers/net/ethernet/hisilicon/hip04_eth.c
index 9d37b670db6a..d85c5287654e 100644
--- a/drivers/net/ethernet/hisilicon/hip04_eth.c
+++ b/drivers/net/ethernet/hisilicon/hip04_eth.c
@@ -12,6 +12,7 @@
 #include <linux/etherdevice.h>
 #include <linux/platform_device.h>
 #include <linux/interrupt.h>
+#include <linux/ktime.h>
 #include <linux/of_address.h>
 #include <linux/phy.h>
 #include <linux/of_mdio.h>
@@ -157,9 +158,11 @@ struct hip04_priv {
        struct sk_buff *tx_skb[TX_DESC_NUM];
        dma_addr_t tx_phys[TX_DESC_NUM];
        unsigned int tx_head;
-       unsigned int tx_tail;
-       int tx_count;
-       unsigned long last_tx;
+
+       /* FIXME: make these adjustable through ethtool */
+       int tx_coalesce_frames;
+       int tx_coalesce_usecs;
+       struct hrtimer tx_coalesce_timer;
 
        unsigned char *rx_buf[RX_DESC_NUM];
        dma_addr_t rx_phys[RX_DESC_NUM];
@@ -171,10 +174,15 @@ struct hip04_priv {
        struct regmap *map;
        struct work_struct tx_timeout_task;
 
-       struct workqueue_struct *wq;
-       struct delayed_work tx_clean_task;
+       /* written only by tx cleanup */
+       unsigned int tx_tail ____cacheline_aligned_in_smp;
 };
 
+static inline unsigned int tx_count(unsigned int head, unsigned int tail)
+{
+       return (head - tail) % (TX_DESC_NUM - 1);
+}
+
 static void hip04_config_port(struct net_device *ndev, u32 speed, u32 duplex)
 {
        struct hip04_priv *priv = netdev_priv(ndev);
@@ -351,18 +359,21 @@ static int hip04_set_mac_address(struct net_device *ndev, 
void *addr)
        return 0;
 }
 
-static void hip04_tx_reclaim(struct net_device *ndev, bool force)
+static int hip04_tx_reclaim(struct net_device *ndev, bool force)
 {
        struct hip04_priv *priv = netdev_priv(ndev);
        unsigned tx_tail = priv->tx_tail;
        struct tx_desc *desc;
        unsigned int bytes_compl = 0, pkts_compl = 0;
+       unsigned int count;
 
-       if (priv->tx_count == 0)
+       smp_rmb();
+       count = tx_count(ACCESS_ONCE(priv->tx_head), tx_tail);
+       if (count == 0)
                goto out;
 
-       while ((tx_tail != priv->tx_head) || (priv->tx_count == TX_DESC_NUM)) {
-               desc = &priv->tx_desc[priv->tx_tail];
+       while (count) {
+               desc = &priv->tx_desc[tx_tail];
                if (desc->send_addr != 0) {
                        if (force)
                                desc->send_addr = 0;
@@ -381,57 +392,37 @@ static void hip04_tx_reclaim(struct net_device *ndev, 
bool force)
                dev_kfree_skb(priv->tx_skb[tx_tail]);
                priv->tx_skb[tx_tail] = NULL;
                tx_tail = TX_NEXT(tx_tail);
-               priv->tx_count--;
-
-               if (priv->tx_count <= 0)
-                       break;
+               count--;
        }
 
        priv->tx_tail = tx_tail;
+       smp_wmb(); /* Ensure tx_tail visible to xmit */
 
-       /* Ensure tx_tail & tx_count visible to xmit */
-       smp_mb();
 out:
-
        if (pkts_compl || bytes_compl)
                netdev_completed_queue(ndev, pkts_compl, bytes_compl);
 
-       if (unlikely(netif_queue_stopped(ndev)) &&
-           (priv->tx_count < TX_DESC_NUM))
+       if (unlikely(netif_queue_stopped(ndev)) && (count < (TX_DESC_NUM - 1)))
                netif_wake_queue(ndev);
-}
 
-static void hip04_tx_clean_monitor(struct work_struct *work)
-{
-       struct hip04_priv *priv = container_of(work, struct hip04_priv,
-                                              tx_clean_task.work);
-       struct net_device *ndev = priv->ndev;
-       int delta_in_ticks = msecs_to_jiffies(1000);
-
-       if (!time_in_range(jiffies, priv->last_tx,
-                          priv->last_tx + delta_in_ticks)) {
-               netif_tx_lock(ndev);
-               hip04_tx_reclaim(ndev, false);
-               netif_tx_unlock(ndev);
-       }
-       queue_delayed_work(priv->wq, &priv->tx_clean_task, delta_in_ticks);
+       return count;
 }
 
 static int hip04_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 {
        struct hip04_priv *priv = netdev_priv(ndev);
        struct net_device_stats *stats = &ndev->stats;
-       unsigned int tx_head = priv->tx_head;
+       unsigned int tx_head = priv->tx_head, count;
        struct tx_desc *desc = &priv->tx_desc[tx_head];
        dma_addr_t phys;
 
-       if (priv->tx_count >= TX_DESC_NUM) {
+       smp_rmb();
+       count = tx_count(tx_head, ACCESS_ONCE(priv->tx_tail));
+       if (count == (TX_DESC_NUM - 1)) {
                netif_stop_queue(ndev);
                return NETDEV_TX_BUSY;
        }
 
-       hip04_tx_reclaim(ndev, false);
-
        phys = dma_map_single(&ndev->dev, skb->data, skb->len, DMA_TO_DEVICE);
        if (dma_mapping_error(&ndev->dev, phys)) {
                dev_kfree_skb(skb);
@@ -447,20 +438,33 @@ static int hip04_mac_start_xmit(struct sk_buff *skb, 
struct net_device *ndev)
        desc->wb_addr = cpu_to_be32(phys);
        skb_tx_timestamp(skb);
 
-       /* Don't wait up for transmitted skbs to be freed. */
-       skb_orphan(skb);
-
+       /* FIXME: eliminate this mmio access if xmit_more is set */
        hip04_set_xmit_desc(priv, phys);
        priv->tx_head = TX_NEXT(tx_head);
+       count++;
        netdev_sent_queue(ndev, skb->len);
 
        stats->tx_bytes += skb->len;
        stats->tx_packets++;
-       priv->tx_count++;
-       priv->last_tx = jiffies;
 
-       /* Ensure tx_head & tx_count update visible to tx reclaim */
-       smp_mb();
+       /* Ensure tx_head update visible to tx reclaim */
+       smp_wmb();
+
+       /* queue is getting full, better start cleaning up now */
+       if (count >= priv->tx_coalesce_frames) {
+               if (napi_schedule_prep(&priv->napi)) {
+                       /* disable rx interrupt and timer */
+                       priv->reg_inten &= ~(RCV_INT);
+                       writel_relaxed(DEF_INT_MASK & ~RCV_INT,
+                                      priv->base + PPE_INTEN);
+                       hrtimer_cancel(&priv->tx_coalesce_timer);
+                       __napi_schedule(&priv->napi);
+               }
+       } else if (!hrtimer_is_queued(&priv->tx_coalesce_timer)) {
+               /* cleanup not pending yet, start a new timer */
+               hrtimer_start_expires(&priv->tx_coalesce_timer,
+                                     HRTIMER_MODE_REL);
+       }
 
        return NETDEV_TX_OK;
 }
@@ -477,6 +481,7 @@ static int hip04_rx_poll(struct napi_struct *napi, int 
budget)
        bool last = false;
        dma_addr_t phys;
        int rx = 0;
+       int tx_remaining;
        u16 len;
        u32 err;
 
@@ -513,11 +518,11 @@ static int hip04_rx_poll(struct napi_struct *napi, int 
budget)
 
                buf = netdev_alloc_frag(priv->rx_buf_size);
                if (!buf)
-                       return -ENOMEM;
+                       goto done;
                phys = dma_map_single(&ndev->dev, buf,
                                      RX_BUF_SIZE, DMA_FROM_DEVICE);
                if (dma_mapping_error(&ndev->dev, phys))
-                       return -EIO;
+                       goto done;
                priv->rx_buf[priv->rx_head] = buf;
                priv->rx_phys[priv->rx_head] = phys;
                hip04_set_recv_desc(priv, phys);
@@ -537,6 +542,11 @@ static int hip04_rx_poll(struct napi_struct *napi, int 
budget)
        }
        napi_complete(napi);
 done:
+       /* clean up tx descriptors and start a new timer if necessary */
+       tx_remaining = hip04_tx_reclaim(ndev, false);
+       if (rx < budget && tx_remaining)
+               hrtimer_start_expires(&priv->tx_coalesce_timer, 
HRTIMER_MODE_REL);
+
        return rx;
 }
 
@@ -547,6 +557,9 @@ static irqreturn_t hip04_mac_interrupt(int irq, void 
*dev_id)
        struct net_device_stats *stats = &ndev->stats;
        u32 ists = readl_relaxed(priv->base + PPE_INTSTS);
 
+       if (!ists)
+               return IRQ_NONE;
+
        writel_relaxed(DEF_INT_MASK, priv->base + PPE_RINT);
 
        if (unlikely(ists & DEF_INT_ERR)) {
@@ -560,16 +573,32 @@ static irqreturn_t hip04_mac_interrupt(int irq, void 
*dev_id)
                }
        }
 
-       if (ists & RCV_INT) {
+       if (ists & RCV_INT && napi_schedule_prep(&priv->napi)) {
                /* disable rx interrupt */
                priv->reg_inten &= ~(RCV_INT);
-               writel_relaxed(priv->reg_inten, priv->base + PPE_INTEN);
-               napi_schedule(&priv->napi);
+               writel_relaxed(DEF_INT_MASK & ~RCV_INT, priv->base + PPE_INTEN);
+               hrtimer_cancel(&priv->tx_coalesce_timer);
+               __napi_schedule(&priv->napi);
        }
 
        return IRQ_HANDLED;
 }
 
+enum hrtimer_restart tx_done(struct hrtimer *hrtimer)
+{
+       struct hip04_priv *priv;
+       priv = container_of(hrtimer, struct hip04_priv, tx_coalesce_timer);
+
+       if (napi_schedule_prep(&priv->napi)) {
+               /* disable rx interrupt */
+               priv->reg_inten &= ~(RCV_INT);
+               writel_relaxed(DEF_INT_MASK & ~RCV_INT, priv->base + PPE_INTEN);
+               __napi_schedule(&priv->napi);
+       }
+
+       return HRTIMER_NORESTART;
+}
+
 static void hip04_adjust_link(struct net_device *ndev)
 {
        struct hip04_priv *priv = netdev_priv(ndev);
@@ -589,7 +618,6 @@ static int hip04_mac_open(struct net_device *ndev)
        priv->rx_head = 0;
        priv->tx_head = 0;
        priv->tx_tail = 0;
-       priv->tx_count = 0;
        hip04_reset_ppe(priv);
 
        for (i = 0; i < RX_DESC_NUM; i++) {
@@ -612,9 +640,6 @@ static int hip04_mac_open(struct net_device *ndev)
        hip04_mac_enable(ndev);
        napi_enable(&priv->napi);
 
-       INIT_DELAYED_WORK(&priv->tx_clean_task, hip04_tx_clean_monitor);
-       queue_delayed_work(priv->wq, &priv->tx_clean_task, 0);
-
        return 0;
 }
 
@@ -623,8 +648,6 @@ static int hip04_mac_stop(struct net_device *ndev)
        struct hip04_priv *priv = netdev_priv(ndev);
        int i;
 
-       cancel_delayed_work_sync(&priv->tx_clean_task);
-
        napi_disable(&priv->napi);
        netif_stop_queue(ndev);
        hip04_mac_disable(ndev);
@@ -725,6 +748,7 @@ static int hip04_mac_probe(struct platform_device *pdev)
        struct hip04_priv *priv;
        struct resource *res;
        unsigned int irq;
+       ktime_t txtime;
        int ret;
 
        ndev = alloc_etherdev(sizeof(struct hip04_priv));
@@ -751,6 +775,21 @@ static int hip04_mac_probe(struct platform_device *pdev)
        priv->port = arg.args[0];
        priv->chan = arg.args[1] * RX_DESC_NUM;
 
+       hrtimer_init(&priv->tx_coalesce_timer, CLOCK_MONOTONIC, 
HRTIMER_MODE_REL);
+
+       /*
+        * BQL will try to keep the TX queue as short as possible, but it can't
+        * be faster than tx_coalesce_usecs, so we need a fast timeout here,
+        * but also long enough to gather up enough frames to ensure we don't
+        * get more interrupts than necessary.
+        * 200us is enough for 16 frames of 1500 bytes at gigabit ethernet rate
+        */
+       priv->tx_coalesce_frames = TX_DESC_NUM * 3 / 4;
+       priv->tx_coalesce_usecs = 200;
+       /* allow timer to fire after half the time at the earliest */
+       txtime = ktime_set(0, priv->tx_coalesce_usecs * NSEC_PER_USEC / 2);
+       hrtimer_set_expires_range(&priv->tx_coalesce_timer, txtime, txtime);
+
        priv->map = syscon_node_to_regmap(arg.np);
        if (IS_ERR(priv->map)) {
                dev_warn(d, "no syscon hisilicon,hip04-ppe\n");
@@ -788,12 +827,6 @@ static int hip04_mac_probe(struct platform_device *pdev)
                }
        }
 
-       priv->wq = create_singlethread_workqueue(ndev->name);
-       if (!priv->wq) {
-               ret = -ENOMEM;
-               goto init_fail;
-       }
-
        INIT_WORK(&priv->tx_timeout_task, hip04_tx_timeout_task);
 
        ether_setup(ndev);
@@ -848,8 +881,6 @@ static int hip04_remove(struct platform_device *pdev)
        free_irq(ndev->irq, ndev);
        of_node_put(priv->phy_node);
        cancel_work_sync(&priv->tx_timeout_task);
-       if (priv->wq)
-               destroy_workqueue(priv->wq);
        free_netdev(ndev);
 
        return 0;

--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to