On Mon, 12 Mar 2012 14:44:27 +0200
<horia.gea...@freescale.com> wrote:

> This patch replaces the back-half implementation of talitos crypto engine from
> tasklet to NAPI. The decision to do this was based on improved performance
> (around 7%).
> A similiar patch (not posted yet) was tested for caam crypto engine, with
> 10-15% improvement over tasklet.
> 
> Since having crypto engines use the net softirq is probably not acceptable,
> I would like to hear your comments on what options do I have to make this
> upstreamable.
> Besides current approach, I am considering the following:
> - defining a new softirq for crypto engines, having a higher priority than the
> NET_RX_SOFTIRQ

but the priority just has to be equal to NET_RX_SOFTIRQ to get the
net--crypto performance balance, not necessarily higher IIRC.

> - using tasklet_hi_schedule instead of tasklet_schedule

this has done close to nothing for performance in my experience -
but there is no other tasklet competing for the slice in my IPSec
fwding tests either.

> Let me know if any of these two fits better or if something else is preferred.

Herbert/Dave,

Is it ok for a crypto driver to depend on NET?  If not, how should
the NAPI-style flow be abstracted out of NET?

Thanks,

Kim

> Thank you
> 
> From 20f30ef6fdfe641f1c30f94320891715ffee33a2 Mon Sep 17 00:00:00 2001
> From: Sandeep Malik <sandeep.ma...@freescale.com>
> Date: Sat, 12 Jun 2010 14:08:47 +0800
> Subject: [RFC,PATCH] crypto: talitos - Replace the tasklet implementation 
> with NAPI
> 
> This patch updates the current tasklet implement to NAPI so as
> the system is more balanced in the terms that the packet submission
> and the packet forwarding after being processed can be done at
> the same priority.
> 
> Signed-off-by: Sandeep Malik <sandeep.ma...@freescale.com>
> Signed-off-by: Horia Geanta <horia.gea...@freescale.com>
> ---
>  drivers/crypto/Kconfig   |    2 +-
>  drivers/crypto/talitos.c |  145 
> +++++++++++++++++++++++++++++++++-------------
>  drivers/crypto/talitos.h |    4 +-
>  3 files changed, 109 insertions(+), 42 deletions(-)
> 
> diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
> index e707979..682096b 100644
> --- a/drivers/crypto/Kconfig
> +++ b/drivers/crypto/Kconfig
> @@ -218,7 +218,7 @@ config CRYPTO_DEV_TALITOS
>       select CRYPTO_ALGAPI
>       select CRYPTO_AUTHENC
>       select HW_RANDOM
> -     depends on FSL_SOC
> +     depends on FSL_SOC && NET
>       help
>         Say 'Y' here to use the Freescale Security Engine (SEC)
>         to offload cryptographic algorithm computation.
> diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
> index dc641c7..f368579 100644
> --- a/drivers/crypto/talitos.c
> +++ b/drivers/crypto/talitos.c
> @@ -1,7 +1,7 @@
>  /*
>   * talitos - Freescale Integrated Security Engine (SEC) device driver
>   *
> - * Copyright (c) 2008-2011 Freescale Semiconductor, Inc.
> + * Copyright (c) 2008-2012 Freescale Semiconductor, Inc.
>   *
>   * Scatterlist Crypto API glue code copied from files with the following:
>   * Copyright (c) 2006-2007 Herbert Xu <herb...@gondor.apana.org.au>
> @@ -37,6 +37,7 @@
>  #include <linux/io.h>
>  #include <linux/spinlock.h>
>  #include <linux/rtnetlink.h>
> +#include <linux/netdevice.h>
>  #include <linux/slab.h>
>  
>  #include <crypto/algapi.h>
> @@ -121,6 +122,7 @@ struct talitos_channel {
>  struct talitos_private {
>       struct device *dev;
>       struct platform_device *ofdev;
> +     struct net_device __percpu *netdev;
>       void __iomem *reg;
>       int irq[2];
>  
> @@ -145,8 +147,8 @@ struct talitos_private {
>       /* next channel to be assigned next incoming descriptor */
>       atomic_t last_chan ____cacheline_aligned;
>  
> -     /* request callback tasklet */
> -     struct tasklet_struct done_task[2];
> +     /* request callback napi */
> +     struct napi_struct __percpu *done_task[2];
>  
>       /* list of registered algorithms */
>       struct list_head alg_list;
> @@ -349,17 +351,18 @@ static int talitos_submit(struct device *dev, int ch, 
> struct talitos_desc *desc,
>  /*
>   * process what was done, notify callback of error if not
>   */
> -static void flush_channel(struct device *dev, int ch, int error, int 
> reset_ch)
> +static int flush_channel(struct device *dev, int ch, int error, int reset_ch,
> +                      int weight)
>  {
>       struct talitos_private *priv = dev_get_drvdata(dev);
>       struct talitos_request *request, saved_req;
>       unsigned long flags;
> -     int tail, status;
> +     int tail, status, count = 0;
>  
>       spin_lock_irqsave(&priv->chan[ch].tail_lock, flags);
>  
>       tail = priv->chan[ch].tail;
> -     while (priv->chan[ch].fifo[tail].desc) {
> +     while (priv->chan[ch].fifo[tail].desc && (count < weight)) {
>               request = &priv->chan[ch].fifo[tail];
>  
>               /* descriptors with their done bits set don't get the error */
> @@ -396,43 +399,55 @@ static void flush_channel(struct device *dev, int ch, 
> int error, int reset_ch)
>                                  status);
>               /* channel may resume processing in single desc error case */
>               if (error && !reset_ch && status == error)
> -                     return;
> +                     return 0;
>               spin_lock_irqsave(&priv->chan[ch].tail_lock, flags);
>               tail = priv->chan[ch].tail;
> +             count++;
>       }
>  
>       spin_unlock_irqrestore(&priv->chan[ch].tail_lock, flags);
> +
> +     return count;
>  }
>  
>  /*
>   * process completed requests for channels that have done status
>   */
> -#define DEF_TALITOS_DONE(name, ch_done_mask)                         \
> -static void talitos_done_##name(unsigned long data)                  \
> +#define DEF_TALITOS_DONE(name, ch_done_mask, num_ch)                 \
> +static int talitos_done_##name(struct napi_struct *napi, int budget) \
>  {                                                                    \
> -     struct device *dev = (struct device *)data;                     \
> +     struct device *dev = &napi->dev->dev;                           \
>       struct talitos_private *priv = dev_get_drvdata(dev);            \
> +     int budget_per_ch, work_done = 0;                               \
>                                                                       \
> +     budget_per_ch = budget / num_ch;                                \
>       if (ch_done_mask & 1)                                           \
> -             flush_channel(dev, 0, 0, 0);                            \
> +             work_done += flush_channel(dev, 0, 0, 0, budget_per_ch);\
>       if (priv->num_channels == 1)                                    \
>               goto out;                                               \
>       if (ch_done_mask & (1 << 2))                                    \
> -             flush_channel(dev, 1, 0, 0);                            \
> +             work_done += flush_channel(dev, 1, 0, 0, budget_per_ch);\
>       if (ch_done_mask & (1 << 4))                                    \
> -             flush_channel(dev, 2, 0, 0);                            \
> +             work_done += flush_channel(dev, 2, 0, 0, budget_per_ch);\
>       if (ch_done_mask & (1 << 6))                                    \
> -             flush_channel(dev, 3, 0, 0);                            \
> +             work_done += flush_channel(dev, 3, 0, 0, budget_per_ch);\
>                                                                       \
>  out:                                                                 \
> -     /* At this point, all completed channels have been processed */ \
> -     /* Unmask done interrupts for channels completed later on. */   \
> -     setbits32(priv->reg + TALITOS_IMR, ch_done_mask);               \
> -     setbits32(priv->reg + TALITOS_IMR_LO, TALITOS_IMR_LO_INIT);     \
> +     if (work_done < budget) {                                       \
> +             napi_complete(napi);                                    \
> +             /* At this point, all completed channels have been */   \
> +             /* processed. Unmask done interrupts for channels */    \
> +             /* completed later on. */                               \
> +             setbits32(priv->reg + TALITOS_IMR, ch_done_mask);       \
> +             setbits32(priv->reg + TALITOS_IMR_LO,                   \
> +                       TALITOS_IMR_LO_INIT);                         \
> +     }                                                               \
> +                                                                     \
> +     return work_done;                                               \
>  }
> -DEF_TALITOS_DONE(4ch, TALITOS_ISR_4CHDONE)
> -DEF_TALITOS_DONE(ch0_2, TALITOS_ISR_CH_0_2_DONE)
> -DEF_TALITOS_DONE(ch1_3, TALITOS_ISR_CH_1_3_DONE)
> +DEF_TALITOS_DONE(4ch, TALITOS_ISR_4CHDONE, 4)
> +DEF_TALITOS_DONE(ch0_2, TALITOS_ISR_CH_0_2_DONE, 2)
> +DEF_TALITOS_DONE(ch1_3, TALITOS_ISR_CH_1_3_DONE, 2)
>  
>  /*
>   * locate current (offending) descriptor
> @@ -582,7 +597,7 @@ static void talitos_error(struct device *dev, u32 isr, 
> u32 isr_lo)
>               if (v_lo & TALITOS_CCPSR_LO_SRL)
>                       dev_err(dev, "scatter return/length error\n");
>  
> -             flush_channel(dev, ch, error, reset_ch);
> +             flush_channel(dev, ch, error, reset_ch, priv->fifo_len);
>  
>               if (reset_ch) {
>                       reset_channel(dev, ch);
> @@ -606,14 +621,14 @@ static void talitos_error(struct device *dev, u32 isr, 
> u32 isr_lo)
>  
>               /* purge request queues */
>               for (ch = 0; ch < priv->num_channels; ch++)
> -                     flush_channel(dev, ch, -EIO, 1);
> +                     flush_channel(dev, ch, -EIO, 1, priv->fifo_len);
>  
>               /* reset and reinitialize the device */
>               init_device(dev);
>       }
>  }
>  
> -#define DEF_TALITOS_INTERRUPT(name, ch_done_mask, ch_err_mask, tlet)        \
> +#define DEF_TALITOS_INTERRUPT(name, ch_done_mask, ch_err_mask, sirq)        \
>  static irqreturn_t talitos_interrupt_##name(int irq, void *data)            \
>  {                                                                           \
>       struct device *dev = data;                                             \
> @@ -633,7 +648,8 @@ static irqreturn_t talitos_interrupt_##name(int irq, void 
> *data)         \
>                       /* mask further done interrupts. */                    \
>                       clrbits32(priv->reg + TALITOS_IMR, ch_done_mask);      \
>                       /* done_task will unmask done interrupts at exit */    \
> -                     tasklet_schedule(&priv->done_task[tlet]);              \
> +                     napi_schedule(per_cpu_ptr(priv->done_task[sirq],       \
> +                                               smp_processor_id()));        \
>               }                                                              \
>                                                                              \
>       return (isr & (ch_done_mask | ch_err_mask) || isr_lo) ? IRQ_HANDLED :  \
> @@ -2555,7 +2571,7 @@ static int talitos_remove(struct platform_device *ofdev)
>       struct device *dev = &ofdev->dev;
>       struct talitos_private *priv = dev_get_drvdata(dev);
>       struct talitos_crypto_alg *t_alg, *n;
> -     int i;
> +     int i, j;
>  
>       list_for_each_entry_safe(t_alg, n, &priv->alg_list, entry) {
>               switch (t_alg->algt.type) {
> @@ -2574,25 +2590,32 @@ static int talitos_remove(struct platform_device 
> *ofdev)
>       if (hw_supports(dev, DESC_HDR_SEL0_RNG))
>               talitos_unregister_rng(dev);
>  
> -     for (i = 0; i < priv->num_channels; i++)
> -             kfree(priv->chan[i].fifo);
> -
> -     kfree(priv->chan);
> -
>       for (i = 0; i < 2; i++)
>               if (priv->irq[i]) {
>                       free_irq(priv->irq[i], dev);
>                       irq_dispose_mapping(priv->irq[i]);
> +
> +                     for_each_possible_cpu(j) {
> +                             napi_disable(per_cpu_ptr(priv->done_task[i],
> +                                                      j));
> +                             netif_napi_del(per_cpu_ptr(priv->done_task[i],
> +                                                        j));
> +                     }
> +
> +                     free_percpu(priv->done_task[i]);
>               }
>  
> -     tasklet_kill(&priv->done_task[0]);
> -     if (priv->irq[1])
> -             tasklet_kill(&priv->done_task[1]);
> +     for (i = 0; i < priv->num_channels; i++)
> +             kfree(priv->chan[i].fifo);
> +
> +     kfree(priv->chan);
>  
>       iounmap(priv->reg);
>  
>       dev_set_drvdata(dev, NULL);
>  
> +     free_percpu(priv->netdev);
> +
>       kfree(priv);
>  
>       return 0;
> @@ -2718,19 +2741,61 @@ static int talitos_probe(struct platform_device 
> *ofdev)
>       dev_set_drvdata(dev, priv);
>  
>       priv->ofdev = ofdev;
> +     priv->dev = dev;
> +
> +     priv->netdev = alloc_percpu(struct net_device);
> +     if (!priv->netdev) {
> +             dev_err(dev, "failed to allocate netdevice\n");
> +             err = -ENOMEM;
> +             goto err_out;
> +     }
> +
> +     for_each_possible_cpu(i) {
> +             err = init_dummy_netdev(per_cpu_ptr(priv->netdev, i));
> +             if (err) {
> +                     dev_err(dev, "failed to initialize dummy netdevice\n");
> +                     goto err_out;
> +             }
> +             (per_cpu_ptr(priv->netdev, i))->dev = *dev;
> +     }
>  
>       err = talitos_probe_irq(ofdev);
>       if (err)
>               goto err_out;
>  
> +     priv->done_task[0] = alloc_percpu(struct napi_struct);
> +     if (!priv->done_task[0]) {
> +             dev_err(dev, "failed to allocate napi for 1st irq\n");
> +             err = -ENOMEM;
> +             goto err_out;
> +     }
> +
>       if (!priv->irq[1]) {
> -             tasklet_init(&priv->done_task[0], talitos_done_4ch,
> -                          (unsigned long)dev);
> +             for_each_possible_cpu(i) {
> +                     netif_napi_add(per_cpu_ptr(priv->netdev, i),
> +                                    per_cpu_ptr(priv->done_task[0], i),
> +                                    talitos_done_4ch, TALITOS_NAPI_WEIGHT);
> +                     napi_enable(per_cpu_ptr(priv->done_task[0], i));
> +             }
>       } else {
> -             tasklet_init(&priv->done_task[0], talitos_done_ch0_2,
> -                          (unsigned long)dev);
> -             tasklet_init(&priv->done_task[1], talitos_done_ch1_3,
> -                          (unsigned long)dev);
> +             priv->done_task[1] = alloc_percpu(struct napi_struct);
> +             if (!priv->done_task[1]) {
> +                     dev_err(dev, "failed to allocate napi for 2nd irq\n");
> +                     err = -ENOMEM;
> +                     goto err_out;
> +             }
> +
> +             for_each_possible_cpu(i) {
> +                     netif_napi_add(per_cpu_ptr(priv->netdev, i),
> +                                    per_cpu_ptr(priv->done_task[0], i),
> +                                    talitos_done_ch0_2, TALITOS_NAPI_WEIGHT);
> +                     napi_enable(per_cpu_ptr(priv->done_task[0], i));
> +
> +                     netif_napi_add(per_cpu_ptr(priv->netdev, i),
> +                                    per_cpu_ptr(priv->done_task[1], i),
> +                                    talitos_done_ch1_3, TALITOS_NAPI_WEIGHT);
> +                     napi_enable(per_cpu_ptr(priv->done_task[1], i));
> +             }
>       }
>  
>       INIT_LIST_HEAD(&priv->alg_list);
> diff --git a/drivers/crypto/talitos.h b/drivers/crypto/talitos.h
> index 3c17395..ba62abc 100644
> --- a/drivers/crypto/talitos.h
> +++ b/drivers/crypto/talitos.h
> @@ -1,7 +1,7 @@
>  /*
>   * Freescale SEC (talitos) device register and descriptor header defines
>   *
> - * Copyright (c) 2006-2011 Freescale Semiconductor, Inc.
> + * Copyright (c) 2006-2012 Freescale Semiconductor, Inc.
>   *
>   * Redistribution and use in source and binary forms, with or without
>   * modification, are permitted provided that the following conditions
> @@ -28,6 +28,8 @@
>   *
>   */
>  
> +#define TALITOS_NAPI_WEIGHT     12
> +
>  /*
>   * TALITOS_xxx_LO addresses point to the low data bits (32-63) of the 
> register
>   */
> -- 
> 1.7.3.4
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
  • Hi, horia.geanta
    • balancing crypto with NAPI flow vs. tasklet (was: Re: Hi... Kim Phillips

Reply via email to