Re: [PATCH v2] crypto: arm/chacha20 - faster 8-bit rotations and other optimizations

2018-09-03 Thread Herbert Xu
On Sat, Sep 01, 2018 at 12:17:07AM -0700, Eric Biggers wrote:
> From: Eric Biggers 
> 
> Optimize ChaCha20 NEON performance by:
> 
> - Implementing the 8-bit rotations using the 'vtbl.8' instruction.
> - Streamlining the part that adds the original state and XORs the data.
> - Making some other small tweaks.
> 
> On ARM Cortex-A7, these optimizations improve ChaCha20 performance from
> about 12.08 cycles per byte to about 11.37 -- a 5.9% improvement.
> 
> There is a tradeoff involved with the 'vtbl.8' rotation method since
> there is at least one CPU (Cortex-A53) where it's not fastest.  But it
> seems to be a better default; see the added comment.  Overall, this
> patch reduces Cortex-A53 performance by less than 0.5%.
> 
> Signed-off-by: Eric Biggers 
> ---
>  arch/arm/crypto/chacha20-neon-core.S | 277 ++-
>  1 file changed, 143 insertions(+), 134 deletions(-)

Patch applied.  Thanks.
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


[PATCH v2] crypto: arm/chacha20 - faster 8-bit rotations and other optimizations

2018-09-01 Thread Eric Biggers
From: Eric Biggers 

Optimize ChaCha20 NEON performance by:

- Implementing the 8-bit rotations using the 'vtbl.8' instruction.
- Streamlining the part that adds the original state and XORs the data.
- Making some other small tweaks.

On ARM Cortex-A7, these optimizations improve ChaCha20 performance from
about 12.08 cycles per byte to about 11.37 -- a 5.9% improvement.

There is a tradeoff involved with the 'vtbl.8' rotation method since
there is at least one CPU (Cortex-A53) where it's not fastest.  But it
seems to be a better default; see the added comment.  Overall, this
patch reduces Cortex-A53 performance by less than 0.5%.

Signed-off-by: Eric Biggers 
---
 arch/arm/crypto/chacha20-neon-core.S | 277 ++-
 1 file changed, 143 insertions(+), 134 deletions(-)

diff --git a/arch/arm/crypto/chacha20-neon-core.S 
b/arch/arm/crypto/chacha20-neon-core.S
index 451a849ad5186..50e7b98968189 100644
--- a/arch/arm/crypto/chacha20-neon-core.S
+++ b/arch/arm/crypto/chacha20-neon-core.S
@@ -18,6 +18,34 @@
  * (at your option) any later version.
  */
 
+ /*
+  * NEON doesn't have a rotate instruction.  The alternatives are, more or 
less:
+  *
+  * (a)  vshl.u32 + vsri.u32   (needs temporary register)
+  * (b)  vshl.u32 + vshr.u32 + vorr(needs temporary register)
+  * (c)  vrev32.16 (16-bit rotations only)
+  * (d)  vtbl.8 + vtbl.8   (multiple of 8 bits rotations only,
+  * needs index vector)
+  *
+  * ChaCha20 has 16, 12, 8, and 7-bit rotations.  For the 12 and 7-bit
+  * rotations, the only choices are (a) and (b).  We use (a) since it takes
+  * two-thirds the cycles of (b) on both Cortex-A7 and Cortex-A53.
+  *
+  * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
+  * and doesn't need a temporary register.
+  *
+  * For the 8-bit rotation, we use vtbl.8 + vtbl.8.  On Cortex-A7, this 
sequence
+  * is twice as fast as (a), even when doing (a) on multiple registers
+  * simultaneously to eliminate the stall between vshl and vsri.  Also, it
+  * parallelizes better when temporary registers are scarce.
+  *
+  * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed 
as
+  * (a), so the need to load the rotation table actually makes the vtbl method
+  * slightly slower overall on that CPU (~1.3% slower ChaCha20).  Still, it
+  * seems to be a good compromise to get a more significant speed boost on some
+  * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
+  */
+
 #include 
 
.text
@@ -46,7 +74,9 @@ ENTRY(chacha20_block_xor_neon)
vmovq10, q2
vmovq11, q3
 
+   adr ip, .Lrol8_table
mov r3, #10
+   vld1.8  {d10}, [ip, :64]
 
 .Ldoubleround:
// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
@@ -62,9 +92,9 @@ ENTRY(chacha20_block_xor_neon)
 
// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
vadd.i32q0, q0, q1
-   veorq4, q3, q0
-   vshl.u32q3, q4, #8
-   vsri.u32q3, q4, #24
+   veorq3, q3, q0
+   vtbl.8  d6, {d6}, d10
+   vtbl.8  d7, {d7}, d10
 
// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
vadd.i32q2, q2, q3
@@ -92,9 +122,9 @@ ENTRY(chacha20_block_xor_neon)
 
// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
vadd.i32q0, q0, q1
-   veorq4, q3, q0
-   vshl.u32q3, q4, #8
-   vsri.u32q3, q4, #24
+   veorq3, q3, q0
+   vtbl.8  d6, {d6}, d10
+   vtbl.8  d7, {d7}, d10
 
// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
vadd.i32q2, q2, q3
@@ -139,13 +169,17 @@ ENTRY(chacha20_block_xor_neon)
bx  lr
 ENDPROC(chacha20_block_xor_neon)
 
+   .align  4
+.Lctrinc:  .word   0, 1, 2, 3
+.Lrol8_table:  .byte   3, 0, 1, 2, 7, 4, 5, 6
+
.align  5
 ENTRY(chacha20_4block_xor_neon)
-   push{r4-r6, lr}
-   mov ip, sp  // preserve the stack pointer
-   sub r3, sp, #0x20   // allocate a 32 byte buffer
-   bic r3, r3, #0x1f   // aligned to 32 bytes
-   mov sp, r3
+   push{r4-r5}
+   mov r4, sp  // preserve the stack pointer
+   sub ip, sp, #0x20   // allocate a 32 byte buffer
+   bic ip, ip, #0x1f   // aligned to 32 bytes
+   mov sp, ip
 
// r0: Input state matrix, s
// r1: 4 data blocks output, o
@@ -155,25 +189,24 @@ ENTRY(chacha20_4block_xor_neon)
// This function encrypts four consecutive ChaCha20 blocks by loading
// the state matrix in NEON registers four times. The algorithm performs
// each operation on the corresponding word of each state matrix, hence
-   // requires no word