Re: [PATCH v2] powerpc64/sha256: fix loading overreads by loading less and shifting

Eric Richter Fri, 13 Sep 2024 14:50:33 -0700

On Fri, 2024-09-13 at 10:32 +0200, Niels Möller wrote:
> Below is a patch eliminating a few of the instructions used for
> indexing, and the T1 register. Would you like to try out on real
> hardware? I would expect performance to be unchanged or very
> marginally
> improved, but given your experience I can't rule out that some tweaks
> to
> the align directives are needed to rule out a regression.
>



If there is one thing I've come to expect, is strange unexpected
regressions. This is the performance diff from your patch, compared
against current master:

alg/mode                min             avg             max
----------------------  --------------  --------------  --------------
sha256 update           460.75 (-8.08)  460.92 (-8.03)  461.01 (-8.00)
hmac-sha256 64 bytes    127.22 (-0.03)  127.87 (+0.33)  128.47 (+0.50)
hmac-sha256 256 bytes   271.76 (-1.55)  272.96 (-0.82)  274.35 (-0.49)
hmac-sha256 1024 bytes  389.79 (-4.70)  391.25 (-4.48)  393.18 (-4.04)
hmac-sha256 4096 bytes  437.11 (-6.70)  437.52 (-6.88)  438.21 (-6.71)
hmac-sha256 single msg  452.83 (-1.55)  453.06 (-6.64)  453.22 (-7.47)

Playing around with nops and aligns, I've only been able to gain back a
small and inconsistent amount of performance (roughly +1 on this patch
at best) -- nothing that seems to indicate a pattern.

Incidentally, I'll be setting up the deeper test environment for some
colleagues early next week, so in using this code as an example, I
might be able to better find a more optimal code arrangement.

> With the freed up volatile registers, one could potentially use r11
> and
> r12 for additional constants TC32 and TC48, but since that would
> eliminate only a single instruction (replacing the two "addi INPUT,
> INPUT, 32" with a single "addi INPUT, INPUT, 64") from the main block
> loop, I doubt that's worth the effort.


Considering the strange unexpected results coming from alignment, it
could somehow yield better performance. I'll consider this with the
upcoming testing as well.

> 
> Regards,
> /Niels
> 
> diff --git a/powerpc64/p8/sha256-compress-n.asm
> b/powerpc64/p8/sha256-compress-n.asm
> index 75666deb..a8c5ee44 100644
> --- a/powerpc64/p8/sha256-compress-n.asm
> +++ b/powerpc64/p8/sha256-compress-n.asm
> @@ -40,11 +40,10 @@ define(`NUMBLOCKS', `r5')
>  define(`INPUT', `r6')
>  
>  define(`T0', `r7')
> -define(`T1', `r8')
> -define(`TK', `r9')
> -define(`COUNT', `r10')
> +define(`TK', `r8')
> +define(`COUNT', `r9')
>  define(`TC0', `0')   C Index instructions allow literal 0 instead
> of a GPR
> -define(`TC16', `r11')
> +define(`TC16', `r10')
>  
>  C State registers
>  define(`VSA', `v0')
> @@ -182,27 +181,25 @@ define(`LOAD', `
>  ')
>  
>  define(`DOLOADS', `
> -     IF_LE(`DATA_LOAD_VEC(VT0, .load_swap, T1)')
> +     IF_LE(`DATA_LOAD_VEC(VT0, .load_swap, T0)')
>       LOAD(0, TC0)
>       vsldoi  IV(1), IV(0), IV(0), 4
>       vsldoi  IV(2), IV(0), IV(0), 8
>       vsldoi  IV(3), IV(0), IV(0), 12
> -     addi    INPUT, INPUT, 16
> -     LOAD(4, TC0)
> +     LOAD(4, TC16)
>       vsldoi  IV(5), IV(4), IV(4), 4
>       vsldoi  IV(6), IV(4), IV(4), 8
>       vsldoi  IV(7), IV(4), IV(4), 12
> -     addi    INPUT, INPUT, 16
> +     addi    INPUT, INPUT, 32
>       LOAD(8, TC0)
>       vsldoi  IV(9), IV(8), IV(8), 4
>       vsldoi  IV(10), IV(8), IV(8), 8
>       vsldoi  IV(11), IV(8), IV(8), 12
> -     addi    INPUT, INPUT, 16
> -     LOAD(12, TC0)
> +     LOAD(12, TC16)
>       vsldoi  IV(13), IV(12), IV(12), 4
>       vsldoi  IV(14), IV(12), IV(12), 8
>       vsldoi  IV(15), IV(12), IV(12), 12
> -     addi    INPUT, INPUT, 16
> +     addi    INPUT, INPUT, 32
>  ')
>  
>  .text
> @@ -215,30 +212,24 @@ PROLOGUE(_nettle_sha256_compress_n)
>  
>       ALIGN(16)       C Appears necessary for optimal stores
>       li      TC16, 16
> -     li      T0, -16
> -     li      T1, -32
> -     stvx    v20, T0, SP
> -     stvx    v21, T1, SP
> +     subi    T0, SP, 32
> +     stvx    v20, TC16, T0
> +     stvx    v21, TC0, T0
>       subi    T0, T0, 32
> -     subi    T1, T1, 32
> -     stvx    v22, T0, SP
> -     stvx    v23, T1, SP
> +     stvx    v22, TC16, T0
> +     stvx    v23, TC0, T0
>       subi    T0, T0, 32
> -     subi    T1, T1, 32
> -     stvx    v24, T0, SP
> -     stvx    v25, T1, SP
> +     stvx    v24, TC16, T0
> +     stvx    v25, TC0, T0
>       subi    T0, T0, 32
> -     subi    T1, T1, 32
> -     stvx    v26, T0, SP
> -     stvx    v27, T1, SP
> +     stvx    v26, TC16, T0
> +     stvx    v27, TC0, T0
>       subi    T0, T0, 32
> -     subi    T1, T1, 32
> -     stvx    v28, T0, SP
> -     stvx    v29, T1, SP
> +     stvx    v28, TC16, T0
> +     stvx    v29, TC0, T0
>       subi    T0, T0, 32
> -     subi    T1, T1, 32
> -     stvx    v30, T0, SP
> -     stvx    v31, T1, SP
> +     stvx    v30, TC16, T0
> +     stvx    v31, TC0, T0
>  
>       ALIGN(16)       C Appears necessary for optimal loads
>  
> @@ -313,30 +304,24 @@ PROLOGUE(_nettle_sha256_compress_n)
>  
>  
>       C Restore nonvolatile registers
> -     li      T0, -16
> -     li      T1, -32
> -     lvx     v20, T0, SP
> -     lvx     v21, T1, SP
> +     subi    T0, SP, 32
> +     lvx     v20, TC16, T0
> +     lvx     v21, TC0, T0
>       subi    T0, T0, 32
> -     subi    T1, T1, 32
> -     lvx     v22, T0, SP
> -     lvx     v23, T1, SP
> +     lvx     v22, TC16, T0
> +     lvx     v23, TC0, T0
>       subi    T0, T0, 32
> -     subi    T1, T1, 32
> -     lvx     v24, T0, SP
> -     lvx     v25, T1, SP
> +     lvx     v24, TC16, T0
> +     lvx     v25, TC0, T0
>       subi    T0, T0, 32
> -     subi    T1, T1, 32
> -     lvx     v26, T0, SP
> -     lvx     v27, T1, SP
> +     lvx     v26, TC16, T0
> +     lvx     v27, TC0, T0
>       subi    T0, T0, 32
> -     subi    T1, T1, 32
> -     lvx     v28, T0, SP
> -     lvx     v29, T1, SP
> +     lvx     v28, TC16, T0
> +     lvx     v29, TC0, T0
>       subi    T0, T0, 32
> -     subi    T1, T1, 32
> -     lvx     v30, T0, SP
> -     lvx     v31, T1, SP
> +     lvx     v30, TC16, T0
> +     lvx     v31, TC0, T0
>  
>  .done:
>       mr r3, INPUT
> 
> 

_______________________________________________
nettle-bugs mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Re: [PATCH v2] powerpc64/sha256: fix loading overreads by loading less and shifting

Reply via email to