Originally, the 16 input words were loaded with 16 individual vector load
instructions. This has a side effect where the last three loads would
overread 1/2/3 extra words.

Fix the overread by replacing unnecessary overlapped reads with shifts.
As a consequence, the constant registers for 4,8,12 can be removed, and
also gain about 1~2% in performance.

Signed-off-by: Eric Richter <[email protected]>
---

So this was supposed to be a simple update, removing those eight
unnecessary instructions. However, removing the GPR store instructions
introduced a performance regression of roughly -5% -- more performance lost
than gained in the previous version.

After a lot of trial-and-error, I've landed on this version, which includes
yet another align, this time before the register storage section. I tried
placing it in other locations, as well as changing the align value,
eventually landing on this as the most performant version that I could
find through brute force. Moving the load-immediate for TC16 to before the
register save section marginally improved performance (<1%), and in a less
technical opinion appears better than doing:

  ALIGN(16)
  li  TC16, 16
  ALIGN(16)

  lxv4wx
  lxv4wx

When I have more time, I still intend to do a deeper dive and try to find
out exactly why these aligns are helping/hurting performance. At minimum
I'd like to send a patch for better comments to document the logic, so
future implementations can learn from it.

v2:
 - removed store/restore of now unused r14/r15
 - moved the initial load of TC16 to be with other load immediates
 - changed .align to use the ALIGN macro
 - added an ALIGN before the register saving stores
 - added comments to ALIGN


 powerpc64/p8/sha256-compress-n.asm | 44 +++++++++++-------------------
 1 file changed, 16 insertions(+), 28 deletions(-)

diff --git a/powerpc64/p8/sha256-compress-n.asm 
b/powerpc64/p8/sha256-compress-n.asm
index e08ae132..75666deb 100644
--- a/powerpc64/p8/sha256-compress-n.asm
+++ b/powerpc64/p8/sha256-compress-n.asm
@@ -44,10 +44,7 @@ define(`T1', `r8')
 define(`TK', `r9')
 define(`COUNT', `r10')
 define(`TC0', `0')     C Index instructions allow literal 0 instead of a GPR
-define(`TC4', `r11')
-define(`TC8', `r12')
-define(`TC12', `r14')
-define(`TC16', `r15')
+define(`TC16', `r11')
 
 C State registers
 define(`VSA', `v0')
@@ -187,24 +184,24 @@ define(`LOAD', `
 define(`DOLOADS', `
        IF_LE(`DATA_LOAD_VEC(VT0, .load_swap, T1)')
        LOAD(0, TC0)
-       LOAD(1, TC4)
-       LOAD(2, TC8)
-       LOAD(3, TC12)
+       vsldoi  IV(1), IV(0), IV(0), 4
+       vsldoi  IV(2), IV(0), IV(0), 8
+       vsldoi  IV(3), IV(0), IV(0), 12
        addi    INPUT, INPUT, 16
        LOAD(4, TC0)
-       LOAD(5, TC4)
-       LOAD(6, TC8)
-       LOAD(7, TC12)
+       vsldoi  IV(5), IV(4), IV(4), 4
+       vsldoi  IV(6), IV(4), IV(4), 8
+       vsldoi  IV(7), IV(4), IV(4), 12
        addi    INPUT, INPUT, 16
        LOAD(8, TC0)
-       LOAD(9, TC4)
-       LOAD(10, TC8)
-       LOAD(11, TC12)
+       vsldoi  IV(9), IV(8), IV(8), 4
+       vsldoi  IV(10), IV(8), IV(8), 8
+       vsldoi  IV(11), IV(8), IV(8), 12
        addi    INPUT, INPUT, 16
        LOAD(12, TC0)
-       LOAD(13, TC4)
-       LOAD(14, TC8)
-       LOAD(15, TC12)
+       vsldoi  IV(13), IV(12), IV(12), 4
+       vsldoi  IV(14), IV(12), IV(12), 8
+       vsldoi  IV(15), IV(12), IV(12), 12
        addi    INPUT, INPUT, 16
 ')
 
@@ -216,6 +213,8 @@ PROLOGUE(_nettle_sha256_compress_n)
 
        C Store non-volatile registers
 
+       ALIGN(16)       C Appears necessary for optimal stores
+       li      TC16, 16
        li      T0, -16
        li      T1, -32
        stvx    v20, T0, SP
@@ -240,15 +239,8 @@ PROLOGUE(_nettle_sha256_compress_n)
        subi    T1, T1, 32
        stvx    v30, T0, SP
        stvx    v31, T1, SP
-       subi    T0, T0, 32
-       subi    T1, T1, 32
-       stdx    r14, T0, SP
-       stdx    r15, T1, SP
 
-       li      TC4, 4
-       li      TC8, 8
-       li      TC12, 12
-       li      TC16, 16
+       ALIGN(16)       C Appears necessary for optimal loads
 
        C Load state values
        lxvw4x  VSR(VSA), 0, STATE      C VSA contains A,B,C,D
@@ -345,10 +337,6 @@ PROLOGUE(_nettle_sha256_compress_n)
        subi    T1, T1, 32
        lvx     v30, T0, SP
        lvx     v31, T1, SP
-       subi    T0, T0, 32
-       subi    T1, T1, 32
-       ldx     r14, T0, SP
-       ldx     r15, T1, SP
 
 .done:
        mr r3, INPUT
-- 
2.46.0

_______________________________________________
nettle-bugs mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to