The last patch follows the C implementation but I just figured out a decent
way to do it.
---
 powerpc64/p7/chacha-core-internal.asm | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/powerpc64/p7/chacha-core-internal.asm
b/powerpc64/p7/chacha-core-internal.asm
index 33c721c1..76ca0d45 100644
--- a/powerpc64/p7/chacha-core-internal.asm
+++ b/powerpc64/p7/chacha-core-internal.asm
@@ -53,6 +53,10 @@ define(`S1', `v9')
 define(`S2', `v10')
 define(`S3', `v11')

+C Big-endian working state
+define(`LE_MASK',  `v12')
+define(`LE_TEMP', `v13')
+
 C QROUND(X0, X1, X2, X3)
 define(`QROUND', `
  C x0 += x1, x3 ^= x0, x3 lrot 16
@@ -77,10 +81,18 @@ define(`QROUND', `
  vrlw $2, $2, ROT7
 ')

+C LE_SWAP32(X0, X1, X2, X3)
+define(`LE_SWAP32', `IF_BE(`
+ vperm   X0, X0, X0, LE_MASK
+ vperm   X1, X1, X1, LE_MASK
+ vperm   X2, X2, X2, LE_MASK
+ vperm   X3, X3, X3, LE_MASK
+')')
+
  .text
- .align 4
  C _chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds)

+define(`FUNC_ALIGN', `5')
 PROLOGUE(_nettle_chacha_core)

  li r6, 0x10 C set up some...
@@ -91,6 +103,12 @@ PROLOGUE(_nettle_chacha_core)
  vspltisw ROT12, 12
  vspltisw ROT8, 8
  vspltisw ROT7, 7
+IF_BE(`
+ li       r9, 0
+ lvsl     LE_MASK, r9, r9
+ vspltisb LE_TEMP, 0x03
+ vxor     LE_MASK, LE_MASK, LE_TEMP
+')

  lxvw4x VSR(X0), 0, SRC
  lxvw4x VSR(X1), r6, SRC
@@ -131,6 +149,8 @@ PROLOGUE(_nettle_chacha_core)
  vadduwm X2, X2, S2
  vadduwm X3, X3, S3

+ LE_SWAP32(X0, X1, X2, X3)
+
  stxvw4x VSR(X0), 0, DST
  stxvw4x VSR(X1), r6, DST
  stxvw4x VSR(X2), r7, DST

-- 
2.17.1
_______________________________________________
nettle-bugs mailing list
[email protected]
http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs

Reply via email to