[email protected] (Niels Möller) writes:

> It could likely be speedup further by processing 2, 3 or 4 blocks in
> parallel.

I've given 2 blocks in parallel a try, but not quite working yet. My
work-in-progress code below.

When I test it on the gcc112 machine, it fails with an illegal
instruction (SIGILL) on this line, close to function entry:

  .globl _nettle_chacha_2core
  .type _nettle_chacha_2core,%function
  .align 5
  _nettle_chacha_2core:
  addis 2,12,(.TOC.-_nettle_chacha_2core)@ha
  addi 2,2,(.TOC.-_nettle_chacha_2core)@l
  .localentry _nettle_chacha_2core, .-_nettle_chacha_2core
  
  
          li      r8, 0x30
          vspltisw v1, 1
  =>      vextractuw v1, v1, 0

I don't understand, from the manual, what's wrong with this. The
intention of this piece of code is just to construct the value {1, 0, 0,
0} in one of the vector registers. Maybe there's a better way to do
that?

Regards,
/Niels

C powerpc64/p7/chacha-core-internal.asm

ifelse(`
   Copyright (C) 2020 Niels Möller and Torbjörn Granlund
   This file is part of GNU Nettle.

   GNU Nettle is free software: you can redistribute it and/or
   modify it under the terms of either:

     * the GNU Lesser General Public License as published by the Free
       Software Foundation; either version 3 of the License, or (at your
       option) any later version.

   or

     * the GNU General Public License as published by the Free
       Software Foundation; either version 2 of the License, or (at your
       option) any later version.

   or both in parallel, as here.

   GNU Nettle is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received copies of the GNU General Public License and
   the GNU Lesser General Public License along with this program.  If
   not, see http://www.gnu.org/licenses/.
')

C Register usage:

C Argments
define(`DST', `r3')
define(`SRC', `r4')
define(`ROUNDS', `r5')

C State, even elements in X, odd elements in Y
define(`X0', `v0')
define(`X1', `v1')
define(`X2', `v2')
define(`X3', `v3')
define(`Y0', `v4')
define(`Y1', `v5')
define(`Y2', `v6')
define(`Y3', `v7')

define(`ROT16', `v8')
define(`ROT12', `v9')
define(`ROT8',  `v10')
define(`ROT7',  `v11')

C Original input state
define(`S0', `v12')
define(`S1', `v13')
define(`S2', `v14')
define(`S3', `v15')
define(`S3p1', `v16')
define(`T0', `v17')

        .text
        C _chacha_2core(uint32_t *dst, const uint32_t *src, unsigned rounds)

define(`FUNC_ALIGN', `5')
PROLOGUE(_nettle_chacha_2core)

        li      r8, 0x30        C offset for x3
        vspltisw X1, 1          C {1,1,...,1}
        vextractuw X1, X1, 0    C {1,0,...,0}

        lxvw4x  VSR(X3), r8, SRC

        vnegw   X0, X1
        vcmpequw Y3, X3, X0
        vand    Y3, Y3, X1      C Counter carry out
        vsldoi  Y3, Y3, Y3, 4
        vor     Y3, Y3, X1

.Lshared_entry:
        vadduwm Y3, Y3, X3

        li      r6, 0x10        C set up some...
        li      r7, 0x20        C ...useful...
        lxvw4x  VSR(X0), 0, SRC
        lxvw4x  VSR(X1), r6, SRC
        lxvw4x  VSR(X2), r7, SRC

        vor     S0, X0, X0
        vor     S1, X1, X1
        vor     S2, X2, X2
        vor     S3, S3, X3
        vor     S3p1, Y3, Y3

        vmrgow  Y0, X0, X0      C  1  1  3  3
        vmrgew  X0, X0, X0      C  0  0  2  2
        vmrgow  Y1, X1, X1      C  5  5  7  7
        vmrgew  X1, X1, X1      C  4  4  6  6
        vmrgow  Y2, X2, X2      C  9  9 11 11
        vmrgew  X2, X2, X2      C  8  8 10 10
        vmrgow  Y3, X3, X3      C 13 13 15 15
        vmrgew  X3, X3, X3      C 12 12 14 14

        vspltisw ROT16, -16     C -16 instead of 16 actually works!
        vspltisw ROT12, 12
        vspltisw ROT8, 8
        vspltisw ROT7, 7

        srdi    ROUNDS, ROUNDS, 1
        mtctr   ROUNDS
.Loop:
C Register layout (A is first block, B is second block)
C
C X0:  A0  B0  A2  B2  Y0:  A1  B1  A3  B3
C X1:  A4  B4  A6  B6  Y1:  A5  B5  A7  B7
C X2:  A8  B8 A10 B10  Y2:  A9  B9 A11 B11
C X3: A12 B12 A14 B14  Y3: A13 B13 A15 B15
        vadduwm X0, X0, X1
         vadduwm Y0, Y0, Y1
        vxor    X3, X3, X0
         vxor   Y3, Y3, Y0
        vrlw    X3, X3, ROT16
         vrlw   Y3, Y3, ROT16

        vadduwm X2, X2, X3
         vadduwm Y2, Y2, Y3
        vxor    X1, X1, X2
         vxor   Y1, Y1, Y2
        vrlw    X1, X1, ROT12
         vrlw   Y1, Y1, ROT12

        vadduwm X0, X0, X1
         vadduwm Y0, Y0, Y1
        vxor    X3, X3, X0
         vxor   Y3, Y3, Y0
        vrlw    X3, X3, ROT8
         vrlw   Y3, Y3, ROT8

        vadduwm X2, X2, X3
         vadduwm Y2, Y2, Y3
        vxor    X1, X1, X2
         vxor   Y1, Y1, Y2
        vrlw    X1, X1, ROT7
         vrlw   Y1, Y1, ROT7

        vsldoi  X1, X1, X1, 8
        vsldoi  X2, X2, X2, 8
        vsldoi  Y2, Y2, Y2, 8
        vsldoi  X3, X3, X3, 8

C Register layout:
C X0:  A0  B0  A2  B2  Y0:  A1  B1  A3  B3
C Y1:  A5  B5  A7  B7  X1:  A6  B6  A4  B4 (X1 swapped)
C X2: A10 B10  A8  B8  Y2: A11 A11  A9  B9 (X2, Y2 swapped)
C Y3  A15 B15 A13 B13  X3  A12 B12 A14 B14 (X3 swapped)

        vadduwm X0, X0, Y1
         vadduwm Y0, Y0, X1
        vxor    Y3, Y3, X0
         vxor   X3, X3, Y0
        vrlw    Y3, Y3, ROT16
         vrlw   X3, X3, ROT16

        vadduwm X2, X2, Y3
         vadduwm Y2, Y2, X3
        vxor    Y1, Y1, X2
         vxor   X1, X1, Y2
        vrlw    Y1, Y1, ROT12
         vrlw   X1, X1, ROT12

        vadduwm X0, X0, Y1
         vadduwm Y0, Y0, Y1
        vxor    Y3, Y3, X0
         vxor   X3, X3, Y0
        vrlw    Y3, Y3, ROT8
         vrlw   X3, X3, ROT8

        vadduwm X2, X2, Y3
         vadduwm Y2, Y2, X3
        vxor    Y1, Y1, X2
         vxor   X1, X1, Y2
        vrlw    Y1, Y1, ROT7
         vrlw   X1, X1, ROT7

        vsldoi  X1, X1, X1, 8
        vsldoi  X2, X2, X2, 8
        vsldoi  Y2, Y2, Y2, 8
        vsldoi  X3, X3, X3, 8

        bdnz    .Loop

        vmrghw  T0, X0, Y0
        vmrglw  Y0, X0, Y0

        vmrghw  X0, X1, Y1
        vmrglw  Y1, X1, Y1

        vmrghw  X1, X2, Y2
        vmrglw  Y2, X2, Y2

        vmrghw  X2, X3, Y3
        vmrglw  Y3, X3, Y3

        vadduwm T0, T0, S0
        vadduwm Y0, Y0, S0
        vadduwm X0, X0, S1
        vadduwm Y1, Y1, S1
        vadduwm X1, X1, S2
        vadduwm Y2, Y2, S2
        vadduwm X2, X2, S3
        vadduwm Y3, Y3, S3p1

        stxvw4x VSR(T0), 0, DST
        stxvw4x VSR(X0), r6, DST
        stxvw4x VSR(X1), r7, DST
        stxvw4x VSR(X2), r8, DST

        addi    DST, DST, 64

        stxvw4x VSR(T0), 0, DST
        stxvw4x VSR(X0), r6, DST
        stxvw4x VSR(X1), r7, DST
        stxvw4x VSR(X2), r8, DST
        blr

define(`FUNC_ALIGN', `5')
PROLOGUE(_nettle_chacha_2core32)
        li      r8, 0x30        C offset for x3
        vspltisw Y3, 1          C {1,1,...,1}
        vextractuw Y3, Y3, 0    C {1,0,...,0}
        lxvw4x  VSR(X3), r8, SRC
        b       .Lshared_entry
EPILOGUE(_nettle_chacha_2core32)

        .data
        .align 4
.Lcount1:
        .int 1,0,0,0

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
Internet email is subject to wholesale government surveillance.
_______________________________________________
nettle-bugs mailing list
[email protected]
http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs

Reply via email to