On raspberry pi 3b+ (cortex-a53 @ 1.4GHz):
Before:
 aes128         |  nanosecs/byte   mebibytes/sec   cycles/byte
        ECB enc |     39.58 ns/B     24.10 MiB/s         - c/B
        ECB dec |     39.57 ns/B     24.10 MiB/s         - c/B
After:
        ECB enc |     15.24 ns/B     62.57 MiB/s         - c/B
        ECB dec |     15.68 ns/B     60.80 MiB/s         - c/B

Passes nettle regression test (only little-endian though)

Does not use pre-rotated tables (as in AES_SMALL), so reduces d-cache
footprint from 4.25K to 1K (enc)/1.25K (dec);
completely unrolled, so increases i-cache footprint
from 948b to 4416b (enc)/4032b (dec)

As it completely replaces current implementation, I just attached new
files (will post final version as a patch).

P.S. Yes, I tried convert macros to m4: complete failure (no named
parameters, problems with more than 9 arguments, weird expansion rules);
so I fallen back to good ol' gas. Sorry.

P.P.S. with this change, gcm/neon and (to-be-publushed) chacha_blocks/neon,
gnutls-cli --benchmark-ciphers:
Before:
Checking cipher-MAC combinations, payload size: 16384
             AES-128-GCM 13.56 MB/sec
       CHACHA20-POLY1305 68.26 MB/sec
        AES-128-CBC-SHA1 16.72 MB/sec
        AES-128-CBC-SHA256 15.07 MB/sec
After:
             AES-128-GCM 35.32 MB/sec
       CHACHA20-POLY1305 94.94 MB/sec
        AES-128-CBC-SHA1 27.53 MB/sec
        AES-128-CBC-SHA256 23.30 MB/sec
C arm/v6/aes-decrypt-internal.asm

ifelse(<
rijndael-arm.S  -  ARM assembly implementation of AES cipher

Copyright (C) 2013 Jussi Kivilinna [email protected]

This file is part of Libgcrypt.

Libgcrypt is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation; either version 2.1 of
the License, or (at your option) any later version.

Libgcrypt is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this program; if not, see http://www.gnu.org/licenses/.
>)

.text
        .file "aes-decrypt-internal.asm"
        .arch armv6
        .syntax unified
        .arm

define(<KEYSCHEDULE_REVERSED>,<yes>)
define(<IF_KEYSCHEDULE_REVERSED>,<ifelse(
KEYSCHEDULE_REVERSED,yes,<$1>,
KEYSCHEDULE_REVERSED,no,<$2>)>)

C register macros
define(<PARAM_ROUNDS>, <r0>)
define(<PARAM_LENGTH>, <r3>)

define(<FRAME_ROUNDS>,<[sp, #0]>)
define(<FRAME_LENGTH>,<[sp, #4]>)
define(<FRAME_DST>,<[sp, #(48+0)]>)
define(<FRAME_SRC>,<[sp, #(48+4)]>)

define(<CTX>, <%r1>)
define(<RTAB>, <%r2>)
define(<RMASK>, <%ip>)

define(<RA>, <%r4>)
define(<RB>, <%r5>)
define(<RC>, <%r6>)
define(<RD>, <%r7>)

define(<RNA>, <%r8>)
define(<RNB>, <%r9>)
define(<RNC>, <%r10>)
define(<RND>, <%r11>)

define(<RT0>, <%r0>)
define(<RT1>, <%r3>)
define(<RT2>, <%lr>)

C helper macros
.macro ldr_unaligned_le rout rsrc offs rtmp
        ldrb \rout, [\rsrc, #((\offs) + 0)]
        ldrb \rtmp, [\rsrc, #((\offs) + 1)]
        orr \rout, \rout, \rtmp, lsl #8
        ldrb \rtmp, [\rsrc, #((\offs) + 2)]
        orr \rout, \rout, \rtmp, lsl #16
        ldrb \rtmp, [\rsrc, #((\offs) + 3)]
        orr \rout, \rout, \rtmp, lsl #24
.endm

.macro str_unaligned_le rin rdst offs rtmp0 rtmp1
        mov \rtmp0, \rin, lsr #8
        strb \rin, [\rdst, #((\offs) + 0)]
        mov \rtmp1, \rin, lsr #16
        strb \rtmp0, [\rdst, #((\offs) + 1)]
        mov \rtmp0, \rin, lsr #24
        strb \rtmp1, [\rdst, #((\offs) + 2)]
        strb \rtmp0, [\rdst, #((\offs) + 3)]
.endm

C ***********************************************************************
C ARM assembly implementation of the AES cipher
C ***********************************************************************

.macro preload_first_key round ra
IF_KEYSCHEDULE_REVERSED(<
        ldr \ra, [CTX], #+4
>,<
        ldr \ra, [CTX, #(((\round) * 16) + 0 * 4)]
>)
.endm
.macro dummy round ra
.endm
.macro addroundkey ra rb rc rd rna rnb rnc rnd preload_key
IF_KEYSCHEDULE_REVERSED(<
        ldm CTX!, {\rna, \rnb, \rnc, \rnd}
>,<
        ldm CTX, {\rna, \rnb, \rnc, \rnd}
>)
        eor \ra, \rna
        eor \rb, \rnb
        eor \rc, \rnc
        \preload_key 1, \rna
        eor \rd, \rnd
.endm

.macro addroundkey_dec round ra rb rc rd rna rnb rnc rnd
IF_KEYSCHEDULE_REVERSED(<
        addroundkey \ra,\rb,\rc,\rd,\rna,\rnb,\rnc,\rnd,preload_first_key
>,<
        ldr \rna, [CTX, #(((\round) * 16) + 0 * 4)]
        ldr \rnb, [CTX, #(((\round) * 16) + 1 * 4)]
        eor \ra, \rna
        ldr \rnc, [CTX, #(((\round) * 16) + 2 * 4)]
        eor \rb, \rnb
        ldr \rnd, [CTX, #(((\round) * 16) + 3 * 4)]
        eor \rc, \rnc
        preload_first_key (\round) - 1, \rna
        eor \rd, \rnd
>)
.endm

.macro do_decround next_r ra rb rc rd rna rnb rnc rnd preload_key
IF_KEYSCHEDULE_REVERSED(<
        ldr \rnb, [CTX], #+4
>,<
        ldr \rnb, [CTX, #(((\next_r) * 16) + 1 * 4)]
>)

        and RT0, RMASK, \ra, lsl#2
IF_KEYSCHEDULE_REVERSED(<
        ldr \rnc, [CTX], #+4
>,<
        ldr \rnc, [CTX, #(((\next_r) * 16) + 2 * 4)]
>)
        and RT1, RMASK, \ra, lsr#(8 - 2)
IF_KEYSCHEDULE_REVERSED(<
        ldr \rnd, [CTX], #+4
>,<
        ldr \rnd, [CTX, #(((\next_r) * 16) + 3 * 4)]
>)
        and RT2, RMASK, \ra, lsr#(16 - 2)
        ldr RT0, [RTAB, RT0]
        and \ra,  RMASK, \ra, lsr#(24 - 2)

        ldr RT1, [RTAB, RT1]
        eor \rna, \rna, RT0
        ldr RT2, [RTAB, RT2]
        and RT0, RMASK, \rb, lsl#2
        ldr \ra,  [RTAB, \ra]

        eor \rnb, \rnb, RT1, ror #24
        and RT1, RMASK, \rb, lsr#(8 - 2)
        eor \rnc, \rnc, RT2, ror #16
        and RT2, RMASK, \rb, lsr#(16 - 2)
        eor \rnd, \rnd, \ra, ror #8
        ldr RT0, [RTAB, RT0]
        and \rb,  RMASK, \rb, lsr#(24 - 2)

        ldr RT1, [RTAB, RT1]
        eor \rnb, \rnb, RT0
        ldr RT2, [RTAB, RT2]
        and RT0, RMASK, \rc, lsl#2
        ldr \rb,  [RTAB, \rb]

        eor \rnc, \rnc, RT1, ror #24
        and RT1, RMASK, \rc, lsr#(8 - 2)
        eor \rnd, \rnd, RT2, ror #16
        and RT2, RMASK, \rc, lsr#(16 - 2)
        eor \rna, \rna, \rb, ror #8
        ldr RT0, [RTAB, RT0]
        and \rc,  RMASK, \rc, lsr#(24 - 2)

        ldr RT1, [RTAB, RT1]
        eor \rnc, \rnc, RT0
        ldr RT2, [RTAB, RT2]
        and RT0, RMASK, \rd, lsl#2
        ldr \rc,  [RTAB, \rc]

        eor \rnd, \rnd, RT1, ror #24
        and RT1, RMASK, \rd, lsr#(8 - 2)
        eor \rna, \rna, RT2, ror #16
        and RT2, RMASK, \rd, lsr#(16 - 2)
        eor \rnb, \rnb, \rc, ror #8
        ldr RT0, [RTAB, RT0]
        and \rd,  RMASK, \rd, lsr#(24 - 2)

        ldr RT1, [RTAB, RT1]
        eor \rnd, \rnd, RT0
        ldr RT2, [RTAB, RT2]
        eor \rna, \rna, RT1, ror #24
        ldr \rd,  [RTAB, \rd]

        eor \rnb, \rnb, RT2, ror #16
        \preload_key (\next_r) - 1, \ra
        eor \rnc, \rnc, \rd, ror #8
.endm

.macro do_lastdecround ra rb rc rd rna rnb rnc rnd
        and RT0, RMASK, \ra
        and RT1, RMASK, \ra, lsr#8
        and RT2, RMASK, \ra, lsr#16
        ldrb \rna, [RTAB, RT0]
        mov \ra,  \ra, lsr#24
        ldrb \rnb, [RTAB, RT1]
        and RT0, RMASK, \rb
        ldrb \rnc, [RTAB, RT2]
        mov \rnb, \rnb, ror #24
        ldrb \rnd, [RTAB, \ra]
        and RT1, RMASK, \rb, lsr#8
        mov \rnc, \rnc, ror #16
        and RT2, RMASK, \rb, lsr#16
        mov \rnd, \rnd, ror #8
        ldrb RT0, [RTAB, RT0]
        mov \rb,  \rb, lsr#24
        ldrb RT1, [RTAB, RT1]

        orr \rnb, \rnb, RT0
        ldrb RT2, [RTAB, RT2]
        and RT0, RMASK, \rc
        ldrb \rb,  [RTAB, \rb]
        orr \rnc, \rnc, RT1, ror #24
        and RT1, RMASK, \rc, lsr#8
        orr \rnd, \rnd, RT2, ror #16
        and RT2, RMASK, \rc, lsr#16
        orr \rna, \rna, \rb, ror #8
        ldrb RT0, [RTAB, RT0]
        mov \rc,  \rc, lsr#24
        ldrb RT1, [RTAB, RT1]

        orr \rnc, \rnc, RT0
        ldrb RT2, [RTAB, RT2]
        and RT0, RMASK, \rd
        ldrb \rc,  [RTAB, \rc]
        orr \rnd, \rnd, RT1, ror #24
        and RT1, RMASK, \rd, lsr#8
        orr \rna, \rna, RT2, ror #16
        ldrb RT0, [RTAB, RT0]
        and RT2, RMASK, \rd, lsr#16
        ldrb RT1, [RTAB, RT1]
        orr \rnb, \rnb, \rc, ror #8
        ldrb RT2, [RTAB, RT2]
        mov \rd,  \rd, lsr#24
        ldrb \rd,  [RTAB, \rd]

        orr \rnd, \rnd, RT0
        orr \rna, \rna, RT1, ror #24
        orr \rnb, \rnb, RT2, ror #16
        orr \rnc, \rnc, \rd, ror #8
.endm

.macro firstdecround round ra rb rc rd rna rnb rnc rnd
        addroundkey_dec ((\round) + 1), \ra, \rb, \rc, \rd, \rna, \rnb, \rnc, 
\rnd
        do_decround \round, \ra, \rb, \rc, \rd, \rna, \rnb, \rnc, \rnd, 
preload_first_key
.endm

.macro decround round ra rb rc rd rna rnb rnc rnd preload_key
        do_decround \round, \ra, \rb, \rc, \rd, \rna, \rnb, \rnc, \rnd, 
\preload_key
.endm

.macro set_last_round_rmask _ __
        mov RMASK, #0xff
.endm

.macro lastdecround round ra rb rc rd rna rnb rnc rnd
        sub RTAB, #AES_TABLE0
        do_lastdecround \ra, \rb, \rc, \rd, \rna, \rnb, \rnc, \rnd
        addroundkey \rna, \rnb, \rnc, \rnd, \ra, \rb, \rc, \rd, dummy
        add RTAB, #AES_TABLE0
.endm

        C _aes_decrypt(unsigned rounds, const uint32_t *keys,
        C              const struct aes_table *T,
        C              size_t length, uint8_t *dst,
        C              uint8_t *src)
        C r0 rounds
        C r1 ctx
        C r2 table
        C r3 length
        C [sp, #0] dst
        C [sp, #4] src

PROLOGUE(_nettle_aes_decrypt)
        .cfi_startproc
        teq     PARAM_LENGTH, #0
        bxeq    lr

        push {r0,r3,%r4-%r11, %ip, %lr}
        .cfi_adjust_cfa_offset 48
        .cfi_rel_offset r0, 0   C PARAM_LENGTH
        .cfi_rel_offset r3, 4   C PARAM_ROUNDS
        .cfi_rel_offset r4, 8
        .cfi_rel_offset r5, 12
        .cfi_rel_offset r6, 16
        .cfi_rel_offset r7, 20
        .cfi_rel_offset r8, 24
        .cfi_rel_offset r9, 28
        .cfi_rel_offset r10, 32
        .cfi_rel_offset r11, 36
        .cfi_rel_offset ip, 40
        .cfi_rel_offset lr, 44
        add     RTAB, RTAB, #AES_TABLE0
        C read input block
.Lblock_loop:
        ldr     RT0, FRAME_SRC

ifelse(V6,V6,<
        ldr     RA, [RT0]
        ldr     RB, [RT0, #4]
        ldr     RC, [RT0, #8]
        ldr     RD, [RT0, #12]
  IF_BE(<
        rev     RA, RA
        rev     RB, RB
        rev     RC, RC
        rev     RD, RD
  >)
>,<
  IF_LE(<
        C test if src is unaligned
        tst     RT0, #3
        beq     1f
  >)

        C unaligned load
        ldr_unaligned_le RA, RT0, 0, RNA
        ldr_unaligned_le RB, RT0, 4, RNB
        ldr_unaligned_le RC, RT0, 8, RNA
        ldr_unaligned_le RD, RT0, 12, RNB
  IF_LE(<
        b       2f
.ltorg
1:
        C aligned load
        ldm     RT0, {RA, RB, RC, RD}
2:
  >)
>)
        add     RT0, RT0, #16
        mov     RMASK, #0xff
        str     RT0, FRAME_SRC
        ldr     RT1, FRAME_ROUNDS
        mov     RMASK, RMASK, lsl#2;    C byte mask

        cmp     RT1, #12
        bge     .Ldec_256

        firstdecround 9, RA, RB, RC, RD, RNA, RNB, RNC, RND
.Ldec_tail:
        decround 8, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key
        decround 7, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key
        decround 6, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key
        decround 5, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key
        decround 4, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key
        decround 3, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key
        decround 2, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key
        decround 1, RA, RB, RC, RD, RNA, RNB, RNC, RND, set_last_round_rmask
        lastdecround 0, RNA, RNB, RNC, RND, RA, RB, RC, RD

IF_KEYSCHEDULE_REVERSED(<
        ldr     RT1, FRAME_ROUNDS
>)
        ldr     RT0, FRAME_DST
IF_KEYSCHEDULE_REVERSED(<
        add     RT1, 1
>)
        ldr     RT2, FRAME_LENGTH
IF_KEYSCHEDULE_REVERSED(<
        sub     CTX, CTX, RT1, lsl#4
>)

        C store output block

ifelse(V6,V6,<
  IF_BE(<
        rev     RA, RA
        rev     RB, RB
        rev     RC, RC
        rev     RD, RD
  >)
        str RA, [RT0]
        str RB, [RT0, #4]
        str RC, [RT0, #8]
        str RD, [RT0, #12]
>,<
  IF_LE(<
        C test if dst is unaligned
        tst     RT0, #3
        beq     1f
  >)

        C unaligned store
        str_unaligned_le RA, RT0, 0, RNA, RNB
        str_unaligned_le RB, RT0, 4, RNA, RNB
        str_unaligned_le RC, RT0, 8, RNA, RNB
        str_unaligned_le RD, RT0, 12, RNA, RNB
  IF_LE(<
        b       2f
.ltorg
1:
        C aligned store
        C write output block
        stm     RT0, {RA, RB, RC, RD}
2:
  >)
>)
        add     RT0, RT0, #16
        subs    RT2, RT2, #16
        str     RT0, FRAME_DST
        str     RT2, FRAME_LENGTH
        bhi     .Lblock_loop
        .cfi_remember_state
        pop {%r0,%r3, %r4-%r11,%ip,%pc}
        .cfi_restore_state

.ltorg
.Ldec_256:
        beq .Ldec_192

        firstdecround 13, RA, RB, RC, RD, RNA, RNB, RNC, RND
        decround 12, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key
        decround 11, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key
        decround 10, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key
        decround 9, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key

        b .Ldec_tail

.ltorg
.Ldec_192:
        firstdecround 11, RA, RB, RC, RD, RNA, RNB, RNC, RND
        decround 10, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key
        decround 9, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key

        b .Ldec_tail
        .cfi_endproc
EPILOGUE(_nettle_aes_decrypt)
C arm/v6/aes-encrypt-internal.asm

ifelse(<
rijndael-arm.S  -  ARM assembly implementation of AES cipher

Copyright (C) 2013 Jussi Kivilinna <[email protected]>

This file is part of Libgcrypt.

Libgcrypt is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation; either version 2.1 of
the License, or (at your option) any later version.

Libgcrypt is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this program; if not, see <http://www.gnu.org/licenses/>.
>)

.text
.arch armv6
.syntax unified
.arm

C register macros
define(<PARAM_ROUNDS>, <r0>)
define(<PARAM_LENGTH>, <r3>)

define(<FRAME_ROUNDS>,<[sp, #0]>)
define(<FRAME_LENGTH>,<[sp, #4]>)
define(<FRAME_DST>,<[sp, #(48+0)]>)
define(<FRAME_SRC>,<[sp, #(48+4)]>)

define(<CTX>, <%r1>)
define(<RTAB>, <%r2>)
define(<RMASK>, <%ip>)

define(<RA>, <%r4>)
define(<RB>, <%r5>)
define(<RC>, <%r6>)
define(<RD>, <%r7>)

define(<RNA>, <%r8>)
define(<RNB>, <%r9>)
define(<RNC>, <%r10>)
define(<RND>, <%r11>)

define(<RT0>, <%r0>)
define(<RT1>, <%r3>)
define(<RT2>, <%lr>)

C helper macros
.macro ldr_unaligned_le rout rsrc offs rtmp
        ldrb \rout, [\rsrc, #((\offs) + 0)]
        ldrb \rtmp, [\rsrc, #((\offs) + 1)]
        orr \rout, \rout, \rtmp, lsl #8
        ldrb \rtmp, [\rsrc, #((\offs) + 2)]
        orr \rout, \rout, \rtmp, lsl #16
        ldrb \rtmp, [\rsrc, #((\offs) + 3)]
        orr \rout, \rout, \rtmp, lsl #24
.endm

.macro str_unaligned_le rin rdst offs rtmp0 rtmp1
        mov \rtmp0, \rin, lsr #8
        strb \rin, [\rdst, #((\offs) + 0)]
        mov \rtmp1, \rin, lsr #16
        strb \rtmp0, [\rdst, #((\offs) + 1)]
        mov \rtmp0, \rin, lsr #24
        strb \rtmp1, [\rdst, #((\offs) + 2)]
        strb \rtmp0, [\rdst, #((\offs) + 3)]
.endm

C ***********************************************************************
C ARM assembly implementation of the AES cipher
C ***********************************************************************

.macro preload_first_key round ra
        ldr \ra, [CTX, #(((\round) * 16) + 0 * 4)]
.endm
.macro dummy round ra
.endm
.macro addroundkey ra rb rc rd rna rnb rnc rnd preload_key
        ldm CTX, {\rna, \rnb, \rnc, \rnd}
        eor \ra, \rna
        eor \rb, \rnb
        eor \rc, \rnc
        \preload_key 1, \rna
        eor \rd, \rnd
.endm

.macro do_encround next_r ra rb rc rd rna rnb rnc rnd preload_key
        ldr \rnb, [CTX, #(((\next_r) * 16) + 1 * 4)]

        and RT0, RMASK, \ra, lsl#2
        ldr \rnc, [CTX, #(((\next_r) * 16) + 2 * 4)]
        and RT1, RMASK, \ra, lsr#(8 - 2)
        ldr \rnd, [CTX, #(((\next_r) * 16) + 3 * 4)]
        and RT2, RMASK, \ra, lsr#(16 - 2)
        ldr RT0, [RTAB, RT0]
        and \ra,  RMASK, \ra, lsr#(24 - 2)

        ldr RT1, [RTAB, RT1]
        eor \rna, \rna, RT0
        ldr RT2, [RTAB, RT2]
        and RT0, RMASK, \rd, lsl#2
        ldr \ra,  [RTAB, \ra]

        eor \rnd, \rnd, RT1, ror #24
        and RT1, RMASK, \rd, lsr#(8 - 2)
        eor \rnc, \rnc, RT2, ror #16
        and RT2, RMASK, \rd, lsr#(16 - 2)
        eor \rnb, \rnb, \ra, ror #8
        ldr RT0, [RTAB, RT0]
        and \rd,  RMASK, \rd, lsr#(24 - 2)

        ldr RT1, [RTAB, RT1]
        eor \rnd, \rnd, RT0
        ldr RT2, [RTAB, RT2]
        and RT0, RMASK, \rc, lsl#2
        ldr \rd,  [RTAB, \rd]

        eor \rnc, \rnc, RT1, ror #24
        and RT1, RMASK, \rc, lsr#(8 - 2)
        eor \rnb, \rnb, RT2, ror #16
        and RT2, RMASK, \rc, lsr#(16 - 2)
        eor \rna, \rna, \rd, ror #8
        ldr RT0, [RTAB, RT0]
        and \rc,  RMASK, \rc, lsr#(24 - 2)

        ldr RT1, [RTAB, RT1]
        eor \rnc, \rnc, RT0
        ldr RT2, [RTAB, RT2]
        and RT0, RMASK, \rb, lsl#2
        ldr \rc,  [RTAB, \rc]

        eor \rnb, \rnb, RT1, ror #24
        and RT1, RMASK, \rb, lsr#(8 - 2)
        eor \rna, \rna, RT2, ror #16
        and RT2, RMASK, \rb, lsr#(16 - 2)
        eor \rnd, \rnd, \rc, ror #8
        ldr RT0, [RTAB, RT0]
        and \rb,  RMASK, \rb, lsr#(24 - 2)

        ldr RT1, [RTAB, RT1]
        eor \rnb, \rnb, RT0
        ldr RT2, [RTAB, RT2]
        eor \rna, \rna, RT1, ror #24
        ldr \rb,  [RTAB, \rb]

        eor \rnd, \rnd, RT2, ror #16
        \preload_key (\next_r) + 1, \ra
        eor \rnc, \rnc, \rb, ror #8
.endm

.macro do_lastencround ra rb rc rd rna rnb rnc rnd
        and RT0, RMASK, \ra, lsl#2
        and RT1, RMASK, \ra, lsr#(8 - 2)
        and RT2, RMASK, \ra, lsr#(16 - 2)
        ldrb \rna, [RTAB, RT0]
        and \ra,  RMASK, \ra, lsr#(24 - 2)
        ldrb \rnd, [RTAB, RT1]
        and RT0, RMASK, \rd, lsl#2
        ldrb \rnc, [RTAB, RT2]
        mov \rnd, \rnd, ror #24
        ldrb \rnb, [RTAB, \ra]
        and RT1, RMASK, \rd, lsr#(8 - 2)
        mov \rnc, \rnc, ror #16
        and RT2, RMASK, \rd, lsr#(16 - 2)
        mov \rnb, \rnb, ror #8
        ldrb RT0, [RTAB, RT0]
        and \rd,  RMASK, \rd, lsr#(24 - 2)
        ldrb RT1, [RTAB, RT1]

        orr \rnd, \rnd, RT0
        ldrb RT2, [RTAB, RT2]
        and RT0, RMASK, \rc, lsl#2
        ldrb \rd,  [RTAB, \rd]
        orr \rnc, \rnc, RT1, ror #24
        and RT1, RMASK, \rc, lsr#(8 - 2)
        orr \rnb, \rnb, RT2, ror #16
        and RT2, RMASK, \rc, lsr#(16 - 2)
        orr \rna, \rna, \rd, ror #8
        ldrb RT0, [RTAB, RT0]
        and \rc,  RMASK, \rc, lsr#(24 - 2)
        ldrb RT1, [RTAB, RT1]

        orr \rnc, \rnc, RT0
        ldrb RT2, [RTAB, RT2]
        and RT0, RMASK, \rb, lsl#2
        ldrb \rc,  [RTAB, \rc]
        orr \rnb, \rnb, RT1, ror #24
        and RT1, RMASK, \rb, lsr#(8 - 2)
        orr \rna, \rna, RT2, ror #16
        ldrb RT0, [RTAB, RT0]
        and RT2, RMASK, \rb, lsr#(16 - 2)
        ldrb RT1, [RTAB, RT1]
        orr \rnd, \rnd, \rc, ror #8
        ldrb RT2, [RTAB, RT2]
        and \rb,  RMASK, \rb, lsr#(24 - 2)
        ldrb \rb,  [RTAB, \rb]

        orr \rnb, \rnb, RT0
        orr \rna, \rna, RT1, ror #24
        orr \rnd, \rnd, RT2, ror #16
        orr \rnc, \rnc, \rb, ror #8
.endm

.macro firstencround round ra rb rc rd rna rnb rnc rnd
        addroundkey \ra,\rb,\rc,\rd,\rna,\rnb,\rnc,\rnd,preload_first_key
        do_encround (\round) + 
1,\ra,\rb,\rc,\rd,\rna,\rnb,\rnc,\rnd,preload_first_key
.endm

.macro encround round ra rb rc rd rna rnb rnc rnd preload_key
        do_encround (\round) + 
1,\ra,\rb,\rc,\rd,\rna,\rnb,\rnc,\rnd,\preload_key
.endm

.macro lastencround round ra rb rc rd rna rnb rnc rnd
        add CTX, #(((\round) + 1) * 16)
        add RTAB, #1
        do_lastencround \ra,\rb,\rc,\rd,\rna,\rnb,\rnc,\rnd
        addroundkey \rna,\rnb,\rnc,\rnd,\ra,\rb,\rc,\rd,dummy
        sub CTX, #(((\round) + 1) * 16)
        sub RTAB, #1
.endm

        C _aes_encrypt(unsigned rounds, const uint32_t *keys,
        C              const struct aes_table *T,
        C              size_t length, uint8_t *dst,
        C              uint8_t *src)
        C r0 rounds
        C r1 ctx
        C r2 table
        C r3 length
        C [sp, #0] dst
        C [sp, #4] src

PROLOGUE(_nettle_aes_encrypt)
        .cfi_startproc
        teq     PARAM_LENGTH, #0
        bxeq    lr

        push {r0,r3,%r4-%r11, %ip, %lr}
        .cfi_adjust_cfa_offset 48
        .cfi_rel_offset r0, 0   C PARAM_LENGTH
        .cfi_rel_offset r3, 4   C PARAM_ROUNDS
        .cfi_rel_offset r4, 8
        .cfi_rel_offset r5, 12
        .cfi_rel_offset r6, 16
        .cfi_rel_offset r7, 20
        .cfi_rel_offset r8, 24
        .cfi_rel_offset r9, 28
        .cfi_rel_offset r10, 32
        .cfi_rel_offset r11, 36
        .cfi_rel_offset ip, 40
        .cfi_rel_offset lr, 44
        add     RTAB, RTAB, #AES_TABLE0
        C read input block
.Lblock_loop:
        ldr     RT0, FRAME_SRC

ifelse(V6,V6,<
  IF_BE(<
        rev     RA, RA
        rev     RB, RB
        rev     RC, RC
        rev     RD, RD
  >)
        str RA, [RT0]
        str RB, [RT0, #4]
        str RC, [RT0, #8]
        str RD, [RT0, #12]
>,<
  IF_LE(<
        C test if src is unaligned
        tst     RT0, #3
        beq     1f
  >)
        C unaligned load
        ldr_unaligned_le RA, RT0, 0, RNA
        ldr_unaligned_le RB, RT0, 4, RNB
        ldr_unaligned_le RC, RT0, 8, RNA
        ldr_unaligned_le RD, RT0, 12, RNB
  IF_LE(<
        b       2f
.ltorg
1:
        C aligned load
        ldm     RT0, {RA, RB, RC, RD}
2:
  >)
>)
        add     RT0, RT0, #16

        mov     RMASK, #0xff
        str     RT0, FRAME_SRC
        mov     RMASK, RMASK, lsl#2;    C byte mask

        firstencround 0, RA, RB, RC, RD, RNA, RNB, RNC, RND
        encround 1, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key
        encround 2, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key
        encround 3, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key
        encround 4, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key
        encround 5, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key
        encround 6, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key
        encround 7, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key

        ldr     RT0, FRAME_ROUNDS
        cmp     RT0, #12
        bge     .Lenc_not_128

        encround 8, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy
        lastencround 9, RNA, RNB, RNC, RND, RA, RB, RC, RD

.Lenc_done:
        ldr     RT0, FRAME_DST
        ldr     RT1, FRAME_LENGTH

        C store output block
ifelse(V6,V6,<
  IF_BE(<
        rev     RA, RA
        rev     RB, RB
        rev     RC, RC
        rev     RD, RD
  >)
        str RA, [RT0]
        str RB, [RT0, #4]
        str RC, [RT0, #8]
        str RD, [RT0, #12]
>,<
  IF_LE(<
        C test if dst is unaligned
        tst     RT0, #3
        beq     1f
  >)

        C unaligned store
        str_unaligned_le RA, RT0,  0, RNA, RNB
        str_unaligned_le RB, RT0,  4, RNA, RNB
        str_unaligned_le RC, RT0,  8, RNA, RNB
        str_unaligned_le RD, RT0, 12, RNA, RNB
  IF_LE(<
        b       2f
.ltorg
1:
        C aligned store
        C write output block
        stm     RT0, {RA, RB, RC, RD}
2:
  >)
>)
        add     RT0, RT0, #16
        subs    RT1, RT1, #16
        str     RT0, FRAME_DST
        str     RT1, FRAME_LENGTH
        bhi     .Lblock_loop
        .cfi_remember_state
        pop {%r0,%r3, %r4-%r11,%ip,%pc}
        .cfi_restore_state

.ltorg
.Lenc_not_128:
        beq .Lenc_192

        encround 8, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key
        encround 9, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key
        encround 10, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key
        encround 11, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key
        encround 12, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy
        lastencround 13, RNA, RNB, RNC, RND, RA, RB, RC, RD

        b .Lenc_done

.ltorg
.Lenc_192:
        encround 8, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key
        encround 9, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key
        encround 10, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy
        lastencround 11, RNA, RNB, RNC, RND, RA, RB, RC, RD

        b .Lenc_done
        .cfi_endproc
EPILOGUE(_nettle_aes_encrypt)
_______________________________________________
nettle-bugs mailing list
[email protected]
http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs

Reply via email to