On raspberry pi 3b+ (cortex-a53 @ 1.4GHz):
Before:
aes128 | nanosecs/byte mebibytes/sec cycles/byte
ECB enc | 39.58 ns/B 24.10 MiB/s - c/B
ECB dec | 39.57 ns/B 24.10 MiB/s - c/B
After:
ECB enc | 15.24 ns/B 62.57 MiB/s - c/B
ECB dec | 15.68 ns/B 60.80 MiB/s - c/B
Passes nettle regression test (only little-endian though)
Does not use pre-rotated tables (as in AES_SMALL), so reduces d-cache
footprint from 4.25K to 1K (enc)/1.25K (dec);
completely unrolled, so increases i-cache footprint
from 948b to 4416b (enc)/4032b (dec)
As it completely replaces current implementation, I just attached new
files (will post final version as a patch).
P.S. Yes, I tried convert macros to m4: complete failure (no named
parameters, problems with more than 9 arguments, weird expansion rules);
so I fallen back to good ol' gas. Sorry.
P.P.S. with this change, gcm/neon and (to-be-publushed) chacha_blocks/neon,
gnutls-cli --benchmark-ciphers:
Before:
Checking cipher-MAC combinations, payload size: 16384
AES-128-GCM 13.56 MB/sec
CHACHA20-POLY1305 68.26 MB/sec
AES-128-CBC-SHA1 16.72 MB/sec
AES-128-CBC-SHA256 15.07 MB/sec
After:
AES-128-GCM 35.32 MB/sec
CHACHA20-POLY1305 94.94 MB/sec
AES-128-CBC-SHA1 27.53 MB/sec
AES-128-CBC-SHA256 23.30 MB/sec
C arm/v6/aes-decrypt-internal.asm
ifelse(<
rijndael-arm.S - ARM assembly implementation of AES cipher
Copyright (C) 2013 Jussi Kivilinna [email protected]
This file is part of Libgcrypt.
Libgcrypt is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation; either version 2.1 of
the License, or (at your option) any later version.
Libgcrypt is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this program; if not, see http://www.gnu.org/licenses/.
>)
.text
.file "aes-decrypt-internal.asm"
.arch armv6
.syntax unified
.arm
define(<KEYSCHEDULE_REVERSED>,<yes>)
define(<IF_KEYSCHEDULE_REVERSED>,<ifelse(
KEYSCHEDULE_REVERSED,yes,<$1>,
KEYSCHEDULE_REVERSED,no,<$2>)>)
C register macros
define(<PARAM_ROUNDS>, <r0>)
define(<PARAM_LENGTH>, <r3>)
define(<FRAME_ROUNDS>,<[sp, #0]>)
define(<FRAME_LENGTH>,<[sp, #4]>)
define(<FRAME_DST>,<[sp, #(48+0)]>)
define(<FRAME_SRC>,<[sp, #(48+4)]>)
define(<CTX>, <%r1>)
define(<RTAB>, <%r2>)
define(<RMASK>, <%ip>)
define(<RA>, <%r4>)
define(<RB>, <%r5>)
define(<RC>, <%r6>)
define(<RD>, <%r7>)
define(<RNA>, <%r8>)
define(<RNB>, <%r9>)
define(<RNC>, <%r10>)
define(<RND>, <%r11>)
define(<RT0>, <%r0>)
define(<RT1>, <%r3>)
define(<RT2>, <%lr>)
C helper macros
.macro ldr_unaligned_le rout rsrc offs rtmp
ldrb \rout, [\rsrc, #((\offs) + 0)]
ldrb \rtmp, [\rsrc, #((\offs) + 1)]
orr \rout, \rout, \rtmp, lsl #8
ldrb \rtmp, [\rsrc, #((\offs) + 2)]
orr \rout, \rout, \rtmp, lsl #16
ldrb \rtmp, [\rsrc, #((\offs) + 3)]
orr \rout, \rout, \rtmp, lsl #24
.endm
.macro str_unaligned_le rin rdst offs rtmp0 rtmp1
mov \rtmp0, \rin, lsr #8
strb \rin, [\rdst, #((\offs) + 0)]
mov \rtmp1, \rin, lsr #16
strb \rtmp0, [\rdst, #((\offs) + 1)]
mov \rtmp0, \rin, lsr #24
strb \rtmp1, [\rdst, #((\offs) + 2)]
strb \rtmp0, [\rdst, #((\offs) + 3)]
.endm
C ***********************************************************************
C ARM assembly implementation of the AES cipher
C ***********************************************************************
.macro preload_first_key round ra
IF_KEYSCHEDULE_REVERSED(<
ldr \ra, [CTX], #+4
>,<
ldr \ra, [CTX, #(((\round) * 16) + 0 * 4)]
>)
.endm
.macro dummy round ra
.endm
.macro addroundkey ra rb rc rd rna rnb rnc rnd preload_key
IF_KEYSCHEDULE_REVERSED(<
ldm CTX!, {\rna, \rnb, \rnc, \rnd}
>,<
ldm CTX, {\rna, \rnb, \rnc, \rnd}
>)
eor \ra, \rna
eor \rb, \rnb
eor \rc, \rnc
\preload_key 1, \rna
eor \rd, \rnd
.endm
.macro addroundkey_dec round ra rb rc rd rna rnb rnc rnd
IF_KEYSCHEDULE_REVERSED(<
addroundkey \ra,\rb,\rc,\rd,\rna,\rnb,\rnc,\rnd,preload_first_key
>,<
ldr \rna, [CTX, #(((\round) * 16) + 0 * 4)]
ldr \rnb, [CTX, #(((\round) * 16) + 1 * 4)]
eor \ra, \rna
ldr \rnc, [CTX, #(((\round) * 16) + 2 * 4)]
eor \rb, \rnb
ldr \rnd, [CTX, #(((\round) * 16) + 3 * 4)]
eor \rc, \rnc
preload_first_key (\round) - 1, \rna
eor \rd, \rnd
>)
.endm
.macro do_decround next_r ra rb rc rd rna rnb rnc rnd preload_key
IF_KEYSCHEDULE_REVERSED(<
ldr \rnb, [CTX], #+4
>,<
ldr \rnb, [CTX, #(((\next_r) * 16) + 1 * 4)]
>)
and RT0, RMASK, \ra, lsl#2
IF_KEYSCHEDULE_REVERSED(<
ldr \rnc, [CTX], #+4
>,<
ldr \rnc, [CTX, #(((\next_r) * 16) + 2 * 4)]
>)
and RT1, RMASK, \ra, lsr#(8 - 2)
IF_KEYSCHEDULE_REVERSED(<
ldr \rnd, [CTX], #+4
>,<
ldr \rnd, [CTX, #(((\next_r) * 16) + 3 * 4)]
>)
and RT2, RMASK, \ra, lsr#(16 - 2)
ldr RT0, [RTAB, RT0]
and \ra, RMASK, \ra, lsr#(24 - 2)
ldr RT1, [RTAB, RT1]
eor \rna, \rna, RT0
ldr RT2, [RTAB, RT2]
and RT0, RMASK, \rb, lsl#2
ldr \ra, [RTAB, \ra]
eor \rnb, \rnb, RT1, ror #24
and RT1, RMASK, \rb, lsr#(8 - 2)
eor \rnc, \rnc, RT2, ror #16
and RT2, RMASK, \rb, lsr#(16 - 2)
eor \rnd, \rnd, \ra, ror #8
ldr RT0, [RTAB, RT0]
and \rb, RMASK, \rb, lsr#(24 - 2)
ldr RT1, [RTAB, RT1]
eor \rnb, \rnb, RT0
ldr RT2, [RTAB, RT2]
and RT0, RMASK, \rc, lsl#2
ldr \rb, [RTAB, \rb]
eor \rnc, \rnc, RT1, ror #24
and RT1, RMASK, \rc, lsr#(8 - 2)
eor \rnd, \rnd, RT2, ror #16
and RT2, RMASK, \rc, lsr#(16 - 2)
eor \rna, \rna, \rb, ror #8
ldr RT0, [RTAB, RT0]
and \rc, RMASK, \rc, lsr#(24 - 2)
ldr RT1, [RTAB, RT1]
eor \rnc, \rnc, RT0
ldr RT2, [RTAB, RT2]
and RT0, RMASK, \rd, lsl#2
ldr \rc, [RTAB, \rc]
eor \rnd, \rnd, RT1, ror #24
and RT1, RMASK, \rd, lsr#(8 - 2)
eor \rna, \rna, RT2, ror #16
and RT2, RMASK, \rd, lsr#(16 - 2)
eor \rnb, \rnb, \rc, ror #8
ldr RT0, [RTAB, RT0]
and \rd, RMASK, \rd, lsr#(24 - 2)
ldr RT1, [RTAB, RT1]
eor \rnd, \rnd, RT0
ldr RT2, [RTAB, RT2]
eor \rna, \rna, RT1, ror #24
ldr \rd, [RTAB, \rd]
eor \rnb, \rnb, RT2, ror #16
\preload_key (\next_r) - 1, \ra
eor \rnc, \rnc, \rd, ror #8
.endm
.macro do_lastdecround ra rb rc rd rna rnb rnc rnd
and RT0, RMASK, \ra
and RT1, RMASK, \ra, lsr#8
and RT2, RMASK, \ra, lsr#16
ldrb \rna, [RTAB, RT0]
mov \ra, \ra, lsr#24
ldrb \rnb, [RTAB, RT1]
and RT0, RMASK, \rb
ldrb \rnc, [RTAB, RT2]
mov \rnb, \rnb, ror #24
ldrb \rnd, [RTAB, \ra]
and RT1, RMASK, \rb, lsr#8
mov \rnc, \rnc, ror #16
and RT2, RMASK, \rb, lsr#16
mov \rnd, \rnd, ror #8
ldrb RT0, [RTAB, RT0]
mov \rb, \rb, lsr#24
ldrb RT1, [RTAB, RT1]
orr \rnb, \rnb, RT0
ldrb RT2, [RTAB, RT2]
and RT0, RMASK, \rc
ldrb \rb, [RTAB, \rb]
orr \rnc, \rnc, RT1, ror #24
and RT1, RMASK, \rc, lsr#8
orr \rnd, \rnd, RT2, ror #16
and RT2, RMASK, \rc, lsr#16
orr \rna, \rna, \rb, ror #8
ldrb RT0, [RTAB, RT0]
mov \rc, \rc, lsr#24
ldrb RT1, [RTAB, RT1]
orr \rnc, \rnc, RT0
ldrb RT2, [RTAB, RT2]
and RT0, RMASK, \rd
ldrb \rc, [RTAB, \rc]
orr \rnd, \rnd, RT1, ror #24
and RT1, RMASK, \rd, lsr#8
orr \rna, \rna, RT2, ror #16
ldrb RT0, [RTAB, RT0]
and RT2, RMASK, \rd, lsr#16
ldrb RT1, [RTAB, RT1]
orr \rnb, \rnb, \rc, ror #8
ldrb RT2, [RTAB, RT2]
mov \rd, \rd, lsr#24
ldrb \rd, [RTAB, \rd]
orr \rnd, \rnd, RT0
orr \rna, \rna, RT1, ror #24
orr \rnb, \rnb, RT2, ror #16
orr \rnc, \rnc, \rd, ror #8
.endm
.macro firstdecround round ra rb rc rd rna rnb rnc rnd
addroundkey_dec ((\round) + 1), \ra, \rb, \rc, \rd, \rna, \rnb, \rnc,
\rnd
do_decround \round, \ra, \rb, \rc, \rd, \rna, \rnb, \rnc, \rnd,
preload_first_key
.endm
.macro decround round ra rb rc rd rna rnb rnc rnd preload_key
do_decround \round, \ra, \rb, \rc, \rd, \rna, \rnb, \rnc, \rnd,
\preload_key
.endm
.macro set_last_round_rmask _ __
mov RMASK, #0xff
.endm
.macro lastdecround round ra rb rc rd rna rnb rnc rnd
sub RTAB, #AES_TABLE0
do_lastdecround \ra, \rb, \rc, \rd, \rna, \rnb, \rnc, \rnd
addroundkey \rna, \rnb, \rnc, \rnd, \ra, \rb, \rc, \rd, dummy
add RTAB, #AES_TABLE0
.endm
C _aes_decrypt(unsigned rounds, const uint32_t *keys,
C const struct aes_table *T,
C size_t length, uint8_t *dst,
C uint8_t *src)
C r0 rounds
C r1 ctx
C r2 table
C r3 length
C [sp, #0] dst
C [sp, #4] src
PROLOGUE(_nettle_aes_decrypt)
.cfi_startproc
teq PARAM_LENGTH, #0
bxeq lr
push {r0,r3,%r4-%r11, %ip, %lr}
.cfi_adjust_cfa_offset 48
.cfi_rel_offset r0, 0 C PARAM_LENGTH
.cfi_rel_offset r3, 4 C PARAM_ROUNDS
.cfi_rel_offset r4, 8
.cfi_rel_offset r5, 12
.cfi_rel_offset r6, 16
.cfi_rel_offset r7, 20
.cfi_rel_offset r8, 24
.cfi_rel_offset r9, 28
.cfi_rel_offset r10, 32
.cfi_rel_offset r11, 36
.cfi_rel_offset ip, 40
.cfi_rel_offset lr, 44
add RTAB, RTAB, #AES_TABLE0
C read input block
.Lblock_loop:
ldr RT0, FRAME_SRC
ifelse(V6,V6,<
ldr RA, [RT0]
ldr RB, [RT0, #4]
ldr RC, [RT0, #8]
ldr RD, [RT0, #12]
IF_BE(<
rev RA, RA
rev RB, RB
rev RC, RC
rev RD, RD
>)
>,<
IF_LE(<
C test if src is unaligned
tst RT0, #3
beq 1f
>)
C unaligned load
ldr_unaligned_le RA, RT0, 0, RNA
ldr_unaligned_le RB, RT0, 4, RNB
ldr_unaligned_le RC, RT0, 8, RNA
ldr_unaligned_le RD, RT0, 12, RNB
IF_LE(<
b 2f
.ltorg
1:
C aligned load
ldm RT0, {RA, RB, RC, RD}
2:
>)
>)
add RT0, RT0, #16
mov RMASK, #0xff
str RT0, FRAME_SRC
ldr RT1, FRAME_ROUNDS
mov RMASK, RMASK, lsl#2; C byte mask
cmp RT1, #12
bge .Ldec_256
firstdecround 9, RA, RB, RC, RD, RNA, RNB, RNC, RND
.Ldec_tail:
decround 8, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key
decround 7, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key
decround 6, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key
decround 5, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key
decround 4, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key
decround 3, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key
decround 2, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key
decround 1, RA, RB, RC, RD, RNA, RNB, RNC, RND, set_last_round_rmask
lastdecround 0, RNA, RNB, RNC, RND, RA, RB, RC, RD
IF_KEYSCHEDULE_REVERSED(<
ldr RT1, FRAME_ROUNDS
>)
ldr RT0, FRAME_DST
IF_KEYSCHEDULE_REVERSED(<
add RT1, 1
>)
ldr RT2, FRAME_LENGTH
IF_KEYSCHEDULE_REVERSED(<
sub CTX, CTX, RT1, lsl#4
>)
C store output block
ifelse(V6,V6,<
IF_BE(<
rev RA, RA
rev RB, RB
rev RC, RC
rev RD, RD
>)
str RA, [RT0]
str RB, [RT0, #4]
str RC, [RT0, #8]
str RD, [RT0, #12]
>,<
IF_LE(<
C test if dst is unaligned
tst RT0, #3
beq 1f
>)
C unaligned store
str_unaligned_le RA, RT0, 0, RNA, RNB
str_unaligned_le RB, RT0, 4, RNA, RNB
str_unaligned_le RC, RT0, 8, RNA, RNB
str_unaligned_le RD, RT0, 12, RNA, RNB
IF_LE(<
b 2f
.ltorg
1:
C aligned store
C write output block
stm RT0, {RA, RB, RC, RD}
2:
>)
>)
add RT0, RT0, #16
subs RT2, RT2, #16
str RT0, FRAME_DST
str RT2, FRAME_LENGTH
bhi .Lblock_loop
.cfi_remember_state
pop {%r0,%r3, %r4-%r11,%ip,%pc}
.cfi_restore_state
.ltorg
.Ldec_256:
beq .Ldec_192
firstdecround 13, RA, RB, RC, RD, RNA, RNB, RNC, RND
decround 12, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key
decround 11, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key
decround 10, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key
decround 9, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key
b .Ldec_tail
.ltorg
.Ldec_192:
firstdecround 11, RA, RB, RC, RD, RNA, RNB, RNC, RND
decround 10, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key
decround 9, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key
b .Ldec_tail
.cfi_endproc
EPILOGUE(_nettle_aes_decrypt)
C arm/v6/aes-encrypt-internal.asm
ifelse(<
rijndael-arm.S - ARM assembly implementation of AES cipher
Copyright (C) 2013 Jussi Kivilinna <[email protected]>
This file is part of Libgcrypt.
Libgcrypt is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation; either version 2.1 of
the License, or (at your option) any later version.
Libgcrypt is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this program; if not, see <http://www.gnu.org/licenses/>.
>)
.text
.arch armv6
.syntax unified
.arm
C register macros
define(<PARAM_ROUNDS>, <r0>)
define(<PARAM_LENGTH>, <r3>)
define(<FRAME_ROUNDS>,<[sp, #0]>)
define(<FRAME_LENGTH>,<[sp, #4]>)
define(<FRAME_DST>,<[sp, #(48+0)]>)
define(<FRAME_SRC>,<[sp, #(48+4)]>)
define(<CTX>, <%r1>)
define(<RTAB>, <%r2>)
define(<RMASK>, <%ip>)
define(<RA>, <%r4>)
define(<RB>, <%r5>)
define(<RC>, <%r6>)
define(<RD>, <%r7>)
define(<RNA>, <%r8>)
define(<RNB>, <%r9>)
define(<RNC>, <%r10>)
define(<RND>, <%r11>)
define(<RT0>, <%r0>)
define(<RT1>, <%r3>)
define(<RT2>, <%lr>)
C helper macros
.macro ldr_unaligned_le rout rsrc offs rtmp
ldrb \rout, [\rsrc, #((\offs) + 0)]
ldrb \rtmp, [\rsrc, #((\offs) + 1)]
orr \rout, \rout, \rtmp, lsl #8
ldrb \rtmp, [\rsrc, #((\offs) + 2)]
orr \rout, \rout, \rtmp, lsl #16
ldrb \rtmp, [\rsrc, #((\offs) + 3)]
orr \rout, \rout, \rtmp, lsl #24
.endm
.macro str_unaligned_le rin rdst offs rtmp0 rtmp1
mov \rtmp0, \rin, lsr #8
strb \rin, [\rdst, #((\offs) + 0)]
mov \rtmp1, \rin, lsr #16
strb \rtmp0, [\rdst, #((\offs) + 1)]
mov \rtmp0, \rin, lsr #24
strb \rtmp1, [\rdst, #((\offs) + 2)]
strb \rtmp0, [\rdst, #((\offs) + 3)]
.endm
C ***********************************************************************
C ARM assembly implementation of the AES cipher
C ***********************************************************************
.macro preload_first_key round ra
ldr \ra, [CTX, #(((\round) * 16) + 0 * 4)]
.endm
.macro dummy round ra
.endm
.macro addroundkey ra rb rc rd rna rnb rnc rnd preload_key
ldm CTX, {\rna, \rnb, \rnc, \rnd}
eor \ra, \rna
eor \rb, \rnb
eor \rc, \rnc
\preload_key 1, \rna
eor \rd, \rnd
.endm
.macro do_encround next_r ra rb rc rd rna rnb rnc rnd preload_key
ldr \rnb, [CTX, #(((\next_r) * 16) + 1 * 4)]
and RT0, RMASK, \ra, lsl#2
ldr \rnc, [CTX, #(((\next_r) * 16) + 2 * 4)]
and RT1, RMASK, \ra, lsr#(8 - 2)
ldr \rnd, [CTX, #(((\next_r) * 16) + 3 * 4)]
and RT2, RMASK, \ra, lsr#(16 - 2)
ldr RT0, [RTAB, RT0]
and \ra, RMASK, \ra, lsr#(24 - 2)
ldr RT1, [RTAB, RT1]
eor \rna, \rna, RT0
ldr RT2, [RTAB, RT2]
and RT0, RMASK, \rd, lsl#2
ldr \ra, [RTAB, \ra]
eor \rnd, \rnd, RT1, ror #24
and RT1, RMASK, \rd, lsr#(8 - 2)
eor \rnc, \rnc, RT2, ror #16
and RT2, RMASK, \rd, lsr#(16 - 2)
eor \rnb, \rnb, \ra, ror #8
ldr RT0, [RTAB, RT0]
and \rd, RMASK, \rd, lsr#(24 - 2)
ldr RT1, [RTAB, RT1]
eor \rnd, \rnd, RT0
ldr RT2, [RTAB, RT2]
and RT0, RMASK, \rc, lsl#2
ldr \rd, [RTAB, \rd]
eor \rnc, \rnc, RT1, ror #24
and RT1, RMASK, \rc, lsr#(8 - 2)
eor \rnb, \rnb, RT2, ror #16
and RT2, RMASK, \rc, lsr#(16 - 2)
eor \rna, \rna, \rd, ror #8
ldr RT0, [RTAB, RT0]
and \rc, RMASK, \rc, lsr#(24 - 2)
ldr RT1, [RTAB, RT1]
eor \rnc, \rnc, RT0
ldr RT2, [RTAB, RT2]
and RT0, RMASK, \rb, lsl#2
ldr \rc, [RTAB, \rc]
eor \rnb, \rnb, RT1, ror #24
and RT1, RMASK, \rb, lsr#(8 - 2)
eor \rna, \rna, RT2, ror #16
and RT2, RMASK, \rb, lsr#(16 - 2)
eor \rnd, \rnd, \rc, ror #8
ldr RT0, [RTAB, RT0]
and \rb, RMASK, \rb, lsr#(24 - 2)
ldr RT1, [RTAB, RT1]
eor \rnb, \rnb, RT0
ldr RT2, [RTAB, RT2]
eor \rna, \rna, RT1, ror #24
ldr \rb, [RTAB, \rb]
eor \rnd, \rnd, RT2, ror #16
\preload_key (\next_r) + 1, \ra
eor \rnc, \rnc, \rb, ror #8
.endm
.macro do_lastencround ra rb rc rd rna rnb rnc rnd
and RT0, RMASK, \ra, lsl#2
and RT1, RMASK, \ra, lsr#(8 - 2)
and RT2, RMASK, \ra, lsr#(16 - 2)
ldrb \rna, [RTAB, RT0]
and \ra, RMASK, \ra, lsr#(24 - 2)
ldrb \rnd, [RTAB, RT1]
and RT0, RMASK, \rd, lsl#2
ldrb \rnc, [RTAB, RT2]
mov \rnd, \rnd, ror #24
ldrb \rnb, [RTAB, \ra]
and RT1, RMASK, \rd, lsr#(8 - 2)
mov \rnc, \rnc, ror #16
and RT2, RMASK, \rd, lsr#(16 - 2)
mov \rnb, \rnb, ror #8
ldrb RT0, [RTAB, RT0]
and \rd, RMASK, \rd, lsr#(24 - 2)
ldrb RT1, [RTAB, RT1]
orr \rnd, \rnd, RT0
ldrb RT2, [RTAB, RT2]
and RT0, RMASK, \rc, lsl#2
ldrb \rd, [RTAB, \rd]
orr \rnc, \rnc, RT1, ror #24
and RT1, RMASK, \rc, lsr#(8 - 2)
orr \rnb, \rnb, RT2, ror #16
and RT2, RMASK, \rc, lsr#(16 - 2)
orr \rna, \rna, \rd, ror #8
ldrb RT0, [RTAB, RT0]
and \rc, RMASK, \rc, lsr#(24 - 2)
ldrb RT1, [RTAB, RT1]
orr \rnc, \rnc, RT0
ldrb RT2, [RTAB, RT2]
and RT0, RMASK, \rb, lsl#2
ldrb \rc, [RTAB, \rc]
orr \rnb, \rnb, RT1, ror #24
and RT1, RMASK, \rb, lsr#(8 - 2)
orr \rna, \rna, RT2, ror #16
ldrb RT0, [RTAB, RT0]
and RT2, RMASK, \rb, lsr#(16 - 2)
ldrb RT1, [RTAB, RT1]
orr \rnd, \rnd, \rc, ror #8
ldrb RT2, [RTAB, RT2]
and \rb, RMASK, \rb, lsr#(24 - 2)
ldrb \rb, [RTAB, \rb]
orr \rnb, \rnb, RT0
orr \rna, \rna, RT1, ror #24
orr \rnd, \rnd, RT2, ror #16
orr \rnc, \rnc, \rb, ror #8
.endm
.macro firstencround round ra rb rc rd rna rnb rnc rnd
addroundkey \ra,\rb,\rc,\rd,\rna,\rnb,\rnc,\rnd,preload_first_key
do_encround (\round) +
1,\ra,\rb,\rc,\rd,\rna,\rnb,\rnc,\rnd,preload_first_key
.endm
.macro encround round ra rb rc rd rna rnb rnc rnd preload_key
do_encround (\round) +
1,\ra,\rb,\rc,\rd,\rna,\rnb,\rnc,\rnd,\preload_key
.endm
.macro lastencround round ra rb rc rd rna rnb rnc rnd
add CTX, #(((\round) + 1) * 16)
add RTAB, #1
do_lastencround \ra,\rb,\rc,\rd,\rna,\rnb,\rnc,\rnd
addroundkey \rna,\rnb,\rnc,\rnd,\ra,\rb,\rc,\rd,dummy
sub CTX, #(((\round) + 1) * 16)
sub RTAB, #1
.endm
C _aes_encrypt(unsigned rounds, const uint32_t *keys,
C const struct aes_table *T,
C size_t length, uint8_t *dst,
C uint8_t *src)
C r0 rounds
C r1 ctx
C r2 table
C r3 length
C [sp, #0] dst
C [sp, #4] src
PROLOGUE(_nettle_aes_encrypt)
.cfi_startproc
teq PARAM_LENGTH, #0
bxeq lr
push {r0,r3,%r4-%r11, %ip, %lr}
.cfi_adjust_cfa_offset 48
.cfi_rel_offset r0, 0 C PARAM_LENGTH
.cfi_rel_offset r3, 4 C PARAM_ROUNDS
.cfi_rel_offset r4, 8
.cfi_rel_offset r5, 12
.cfi_rel_offset r6, 16
.cfi_rel_offset r7, 20
.cfi_rel_offset r8, 24
.cfi_rel_offset r9, 28
.cfi_rel_offset r10, 32
.cfi_rel_offset r11, 36
.cfi_rel_offset ip, 40
.cfi_rel_offset lr, 44
add RTAB, RTAB, #AES_TABLE0
C read input block
.Lblock_loop:
ldr RT0, FRAME_SRC
ifelse(V6,V6,<
IF_BE(<
rev RA, RA
rev RB, RB
rev RC, RC
rev RD, RD
>)
str RA, [RT0]
str RB, [RT0, #4]
str RC, [RT0, #8]
str RD, [RT0, #12]
>,<
IF_LE(<
C test if src is unaligned
tst RT0, #3
beq 1f
>)
C unaligned load
ldr_unaligned_le RA, RT0, 0, RNA
ldr_unaligned_le RB, RT0, 4, RNB
ldr_unaligned_le RC, RT0, 8, RNA
ldr_unaligned_le RD, RT0, 12, RNB
IF_LE(<
b 2f
.ltorg
1:
C aligned load
ldm RT0, {RA, RB, RC, RD}
2:
>)
>)
add RT0, RT0, #16
mov RMASK, #0xff
str RT0, FRAME_SRC
mov RMASK, RMASK, lsl#2; C byte mask
firstencround 0, RA, RB, RC, RD, RNA, RNB, RNC, RND
encround 1, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key
encround 2, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key
encround 3, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key
encround 4, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key
encround 5, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key
encround 6, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key
encround 7, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key
ldr RT0, FRAME_ROUNDS
cmp RT0, #12
bge .Lenc_not_128
encround 8, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy
lastencround 9, RNA, RNB, RNC, RND, RA, RB, RC, RD
.Lenc_done:
ldr RT0, FRAME_DST
ldr RT1, FRAME_LENGTH
C store output block
ifelse(V6,V6,<
IF_BE(<
rev RA, RA
rev RB, RB
rev RC, RC
rev RD, RD
>)
str RA, [RT0]
str RB, [RT0, #4]
str RC, [RT0, #8]
str RD, [RT0, #12]
>,<
IF_LE(<
C test if dst is unaligned
tst RT0, #3
beq 1f
>)
C unaligned store
str_unaligned_le RA, RT0, 0, RNA, RNB
str_unaligned_le RB, RT0, 4, RNA, RNB
str_unaligned_le RC, RT0, 8, RNA, RNB
str_unaligned_le RD, RT0, 12, RNA, RNB
IF_LE(<
b 2f
.ltorg
1:
C aligned store
C write output block
stm RT0, {RA, RB, RC, RD}
2:
>)
>)
add RT0, RT0, #16
subs RT1, RT1, #16
str RT0, FRAME_DST
str RT1, FRAME_LENGTH
bhi .Lblock_loop
.cfi_remember_state
pop {%r0,%r3, %r4-%r11,%ip,%pc}
.cfi_restore_state
.ltorg
.Lenc_not_128:
beq .Lenc_192
encround 8, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key
encround 9, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key
encround 10, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key
encround 11, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key
encround 12, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy
lastencround 13, RNA, RNB, RNC, RND, RA, RB, RC, RD
b .Lenc_done
.ltorg
.Lenc_192:
encround 8, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key
encround 9, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key
encround 10, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy
lastencround 11, RNA, RNB, RNC, RND, RA, RB, RC, RD
b .Lenc_done
.cfi_endproc
EPILOGUE(_nettle_aes_encrypt)
_______________________________________________
nettle-bugs mailing list
[email protected]
http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs