For the final round, avoid the expanded and padded lookup tables
exported by the generic AES driver. Instead, for encryption, we can
perform byte loads from the same table we used for the inner rounds,
which will still be hot in the caches. For decryption, use the inverse
AES Sbox directly, which is 4x smaller than the inverse lookup table
exported by the generic driver.

This should significantly reduce the Dcache footprint of our code,
which makes the code more robust against timing attacks. It does not
introduce any additional module dependencies, given that we already
rely on the core AES module for the shared key expansion routines.
It also frees up register x18, which is not available as a scratch
register on all platforms, which and so avoiding it improves
shareability of this code.

Signed-off-by: Ard Biesheuvel <ard.biesheu...@linaro.org>
---
 arch/arm64/crypto/aes-cipher-core.S | 152 ++++++++++++++------
 1 file changed, 107 insertions(+), 45 deletions(-)

diff --git a/arch/arm64/crypto/aes-cipher-core.S 
b/arch/arm64/crypto/aes-cipher-core.S
index f2f9cc519309..6d2445d603cc 100644
--- a/arch/arm64/crypto/aes-cipher-core.S
+++ b/arch/arm64/crypto/aes-cipher-core.S
@@ -10,6 +10,7 @@
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
+#include <asm/cache.h>
 
        .text
 
@@ -17,94 +18,155 @@
        out             .req    x1
        in              .req    x2
        rounds          .req    x3
-       tt              .req    x4
-       lt              .req    x2
+       tt              .req    x2
 
-       .macro          __pair, enc, reg0, reg1, in0, in1e, in1d, shift
+       .macro          __pair1, sz, op, reg0, reg1, in0, in1e, in1d, shift
+       .ifc            \op\shift, b0
+       ubfiz           \reg0, \in0, #2, #8
+       ubfiz           \reg1, \in1e, #2, #8
+       .else
        ubfx            \reg0, \in0, #\shift, #8
-       .if             \enc
        ubfx            \reg1, \in1e, #\shift, #8
-       .else
-       ubfx            \reg1, \in1d, #\shift, #8
        .endif
+
+       /*
+        * AArch64 cannot do byte size indexed loads from a table containing
+        * 32-bit quantities, i.e., 'ldrb w12, [tt, w12, uxtw #2]' is not a
+        * valid instruction. So perform the shift explicitly first for the
+        * high bytes (the low byte is shifted implicitly by using ubfiz rather
+        * than ubfx above)
+        */
+       .ifnc           \op, b
        ldr             \reg0, [tt, \reg0, uxtw #2]
        ldr             \reg1, [tt, \reg1, uxtw #2]
+       .else
+       .if             \shift > 0
+       lsl             \reg0, \reg0, #2
+       lsl             \reg1, \reg1, #2
+       .endif
+       ldrb            \reg0, [tt, \reg0, uxtw]
+       ldrb            \reg1, [tt, \reg1, uxtw]
+       .endif
        .endm
 
-       .macro          __hround, out0, out1, in0, in1, in2, in3, t0, t1, enc
+       .macro          __pair0, sz, op, reg0, reg1, in0, in1e, in1d, shift
+       ubfx            \reg0, \in0, #\shift, #8
+       ubfx            \reg1, \in1d, #\shift, #8
+       ldr\op          \reg0, [tt, \reg0, uxtw #\sz]
+       ldr\op          \reg1, [tt, \reg1, uxtw #\sz]
+       .endm
+
+       .macro          __hround, out0, out1, in0, in1, in2, in3, t0, t1, enc, 
sz, op
        ldp             \out0, \out1, [rk], #8
 
-       __pair          \enc, w13, w14, \in0, \in1, \in3, 0
-       __pair          \enc, w15, w16, \in1, \in2, \in0, 8
-       __pair          \enc, w17, w18, \in2, \in3, \in1, 16
-       __pair          \enc, \t0, \t1, \in3, \in0, \in2, 24
-
-       eor             \out0, \out0, w13
-       eor             \out1, \out1, w14
-       eor             \out0, \out0, w15, ror #24
-       eor             \out1, \out1, w16, ror #24
-       eor             \out0, \out0, w17, ror #16
-       eor             \out1, \out1, w18, ror #16
+       __pair\enc      \sz, \op, w12, w13, \in0, \in1, \in3, 0
+       __pair\enc      \sz, \op, w14, w15, \in1, \in2, \in0, 8
+       __pair\enc      \sz, \op, w16, w17, \in2, \in3, \in1, 16
+       __pair\enc      \sz, \op, \t0, \t1, \in3, \in0, \in2, 24
+
+       eor             \out0, \out0, w12
+       eor             \out1, \out1, w13
+       eor             \out0, \out0, w14, ror #24
+       eor             \out1, \out1, w15, ror #24
+       eor             \out0, \out0, w16, ror #16
+       eor             \out1, \out1, w17, ror #16
        eor             \out0, \out0, \t0, ror #8
        eor             \out1, \out1, \t1, ror #8
        .endm
 
-       .macro          fround, out0, out1, out2, out3, in0, in1, in2, in3
-       __hround        \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1
-       __hround        \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1
+       .macro          fround, out0, out1, out2, out3, in0, in1, in2, in3, 
sz=2, op
+       __hround        \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1, 
\sz, \op
+       __hround        \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, 
\sz, \op
        .endm
 
-       .macro          iround, out0, out1, out2, out3, in0, in1, in2, in3
-       __hround        \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0
-       __hround        \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0
+       .macro          iround, out0, out1, out2, out3, in0, in1, in2, in3, 
sz=2, op
+       __hround        \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0, 
\sz, \op
+       __hround        \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, 
\sz, \op
        .endm
 
-       .macro          do_crypt, round, ttab, ltab
-       ldp             w5, w6, [in]
-       ldp             w7, w8, [in, #8]
-       ldp             w9, w10, [rk], #16
-       ldp             w11, w12, [rk, #-8]
+       .macro          do_crypt, round, ttab, ltab, bsz
+       ldp             w4, w5, [in]
+       ldp             w6, w7, [in, #8]
+       ldp             w8, w9, [rk], #16
+       ldp             w10, w11, [rk, #-8]
 
+CPU_BE(        rev             w4, w4          )
 CPU_BE(        rev             w5, w5          )
 CPU_BE(        rev             w6, w6          )
 CPU_BE(        rev             w7, w7          )
-CPU_BE(        rev             w8, w8          )
 
+       eor             w4, w4, w8
        eor             w5, w5, w9
        eor             w6, w6, w10
        eor             w7, w7, w11
-       eor             w8, w8, w12
 
        adr_l           tt, \ttab
-       adr_l           lt, \ltab
 
        tbnz            rounds, #1, 1f
 
-0:     \round          w9, w10, w11, w12, w5, w6, w7, w8
-       \round          w5, w6, w7, w8, w9, w10, w11, w12
+0:     \round          w8, w9, w10, w11, w4, w5, w6, w7
+       \round          w4, w5, w6, w7, w8, w9, w10, w11
 
 1:     subs            rounds, rounds, #4
-       \round          w9, w10, w11, w12, w5, w6, w7, w8
-       csel            tt, tt, lt, hi
-       \round          w5, w6, w7, w8, w9, w10, w11, w12
-       b.hi            0b
-
+       \round          w8, w9, w10, w11, w4, w5, w6, w7
+       b.ls            3f
+2:     \round          w4, w5, w6, w7, w8, w9, w10, w11
+       b               0b
+3:     adr_l           tt, \ltab
+       \round          w4, w5, w6, w7, w8, w9, w10, w11, \bsz, b
+
+CPU_BE(        rev             w4, w4          )
 CPU_BE(        rev             w5, w5          )
 CPU_BE(        rev             w6, w6          )
 CPU_BE(        rev             w7, w7          )
-CPU_BE(        rev             w8, w8          )
 
-       stp             w5, w6, [out]
-       stp             w7, w8, [out, #8]
+       stp             w4, w5, [out]
+       stp             w6, w7, [out, #8]
        ret
        .endm
 
-       .align          5
+       .align          L1_CACHE_SHIFT
+       .type           __aes_arm64_inverse_sbox, %object
+__aes_arm64_inverse_sbox:
+       .byte           0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+       .byte           0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+       .byte           0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+       .byte           0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+       .byte           0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+       .byte           0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+       .byte           0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+       .byte           0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+       .byte           0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+       .byte           0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+       .byte           0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+       .byte           0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+       .byte           0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+       .byte           0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+       .byte           0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+       .byte           0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+       .byte           0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+       .byte           0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+       .byte           0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+       .byte           0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+       .byte           0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+       .byte           0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+       .byte           0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+       .byte           0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+       .byte           0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+       .byte           0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+       .byte           0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+       .byte           0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+       .byte           0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+       .byte           0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+       .byte           0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+       .byte           0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+       .size           __aes_arm64_inverse_sbox, . - __aes_arm64_inverse_sbox
+
 ENTRY(__aes_arm64_encrypt)
-       do_crypt        fround, crypto_ft_tab, crypto_fl_tab
+       do_crypt        fround, crypto_ft_tab, crypto_ft_tab + 1, 2
 ENDPROC(__aes_arm64_encrypt)
 
        .align          5
 ENTRY(__aes_arm64_decrypt)
-       do_crypt        iround, crypto_it_tab, crypto_il_tab
+       do_crypt        iround, crypto_it_tab, __aes_arm64_inverse_sbox, 0
 ENDPROC(__aes_arm64_decrypt)
-- 
2.9.3

Reply via email to