For the final round, avoid the expanded and padded lookup tables
exported by the generic AES driver. Instead, for encryption, we can
perform byte loads from the same table we used for the inner rounds,
which will still be hot in the caches. For decryption, use the inverse
AES Sbox exported by the generic AES driver, which is 4x smaller than
the inverse table exported by the generic driver.

This significantly reduces the Dcache footprint of our code, and does
not introduce any additional module dependencies, given that we already
rely on the core AES module for the shared key expansion routines. It
also frees up register x18, which is not available as a scratch register
on all platforms, which and so avoiding it improves shareability of this
code.

Signed-off-by: Ard Biesheuvel <ard.biesheu...@linaro.org>
---
 arch/arm64/crypto/aes-cipher-core.S | 155 ++++++++++++++------
 1 file changed, 108 insertions(+), 47 deletions(-)

diff --git a/arch/arm64/crypto/aes-cipher-core.S 
b/arch/arm64/crypto/aes-cipher-core.S
index bbe5dd96135c..fe807f164d83 100644
--- a/arch/arm64/crypto/aes-cipher-core.S
+++ b/arch/arm64/crypto/aes-cipher-core.S
@@ -18,99 +18,160 @@
        out             .req    x1
        in              .req    x2
        rounds          .req    x3
-       tt              .req    x4
-       lt              .req    x2
+       tt              .req    x2
 
-       .macro          __pair, enc, reg0, reg1, in0, in1e, in1d, shift
+       .macro          __ubf1, reg0, reg1, in0, in1e, in1d, sz, shift
        ubfx            \reg0, \in0, #\shift, #8
-       .if             \enc
        ubfx            \reg1, \in1e, #\shift, #8
-       .else
+       .endm
+
+       .macro          __ubf0, reg0, reg1, in0, in1e, in1d, sz, shift
+       ubfx            \reg0, \in0, #\shift, #8
        ubfx            \reg1, \in1d, #\shift, #8
+       .endm
+
+       .macro          __ubf1b, reg0, reg1, in0, in1e, in1d, sz, shift
+       .if             \shift == 0 && \sz > 0
+       ubfiz           \reg0, \in0, #\sz, #8
+       ubfiz           \reg1, \in1e, #\sz, #8
+       .else
+       __ubf1          \reg0, \reg1, \in0, \in1e, \in1d, \sz, \shift
+       .endif
+       .endm
+
+       .macro          __ubf0b, reg0, reg1, in0, in1e, in1d, sz, shift
+       .if             \shift == 0 && \sz > 0
+       ubfiz           \reg0, \in0, #\sz, #8
+       ubfiz           \reg1, \in1d, #\sz, #8
+       .else
+       __ubf0          \reg0, \reg1, \in0, \in1e, \in1d, \sz, \shift
        .endif
+       .endm
+
+       /*
+        * AArch64 cannot do byte size indexed loads from a table containing
+        * 32-bit quantities, i.e., 'ldrb w12, [tt, w12, uxtw #2]' is not a
+        * valid instruction.
+        *
+        * For shift == 0, we can simply fold the size shift of the index
+        * into the ubfx instruction, by switcing to ubfiz and using \sz as
+        * the destination offset.
+        * For shift > 0, we perform a 32-byte wide load instead, which does
+        * allow an index shift of 2, and discard the high bytes later using
+        * uxtb or lsl #24.
+        */
+       .macro          __pair, enc, sz, op, reg0, reg1, in0, in1e, in1d, shift
+       __ubf\enc\op    \reg0, \reg1, \in0, \in1e, \in1d, \sz, \shift
+       .ifnc           \op\sz, b2
+       ldr\op          \reg0, [tt, \reg0, uxtw #\sz]
+       ldr\op          \reg1, [tt, \reg1, uxtw #\sz]
+       .elseif         \shift == 0
+       ldrb            \reg0, [tt, \reg0, uxtw]
+       ldrb            \reg1, [tt, \reg1, uxtw]
+       .else
        ldr             \reg0, [tt, \reg0, uxtw #2]
        ldr             \reg1, [tt, \reg1, uxtw #2]
+       .endif
        .endm
 
-       .macro          __hround, out0, out1, in0, in1, in2, in3, t0, t1, enc
+       .macro          __hround, out0, out1, in0, in1, in2, in3, t0, t1, enc, 
sz, op
        ldp             \out0, \out1, [rk], #8
 
-       __pair          \enc, w13, w14, \in0, \in1, \in3, 0
-       __pair          \enc, w15, w16, \in1, \in2, \in0, 8
-       __pair          \enc, w17, w18, \in2, \in3, \in1, 16
-       __pair          \enc, \t0, \t1, \in3, \in0, \in2, 24
-
-       eor             \out0, \out0, w13
-       eor             \out1, \out1, w14
-       eor             \out0, \out0, w15, ror #24
-       eor             \out1, \out1, w16, ror #24
-       eor             \out0, \out0, w17, ror #16
-       eor             \out1, \out1, w18, ror #16
-       eor             \out0, \out0, \t0, ror #8
-       eor             \out1, \out1, \t1, ror #8
+       __pair          \enc, \sz, \op, w12, w13, \in0, \in1, \in3, 0
+       __pair          \enc, \sz, \op, w14, w15, \in3, \in0, \in2, 24
+       __pair          \enc, \sz, \op, w16, w17, \in2, \in3, \in1, 16
+       __pair          \enc, \sz, \op, \t0, \t1, \in1, \in2, \in0, 8
+
+       eor             \out0, \out0, w12
+       eor             \out1, \out1, w13
+
+       .ifnc           \op\sz, b2
+       eor             \out0, \out0, w14, ror #8
+       eor             \out1, \out1, w15, ror #8
+       .else
+CPU_BE(        lsr             w14, w14, #24           )
+CPU_BE(        lsr             w15, w15, #24           )
+
+       eor             \out0, \out0, w14, lsl #24
+       eor             \out1, \out1, w15, lsl #24
+
+CPU_LE(        uxtb            w16, w16                )
+CPU_LE(        uxtb            w17, w17                )
+CPU_LE(        uxtb            \t0, \t0                )
+CPU_LE(        uxtb            \t1, \t1                )
+
+CPU_BE(        lsr             w16, w16, #24           )
+CPU_BE(        lsr             w17, w17, #24           )
+CPU_BE(        lsr             \t0, \t0, #24           )
+CPU_BE(        lsr             \t1, \t1, #24           )
+       .endif
+
+       eor             \out0, \out0, w16, ror #16
+       eor             \out1, \out1, w17, ror #16
+       eor             \out0, \out0, \t0, ror #24
+       eor             \out1, \out1, \t1, ror #24
        .endm
 
-       .macro          fround, out0, out1, out2, out3, in0, in1, in2, in3
-       __hround        \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1
-       __hround        \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1
+       .macro          fround, out0, out1, out2, out3, in0, in1, in2, in3, 
sz=2, op
+       __hround        \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1, 
\sz, \op
+       __hround        \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, 
\sz, \op
        .endm
 
-       .macro          iround, out0, out1, out2, out3, in0, in1, in2, in3
-       __hround        \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0
-       __hround        \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0
+       .macro          iround, out0, out1, out2, out3, in0, in1, in2, in3, 
sz=2, op
+       __hround        \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0, 
\sz, \op
+       __hround        \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, 
\sz, \op
        .endm
 
-       .macro          do_crypt, round, ttab, ltab
-       ldp             w5, w6, [in]
-       ldp             w7, w8, [in, #8]
-       ldp             w9, w10, [rk], #16
-       ldp             w11, w12, [rk, #-8]
+       .macro          do_crypt, round, ttab, ltab, bsz
+       ldp             w4, w5, [in]
+       ldp             w6, w7, [in, #8]
+       ldp             w8, w9, [rk], #16
+       ldp             w10, w11, [rk, #-8]
 
+CPU_BE(        rev             w4, w4          )
 CPU_BE(        rev             w5, w5          )
 CPU_BE(        rev             w6, w6          )
 CPU_BE(        rev             w7, w7          )
-CPU_BE(        rev             w8, w8          )
 
+       eor             w4, w4, w8
        eor             w5, w5, w9
        eor             w6, w6, w10
        eor             w7, w7, w11
-       eor             w8, w8, w12
 
        adr_l           tt, \ttab
-       adr_l           lt, \ltab
 
        tbnz            rounds, #1, 1f
 
-0:     \round          w9, w10, w11, w12, w5, w6, w7, w8
-       \round          w5, w6, w7, w8, w9, w10, w11, w12
+0:     \round          w8, w9, w10, w11, w4, w5, w6, w7
+       \round          w4, w5, w6, w7, w8, w9, w10, w11
 
 1:     subs            rounds, rounds, #4
-       \round          w9, w10, w11, w12, w5, w6, w7, w8
-       csel            tt, tt, lt, hi
-       \round          w5, w6, w7, w8, w9, w10, w11, w12
-       b.hi            0b
-
+       \round          w8, w9, w10, w11, w4, w5, w6, w7
+       b.ls            3f
+2:     \round          w4, w5, w6, w7, w8, w9, w10, w11
+       b               0b
+3:     adr_l           tt, \ltab
+       \round          w4, w5, w6, w7, w8, w9, w10, w11, \bsz, b
+
+CPU_BE(        rev             w4, w4          )
 CPU_BE(        rev             w5, w5          )
 CPU_BE(        rev             w6, w6          )
 CPU_BE(        rev             w7, w7          )
-CPU_BE(        rev             w8, w8          )
 
-       stp             w5, w6, [out]
-       stp             w7, w8, [out, #8]
+       stp             w4, w5, [out]
+       stp             w6, w7, [out, #8]
        ret
        .endm
 
        .align                  7
        aes_table_reduced       crypto_ft_tab
-       aes_table_reduced       crypto_fl_tab
        aes_table_reduced       crypto_it_tab
-       aes_table_reduced       crypto_il_tab
 
 ENTRY(__aes_arm64_encrypt)
-       do_crypt        fround, crypto_ft_tab, crypto_fl_tab
+       do_crypt        fround, crypto_ft_tab, crypto_ft_tab + 1, 2
 ENDPROC(__aes_arm64_encrypt)
 
        .align          5
 ENTRY(__aes_arm64_decrypt)
-       do_crypt        iround, crypto_it_tab, crypto_il_tab
+       do_crypt        iround, crypto_it_tab, crypto_aes_inv_sbox, 0
 ENDPROC(__aes_arm64_decrypt)
-- 
2.9.3

Reply via email to