gcm-aes-ce - add NEON yield support

Ard Biesheuvel Fri, 18 Dec 2020 09:04:51 -0800

GCM mode is typically used for IPsec, but is now also used for software
kTLS, which means that we may end up operating on much larger inputs.


So let's wire up conditional NEON yield support for this driver, to ensure
that it does not cause scheduling blackouts when used with large inputs.

Given that GCM executes at 1-2 cycles per bytes and operates on 64 byte
chunks, doing a yield check every iteration should limit the scheduling
(or softirq) latency to < 200 cycles, which is a very conservative upper
bound.

Signed-off-by: Ard Biesheuvel <a...@kernel.org>
---
 arch/arm64/crypto/ghash-ce-core.S | 115 ++++++++++++--------
 1 file changed, 67 insertions(+), 48 deletions(-)

diff --git a/arch/arm64/crypto/ghash-ce-core.S 
b/arch/arm64/crypto/ghash-ce-core.S
index 7868330dd54e..533cfb810232 100644
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -435,14 +435,23 @@ SYM_FUNC_END(pmull_ghash_update_p8)
 
        .align          6
        .macro          pmull_gcm_do_crypt, enc
-       stp             x29, x30, [sp, #-32]!
-       mov             x29, sp
-       str             x19, [sp, #24]
+       frame_push      8
 
-       load_round_keys x7, x6, x8
+       mov             x19, x0
+       mov             x20, x1
+       mov             x21, x2
+       mov             x22, x3
+       mov             x23, x4
+       mov             x24, x5
+       mov             x25, x6
+       mov             x26, x7
 
-       ld1             {SHASH.2d}, [x3], #16
-       ld1             {HH.2d-HH4.2d}, [x3]
+.Lrestart\enc:
+       load_round_keys x26, x25, x8
+
+       add             x0, x22, #16
+       ld1             {SHASH.2d}, [x22]
+       ld1             {HH.2d-HH4.2d}, [x0]
 
        trn1            SHASH2.2d, SHASH.2d, HH.2d
        trn2            T1.2d, SHASH.2d, HH.2d
@@ -452,23 +461,23 @@ SYM_FUNC_END(pmull_ghash_update_p8)
        trn2            T1.2d, HH3.2d, HH4.2d
        eor             HH34.16b, HH34.16b, T1.16b
 
-       ld1             {XL.2d}, [x4]
+       ld1             {XL.2d}, [x23]
 
-       cbz             x0, 3f                          // tag only?
+       cbz             x19, 3f                         // tag only?
 
-       ldr             w8, [x5, #12]                   // load lower counter
+       ldr             w8, [x24, #12]                  // load lower counter
 CPU_LE(        rev             w8, w8          )
 
 0:     mov             w9, #4                          // max blocks per round
-       add             x10, x0, #0xf
+       add             x10, x19, #0xf
        lsr             x10, x10, #4                    // remaining blocks
 
-       subs            x0, x0, #64
+       subs            x19, x19, #64
        csel            w9, w10, w9, mi
        add             w8, w8, w9
 
        bmi             1f
-       ld1             {INP0.16b-INP3.16b}, [x2], #64
+       ld1             {INP0.16b-INP3.16b}, [x21], #64
        .subsection     1
        /*
         * Populate the four input registers right to left with up to 63 bytes
@@ -487,30 +496,30 @@ CPU_LE(   rev             w8, w8          )
         * input size is < 16 bytes)
         */
 1:     mov             x15, #16
-       ands            x19, x0, #0xf
-       csel            x19, x19, x15, ne
+       ands            x0, x19, #0xf
+       csel            x0, x0, x15, ne
        adr_l           x17, .Lpermute_table + 16
 
-       sub             x11, x15, x19
+       sub             x11, x15, x0
        add             x12, x17, x11
        sub             x17, x17, x11
        ld1             {T1.16b}, [x12]
-       sub             x10, x1, x11
-       sub             x11, x2, x11
+       sub             x10, x20, x11
+       sub             x11, x21, x11
 
-       cmp             x0, #-16
+       cmp             x19, #-16
        csel            x14, x15, xzr, gt
-       cmp             x0, #-32
+       cmp             x19, #-32
        csel            x15, x15, xzr, gt
-       cmp             x0, #-48
-       csel            x16, x19, xzr, gt
-       csel            x1, x1, x10, gt
-       csel            x2, x2, x11, gt
-
-       ld1             {INP0.16b}, [x2], x14
-       ld1             {INP1.16b}, [x2], x15
-       ld1             {INP2.16b}, [x2], x16
-       ld1             {INP3.16b}, [x2]
+       cmp             x19, #-48
+       csel            x16, x0, xzr, gt
+       csel            x20, x20, x10, gt
+       csel            x21, x21, x11, gt
+
+       ld1             {INP0.16b}, [x21], x14
+       ld1             {INP1.16b}, [x21], x15
+       ld1             {INP2.16b}, [x21], x16
+       ld1             {INP3.16b}, [x21]
        tbl             INP3.16b, {INP3.16b}, T1.16b
        b               2f
        .previous
@@ -521,14 +530,24 @@ CPU_LE(   rev             w8, w8          )
 
        bl              pmull_gcm_enc_4x
 
-       tbnz            x0, #63, 6f
-       st1             {INP0.16b-INP3.16b}, [x1], #64
+       tbnz            x19, #63, 6f
+       st1             {INP0.16b-INP3.16b}, [x20], #64
        .if             \enc == 1
        bl              pmull_gcm_ghash_4x
        .endif
-       bne             0b
 
-3:     ldp             x19, x10, [sp, #24]
+       beq             3f
+
+       if_will_cond_yield_neon
+CPU_LE(        rev             w8, w8          )
+       str             w8, [x24, #12]                  // store lower counter
+       st1             {XL.2d}, [x23]
+       do_cond_yield_neon
+       b               .Lrestart\enc
+       endif_yield_neon
+       b               0b
+
+3:     ldr             x10, [sp, #.Lframe_local_offset]
        cbz             x10, 5f                         // output tag?
 
        ld1             {INP3.16b}, [x10]               // load lengths[]
@@ -536,10 +555,10 @@ CPU_LE(   rev             w8, w8          )
        bl              pmull_gcm_ghash_4x
 
        mov             w11, #(0x1 << 24)               // BE '1U'
-       ld1             {KS0.16b}, [x5]
+       ld1             {KS0.16b}, [x24]
        mov             KS0.s[3], w11
 
-       enc_block       KS0, x7, x6, x12
+       enc_block       KS0, x26, x25, x12
 
        ext             XL.16b, XL.16b, XL.16b, #8
        rev64           XL.16b, XL.16b
@@ -548,7 +567,7 @@ CPU_LE(     rev             w8, w8          )
        .if             \enc == 1
        st1             {XL.16b}, [x10]                 // store tag
        .else
-       ldp             x11, x12, [sp, #40]             // load tag pointer and 
authsize
+       ldp             x11, x12, [sp, #.Lframe_local_offset + 8] // load tag 
pointer and authsize
        adr_l           x17, .Lpermute_table
        ld1             {KS0.16b}, [x11]                // load supplied tag
        add             x17, x17, x12
@@ -561,33 +580,33 @@ CPU_LE(   rev             w8, w8          )
        smov            w0, v0.b[0]                     // return b0
        .endif
 
-4:     ldp             x29, x30, [sp], #32
+4:     frame_pop
        ret
 
 5:
 CPU_LE(        rev             w8, w8          )
-       str             w8, [x5, #12]                   // store lower counter
-       st1             {XL.2d}, [x4]
+       str             w8, [x24, #12]                  // store lower counter
+       st1             {XL.2d}, [x23]
        b               4b
 
 6:     ld1             {T1.16b-T2.16b}, [x17], #32     // permute vectors
-       sub             x17, x17, x19, lsl #1
+       sub             x17, x17, x0, lsl #1
 
        cmp             w9, #1
        beq             7f
        .subsection     1
-7:     ld1             {INP2.16b}, [x1]
+7:     ld1             {INP2.16b}, [x20]
        tbx             INP2.16b, {INP3.16b}, T1.16b
        mov             INP3.16b, INP2.16b
        b               8f
        .previous
 
-       st1             {INP0.16b}, [x1], x14
-       st1             {INP1.16b}, [x1], x15
-       st1             {INP2.16b}, [x1], x16
+       st1             {INP0.16b}, [x20], x14
+       st1             {INP1.16b}, [x20], x15
+       st1             {INP2.16b}, [x20], x16
        tbl             INP3.16b, {INP3.16b}, T1.16b
        tbx             INP3.16b, {INP2.16b}, T2.16b
-8:     st1             {INP3.16b}, [x1]
+8:     st1             {INP3.16b}, [x20]
 
        .if             \enc == 1
        ld1             {T1.16b}, [x17]
@@ -699,7 +718,7 @@ SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x)
 SYM_FUNC_END(pmull_gcm_ghash_4x)
 
 SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
-       ld1             {KS0.16b}, [x5]                 // load upper counter
+       ld1             {KS0.16b}, [x24]                // load upper counter
        sub             w10, w8, #4
        sub             w11, w8, #3
        sub             w12, w8, #2
@@ -716,13 +735,13 @@ SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
        ins             KS2.s[3], w12
        ins             KS3.s[3], w13
 
-       add             x10, x6, #96                    // round key pointer
+       add             x10, x25, #96                   // round key pointer
        ld1             {K6.4s-K7.4s}, [x10], #32
        .irp            key, K0, K1, K2, K3, K4, K5
        enc_qround      KS0, KS1, KS2, KS3, \key
        .endr
 
-       tbnz            x7, #2, .Lnot128
+       tbnz            x26, #2, .Lnot128
        .subsection     1
 .Lnot128:
        ld1             {K8.4s-K9.4s}, [x10], #32
@@ -733,7 +752,7 @@ SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
        .irp            key, K8, K9
        enc_qround      KS0, KS1, KS2, KS3, \key
        .endr
-       tbz             x7, #1, .Lout192
+       tbz             x26, #1, .Lout192
        b               .Lout256
        .previous
 
-- 
2.17.1

[RFC PATCH 3/5] crypto: arm64/gcm-aes-ce - add NEON yield support

Reply via email to