CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: rin Date: Mon Aug 7 00:58:35 UTC 2023 Modified Files: src/sys/crypto/aes/arch/arm: arm_neon.h Log Message: sys/crypto/{aes,chacha}/arch/arm/arm_neon.h: Sync (whitespace fix) No binary changes. To generate a diff of this commit: cvs rdiff -u -r1.11 -r1.12 src/sys/crypto/aes/arch/arm/arm_neon.h Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/aes/arch/arm/arm_neon.h diff -u src/sys/crypto/aes/arch/arm/arm_neon.h:1.11 src/sys/crypto/aes/arch/arm/arm_neon.h:1.12 --- src/sys/crypto/aes/arch/arm/arm_neon.h:1.11 Mon Sep 7 18:06:13 2020 +++ src/sys/crypto/aes/arch/arm/arm_neon.h Mon Aug 7 00:58:35 2023 @@ -1,4 +1,4 @@ -/* $NetBSD: arm_neon.h,v 1.11 2020/09/07 18:06:13 jakllsch Exp $ */ +/* $NetBSD: arm_neon.h,v 1.12 2023/08/07 00:58:35 rin Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -232,7 +232,7 @@ static __inline uint32_t vgetq_lane_u32(uint32x4_t __v, uint8_t __i) { #ifdef __aarch64__ - return __v[__neon_laneq_index(__v,__i)]; + return __v[__neon_laneq_index(__v, __i)]; #else return (uint32_t)__builtin_neon_vget_laneuv4si((int32x4_t)__v, __i); #endif
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: rin Date: Mon Aug 7 00:58:35 UTC 2023 Modified Files: src/sys/crypto/aes/arch/arm: arm_neon.h Log Message: sys/crypto/{aes,chacha}/arch/arm/arm_neon.h: Sync (whitespace fix) No binary changes. To generate a diff of this commit: cvs rdiff -u -r1.11 -r1.12 src/sys/crypto/aes/arch/arm/arm_neon.h Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Sun Jun 26 17:52:54 UTC 2022 Modified Files: src/sys/crypto/aes/arch/arm: aes_neon_subr.c Log Message: arm/aes_neon: Fix formatting of self-test failure message. Discovered by code inspection. Remarkably, a combination of errors made this fail to be a stack buffer overrun. Verified by booting with ARMv8.0-AES disabled and with the self-test artificially made to fail. To generate a diff of this commit: cvs rdiff -u -r1.7 -r1.8 src/sys/crypto/aes/arch/arm/aes_neon_subr.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/aes/arch/arm/aes_neon_subr.c diff -u src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.7 src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.8 --- src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.7 Sun Aug 9 02:48:38 2020 +++ src/sys/crypto/aes/arch/arm/aes_neon_subr.c Sun Jun 26 17:52:54 2022 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_neon_subr.c,v 1.7 2020/08/09 02:48:38 riastradh Exp $ */ +/* $NetBSD: aes_neon_subr.c,v 1.8 2022/06/26 17:52:54 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -27,7 +27,7 @@ */ #include -__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.7 2020/08/09 02:48:38 riastradh Exp $"); +__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.8 2022/06/26 17:52:54 riastradh Exp $"); #ifdef _KERNEL #include @@ -183,11 +183,11 @@ aes_neon_xts_update_selftest(void) for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) { storeblock(t, aes_neon_xts_update(loadblock(cases[i].in))); if (memcmp(t, cases[i].out, 16)) { - char buf[33]; + char buf[3*16 + 1]; unsigned j; for (j = 0; j < 16; j++) { -snprintf(buf + 2*j, sizeof(buf) - 2*j, +snprintf(buf + 3*j, sizeof(buf) - 3*j, " %02hhx", t[j]); } printf("%s %u: %s\n", __func__, i, buf);
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Sun Jun 26 17:52:54 UTC 2022 Modified Files: src/sys/crypto/aes/arch/arm: aes_neon_subr.c Log Message: arm/aes_neon: Fix formatting of self-test failure message. Discovered by code inspection. Remarkably, a combination of errors made this fail to be a stack buffer overrun. Verified by booting with ARMv8.0-AES disabled and with the self-test artificially made to fail. To generate a diff of this commit: cvs rdiff -u -r1.7 -r1.8 src/sys/crypto/aes/arch/arm/aes_neon_subr.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: rin Date: Sat Nov 21 08:09:21 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_neon.c Log Message: Fix build with clang for earmv7hf; loadroundkey() is used only for __aarch64__. To generate a diff of this commit: cvs rdiff -u -r1.5 -r1.6 src/sys/crypto/aes/arch/arm/aes_neon.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/aes/arch/arm/aes_neon.c diff -u src/sys/crypto/aes/arch/arm/aes_neon.c:1.5 src/sys/crypto/aes/arch/arm/aes_neon.c:1.6 --- src/sys/crypto/aes/arch/arm/aes_neon.c:1.5 Sat Aug 8 14:47:01 2020 +++ src/sys/crypto/aes/arch/arm/aes_neon.c Sat Nov 21 08:09:21 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_neon.c,v 1.5 2020/08/08 14:47:01 riastradh Exp $ */ +/* $NetBSD: aes_neon.c,v 1.6 2020/11/21 08:09:21 rin Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -39,7 +39,7 @@ */ #include -__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.5 2020/08/08 14:47:01 riastradh Exp $"); +__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.6 2020/11/21 08:09:21 rin Exp $"); #include @@ -196,11 +196,13 @@ inv = VQ_N_U8(0x80,0x01,0x08,0x0D,0x0F,0 inva = VQ_N_U8(0x80,0x07,0x0B,0x0F,0x06,0x0A,0x04,0x01, 0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03); +#ifdef __aarch64__ static inline uint8x16_t loadroundkey(const void *rkp) { return vld1q_u8(rkp); } +#endif static inline void storeroundkey(void *rkp, uint8x16_t rk)
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: rin Date: Sat Nov 21 08:09:21 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_neon.c Log Message: Fix build with clang for earmv7hf; loadroundkey() is used only for __aarch64__. To generate a diff of this commit: cvs rdiff -u -r1.5 -r1.6 src/sys/crypto/aes/arch/arm/aes_neon.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Thu Sep 10 11:31:04 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_neon_32.S Log Message: aes neon: Gather mc_forward/backward so we can load 256 bits at once. To generate a diff of this commit: cvs rdiff -u -r1.10 -r1.11 src/sys/crypto/aes/arch/arm/aes_neon_32.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/aes/arch/arm/aes_neon_32.S diff -u src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.10 src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.11 --- src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.10 Thu Sep 10 11:30:28 2020 +++ src/sys/crypto/aes/arch/arm/aes_neon_32.S Thu Sep 10 11:31:03 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_neon_32.S,v 1.10 2020/09/10 11:30:28 riastradh Exp $ */ +/* $NetBSD: aes_neon_32.S,v 1.11 2020/09/10 11:31:03 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -28,7 +28,7 @@ #include -RCSID("$NetBSD: aes_neon_32.S,v 1.10 2020/09/10 11:30:28 riastradh Exp $") +RCSID("$NetBSD: aes_neon_32.S,v 1.11 2020/09/10 11:31:03 riastradh Exp $") .fpu neon @@ -54,36 +54,26 @@ inva: .byte 0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03 END(inva) - .type mc_forward,_ASM_TYPE_OBJECT -mc_forward: - .byte 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04 /* 0 */ + .type mc,_ASM_TYPE_OBJECT +mc: + .byte 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04 /* 0 forward */ .byte 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C - - .byte 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08 /* 1 */ + .byte 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06 /* 0 backward */ + .byte 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E + .byte 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08 /* 1 forward */ .byte 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00 - - .byte 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C /* 2 */ + .byte 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02 /* 1 backward */ + .byte 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A + .byte 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C /* 2 forward */ .byte 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04 - + .byte 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E /* 2 backward */ + .byte 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06 .Lmc_forward_3: - .byte 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00 /* 3 */ + .byte 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00 /* 3 forward */ .byte 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08 -END(mc_forward) - - .type mc_backward,_ASM_TYPE_OBJECT -mc_backward: - .byte 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06 /* 0 */ - .byte 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E - - .byte 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02 /* 1 */ - .byte 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A - - .byte 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E /* 2 */ - .byte 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06 - - .byte 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A /* 3 */ + .byte 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A /* 3 backward */ .byte 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02 -END(mc_backward) +END(mc) .type sr,_ASM_TYPE_OBJECT sr: @@ -210,8 +200,7 @@ ENTRY(aes_neon_enc1) /* * r3: rmod4 - * r4: mc_forward - * r5: mc_backward + * r4: mc * r6,r8,r10,ip: temporaries * q0={d0-d1}: x/ak/A * q1={d2-d3}: 0x0f0f... @@ -225,8 +214,8 @@ ENTRY(aes_neon_enc1) * q9={d18-d19}: sb2[1] * q10={d20-d21}: inv * q11={d22-d23}: inva - * q12={d24-d25}: ir/iak/iakr/sb1_0(io)/mc_backward[rmod4] - * q13={d26-d27}: jr/jak/jakr/sb1_1(jo)/mc_forward[rmod4] + * q12={d24-d25}: ir/iak/iakr/sb1_0(io)/mc[rmod4].backward + * q13={d26-d27}: jr/jak/jakr/sb1_1(jo)/mc[rmod4].forward * q14={d28-d29}: rk/A2/A2_B_D * q15={d30-d31}: A2_B/sr[rmod4] */ @@ -254,9 +243,8 @@ ENTRY(aes_neon_enc1) vld1.8 {q8-q9}, [r6 :256] /* q8 = sb2[0], q9 = sb2[1] */ vld1.8 {q10-q11}, [r8 :256] /* q10 = inv, q11 = inva */ - /* (r4, r5) := (_forward[0], _backward[0]) */ - add r4, ip, #(mc_forward - .Lconstants) - add r5, ip, #(mc_backward - .Lconstants) + /* r4 := mc */ + add r4, ip, #(mc - .Lconstants) /* (q2, q3) := (lo, hi) */ vshr.u8 q3, q0, #4 @@ -291,13 +279,11 @@ ENTRY(aes_neon_enc1) vtbl.8 d25, {q8}, d5 vtbl.8 d26, {q9}, d6 vtbl.8 d27, {q9}, d7 + add r6, r4, r3, lsl #5 /* r6 := [rmod4] */ veor q14, q12, q13 - /* (q12, q13) := (mc_forward[rmod4], mc_backward[rmod4]) */ - add r6, r4, r3, lsl #4 - add r8, r5, r3, lsl #4 - vld1.8 {q12}, [r6 :128] - vld1.8 {q13}, [r8 :128] + /* (q12, q13) := (mc[rmod4].forward, mc[rmod4].backward) */ + vld1.8 {q12-q13}, [r6 :256] /* q15 := A2_B = A2 + A(mcf) */ vtbl.8 d30, {q0}, d24 @@ -474,7 +460,7 @@ ENTRY(aes_neon_dec1) add r8, ip, #(.Lmc_forward_3 - .Lconstants) vld1.8 {q6-q7}, [r4 :256] /* q6 := dsbb[0], q7 := dsbb[1] */ vld1.8 {q10-q11}, [r6 :256] /* q10 := inv, q11 := inva */ - vld1.8 {q15}, [r8 :128] /* q15 := mc_forward[3] */ + vld1.8 {q15}, [r8 :128] /* q15 := mc[3].forward */ /* (q2, q3) := (lo, hi) */ vshr.u8 q3, q0, #4
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Thu Sep 10 11:31:04 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_neon_32.S Log Message: aes neon: Gather mc_forward/backward so we can load 256 bits at once. To generate a diff of this commit: cvs rdiff -u -r1.10 -r1.11 src/sys/crypto/aes/arch/arm/aes_neon_32.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Thu Sep 10 11:30:28 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_neon_32.S Log Message: aes neon: Hoist dsbd/dsbe address calculation out of loop. To generate a diff of this commit: cvs rdiff -u -r1.9 -r1.10 src/sys/crypto/aes/arch/arm/aes_neon_32.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/aes/arch/arm/aes_neon_32.S diff -u src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.9 src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.10 --- src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.9 Thu Sep 10 11:30:08 2020 +++ src/sys/crypto/aes/arch/arm/aes_neon_32.S Thu Sep 10 11:30:28 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_neon_32.S,v 1.9 2020/09/10 11:30:08 riastradh Exp $ */ +/* $NetBSD: aes_neon_32.S,v 1.10 2020/09/10 11:30:28 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -28,7 +28,7 @@ #include -RCSID("$NetBSD: aes_neon_32.S,v 1.9 2020/09/10 11:30:08 riastradh Exp $") +RCSID("$NetBSD: aes_neon_32.S,v 1.10 2020/09/10 11:30:28 riastradh Exp $") .fpu neon @@ -431,6 +431,9 @@ ENTRY(aes_neon_dec1) /* * r3: 3 & ~(nrounds - 1) + * r4: dsbd + * r5: dsbe + * r6,r8,r10,ip: temporaries * q0={d0-d1}: x/ak * q1={d2-d3}: 0x0f0f... * q2={d4-d5}: lo/k/j/io @@ -488,6 +491,10 @@ ENTRY(aes_neon_dec1) add r4, ip, #(dsb9 - .Lconstants) vld1.8 {q4-q5}, [r4 :256] /* q4 := dsb9[0], q5 := dsb9[1] */ + /* r4 := dsbd, r5 := dsbe */ + add r4, ip, #(dsbd - .Lconstants) + add r5, ip, #(dsbe - .Lconstants) + /* q0 := rk[0] + diptlo(lo) + dipthi(hi) */ veor q0, q14, q2 veor q0, q0, q3 @@ -496,7 +503,6 @@ ENTRY(aes_neon_dec1) _ALIGN_TEXT 1: /* load dsbd */ - add r4, ip, #(dsbd - .Lconstants) vld1.8 {q8-q9}, [r4 :256] /* q8 := dsbd[0], q9 := dsbd[1] */ vld1.8 {q14}, [r0 :128]! /* q14 = *rk++ */ @@ -522,8 +528,7 @@ ENTRY(aes_neon_dec1) veor q0, q0, q13 /* load dsbe */ - add r4, ip, #(dsbe - .Lconstants) - vld1.8 {q8-q9}, [r4 :256]! /* q8 := dsbe[0], q9 := dsbe[1] */ + vld1.8 {q8-q9}, [r5 :256] /* q8 := dsbe[0], q9 := dsbe[1] */ /* q0 := x(mc) + dsbb_0(io) + dsbb_1(jo) */ vtbl.8 d28, {q0}, d30
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Thu Sep 10 11:30:28 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_neon_32.S Log Message: aes neon: Hoist dsbd/dsbe address calculation out of loop. To generate a diff of this commit: cvs rdiff -u -r1.9 -r1.10 src/sys/crypto/aes/arch/arm/aes_neon_32.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Thu Sep 10 11:30:08 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_neon_32.S Log Message: aes neon: Tweak register usage. - Call r12 by its usual name, ip. - No need for r7 or r11=fp at the moment. To generate a diff of this commit: cvs rdiff -u -r1.8 -r1.9 src/sys/crypto/aes/arch/arm/aes_neon_32.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Thu Sep 10 11:30:08 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_neon_32.S Log Message: aes neon: Tweak register usage. - Call r12 by its usual name, ip. - No need for r7 or r11=fp at the moment. To generate a diff of this commit: cvs rdiff -u -r1.8 -r1.9 src/sys/crypto/aes/arch/arm/aes_neon_32.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/aes/arch/arm/aes_neon_32.S diff -u src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.8 src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.9 --- src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.8 Thu Sep 10 11:29:43 2020 +++ src/sys/crypto/aes/arch/arm/aes_neon_32.S Thu Sep 10 11:30:08 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_neon_32.S,v 1.8 2020/09/10 11:29:43 riastradh Exp $ */ +/* $NetBSD: aes_neon_32.S,v 1.9 2020/09/10 11:30:08 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -28,7 +28,7 @@ #include -RCSID("$NetBSD: aes_neon_32.S,v 1.8 2020/09/10 11:29:43 riastradh Exp $") +RCSID("$NetBSD: aes_neon_32.S,v 1.9 2020/09/10 11:30:08 riastradh Exp $") .fpu neon @@ -205,14 +205,14 @@ ENTRY(aes_neon_enc1) vldr d1, [sp] /* d1 := x hi */ ldr r1, [sp, #8] /* r1 := nrounds */ #endif - push {r4, r5, r6, r7, r8, r10, r11, lr} + push {r4, r5, r6, r8, r10, lr} vpush {d8-d15} /* * r3: rmod4 * r4: mc_forward * r5: mc_backward - * r6,r7,r8,r10,r11,r12: temporaries + * r6,r8,r10,ip: temporaries * q0={d0-d1}: x/ak/A * q1={d2-d3}: 0x0f0f... * q2={d4-d5}: lo/k/j/io @@ -231,32 +231,32 @@ ENTRY(aes_neon_enc1) * q15={d30-d31}: A2_B/sr[rmod4] */ - /* r12 := .Lconstants - .Lconstants_addr, r11 := .Lconstants_addr */ - ldr r12, .Lconstants_addr - adr r11, .Lconstants_addr + /* ip := .Lconstants - .Lconstants_addr, r10 := .Lconstants_addr */ + ldr ip, .Lconstants_addr + adr r10, .Lconstants_addr vld1.8 {q14}, [r0 :128]! /* q14 = *rk++ */ movw r3, #0 vmov.i8 q1, #0x0f - /* r12 := .Lconstants */ - add r12, r12, r11 + /* ip := .Lconstants */ + add ip, ip, r10 /* (q4, q5) := (iptlo, ipthi) */ - add r6, r12, #(ipt - .Lconstants) + add r6, ip, #(ipt - .Lconstants) vld1.8 {q4-q5}, [r6 :256] /* load the rest of the constants */ - add r4, r12, #(sb1 - .Lconstants) - add r6, r12, #(sb2 - .Lconstants) - add r8, r12, #(.Linv_inva - .Lconstants) + add r4, ip, #(sb1 - .Lconstants) + add r6, ip, #(sb2 - .Lconstants) + add r8, ip, #(.Linv_inva - .Lconstants) vld1.8 {q6-q7}, [r4 :256] /* q6 = sb1[0], q7 = sb1[1] */ vld1.8 {q8-q9}, [r6 :256] /* q8 = sb2[0], q9 = sb2[1] */ vld1.8 {q10-q11}, [r8 :256] /* q10 = inv, q11 = inva */ /* (r4, r5) := (_forward[0], _backward[0]) */ - add r4, r12, #(mc_forward - .Lconstants) - add r5, r12, #(mc_backward - .Lconstants) + add r4, ip, #(mc_forward - .Lconstants) + add r5, ip, #(mc_backward - .Lconstants) /* (q2, q3) := (lo, hi) */ vshr.u8 q3, q0, #4 @@ -295,9 +295,9 @@ ENTRY(aes_neon_enc1) /* (q12, q13) := (mc_forward[rmod4], mc_backward[rmod4]) */ add r6, r4, r3, lsl #4 - add r7, r5, r3, lsl #4 + add r8, r5, r3, lsl #4 vld1.8 {q12}, [r6 :128] - vld1.8 {q13}, [r7 :128] + vld1.8 {q13}, [r8 :128] /* q15 := A2_B = A2 + A(mcf) */ vtbl.8 d30, {q0}, d24 @@ -365,8 +365,8 @@ ENTRY(aes_neon_enc1) bne 1b /* (q6, q7, q15) := (sbo[0], sbo[1], sr[rmod4]) */ - add r8, r12, #(sr - .Lconstants) - add r6, r12, #(sbo - .Lconstants) + add r8, ip, #(sr - .Lconstants) + add r6, ip, #(sbo - .Lconstants) add r8, r8, r3, lsl #4 vld1.8 {q6-q7}, [r6 :256] vld1.8 {q15}, [r8 :128] @@ -388,7 +388,7 @@ ENTRY(aes_neon_enc1) vtbl.8 d1, {q2}, d31 vpop {d8-d15} - pop {r4, r5, r6, r7, r8, r10, r11, lr} + pop {r4, r5, r6, r8, r10, lr} #ifdef __SOFTFP__ #ifdef __ARM_BIG_ENDIAN vmov r1, r0, d0 @@ -426,7 +426,7 @@ ENTRY(aes_neon_dec1) vldr d1, [sp] /* d1 := x hi */ ldr r1, [sp, #8] /* r1 := nrounds */ #endif - push {r4, r5, r6, r7, r8, r10, r11, lr} + push {r4, r5, r6, r8, r10, lr} vpush {d8-d15} /* @@ -449,26 +449,26 @@ ENTRY(aes_neon_dec1) * q15={d30-d31}: mc/sr[3 & ~(nrounds - 1)] */ - /* r12 := .Lconstants - .Lconstants_addr, r11 := .Lconstants_addr */ - ldr r12, .Lconstants_addr - adr r11, .Lconstants_addr + /* ip := .Lconstants - .Lconstants_addr, r10 := .Lconstants_addr */ + ldr ip, .Lconstants_addr + adr r10, .Lconstants_addr vld1.8 {q14}, [r0 :128]! /* q14 = *rk++ */ rsb r3, r1, #0 /* r3 := ~(x - 1) = -x */ vmov.i8 q1, #0x0f and r3, r3, #3 /* r3 := 3 & ~(x - 1) */ - /* r12 := .Lconstants */ - add r12, r12, r11 + /* ip := .Lconstants */ + add ip, ip, r10 /* (q4, q5) := (diptlo, dipthi) */ - add r6, r12, #(dipt - .Lconstants) + add r6, ip, #(dipt - .Lconstants) vld1.8 {q4-q5}, [r6 :256] /* load the rest of the constants */ - add r4, r12, #(dsbb - .Lconstants) - add r6, r12, #(.Linv_inva - .Lconstants) - add r8, r12, #(.Lmc_forward_3 -
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Thu Sep 10 11:29:43 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_neon_32.S Log Message: aes neon: Write vtbl with {qN} rather than {d(2N)-d(2N+1)}. Cosmetic; no functional change. To generate a diff of this commit: cvs rdiff -u -r1.7 -r1.8 src/sys/crypto/aes/arch/arm/aes_neon_32.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/aes/arch/arm/aes_neon_32.S diff -u src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.7 src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.8 --- src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.7 Thu Sep 10 11:29:02 2020 +++ src/sys/crypto/aes/arch/arm/aes_neon_32.S Thu Sep 10 11:29:43 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_neon_32.S,v 1.7 2020/09/10 11:29:02 riastradh Exp $ */ +/* $NetBSD: aes_neon_32.S,v 1.8 2020/09/10 11:29:43 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -28,7 +28,7 @@ #include -RCSID("$NetBSD: aes_neon_32.S,v 1.7 2020/09/10 11:29:02 riastradh Exp $") +RCSID("$NetBSD: aes_neon_32.S,v 1.8 2020/09/10 11:29:43 riastradh Exp $") .fpu neon @@ -264,10 +264,10 @@ ENTRY(aes_neon_enc1) vand q3, q3, q1 /* q3 := (x >> 4) & 0x0f0f... */ /* (q2, q3) := (iptlo(lo), ipthi(hi)) */ - vtbl.8 d4, {d8-d9}, d4 - vtbl.8 d5, {d8-d9}, d5 - vtbl.8 d6, {d10-d11}, d6 - vtbl.8 d7, {d10-d11}, d7 + vtbl.8 d4, {q4}, d4 + vtbl.8 d5, {q4}, d5 + vtbl.8 d6, {q5}, d6 + vtbl.8 d7, {q5}, d7 /* q0 := rk[0] + iptlo(lo) + ipthi(hi) */ veor q0, q14, q2 @@ -279,18 +279,18 @@ ENTRY(aes_neon_enc1) 1: vld1.8 {q14}, [r0 :128]! /* q14 = *rk++ */ /* q0 := A = rk[i] + sb1_0(io) + sb1_1(jo) */ - vtbl.8 d24, {d12-d13}, d4 - vtbl.8 d25, {d12-d13}, d5 - vtbl.8 d26, {d14-d15}, d6 - vtbl.8 d27, {d14-d15}, d7 + vtbl.8 d24, {q6}, d4 + vtbl.8 d25, {q6}, d5 + vtbl.8 d26, {q7}, d6 + vtbl.8 d27, {q7}, d7 veor q0, q14, q12 veor q0, q0, q13 /* q14 := A2 = sb2_0[io] + sb2_1[jo] */ - vtbl.8 d24, {d16-d17}, d4 - vtbl.8 d25, {d16-d17}, d5 - vtbl.8 d26, {d18-d19}, d6 - vtbl.8 d27, {d18-d19}, d7 + vtbl.8 d24, {q8}, d4 + vtbl.8 d25, {q8}, d5 + vtbl.8 d26, {q9}, d6 + vtbl.8 d27, {q9}, d7 veor q14, q12, q13 /* (q12, q13) := (mc_forward[rmod4], mc_backward[rmod4]) */ @@ -300,18 +300,18 @@ ENTRY(aes_neon_enc1) vld1.8 {q13}, [r7 :128] /* q15 := A2_B = A2 + A(mcf) */ - vtbl.8 d30, {d0-d1}, d24 - vtbl.8 d31, {d0-d1}, d25 + vtbl.8 d30, {q0}, d24 + vtbl.8 d31, {q0}, d25 veor q15, q15, q14 /* q14 := A2_B_D = A2_B + A(mcb) */ - vtbl.8 d28, {d0-d1}, d26 - vtbl.8 d29, {d0-d1}, d27 + vtbl.8 d28, {q0}, d26 + vtbl.8 d29, {q0}, d27 veor q14, q14, q15 /* q0 := x = A2_B_D + A2_B(mcf) */ - vtbl.8 d0, {d30-d31}, d24 - vtbl.8 d1, {d30-d31}, d25 + vtbl.8 d0, {q15}, d24 + vtbl.8 d1, {q15}, d25 veor q0, q0, q14 2: /* @@ -324,19 +324,19 @@ ENTRY(aes_neon_enc1) vand q3, q3, q1 /* q3 := (x >> 4) & 0x0f0f... */ /* q0 := a/k */ - vtbl.8 d0, {d22-d23}, d4 - vtbl.8 d1, {d22-d23}, d5 + vtbl.8 d0, {q11}, d4 + vtbl.8 d1, {q11}, d5 /* q2 := j = i + k */ veor q2, q3, q2 /* q12 := ir = 1/i */ - vtbl.8 d24, {d20-d21}, d6 - vtbl.8 d25, {d20-d21}, d7 + vtbl.8 d24, {q10}, d6 + vtbl.8 d25, {q10}, d7 /* q13 := jr = 1/j */ - vtbl.8 d26, {d20-d21}, d4 - vtbl.8 d27, {d20-d21}, d5 + vtbl.8 d26, {q10}, d4 + vtbl.8 d27, {q10}, d5 /* q12 := iak = 1/i + a/k */ veor q12, q12, q0 @@ -345,12 +345,12 @@ ENTRY(aes_neon_enc1) veor q13, q13, q0 /* q12 := iakr = 1/(1/i + a/k) */ - vtbl.8 d24, {d20-d21}, d24 - vtbl.8 d25, {d20-d21}, d25 + vtbl.8 d24, {q10}, d24 + vtbl.8 d25, {q10}, d25 /* q13 := jakr = 1/(1/j + a/k) */ - vtbl.8 d26, {d20-d21}, d26 - vtbl.8 d27, {d20-d21}, d27 + vtbl.8 d26, {q10}, d26 + vtbl.8 d27, {q10}, d27 /* q2 := io = j + 1/(1/i + a/k) */ veor q2, q2, q12 @@ -374,18 +374,18 @@ ENTRY(aes_neon_enc1) vld1.8 {q14}, [r0 :128]! /* q14 = *rk++ */ /* (q2, q3) := (sbo_0(io), sbo_1(jo)) */ - vtbl.8 d4, {d12-d13}, d4 - vtbl.8 d5, {d12-d13}, d5 - vtbl.8 d6, {d14-d15}, d6 - vtbl.8 d7, {d14-d15}, d7 + vtbl.8 d4, {q6}, d4 + vtbl.8 d5, {q6}, d5 + vtbl.8 d6, {q7}, d6 + vtbl.8 d7, {q7}, d7 /* q2 := x = rk[nr] + sbo_0(io) + sbo_1(jo) */ veor q2, q2, q14 veor q2, q2, q3 /* q0 := x(sr[rmod4]) */ - vtbl.8 d0, {d4-d5}, d30 - vtbl.8 d1, {d4-d5}, d31 + vtbl.8 d0, {q2}, d30 + vtbl.8 d1, {q2}, d31 vpop {d8-d15} pop {r4, r5, r6, r7, r8, r10, r11, lr} @@ -479,10 +479,10 @@ ENTRY(aes_neon_dec1) vand q3, q3, q1 /* q3 := (x >> 4) & 0x0f0f... */ /* (q2, q3) := (diptlo(lo), dipthi(hi)) */ - vtbl.8 d4, {d8-d9}, d4 - vtbl.8 d5, {d8-d9}, d5 - vtbl.8 d6, {d10-d11}, d6 - vtbl.8 d7, {d10-d11}, d7 + vtbl.8 d4, {q4}, d4 + vtbl.8 d5, {q4}, d5 + vtbl.8 d6, {q5}, d6 + vtbl.8 d7, {q5}, d7 /* load dsb9 */ add r4, r12, #(dsb9 - .Lconstants) @@ -502,22 +502,22 @@ ENTRY(aes_neon_dec1) vld1.8 {q14}, [r0 :128]! /* q14 = *rk++ */ /* q0 := rk[i] + dsb9_0(io) + dsb9_1(jo)
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Thu Sep 10 11:29:43 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_neon_32.S Log Message: aes neon: Write vtbl with {qN} rather than {d(2N)-d(2N+1)}. Cosmetic; no functional change. To generate a diff of this commit: cvs rdiff -u -r1.7 -r1.8 src/sys/crypto/aes/arch/arm/aes_neon_32.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Thu Sep 10 11:29:02 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_neon_32.S Log Message: aes neon: Issue 256-bit loads rather than pairs of 128-bit loads. Not sure why I didn't realize you could do this before! Saves some temporary registers that can now be allocated to shave off a few cycles. To generate a diff of this commit: cvs rdiff -u -r1.6 -r1.7 src/sys/crypto/aes/arch/arm/aes_neon_32.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Thu Sep 10 11:29:02 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_neon_32.S Log Message: aes neon: Issue 256-bit loads rather than pairs of 128-bit loads. Not sure why I didn't realize you could do this before! Saves some temporary registers that can now be allocated to shave off a few cycles. To generate a diff of this commit: cvs rdiff -u -r1.6 -r1.7 src/sys/crypto/aes/arch/arm/aes_neon_32.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/aes/arch/arm/aes_neon_32.S diff -u src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.6 src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.7 --- src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.6 Sun Aug 16 18:02:03 2020 +++ src/sys/crypto/aes/arch/arm/aes_neon_32.S Thu Sep 10 11:29:02 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_neon_32.S,v 1.6 2020/08/16 18:02:03 riastradh Exp $ */ +/* $NetBSD: aes_neon_32.S,v 1.7 2020/09/10 11:29:02 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -28,7 +28,7 @@ #include -RCSID("$NetBSD: aes_neon_32.S,v 1.6 2020/08/16 18:02:03 riastradh Exp $") +RCSID("$NetBSD: aes_neon_32.S,v 1.7 2020/09/10 11:29:02 riastradh Exp $") .fpu neon @@ -38,9 +38,10 @@ RCSID("$NetBSD: aes_neon_32.S,v 1.6 2020 .long .Lconstants - . .section .rodata - .p2align 4 + .p2align 5 .Lconstants: +.Linv_inva: /* inv and inva must be consecutive */ .type inv,_ASM_TYPE_OBJECT inv: .byte 0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E @@ -99,125 +100,85 @@ sr: .byte 0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03 END(sr) - .type iptlo,_ASM_TYPE_OBJECT -iptlo: - .byte 0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2 + .type ipt,_ASM_TYPE_OBJECT +ipt: + .byte 0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2 /* lo */ .byte 0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA -END(iptlo) - - .type ipthi,_ASM_TYPE_OBJECT -ipthi: - .byte 0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C + .byte 0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C /* hi */ .byte 0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD -END(ipthi) +END(ipt) - .type sb1_0,_ASM_TYPE_OBJECT -sb1_0: - .byte 0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1 + .type sb1,_ASM_TYPE_OBJECT +sb1: + .byte 0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1 /* 0 */ .byte 0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5 -END(sb1_0) - - .type sb1_1,_ASM_TYPE_OBJECT -sb1_1: - .byte 0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36 + .byte 0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36 /* 1 */ .byte 0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B -END(sb1_1) +END(sb1) - .type sb2_0,_ASM_TYPE_OBJECT -sb2_0: - .byte 0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2 + .type sb2,_ASM_TYPE_OBJECT +sb2: + .byte 0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2 /* 0 */ .byte 0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E -END(sb2_0) - - .type sb2_1,_ASM_TYPE_OBJECT -sb2_1: - .byte 0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69 + .byte 0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69 /* 1 */ .byte 0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2 -END(sb2_1) +END(sb2) - .type sbo_0,_ASM_TYPE_OBJECT -sbo_0: - .byte 0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0 + .type sbo,_ASM_TYPE_OBJECT +sbo: + .byte 0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0 /* 0 */ .byte 0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15 -END(sbo_0) - - .type sbo_1,_ASM_TYPE_OBJECT -sbo_1: - .byte 0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF + .byte 0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF /* 1 */ .byte 0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E -END(sbo_1) +END(sbo) - .type diptlo,_ASM_TYPE_OBJECT -diptlo: - .byte 0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F + .type dipt,_ASM_TYPE_OBJECT +dipt: + .byte 0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F /* lo */ .byte 0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15 -END(diptlo) - - .type dipthi,_ASM_TYPE_OBJECT -dipthi: - .byte 0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86 + .byte 0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86 /* hi */ .byte 0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12 -END(dipthi) +END(dipt) - .type dsb9_0,_ASM_TYPE_OBJECT -dsb9_0: - .byte 0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85 + .type dsb9,_ASM_TYPE_OBJECT +dsb9: + .byte 0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85 /* 0 */ .byte 0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA -END(dsb9_0) - - .type dsb9_1,_ASM_TYPE_OBJECT -dsb9_1: - .byte 0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0 + .byte 0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0 /* 1 */ .byte 0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72 -END(dsb9_1) +END(dsb9) - .type dsbd_0,_ASM_TYPE_OBJECT -dsbd_0: - .byte 0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D + .type dsbd,_ASM_TYPE_OBJECT +dsbd: + .byte 0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D /* 0 */ .byte 0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5 -END(dsbd_0) - - .type dsbd_1,_ASM_TYPE_OBJECT -dsbd_1: - .byte 0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C + .byte 0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C /* 1 */ .byte 0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29 -END(dsbd_1) +END(dsbd) - .type dsbb_0,_ASM_TYPE_OBJECT -dsbb_0: - .byte 0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Tue Sep 8 23:58:09 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_armv8_64.S Log Message: aesarmv8: Reallocate registers to shave off unnecessary MOV. To generate a diff of this commit: cvs rdiff -u -r1.14 -r1.15 src/sys/crypto/aes/arch/arm/aes_armv8_64.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/aes/arch/arm/aes_armv8_64.S diff -u src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.14 src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.15 --- src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.14 Tue Sep 8 23:57:43 2020 +++ src/sys/crypto/aes/arch/arm/aes_armv8_64.S Tue Sep 8 23:58:09 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_armv8_64.S,v 1.14 2020/09/08 23:57:43 riastradh Exp $ */ +/* $NetBSD: aes_armv8_64.S,v 1.15 2020/09/08 23:58:09 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -28,7 +28,7 @@ #include -RCSID("$NetBSD: aes_armv8_64.S,v 1.14 2020/09/08 23:57:43 riastradh Exp $") +RCSID("$NetBSD: aes_armv8_64.S,v 1.15 2020/09/08 23:58:09 riastradh Exp $") .arch_extension aes @@ -917,13 +917,12 @@ END(aesarmv8_cbcmac_update1) ENTRY(aesarmv8_ccm_enc1) stp fp, lr, [sp, #-16]! /* push stack frame */ mov fp, sp - ld1 {v0.16b, v1.16b}, [x4] /* q0 := auth, q2 := ctr (be) */ - mov v2.16b, v1.16b + ld1 {v0.16b-v1.16b}, [x4] /* q0 := auth, q1 := ctr (be) */ adrl x11, ctr32_inc /* x11 := _inc */ ld1 {v5.4s}, [x11] /* q5 := (0,0,0,1) (host-endian) */ mov x9, x0 /* x9 := enckey */ mov x10, x3 /* x10 := nbytes */ - rev32 v2.16b, v2.16b /* q2 := ctr (host-endian) */ + rev32 v2.16b, v1.16b /* q2 := ctr (host-endian) */ _ALIGN_TEXT 1: ld1 {v3.16b}, [x1], #0x10 /* q3 := plaintext block */ add v2.4s, v2.4s, v5.4s /* increment ctr (32-bit) */ @@ -937,9 +936,8 @@ ENTRY(aesarmv8_ccm_enc1) subs x10, x10, #0x10 /* count down bytes */ st1 {v3.16b}, [x2], #0x10 /* store ciphertext block */ b.ne 1b /* repeat if more blocks */ - rev32 v2.16b, v2.16b /* q2 := ctr (big-endian) */ - mov v1.16b, v2.16b /* store updated auth/ctr */ - st1 {v0.16b-v1.16b}, [x4] + rev32 v1.16b, v2.16b /* q1 := ctr (big-endian) */ + st1 {v0.16b-v1.16b}, [x4] /* store updated auth/ctr */ ldp fp, lr, [sp], #16 /* pop stack frame */ ret END(aesarmv8_ccm_enc1)
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Tue Sep 8 23:57:43 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_armv8_64.S Log Message: aesarmv8: Issue two 4-register ld/st, not four 2-register ld/st. To generate a diff of this commit: cvs rdiff -u -r1.13 -r1.14 src/sys/crypto/aes/arch/arm/aes_armv8_64.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/aes/arch/arm/aes_armv8_64.S diff -u src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.13 src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.14 --- src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.13 Tue Sep 8 23:57:13 2020 +++ src/sys/crypto/aes/arch/arm/aes_armv8_64.S Tue Sep 8 23:57:43 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_armv8_64.S,v 1.13 2020/09/08 23:57:13 riastradh Exp $ */ +/* $NetBSD: aes_armv8_64.S,v 1.14 2020/09/08 23:57:43 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -28,7 +28,7 @@ #include -RCSID("$NetBSD: aes_armv8_64.S,v 1.13 2020/09/08 23:57:13 riastradh Exp $") +RCSID("$NetBSD: aes_armv8_64.S,v 1.14 2020/09/08 23:57:43 riastradh Exp $") .arch_extension aes @@ -693,10 +693,8 @@ ENTRY(aesarmv8_xts_enc8) mov v30.16b, v31.16b /* q30 := tweak[6] */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ /* q31 := tweak[7] */ - ld1 {v0.16b,v1.16b}, [x1], #0x20 /* q[i] := ptxt[i] */ - ld1 {v2.16b,v3.16b}, [x1], #0x20 - ld1 {v4.16b,v5.16b}, [x1], #0x20 - ld1 {v6.16b,v7.16b}, [x1], #0x20 + ld1 {v0.16b-v3.16b}, [x1], #0x40 /* q[i] := ptxt[i] */ + ld1 {v4.16b-v7.16b}, [x1], #0x40 eor v0.16b, v0.16b, v24.16b /* q[i] := ptxt[i] ^ tweak[i] */ eor v1.16b, v1.16b, v25.16b eor v2.16b, v2.16b, v26.16b @@ -716,10 +714,8 @@ ENTRY(aesarmv8_xts_enc8) eor v5.16b, v5.16b, v29.16b eor v6.16b, v6.16b, v30.16b eor v7.16b, v7.16b, v31.16b - st1 {v0.16b,v1.16b}, [x2], #0x20 /* store ciphertext blocks */ - st1 {v2.16b,v3.16b}, [x2], #0x20 - st1 {v4.16b,v5.16b}, [x2], #0x20 - st1 {v6.16b,v7.16b}, [x2], #0x20 + st1 {v0.16b-v3.16b}, [x2], #0x40 /* store ciphertext blocks */ + st1 {v4.16b-v7.16b}, [x2], #0x40 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ subs x10, x10, #0x80 /* count down nbytes */ b.ne 1b /* repeat if more block groups */
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Tue Sep 8 23:57:43 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_armv8_64.S Log Message: aesarmv8: Issue two 4-register ld/st, not four 2-register ld/st. To generate a diff of this commit: cvs rdiff -u -r1.13 -r1.14 src/sys/crypto/aes/arch/arm/aes_armv8_64.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Tue Sep 8 23:57:13 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_armv8_64.S Log Message: aesarmv8: Adapt aes_armv8_64.S to big-endian. Patch mainly from (and tested by) jakllsch@ with minor tweaks by me. To generate a diff of this commit: cvs rdiff -u -r1.12 -r1.13 src/sys/crypto/aes/arch/arm/aes_armv8_64.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Tue Sep 8 23:58:09 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_armv8_64.S Log Message: aesarmv8: Reallocate registers to shave off unnecessary MOV. To generate a diff of this commit: cvs rdiff -u -r1.14 -r1.15 src/sys/crypto/aes/arch/arm/aes_armv8_64.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Tue Sep 8 23:57:13 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_armv8_64.S Log Message: aesarmv8: Adapt aes_armv8_64.S to big-endian. Patch mainly from (and tested by) jakllsch@ with minor tweaks by me. To generate a diff of this commit: cvs rdiff -u -r1.12 -r1.13 src/sys/crypto/aes/arch/arm/aes_armv8_64.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/aes/arch/arm/aes_armv8_64.S diff -u src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.12 src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.13 --- src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.12 Sat Aug 8 14:47:01 2020 +++ src/sys/crypto/aes/arch/arm/aes_armv8_64.S Tue Sep 8 23:57:13 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_armv8_64.S,v 1.12 2020/08/08 14:47:01 riastradh Exp $ */ +/* $NetBSD: aes_armv8_64.S,v 1.13 2020/09/08 23:57:13 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -28,7 +28,7 @@ #include -RCSID("$NetBSD: aes_armv8_64.S,v 1.12 2020/08/08 14:47:01 riastradh Exp $") +RCSID("$NetBSD: aes_armv8_64.S,v 1.13 2020/09/08 23:57:13 riastradh Exp $") .arch_extension aes @@ -114,11 +114,11 @@ END(unshiftrows_rotword_3) * Standard ABI calling convention. */ ENTRY(aesarmv8_setenckey128) - ldr q1, [x1] /* q1 := master key */ + ld1 {v1.16b}, [x1] /* q1 := master key */ adrl x4, unshiftrows_rotword_3 eor v0.16b, v0.16b, v0.16b /* q0 := 0 */ - ldr q16, [x4] /* q16 := unshiftrows_rotword_3 table */ + ld1 {v16.16b}, [x4] /* q16 := unshiftrows_rotword_3 table */ str q1, [x0], #0x10 /* store master key as first round key */ mov x2, #10 /* round count */ @@ -171,14 +171,14 @@ END(aesarmv8_setenckey128) * Standard ABI calling convention. */ ENTRY(aesarmv8_setenckey192) - ldr q1, [x1], #0x10 /* q1 := master key[0:128) */ - ldr d2, [x1] /* d2 := master key[128:192) */ + ld1 {v1.16b}, [x1], #0x10 /* q1 := master key[0:128) */ + ld1 {v2.8b}, [x1] /* d2 := master key[128:192) */ adrl x4, unshiftrows_rotword_1 adrl x5, unshiftrows_rotword_3 eor v0.16b, v0.16b, v0.16b /* q0 := 0 */ - ldr q16, [x4] /* q16 := unshiftrows_rotword_1 */ - ldr q17, [x5] /* q17 := unshiftrows_rotword_3 */ + ld1 {v16.16b}, [x4] /* q16 := unshiftrows_rotword_1 */ + ld1 {v17.16b}, [x5] /* q17 := unshiftrows_rotword_3 */ str q1, [x0], #0x10 /* store master key[0:128) as round key */ mov x2, #12 /* round count */ @@ -351,13 +351,13 @@ END(aesarmv8_setenckey192) */ ENTRY(aesarmv8_setenckey256) /* q1 := key[0:128), q2 := key[128:256) */ - ldp q1, q2, [x1], #0x20 + ld1 {v1.16b-v2.16b}, [x1], #0x20 adrl x4, unshiftrows_rotword_3 adrl x5, unshiftrows_3 eor v0.16b, v0.16b, v0.16b /* q0 := 0 */ - ldr q16, [x4] /* q16 := unshiftrows_rotword_3 */ - ldr q17, [x5] /* q17 := unshiftrows_3 */ + ld1 {v16.16b}, [x4] /* q16 := unshiftrows_rotword_3 */ + ld1 {v17.16b}, [x5] /* q17 := unshiftrows_3 */ /* store master key as first two round keys */ stp q1, q2, [x0], #0x20 @@ -461,9 +461,9 @@ END(aesarmv8_enctodec) ENTRY(aesarmv8_enc) stp fp, lr, [sp, #-16]! /* push stack frame */ mov fp, sp - ldr q0, [x1] /* q0 := ptxt */ + ld1 {v0.16b}, [x1] /* q0 := ptxt */ bl aesarmv8_enc1 /* q0 := ctxt; trash x0/x3/q16 */ - str q0, [x2] /* store ctxt */ + st1 {v0.16b}, [x2] /* store ctxt */ ldp fp, lr, [sp], #16 /* pop stack frame */ ret END(aesarmv8_enc) @@ -479,9 +479,9 @@ END(aesarmv8_enc) ENTRY(aesarmv8_dec) stp fp, lr, [sp, #-16]! /* push stack frame */ mov fp, sp - ldr q0, [x1] /* q0 := ctxt */ + ld1 {v0.16b}, [x1] /* q0 := ctxt */ bl aesarmv8_dec1 /* q0 := ptxt; trash x0/x3/q16 */ - str q0, [x2] /* store ptxt */ + st1 {v0.16b}, [x2] /* store ptxt */ ldp fp, lr, [sp], #16 /* pop stack frame */ ret END(aesarmv8_dec) @@ -503,17 +503,17 @@ ENTRY(aesarmv8_cbc_enc) mov fp, sp mov x9, x0 /* x9 := enckey */ mov x10, x3 /* x10 := nbytes */ - ldr q0, [x4] /* q0 := chaining value */ + ld1 {v0.16b}, [x4] /* q0 := chaining value */ _ALIGN_TEXT -1: ldr q1, [x1], #0x10 /* q1 := plaintext block */ +1: ld1 {v1.16b}, [x1], #0x10 /* q1 := plaintext block */ eor v0.16b, v0.16b, v1.16b /* q0 := cv ^ ptxt */ mov x0, x9 /* x0 := enckey */ mov x3, x5 /* x3 := nrounds */ bl aesarmv8_enc1 /* q0 := ctxt; trash x0/x3/q16 */ subs x10, x10, #0x10 /* count down nbytes */ - str q0, [x2], #0x10 /* store ciphertext block */ + st1 {v0.16b}, [x2], #0x10 /* store ciphertext block */ b.ne 1b /* repeat if x10 is nonzero */ - str q0, [x4] /* store chaining value */ + st1 {v0.16b}, [x4] /* store chaining value */ ldp fp, lr, [sp], #16 /* pop stack frame */ 2: ret END(aesarmv8_cbc_enc) @@ -533,18 +533,21 @@ END(aesarmv8_cbc_enc) ENTRY(aesarmv8_cbc_dec1) stp fp, lr, [sp, #-16]! /* push stack frame */ mov fp, sp - ldr q24, [x4] /* q24 := iv */ + ld1 {v24.16b}, [x4] /* q24 := iv */ mov x9, x0 /* x9 := enckey */
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Sun Aug 16 18:02:03 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_neon_32.S files.aesneon Log Message: Fix AES NEON code for big-endian softfp ARM. ...which is how the kernel runs. Switch to using __SOFTFP__ for consistency with how it gets exposed to C, although I'm not sure how to get it defined automagically in the toolchain for .S files so that's set manually in files.aesneon for now. To generate a diff of this commit: cvs rdiff -u -r1.5 -r1.6 src/sys/crypto/aes/arch/arm/aes_neon_32.S cvs rdiff -u -r1.3 -r1.4 src/sys/crypto/aes/arch/arm/files.aesneon Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/aes/arch/arm/aes_neon_32.S diff -u src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.5 src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.6 --- src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.5 Sat Aug 8 14:47:01 2020 +++ src/sys/crypto/aes/arch/arm/aes_neon_32.S Sun Aug 16 18:02:03 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_neon_32.S,v 1.5 2020/08/08 14:47:01 riastradh Exp $ */ +/* $NetBSD: aes_neon_32.S,v 1.6 2020/08/16 18:02:03 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -28,7 +28,7 @@ #include -RCSID("$NetBSD: aes_neon_32.S,v 1.5 2020/08/08 14:47:01 riastradh Exp $") +RCSID("$NetBSD: aes_neon_32.S,v 1.6 2020/08/16 18:02:03 riastradh Exp $") .fpu neon @@ -228,15 +228,19 @@ END(dsbo_1) * aes_neon_enc1(const struct aesenc *enc@r0, uint8x16_t x@q0, * unsigned nrounds@r1) * - * With -mfloat-abi=soft(fp) (here spelled `#ifdef _KERNEL'): + * With -mfloat-abi=soft(fp) (i.e., __SOFTFP__): * * uint8x16_t@(r0,r1,r2,r3) * aes_neon_enc1(const struct aesenc *enc@r0, * uint8x16_t x@(r2,r3,sp[0],sp[4]), nrounds@sp[8]) */ ENTRY(aes_neon_enc1) -#ifdef _KERNEL +#ifdef __SOFTFP__ +#ifdef __ARM_BIG_ENDIAN + vmov d0, r3, r2 /* d0 := x lo */ +#else vmov d0, r2, r3 /* d0 := x lo */ +#endif vldr d1, [sp] /* d1 := x hi */ ldr r1, [sp, #8] /* r1 := nrounds */ #endif @@ -434,10 +438,15 @@ ENTRY(aes_neon_enc1) vpop {d8-d15} pop {r4, r5, r6, r7, r8, r10, r11, lr} -#ifdef _KERNEL +#ifdef __SOFTFP__ +#ifdef __ARM_BIG_ENDIAN + vmov r1, r0, d0 + vmov r3, r2, d1 +#else vmov r0, r1, d0 vmov r2, r3, d1 #endif +#endif bx lr END(aes_neon_enc1) @@ -457,8 +466,12 @@ END(aes_neon_enc1) * uint8x16_t x@(r2,r3,sp[0],sp[4]), nrounds@sp[8]) */ ENTRY(aes_neon_dec1) -#ifdef _KERNEL +#ifdef __SOFTFP__ +#ifdef __ARM_BIG_ENDIAN + vmov d0, r3, r2 /* d0 := x lo */ +#else vmov d0, r2, r3 /* d0 := x lo */ +#endif vldr d1, [sp] /* d1 := x hi */ ldr r1, [sp, #8] /* r1 := nrounds */ #endif @@ -669,9 +682,14 @@ ENTRY(aes_neon_dec1) vpop {d8-d15} pop {r4, r5, r6, r7, r8, r10, r11, lr} -#ifdef _KERNEL +#ifdef __SOFTFP__ +#ifdef __ARM_BIG_ENDIAN + vmov r1, r0, d0 + vmov r3, r2, d1 +#else vmov r0, r1, d0 vmov r2, r3, d1 #endif +#endif bx lr END(aes_neon_dec1) Index: src/sys/crypto/aes/arch/arm/files.aesneon diff -u src/sys/crypto/aes/arch/arm/files.aesneon:1.3 src/sys/crypto/aes/arch/arm/files.aesneon:1.4 --- src/sys/crypto/aes/arch/arm/files.aesneon:1.3 Tue Jun 30 17:03:13 2020 +++ src/sys/crypto/aes/arch/arm/files.aesneon Sun Aug 16 18:02:03 2020 @@ -1,4 +1,4 @@ -# $NetBSD: files.aesneon,v 1.3 2020/06/30 17:03:13 riastradh Exp $ +# $NetBSD: files.aesneon,v 1.4 2020/08/16 18:02:03 riastradh Exp $ ifdef aarch64 makeoptions aes "COPTS.aes_neon.c"+="-march=armv8-a" @@ -8,6 +8,8 @@ makeoptions aes "COPTS.aes_neon.c"+="-mf makeoptions aes "COPTS.aes_neon_subr.c"+="-mfloat-abi=softfp -mfpu=neon" endif +makeoptions aes "AOPTS.aes_neon_32.S"+="-D__SOFTFP__" + file crypto/aes/arch/arm/aes_neon.c aes & (cpu_cortex | aarch64) file crypto/aes/arch/arm/aes_neon_impl.c aes & (cpu_cortex | aarch64) file crypto/aes/arch/arm/aes_neon_subr.c aes & (cpu_cortex | aarch64)
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Sun Aug 16 18:02:03 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_neon_32.S files.aesneon Log Message: Fix AES NEON code for big-endian softfp ARM. ...which is how the kernel runs. Switch to using __SOFTFP__ for consistency with how it gets exposed to C, although I'm not sure how to get it defined automagically in the toolchain for .S files so that's set manually in files.aesneon for now. To generate a diff of this commit: cvs rdiff -u -r1.5 -r1.6 src/sys/crypto/aes/arch/arm/aes_neon_32.S cvs rdiff -u -r1.3 -r1.4 src/sys/crypto/aes/arch/arm/files.aesneon Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Sun Aug 9 02:00:57 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_neon_subr.c Log Message: Nix outdated comment. I implemented this parallelism a couple weeks ago. To generate a diff of this commit: cvs rdiff -u -r1.5 -r1.6 src/sys/crypto/aes/arch/arm/aes_neon_subr.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Sun Aug 9 02:00:57 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_neon_subr.c Log Message: Nix outdated comment. I implemented this parallelism a couple weeks ago. To generate a diff of this commit: cvs rdiff -u -r1.5 -r1.6 src/sys/crypto/aes/arch/arm/aes_neon_subr.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/aes/arch/arm/aes_neon_subr.c diff -u src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.5 src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.6 --- src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.5 Sat Aug 8 14:47:01 2020 +++ src/sys/crypto/aes/arch/arm/aes_neon_subr.c Sun Aug 9 02:00:57 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_neon_subr.c,v 1.5 2020/08/08 14:47:01 riastradh Exp $ */ +/* $NetBSD: aes_neon_subr.c,v 1.6 2020/08/09 02:00:57 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -27,7 +27,7 @@ */ #include -__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.5 2020/08/08 14:47:01 riastradh Exp $"); +__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.6 2020/08/09 02:00:57 riastradh Exp $"); #ifdef _KERNEL #include @@ -287,12 +287,6 @@ aes_neon_cbcmac_update1(const struct aes storeblock(auth0, auth); } -/* - * XXX On aarch64, we have enough registers that we should be able to - * pipeline two simultaneous vpaes computations in an `aes_neon_enc2' - * function, which should substantially improve CCM throughput. - */ - void aes_neon_ccm_enc1(const struct aesenc *enc, const uint8_t in[static 16], uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Tue Jul 28 20:11:09 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_neon.c aes_neon_impl.h aes_neon_subr.c arm_neon.h Log Message: Draft 2x vectorized neon vpaes for aarch64. Gives a modest speed boost on rk3399 (Cortex-A53/A72), around 20% in cgd tests, for parallelizable operations like CBC decryption; same improvement should probably carry over to rpi4 CPU which lacks ARMv8.0-AES. To generate a diff of this commit: cvs rdiff -u -r1.3 -r1.4 src/sys/crypto/aes/arch/arm/aes_neon.c \ src/sys/crypto/aes/arch/arm/aes_neon_subr.c cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/aes/arch/arm/aes_neon_impl.h cvs rdiff -u -r1.6 -r1.7 src/sys/crypto/aes/arch/arm/arm_neon.h Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/aes/arch/arm/aes_neon.c diff -u src/sys/crypto/aes/arch/arm/aes_neon.c:1.3 src/sys/crypto/aes/arch/arm/aes_neon.c:1.4 --- src/sys/crypto/aes/arch/arm/aes_neon.c:1.3 Tue Jun 30 20:32:11 2020 +++ src/sys/crypto/aes/arch/arm/aes_neon.c Tue Jul 28 20:11:09 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_neon.c,v 1.3 2020/06/30 20:32:11 riastradh Exp $ */ +/* $NetBSD: aes_neon.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -39,7 +39,7 @@ */ #include -__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.3 2020/06/30 20:32:11 riastradh Exp $"); +__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $"); #include @@ -589,6 +589,59 @@ aes_neon_enc1(const struct aesenc *enc, return vqtbl1q_u8(x, sr[rmod4]); } +uint8x16x2_t +aes_neon_enc2(const struct aesenc *enc, uint8x16x2_t x, unsigned nrounds) +{ + const uint32_t *rk32 = enc->aese_aes.aes_rk; + uint8x16_t inv_ = *(const volatile uint8x16_t *) + uint8x16_t inva_ = *(const volatile uint8x16_t *) + uint8x16_t sb1_0 = ((const volatile uint8x16_t *)sb1)[0]; + uint8x16_t sb1_1 = ((const volatile uint8x16_t *)sb1)[1]; + uint8x16_t sb2_0 = ((const volatile uint8x16_t *)sb2)[0]; + uint8x16_t sb2_1 = ((const volatile uint8x16_t *)sb2)[1]; + uint8x16_t x0 = x.val[0], x1 = x.val[1]; + uint8x16_t io0, jo0, io1, jo1; + unsigned rmod4 = 0; + + x0 = aes_schedule_transform(x0, ipt); + x1 = aes_schedule_transform(x1, ipt); + x0 ^= loadroundkey(rk32); + x1 ^= loadroundkey(rk32); + for (;;) { + uint8x16_t A_0, A2_0, A2_B_0, A2_B_D_0; + uint8x16_t A_1, A2_1, A2_B_1, A2_B_D_1; + + subbytes(, , x0, inv_, inva_); + subbytes(, , x1, inv_, inva_); + + rk32 += 4; + rmod4 = (rmod4 + 1) % 4; + if (--nrounds == 0) + break; + + A_0 = vqtbl1q_u8(sb1_0, io0) ^ vqtbl1q_u8(sb1_1, jo0); + A_1 = vqtbl1q_u8(sb1_0, io1) ^ vqtbl1q_u8(sb1_1, jo1); + A_0 ^= loadroundkey(rk32); + A_1 ^= loadroundkey(rk32); + A2_0 = vqtbl1q_u8(sb2_0, io0) ^ vqtbl1q_u8(sb2_1, jo0); + A2_1 = vqtbl1q_u8(sb2_0, io1) ^ vqtbl1q_u8(sb2_1, jo1); + A2_B_0 = A2_0 ^ vqtbl1q_u8(A_0, mc_forward[rmod4]); + A2_B_1 = A2_1 ^ vqtbl1q_u8(A_1, mc_forward[rmod4]); + A2_B_D_0 = A2_B_0 ^ vqtbl1q_u8(A_0, mc_backward[rmod4]); + A2_B_D_1 = A2_B_1 ^ vqtbl1q_u8(A_1, mc_backward[rmod4]); + x0 = A2_B_D_0 ^ vqtbl1q_u8(A2_B_0, mc_forward[rmod4]); + x1 = A2_B_D_1 ^ vqtbl1q_u8(A2_B_1, mc_forward[rmod4]); + } + x0 = vqtbl1q_u8(sbo[0], io0) ^ vqtbl1q_u8(sbo[1], jo0); + x1 = vqtbl1q_u8(sbo[0], io1) ^ vqtbl1q_u8(sbo[1], jo1); + x0 ^= loadroundkey(rk32); + x1 ^= loadroundkey(rk32); + return (uint8x16x2_t) { .val = { + [0] = vqtbl1q_u8(x0, sr[rmod4]), + [1] = vqtbl1q_u8(x1, sr[rmod4]), + } }; +} + uint8x16_t aes_neon_dec1(const struct aesdec *dec, uint8x16_t x, unsigned nrounds) { @@ -628,4 +681,60 @@ aes_neon_dec1(const struct aesdec *dec, return vqtbl1q_u8(x, sr[i]); } +uint8x16x2_t +aes_neon_dec2(const struct aesdec *dec, uint8x16x2_t x, unsigned nrounds) +{ + const uint32_t *rk32 = dec->aesd_aes.aes_rk; + unsigned i = 3 & ~(nrounds - 1); + uint8x16_t inv_ = *(const volatile uint8x16_t *) + uint8x16_t inva_ = *(const volatile uint8x16_t *) + uint8x16_t x0 = x.val[0], x1 = x.val[1]; + uint8x16_t io0, jo0, io1, jo1, mc; + + x0 = aes_schedule_transform(x0, dipt); + x1 = aes_schedule_transform(x1, dipt); + x0 ^= loadroundkey(rk32); + x1 ^= loadroundkey(rk32); + rk32 += 4; + + mc = mc_forward[3]; + for (;;) { + subbytes(, , x0, inv_, inva_); + subbytes(, , x1, inv_, inva_); + if (--nrounds == 0) + break; + + x0 = vqtbl1q_u8(dsb9[0], io0) ^ vqtbl1q_u8(dsb9[1], jo0); + x1 = vqtbl1q_u8(dsb9[0], io1) ^ vqtbl1q_u8(dsb9[1], jo1); + x0 ^= loadroundkey(rk32); + x1 ^= loadroundkey(rk32); + rk32 += 4;/* next round key */ + + x0 = vqtbl1q_u8(x0, mc); + x1 = vqtbl1q_u8(x1, mc); + x0 ^= vqtbl1q_u8(dsbd[0], io0) ^ vqtbl1q_u8(dsbd[1], jo0); + x1 ^= vqtbl1q_u8(dsbd[0], io1) ^ vqtbl1q_u8(dsbd[1], jo1); + + x0 = vqtbl1q_u8(x0, mc); + x1 = vqtbl1q_u8(x1, mc); + x0 ^= vqtbl1q_u8(dsbb[0], io0) ^ vqtbl1q_u8(dsbb[1], jo0); + x1 ^= vqtbl1q_u8(dsbb[0],
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Tue Jul 28 20:11:09 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_neon.c aes_neon_impl.h aes_neon_subr.c arm_neon.h Log Message: Draft 2x vectorized neon vpaes for aarch64. Gives a modest speed boost on rk3399 (Cortex-A53/A72), around 20% in cgd tests, for parallelizable operations like CBC decryption; same improvement should probably carry over to rpi4 CPU which lacks ARMv8.0-AES. To generate a diff of this commit: cvs rdiff -u -r1.3 -r1.4 src/sys/crypto/aes/arch/arm/aes_neon.c \ src/sys/crypto/aes/arch/arm/aes_neon_subr.c cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/aes/arch/arm/aes_neon_impl.h cvs rdiff -u -r1.6 -r1.7 src/sys/crypto/aes/arch/arm/arm_neon.h Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Mon Jul 27 20:54:12 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_armv8_64.S Log Message: Issue aese/aesmc and aesd/aesimc in pairs. Advised by the aarch64 optimization guide; increases cgd throughput by about 10%. To generate a diff of this commit: cvs rdiff -u -r1.9 -r1.10 src/sys/crypto/aes/arch/arm/aes_armv8_64.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Mon Jul 27 20:54:12 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_armv8_64.S Log Message: Issue aese/aesmc and aesd/aesimc in pairs. Advised by the aarch64 optimization guide; increases cgd throughput by about 10%. To generate a diff of this commit: cvs rdiff -u -r1.9 -r1.10 src/sys/crypto/aes/arch/arm/aes_armv8_64.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/aes/arch/arm/aes_armv8_64.S diff -u src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.9 src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.10 --- src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.9 Mon Jul 27 20:53:22 2020 +++ src/sys/crypto/aes/arch/arm/aes_armv8_64.S Mon Jul 27 20:54:11 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_armv8_64.S,v 1.9 2020/07/27 20:53:22 riastradh Exp $ */ +/* $NetBSD: aes_armv8_64.S,v 1.10 2020/07/27 20:54:11 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -1041,15 +1041,18 @@ END(ctr32_inc) .type aesarmv8_enc1,@function aesarmv8_enc1: ldr q16, [x0], #0x10 /* load round key */ - b 2f + sub x3, x3, #1 _ALIGN_TEXT -1: /* q0 := MixColumns(q0) */ +1: /* q0 := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q0 */ + aese v0.16b, v16.16b aesmc v0.16b, v0.16b -2: subs x3, x3, #1 + ldr q16, [x0], #0x10 + subs x3, x3, #1 + b.ne 1b /* q0 := ShiftRows(SubBytes(AddRoundKey_q16(q0))) */ aese v0.16b, v16.16b - ldr q16, [x0], #0x10 /* load next round key */ - b.ne 1b + ldr q16, [x0] /* load last round key */ + /* q0 := AddRoundKey_q16(q0) */ eor v0.16b, v0.16b, v16.16b ret END(aesarmv8_enc1) @@ -1067,17 +1070,21 @@ END(aesarmv8_enc1) .type aesarmv8_enc2,@function aesarmv8_enc2: ldr q16, [x0], #0x10 /* load round key */ - b 2f + sub x3, x3, #1 _ALIGN_TEXT -1: /* q[i] := MixColumns(q[i]) */ +1: /* q[i] := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q[i] */ + aese v0.16b, v16.16b aesmc v0.16b, v0.16b + aese v1.16b, v16.16b aesmc v1.16b, v1.16b -2: subs x3, x3, #1 + ldr q16, [x0], #0x10 /* load next round key */ + subs x3, x3, #1 + b.ne 1b /* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */ aese v0.16b, v16.16b aese v1.16b, v16.16b - ldr q16, [x0], #0x10 /* load next round key */ - b.ne 1b + ldr q16, [x0] /* load last round key */ + /* q[i] := AddRoundKey_q16(q[i]) */ eor v0.16b, v0.16b, v16.16b eor v1.16b, v1.16b, v16.16b ret @@ -1097,18 +1104,28 @@ END(aesarmv8_enc2) .type aesarmv8_enc8,@function aesarmv8_enc8: ldr q16, [x0], #0x10 /* load round key */ - b 2f + sub x3, x3, #1 _ALIGN_TEXT -1: /* q[i] := MixColumns(q[i]) */ +1: /* q[i] := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q[i] */ + aese v0.16b, v16.16b aesmc v0.16b, v0.16b + aese v1.16b, v16.16b aesmc v1.16b, v1.16b + aese v2.16b, v16.16b aesmc v2.16b, v2.16b + aese v3.16b, v16.16b aesmc v3.16b, v3.16b + aese v4.16b, v16.16b aesmc v4.16b, v4.16b + aese v5.16b, v16.16b aesmc v5.16b, v5.16b + aese v6.16b, v16.16b aesmc v6.16b, v6.16b + aese v7.16b, v16.16b aesmc v7.16b, v7.16b -2: subs x3, x3, #1 + ldr q16, [x0], #0x10 /* load next round key */ + subs x3, x3, #1 + b.ne 1b /* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */ aese v0.16b, v16.16b aese v1.16b, v16.16b @@ -1118,9 +1135,9 @@ aesarmv8_enc8: aese v5.16b, v16.16b aese v6.16b, v16.16b aese v7.16b, v16.16b - ldr q16, [x0], #0x10 /* load next round key */ - b.ne 1b - eor v0.16b, v0.16b, v16.16b /* AddRoundKey */ + ldr q16, [x0] /* load last round key */ + /* q[i] := AddRoundKey_q16(q[i]) */ + eor v0.16b, v0.16b, v16.16b eor v1.16b, v1.16b, v16.16b eor v2.16b, v2.16b, v16.16b eor v3.16b, v3.16b, v16.16b @@ -1144,15 +1161,19 @@ END(aesarmv8_enc8) .type aesarmv8_dec1,@function aesarmv8_dec1: ldr q16, [x0], #0x10 /* load round key */ - b 2f + sub x3, x3, #1 _ALIGN_TEXT -1: /* q0 := InMixColumns(q0) */ - aesimc v0.16b, v0.16b -2: subs x3, x3, #1 - /* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */ +1: /* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */ aesd v0.16b, v16.16b + /* q0 := InMixColumns(q0) */ + aesimc v0.16b, v0.16b ldr q16, [x0], #0x10 /* load next round key */ + subs x3, x3, #1 b.ne 1b + /* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */ + aesd v0.16b, v16.16b + ldr q16, [x0] /* load last round key */ + /* q0 := AddRoundKey_q16(q0) */ eor v0.16b, v0.16b, v16.16b ret END(aesarmv8_dec1) @@ -1171,18 +1192,29 @@ END(aesarmv8_dec1) .type aesarmv8_dec8,@function aesarmv8_dec8: ldr q16, [x0], #0x10 /* load round key */ - b 2f + sub x3, x3, #1 _ALIGN_TEXT -1: /* q[i] := InMixColumns(q[i]) */ +1: /* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */ + aesd v0.16b, v16.16b + /* q[i] := InMixColumns(q[i]) */ aesimc v0.16b, v0.16b + aesd v1.16b, v16.16b aesimc v1.16b, v1.16b + aesd v2.16b, v16.16b aesimc v2.16b, v2.16b + aesd v3.16b, v16.16b aesimc v3.16b,
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Mon Jul 27 20:52:11 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_neon_32.S Log Message: PIC for aes_neon_32.S. Without this, tests/sys/crypto/aes/t_aes fails to start on armv7 because of R_ARM_ABS32 relocations in a nonwritable text segment for a PIE -- which atf quietly ignores in the final report! Yikes. To generate a diff of this commit: cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/aes/arch/arm/aes_neon_32.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/aes/arch/arm/aes_neon_32.S diff -u src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.1 src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.2 --- src/sys/crypto/aes/arch/arm/aes_neon_32.S:1.1 Mon Jun 29 23:57:56 2020 +++ src/sys/crypto/aes/arch/arm/aes_neon_32.S Mon Jul 27 20:52:10 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_neon_32.S,v 1.1 2020/06/29 23:57:56 riastradh Exp $ */ +/* $NetBSD: aes_neon_32.S,v 1.2 2020/07/27 20:52:10 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -30,8 +30,14 @@ .fpu neon + .text + .p2align 2 +.Lconstants_addr: + .long .Lconstants - . + .section .rodata .p2align 4 +.Lconstants: .type inv,_ASM_TYPE_OBJECT inv: @@ -239,7 +245,7 @@ ENTRY(aes_neon_enc1) * r3: rmod4 * r4: mc_forward * r5: mc_backward - * r6,r7,r8,r10,r11: temporaries + * r6,r7,r8,r10,r11,r12: temporaries * q0={d0-d1}: x/ak/A * q1={d2-d3}: 0x0f0f... * q2={d4-d5}: lo/k/j/io @@ -258,23 +264,30 @@ ENTRY(aes_neon_enc1) * q15={d30-d31}: A2_B/sr[rmod4] */ + /* r12 := .Lconstants - .Lconstants_addr, r11 := .Lconstants_addr */ + ldr r12, .Lconstants_addr + adr r11, .Lconstants_addr + vld1.64 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ movw r3, #0 vmov.i8 q1, #0x0f + /* r12 := .Lconstants */ + add r12, r12, r11 + /* (q4, q5) := (iptlo, ipthi) */ - ldr r6, =iptlo - ldr r7, =ipthi + add r6, r12, #(iptlo - .Lconstants) + add r7, r12, #(ipthi - .Lconstants) vld1.64 {d8-d9}, [r6 :128] vld1.64 {d10-d11}, [r7 :128] /* load the rest of the constants */ - ldr r4, =sb1_0 - ldr r5, =sb1_1 - ldr r6, =sb2_0 - ldr r7, =sb2_1 - ldr r8, =inv - ldr r10, =inva + add r4, r12, #(sb1_0 - .Lconstants) + add r5, r12, #(sb1_1 - .Lconstants) + add r6, r12, #(sb2_0 - .Lconstants) + add r7, r12, #(sb2_1 - .Lconstants) + add r8, r12, #(inv - .Lconstants) + add r10, r12, #(inva - .Lconstants) vld1.64 {d12-d13}, [r4 :128] /* q6 = sb1[0] */ vld1.64 {d14-d15}, [r5 :128] /* q7 = sb1[1] */ vld1.64 {d16-d17}, [r6 :128] /* q8 = sb2[0] */ @@ -283,8 +296,8 @@ ENTRY(aes_neon_enc1) vld1.64 {d22-d23}, [r10 :128] /* q11 = inva */ /* (r4, r5) := (_forward[0], _backward[0]) */ - ldr r4, =mc_forward - ldr r5, =mc_backward + add r4, r12, #(mc_forward - .Lconstants) + add r5, r12, #(mc_backward - .Lconstants) /* (q2, q3) := (lo, hi) */ vshr.u8 q3, q0, #4 @@ -392,9 +405,9 @@ ENTRY(aes_neon_enc1) bne 1b /* (q6, q7, q15) := (sbo[0], sbo[1], sr[rmod4]) */ - ldr r8, =sr - ldr r6, =sbo_0 - ldr r7, =sbo_1 + add r8, r12, #(sr - .Lconstants) + add r6, r12, #(sbo_0 - .Lconstants) + add r7, r12, #(sbo_1 - .Lconstants) add r8, r8, r3, lsl #4 vld1.64 {d12-d13}, [r6 :128] vld1.64 {d14-d15}, [r7 :128] @@ -469,23 +482,30 @@ ENTRY(aes_neon_dec1) * q15={d30-d31}: mc/sr[3 & ~(nrounds - 1)] */ + /* r12 := .Lconstants - .Lconstants_addr, r11 := .Lconstants_addr */ + ldr r12, .Lconstants_addr + adr r11, .Lconstants_addr + vld1.64 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ rsb r3, r1, #0 /* r3 := ~(x - 1) = -x */ vmov.i8 q1, #0x0f and r3, r3, #3 /* r3 := 3 & ~(x - 1) */ + /* r12 := .Lconstants */ + add r12, r12, r11 + /* (q4, q5) := (diptlo, dipthi) */ - ldr r6, =diptlo - ldr r7, =dipthi + add r6, r12, #(diptlo - .Lconstants) + add r7, r12, #(dipthi - .Lconstants) vld1.64 {d8-d9}, [r6 :128] vld1.64 {d10-d11}, [r7 :128] /* load the rest of the constants */ - ldr r4, =dsbb_0 - ldr r5, =dsbb_1 - ldr r6, =inv - ldr r7, =inva - ldr r8, =.Lmc_forward_3 + add r4, r12, #(dsbb_0 - .Lconstants) + add r5, r12, #(dsbb_1 - .Lconstants) + add r6, r12, #(inv - .Lconstants) + add r7, r12, #(inva - .Lconstants) + add r8, r12, #(.Lmc_forward_3 - .Lconstants) vld1.64 {d12-d13}, [r4 :128] /* q6 := dsbb[0] */ vld1.64 {d14-d15}, [r5 :128] /* q7 := dsbb[1] */ vld1.64 {d20-d21}, [r6 :128] /* q10 := inv */ @@ -504,8 +524,8 @@ ENTRY(aes_neon_dec1) vtbl.8 d7, {d10-d11}, d7 /* load dsb9 */ - ldr r4, =dsb9_0 - ldr r5, =dsb9_1 + add r4, r12, #(dsb9_0 - .Lconstants) + add r5, r12, #(dsb9_1 - .Lconstants) vld1.64 {d8-d9}, [r4 :128] /* q4 := dsb9[0] */ vld1.64 {d10-d11}, [r5 :128] /* q5 := dsb9[1] */ @@ -516,7 +536,7 @@ ENTRY(aes_neon_dec1) b 2f 1: /* load dsbd */ - ldr r4, =dsbd_0 + add r4, r12, #(dsbd_0 - .Lconstants) vld1.64 {d16-d17}, [r4 :128]! /* q8 := dsbd[0] */ vld1.64 {d18-d19}, [r4 :128] /* q9 := dsbd[1] */ @@ -543,7 +563,7
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Mon Jul 27 20:52:11 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_neon_32.S Log Message: PIC for aes_neon_32.S. Without this, tests/sys/crypto/aes/t_aes fails to start on armv7 because of R_ARM_ABS32 relocations in a nonwritable text segment for a PIE -- which atf quietly ignores in the final report! Yikes. To generate a diff of this commit: cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/aes/arch/arm/aes_neon_32.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Sat Jul 25 22:43:01 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: arm_neon.h Log Message: Add 32-bit load, store, and shift intrinsics. vld1q_u32 vst1q_u32 vshlq_n_u32 vshrq_n_u32 To generate a diff of this commit: cvs rdiff -u -r1.5 -r1.6 src/sys/crypto/aes/arch/arm/arm_neon.h Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Sat Jul 25 22:42:31 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: arm_neon.h Log Message: Fix missing clang big-endian case. To generate a diff of this commit: cvs rdiff -u -r1.4 -r1.5 src/sys/crypto/aes/arch/arm/arm_neon.h Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Sat Jul 25 22:42:31 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: arm_neon.h Log Message: Fix missing clang big-endian case. To generate a diff of this commit: cvs rdiff -u -r1.4 -r1.5 src/sys/crypto/aes/arch/arm/arm_neon.h Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/aes/arch/arm/arm_neon.h diff -u src/sys/crypto/aes/arch/arm/arm_neon.h:1.4 src/sys/crypto/aes/arch/arm/arm_neon.h:1.5 --- src/sys/crypto/aes/arch/arm/arm_neon.h:1.4 Sat Jul 25 22:36:06 2020 +++ src/sys/crypto/aes/arch/arm/arm_neon.h Sat Jul 25 22:42:31 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: arm_neon.h,v 1.4 2020/07/25 22:36:06 riastradh Exp $ */ +/* $NetBSD: arm_neon.h,v 1.5 2020/07/25 22:42:31 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -237,7 +237,12 @@ vld1q_u8(const uint8_t *__p8) return (uint8x16_t)__builtin_neon_vld1v16qi(__p); #endif #elif defined(__clang__) - return (uint8x16_t)__builtin_neon_vld1q_v(__p8, 48); + uint8x16_t __v = (uint8x16_t)__builtin_neon_vld1q_v(__p8, 48); +#ifndef __LITTLE_ENDIAN__ + __v = __builtin_shufflevector(__v, __v, + 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); +#endif + return __v; #endif } @@ -442,7 +447,7 @@ vst1q_u8(uint8_t *__p8, uint8x16_t __v) #elif defined(__clang__) #ifndef __LITTLE_ENDIAN__ __v = __builtin_shufflevector(__v, __v, - 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); #endif __builtin_neon_vst1q_v(__p8, __v, 48); #endif
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Sat Jul 25 22:43:01 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: arm_neon.h Log Message: Add 32-bit load, store, and shift intrinsics. vld1q_u32 vst1q_u32 vshlq_n_u32 vshrq_n_u32 To generate a diff of this commit: cvs rdiff -u -r1.5 -r1.6 src/sys/crypto/aes/arch/arm/arm_neon.h Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/aes/arch/arm/arm_neon.h diff -u src/sys/crypto/aes/arch/arm/arm_neon.h:1.5 src/sys/crypto/aes/arch/arm/arm_neon.h:1.6 --- src/sys/crypto/aes/arch/arm/arm_neon.h:1.5 Sat Jul 25 22:42:31 2020 +++ src/sys/crypto/aes/arch/arm/arm_neon.h Sat Jul 25 22:43:01 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: arm_neon.h,v 1.5 2020/07/25 22:42:31 riastradh Exp $ */ +/* $NetBSD: arm_neon.h,v 1.6 2020/07/25 22:43:01 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -222,6 +222,30 @@ vgetq_lane_u32(uint32x4_t __v, uint8_t _ #endif _INTRINSATTR +static __inline uint32x4_t +vld1q_u32(const uint32_t *__p32) +{ +#if defined(__GNUC__) && !defined(__clang__) +#ifdef __aarch64__ + const __builtin_aarch64_simd_si *__p = + (const __builtin_aarch64_simd_si *)__p32; + + return (uint32x4_t)__builtin_aarch64_ld1v4si(__p); +#else + const __builtin_neon_si *__p = (const __builtin_neon_si *)__p32; + + return (uint32x4_t)__builtin_neon_vld1v4si(__p); +#endif +#elif defined(__clang__) + uint32x4_t __v = (uint32x4_t)__builtin_neon_vld1q_v(__p32, 50); +#ifndef __LITTLE_ENDIAN__ + __v = __builtin_shufflevector(__v, __v, 3,2,1,0); +#endif + return __v; +#endif +} + +_INTRINSATTR static __inline uint8x16_t vld1q_u8(const uint8_t *__p8) { @@ -383,6 +407,38 @@ vsetq_lane_u64(uint64_t __x, uint64x2_t #if defined(__GNUC__) && !defined(__clang__) _INTRINSATTR +static __inline uint32x4_t +vshlq_n_u32(uint32x4_t __v, uint8_t __bits) +{ +#ifdef __aarch64__ + return (uint32x4_t)__builtin_aarch64_ashlv4si((int32x4_t)__v, __bits); +#else + return (uint32x4_t)__builtin_neon_vshl_nv4si((int32x4_t)__v, __bits); +#endif +} +#elif defined(__clang__) +#define vshlq_n_u32(__v, __bits) \ + (uint32x4_t)__builtin_neon_vshlq_n_v((int32x4_t)(__v), (__bits), 50) +#endif + +#if defined(__GNUC__) && !defined(__clang__) +_INTRINSATTR +static __inline uint32x4_t +vshrq_n_u32(uint32x4_t __v, uint8_t __bits) +{ +#ifdef __aarch64__ + return (uint32x4_t)__builtin_aarch64_lshrv4si((int32x4_t)__v, __bits); +#else + return (uint32x4_t)__builtin_neon_vshru_nv4si((int32x4_t)__v, __bits); +#endif +} +#elif defined(__clang__) +#define vshrq_n_u8(__v, __bits) \ + (uint32x4_t)__builtin_neon_vshrq_n_v((int32x4_t)(__v), (__bits), 50) +#endif + +#if defined(__GNUC__) && !defined(__clang__) +_INTRINSATTR static __inline uint8x16_t vshrq_n_u8(uint8x16_t __v, uint8_t __bits) { @@ -432,6 +488,28 @@ vsliq_n_s32(int32x4_t __vins, int32x4_t _INTRINSATTR static __inline void +vst1q_u32(uint32_t *__p32, uint32x4_t __v) +{ +#if defined(__GNUC__) && !defined(__clang__) +#ifdef __aarch64__ + __builtin_aarch64_simd_si *__p = (__builtin_aarch64_simd_si *)__p32; + + __builtin_aarch64_st1v4si(__p, (int32x4_t)__v); +#else + __builtin_neon_si *__p = (__builtin_neon_si *)__p32; + + __builtin_neon_vst1v4si(__p, (int32x4_t)__v); +#endif +#elif defined(__clang__) +#ifndef __LITTLE_ENDIAN__ + __v = __builtin_shufflevector(__v, __v, 3,2,1,0); +#endif + __builtin_neon_vst1q_v(__p32, __v, 50); +#endif +} + +_INTRINSATTR +static __inline void vst1q_u8(uint8_t *__p8, uint8x16_t __v) { #if defined(__GNUC__) && !defined(__clang__)
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Sat Jul 25 22:36:06 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_neon.h aes_neon_impl.c aes_neon_subr.c arm_neon.h Log Message: Implement AES-CCM with NEON. To generate a diff of this commit: cvs rdiff -u -r1.2 -r1.3 src/sys/crypto/aes/arch/arm/aes_neon.h \ src/sys/crypto/aes/arch/arm/aes_neon_subr.c cvs rdiff -u -r1.3 -r1.4 src/sys/crypto/aes/arch/arm/aes_neon_impl.c \ src/sys/crypto/aes/arch/arm/arm_neon.h Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/aes/arch/arm/aes_neon.h diff -u src/sys/crypto/aes/arch/arm/aes_neon.h:1.2 src/sys/crypto/aes/arch/arm/aes_neon.h:1.3 --- src/sys/crypto/aes/arch/arm/aes_neon.h:1.2 Sat Jul 25 22:12:57 2020 +++ src/sys/crypto/aes/arch/arm/aes_neon.h Sat Jul 25 22:36:06 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_neon.h,v 1.2 2020/07/25 22:12:57 riastradh Exp $ */ +/* $NetBSD: aes_neon.h,v 1.3 2020/07/25 22:36:06 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -59,6 +59,12 @@ void aes_neon_xts_enc(const struct aesen uint8_t[static 16], size_t, uint8_t[static 16], uint32_t); void aes_neon_xts_dec(const struct aesdec *, const uint8_t[static 16], uint8_t[static 16], size_t, uint8_t[static 16], uint32_t); +void aes_neon_cbcmac_update1(const struct aesenc *, const uint8_t[static 16], +size_t, uint8_t[static 16], uint32_t); +void aes_neon_ccm_enc1(const struct aesenc *, const uint8_t[static 16], +uint8_t[static 16], size_t, uint8_t[static 32], uint32_t); +void aes_neon_ccm_dec1(const struct aesenc *, const uint8_t[static 16], +uint8_t[static 16], size_t, uint8_t[static 32], uint32_t); int aes_neon_selftest(void); Index: src/sys/crypto/aes/arch/arm/aes_neon_subr.c diff -u src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.2 src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.3 --- src/sys/crypto/aes/arch/arm/aes_neon_subr.c:1.2 Tue Jun 30 20:32:11 2020 +++ src/sys/crypto/aes/arch/arm/aes_neon_subr.c Sat Jul 25 22:36:06 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_neon_subr.c,v 1.2 2020/06/30 20:32:11 riastradh Exp $ */ +/* $NetBSD: aes_neon_subr.c,v 1.3 2020/07/25 22:36:06 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -27,7 +27,9 @@ */ #include -__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.2 2020/06/30 20:32:11 riastradh Exp $"); +__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.3 2020/07/25 22:36:06 riastradh Exp $"); + +#include #ifdef _KERNEL #include @@ -213,6 +215,89 @@ aes_neon_xts_dec(const struct aesdec *de storeblock(tweak, t); } +void +aes_neon_cbcmac_update1(const struct aesenc *enc, const uint8_t in[static 16], +size_t nbytes, uint8_t auth0[static 16], uint32_t nrounds) +{ + uint8x16_t auth; + + KASSERT(nbytes); + KASSERT(nbytes % 16 == 0); + + auth = loadblock(auth0); + for (; nbytes; nbytes -= 16, in += 16) + auth = aes_neon_enc1(enc, auth ^ loadblock(in), nrounds); + storeblock(auth0, auth); +} + +/* + * XXX On aarch64, we have enough registers that we should be able to + * pipeline two simultaneous vpaes computations in an `aes_neon_enc2' + * function, which should substantially improve CCM throughput. + */ + +#if _BYTE_ORDER == _LITTLE_ENDIAN +#define vbetoh32q_u8 vrev32q_u8 +#define vhtobe32q_u8 vrev32q_u8 +#elif _BYTE_ORDER == _BIG_ENDIAN +#define vbetoh32q_u8(x) (x) +#define vhtobe32q_u8(x) (x) +#else +#error what kind of endian are you anyway +#endif + +void +aes_neon_ccm_enc1(const struct aesenc *enc, const uint8_t in[static 16], +uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32], +uint32_t nrounds) +{ + const uint32x4_t ctr32_inc = {0, 0, 0, 1}; + uint8x16_t auth, ptxt, ctr_be; + uint32x4_t ctr; + + KASSERT(nbytes); + KASSERT(nbytes % 16 == 0); + + auth = loadblock(authctr); + ctr_be = loadblock(authctr + 16); + ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be)); + for (; nbytes; nbytes -= 16, in += 16, out += 16) { + ptxt = loadblock(in); + auth = aes_neon_enc1(enc, auth ^ ptxt, nrounds); + ctr = vaddq_u32(ctr, ctr32_inc); + ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr)); + storeblock(out, ptxt ^ aes_neon_enc1(enc, ctr_be, nrounds)); + } + storeblock(authctr, auth); + storeblock(authctr + 16, ctr_be); +} + +void +aes_neon_ccm_dec1(const struct aesenc *enc, const uint8_t in[static 16], +uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32], +uint32_t nrounds) +{ + const uint32x4_t ctr32_inc = {0, 0, 0, 1}; + uint8x16_t auth, ctr_be, ptxt; + uint32x4_t ctr; + + KASSERT(nbytes); + KASSERT(nbytes % 16 == 0); + + auth = loadblock(authctr); + ctr_be = loadblock(authctr + 16); + ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be)); + for (; nbytes; nbytes -= 16, in += 16, out += 16) { + ctr = vaddq_u32(ctr, ctr32_inc); + ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr)); + ptxt = loadblock(in) ^ aes_neon_enc1(enc,
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Sat Jul 25 22:36:06 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_neon.h aes_neon_impl.c aes_neon_subr.c arm_neon.h Log Message: Implement AES-CCM with NEON. To generate a diff of this commit: cvs rdiff -u -r1.2 -r1.3 src/sys/crypto/aes/arch/arm/aes_neon.h \ src/sys/crypto/aes/arch/arm/aes_neon_subr.c cvs rdiff -u -r1.3 -r1.4 src/sys/crypto/aes/arch/arm/aes_neon_impl.c \ src/sys/crypto/aes/arch/arm/arm_neon.h Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Sat Jul 25 22:32:09 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_armv8_64.S Log Message: Invert some loops to save a branch instruction on every iteration. To generate a diff of this commit: cvs rdiff -u -r1.6 -r1.7 src/sys/crypto/aes/arch/arm/aes_armv8_64.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/aes/arch/arm/aes_armv8_64.S diff -u src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.6 src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.7 --- src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.6 Wed Jul 22 06:15:21 2020 +++ src/sys/crypto/aes/arch/arm/aes_armv8_64.S Sat Jul 25 22:32:09 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_armv8_64.S,v 1.6 2020/07/22 06:15:21 riastradh Exp $ */ +/* $NetBSD: aes_armv8_64.S,v 1.7 2020/07/25 22:32:09 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -437,13 +437,13 @@ END(aesarmv8_setenckey256) */ ENTRY(aesarmv8_enctodec) ldr q0, [x0, x2, lsl #4] /* load last round key */ -1: str q0, [x1], #0x10 /* store round key */ + b 2f +1: aesimc v0.16b, v0.16b /* convert encryption to decryption */ +2: str q0, [x1], #0x10 /* store round key */ subs x2, x2, #1 /* count down round */ ldr q0, [x0, x2, lsl #4] /* load previous round key */ - b.eq 2f /* stop if this is the last one */ - aesimc v0.16b, v0.16b /* convert encryption to decryption */ - b 1b -2: str q0, [x1] /* store first round key verbatim */ + b.ne 1b /* repeat if there's more */ + str q0, [x1] /* store first round key verbatim */ ret END(aesarmv8_enctodec) @@ -536,17 +536,17 @@ ENTRY(aesarmv8_cbc_dec1) add x2, x2, x3 /* x2 := pointer past end of out */ ldr q0, [x1, #-0x10]! /* q0 := last ciphertext block */ str q0, [x4] /* update iv */ -1: mov x0, x9 /* x0 := enckey */ - mov x3, x5 /* x3 := nrounds */ - bl aesarmv8_dec1 /* q0 := cv ^ ptxt; trash x0/x3/q16 */ - subs x10, x10, #0x10 /* count down nbytes */ - b.eq 2f /* stop if this is the first block */ - ldr q31, [x1, #-0x10]! /* q31 := chaining value */ + b 2f +1: ldr q31, [x1, #-0x10]! /* q31 := chaining value */ eor v0.16b, v0.16b, v31.16b /* q0 := plaintext block */ str q0, [x2, #-0x10]! /* store plaintext block */ mov v0.16b, v31.16b /* move cv = ciphertext block */ - b 1b -2: eor v0.16b, v0.16b, v24.16b /* q0 := first plaintext block */ +2: mov x0, x9 /* x0 := enckey */ + mov x3, x5 /* x3 := nrounds */ + bl aesarmv8_dec1 /* q0 := cv ^ ptxt; trash x0/x3/q16 */ + subs x10, x10, #0x10 /* count down nbytes */ + b.ne 1b /* repeat if more blocks */ + eor v0.16b, v0.16b, v24.16b /* q0 := first plaintext block */ str q0, [x2, #-0x10]! /* store first plaintext block */ ldp fp, lr, [sp], #16 /* pop stack frame */ ret @@ -573,7 +573,11 @@ ENTRY(aesarmv8_cbc_dec8) add x2, x2, x3 /* x2 := pointer past end of out */ ldp q6, q7, [x1, #-0x20]! /* q6, q7 := last ciphertext blocks */ str q7, [x4] /* update iv */ -1: ldp q4, q5, [x1, #-0x20]! + b 2f +1: ldp q6, q7, [x1, #-0x20]! + eor v0.16b, v0.16b, v7.16b /* q0 := pt0 */ + stp q0, q1, [x2, #-0x20]! +2: ldp q4, q5, [x1, #-0x20]! ldp q2, q3, [x1, #-0x20]! ldp q0, q1, [x1, #-0x20]! mov v31.16b, v6.16b /* q[24+i] := cv[i], 0
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Sat Jul 25 22:32:09 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_armv8_64.S Log Message: Invert some loops to save a branch instruction on every iteration. To generate a diff of this commit: cvs rdiff -u -r1.6 -r1.7 src/sys/crypto/aes/arch/arm/aes_armv8_64.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: ryo Date: Thu Jul 23 11:33:01 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: arm_neon.h Log Message: fix build with llvm/clang. To generate a diff of this commit: cvs rdiff -u -r1.2 -r1.3 src/sys/crypto/aes/arch/arm/arm_neon.h Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/aes/arch/arm/arm_neon.h diff -u src/sys/crypto/aes/arch/arm/arm_neon.h:1.2 src/sys/crypto/aes/arch/arm/arm_neon.h:1.3 --- src/sys/crypto/aes/arch/arm/arm_neon.h:1.2 Tue Jun 30 21:24:00 2020 +++ src/sys/crypto/aes/arch/arm/arm_neon.h Thu Jul 23 11:33:01 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: arm_neon.h,v 1.2 2020/06/30 21:24:00 riastradh Exp $ */ +/* $NetBSD: arm_neon.h,v 1.3 2020/07/23 11:33:01 ryo Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -65,7 +65,7 @@ typedef struct { uint8x8_t val[2]; } uin #elif defined(__clang__) #define _INTRINSATTR \ - __attribute__((__always_inline__, __nodebug)) + __attribute__((__always_inline__, __nodebug__)) typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t; typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t;
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: ryo Date: Thu Jul 23 11:33:01 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: arm_neon.h Log Message: fix build with llvm/clang. To generate a diff of this commit: cvs rdiff -u -r1.2 -r1.3 src/sys/crypto/aes/arch/arm/arm_neon.h Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Wed Jul 22 06:15:21 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_armv8_64.S Log Message: Fix register name in comment. Some time ago I reallocated the registers to avoid inadvertently clobbering the callee-saves v9, but neglected to update the comment. To generate a diff of this commit: cvs rdiff -u -r1.5 -r1.6 src/sys/crypto/aes/arch/arm/aes_armv8_64.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Wed Jul 22 06:15:21 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_armv8_64.S Log Message: Fix register name in comment. Some time ago I reallocated the registers to avoid inadvertently clobbering the callee-saves v9, but neglected to update the comment. To generate a diff of this commit: cvs rdiff -u -r1.5 -r1.6 src/sys/crypto/aes/arch/arm/aes_armv8_64.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/aes/arch/arm/aes_armv8_64.S diff -u src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.5 src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.6 --- src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.5 Sun Jul 19 07:32:43 2020 +++ src/sys/crypto/aes/arch/arm/aes_armv8_64.S Wed Jul 22 06:15:21 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_armv8_64.S,v 1.5 2020/07/19 07:32:43 ryo Exp $ */ +/* $NetBSD: aes_armv8_64.S,v 1.6 2020/07/22 06:15:21 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -827,7 +827,7 @@ aesarmv8_xts_mulx: * carried into x^128 = x^7 + x^2 + x + 1. */ adrl x0, xtscarry - cmlt v1.2d, v31.2d, #0 /* v1.2d[i] := -1 if v9.2d[i] < 0, else 0 */ + cmlt v1.2d, v31.2d, #0 /* v1.2d[i] := -1 if v31.2d[i] < 0, else 0 */ ldr q0, [x0] /* q0 := xtscarry */ ext v1.16b, v1.16b, v1.16b, #8 /* swap halves of q1 */ shl v31.2d, v31.2d, #1 /* shift */
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: ryo Date: Sun Jul 19 07:32:43 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_armv8_64.S Log Message: fix build with clang/llvm. clang aarch64 assembler doesn't accept optional number of lanes of vector register. (but ARMARM says that an assembler must accept it) To generate a diff of this commit: cvs rdiff -u -r1.4 -r1.5 src/sys/crypto/aes/arch/arm/aes_armv8_64.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: ryo Date: Sun Jul 19 07:32:43 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_armv8_64.S Log Message: fix build with clang/llvm. clang aarch64 assembler doesn't accept optional number of lanes of vector register. (but ARMARM says that an assembler must accept it) To generate a diff of this commit: cvs rdiff -u -r1.4 -r1.5 src/sys/crypto/aes/arch/arm/aes_armv8_64.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/aes/arch/arm/aes_armv8_64.S diff -u src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.4 src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.5 --- src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.4 Tue Jun 30 23:06:02 2020 +++ src/sys/crypto/aes/arch/arm/aes_armv8_64.S Sun Jul 19 07:32:43 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_armv8_64.S,v 1.4 2020/06/30 23:06:02 riastradh Exp $ */ +/* $NetBSD: aes_armv8_64.S,v 1.5 2020/07/19 07:32:43 ryo Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -238,8 +238,8 @@ ENTRY(aesarmv8_setenckey192) */ /* v1.4s := (nrk[0], nrk[1], nrk[1], nrk[1]) */ - dup v1.4s, v5.4s[3] - mov v1.4s[0], v5.4s[2] + dup v1.4s, v5.s[3] + mov v1.s[0], v5.s[2] /* * v6.4s := (0, 0, rklo[0], rklo[1]) @@ -257,7 +257,7 @@ ENTRY(aesarmv8_setenckey192) * and v5.4s = (rk[2], rk[3], xxx, xxx). Set * v2.4s := (rk[0], rk[1], rk[2], rk[3]) */ - mov v2.2d[1], v5.2d[0] + mov v2.d[1], v5.d[0] /* store two round keys */ stp q2, q3, [x0], #0x20 @@ -325,7 +325,7 @@ ENTRY(aesarmv8_setenckey192) ext v5.16b, v0.16b, v4.16b, #12 /* v2.4s := (nnrk[3], nnrk[3], xxx, xxx) */ - dup v2.4s, v1.4s[3] + dup v2.4s, v1.s[3] /* * v2.4s := (nnnrklo[0] = nnrk[3] ^ nrk[2],
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Tue Jun 30 23:06:02 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_armv8_64.S Log Message: Reallocate registers to avoid abusing callee-saves registers, v8-v15. Forgot to consult the AAPCS before committing this before -- oops! While here, take advantage of the 32 aarch64 simd registers to avoid all stack spills. To generate a diff of this commit: cvs rdiff -u -r1.3 -r1.4 src/sys/crypto/aes/arch/arm/aes_armv8_64.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/aes/arch/arm/aes_armv8_64.S diff -u src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.3 src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.4 --- src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.3 Tue Jun 30 21:53:39 2020 +++ src/sys/crypto/aes/arch/arm/aes_armv8_64.S Tue Jun 30 23:06:02 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_armv8_64.S,v 1.3 2020/06/30 21:53:39 riastradh Exp $ */ +/* $NetBSD: aes_armv8_64.S,v 1.4 2020/06/30 23:06:02 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -116,7 +116,7 @@ ENTRY(aesarmv8_setenckey128) adrl x4, unshiftrows_rotword_3 eor v0.16b, v0.16b, v0.16b /* q0 := 0 */ - ldr q8, [x4] /* q8 := unshiftrows_rotword_3 table */ + ldr q16, [x4] /* q16 := unshiftrows_rotword_3 table */ str q1, [x0], #0x10 /* store master key as first round key */ mov x2, #10 /* round count */ @@ -136,7 +136,7 @@ ENTRY(aesarmv8_setenckey128) /* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */ ld1r {v4.4s}, [x3], #4 - tbl v3.16b, {v3.16b}, v8.16b + tbl v3.16b, {v3.16b}, v16.16b eor v3.16b, v3.16b, v4.16b /* @@ -175,8 +175,8 @@ ENTRY(aesarmv8_setenckey192) adrl x4, unshiftrows_rotword_1 adrl x5, unshiftrows_rotword_3 eor v0.16b, v0.16b, v0.16b /* q0 := 0 */ - ldr q8, [x4] /* q8 := unshiftrows_rotword_1 */ - ldr q9, [x5] /* q9 := unshiftrows_rotword_3 */ + ldr q16, [x4] /* q16 := unshiftrows_rotword_1 */ + ldr q17, [x5] /* q17 := unshiftrows_rotword_3 */ str q1, [x0], #0x10 /* store master key[0:128) as round key */ mov x2, #12 /* round count */ @@ -197,7 +197,7 @@ ENTRY(aesarmv8_setenckey192) /* v3.4s[i] := RotWords(SubBytes(rklo[1])) ^ RCON */ ld1r {v4.4s}, [x3], #4 - tbl v3.16b, {v3.16b}, v8.16b + tbl v3.16b, {v3.16b}, v16.16b eor v3.16b, v3.16b, v4.16b /* @@ -269,8 +269,8 @@ ENTRY(aesarmv8_setenckey192) * q2 = rk * q3 = nrk * v5.4s = (rk[2], rk[3], nrk[0], nrk[1]) - * q8 = unshiftrows_rotword_1 - * q9 = unshiftrows_rotword_3 + * q16 = unshiftrows_rotword_1 + * q17 = unshiftrows_rotword_3 * * We have to compute, in q1: * @@ -294,7 +294,7 @@ ENTRY(aesarmv8_setenckey192) /* v1.4s[i] := RotWords(SubBytes(nrk[3])) ^ RCON' */ ld1r {v4.4s}, [x3], #4 - tbl v1.16b, {v1.16b}, v9.16b + tbl v1.16b, {v1.16b}, v17.16b eor v1.16b, v1.16b, v4.16b /* @@ -354,8 +354,8 @@ ENTRY(aesarmv8_setenckey256) adrl x4, unshiftrows_rotword_3 adrl x5, unshiftrows_3 eor v0.16b, v0.16b, v0.16b /* q0 := 0 */ - ldr q8, [x4] /* q8 := unshiftrows_rotword_3 */ - ldr q9, [x5] /* q9 := unshiftrows_3 */ + ldr q16, [x4] /* q16 := unshiftrows_rotword_3 */ + ldr q17, [x5] /* q17 := unshiftrows_3 */ /* store master key as first two round keys */ stp q1, q2, [x0], #0x20 @@ -376,7 +376,7 @@ ENTRY(aesarmv8_setenckey256) /* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */ ld1r {v4.4s}, [x3], #4 - tbl v3.16b, {v3.16b}, v8.16b + tbl v3.16b, {v3.16b}, v16.16b eor v3.16b, v3.16b, v4.16b /* @@ -402,7 +402,7 @@ ENTRY(aesarmv8_setenckey256) aese v3.16b, v0.16b /* v3.4s[i] := SubBytes(rk[3]) */ - tbl v3.16b, {v3.16b}, v9.16b + tbl v3.16b, {v3.16b}, v17.16b /* * v5.4s := (0,prk[0],prk[1],prk[2]) @@ -458,9 +458,9 @@ END(aesarmv8_enctodec) ENTRY(aesarmv8_enc) stp fp, lr, [sp, #-16]! /* push stack frame */ mov fp, sp - ldr q0, [x1] /* q0 := block */ - bl aesarmv8_enc1 - str q0, [x2] /* store block */ + ldr q0, [x1] /* q0 := ptxt */ + bl aesarmv8_enc1 /* q0 := ctxt; trash x0/x3/q16 */ + str q0, [x2] /* store ctxt */ ldp fp, lr, [sp], #16 /* pop stack frame */ ret END(aesarmv8_enc) @@ -476,9 +476,9 @@ END(aesarmv8_enc) ENTRY(aesarmv8_dec) stp fp, lr, [sp, #-16]! /* push stack frame */ mov fp, sp - ldr q0, [x1] /* q0 := block */ - bl aesarmv8_dec1 - str q0, [x2] /* store block */ + ldr q0, [x1] /* q0 := ctxt */ + bl aesarmv8_dec1 /* q0 := ptxt; trash x0/x3/q16 */ + str q0, [x2] /* store ptxt */ ldp fp, lr, [sp], #16 /* pop stack frame */ ret END(aesarmv8_dec) @@ -505,7 +505,7 @@ ENTRY(aesarmv8_cbc_enc) eor v0.16b, v0.16b, v1.16b /* q0 := cv ^ ptxt */ mov x0, x9 /* x0 := enckey */ mov x3, x5 /* x3 := nrounds */ - bl aesarmv8_enc1 /* q0 := ciphertext block */ + bl aesarmv8_enc1 /* q0 := ctxt; trash x0/x3/q16 */ subs x10, x10, #0x10 /* count down nbytes */ str q0, [x2], #0x10 /* store ciphertext block */ b.ne 1b /* repeat if x10 is nonzero */ @@
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Tue Jun 30 23:06:02 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_armv8_64.S Log Message: Reallocate registers to avoid abusing callee-saves registers, v8-v15. Forgot to consult the AAPCS before committing this before -- oops! While here, take advantage of the 32 aarch64 simd registers to avoid all stack spills. To generate a diff of this commit: cvs rdiff -u -r1.3 -r1.4 src/sys/crypto/aes/arch/arm/aes_armv8_64.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Tue Jun 30 21:53:39 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_armv8_64.S Log Message: Use `.arch_extension aes' for aese/aesmc/aesd/aesimc. Unlike `.arch_extension crypto', this works with clang; both work with gas, so we'll go with this. Clang still can't handle aes_armv8_64.S yet -- it gets confused by dup and mov on lanes, but this makes progress. To generate a diff of this commit: cvs rdiff -u -r1.2 -r1.3 src/sys/crypto/aes/arch/arm/aes_armv8_64.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Tue Jun 30 21:53:39 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_armv8_64.S Log Message: Use `.arch_extension aes' for aese/aesmc/aesd/aesimc. Unlike `.arch_extension crypto', this works with clang; both work with gas, so we'll go with this. Clang still can't handle aes_armv8_64.S yet -- it gets confused by dup and mov on lanes, but this makes progress. To generate a diff of this commit: cvs rdiff -u -r1.2 -r1.3 src/sys/crypto/aes/arch/arm/aes_armv8_64.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/aes/arch/arm/aes_armv8_64.S diff -u src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.2 src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.3 --- src/sys/crypto/aes/arch/arm/aes_armv8_64.S:1.2 Tue Jun 30 21:41:03 2020 +++ src/sys/crypto/aes/arch/arm/aes_armv8_64.S Tue Jun 30 21:53:39 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_armv8_64.S,v 1.2 2020/06/30 21:41:03 riastradh Exp $ */ +/* $NetBSD: aes_armv8_64.S,v 1.3 2020/06/30 21:53:39 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -28,7 +28,7 @@ #include - .arch_extension crypto + .arch_extension aes /* * uint32_t rcon[10]
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Tue Jun 30 21:24:00 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: arm_neon.h Log Message: Tweak clang neon intrinsics so they build. (this file is still a kludge) To generate a diff of this commit: cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/aes/arch/arm/arm_neon.h Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Tue Jun 30 21:24:00 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: arm_neon.h Log Message: Tweak clang neon intrinsics so they build. (this file is still a kludge) To generate a diff of this commit: cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/aes/arch/arm/arm_neon.h Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/aes/arch/arm/arm_neon.h diff -u src/sys/crypto/aes/arch/arm/arm_neon.h:1.1 src/sys/crypto/aes/arch/arm/arm_neon.h:1.2 --- src/sys/crypto/aes/arch/arm/arm_neon.h:1.1 Mon Jun 29 23:56:31 2020 +++ src/sys/crypto/aes/arch/arm/arm_neon.h Tue Jun 30 21:24:00 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: arm_neon.h,v 1.1 2020/06/29 23:56:31 riastradh Exp $ */ +/* $NetBSD: arm_neon.h,v 1.2 2020/06/30 21:24:00 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -73,6 +73,8 @@ typedef __attribute__((neon_vector_type( typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t; typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t; typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t; + +typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t; typedef struct { uint8x8_t val[2]; } uint8x8x2_t; #ifdef __LITTLE_ENDIAN__ @@ -118,11 +120,11 @@ vdupq_n_u8(uint8_t __x) }; } +#if defined(__GNUC__) && !defined(__clang__) _INTRINSATTR static __inline uint32x4_t vextq_u32(uint32x4_t __lo, uint32x4_t __hi, uint8_t __i) { -#if defined(__GNUC__) && !defined(__clang__) #if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) return __builtin_shuffle(__hi, __lo, (uint32x4_t) { 4 - __i, 5 - __i, 6 - __i, 7 - __i }); @@ -130,25 +132,31 @@ vextq_u32(uint32x4_t __lo, uint32x4_t __ return __builtin_shuffle(__lo, __hi, (uint32x4_t) { __i + 0, __i + 1, __i + 2, __i + 3 }); #endif +} #elif defined(__clang__) #ifdef __LITTLE_ENDIAN__ - return __builtin_neon_vextq_v((int8x16_t)__lo, (int8x16_t)__hi, __i, - 50); -#else - uint32x4_t __lo_r = __builtin_shufflevector(__lo, __lo, 3, 2, 1, 0); - uint32x4_t __hi_r = __builtin_shufflevector(__hi, __hi, 3, 2, 1, 0); - uint32x4_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, - (int8x16_t)__hi_r, __i, 50); - return __builtin_shufflevector(__r, __r, 3, 2, 1, 0); -#endif +#define vextq_u32(__lo, __hi, __i) \ + (uint32x4_t)__builtin_neon_vextq_v((int8x16_t)(__lo), \ + (int8x16_t)(__hi), (__i), 50) +#else +#define vextq_u32(__lo, __hi, __i) ( \ +{ \ + uint32x4_t __tlo = (__lo); \ + uint32x4_t __thi = (__hi); \ + uint32x4_t __lo_r = __builtin_shufflevector(__tlo, __tlo, 3,2,1,0); \ + uint32x4_t __hi_r = __builtin_shufflevector(__thi, __thi, 3,2,1,0); \ + uint32x4_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, \ + (int8x16_t)__hi_r, __i, 50); \ + __builtin_shufflevector(__r, __r, 3,2,1,0); \ +}) +#endif /* __LITTLE_ENDIAN__ */ #endif -} +#if defined(__GNUC__) && !defined(__clang__) _INTRINSATTR static __inline uint8x16_t vextq_u8(uint8x16_t __lo, uint8x16_t __hi, uint8_t __i) { -#if defined(__GNUC__) && !defined(__clang__) #if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) return __builtin_shuffle(__hi, __lo, (uint8x16_t) { @@ -166,38 +174,45 @@ vextq_u8(uint8x16_t __lo, uint8x16_t __h __i + 12, __i + 13, __i + 14, __i + 15, }); #endif +} #elif defined(__clang__) #ifdef __LITTLE_ENDIAN__ - return __builtin_neon_vextq_v((int8x16_t)__lo, (int8x16_t)__hi, __i, - 48); -#else - uint32x4_t __lo_r = __builtin_shufflevector(__lo, __lo, - 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - uint32x4_t __hi_r = __builtin_shufflevector(__hi, __hi, - 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - uint32x4_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, - (int8x16_t)__hi_r, __i, 50); - return __builtin_shufflevector(__r, __r, - 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); -#endif +#define vextq_u8(__lo, __hi, __i) \ + (uint8x16_t)__builtin_neon_vextq_v((int8x16_t)(__lo), \ + (int8x16_t)(__hi), (__i), 48) +#else +#define vextq_u8(__lo, __hi, __i) ( \ +{ \ + uint8x16_t __tlo = (__lo); \ + uint8x16_t __thi = (__hi); \ + uint8x16_t __lo_r = __builtin_shufflevector(__tlo, __tlo, \ + 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ + uint8x16_t __hi_r = __builtin_shufflevector(__thi, __thi, \ + 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ + uint8x16_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, \ + (int8x16_t)__hi_r, (__i), 48); \ + return __builtin_shufflevector(__r, __r, \ + 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ +}) +#endif /* __LITTLE_ENDIAN */ #endif -} +#if defined(__GNUC__) && !defined(__clang__) _INTRINSATTR static __inline
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Tue Jun 30 17:03:14 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: files.aesneon Log Message: Limit aes_neon to cpu_cortex | aarch64. We won't use it on any other systems, and it doesn't build without NEON anyway. Verified earmv7hf GENERIC, aarch64 GENERIC64, and earmv6 RPI2 all build with this. To generate a diff of this commit: cvs rdiff -u -r1.2 -r1.3 src/sys/crypto/aes/arch/arm/files.aesneon Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Tue Jun 30 17:03:14 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: files.aesneon Log Message: Limit aes_neon to cpu_cortex | aarch64. We won't use it on any other systems, and it doesn't build without NEON anyway. Verified earmv7hf GENERIC, aarch64 GENERIC64, and earmv6 RPI2 all build with this. To generate a diff of this commit: cvs rdiff -u -r1.2 -r1.3 src/sys/crypto/aes/arch/arm/files.aesneon Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/aes/arch/arm/files.aesneon diff -u src/sys/crypto/aes/arch/arm/files.aesneon:1.2 src/sys/crypto/aes/arch/arm/files.aesneon:1.3 --- src/sys/crypto/aes/arch/arm/files.aesneon:1.2 Mon Jun 29 23:57:56 2020 +++ src/sys/crypto/aes/arch/arm/files.aesneon Tue Jun 30 17:03:13 2020 @@ -1,4 +1,4 @@ -# $NetBSD: files.aesneon,v 1.2 2020/06/29 23:57:56 riastradh Exp $ +# $NetBSD: files.aesneon,v 1.3 2020/06/30 17:03:13 riastradh Exp $ ifdef aarch64 makeoptions aes "COPTS.aes_neon.c"+="-march=armv8-a" @@ -8,10 +8,8 @@ makeoptions aes "COPTS.aes_neon.c"+="-mf makeoptions aes "COPTS.aes_neon_subr.c"+="-mfloat-abi=softfp -mfpu=neon" endif -file crypto/aes/arch/arm/aes_neon.c aes -file crypto/aes/arch/arm/aes_neon_impl.c aes -file crypto/aes/arch/arm/aes_neon_subr.c aes +file crypto/aes/arch/arm/aes_neon.c aes & (cpu_cortex | aarch64) +file crypto/aes/arch/arm/aes_neon_impl.c aes & (cpu_cortex | aarch64) +file crypto/aes/arch/arm/aes_neon_subr.c aes & (cpu_cortex | aarch64) -ifndef aarch64 -file crypto/aes/arch/arm/aes_neon_32.S aes -endif +file crypto/aes/arch/arm/aes_neon_32.S aes & cpu_cortex & !aarch64
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Mon Jun 29 23:57:56 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_neon.c files.aesneon Added Files: src/sys/crypto/aes/arch/arm: aes_neon_32.S Log Message: Provide hand-written AES NEON assembly for arm32. gcc does a lousy job at compiling 128-bit NEON intrinsics on arm32; hand-writing it made it about 12x faster, by avoiding a zillion loads and stores to spill everything and the kitchen sink onto the stack. (But gcc does fine on aarch64, presumably because it has twice as many registers and doesn't have to deal with q2=d4/d5 overlapping.) To generate a diff of this commit: cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/aes/arch/arm/aes_neon.c \ src/sys/crypto/aes/arch/arm/files.aesneon cvs rdiff -u -r0 -r1.1 src/sys/crypto/aes/arch/arm/aes_neon_32.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. Modified files: Index: src/sys/crypto/aes/arch/arm/aes_neon.c diff -u src/sys/crypto/aes/arch/arm/aes_neon.c:1.1 src/sys/crypto/aes/arch/arm/aes_neon.c:1.2 --- src/sys/crypto/aes/arch/arm/aes_neon.c:1.1 Mon Jun 29 23:56:31 2020 +++ src/sys/crypto/aes/arch/arm/aes_neon.c Mon Jun 29 23:57:56 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: aes_neon.c,v 1.1 2020/06/29 23:56:31 riastradh Exp $ */ +/* $NetBSD: aes_neon.c,v 1.2 2020/06/29 23:57:56 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -39,7 +39,7 @@ */ #include -__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.1 2020/06/29 23:56:31 riastradh Exp $"); +__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.2 2020/06/29 23:57:56 riastradh Exp $"); #include @@ -47,6 +47,12 @@ __KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v #include "aes_neon_impl.h" +#ifdef __aarch64__ +#define __aarch64_used +#else +#define __aarch64_used __unused +#endif + static const uint8x16_t mc_forward[4] = { {0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04, @@ -58,7 +64,7 @@ mc_forward[4] = { {0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00, 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08}, }, -mc_backward[4] = { +mc_backward[4] __aarch64_used = { {0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06, 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E}, {0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02, @@ -68,7 +74,7 @@ mc_backward[4] = { {0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A, 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02}, }, -ipt[2] = { +ipt[2] __aarch64_used = { {0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2, 0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA}, {0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C, @@ -80,55 +86,55 @@ opt[2] = { {0x00,0xEC,0xBC,0x50,0x51,0xBD,0xED,0x01, 0xE0,0x0C,0x5C,0xB0,0xB1,0x5D,0x0D,0xE1}, }, -dipt[2] = { +dipt[2] __aarch64_used = { {0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F, 0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15}, {0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86, 0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12}, }, -sb1[2] = { +sb1[2] __aarch64_used = { {0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1, 0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5}, {0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36, 0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B}, }, -sb2[2] = { +sb2[2] __aarch64_used = { {0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2, 0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E}, {0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69, 0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2}, }, -sbo[2] = { +sbo[2] __aarch64_used = { {0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0, 0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15}, {0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF, 0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E}, }, -dsb9[2] = { +dsb9[2] __aarch64_used = { {0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85, 0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA}, {0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0, 0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72}, }, -dsbd[2] = { +dsbd[2] __aarch64_used = { {0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D, 0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5}, {0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C, 0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29}, }, -dsbb[2] = { +dsbb[2] __aarch64_used = { {0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0, 0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60}, {0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1, 0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3}, }, -dsbe[2] = { +dsbe[2] __aarch64_used = { {0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46, 0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22}, {0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C, 0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94}, }, -dsbo[2] = { +dsbo[2] __aarch64_used = { {0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13, 0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7}, {0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12, @@ -164,7 +170,7 @@ deskew[2] = { {0x00,0x69,0xEA,0x83,0xDC,0xB5,0x36,0x5F, 0x77,0x1E,0x9D,0xF4,0xAB,0xC2,0x41,0x28}, }, -sr[4] = { +sr[4] __aarch64_used = { {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}, {0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03, @@ -533,6 +539,14 @@
CVS commit: src/sys/crypto/aes/arch/arm
Module Name:src Committed By: riastradh Date: Mon Jun 29 23:57:56 UTC 2020 Modified Files: src/sys/crypto/aes/arch/arm: aes_neon.c files.aesneon Added Files: src/sys/crypto/aes/arch/arm: aes_neon_32.S Log Message: Provide hand-written AES NEON assembly for arm32. gcc does a lousy job at compiling 128-bit NEON intrinsics on arm32; hand-writing it made it about 12x faster, by avoiding a zillion loads and stores to spill everything and the kitchen sink onto the stack. (But gcc does fine on aarch64, presumably because it has twice as many registers and doesn't have to deal with q2=d4/d5 overlapping.) To generate a diff of this commit: cvs rdiff -u -r1.1 -r1.2 src/sys/crypto/aes/arch/arm/aes_neon.c \ src/sys/crypto/aes/arch/arm/files.aesneon cvs rdiff -u -r0 -r1.1 src/sys/crypto/aes/arch/arm/aes_neon_32.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.