The branch stable/15 has been updated by fuz:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=28a0f0c845d1d54e0f785d101828d53a5938e3ec

commit 28a0f0c845d1d54e0f785d101828d53a5938e3ec
Author:     Robert Clausecker <[email protected]>
AuthorDate: 2025-10-10 17:40:49 +0000
Commit:     Robert Clausecker <[email protected]>
CommitDate: 2025-11-23 01:12:35 +0000

    lib/libmd: import md5 amd64 kernels
    
    Differential Revision:  https://reviews.freebsd.org/D45670
    Reviewed by:    imp
    Approved by:    markj (mentor)
    MFC after:      1 month
    
    (cherry picked from commit d92e987421001c365216b039f8c3303939c195f7)
---
 lib/libmd/Makefile            |   7 +
 lib/libmd/amd64/md5block.S    | 363 ++++++++++++++++++++++++++++++++++++++++++
 lib/libmd/amd64/md5dispatch.c |  41 +++++
 3 files changed, 411 insertions(+)

diff --git a/lib/libmd/Makefile b/lib/libmd/Makefile
index 59a519a882af..c4ab767c8b2f 100644
--- a/lib/libmd/Makefile
+++ b/lib/libmd/Makefile
@@ -117,6 +117,13 @@ USE_ASM_SOURCES:=0
 .endif
 
 .if ${USE_ASM_SOURCES} != 0
+.if exists(${MACHINE_ARCH}/md5block.S)
+SRCS+= md5block.S
+CFLAGS+= -DMD5_ASM
+.if exists(${MACHINE_ARCH}/md5dispatch.c)
+SRCS+=  md5dispatch.c
+.endif
+.endif
 .if exists(${MACHINE_ARCH}/sha1block.S)
 SRCS+= sha1block.S
 CFLAGS+= -DSHA1_ASM
diff --git a/lib/libmd/amd64/md5block.S b/lib/libmd/amd64/md5block.S
new file mode 100644
index 000000000000..0dd594dd5dc2
--- /dev/null
+++ b/lib/libmd/amd64/md5block.S
@@ -0,0 +1,363 @@
+/*-
+ * Copyright (c) 2024, 2025 Robert Clausecker <[email protected]>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <machine/asm.h>
+
+/* apply the round keys to the four round functions */
+.macro allrounds       rfn0, rfn1, rfn2, rfn3
+       \rfn0    0, 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee
+       \rfn0    4, 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501
+       \rfn0    8, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be
+       \rfn0   12, 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821
+
+       \rfn1   16, 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa
+       \rfn1   20, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8
+       \rfn1   24, 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed
+       \rfn1   28, 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a
+
+       \rfn2   32, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c
+       \rfn2   36, 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70
+       \rfn2   40, 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05
+       \rfn2   44, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665
+
+       \rfn3   48, 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039
+       \rfn3   52, 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1
+       \rfn3   56, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1
+       \rfn3   60, 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
+.endm
+
+       // md5block(MD5_CTX, buf, len)
+ENTRY(_libmd_md5block_baseline)
+.macro round   a, b, c, d, f, k, m, s
+       \f      %ebp, \b, \c, \d
+       add     $\k, \a                 // a + k[i]
+       add     ((\m)%16*4)(%rsi), \a   // a + k[i] + m[g]
+       add     %ebp, \a                // a + k[i] + m[g] + f
+       rol     $\s, \a
+       add     \b, \a
+.endm
+
+       // f = b ? c : d
+.macro f0      f, b, c, d
+       mov     \c, \f
+       xor     \d, \f
+       and     \b, \f
+       xor     \d, \f
+.endm
+
+       // f = d ? b : c
+.macro f1      f, b, c, d
+       mov     \c, \f
+       xor     \b, \f
+       and     \d, \f
+       xor     \c, \f
+.endm
+
+       // f = b ^ c ^ d
+.macro f2      f, b, c, d
+       mov     \c, \f
+       xor     \d, \f
+       xor     \b, \f
+.endm
+
+       // f = c ^ (b | ~d)
+.macro f3      f, b, c, d
+       mov     $-1, \f
+       xor     \d, \f
+       or      \b, \f
+       xor     \c, \f
+.endm
+
+       // do 4 rounds
+.macro rounds  f, p, q, s0, s1, s2, s3, k0, k1, k2, k3
+       round   %eax, %ebx, %ecx, %edx, \f, \k0, \p*0+\q, \s0
+       round   %edx, %eax, %ebx, %ecx, \f, \k1, \p*1+\q, \s1
+       round   %ecx, %edx, %eax, %ebx, \f, \k2, \p*2+\q, \s2
+       round   %ebx, %ecx, %edx, %eax, \f, \k3, \p*3+\q, \s3
+.endm
+
+       // do 4 rounds with f0, f1, f2, f3
+.macro rounds0 i, k0, k1, k2, k3
+       rounds  f0, 1, \i, 7, 12, 17, 22, \k0, \k1, \k2, \k3
+.endm
+
+.macro rounds1 i, k0, k1, k2, k3
+       rounds  f1, 5, 5*\i+1, 5, 9, 14, 20, \k0, \k1, \k2, \k3
+.endm
+
+.macro rounds2 i, k0, k1, k2, k3
+       rounds  f2, 3, 3*\i+5, 4, 11, 16, 23, \k0, \k1, \k2, \k3
+.endm
+
+.macro rounds3 i, k0, k1, k2, k3
+       rounds  f3, 7, 7*\i, 6, 10, 15, 21, \k0, \k1, \k2, \k3
+.endm
+
+       push    %rbx
+       push    %rbp
+       push    %r12
+
+       and     $~63, %rdx              // length in blocks
+       lea     (%rsi, %rdx, 1), %r12   // end pointer
+
+       mov     (%rdi), %eax            // a
+       mov     4(%rdi), %ebx           // b
+       mov     8(%rdi), %ecx           // c
+       mov     12(%rdi), %edx          // d
+
+       cmp     %rsi, %r12              // any data to process?
+       je      .Lend
+
+       .balign 16
+.Lloop:        mov     %eax, %r8d
+       mov     %ebx, %r9d
+       mov     %ecx, %r10d
+       mov     %edx, %r11d
+
+       allrounds       rounds0, rounds1, rounds2, rounds3
+
+       add     %r8d, %eax
+       add     %r9d, %ebx
+       add     %r10d, %ecx
+       add     %r11d, %edx
+
+       add     $64, %rsi
+       cmp     %rsi, %r12
+       jne     .Lloop
+
+       mov     %eax, (%rdi)
+       mov     %ebx, 4(%rdi)
+       mov     %ecx, 8(%rdi)
+       mov     %edx, 12(%rdi)
+
+.Lend: pop     %r12
+       pop     %rbp
+       pop     %rbx
+       ret
+END(_libmd_md5block_baseline)
+
+       /*
+        * An implementation leveraging the ANDN instruction
+        * from BMI1 to shorten some dependency chains.
+        */
+ENTRY(_libmd_md5block_bmi1)
+       // special-cased round 1
+       // f1 = d ? b : c = (d & b) + (~d & c)
+.macro round1  a, b, c, d, k, m, s
+       andn    \c, \d, %edi            // ~d & c
+       add     $\k, \a                 // a + k[i]
+       mov     \d, %ebp
+       add     ((\m)%16*4)(%rsi), \a   // a + k[i] + m[g]
+       and     \b, %ebp                // d & b
+       add     %edi, \a                // a + k[i] + m[g] + (~d & c)
+       add     %ebp, \a                // a + k[i] + m[g] + (~d & c) + (d & b)
+       rol     $\s, \a
+       add     \b, \a
+.endm
+
+       // special-cased round 3
+       // f3 = c ^ (b | ~d) = ~(c ^ ~b & d) = -1 - (c ^ ~b & d)
+.macro round3  a, b, c, d, k, m, s
+       andn    \d, \b, %ebp
+       add     $\k - 1, \a             // a + k[i] - 1
+       add     ((\m)%16*4)(%rsi), \a   // a + k[i] + m[g]
+       xor     \c, %ebp
+       sub     %ebp, \a                // a + k[i] + m[g] + f
+       rol     $\s, \a
+       add     \b, \a
+.endm
+
+       .purgem rounds1
+.macro rounds1 i, k0, k1, k2, k3
+       round1  %eax, %ebx, %ecx, %edx, \k0, 5*\i+ 1,  5
+       round1  %edx, %eax, %ebx, %ecx, \k1, 5*\i+ 6,  9
+       round1  %ecx, %edx, %eax, %ebx, \k2, 5*\i+11, 14
+       round1  %ebx, %ecx, %edx, %eax, \k3, 5*\i+16, 20
+.endm
+
+       .purgem rounds3
+.macro rounds3 i, k0, k1, k2, k3
+       round3  %eax, %ebx, %ecx, %edx, \k0, 7*\i+ 0,  6
+       round3  %edx, %eax, %ebx, %ecx, \k1, 7*\i+ 7, 10
+       round3  %ecx, %edx, %eax, %ebx, \k2, 7*\i+14, 15
+       round3  %ebx, %ecx, %edx, %eax, \k3, 7*\i+21, 21
+.endm
+
+       push    %rbx
+       push    %rbp
+       push    %r12
+
+       and     $~63, %rdx              // length in blocks
+       lea     (%rsi, %rdx, 1), %r12   // end pointer
+
+       mov     (%rdi), %eax            // a
+       mov     4(%rdi), %ebx           // b
+       mov     8(%rdi), %ecx           // c
+       mov     12(%rdi), %edx          // d
+
+       cmp     %rsi, %r12              // any data to process?
+       je      0f
+
+       push    %rdi
+
+       .balign 16
+1:     mov     %eax, %r8d
+       mov     %ebx, %r9d
+       mov     %ecx, %r10d
+       mov     %edx, %r11d
+
+       allrounds       rounds0, rounds1, rounds2, rounds3
+
+       add     %r8d, %eax
+       add     %r9d, %ebx
+       add     %r10d, %ecx
+       add     %r11d, %edx
+
+       add     $64, %rsi
+       cmp     %rsi, %r12
+       jne     1b
+
+       pop     %rdi
+       mov     %eax, (%rdi)
+       mov     %ebx, 4(%rdi)
+       mov     %ecx, 8(%rdi)
+       mov     %edx, 12(%rdi)
+
+0:     pop     %r12
+       pop     %rbp
+       pop     %rbx
+       ret
+END(_libmd_md5block_bmi1)
+
+#ifndef _KERNEL
+       /*
+        * An implementation leveraging AVX-512 for its VPTERNLOGD
+        * instruction.  We're using only XMM registers here,
+        * avoiding costly thermal licensing.
+        */
+ENTRY(_libmd_md5block_avx512)
+.macro vround          a, b, c, d, f, i, m, mi, s
+       vmovdqa         \b, %xmm4
+       vpternlogd      $\f, \d, \c, %xmm4
+       vpaddd          4*(\i)(%rax){1to4}, \m, %xmm5 // m[g] + k[i]
+.if    \mi != 0
+       vpshufd         $0x55 * \mi, %xmm5, %xmm5       // broadcast to each 
dword
+.endif
+       vpaddd          %xmm5, \a, \a           // a + k[i] + m[g]
+       vpaddd          %xmm4, \a, \a           // a + k[i] + m[g] + f
+       vprold          $\s, \a, \a
+       vpaddd          \b, \a, \a
+.endm
+
+.macro vrounds         f, i, m0, i0, m1, i1, m2, i2, m3, i3, s0, s1, s2, s3
+       vround          %xmm0, %xmm1, %xmm2, %xmm3, \f, \i+0, \m0, \i0, \s0
+       vround          %xmm3, %xmm0, %xmm1, %xmm2, \f, \i+1, \m1, \i1, \s1
+       vround          %xmm2, %xmm3, %xmm0, %xmm1, \f, \i+2, \m2, \i2, \s2
+       vround          %xmm1, %xmm2, %xmm3, %xmm0, \f, \i+3, \m3, \i3, \s3
+.endm
+
+/*
+ * d c b f0 f1 f2 f3
+ * 0 0 0  0  0  0  1
+ * 1 0 0  1  0  1  0
+ * 0 1 0  0  1  1  0
+ * 1 1 0  1  0  0  1
+ * 0 0 1  0  0  1  1
+ * 1 0 1  0  1  0  1
+ * 0 1 1  1  1  0  0
+ * 1 1 1  1  1  1  0
+ */
+
+.macro vrounds0        i, m
+       vrounds         0xca, \i, \m, 0, \m, 1, \m, 2, \m, 3, 7, 12, 17, 22
+.endm
+
+.macro vrounds1        i, m0, i0, m1, i1, m2, i2, m3, i3
+       vrounds         0xe4, \i, \m0, \i0, \m1, \i1, \m2, \i2, \m3, \i3, 5, 9, 
14, 20
+.endm
+
+.macro vrounds2        i, m0, i0, m1, i1, m2, i2, m3, i3
+       vrounds         0x96, \i, \m0, \i0, \m1, \i1, \m2, \i2, \m3, \i3, 4, 
11, 16, 23
+.endm
+
+.macro vrounds3        i, m0, i0, m1, i1, m2, i2, m3, i3
+       vrounds         0x39, \i, \m0, \i0, \m1, \i1, \m2, \i2, \m3, \i3, 6, 
10, 15, 21
+.endm
+
+       and             $~63, %rdx              // length in blocks
+       add             %rsi, %rdx              // end pointer
+
+       vmovd           (%rdi), %xmm0           // a
+       vmovd           4(%rdi), %xmm1          // b
+       vmovd           8(%rdi), %xmm2          // c
+       vmovd           12(%rdi), %xmm3         // d
+
+       lea             keys(%rip), %rax
+
+       cmp             %rsi, %rdx              // any data to process?
+       je              0f
+
+       .balign         16
+1:     vmovdqu         0*4(%rsi), %xmm8        // message words
+       vmovdqu         4*4(%rsi), %xmm9
+       vmovdqu         8*4(%rsi), %xmm10
+       vmovdqu         12*4(%rsi), %xmm11
+
+       vmovdqa         %xmm0, %xmm12           // stash old state variables
+       vmovdqa         %xmm1, %xmm13
+       vmovdqa         %xmm2, %xmm14
+       vmovdqa         %xmm3, %xmm15
+
+       vrounds0         0, %xmm8
+       vrounds0         4, %xmm9
+       vrounds0         8, %xmm10
+       vrounds0        12, %xmm11
+
+       vrounds1        16,  %xmm8, 1,  %xmm9, 2, %xmm10, 3,  %xmm8, 0
+       vrounds1        20,  %xmm9, 1, %xmm10, 2, %xmm11, 3,  %xmm9, 0
+       vrounds1        24, %xmm10, 1, %xmm11, 2,  %xmm8, 3, %xmm10, 0
+       vrounds1        28, %xmm11, 1,  %xmm8, 2,  %xmm9, 3, %xmm11, 0
+
+       vrounds2        32,  %xmm9, 1, %xmm10, 0, %xmm10, 3, %xmm11, 2
+       vrounds2        36,  %xmm8, 1,  %xmm9, 0,  %xmm9, 3, %xmm10, 2
+       vrounds2        40, %xmm11, 1,  %xmm8, 0,  %xmm8, 3,  %xmm9, 2
+       vrounds2        44  %xmm10, 1, %xmm11, 0, %xmm11, 3,  %xmm8, 2
+
+       vrounds3        48,  %xmm8, 0,  %xmm9, 3, %xmm11, 2,  %xmm9, 1
+       vrounds3        52, %xmm11, 0,  %xmm8, 3, %xmm10, 2,  %xmm8, 1
+       vrounds3        56, %xmm10, 0, %xmm11, 3,  %xmm9, 2, %xmm11, 1
+       vrounds3        60,  %xmm9, 0, %xmm10, 3,  %xmm8, 2, %xmm10, 1
+
+       vpaddd          %xmm12, %xmm0, %xmm0
+       vpaddd          %xmm13, %xmm1, %xmm1
+       vpaddd          %xmm14, %xmm2, %xmm2
+       vpaddd          %xmm15, %xmm3, %xmm3
+
+       add             $64, %rsi
+       cmp             %rsi, %rdx
+       jne             1b
+
+       vmovd           %xmm0, (%rdi)
+       vmovd           %xmm1, 4(%rdi)
+       vmovd           %xmm2, 8(%rdi)
+       vmovd           %xmm3, 12(%rdi)
+
+0:     ret
+END(_libmd_md5block_avx512)
+
+       // round keys, for use in md5block_avx512
+       .section        .rodata
+       .balign         16
+
+.macro putkeys         i, a, b, c, d
+       .4byte          \a, \b, \c, \d
+.endm
+
+keys:  allrounds       putkeys, putkeys, putkeys, putkeys
+       .size           keys, .-keys
+#endif /* !defined(_KERNEL) */
+
+       .section .note.GNU-stack,"",%progbits
diff --git a/lib/libmd/amd64/md5dispatch.c b/lib/libmd/amd64/md5dispatch.c
new file mode 100644
index 000000000000..dd2131c5a57c
--- /dev/null
+++ b/lib/libmd/amd64/md5dispatch.c
@@ -0,0 +1,41 @@
+/*-
+ * Copyright (c) 2024 Robert Clausecker <[email protected]>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <sys/types.h>
+#include <sys/md5.h>
+
+#include <machine/cpufunc.h>
+#include <machine/specialreg.h>
+#include <stdint.h>
+#include <string.h>
+#include <x86/ifunc.h>
+
+extern void _libmd_md5block_baseline(MD5_CTX *, const void *, size_t);
+extern void _libmd_md5block_bmi1(MD5_CTX *, const void *, size_t);
+extern void _libmd_md5block_avx512(MD5_CTX *, const void *, size_t);
+
+DEFINE_UIFUNC(, void, _libmd_md5block, (MD5_CTX *, const void *, size_t))
+{
+       if ((cpu_stdext_feature & (CPUID_STDEXT_AVX512F | 
CPUID_STDEXT_AVX512VL))
+           == (CPUID_STDEXT_AVX512F | CPUID_STDEXT_AVX512VL)) {
+               u_int regs[4];
+               char cpu_vendor[12];
+
+               do_cpuid(0, regs);
+               ((u_int *)&cpu_vendor)[0] = regs[1];
+               ((u_int *)&cpu_vendor)[1] = regs[3];
+               ((u_int *)&cpu_vendor)[2] = regs[2];
+
+               /* the AVX-512 kernel performs poorly on AMD */
+               if (memcmp(cpu_vendor, AMD_VENDOR_ID, sizeof(cpu_vendor)) != 0)
+                       return (_libmd_md5block_avx512);
+       }
+
+       if (cpu_stdext_feature & CPUID_STDEXT_BMI1)
+               return (_libmd_md5block_bmi1);
+       else
+               return (_libmd_md5block_baseline);
+}

Reply via email to