From 0c5cb2fe3982248c99dd45f63cf3adbd9ffe7f3b Mon Sep 17 00:00:00 2001
From: Sam Russell <sam.h.russell@gmail.com>
Date: Mon, 16 Dec 2024 16:13:45 +0100
Subject: [PATCH] crc: Add PCLMUL implementation

* lib/crc-x86_64-pclmul.c: Implement CRC32 with PCLMUL intrinsics
* lib/crc-x86_64.h: Add header for CRC32 with PCLMUL instrinsics
* lib/crc.c: Use PCLMUL implementation if available
* m4/crc-x86_64.m4: Detect PCLMUL and build accordingly
* modules/crc-x86_64: New module for crc-x86_64
---
 ChangeLog               |  10 ++
 lib/crc-x86_64-pclmul.c | 202 ++++++++++++++++++++++++++++++++++++++++
 lib/crc-x86_64.h        |  37 ++++++++
 lib/crc.c               |  32 ++++++-
 m4/crc-x86_64.m4        |  41 ++++++++
 modules/crc-x86_64      |  26 ++++++
 6 files changed, 347 insertions(+), 1 deletion(-)
 create mode 100644 lib/crc-x86_64-pclmul.c
 create mode 100644 lib/crc-x86_64.h
 create mode 100644 m4/crc-x86_64.m4
 create mode 100644 modules/crc-x86_64

diff --git a/ChangeLog b/ChangeLog
index 9d412f9cbc..339e5590ae 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,13 @@
+2024-12-16  Sam Russell  <sam.h.russell@gmail.com>
+
+    crc: Add PCLMUL implementation
+
+    * lib/crc-x86_64-pclmul.c: Implement CRC32 with PCLMUL intrinsics
+	* lib/crc-x86_64.h: Add header for CRC32 with PCLMUL instrinsics
+    * lib/crc.c: Use PCLMUL implementation if available
+    * m4/crc-x86_64.m4: Detect PCLMUL and build accordingly
+    * modules/crc-x86_64: New module for crc-x86_64
+
 2024-11-20  Paul Eggert  <eggert@cs.ucla.edu>
 
 	openat: don’t close (-1)
diff --git a/lib/crc-x86_64-pclmul.c b/lib/crc-x86_64-pclmul.c
new file mode 100644
index 0000000000..e8ee0d9320
--- /dev/null
+++ b/lib/crc-x86_64-pclmul.c
@@ -0,0 +1,202 @@
+/* crc-x86_64.c -- CRC32 implementation using SSE/AVX1
+   Copyright (C) 2005-2006, 2009-2024 Free Software Foundation, Inc.
+
+   This file is free software: you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as
+   published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   This file is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+/* Written by Sam Russell.  */
+
+#include <config.h>
+
+#include "x86intrin.h"
+#include "crc.h"
+#include "crc-x86_64.h"
+
+#include <string.h>
+
+#pragma GCC push_options
+#pragma GCC target("pclmul,avx")
+uint32_t
+crc32_update_no_xor_pclmul (uint32_t crc, const void *buf, size_t len)
+  {
+    const __m128i* data = buf;
+    __m128i* datarw;
+    size_t bytes_remaining = len;
+    __m128i in256[4] = {0};
+    __m128i xor_crc;
+    __m128i in1 = _mm_setzero_si128();
+    __m128i in2 = _mm_setzero_si128();
+    __m128i in3 = _mm_setzero_si128();
+    __m128i in4 = _mm_setzero_si128();
+    __m128i in5 = _mm_setzero_si128();
+    __m128i in6 = _mm_setzero_si128();
+    __m128i in7 = _mm_setzero_si128();
+    __m128i in8 = _mm_setzero_si128();
+    __m128i final_buf[12] = {0};
+    __m128i fold_high;
+    __m128i fold_low;
+
+    /* These constants are calculated as T-1 mod P
+       0x8F352D95 = (544 - 1) mod P etc
+       This method is described in Fast CRC Computation for Generic Polynomials
+       Using PCLMULQDQ Instruction (Gopal et al.) */
+    __m128i shift544_shift480 = _mm_set_epi64x(0x1D9513D7, 0x8F352D95);
+    __m128i shift160_shift96 = _mm_set_epi64x(0xCCAA009E, 0xAE689191);
+    __m128i shift96_shift64 = _mm_set_epi64x(0xB8BC6765, 0xCCAA009E);
+    /* Mu is defined as x^64 / P(x)
+       P(x) is the generator polynomial for this CRC32 implementation */
+    __m128i mu_poly = _mm_set_epi64x(0x1DB710641, 0x1F7011641);
+
+    if (bytes_remaining >= 128)
+      {
+        /* Here we fold 4x 128bit words forward by 512 bits */
+        /* First load up our initial state */
+        in1 = _mm_loadu_si128(data);
+        in2 = _mm_loadu_si128(data + 1);
+        in3 = _mm_loadu_si128(data + 2);
+        in4 = _mm_loadu_si128(data + 3);
+
+        /* Initialise with incoming CRC */
+        xor_crc = _mm_set_epi32 (0, 0, 0, crc);
+        in1 = _mm_xor_si128 (in1, xor_crc);
+
+        while (bytes_remaining >= 128)
+          {
+            /* Load up second set of 512 bits*/
+            in5 = _mm_loadu_si128(data + 4);
+            in6 = _mm_loadu_si128(data + 5);
+            in7 = _mm_loadu_si128(data + 6);
+            in8 = _mm_loadu_si128(data + 7);
+
+            /* We shift the high QWORD forward 544 bits and the
+               low QWORD 480 bits */
+            fold_high = _mm_clmulepi64_si128(in1, shift544_shift480, 0x11);
+            fold_low = _mm_clmulepi64_si128(in1, shift544_shift480, 0x00);
+            in1 = _mm_xor_si128(in5, fold_high);
+            in1 = _mm_xor_si128(in1, fold_low);
+            fold_high = _mm_clmulepi64_si128(in2, shift544_shift480, 0x11);
+            fold_low = _mm_clmulepi64_si128(in2, shift544_shift480, 0x00);
+            in2 = _mm_xor_si128(in6, fold_high);
+            in2 = _mm_xor_si128(in2, fold_low);
+            fold_high = _mm_clmulepi64_si128(in3, shift544_shift480, 0x11);
+            fold_low = _mm_clmulepi64_si128(in3, shift544_shift480, 0x00);
+            in3 = _mm_xor_si128(in7, fold_high);
+            in3 = _mm_xor_si128(in3, fold_low);
+            fold_high = _mm_clmulepi64_si128(in4, shift544_shift480, 0x11);
+            fold_low = _mm_clmulepi64_si128(in4, shift544_shift480, 0x00);
+            in4 = _mm_xor_si128(in8, fold_high);
+            in4 = _mm_xor_si128(in4, fold_low);
+
+            bytes_remaining -= 64;
+            data += 4;
+          }
+
+        _mm_storeu_si128(final_buf, in1);
+        _mm_storeu_si128(final_buf + 1, in2);
+        _mm_storeu_si128(final_buf + 2, in3);
+        _mm_storeu_si128(final_buf + 3, in4);
+
+        /* Move everything to final_buf because it is RW */
+        memcpy (final_buf + 4, data + 4, bytes_remaining - 64);
+        datarw = final_buf;
+      }
+    else
+      {
+        /* Move everything to final_buf because it is RW */
+        /* XOR in previous CRC value */
+        memcpy (final_buf, data, bytes_remaining);
+        in1 = _mm_loadu_si128(final_buf);
+        xor_crc = _mm_set_epi32 (0, 0, 0, crc);
+        in1 = _mm_xor_si128 (in1, xor_crc);
+        _mm_storeu_si128(final_buf, in1);
+        datarw = final_buf;
+      }
+
+    while (bytes_remaining >= 32) {
+        /* Do 128-bit folds as above */
+        in1 = _mm_loadu_si128(datarw);
+        in2 = _mm_loadu_si128(datarw + 1);
+
+        /* We shift the high QWORD forward 160 bits and the
+           low QWORD 96 bits */
+        fold_high = _mm_clmulepi64_si128(in1, shift160_shift96, 0x11);
+        fold_low = _mm_clmulepi64_si128(in1, shift160_shift96, 0x00);
+        in2 = _mm_xor_si128(in2, fold_high);
+        in2 = _mm_xor_si128(in2, fold_low);
+
+        _mm_storeu_si128(datarw + 1, in2);
+        bytes_remaining -= 16;
+        datarw += 1;
+    }
+
+    /* We have 16-31 bytes here
+       If we have 17-31 then we do another special case 128-bit fold
+       the padding trick works because we're effectively padding 0s on the front
+       which means with little endian we're shifting the number higher
+       we'll also use this step to pick the <16 byte case */
+
+    if (bytes_remaining != 16){
+        /* Pad remainder and fold 128-bits
+           We're reading in up to 32 bytes here = 256 bits
+           This is inefficient so we only actually want to hit this on the actual end of data
+           If we're reading a multiple of 32 bytes in the loop then this will never get hit */
+
+        /* Read in at an offset so we get the shift for free */
+        memcpy(((char*) in256) + (32 - bytes_remaining), datarw, bytes_remaining);
+        in1 = _mm_loadu_si128(in256);
+        in2 = _mm_loadu_si128(in256 + 1);
+
+        /* Now we fold in1 onto in2 */
+        shift160_shift96 = _mm_set_epi64x(0x0ccaa009e, 0x1751997d0);
+        fold_high = _mm_clmulepi64_si128(in1, shift160_shift96, 0x11);
+        fold_low = _mm_clmulepi64_si128(in1, shift160_shift96, 0x00);
+        in2 = _mm_xor_si128(in2, fold_high);
+        in1 = _mm_xor_si128(in2, fold_low);
+    }
+    else {
+        in1 = _mm_loadu_si128(datarw);
+    }
+
+    /* We now have 16 bytes and fold as normal */
+
+    in2 = _mm_and_si128(_mm_srli_si128(in1, 8), _mm_set_epi64x(0, 0xffffffff));
+    in3 = _mm_and_si128(_mm_srli_si128(in1, 12), _mm_set_epi64x(0, 0xffffffff));
+    in1 = _mm_and_si128(in1, _mm_set_epi64x(0, 0xffffffffffffffff));
+    /* Multiply first 64 bits against shift96 */
+    in1 = _mm_clmulepi64_si128(shift96_shift64, in1, 0x00);
+    /* First 32 bits go on in2 */
+    in2 = _mm_xor_si128(in2, _mm_and_si128(in1, _mm_set_epi64x(0, 0xffffffff)));
+    /* Next 64 bits go on in3 */
+    in3 = _mm_xor_si128(in3, _mm_srli_si128(in1, 4));
+    /* Then shift 64 bits from here */
+    in1 = _mm_clmulepi64_si128(shift96_shift64, in2, 0x01);
+    in1 = _mm_xor_si128(in1, in3);
+
+    /* This is the Barrett reduction */
+    /* Take the bottom 32 bits */
+    in2 = _mm_and_si128(in1, _mm_set_epi64x(0, 0xffffffff));
+    /* Multiply by mu */
+    in2 = _mm_clmulepi64_si128(mu_poly, in2, 0x00);
+    /* Take the bottom 32 bits of the result */
+    in2 = _mm_and_si128(in2, _mm_set_epi64x(0, 0xffffffff));
+    /* Multiply by P(x) */
+    in2 = _mm_clmulepi64_si128(mu_poly, in2, 0x01);
+    /* XOR against input */
+    in1 = _mm_xor_si128(in1, in2);
+    /* Take bits 32-63 as the CRC*/
+    in1 = _mm_srli_si128(in1, 4);
+    crc = _mm_cvtsi128_si32(in1);
+
+    return crc;
+  }
+#pragma GCC pop_options
diff --git a/lib/crc-x86_64.h b/lib/crc-x86_64.h
new file mode 100644
index 0000000000..f079975fed
--- /dev/null
+++ b/lib/crc-x86_64.h
@@ -0,0 +1,37 @@
+/* crc-x86_64.h -- CRC32 implementation using SSE/AVX1
+   Copyright (C) 2005-2006, 2009-2024 Free Software Foundation, Inc.
+
+   This file is free software: you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as
+   published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   This file is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+/* Written by Sam Russell.  */
+
+#ifndef CRC_X86_64_H
+#define CRC_X86_64_H 1
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern uint32_t
+crc32_update_no_xor_pclmul (uint32_t crc, const void *buf, size_t len)
+  _GL_ATTRIBUTE_PURE;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CRC_X86_64_H */
diff --git a/lib/crc.c b/lib/crc.c
index 53f26486a2..14887e60c3 100644
--- a/lib/crc.c
+++ b/lib/crc.c
@@ -21,9 +21,15 @@
 #include "crc.h"
 #include "endian.h"
 
+#ifdef GL_CRC_PCLMUL
+#include "crc-x86_64.h"
+static bool pclmul_enabled = false;
+static bool pclmul_checked = false;
+#endif
+
 #include <string.h>
 
-#ifdef GL_CRC_SLICE_BY_8
+#if GL_CRC_SLICE_BY_8
 #include "crc-sliceby8.h"
 
 /*
@@ -110,6 +116,18 @@ crc32_update_no_xor (uint32_t crc, const char *buf, size_t len)
 {
   size_t n, slice_alignment;
 
+#ifdef GL_CRC_PCLMUL
+  if (!pclmul_checked)
+    {
+      pclmul_enabled = (0 < __builtin_cpu_supports ("pclmul") &&
+                        0 < __builtin_cpu_supports ("avx"));
+      pclmul_checked = true;
+    }
+
+  if (pclmul_enabled && len >= 16)
+    return crc32_update_no_xor_pclmul(crc, buf, len);
+#endif
+
   slice_alignment = (len & (-8));
 
   for (n = 0; n < slice_alignment; n += 8)
@@ -180,6 +198,18 @@ crc32_update_no_xor (uint32_t crc, const char *buf, size_t len)
 {
   size_t n;
 
+#ifdef GL_CRC_PCLMUL
+  if (!pclmul_checked)
+    {
+      pclmul_enabled = (0 < __builtin_cpu_supports ("pclmul") &&
+                        0 < __builtin_cpu_supports ("avx"));
+      pclmul_checked = true;
+    }
+
+  if (pclmul_enabled && len >= 16)
+    return crc32_update_no_xor_pclmul(crc, buf, len);
+#endif
+
   for (n = 0; n < len; n++)
     crc = crc32_table[(crc ^ buf[n]) & 0xff] ^ (crc >> 8);
 
diff --git a/m4/crc-x86_64.m4 b/m4/crc-x86_64.m4
new file mode 100644
index 0000000000..f80cf930d7
--- /dev/null
+++ b/m4/crc-x86_64.m4
@@ -0,0 +1,41 @@
+# crc-x86_64.m4
+# serial 1
+dnl Copyright (C) 2024 Free Software Foundation, Inc.
+dnl This file is free software; the Free Software Foundation
+dnl gives unlimited permission to copy and/or distribute it,
+dnl with or without modifications, as long as this notice is preserved.
+dnl This file is offered as-is, without any warranty.
+
+AC_DEFUN([gl_CRC_PCLMUL],
+[
+  ac_save_CFLAGS=$CFLAGS
+  CFLAGS="-mavx -mpclmul $CFLAGS"
+  AC_MSG_CHECKING([if pclmul intrinsic exists])
+  AC_CACHE_VAL([gl_cv_crc_pclmul],[
+  AC_LINK_IFELSE(
+    [AC_LANG_SOURCE([[
+      #include <x86intrin.h>
+
+      int
+      main (void)
+      {
+        __m128i a, b;
+        a = _mm_clmulepi64_si128 (a, b, 0x00);
+        a = _mm_shuffle_epi8 (a, b);
+        return __builtin_cpu_supports ("pclmul");
+      }
+    ]])
+    ],[
+      gl_cv_crc_pclmul=yes
+    ],[
+      gl_cv_crc_pclmul=no
+    ])])
+  AC_MSG_RESULT([$gl_cv_crc_pclmul])
+  if test $gl_cv_crc_pclmul = yes; then
+    AC_DEFINE([GL_CRC_PCLMUL], [1],
+              [CRC32 calculation by pclmul hardware instruction enabled])
+  fi
+  AM_CONDITIONAL([GL_CRC_PCLMUL],
+                [test $gl_cv_crc_pclmul = yes])
+  CFLAGS=$ac_save_CFLAGS
+])
diff --git a/modules/crc-x86_64 b/modules/crc-x86_64
new file mode 100644
index 0000000000..5d51c28eaa
--- /dev/null
+++ b/modules/crc-x86_64
@@ -0,0 +1,26 @@
+Description:
+CRC32 implementation using x86_64 specific optimizations
+
+Files:
+lib/crc-x86_64.h
+lib/crc-x86_64-pclmul.c
+m4/crc-x86_64.m4
+
+Depends-on:
+stdint
+crc
+
+configure.ac:
+AC_REQUIRE([gl_CRC_PCLMUL])
+
+Makefile.am:
+lib_SOURCES += crc-x86_64-pclmul.c
+
+Include:
+"crc.h"
+
+License:
+LGPL
+
+Maintainer:
+All
-- 
2.43.0