This is an automated email from the ASF dual-hosted git repository.

stevel pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/hadoop.git


The following commit(s) were added to refs/heads/trunk by this push:
     new c1de2db0d2b HADOOP-19724. [RISC-V] Add rv64 Zbc (CLMUL) bulk CRC32 
(CRC32C not optimized) (#8031)
c1de2db0d2b is described below

commit c1de2db0d2b2d7b9a034d835b54b98ce8d353e3a
Author: Peter Pan <[email protected]>
AuthorDate: Fri Feb 6 22:08:57 2026 +0800

    HADOOP-19724. [RISC-V] Add rv64 Zbc (CLMUL) bulk CRC32 (CRC32C not 
optimized) (#8031)
    
    
    Contributed by  ptroc <[email protected]>
    Co-authored-by: gong-flying <[email protected]>
---
 .../src/org/apache/hadoop/util/bulk_crc32_riscv.c  | 234 +++++++++++++++++++--
 1 file changed, 220 insertions(+), 14 deletions(-)

diff --git 
a/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32_riscv.c
 
b/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32_riscv.c
index 5955e4b7f96..b8ea0cf0658 100644
--- 
a/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32_riscv.c
+++ 
b/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32_riscv.c
@@ -16,24 +16,230 @@
  * limitations under the License.
  */
 
+#include <assert.h>
+#include <stddef.h>  // for size_t
+#include <stdio.h>
+#include <string.h>
+
+#include "bulk_crc32.h"
+#include "gcc_optimizations.h"
+
+/**
+ * Hardware-accelerated CRC32 calculation using RISC-V Zbc extension.
+ * Uses carry-less multiply instructions (clmul/clmulh) for CRC32 (zlib
+ * polynomial).
+ */
+
+typedef void (*crc_pipelined_func_t)(uint32_t *, uint32_t *, uint32_t *,
+                                     const uint8_t *, size_t, int);
+extern crc_pipelined_func_t pipelined_crc32_zlib_func;
+
+#if defined(__riscv) && (__riscv_xlen == 64)
+
+/**
+ * Precomputed constants for CRC32 (zlib polynomial) reduction using
+ * carry-less multiplication. These constants are derived from the
+ * polynomial 0xEDB88320 (reflected) and are used for Barrett reduction
+ * or similar folding algorithms.
+ */
+#define RV_CRC32_CONST_R3 0x01751997d0ULL
+#define RV_CRC32_CONST_R4 0x00ccaa009eULL
+#define RV_CRC32_CONST_R5 0x0163cd6124ULL
+#define RV_CRC32_MASK32 0x00000000FFFFFFFFULL
+#define RV_CRC32_POLY_TRUE_LE_FULL 0x01DB710641ULL
+#define RV_CRC32_CONST_RU 0x01F7011641ULL
+
+/**
+ * Performs a 64-bit carry-less multiplication (clmul) of two values.
+ * This instruction is part of the RISC-V Zbc extension.
+ */
+static inline uint64_t rv_clmul(uint64_t a, uint64_t b) {
+  uint64_t r;
+  __asm__ volatile(
+      ".option push\n\t"
+      ".option arch, +zbc\n\t"
+      "clmul %0, %1, %2\n\t"
+      ".option pop\n\t"
+      : "=r"(r)
+      : "r"(a), "r"(b));
+  return r;
+}
+
+/**
+ * Performs the high 64 bits of a 64-bit carry-less multiplication (clmulh).
+ * This instruction is part of the RISC-V Zbc extension.
+ */
+static inline uint64_t rv_clmulh(uint64_t a, uint64_t b) {
+  uint64_t r;
+  __asm__ volatile(
+      ".option push\n\t"
+      ".option arch, +zbc\n\t"
+      "clmulh %0, %1, %2\n\t"
+      ".option pop\n\t"
+      : "=r"(r)
+      : "r"(a), "r"(b));
+  return r;
+}
+
+/**
+ * Fallback bitwise implementation of CRC32 (zlib) for small data chunks
+ * or to handle misaligned data at the beginning/end of a buffer.
+ */
+static inline uint32_t rv_crc32_zlib_bitwise(uint32_t crc, const uint8_t *buf,
+                                             size_t len) {
+  uint32_t c = crc;
+  for (size_t i = 0; i < len; ++i) {
+    c ^= buf[i];
+    for (int k = 0; k < 8; ++k) {
+      uint32_t mask = -(int32_t)(c & 1);
+      c = (c >> 1) ^ (0xEDB88320U & mask);  // reflected polynomial
+    }
+  }
+  return c;
+}
+
 /**
- * RISC-V CRC32 hardware acceleration (placeholder)
+ * Hardware-accelerated CRC32 (zlib) calculation using RISC-V Zbc
+ * carry-less multiplication instructions.
+ */
+static uint32_t rv_crc32_zlib_clmul(uint32_t crc, const uint8_t *buf,
+                                    size_t len) {
+  const uint8_t *p = buf;
+  size_t n = len;
+
+  if (n < 32) {
+    return rv_crc32_zlib_bitwise(crc, p, n);
+  }
+
+  // Handle misaligned data at the start. This is considered unlikely
+  // in typical Hadoop usage but necessary for correctness.
+  uintptr_t mis = (uintptr_t)p & 0xF;
+  if (unlikely(mis)) {
+    size_t pre = 16 - mis;
+    if (pre > n) pre = n;
+    crc = rv_crc32_zlib_bitwise(crc, p, pre);
+    p += pre;
+    n -= pre;
+  }
+
+  uint64_t x0 = *(const uint64_t *)(const void *)(p + 0);
+  uint64_t x1 = *(const uint64_t *)(const void *)(p + 8);
+  x0 ^= (uint64_t)crc;
+  p += 16;
+  n -= 16;
+
+  const uint64_t C1 = RV_CRC32_CONST_R3;
+  const uint64_t C2 = RV_CRC32_CONST_R4;
+
+  // Main loop: process 16 bytes of aligned data per iteration using
+  // carry-less multiplication for high-performance folding.
+  while (likely(n >= 16)) {
+    uint64_t tL = rv_clmul(C2, x1);
+    uint64_t tH = rv_clmulh(C2, x1);
+    uint64_t yL = rv_clmul(C1, x0);
+    uint64_t yH = rv_clmulh(C1, x0);
+    x0 = yL ^ tL;
+    x1 = yH ^ tH;
+
+    uint64_t d0 = *(const uint64_t *)(const void *)(p + 0);
+    uint64_t d1 = *(const uint64_t *)(const void *)(p + 8);
+    x0 ^= d0;
+    x1 ^= d1;
+    p += 16;
+    n -= 16;
+  }
+
+  // Final reduction and folding of the remaining 16 bytes in the pipeline.
+  {
+    uint64_t tH = rv_clmulh(x0, C2);
+    uint64_t tL = rv_clmul(x0, C2);
+    x0 = x1 ^ tL;
+    x1 = tH;
+  }
+
+  uint64_t hi = x1;
+  uint64_t lo = x0;
+  uint64_t t2 = (lo >> 32) | (hi << 32);
+  lo &= RV_CRC32_MASK32;
+
+  lo = rv_clmul(RV_CRC32_CONST_R5, lo) ^ t2;
+  uint64_t tmp = lo;
+  lo &= RV_CRC32_MASK32;
+  lo = rv_clmul(lo, RV_CRC32_CONST_RU);
+  lo &= RV_CRC32_MASK32;
+  lo = rv_clmul(lo, RV_CRC32_POLY_TRUE_LE_FULL) ^ tmp;
+
+  uint32_t c = (uint32_t)(lo >> 32);
+
+  // Handle any remaining bytes (less than 16) using bitwise fallback.
+  if (n) {
+    c = rv_crc32_zlib_bitwise(c, p, n);
+  }
+  return c;
+}
+
+/**
+ * Pipelined version of hardware-accelerated CRC32 calculation using
+ * RISC-V Zbc carry-less multiply instructions.
  *
- * Phase 1: provide a RISC-V-specific compilation unit that currently makes
- * no runtime changes and falls back to the generic software path in
- * bulk_crc32.c. Future work will add Zbc-based acceleration and runtime
- * dispatch.
+ *   crc1, crc2, crc3 : Store initial checksum for each block before
+ *           calling. When it returns, updated checksums are stored.
+ *   p_buf : The base address of the data buffer. The buffer should be
+ *           at least as big as block_size * num_blocks.
+ *   block_size : The size of each block in bytes.
+ *   num_blocks : The number of blocks to work on. Valid values are 1, 2, or 3.
+ *                A value of 0 is treated as a no-op. Any other value will
+ *                trigger an assertion in debug builds.
  */
+static void pipelined_crc32_zlib(uint32_t *crc1, uint32_t *crc2, uint32_t 
*crc3,
+                                 const uint8_t *p_buf, size_t block_size,
+                                 int num_blocks) {
+  const uint8_t *p1 = p_buf;
+  const uint8_t *p2 = p_buf + block_size;
+  const uint8_t *p3 = p_buf + 2 * block_size;
 
-#include <assert.h>
-#include <stddef.h> // for size_t
+  switch (num_blocks) {
+    case 3:
+      *crc3 = rv_crc32_zlib_clmul(*crc3, p3, block_size);
+      // fall through
+    case 2:
+      *crc2 = rv_crc32_zlib_clmul(*crc2, p2, block_size);
+      // fall through
+    case 1:
+      *crc1 = rv_crc32_zlib_clmul(*crc1, p1, block_size);
+      break;
+    case 0:
+      return;
+    default:
+      assert(0 && "BUG: Invalid number of checksum blocks");
+  }
+}
 
-#include "bulk_crc32.h"
-#include "gcc_optimizations.h"
+#endif  // __riscv && __riscv_xlen==64
 
-/* Constructor hook reserved for future HW capability detection and
- * function-pointer dispatch. Intentionally a no-op for the initial phase. */
-void __attribute__((constructor)) init_riscv_crc_support(void)
-{
-  /* No-op: keep using the default software implementations. */
+/**
+ * On library load, determine what sort of crc we are going to do
+ * and set crc function pointers appropriately.
+ */
+void __attribute__((constructor)) init_cpu_support_flag(void) {
+#if defined(__riscv) && (__riscv_xlen == 64)
+  // check if CPU supports Zbc.
+  // parse /proc/cpuinfo 'isa' line for substring "zbc".
+  FILE *f = fopen("/proc/cpuinfo", "r");
+  if (f) {
+    char line[256];
+    int has_zbc = 0;
+    while (fgets(line, sizeof(line), f)) {
+      if ((strstr(line, "isa") || strstr(line, "extensions")) &&
+          strstr(line, "zbc")) {
+        has_zbc = 1;
+        break;
+      }
+    }
+    fclose(f);
+    if (has_zbc) {
+      pipelined_crc32_zlib_func = pipelined_crc32_zlib;
+    }
+  }
+#endif
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to