Re: [PR] Add RISC-V Zvbc vector CRC32C acceleration (brpc)

via GitHub Fri, 05 Jun 2026 21:40:29 -0700


Felix-Gong commented on code in PR #3332:
URL: https://github.com/apache/brpc/pull/3332#discussion_r3366641854



##########
src/butil/crc32c.cc:
##########
@@ -604,8 +609,195 @@ static bool isZbc() {
   }();
   return zbc_supported;
 }
+
+#if defined(__riscv_zvbc)
+// Hardware-accelerated CRC32C using RISC-V Zvbc vector carry-less 
multiplication.
+// Uses RVV vclmul/vclmulh to process 2 lanes per vector operation (VLEN=128).
+// With VLEN=128, each vector register holds 2 x 64-bit elements.
+// 4 lanes are processed using 2 vector register pairs per clmul step.
+static uint32_t rv_crc32c_vclmul(uint32_t crc, const char* buf, size_t len) {
+  crc ^= 0xFFFFFFFF;
+
+  const uint8_t* p = reinterpret_cast<const uint8_t*>(buf);
+  size_t n = len;
+
+  if (n < 64) {
+    return rv_crc32c_bitwise(crc, p, n) ^ 0xFFFFFFFF;
+  }
+
+  // Align to 16-byte boundary
+  uintptr_t mis = (uintptr_t)p & 0xF;
+  if (mis) {
+    size_t pre = 16 - mis;
+    if (pre > n) pre = n;
+    crc = rv_crc32c_bitwise(crc, p, pre);
+    p += pre;
+    n -= pre;
+    if (n < 64) {
+      return rv_crc32c_bitwise(crc, p, n) ^ 0xFFFFFFFF;
+    }
+  }
+
+  // Set up RVV for 64-bit elements: vl = min(VLEN/64, 2) = 2 for VLEN=128
+  size_t vl = __riscv_vsetvl_e64m1(2);
+
+  // Construct fold constant vectors: {k1, k2} and {k3, k4}
+  // Each element gets the appropriate constant for its position:
+  // element 0 (lo half) uses k1/k3, element 1 (hi half) uses k2/k4
+  uint64_t k12_arr[2] = { crc32c_fold_const[0], crc32c_fold_const[1] };
+  uint64_t k34_arr[2] = { crc32c_fold_const[2], crc32c_fold_const[3] };
+  vuint64m1_t k12_vec = __riscv_vle64_v_u64m1(k12_arr, vl);  // {k1, k2}
+  vuint64m1_t k34_vec = __riscv_vle64_v_u64m1(k34_arr, vl);  // {k3, k4}
+
+  // Load first 64 bytes into 4 vector registers.
+  // Each vector = one 128-bit lane: {lo_64, hi_64}
+  vuint64m1_t lane1 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 0), vl);
+  vuint64m1_t lane2 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 16), vl);
+  vuint64m1_t lane3 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 32), vl);
+  vuint64m1_t lane4 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 48), vl);

Review Comment:
   Fixed. All vector loads now go through `memcpy` into a `uint64_t[2]` staging 
buffer before `vle64`, avoiding the `uint8_t*`-to-`uint64_t*` cast.



##########
src/butil/crc32c.cc:
##########
@@ -604,8 +609,195 @@ static bool isZbc() {
   }();
   return zbc_supported;
 }
+
+#if defined(__riscv_zvbc)
+// Hardware-accelerated CRC32C using RISC-V Zvbc vector carry-less 
multiplication.
+// Uses RVV vclmul/vclmulh to process 2 lanes per vector operation (VLEN=128).
+// With VLEN=128, each vector register holds 2 x 64-bit elements.
+// 4 lanes are processed using 2 vector register pairs per clmul step.
+static uint32_t rv_crc32c_vclmul(uint32_t crc, const char* buf, size_t len) {
+  crc ^= 0xFFFFFFFF;
+
+  const uint8_t* p = reinterpret_cast<const uint8_t*>(buf);
+  size_t n = len;
+
+  if (n < 64) {
+    return rv_crc32c_bitwise(crc, p, n) ^ 0xFFFFFFFF;
+  }
+
+  // Align to 16-byte boundary
+  uintptr_t mis = (uintptr_t)p & 0xF;
+  if (mis) {
+    size_t pre = 16 - mis;
+    if (pre > n) pre = n;
+    crc = rv_crc32c_bitwise(crc, p, pre);
+    p += pre;
+    n -= pre;
+    if (n < 64) {
+      return rv_crc32c_bitwise(crc, p, n) ^ 0xFFFFFFFF;
+    }
+  }
+
+  // Set up RVV for 64-bit elements: vl = min(VLEN/64, 2) = 2 for VLEN=128
+  size_t vl = __riscv_vsetvl_e64m1(2);
+
+  // Construct fold constant vectors: {k1, k2} and {k3, k4}
+  // Each element gets the appropriate constant for its position:
+  // element 0 (lo half) uses k1/k3, element 1 (hi half) uses k2/k4
+  uint64_t k12_arr[2] = { crc32c_fold_const[0], crc32c_fold_const[1] };
+  uint64_t k34_arr[2] = { crc32c_fold_const[2], crc32c_fold_const[3] };
+  vuint64m1_t k12_vec = __riscv_vle64_v_u64m1(k12_arr, vl);  // {k1, k2}
+  vuint64m1_t k34_vec = __riscv_vle64_v_u64m1(k34_arr, vl);  // {k3, k4}
+
+  // Load first 64 bytes into 4 vector registers.
+  // Each vector = one 128-bit lane: {lo_64, hi_64}
+  vuint64m1_t lane1 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 0), vl);
+  vuint64m1_t lane2 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 16), vl);
+  vuint64m1_t lane3 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 32), vl);
+  vuint64m1_t lane4 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 48), vl);
+
+  // XOR CRC into element 0 of first lane
+  uint64_t tmp[2];
+  __riscv_vse64_v_u64m1(tmp, lane1, vl);
+  tmp[0] ^= (uint64_t)crc;
+  lane1 = __riscv_vle64_v_u64m1(tmp, vl);
+
+  p += 64;
+  n -= 64;
+
+  // Main loop: fold 64 bytes per iteration using vector carry-less multiply.
+  //
+  // For each 128-bit lane {lo, hi}, the fold computes:
+  //   new_lo = clmul(lo, k1) ^ clmul(hi, k2) ^ data_lo
+  //   new_hi = clmulh(lo, k1) ^ clmulh(hi, k2) ^ data_hi
+  //
+  // With k12_vec = {k1, k2} and element-wise vclmul:
+  //   vclmul(lane, k12_vec)  = {clmul(lo, k1), clmul(hi, k2)}   (lo halves of 
products)
+  //   vclmulh(lane, k12_vec) = {clmulh(lo, k1), clmulh(hi, k2)} (hi halves of 
products)
+  //
+  // The 128-bit XOR of (lo*k1) and (hi*k2) decomposes element-wise:
+  //   new_lo = clmul(lo,k1) ^ clmul(hi,k2) = vclmul[0] ^ vclmul[1]
+  //   new_hi = clmulh(lo,k1) ^ clmulh(hi,k2) = vclmulh[0] ^ vclmulh[1]
+  //
+  // So we need to XOR across elements. With VLEN=128 (2 elements), we use
+  // scalar extraction for the cross-element XOR since there's no vector
+  // permute instruction for just 2 elements that's more efficient.
+  while (n >= 64) {
+    vuint64m1_t d1 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 0), vl);
+    vuint64m1_t d2 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 16), vl);
+    vuint64m1_t d3 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 32), vl);
+    vuint64m1_t d4 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 48), vl);
+
+    // Fold each lane using vector clmul with {k1, k2}
+    uint64_t lo_r[2], hi_r[2], d_r[2];
+
+    // Lane 1
+    __riscv_vse64_v_u64m1(lo_r, __riscv_vclmul_vv_u64m1(lane1, k12_vec, vl), 
vl);
+    __riscv_vse64_v_u64m1(hi_r, __riscv_vclmulh_vv_u64m1(lane1, k12_vec, vl), 
vl);
+    __riscv_vse64_v_u64m1(d_r, d1, vl);
+    d_r[0] ^= lo_r[0] ^ lo_r[1];
+    d_r[1] ^= hi_r[0] ^ hi_r[1];
+    lane1 = __riscv_vle64_v_u64m1(d_r, vl);
+
+    // Lane 2
+    __riscv_vse64_v_u64m1(lo_r, __riscv_vclmul_vv_u64m1(lane2, k12_vec, vl), 
vl);
+    __riscv_vse64_v_u64m1(hi_r, __riscv_vclmulh_vv_u64m1(lane2, k12_vec, vl), 
vl);
+    __riscv_vse64_v_u64m1(d_r, d2, vl);
+    d_r[0] ^= lo_r[0] ^ lo_r[1];
+    d_r[1] ^= hi_r[0] ^ hi_r[1];
+    lane2 = __riscv_vle64_v_u64m1(d_r, vl);
+
+    // Lane 3
+    __riscv_vse64_v_u64m1(lo_r, __riscv_vclmul_vv_u64m1(lane3, k12_vec, vl), 
vl);
+    __riscv_vse64_v_u64m1(hi_r, __riscv_vclmulh_vv_u64m1(lane3, k12_vec, vl), 
vl);
+    __riscv_vse64_v_u64m1(d_r, d3, vl);
+    d_r[0] ^= lo_r[0] ^ lo_r[1];
+    d_r[1] ^= hi_r[0] ^ hi_r[1];
+    lane3 = __riscv_vle64_v_u64m1(d_r, vl);
+
+    // Lane 4
+    __riscv_vse64_v_u64m1(lo_r, __riscv_vclmul_vv_u64m1(lane4, k12_vec, vl), 
vl);
+    __riscv_vse64_v_u64m1(hi_r, __riscv_vclmulh_vv_u64m1(lane4, k12_vec, vl), 
vl);
+    __riscv_vse64_v_u64m1(d_r, d4, vl);
+    d_r[0] ^= lo_r[0] ^ lo_r[1];
+    d_r[1] ^= hi_r[0] ^ hi_r[1];
+    lane4 = __riscv_vle64_v_u64m1(d_r, vl);
+
+    p += 64;
+    n -= 64;
+  }
+
+  // Reduce 4 lanes to 1 using {k3, k4}
+  // Same fold pattern: fold lane_a into lane_b
+  #define FOLD_INTO(dst, src) do { \
+    uint64_t _lo[2], _hi[2], _d[2]; \
+    __riscv_vse64_v_u64m1(_lo, __riscv_vclmul_vv_u64m1(src, k34_vec, vl), vl); 
\
+    __riscv_vse64_v_u64m1(_hi, __riscv_vclmulh_vv_u64m1(src, k34_vec, vl), 
vl); \
+    __riscv_vse64_v_u64m1(_d, dst, vl); \
+    _d[0] ^= _lo[0] ^ _lo[1]; \
+    _d[1] ^= _hi[0] ^ _hi[1]; \
+    dst = __riscv_vle64_v_u64m1(_d, vl); \
+  } while(0)
+
+  FOLD_INTO(lane2, lane1);  // lane2 = fold(lane1) ^ lane2
+  FOLD_INTO(lane3, lane2);  // lane3 = fold(lane2) ^ lane3
+  FOLD_INTO(lane4, lane3);  // lane4 = fold(lane3) ^ lane4
+  #undef FOLD_INTO
+
+  // Extract final 128-bit state from vector register
+  uint64_t final_state[2];
+  __riscv_vse64_v_u64m1(final_state, lane4, vl);
+  uint64_t x0 = final_state[0];
+  uint64_t x1 = final_state[1];
+
+  // Barrett reduction: 128-bit -> 32-bit CRC (scalar)
+  uint64_t t4 = rv_clmul(x0, RV_CRC32C_CONST_1);
+  uint64_t t3 = rv_clmulh(x0, RV_CRC32C_CONST_1);
+  uint64_t t1 = x1 ^ t4;
+  t4 = t1 & RV_CRC32_MASK32;
+  t1 >>= 32;
+  uint64_t t0 = rv_clmul(t4, RV_CRC32C_CONST_0);
+  t3 = (t3 << 32) ^ t1 ^ t0;
+
+  t4 = t3 & RV_CRC32_MASK32;
+  t4 = rv_clmul(t4, RV_CRC32C_CONST_QUO);
+  t4 &= RV_CRC32_MASK32;
+  t4 = rv_clmul(t4, RV_CRC32C_CONST_POLY);
+  t4 ^= t3;
+
+  uint32_t c = (uint32_t)((t4 >> 32) & RV_CRC32_MASK32);
+  if (n) {
+    c = rv_crc32c_bitwise(c, p, n);
+  }
+  return c ^ 0xFFFFFFFF;
+}
+
+// Runtime detection: check if RISC-V CPU supports Zvbc extension
+static bool isZvbc() {
+  static const bool zvbc_supported = []() {
+    FILE* f = fopen("/proc/cpuinfo", "r");
+    if (!f) return false;
+    bool supported = false;
+    char line[1024];
+    while (fgets(line, sizeof(line), f)) {
+      if (strstr(line, "isa") || strstr(line, "hart isa")) {
+        char* colon = strchr(line, ':');
+        if (colon) {
+          if (strstr(colon, "_zvbc") || strstr(colon, "zvbc")) {
+            supported = true;

Review Comment:
   Fixed. Both `isZbc()` and `isZvbc()` now only match `_zbc` and `_zvbc` (with 
underscore prefix), removing the bare `zbc`/`zvbc` substring checks that could 
cause false positives.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Add RISC-V Zvbc vector CRC32C acceleration (brpc)

Reply via email to