From: Tang ShiHao <[email protected]> Implement rte acl classify function for RISC-V architecture using RISC-V Vector Extension instruction set Verified with testacl and acl_autotest applications on riscv architecture.
Performance improvements measured with dpdk-test-acl: - 100 rules / 100 traces: 57.3 → 48.6 cycles/pkt (~1.18x) - 1k rules / 1k traces: 13.6 → 8.3 cycles/pkt (~1.64x) - 10k rules / 1M traces: 36.6 → 27.7 cycles/pkt (~1.32x) Throughput improvement up to ~64%. This patch is co-developed with Gong Xiaofei. Signed-off-by: gong-flying <[email protected]> Signed-off-by: Tang ShiHao <[email protected]> --- app/test-acl/main.c | 4 + lib/acl/acl.h | 4 + lib/acl/acl_run_rvv.c | 19 ++++ lib/acl/acl_run_rvv.h | 210 ++++++++++++++++++++++++++++++++++++++++++ lib/acl/meson.build | 4 +- lib/acl/rte_acl.c | 43 +++++++++ lib/acl/rte_acl.h | 1 + 7 files changed, 284 insertions(+), 1 deletion(-) create mode 100644 lib/acl/acl_run_rvv.c create mode 100644 lib/acl/acl_run_rvv.h diff --git a/app/test-acl/main.c b/app/test-acl/main.c index 3a791b3ccf..ad0bc89644 100644 --- a/app/test-acl/main.c +++ b/app/test-acl/main.c @@ -97,6 +97,10 @@ static const struct acl_alg acl_alg[] = { .name = "avx512x32", .alg = RTE_ACL_CLASSIFY_AVX512X32, }, + { + .name = "rvv", + .alg = RTE_ACL_CLASSIFY_RVV, + }, }; static struct { diff --git a/lib/acl/acl.h b/lib/acl/acl.h index 9c85a3d58a..af202a84ed 100644 --- a/lib/acl/acl.h +++ b/lib/acl/acl.h @@ -226,6 +226,10 @@ int rte_acl_classify_altivec(const struct rte_acl_ctx *ctx, const uint8_t **data, uint32_t *results, uint32_t num, uint32_t categories); +int +rte_acl_classify_rvv(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, uint32_t num, uint32_t categories); + #ifdef __cplusplus } #endif /* __cplusplus */ diff --git a/lib/acl/acl_run_rvv.c b/lib/acl/acl_run_rvv.c new file mode 100644 index 0000000000..2b53e28213 --- /dev/null +++ b/lib/acl/acl_run_rvv.c @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2026 Institute of Software Chinese Academy of Sciences (ISCAS) + */ + +#if defined(RTE_ARCH_RISCV) && defined(RTE_RISCV_FEATURE_V) + +#include "acl_run_rvv.h" + +int +rte_acl_classify_rvv(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, uint32_t num, uint32_t categories) +{ + if (num >= 4) + return search_rvv_4(ctx, data, results, num, categories); + else + return rte_acl_classify_scalar(ctx, data, results, num, categories); +} + +#endif diff --git a/lib/acl/acl_run_rvv.h b/lib/acl/acl_run_rvv.h new file mode 100644 index 0000000000..ed21ce2ba6 --- /dev/null +++ b/lib/acl/acl_run_rvv.h @@ -0,0 +1,210 @@ +#include <stdalign.h> + +#include "acl_run.h" + +#include <riscv_vector.h> + + +static const uint8_t idx_const[16] = { + 0, 0, 0, 0, 4, 4, 4, 4, + 8, 8, 8, 8, 12, 12, 12, 12 +}; + +/* + * Resolve priority for multiple results (scalar version). + * This consists comparing the priority of the current traversal with the + * running set of results for the packet. + * For each result, keep a running array of the result (rule number) and + * its priority for each category. + */ +static inline void +resolve_priority_rvv(uint64_t transition, int n, + const struct rte_acl_ctx *ctx, + struct parms *parms, + const struct rte_acl_match_results *p, + uint32_t categories) +{ + uint32_t x; + + for (x = 0; x < categories; x += RTE_ACL_RESULTS_MULTIPLIER) { + + int32_t *saved_results = (int32_t *)&parms[n].cmplt->results[x]; + int32_t *saved_priority = (int32_t *)&parms[n].cmplt->priority[x]; + + const int32_t *cur_results = (const int32_t *)&p[transition].results[x]; + const int32_t *cur_priority = (const int32_t *)&p[transition].priority[x]; + + size_t vl = __riscv_vsetvl_e32m1(RTE_ACL_RESULTS_MULTIPLIER); + + /* load current trie results / priority */ + vint32m1_t v_results = __riscv_vle32_v_i32m1(cur_results, vl); + vint32m1_t v_priority = __riscv_vle32_v_i32m1(cur_priority, vl); + + if (parms[n].cmplt->count != ctx->num_tries) { + + /* load running best */ + vint32m1_t v_results1 = __riscv_vle32_v_i32m1(saved_results, vl); + vint32m1_t v_priority1 = __riscv_vle32_v_i32m1(saved_priority, vl); + + /* selector = priority1 > priority */ + vbool32_t mask = __riscv_vmsgt_vv_i32m1_b32(v_priority1, v_priority, vl); + + /* results = mask ? results1 : results */ + v_results = __riscv_vmerge_vvm_i32m1(v_results, v_results1, mask, vl); + v_priority = __riscv_vmerge_vvm_i32m1(v_priority, v_priority1, mask, vl); + } + + /* store back running best */ + __riscv_vse32_v_i32m1(saved_results, v_results, vl); + __riscv_vse32_v_i32m1(saved_priority, v_priority, vl); + } +} + +vuint32m1_t +transition4_rvv(vuint32m1_t next_input, + const uint64_t *trans, + uint64_t transitions[4]) +{ + size_t vl = 4; + + vuint64m2_t vtr = __riscv_vle64_v_u64m2(transitions, vl); + + vuint32m1_t lo = __riscv_vnsrl_wx_u32m1(vtr, 0, vl); + vuint32m1_t hi = __riscv_vnsrl_wx_u32m1(vtr, 32, vl); + + vuint32m1_t addr = + __riscv_vxor_vv_u32m1(lo, __riscv_vand_vx_u32m1(lo, ~RTE_ACL_NODE_INDEX, vl), vl); + + vuint32m1_t node_type = + __riscv_vand_vx_u32m1(lo, ~RTE_ACL_NODE_INDEX, vl); + + vbool32_t m_dfa = + __riscv_vmseq_vx_u32m1_b32(node_type, 0, vl); + + vuint32m1_t input = + __riscv_vand_vx_u32m1(next_input, 0xff, vl); + + /* ---------------- DFA ---------------- */ + + vuint32m1_t grp = + __riscv_vsrl_vx_u32m1(input, 6, vl); + + vuint32m1_t shift = + __riscv_vmul_vx_u32m1(grp, RTE_ACL_DFA_GR64_BIT, vl); + + vuint32m1_t dfa_base = + __riscv_vsrl_vv_u32m1(hi, shift, vl); + + vuint32m1_t dfa_x = + __riscv_vsub_vv_u32m1(input, + __riscv_vand_vx_u32m1(dfa_base, UINT8_MAX, vl), + vl); + + /* ---------------- QRANGE ---------------- */ + vuint8m1_t mask = __riscv_vle8_v_u8m1(idx_const, 16); + + vuint8m1_t in = + __riscv_vrgather_vv_u8m1( + __riscv_vreinterpret_v_u32m1_u8m1(next_input), + mask, + 16); + + vint8m1_t in_s8 = + __riscv_vreinterpret_v_u8m1_i8m1(in); + + vuint8m1_t ranges_u8 = + __riscv_vreinterpret_v_u32m1_u8m1(hi); + + vint8m1_t ranges_s8 = + __riscv_vreinterpret_v_u8m1_i8m1(ranges_u8); + + vbool8_t cmp = + __riscv_vmsgt_vv_i8m1_b8(in_s8, ranges_s8, 16); + int32_t q_1 = __riscv_vcpop_m_b8(cmp, 4); + int32_t q_2 = __riscv_vcpop_m_b8(cmp, 8); + int32_t q_3 = __riscv_vcpop_m_b8(cmp, 12); + int32_t q_4 = __riscv_vcpop_m_b8(cmp, 16); + uint32_t q_scalar[4] = {q_1, q_2 - q_1, q_3 - q_2, q_4 - q_3}; + vuint32m1_t q_x = __riscv_vle32_v_u32m1(q_scalar, 4); + + + vuint32m1_t x = + __riscv_vmerge_vvm_u32m1(q_x, dfa_x, m_dfa, vl); + + addr = __riscv_vadd_vv_u32m1(addr, x, vl); + + vuint64m2_t addr64 = + __riscv_vwmulu_vx_u64m2(addr, sizeof(uint64_t), vl); + vuint64m2_t next = + __riscv_vloxei64_v_u64m2(trans, addr64, vl); + + __riscv_vse64_v_u64m2(transitions, next, vl); + + return __riscv_vsrl_vx_u32m1(next_input, 8, vl); +} + +/* + * Check for any match in 4 transitions + */ +static __rte_always_inline uint32_t +check_any_match_x4(uint64_t val[]) +{ + return (val[0] | val[1] | val[2] | val[3]) & RTE_ACL_NODE_MATCH; +} + +static __rte_always_inline void +acl_match_check_x4(int slot, const struct rte_acl_ctx *ctx, struct parms *parms, + struct acl_flow_data *flows, uint64_t transitions[]) +{ + while (check_any_match_x4(transitions)) { + transitions[0] = acl_match_check(transitions[0], slot, ctx, + parms, flows, resolve_priority_rvv); + transitions[1] = acl_match_check(transitions[1], slot + 1, ctx, + parms, flows, resolve_priority_rvv); + transitions[2] = acl_match_check(transitions[2], slot + 2, ctx, + parms, flows, resolve_priority_rvv); + transitions[3] = acl_match_check(transitions[3], slot + 3, ctx, + parms, flows, resolve_priority_rvv); + } +} + +static inline int +search_rvv_4(const struct rte_acl_ctx *ctx, + const uint8_t **data, + uint32_t *results, + int total_packets, + uint32_t categories) +{ + struct acl_flow_data flows; + uint64_t index_array[4]; + struct completion cmplt[4]; + struct parms parms[4]; + vuint32m1_t input; + + acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, + results, total_packets, + categories, ctx->trans_table); + + for (int i = 0; i < 4; i++) + index_array[i] = + acl_start_next_trie(&flows, parms, i, ctx); + + acl_match_check_x4(0, ctx, parms, &flows, index_array); + + while (flows.started > 0) { + input = __riscv_vmv_v_x_u32m1(GET_NEXT_4BYTES(parms, 0), 4); + input = __riscv_vslide1down_vx_u32m1( + input, GET_NEXT_4BYTES(parms, 1), 4); + input = __riscv_vslide1down_vx_u32m1( + input, GET_NEXT_4BYTES(parms, 2), 4); + input = __riscv_vslide1down_vx_u32m1( + input, GET_NEXT_4BYTES(parms, 3), 4); + + input = transition4_rvv(input, flows.trans, index_array); + input = transition4_rvv(input, flows.trans, index_array); + input = transition4_rvv(input, flows.trans, index_array); + input = transition4_rvv(input, flows.trans, index_array); + acl_match_check_x4(0, ctx, parms, &flows, index_array); + } + return 0; +} diff --git a/lib/acl/meson.build b/lib/acl/meson.build index 87e9f25f8e..2d2b8d46c6 100644 --- a/lib/acl/meson.build +++ b/lib/acl/meson.build @@ -25,4 +25,6 @@ elif dpdk_conf.has('RTE_ARCH_ARM') sources += files('acl_run_neon.c') elif dpdk_conf.has('RTE_ARCH_PPC_64') sources += files('acl_run_altivec.c') -endif +elif dpdk_conf.has('RTE_ARCH_RISCV') + sources += files('acl_run_rvv.c') +endif \ No newline at end of file diff --git a/lib/acl/rte_acl.c b/lib/acl/rte_acl.c index 3f2b194206..8fc54e8037 100644 --- a/lib/acl/rte_acl.c +++ b/lib/acl/rte_acl.c @@ -8,6 +8,10 @@ #include <rte_acl.h> #include <rte_tailq.h> +#if defined(RTE_ARCH_RISCV) && defined(RTE_RISCV_FEATURE_V) +#include <riscv_vector.h> +#endif + #include "acl.h" #include "acl_log.h" @@ -94,6 +98,18 @@ rte_acl_classify_altivec(__rte_unused const struct rte_acl_ctx *ctx, } #endif +#ifndef RTE_ARCH_RISCV +int +rte_acl_classify_rvv(__rte_unused const struct rte_acl_ctx *ctx, + __rte_unused const uint8_t **data, + __rte_unused uint32_t *results, + __rte_unused uint32_t num, + __rte_unused uint32_t categories) +{ + return -ENOTSUP; +} +#endif + static const rte_acl_classify_t classify_fns[] = { [RTE_ACL_CLASSIFY_DEFAULT] = rte_acl_classify_scalar, [RTE_ACL_CLASSIFY_SCALAR] = rte_acl_classify_scalar, @@ -103,6 +119,7 @@ static const rte_acl_classify_t classify_fns[] = { [RTE_ACL_CLASSIFY_ALTIVEC] = rte_acl_classify_altivec, [RTE_ACL_CLASSIFY_AVX512X16] = rte_acl_classify_avx512x16, [RTE_ACL_CLASSIFY_AVX512X32] = rte_acl_classify_avx512x32, + [RTE_ACL_CLASSIFY_RVV] = rte_acl_classify_rvv, }; /* @@ -202,6 +219,28 @@ acl_check_alg_x86(enum rte_acl_classify_alg alg) return -EINVAL; } + +/* + * Helper function for acl_check_alg. + * Check support for RISCV specific classify methods. + */ +static int +acl_check_alg_rvv(enum rte_acl_classify_alg alg) +{ + if (alg == RTE_ACL_CLASSIFY_RVV) { +#if defined(RTE_RISCV_FEATURE_V) + if (__riscv_vsetvl_e32m1(RTE_ACL_RESULTS_MULTIPLIER) >= + RTE_ACL_RESULTS_MULTIPLIER && + __riscv_vsetvl_e32m1(4) >= 4) + return 0; +#endif + return -ENOTSUP; + } + + return -EINVAL; +} + + /* * Check if input alg is supported by given platform/binary. * Note that both conditions should be met: @@ -221,6 +260,8 @@ acl_check_alg(enum rte_acl_classify_alg alg) case RTE_ACL_CLASSIFY_AVX2: case RTE_ACL_CLASSIFY_SSE: return acl_check_alg_x86(alg); + case RTE_ACL_CLASSIFY_RVV: + return acl_check_alg_rvv(alg); /* scalar method is supported on all platforms */ case RTE_ACL_CLASSIFY_SCALAR: return 0; @@ -249,6 +290,8 @@ acl_get_best_alg(void) RTE_ACL_CLASSIFY_AVX512X16, RTE_ACL_CLASSIFY_AVX2, RTE_ACL_CLASSIFY_SSE, +#elif defined(RTE_ARCH_RISCV) && defined(RTE_RISCV_FEATURE_V) + RTE_ACL_CLASSIFY_RVV, #endif RTE_ACL_CLASSIFY_SCALAR, }; diff --git a/lib/acl/rte_acl.h b/lib/acl/rte_acl.h index 0db4600cbe..0e9d09511d 100644 --- a/lib/acl/rte_acl.h +++ b/lib/acl/rte_acl.h @@ -303,6 +303,7 @@ enum rte_acl_classify_alg { RTE_ACL_CLASSIFY_ALTIVEC = 5, /**< requires ALTIVEC support. */ RTE_ACL_CLASSIFY_AVX512X16 = 6, /**< requires AVX512 support. */ RTE_ACL_CLASSIFY_AVX512X32 = 7, /**< requires AVX512 support. */ + RTE_ACL_CLASSIFY_RVV = 8, /**< requires RVV support. */ }; /** -- 2.43.0

