> From: Tang ShiHao <[email protected]>
> 
> Implement rte acl classify function for RISC-V architecture
> using RISC-V Vector Extension instruction set
> Verified with testacl and acl_autotest applications on riscv architecture.
> 
> Performance improvements measured with dpdk-test-acl:
> 
> - 100 rules / 100 traces: 57.3 → 48.6 cycles/pkt (~1.18x)
> - 1k rules / 1k traces: 13.6 → 8.3 cycles/pkt (~1.64x)
> - 10k rules / 1M traces: 36.6 → 27.7 cycles/pkt (~1.32x)
> 
> Throughput improvement up to ~64%.
> 
> This patch is co-developed with Gong Xiaofei.
> 
> Signed-off-by: gong-flying <[email protected]>
> Signed-off-by: Tang ShiHao <[email protected]>

There is already an acked by me patch:
https://patchwork.dpdk.org/project/dpdk/patch/[email protected]/
that provides identical functionality.
Can you probably explain why a new one is required?
BTW, if you did spot any issues with the patch above, please
provide your comments.
Thanks
Konstantin

> ---
>  app/test-acl/main.c   |   4 +
>  lib/acl/acl.h         |   4 +
>  lib/acl/acl_run_rvv.c |  19 ++++
>  lib/acl/acl_run_rvv.h | 210 ++++++++++++++++++++++++++++++++++++++++++
>  lib/acl/meson.build   |   4 +-
>  lib/acl/rte_acl.c     |  43 +++++++++
>  lib/acl/rte_acl.h     |   1 +
>  7 files changed, 284 insertions(+), 1 deletion(-)
>  create mode 100644 lib/acl/acl_run_rvv.c
>  create mode 100644 lib/acl/acl_run_rvv.h
> 
> diff --git a/app/test-acl/main.c b/app/test-acl/main.c
> index 3a791b3ccf..ad0bc89644 100644
> --- a/app/test-acl/main.c
> +++ b/app/test-acl/main.c
> @@ -97,6 +97,10 @@ static const struct acl_alg acl_alg[] = {
>               .name = "avx512x32",
>               .alg = RTE_ACL_CLASSIFY_AVX512X32,
>       },
> +     {
> +             .name = "rvv",
> +             .alg = RTE_ACL_CLASSIFY_RVV,
> +     },
>  };
> 
>  static struct {
> diff --git a/lib/acl/acl.h b/lib/acl/acl.h
> index 9c85a3d58a..af202a84ed 100644
> --- a/lib/acl/acl.h
> +++ b/lib/acl/acl.h
> @@ -226,6 +226,10 @@ int
>  rte_acl_classify_altivec(const struct rte_acl_ctx *ctx, const uint8_t **data,
>       uint32_t *results, uint32_t num, uint32_t categories);
> 
> +int
> +rte_acl_classify_rvv(const struct rte_acl_ctx *ctx, const uint8_t **data,
> +     uint32_t *results, uint32_t num, uint32_t categories);
> +
>  #ifdef __cplusplus
>  }
>  #endif /* __cplusplus */
> diff --git a/lib/acl/acl_run_rvv.c b/lib/acl/acl_run_rvv.c
> new file mode 100644
> index 0000000000..2b53e28213
> --- /dev/null
> +++ b/lib/acl/acl_run_rvv.c
> @@ -0,0 +1,19 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2026 Institute of Software Chinese Academy of Sciences 
> (ISCAS)
> + */
> +
> +#if defined(RTE_ARCH_RISCV) && defined(RTE_RISCV_FEATURE_V)
> +
> +#include "acl_run_rvv.h"
> +
> +int
> +rte_acl_classify_rvv(const struct rte_acl_ctx *ctx, const uint8_t **data,
> +                   uint32_t *results, uint32_t num, uint32_t categories)
> +{
> +     if (num >= 4)
> +             return search_rvv_4(ctx, data, results, num, categories);
> +     else
> +             return rte_acl_classify_scalar(ctx, data, results, num, 
> categories);
> +}
> +
> +#endif
> diff --git a/lib/acl/acl_run_rvv.h b/lib/acl/acl_run_rvv.h
> new file mode 100644
> index 0000000000..ed21ce2ba6
> --- /dev/null
> +++ b/lib/acl/acl_run_rvv.h
> @@ -0,0 +1,210 @@
> +#include <stdalign.h>
> +
> +#include "acl_run.h"
> +
> +#include <riscv_vector.h>
> +
> +
> +static const uint8_t idx_const[16] = {
> +     0, 0, 0, 0, 4, 4, 4, 4,
> +     8, 8, 8, 8, 12, 12, 12, 12
> +};
> +
> +/*
> + * Resolve priority for multiple results (scalar version).
> + * This consists comparing the priority of the current traversal with the
> + * running set of results for the packet.
> + * For each result, keep a running array of the result (rule number) and
> + * its priority for each category.
> + */
> +static inline void
> +resolve_priority_rvv(uint64_t transition, int n,
> +                                             const struct rte_acl_ctx *ctx,
> +                                             struct parms *parms,
> +                                             const struct
> rte_acl_match_results *p,
> +                                             uint32_t categories)
> +{
> +     uint32_t x;
> +
> +     for (x = 0; x < categories; x += RTE_ACL_RESULTS_MULTIPLIER) {
> +
> +             int32_t *saved_results  = (int32_t 
> *)&parms[n].cmplt->results[x];
> +             int32_t *saved_priority = (int32_t 
> *)&parms[n].cmplt->priority[x];
> +
> +             const int32_t *cur_results  = (const int32_t
> *)&p[transition].results[x];
> +             const int32_t *cur_priority = (const int32_t
> *)&p[transition].priority[x];
> +
> +             size_t vl = __riscv_vsetvl_e32m1(RTE_ACL_RESULTS_MULTIPLIER);
> +
> +             /* load current trie results / priority */
> +             vint32m1_t v_results  = __riscv_vle32_v_i32m1(cur_results, vl);
> +             vint32m1_t v_priority = __riscv_vle32_v_i32m1(cur_priority, vl);
> +
> +             if (parms[n].cmplt->count != ctx->num_tries) {
> +
> +                     /* load running best */
> +                     vint32m1_t v_results1  =
> __riscv_vle32_v_i32m1(saved_results, vl);
> +                     vint32m1_t v_priority1 =
> __riscv_vle32_v_i32m1(saved_priority, vl);
> +
> +                     /* selector = priority1 > priority */
> +                     vbool32_t mask =
> __riscv_vmsgt_vv_i32m1_b32(v_priority1, v_priority, vl);
> +
> +                     /* results = mask ? results1 : results */
> +                     v_results  = __riscv_vmerge_vvm_i32m1(v_results,
> v_results1, mask, vl);
> +                     v_priority = __riscv_vmerge_vvm_i32m1(v_priority,
> v_priority1, mask, vl);
> +             }
> +
> +             /* store back running best */
> +             __riscv_vse32_v_i32m1(saved_results,  v_results,  vl);
> +             __riscv_vse32_v_i32m1(saved_priority, v_priority, vl);
> +     }
> +}
> +
> +vuint32m1_t
> +transition4_rvv(vuint32m1_t next_input,
> +                             const uint64_t *trans,
> +                             uint64_t transitions[4])
> +{
> +     size_t vl = 4;
> +
> +     vuint64m2_t vtr = __riscv_vle64_v_u64m2(transitions, vl);
> +
> +     vuint32m1_t lo = __riscv_vnsrl_wx_u32m1(vtr, 0, vl);
> +     vuint32m1_t hi = __riscv_vnsrl_wx_u32m1(vtr, 32, vl);
> +
> +     vuint32m1_t addr =
> +             __riscv_vxor_vv_u32m1(lo, __riscv_vand_vx_u32m1(lo,
> ~RTE_ACL_NODE_INDEX, vl), vl);
> +
> +     vuint32m1_t node_type =
> +             __riscv_vand_vx_u32m1(lo, ~RTE_ACL_NODE_INDEX, vl);
> +
> +     vbool32_t m_dfa =
> +             __riscv_vmseq_vx_u32m1_b32(node_type, 0, vl);
> +
> +     vuint32m1_t input =
> +             __riscv_vand_vx_u32m1(next_input, 0xff, vl);
> +
> +     /* ---------------- DFA ---------------- */
> +
> +     vuint32m1_t grp =
> +             __riscv_vsrl_vx_u32m1(input, 6, vl);
> +
> +     vuint32m1_t shift =
> +             __riscv_vmul_vx_u32m1(grp, RTE_ACL_DFA_GR64_BIT, vl);
> +
> +     vuint32m1_t dfa_base =
> +             __riscv_vsrl_vv_u32m1(hi, shift, vl);
> +
> +     vuint32m1_t dfa_x =
> +             __riscv_vsub_vv_u32m1(input,
> +                     __riscv_vand_vx_u32m1(dfa_base, UINT8_MAX, vl),
> +                     vl);
> +
> +     /* ---------------- QRANGE ---------------- */
> +     vuint8m1_t mask = __riscv_vle8_v_u8m1(idx_const, 16);
> +
> +     vuint8m1_t in =
> +             __riscv_vrgather_vv_u8m1(
> +                     __riscv_vreinterpret_v_u32m1_u8m1(next_input),
> +                     mask,
> +                     16);
> +
> +     vint8m1_t in_s8 =
> +             __riscv_vreinterpret_v_u8m1_i8m1(in);
> +
> +     vuint8m1_t ranges_u8 =
> +             __riscv_vreinterpret_v_u32m1_u8m1(hi);
> +
> +     vint8m1_t ranges_s8 =
> +             __riscv_vreinterpret_v_u8m1_i8m1(ranges_u8);
> +
> +     vbool8_t cmp =
> +             __riscv_vmsgt_vv_i8m1_b8(in_s8, ranges_s8, 16);
> +     int32_t q_1 = __riscv_vcpop_m_b8(cmp, 4);
> +     int32_t q_2 = __riscv_vcpop_m_b8(cmp, 8);
> +     int32_t q_3 = __riscv_vcpop_m_b8(cmp, 12);
> +     int32_t q_4 = __riscv_vcpop_m_b8(cmp, 16);
> +     uint32_t q_scalar[4] = {q_1, q_2 - q_1, q_3 - q_2, q_4 - q_3};
> +     vuint32m1_t q_x = __riscv_vle32_v_u32m1(q_scalar, 4);
> +
> +
> +     vuint32m1_t x =
> +             __riscv_vmerge_vvm_u32m1(q_x, dfa_x, m_dfa, vl);
> +
> +     addr = __riscv_vadd_vv_u32m1(addr, x, vl);
> +
> +     vuint64m2_t addr64 =
> +             __riscv_vwmulu_vx_u64m2(addr, sizeof(uint64_t), vl);
> +     vuint64m2_t next =
> +             __riscv_vloxei64_v_u64m2(trans, addr64, vl);
> +
> +     __riscv_vse64_v_u64m2(transitions, next, vl);
> +
> +     return __riscv_vsrl_vx_u32m1(next_input, 8, vl);
> +}
> +
> +/*
> + * Check for any match in 4 transitions
> + */
> +static __rte_always_inline uint32_t
> +check_any_match_x4(uint64_t val[])
> +{
> +     return (val[0] | val[1] | val[2] | val[3]) & RTE_ACL_NODE_MATCH;
> +}
> +
> +static __rte_always_inline void
> +acl_match_check_x4(int slot, const struct rte_acl_ctx *ctx, struct parms 
> *parms,
> +                     struct acl_flow_data *flows, uint64_t transitions[])
> +{
> +     while (check_any_match_x4(transitions)) {
> +             transitions[0] = acl_match_check(transitions[0], slot, ctx,
> +                     parms, flows, resolve_priority_rvv);
> +             transitions[1] = acl_match_check(transitions[1], slot + 1, ctx,
> +                     parms, flows, resolve_priority_rvv);
> +             transitions[2] = acl_match_check(transitions[2], slot + 2, ctx,
> +                     parms, flows, resolve_priority_rvv);
> +             transitions[3] = acl_match_check(transitions[3], slot + 3, ctx,
> +                     parms, flows, resolve_priority_rvv);
> +     }
> +}
> +
> +static inline int
> +search_rvv_4(const struct rte_acl_ctx *ctx,
> +                     const uint8_t **data,
> +                     uint32_t *results,
> +                     int total_packets,
> +                     uint32_t categories)
> +{
> +     struct acl_flow_data flows;
> +     uint64_t index_array[4];
> +     struct completion cmplt[4];
> +     struct parms parms[4];
> +     vuint32m1_t input;
> +
> +     acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data,
> +                             results, total_packets,
> +                             categories, ctx->trans_table);
> +
> +     for (int i = 0; i < 4; i++)
> +             index_array[i] =
> +                     acl_start_next_trie(&flows, parms, i, ctx);
> +
> +     acl_match_check_x4(0, ctx, parms, &flows, index_array);
> +
> +     while (flows.started > 0) {
> +             input = __riscv_vmv_v_x_u32m1(GET_NEXT_4BYTES(parms, 0),
> 4);
> +             input = __riscv_vslide1down_vx_u32m1(
> +                             input, GET_NEXT_4BYTES(parms, 1), 4);
> +             input = __riscv_vslide1down_vx_u32m1(
> +                             input, GET_NEXT_4BYTES(parms, 2), 4);
> +             input = __riscv_vslide1down_vx_u32m1(
> +                             input, GET_NEXT_4BYTES(parms, 3), 4);
> +
> +             input = transition4_rvv(input, flows.trans, index_array);
> +             input = transition4_rvv(input, flows.trans, index_array);
> +             input = transition4_rvv(input, flows.trans, index_array);
> +             input = transition4_rvv(input, flows.trans, index_array);
> +             acl_match_check_x4(0, ctx, parms, &flows, index_array);
> +     }
> +     return 0;
> +}
> diff --git a/lib/acl/meson.build b/lib/acl/meson.build
> index 87e9f25f8e..2d2b8d46c6 100644
> --- a/lib/acl/meson.build
> +++ b/lib/acl/meson.build
> @@ -25,4 +25,6 @@ elif dpdk_conf.has('RTE_ARCH_ARM')
>      sources += files('acl_run_neon.c')
>  elif dpdk_conf.has('RTE_ARCH_PPC_64')
>      sources += files('acl_run_altivec.c')
> -endif
> +elif dpdk_conf.has('RTE_ARCH_RISCV')
> +    sources += files('acl_run_rvv.c')
> +endif
> \ No newline at end of file
> diff --git a/lib/acl/rte_acl.c b/lib/acl/rte_acl.c
> index 3f2b194206..8fc54e8037 100644
> --- a/lib/acl/rte_acl.c
> +++ b/lib/acl/rte_acl.c
> @@ -8,6 +8,10 @@
>  #include <rte_acl.h>
>  #include <rte_tailq.h>
> 
> +#if defined(RTE_ARCH_RISCV) && defined(RTE_RISCV_FEATURE_V)
> +#include <riscv_vector.h>
> +#endif
> +
>  #include "acl.h"
>  #include "acl_log.h"
> 
> @@ -94,6 +98,18 @@ rte_acl_classify_altivec(__rte_unused const struct
> rte_acl_ctx *ctx,
>  }
>  #endif
> 
> +#ifndef RTE_ARCH_RISCV
> +int
> +rte_acl_classify_rvv(__rte_unused const struct rte_acl_ctx *ctx,
> +     __rte_unused const uint8_t **data,
> +     __rte_unused uint32_t *results,
> +     __rte_unused uint32_t num,
> +     __rte_unused uint32_t categories)
> +{
> +     return -ENOTSUP;
> +}
> +#endif
> +
>  static const rte_acl_classify_t classify_fns[] = {
>       [RTE_ACL_CLASSIFY_DEFAULT] = rte_acl_classify_scalar,
>       [RTE_ACL_CLASSIFY_SCALAR] = rte_acl_classify_scalar,
> @@ -103,6 +119,7 @@ static const rte_acl_classify_t classify_fns[] = {
>       [RTE_ACL_CLASSIFY_ALTIVEC] = rte_acl_classify_altivec,
>       [RTE_ACL_CLASSIFY_AVX512X16] = rte_acl_classify_avx512x16,
>       [RTE_ACL_CLASSIFY_AVX512X32] = rte_acl_classify_avx512x32,
> +     [RTE_ACL_CLASSIFY_RVV] = rte_acl_classify_rvv,
>  };
> 
>  /*
> @@ -202,6 +219,28 @@ acl_check_alg_x86(enum rte_acl_classify_alg alg)
>       return -EINVAL;
>  }
> 
> +
> +/*
> + * Helper function for acl_check_alg.
> + * Check support for RISCV specific classify methods.
> + */
> +static int
> +acl_check_alg_rvv(enum rte_acl_classify_alg alg)
> +{
> +     if (alg == RTE_ACL_CLASSIFY_RVV) {
> +#if defined(RTE_RISCV_FEATURE_V)
> +             if (__riscv_vsetvl_e32m1(RTE_ACL_RESULTS_MULTIPLIER) >=
> +                             RTE_ACL_RESULTS_MULTIPLIER &&
> +                             __riscv_vsetvl_e32m1(4) >= 4)
> +                     return 0;
> +#endif
> +             return -ENOTSUP;
> +     }
> +
> +     return -EINVAL;
> +}
> +
> +
>  /*
>   * Check if input alg is supported by given platform/binary.
>   * Note that both conditions should be met:
> @@ -221,6 +260,8 @@ acl_check_alg(enum rte_acl_classify_alg alg)
>       case RTE_ACL_CLASSIFY_AVX2:
>       case RTE_ACL_CLASSIFY_SSE:
>               return acl_check_alg_x86(alg);
> +     case RTE_ACL_CLASSIFY_RVV:
> +             return acl_check_alg_rvv(alg);
>       /* scalar method is supported on all platforms */
>       case RTE_ACL_CLASSIFY_SCALAR:
>               return 0;
> @@ -249,6 +290,8 @@ acl_get_best_alg(void)
>               RTE_ACL_CLASSIFY_AVX512X16,
>               RTE_ACL_CLASSIFY_AVX2,
>               RTE_ACL_CLASSIFY_SSE,
> +#elif defined(RTE_ARCH_RISCV) && defined(RTE_RISCV_FEATURE_V)
> +             RTE_ACL_CLASSIFY_RVV,
>  #endif
>               RTE_ACL_CLASSIFY_SCALAR,
>       };
> diff --git a/lib/acl/rte_acl.h b/lib/acl/rte_acl.h
> index 0db4600cbe..0e9d09511d 100644
> --- a/lib/acl/rte_acl.h
> +++ b/lib/acl/rte_acl.h
> @@ -303,6 +303,7 @@ enum rte_acl_classify_alg {
>       RTE_ACL_CLASSIFY_ALTIVEC = 5,    /**< requires ALTIVEC support. */
>       RTE_ACL_CLASSIFY_AVX512X16 = 6,  /**< requires AVX512 support. */
>       RTE_ACL_CLASSIFY_AVX512X32 = 7,  /**< requires AVX512 support. */
> +     RTE_ACL_CLASSIFY_RVV = 8,  /**< requires RVV support. */
>  };
> 
>  /**
> --
> 2.43.0
> 

Reply via email to