Add support for asynchronous batch enqueue/dequeue
of pointers from NPA pool.

Signed-off-by: Ashwin Sekhar T K <[email protected]>
---
 drivers/mempool/cnxk/cn10k_mempool_ops.c | 258 ++++++++++++++++++++++-
 drivers/mempool/cnxk/cnxk_mempool.c      |  19 +-
 drivers/mempool/cnxk/cnxk_mempool.h      |   3 +-
 drivers/mempool/cnxk/cnxk_mempool_ops.c  |  28 +++
 4 files changed, 287 insertions(+), 21 deletions(-)

diff --git a/drivers/mempool/cnxk/cn10k_mempool_ops.c 
b/drivers/mempool/cnxk/cn10k_mempool_ops.c
index fc7592fd94..131abc0723 100644
--- a/drivers/mempool/cnxk/cn10k_mempool_ops.c
+++ b/drivers/mempool/cnxk/cn10k_mempool_ops.c
@@ -7,11 +7,239 @@
 #include "roc_api.h"
 #include "cnxk_mempool.h"
 
+#define BATCH_ALLOC_SZ ROC_CN10K_NPA_BATCH_ALLOC_MAX_PTRS
+
+enum batch_op_status {
+       BATCH_ALLOC_OP_NOT_ISSUED = 0,
+       BATCH_ALLOC_OP_ISSUED = 1,
+       BATCH_ALLOC_OP_DONE
+};
+
+struct batch_op_mem {
+       unsigned int sz;
+       enum batch_op_status status;
+       uint64_t objs[BATCH_ALLOC_SZ] __rte_aligned(ROC_ALIGN);
+};
+
+struct batch_op_data {
+       uint64_t lmt_addr;
+       struct batch_op_mem mem[RTE_MAX_LCORE] __rte_aligned(ROC_ALIGN);
+};
+
+static struct batch_op_data **batch_op_data;
+
+#define BATCH_OP_DATA_GET(pool_id)                                             
\
+       batch_op_data[roc_npa_aura_handle_to_aura(pool_id)]
+
+#define BATCH_OP_DATA_SET(pool_id, op_data)                                    
\
+       do {                                                                   \
+               uint64_t aura = roc_npa_aura_handle_to_aura(pool_id);          \
+               batch_op_data[aura] = op_data;                                 \
+       } while (0)
+
+int
+cn10k_mempool_lf_init(void)
+{
+       unsigned int maxpools, sz;
+
+       maxpools = roc_idev_npa_maxpools_get();
+       sz = maxpools * sizeof(uintptr_t);
+
+       batch_op_data = rte_zmalloc(NULL, sz, ROC_ALIGN);
+       if (!batch_op_data)
+               return -1;
+
+       return 0;
+}
+
+void
+cn10k_mempool_lf_fini(void)
+{
+       if (!batch_op_data)
+               return;
+
+       rte_free(batch_op_data);
+       batch_op_data = NULL;
+}
+
+static int
+batch_op_init(struct rte_mempool *mp)
+{
+       struct batch_op_data *op_data;
+       int i;
+
+       RTE_ASSERT(BATCH_OP_DATA_GET(mp->pool_id) == NULL);
+       op_data = rte_zmalloc(NULL, sizeof(struct batch_op_data), ROC_ALIGN);
+       if (op_data == NULL)
+               return -1;
+
+       for (i = 0; i < RTE_MAX_LCORE; i++) {
+               op_data->mem[i].sz = 0;
+               op_data->mem[i].status = BATCH_ALLOC_OP_NOT_ISSUED;
+       }
+
+       op_data->lmt_addr = roc_idev_lmt_base_addr_get();
+       BATCH_OP_DATA_SET(mp->pool_id, op_data);
+
+       return 0;
+}
+
+static void
+batch_op_fini(struct rte_mempool *mp)
+{
+       struct batch_op_data *op_data;
+       int i;
+
+       op_data = BATCH_OP_DATA_GET(mp->pool_id);
+
+       rte_wmb();
+       for (i = 0; i < RTE_MAX_LCORE; i++) {
+               struct batch_op_mem *mem = &op_data->mem[i];
+
+               if (mem->status == BATCH_ALLOC_OP_ISSUED) {
+                       mem->sz = roc_npa_aura_batch_alloc_extract(
+                               mem->objs, mem->objs, BATCH_ALLOC_SZ);
+                       mem->status = BATCH_ALLOC_OP_DONE;
+               }
+               if (mem->status == BATCH_ALLOC_OP_DONE) {
+                       roc_npa_aura_op_bulk_free(mp->pool_id, mem->objs,
+                                                 mem->sz, 1);
+                       mem->status = BATCH_ALLOC_OP_NOT_ISSUED;
+               }
+       }
+
+       rte_free(op_data);
+       BATCH_OP_DATA_SET(mp->pool_id, NULL);
+}
+
+static int __rte_hot
+cn10k_mempool_enq(struct rte_mempool *mp, void *const *obj_table,
+                 unsigned int n)
+{
+       const uint64_t *ptr = (const uint64_t *)obj_table;
+       uint64_t lmt_addr = 0, lmt_id = 0;
+       struct batch_op_data *op_data;
+
+       /* Ensure mbuf init changes are written before the free pointers are
+        * are enqueued to the stack.
+        */
+       rte_io_wmb();
+
+       if (n == 1) {
+               roc_npa_aura_op_free(mp->pool_id, 1, ptr[0]);
+               return 0;
+       }
+
+       op_data = BATCH_OP_DATA_GET(mp->pool_id);
+       lmt_addr = op_data->lmt_addr;
+       ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
+       roc_npa_aura_op_batch_free(mp->pool_id, ptr, n, 1, lmt_addr, lmt_id);
+
+       return 0;
+}
+
+static int __rte_hot
+cn10k_mempool_deq(struct rte_mempool *mp, void **obj_table, unsigned int n)
+{
+       struct batch_op_data *op_data;
+       struct batch_op_mem *mem;
+       unsigned int count = 0;
+       int tid, rc, retry;
+       bool loop = true;
+
+       op_data = BATCH_OP_DATA_GET(mp->pool_id);
+       tid = rte_lcore_id();
+       mem = &op_data->mem[tid];
+
+       /* Issue batch alloc */
+       if (mem->status == BATCH_ALLOC_OP_NOT_ISSUED) {
+               rc = roc_npa_aura_batch_alloc_issue(mp->pool_id, mem->objs,
+                                                   BATCH_ALLOC_SZ, 0, 1);
+               /* If issue fails, try falling back to default alloc */
+               if (unlikely(rc))
+                       return cn10k_mempool_enq(mp, obj_table, n);
+               mem->status = BATCH_ALLOC_OP_ISSUED;
+       }
+
+       retry = 4;
+       while (loop) {
+               unsigned int cur_sz;
+
+               if (mem->status == BATCH_ALLOC_OP_ISSUED) {
+                       mem->sz = roc_npa_aura_batch_alloc_extract(
+                               mem->objs, mem->objs, BATCH_ALLOC_SZ);
+
+                       /* If partial alloc reduce the retry count */
+                       retry -= (mem->sz != BATCH_ALLOC_SZ);
+                       /* Break the loop if retry count exhausted */
+                       loop = !!retry;
+                       mem->status = BATCH_ALLOC_OP_DONE;
+               }
+
+               cur_sz = n - count;
+               if (cur_sz > mem->sz)
+                       cur_sz = mem->sz;
+
+               /* Dequeue the pointers */
+               memcpy(&obj_table[count], &mem->objs[mem->sz - cur_sz],
+                      cur_sz * sizeof(uintptr_t));
+               mem->sz -= cur_sz;
+               count += cur_sz;
+
+               /* Break loop if the required pointers has been dequeued */
+               loop &= (count != n);
+
+               /* Issue next batch alloc if pointers are exhausted */
+               if (mem->sz == 0) {
+                       rc = roc_npa_aura_batch_alloc_issue(
+                               mp->pool_id, mem->objs, BATCH_ALLOC_SZ, 0, 1);
+                       /* Break loop if issue failed and set status */
+                       loop &= !rc;
+                       mem->status = !rc;
+               }
+       }
+
+       if (unlikely(count != n)) {
+               /* No partial alloc allowed. Free up allocated pointers */
+               cn10k_mempool_enq(mp, obj_table, count);
+               return -ENOENT;
+       }
+
+       return 0;
+}
+
+static unsigned int
+cn10k_mempool_get_count(const struct rte_mempool *mp)
+{
+       struct batch_op_data *op_data;
+       unsigned int count = 0;
+       int i;
+
+       op_data = BATCH_OP_DATA_GET(mp->pool_id);
+
+       rte_wmb();
+       for (i = 0; i < RTE_MAX_LCORE; i++) {
+               struct batch_op_mem *mem = &op_data->mem[i];
+
+               if (mem->status == BATCH_ALLOC_OP_ISSUED)
+                       count += roc_npa_aura_batch_alloc_count(mem->objs,
+                                                               BATCH_ALLOC_SZ);
+
+               if (mem->status == BATCH_ALLOC_OP_DONE)
+                       count += mem->sz;
+       }
+
+       count += cnxk_mempool_get_count(mp);
+
+       return count;
+}
+
 static int
 cn10k_mempool_alloc(struct rte_mempool *mp)
 {
        uint32_t block_size;
        size_t padding;
+       int rc;
 
        block_size = mp->elt_size + mp->header_size + mp->trailer_size;
        /* Align header size to ROC_ALIGN */
@@ -29,16 +257,36 @@ cn10k_mempool_alloc(struct rte_mempool *mp)
                block_size += padding;
        }
 
-       return cnxk_mempool_alloc(mp);
+       rc = cnxk_mempool_alloc(mp);
+       if (rc)
+               return rc;
+
+       rc = batch_op_init(mp);
+       if (rc) {
+               plt_err("Failed to init batch alloc mem rc=%d", rc);
+               goto error;
+       }
+
+       return 0;
+error:
+       cnxk_mempool_free(mp);
+       return rc;
+}
+
+static void
+cn10k_mempool_free(struct rte_mempool *mp)
+{
+       batch_op_fini(mp);
+       cnxk_mempool_free(mp);
 }
 
 static struct rte_mempool_ops cn10k_mempool_ops = {
        .name = "cn10k_mempool_ops",
        .alloc = cn10k_mempool_alloc,
-       .free = cnxk_mempool_free,
-       .enqueue = cnxk_mempool_enq,
-       .dequeue = cnxk_mempool_deq,
-       .get_count = cnxk_mempool_get_count,
+       .free = cn10k_mempool_free,
+       .enqueue = cn10k_mempool_enq,
+       .dequeue = cn10k_mempool_deq,
+       .get_count = cn10k_mempool_get_count,
        .calc_mem_size = cnxk_mempool_calc_mem_size,
        .populate = cnxk_mempool_populate,
 };
diff --git a/drivers/mempool/cnxk/cnxk_mempool.c 
b/drivers/mempool/cnxk/cnxk_mempool.c
index c24497a6e5..1bbe384fe7 100644
--- a/drivers/mempool/cnxk/cnxk_mempool.c
+++ b/drivers/mempool/cnxk/cnxk_mempool.c
@@ -14,14 +14,11 @@
 #include <rte_pci.h>
 
 #include "roc_api.h"
-#include "cnxk_mempool.h"
 
 #define CNXK_NPA_DEV_NAME       RTE_STR(cnxk_npa_dev_)
 #define CNXK_NPA_DEV_NAME_LEN   (sizeof(CNXK_NPA_DEV_NAME) + PCI_PRI_STR_SIZE)
 #define CNXK_NPA_MAX_POOLS_PARAM "max_pools"
 
-uintptr_t *cnxk_mempool_internal_data;
-
 static inline uint32_t
 npa_aura_size_to_u32(uint8_t val)
 {
@@ -82,33 +79,25 @@ static int
 npa_init(struct rte_pci_device *pci_dev)
 {
        char name[CNXK_NPA_DEV_NAME_LEN];
-       size_t idata_offset, idata_sz;
        const struct rte_memzone *mz;
        struct roc_npa *dev;
-       int rc, maxpools;
+       int rc;
 
        rc = plt_init();
        if (rc < 0)
                goto error;
 
-       maxpools = parse_aura_size(pci_dev->device.devargs);
-       /* Add the space for per-pool internal data pointers to memzone len */
-       idata_offset = RTE_ALIGN_CEIL(sizeof(*dev), ROC_ALIGN);
-       idata_sz = maxpools * sizeof(uintptr_t);
-
        rc = -ENOMEM;
        mz = rte_memzone_reserve_aligned(npa_dev_to_name(pci_dev, name),
-                                        idata_offset + idata_sz, SOCKET_ID_ANY,
-                                        0, RTE_CACHE_LINE_SIZE);
+                                        sizeof(*dev), SOCKET_ID_ANY, 0,
+                                        RTE_CACHE_LINE_SIZE);
        if (mz == NULL)
                goto error;
 
        dev = mz->addr;
        dev->pci_dev = pci_dev;
-       cnxk_mempool_internal_data = (uintptr_t *)(mz->addr_64 + idata_offset);
-       memset(cnxk_mempool_internal_data, 0, idata_sz);
 
-       roc_idev_npa_maxpools_set(maxpools);
+       roc_idev_npa_maxpools_set(parse_aura_size(pci_dev->device.devargs));
        rc = roc_npa_dev_init(dev);
        if (rc)
                goto mz_free;
diff --git a/drivers/mempool/cnxk/cnxk_mempool.h 
b/drivers/mempool/cnxk/cnxk_mempool.h
index 8f226f861c..6e54346e6a 100644
--- a/drivers/mempool/cnxk/cnxk_mempool.h
+++ b/drivers/mempool/cnxk/cnxk_mempool.h
@@ -23,6 +23,7 @@ int __rte_hot cnxk_mempool_enq(struct rte_mempool *mp, void 
*const *obj_table,
 int __rte_hot cnxk_mempool_deq(struct rte_mempool *mp, void **obj_table,
                               unsigned int n);
 
-extern uintptr_t *cnxk_mempool_internal_data;
+int cn10k_mempool_lf_init(void);
+void cn10k_mempool_lf_fini(void);
 
 #endif
diff --git a/drivers/mempool/cnxk/cnxk_mempool_ops.c 
b/drivers/mempool/cnxk/cnxk_mempool_ops.c
index 29a4c12208..18f125c7ac 100644
--- a/drivers/mempool/cnxk/cnxk_mempool_ops.c
+++ b/drivers/mempool/cnxk/cnxk_mempool_ops.c
@@ -2,6 +2,7 @@
  * Copyright(C) 2021 Marvell.
  */
 
+#include <rte_mbuf_pool_ops.h>
 #include <rte_mempool.h>
 
 #include "roc_api.h"
@@ -171,3 +172,30 @@ cnxk_mempool_populate(struct rte_mempool *mp, unsigned int 
max_objs,
                mp, RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ, max_objs, vaddr, iova,
                len, obj_cb, obj_cb_arg);
 }
+
+static int
+cnxk_mempool_lf_init(void)
+{
+       int rc = 0;
+
+       if (roc_model_is_cn10k()) {
+               rte_mbuf_set_platform_mempool_ops("cn10k_mempool_ops");
+               rc = cn10k_mempool_lf_init();
+       } else {
+               rte_mbuf_set_platform_mempool_ops("cn9k_mempool_ops");
+       }
+       return rc;
+}
+
+static void
+cnxk_mempool_lf_fini(void)
+{
+       if (roc_model_is_cn10k())
+               cn10k_mempool_lf_fini();
+}
+
+RTE_INIT(cnxk_mempool_ops_init)
+{
+       roc_npa_lf_init_cb_register(cnxk_mempool_lf_init);
+       roc_npa_lf_fini_cb_register(cnxk_mempool_lf_fini);
+}
-- 
2.29.2

Reply via email to