Implement the dmadev fast path for the AMD AE4DMA PMD.

This commit adds:
 - copy enqueue (rte_dma_copy): write an AE4DMA descriptor for a
   memory-to-memory transfer; on RTE_DMA_OP_FLAG_SUBMIT the doorbell
   is rung immediately.
 - submit (rte_dma_submit): advance the per-queue write_idx
   register to expose pending descriptors to the hardware.
 - completion (rte_dma_completed / rte_dma_completed_status):
   completion is detected via the hardware's per-queue read_idx
   register, which the engine advances as it processes descriptors.
   The descriptor status / err_code bytes are read only to classify
   each drained slot as success or failure, and HW error codes are
   translated to the dmadev RTE_DMA_STATUS_* enumeration.
 - burst capacity (rte_dma_burst_capacity): report the number of
   free descriptor slots, taking into account the one slot reserved
   to distinguish full from empty on the power-of-two ring.

The fast path entry points are wired through fp_obj in
ae4dma_dmadev_create(). The fill capability is not advertised;
fp_obj->fill is left zero-initialised.

Signed-off-by: Raghavendra Ningoji <[email protected]>
---
 doc/guides/dmadevs/ae4dma.rst      |  22 +++
 drivers/dma/ae4dma/ae4dma_dmadev.c | 288 +++++++++++++++++++++++++++++
 2 files changed, 310 insertions(+)

diff --git a/doc/guides/dmadevs/ae4dma.rst b/doc/guides/dmadevs/ae4dma.rst
index a85c1d92ca..37a2096ccf 100644
--- a/doc/guides/dmadevs/ae4dma.rst
+++ b/doc/guides/dmadevs/ae4dma.rst
@@ -51,3 +51,25 @@ On probe the PMD performs the following steps for each PCI 
function:
   IOVA-contiguous memory, programs the queue base address and ring
   depth into the per-queue registers, and enables the queue.
 * Interrupts are masked; completion is polled by the application.
+
+Usage
+-----
+
+Once a dmadev has been started, copies are submitted with
+``rte_dma_copy()`` and completions are reaped with ``rte_dma_completed()``
+or ``rte_dma_completed_status()``. See the
+:ref:`Enqueue / Dequeue API <dmadev_enqueue_dequeue>` section of the
+dmadev library documentation for details.
+
+Limitations
+-----------
+
+* Only memory-to-memory copies are supported. Fill, scatter-gather and
+  any other operation types are not advertised in
+  ``rte_dma_info::dev_capa``.
+* The maximum number of descriptors per virtual channel is fixed by
+  hardware at 32. The PMD rounds the requested ring size up to a
+  power of two and clamps it to 32.
+* Only a single virtual channel per dmadev is supported; use the 16
+  per-PCI-function dmadevs to obtain channel-level parallelism.
+* Interrupt-driven completion is not supported.
diff --git a/drivers/dma/ae4dma/ae4dma_dmadev.c 
b/drivers/dma/ae4dma/ae4dma_dmadev.c
index dfda723c13..0f223fc40c 100644
--- a/drivers/dma/ae4dma/ae4dma_dmadev.c
+++ b/drivers/dma/ae4dma/ae4dma_dmadev.c
@@ -167,6 +167,73 @@ ae4dma_dev_close(struct rte_dma_dev *dev)
        cmd_q->qbase_phys_addr = 0;
        return 0;
 }
+
+/* trigger h/w to process enqued desc:doorbell - by next_write */
+static inline void
+__submit(struct ae4dma_dmadev *ae4dma)
+{
+       struct ae4dma_cmd_queue *cmd_q = &ae4dma->cmd_q;
+       uint16_t write_idx = cmd_q->next_write;
+       uint16_t nb = cmd_q->qcfg.nb_desc;
+
+       AE4DMA_WRITE_REG(&cmd_q->hwq_regs->write_idx, write_idx);
+       if (nb != 0)
+               cmd_q->stats.submitted += (uint16_t)((cmd_q->next_write - 
cmd_q->last_write +
+                               nb) % nb);
+       cmd_q->last_write = cmd_q->next_write;
+}
+
+static int
+ae4dma_submit(void *dev_private, uint16_t vchan __rte_unused)
+{
+       struct ae4dma_dmadev *ae4dma = dev_private;
+
+       __submit(ae4dma);
+       return 0;
+}
+
+/* Write descriptor for enqueue (copy only). */
+static inline int
+__write_desc_copy(void *dev_private, rte_iova_t src, rte_iova_t dst,
+               uint32_t len, uint64_t flags)
+{
+       struct ae4dma_dmadev *ae4dma = dev_private;
+       struct ae4dma_cmd_queue *cmd_q = &ae4dma->cmd_q;
+       struct ae4dma_desc *dma_desc;
+       uint16_t ret;
+       uint16_t nb = cmd_q->qcfg.nb_desc;
+       uint16_t write = cmd_q->next_write;
+
+       if (nb == 0)
+               return -EINVAL;
+
+       /* Reserve one slot to distinguish full from empty (power-of-two ring). 
*/
+       if ((uint32_t)cmd_q->ring_buff_count >= (uint32_t)(nb - 1))
+               return -ENOSPC;
+
+       dma_desc = &cmd_q->qbase_desc[write];
+       memset(dma_desc, 0, sizeof(*dma_desc));
+       dma_desc->length = len;
+       dma_desc->src_hi = upper_32_bits(src);
+       dma_desc->src_lo = lower_32_bits(src);
+       dma_desc->dst_hi = upper_32_bits(dst);
+       dma_desc->dst_lo = lower_32_bits(dst);
+       cmd_q->ring_buff_count++;
+       cmd_q->next_write = (uint16_t)((write + 1) % nb);
+       ret = write;
+       if (flags & RTE_DMA_OP_FLAG_SUBMIT)
+               __submit(ae4dma);
+       return ret;
+}
+
+/* Enqueue a copy operation onto the ae4dma device. */
+static int
+ae4dma_enqueue_copy(void *dev_private, uint16_t vchan __rte_unused,
+               rte_iova_t src, rte_iova_t dst, uint32_t length, uint64_t flags)
+{
+       return __write_desc_copy(dev_private, src, dst, length, flags);
+}
+
 /* Dump DMA device info. */
 static int
 ae4dma_dev_dump(const struct rte_dma_dev *dev, FILE *f)
@@ -197,6 +264,220 @@ ae4dma_dev_dump(const struct rte_dma_dev *dev, FILE *f)
                cmd_q->stats.errors);
        return 0;
 }
+
+/* Translates AE4DMA ChanERRs to DMA error codes. */
+static inline enum rte_dma_status_code
+__translate_status_ae4dma_to_dma(enum ae4dma_dma_err status)
+{
+       AE4DMA_PMD_DEBUG("ae4dma desc status = %d", status);
+
+       switch (status) {
+       case AE4DMA_DMA_ERR_NO_ERR:
+               return RTE_DMA_STATUS_SUCCESSFUL;
+       case AE4DMA_DMA_ERR_INV_LEN:
+               return RTE_DMA_STATUS_INVALID_LENGTH;
+       case AE4DMA_DMA_ERR_INV_SRC:
+               return RTE_DMA_STATUS_INVALID_SRC_ADDR;
+       case AE4DMA_DMA_ERR_INV_DST:
+               return RTE_DMA_STATUS_INVALID_DST_ADDR;
+       case AE4DMA_DMA_ERR_INV_ALIGN:
+               /* Name matches DPDK public enum spelling. */
+               return RTE_DMA_STATUS_DATA_POISION;
+       case AE4DMA_DMA_ERR_INV_HEADER:
+       case AE4DMA_DMA_ERR_INV_STATUS:
+               return RTE_DMA_STATUS_ERROR_UNKNOWN;
+       default:
+               return RTE_DMA_STATUS_ERROR_UNKNOWN;
+       }
+}
+
+/*
+ * Scan HW queue for completed descriptors (non-blocking).
+ *
+ * The AE4DMA engine signals completion by advancing the per-queue
+ * `read_idx` register; it does not (reliably) write a status value
+ * back into the descriptor. We therefore use the HW `read_idx`
+ * register as the source of truth and only inspect the descriptor's
+ * `dw1.err_code` byte to classify each completion as success or
+ * failure.
+ *
+ * @param cmd_q
+ *   The AE4DMA command queue.
+ * @param max_ops
+ *   Maximum descriptors to process this call.
+ * @param[out] failed_count
+ *   Number of completed descriptors that did not report success.
+ * @return
+ *   Number of descriptors completed (success + failure), <= max_ops.
+ */
+static inline uint16_t
+ae4dma_scan_hwq(struct ae4dma_cmd_queue *cmd_q, uint16_t max_ops,
+               uint16_t *failed_count)
+{
+       volatile struct ae4dma_desc *hw_desc;
+       uint16_t events_count = 0, fails = 0;
+       uint16_t tail;
+       uint16_t nb = cmd_q->qcfg.nb_desc;
+       uint16_t mask;
+       uint16_t hw_read_idx;
+       uint16_t in_flight;
+       uint16_t scan_cap;
+
+       if (nb == 0 || cmd_q->ring_buff_count == 0) {
+               *failed_count = 0;
+               return 0;
+       }
+       mask = nb - 1;
+
+       hw_read_idx = (uint16_t)(AE4DMA_READ_REG(&cmd_q->hwq_regs->read_idx) & 
mask);
+       tail = cmd_q->next_read;
+
+       /*
+        * Descriptors completed since our last visit live in the
+        * half-open ring range [tail, hw_read_idx). If HW hasn't
+        * moved we have nothing to do.
+        */
+       in_flight = (uint16_t)((hw_read_idx - tail) & mask);
+       if (in_flight == 0) {
+               *failed_count = 0;
+               return 0;
+       }
+
+       scan_cap = max_ops;
+       if (scan_cap > AE4DMA_DESCRIPTORS_PER_CMDQ)
+               scan_cap = AE4DMA_DESCRIPTORS_PER_CMDQ;
+       if (scan_cap > in_flight)
+               scan_cap = in_flight;
+       if (scan_cap > cmd_q->ring_buff_count)
+               scan_cap = (uint16_t)cmd_q->ring_buff_count;
+
+       while (events_count < scan_cap) {
+               uint8_t hw_status;
+               uint8_t hw_err;
+
+               hw_desc = &cmd_q->qbase_desc[tail];
+               hw_status = hw_desc->dw1.status;
+               hw_err = hw_desc->dw1.err_code;
+
+               /*
+                * read_idx advancing is the definitive completion
+                * signal. The per-descriptor status byte is informational
+                * and may not yet be written when we observe it:
+                *
+                *   AE4DMA_DMA_DESC_ERROR (4)
+                *     Hard failure - err_code names the precise cause.
+                *   AE4DMA_DMA_DESC_COMPLETED (3) or 0
+                *     Success.
+                *   AE4DMA_DMA_DESC_VALIDATED (1) / _PROCESSED (2)
+                *     Benign race: HW had not finished updating the
+                *     status byte at the instant we read it. Since
+                *     read_idx has moved past this slot, treat it as
+                *     success unless err_code says otherwise.
+                *
+                * A non-zero err_code is treated as a failure regardless
+                * of the observed status value.
+                */
+               if (hw_status == AE4DMA_DMA_DESC_ERROR ||
+                               hw_err != AE4DMA_DMA_ERR_NO_ERR) {
+                       fails++;
+                       AE4DMA_PMD_WARN("Desc failed: status=%u err=%u",
+                                       hw_status, hw_err);
+               }
+               cmd_q->status[events_count] = (enum ae4dma_dma_err)hw_err;
+               cmd_q->ring_buff_count--;
+               events_count++;
+               tail = (tail + 1) & mask;
+       }
+
+       cmd_q->stats.completed += events_count;
+       cmd_q->stats.errors += fails;
+       cmd_q->next_read = tail;
+       *failed_count = fails;
+       return events_count;
+}
+
+/* Returns successful operations count and sets error flag if any errors. */
+static uint16_t
+ae4dma_completed(void *dev_private, uint16_t vchan __rte_unused,
+               const uint16_t max_ops, uint16_t *last_idx, bool *has_error)
+{
+       struct ae4dma_dmadev *ae4dma = dev_private;
+       struct ae4dma_cmd_queue *cmd_q = &ae4dma->cmd_q;
+       uint16_t cpl_count, sl_count;
+       uint16_t err_count = 0;
+       uint16_t nb = cmd_q->qcfg.nb_desc;
+
+       *has_error = false;
+
+       cpl_count = ae4dma_scan_hwq(cmd_q, max_ops, &err_count);
+
+       if (cpl_count > max_ops)
+               cpl_count = max_ops;
+
+       if (cpl_count > 0 && last_idx != NULL)
+               *last_idx = (uint16_t)((cmd_q->next_read - 1 + nb) % nb);
+
+       sl_count = cpl_count - err_count;
+       if (err_count)
+               *has_error = true;
+
+       return sl_count;
+}
+
+static uint16_t
+ae4dma_completed_status(void *dev_private, uint16_t vchan __rte_unused,
+               uint16_t max_ops, uint16_t *last_idx,
+               enum rte_dma_status_code *status)
+{
+       struct ae4dma_dmadev *ae4dma = dev_private;
+       struct ae4dma_cmd_queue *cmd_q = &ae4dma->cmd_q;
+       uint16_t cpl_count;
+       uint16_t i;
+       uint16_t err_count = 0;
+       uint16_t nb = cmd_q->qcfg.nb_desc;
+
+       cpl_count = ae4dma_scan_hwq(cmd_q, max_ops, &err_count);
+
+       if (cpl_count > max_ops)
+               cpl_count = max_ops;
+
+       if (cpl_count > 0 && last_idx != NULL)
+               *last_idx = (uint16_t)((cmd_q->next_read - 1 + nb) % nb);
+
+       if (likely(err_count == 0)) {
+               for (i = 0; i < cpl_count; i++)
+                       status[i] = RTE_DMA_STATUS_SUCCESSFUL;
+       } else {
+               for (i = 0; i < cpl_count; i++)
+                       status[i] = 
__translate_status_ae4dma_to_dma(cmd_q->status[i]);
+       }
+
+       return cpl_count;
+}
+
+/* Get the remaining capacity of the ring. */
+static uint16_t
+ae4dma_burst_capacity(const void *dev_private, uint16_t vchan __rte_unused)
+{
+       const struct ae4dma_dmadev *ae4dma = dev_private;
+       const struct ae4dma_cmd_queue *cmd_q = &ae4dma->cmd_q;
+       uint16_t nb = cmd_q->qcfg.nb_desc;
+       uint16_t mask;
+       uint16_t read_idx = cmd_q->next_read;
+       uint16_t write_idx = cmd_q->next_write;
+       uint16_t used;
+
+       if (nb < 2 || !rte_is_power_of_2(nb))
+               return 0;
+
+       mask = nb - 1;
+       used = (uint16_t)((write_idx - read_idx) & mask);
+       /* One slot reserved (same rule as enqueue). */
+       if (used >= nb - 1)
+               return 0;
+       return (uint16_t)(nb - 1 - used);
+}
+
 /* Retrieve the generic stats of a DMA device. */
 static int
 ae4dma_stats_get(const struct rte_dma_dev *dev, uint16_t vchan __rte_unused,
@@ -357,6 +638,13 @@ ae4dma_dmadev_create(const char *name, struct 
rte_pci_device *dev, uint8_t qn)
        dmadev->fp_obj->dev_private = dmadev->data->dev_private;
        dmadev->dev_ops = &ae4dma_dmadev_ops;
 
+       dmadev->fp_obj->burst_capacity = ae4dma_burst_capacity;
+       dmadev->fp_obj->completed = ae4dma_completed;
+       dmadev->fp_obj->completed_status = ae4dma_completed_status;
+       dmadev->fp_obj->copy = ae4dma_enqueue_copy;
+       dmadev->fp_obj->submit = ae4dma_submit;
+       /* fill capability not advertised: leave fp_obj->fill as 
zero-initialised. */
+
        ae4dma = dmadev->data->dev_private;
        ae4dma->dmadev = dmadev;
        ae4dma->pci = dev;
-- 
2.34.1

Reply via email to