On 6/26/2026 2:47 AM, Raghavendra Ningoji wrote: > Implement the dmadev fast path for the AMD AE4DMA PMD. > > This commit adds: > - copy enqueue (rte_dma_copy): write an AE4DMA descriptor for a > memory-to-memory transfer; on RTE_DMA_OP_FLAG_SUBMIT the doorbell > is rung immediately. > - submit (rte_dma_submit): advance the per-queue write_idx > register to expose pending descriptors to the hardware. > - completion (rte_dma_completed / rte_dma_completed_status): > completion is detected via the hardware's per-queue read_idx > register, which the engine advances as it processes descriptors. > The descriptor status / err_code bytes are read only to classify > each drained slot as success or failure, and HW error codes are > translated to the dmadev RTE_DMA_STATUS_* enumeration. > - burst capacity (rte_dma_burst_capacity): report the number of > free descriptor slots, taking into account the one slot reserved > to distinguish full from empty on the power-of-two ring.
I don't think it's necessary to write in such detail because the ops implemented are defined by the framework. If needed, you can supplement by explaining what special features this driver has. > > The fast path entry points are wired through fp_obj in > ae4dma_dmadev_create(). The fill capability is not advertised; > fp_obj->fill is left zero-initialised. > > Signed-off-by: Raghavendra Ningoji <[email protected]> > --- > doc/guides/dmadevs/ae4dma.rst | 22 +++ > drivers/dma/ae4dma/ae4dma_dmadev.c | 287 +++++++++++++++++++++++++++++ > 2 files changed, 309 insertions(+) > > diff --git a/doc/guides/dmadevs/ae4dma.rst b/doc/guides/dmadevs/ae4dma.rst > index a85c1d92ca..37a2096ccf 100644 > --- a/doc/guides/dmadevs/ae4dma.rst > +++ b/doc/guides/dmadevs/ae4dma.rst > @@ -51,3 +51,25 @@ On probe the PMD performs the following steps for each PCI > function: > IOVA-contiguous memory, programs the queue base address and ring > depth into the per-queue registers, and enables the queue. > * Interrupts are masked; completion is polled by the application. > + > +Usage > +----- > + > +Once a dmadev has been started, copies are submitted with > +``rte_dma_copy()`` and completions are reaped with ``rte_dma_completed()`` > +or ``rte_dma_completed_status()``. See the > +:ref:`Enqueue / Dequeue API <dmadev_enqueue_dequeue>` section of the > +dmadev library documentation for details. > + > +Limitations > +----------- > + > +* Only memory-to-memory copies are supported. Fill, scatter-gather and > + any other operation types are not advertised in > + ``rte_dma_info::dev_capa``. > +* The maximum number of descriptors per virtual channel is fixed by > + hardware at 32. The PMD rounds the requested ring size up to a > + power of two and clamps it to 32. > +* Only a single virtual channel per dmadev is supported; use the 16 > + per-PCI-function dmadevs to obtain channel-level parallelism. > +* Interrupt-driven completion is not supported. > diff --git a/drivers/dma/ae4dma/ae4dma_dmadev.c > b/drivers/dma/ae4dma/ae4dma_dmadev.c > index 607f288623..da3ec42233 100644 > --- a/drivers/dma/ae4dma/ae4dma_dmadev.c > +++ b/drivers/dma/ae4dma/ae4dma_dmadev.c > @@ -158,6 +158,72 @@ ae4dma_dev_close(struct rte_dma_dev *dev) > return 0; > } > > +/* trigger h/w to process enqued desc:doorbell - by next_write */ > +static inline void > +__submit(struct ae4dma_dmadev *ae4dma) > +{ > + struct ae4dma_cmd_queue *cmd_q = &ae4dma->cmd_q; > + uint16_t write_idx = cmd_q->next_write; > + uint16_t nb = cmd_q->qcfg.nb_desc; > + > + AE4DMA_WRITE_REG(&cmd_q->hwq_regs->write_idx, write_idx); > + if (nb != 0) > + cmd_q->stats.submitted += (uint16_t)((cmd_q->next_write - > cmd_q->last_write + > + nb) % nb); > + cmd_q->last_write = cmd_q->next_write; > +} > + > +static int > +ae4dma_submit(void *dev_private, uint16_t vchan __rte_unused) > +{ > + struct ae4dma_dmadev *ae4dma = dev_private; > + > + __submit(ae4dma); > + return 0; > +} > + > +/* Write descriptor for enqueue (copy only). */ > +static inline int > +__write_desc_copy(void *dev_private, rte_iova_t src, rte_iova_t dst, > + uint32_t len, uint64_t flags) > +{ > + struct ae4dma_dmadev *ae4dma = dev_private; > + struct ae4dma_cmd_queue *cmd_q = &ae4dma->cmd_q; > + struct ae4dma_desc *dma_desc; > + uint16_t ret; > + uint16_t nb = cmd_q->qcfg.nb_desc; > + uint16_t write = cmd_q->next_write; > + > + if (nb == 0) > + return -EINVAL; > + > + /* Reserve one slot to distinguish full from empty (power-of-two ring). > */ > + if ((uint32_t)cmd_q->ring_buff_count >= (uint32_t)(nb - 1)) > + return -ENOSPC; > + > + dma_desc = &cmd_q->qbase_desc[write]; > + memset(dma_desc, 0, sizeof(*dma_desc)); > + dma_desc->length = len; > + dma_desc->src_hi = upper_32_bits(src); > + dma_desc->src_lo = lower_32_bits(src); > + dma_desc->dst_hi = upper_32_bits(dst); > + dma_desc->dst_lo = lower_32_bits(dst); > + cmd_q->ring_buff_count++; > + cmd_q->next_write = (uint16_t)((write + 1) % nb); the next_write is [0, nb_desc-1], and it will as return value as copy, but the dmadev framework expect as [0, 0xFFFF], I doubt your drvier was not passed in any DMA test (e.g. dpdk-test, dpdk-dma-perf or examples/dma) > + ret = write; > + if (flags & RTE_DMA_OP_FLAG_SUBMIT) > + __submit(ae4dma); > + return ret; > +} > + > +/* Enqueue a copy operation onto the ae4dma device. */ > +static int > +ae4dma_enqueue_copy(void *dev_private, uint16_t vchan __rte_unused, > + rte_iova_t src, rte_iova_t dst, uint32_t length, uint64_t flags) > +{ > + return __write_desc_copy(dev_private, src, dst, length, flags); > +} > + > static int > ae4dma_dev_dump(const struct rte_dma_dev *dev, FILE *f) > { > @@ -187,6 +253,220 @@ ae4dma_dev_dump(const struct rte_dma_dev *dev, FILE *f) > cmd_q->stats.errors); > return 0; > } > + > +/* Translates AE4DMA ChanERRs to DMA error codes. */ > +static inline enum rte_dma_status_code > +__translate_status_ae4dma_to_dma(enum ae4dma_dma_err status) > +{ > + AE4DMA_PMD_DEBUG("ae4dma desc status = %d", status); > + > + switch (status) { > + case AE4DMA_DMA_ERR_NO_ERR: > + return RTE_DMA_STATUS_SUCCESSFUL; > + case AE4DMA_DMA_ERR_INV_LEN: > + return RTE_DMA_STATUS_INVALID_LENGTH; > + case AE4DMA_DMA_ERR_INV_SRC: > + return RTE_DMA_STATUS_INVALID_SRC_ADDR; > + case AE4DMA_DMA_ERR_INV_DST: > + return RTE_DMA_STATUS_INVALID_DST_ADDR; > + case AE4DMA_DMA_ERR_INV_ALIGN: > + /* Name matches DPDK public enum spelling. */ > + return RTE_DMA_STATUS_DATA_POISION; Suggest add RTE_DMA_STATUS_INVALID_ALIGN enum in rte_dmadev.h > + case AE4DMA_DMA_ERR_INV_HEADER: > + case AE4DMA_DMA_ERR_INV_STATUS: > + return RTE_DMA_STATUS_ERROR_UNKNOWN; > + default: > + return RTE_DMA_STATUS_ERROR_UNKNOWN; > + } > +} > + > +/* > + * Scan HW queue for completed descriptors (non-blocking). > + * > + * The AE4DMA engine signals completion by advancing the per-queue > + * `read_idx` register; it does not (reliably) write a status value > + * back into the descriptor. We therefore use the HW `read_idx` > + * register as the source of truth and only inspect the descriptor's > + * `dw1.err_code` byte to classify each completion as success or > + * failure. > + * > + * @param cmd_q > + * The AE4DMA command queue. > + * @param max_ops > + * Maximum descriptors to process this call. > + * @param[out] failed_count > + * Number of completed descriptors that did not report success. > + * @return > + * Number of descriptors completed (success + failure), <= max_ops. > + */ > +static inline uint16_t > +ae4dma_scan_hwq(struct ae4dma_cmd_queue *cmd_q, uint16_t max_ops, > + uint16_t *failed_count) > +{ > + volatile struct ae4dma_desc *hw_desc; > + uint16_t events_count = 0, fails = 0; > + uint16_t tail; > + uint16_t nb = cmd_q->qcfg.nb_desc; > + uint16_t mask; > + uint16_t hw_read_idx; > + uint16_t in_flight; > + uint16_t scan_cap; > + > + if (nb == 0 || cmd_q->ring_buff_count == 0) { > + *failed_count = 0; > + return 0; > + } > + mask = nb - 1; > + > + hw_read_idx = (uint16_t)(AE4DMA_READ_REG(&cmd_q->hwq_regs->read_idx) & > mask); > + tail = cmd_q->next_read; > + > + /* > + * Descriptors completed since our last visit live in the > + * half-open ring range [tail, hw_read_idx). If HW hasn't > + * moved we have nothing to do. > + */ > + in_flight = (uint16_t)((hw_read_idx - tail) & mask); > + if (in_flight == 0) { > + *failed_count = 0; > + return 0; > + } > + > + scan_cap = max_ops; > + if (scan_cap > AE4DMA_DESCRIPTORS_PER_CMDQ) > + scan_cap = AE4DMA_DESCRIPTORS_PER_CMDQ; > + if (scan_cap > in_flight) > + scan_cap = in_flight; > + if (scan_cap > cmd_q->ring_buff_count) > + scan_cap = (uint16_t)cmd_q->ring_buff_count; > + > + while (events_count < scan_cap) { > + uint8_t hw_status; > + uint8_t hw_err; > + > + hw_desc = &cmd_q->qbase_desc[tail]; > + hw_status = hw_desc->dw1.status; > + hw_err = hw_desc->dw1.err_code; > + > + /* > + * read_idx advancing is the definitive completion > + * signal. The per-descriptor status byte is informational > + * and may not yet be written when we observe it: > + * > + * AE4DMA_DMA_DESC_ERROR (4) > + * Hard failure - err_code names the precise cause. > + * AE4DMA_DMA_DESC_COMPLETED (3) or 0 > + * Success. > + * AE4DMA_DMA_DESC_VALIDATED (1) / _PROCESSED (2) > + * Benign race: HW had not finished updating the > + * status byte at the instant we read it. Since > + * read_idx has moved past this slot, treat it as > + * success unless err_code says otherwise. > + * > + * A non-zero err_code is treated as a failure regardless > + * of the observed status value. > + */ > + if (hw_status == AE4DMA_DMA_DESC_ERROR || > + hw_err != AE4DMA_DMA_ERR_NO_ERR) { > + fails++; > + AE4DMA_PMD_WARN("Desc failed: status=%u err=%u", > + hw_status, hw_err); > + } > + cmd_q->status[events_count] = (enum ae4dma_dma_err)hw_err; > + cmd_q->ring_buff_count--; > + events_count++; > + tail = (tail + 1) & mask; > + } > + > + cmd_q->stats.completed += events_count; > + cmd_q->stats.errors += fails; > + cmd_q->next_read = tail; > + *failed_count = fails; > + return events_count; > +} > + > +/* Returns successful operations count and sets error flag if any errors. */ > +static uint16_t > +ae4dma_completed(void *dev_private, uint16_t vchan __rte_unused, > + const uint16_t max_ops, uint16_t *last_idx, bool *has_error) > +{ > + struct ae4dma_dmadev *ae4dma = dev_private; > + struct ae4dma_cmd_queue *cmd_q = &ae4dma->cmd_q; > + uint16_t cpl_count, sl_count; > + uint16_t err_count = 0; > + uint16_t nb = cmd_q->qcfg.nb_desc; > + > + *has_error = false; > + > + cpl_count = ae4dma_scan_hwq(cmd_q, max_ops, &err_count); > + > + if (cpl_count > max_ops) > + cpl_count = max_ops; > + > + if (cpl_count > 0 && last_idx != NULL) > + *last_idx = (uint16_t)((cmd_q->next_read - 1 + nb) % nb); the last_idx should be in range of [0, 0xFFFF] > + > + sl_count = cpl_count - err_count; > + if (err_count) > + *has_error = true; > + > + return sl_count; > +} > + > +static uint16_t > +ae4dma_completed_status(void *dev_private, uint16_t vchan __rte_unused, > + uint16_t max_ops, uint16_t *last_idx, > + enum rte_dma_status_code *status) > +{ > + struct ae4dma_dmadev *ae4dma = dev_private; > + struct ae4dma_cmd_queue *cmd_q = &ae4dma->cmd_q; > + uint16_t cpl_count; > + uint16_t i; > + uint16_t err_count = 0; > + uint16_t nb = cmd_q->qcfg.nb_desc; > + > + cpl_count = ae4dma_scan_hwq(cmd_q, max_ops, &err_count); > + > + if (cpl_count > max_ops) > + cpl_count = max_ops; > + > + if (cpl_count > 0 && last_idx != NULL) > + *last_idx = (uint16_t)((cmd_q->next_read - 1 + nb) % nb); > + > + if (likely(err_count == 0)) { > + for (i = 0; i < cpl_count; i++) > + status[i] = RTE_DMA_STATUS_SUCCESSFUL; > + } else { > + for (i = 0; i < cpl_count; i++) > + status[i] = > __translate_status_ae4dma_to_dma(cmd_q->status[i]); > + } > + > + return cpl_count; > +} > + > +/* Get the remaining capacity of the ring. */ > +static uint16_t > +ae4dma_burst_capacity(const void *dev_private, uint16_t vchan __rte_unused) > +{ > + const struct ae4dma_dmadev *ae4dma = dev_private; > + const struct ae4dma_cmd_queue *cmd_q = &ae4dma->cmd_q; > + uint16_t nb = cmd_q->qcfg.nb_desc; > + uint16_t mask; > + uint16_t read_idx = cmd_q->next_read; > + uint16_t write_idx = cmd_q->next_write; > + uint16_t used; > + > + if (nb < 2 || !rte_is_power_of_2(nb)) > + return 0; No need to check this > + > + mask = nb - 1; > + used = (uint16_t)((write_idx - read_idx) & mask); > + /* One slot reserved (same rule as enqueue). */ > + if (used >= nb - 1) > + return 0; > + return (uint16_t)(nb - 1 - used); > +} > + > static int > ae4dma_stats_get(const struct rte_dma_dev *dev, uint16_t vchan __rte_unused, > struct rte_dma_stats *rte_stats, uint32_t size) > @@ -342,6 +622,13 @@ ae4dma_dmadev_create(const char *name, struct > rte_pci_device *dev, uint8_t qn) > dmadev->fp_obj->dev_private = dmadev->data->dev_private; > dmadev->dev_ops = &ae4dma_dmadev_ops; > > + dmadev->fp_obj->burst_capacity = ae4dma_burst_capacity; > + dmadev->fp_obj->completed = ae4dma_completed; > + dmadev->fp_obj->completed_status = ae4dma_completed_status; > + dmadev->fp_obj->copy = ae4dma_enqueue_copy; > + dmadev->fp_obj->submit = ae4dma_submit; > + /* fill capability not advertised: leave fp_obj->fill as > zero-initialised. */ > + > ae4dma = dmadev->data->dev_private; > > if (ae4dma_add_queue(ae4dma, dev, qn, name) != 0)

