> From: Bruce Richardson [mailto:[email protected]] > Sent: Friday, 19 December 2025 18.26 > > Use a non-volatile uint64_t pointer to store to the descriptor ring. > This will allow the compiler to optionally merge the stores as it sees > best.
I suppose there was a reason for the volatile. Is removing it really safe? E.g. this will also allow the compiler to reorder stores; not just the pair of 64-bits, but also stores to multiple descriptors. One more comment inline below. > > Signed-off-by: Bruce Richardson <[email protected]> > --- > drivers/net/intel/common/tx_scalar_fns.h | 24 ++++++++++++++++-------- > 1 file changed, 16 insertions(+), 8 deletions(-) > > diff --git a/drivers/net/intel/common/tx_scalar_fns.h > b/drivers/net/intel/common/tx_scalar_fns.h > index 7b643fcf44..95e9acbe60 100644 > --- a/drivers/net/intel/common/tx_scalar_fns.h > +++ b/drivers/net/intel/common/tx_scalar_fns.h > @@ -184,6 +184,15 @@ struct ci_timesstamp_queue_fns { > write_ts_tail_t write_ts_tail; > }; > > +static inline void > +write_txd(volatile void *txd, uint64_t qw0, uint64_t qw1) > +{ > + uint64_t *txd_qw = RTE_CAST_PTR(void *, txd); If the descriptors are 16-byte aligned, you could mark them as such, so the compiler can use 128-bit stores on architectures where alignment matters. > + > + txd_qw[0] = rte_cpu_to_le_64(qw0); > + txd_qw[1] = rte_cpu_to_le_64(qw1); > +} > + > static inline uint16_t > ci_xmit_pkts(struct ci_tx_queue *txq, > struct rte_mbuf **tx_pkts, > @@ -313,8 +322,7 @@ ci_xmit_pkts(struct ci_tx_queue *txq, > txe->mbuf = NULL; > } > > - ctx_txd[0] = cd_qw0; > - ctx_txd[1] = cd_qw1; > + write_txd(ctx_txd, cd_qw0, cd_qw1); > > txe->last_id = tx_last; > tx_id = txe->next_id; > @@ -361,12 +369,12 @@ ci_xmit_pkts(struct ci_tx_queue *txq, > > while ((ol_flags & (RTE_MBUF_F_TX_TCP_SEG | > RTE_MBUF_F_TX_UDP_SEG)) && > unlikely(slen > CI_MAX_DATA_PER_TXD)) { > - txd->buffer_addr = > rte_cpu_to_le_64(buf_dma_addr); > - txd->cmd_type_offset_bsz = > rte_cpu_to_le_64(CI_TX_DESC_DTYPE_DATA | > + const uint64_t cmd_type_offset_bsz = > CI_TX_DESC_DTYPE_DATA | > ((uint64_t)td_cmd << CI_TXD_QW1_CMD_S) | > ((uint64_t)td_offset << > CI_TXD_QW1_OFFSET_S) | > ((uint64_t)CI_MAX_DATA_PER_TXD << > CI_TXD_QW1_TX_BUF_SZ_S) | > - ((uint64_t)td_tag << > CI_TXD_QW1_L2TAG1_S)); > + ((uint64_t)td_tag << > CI_TXD_QW1_L2TAG1_S); > + write_txd(txd, buf_dma_addr, > cmd_type_offset_bsz); > > buf_dma_addr += CI_MAX_DATA_PER_TXD; > slen -= CI_MAX_DATA_PER_TXD; > @@ -382,12 +390,12 @@ ci_xmit_pkts(struct ci_tx_queue *txq, > if (m_seg->next == NULL) > td_cmd |= CI_TX_DESC_CMD_EOP; > > - txd->buffer_addr = rte_cpu_to_le_64(buf_dma_addr); > - txd->cmd_type_offset_bsz = > rte_cpu_to_le_64(CI_TX_DESC_DTYPE_DATA | > + const uint64_t cmd_type_offset_bsz = > CI_TX_DESC_DTYPE_DATA | > ((uint64_t)td_cmd << CI_TXD_QW1_CMD_S) | > ((uint64_t)td_offset << CI_TXD_QW1_OFFSET_S) | > ((uint64_t)slen << CI_TXD_QW1_TX_BUF_SZ_S) | > - ((uint64_t)td_tag << CI_TXD_QW1_L2TAG1_S)); > + ((uint64_t)td_tag << CI_TXD_QW1_L2TAG1_S); > + write_txd(txd, buf_dma_addr, cmd_type_offset_bsz); > > txe->last_id = tx_last; > tx_id = txe->next_id; > -- > 2.51.0

