stm32_spi: chunk DMA exchanges to honor 16-bit NDTR

xiaoxiang Thu, 14 May 2026 19:11:01 -0700

This is an automated email from the ASF dual-hosted git repository.

xiaoxiang781216 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nuttx.git



The following commit(s) were added to refs/heads/master by this push:
     new 8302295dd01 arch/arm/src/stm32/stm32_spi: chunk DMA exchanges to honor 
16-bit NDTR
8302295dd01 is described below

commit 8302295dd01850bfdb19648466e507ca4784eede
Author: Jinji Cui <[email protected]>
AuthorDate: Thu May 14 09:57:56 2026 +0800

    arch/arm/src/stm32/stm32_spi: chunk DMA exchanges to honor 16-bit NDTR
    
    The STM32 DMA NDTR/CNDTR transfer-count register is 16 bits wide on
    every STM32 series the in-tree driver supports (IPv1 CNDTR, IPv2
    SxNDTR).  spi_exchange()'s DMA path forwarded the caller's full
    nwords to stm32_dmasetup(), so a single SPI_EXCHANGE() of >= 65536
    words silently programmed NDTR to (nwords & 0xffff).  When the
    truncated count was zero - the typical case for an exact 64 KiB
    transfer (flash erase block, FAT cluster, common DMA staging
    buffer) - the stream completed instantly with no transfer-complete
    interrupt and the caller deadlocked in spi_dmarxwait().
    
    Walk the request in chunks of at most 65535 words inside the
    existing DMA branch, reusing the same spi_dma{rx,tx}{setup,start,
    wait}() sequence per chunk.  Single-descriptor transfers (every
    in-tree caller today) are byte-for-byte identical.  CONFIG_SPI_TRIGGER
    is honored for the first chunk only; subsequent chunks must run
    unconditionally because re-arming between chunks of one logical
    exchange was never part of the SPI_TRIGGER contract.
    
    Drive-by: rescale priv->buflen-clamped nwords so the DMA
    descriptor matches the actually-copied byte count, promote the
    spiinfo() format specifier from %d to %zu, and fix two adjacent
    comment typos.
    
    See the PR description for reproduction, NSH log and benchmark
    numbers (5.12 MB/s, 97.5% of SCK/8 @ 42 MHz on STM32F407 + W25Q128).
    
    Signed-off-by: Jinji Cui <[email protected]>
---
 arch/arm/src/stm32/stm32_spi.c | 110 ++++++++++++++++++++++++++++++++---------
 1 file changed, 86 insertions(+), 24 deletions(-)

diff --git a/arch/arm/src/stm32/stm32_spi.c b/arch/arm/src/stm32/stm32_spi.c
index abb18e3db78..62b019ce07e 100644
--- a/arch/arm/src/stm32/stm32_spi.c
+++ b/arch/arm/src/stm32/stm32_spi.c
@@ -148,6 +148,19 @@
 #  error "Unknown STM32 DMA"
 #endif
 
+/* Maximum number of data items per single DMA descriptor.
+ *
+ * Both the STM32 DMA IPv1 (CNDTR on F0/F1/F3/G4/L0/L1/L4) and IPv2 (SxNDTR
+ * on F2/F4/F7/H7) transfer-count registers are 16 bits wide, so each call
+ * to stm32_dmasetup() can program at most 65535 transfers.  spi_exchange()
+ * below chunks larger requests to stay within this limit; without it a
+ * single SPI_EXCHANGE() of >= 64 KiB silently programs NDTR to 0 (low 16
+ * bits of nwords) and the driver blocks forever waiting on a DMA-complete
+ * IRQ that never fires.
+ */
+
+#  define STM32_SPI_DMA_MAX_XFER  65535u
+
 #  define SPIDMA_BUFFER_MASK   (4 - 1)
 #  define SPIDMA_SIZE(b) (((b) + SPIDMA_BUFFER_MASK) & ~SPIDMA_BUFFER_MASK)
 #  define SPIDMA_BUF_ALIGN   aligned_data(4)
@@ -1871,14 +1884,17 @@ static void spi_exchange(struct spi_dev_s *dev, const 
void *txbuffer,
     {
       static uint16_t rxdummy = 0xffff;
       static const uint16_t txdummy = 0xffff;
+      const size_t word_size = (priv->nbits > 8) ? 2u : 1u;
+      const uint8_t *txp;
+      uint8_t       *rxp;
 
-      spiinfo("txbuffer=%p rxbuffer=%p nwords=%d\n",
+      spiinfo("txbuffer=%p rxbuffer=%p nwords=%zu\n",
                txbuffer, rxbuffer, nwords);
       DEBUGASSERT(priv && priv->spibase);
 
       /* Setup DMAs */
 
-      /* If this bus uses a in driver buffers we will incur 2 copies,
+      /* If this bus uses an in-driver buffer we will incur 2 copies,
        * The copy cost is << less the non DMA transfer time and having
        * the buffer in the driver ensures DMA can be used. This is because
        * the API does not support passing the buffer extent so the only
@@ -1897,40 +1913,86 @@ static void spi_exchange(struct spi_dev_s *dev, const 
void *txbuffer,
           memcpy(priv->txbuf, txbuffer, nbytes);
           txbuffer  = priv->txbuf;
           rxbuffer  = rxbuffer ? priv->rxbuf : rxbuffer;
+
+          /* Rescale nwords to match the (possibly clamped) nbytes. */
+
+          nwords = (priv->nbits > 8) ? nbytes >> 1 : nbytes;
         }
 
-      spi_dmarxsetup(priv, rxbuffer, &rxdummy, nwords);
-      spi_dmatxsetup(priv, txbuffer, &txdummy, nwords);
+      txp = (const uint8_t *)txbuffer;
+      rxp = (uint8_t *)rxbuffer;
+      ret = OK;
+
+      /* Walk the request in chunks of at most STM32_SPI_DMA_MAX_XFER words.
+       * The STM32 DMA NDTR/CNDTR transfer-count register is only 16 bits
+       * wide; submitting more than 65535 transfers in a single descriptor
+       * silently programs NDTR to (nwords & 0xffff) and on most paths
+       * results in a stream that never raises transfer-complete, causing
+       * the SPI driver to block forever in spi_dmarxwait().  Splitting the
+       * request keeps each descriptor within the hardware limit and lets
+       * the W25/SD/etc. driver remain agnostic of this constraint.
+       */
+
+      while (nwords > 0)
+        {
+          size_t chunk = (nwords > STM32_SPI_DMA_MAX_XFER)
+                         ? STM32_SPI_DMA_MAX_XFER
+                         : nwords;
+
+          spi_dmarxsetup(priv, rxp, &rxdummy, chunk);
+          spi_dmatxsetup(priv, txp, &txdummy, chunk);
 
 #ifdef CONFIG_SPI_TRIGGER
-      /* Is deferred triggering in effect? */
+          /* Is deferred triggering in effect? */
 
-      if (!priv->defertrig)
-        {
-          /* No.. Start the DMAs */
+          if (!priv->defertrig)
+            {
+              /* No.. Start the DMAs */
 
-          spi_dmarxstart(priv);
-          spi_dmatxstart(priv);
-        }
-      else
-        {
-          /* Yes.. indicated that we are ready to be started */
+              spi_dmarxstart(priv);
+              spi_dmatxstart(priv);
+            }
+          else
+            {
+              /* Yes.. indicate that we are ready to be started.  Deferred
+               * triggering is only meaningful for the first (often only)
+               * chunk; subsequent chunks must run unconditionally or the
+               * caller would have to re-arm between chunks.
+               */
 
-          priv->trigarmed = true;
-        }
+              priv->trigarmed = true;
+            }
 #else
-      /* Start the DMAs */
+          /* Start the DMAs */
 
-      spi_dmarxstart(priv);
-      spi_dmatxstart(priv);
+          spi_dmarxstart(priv);
+          spi_dmatxstart(priv);
 #endif
 
-      /* Then wait for each to complete */
+          /* Then wait for each to complete */
 
-      ret = spi_dmarxwait(priv);
-      if (ret < 0)
-        {
-          ret = spi_dmatxwait(priv);
+          ret = spi_dmarxwait(priv);
+          if (ret < 0)
+            {
+              ret = spi_dmatxwait(priv);
+            }
+
+          if (ret < 0)
+            {
+              break;
+            }
+
+          if (txp != NULL)
+            {
+              txp += chunk * word_size;
+            }
+
+          if (rxp != NULL)
+            {
+              rxp += chunk * word_size;
+            }
+
+          nwords -= chunk;
         }
 
       if (rxbuffer != NULL && priv->rxbuf != NULL && ret >= 0)

(nuttx) branch master updated: arch/arm/src/stm32/stm32_spi: chunk DMA exchanges to honor 16-bit NDTR

Reply via email to