Author: imp
Date: Tue Aug 28 01:28:52 2012
New Revision: 239762
URL: http://svn.freebsd.org/changeset/base/239762

Log:
  Bring in the multi-block patches for mci.  These required extensive
  restructuring of the driver.  I've tried to preserve the other silicon
  workarounds that we've added over the years, but haven't had a chance
  to extensively test on other hardware.  On my AT91RM9200 with 30MHz/1
  wire/64 block transfers, I've been able to go from ~.66MB/s to
  2.25MB/s in the simple tests I performed, almost a 3.5x improvement.
  This cuts the boot time almost in half when everything else goes
  right (timed from rtc message to login: prompt).
  
  PR:           155214
  Submitted by: Ian Lapore

Modified:
  head/sys/arm/at91/at91_mci.c

Modified: head/sys/arm/at91/at91_mci.c
==============================================================================
--- head/sys/arm/at91/at91_mci.c        Mon Aug 27 23:27:41 2012        
(r239761)
+++ head/sys/arm/at91/at91_mci.c        Tue Aug 28 01:28:52 2012        
(r239762)
@@ -114,7 +114,24 @@ __FBSDID("$FreeBSD$");
 #define AT91_MCI_USE_30MHZ 1
 #endif
 
-#define BBSZ   512
+/*
+ * Allocate 2 bounce buffers we'll use to endian-swap the data due to the 
rm9200
+ * erratum.  We use a pair of buffers because when reading that lets us begin
+ * endian-swapping the data in the first buffer while the DMA is reading into
+ * the second buffer.  (We can't use the same trick for writing because we 
might
+ * not get all the data in the 2nd buffer swapped before the hardware needs it;
+ * dealing with that would add complexity to the driver.)
+ *
+ * The buffers are sized at 16K each due to the way the busdma cache sync
+ * operations work on arm.  A dcache_inv_range() operation on a range larger
+ * than 16K gets turned into a dcache_wbinv_all().  That needlessly flushes the
+ * entire data cache, impacting overall system performance.
+ */
+#define BBCOUNT     2
+#define BBSIZE      (16*1024)
+#define MAX_BLOCKS  ((BBSIZE*BBCOUNT)/512)
+
+static int mci_debug;
 
 struct at91_mci_softc {
        void *intrhand;                 /* Interrupt handle */
@@ -123,21 +140,25 @@ struct at91_mci_softc {
 #define        CAP_HAS_4WIRE           1       /* Has 4 wire bus */
 #define        CAP_NEEDS_BYTESWAP      2       /* broken hardware needing 
bounce */
        int flags;
-#define CMD_STARTED    1
-#define STOP_STARTED   2
+#define PENDING_CMD    0x01
+#define PENDING_STOP   0x02
+#define CMD_MULTIREAD  0x10
+#define CMD_MULTIWRITE 0x20
        int has_4wire;
        int use_30mhz;
        struct resource *irq_res;       /* IRQ resource */
        struct resource *mem_res;       /* Memory resource */
        struct mtx sc_mtx;
        bus_dma_tag_t dmatag;
-       bus_dmamap_t map;
-       int mapped;
        struct mmc_host host;
        int bus_busy;
        struct mmc_request *req;
        struct mmc_command *curcmd;
-       char bounce_buffer[BBSZ];
+       bus_dmamap_t bbuf_map[BBCOUNT];
+       char      *  bbuf_vaddr[BBCOUNT]; /* bounce bufs in KVA space */
+       uint32_t     bbuf_len[BBCOUNT];   /* len currently queued for bounce 
buf */
+       uint32_t     bbuf_curidx;         /* which bbuf is the active DMA 
buffer */
+       uint32_t     xfer_offset;         /* offset so far into caller's buf */
 };
 
 static inline uint32_t
@@ -172,6 +193,51 @@ static int at91_mci_is_mci1rev2xx(void);
 #define AT91_MCI_ASSERT_LOCKED(_sc)    mtx_assert(&_sc->sc_mtx, MA_OWNED);
 #define AT91_MCI_ASSERT_UNLOCKED(_sc) mtx_assert(&_sc->sc_mtx, MA_NOTOWNED);
 
+static void 
+at91_bswap_buf(struct at91_mci_softc *sc, void * dptr, void * sptr, uint32_t 
memsize)
+{
+       uint32_t * dst = (uint32_t *)dptr;
+       uint32_t * src = (uint32_t *)sptr;
+       uint32_t   i;
+
+       /*
+        * If the hardware doesn't need byte-swapping, let bcopy() do the
+        * work.  Use bounce buffer even if we don't need byteswap, since
+        * buffer may straddle a page boundry, and we don't handle
+        * multi-segment transfers in hardware.  Seen from 'bsdlabel -w' which
+        * uses raw geom access to the volume.  Greg Ansley (gja (at)
+        * ansley.com)
+        */
+       if (!(sc->sc_cap & CAP_NEEDS_BYTESWAP)) {
+               bcopy(dptr, sptr, memsize);
+               return;
+       }
+
+       /*
+        * Nice performance boost for slightly unrolling this loop.
+        * (But very little extra boost for further unrolling it.)
+        */
+       for (i = 0; i < memsize; i += 16) {
+               *dst++ = bswap32(*src++);
+               *dst++ = bswap32(*src++);
+               *dst++ = bswap32(*src++);
+               *dst++ = bswap32(*src++);
+       }
+
+       /* Mop up the last 1-3 words, if any. */
+       for (i = 0; i < (memsize & 0x0F); i += 4) {
+               *dst++ = bswap32(*src++);
+       }
+}
+
+static void
+at91_mci_getaddr(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
+{
+       if (error != 0)
+               return;
+       *(bus_addr_t *)arg = segs[0].ds_addr;
+}
+
 static void
 at91_mci_pdc_disable(struct at91_mci_softc *sc)
 {
@@ -186,13 +252,57 @@ at91_mci_pdc_disable(struct at91_mci_sof
        WR4(sc, PDC_TNCR, 0);
 }
 
+/*
+ * Reset the controller, then restore most of the current state.
+ *
+ * This is called after detecting an error.  It's also called after stopping a
+ * multi-block write, to un-wedge the device so that it will handle the NOTBUSY
+ * signal correctly.  See comments in at91_mci_stop_done() for more details.
+ */
+static void at91_mci_reset(struct at91_mci_softc *sc)
+{
+       uint32_t mr;
+       uint32_t sdcr;
+       uint32_t dtor;
+       uint32_t imr;
+
+       at91_mci_pdc_disable(sc);
+
+       /* save current state */
+
+       imr  = RD4(sc, MCI_IMR);
+       mr   = RD4(sc, MCI_MR) & 0x7fff;
+       sdcr = RD4(sc, MCI_SDCR);
+       dtor = RD4(sc, MCI_DTOR);
+
+       /* reset the controller */
+
+       WR4(sc, MCI_IDR, 0xffffffff);
+       WR4(sc, MCI_CR, MCI_CR_MCIDIS | MCI_CR_SWRST);
+
+       /* restore state */
+
+       WR4(sc, MCI_CR, MCI_CR_MCIEN|MCI_CR_PWSEN);
+       WR4(sc, MCI_MR, mr);
+       WR4(sc, MCI_SDCR, sdcr);
+       WR4(sc, MCI_DTOR, dtor);
+       WR4(sc, MCI_IER, imr);
+
+       /*
+        * Make sure sdio interrupts will fire.  Not sure why reading
+        * SR ensures that, but this is in the linux driver.
+        */
+
+       RD4(sc, MCI_SR);
+}
+
 static void
 at91_mci_init(device_t dev)
 {
        struct at91_mci_softc *sc = device_get_softc(dev);
        uint32_t val;
 
-       WR4(sc, MCI_CR, MCI_CR_MCIEN);          /* Enable controller */
+       WR4(sc, MCI_CR, MCI_CR_MCIDIS | MCI_CR_SWRST); /* device into reset */
        WR4(sc, MCI_IDR, 0xffffffff);           /* Turn off interrupts */
        WR4(sc, MCI_DTOR, MCI_DTOR_DTOMUL_1M | 1);
        val = MCI_MR_PDCMODE;
@@ -203,10 +313,19 @@ at91_mci_init(device_t dev)
 #ifndef  AT91_MCI_SLOT_B
        WR4(sc, MCI_SDCR, 0);                   /* SLOT A, 1 bit bus */
 #else
-       /* XXX Really should add second "unit" but nobody using using
-        * a two slot card that we know of. -- except they are... XXX */
+       /*
+        * XXX Really should add second "unit" but nobody using using 
+        * a two slot card that we know of. XXX
+        */
        WR4(sc, MCI_SDCR, 1);                   /* SLOT B, 1 bit bus */
 #endif
+       /*
+        * Enable controller, including power-save.  The slower clock
+        * of the power-save mode is only in effect when there is no
+        * transfer in progress, so it can be left in this mode all
+        * the time.
+        */
+       WR4(sc, MCI_CR, MCI_CR_MCIEN|MCI_CR_PWSEN);
 }
 
 static void
@@ -216,7 +335,7 @@ at91_mci_fini(device_t dev)
 
        WR4(sc, MCI_IDR, 0xffffffff);           /* Turn off interrupts */
        at91_mci_pdc_disable(sc);
-       WR4(sc, MCI_CR, MCI_CR_MCIDIS | MCI_CR_SWRST); /* Put the device into 
reset */
+       WR4(sc, MCI_CR, MCI_CR_MCIDIS | MCI_CR_SWRST); /* device into reset */
 }
 
 static int
@@ -234,7 +353,7 @@ at91_mci_attach(device_t dev)
        struct sysctl_ctx_list *sctx;
        struct sysctl_oid *soid;
        device_t child;
-       int err;
+       int err, i;
 
        sctx = device_get_sysctl_ctx(dev);
        soid = device_get_sysctl_tree(dev);
@@ -249,21 +368,33 @@ at91_mci_attach(device_t dev)
 
        AT91_MCI_LOCK_INIT(sc);
 
+       at91_mci_fini(dev);
+       at91_mci_init(dev);
+
        /*
-        * Allocate DMA tags and maps
+        * Allocate DMA tags and maps and bounce buffers.
+        *
+        * The parms in the tag_create call cause the dmamem_alloc call to
+        * create each bounce buffer as a single contiguous buffer of BBSIZE
+        * bytes aligned to a 4096 byte boundary.
+        *
+        * Do not use DMA_COHERENT for these buffers because that maps the
+        * memory as non-cachable, which prevents cache line burst fills/writes,
+        * which is something we need since we're trying to overlap the
+        * byte-swapping with the DMA operations.
         */
-       err = bus_dma_tag_create(bus_get_dma_tag(dev), 1, 0,
-           BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR, NULL, NULL, MAXPHYS, 1,
-           MAXPHYS, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->dmatag);
+       err = bus_dma_tag_create(bus_get_dma_tag(dev), 4096, 0,
+           BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR, NULL, NULL, 
+           BBSIZE, 1, BBSIZE, 0, NULL, NULL, &sc->dmatag);
        if (err != 0)
                goto out;
 
-       err = bus_dmamap_create(sc->dmatag, 0,  &sc->map);
-       if (err != 0)
-               goto out;
-
-       at91_mci_fini(dev);
-       at91_mci_init(dev);
+       for (i = 0; i < BBCOUNT; ++i) {
+               err = bus_dmamem_alloc(sc->dmatag, (void **)&sc->bbuf_vaddr[i],
+                   BUS_DMA_NOWAIT, &sc->bbuf_map[i]);
+               if (err != 0)
+                       goto out;
+       }
 
        /*
         * Activate the interrupt
@@ -330,8 +461,15 @@ out:
 static int
 at91_mci_detach(device_t dev)
 {
+       struct at91_mci_softc *sc = device_get_softc(dev);
+
        at91_mci_fini(dev);
        at91_mci_deactivate(dev);
+
+       bus_dmamem_free(sc->dmatag, sc->bbuf_vaddr[0], sc->bbuf_map[0]);
+       bus_dmamem_free(sc->dmatag, sc->bbuf_vaddr[1], sc->bbuf_map[1]);
+       bus_dma_tag_destroy(sc->dmatag);
+
        return (EBUSY); /* XXX */
 }
 
@@ -398,14 +536,6 @@ at91_mci_is_mci1rev2xx(void)
        }
 }
 
-static void
-at91_mci_getaddr(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
-{
-       if (error != 0)
-               return;
-       *(bus_addr_t *)arg = segs[0].ds_addr;
-}
-
 static int
 at91_mci_update_ios(device_t brdev, device_t reqdev)
 {
@@ -437,7 +567,7 @@ at91_mci_update_ios(device_t brdev, devi
                if (sc->use_30mhz && ios->clock == 25000000 &&
                    at91_master_clock > 50000000)
                        clkdiv = 0;
-                else if ((at91_master_clock % (ios->clock * 2)) == 0)
+               else if ((at91_master_clock % (ios->clock * 2)) == 0)
                        clkdiv = ((at91_master_clock / ios->clock) / 2) - 1;
                else
                        clkdiv = (at91_master_clock / ios->clock) / 2;
@@ -456,73 +586,182 @@ at91_mci_update_ios(device_t brdev, devi
 static void
 at91_mci_start_cmd(struct at91_mci_softc *sc, struct mmc_command *cmd)
 {
-       size_t len;
-       uint32_t cmdr, ier = 0, mr;
-       uint32_t *src, *dst;
-       int i;
+       uint32_t cmdr, mr;
        struct mmc_data *data;
-       void *vaddr;
-       bus_addr_t paddr;
 
        sc->curcmd = cmd;
        data = cmd->data;
-       cmdr = cmd->opcode;
 
        /* XXX Upper layers don't always set this */
        cmd->mrq = sc->req;
 
+       /* Begin setting up command register. */
+
+       cmdr = cmd->opcode;
+
+       if (sc->host.ios.bus_mode == opendrain)
+               cmdr |= MCI_CMDR_OPDCMD;
+
+       /* Set up response handling.  Allow max timeout for responses. */
+
        if (MMC_RSP(cmd->flags) == MMC_RSP_NONE)
                cmdr |= MCI_CMDR_RSPTYP_NO;
        else {
-               /* Allow big timeout for responses */
                cmdr |= MCI_CMDR_MAXLAT;
                if (cmd->flags & MMC_RSP_136)
                        cmdr |= MCI_CMDR_RSPTYP_136;
                else
                        cmdr |= MCI_CMDR_RSPTYP_48;
        }
-       if (cmd->opcode == MMC_STOP_TRANSMISSION)
-               cmdr |= MCI_CMDR_TRCMD_STOP;
-       if (sc->host.ios.bus_mode == opendrain)
-               cmdr |= MCI_CMDR_OPDCMD;
-       if (!data) {
-               // The no data case is fairly simple
+
+       /*
+        * If there is no data transfer, just set up the right interrupt mask
+        * and start the command.
+        *
+        * The interrupt mask needs to be CMDRDY plus all non-data-transfer
+        * errors. It's important to leave the transfer-related errors out, to
+        * avoid spurious timeout or crc errors on a STOP command following a
+        * multiblock read.  When a multiblock read is in progress, sending a
+        * STOP in the middle of a block occasionally triggers such errors, but
+        * we're totally disinterested in them because we've already gotten all
+        * the data we wanted without error before sending the STOP command.
+        */
+
+       if (data == NULL) {
+               uint32_t ier = MCI_SR_CMDRDY | 
+                   MCI_SR_RTOE | MCI_SR_RENDE | 
+                   MCI_SR_RCRCE | MCI_SR_RDIRE | MCI_SR_RINDE;
+
                at91_mci_pdc_disable(sc);
-//             printf("CMDR %x ARGR %x\n", cmdr, cmd->arg);
+
+               if (cmd->opcode == MMC_STOP_TRANSMISSION)
+                       cmdr |= MCI_CMDR_TRCMD_STOP;
+
+               /* Ignore response CRC on CMD2 and ACMD41, per standard. */
+
+               if (cmd->opcode == MMC_SEND_OP_COND ||
+                   cmd->opcode == ACMD_SD_SEND_OP_COND)
+                       ier &= ~MCI_SR_RCRCE;
+
+               if (mci_debug)
+                       printf("CMDR %x (opcode %d) ARGR %x no data\n", 
+                           cmdr, cmd->opcode, cmd->arg);
+
                WR4(sc, MCI_ARGR, cmd->arg);
                WR4(sc, MCI_CMDR, cmdr);
-               WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_CMDRDY);
+               WR4(sc, MCI_IDR, 0xffffffff);
+               WR4(sc, MCI_IER, ier);
                return;
        }
+
+       /* There is data, set up the transfer-related parts of the command. */
+
        if (data->flags & MMC_DATA_READ)
                cmdr |= MCI_CMDR_TRDIR;
+
        if (data->flags & (MMC_DATA_READ | MMC_DATA_WRITE))
                cmdr |= MCI_CMDR_TRCMD_START;
+
        if (data->flags & MMC_DATA_STREAM)
                cmdr |= MCI_CMDR_TRTYP_STREAM;
-       if (data->flags & MMC_DATA_MULTI)
+       else if (data->flags & MMC_DATA_MULTI) {
                cmdr |= MCI_CMDR_TRTYP_MULTIPLE;
-       // Set block size and turn on PDC mode for dma xfer and disable
-       // PDC until we're ready.
-       mr = RD4(sc, MCI_MR) & ~MCI_MR_BLKLEN;
-       WR4(sc, MCI_MR, mr | (data->len << 16) | MCI_MR_PDCMODE);
-       WR4(sc, PDC_PTCR, PDC_PTCR_RXTDIS | PDC_PTCR_TXTDIS);
-       if (cmdr & MCI_CMDR_TRCMD_START) {
-               len = data->len;
-               if (cmdr & MCI_CMDR_TRDIR)
-                       vaddr = cmd->data->data;
-               else {
-                       /* Use bounce buffer even if we don't need
-                        * byteswap, since buffer may straddle a page
-                        * boundry, and we don't handle multi-segment
-                        * transfers in hardware.
-                        * (page issues seen from 'bsdlabel -w' which
-                        * uses raw geom access to the volume).
-                        * Greg Ansley (gja (at) ansley.com)
-                        */
-                       vaddr = sc->bounce_buffer;
-                       src = (uint32_t *)cmd->data->data;
-                       dst = (uint32_t *)vaddr;
+               sc->flags |= (data->flags & MMC_DATA_READ) ? 
+                               CMD_MULTIREAD : CMD_MULTIWRITE;
+       }
+
+       /*
+        * Disable PDC until we're ready.
+        *
+        * Set block size and turn on PDC mode for dma xfer.
+        * Note that the block size is the smaller of the amount of data to be
+        * transferred, or 512 bytes.  The 512 size is fixed by the standard;
+        * smaller blocks are possible, but never larger.
+        */
+
+       WR4(sc, PDC_PTCR, PDC_PTCR_RXTDIS | PDC_PTCR_TXTDIS); 
+
+       mr = RD4(sc,MCI_MR) & ~MCI_MR_BLKLEN; 
+       mr |=  min(data->len, 512) << 16; 
+       WR4(sc, MCI_MR, mr | MCI_MR_PDCMODE|MCI_MR_PDCPADV);
+
+       /*
+        * Set up DMA.
+        *
+        * Use bounce buffers even if we don't need to byteswap, because doing
+        * multi-block IO with large DMA buffers is way fast (compared to
+        * single-block IO), even after incurring the overhead of also copying
+        * from/to the caller's buffers (which may be in non-contiguous physical
+        * pages).
+        *
+        * In an ideal non-byteswap world we could create a dma tag that allows
+        * for discontiguous segments and do the IO directly from/to the
+        * caller's buffer(s), using ENDRX/ENDTX interrupts to chain the
+        * discontiguous buffers through the PDC. Someday.
+        *
+        * If a read is bigger than 2k, split it in half so that we can start
+        * byte-swapping the first half while the second half is on the wire.
+        * It would be best if we could split it into 8k chunks, but we can't
+        * always keep up with the byte-swapping due to other system activity,
+        * and if an RXBUFF interrupt happens while we're still handling the
+        * byte-swap from the prior buffer (IE, we haven't returned from
+        * handling the prior interrupt yet), then data will get dropped on the
+        * floor and we can't easily recover from that.  The right fix for that
+        * would be to have the interrupt handling only keep the DMA flowing and
+        * enqueue filled buffers to be byte-swapped in a non-interrupt context.
+        * Even that won't work on the write side of things though; in that
+        * context we have to have all the data ready to go before starting the
+        * dma.
+        *
+        * XXX what about stream transfers?
+        */
+       sc->xfer_offset = 0;
+       sc->bbuf_curidx = 0;
+
+       if (data->flags & (MMC_DATA_READ | MMC_DATA_WRITE)) {
+               uint32_t len;
+               uint32_t remaining = data->len;
+               bus_addr_t paddr;
+               int err;
+
+               if (remaining > (BBCOUNT*BBSIZE))
+                       panic("IO read size exceeds MAXDATA\n");
+
+               if (data->flags & MMC_DATA_READ) {
+                       if (remaining > 2048) // XXX
+                               len = remaining / 2;
+                       else
+                               len = remaining;
+                       err = bus_dmamap_load(sc->dmatag, sc->bbuf_map[0], 
+                           sc->bbuf_vaddr[0], len, at91_mci_getaddr, 
+                           &paddr, BUS_DMA_NOWAIT);
+                       if (err != 0)
+                               panic("IO read dmamap_load failed\n");
+                       bus_dmamap_sync(sc->dmatag, sc->bbuf_map[0], 
+                           BUS_DMASYNC_PREREAD);
+                       WR4(sc, PDC_RPR, paddr);
+                       WR4(sc, PDC_RCR, len / 4);
+                       sc->bbuf_len[0] = len;
+                       remaining -= len;
+                       if (remaining == 0) {
+                               sc->bbuf_len[1] = 0;
+                       } else {
+                               len = remaining;
+                               err = bus_dmamap_load(sc->dmatag, 
sc->bbuf_map[1], 
+                                   sc->bbuf_vaddr[1], len, at91_mci_getaddr, 
+                                   &paddr, BUS_DMA_NOWAIT);
+                               if (err != 0)
+                                       panic("IO read dmamap_load failed\n");
+                               bus_dmamap_sync(sc->dmatag, sc->bbuf_map[1], 
+                                   BUS_DMASYNC_PREREAD);
+                               WR4(sc, PDC_RNPR, paddr);
+                               WR4(sc, PDC_RNCR, len / 4);
+                               sc->bbuf_len[1] = len;
+                               remaining -= len;
+                       }
+                       WR4(sc, PDC_PTCR, PDC_PTCR_RXTEN);
+               } else {
+                       len = min(BBSIZE, remaining);
                        /*
                         * If this is MCI1 revision 2xx controller, apply
                         * a work-around for the "Data Write Operation and
@@ -530,74 +769,75 @@ at91_mci_start_cmd(struct at91_mci_softc
                         */
                        if (at91_mci_is_mci1rev2xx() && data->len < 12) {
                                len = 12;
-                               memset(dst, 0, 12);
+                               memset(data->data, 0, 12);
                        }
-                       if (sc->sc_cap & CAP_NEEDS_BYTESWAP) {
-                               for (i = 0; i < data->len / 4; i++)
-                                       dst[i] = bswap32(src[i]);
-                       } else
-                               memcpy(dst, src, data->len);
-               }
-               data->xfer_len = 0;
-               if (bus_dmamap_load(sc->dmatag, sc->map, vaddr, len,
-                   at91_mci_getaddr, &paddr, 0) != 0) {
-                       cmd->error = MMC_ERR_NO_MEMORY;
-                       sc->req = NULL;
-                       sc->curcmd = NULL;
-                       cmd->mrq->done(cmd->mrq);
-                       return;
-               }
-               sc->mapped++;
-               if (cmdr & MCI_CMDR_TRDIR) {
-                       bus_dmamap_sync(sc->dmatag, sc->map, 
BUS_DMASYNC_PREREAD);
-                       WR4(sc, PDC_RPR, paddr);
-                       WR4(sc, PDC_RCR, len / 4);
-                       ier = MCI_SR_ENDRX;
-               } else {
-                       bus_dmamap_sync(sc->dmatag, sc->map, 
BUS_DMASYNC_PREWRITE);
-                       WR4(sc, PDC_TPR, paddr);
+                       at91_bswap_buf(sc, sc->bbuf_vaddr[0], data->data, len);
+                       err = bus_dmamap_load(sc->dmatag, sc->bbuf_map[0], 
+                           sc->bbuf_vaddr[0], len, at91_mci_getaddr, 
+                           &paddr, BUS_DMA_NOWAIT);
+                       if (err != 0)
+                               panic("IO write dmamap_load failed\n");
+                       bus_dmamap_sync(sc->dmatag, sc->bbuf_map[0], 
+                           BUS_DMASYNC_PREWRITE);
+                       WR4(sc, PDC_TPR,paddr);
                        WR4(sc, PDC_TCR, len / 4);
-                       ier = MCI_SR_TXBUFE;
+                       sc->bbuf_len[0] = len;
+                       remaining -= len;
+                       if (remaining == 0) {
+                               sc->bbuf_len[1] = 0;
+                       } else {
+                               len = remaining;
+                               at91_bswap_buf(sc, sc->bbuf_vaddr[1],
+                                   ((char *)data->data)+BBSIZE, len);
+                               err = bus_dmamap_load(sc->dmatag, 
sc->bbuf_map[1], 
+                                   sc->bbuf_vaddr[1], len, at91_mci_getaddr, 
+                                   &paddr, BUS_DMA_NOWAIT);
+                               if (err != 0)
+                                       panic("IO write dmamap_load failed\n");
+                               bus_dmamap_sync(sc->dmatag, sc->bbuf_map[1], 
+                                   BUS_DMASYNC_PREWRITE);
+                               WR4(sc, PDC_TNPR, paddr);
+                               WR4(sc, PDC_TNCR, len / 4);
+                               sc->bbuf_len[1] = len;
+                               remaining -= len;
+                       }
+                       /* do not enable PDC xfer until CMDRDY asserted */
                }
+               data->xfer_len = 0; /* XXX what's this? appears to be unused. */
        }
-//     printf("CMDR %x ARGR %x with data\n", cmdr, cmd->arg);
+
+       if (mci_debug)
+               printf("CMDR %x (opcode %d) ARGR %x with data len %d\n", 
+                      cmdr, cmd->opcode, cmd->arg, cmd->data->len);
+
        WR4(sc, MCI_ARGR, cmd->arg);
-       if (cmdr & MCI_CMDR_TRCMD_START) {
-               if (cmdr & MCI_CMDR_TRDIR) {
-                       WR4(sc, PDC_PTCR, PDC_PTCR_RXTEN);
-                       WR4(sc, MCI_CMDR, cmdr);
-               } else {
-                       WR4(sc, MCI_CMDR, cmdr);
-                       WR4(sc, PDC_PTCR, PDC_PTCR_TXTEN);
-               }
-       }
-       WR4(sc, MCI_IER, MCI_SR_ERROR | ier);
+       WR4(sc, MCI_CMDR, cmdr);
+       WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_CMDRDY);
 }
 
 static void
-at91_mci_start(struct at91_mci_softc *sc)
+at91_mci_next_operation(struct at91_mci_softc *sc)
 {
        struct mmc_request *req;
 
        req = sc->req;
        if (req == NULL)
                return;
-       // assert locked
-       if (!(sc->flags & CMD_STARTED)) {
-               sc->flags |= CMD_STARTED;
-//             printf("Starting CMD\n");
+
+       if (sc->flags & PENDING_CMD) {
+               sc->flags &= ~PENDING_CMD;
                at91_mci_start_cmd(sc, req->cmd);
                return;
-       }
-       if (!(sc->flags & STOP_STARTED) && req->stop) {
-//             printf("Starting Stop\n");
-               sc->flags |= STOP_STARTED;
+       } else if (sc->flags & PENDING_STOP) {
+               sc->flags &= ~PENDING_STOP;
                at91_mci_start_cmd(sc, req->stop);
                return;
        }
-       /* We must be done -- bad idea to do this while locked? */
+
+       WR4(sc, MCI_IDR, 0xffffffff);
        sc->req = NULL;
        sc->curcmd = NULL;
+       //printf("req done\n");
        req->done(req);
 }
 
@@ -607,16 +847,16 @@ at91_mci_request(device_t brdev, device_
        struct at91_mci_softc *sc = device_get_softc(brdev);
 
        AT91_MCI_LOCK(sc);
-       // XXX do we want to be able to queue up multiple commands?
-       // XXX sounds like a good idea, but all protocols are sync, so
-       // XXX maybe the idea is naive...
        if (sc->req != NULL) {
                AT91_MCI_UNLOCK(sc);
                return (EBUSY);
        }
+       //printf("new req\n");
        sc->req = req;
-       sc->flags = 0;
-       at91_mci_start(sc);
+       sc->flags = PENDING_CMD;
+       if (sc->req->stop)
+               sc->flags |= PENDING_STOP;
+       at91_mci_next_operation(sc);
        AT91_MCI_UNLOCK(sc);
        return (0);
 }
@@ -654,120 +894,351 @@ at91_mci_release_host(device_t brdev, de
 }
 
 static void
-at91_mci_read_done(struct at91_mci_softc *sc)
+at91_mci_read_done(struct at91_mci_softc *sc, uint32_t sr)
 {
-       uint32_t *walker;
-       struct mmc_command *cmd;
-       int i, len;
-
-       cmd = sc->curcmd;
-       bus_dmamap_sync(sc->dmatag, sc->map, BUS_DMASYNC_POSTREAD);
-       bus_dmamap_unload(sc->dmatag, sc->map);
-       sc->mapped--;
-       if (sc->sc_cap & CAP_NEEDS_BYTESWAP) {
-               walker = (uint32_t *)cmd->data->data;
-               len = cmd->data->len / 4;
-               for (i = 0; i < len; i++)
-                       walker[i] = bswap32(walker[i]);
-       }
-       // Finish up the sequence...
-       WR4(sc, MCI_IDR, MCI_SR_ENDRX);
-       WR4(sc, MCI_IER, MCI_SR_RXBUFF);
-       WR4(sc, PDC_PTCR, PDC_PTCR_RXTDIS | PDC_PTCR_TXTDIS);
+       struct mmc_command *cmd = sc->curcmd;
+       char * dataptr = (char *)cmd->data->data;
+       uint32_t curidx = sc->bbuf_curidx;
+       uint32_t len = sc->bbuf_len[curidx];
+
+       /*
+        * We arrive here when a DMA transfer for a read is done, whether it's
+        * a single or multi-block read.
+        *
+        * We byte-swap the buffer that just completed, and if that is the
+        * last buffer that's part of this read then we move on to the next
+        * operation, otherwise we wait for another ENDRX for the next bufer.
+        */
+
+       bus_dmamap_sync(sc->dmatag, sc->bbuf_map[curidx], BUS_DMASYNC_POSTREAD);
+       bus_dmamap_unload(sc->dmatag, sc->bbuf_map[curidx]);
+
+       at91_bswap_buf(sc, dataptr + sc->xfer_offset, sc->bbuf_vaddr[curidx], 
len);
+
+       if (mci_debug) {
+               printf("read done sr %x curidx %d len %d xfer_offset %d\n",
+                      sr, curidx, len, sc->xfer_offset);
+       }
+
+       sc->xfer_offset += len;
+       sc->bbuf_curidx = !curidx; /* swap buffers */
+
+       /*
+        * If we've transferred all the data, move on to the next operation.
+        *
+        * If we're still transferring the last buffer, RNCR is already zero but
+        * we have to write a zero anyway to clear the ENDRX status so we don't
+        * re-interrupt until the last buffer is done.
+        */
+       if (sc->xfer_offset == cmd->data->len) {
+               WR4(sc, PDC_PTCR, PDC_PTCR_RXTDIS | PDC_PTCR_TXTDIS);
+               cmd->error = MMC_ERR_NONE;
+               at91_mci_next_operation(sc);
+       } else {
+               WR4(sc, PDC_RNCR, 0);
+               WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_ENDRX);
+       }
 }
 
 static void
-at91_mci_xmit_done(struct at91_mci_softc *sc)
+at91_mci_write_done(struct at91_mci_softc *sc, uint32_t sr)
 {
-       // Finish up the sequence...
+       struct mmc_command *cmd = sc->curcmd;
+
+       /*
+        * We arrive here when the entire DMA transfer for a write is done,
+        * whether it's a single or multi-block write.  If it's multi-block we
+        * have to immediately move on to the next operation which is to send
+        * the stop command.  If it's a single-block transfer we need to wait
+        * for NOTBUSY, but if that's already asserted we can avoid another
+        * interrupt and just move on to completing the request right away.
+        */
+
        WR4(sc, PDC_PTCR, PDC_PTCR_RXTDIS | PDC_PTCR_TXTDIS);
-       WR4(sc, MCI_IDR, MCI_SR_TXBUFE);
-       WR4(sc, MCI_IER, MCI_SR_NOTBUSY);
-       bus_dmamap_sync(sc->dmatag, sc->map, BUS_DMASYNC_POSTWRITE);
-       bus_dmamap_unload(sc->dmatag, sc->map);
-       sc->mapped--;
+
+       bus_dmamap_sync(sc->dmatag, sc->bbuf_map[sc->bbuf_curidx],
+           BUS_DMASYNC_POSTWRITE);
+       bus_dmamap_unload(sc->dmatag, sc->bbuf_map[sc->bbuf_curidx]);
+
+       if ((cmd->data->flags & MMC_DATA_MULTI) || (sr & MCI_SR_NOTBUSY)) {
+               cmd->error = MMC_ERR_NONE;
+               at91_mci_next_operation(sc);
+       } else {
+               WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_NOTBUSY);
+       }
+}
+
+static void
+at91_mci_notbusy(struct at91_mci_softc *sc)
+{
+       struct mmc_command *cmd = sc->curcmd;
+
+       /*
+        * We arrive here by either completion of a single-block write, or
+        * completion of the stop command that ended a multi-block write (and,
+        * I suppose, after a card-select or erase, but I haven't tested
+        * those).  Anyway, we're done and it's time to move on to the next
+        * command.
+        */
+
+       cmd->error = MMC_ERR_NONE;
+       at91_mci_next_operation(sc);
+}
+
+static void
+at91_mci_stop_done(struct at91_mci_softc *sc, uint32_t sr)
+{
+       struct mmc_command *cmd = sc->curcmd;
+
+       /*
+        * We arrive here after receiving CMDRDY for a MMC_STOP_TRANSMISSION
+        * command.  Depending on the operation being stopped, we may have to
+        * do some unusual things to work around hardware bugs.
+        */
+
+       /*
+        * This is known to be true of at91rm9200 hardware; it may or may not
+        * apply to more recent chips: 
+        *
+        * After stopping a multi-block write, the NOTBUSY bit in MCI_SR does
+        * not properly reflect the actual busy state of the card as signaled
+        * on the DAT0 line; it always claims the card is not-busy.  If we
+        * believe that and let operations continue, following commands will
+        * fail with response timeouts (except of course MMC_SEND_STATUS -- it
+        * indicates the card is busy in the PRG state, which was the smoking
+        * gun that showed MCI_SR NOTBUSY was not tracking DAT0 correctly).
+        *
+        * The atmel docs are emphatic: "This flag [NOTBUSY] must be used only
+        * for Write Operations."  I guess technically since we sent a stop
+        * it's not a write operation anymore.  But then just what did they
+        * think it meant for the stop command to have "...an optional busy
+        * signal transmitted on the data line" according to the SD spec?
+        *
+        * I tried a variety of things to un-wedge the MCI and get the status
+        * register to reflect NOTBUSY correctly again, but the only thing
+        * that worked was a full device reset.  It feels like an awfully big
+        * hammer, but doing a full reset after every multiblock write is
+        * still faster than doing single-block IO (by almost two orders of
+        * magnitude: 20KB/sec improves to about 1.8MB/sec best case).
+        *
+        * After doing the reset, wait for a NOTBUSY interrupt before
+        * continuing with the next operation.
+        */
+       if (sc->flags & CMD_MULTIWRITE) {
+               at91_mci_reset(sc);
+               WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_NOTBUSY);
+               return;
+       }
+
+       /*
+        * This is known to be true of at91rm9200 hardware; it may or may not
+        * apply to more recent chips:
+        *
+        * After stopping a multi-block read, loop to read and discard any
+        * data that coasts in after we sent the stop command.  The docs don't
+        * say anything about it, but empirical testing shows that 1-3
+        * additional words of data get buffered up in some unmentioned
+        * internal fifo and if we don't read and discard them here they end
+        * up on the front of the next read DMA transfer we do.
+        */
+       if (sc->flags & CMD_MULTIREAD) {
+               uint32_t sr;
+               int count = 0;
+
+               do {
+                       sr = RD4(sc, MCI_SR);
+                       if (sr & MCI_SR_RXRDY) {
+                               RD4(sc,  MCI_RDR);
+                               ++count;
+                       }
+               } while (sr & MCI_SR_RXRDY);
+               at91_mci_reset(sc);
+//              if (count != 0)
+//                      printf("Had to soak up %d words after read\n", count);
+       }
+
+       cmd->error = MMC_ERR_NONE;
+       at91_mci_next_operation(sc);
+
+}
+
+static void
+at91_mci_cmdrdy(struct at91_mci_softc *sc, uint32_t sr)
+{
+       struct mmc_command *cmd = sc->curcmd;
+       int i;
+
+       if (cmd == NULL)
+               return;
+
+       /*
+        * We get here at the end of EVERY command.  We retrieve the command
+        * response (if any) then decide what to do next based on the command.
+        */
+
+       if (cmd->flags & MMC_RSP_PRESENT) {
+               for (i = 0; i < ((cmd->flags & MMC_RSP_136) ? 4 : 1); i++) {
+                       cmd->resp[i] = RD4(sc, MCI_RSPR + i * 4);
+                       if (mci_debug)
+                               printf("RSPR[%d] = %x sr=%x\n", i, 
cmd->resp[i],  sr);
+               }
+       }
+
+       /*
+        * If this was a stop command, go handle the various special
+        * conditions (read: bugs) that have to be dealt with following a stop.
+        */
+       if (cmd->opcode == MMC_STOP_TRANSMISSION) {
+               at91_mci_stop_done(sc, sr);
+               return;
+       }
+
+       /*
+        * If this command can continue to assert BUSY beyond the response then
+        * we need to wait for NOTBUSY before the command is really done.
+        *
+        * Note that this may not work properly on the at91rm9200.  It certainly
+        * doesn't work for the STOP command that follows a multi-block write,
+        * so post-stop CMDRDY is handled separately; see the special handling
+        * in at91_mci_stop_done().
+        *
+        * Beside STOP, there are other R1B-type commands that use the busy
+        * signal after CMDRDY: CMD7 (card select), CMD28-29 (write protect),
+        * CMD38 (erase). I haven't tested any of them, but I rather expect
+        * them all to have the same sort of problem with MCI_SR not actually
+        * reflecting the state of the DAT0-line busy indicator.  So this code
+        * may need to grow some sort of special handling for them too. (This
+        * just in: CMD7 isn't a problem right now because dev/mmc.c incorrectly
+        * sets the response flags to R1 rather than R1B.) XXX
+        */
+       if ((cmd->flags & MMC_RSP_BUSY)) {
+               WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_NOTBUSY);
+               return;
+       }
+
+       /*
+        * If there is a data transfer with this command, then...
+        * - If it's a read, we need to wait for ENDRX.
+        * - If it's a write, now is the time to enable the PDC, and we need
+        *   to wait for a BLKE that follows a TXBUFE, because if we're doing
+        *   a split transfer we get a BLKE after the first half (when TPR/TCR
+        *   get loaded from TNPR/TNCR).  So first we wait for the TXBUFE, and
+        *   the handling for that interrupt will then invoke the wait for the
+        *   subsequent BLKE which indicates actual completion.
+        */
+       if (cmd->data) {
+               uint32_t ier;
+               if (cmd->data->flags & MMC_DATA_READ) {
+                       ier = MCI_SR_ENDRX;
+               } else {
+                       ier = MCI_SR_TXBUFE;
+                       WR4(sc, PDC_PTCR, PDC_PTCR_TXTEN);
+               }
+               WR4(sc, MCI_IER, MCI_SR_ERROR | ier);
+               return;
+       }
+
+       /*
+        * If we made it to here, we don't need to wait for anything more for
+        * the current command, move on to the next command (will complete the
+        * request if there is no next command).
+        */
+       cmd->error = MMC_ERR_NONE;
+       at91_mci_next_operation(sc);
 }
 
 static void
 at91_mci_intr(void *arg)
 {
        struct at91_mci_softc *sc = (struct at91_mci_softc*)arg;
-       uint32_t sr;
-       int i, done = 0;
-       struct mmc_command *cmd;
+       struct mmc_command *cmd = sc->curcmd;
+       uint32_t sr, isr;
 
        AT91_MCI_LOCK(sc);
-       sr = RD4(sc, MCI_SR) & RD4(sc, MCI_IMR);
-//     printf("i 0x%x\n", sr);
-       cmd = sc->curcmd;
-       if (sr & MCI_SR_ERROR) {
-               // Ignore CRC errors on CMD2 and ACMD47, per relevant standards
-               if ((sr & MCI_SR_RCRCE) && (cmd->opcode == MMC_SEND_OP_COND ||
-                   cmd->opcode == ACMD_SD_SEND_OP_COND))
-                       cmd->error = MMC_ERR_NONE;
-               else if (sr & (MCI_SR_RTOE | MCI_SR_DTOE))
+
+       sr = RD4(sc, MCI_SR);
+       isr = sr & RD4(sc, MCI_IMR);
+
+       if (mci_debug)
+               printf("i 0x%x sr 0x%x\n", isr, sr);
+
+       /*
+        * All interrupts are one-shot; disable it now.
+        * The next operation will re-enable whatever interrupts it wants.
+        */
+       WR4(sc, MCI_IDR, isr);
+       if (isr & MCI_SR_ERROR) {
+               if (isr & (MCI_SR_RTOE | MCI_SR_DTOE))
                        cmd->error = MMC_ERR_TIMEOUT;
-               else if (sr & (MCI_SR_RCRCE | MCI_SR_DCRCE))
+               else if (isr & (MCI_SR_RCRCE | MCI_SR_DCRCE))
                        cmd->error = MMC_ERR_BADCRC;
-               else if (sr & (MCI_SR_OVRE | MCI_SR_UNRE))
+               else if (isr & (MCI_SR_OVRE | MCI_SR_UNRE))
                        cmd->error = MMC_ERR_FIFO;
                else
                        cmd->error = MMC_ERR_FAILED;
-               done = 1;
-               if (sc->mapped && cmd->error) {
-                       bus_dmamap_unload(sc->dmatag, sc->map);
-                       sc->mapped--;
+               /*
+                * CMD8 is used to probe for SDHC cards, a standard SD card
+                * will get a response timeout; don't report it because it's a
+                * normal and expected condition.  One might argue that all
+                * error reporting should be left to higher levels, but when
+                * they report at all it's always EIO, which isn't very
+                * helpful. XXX bootverbose?
+                */
+               if (cmd->opcode != 8) {
+                       device_printf(sc->dev, 

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-head@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to