Re: [patch] tsec(4): enable TX interrupt coalescing

2015-12-03 Thread Richard Procter
Refreshed patch against HEAD appears below, 

best, 
Richard. 

PS. Apologies for the quoted-printable encoding...looking now for an email
client capable of 7-bit... 

On 10/11/2015, at 5:18 PM, Richard Procter wrote:

> This reduces tsec(4) TX interrupts by over a factor of four per interface,
> boosting throughput by a couple of percent for
> 
>   $ dd if=/dev/zero bs=4096 | nc ${host} ${port}
> 
> It does this by reducing TX interrupts notifications to one per frame, from
> one per mbuf fragment, and by enabling TX interrupt coalescing.
> 
> I've chosen conservative coalescing parameters. The card now interrupts every
> four tx frames, leaving the tx ring fuller on average. But ample room remains
> on the card's tx ring of 256 descriptors, which can hold 16 frames in the 
> worst case of 16 mbuf fragments per frame. Testing showed descriptor use 
> peaking at 13 descriptors under load.
> 
> The hold-off timer, ensuring stale frames are not left on the tx ring 
> indefinitely, is not crucial for tx: as the frame has already been 
> transmitted, 
> latency isn't a concern. It need only last longer than the time to transmit 
> the 
> coalesced frames, and I've set it much longer, roughly 2ms for 1000baseT, 
> to give the stack some slack when feeding the card.
> 
> While here, also makes tsec_encap() error handling a tad more robust.
> 
> Tested on RB600A.

Index: arch/socppc/dev/if_tsec.c
===
RCS file: /cvs/src/sys/arch/socppc/dev/if_tsec.c,v
retrieving revision 1.42
diff -u -p -U6 -r1.42 if_tsec.c
--- arch/socppc/dev/if_tsec.c   25 Nov 2015 03:09:58 -  1.42
+++ arch/socppc/dev/if_tsec.c   3 Dec 2015 20:54:39 -
@@ -117,12 +117,14 @@ extern void myetheraddr(u_char *);
#define  TSEC_DMACTRL_WOP   0x0001
#define TSEC_TBIPA  0x030

#define TSEC_TCTRL  0x100
#define TSEC_TSTAT  0x104
#define  TSEC_TSTAT_THLT0x8000
+#define TSEC_TXIC  0x110
+#define  TSEC_TXIC_ICEN0x8000
#define TSEC_TBPTR  0x184
#define TSEC_TBASE  0x204

#define TSEC_RCTRL  0x300
#define  TSEC_RCTRL_PROM0x0008
#define TSEC_RSTAT  0x304
@@ -533,13 +535,13 @@ tsec_start(struct ifnet *ifp)
error = tsec_encap(sc, m, );
if (error == ENOBUFS) {
ifq_deq_rollback(>if_snd, m);
ifq_set_oactive(>if_snd);
break;
} 
-   if (error == EFBIG) {
+   if (error) {
ifq_deq_commit(>if_snd, m);
m_freem(m); /* give up: drop it */
ifp->if_oerrors++;
continue;
}

@@ -1017,12 +1019,15 @@ tsec_up(struct tsec_softc *sc)

attr = tsec_read(sc, TSEC_ATTR);
attr |= TSEC_ATTR_RDSEN;
attr |= TSEC_ATTR_RBDSEN;
tsec_write(sc, TSEC_ATTR, attr);

+   /* TX interrupts every 4 TSEC_TX_I with ~2ms hold-off @ 1000baseT */
+   tsec_write(sc, TSEC_TXIC, (TSEC_TXIC_ICEN | (0x4 << 21) | 0x1000));
+
tsec_write(sc, TSEC_TSTAT, TSEC_TSTAT_THLT);
tsec_write(sc, TSEC_RSTAT, TSEC_RSTAT_QHLT);

/* Configure media. */
if (LIST_FIRST(>sc_mii.mii_phys))
mii_mediachg(>sc_mii);
@@ -1156,18 +1161,20 @@ tsec_encap(struct tsec_softc *sc, struct
BUS_DMASYNC_PREWRITE);

txd = >sc_txdesc[frag];
for (i = 0; i < map->dm_nsegs; i++) {
status = txd->td_status & TSEC_TX_W;
status |= TSEC_TX_TO1;
+   status |= TSEC_TX_TC;
if (i == (map->dm_nsegs - 1))
-   status |= TSEC_TX_L;
+   status |= TSEC_TX_L | TSEC_TX_I;
+
txd->td_len = map->dm_segs[i].ds_len;
txd->td_addr = map->dm_segs[i].ds_addr;
__asm volatile("eieio" ::: "memory");
-   txd->td_status = status | TSEC_TX_R | TSEC_TX_I | TSEC_TX_TC;
+   txd->td_status = status | TSEC_TX_R;

bus_dmamap_sync(sc->sc_dmat, TSEC_DMA_MAP(sc->sc_txring),
frag * sizeof(*txd), sizeof(*txd), BUS_DMASYNC_PREWRITE);

cur = frag;
if (status & TSEC_TX_W) {



[patch] tsec(4): enable TX interrupt coalescing

2015-11-09 Thread Richard Procter
Hi, 

This reduces tsec(4) TX interrupts by over a factor of four per interface,
boosting throughput by a couple of percent for

$ dd if=/dev/zero bs=4096 | nc ${host} ${port}

It does this by reducing TX interrupts notifications to one per frame, from
one per mbuf fragment, and by enabling TX interrupt coalescing.

I've chosen conservative coalescing parameters. The card now interrupts every
four tx frames, leaving the tx ring fuller on average. But ample room remains
on the card's tx ring of 256 descriptors, which can hold 16 frames in the 
worst case of 16 mbuf fragments per frame. Testing showed descriptor use 
peaking at 13 descriptors under load.

The hold-off timer, ensuring stale frames are not left on the tx ring 
indefinitely, is not crucial for tx: as the frame has already been transmitted, 
latency isn't a concern. It need only last longer than the time to transmit the 
coalesced frames, and I've set it much longer, roughly 2ms for 1000baseT, 
to give the stack some slack when feeding the card.

While here, also makes tsec_encap() error handling a tad more robust.

Tested on RB600A.

best, 
Richard. 

Index: if_tsec.c
===
RCS file: /cvs/src/sys/arch/socppc/dev/if_tsec.c,v
retrieving revision 1.39
diff -u -p -u -r1.39 if_tsec.c
--- if_tsec.c   6 Nov 2015 11:35:48 -   1.39
+++ if_tsec.c   10 Nov 2015 01:32:31 -
@@ -121,6 +121,8 @@ extern void myetheraddr(u_char *);
 #define TSEC_TCTRL 0x100
 #define TSEC_TSTAT 0x104
 #define  TSEC_TSTAT_THLT   0x8000
+#define TSEC_TXIC  0x110
+#define  TSEC_TXIC_ICEN0x8000
 #define TSEC_TBPTR 0x184
 #define TSEC_TBASE 0x204
 
@@ -536,7 +538,7 @@ tsec_start(struct ifnet *ifp)
ifp->if_flags |= IFF_OACTIVE;
break;
} 
-   if (error == EFBIG) {
+   if (error) {
IFQ_DEQUEUE(>if_snd, m);
m_freem(m); /* give up: drop it */
ifp->if_oerrors++;
@@ -1020,6 +1022,9 @@ tsec_up(struct tsec_softc *sc)
attr |= TSEC_ATTR_RBDSEN;
tsec_write(sc, TSEC_ATTR, attr);
 
+   /* TX interrupts every 4 TSEC_TX_I with ~2ms hold-off @ 1000baseT */
+   tsec_write(sc, TSEC_TXIC, (TSEC_TXIC_ICEN | (0x4 << 21) | 0x1000));
+
tsec_write(sc, TSEC_TSTAT, TSEC_TSTAT_THLT);
tsec_write(sc, TSEC_RSTAT, TSEC_RSTAT_QHLT);
 
@@ -1158,12 +1163,14 @@ tsec_encap(struct tsec_softc *sc, struct
for (i = 0; i < map->dm_nsegs; i++) {
status = txd->td_status & TSEC_TX_W;
status |= TSEC_TX_TO1;
+   status |= TSEC_TX_TC;
if (i == (map->dm_nsegs - 1))
-   status |= TSEC_TX_L;
+   status |= TSEC_TX_L | TSEC_TX_I;
+
txd->td_len = map->dm_segs[i].ds_len;
txd->td_addr = map->dm_segs[i].ds_addr;
__asm volatile("eieio" ::: "memory");
-   txd->td_status = status | TSEC_TX_R | TSEC_TX_I | TSEC_TX_TC;
+   txd->td_status = status | TSEC_TX_R;
 
bus_dmamap_sync(sc->sc_dmat, TSEC_DMA_MAP(sc->sc_txring),
frag * sizeof(*txd), sizeof(*txd), BUS_DMASYNC_PREWRITE);