> From: Hrvoje Popovski <[email protected]>
> Date: Tue, 22 Dec 2015 23:45:49 +0100
>
> On 22.12.2015. 22:08, Mark Kettenis wrote:
> > Anybody willing to give this a spin? I don't have access to hardware
> > currently...
> >
> > Thanks,
> >
> > Mark
>
> Hi,
>
> i'm sending 1.1Mpps and this patch almost immediately trigger OACTIVE
> flag. Patch is applied on clean source few minutes ago. If there is
> anything i can do more, please tell....
ok, that diff wasn't quite right. Here is a better one. This one
gets rid of the ridiculous number of scatter segments on the 82598.
There really is no point in allowing that many segments, and with the
new code it would reduce the usable part of the tx ring significantly.
I did some testing of forwarding performance on a machine with two
sockets filled with:
cpu0: Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz, 2400.37 MHz
for a total of 16 cores forwarding between ix1 and ix0:
ix0 at pci3 dev 0 function 0 "Intel X540T" rev 0x01: msi, address
0c:c4:7a:4d:a3:e4
ix1 at pci3 dev 0 function 1 "Intel X540T" rev 0x01: msi, address
0c:c4:7a:4d:a3:e5
I basically tested how many pps I could push through the box without
loss, and many pps got through if sent 1Mpps into the box. All
testing was done with pf disabled.
With -current I got the following numbers:
- 730kpps without loss
- 82kpps when receiving 1Mpps
and if I set net.inet.ip.ifq.maxlen to 8000 I got:
- 740kpps without loss
- 640-740kpps when receiving 1Mpps (fluctuating)
With this diff I got:
- 670kpps without loss
- 250kpps when receiving 1Mpps
and if I set net.inet.ip.ifq.maxlen to 8000 I got:
- 690kpps without loss
- 680kpps when receiving 1Mpps (fluctuating)
So the maximal throughput goes slightly down, but it seems it with the
diff it behaves betterunder load.
Further tests are welcome!
Index: if_ix.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
retrieving revision 1.130
diff -u -p -r1.130 if_ix.c
--- if_ix.c 18 Dec 2015 22:47:18 -0000 1.130
+++ if_ix.c 29 Dec 2015 16:32:22 -0000
@@ -376,28 +376,21 @@ ixgbe_start(struct ifnet * ifp)
BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
for (;;) {
- m_head = ifq_deq_begin(&ifp->if_snd);
+ /* Check that we have the minimal number of TX descriptors. */
+ if (txr->tx_avail <= IXGBE_TX_OP_THRESHOLD) {
+ ifq_set_oactive(&ifp->if_snd);
+ break;
+ }
+
+ m_head = ifq_dequeue(&ifp->if_snd);
if (m_head == NULL)
break;
if (ixgbe_encap(txr, m_head)) {
- ifq_deq_rollback(&ifp->if_snd, m_head);
- ifq_set_oactive(&ifp->if_snd);
- /*
- * Make sure there are still packets on the
- * ring. The interrupt handler may have
- * cleaned up the ring before we were able to
- * set the IF_OACTIVE flag.
- */
- if (txr->tx_avail == sc->num_tx_desc) {
- ifq_clr_oactive(&ifp->if_snd);
- continue;
- }
- break;
+ m_freem(m_head);
+ continue;
}
- ifq_deq_commit(&ifp->if_snd, m_head);
-
#if NBPFILTER > 0
if (ifp->if_bpf)
bpf_mtap_ether(ifp->if_bpf, m_head, BPF_DIRECTION_OUT);
@@ -894,9 +887,8 @@ ixgbe_intr(void *arg)
if (reg_eicr & IXGBE_EICR_LSC) {
KERNEL_LOCK();
ixgbe_update_link_status(sc);
- if (!IFQ_IS_EMPTY(&ifp->if_snd))
- ixgbe_start(ifp);
KERNEL_UNLOCK();
+ ifq_start(&ifp->if_snd);
}
/* ... more link status change */
@@ -1059,10 +1051,6 @@ ixgbe_encap(struct tx_ring *txr, struct
cmd_type_len |= IXGBE_ADVTXD_DCMD_VLE;
#endif
- /* Check that we have least the minimal number of TX descriptors. */
- if (txr->tx_avail <= IXGBE_TX_OP_THRESHOLD)
- return (ENOBUFS);
-
/*
* Important to capture the first descriptor
* used because it will contain the index of
@@ -1092,10 +1080,7 @@ ixgbe_encap(struct tx_ring *txr, struct
}
/* Make certain there are enough descriptors */
- if (map->dm_nsegs > txr->tx_avail - 2) {
- error = ENOBUFS;
- goto xmit_fail;
- }
+ KASSERT(map->dm_nsegs <= txr->tx_avail - 2);
/*
* Set the appropriate offload context
@@ -1153,7 +1138,6 @@ ixgbe_encap(struct tx_ring *txr, struct
xmit_fail:
bus_dmamap_unload(txr->txdma.dma_tag, txbuf->map);
return (error);
-
}
void
@@ -1296,7 +1280,6 @@ ixgbe_stop(void *arg)
/* Tell the stack that the interface is no longer active */
ifp->if_flags &= ~IFF_RUNNING;
- ifq_clr_oactive(&ifp->if_snd);
INIT_DEBUGOUT("ixgbe_stop: begin\n");
ixgbe_disable_intr(sc);
@@ -1315,10 +1298,13 @@ ixgbe_stop(void *arg)
/* reprogram the RAR[0] in case user changed it. */
ixgbe_set_rar(&sc->hw, 0, sc->hw.mac.addr, 0, IXGBE_RAH_AV);
+ ifq_barrier(&ifp->if_snd);
intr_barrier(sc->tag);
KASSERT((ifp->if_flags & IFF_RUNNING) == 0);
+ ifq_clr_oactive(&ifp->if_snd);
+
/* Should we really clear all structures on stop? */
ixgbe_free_transmit_structures(sc);
ixgbe_free_receive_structures(sc);
@@ -1389,11 +1375,9 @@ ixgbe_identify_hardware(struct ix_softc
}
/* Pick up the 82599 and VF settings */
- if (sc->hw.mac.type != ixgbe_mac_82598EB) {
+ if (sc->hw.mac.type != ixgbe_mac_82598EB)
sc->hw.phy.smart_speed = ixgbe_smart_speed;
- sc->num_segs = IXGBE_82599_SCATTER;
- } else
- sc->num_segs = IXGBE_82598_SCATTER;
+ sc->num_segs = IXGBE_82599_SCATTER;
}
/*********************************************************************
@@ -1547,6 +1531,7 @@ ixgbe_setup_interface(struct ix_softc *s
strlcpy(ifp->if_xname, sc->dev.dv_xname, IFNAMSIZ);
ifp->if_softc = sc;
ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
+ ifp->if_xflags = IFXF_MPSAFE;
ifp->if_ioctl = ixgbe_ioctl;
ifp->if_start = ixgbe_start;
ifp->if_timer = 0;
@@ -2439,17 +2424,8 @@ ixgbe_txeof(struct tx_ring *txr)
if (num_avail == sc->num_tx_desc)
ifp->if_timer = 0;
- /*
- * If we have enough room, clear IFF_OACTIVE to tell the stack that
- * it is OK to send packets.
- */
- if (ifq_is_oactive(&ifp->if_snd) &&
- num_avail > IXGBE_TX_OP_THRESHOLD) {
- ifq_clr_oactive(&ifp->if_snd);
- KERNEL_LOCK();
- ixgbe_start(ifp);
- KERNEL_UNLOCK();
- }
+ if (ifq_is_oactive(&ifp->if_snd))
+ ifq_restart(&ifp->if_snd);
return TRUE;
}
Index: if_ix.h
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_ix.h,v
retrieving revision 1.30
diff -u -p -r1.30 if_ix.h
--- if_ix.h 18 Dec 2015 19:08:36 -0000 1.30
+++ if_ix.h 29 Dec 2015 16:32:22 -0000
@@ -80,7 +80,7 @@
* Thise parameter controls the minimum number of available transmit
* descriptors needed before we attempt transmission of a packet.
*/
-#define IXGBE_TX_OP_THRESHOLD (sc->num_tx_desc / 32)
+#define IXGBE_TX_OP_THRESHOLD (sc->num_segs + 2)
#define IXGBE_MAX_FRAME_SIZE 9216