On 28/09/15(Mon) 23:53, Mark Kettenis wrote:
> This diff makes the tx completion path run without the kernel lock
> held. With this change, the interrupt handler will not grab the
> kernel lock under normal circumstances. The diff follows the same
> approach as dlg@ took with vmx(4).
>
> The diff removes the code that tries to reclaim tx descriptors in
> if_start() if we have only EM_TX_CLEANUP_THRESHOLD available
> descriptors left. Running tx_eof() in both the normal transmit path
> and from the interrupt handler becomes a locking nightmare.
Makes sense.
> Seems to survive some serious stress-testing with tcpbench and ping -f
> on my x220:
>
> em0 at pci0 dev 25 function 0 "Intel 82579LM" rev 0x04: msi, address
> f0:de:f1:70:87:8a
>
> Could use some further testing of course.
Tested on:
em0 at pci1 dev 0 function 0 "Intel I210" rev 0x03: msi
em0 at pci0 dev 25 function 0 "Intel 82579LM" rev 0x04: msi
em2 at pci0 dev 20 function 0 "Intel I354 SGMII" rev 0x03: msi
ok mpi@
> Index: if_em.c
> ===================================================================
> RCS file: /home/cvs/src/sys/dev/pci/if_em.c,v
> retrieving revision 1.305
> diff -u -p -r1.305 if_em.c
> --- if_em.c 19 Sep 2015 12:48:26 -0000 1.305
> +++ if_em.c 28 Sep 2015 20:58:52 -0000
> @@ -920,20 +920,15 @@ em_intr(void *arg)
> if (reg_icr & E1000_ICR_RXO)
> sc->rx_overruns++;
>
> - KERNEL_LOCK();
> -
> /* Link status change */
> if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) {
> + KERNEL_LOCK();
> sc->hw.get_link_status = 1;
> em_check_for_link(&sc->hw);
> em_update_link_status(sc);
> + KERNEL_UNLOCK();
> }
>
> - if (ifp->if_flags & IFF_RUNNING && !IFQ_IS_EMPTY(&ifp->if_snd))
> - em_start(ifp);
> -
> - KERNEL_UNLOCK();
> -
> if (refill && em_rxfill(sc)) {
> /* Advance the Rx Queue #0 "Tail Pointer". */
> E1000_WRITE_REG(&sc->hw, RDT, sc->last_rx_desc_filled);
> @@ -1110,17 +1105,10 @@ em_encap(struct em_softc *sc, struct mbu
> struct em_buffer *tx_buffer, *tx_buffer_mapped;
> struct em_tx_desc *current_tx_desc = NULL;
>
> - /*
> - * Force a cleanup if number of TX descriptors
> - * available hits the threshold
> - */
> - if (sc->num_tx_desc_avail <= EM_TX_CLEANUP_THRESHOLD) {
> - em_txeof(sc);
> - /* Now do we at least have a minimal? */
> - if (sc->num_tx_desc_avail <= EM_TX_OP_THRESHOLD) {
> - sc->no_tx_desc_avail1++;
> - return (ENOBUFS);
> - }
> + /* Check that we have least the minimal number of TX descriptors. */
> + if (sc->num_tx_desc_avail <= EM_TX_OP_THRESHOLD) {
> + sc->no_tx_desc_avail1++;
> + return (ENOBUFS);
> }
>
> if (sc->hw.mac_type == em_82547) {
> @@ -1224,9 +1212,9 @@ em_encap(struct em_softc *sc, struct mbu
>
> sc->next_avail_tx_desc = i;
> if (sc->pcix_82544)
> - sc->num_tx_desc_avail -= txd_used;
> + atomic_sub_int(&sc->num_tx_desc_avail, txd_used);
> else
> - sc->num_tx_desc_avail -= map->dm_nsegs;
> + atomic_sub_int(&sc->num_tx_desc_avail, map->dm_nsegs);
>
> #if NVLAN > 0
> /* Find out if we are in VLAN mode */
> @@ -2393,7 +2381,7 @@ em_transmit_checksum_setup(struct em_sof
> if (++curr_txd == sc->num_tx_desc)
> curr_txd = 0;
>
> - sc->num_tx_desc_avail--;
> + atomic_dec_int(&sc->num_tx_desc_avail);
> sc->next_avail_tx_desc = curr_txd;
> }
>
> @@ -2407,7 +2395,7 @@ em_transmit_checksum_setup(struct em_sof
> void
> em_txeof(struct em_softc *sc)
> {
> - int first, last, done, num_avail;
> + int first, last, done, num_avail, free = 0;
> struct em_buffer *tx_buffer;
> struct em_tx_desc *tx_desc, *eop_desc;
> struct ifnet *ifp = &sc->interface_data.ac_if;
> @@ -2415,9 +2403,6 @@ em_txeof(struct em_softc *sc)
> if (sc->num_tx_desc_avail == sc->num_tx_desc)
> return;
>
> - KERNEL_LOCK();
> -
> - num_avail = sc->num_tx_desc_avail;
> first = sc->next_tx_to_clean;
> tx_desc = &sc->tx_desc_base[first];
> tx_buffer = &sc->tx_buffer_area[first];
> @@ -2441,7 +2426,7 @@ em_txeof(struct em_softc *sc)
> while (first != done) {
> tx_desc->upper.data = 0;
> tx_desc->lower.data = 0;
> - num_avail++;
> + free++;
>
> if (tx_buffer->m_head != NULL) {
> ifp->if_opackets++;
> @@ -2481,25 +2466,23 @@ em_txeof(struct em_softc *sc)
>
> sc->next_tx_to_clean = first;
>
> - /*
> - * If we have enough room, clear IFF_OACTIVE to tell the stack
> - * that it is OK to send packets.
> - * If there are no pending descriptors, clear the timeout. Otherwise,
> - * if some descriptors have been freed, restart the timeout.
> - */
> - if (num_avail > EM_TX_CLEANUP_THRESHOLD)
> - ifp->if_flags &= ~IFF_OACTIVE;
> + num_avail = atomic_add_int_nv(&sc->num_tx_desc_avail, free);
>
> /* All clean, turn off the timer */
> if (num_avail == sc->num_tx_desc)
> ifp->if_timer = 0;
> - /* Some cleaned, reset the timer */
> - else if (num_avail != sc->num_tx_desc_avail)
> - ifp->if_timer = EM_TX_TIMEOUT;
> -
> - sc->num_tx_desc_avail = num_avail;
>
> - KERNEL_UNLOCK();
> + /*
> + * If we have enough room, clear IFF_OACTIVE to tell the stack
> + * that it is OK to send packets.
> + */
> + if (ISSET(ifp->if_flags, IFF_OACTIVE) &&
> + num_avail > EM_TX_OP_THRESHOLD) {
> + KERNEL_LOCK();
> + CLR(ifp->if_flags, IFF_OACTIVE);
> + em_start(ifp);
> + KERNEL_UNLOCK();
> + }
> }
>
> /*********************************************************************
> Index: if_em.h
> ===================================================================
> RCS file: /home/cvs/src/sys/dev/pci/if_em.h,v
> retrieving revision 1.57
> diff -u -p -r1.57 if_em.h
> --- if_em.h 19 Sep 2015 12:48:26 -0000 1.57
> +++ if_em.h 28 Sep 2015 21:17:45 -0000
> @@ -187,10 +187,9 @@ typedef int boolean_t;
> #define EM_TX_TIMEOUT 5 /* set to 5 seconds */
>
> /*
> - * These parameters control when the driver calls the routine to reclaim
> - * transmit descriptors.
> + * Thise parameter controls the minimum number of available transmit
> + * descriptors needed before we attempt transmission of a packet.
> */
> -#define EM_TX_CLEANUP_THRESHOLD (sc->num_tx_desc / 8)
> #define EM_TX_OP_THRESHOLD (sc->num_tx_desc / 32)
>
> /*
> @@ -356,8 +355,8 @@ struct em_softc {
> struct em_tx_desc *tx_desc_base;
> u_int32_t next_avail_tx_desc;
> u_int32_t next_tx_to_clean;
> - volatile u_int16_t num_tx_desc_avail;
> - u_int16_t num_tx_desc;
> + volatile u_int32_t num_tx_desc_avail;
> + u_int32_t num_tx_desc;
> u_int32_t txd_cmd;
> struct em_buffer *tx_buffer_area;
> bus_dma_tag_t txtag; /* dma tag for tx */
>