On 28/09/15(Mon) 23:53, Mark Kettenis wrote:
> This diff makes the tx completion path run without the kernel lock
> held.  With this change, the interrupt handler will not grab the
> kernel lock under normal circumstances.  The diff follows the same
> approach as dlg@ took with vmx(4).
> 
> The diff removes the code that tries to reclaim tx descriptors in
> if_start() if we have only EM_TX_CLEANUP_THRESHOLD available
> descriptors left.  Running tx_eof() in both the normal transmit path
> and from the interrupt handler becomes a locking nightmare.

Makes sense.

> Seems to survive some serious stress-testing with tcpbench and ping -f
> on my x220:
> 
> em0 at pci0 dev 25 function 0 "Intel 82579LM" rev 0x04: msi, address 
> f0:de:f1:70:87:8a
> 
> Could use some further testing of course.

Tested on:

em0 at pci1 dev 0 function 0 "Intel I210" rev 0x03: msi

em0 at pci0 dev 25 function 0 "Intel 82579LM" rev 0x04: msi

em2 at pci0 dev 20 function 0 "Intel I354 SGMII" rev 0x03: msi

ok mpi@


> Index: if_em.c
> ===================================================================
> RCS file: /home/cvs/src/sys/dev/pci/if_em.c,v
> retrieving revision 1.305
> diff -u -p -r1.305 if_em.c
> --- if_em.c   19 Sep 2015 12:48:26 -0000      1.305
> +++ if_em.c   28 Sep 2015 20:58:52 -0000
> @@ -920,20 +920,15 @@ em_intr(void *arg)
>       if (reg_icr & E1000_ICR_RXO)
>               sc->rx_overruns++;
>  
> -     KERNEL_LOCK();
> -
>       /* Link status change */
>       if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) {
> +             KERNEL_LOCK();
>               sc->hw.get_link_status = 1;
>               em_check_for_link(&sc->hw);
>               em_update_link_status(sc);
> +             KERNEL_UNLOCK();
>       }
>  
> -     if (ifp->if_flags & IFF_RUNNING && !IFQ_IS_EMPTY(&ifp->if_snd))
> -             em_start(ifp);
> -
> -     KERNEL_UNLOCK();
> -
>       if (refill && em_rxfill(sc)) {
>               /* Advance the Rx Queue #0 "Tail Pointer". */
>               E1000_WRITE_REG(&sc->hw, RDT, sc->last_rx_desc_filled);
> @@ -1110,17 +1105,10 @@ em_encap(struct em_softc *sc, struct mbu
>       struct em_buffer   *tx_buffer, *tx_buffer_mapped;
>       struct em_tx_desc *current_tx_desc = NULL;
>  
> -     /*
> -      * Force a cleanup if number of TX descriptors
> -      * available hits the threshold
> -      */
> -     if (sc->num_tx_desc_avail <= EM_TX_CLEANUP_THRESHOLD) {
> -             em_txeof(sc);
> -             /* Now do we at least have a minimal? */
> -             if (sc->num_tx_desc_avail <= EM_TX_OP_THRESHOLD) {
> -                     sc->no_tx_desc_avail1++;
> -                     return (ENOBUFS);
> -             }
> +     /* Check that we have least the minimal number of TX descriptors. */
> +     if (sc->num_tx_desc_avail <= EM_TX_OP_THRESHOLD) {
> +             sc->no_tx_desc_avail1++;
> +             return (ENOBUFS);
>       }
>  
>       if (sc->hw.mac_type == em_82547) {
> @@ -1224,9 +1212,9 @@ em_encap(struct em_softc *sc, struct mbu
>  
>       sc->next_avail_tx_desc = i;
>       if (sc->pcix_82544)
> -             sc->num_tx_desc_avail -= txd_used;
> +             atomic_sub_int(&sc->num_tx_desc_avail, txd_used);
>       else
> -             sc->num_tx_desc_avail -= map->dm_nsegs;
> +             atomic_sub_int(&sc->num_tx_desc_avail, map->dm_nsegs);
>  
>  #if NVLAN > 0
>       /* Find out if we are in VLAN mode */
> @@ -2393,7 +2381,7 @@ em_transmit_checksum_setup(struct em_sof
>       if (++curr_txd == sc->num_tx_desc)
>               curr_txd = 0;
>  
> -     sc->num_tx_desc_avail--;
> +     atomic_dec_int(&sc->num_tx_desc_avail);
>       sc->next_avail_tx_desc = curr_txd;
>  }
>  
> @@ -2407,7 +2395,7 @@ em_transmit_checksum_setup(struct em_sof
>  void
>  em_txeof(struct em_softc *sc)
>  {
> -     int first, last, done, num_avail;
> +     int first, last, done, num_avail, free = 0;
>       struct em_buffer *tx_buffer;
>       struct em_tx_desc   *tx_desc, *eop_desc;
>       struct ifnet   *ifp = &sc->interface_data.ac_if;
> @@ -2415,9 +2403,6 @@ em_txeof(struct em_softc *sc)
>       if (sc->num_tx_desc_avail == sc->num_tx_desc)
>               return;
>  
> -     KERNEL_LOCK();
> -
> -     num_avail = sc->num_tx_desc_avail;
>       first = sc->next_tx_to_clean;
>       tx_desc = &sc->tx_desc_base[first];
>       tx_buffer = &sc->tx_buffer_area[first];
> @@ -2441,7 +2426,7 @@ em_txeof(struct em_softc *sc)
>               while (first != done) {
>                       tx_desc->upper.data = 0;
>                       tx_desc->lower.data = 0;
> -                     num_avail++;
> +                     free++;
>  
>                       if (tx_buffer->m_head != NULL) {
>                               ifp->if_opackets++;
> @@ -2481,25 +2466,23 @@ em_txeof(struct em_softc *sc)
>  
>       sc->next_tx_to_clean = first;
>  
> -     /*
> -      * If we have enough room, clear IFF_OACTIVE to tell the stack
> -      * that it is OK to send packets.
> -      * If there are no pending descriptors, clear the timeout. Otherwise,
> -      * if some descriptors have been freed, restart the timeout.
> -      */
> -     if (num_avail > EM_TX_CLEANUP_THRESHOLD)
> -             ifp->if_flags &= ~IFF_OACTIVE;
> +     num_avail = atomic_add_int_nv(&sc->num_tx_desc_avail, free);
>  
>       /* All clean, turn off the timer */
>       if (num_avail == sc->num_tx_desc)
>               ifp->if_timer = 0;
> -     /* Some cleaned, reset the timer */
> -     else if (num_avail != sc->num_tx_desc_avail)
> -             ifp->if_timer = EM_TX_TIMEOUT;
> -
> -     sc->num_tx_desc_avail = num_avail;
>  
> -     KERNEL_UNLOCK();
> +     /*
> +      * If we have enough room, clear IFF_OACTIVE to tell the stack
> +      * that it is OK to send packets.
> +      */
> +     if (ISSET(ifp->if_flags, IFF_OACTIVE) &&
> +         num_avail > EM_TX_OP_THRESHOLD) {
> +             KERNEL_LOCK();
> +             CLR(ifp->if_flags, IFF_OACTIVE);
> +             em_start(ifp);
> +             KERNEL_UNLOCK();
> +     }
>  }
>  
>  /*********************************************************************
> Index: if_em.h
> ===================================================================
> RCS file: /home/cvs/src/sys/dev/pci/if_em.h,v
> retrieving revision 1.57
> diff -u -p -r1.57 if_em.h
> --- if_em.h   19 Sep 2015 12:48:26 -0000      1.57
> +++ if_em.h   28 Sep 2015 21:17:45 -0000
> @@ -187,10 +187,9 @@ typedef int      boolean_t;
>  #define EM_TX_TIMEOUT                        5       /* set to 5 seconds */
>  
>  /*
> - * These parameters control when the driver calls the routine to reclaim
> - * transmit descriptors.
> + * Thise parameter controls the minimum number of available transmit
> + * descriptors needed before we attempt transmission of a packet.
>   */
> -#define EM_TX_CLEANUP_THRESHOLD              (sc->num_tx_desc / 8)
>  #define EM_TX_OP_THRESHOLD           (sc->num_tx_desc / 32)
>  
>  /*
> @@ -356,8 +355,8 @@ struct em_softc {
>       struct em_tx_desc       *tx_desc_base;
>       u_int32_t               next_avail_tx_desc;
>       u_int32_t               next_tx_to_clean;
> -     volatile u_int16_t      num_tx_desc_avail;
> -     u_int16_t               num_tx_desc;
> +     volatile u_int32_t      num_tx_desc_avail;
> +     u_int32_t               num_tx_desc;
>       u_int32_t               txd_cmd;
>       struct em_buffer        *tx_buffer_area;
>       bus_dma_tag_t           txtag;          /* dma tag for tx */
> 

Reply via email to