Re: Fix deadlock in cad_down()

2022-03-02 Thread Visa Hankala
On Wed, Mar 02, 2022 at 12:11:30PM +0100, Mark Kettenis wrote:
> > Date: Wed, 2 Mar 2022 10:53:54 +
> > From: Visa Hankala 
> > 
> > Holding NET_LOCK() while invoking a taskq barrier can result in
> > a deadlock because the lock can prevent the taskq from making progress.
> > Avoid this problem in cad_down() by serializing the driver's ioctl
> > handler with an rwlock so that NET_LOCK() can be released temporarily.
> > 
> > The patch additionally makes cad_up() release NET_LOCK() when allocating
> > the Tx and Rx rings. There is a risk that the allocation blocks.
> > The code probably should allow allocation failure, but that I leave for
> > another patch.
> > 
> > OK?
> 
> I'm not 100% up to date with the network stack locking requirements,
> but:
> 
> > Index: dev/fdt/if_cad.c
> > ===
> > RCS file: src/sys/dev/fdt/if_cad.c,v
> > retrieving revision 1.9
> > diff -u -p -r1.9 if_cad.c
> > --- dev/fdt/if_cad.c27 Jan 2022 17:34:51 -  1.9
> > +++ dev/fdt/if_cad.c2 Mar 2022 10:28:04 -
> > @@ -30,6 +30,7 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >  #include 
> >  #include 
> >  
> > @@ -297,6 +298,7 @@ struct cad_softc {
> > unsigned intsc_rx_cons;
> > uint32_tsc_netctl;
> >  
> > +   struct rwlock   sc_cfg_lock;
> > struct task sc_statchg_task;
> > uint32_tsc_tx_freq;
> >  
> > @@ -467,10 +469,13 @@ cad_attach(struct device *parent, struct
> > if (OF_is_compatible(faa->fa_node, "cdns,zynq-gem"))
> > sc->sc_rxhang_erratum = 1;
> >  
> > +   rw_init(>sc_cfg_lock, "cadcfg");
> > timeout_set(>sc_tick, cad_tick, sc);
> > task_set(>sc_statchg_task, cad_statchg_task, sc);
> >  
> > +   rw_enter_write(>sc_cfg_lock);
> > cad_reset(sc);
> > +   rw_exit_write(>sc_cfg_lock);
> >  
> > sc->sc_ih = fdt_intr_establish(faa->fa_node, IPL_NET | IPL_MPSAFE,
> > cad_intr, sc, sc->sc_dev.dv_xname);
> > @@ -548,6 +553,9 @@ cad_ioctl(struct ifnet *ifp, u_long cmd,
> > int error = 0;
> > int s;
> >  
> > +   NET_UNLOCK();
> > +   rw_enter_write(>sc_cfg_lock);
> > +   NET_LOCK();
> > s = splnet();
> 
> This looks a bit odd.  But I suppose that splnet() is needed because
> NET_LOCK() doesn't prevent the interrupt handler from running?  But
> that doesn't work since the driver uses IPL_MPSAFE.  So is this just
> to satisfysome splassert() calls?

Most NIC drivers appear to use splnet() in their ioctl code, even when
the interrupt handlers are MP-safe.

However, the splnet() blocks simultaneous media status changes through
cad_tick(). Both the ioctl handler and the timer run under the kernel
lock.

> 
> > switch (cmd) {
> > @@ -585,6 +593,7 @@ cad_ioctl(struct ifnet *ifp, u_long cmd,
> > }
> >  
> > splx(s);
> > +   rw_exit_write(>sc_cfg_lock);
> >  
> > return error;
> >  }
> > @@ -598,6 +607,8 @@ cad_reset(struct cad_softc *sc)
> > unsigned int freq, i;
> > uint32_t div, netcfg;
> >  
> > +   rw_assert_wrlock(>sc_cfg_lock);
> > +
> > HWRITE4(sc, GEM_NETCTL, 0);
> > HWRITE4(sc, GEM_IDR, ~0U);
> > HWRITE4(sc, GEM_RXSR, 0);
> > @@ -655,6 +666,11 @@ cad_up(struct cad_softc *sc)
> > unsigned int i, nrxd, ntxd;
> > uint32_t val;
> >  
> > +   rw_assert_wrlock(>sc_cfg_lock);
> > +
> > +   /* Release lock for memory allocation. */
> > +   NET_UNLOCK();
> > +
> > if (sc->sc_dma64)
> > flags |= BUS_DMA_64BIT;
> >  
> > @@ -810,6 +826,8 @@ cad_up(struct cad_softc *sc)
> > }
> > }
> >  
> > +   NET_LOCK();
> > +
> > /*
> >  * Set MAC address filters.
> >  */
> > @@ -905,11 +923,16 @@ cad_down(struct cad_softc *sc)
> > struct cad_buf *rxb, *txb;
> > unsigned int i, timeout;
> >  
> > +   rw_assert_wrlock(>sc_cfg_lock);
> > +
> > ifp->if_flags &= ~IFF_RUNNING;
> >  
> > ifq_clr_oactive(>if_snd);
> > ifp->if_timer = 0;
> >  
> > +   /* Avoid lock order issues with barriers. */
> > +   NET_UNLOCK();
> > +
> > timeout_del_barrier(>sc_tick);
> >  
> > /* Disable data transfer. */
> > @@ -981,6 +1004,8 @@ cad_down(struct cad_softc *sc)
> > cad_dmamem_free(sc, sc->sc_rxring);
> > sc->sc_rxring = NULL;
> > sc->sc_rxdesc = NULL;
> > +
> > +   NET_LOCK();
> >  }
> >  
> >  uint8_t
> > @@ -1010,6 +1035,8 @@ cad_iff(struct cad_softc *sc)
> > struct ether_multistep step;
> > uint64_t hash;
> > uint32_t netcfg;
> > +
> > +   rw_assert_wrlock(>sc_cfg_lock);
> >  
> > netcfg = HREAD4(sc, GEM_NETCFG);
> > netcfg &= ~GEM_NETCFG_UCASTHASHEN;
> > 
> > 
> 



Re: Fix deadlock in cad_down()

2022-03-02 Thread Mark Kettenis
> Date: Wed, 2 Mar 2022 10:53:54 +
> From: Visa Hankala 
> 
> Holding NET_LOCK() while invoking a taskq barrier can result in
> a deadlock because the lock can prevent the taskq from making progress.
> Avoid this problem in cad_down() by serializing the driver's ioctl
> handler with an rwlock so that NET_LOCK() can be released temporarily.
> 
> The patch additionally makes cad_up() release NET_LOCK() when allocating
> the Tx and Rx rings. There is a risk that the allocation blocks.
> The code probably should allow allocation failure, but that I leave for
> another patch.
> 
> OK?

I'm not 100% up to date with the network stack locking requirements,
but:

> Index: dev/fdt/if_cad.c
> ===
> RCS file: src/sys/dev/fdt/if_cad.c,v
> retrieving revision 1.9
> diff -u -p -r1.9 if_cad.c
> --- dev/fdt/if_cad.c  27 Jan 2022 17:34:51 -  1.9
> +++ dev/fdt/if_cad.c  2 Mar 2022 10:28:04 -
> @@ -30,6 +30,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  
> @@ -297,6 +298,7 @@ struct cad_softc {
>   unsigned intsc_rx_cons;
>   uint32_tsc_netctl;
>  
> + struct rwlock   sc_cfg_lock;
>   struct task sc_statchg_task;
>   uint32_tsc_tx_freq;
>  
> @@ -467,10 +469,13 @@ cad_attach(struct device *parent, struct
>   if (OF_is_compatible(faa->fa_node, "cdns,zynq-gem"))
>   sc->sc_rxhang_erratum = 1;
>  
> + rw_init(>sc_cfg_lock, "cadcfg");
>   timeout_set(>sc_tick, cad_tick, sc);
>   task_set(>sc_statchg_task, cad_statchg_task, sc);
>  
> + rw_enter_write(>sc_cfg_lock);
>   cad_reset(sc);
> + rw_exit_write(>sc_cfg_lock);
>  
>   sc->sc_ih = fdt_intr_establish(faa->fa_node, IPL_NET | IPL_MPSAFE,
>   cad_intr, sc, sc->sc_dev.dv_xname);
> @@ -548,6 +553,9 @@ cad_ioctl(struct ifnet *ifp, u_long cmd,
>   int error = 0;
>   int s;
>  
> + NET_UNLOCK();
> + rw_enter_write(>sc_cfg_lock);
> + NET_LOCK();
>   s = splnet();

This looks a bit odd.  But I suppose that splnet() is needed because
NET_LOCK() doesn't prevent the interrupt handler from running?  But
that doesn't work since the driver uses IPL_MPSAFE.  So is this just
to satisfysome splassert() calls?

>   switch (cmd) {
> @@ -585,6 +593,7 @@ cad_ioctl(struct ifnet *ifp, u_long cmd,
>   }
>  
>   splx(s);
> + rw_exit_write(>sc_cfg_lock);
>  
>   return error;
>  }
> @@ -598,6 +607,8 @@ cad_reset(struct cad_softc *sc)
>   unsigned int freq, i;
>   uint32_t div, netcfg;
>  
> + rw_assert_wrlock(>sc_cfg_lock);
> +
>   HWRITE4(sc, GEM_NETCTL, 0);
>   HWRITE4(sc, GEM_IDR, ~0U);
>   HWRITE4(sc, GEM_RXSR, 0);
> @@ -655,6 +666,11 @@ cad_up(struct cad_softc *sc)
>   unsigned int i, nrxd, ntxd;
>   uint32_t val;
>  
> + rw_assert_wrlock(>sc_cfg_lock);
> +
> + /* Release lock for memory allocation. */
> + NET_UNLOCK();
> +
>   if (sc->sc_dma64)
>   flags |= BUS_DMA_64BIT;
>  
> @@ -810,6 +826,8 @@ cad_up(struct cad_softc *sc)
>   }
>   }
>  
> + NET_LOCK();
> +
>   /*
>* Set MAC address filters.
>*/
> @@ -905,11 +923,16 @@ cad_down(struct cad_softc *sc)
>   struct cad_buf *rxb, *txb;
>   unsigned int i, timeout;
>  
> + rw_assert_wrlock(>sc_cfg_lock);
> +
>   ifp->if_flags &= ~IFF_RUNNING;
>  
>   ifq_clr_oactive(>if_snd);
>   ifp->if_timer = 0;
>  
> + /* Avoid lock order issues with barriers. */
> + NET_UNLOCK();
> +
>   timeout_del_barrier(>sc_tick);
>  
>   /* Disable data transfer. */
> @@ -981,6 +1004,8 @@ cad_down(struct cad_softc *sc)
>   cad_dmamem_free(sc, sc->sc_rxring);
>   sc->sc_rxring = NULL;
>   sc->sc_rxdesc = NULL;
> +
> + NET_LOCK();
>  }
>  
>  uint8_t
> @@ -1010,6 +1035,8 @@ cad_iff(struct cad_softc *sc)
>   struct ether_multistep step;
>   uint64_t hash;
>   uint32_t netcfg;
> +
> + rw_assert_wrlock(>sc_cfg_lock);
>  
>   netcfg = HREAD4(sc, GEM_NETCFG);
>   netcfg &= ~GEM_NETCFG_UCASTHASHEN;
> 
>