Hello David,

</snip>
> 
> my best argument for mutexes in pf is that currently we use smr critical
> sections around things like the bridge and aggr port selection, which
> are very obviously read-only workloads. pf may be a largely read-only
> workload, but where it is at the moment means it's not going to get run
> concurrently anyway.
> 

    I'm glad you mention bridges and smr. with bluhm's parallel forwarding
    diff [1] Hrvoje noticed panics with various bridges:

        db_enter() at db_enter+0x10
        panic() at panic+0x12a
        __assert() at __assert+0x2b
        assertwaitok() at assertwaitok+0xdc
        mi_switch() at mi_switch+0x40
        sleep_finish() at sleep_finish+0x11c
        rw_enter() at rw_enter+0x229
        pf_test() at pf_test+0x1055
        tpmr_pf() at tpmr_pf+0x7e
        tpmr_input() at tpmr_input+0x124
        ether_input() at ether_input+0xf5
        if_input_process() at if_input_process+0x6f
        ifiq_process() at ifiq_process+0x69

    the things start to go downhill right here in ether_input():

     426 
     427         smr_read_enter();
     428         eb = SMR_PTR_GET(&ac->ac_brport);
     429         if (eb != NULL) {
     430                 m = (*eb->eb_input)(ifp, m, dst, eb->eb_port);
     431                 if (m == NULL) {
     432                         smr_read_leave();
     433                         return;
     434                 }
     435         }
     436         smr_read_leave();
     437 

    we panic, because PF is going to sleep on one of its locks. 
    this is the case of 1 of 40 packets. The one, which needs
    to create state. ether_input() can not expect that ->eb_input(),
    won't block. May be someday one day in future, but we are not
    there yet.

    we don't see panic in current tree, because of exclusive
    net_lock. The ether_input() runs exclusively as a writer
    on netlock, thus all further locks down the stack are
    grabbed without blocking.

    My personal though on this: non-blocking ->eb_input() is
    pretty hard requirement. I don't mind if packet processing
    gives up a CPU to wait for another packet to get its job
    done. This gives other process chance to use cpu, be it
    another packet or any process in system. lack of sleeping
    points may lead to less responsive system, when network
    will get more busy.

    diff below combines smr_read... with reference counting using 
    traditional atomic ops. The idea is to adjust code found in ether_input():

     426 
     427         smr_read_enter();
     428         eb = SMR_PTR_GET(&ac->ac_brport);
     429         if (eb != NULL)
     430                 eb->eb_port_take(eb->eb_port);
     431         smr_read_leave();
     432         if (eb != NULL) {
     433                 m = (*eb->eb_input)(ifp, m, dst, eb->eb_port);
     434                 eb->eb_port_rele(eb->eb_port);
     435                 if (m == NULL) {
     436                         return;
     437                 }
     438         }
     439 

    According to early tests it works well. Currently there is a one
    mysterious panic, which Hrvoje may be able to comment on
    how to trigger it. The stack looks as follows:

        kernel diagnostic assertion "smr->smr_func == NULL" failed: file
            "/sys/kern/kern_smr.c", line 247
        panic() at panic+0x12a
        __assert() at
        __assert+0x2b
        smr_call_impl() at smr_call_impl+0xd4
        veb_port_input() at veb_port_input+0x2fa
        ether_input() at ether_input+0x10b
        if_input_process() at if_input_process+0x6f
        ifiq_process() at ifiq_process+0x69
        taskq_thread() at taskq_thread+0x81

    It's not obvious from stack above, but we die in etherbridge_map(),
    which is being called from here in veb_port_input():

        #if 0 && defined(IPSEC)
                if (ISSET(ifp->if_flags, IFF_LINK2) &&
                    (m = veb_ipsec_in(ifp0, m)) == NULL)
                        return (NULL);
        #endif

                eh = mtod(m, struct ether_header *);

                if (ISSET(p->p_bif_flags, IFBIF_LEARNING))
                        etherbridge_map(&sc->sc_eb, p, src);

                CLR(m->m_flags, M_BCAST|M_MCAST);
                SET(m->m_flags, M_PROTO1);

    the panic itself must be triggered by 'ebe_rele(oebe);' found
    at the end of etherbridge_map(). I've spent few days with it,
    and have no idea how it could happen. looks like result of
    memory corruption/use-after-free bug.

    anyway I would like to know if it would make sense to go with diff
    below. This will allow us to keep hunting for other bugs sitting
    in parallel forwarding path.

thanks and
sashan

[1] paralle://marc.info/?l=openbsd-tech&m=161903387904923&w=2 

--------8<---------------8<---------------8<------------------8<--------
diff --git a/sys/net/if_bridge.c b/sys/net/if_bridge.c
index 703be01648f..c41f3c93668 100644
--- a/sys/net/if_bridge.c
+++ b/sys/net/if_bridge.c
@@ -138,6 +138,8 @@ int bridge_ipsec(struct ifnet *, struct ether_header *, 
int, struct llc *,
 #endif
 int     bridge_clone_create(struct if_clone *, int);
 int    bridge_clone_destroy(struct ifnet *);
+void   bridge_take(void *);
+void   bridge_rele(void *);
 
 #define        ETHERADDR_IS_IP_MCAST(a) \
        /* struct etheraddr *a; */                              \
@@ -152,6 +154,8 @@ struct if_clone bridge_cloner =
 
 const struct ether_brport bridge_brport = {
        bridge_input,
+       bridge_take,
+       bridge_rele,
        NULL,
 };
 
@@ -1986,3 +1990,15 @@ bridge_send_icmp_err(struct ifnet *ifp,
  dropit:
        m_freem(n);
 }
+
+void
+bridge_take(void *unused)
+{
+       return;
+}
+
+void
+bridge_rele(void *unused)
+{
+       return;
+}
diff --git a/sys/net/if_ethersubr.c b/sys/net/if_ethersubr.c
index c9493b7f857..4b975e2d4a1 100644
--- a/sys/net/if_ethersubr.c
+++ b/sys/net/if_ethersubr.c
@@ -426,14 +426,16 @@ ether_input(struct ifnet *ifp, struct mbuf *m)
 
        smr_read_enter();
        eb = SMR_PTR_GET(&ac->ac_brport);
+       if (eb != NULL)
+               eb->eb_port_take(eb->eb_port);
+       smr_read_leave();
        if (eb != NULL) {
                m = (*eb->eb_input)(ifp, m, dst, eb->eb_port);
+               eb->eb_port_rele(eb->eb_port);
                if (m == NULL) {
-                       smr_read_leave();
                        return;
                }
        }
-       smr_read_leave();
 
        /*
         * Fourth phase: drop service delimited packets.
diff --git a/sys/net/if_switch.c b/sys/net/if_switch.c
index 22aecdc6b75..ca24597d1f7 100644
--- a/sys/net/if_switch.c
+++ b/sys/net/if_switch.c
@@ -110,6 +110,8 @@ struct mbuf
 void    switch_flow_classifier_dump(struct switch_softc *,
            struct switch_flow_classify *);
 void    switchattach(int);
+void   switch_take(void *);
+void   switch_rele(void *);
 
 struct if_clone switch_cloner =
     IF_CLONE_INITIALIZER("switch", switch_clone_create, switch_clone_destroy);
@@ -122,6 +124,8 @@ struct pool swfcl_pool;
 
 const struct ether_brport switch_brport = {
        switch_input,
+       switch_take,
+       switch_rele,
        NULL,
 };
 
@@ -1571,3 +1575,15 @@ ofp_split_mbuf(struct mbuf *m, struct mbuf **mtail)
 
        return (0);
 }
+
+void
+switch_take(void *unused)
+{
+       return;
+}
+
+void
+switch_rele(void *unused)
+{
+       return;
+}
diff --git a/sys/net/if_tpmr.c b/sys/net/if_tpmr.c
index f6eb99f347c..759ad844c81 100644
--- a/sys/net/if_tpmr.c
+++ b/sys/net/if_tpmr.c
@@ -83,6 +83,8 @@ struct tpmr_port {
        struct tpmr_softc       *p_tpmr;
        unsigned int             p_slot;
 
+       unsigned int             p_refcnt;
+
        struct ether_brport      p_brport;
 };
 
@@ -125,6 +127,8 @@ static int  tpmr_add_port(struct tpmr_softc *,
 static int     tpmr_del_port(struct tpmr_softc *,
                    const struct ifbreq *);
 static int     tpmr_port_list(struct tpmr_softc *, struct ifbifconf *);
+static void    tpmr_p_take(void *);
+static void    tpmr_p_rele(void *);
 
 static struct if_clone tpmr_cloner =
     IF_CLONE_INITIALIZER("tpmr", tpmr_clone_create, tpmr_clone_destroy);
@@ -377,6 +381,9 @@ tpmr_input(struct ifnet *ifp0, struct mbuf *m, uint64_t 
dst, void *brport)
 
        SMR_ASSERT_CRITICAL(); /* ether_input calls us in a crit section */
        pn = SMR_PTR_GET(&sc->sc_ports[!p->p_slot]);
+       if (pn != NULL)
+               tpmr_p_take(pn);
+       smr_read_leave();
        if (pn == NULL)
                goto drop;
 
@@ -532,6 +539,8 @@ tpmr_add_port(struct tpmr_softc *sc, const struct ifbreq 
*req)
        if_detachhook_add(ifp0, &p->p_dtask);
 
        p->p_brport.eb_input = tpmr_input;
+       p->p_brport.eb_port_take = tpmr_p_take;
+       p->p_brport.eb_port_rele = tpmr_p_rele;
        p->p_brport.eb_port = p;
 
        /* commit */
@@ -547,6 +556,7 @@ tpmr_add_port(struct tpmr_softc *sc, const struct ifbreq 
*req)
 
        p->p_slot = i;
 
+       tpmr_p_take(p);
        ether_brport_set(ifp0, &p->p_brport);
        ifp0->if_ioctl = tpmr_p_ioctl;
        ifp0->if_output = tpmr_p_output;
@@ -700,6 +710,26 @@ tpmr_p_output(struct ifnet *ifp0, struct mbuf *m, struct 
sockaddr *dst,
        return ((*p_output)(ifp0, m, dst, rt));
 }
 
+static void
+tpmr_p_take(void *p)
+{
+       struct tpmr_port *port = p;
+
+       atomic_inc_int((int *)(&port->p_refcnt));
+}
+
+static void
+tpmr_p_rele(void *p)
+{
+       struct tpmr_port *port = p;
+       struct ifnet *ifp0 = port->p_ifp0;
+
+       if (atomic_dec_int_nv((int *)(&port->p_refcnt)) == 0) {
+               if_put(ifp0);
+               free(port, M_DEVBUF, sizeof(*port));
+       }
+}
+
 static void
 tpmr_p_dtor(struct tpmr_softc *sc, struct tpmr_port *p, const char *op)
 {
@@ -725,10 +755,9 @@ tpmr_p_dtor(struct tpmr_softc *sc, struct tpmr_port *p, 
const char *op)
        if_detachhook_del(ifp0, &p->p_dtask);
        if_linkstatehook_del(ifp0, &p->p_ltask);
 
-       smr_barrier();
+       tpmr_p_rele(p);
 
-       if_put(ifp0);
-       free(p, M_DEVBUF, sizeof(*p));
+       smr_barrier();
 
        if (ifp->if_link_state != LINK_STATE_DOWN) {
                ifp->if_link_state = LINK_STATE_DOWN;
diff --git a/sys/net/if_veb.c b/sys/net/if_veb.c
index 71cc814fa54..72c9ea2ba5f 100644
--- a/sys/net/if_veb.c
+++ b/sys/net/if_veb.c
@@ -209,6 +209,9 @@ static void  veb_eb_port_rele(void *, void *);
 static size_t   veb_eb_port_ifname(void *, char *, size_t, void *);
 static void     veb_eb_port_sa(void *, struct sockaddr_storage *, void *);
 
+static void     veb_eb_brport_take(void *);
+static void     veb_eb_brport_rele(void *);
+
 static const struct etherbridge_ops veb_etherbridge_ops = {
        veb_eb_port_cmp,
        veb_eb_port_take,
@@ -1060,8 +1063,12 @@ veb_port_input(struct ifnet *ifp0, struct mbuf *m, 
uint64_t dst, void *brport)
 
                smr_read_enter();
                tp = etherbridge_resolve(&sc->sc_eb, dst);
-               m = veb_transmit(sc, p, tp, m, src, dst);
+               if (tp != NULL)
+                       veb_eb_port_take(NULL, tp);
                smr_read_leave();
+               m = veb_transmit(sc, p, tp, m, src, dst);
+               if (tp != NULL)
+                       veb_eb_port_rele(NULL, tp);
 
                if (m == NULL)
                        return (NULL);
@@ -1291,6 +1298,9 @@ veb_add_port(struct veb_softc *sc, const struct ifbreq 
*req, unsigned int span)
                p->p_brport.eb_input = veb_port_input;
        }
 
+       p->p_brport.eb_port_take = veb_eb_brport_take;
+       p->p_brport.eb_port_rele = veb_eb_brport_rele;
+
        /* this might have changed if we slept for malloc or ifpromisc */
        error = ether_brport_isset(ifp0);
        if (error != 0)
@@ -1910,8 +1920,8 @@ veb_p_dtor(struct veb_softc *sc, struct veb_port *p, 
const char *op)
        SMR_TAILQ_REMOVE_LOCKED(&port_list->l_list, p, p_entry);
        port_list->l_count--;
 
-       smr_barrier();
        refcnt_finalize(&p->p_refs, "vebpdtor");
+       smr_barrier();
 
        veb_rule_list_free(TAILQ_FIRST(&p->p_vrl));
 
@@ -2016,6 +2026,18 @@ veb_eb_port_rele(void *arg, void *port)
        refcnt_rele_wake(&p->p_refs);
 }
 
+static void
+veb_eb_brport_take(void *port)
+{
+       veb_eb_port_take(NULL, port);
+}
+
+static void
+veb_eb_brport_rele(void *port)
+{
+       veb_eb_port_rele(NULL, port);
+}
+
 static size_t
 veb_eb_port_ifname(void *arg, char *dst, size_t len, void *port)
 {
@@ -2181,6 +2203,9 @@ vport_enqueue(struct ifnet *ifp, struct mbuf *m)
 
        smr_read_enter();
        eb = SMR_PTR_GET(&ac->ac_brport);
+       if (eb != NULL)
+               eb->eb_port_take(eb->eb_port);
+       smr_read_leave();
        if (eb != NULL) {
                struct ether_header *eh;
                uint64_t dst;
@@ -2199,8 +2224,9 @@ vport_enqueue(struct ifnet *ifp, struct mbuf *m)
                m = (*eb->eb_input)(ifp, m, dst, eb->eb_port);
 
                error = 0;
+
+               eb->eb_port_rele(eb->eb_port);
        }
-       smr_read_leave();
 
        m_freem(m);
 
diff --git a/sys/netinet/if_ether.h b/sys/netinet/if_ether.h
index 56e6384f39b..611e0fa704b 100644
--- a/sys/netinet/if_ether.h
+++ b/sys/netinet/if_ether.h
@@ -220,6 +220,8 @@ do {                                                        
                \
 struct ether_brport {
        struct mbuf     *(*eb_input)(struct ifnet *, struct mbuf *,
                           uint64_t, void *);
+       void            (*eb_port_take)(void *);
+       void            (*eb_port_rele)(void *);
        void              *eb_port;
 };
 

Reply via email to