Hello David, </snip> > > my best argument for mutexes in pf is that currently we use smr critical > sections around things like the bridge and aggr port selection, which > are very obviously read-only workloads. pf may be a largely read-only > workload, but where it is at the moment means it's not going to get run > concurrently anyway. >
I'm glad you mention bridges and smr. with bluhm's parallel forwarding diff [1] Hrvoje noticed panics with various bridges: db_enter() at db_enter+0x10 panic() at panic+0x12a __assert() at __assert+0x2b assertwaitok() at assertwaitok+0xdc mi_switch() at mi_switch+0x40 sleep_finish() at sleep_finish+0x11c rw_enter() at rw_enter+0x229 pf_test() at pf_test+0x1055 tpmr_pf() at tpmr_pf+0x7e tpmr_input() at tpmr_input+0x124 ether_input() at ether_input+0xf5 if_input_process() at if_input_process+0x6f ifiq_process() at ifiq_process+0x69 the things start to go downhill right here in ether_input(): 426 427 smr_read_enter(); 428 eb = SMR_PTR_GET(&ac->ac_brport); 429 if (eb != NULL) { 430 m = (*eb->eb_input)(ifp, m, dst, eb->eb_port); 431 if (m == NULL) { 432 smr_read_leave(); 433 return; 434 } 435 } 436 smr_read_leave(); 437 we panic, because PF is going to sleep on one of its locks. this is the case of 1 of 40 packets. The one, which needs to create state. ether_input() can not expect that ->eb_input(), won't block. May be someday one day in future, but we are not there yet. we don't see panic in current tree, because of exclusive net_lock. The ether_input() runs exclusively as a writer on netlock, thus all further locks down the stack are grabbed without blocking. My personal though on this: non-blocking ->eb_input() is pretty hard requirement. I don't mind if packet processing gives up a CPU to wait for another packet to get its job done. This gives other process chance to use cpu, be it another packet or any process in system. lack of sleeping points may lead to less responsive system, when network will get more busy. diff below combines smr_read... with reference counting using traditional atomic ops. The idea is to adjust code found in ether_input(): 426 427 smr_read_enter(); 428 eb = SMR_PTR_GET(&ac->ac_brport); 429 if (eb != NULL) 430 eb->eb_port_take(eb->eb_port); 431 smr_read_leave(); 432 if (eb != NULL) { 433 m = (*eb->eb_input)(ifp, m, dst, eb->eb_port); 434 eb->eb_port_rele(eb->eb_port); 435 if (m == NULL) { 436 return; 437 } 438 } 439 According to early tests it works well. Currently there is a one mysterious panic, which Hrvoje may be able to comment on how to trigger it. The stack looks as follows: kernel diagnostic assertion "smr->smr_func == NULL" failed: file "/sys/kern/kern_smr.c", line 247 panic() at panic+0x12a __assert() at __assert+0x2b smr_call_impl() at smr_call_impl+0xd4 veb_port_input() at veb_port_input+0x2fa ether_input() at ether_input+0x10b if_input_process() at if_input_process+0x6f ifiq_process() at ifiq_process+0x69 taskq_thread() at taskq_thread+0x81 It's not obvious from stack above, but we die in etherbridge_map(), which is being called from here in veb_port_input(): #if 0 && defined(IPSEC) if (ISSET(ifp->if_flags, IFF_LINK2) && (m = veb_ipsec_in(ifp0, m)) == NULL) return (NULL); #endif eh = mtod(m, struct ether_header *); if (ISSET(p->p_bif_flags, IFBIF_LEARNING)) etherbridge_map(&sc->sc_eb, p, src); CLR(m->m_flags, M_BCAST|M_MCAST); SET(m->m_flags, M_PROTO1); the panic itself must be triggered by 'ebe_rele(oebe);' found at the end of etherbridge_map(). I've spent few days with it, and have no idea how it could happen. looks like result of memory corruption/use-after-free bug. anyway I would like to know if it would make sense to go with diff below. This will allow us to keep hunting for other bugs sitting in parallel forwarding path. thanks and sashan [1] paralle://marc.info/?l=openbsd-tech&m=161903387904923&w=2 --------8<---------------8<---------------8<------------------8<-------- diff --git a/sys/net/if_bridge.c b/sys/net/if_bridge.c index 703be01648f..c41f3c93668 100644 --- a/sys/net/if_bridge.c +++ b/sys/net/if_bridge.c @@ -138,6 +138,8 @@ int bridge_ipsec(struct ifnet *, struct ether_header *, int, struct llc *, #endif int bridge_clone_create(struct if_clone *, int); int bridge_clone_destroy(struct ifnet *); +void bridge_take(void *); +void bridge_rele(void *); #define ETHERADDR_IS_IP_MCAST(a) \ /* struct etheraddr *a; */ \ @@ -152,6 +154,8 @@ struct if_clone bridge_cloner = const struct ether_brport bridge_brport = { bridge_input, + bridge_take, + bridge_rele, NULL, }; @@ -1986,3 +1990,15 @@ bridge_send_icmp_err(struct ifnet *ifp, dropit: m_freem(n); } + +void +bridge_take(void *unused) +{ + return; +} + +void +bridge_rele(void *unused) +{ + return; +} diff --git a/sys/net/if_ethersubr.c b/sys/net/if_ethersubr.c index c9493b7f857..4b975e2d4a1 100644 --- a/sys/net/if_ethersubr.c +++ b/sys/net/if_ethersubr.c @@ -426,14 +426,16 @@ ether_input(struct ifnet *ifp, struct mbuf *m) smr_read_enter(); eb = SMR_PTR_GET(&ac->ac_brport); + if (eb != NULL) + eb->eb_port_take(eb->eb_port); + smr_read_leave(); if (eb != NULL) { m = (*eb->eb_input)(ifp, m, dst, eb->eb_port); + eb->eb_port_rele(eb->eb_port); if (m == NULL) { - smr_read_leave(); return; } } - smr_read_leave(); /* * Fourth phase: drop service delimited packets. diff --git a/sys/net/if_switch.c b/sys/net/if_switch.c index 22aecdc6b75..ca24597d1f7 100644 --- a/sys/net/if_switch.c +++ b/sys/net/if_switch.c @@ -110,6 +110,8 @@ struct mbuf void switch_flow_classifier_dump(struct switch_softc *, struct switch_flow_classify *); void switchattach(int); +void switch_take(void *); +void switch_rele(void *); struct if_clone switch_cloner = IF_CLONE_INITIALIZER("switch", switch_clone_create, switch_clone_destroy); @@ -122,6 +124,8 @@ struct pool swfcl_pool; const struct ether_brport switch_brport = { switch_input, + switch_take, + switch_rele, NULL, }; @@ -1571,3 +1575,15 @@ ofp_split_mbuf(struct mbuf *m, struct mbuf **mtail) return (0); } + +void +switch_take(void *unused) +{ + return; +} + +void +switch_rele(void *unused) +{ + return; +} diff --git a/sys/net/if_tpmr.c b/sys/net/if_tpmr.c index f6eb99f347c..759ad844c81 100644 --- a/sys/net/if_tpmr.c +++ b/sys/net/if_tpmr.c @@ -83,6 +83,8 @@ struct tpmr_port { struct tpmr_softc *p_tpmr; unsigned int p_slot; + unsigned int p_refcnt; + struct ether_brport p_brport; }; @@ -125,6 +127,8 @@ static int tpmr_add_port(struct tpmr_softc *, static int tpmr_del_port(struct tpmr_softc *, const struct ifbreq *); static int tpmr_port_list(struct tpmr_softc *, struct ifbifconf *); +static void tpmr_p_take(void *); +static void tpmr_p_rele(void *); static struct if_clone tpmr_cloner = IF_CLONE_INITIALIZER("tpmr", tpmr_clone_create, tpmr_clone_destroy); @@ -377,6 +381,9 @@ tpmr_input(struct ifnet *ifp0, struct mbuf *m, uint64_t dst, void *brport) SMR_ASSERT_CRITICAL(); /* ether_input calls us in a crit section */ pn = SMR_PTR_GET(&sc->sc_ports[!p->p_slot]); + if (pn != NULL) + tpmr_p_take(pn); + smr_read_leave(); if (pn == NULL) goto drop; @@ -532,6 +539,8 @@ tpmr_add_port(struct tpmr_softc *sc, const struct ifbreq *req) if_detachhook_add(ifp0, &p->p_dtask); p->p_brport.eb_input = tpmr_input; + p->p_brport.eb_port_take = tpmr_p_take; + p->p_brport.eb_port_rele = tpmr_p_rele; p->p_brport.eb_port = p; /* commit */ @@ -547,6 +556,7 @@ tpmr_add_port(struct tpmr_softc *sc, const struct ifbreq *req) p->p_slot = i; + tpmr_p_take(p); ether_brport_set(ifp0, &p->p_brport); ifp0->if_ioctl = tpmr_p_ioctl; ifp0->if_output = tpmr_p_output; @@ -700,6 +710,26 @@ tpmr_p_output(struct ifnet *ifp0, struct mbuf *m, struct sockaddr *dst, return ((*p_output)(ifp0, m, dst, rt)); } +static void +tpmr_p_take(void *p) +{ + struct tpmr_port *port = p; + + atomic_inc_int((int *)(&port->p_refcnt)); +} + +static void +tpmr_p_rele(void *p) +{ + struct tpmr_port *port = p; + struct ifnet *ifp0 = port->p_ifp0; + + if (atomic_dec_int_nv((int *)(&port->p_refcnt)) == 0) { + if_put(ifp0); + free(port, M_DEVBUF, sizeof(*port)); + } +} + static void tpmr_p_dtor(struct tpmr_softc *sc, struct tpmr_port *p, const char *op) { @@ -725,10 +755,9 @@ tpmr_p_dtor(struct tpmr_softc *sc, struct tpmr_port *p, const char *op) if_detachhook_del(ifp0, &p->p_dtask); if_linkstatehook_del(ifp0, &p->p_ltask); - smr_barrier(); + tpmr_p_rele(p); - if_put(ifp0); - free(p, M_DEVBUF, sizeof(*p)); + smr_barrier(); if (ifp->if_link_state != LINK_STATE_DOWN) { ifp->if_link_state = LINK_STATE_DOWN; diff --git a/sys/net/if_veb.c b/sys/net/if_veb.c index 71cc814fa54..72c9ea2ba5f 100644 --- a/sys/net/if_veb.c +++ b/sys/net/if_veb.c @@ -209,6 +209,9 @@ static void veb_eb_port_rele(void *, void *); static size_t veb_eb_port_ifname(void *, char *, size_t, void *); static void veb_eb_port_sa(void *, struct sockaddr_storage *, void *); +static void veb_eb_brport_take(void *); +static void veb_eb_brport_rele(void *); + static const struct etherbridge_ops veb_etherbridge_ops = { veb_eb_port_cmp, veb_eb_port_take, @@ -1060,8 +1063,12 @@ veb_port_input(struct ifnet *ifp0, struct mbuf *m, uint64_t dst, void *brport) smr_read_enter(); tp = etherbridge_resolve(&sc->sc_eb, dst); - m = veb_transmit(sc, p, tp, m, src, dst); + if (tp != NULL) + veb_eb_port_take(NULL, tp); smr_read_leave(); + m = veb_transmit(sc, p, tp, m, src, dst); + if (tp != NULL) + veb_eb_port_rele(NULL, tp); if (m == NULL) return (NULL); @@ -1291,6 +1298,9 @@ veb_add_port(struct veb_softc *sc, const struct ifbreq *req, unsigned int span) p->p_brport.eb_input = veb_port_input; } + p->p_brport.eb_port_take = veb_eb_brport_take; + p->p_brport.eb_port_rele = veb_eb_brport_rele; + /* this might have changed if we slept for malloc or ifpromisc */ error = ether_brport_isset(ifp0); if (error != 0) @@ -1910,8 +1920,8 @@ veb_p_dtor(struct veb_softc *sc, struct veb_port *p, const char *op) SMR_TAILQ_REMOVE_LOCKED(&port_list->l_list, p, p_entry); port_list->l_count--; - smr_barrier(); refcnt_finalize(&p->p_refs, "vebpdtor"); + smr_barrier(); veb_rule_list_free(TAILQ_FIRST(&p->p_vrl)); @@ -2016,6 +2026,18 @@ veb_eb_port_rele(void *arg, void *port) refcnt_rele_wake(&p->p_refs); } +static void +veb_eb_brport_take(void *port) +{ + veb_eb_port_take(NULL, port); +} + +static void +veb_eb_brport_rele(void *port) +{ + veb_eb_port_rele(NULL, port); +} + static size_t veb_eb_port_ifname(void *arg, char *dst, size_t len, void *port) { @@ -2181,6 +2203,9 @@ vport_enqueue(struct ifnet *ifp, struct mbuf *m) smr_read_enter(); eb = SMR_PTR_GET(&ac->ac_brport); + if (eb != NULL) + eb->eb_port_take(eb->eb_port); + smr_read_leave(); if (eb != NULL) { struct ether_header *eh; uint64_t dst; @@ -2199,8 +2224,9 @@ vport_enqueue(struct ifnet *ifp, struct mbuf *m) m = (*eb->eb_input)(ifp, m, dst, eb->eb_port); error = 0; + + eb->eb_port_rele(eb->eb_port); } - smr_read_leave(); m_freem(m); diff --git a/sys/netinet/if_ether.h b/sys/netinet/if_ether.h index 56e6384f39b..611e0fa704b 100644 --- a/sys/netinet/if_ether.h +++ b/sys/netinet/if_ether.h @@ -220,6 +220,8 @@ do { \ struct ether_brport { struct mbuf *(*eb_input)(struct ifnet *, struct mbuf *, uint64_t, void *); + void (*eb_port_take)(void *); + void (*eb_port_rele)(void *); void *eb_port; };