One of the interesting things we've done during s2k15 was to redesign
ether_input(). The name of this function is clearly misleading. Back
in the old days [0] it was simply used to put a packet on a protocol
queue. Today ether_input() still does that, but before it does a lot
of different hacks to determine if a packet should be feed to a pseudo-
interface like vlan(4), carp(4), brige(4), etc.
ether_input() is now *the* entry point of our network stack and that's
what we need to change.
The main issues that we analysed about the current plumbing are:
1) None of the pseudo-driver is (almost) self-contained which increase
the risk of regressions due to code complexity. See the recent vlan
example.
2) They are all plugged differently and most of the time feeding packets
or copies of them recursively.
3) Every packet has to go through all the pseudo-driver checks even if
you're not using any of them.
So we came up with an architecture that should allow us to solve these
issues while preserving all the existing features that we all enjoy.
In the end, having self-contained, transparent and non-recursive pseudo-
driver code path to feed packets to the network stack will allow us to
work on one driver at a time to make it MP-safe. Right now it's just to
many spaghetti in the same plate.
So the idea of if_input() is that an interface (ifp) can have multiple
input packet handlers. These handlers are all on a list and act as
filters. You give a mbuf to the first one, if it does not return an
error, you give it to the second one, etc.
Now by default every Ethernet driver has a single handler: ether_input().
So what's next? Well the diff below introduces a new "filter" for trunk.
Instead of calling ether_input() which will then call trunk_input() for
every interface part of a trunk(4), a new handler is added in the list.
Since this handler is now executed *before* ether_input() we do not call
m_adj() on the mbuf we're currently filtering. That's a crucial point!
Apart from that, everything should be straightforward. Please note that
the diff below depends on the recent "if_input() and `rcvif`" tweak I
just sent.
I'm happily running with it, please let me know how it goes on your
setup.
[0] https://github.com/jonathangray/csrg/blob/master/sys/net/if_ethersubr.c#L299
diff --git sys/net/if_ethersubr.c sys/net/if_ethersubr.c
index bbc9246..1d964dd 100644
--- sys/net/if_ethersubr.c
+++ sys/net/if_ethersubr.c
@@ -463,9 +463,6 @@ ether_input(struct mbuf *m, void *hdr)
int s, llcfound = 0;
struct llc *l;
struct arpcom *ac;
-#if NTRUNK > 0
- int i = 0;
-#endif
#if NPPPOE > 0
struct ether_header *eh_tmp;
#endif
@@ -480,21 +477,6 @@ ether_input(struct mbuf *m, void *hdr)
m_adj(m, ETHER_HDR_LEN);
}
-#if NTRUNK > 0
- /* Handle input from a trunk port */
- while (ifp->if_type == IFT_IEEE8023ADLAG) {
- if (++i > TRUNK_MAX_STACKING) {
- m_freem(m);
- return (1);
- }
- if (trunk_input(ifp, eh, m) != 0)
- return (1);
-
- /* Has been set to the trunk interface */
- ifp = m->m_pkthdr.rcvif;
- }
-#endif
-
if ((ifp->if_flags & IFF_UP) == 0) {
m_freem(m);
return (1);
@@ -518,17 +500,9 @@ ether_input(struct mbuf *m, void *hdr)
else
m->m_flags |= M_MCAST;
ifp->if_imcasts++;
-#if NTRUNK > 0
- if (ifp != ifp0)
- ifp0->if_imcasts++;
-#endif
}
ifp->if_ibytes += m->m_pkthdr.len + sizeof(*eh);
-#if NTRUNK > 0
- if (ifp != ifp0)
- ifp0->if_ibytes += m->m_pkthdr.len + sizeof(*eh);
-#endif
etype = ntohs(eh->ether_type);
diff --git sys/net/if_trunk.c sys/net/if_trunk.c
index e364648..514f77d 100644
--- sys/net/if_trunk.c
+++ sys/net/if_trunk.c
@@ -94,14 +94,14 @@ int trunk_rr_detach(struct trunk_softc *);
void trunk_rr_port_destroy(struct trunk_port *);
int trunk_rr_start(struct trunk_softc *, struct mbuf *);
int trunk_rr_input(struct trunk_softc *, struct trunk_port *,
- struct ether_header *, struct mbuf *);
+ struct mbuf *);
/* Active failover */
int trunk_fail_attach(struct trunk_softc *);
int trunk_fail_detach(struct trunk_softc *);
int trunk_fail_start(struct trunk_softc *, struct mbuf *);
int trunk_fail_input(struct trunk_softc *, struct trunk_port *,
- struct ether_header *, struct mbuf *);
+ struct mbuf *);
/* Loadbalancing */
int trunk_lb_attach(struct trunk_softc *);
@@ -110,7 +110,7 @@ int trunk_lb_port_create(struct trunk_port *);
void trunk_lb_port_destroy(struct trunk_port *);
int trunk_lb_start(struct trunk_softc *, struct mbuf *);
int trunk_lb_input(struct trunk_softc *, struct trunk_port *,
- struct ether_header *, struct mbuf *);
+ struct mbuf *);
int trunk_lb_porttable(struct trunk_softc *, struct trunk_port *);
/* Broadcast mode */
@@ -118,14 +118,14 @@ int trunk_bcast_attach(struct trunk_softc *);
int trunk_bcast_detach(struct trunk_softc *);
int trunk_bcast_start(struct trunk_softc *, struct mbuf *);
int trunk_bcast_input(struct trunk_softc *, struct trunk_port *,
- struct ether_header *, struct mbuf *);
+ struct mbuf *);
/* 802.3ad LACP */
int trunk_lacp_attach(struct trunk_softc *);
int trunk_lacp_detach(struct trunk_softc *);
int trunk_lacp_start(struct trunk_softc *, struct mbuf *);
int trunk_lacp_input(struct trunk_softc *, struct trunk_port *,
- struct ether_header *, struct mbuf *);
+ struct mbuf *);
/* Trunk protocol table */
static const struct {
@@ -288,6 +288,7 @@ trunk_port_create(struct trunk_softc *tr, struct ifnet *ifp)
{
struct trunk_softc *tr_ptr;
struct trunk_port *tp;
+ struct ifih *trunk_ifih;
int error = 0;
/* Limit the maximal number of trunk ports */
@@ -326,12 +327,19 @@ trunk_port_create(struct trunk_softc *tr, struct ifnet
*ifp)
M_DEVBUF, M_NOWAIT|M_ZERO)) == NULL)
return (ENOMEM);
+ trunk_ifih = malloc(sizeof(*trunk_ifih), M_DEVBUF, M_NOWAIT);
+ if (trunk_ifih == NULL) {
+ free(tp, M_DEVBUF, 0);
+ return (ENOMEM);
+ }
+
/* Check if port is a stacked trunk */
SLIST_FOREACH(tr_ptr, &trunk_list, tr_entries) {
if (ifp == &tr_ptr->tr_ac.ac_if) {
tp->tp_flags |= TRUNK_PORT_STACK;
if (trunk_port_checkstacking(tr_ptr) >=
TRUNK_MAX_STACKING) {
+ free(trunk_ifih, M_DEVBUF, sizeof(*trunk_ifih));
free(tp, M_DEVBUF, 0);
return (E2BIG);
}
@@ -341,6 +349,11 @@ trunk_port_create(struct trunk_softc *tr, struct ifnet
*ifp)
/* Change the interface type */
tp->tp_iftype = ifp->if_type;
ifp->if_type = IFT_IEEE8023ADLAG;
+
+ /* Change input handler of the physical interface. */
+ trunk_ifih->ifih_input = trunk_input;
+ SLIST_INSERT_HEAD(&ifp->if_inputs, trunk_ifih, ifih_next);
+
ifp->if_tp = (caddr_t)tp;
tp->tp_ioctl = ifp->if_ioctl;
ifp->if_ioctl = trunk_port_ioctl;
@@ -411,6 +424,7 @@ trunk_port_destroy(struct trunk_port *tp)
{
struct trunk_softc *tr = (struct trunk_softc *)tp->tp_trunk;
struct trunk_port *tp_ptr;
+ struct ifih *trunk_ifih;
struct ifnet *ifp = tp->tp_if;
if (tr->tr_port_destroy != NULL)
@@ -425,8 +439,15 @@ trunk_port_destroy(struct trunk_port *tp)
ifpromisc(ifp, 0);
- /* Restore interface */
+ /* Restore interface type. */
ifp->if_type = tp->tp_iftype;
+
+ /* Restore previous input handler. */
+ trunk_ifih = SLIST_FIRST(&ifp->if_inputs);
+ SLIST_REMOVE_HEAD(&ifp->if_inputs, ifih_next);
+ KASSERT(trunk_ifih->ifih_input == trunk_input);
+ free(trunk_ifih, M_DEVBUF, sizeof(*trunk_ifih));
+
ifp->if_watchdog = tp->tp_watchdog;
ifp->if_ioctl = tp->tp_ioctl;
ifp->if_tp = NULL;
@@ -1073,12 +1094,24 @@ trunk_watchdog(struct ifnet *ifp)
}
int
-trunk_input(struct ifnet *ifp, struct ether_header *eh, struct mbuf *m)
+trunk_input(struct mbuf *m, void *hdr)
{
+ struct ifnet *ifp;
struct trunk_softc *tr;
struct trunk_port *tp;
struct ifnet *trifp = NULL;
- int error = 0;
+ struct ether_header *eh = hdr;
+ int error;
+
+ ifp = m->m_pkthdr.rcvif;
+
+ if (eh == NULL)
+ eh = mtod(m, struct ether_header *);
+
+ if (ETHER_IS_MULTICAST(eh->ether_dhost))
+ ifp->if_imcasts++;
+
+ ifp->if_ibytes += m->m_pkthdr.len;
/* Should be checked by the caller */
if (ifp->if_type != IFT_IEEE8023ADLAG) {
@@ -1098,19 +1131,19 @@ trunk_input(struct ifnet *ifp, struct ether_header *eh,
struct mbuf *m)
#if NBPFILTER > 0
if (trifp->if_bpf && tr->tr_proto != TRUNK_PROTO_FAILOVER)
- bpf_mtap_hdr(trifp->if_bpf, (char *)eh, ETHER_HDR_LEN, m,
- BPF_DIRECTION_IN, NULL);
+ bpf_mtap_ether(trifp->if_bpf, m, BPF_DIRECTION_IN);
#endif
- error = (*tr->tr_input)(tr, tp, eh, m);
- if (error != 0)
- return (error);
+ error = (*tr->tr_input)(tr, tp, m);
+ if (error)
+ goto bad;
trifp->if_ipackets++;
+ m->m_pkthdr.rcvif = trifp;
return (0);
bad:
- if (error > 0 && trifp != NULL)
+ if (trifp != NULL)
trifp->if_ierrors++;
m_freem(m);
return (error);
@@ -1290,14 +1323,9 @@ trunk_rr_start(struct trunk_softc *tr, struct mbuf *m)
}
int
-trunk_rr_input(struct trunk_softc *tr, struct trunk_port *tp,
- struct ether_header *eh, struct mbuf *m)
+trunk_rr_input(struct trunk_softc *tr, struct trunk_port *tp, struct mbuf *m)
{
- struct ifnet *ifp = &tr->tr_ac.ac_if;
-
/* Just pass in the packet to our trunk device */
- m->m_pkthdr.rcvif = ifp;
-
return (0);
}
@@ -1344,8 +1372,7 @@ trunk_fail_start(struct trunk_softc *tr, struct mbuf *m)
}
int
-trunk_fail_input(struct trunk_softc *tr, struct trunk_port *tp,
- struct ether_header *eh, struct mbuf *m)
+trunk_fail_input(struct trunk_softc *tr, struct trunk_port *tp, struct mbuf *m)
{
struct ifnet *ifp = &tr->tr_ac.ac_if;
struct trunk_port *tmp_tp;
@@ -1362,17 +1389,13 @@ trunk_fail_input(struct trunk_softc *tr, struct
trunk_port *tp,
if ((tmp_tp == NULL || tmp_tp == tp))
accept = 1;
}
- if (!accept) {
- m_freem(m);
+ if (!accept)
return (-1);
- }
#if NBPFILTER > 0
if (ifp->if_bpf)
- bpf_mtap_hdr(ifp->if_bpf, (char *)eh, ETHER_HDR_LEN, m,
- BPF_DIRECTION_IN, NULL);
+ bpf_mtap_ether(ifp->if_bpf, m, BPF_DIRECTION_IN);
#endif
- m->m_pkthdr.rcvif = ifp;
return (0);
}
@@ -1476,14 +1499,9 @@ trunk_lb_start(struct trunk_softc *tr, struct mbuf *m)
}
int
-trunk_lb_input(struct trunk_softc *tr, struct trunk_port *tp,
- struct ether_header *eh, struct mbuf *m)
+trunk_lb_input(struct trunk_softc *tr, struct trunk_port *tp, struct mbuf *m)
{
- struct ifnet *ifp = &tr->tr_ac.ac_if;
-
/* Just pass in the packet to our trunk device */
- m->m_pkthdr.rcvif = ifp;
-
return (0);
}
@@ -1559,12 +1577,8 @@ trunk_bcast_start(struct trunk_softc *tr, struct mbuf
*m0)
}
int
-trunk_bcast_input(struct trunk_softc *tr, struct trunk_port *tp,
- struct ether_header *eh, struct mbuf *m)
+trunk_bcast_input(struct trunk_softc *tr, struct trunk_port *tp, struct mbuf
*m)
{
- struct ifnet *ifp = &tr->tr_ac.ac_if;
-
- m->m_pkthdr.rcvif = ifp;
return (0);
}
@@ -1630,15 +1644,11 @@ trunk_lacp_start(struct trunk_softc *tr, struct mbuf *m)
}
int
-trunk_lacp_input(struct trunk_softc *tr, struct trunk_port *tp,
- struct ether_header *eh, struct mbuf *m)
+trunk_lacp_input(struct trunk_softc *tr, struct trunk_port *tp, struct mbuf *m)
{
- struct ifnet *ifp = &tr->tr_ac.ac_if;
-
- m = lacp_input(tp, eh, m);
+ m = lacp_input(tp, m);
if (m == NULL)
return (-1);
- m->m_pkthdr.rcvif = ifp;
return (0);
}
diff --git sys/net/if_trunk.h sys/net/if_trunk.h
index 81f7121..a7fb7bd 100644
--- sys/net/if_trunk.h
+++ sys/net/if_trunk.h
@@ -194,7 +194,7 @@ struct trunk_softc {
int (*tr_start)(struct trunk_softc *, struct mbuf *);
int (*tr_watchdog)(struct trunk_softc *);
int (*tr_input)(struct trunk_softc *, struct trunk_port *,
- struct ether_header *, struct mbuf *);
+ struct mbuf *);
int (*tr_port_create)(struct trunk_port *);
void (*tr_port_destroy)(struct trunk_port *);
void (*tr_linkstate)(struct trunk_port *);
@@ -219,8 +219,7 @@ struct trunk_lb {
struct trunk_port *lb_ports[TRUNK_MAX_PORTS];
};
-int trunk_input(struct ifnet *, struct ether_header *,
- struct mbuf *);
+int trunk_input(struct mbuf *, void *);
int trunk_enqueue(struct ifnet *, struct mbuf *);
u_int32_t trunk_hashmbuf(struct mbuf *, SIPHASH_KEY *);
#endif /* _KERNEL */
diff --git sys/net/trunklacp.c sys/net/trunklacp.c
index 7b2da3b..eade6fe 100644
--- sys/net/trunklacp.c
+++ sys/net/trunklacp.c
@@ -122,10 +122,8 @@ void lacp_aggregator_delref(struct
lacp_softc *,
/* receive machine */
-int lacp_pdu_input(struct lacp_port *,
- struct ether_header *, struct mbuf *);
-int lacp_marker_input(struct lacp_port *,
- struct ether_header *, struct mbuf *);
+int lacp_pdu_input(struct lacp_port *, struct mbuf *);
+int lacp_marker_input(struct lacp_port *, struct mbuf *);
void lacp_sm_rx(struct lacp_port *, const struct lacpdu *);
void lacp_sm_rx_timer(struct lacp_port *);
void lacp_sm_rx_set_expired(struct lacp_port *);
@@ -220,27 +218,28 @@ const lacp_timer_func_t lacp_timer_funcs[LACP_NTIMER] = {
};
struct mbuf *
-lacp_input(struct trunk_port *tp, struct ether_header *eh, struct mbuf *m)
+lacp_input(struct trunk_port *tp, struct mbuf *m)
{
struct lacp_port *lp = LACP_PORT(tp);
struct lacp_softc *lsc = lp->lp_lsc;
struct lacp_aggregator *la = lp->lp_aggregator;
+ struct ether_header *eh;
u_int8_t subtype;
+ eh = mtod(m, struct ether_header *);
+
if (ntohs(eh->ether_type) == ETHERTYPE_SLOW) {
- if (m->m_pkthdr.len < sizeof(subtype)) {
- m_freem(m);
+ if (m->m_pkthdr.len < (sizeof(*eh) + sizeof(subtype)))
return (NULL);
- }
- subtype = *mtod(m, u_int8_t *);
+ m_copydata(m, sizeof(*eh), sizeof(subtype), &subtype);
switch (subtype) {
case SLOWPROTOCOLS_SUBTYPE_LACP:
- lacp_pdu_input(lp, eh, m);
+ lacp_pdu_input(lp, m);
return (NULL);
case SLOWPROTOCOLS_SUBTYPE_MARKER:
- lacp_marker_input(lp, eh, m);
+ lacp_marker_input(lp, m);
return (NULL);
}
}
@@ -251,10 +250,8 @@ lacp_input(struct trunk_port *tp, struct ether_header *eh,
struct mbuf *m)
*/
/* This port is joined to the active aggregator */
if ((lp->lp_state & LACP_STATE_COLLECTING) == 0 ||
- la == NULL || la != lsc->lsc_active_aggregator) {
- m_freem(m);
+ la == NULL || la != lsc->lsc_active_aggregator)
return (NULL);
- }
/* Not a subtype we are interested in */
return (m);
@@ -264,7 +261,7 @@ lacp_input(struct trunk_port *tp, struct ether_header *eh,
struct mbuf *m)
* lacp_pdu_input: process lacpdu
*/
int
-lacp_pdu_input(struct lacp_port *lp, struct ether_header *eh, struct mbuf *m)
+lacp_pdu_input(struct lacp_port *lp, struct mbuf *m)
{
struct lacpdu *du;
int error = 0;
@@ -280,7 +277,7 @@ lacp_pdu_input(struct lacp_port *lp, struct ether_header
*eh, struct mbuf *m)
}
du = mtod(m, struct lacpdu *);
- if (memcmp(&eh->ether_dhost,
+ if (memcmp(&du->ldu_eh.ether_dhost,
ðermulticastaddr_slowprotocols, ETHER_ADDR_LEN))
goto bad;
@@ -308,11 +305,9 @@ lacp_pdu_input(struct lacp_port *lp, struct ether_header
*eh, struct mbuf *m)
lacp_sm_rx(lp, du);
- m_freem(m);
return (error);
bad:
- m_freem(m);
return (EINVAL);
}
@@ -1626,7 +1621,7 @@ lacp_run_timers(struct lacp_port *lp)
}
int
-lacp_marker_input(struct lacp_port *lp, struct ether_header *eh, struct mbuf
*m)
+lacp_marker_input(struct lacp_port *lp, struct mbuf *m)
{
struct lacp_softc *lsc = lp->lp_lsc;
struct trunk_port *tp = lp->lp_trunk;
@@ -1649,7 +1644,7 @@ lacp_marker_input(struct lacp_port *lp, struct
ether_header *eh, struct mbuf *m)
mdu = mtod(m, struct markerdu *);
- if (memcmp(&eh->ether_dhost,
+ if (memcmp(&mdu->mdu_eh.ether_dhost,
ðermulticastaddr_slowprotocols, ETHER_ADDR_LEN))
goto bad;
@@ -1663,9 +1658,9 @@ lacp_marker_input(struct lacp_port *lp, struct
ether_header *eh, struct mbuf *m)
goto bad;
mdu->mdu_tlv.tlv_type = MARKER_TYPE_RESPONSE;
- memcpy(&eh->ether_dhost,
+ memcpy(&mdu->mdu_eh.ether_dhost,
ðermulticastaddr_slowprotocols, ETHER_ADDR_LEN);
- memcpy(&eh->ether_shost,
+ memcpy(&mdu->mdu_eh.ether_shost,
tp->tp_lladdr, ETHER_ADDR_LEN);
error = trunk_enqueue(lp->lp_ifp, m);
break;
@@ -1701,7 +1696,6 @@ lacp_marker_input(struct lacp_port *lp, struct
ether_header *eh, struct mbuf *m)
lsc->lsc_suppress_distributing = 0;
}
}
- m_freem(m);
break;
default:
@@ -1712,7 +1706,6 @@ lacp_marker_input(struct lacp_port *lp, struct
ether_header *eh, struct mbuf *m)
bad:
LACP_DPRINTF((lp, "bad marker frame\n"));
- m_freem(m);
return (EINVAL);
}
diff --git sys/net/trunklacp.h sys/net/trunklacp.h
index 12d94fe..689aaea 100644
--- sys/net/trunklacp.h
+++ sys/net/trunklacp.h
@@ -87,6 +87,7 @@ struct lacp_collectorinfo {
} __packed;
struct lacpdu {
+ struct ether_header ldu_eh;
struct slowprothdr ldu_sph;
struct tlvhdr ldu_tlv_actor;
@@ -153,6 +154,7 @@ struct lacp_markerinfo {
"\010EXPIRED"
struct markerdu {
+ struct ether_header mdu_eh;
struct slowprothdr mdu_sph;
struct tlvhdr mdu_tlv;
@@ -260,8 +262,7 @@ struct lacp_softc {
#define LACP_UNLOCK(_lsc) mtx_unlock(&(_lsc)->lsc_mtx)
#define LACP_LOCK_ASSERT(_lsc) mtx_assert(&(_lsc)->lsc_mtx, MA_OWNED)
-struct mbuf *lacp_input(struct trunk_port *,
- struct ether_header *, struct mbuf *);
+struct mbuf *lacp_input(struct trunk_port *, struct mbuf *);
struct trunk_port *lacp_select_tx_port(struct trunk_softc *, struct mbuf *);
int lacp_attach(struct trunk_softc *);
int lacp_detach(struct trunk_softc *);