this changes how ifiq_input measure whether there is too much work for the network stack to do, which in turn is used to decide whether it should drop packets or not.
currently we count the number of packets still on the queue, and we drop when that backlog of packets gets too high. currently that threshold ends up being 10240 packets for every individual nic or ring on a nic, which is "quite high". it seems to mean that when the network stack is under load, we keep feeding it 10k packets at a time, and dropping the rest in between. this moves us to counting the number of times a nic or ring tried to enqueue packets for the network stack. so, ifiq_input operations add 1 to an ifiq_pressure variable, and the task that dequeues and processes the packets resets it to 0. if ifiq_pressure grows too much due to a lot of enqueue operations without matching dequeue ops, it assumes there's too much pressure and begins to drop packets. this is a much nicer mechanism since it effectively scales to the system it's running on, and feeds smaller and therefore smoother bundles of packets into the stack. this is instead of having a queue limit we hope works well enough on all systems ranging from a raspberry pi all the way up to a high speed xeon box with 40Gb nics. it seems to work well in practice. i had hoped that there'd be no difference in performance, but hrvoje popovski has tested it and noted an increase on one system from 1.1mpps to sitting between 1.1 and 1.3mpps, and a slow box going from 730 to 745kpps. there's some follow on work moving nics that implement rx ring moderation to use ifiq_input directly, and reducing their rings in hardware before we have to resort to dropping packets in software. my intitial tests with a change like that have ix forwarding over 90% of the same pps as before, but moderating it's rings to about 20 or 30 packets. currently it grows them to the full 255 slots and we drop 2/3rds of them in ifiq_input. i'm hoping i can tune it so there's no drop in pps but we run with much smaller rings and without spending so much time in m_freem. im sending this out so people can object, otherwise im committing it tomorrow, or about 24 hours from now. Index: if.c =================================================================== RCS file: /cvs/src/sys/net/if.c,v retrieving revision 1.572 diff -u -p -r1.572 if.c --- if.c 26 Feb 2019 03:20:08 -0000 1.572 +++ if.c 27 Feb 2019 12:05:06 -0000 @@ -738,7 +738,7 @@ if_enqueue_ifq(struct ifnet *ifp, struct void if_input(struct ifnet *ifp, struct mbuf_list *ml) { - ifiq_input(&ifp->if_rcv, ml, 2048); + ifiq_input(&ifp->if_rcv, ml); } int Index: ifq.c =================================================================== RCS file: /cvs/src/sys/net/ifq.c,v retrieving revision 1.25 diff -u -p -r1.25 ifq.c --- ifq.c 16 Dec 2018 03:36:02 -0000 1.25 +++ ifq.c 27 Feb 2019 12:05:06 -0000 @@ -445,6 +445,7 @@ ifiq_init(struct ifiqueue *ifiq, struct mtx_init(&ifiq->ifiq_mtx, IPL_NET); ml_init(&ifiq->ifiq_ml); task_set(&ifiq->ifiq_task, ifiq_process, ifiq); + ifiq->ifiq_pressure = 0; ifiq->ifiq_qdrops = 0; ifiq->ifiq_packets = 0; @@ -467,17 +468,20 @@ ifiq_destroy(struct ifiqueue *ifiq) ml_purge(&ifiq->ifiq_ml); } +unsigned int ifiq_pressure_drop = 16; +unsigned int ifiq_pressure_return = 2; + int -ifiq_input(struct ifiqueue *ifiq, struct mbuf_list *ml, unsigned int cwm) +ifiq_input(struct ifiqueue *ifiq, struct mbuf_list *ml) { struct ifnet *ifp = ifiq->ifiq_if; struct mbuf *m; uint64_t packets; uint64_t bytes = 0; + unsigned int pressure; #if NBPFILTER > 0 caddr_t if_bpf; #endif - int rv = 1; if (ml_empty(ml)) return (0); @@ -518,12 +522,11 @@ ifiq_input(struct ifiqueue *ifiq, struct ifiq->ifiq_packets += packets; ifiq->ifiq_bytes += bytes; - if (ifiq_len(ifiq) >= cwm * 5) + pressure = ++ifiq->ifiq_pressure; + if (pressure > ifiq_pressure_drop) ifiq->ifiq_qdrops += ml_len(ml); - else { - rv = (ifiq_len(ifiq) >= cwm * 3); + else ml_enlist(&ifiq->ifiq_ml, ml); - } mtx_leave(&ifiq->ifiq_mtx); if (ml_empty(ml)) @@ -531,7 +534,7 @@ ifiq_input(struct ifiqueue *ifiq, struct else ml_purge(ml); - return (rv); + return (pressure > ifiq_pressure_return); } void @@ -573,6 +576,7 @@ ifiq_process(void *arg) return; mtx_enter(&ifiq->ifiq_mtx); + ifiq->ifiq_pressure = 0; ml = ifiq->ifiq_ml; ml_init(&ifiq->ifiq_ml); mtx_leave(&ifiq->ifiq_mtx); Index: ifq.h =================================================================== RCS file: /cvs/src/sys/net/ifq.h,v retrieving revision 1.22 diff -u -p -r1.22 ifq.h --- ifq.h 11 Dec 2018 01:36:42 -0000 1.22 +++ ifq.h 27 Feb 2019 12:05:06 -0000 @@ -80,6 +80,7 @@ struct ifiqueue { struct mutex ifiq_mtx; struct mbuf_list ifiq_ml; struct task ifiq_task; + unsigned int ifiq_pressure; /* counters */ uint64_t ifiq_packets; @@ -473,8 +474,7 @@ ifq_idx(struct ifqueue *ifq, unsigned in void ifiq_init(struct ifiqueue *, struct ifnet *, unsigned int); void ifiq_destroy(struct ifiqueue *); -int ifiq_input(struct ifiqueue *, struct mbuf_list *, - unsigned int); +int ifiq_input(struct ifiqueue *, struct mbuf_list *); int ifiq_enqueue(struct ifiqueue *, struct mbuf *); void ifiq_add_data(struct ifiqueue *, struct if_data *); void ifiq_barrier(struct ifiqueue *);