this changes how ifiq_input measure whether there is too much work for
the network stack to do, which in turn is used to decide whether it
should drop packets or not.

currently we count the number of packets still on the queue, and we drop
when that backlog of packets gets too high. currently that threshold
ends up being 10240 packets for every individual nic or ring on a
nic, which is "quite high". it seems to mean that when the network
stack is under load, we keep feeding it 10k packets at a time, and
dropping the rest in between.

this moves us to counting the number of times a nic or ring tried to
enqueue packets for the network stack. so, ifiq_input operations
add 1 to an ifiq_pressure variable, and the task that dequeues and
processes the packets resets it to 0. if ifiq_pressure grows too
much due to a lot of enqueue operations without matching dequeue
ops, it assumes there's too much pressure and begins to drop packets.

this is a much nicer mechanism since it effectively scales to the
system it's running on, and feeds smaller and therefore smoother bundles
of packets into the stack. this is instead of having a queue limit
we hope works well enough on all systems ranging from a raspberry
pi all the way up to a high speed xeon box with 40Gb nics.

it seems to work well in practice. i had hoped that there'd be no
difference in performance, but hrvoje popovski has tested it and noted
an increase on one system from 1.1mpps to sitting between 1.1 and
1.3mpps, and a slow box going from 730 to 745kpps.

there's some follow on work moving nics that implement rx ring
moderation to use ifiq_input directly, and reducing their rings in
hardware before we have to resort to dropping packets in software.
my intitial tests with a change like that have ix forwarding over
90% of the same pps as before, but moderating it's rings to about
20 or 30 packets.  currently it grows them to the full 255 slots
and we drop 2/3rds of them in ifiq_input. i'm hoping i can tune it
so there's no drop in pps but we run with much smaller rings and
without spending so much time in m_freem.

im sending this out so people can object, otherwise im committing it
tomorrow, or about 24 hours from now.

Index: if.c
===================================================================
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.572
diff -u -p -r1.572 if.c
--- if.c        26 Feb 2019 03:20:08 -0000      1.572
+++ if.c        27 Feb 2019 12:05:06 -0000
@@ -738,7 +738,7 @@ if_enqueue_ifq(struct ifnet *ifp, struct
 void
 if_input(struct ifnet *ifp, struct mbuf_list *ml)
 {
-       ifiq_input(&ifp->if_rcv, ml, 2048);
+       ifiq_input(&ifp->if_rcv, ml);
 }
 
 int
Index: ifq.c
===================================================================
RCS file: /cvs/src/sys/net/ifq.c,v
retrieving revision 1.25
diff -u -p -r1.25 ifq.c
--- ifq.c       16 Dec 2018 03:36:02 -0000      1.25
+++ ifq.c       27 Feb 2019 12:05:06 -0000
@@ -445,6 +445,7 @@ ifiq_init(struct ifiqueue *ifiq, struct 
        mtx_init(&ifiq->ifiq_mtx, IPL_NET);
        ml_init(&ifiq->ifiq_ml);
        task_set(&ifiq->ifiq_task, ifiq_process, ifiq);
+       ifiq->ifiq_pressure = 0;
 
        ifiq->ifiq_qdrops = 0;
        ifiq->ifiq_packets = 0;
@@ -467,17 +468,20 @@ ifiq_destroy(struct ifiqueue *ifiq)
        ml_purge(&ifiq->ifiq_ml);
 }
 
+unsigned int ifiq_pressure_drop = 16;
+unsigned int ifiq_pressure_return = 2;
+
 int
-ifiq_input(struct ifiqueue *ifiq, struct mbuf_list *ml, unsigned int cwm)
+ifiq_input(struct ifiqueue *ifiq, struct mbuf_list *ml)
 {
        struct ifnet *ifp = ifiq->ifiq_if;
        struct mbuf *m;
        uint64_t packets;
        uint64_t bytes = 0;
+       unsigned int pressure;
 #if NBPFILTER > 0
        caddr_t if_bpf;
 #endif
-       int rv = 1;
 
        if (ml_empty(ml))
                return (0);
@@ -518,12 +522,11 @@ ifiq_input(struct ifiqueue *ifiq, struct
        ifiq->ifiq_packets += packets;
        ifiq->ifiq_bytes += bytes;
 
-       if (ifiq_len(ifiq) >= cwm * 5)
+       pressure = ++ifiq->ifiq_pressure;
+       if (pressure > ifiq_pressure_drop)
                ifiq->ifiq_qdrops += ml_len(ml);
-       else {
-               rv = (ifiq_len(ifiq) >= cwm * 3);
+       else
                ml_enlist(&ifiq->ifiq_ml, ml);
-       }
        mtx_leave(&ifiq->ifiq_mtx);
 
        if (ml_empty(ml))
@@ -531,7 +534,7 @@ ifiq_input(struct ifiqueue *ifiq, struct
        else
                ml_purge(ml);
 
-       return (rv);
+       return (pressure > ifiq_pressure_return);
 }
 
 void
@@ -573,6 +576,7 @@ ifiq_process(void *arg)
                return;
 
        mtx_enter(&ifiq->ifiq_mtx);
+       ifiq->ifiq_pressure = 0;
        ml = ifiq->ifiq_ml;
        ml_init(&ifiq->ifiq_ml);
        mtx_leave(&ifiq->ifiq_mtx);
Index: ifq.h
===================================================================
RCS file: /cvs/src/sys/net/ifq.h,v
retrieving revision 1.22
diff -u -p -r1.22 ifq.h
--- ifq.h       11 Dec 2018 01:36:42 -0000      1.22
+++ ifq.h       27 Feb 2019 12:05:06 -0000
@@ -80,6 +80,7 @@ struct ifiqueue {
        struct mutex             ifiq_mtx;
        struct mbuf_list         ifiq_ml;
        struct task              ifiq_task;
+       unsigned int             ifiq_pressure;
 
        /* counters */
        uint64_t                 ifiq_packets;
@@ -473,8 +474,7 @@ ifq_idx(struct ifqueue *ifq, unsigned in
 
 void            ifiq_init(struct ifiqueue *, struct ifnet *, unsigned int);
 void            ifiq_destroy(struct ifiqueue *);
-int             ifiq_input(struct ifiqueue *, struct mbuf_list *,
-                    unsigned int);
+int             ifiq_input(struct ifiqueue *, struct mbuf_list *);
 int             ifiq_enqueue(struct ifiqueue *, struct mbuf *);
 void            ifiq_add_data(struct ifiqueue *, struct if_data *);
 void            ifiq_barrier(struct ifiqueue *);

Reply via email to