This is a pretty easy one to diagnose.  In FreeBSD 5.x+, there are
network interface locks that the ifnet::if_start() routines grab and
the IPFW dynamic rule lock that IPFW grabs.  When IPFW periodically
runs its dynamic rule keepalive event, it tries to grab the locks in
the order: IPFW dynamic rule lock, ifnet lock.  This is the wrong
order, and in my 5.4-STABLE IPFW+if_bridge(4)+ALTQ configuration,
leads to a full system deadlock.

The solution I have is pretty simple, but I have not actually
tested what was there before against WITNESS to see exactly what
it had to say since I was more interested in fixing this in my
production environment.  If interested, I can easily help set up
a test environment for this.  Here are the changes, which are, I
believe, a complete fix.  I don't particularly love the style
and feel that could probably stand improvement.

Index: ip_fw2.c
===================================================================
RCS file: /export/ncvs/src/sys/netinet/ip_fw2.c,v
retrieving revision 1.70.2.11
diff -u -r1.70.2.11 ip_fw2.c
--- ip_fw2.c    12 May 2005 15:11:30 -0000      1.70.2.11
+++ ip_fw2.c    1 Jun 2005 20:54:56 -0000
@@ -1237,12 +1237,12 @@
 }
 
 /*
- * Transmit a TCP packet, containing either a RST or a keepalive.
+ * Generate a TCP packet, containing either a RST or a keepalive.
  * When flags & TH_RST, we are sending a RST packet, because of a
  * "reset" action matched the packet.
  * Otherwise we are sending a keepalive, and flags & TH_
  */
-static void
+static struct mbuf *
 send_pkt(struct ipfw_flow_id *id, u_int32_t seq, u_int32_t ack, int flags)
 {
        struct mbuf *m;
@@ -1251,7 +1251,7 @@
 
        MGETHDR(m, M_DONTWAIT, MT_HEADER);
        if (m == 0)
-               return;
+               return (NULL);
        m->m_pkthdr.rcvif = (struct ifnet *)0;
        m->m_pkthdr.len = m->m_len = sizeof(struct ip) + sizeof(struct tcphdr);
        m->m_data += max_linkhdr;
@@ -1314,7 +1314,7 @@
        ip->ip_ttl = ip_defttl;
        ip->ip_len = m->m_pkthdr.len;
        m->m_flags |= M_SKIP_FIREWALL;
-       ip_output(m, NULL, NULL, 0, NULL, NULL);
+       return (m);
 }
 
 /*
@@ -1335,10 +1335,14 @@
        } else if (offset == 0 && args->f_id.proto == IPPROTO_TCP) {
                struct tcphdr *const tcp =
                    L3HDR(struct tcphdr, mtod(args->m, struct ip *));
-               if ( (tcp->th_flags & TH_RST) == 0)
-                       send_pkt(&(args->f_id), ntohl(tcp->th_seq),
+               if ( (tcp->th_flags & TH_RST) == 0) {
+                       struct mbuf *m;
+                       m = send_pkt(&(args->f_id), ntohl(tcp->th_seq),
                                ntohl(tcp->th_ack),
                                tcp->th_flags | TH_RST);
+                       if (m != NULL)
+                               ip_output(m, NULL, NULL, 0, NULL, NULL);
+               }
                m_freem(args->m);
        } else
                m_freem(args->m);
@@ -3476,12 +3480,20 @@
 static void
 ipfw_tick(void * __unused unused)
 {
+       struct mbuf *m0, *m, *mn;
        int i;
        ipfw_dyn_rule *q;
 
        if (dyn_keepalive == 0 || ipfw_dyn_v == NULL || dyn_count == 0)
                goto done;
 
+       /*
+        * We make a chain of packets to go out here -- not deferring
+        * until after we drop the IPFW dynamic rule lock would result
+        * in a lock order reversal with the normal packet input -> ipfw
+        * call stack.
+        */
+       m0 = m = NULL;
        IPFW_DYN_LOCK();
        for (i = 0 ; i < curr_dyn_buckets ; i++) {
                for (q = ipfw_dyn_v[i] ; q ; q = q->next ) {
@@ -3497,11 +3509,33 @@
                        if (TIME_LEQ(q->expire, time_second))
                                continue;       /* too late, rule expired */
 
-                       send_pkt(&(q->id), q->ack_rev - 1, q->ack_fwd, TH_SYN);
-                       send_pkt(&(q->id), q->ack_fwd - 1, q->ack_rev, 0);
+                       mn = send_pkt(&(q->id), q->ack_rev - 1, q->ack_fwd,
+                               TH_SYN);
+                       if (mn != NULL) {
+                               if (m0 == NULL) {
+                                       m0 = m = mn;
+                               } else {
+                                       m->m_nextpkt = mn;
+                                       m = mn;
+                               }
+                       }
+                       mn = send_pkt(&(q->id), q->ack_fwd - 1, q->ack_rev, 0);
+                       if (mn != NULL) {
+                               if (m0 == NULL) {
+                                       m0 = m = mn;
+                               } else {
+                                       m->m_nextpkt = mn;
+                                       m = mn;
+                               }
+                       }
                }
        }
        IPFW_DYN_UNLOCK();
+       for (m = mn = m0; m != NULL; m = mn) {
+               mn = m->m_nextpkt;
+               m->m_nextpkt = NULL;
+               ip_output(m, NULL, NULL, 0, NULL, NULL);
+       }
 done:
        callout_reset(&ipfw_timeout, dyn_keepalive_period*hz, ipfw_tick, NULL);
 }

-- 
Brian Fundakowski Feldman                           \'[ FreeBSD ]''''''''''\
  <> [EMAIL PROTECTED]                               \  The Power to Serve! \
 Opinions expressed are my own.                       \,,,,,,,,,,,,,,,,,,,,,,\
_______________________________________________
[email protected] mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-ipfw
To unsubscribe, send any mail to "[EMAIL PROTECTED]"

Reply via email to