On Fri, Feb 05, 2021 at 02:01:31AM +0100, Alexander Bluhm wrote:
> I have created an automated test from his setup.

I have commited a regress test that automatically builds the affected
setup.  In case someone wants to play with it, run

make -C /usr/src/regress/sys/net/pair clean run-tcpbench-3-1

You see that tcpbench does not transfer much as PMTU discovery does
not work.  It leaves a TCP socket in FIN_WAIT_1.  After a few seconds
the rexmit timeout triggers.  Then the kernel crashes.

When built with -O0 you see the stack trace jumping between
tcp_output() and tcp_mtudisc().

login: kernel: double fault trap, code=0
Stopped at      m_copydata+0x11:        pushq   %rdx
ddb> trace
m_copydata(fffffd807c7c7400,14,0,0) at m_copydata+0x11
pf_pull_hdr(fffffd807c7c7400,14,ffff8000213421a4,14,0,ffff8000213421fe) at pf_p
ull_hdr+0xa9
pf_setup_pdesc(ffff800021342100,2,2,ffff800000578900,fffffd807c7c7400,ffff80002
13421fe) at pf_setup_pdesc+0x20d
pf_test(2,2,ffff8000005b0000,ffff800021342330) at pf_test+0x16e
ip_output(fffffd807c7c7400,0,fffffd807ed0b078,800,0,fffffd807ed0b008) at ip_out
put+0xb96
tcp_output(ffff800000593420) at tcp_output+0x221b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_timer_rexmt(ffff800000593420) at tcp_timer_rexmt+0x74f
softclock_thread(ffff800021336fc0) at softclock_thread+0x13f
end trace frame: 0x0, count: -52

Attached again diff that fixes it.  I guess IPv6 has the same
problem.  I will look into that after we know that my IPv4 fix is
correct.

bluhm

Index: netinet/ip_output.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_output.c,v
retrieving revision 1.363
diff -u -p -r1.363 ip_output.c
--- netinet/ip_output.c 2 Feb 2021 17:47:42 -0000       1.363
+++ netinet/ip_output.c 5 Feb 2021 14:29:46 -0000
@@ -107,7 +107,10 @@ ip_output(struct mbuf *m0, struct mbuf *
        struct sockaddr_in *dst;
        struct tdb *tdb = NULL;
        u_long mtu;
-#if defined(MROUTING)
+#if NPF > 0
+       u_int orig_rtableid;
+#endif
+#ifdef MROUTING
        int rv;
 #endif
 
@@ -150,6 +153,7 @@ ip_output(struct mbuf *m0, struct mbuf *
        }
 
 #if NPF > 0
+       orig_rtableid = m->m_pkthdr.ph_rtableid;
 reroute:
 #endif
 
@@ -480,6 +484,15 @@ sendit:
                        ipsec_adjust_mtu(m, ifp->if_mtu);
 #endif
                error = EMSGSIZE;
+#if NPF > 0
+               /* pf changed routing table, use orig rtable for path MTU */
+               if (ro->ro_tableid != orig_rtableid) {
+                       rtfree(ro->ro_rt);
+                       ro->ro_tableid = orig_rtableid;
+                       ro->ro_rt = icmp_mtudisc_clone(
+                           satosin(&ro->ro_dst)->sin_addr, ro->ro_tableid, 0);
+               }
+#endif
                /*
                 * This case can happen if the user changed the MTU
                 * of an interface after enabling IP on it.  Because

Reply via email to