On Fri, Feb 05, 2021 at 02:01:31AM +0100, Alexander Bluhm wrote:
> I have created an automated test from his setup.
I have commited a regress test that automatically builds the affected
setup. In case someone wants to play with it, run
make -C /usr/src/regress/sys/net/pair clean run-tcpbench-3-1
You see that tcpbench does not transfer much as PMTU discovery does
not work. It leaves a TCP socket in FIN_WAIT_1. After a few seconds
the rexmit timeout triggers. Then the kernel crashes.
When built with -O0 you see the stack trace jumping between
tcp_output() and tcp_mtudisc().
login: kernel: double fault trap, code=0
Stopped at m_copydata+0x11: pushq %rdx
ddb> trace
m_copydata(fffffd807c7c7400,14,0,0) at m_copydata+0x11
pf_pull_hdr(fffffd807c7c7400,14,ffff8000213421a4,14,0,ffff8000213421fe) at pf_p
ull_hdr+0xa9
pf_setup_pdesc(ffff800021342100,2,2,ffff800000578900,fffffd807c7c7400,ffff80002
13421fe) at pf_setup_pdesc+0x20d
pf_test(2,2,ffff8000005b0000,ffff800021342330) at pf_test+0x16e
ip_output(fffffd807c7c7400,0,fffffd807ed0b078,800,0,fffffd807ed0b008) at ip_out
put+0xb96
tcp_output(ffff800000593420) at tcp_output+0x221b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_mtudisc(fffffd807ed0b008,ffffffff) at tcp_mtudisc+0x11b
tcp_output(ffff800000593420) at tcp_output+0x234b
tcp_timer_rexmt(ffff800000593420) at tcp_timer_rexmt+0x74f
softclock_thread(ffff800021336fc0) at softclock_thread+0x13f
end trace frame: 0x0, count: -52
Attached again diff that fixes it. I guess IPv6 has the same
problem. I will look into that after we know that my IPv4 fix is
correct.
bluhm
Index: netinet/ip_output.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_output.c,v
retrieving revision 1.363
diff -u -p -r1.363 ip_output.c
--- netinet/ip_output.c 2 Feb 2021 17:47:42 -0000 1.363
+++ netinet/ip_output.c 5 Feb 2021 14:29:46 -0000
@@ -107,7 +107,10 @@ ip_output(struct mbuf *m0, struct mbuf *
struct sockaddr_in *dst;
struct tdb *tdb = NULL;
u_long mtu;
-#if defined(MROUTING)
+#if NPF > 0
+ u_int orig_rtableid;
+#endif
+#ifdef MROUTING
int rv;
#endif
@@ -150,6 +153,7 @@ ip_output(struct mbuf *m0, struct mbuf *
}
#if NPF > 0
+ orig_rtableid = m->m_pkthdr.ph_rtableid;
reroute:
#endif
@@ -480,6 +484,15 @@ sendit:
ipsec_adjust_mtu(m, ifp->if_mtu);
#endif
error = EMSGSIZE;
+#if NPF > 0
+ /* pf changed routing table, use orig rtable for path MTU */
+ if (ro->ro_tableid != orig_rtableid) {
+ rtfree(ro->ro_rt);
+ ro->ro_tableid = orig_rtableid;
+ ro->ro_rt = icmp_mtudisc_clone(
+ satosin(&ro->ro_dst)->sin_addr, ro->ro_tableid, 0);
+ }
+#endif
/*
* This case can happen if the user changed the MTU
* of an interface after enabling IP on it. Because