From: Wesley Atwell <[email protected]> Add a test-only TUN ioctl that inflates RX skb->truesize, plus the packetdrill-side helper needed to drive that ioctl through packetdrill's own TUN queue file descriptor.
Use that plumbing to cover the receive-window regressions where scaling_ratio drifts after advertisement, alongside the baseline too-big packetdrill cases that exercise the same sender-visible rwnd accounting from the non-injected path. Signed-off-by: Wesley Atwell <[email protected]> --- drivers/net/tun.c | 65 ++++++++ include/uapi/linux/if_tun.h | 4 + .../tcp_rcv_neg_window_truesize.pkt | 143 ++++++++++++++++++ .../net/packetdrill/tcp_rcv_toobig.pkt | 35 +++++ .../packetdrill/tcp_rcv_toobig_default.pkt | 97 ++++++++++++ .../tcp_rcv_toobig_default_truesize.pkt | 118 +++++++++++++++ .../tcp_rcv_wnd_shrink_allowed_truesize.pkt | 49 ++++++ tools/testing/selftests/net/tun.c | 140 ++++++++++++++++- 8 files changed, 650 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/net/packetdrill/tcp_rcv_neg_window_truesize.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_rcv_toobig.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default_truesize.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_rcv_wnd_shrink_allowed_truesize.pkt diff --git a/drivers/net/tun.c b/drivers/net/tun.c index c492fda6fc15..2cef62cebe88 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -53,6 +53,7 @@ #include <linux/if_ether.h> #include <linux/if_tun.h> #include <linux/if_vlan.h> +#include <linux/overflow.h> #include <linux/crc32.h> #include <linux/math.h> #include <linux/nsproxy.h> @@ -85,8 +86,13 @@ #include "tun_vnet.h" +struct tun_file; + +#define TUNSETTRUESIZE_OLD _IOW('T', 228, unsigned int) + static void tun_default_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd); +static void tun_rx_update_truesize(struct tun_file *tfile, struct sk_buff *skb); #define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) @@ -138,6 +144,7 @@ struct tun_file { u16 queue_index; unsigned int ifindex; }; + u32 rx_extra_truesize; struct napi_struct napi; bool napi_enabled; bool napi_frags_enabled; @@ -1817,6 +1824,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, goto free_skb; } + tun_rx_update_truesize(tfile, skb); switch (tun->flags & TUN_TYPE_MASK) { case IFF_TUN: if (tun->flags & IFF_NO_PI) { @@ -2373,6 +2381,25 @@ static void tun_put_page(struct tun_page *tpage) __page_frag_cache_drain(tpage->page, tpage->count); } +/* Tests can inflate skb->truesize on ingress to exercise receive-memory + * accounting against a scaling_ratio that drifts after a window was + * advertised. The knob is per queue file, defaults to zero, and only changes + * behavior when explicitly enabled through the TUN fd. + */ +static void tun_rx_update_truesize(struct tun_file *tfile, struct sk_buff *skb) +{ + u32 extra = READ_ONCE(tfile->rx_extra_truesize); + unsigned int truesize; + + if (!extra) + return; + + if (check_add_overflow(skb->truesize, extra, &truesize)) + truesize = UINT_MAX; + + skb->truesize = truesize; +} + static int tun_xdp_one(struct tun_struct *tun, struct tun_file *tfile, struct xdp_buff *xdp, int *flush, @@ -2459,6 +2486,7 @@ static int tun_xdp_one(struct tun_struct *tun, goto out; } + tun_rx_update_truesize(tfile, skb); skb->protocol = eth_type_trans(skb, tun->dev); skb_reset_network_header(skb); skb_probe_transport_header(skb); @@ -3045,6 +3073,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, struct tun_struct *tun; void __user* argp = (void __user*)arg; unsigned int carrier; + unsigned int extra_truesize; struct ifreq ifr; kuid_t owner; kgid_t group; @@ -3309,6 +3338,40 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, ret = tun_net_change_carrier(tun->dev, (bool)carrier); break; + /* Support both the legacy pointer-payload form and the scalar form + * used by the selftest helper when injecting truesize from + * packetdrill shell commands. + */ + case TUNSETTRUESIZE: + case TUNSETTRUESIZE_OLD: + ret = -EPERM; + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + goto unlock; + + if (cmd == TUNSETTRUESIZE_OLD) { + ret = -EFAULT; + if (copy_from_user(&extra_truesize, argp, + sizeof(extra_truesize))) { + ret = -EINVAL; + if (arg > U32_MAX) + goto unlock; + + extra_truesize = arg; + } + } else { + ret = -EINVAL; + if (arg > U32_MAX) + goto unlock; + + extra_truesize = arg; + } + + WRITE_ONCE(tfile->rx_extra_truesize, extra_truesize); + netif_info(tun, drv, tun->dev, + "rx extra truesize set to %u\n", extra_truesize); + ret = 0; + break; + case TUNGETDEVNETNS: ret = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) @@ -3348,6 +3411,7 @@ static long tun_chr_compat_ioctl(struct file *file, case TUNGETSNDBUF: case TUNSETSNDBUF: case SIOCGIFHWADDR: + case TUNSETTRUESIZE_OLD: case SIOCSIFHWADDR: arg = (unsigned long)compat_ptr(arg); break; @@ -3408,6 +3472,7 @@ static int tun_chr_open(struct inode *inode, struct file * file) RCU_INIT_POINTER(tfile->tun, NULL); tfile->flags = 0; tfile->ifindex = 0; + tfile->rx_extra_truesize = 0; init_waitqueue_head(&tfile->socket.wq.wait); diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index 79d53c7a1ebd..4be63efe6540 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -61,6 +61,10 @@ #define TUNSETFILTEREBPF _IOR('T', 225, int) #define TUNSETCARRIER _IOW('T', 226, int) #define TUNGETDEVNETNS _IO('T', 227) +/* Test-only: add scalar bytes to skb->truesize on RX after TUN allocates + * an skb. + */ +#define TUNSETTRUESIZE _IO('T', 228) /* TUNSETIFF ifr flags */ #define IFF_TUN 0x0001 diff --git a/tools/testing/selftests/net/packetdrill/tcp_rcv_neg_window_truesize.pkt b/tools/testing/selftests/net/packetdrill/tcp_rcv_neg_window_truesize.pkt new file mode 100644 index 000000000000..1c5550fff509 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_rcv_neg_window_truesize.pkt @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0 +// Run the negative-window / max-advertised-window regression with inflated +// TUN skb->truesize so scaling_ratio drifts throughout the flow. The sequence +// checks and drop counters should remain identical to the uninflated case. + +--mss=1000 + +`./defaults.sh` + + 0 `nstat -n` + +// Establish a connection. + +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [1000000], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 <mss 1000,nop,nop,sackOK,nop,wscale 7> + +0 > S. 0:0(0) ack 1 win 65535 <mss 1460,nop,nop,sackOK,nop,wscale 4> + +0 < . 1:1(0) ack 1 win 257 + + +0 accept(3, ..., ...) = 4 + +// Put 1040000 bytes into the receive buffer. + +0 < P. 1:65001(65000) ack 1 win 257 + * > . 1:1(0) ack 65001 + +0 < P. 65001:130001(65000) ack 1 win 257 + * > . 1:1(0) ack 130001 + +0 < P. 130001:195001(65000) ack 1 win 257 + * > . 1:1(0) ack 195001 + +0 < P. 195001:260001(65000) ack 1 win 257 + * > . 1:1(0) ack 260001 + +0 < P. 260001:325001(65000) ack 1 win 257 + * > . 1:1(0) ack 325001 + +0 < P. 325001:390001(65000) ack 1 win 257 + * > . 1:1(0) ack 390001 + +0 < P. 390001:455001(65000) ack 1 win 257 + * > . 1:1(0) ack 455001 + +0 < P. 455001:520001(65000) ack 1 win 257 + * > . 1:1(0) ack 520001 + +0 < P. 520001:585001(65000) ack 1 win 257 + * > . 1:1(0) ack 585001 + +0 < P. 585001:650001(65000) ack 1 win 257 + * > . 1:1(0) ack 650001 + +0 < P. 650001:715001(65000) ack 1 win 257 + * > . 1:1(0) ack 715001 + +0 < P. 715001:780001(65000) ack 1 win 257 + * > . 1:1(0) ack 780001 + +0 < P. 780001:845001(65000) ack 1 win 257 + * > . 1:1(0) ack 845001 + +0 < P. 845001:910001(65000) ack 1 win 257 + * > . 1:1(0) ack 910001 + +0 < P. 910001:975001(65000) ack 1 win 257 + * > . 1:1(0) ack 975001 + +0 < P. 975001:1040001(65000) ack 1 win 257 + * > . 1:1(0) ack 1040001 + +// Start inflating future TUN skbs only after the baseline sender-visible +// window has been established, so the negative-window checks below exercise +// ratio drift without changing the initial max advertised window. + +0 `../tun --set-rx-truesize tun0 65536` + +// Trigger an extreme memory squeeze by shrinking SO_RCVBUF. + +0 setsockopt(4, SOL_SOCKET, SO_RCVBUF, [16000], 4) = 0 + + +0 < P. 1040001:1105001(65000) ack 1 win 257 + * > . 1:1(0) ack 1040001 win 0 +// Check LINUX_MIB_TCPRCVQDROP has been incremented. + +0 `nstat -s | grep TcpExtTCPRcvQDrop | grep -q " 1 "` + +// RWIN == 0: rcv_wup = 1040001, rcv_wnd = 0, rcv_mwnd_seq > 1105001. + +// Accept pure ack with seq in max adv. window. + +0 write(4, ..., 1000) = 1000 + +0 > P. 1:1001(1000) ack 1040001 win 0 + +0 < . 1105001:1105001(0) ack 1001 win 257 + +// In order segment, in max adv. window -> drop (SKB_DROP_REASON_TCP_ZEROWINDOW). + +0 < P. 1040001:1041001(1000) ack 1001 win 257 + +0 > . 1001:1001(0) ack 1040001 win 0 +// Ooo partial segment, in max adv. window -> drop (SKB_DROP_REASON_TCP_ZEROWINDOW). + +0 < P. 1039001:1041001(2000) ack 1001 win 257 + +0 > . 1001:1001(0) ack 1040001 win 0 <nop,nop,sack 1039001:1040001> +// Check LINUX_MIB_TCPZEROWINDOWDROP has been incremented twice. + +0 `nstat -s | grep TcpExtTCPZeroWindowDrop | grep -q " 2 "` + +// Ooo segment, in max adv. window -> drop (SKB_DROP_REASON_TCP_OVERWINDOW). + +0 < P. 1105001:1106001(1000) ack 1001 win 257 + +0 > . 1001:1001(0) ack 1040001 win 0 +// Ooo segment, beyond max adv. window -> drop (SKB_DROP_REASON_TCP_INVALID_SEQUENCE). + +0 < P. 2000001:2001001(1000) ack 1001 win 257 + +0 > . 1001:1001(0) ack 1040001 win 0 +// Check LINUX_MIB_BEYOND_WINDOW has been incremented twice. + +0 `nstat -s | grep TcpExtBeyondWindow | grep -q " 2 "` + +// Read all data. + +0 read(4, ..., 2000000) = 1040000 + * > . 1001:1001(0) ack 1040001 + +// RWIN > 0: rcv_wup = 1040001, 0 < rcv_wnd < 32000, rcv_mwnd_seq > 1105001. + +// Accept pure ack with seq in max adv. window, beyond adv. window. + +0 write(4, ..., 1000) = 1000 + +0 > P. 1001:2001(1000) ack 1040001 + +0 < . 1105001:1105001(0) ack 2001 win 257 + +// In order segment, in max adv. window, in adv. window -> accept. + +0 < P. 1040001:1041001(1000) ack 2001 win 257 + * > . 2001:2001(0) ack 1041001 + +// Ooo partial segment, in adv. window -> accept. + +0 < P. 1040001:1042001(2000) ack 2001 win 257 + * > . 2001:2001(0) ack 1042001 <nop,nop,sack 1040001:1041001> + +// Ooo segment, in max adv. window, beyond adv. window -> drop. + +0 < P. 1105001:1106001(1000) ack 2001 win 257 + +0 > . 2001:2001(0) ack 1042001 +// Ooo segment, beyond max adv. window, beyond adv. window -> drop. + +0 < P. 2000001:2001001(1000) ack 2001 win 257 + +0 > . 2001:2001(0) ack 1042001 +// Check LINUX_MIB_BEYOND_WINDOW has been incremented twice more. + +0 `nstat -s | grep TcpExtBeyondWindow | grep -q " 4 "` + +// We are allowed to go beyond the window and buffer with one packet. + +0 < P. 1042001:1062001(20000) ack 2001 win 257 + * > . 2001:2001(0) ack 1062001 + +0 < P. 1062001:1082001(20000) ack 2001 win 257 + * > . 2001:2001(0) ack 1082001 win 0 + +// But not more: in-order segment, in max adv. window -> drop. + +0 < P. 1082001:1083001(1000) ack 2001 win 257 + * > . 2001:2001(0) ack 1082001 +// Check LINUX_MIB_TCPZEROWINDOWDROP has been incremented again. + +0 `nstat -s | grep TcpExtTCPZeroWindowDrop | grep -q " 3 "` + +// Another ratio drop must not change the final zero-window decision. + +0 `../tun --set-rx-truesize tun0 131072` + + +0 < P. 1082001:1083001(1000) ack 2001 win 257 + * > . 2001:2001(0) ack 1082001 +// Check LINUX_MIB_TCPZEROWINDOWDROP has been incremented once more. + +0 `nstat -s | grep TcpExtTCPZeroWindowDrop | grep -q " 4 "` diff --git a/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig.pkt b/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig.pkt new file mode 100644 index 000000000000..837ba3633752 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig.pkt @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 + +--mss=1000 + +`./defaults.sh` + + 0 `nstat -n` + +// Establish a connection. + +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [20000], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7> + +0 > S. 0:0(0) ack 1 win 18980 <mss 1460,nop,wscale 0> + +.1 < . 1:1(0) ack 1 win 257 + + +0 accept(3, ..., ...) = 4 + + +0 < P. 1:20001(20000) ack 1 win 257 + +.04 > . 1:1(0) ack 20001 win 18000 + + +0 setsockopt(4, SOL_SOCKET, SO_RCVBUF, [12000], 4) = 0 + +0 < P. 20001:80001(60000) ack 1 win 257 + +0 > . 1:1(0) ack 20001 win 18000 + + +0 read(4, ..., 20000) = 20000 + +// A too big packet is accepted if the receive queue is empty, but the +// stronger admission path must not zero the receive buffer while doing so. + +0 < P. 20001:80001(60000) ack 1 win 257 + * > . 1:1(0) ack 80001 win 0 + +0 %{ assert SK_MEMINFO_RCVBUF > 0, SK_MEMINFO_RCVBUF }% diff --git a/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default.pkt b/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default.pkt new file mode 100644 index 000000000000..b2e4950e0b83 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default.pkt @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-2.0 + +--mss=1000 + +`./defaults.sh +sysctl -q net.ipv4.tcp_moderate_rcvbuf=0` + +// Establish a connection on the default receive buffer. Leave a large skb in +// the queue, then deliver another one which still fits the remaining rwnd. +// We should grow sk_rcvbuf to honor the already-advertised window instead of +// dropping the packet. + +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 65535 <mss 1000,nop,nop,sackOK,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <...> + +.1 < . 1:1(0) ack 1 win 257 + + +0 accept(3, ..., ...) = 4 + +// Exchange enough data to get past the completely fresh-socket case while +// still keeping the receive buffer at its 128kB default. + +0 < P. 1:65001(65000) ack 1 win 257 + * > . 1:1(0) ack 65001 + +0 read(4, ..., 65000) = 65000 + + +0 < P. 65001:130001(65000) ack 1 win 257 + * > . 1:1(0) ack 130001 + +0 read(4, ..., 65000) = 65000 + + +0 < P. 130001:195001(65000) ack 1 win 257 + * > . 1:1(0) ack 195001 + +0 read(4, ..., 65000) = 65000 + + +0 < P. 195001:260001(65000) ack 1 win 257 + * > . 1:1(0) ack 260001 + +0 read(4, ..., 65000) = 65000 + + +0 < P. 260001:325001(65000) ack 1 win 257 + * > . 1:1(0) ack 325001 + +0 read(4, ..., 65000) = 65000 + + +0 < P. 325001:390001(65000) ack 1 win 257 + * > . 1:1(0) ack 390001 + +0 read(4, ..., 65000) = 65000 + + +0 < P. 390001:455001(65000) ack 1 win 257 + * > . 1:1(0) ack 455001 + +0 read(4, ..., 65000) = 65000 + + +0 < P. 455001:520001(65000) ack 1 win 257 + * > . 1:1(0) ack 520001 + +0 read(4, ..., 65000) = 65000 + + +0 < P. 520001:585001(65000) ack 1 win 257 + * > . 1:1(0) ack 585001 + +0 read(4, ..., 65000) = 65000 + + +0 < P. 585001:650001(65000) ack 1 win 257 + * > . 1:1(0) ack 650001 + +0 read(4, ..., 65000) = 65000 + + +0 < P. 650001:715001(65000) ack 1 win 257 + * > . 1:1(0) ack 715001 + +0 read(4, ..., 65000) = 65000 + + +0 < P. 715001:780001(65000) ack 1 win 257 + * > . 1:1(0) ack 780001 + +0 read(4, ..., 65000) = 65000 + + +0 < P. 780001:845001(65000) ack 1 win 257 + * > . 1:1(0) ack 845001 + +0 read(4, ..., 65000) = 65000 + + +0 < P. 845001:910001(65000) ack 1 win 257 + * > . 1:1(0) ack 910001 + +0 read(4, ..., 65000) = 65000 + + +0 < P. 910001:975001(65000) ack 1 win 257 + * > . 1:1(0) ack 975001 + +0 read(4, ..., 65000) = 65000 + + +0 < P. 975001:1040001(65000) ack 1 win 257 + * > . 1:1(0) ack 1040001 + +0 read(4, ..., 65000) = 65000 + +// Leave about 60kB queued, then accept another large skb which still fits +// the rwnd we already exposed to the peer. The regression is the drop; the +// exact sk_rcvbuf growth path is an implementation detail. + +0 < P. 1040001:1102001(62000) ack 1 win 257 + * > . 1:1(0) ack 1102001 + + +0 < P. 1102001:1167001(65000) ack 1 win 257 + * > . 1:1(0) ack 1167001 + +0 read(4, ..., 127000) = 127000 diff --git a/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default_truesize.pkt b/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default_truesize.pkt new file mode 100644 index 000000000000..c2ebe11d75f7 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default_truesize.pkt @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: GPL-2.0 + +--mss=1000 + +`./defaults.sh +sysctl -q net.ipv4.tcp_moderate_rcvbuf=0` + +// Establish a connection on the default receive buffer. The warmup traffic +// keeps the socket in the normal data path without changing its default +// sk_rcvbuf. Then inflate skb->truesize on future TUN RX packets so the live +// scaling_ratio drops after we already exposed a larger rwnd to the peer. +// The follow-up packet should still be admitted, and tcp_clamp_window() should +// grow sk_rcvbuf to honor the sender-visible window instead of dropping data. + +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 65535 <mss 1000,nop,nop,sackOK,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <...> + +.1 < . 1:1(0) ack 1 win 257 + + +0 accept(3, ..., ...) = 4 + +// Exchange enough data to get past the completely fresh-socket case while +// still keeping the receive buffer at its initial default. + +0 < P. 1:65001(65000) ack 1 win 257 + * > . 1:1(0) ack 65001 + +0 read(4, ..., 65000) = 65000 + + +0 < P. 65001:130001(65000) ack 1 win 257 + * > . 1:1(0) ack 130001 + +0 read(4, ..., 65000) = 65000 + + +0 < P. 130001:195001(65000) ack 1 win 257 + * > . 1:1(0) ack 195001 + +0 read(4, ..., 65000) = 65000 + + +0 < P. 195001:260001(65000) ack 1 win 257 + * > . 1:1(0) ack 260001 + +0 read(4, ..., 65000) = 65000 + + +0 < P. 260001:325001(65000) ack 1 win 257 + * > . 1:1(0) ack 325001 + +0 read(4, ..., 65000) = 65000 + + +0 < P. 325001:390001(65000) ack 1 win 257 + * > . 1:1(0) ack 390001 + +0 read(4, ..., 65000) = 65000 + + +0 < P. 390001:455001(65000) ack 1 win 257 + * > . 1:1(0) ack 455001 + +0 read(4, ..., 65000) = 65000 + + +0 < P. 455001:520001(65000) ack 1 win 257 + * > . 1:1(0) ack 520001 + +0 read(4, ..., 65000) = 65000 + + +0 < P. 520001:585001(65000) ack 1 win 257 + * > . 1:1(0) ack 585001 + +0 read(4, ..., 65000) = 65000 + + +0 < P. 585001:650001(65000) ack 1 win 257 + * > . 1:1(0) ack 650001 + +0 read(4, ..., 65000) = 65000 + + +0 < P. 650001:715001(65000) ack 1 win 257 + * > . 1:1(0) ack 715001 + +0 read(4, ..., 65000) = 65000 + + +0 < P. 715001:780001(65000) ack 1 win 257 + * > . 1:1(0) ack 780001 + +0 read(4, ..., 65000) = 65000 + + +0 < P. 780001:845001(65000) ack 1 win 257 + * > . 1:1(0) ack 845001 + +0 read(4, ..., 65000) = 65000 + + +0 < P. 845001:910001(65000) ack 1 win 257 + * > . 1:1(0) ack 910001 + +0 read(4, ..., 65000) = 65000 + + +0 < P. 910001:975001(65000) ack 1 win 257 + * > . 1:1(0) ack 975001 + +0 read(4, ..., 65000) = 65000 + + +0 < P. 975001:1040001(65000) ack 1 win 257 + * > . 1:1(0) ack 1040001 + +0 read(4, ..., 65000) = 65000 + + +0 %{ base_rcvbuf = SK_MEMINFO_RCVBUF }% + +// Leave about 60kB queued, then make future TUN skbs look more expensive in +// two steps. Both inflated skbs still fit the already-advertised window and +// must be admitted, and sk_rcvbuf should keep growing as the live +// scaling_ratio drops further. + +0 < P. 1040001:1102001(62000) ack 1 win 257 + * > . 1:1(0) ack 1102001 + + +0 `../tun --set-rx-truesize tun0 4096` + + +0 < P. 1102001:1167001(65000) ack 1 win 257 + * > . 1:1(0) ack 1167001 + +0 %{ assert SK_MEMINFO_RCVBUF > base_rcvbuf, (base_rcvbuf, SK_MEMINFO_RCVBUF) }% + +0 %{ small_rcvbuf = SK_MEMINFO_RCVBUF }% + + +0 < P. 1167001:1229001(62000) ack 1 win 257 + * > . 1:1(0) ack 1229001 + + +0 `../tun --set-rx-truesize tun0 65536` + + +0 < P. 1229001:1294001(65000) ack 1 win 257 + * > . 1:1(0) ack 1294001 + +0 %{ assert SK_MEMINFO_RCVBUF > small_rcvbuf, (base_rcvbuf, small_rcvbuf, SK_MEMINFO_RCVBUF) }% + + +0 < P. 1294001:1356001(62000) ack 1 win 257 + * > . 1:1(0) ack 1356001 + +0 read(4, ..., 254000) = 254000 diff --git a/tools/testing/selftests/net/packetdrill/tcp_rcv_wnd_shrink_allowed_truesize.pkt b/tools/testing/selftests/net/packetdrill/tcp_rcv_wnd_shrink_allowed_truesize.pkt new file mode 100644 index 000000000000..08da5fddaa12 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_rcv_wnd_shrink_allowed_truesize.pkt @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 + +--mss=1000 + +`./defaults.sh +sysctl -q net.ipv4.tcp_shrink_window=1 +sysctl -q net.ipv4.tcp_rmem="4096 32768 $((32*1024*1024))"` + + 0 `nstat -n` + +// Establish a connection. After the first payload we know the peer has seen a +// scaled receive window reaching sequence 25361. Inflate later TUN skbs in two +// steps so the live scaling_ratio drops more than once, then verify that: +// 1) a segment one byte beyond the max advertised window is still dropped, +// 2) a segment exactly using the previously advertised max window is still +// accepted even though the current live ratio no longer matches that +// original advertisement basis. + +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 10> + +0 < . 1:1(0) ack 1 win 257 + + +0 accept(3, ..., ...) = 4 + + +0 < P. 1:10001(10000) ack 1 win 257 + * > . 1:1(0) ack 10001 win 15 + +// Max window seq advertised here is 10001 + 15*1024 = 25361. + +0 `../tun --set-rx-truesize tun0 4096` + + +0 < P. 10001:11024(1023) ack 1 win 257 + * > . 1:1(0) ack 11024 + + +0 `../tun --set-rx-truesize tun0 65536` + +// Segment beyond the max window stays invalid even after ratio drift. + +0 < P. 11024:25362(14338) ack 1 win 257 + * > . 1:1(0) ack 11024 + +// Segment exactly using the max window must still be accepted. + +0 < P. 11024:25361(14337) ack 1 win 257 + * > . 1:1(0) ack 25361 + +// Check LINUX_MIB_BEYOND_WINDOW has been incremented once. + +0 `nstat | grep TcpExtBeyondWindow | grep -q " 1 "` diff --git a/tools/testing/selftests/net/tun.c b/tools/testing/selftests/net/tun.c index cf106a49b55e..473992b3784d 100644 --- a/tools/testing/selftests/net/tun.c +++ b/tools/testing/selftests/net/tun.c @@ -2,14 +2,17 @@ #define _GNU_SOURCE +#include <dirent.h> #include <errno.h> #include <fcntl.h> +#include <limits.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <unistd.h> #include <linux/if_tun.h> #include <sys/ioctl.h> +#include <sys/syscall.h> #include <sys/socket.h> #include "kselftest_harness.h" @@ -174,6 +177,135 @@ static int tun_delete(char *dev) return ip_link_del(dev); } +static bool is_numeric_name(const char *name) +{ + for (; *name; name++) { + if (*name < '0' || *name > '9') + return false; + } + + return true; +} + +static int packetdrill_dup_fd(int pidfd, const char *fd_name) +{ + char *end; + unsigned long tmp; + + errno = 0; + tmp = strtoul(fd_name, &end, 10); + if (errno || *end || tmp > INT_MAX) { + errno = EINVAL; + return -1; + } + + return syscall(SYS_pidfd_getfd, pidfd, (int)tmp, 0); +} + +static int open_packetdrill_tunfd(pid_t pid, const char *ifname) +{ + char fd_dir[PATH_MAX]; + struct dirent *dent; + struct ifreq ifr = {}; + int pidfd; + int saved_errno = ENOENT; + DIR *dir; + + snprintf(fd_dir, sizeof(fd_dir), "/proc/%ld/fd", (long)pid); + + pidfd = syscall(SYS_pidfd_open, pid, 0); + if (pidfd < 0) + return -1; + + dir = opendir(fd_dir); + if (!dir) { + close(pidfd); + return -1; + } + + while ((dent = readdir(dir))) { + int fd; + + if (!is_numeric_name(dent->d_name)) + continue; + + /* Reopen via pidfd_getfd() so we duplicate packetdrill's attached + * queue file, instead of opening a fresh /dev/net/tun instance. + */ + fd = packetdrill_dup_fd(pidfd, dent->d_name); + if (fd < 0) { + saved_errno = errno; + continue; + } + + memset(&ifr, 0, sizeof(ifr)); + if (!ioctl(fd, TUNGETIFF, &ifr) && + !strncmp(ifr.ifr_name, ifname, IFNAMSIZ)) { + close(pidfd); + closedir(dir); + return fd; + } + + if (errno) + saved_errno = errno; + close(fd); + } + + close(pidfd); + closedir(dir); + errno = saved_errno; + return -1; +} + +/* Packetdrill owns the TUN queue fd, so drive the test ioctl through that + * exact file descriptor found under /proc/$PACKETDRILL_PID/fd. + */ +static int packetdrill_set_rx_truesize(const char *ifname, const char *value) +{ + char *packetdrill_pid, *end; + unsigned long long tmp; + unsigned int extra; + pid_t pid; + int fd; + + packetdrill_pid = getenv("PACKETDRILL_PID"); + if (!packetdrill_pid || !*packetdrill_pid) { + fprintf(stderr, "PACKETDRILL_PID is not set\n"); + return 1; + } + + errno = 0; + tmp = strtoull(packetdrill_pid, &end, 10); + if (errno || *end || !tmp || tmp > INT_MAX) { + fprintf(stderr, "invalid PACKETDRILL_PID: %s\n", packetdrill_pid); + return 1; + } + pid = (pid_t)tmp; + + errno = 0; + tmp = strtoull(value, &end, 0); + if (errno || *end || tmp > UINT_MAX) { + fprintf(stderr, "invalid truesize value: %s\n", value); + return 1; + } + extra = (unsigned int)tmp; + + fd = open_packetdrill_tunfd(pid, ifname); + if (fd < 0) { + perror("open_packetdrill_tunfd"); + return 1; + } + + if (ioctl(fd, TUNSETTRUESIZE, (unsigned long)extra)) { + perror("ioctl(TUNSETTRUESIZE)"); + close(fd); + return 1; + } + + close(fd); + return 0; +} + static int tun_open(char *dev, const int flags, const int hdrlen, const int features, const unsigned char *mac_addr) { @@ -985,4 +1117,10 @@ XFAIL_ADD(tun_vnet_udptnl, 6in4_over_maxbytes, recv_gso_packet); XFAIL_ADD(tun_vnet_udptnl, 4in6_over_maxbytes, recv_gso_packet); XFAIL_ADD(tun_vnet_udptnl, 6in6_over_maxbytes, recv_gso_packet); -TEST_HARNESS_MAIN +int main(int argc, char **argv) +{ + if (argc == 4 && !strcmp(argv[1], "--set-rx-truesize")) + return packetdrill_set_rx_truesize(argv[2], argv[3]); + + return test_harness_run(argc, argv); +} -- 2.43.0
