svn commit: r367905 - head/cddl/contrib/opensolaris/common/ctf
Author: jtl Date: Fri Nov 20 17:26:02 2020 New Revision: 367905 URL: https://svnweb.freebsd.org/changeset/base/367905 Log: When copying types from one CTF container to another, ensure that we encode 0-length (i.e. "") structure and union member names as offset 0. This ensures that we don't confuse other parts of the CTF code which expect this encoding. This resolves a Dtrace error resolving members of anonymous structs/unions within the (struct mbuf) type which some users were seeing after r366908. While here, update the code in ctf_add_generic() to encode 0-length type names as offset 0. Reviewed by: markj MFC after:2 weeks Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D27246 Modified: head/cddl/contrib/opensolaris/common/ctf/ctf_create.c Modified: head/cddl/contrib/opensolaris/common/ctf/ctf_create.c == --- head/cddl/contrib/opensolaris/common/ctf/ctf_create.c Fri Nov 20 17:13:13 2020(r367904) +++ head/cddl/contrib/opensolaris/common/ctf/ctf_create.c Fri Nov 20 17:26:02 2020(r367905) @@ -615,7 +615,7 @@ ctf_add_generic(ctf_file_t *fp, uint_t flag, const cha if ((dtd = ctf_alloc(sizeof (ctf_dtdef_t))) == NULL) return (ctf_set_errno(fp, EAGAIN)); - if (name != NULL && (s = ctf_strdup(name)) == NULL) { + if (name != NULL && *name != '\0' && (s = ctf_strdup(name)) == NULL) { ctf_free(dtd, sizeof (ctf_dtdef_t)); return (ctf_set_errno(fp, EAGAIN)); } @@ -1217,7 +1217,7 @@ membadd(const char *name, ctf_id_t type, ulong_t offse if ((dmd = ctf_alloc(sizeof (ctf_dmdef_t))) == NULL) return (ctf_set_errno(ctb->ctb_file, EAGAIN)); - if (name != NULL && (s = ctf_strdup(name)) == NULL) { + if (name != NULL && *name != '\0' && (s = ctf_strdup(name)) == NULL) { ctf_free(dmd, sizeof (ctf_dmdef_t)); return (ctf_set_errno(ctb->ctb_file, EAGAIN)); } ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
Re: svn commit: r367763 - head/cddl/contrib/opensolaris/common/ctf
On Tue, Nov 17, 2020 at 9:07 AM Jonathan T. Looney wrote: > Author: jtl > Date: Tue Nov 17 14:07:27 2020 > New Revision: 367763 > URL: https://svnweb.freebsd.org/changeset/base/367763 > > Log: > When copying types from one CTF container to another, ensure that we > always copy intrinsic data types before copying bitfields which are > based on those types. This ensures the type ordering in the destination > CTF container matches the assumption made elsewhere in the CTF code > that instrinsic data types will always appear before bitfields based on > those types. > > This resolves the following error message some users have seen after > r366908: > "/usr/lib/dtrace/ipfw.d", line 121: failed to copy type of 'ip6p': > Conflicting type is already defined > > Reviewed by: markj > Sponsored by: Netflix > Differential Revision:https://reviews.freebsd.org/D27213 FWIW, this should also have said: MFC after: 2 weeks ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r367763 - head/cddl/contrib/opensolaris/common/ctf
Author: jtl Date: Tue Nov 17 14:07:27 2020 New Revision: 367763 URL: https://svnweb.freebsd.org/changeset/base/367763 Log: When copying types from one CTF container to another, ensure that we always copy intrinsic data types before copying bitfields which are based on those types. This ensures the type ordering in the destination CTF container matches the assumption made elsewhere in the CTF code that instrinsic data types will always appear before bitfields based on those types. This resolves the following error message some users have seen after r366908: "/usr/lib/dtrace/ipfw.d", line 121: failed to copy type of 'ip6p': Conflicting type is already defined Reviewed by: markj Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D27213 Modified: head/cddl/contrib/opensolaris/common/ctf/ctf_create.c Modified: head/cddl/contrib/opensolaris/common/ctf/ctf_create.c == --- head/cddl/contrib/opensolaris/common/ctf/ctf_create.c Tue Nov 17 13:14:04 2020(r367762) +++ head/cddl/contrib/opensolaris/common/ctf/ctf_create.c Tue Nov 17 14:07:27 2020(r367763) @@ -1258,7 +1258,7 @@ ctf_add_type(ctf_file_t *dst_fp, ctf_file_t *src_fp, c uint_t kind, flag, vlen; ctf_bundle_t src, dst; - ctf_encoding_t src_en, dst_en; + ctf_encoding_t src_en, main_en, dst_en; ctf_arinfo_t src_ar, dst_ar; ctf_dtdef_t *dtd; @@ -1372,6 +1372,27 @@ ctf_add_type(ctf_file_t *dst_fp, ctf_file_t *src_fp, c case CTF_K_FLOAT: if (ctf_type_encoding(src_fp, src_type, _en) != 0) return (ctf_set_errno(dst_fp, ctf_errno(src_fp))); + + /* +* This could be a bitfield, and the CTF library assumes +* intrinsics will appear before bitfields. Therefore, +* try to copy over the intrinsic prior to copying the +* bitfield. +*/ + if (dst_type == CTF_ERR && name[0] != '\0' && + (hep = ctf_hash_lookup(_fp->ctf_names, src_fp, name, + strlen(name))) != NULL && + src_type != (ctf_id_t)hep->h_type) { + if (ctf_type_encoding(src_fp, (ctf_id_t)hep->h_type, + _en) != 0) { + return (ctf_set_errno(dst_fp, + ctf_errno(src_fp))); + } + if (bcmp(_en, _en, sizeof (ctf_encoding_t)) && + ctf_add_type(dst_fp, src_fp, + (ctf_id_t)hep->h_type) == CTF_ERR) + return (CTF_ERR); /* errno is set for us */ + } if (dst_type != CTF_ERR) { if (ctf_type_encoding(dst_fp, dst_type, _en) != 0) ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r367685 - head/tests/sys/netinet
Author: jtl Date: Sat Nov 14 15:44:28 2020 New Revision: 367685 URL: https://svnweb.freebsd.org/changeset/base/367685 Log: Add a regression test for the port-selection behavior fixed in r367680. Reviewed by: markj, olivier, tuexen Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D27173 Added: head/tests/sys/netinet/tcp_connect_port_test.c (contents, props changed) Modified: head/tests/sys/netinet/Makefile Modified: head/tests/sys/netinet/Makefile == --- head/tests/sys/netinet/Makefile Sat Nov 14 15:33:39 2020 (r367684) +++ head/tests/sys/netinet/Makefile Sat Nov 14 15:44:28 2020 (r367685) @@ -7,7 +7,8 @@ BINDIR= ${TESTSDIR} ATF_TESTS_C= ip_reass_test \ so_reuseport_lb_test \ - socket_afinet + socket_afinet \ + tcp_connect_port_test ATF_TESTS_SH= carp fibs fibs_test redirect divert forward output lpm TEST_METADATA.output+= required_programs="python" Added: head/tests/sys/netinet/tcp_connect_port_test.c == --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ head/tests/sys/netinet/tcp_connect_port_test.c Sat Nov 14 15:44:28 2020(r367685) @@ -0,0 +1,334 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2020 Netflix, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + *notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + *notice, this list of conditions and the following disclaimer in + *the documentation and/or other materials provided with the + *distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include + +#defineSYSCTLBAKFILE "tmp.net.inet.ip.portrange.randomized" + +/* + * Check if port allocation is randomized. If so, update it. Save the old + * value of the sysctl so it can be updated later. + */ +static void +disable_random_ports(void) +{ + int error, fd, random_new, random_save; + size_t sysctlsz; + + /* +* Pre-emptively unlink our restoration file, so we will do no +* restoration on error. +*/ + unlink(SYSCTLBAKFILE); + + /* +* Disable the net.inet.ip.portrange.randomized sysctl. Save the +* old value so we can restore it, if necessary. +*/ + random_new = 0; + sysctlsz = sizeof(random_save); + error = sysctlbyname("net.inet.ip.portrange.randomized", _save, + , _new, sizeof(random_new)); + if (error) { + warn("sysctlbyname(\"net.inet.ip.portrange.randomized\") " + "failed"); + atf_tc_skip("Unable to set sysctl"); + } + if (sysctlsz != sizeof(random_save)) { + fprintf(stderr, "Error: unexpected sysctl value size " + "(expected %zu, actual %zu)\n", sizeof(random_save), + sysctlsz); + goto restore_sysctl; + } + + /* Open the backup file, write the contents, and close it. */ + fd = open(SYSCTLBAKFILE, O_WRONLY|O_CREAT|O_TRUNC|O_EXCL, + S_IRUSR|S_IWUSR); + if (fd < 0) { + warn("error opening sysctl backup file"); + goto restore_sysctl; + } + error = write(fd, _save, sizeof(random_save)); + if (error < 0) { + warn("error writing saved value to sysctl backup file"); + goto cleanup_and_restore; + } + if (error != (int)sizeof(random_save)) { + fprintf(stderr, + "Error writing saved value to sysctl backup file: " + "(expected %zu, actual %d)\n", sizeof(random_save),
svn commit: r367680 - head/sys/netinet6
Author: jtl Date: Sat Nov 14 14:50:34 2020 New Revision: 367680 URL: https://svnweb.freebsd.org/changeset/base/367680 Log: Fix implicit automatic local port selection for IPv6 during connect calls. When a user creates a TCP socket and tries to connect to the socket without explicitly binding the socket to a local address, the connect call implicitly chooses an appropriate local port. When evaluating candidate local ports, the algorithm checks for conflicts with existing ports by doing a lookup in the connection hash table. In this circumstance, both the IPv4 and IPv6 code look for exact matches in the hash table. However, the IPv4 code goes a step further and checks whether the proposed 4-tuple will match wildcard (e.g. TCP "listen") entries. The IPv6 code has no such check. The missing wildcard check can cause problems when connecting to a local server. It is possible that the algorithm will choose the same value for the local port as the foreign port uses. This results in a connection with identical source and destination addresses and ports. Changing the IPv6 code to align with the IPv4 code's behavior fixes this problem. Reviewed by: tuexen Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D27164 Modified: head/sys/netinet6/in6_pcb.c Modified: head/sys/netinet6/in6_pcb.c == --- head/sys/netinet6/in6_pcb.c Sat Nov 14 14:15:49 2020(r367679) +++ head/sys/netinet6/in6_pcb.c Sat Nov 14 14:50:34 2020(r367680) @@ -464,7 +464,8 @@ in6_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr rehash = false; error = in_pcb_lport_dest(inp, (struct sockaddr *) , >inp_lport, - (struct sockaddr *) sin6, sin6->sin6_port, cred, 0); + (struct sockaddr *) sin6, sin6->sin6_port, cred, + INPLOOKUP_WILDCARD); if (error) return (error); } ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r367573 - head/sys/vm
Author: jtl Date: Tue Nov 10 18:12:09 2020 New Revision: 367573 URL: https://svnweb.freebsd.org/changeset/base/367573 Log: When destroying a UMA zone which has a reserve (set with uma_zone_reserve()), messages like the following appear on the console: "Freed UMA keg (Test zone) was not empty (0 items). Lost 528 pages of memory." When keg_drain_domain() is draining the zone, it tries to keep the number of items specified in the reservation. However, when we are destroying the UMA zone, we do not need to keep those items. Therefore, when destroying a non-secondary and non-cache zone, we should reset the keg reservation to 0 prior to draining the zone. Reviewed by: markj Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D27129 Modified: head/sys/vm/uma_core.c Modified: head/sys/vm/uma_core.c == --- head/sys/vm/uma_core.c Tue Nov 10 18:10:50 2020(r367572) +++ head/sys/vm/uma_core.c Tue Nov 10 18:12:09 2020(r367573) @@ -2791,6 +2791,10 @@ zone_dtor(void *arg, int size, void *udata) rw_wlock(_rwlock); LIST_REMOVE(zone, uz_link); rw_wunlock(_rwlock); + if ((zone->uz_flags & (UMA_ZONE_SECONDARY | UMA_ZFLAG_CACHE)) == 0) { + keg = zone->uz_keg; + keg->uk_reserve = 0; + } zone_reclaim(zone, M_WAITOK, true); /* ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
Re: svn commit: r360019 - head/tests/sys/kern
On Sun, Apr 19, 2020 at 5:03 PM Enji Cooper wrote: > > > On Apr 16, 2020, at 1:07 PM, Jonathan T. Looney wrote: > > > > Author: jtl > > Date: Thu Apr 16 20:07:34 2020 > > New Revision: 360019 > > URL: https://svnweb.freebsd.org/changeset/base/360019 > > > > Log: > > Add a regression test for the changes in r359922 and r359923. > > > > Note that the Python code has been tested on both Python 2.7 and 3.7. > > > > Reviewed by: olivier > > MFC after: 2 weeks > > Sponsored by:Netflix, Inc. > > Ugh… I can tell by this commit that I just need to add pytest > support to kyua >_>… > I assume you're suggesting that we support running Python code natively, instead of from a shell script. If so, I think that's a great idea. It looks like we're growing a number of Python scripts in the test suite, and supporting them more natively is probably good for more than one reason. Jonathan ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r360020 - in head/sys: netinet netinet6
Author: jtl Date: Thu Apr 16 20:17:24 2020 New Revision: 360020 URL: https://svnweb.freebsd.org/changeset/base/360020 Log: Avoid calling protocol drain routines more than once per reclamation event. mb_reclaim() calls the protocol drain routines for each protocol in each domain. Some protocols exist in more than one domain and share drain routines. In the case of SCTP, it also uses the same drain routine for its SOCK_SEQPACKET and SOCK_STREAM entries in the same domain. On systems with INET, INET6, and SCTP all defined, mb_reclaim() calls sctp_drain() four times. On systems with INET and INET6 defined, mb_reclaim() calls tcp_drain() twice. mb_reclaim() is the only in-tree caller of the pr_drain protocol entry. Eliminate this duplication by ensuring that each pr_drain routine is only specified for one protocol entry in one domain. Reviewed by: tuexen MFC after:2 weeks Sponsored by: Netflix, Inc. Differential Revision:https://reviews.freebsd.org/D24418 Modified: head/sys/netinet/in_proto.c head/sys/netinet6/in6_proto.c Modified: head/sys/netinet/in_proto.c == --- head/sys/netinet/in_proto.c Thu Apr 16 20:07:34 2020(r360019) +++ head/sys/netinet/in_proto.c Thu Apr 16 20:17:24 2020(r360020) @@ -163,7 +163,7 @@ struct protosw inetsw[] = { .pr_input = sctp_input, .pr_ctlinput = sctp_ctlinput, .pr_ctloutput = sctp_ctloutput, - .pr_drain = sctp_drain, + .pr_drain = NULL, /* Covered by the SOCK_SEQPACKET entry. */ .pr_usrreqs = _usrreqs }, #endif /* SCTP */ Modified: head/sys/netinet6/in6_proto.c == --- head/sys/netinet6/in6_proto.c Thu Apr 16 20:07:34 2020 (r360019) +++ head/sys/netinet6/in6_proto.c Thu Apr 16 20:17:24 2020 (r360020) @@ -172,11 +172,11 @@ struct protosw inet6sw[] = { .pr_input = tcp6_input, .pr_ctlinput = tcp6_ctlinput, .pr_ctloutput = tcp_ctloutput, -#ifndef INET /* don't call initialization and timeout routines twice */ +#ifndef INET /* don't call initialization, timeout, and drain routines twice */ .pr_init = tcp_init, .pr_slowtimo = tcp_slowtimo, -#endif .pr_drain = tcp_drain, +#endif .pr_usrreqs = _usrreqs, }, #ifdef SCTP @@ -188,8 +188,8 @@ struct protosw inet6sw[] = { .pr_input = sctp6_input, .pr_ctlinput = sctp6_ctlinput, .pr_ctloutput = sctp_ctloutput, +#ifndef INET /* Do not call initialization and drain routines twice. */ .pr_drain = sctp_drain, -#ifndef INET /* Do not call initialization twice. */ .pr_init = sctp_init, #endif .pr_usrreqs = _usrreqs @@ -202,7 +202,7 @@ struct protosw inet6sw[] = { .pr_input = sctp6_input, .pr_ctlinput = sctp6_ctlinput, .pr_ctloutput = sctp_ctloutput, - .pr_drain = sctp_drain, + .pr_drain = NULL, /* Covered by the SOCK_SEQPACKET entry. */ .pr_usrreqs = _usrreqs }, #endif /* SCTP */ ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r360019 - head/tests/sys/kern
Author: jtl Date: Thu Apr 16 20:07:34 2020 New Revision: 360019 URL: https://svnweb.freebsd.org/changeset/base/360019 Log: Add a regression test for the changes in r359922 and r359923. Note that the Python code has been tested on both Python 2.7 and 3.7. Reviewed by: olivier MFC after:2 weeks Sponsored by: Netflix, Inc. Added: head/tests/sys/kern/sonewconn_overflow.py (contents, props changed) head/tests/sys/kern/sonewconn_overflow.sh (contents, props changed) Modified: head/tests/sys/kern/Makefile Modified: head/tests/sys/kern/Makefile == --- head/tests/sys/kern/MakefileThu Apr 16 18:37:11 2020 (r360018) +++ head/tests/sys/kern/MakefileThu Apr 16 20:07:34 2020 (r360019) @@ -1,5 +1,7 @@ # $FreeBSD$ +PACKAGE= tests + TESTSRC= ${SRCTOP}/contrib/netbsd-tests/kernel .PATH: ${SRCTOP}/sys/kern @@ -24,6 +26,12 @@ ATF_TESTS_C+=waitpid_nohang ATF_TESTS_C+= pdeathsig ATF_TESTS_SH+= coredump_phnum_test +ATF_TESTS_SH+= sonewconn_overflow +TEST_METADATA.sonewconn_overflow+= required_programs="python" +TEST_METADATA.sonewconn_overflow+= required_user="root" + +${PACKAGE}FILES+= sonewconn_overflow.py +${PACKAGE}FILESMODE_sonewconn_overflow.py=0555 BINDIR=${TESTSDIR} PROGS+=coredump_phnum_helper Added: head/tests/sys/kern/sonewconn_overflow.py == --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ head/tests/sys/kern/sonewconn_overflow.py Thu Apr 16 20:07:34 2020 (r360019) @@ -0,0 +1,156 @@ +#!/usr/bin/env python +#- +# SPDX-License-Identifier: BSD-2-Clause +# +# Copyright (c) 2020 Netflix, Inc. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +#notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +#notice, this list of conditions and the following disclaimer in the +#documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# $FreeBSD$ +# + +import socket +import os +import sys +from subprocess import check_output +from time import sleep + +V4HOST = '127.0.0.1' +V6HOST = '::1' +TCPPORT = 65432 +UNIXSOCK = '/tmp/testsock' +TYPE = socket.SOCK_STREAM + +class GenericTest(object): +def __init__(self): +raise NotImplementedError("Subclass must override the __init__ method") +def setup(self, af, addr): +self.sockets = [] +self.ls = None +self.ls = socket.socket(af, TYPE) +self.ls.bind(addr) +self.ls.listen(2) +self.af = af +self.addr = addr +def doTest(self, cnt): +rv = 0 +for i in range(0, cnt): +try: +s = socket.socket(self.af, TYPE) +s.connect(self.addr) +except: +continue +self.sockets.append(s) +rv += 1 +return rv +def __del__(self): +for s in self.sockets: +s.close() +if self.ls is not None: +self.ls.close() + +class IPv4Test(GenericTest): +def __init__(self): +super(IPv4Test, self).setup(socket.AF_INET, (V4HOST, TCPPORT)) + +class IPv6Test(GenericTest): +def __init__(self): +super(IPv6Test, self).setup(socket.AF_INET6, (V6HOST, TCPPORT)) + +class UnixTest(GenericTest): +def __init__(self): +super(UnixTest, self).setup(socket.AF_UNIX, UNIXSOCK) +def __del__(self): +super(UnixTest, self).__del__() +os.remove(UNIXSOCK) + +class LogChecker(): +def __init__(self): +# Figure out how big the dmesg buffer is. +self.dmesgOff = len(check_output("/sbin/dmesg")) + +def checkForMsg(self, expected): +newOff = self.dmesgOff +for i in range(0, 3): +dmesg = check_output("/sbin/dmesg") +newOff = len(dmesg) +
svn commit: r359923 - in head: lib/libc/sys sys/kern sys/sys
Author: jtl Date: Tue Apr 14 15:38:18 2020 New Revision: 359923 URL: https://svnweb.freebsd.org/changeset/base/359923 Log: Make sonewconn() overflow messages have per-socket rate-limits and values. sonewconn() emits debug-level messages when a listen socket's queue overflows. Currently, sonewconn() tracks overflows on a global basis. It will only log one message every 60 seconds, regardless of how many sockets experience overflows. And, when it next logs at the end of the 60 seconds, it records a single message referencing a single PCB with the total number of overflows across all sockets. This commit changes to per-socket overflow tracking. The code will now log one message every 60 seconds per socket. And, the code will provide per-socket queue length and overflow counts. It also provides a way to change the period between log messages using a sysctl. Reviewed by: jhb (previous version), bcr (manpages) MFC after:2 weeks Sponsored by: Netflix, Inc. Differential Revision:https://reviews.freebsd.org/D24316 Modified: head/lib/libc/sys/listen.2 head/sys/kern/uipc_socket.c head/sys/sys/socketvar.h Modified: head/lib/libc/sys/listen.2 == --- head/lib/libc/sys/listen.2 Tue Apr 14 15:30:34 2020(r359922) +++ head/lib/libc/sys/listen.2 Tue Apr 14 15:38:18 2020(r359923) @@ -28,7 +28,7 @@ .\"From: @(#)listen.2 8.2 (Berkeley) 12/11/93 .\" $FreeBSD$ .\" -.Dd August 18, 2016 +.Dd April 14, 2020 .Dt LISTEN 2 .Os .Sh NAME @@ -110,6 +110,13 @@ or less than zero is specified, .Fa backlog is silently forced to .Va kern.ipc.soacceptqueue . +.Pp +If the listen queue overflows, the kernel will emit a LOG_DEBUG syslog message. +The +.Xr sysctl 3 +MIB variable +.Va kern.ipc.sooverinterval +specifies a per-socket limit on how often the kernel will emit these messages. .Sh INTERACTION WITH ACCEPT FILTERS When accept filtering is used on a socket, a second queue will be used to hold sockets that have connected, but have not yet Modified: head/sys/kern/uipc_socket.c == --- head/sys/kern/uipc_socket.c Tue Apr 14 15:30:34 2020(r359922) +++ head/sys/kern/uipc_socket.c Tue Apr 14 15:38:18 2020(r359923) @@ -575,6 +575,11 @@ SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, _sonewconn_earlytest, 0, "Perform early sonewconn limit test"); #endif +static struct timeval overinterval = { 60, 0 }; +SYSCTL_TIMEVAL_SEC(_kern_ipc, OID_AUTO, sooverinterval, CTLFLAG_RW, +, +"Delay in seconds between warnings for listen socket overflows"); + /* * When an attempt at a new connection is noted on a socket which accepts * connections, sonewconn is called. If the connection is possible (subject @@ -587,14 +592,10 @@ SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, struct socket * sonewconn(struct socket *head, int connstatus) { - static struct timeval lastover; - static struct timeval overinterval = { 60, 0 }; - static int overcount; - struct sbuf descrsb; struct socket *so; - u_int over; - int len; + int len, overcount; + u_int qlen; const char localprefix[] = "local:"; char descrbuf[SUNPATHLEN + sizeof(localprefix)]; #if defined(INET6) @@ -602,18 +603,31 @@ sonewconn(struct socket *head, int connstatus) #elif defined(INET) char addrbuf[INET_ADDRSTRLEN]; #endif + bool dolog, over; SOLISTEN_LOCK(head); over = (head->sol_qlen > 3 * head->sol_qlimit / 2); - SOLISTEN_UNLOCK(head); #ifdef REGRESSION if (regression_sonewconn_earlytest && over) { #else if (over) { #endif - overcount++; + head->sol_overcount++; + dolog = !!ratecheck(>sol_lastover, ); - if (ratecheck(, )) { + /* +* If we're going to log, copy the overflow count and queue +* length from the listen socket before dropping the lock. +* Also, reset the overflow count. +*/ + if (dolog) { + overcount = head->sol_overcount; + head->sol_overcount = 0; + qlen = head->sol_qlen; + } + SOLISTEN_UNLOCK(head); + + if (dolog) { /* * Try to print something descriptive about the * socket for the error message. @@ -686,7 +700,7 @@ sonewconn(struct socket *head, int connstatus) "%i already in queue awaiting acceptance " "(%d occurrences)\n", __func__, head->so_pcb, sbuf_data(), - head->sol_qlen, overcount); + qlen, overcount);
svn commit: r359922 - head/sys/kern
Author: jtl Date: Tue Apr 14 15:30:34 2020 New Revision: 359922 URL: https://svnweb.freebsd.org/changeset/base/359922 Log: Print more detail as part of the sonewconn() overflow message. When a socket's listen queue overflows, sonewconn() emits a debug-level log message. These messages are sometimes useful to systems administrators in highlighting a process which is not keeping up with its listen queue. This commit attempts to enhance the usefulness of this message by printing more details about the socket's address. If all else fails, it will at least print the domain name of the socket. Reviewed by: bz, jhb, kbowling MFC after:2 weeks Sponsored by: Netflix, Inc. Differential Revision:https://reviews.freebsd.org/D24272 Modified: head/sys/kern/uipc_socket.c Modified: head/sys/kern/uipc_socket.c == --- head/sys/kern/uipc_socket.c Tue Apr 14 15:27:24 2020(r359921) +++ head/sys/kern/uipc_socket.c Tue Apr 14 15:30:34 2020(r359922) @@ -130,6 +130,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -140,9 +141,12 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include +#include #include #include #include +#include #include #include @@ -587,8 +591,17 @@ sonewconn(struct socket *head, int connstatus) static struct timeval overinterval = { 60, 0 }; static int overcount; + struct sbuf descrsb; struct socket *so; u_int over; + int len; + const char localprefix[] = "local:"; + char descrbuf[SUNPATHLEN + sizeof(localprefix)]; +#if defined(INET6) + char addrbuf[INET6_ADDRSTRLEN]; +#elif defined(INET) + char addrbuf[INET_ADDRSTRLEN]; +#endif SOLISTEN_LOCK(head); over = (head->sol_qlen > 3 * head->sol_qlimit / 2); @@ -601,10 +614,80 @@ sonewconn(struct socket *head, int connstatus) overcount++; if (ratecheck(, )) { - log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: " + /* +* Try to print something descriptive about the +* socket for the error message. +*/ + sbuf_new(, descrbuf, sizeof(descrbuf), + SBUF_FIXEDLEN); + switch (head->so_proto->pr_domain->dom_family) { +#if defined(INET) || defined(INET6) +#ifdef INET + case AF_INET: +#endif +#ifdef INET6 + case AF_INET6: + if (head->so_proto->pr_domain->dom_family == + AF_INET6 || + (sotoinpcb(head)->inp_inc.inc_flags & + INC_ISIPV6)) { + ip6_sprintf(addrbuf, + (head)->inp_inc.inc6_laddr); + sbuf_printf(, "[%s]", addrbuf); + } else +#endif + { +#ifdef INET + inet_ntoa_r( + sotoinpcb(head)->inp_inc.inc_laddr, + addrbuf); + sbuf_cat(, addrbuf); +#endif + } + sbuf_printf(, ":%hu (proto %u)", + ntohs(sotoinpcb(head)->inp_inc.inc_lport), + head->so_proto->pr_protocol); + break; +#endif /* INET || INET6 */ + case AF_UNIX: + sbuf_cat(, localprefix); + if (sotounpcb(head)->unp_addr != NULL) + len = + sotounpcb(head)->unp_addr->sun_len - + offsetof(struct sockaddr_un, + sun_path); + else + len = 0; + if (len > 0) + sbuf_bcat(, + sotounpcb(head)->unp_addr->sun_path, + len); + else + sbuf_cat(, "(unknown)"); + break; + } + + /* +* If we can't print something more specific, at least +* print the domain name. +*/ + if (sbuf_finish() != 0 || + sbuf_len() <= 0) { +
svn commit: r359921 - head/sys/sys
Author: jtl Date: Tue Apr 14 15:27:24 2020 New Revision: 359921 URL: https://svnweb.freebsd.org/changeset/base/359921 Log: Make the path length of UNIX domain sockets specified by a #define. Also, add a comment describing the historical context for this length. Reviewed by: bz, jhb, kbowling (previous version) MFC after:2 weeks Sponsored by: Netflix, Inc. Differential Revision:https://reviews.freebsd.org/D24272 Modified: head/sys/sys/un.h Modified: head/sys/sys/un.h == --- head/sys/sys/un.h Tue Apr 14 14:48:00 2020(r359920) +++ head/sys/sys/un.h Tue Apr 14 15:27:24 2020(r359921) @@ -44,12 +44,20 @@ typedef __sa_family_t sa_family_t; #endif /* + * Historically, (struct sockaddr) needed to fit inside an mbuf. + * For this reason, UNIX domain sockets were therefore limited to + * 104 bytes. While this limit is no longer necessary, it is kept for + * binary compatibility reasons. + */ +#defineSUNPATHLEN 104 + +/* * Definitions for UNIX IPC domain. */ struct sockaddr_un { unsigned char sun_len;/* sockaddr len including null */ sa_family_t sun_family; /* AF_UNIX */ - charsun_path[104]; /* path name (gag) */ + charsun_path[SUNPATHLEN]; /* path name (gag) */ }; #if __BSD_VISIBLE ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r357742 - head/sys/vm
Author: jtl Date: Mon Feb 10 18:06:38 2020 New Revision: 357742 URL: https://svnweb.freebsd.org/changeset/base/357742 Log: Modify the vm.panic_on_oom sysctl to take a count of events. Currently, the vm.panic_on_oom sysctl is a boolean which controls the behavior of the VM system when it encounters an out-of-memory situation. If set to 0, the VM system kills the largest process. If set to any other value, the VM system will initiate a panic. This change makes the sysctl a count of events. If set to 0, the VM system kills the largest process. If set to any other value, the VM system will kill the largest process until it has seen the specified number of out-of-memory events. Once it reaches the specified number of events, it will initiate a panic. This change is helpful in capturing cores when the system is in a perpetual cycle of out-of-memory events (as opposed to just hitting one or two sporadic out-of-memory events). Reviewed by: kib MFC after:2 weeks Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D23601 Modified: head/sys/vm/vm_pageout.c Modified: head/sys/vm/vm_pageout.c == --- head/sys/vm/vm_pageout.cMon Feb 10 17:17:08 2020(r357741) +++ head/sys/vm/vm_pageout.cMon Feb 10 18:06:38 2020(r357742) @@ -158,7 +158,7 @@ static int vm_panic_on_oom = 0; SYSCTL_INT(_vm, OID_AUTO, panic_on_oom, CTLFLAG_RWTUN, _panic_on_oom, 0, - "panic on out of memory instead of killing the largest process"); + "Panic on the given number of out-of-memory errors instead of killing the largest process"); SYSCTL_INT(_vm, OID_AUTO, pageout_update_period, CTLFLAG_RWTUN, _pageout_update_period, 0, @@ -1933,7 +1933,7 @@ vm_pageout_oom(int shortage) } sx_sunlock(_lock); if (bigproc != NULL) { - if (vm_panic_on_oom != 0) + if (vm_panic_on_oom != 0 && --vm_panic_on_oom == 0) panic("out of swap space"); PROC_LOCK(bigproc); killproc(bigproc, "out of swap space"); ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r352746 - head/sys/netinet
Author: jtl Date: Thu Sep 26 15:18:57 2019 New Revision: 352746 URL: https://svnweb.freebsd.org/changeset/base/352746 Log: Add new functionality to switch to using cookies exclusively when we the syn cache overflows. Whether this is due to an attack or due to the system having more legitimate connections than the syn cache can hold, this situation can quickly impact performance. To make the system perform better during these periods, the code will now switch to exclusively using cookies until the syn cache stops overflowing. In order for this to occur, the system must be configured to use the syn cache with syn cookie fallback. If syn cookies are completely disabled, this change should have no functional impact. When the system is exclusively using syn cookies (either due to configuration or the overflow detection enabled by this change), the code will now skip acquiring a lock on the syn cache bucket. Additionally, the code will now skip lookups in several places (such as when the system receives a RST in response to a SYN|ACK frame). Reviewed by: rrs, gallatin (previous version) Discussed with: tuexen Sponsored by: Netflix, Inc. Differential Revision:https://reviews.freebsd.org/D21644 Modified: head/sys/netinet/tcp_syncache.c head/sys/netinet/tcp_syncache.h Modified: head/sys/netinet/tcp_syncache.c == --- head/sys/netinet/tcp_syncache.c Thu Sep 26 15:06:46 2019 (r352745) +++ head/sys/netinet/tcp_syncache.c Thu Sep 26 15:18:57 2019 (r352746) @@ -144,6 +144,8 @@ static struct syncache *syncookie_lookup(struct in_conninfo *, struct syncache_head *, struct syncache *, struct tcphdr *, struct tcpopt *, struct socket *); +static voidsyncache_pause(struct in_conninfo *); +static voidsyncache_unpause(void *); static void syncookie_reseed(void *); #ifdef INVARIANTS static int syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, @@ -300,6 +302,14 @@ syncache_init(void) arc4rand(V_tcp_syncache.secret.key[1], SYNCOOKIE_SECRET_SIZE, 0); callout_reset(_tcp_syncache.secret.reseed, SYNCOOKIE_LIFETIME * hz, syncookie_reseed, _tcp_syncache); + + /* Initialize the pause machinery. */ + mtx_init(_tcp_syncache.pause_mtx, "tcp_sc_pause", NULL, MTX_DEF); + callout_init_mtx(_tcp_syncache.pause_co, _tcp_syncache.pause_mtx, + 0); + V_tcp_syncache.pause_until = time_uptime - TCP_SYNCACHE_PAUSE_TIME; + V_tcp_syncache.pause_backoff = 0; + V_tcp_syncache.paused = false; } #ifdef VIMAGE @@ -316,6 +326,14 @@ syncache_destroy(void) */ callout_drain(_tcp_syncache.secret.reseed); + /* Stop the SYN cache pause callout. */ + mtx_lock(_tcp_syncache.pause_mtx); + if (callout_stop(_tcp_syncache.pause_co) == 0) { + mtx_unlock(_tcp_syncache.pause_mtx); + callout_drain(_tcp_syncache.pause_co); + } else + mtx_unlock(_tcp_syncache.pause_mtx); + /* Cleanup hash buckets: stop timers, free entries, destroy locks. */ for (i = 0; i < V_tcp_syncache.hashsize; i++) { @@ -339,6 +357,7 @@ syncache_destroy(void) /* Free the allocated global resources. */ uma_zdestroy(V_tcp_syncache.zone); free(V_tcp_syncache.hashbase, M_SYNCACHE); + mtx_destroy(_tcp_syncache.pause_mtx); } #endif @@ -360,10 +379,10 @@ syncache_insert(struct syncache *sc, struct syncache_h if (sch->sch_length >= V_tcp_syncache.bucket_limit) { KASSERT(!TAILQ_EMPTY(>sch_bucket), ("sch->sch_length incorrect")); + syncache_pause(>sc_inc); sc2 = TAILQ_LAST(>sch_bucket, sch_head); sch->sch_last_overflow = time_uptime; syncache_drop(sc2, sch); - TCPSTAT_INC(tcps_sc_bucketoverflow); } /* Put it into the bucket. */ @@ -450,6 +469,7 @@ syncache_timer(void *xsch) struct syncache *sc, *nsc; int tick = ticks; char *s; + bool paused; CURVNET_SET(sch->sch_sc->vnet); @@ -462,7 +482,19 @@ syncache_timer(void *xsch) */ sch->sch_nextc = tick + INT_MAX; + /* +* If we have paused processing, unconditionally remove +* all syncache entries. +*/ + mtx_lock(_tcp_syncache.pause_mtx); + paused = V_tcp_syncache.paused; + mtx_unlock(_tcp_syncache.pause_mtx); + TAILQ_FOREACH_SAFE(sc, >sch_bucket, sc_hash, nsc) { + if (paused) { + syncache_drop(sc, sch); + continue; + } /* * We do not check if the listen socket still exists * and accept the case where the listen socket may be @@ -505,14
svn commit: r352745 - head/sys/netinet
Author: jtl Date: Thu Sep 26 15:06:46 2019 New Revision: 352745 URL: https://svnweb.freebsd.org/changeset/base/352745 Log: Access the syncache secret directly from the V_tcp_syncache variable, rather than indirectly through the backpointer to the tcp_syncache structure stored in the hashtable bucket. This also allows us to remove the requirement in syncookie_generate() and syncookie_lookup() that the syncache hashtable bucket must be locked. Reviewed by: gallatin, rrs Sponsored by: Netflix, Inc. Differential Revision:https://reviews.freebsd.org/D21644 Modified: head/sys/netinet/tcp_syncache.c Modified: head/sys/netinet/tcp_syncache.c == --- head/sys/netinet/tcp_syncache.c Thu Sep 26 15:02:34 2019 (r352744) +++ head/sys/netinet/tcp_syncache.c Thu Sep 26 15:06:46 2019 (r352745) @@ -2061,8 +2061,6 @@ syncookie_generate(struct syncache_head *sch, struct s uint8_t *secbits; union syncookie cookie; - SCH_LOCK_ASSERT(sch); - cookie.cookie = 0; /* Map our computed MSS into the 3-bit index. */ @@ -2090,10 +2088,10 @@ syncookie_generate(struct syncache_head *sch, struct s cookie.flags.sack_ok = 1; /* Which of the two secrets to use. */ - secbit = sch->sch_sc->secret.oddeven & 0x1; + secbit = V_tcp_syncache.secret.oddeven & 0x1; cookie.flags.odd_even = secbit; - secbits = sch->sch_sc->secret.key[secbit]; + secbits = V_tcp_syncache.secret.key[secbit]; hash = syncookie_mac(>sc_inc, sc->sc_irs, cookie.cookie, secbits, (uintptr_t)sch); @@ -2121,8 +2119,6 @@ syncookie_lookup(struct in_conninfo *inc, struct synca int wnd, wscale = 0; union syncookie cookie; - SCH_LOCK_ASSERT(sch); - /* * Pull information out of SYN-ACK/ACK and revert sequence number * advances. @@ -2137,7 +2133,7 @@ syncookie_lookup(struct in_conninfo *inc, struct synca cookie.cookie = (ack & 0xff) ^ (ack >> 24); /* Which of the two secrets to use. */ - secbits = sch->sch_sc->secret.key[cookie.flags.odd_even]; + secbits = V_tcp_syncache.secret.key[cookie.flags.odd_even]; hash = syncookie_mac(inc, seq, cookie.cookie, secbits, (uintptr_t)sch); ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r352744 - head/sys/netinet
Author: jtl Date: Thu Sep 26 15:02:34 2019 New Revision: 352744 URL: https://svnweb.freebsd.org/changeset/base/352744 Log: Remove the unused sch parameter to the syncache_respond() function. The use of this parameter was removed in r313330. This commit now removes passing this now-unused parameter. Reviewed by: gallatin, rrs Sponsored by: Netflix, Inc. Differential Revision:https://reviews.freebsd.org/D21644 Modified: head/sys/netinet/tcp_syncache.c Modified: head/sys/netinet/tcp_syncache.c == --- head/sys/netinet/tcp_syncache.c Thu Sep 26 14:48:39 2019 (r352743) +++ head/sys/netinet/tcp_syncache.c Thu Sep 26 15:02:34 2019 (r352744) @@ -130,8 +130,7 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, functions_inherit_ static void syncache_drop(struct syncache *, struct syncache_head *); static void syncache_free(struct syncache *); static void syncache_insert(struct syncache *, struct syncache_head *); -static int syncache_respond(struct syncache *, struct syncache_head *, - const struct mbuf *, int); +static int syncache_respond(struct syncache *, const struct mbuf *, int); static struct socket *syncache_socket(struct syncache *, struct socket *, struct mbuf *m); static void syncache_timeout(struct syncache *sc, struct syncache_head *sch, @@ -495,7 +494,7 @@ syncache_timer(void *xsch) free(s, M_TCPLOG); } - syncache_respond(sc, sch, NULL, TH_SYN|TH_ACK); + syncache_respond(sc, NULL, TH_SYN|TH_ACK); TCPSTAT_INC(tcps_sc_retransmitted); syncache_timeout(sc, sch, 0); } @@ -632,7 +631,7 @@ syncache_chkrst(struct in_conninfo *inc, struct tcphdr "sending challenge ACK\n", s, __func__, th->th_seq, sc->sc_irs + 1, sc->sc_wnd); - syncache_respond(sc, sch, m, TH_ACK); + syncache_respond(sc, m, TH_ACK); } } else { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) @@ -1475,7 +1474,7 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *t s, __func__); free(s, M_TCPLOG); } - if (syncache_respond(sc, sch, m, TH_SYN|TH_ACK) == 0) { + if (syncache_respond(sc, m, TH_SYN|TH_ACK) == 0) { sc->sc_rxmits = 0; syncache_timeout(sc, sch, 1); TCPSTAT_INC(tcps_sndacks); @@ -1640,7 +1639,7 @@ skip_alloc: /* * Do a standard 3-way handshake. */ - if (syncache_respond(sc, sch, m, TH_SYN|TH_ACK) == 0) { + if (syncache_respond(sc, m, TH_SYN|TH_ACK) == 0) { if (V_tcp_syncookies && V_tcp_syncookiesonly && sc != ) syncache_free(sc); else if (sc != ) @@ -1685,8 +1684,7 @@ tfo_expanded: * i.e. m0 != NULL, or upon 3WHS ACK timeout, i.e. m0 == NULL. */ static int -syncache_respond(struct syncache *sc, struct syncache_head *sch, -const struct mbuf *m0, int flags) +syncache_respond(struct syncache *sc, const struct mbuf *m0, int flags) { struct ip *ip = NULL; struct mbuf *m; ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r350829 - stable/11/sys/kern
Author: jtl Date: Sat Aug 10 00:02:45 2019 New Revision: 350829 URL: https://svnweb.freebsd.org/changeset/base/350829 Log: MFC r350815: In m_pulldown(), before trying to prepend bytes to the subsequent mbuf, ensure that the subsequent mbuf contains the remainder of the bytes the caller sought. If this is not the case, fall through to the code which gathers the bytes in a new mbuf. This fixes a bug where m_pulldown() could fail to gather all the desired bytes into consecutive memory. PR: 238787 Approved by: so (emaste) Modified: stable/11/sys/kern/uipc_mbuf2.c Directory Properties: stable/11/ (props changed) Modified: stable/11/sys/kern/uipc_mbuf2.c == --- stable/11/sys/kern/uipc_mbuf2.c Sat Aug 10 00:01:25 2019 (r350828) +++ stable/11/sys/kern/uipc_mbuf2.c Sat Aug 10 00:02:45 2019 (r350829) @@ -214,7 +214,7 @@ m_pulldown(struct mbuf *m, int off, int len, int *offp goto ok; } if ((off == 0 || offp) && M_LEADINGSPACE(n->m_next) >= hlen -&& writable) { +&& writable && n->m_next->m_len >= tlen) { n->m_next->m_data -= hlen; n->m_next->m_len += hlen; bcopy(mtod(n, caddr_t) + off, mtod(n->m_next, caddr_t), hlen); ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r350828 - stable/12/sys/kern
Author: jtl Date: Sat Aug 10 00:01:25 2019 New Revision: 350828 URL: https://svnweb.freebsd.org/changeset/base/350828 Log: MFC r350815: In m_pulldown(), before trying to prepend bytes to the subsequent mbuf, ensure that the subsequent mbuf contains the remainder of the bytes the caller sought. If this is not the case, fall through to the code which gathers the bytes in a new mbuf. This fixes a bug where m_pulldown() could fail to gather all the desired bytes into consecutive memory. PR: 238787 Approved by: so (emaste) Modified: stable/12/sys/kern/uipc_mbuf2.c Directory Properties: stable/12/ (props changed) Modified: stable/12/sys/kern/uipc_mbuf2.c == --- stable/12/sys/kern/uipc_mbuf2.c Fri Aug 9 23:50:57 2019 (r350827) +++ stable/12/sys/kern/uipc_mbuf2.c Sat Aug 10 00:01:25 2019 (r350828) @@ -216,7 +216,7 @@ m_pulldown(struct mbuf *m, int off, int len, int *offp goto ok; } if ((off == 0 || offp) && M_LEADINGSPACE(n->m_next) >= hlen -&& writable) { +&& writable && n->m_next->m_len >= tlen) { n->m_next->m_data -= hlen; n->m_next->m_len += hlen; bcopy(mtod(n, caddr_t) + off, mtod(n->m_next, caddr_t), hlen); ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r350815 - head/sys/kern
Author: jtl Date: Fri Aug 9 05:18:59 2019 New Revision: 350815 URL: https://svnweb.freebsd.org/changeset/base/350815 Log: In m_pulldown(), before trying to prepend bytes to the subsequent mbuf, ensure that the subsequent mbuf contains the remainder of the bytes the caller sought. If this is not the case, fall through to the code which gathers the bytes in a new mbuf. This fixes a bug where m_pulldown() could fail to gather all the desired bytes into consecutive memory. PR: 238787 Reported by: A reddit user Discussed with: emaste Obtained from:NetBSD MFC after:3 days Modified: head/sys/kern/uipc_mbuf2.c Modified: head/sys/kern/uipc_mbuf2.c == --- head/sys/kern/uipc_mbuf2.c Fri Aug 9 02:20:26 2019(r350814) +++ head/sys/kern/uipc_mbuf2.c Fri Aug 9 05:18:59 2019(r350815) @@ -216,7 +216,7 @@ m_pulldown(struct mbuf *m, int off, int len, int *offp goto ok; } if ((off == 0 || offp) && M_LEADINGSPACE(n->m_next) >= hlen -&& writable) { +&& writable && n->m_next->m_len >= tlen) { n->m_next->m_data -= hlen; n->m_next->m_len += hlen; bcopy(mtod(n, caddr_t) + off, mtod(n->m_next, caddr_t), hlen); ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r349197 - stable/12/sys/netinet/tcp_stacks
Author: jtl Date: Wed Jun 19 16:25:39 2019 New Revision: 349197 URL: https://svnweb.freebsd.org/changeset/base/349197 Log: MFC r349192: Add the ability to limit how much the code will fragment the RACK send map in response to SACKs. The default behavior is unchanged; however, the limit can be activated by changing the new net.inet.tcp.rack.split_limit sysctl. Approved by: so (gordon) Security: CVE-2019-5599 Modified: stable/12/sys/netinet/tcp_stacks/rack.c stable/12/sys/netinet/tcp_stacks/tcp_rack.h Directory Properties: stable/12/ (props changed) Modified: stable/12/sys/netinet/tcp_stacks/rack.c == --- stable/12/sys/netinet/tcp_stacks/rack.c Wed Jun 19 16:09:20 2019 (r349196) +++ stable/12/sys/netinet/tcp_stacks/rack.c Wed Jun 19 16:25:39 2019 (r349197) @@ -1,6 +1,5 @@ /*- - * Copyright (c) 2016-2018 - * Netflix Inc. All rights reserved. + * Copyright (c) 2016-2019 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -203,6 +202,7 @@ static int32_t rack_always_send_oldest = 0; static int32_t rack_sack_block_limit = 128; static int32_t rack_use_sack_filter = 1; static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; +static uint32_t rack_map_split_limit = 0; /* unlimited by default */ /* Rack specific counters */ counter_u64_t rack_badfr; @@ -228,6 +228,8 @@ counter_u64_t rack_to_arm_tlp; counter_u64_t rack_to_alloc; counter_u64_t rack_to_alloc_hard; counter_u64_t rack_to_alloc_emerg; +counter_u64_t rack_alloc_limited_conns; +counter_u64_t rack_split_limited; counter_u64_t rack_sack_proc_all; counter_u64_t rack_sack_proc_short; @@ -261,6 +263,8 @@ static void rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery); static struct rack_sendmap *rack_alloc(struct tcp_rack *rack); +static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack, +uint8_t limit_type); static struct rack_sendmap * rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused); @@ -445,6 +449,8 @@ sysctl_rack_clear(SYSCTL_HANDLER_ARGS) counter_u64_zero(rack_sack_proc_short); counter_u64_zero(rack_sack_proc_restart); counter_u64_zero(rack_to_alloc); + counter_u64_zero(rack_alloc_limited_conns); + counter_u64_zero(rack_split_limited); counter_u64_zero(rack_find_high); counter_u64_zero(rack_runt_sacks); counter_u64_zero(rack_used_tlpmethod); @@ -622,6 +628,11 @@ rack_init_sysctls() OID_AUTO, "pktdelay", CTLFLAG_RW, _pkt_delay, 1, "Extra RACK time (in ms) besides reordering thresh"); + SYSCTL_ADD_U32(_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "split_limit", CTLFLAG_RW, + _map_split_limit, 0, + "Is there a limit on the number of map split entries (0=unlimited)"); SYSCTL_ADD_S32(_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "inc_var", CTLFLAG_RW, @@ -757,7 +768,19 @@ rack_init_sysctls() SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "allocemerg", CTLFLAG_RD, _to_alloc_emerg, - "Total alocations done from emergency cache"); + "Total allocations done from emergency cache"); + rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "alloc_limited_conns", CTLFLAG_RD, + _alloc_limited_conns, + "Connections with allocations dropped due to limit"); + rack_split_limited = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "split_limited", CTLFLAG_RD, + _split_limited, + "Split allocations dropped due to limit"); rack_sack_proc_all = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), @@ -1121,10 +1144,11 @@ rack_alloc(struct tcp_rack *rack) { struct rack_sendmap *rsm; - counter_u64_add(rack_to_alloc, 1); - rack->r_ctl.rc_num_maps_alloced++; rsm = uma_zalloc(rack_zone, M_NOWAIT); if (rsm) { +alloc_done: + counter_u64_add(rack_to_alloc, 1); + rack->r_ctl.rc_num_maps_alloced++; return (rsm); } if (rack->rc_free_cnt) { @@ -1132,14 +1156,46 @@ rack_alloc(struct tcp_rack *rack) rsm = TAILQ_FIRST(>r_ctl.rc_free); TAILQ_REMOVE(>r_ctl.rc_free, rsm, r_next); rack->rc_free_cnt--; - return
svn commit: r349192 - head/sys/netinet/tcp_stacks
Author: jtl Date: Wed Jun 19 13:55:00 2019 New Revision: 349192 URL: https://svnweb.freebsd.org/changeset/base/349192 Log: Add the ability to limit how much the code will fragment the RACK send map in response to SACKs. The default behavior is unchanged; however, the limit can be activated by changing the new net.inet.tcp.rack.split_limit sysctl. Submitted by: Peter Lei Reported by: jtl Reviewed by: lstewart (earlier version) Security: CVE-2019-5599 Modified: head/sys/netinet/tcp_stacks/rack.c head/sys/netinet/tcp_stacks/tcp_rack.h Modified: head/sys/netinet/tcp_stacks/rack.c == --- head/sys/netinet/tcp_stacks/rack.c Wed Jun 19 13:33:34 2019 (r349191) +++ head/sys/netinet/tcp_stacks/rack.c Wed Jun 19 13:55:00 2019 (r349192) @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2016-2018 Netflix, Inc. + * Copyright (c) 2016-2019 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -202,6 +202,7 @@ static int32_t rack_always_send_oldest = 0; static int32_t rack_sack_block_limit = 128; static int32_t rack_use_sack_filter = 1; static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; +static uint32_t rack_map_split_limit = 0; /* unlimited by default */ /* Rack specific counters */ counter_u64_t rack_badfr; @@ -227,6 +228,8 @@ counter_u64_t rack_to_arm_tlp; counter_u64_t rack_to_alloc; counter_u64_t rack_to_alloc_hard; counter_u64_t rack_to_alloc_emerg; +counter_u64_t rack_alloc_limited_conns; +counter_u64_t rack_split_limited; counter_u64_t rack_sack_proc_all; counter_u64_t rack_sack_proc_short; @@ -260,6 +263,8 @@ static void rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery); static struct rack_sendmap *rack_alloc(struct tcp_rack *rack); +static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack, +uint8_t limit_type); static struct rack_sendmap * rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused); @@ -444,6 +449,8 @@ sysctl_rack_clear(SYSCTL_HANDLER_ARGS) counter_u64_zero(rack_sack_proc_short); counter_u64_zero(rack_sack_proc_restart); counter_u64_zero(rack_to_alloc); + counter_u64_zero(rack_alloc_limited_conns); + counter_u64_zero(rack_split_limited); counter_u64_zero(rack_find_high); counter_u64_zero(rack_runt_sacks); counter_u64_zero(rack_used_tlpmethod); @@ -621,6 +628,11 @@ rack_init_sysctls() OID_AUTO, "pktdelay", CTLFLAG_RW, _pkt_delay, 1, "Extra RACK time (in ms) besides reordering thresh"); + SYSCTL_ADD_U32(_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "split_limit", CTLFLAG_RW, + _map_split_limit, 0, + "Is there a limit on the number of map split entries (0=unlimited)"); SYSCTL_ADD_S32(_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "inc_var", CTLFLAG_RW, @@ -756,7 +768,19 @@ rack_init_sysctls() SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "allocemerg", CTLFLAG_RD, _to_alloc_emerg, - "Total alocations done from emergency cache"); + "Total allocations done from emergency cache"); + rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "alloc_limited_conns", CTLFLAG_RD, + _alloc_limited_conns, + "Connections with allocations dropped due to limit"); + rack_split_limited = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "split_limited", CTLFLAG_RD, + _split_limited, + "Split allocations dropped due to limit"); rack_sack_proc_all = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), @@ -1120,10 +1144,11 @@ rack_alloc(struct tcp_rack *rack) { struct rack_sendmap *rsm; - counter_u64_add(rack_to_alloc, 1); - rack->r_ctl.rc_num_maps_alloced++; rsm = uma_zalloc(rack_zone, M_NOWAIT); if (rsm) { +alloc_done: + counter_u64_add(rack_to_alloc, 1); + rack->r_ctl.rc_num_maps_alloced++; return (rsm); } if (rack->rc_free_cnt) { @@ -1131,14 +1156,46 @@ rack_alloc(struct tcp_rack *rack) rsm = TAILQ_FIRST(>r_ctl.rc_free); TAILQ_REMOVE(>r_ctl.rc_free, rsm, r_next); rack->rc_free_cnt--; - return (rsm); + goto alloc_done; } return (NULL); }
svn commit: r348996 - head/sys/dev/ipmi
Author: jtl Date: Wed Jun 12 16:06:31 2019 New Revision: 348996 URL: https://svnweb.freebsd.org/changeset/base/348996 Log: The current IPMI KCS code is waiting 100us for all transitions (roughly between each byte either sent or received). However, most transitions actually complete in 2-3 microseconds. By polling the status register with a delay of 4us with exponential backoff, the performance of most IPMI operations is significantly improved: - A BMC update on a Supermicro x9 or x11 motherboard goes from ~1 hour to ~6-8 minutes. - An ipmitool sensor list time improves by a factor of 4. Testing showed no significant improvements on a modern server by using a lower delay. The changes should also generally reduce the total amount of CPU or I/O bandwidth used for a given IPMI operation. Submitted by: Loic Prylli Reviewed by: jhb MFC after:2 weeks Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D20527 Modified: head/sys/dev/ipmi/ipmi_kcs.c Modified: head/sys/dev/ipmi/ipmi_kcs.c == --- head/sys/dev/ipmi/ipmi_kcs.cWed Jun 12 16:05:20 2019 (r348995) +++ head/sys/dev/ipmi/ipmi_kcs.cWed Jun 12 16:06:31 2019 (r348996) @@ -48,55 +48,46 @@ __FBSDID("$FreeBSD$"); #include #endif +#definePOLLING_DELAY_MIN 4 /* Waits are 2-3 usecs on typical systems */ +#definePOLLING_DELAY_MAX 256 + static voidkcs_clear_obf(struct ipmi_softc *, int); static voidkcs_error(struct ipmi_softc *); -static int kcs_wait_for_ibf(struct ipmi_softc *, int); -static int kcs_wait_for_obf(struct ipmi_softc *, int); +static int kcs_wait_for_ibf(struct ipmi_softc *, bool); +static int kcs_wait_for_obf(struct ipmi_softc *, bool); static int -kcs_wait_for_ibf(struct ipmi_softc *sc, int state) +kcs_wait(struct ipmi_softc *sc, int value, int mask) { int status, start = ticks; + int delay_usec = POLLING_DELAY_MIN; status = INB(sc, KCS_CTL_STS); - if (state == 0) { - /* WAIT FOR IBF = 0 */ - while (ticks - start < MAX_TIMEOUT && status & KCS_STATUS_IBF) { - DELAY(100); - status = INB(sc, KCS_CTL_STS); - } - } else { - /* WAIT FOR IBF = 1 */ - while (ticks - start < MAX_TIMEOUT && - !(status & KCS_STATUS_IBF)) { - DELAY(100); - status = INB(sc, KCS_CTL_STS); - } + while (ticks - start < MAX_TIMEOUT && (status & mask) != value) { + /* +* The wait delay is increased exponentially to avoid putting +* significant load on I/O bus. +*/ + DELAY(delay_usec); + status = INB(sc, KCS_CTL_STS); + if (delay_usec < POLLING_DELAY_MAX) + delay_usec *= 2; } return (status); } static int -kcs_wait_for_obf(struct ipmi_softc *sc, int state) +kcs_wait_for_ibf(struct ipmi_softc *sc, bool level) { - int status, start = ticks; - status = INB(sc, KCS_CTL_STS); - if (state == 0) { - /* WAIT FOR OBF = 0 */ - while (ticks - start < MAX_TIMEOUT && status & KCS_STATUS_OBF) { - DELAY(100); - status = INB(sc, KCS_CTL_STS); - } - } else { - /* WAIT FOR OBF = 1 */ - while (ticks - start < MAX_TIMEOUT && - !(status & KCS_STATUS_OBF)) { - DELAY(100); - status = INB(sc, KCS_CTL_STS); - } - } - return (status); + return (kcs_wait(sc, level ? KCS_STATUS_IBF : 0, KCS_STATUS_IBF)); +} + +static int +kcs_wait_for_obf(struct ipmi_softc *sc, bool level) +{ + + return (kcs_wait(sc, level ? KCS_STATUS_OBF : 0, KCS_STATUS_OBF)); } static void ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
Re: svn commit: r348810 - head/sys/x86/x86
Hi Enji, On Sat, Jun 8, 2019 at 3:58 PM Enji Cooper wrote: > > Modified: > > head/sys/x86/x86/mca.c > > > > Modified: head/sys/x86/x86/mca.c > > > == > > --- head/sys/x86/x86/mca.cSat Jun 8 17:49:17 2019(r348809) > > +++ head/sys/x86/x86/mca.cSat Jun 8 18:26:48 2019(r348810) > > @@ -86,7 +86,6 @@ struct amd_et_state { > > > > struct mca_internal { > >struct mca_record rec; > > -intlogged; > >STAILQ_ENTRY(mca_internal) link; > > }; > > > > @@ -101,6 +100,7 @@ static MALLOC_DEFINE(M_MCA, "MCA", "Machine Check > Arch > > > > static volatile int mca_count;/* Number of records stored. */ > > static int mca_banks;/* Number of per-CPU register banks. */ > > +static int mca_maxcount = -1;/* Limit on records stored. (-1 = > unlimited) */ > > > > static SYSCTL_NODE(_hw, OID_AUTO, mca, CTLFLAG_RD, NULL, > > "Machine Check Architecture"); > > @@ -125,10 +125,11 @@ SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, > CTLFLAG_RDTU > > static STAILQ_HEAD(, mca_internal) mca_freelist; > > static int mca_freecount; > > static STAILQ_HEAD(, mca_internal) mca_records; > > +static STAILQ_HEAD(, mca_internal) mca_pending; > > static struct callout mca_timer; > > static int mca_ticks = 3600;/* Check hourly by default. */ > > static struct taskqueue *mca_tq; > > -static struct task mca_refill_task, mca_scan_task; > > +static struct task mca_resize_task, mca_scan_task; > > static struct mtx mca_lock; > > > > static unsigned int > > @@ -557,32 +558,49 @@ mca_check_status(int bank, struct mca_record *rec) > > } > > > > static void > > -mca_fill_freelist(void) > > +mca_resize_freelist(void) > > { > > -struct mca_internal *rec; > > -int desired; > > +struct mca_internal *next, *rec; > > +STAILQ_HEAD(, mca_internal) tmplist; > > +int count, i, desired_max, desired_min; > > > >/* > > * Ensure we have at least one record for each bank and one > > - * record per CPU. > > + * record per CPU, but no more than twice that amount. > > */ > > -desired = imax(mp_ncpus, mca_banks); > > +desired_min = imax(mp_ncpus, mca_banks); > > +desired_max = imax(mp_ncpus, mca_banks) * 2; > > +STAILQ_INIT(); > >mtx_lock_spin(_lock); > > -while (mca_freecount < desired) { > > +while (mca_freecount > desired_max) { > > +rec = STAILQ_FIRST(_freelist); > > +KASSERT(rec != NULL, ("mca_freecount is %d, but list is empty", > > +mca_freecount)); > > +STAILQ_REMOVE_HEAD(_freelist, link); > > +mca_freecount--; > > +STAILQ_INSERT_TAIL(, rec, link); > > +} > > +while (mca_freecount < desired_min) { > > +count = desired_min - mca_freecount; > >mtx_unlock_spin(_lock); > > Should this also be outside the loop, like it was before, to ensure the > locking is correct, the lock is always unlocked, and the current thread > yields to the other threads, or should the lock be held over both loops? > I think it is correct as is, but welcome feedback if you still think it is incorrect after my explanation. Here is what the new code does: 1. While holding the lock, we figure out how many entries we want to add to the free list. We store it in a local variable (count). 2. We drop the lock and allocate count entries. 3. We reacquire the lock, add those entries to the list, and increment mca_freecount by count. 4. We then check to see if we need to allocate more entries (which might occur if, for example, the free list had been depleted by other threads between steps 1 and 3, while we didn't hold the lock). The key thing is that we must hold the lock while reading or writing mca_freecount and mca_freelist. This code meets that criteria. Even if mca_freecount changes between steps 1 and 3 above, we still add count entries to the list. And, we still increment mca_freecount by count. So, in the end, mca_freecount should match the number of entries on the list. (It might be more or less than our target, but it will be accurate.) The only reason we need to drop the lock is to allocate more entries, which we can't do while holding a spin lock. I don't think there is particularly an impetus to want to yield more often than that. Also, note that you'll only ever enter one of the two loops. (If you are freeing excess entries, you won't need to allocate more to meet a shortage.) Finally, note that you'll never leave this function with less than the desired minimum, but you might leave with more than the desired maximum (if entries are freed by other threads between steps 1 and 3). I considered that to be an acceptable tradeoff between complexity and functionality. And, in any case, if you are using and freeing so many entries so quickly that this occurs, it seems like two things are true: you could probably use the extra headroom on the free list and also it is very likely that something will call the resize task
svn commit: r348810 - head/sys/x86/x86
Author: jtl Date: Sat Jun 8 18:26:48 2019 New Revision: 348810 URL: https://svnweb.freebsd.org/changeset/base/348810 Log: Currently, MCA entries remain on an every-growing linked list. This means that it becomes increasingly expensive to process a steady stream of correctable errors. Additionally, the memory used by the MCA entries can grow without bound. Change the code to maintain two separate lists: a list of entries which still need to be logged, and a list of entries which have already been logged. Additionally, allow a user-configurable limit on the number of entries which will be saved after they are logged. (The limit defaults to -1 [unlimited], which is the current behavior.) Reviewed by: imp, jhb MFC after:2 weeks Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D20482 Modified: head/sys/x86/x86/mca.c Modified: head/sys/x86/x86/mca.c == --- head/sys/x86/x86/mca.c Sat Jun 8 17:49:17 2019(r348809) +++ head/sys/x86/x86/mca.c Sat Jun 8 18:26:48 2019(r348810) @@ -86,7 +86,6 @@ struct amd_et_state { struct mca_internal { struct mca_record rec; - int logged; STAILQ_ENTRY(mca_internal) link; }; @@ -101,6 +100,7 @@ static MALLOC_DEFINE(M_MCA, "MCA", "Machine Check Arch static volatile int mca_count; /* Number of records stored. */ static int mca_banks; /* Number of per-CPU register banks. */ +static int mca_maxcount = -1; /* Limit on records stored. (-1 = unlimited) */ static SYSCTL_NODE(_hw, OID_AUTO, mca, CTLFLAG_RD, NULL, "Machine Check Architecture"); @@ -125,10 +125,11 @@ SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RDTU static STAILQ_HEAD(, mca_internal) mca_freelist; static int mca_freecount; static STAILQ_HEAD(, mca_internal) mca_records; +static STAILQ_HEAD(, mca_internal) mca_pending; static struct callout mca_timer; static int mca_ticks = 3600; /* Check hourly by default. */ static struct taskqueue *mca_tq; -static struct task mca_refill_task, mca_scan_task; +static struct task mca_resize_task, mca_scan_task; static struct mtx mca_lock; static unsigned int @@ -557,32 +558,49 @@ mca_check_status(int bank, struct mca_record *rec) } static void -mca_fill_freelist(void) +mca_resize_freelist(void) { - struct mca_internal *rec; - int desired; + struct mca_internal *next, *rec; + STAILQ_HEAD(, mca_internal) tmplist; + int count, i, desired_max, desired_min; /* * Ensure we have at least one record for each bank and one -* record per CPU. +* record per CPU, but no more than twice that amount. */ - desired = imax(mp_ncpus, mca_banks); + desired_min = imax(mp_ncpus, mca_banks); + desired_max = imax(mp_ncpus, mca_banks) * 2; + STAILQ_INIT(); mtx_lock_spin(_lock); - while (mca_freecount < desired) { + while (mca_freecount > desired_max) { + rec = STAILQ_FIRST(_freelist); + KASSERT(rec != NULL, ("mca_freecount is %d, but list is empty", + mca_freecount)); + STAILQ_REMOVE_HEAD(_freelist, link); + mca_freecount--; + STAILQ_INSERT_TAIL(, rec, link); + } + while (mca_freecount < desired_min) { + count = desired_min - mca_freecount; mtx_unlock_spin(_lock); - rec = malloc(sizeof(*rec), M_MCA, M_WAITOK); + for (i = 0; i < count; i++) { + rec = malloc(sizeof(*rec), M_MCA, M_WAITOK); + STAILQ_INSERT_TAIL(, rec, link); + } mtx_lock_spin(_lock); - STAILQ_INSERT_TAIL(_freelist, rec, link); - mca_freecount++; + STAILQ_CONCAT(_freelist, ); + mca_freecount += count; } mtx_unlock_spin(_lock); + STAILQ_FOREACH_SAFE(rec, , link, next) + free(rec, M_MCA); } static void -mca_refill(void *context, int pending) +mca_resize(void *context, int pending) { - mca_fill_freelist(); + mca_resize_freelist(); } static void @@ -607,12 +625,8 @@ mca_record_entry(enum scan_mode mode, const struct mca } rec->rec = *record; - rec->logged = 0; - STAILQ_INSERT_TAIL(_records, rec, link); - mca_count++; + STAILQ_INSERT_TAIL(_pending, rec, link); mtx_unlock_spin(_lock); - if (mode == CMCI && !cold) - taskqueue_enqueue(mca_tq, _refill_task); } #ifdef DEV_APIC @@ -788,14 +802,69 @@ mca_scan(enum scan_mode mode, int *recoverablep) } #endif } - if (mode == POLLED) - mca_fill_freelist(); if (recoverablep != NULL) *recoverablep = recoverable; return (count); } /* + * Store a new record on
svn commit: r343662 - svnadmin/conf
Author: jtl Date: Fri Feb 1 15:38:20 2019 New Revision: 343662 URL: https://svnweb.freebsd.org/changeset/base/343662 Log: Add bz@ as a co-mentor for thj@. Approved by: core (implicit) Modified: svnadmin/conf/mentors Modified: svnadmin/conf/mentors == --- svnadmin/conf/mentors Fri Feb 1 12:33:00 2019(r343661) +++ svnadmin/conf/mentors Fri Feb 1 15:38:20 2019(r343662) @@ -33,6 +33,6 @@ ram ken Co-mentor: mav rgrimesphk Co-mentor: bde slavashkib Co-mentor: hselasky slmken Co-mentor: scottl, ambrisko -thjjtl +thjjtl Co-mentor: bz tmunro mjg Co-mentor: allanjude wosch cem ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r340483 - head/sys/netinet
Author: jtl Date: Fri Nov 16 18:32:48 2018 New Revision: 340483 URL: https://svnweb.freebsd.org/changeset/base/340483 Log: Add some additional length checks to the IPv4 fragmentation code. Specifically, block 0-length fragments, even when the MF bit is clear. Also, ensure that every fragment with the MF bit clear ends at the same offset and that no subsequently-received fragments exceed that offset. Reviewed by: glebius, markj MFC after:3 days Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D17922 Modified: head/sys/netinet/ip_reass.c head/sys/netinet/ip_var.h Modified: head/sys/netinet/ip_reass.c == --- head/sys/netinet/ip_reass.c Fri Nov 16 17:07:54 2018(r340482) +++ head/sys/netinet/ip_reass.c Fri Nov 16 18:32:48 2018(r340483) @@ -211,19 +211,21 @@ ip_reass(struct mbuf *m) * convert offset of this to bytes. */ ip->ip_len = htons(ntohs(ip->ip_len) - hlen); - if (ip->ip_off & htons(IP_MF)) { - /* -* Make sure that fragments have a data length -* that's a non-zero multiple of 8 bytes. -*/ - if (ip->ip_len == htons(0) || (ntohs(ip->ip_len) & 0x7) != 0) { - IPSTAT_INC(ips_toosmall); /* XXX */ - IPSTAT_INC(ips_fragdropped); - m_freem(m); - return (NULL); - } + /* +* Make sure that fragments have a data length +* that's a non-zero multiple of 8 bytes, unless +* this is the last fragment. +*/ + if (ip->ip_len == htons(0) || + ((ip->ip_off & htons(IP_MF)) && (ntohs(ip->ip_len) & 0x7) != 0)) { + IPSTAT_INC(ips_toosmall); /* XXX */ + IPSTAT_INC(ips_fragdropped); + m_freem(m); + return (NULL); + } + if (ip->ip_off & htons(IP_MF)) m->m_flags |= M_IP_FRAG; - } else + else m->m_flags &= ~M_IP_FRAG; ip->ip_off = htons(ntohs(ip->ip_off) << 3); @@ -301,9 +303,28 @@ ip_reass(struct mbuf *m) fp->ipq_src = ip->ip_src; fp->ipq_dst = ip->ip_dst; fp->ipq_frags = m; + if (m->m_flags & M_IP_FRAG) + fp->ipq_maxoff = -1; + else + fp->ipq_maxoff = ntohs(ip->ip_off) + ntohs(ip->ip_len); m->m_nextpkt = NULL; goto done; } else { + /* +* If we already saw the last fragment, make sure +* this fragment's offset looks sane. Otherwise, if +* this is the last fragment, record its endpoint. +*/ + if (fp->ipq_maxoff > 0) { + i = ntohs(ip->ip_off) + ntohs(ip->ip_len); + if (((m->m_flags & M_IP_FRAG) && i >= fp->ipq_maxoff) || + ((m->m_flags & M_IP_FRAG) == 0 && + i != fp->ipq_maxoff)) { + fp = NULL; + goto dropfrag; + } + } else if ((m->m_flags & M_IP_FRAG) == 0) + fp->ipq_maxoff = ntohs(ip->ip_off) + ntohs(ip->ip_len); fp->ipq_nfrags++; atomic_add_int(, 1); #ifdef MAC Modified: head/sys/netinet/ip_var.h == --- head/sys/netinet/ip_var.h Fri Nov 16 17:07:54 2018(r340482) +++ head/sys/netinet/ip_var.h Fri Nov 16 18:32:48 2018(r340483) @@ -61,6 +61,7 @@ struct ipq { u_char ipq_ttl;/* time for reass q to live */ u_char ipq_p; /* protocol of this fragment */ u_short ipq_id; /* sequence id for reassembly */ + int ipq_maxoff; /* total length of packet */ struct mbuf *ipq_frags; /* to ip headers of fragments */ struct in_addr ipq_src,ipq_dst; u_char ipq_nfrags; /* # frags in this packet */ ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
Re: svn commit: r340241 - head/sys/vm
Nice find! Jonathan On Wed, Nov 7, 2018 at 6:28 PM Mark Johnston wrote: > Author: markj > Date: Wed Nov 7 23:28:11 2018 > New Revision: 340241 > URL: https://svnweb.freebsd.org/changeset/base/340241 > > Log: > Fix a use-after-free in swp_pager_meta_free(). > > This was introduced in r326329 and explains the crashes mentioned in > the commit log message for r339934. In particular, on INVARIANTS > kernels, UMA trashing causes the loop to exit early, leaving swap > blocks behind when they should have been freed. After r336984 this > became more problematic since new anonymous mappings were more > likely to reuse swapped-out subranges of existing VM objects, so faults > would trigger pageins of freed memory rather than returning zeroed > pages. > > Reviewed by: kib > MFC after:3 days > Sponsored by: The FreeBSD Foundation > Differential Revision:https://reviews.freebsd.org/D17897 > > Modified: > head/sys/vm/swap_pager.c > > Modified: head/sys/vm/swap_pager.c > > == > --- head/sys/vm/swap_pager.cWed Nov 7 21:36:52 2018(r340240) > +++ head/sys/vm/swap_pager.cWed Nov 7 23:28:11 2018(r340241) > @@ -1972,13 +1972,13 @@ swp_pager_meta_free(vm_object_t object, > vm_pindex_t pi > swp_pager_update_freerange(_free, _free, > sb->d[i]); > sb->d[i] = SWAPBLK_NONE; > } > + pindex = sb->p + SWAP_META_PAGES; > if (swp_pager_swblk_empty(sb, 0, start) && > swp_pager_swblk_empty(sb, limit, SWAP_META_PAGES)) { > SWAP_PCTRIE_REMOVE(>un_pager.swp.swp_blks, > sb->p); > uma_zfree(swblk_zone, sb); > } > - pindex = sb->p + SWAP_META_PAGES; > } > swp_pager_freeswapspace(s_free, n_free); > } > > ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r340077 - head/sys/netinet
Author: jtl Date: Fri Nov 2 19:14:15 2018 New Revision: 340077 URL: https://svnweb.freebsd.org/changeset/base/340077 Log: m_pulldown() may reallocate n. Update the oip pointer after the m_pulldown() call. MFC after:2 weeks Sponsored by: Netflix Modified: head/sys/netinet/ip_icmp.c Modified: head/sys/netinet/ip_icmp.c == --- head/sys/netinet/ip_icmp.c Fri Nov 2 19:02:03 2018(r340076) +++ head/sys/netinet/ip_icmp.c Fri Nov 2 19:14:15 2018(r340077) @@ -264,6 +264,7 @@ icmp_error(struct mbuf *n, int type, int code, uint32_ if (n->m_len < oiphlen + tcphlen && (n = m_pullup(n, oiphlen + tcphlen)) == NULL) goto freeit; + oip = mtod(n, struct ip *); icmpelen = max(tcphlen, min(V_icmp_quotelen, ntohs(oip->ip_len) - oiphlen)); } else if (oip->ip_p == IPPROTO_SCTP) { ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r339419 - in head: share/man/man9 sys/kern
Author: jtl Date: Thu Oct 18 14:20:15 2018 New Revision: 339419 URL: https://svnweb.freebsd.org/changeset/base/339419 Log: r334853 added a "socket destructor" callback. However, as implemented, it was really a "socket close" callback. Update the socket destructor functionality to run when a socket is destroyed (rather than when it is closed). The original submitter has confirmed that this change satisfies the intended use case. Suggested by: rwatson Submitted by: Michio Honda Tested by:Michio Honda Approved by: re (kib) Differential Revision:https://reviews.freebsd.org/D17590 Modified: head/share/man/man9/socket.9 head/sys/kern/uipc_socket.c Modified: head/share/man/man9/socket.9 == --- head/share/man/man9/socket.9Thu Oct 18 04:36:25 2018 (r339418) +++ head/share/man/man9/socket.9Thu Oct 18 14:20:15 2018 (r339419) @@ -26,7 +26,7 @@ .\" .\" $FreeBSD$ .\" -.Dd June 8, 2018 +.Dd October 18, 2018 .Dt SOCKET 9 .Os .Sh NAME @@ -378,8 +378,8 @@ or A kernel system can use the .Fn sodtor_set function to set a destructor for a socket. -The destructor is called when the socket is closed. -The destructor is called after the protocol close routine has completed. +The destructor is called when the socket is is about to be freed. +The destructor is called before the protocol detach routine. The destructor can serve as a callback to initiate additional cleanup actions. .Ss Socket I/O The Modified: head/sys/kern/uipc_socket.c == --- head/sys/kern/uipc_socket.c Thu Oct 18 04:36:25 2018(r339418) +++ head/sys/kern/uipc_socket.c Thu Oct 18 14:20:15 2018(r339419) @@ -1026,6 +1026,9 @@ sofree(struct socket *so) so->so_error = ECONNABORTED; SOCK_UNLOCK(so); + if (so->so_dtor != NULL) + so->so_dtor(so); + VNET_SO_ASSERT(so); if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) (*pr->pr_domain->dom_dispose)(so); @@ -1102,8 +1105,6 @@ soclose(struct socket *so) drop: if (so->so_proto->pr_usrreqs->pru_close != NULL) (*so->so_proto->pr_usrreqs->pru_close)(so); - if (so->so_dtor != NULL) - so->so_dtor(so); SOCK_LOCK(so); if ((listening = (so->so_options & SO_ACCEPTCONN))) { ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r339378 - head/sys/netinet
Author: jtl Date: Tue Oct 16 14:41:09 2018 New Revision: 339378 URL: https://svnweb.freebsd.org/changeset/base/339378 Log: In r338102, the TCP reassembly code was substantially restructured. Prior to this change, the code sometimes used a temporary stack variable to hold details of a TCP segment. r338102 stopped using the variable to hold segments, but did not actually remove the variable. Because the variable is no longer used, we can safely remove it. Approved by: re (gjb) Modified: head/sys/netinet/tcp_reass.c Modified: head/sys/netinet/tcp_reass.c == --- head/sys/netinet/tcp_reass.cTue Oct 16 14:16:39 2018 (r339377) +++ head/sys/netinet/tcp_reass.cTue Oct 16 14:41:09 2018 (r339378) @@ -528,7 +528,6 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, tcp_seq struct tseg_qent *p = NULL; struct tseg_qent *nq = NULL; struct tseg_qent *te = NULL; - struct tseg_qent tqs; struct mbuf *mlast = NULL; struct sockbuf *sb; struct socket *so = tp->t_inpcb->inp_socket; @@ -1053,8 +1052,7 @@ present: KASSERT(tp->t_segqmbuflen >= q->tqe_mbuf_cnt, ("tp:%p seg queue goes negative", tp)); tp->t_segqmbuflen -= q->tqe_mbuf_cnt; - if (q != ) - uma_zfree(tcp_reass_zone, q); + uma_zfree(tcp_reass_zone, q); tp->t_segqlen--; q = nq; } while (q && q->tqe_start == tp->rcv_nxt); ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r339375 - head/sys/contrib/ck/src
Author: jtl Date: Tue Oct 16 02:30:13 2018 New Revision: 339375 URL: https://svnweb.freebsd.org/changeset/base/339375 Log: Import CK as of commit 5221ae2f3722a78c7fc41e47069ad94983d3bccb. This fixes two problems, one where epoch calls could occur before all the readers had exited the epoch section, and one where the epoch calls could be unnecessarily delayed. Approved by: re (glebius) Modified: head/sys/contrib/ck/src/ck_epoch.c Directory Properties: head/sys/contrib/ck/ (props changed) Modified: head/sys/contrib/ck/src/ck_epoch.c == --- head/sys/contrib/ck/src/ck_epoch.c Tue Oct 16 00:50:00 2018 (r339374) +++ head/sys/contrib/ck/src/ck_epoch.c Tue Oct 16 02:30:13 2018 (r339375) @@ -127,6 +127,14 @@ */ #define CK_EPOCH_GRACE 3U +/* + * CK_EPOCH_LENGTH must be a power-of-2 (because (CK_EPOCH_LENGTH - 1) is used + * as a mask, and it must be at least 3 (see comments above). + */ +#if (CK_EPOCH_LENGTH < 3 || (CK_EPOCH_LENGTH & (CK_EPOCH_LENGTH - 1)) != 0) +#error "CK_EPOCH_LENGTH must be a power of 2 and >= 3" +#endif + enum { CK_EPOCH_STATE_USED = 0, CK_EPOCH_STATE_FREE = 1 @@ -348,7 +356,7 @@ ck_epoch_scan(struct ck_epoch *global, return NULL; } -static void +static unsigned int ck_epoch_dispatch(struct ck_epoch_record *record, unsigned int e, ck_stack_t *deferred) { unsigned int epoch = e & (CK_EPOCH_LENGTH - 1); @@ -366,6 +374,7 @@ ck_epoch_dispatch(struct ck_epoch_record *record, unsi ck_stack_push_spnc(deferred, >stack_entry); else entry->function(entry); + i++; } @@ -381,7 +390,7 @@ ck_epoch_dispatch(struct ck_epoch_record *record, unsi ck_pr_sub_uint(>n_pending, i); } - return; + return i; } /* @@ -560,16 +569,28 @@ ck_epoch_poll_deferred(struct ck_epoch_record *record, unsigned int epoch; struct ck_epoch_record *cr = NULL; struct ck_epoch *global = record->global; + unsigned int n_dispatch; epoch = ck_pr_load_uint(>epoch); /* Serialize epoch snapshots with respect to global epoch. */ ck_pr_fence_memory(); + + /* +* At this point, epoch is the current global epoch value. +* There may or may not be active threads which observed epoch - 1. +* (ck_epoch_scan() will tell us that). However, there should be +* no active threads which observed epoch - 2. +* +* Note that checking epoch - 2 is necessary, as race conditions can +* allow another thread to increment the global epoch before this +* thread runs. +*/ + n_dispatch = ck_epoch_dispatch(record, epoch - 2, deferred); + cr = ck_epoch_scan(global, cr, epoch, ); - if (cr != NULL) { - record->epoch = epoch; - return false; - } + if (cr != NULL) + return (n_dispatch > 0); /* We are at a grace period if all threads are inactive. */ if (active == false) { @@ -580,10 +601,17 @@ ck_epoch_poll_deferred(struct ck_epoch_record *record, return true; } - /* If an active thread exists, rely on epoch observation. */ + /* +* If an active thread exists, rely on epoch observation. +* +* All the active threads entered the epoch section during +* the current epoch. Therefore, we can now run the handlers +* for the immediately preceding epoch and attempt to +* advance the epoch if it hasn't been already. +*/ (void)ck_pr_cas_uint(>epoch, epoch, epoch + 1); - ck_epoch_dispatch(record, epoch + 1, deferred); + ck_epoch_dispatch(record, epoch - 1, deferred); return true; } ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r339374 - vendor-sys/ck/20181014
Author: jtl Date: Tue Oct 16 00:50:00 2018 New Revision: 339374 URL: https://svnweb.freebsd.org/changeset/base/339374 Log: Tag CK after import of commit 5221ae2f3722a78c7fc41e47069ad94983d3bccb. Added: vendor-sys/ck/20181014/ - copied from r339373, vendor-sys/ck/dist/ ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r339373 - vendor-sys/ck/dist/src
Author: jtl Date: Tue Oct 16 00:48:18 2018 New Revision: 339373 URL: https://svnweb.freebsd.org/changeset/base/339373 Log: Import CK as of commit 5221ae2f3722a78c7fc41e47069ad94983d3bccb. This fixes two problems, one where epoch calls could occur before all the readers had exited the epoch section, and one where the epoch calls could be unnecessarily delayed. Modified: vendor-sys/ck/dist/src/ck_epoch.c Modified: vendor-sys/ck/dist/src/ck_epoch.c == --- vendor-sys/ck/dist/src/ck_epoch.c Mon Oct 15 21:59:24 2018 (r339372) +++ vendor-sys/ck/dist/src/ck_epoch.c Tue Oct 16 00:48:18 2018 (r339373) @@ -127,6 +127,14 @@ */ #define CK_EPOCH_GRACE 3U +/* + * CK_EPOCH_LENGTH must be a power-of-2 (because (CK_EPOCH_LENGTH - 1) is used + * as a mask, and it must be at least 3 (see comments above). + */ +#if (CK_EPOCH_LENGTH < 3 || (CK_EPOCH_LENGTH & (CK_EPOCH_LENGTH - 1)) != 0) +#error "CK_EPOCH_LENGTH must be a power of 2 and >= 3" +#endif + enum { CK_EPOCH_STATE_USED = 0, CK_EPOCH_STATE_FREE = 1 @@ -348,7 +356,7 @@ ck_epoch_scan(struct ck_epoch *global, return NULL; } -static void +static unsigned int ck_epoch_dispatch(struct ck_epoch_record *record, unsigned int e, ck_stack_t *deferred) { unsigned int epoch = e & (CK_EPOCH_LENGTH - 1); @@ -366,6 +374,7 @@ ck_epoch_dispatch(struct ck_epoch_record *record, unsi ck_stack_push_spnc(deferred, >stack_entry); else entry->function(entry); + i++; } @@ -381,7 +390,7 @@ ck_epoch_dispatch(struct ck_epoch_record *record, unsi ck_pr_sub_uint(>n_pending, i); } - return; + return i; } /* @@ -560,16 +569,28 @@ ck_epoch_poll_deferred(struct ck_epoch_record *record, unsigned int epoch; struct ck_epoch_record *cr = NULL; struct ck_epoch *global = record->global; + unsigned int n_dispatch; epoch = ck_pr_load_uint(>epoch); /* Serialize epoch snapshots with respect to global epoch. */ ck_pr_fence_memory(); + + /* +* At this point, epoch is the current global epoch value. +* There may or may not be active threads which observed epoch - 1. +* (ck_epoch_scan() will tell us that). However, there should be +* no active threads which observed epoch - 2. +* +* Note that checking epoch - 2 is necessary, as race conditions can +* allow another thread to increment the global epoch before this +* thread runs. +*/ + n_dispatch = ck_epoch_dispatch(record, epoch - 2, deferred); + cr = ck_epoch_scan(global, cr, epoch, ); - if (cr != NULL) { - record->epoch = epoch; - return false; - } + if (cr != NULL) + return (n_dispatch > 0); /* We are at a grace period if all threads are inactive. */ if (active == false) { @@ -580,10 +601,17 @@ ck_epoch_poll_deferred(struct ck_epoch_record *record, return true; } - /* If an active thread exists, rely on epoch observation. */ + /* +* If an active thread exists, rely on epoch observation. +* +* All the active threads entered the epoch section during +* the current epoch. Therefore, we can now run the handlers +* for the immediately preceding epoch and attempt to +* advance the epoch if it hasn't been already. +*/ (void)ck_pr_cas_uint(>epoch, epoch, epoch + 1); - ck_epoch_dispatch(record, epoch + 1, deferred); + ck_epoch_dispatch(record, epoch - 1, deferred); return true; } ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
Re: svn commit: r339251 - in head/sys: net netinet netinet6
On Tue, Oct 9, 2018 at 11:02 AM John Baldwin wrote: > Do we have some sort of simple per-thread epoch counter similar to > td->td_locks that we could assert on in userret() and in the ithread loop > when ithreads go to sleep to help catch leaked locks? > Yes: td->td_epochnest. There are already INVARIANTS checks in malloc (for M_WAITOK), userret(), and _sleep(). I think that covers the cases you mentioned. However, I am very much in favor of adding any additional checks which make sense. Jonathan ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r339251 - in head/sys: net netinet netinet6
Author: jtl Date: Tue Oct 9 13:26:06 2018 New Revision: 339251 URL: https://svnweb.freebsd.org/changeset/base/339251 Log: There are three places where we return from a function which entered an epoch section without exiting that epoch section. This is bad for two reasons: the epoch section won't exit, and we will leave the epoch tracker from the stack on the epoch list. Fix the epoch leak by making sure we exit epoch sections before returning. Reviewed by: ae, gallatin, mmacy Approved by: re (gjb, kib) Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D17450 Modified: head/sys/net/if_lagg.c head/sys/netinet/ip_options.c head/sys/netinet6/udp6_usrreq.c Modified: head/sys/net/if_lagg.c == --- head/sys/net/if_lagg.c Tue Oct 9 10:49:19 2018(r339250) +++ head/sys/net/if_lagg.c Tue Oct 9 13:26:06 2018(r339251) @@ -2033,15 +2033,18 @@ lagg_lb_porttable(struct lagg_softc *sc, struct lagg_p { struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc; struct lagg_port *lp_next; - int i = 0; + int i = 0, rv; + rv = 0; bzero(>lb_ports, sizeof(lb->lb_ports)); LAGG_RLOCK(); CK_SLIST_FOREACH(lp_next, >sc_ports, lp_entries) { if (lp_next == lp) continue; - if (i >= LAGG_MAX_PORTS) - return (EINVAL); + if (i >= LAGG_MAX_PORTS) { + rv = EINVAL; + break; + } if (sc->sc_ifflags & IFF_DEBUG) printf("%s: port %s at index %d\n", sc->sc_ifname, lp_next->lp_ifp->if_xname, i); @@ -2049,7 +2052,7 @@ lagg_lb_porttable(struct lagg_softc *sc, struct lagg_p } LAGG_RUNLOCK(); - return (0); + return (rv); } static int Modified: head/sys/netinet/ip_options.c == --- head/sys/netinet/ip_options.c Tue Oct 9 10:49:19 2018 (r339250) +++ head/sys/netinet/ip_options.c Tue Oct 9 13:26:06 2018 (r339251) @@ -110,16 +110,16 @@ ip_dooptions(struct mbuf *m, int pass) struct nhop4_extended nh_ext; struct sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET }; - NET_EPOCH_ENTER(); /* Ignore or reject packets with IP options. */ if (V_ip_doopts == 0) return 0; else if (V_ip_doopts == 2) { type = ICMP_UNREACH; code = ICMP_UNREACH_FILTER_PROHIB; - goto bad; + goto bad_unlocked; } + NET_EPOCH_ENTER(); dst = ip->ip_dst; cp = (u_char *)(ip + 1); cnt = (ip->ip_hl << 2) - sizeof (struct ip); @@ -388,6 +388,7 @@ dropit: return (0); bad: NET_EPOCH_EXIT(); +bad_unlocked: icmp_error(m, type, code, 0, 0); IPSTAT_INC(ips_badoptions); return (1); Modified: head/sys/netinet6/udp6_usrreq.c == --- head/sys/netinet6/udp6_usrreq.c Tue Oct 9 10:49:19 2018 (r339250) +++ head/sys/netinet6/udp6_usrreq.c Tue Oct 9 13:26:06 2018 (r339251) @@ -434,8 +434,8 @@ udp6_input(struct mbuf **mp, int *offp, int proto) INP_RUNLOCK(last); } else INP_RUNLOCK(last); - INP_INFO_RUNLOCK_ET(pcbinfo, et); inp_lost: + INP_INFO_RUNLOCK_ET(pcbinfo, et); return (IPPROTO_DONE); } /* ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
Re: svn commit: r337776 - head/sys/netinet6
On Thu, Aug 30, 2018 at 6:00 PM Kristof Provost wrote: > On 14 Aug 2018, at 19:17, Jonathan T. Looney wrote: > > + uint32_t hash, hashkey[sizeof(struct in6_addr) * 2 + 1], *hashkeyp; > > I’m pretty sure you didn’t mean for the hashkey to be 1028 bytes long. > Yikes. Yep, that's a bug. I originally spelled this as 9; however, decided to rewrite it to avoid "magic numbers". It made it through all my manual testing; but, that makes sense since all my positive functional tests were on an unloaded machine, so the extra stack garbage would be more likely to be consistent. > I’ve done this, which fixes the problem: > > diff --git a/sys/netinet6/frag6.c b/sys/netinet6/frag6.c > index 0f30801540a..e1f2b3f5842 100644 > --- a/sys/netinet6/frag6.c > +++ b/sys/netinet6/frag6.c > @@ -218,7 +218,9 @@ frag6_input(struct mbuf **mp, int *offp, int proto) > int offset = *offp, nxt, i, next; > int first_frag = 0; > int fragoff, frgpartlen;/* must be larger than u_int16_t */ > - uint32_t hash, hashkey[sizeof(struct in6_addr) * 2 + 1], *hashkeyp; > + uint32_t hashkey[(sizeof(struct in6_addr) * 2 + sizeof(u_int32_t)) / > + sizeof(uint32_t)]; > + uint32_t hash, *hashkeyp; > struct ifnet *dstifp; > u_int8_t ecn, ecn0; > #ifdef RSS > > That looks fine to me, either with or without the followup suggestion of using sizeof(ip6f->ip6f_ident). Feel free to commit this change (after appropriate re@ approval). Or, let me know if you prefer I do it. Either way, I'll wear the pointy hat (sadly, I suspect it is neither the first nor last I will earn). Jonathan ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r337804 - stable/11/share/man/man4
Author: jtl Date: Tue Aug 14 18:17:05 2018 New Revision: 337804 URL: https://svnweb.freebsd.org/changeset/base/337804 Log: MFC r337788: Update the inet(4) and inet6(4) man pages to reflect the changes made to the reassembly code in r337778, r337780, r337781, r337782, and r337783. Approved by: so Security: FreeBSD-SA-18:10.ip Security: CVE-2018-6923 Modified: stable/11/share/man/man4/inet.4 stable/11/share/man/man4/inet6.4 Directory Properties: stable/11/ (props changed) Modified: stable/11/share/man/man4/inet.4 == --- stable/11/share/man/man4/inet.4 Tue Aug 14 18:15:10 2018 (r337803) +++ stable/11/share/man/man4/inet.4 Tue Aug 14 18:17:05 2018 (r337804) @@ -28,7 +28,7 @@ .\" From: @(#)inet.4 8.1 (Berkeley) 6/5/93 .\" $FreeBSD$ .\" -.Dd Feb 4, 2016 +.Dd August 14, 2018 .Dt INET 4 .Os .Sh NAME @@ -229,15 +229,38 @@ At the same time, on high-speed links, it can decrease cycle greatly. Default is 0 (sequential IP IDs). IPv6 flow IDs and fragment IDs are always random. +.It Va ip.maxfrags +Integer: maximum number of fragments the host will accept and simultaneously +hold across all reassembly queues in all VNETs. +If set to 0, reassembly is disabled. +If set to -1, this limit is not applied. +This limit is recalculated when the number of mbuf clusters is changed. +This is a global limit. .It Va ip.maxfragpackets -Integer: maximum number of fragmented packets the host will accept and hold -in the reassembling queue simultaneously. -0 means that the host will not accept any fragmented packets. -\-1 means that the host will accept as many fragmented packets as it receives. +Integer: maximum number of fragmented packets the host will accept and +simultaneously hold in the reassembly queue for a particular VNET. +0 means that the host will not accept any fragmented packets for that VNET. +\-1 means that the host will not apply this limit for that VNET. +This limit is recalculated when the number of mbuf clusters is changed. +This is a per-VNET limit. +.It Va ip.maxfragbucketsize +Integer: maximum number of reassembly queues per bucket. +Fragmented packets are hashed to buckets. +Each bucket has a list of reassembly queues. +The system must compare the incoming packets to the existing reassembly queues +in the bucket to find a matching reassembly queue. +To preserve system resources, the system limits the number of reassembly +queues allowed in each bucket. +This limit is recalculated when the number of mbuf clusters is changed or +when the value of +.Va ip.maxfragpackets +changes. +This is a per-VNET limit. .It Va ip.maxfragsperpacket Integer: maximum number of fragments the host will accept and hold -in the reassembling queue for a packet. -0 means that the host will not accept any fragmented packets. +in the reassembly queue for a packet. +0 means that the host will not accept any fragmented packets for the VNET. +This is a per-VNET limit. .El .Sh SEE ALSO .Xr ioctl 2 , Modified: stable/11/share/man/man4/inet6.4 == --- stable/11/share/man/man4/inet6.4Tue Aug 14 18:15:10 2018 (r337803) +++ stable/11/share/man/man4/inet6.4Tue Aug 14 18:17:05 2018 (r337804) @@ -29,7 +29,7 @@ .\" .\" $FreeBSD$ .\" -.Dd September 2, 2009 +.Dd August 14, 2018 .Dt INET6 4 .Os .Sh NAME @@ -219,12 +219,41 @@ packets. This value applies to all the transport protocols on top of .Tn IPv6 . There are APIs to override the value. +.It Dv IPV6CTL_MAXFRAGS +.Pq ip6.maxfrags +Integer: maximum number of fragments the host will accept and simultaneously +hold across all reassembly queues in all VNETs. +If set to 0, fragment reassembly is disabled. +If set to -1, this limit is not applied. +This limit is recalculated when the number of mbuf clusters is changed. +This is a global limit. .It Dv IPV6CTL_MAXFRAGPACKETS .Pq ip6.maxfragpackets -Integer: default maximum number of fragmented packets the node will accept. -0 means that the node will not accept any fragmented packets. --1 means that the node will accept as many fragmented packets as it receives. -The flag is provided basically for avoiding possible DoS attacks. +Integer: maximum number of fragmented packets the node will accept and +simultaneously hold in the reassembly queue for a particular VNET. +0 means that the node will not accept any fragmented packets for that VNET. +-1 means that the node will not apply this limit for that VNET. +This limit is recalculated when the number of mbuf clusters is changed. +This is a per-VNET limit. +.It Dv IPV6CTL_MAXFRAGBUCKETSIZE +.Pq ip6.maxfragbucketsize +Integer: maximum number of reassembly queues per bucket. +Fragmented packets are hashed to buckets. +Each bucket has a list of reassembly queues. +The system must compare the incoming packets to the existing reassembly queues
svn commit: r337803 - stable/11/sys/netinet6
Author: jtl Date: Tue Aug 14 18:15:10 2018 New Revision: 337803 URL: https://svnweb.freebsd.org/changeset/base/337803 Log: MFC r337787: Lower the default limits on the IPv6 reassembly queue. Currently, the limits are quite high. On machines with millions of mbuf clusters, the reassembly queue limits can also run into the millions. Lower these values. Also, try to ensure that no bucket will have a reassembly queue larger than approximately 100 items. This limits the cost to find the correct reassembly queue when processing an incoming fragment. Due to the low limits on each bucket's length, increase the size of the hash table from 64 to 1024. Approved by: so Security: FreeBSD-SA-18:10.ip Security: CVE-2018-6923 Modified: stable/11/sys/netinet6/frag6.c Directory Properties: stable/11/ (props changed) Modified: stable/11/sys/netinet6/frag6.c == --- stable/11/sys/netinet6/frag6.c Tue Aug 14 18:13:36 2018 (r337802) +++ stable/11/sys/netinet6/frag6.c Tue Aug 14 18:15:10 2018 (r337803) @@ -69,7 +69,7 @@ __FBSDID("$FreeBSD$"); /* * Reassembly headers are stored in hash buckets. */ -#defineIP6REASS_NHASH_LOG2 6 +#defineIP6REASS_NHASH_LOG2 10 #defineIP6REASS_NHASH (1 << IP6REASS_NHASH_LOG2) #defineIP6REASS_HMASK (IP6REASS_NHASH - 1) @@ -105,6 +105,22 @@ static VNET_DEFINE(uint32_t, ip6q_hashseed); static MALLOC_DEFINE(M_FTABLE, "fragment", "fragment reassembly header"); /* + * By default, limit the number of IP6 fragments across all reassembly + * queues to 1/32 of the total number of mbuf clusters. + * + * Limit the total number of reassembly queues per VNET to the + * IP6 fragment limit, but ensure the limit will not allow any bucket + * to grow above 100 items. (The bucket limit is + * IP_MAXFRAGPACKETS / (IPREASS_NHASH / 2), so the 50 is the correct + * multiplier to reach a 100-item limit.) + * The 100-item limit was chosen as brief testing seems to show that + * this produces "reasonable" performance on some subset of systems + * under DoS attack. + */ +#defineIP6_MAXFRAGS(nmbclusters / 32) +#defineIP6_MAXFRAGPACKETS (imin(IP6_MAXFRAGS, IP6REASS_NHASH * 50)) + +/* * Initialise reassembly queue and fragment identifier. */ void @@ -121,11 +137,11 @@ frag6_change(void *tag) { VNET_ITERATOR_DECL(vnet_iter); - ip6_maxfrags = nmbclusters / 4; + ip6_maxfrags = IP6_MAXFRAGS; VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); - V_ip6_maxfragpackets = nmbclusters / 4; + V_ip6_maxfragpackets = IP6_MAXFRAGPACKETS; frag6_set_bucketsize(); CURVNET_RESTORE(); } @@ -138,7 +154,7 @@ frag6_init(void) struct ip6q *q6; int i; - V_ip6_maxfragpackets = nmbclusters / 4; + V_ip6_maxfragpackets = IP6_MAXFRAGPACKETS; frag6_set_bucketsize(); for (i = 0; i < IP6REASS_NHASH; i++) { q6 = IP6Q_HEAD(i); @@ -151,7 +167,7 @@ frag6_init(void) if (!IS_DEFAULT_VNET(curvnet)) return; - ip6_maxfrags = nmbclusters / 4; + ip6_maxfrags = IP6_MAXFRAGS; EVENTHANDLER_REGISTER(nmbclusters_change, frag6_change, NULL, EVENTHANDLER_PRI_ANY); } ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r337802 - stable/11/sys/netinet
Author: jtl Date: Tue Aug 14 18:13:36 2018 New Revision: 337802 URL: https://svnweb.freebsd.org/changeset/base/337802 Log: MFC r337786: Lower the default limits on the IPv4 reassembly queue. In particular, try to ensure that no bucket will have a reassembly queue larger than approximately 100 items. This limits the cost to find the correct reassembly queue when processing an incoming fragment. Due to the low limits on each bucket's length, increase the size of the hash table from 64 to 1024. Approved by: so Security: FreeBSD-SA-18:10.ip Security: CVE-2018-6923 Modified: stable/11/sys/netinet/ip_reass.c Directory Properties: stable/11/ (props changed) Modified: stable/11/sys/netinet/ip_reass.c == --- stable/11/sys/netinet/ip_reass.cTue Aug 14 18:12:02 2018 (r337801) +++ stable/11/sys/netinet/ip_reass.cTue Aug 14 18:13:36 2018 (r337802) @@ -64,7 +64,7 @@ SYSCTL_DECL(_net_inet_ip); /* * Reassembly headers are stored in hash buckets. */ -#defineIPREASS_NHASH_LOG2 6 +#defineIPREASS_NHASH_LOG2 10 #defineIPREASS_NHASH (1 << IPREASS_NHASH_LOG2) #defineIPREASS_HMASK (IPREASS_NHASH - 1) @@ -116,6 +116,22 @@ ipq_drop(struct ipqbucket *bucket, struct ipq *fp) ipq_free(bucket, fp); } +/* + * By default, limit the number of IP fragments across all reassembly + * queues to 1/32 of the total number of mbuf clusters. + * + * Limit the total number of reassembly queues per VNET to the + * IP fragment limit, but ensure the limit will not allow any bucket + * to grow above 100 items. (The bucket limit is + * IP_MAXFRAGPACKETS / (IPREASS_NHASH / 2), so the 50 is the correct + * multiplier to reach a 100-item limit.) + * The 100-item limit was chosen as brief testing seems to show that + * this produces "reasonable" performance on some subset of systems + * under DoS attack. + */ +#defineIP_MAXFRAGS (nmbclusters / 32) +#defineIP_MAXFRAGPACKETS (imin(IP_MAXFRAGS, IPREASS_NHASH * 50)) + static int maxfrags; static volatile u_int nfrags; SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfrags, CTLFLAG_RW, @@ -513,12 +529,12 @@ ipreass_init(void) V_maxfragsperpacket = 16; V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); - max = nmbclusters / 32; + max = IP_MAXFRAGPACKETS; max = uma_zone_set_max(V_ipq_zone, max); V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1); if (IS_DEFAULT_VNET(curvnet)) { - maxfrags = nmbclusters / 32; + maxfrags = IP_MAXFRAGS; EVENTHANDLER_REGISTER(nmbclusters_change, ipreass_zone_change, NULL, EVENTHANDLER_PRI_ANY); } @@ -622,8 +638,8 @@ ipreass_zone_change(void *tag) VNET_ITERATOR_DECL(vnet_iter); int max; - maxfrags = nmbclusters / 32; - max = nmbclusters / 32; + maxfrags = IP_MAXFRAGS; + max = IP_MAXFRAGPACKETS; VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r337801 - stable/11/sys/netinet6
Author: jtl Date: Tue Aug 14 18:12:02 2018 New Revision: 337801 URL: https://svnweb.freebsd.org/changeset/base/337801 Log: MFC r337784: Drop 0-byte IPv6 fragments. Currently, we process IPv6 fragments with 0 bytes of payload, add them to the reassembly queue, and do not recognize them as duplicating or overlapping with adjacent 0-byte fragments. An attacker can exploit this to create long fragment queues. There is no legitimate reason for a fragment with no payload. However, because IPv6 packets with an empty payload are acceptable, allow an "atomic" fragment with no payload. Approved by: so Security: FreeBSD-SA-18:10.ip Security: CVE-2018-6923 Modified: stable/11/sys/netinet6/frag6.c Directory Properties: stable/11/ (props changed) Modified: stable/11/sys/netinet6/frag6.c == --- stable/11/sys/netinet6/frag6.c Tue Aug 14 18:11:06 2018 (r337800) +++ stable/11/sys/netinet6/frag6.c Tue Aug 14 18:12:02 2018 (r337801) @@ -269,6 +269,16 @@ frag6_input(struct mbuf **mp, int *offp, int proto) return (ip6f->ip6f_nxt); } + /* Get fragment length and discard 0-byte fragments. */ + frgpartlen = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) - offset; + if (frgpartlen == 0) { + icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, + offsetof(struct ip6_hdr, ip6_plen)); + in6_ifstat_inc(dstifp, ifs6_reass_fail); + IP6STAT_INC(ip6s_fragdropped); + return IPPROTO_DONE; + } + hashkeyp = hashkey; memcpy(hashkeyp, >ip6_src, sizeof(struct in6_addr)); hashkeyp += sizeof(struct in6_addr) / sizeof(*hashkeyp); @@ -365,7 +375,6 @@ frag6_input(struct mbuf **mp, int *offp, int proto) * in size. * If it would exceed, discard the fragment and return an ICMP error. */ - frgpartlen = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) - offset; if (q6->ip6q_unfrglen >= 0) { /* The 1st fragment has already arrived. */ if (q6->ip6q_unfrglen + fragoff + frgpartlen > IPV6_MAXPACKET) { ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r337799 - stable/11/sys/netinet6
Author: jtl Date: Tue Aug 14 18:10:25 2018 New Revision: 337799 URL: https://svnweb.freebsd.org/changeset/base/337799 Log: MFC r337783: Implement a limit on on the number of IPv6 reassembly queues per bucket. There is a hashing algorithm which should distribute IPv6 reassembly queues across the available buckets in a relatively even way. However, if there is a flaw in the hashing algorithm which allows a large number of IPv6 fragment reassembly queues to end up in a single bucket, a per- bucket limit could help mitigate the performance impact of this flaw. Implement such a limit, with a default of twice the maximum number of reassembly queues divided by the number of buckets. Recalculate the limit any time the maximum number of reassembly queues changes. However, allow the user to override the value using a sysctl (net.inet6.ip6.maxfragbucketsize). Approved by: so Security: FreeBSD-SA-18:10.ip Security: CVE-2018-6923 Modified: stable/11/sys/netinet6/frag6.c stable/11/sys/netinet6/in6.h stable/11/sys/netinet6/in6_proto.c stable/11/sys/netinet6/ip6_var.h Directory Properties: stable/11/ (props changed) Modified: stable/11/sys/netinet6/frag6.c == --- stable/11/sys/netinet6/frag6.c Tue Aug 14 18:06:59 2018 (r337798) +++ stable/11/sys/netinet6/frag6.c Tue Aug 14 18:10:25 2018 (r337799) @@ -77,13 +77,14 @@ static void frag6_enq(struct ip6asfrag *, struct ip6as uint32_t bucket __unused); static void frag6_deq(struct ip6asfrag *, uint32_t bucket __unused); static void frag6_insque_head(struct ip6q *, struct ip6q *, -uint32_t bucket __unused); -static void frag6_remque(struct ip6q *, uint32_t bucket __unused); +uint32_t bucket); +static void frag6_remque(struct ip6q *, uint32_t bucket); static void frag6_freef(struct ip6q *, uint32_t bucket); struct ip6qbucket { struct ip6q ip6q; struct mtx lock; + int count; }; static VNET_DEFINE(volatile u_int, frag6_nfragpackets); @@ -106,6 +107,15 @@ static MALLOC_DEFINE(M_FTABLE, "fragment", "fragment r /* * Initialise reassembly queue and fragment identifier. */ +void +frag6_set_bucketsize() +{ + int i; + + if ((i = V_ip6_maxfragpackets) > 0) + V_ip6_maxfragbucketsize = imax(i / (IP6REASS_NHASH / 2), 1); +} + static void frag6_change(void *tag) { @@ -116,6 +126,7 @@ frag6_change(void *tag) VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); V_ip6_maxfragpackets = nmbclusters / 4; + frag6_set_bucketsize(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); @@ -128,10 +139,12 @@ frag6_init(void) int i; V_ip6_maxfragpackets = nmbclusters / 4; + frag6_set_bucketsize(); for (i = 0; i < IP6REASS_NHASH; i++) { q6 = IP6Q_HEAD(i); q6->ip6q_next = q6->ip6q_prev = q6; mtx_init(_ip6q[i].lock, "ip6qlock", NULL, MTX_DEF); + V_ip6q[i].count = 0; } V_ip6q_hashseed = arc4random(); V_ip6_maxfragsperpacket = 64; @@ -302,7 +315,8 @@ frag6_input(struct mbuf **mp, int *offp, int proto) */ if (V_ip6_maxfragpackets < 0) ; - else if (V_frag6_nfragpackets >= (u_int)V_ip6_maxfragpackets) + else if (V_ip6q[hash].count >= V_ip6_maxfragbucketsize || + V_frag6_nfragpackets >= (u_int)V_ip6_maxfragpackets) goto dropfrag; atomic_add_int(_frag6_nfragpackets, 1); q6 = (struct ip6q *)malloc(sizeof(struct ip6q), M_FTABLE, @@ -760,7 +774,7 @@ frag6_deq(struct ip6asfrag *af6, uint32_t bucket __unu } static void -frag6_insque_head(struct ip6q *new, struct ip6q *old, uint32_t bucket __unused) +frag6_insque_head(struct ip6q *new, struct ip6q *old, uint32_t bucket) { IP6Q_LOCK_ASSERT(bucket); @@ -772,16 +786,18 @@ frag6_insque_head(struct ip6q *new, struct ip6q *old, new->ip6q_next = old->ip6q_next; old->ip6q_next->ip6q_prev= new; old->ip6q_next = new; + V_ip6q[bucket].count++; } static void -frag6_remque(struct ip6q *p6, uint32_t bucket __unused) +frag6_remque(struct ip6q *p6, uint32_t bucket) { IP6Q_LOCK_ASSERT(bucket); p6->ip6q_prev->ip6q_next = p6->ip6q_next; p6->ip6q_next->ip6q_prev = p6->ip6q_prev; + V_ip6q[bucket].count--; } /* @@ -803,29 +819,58 @@ frag6_slowtimo(void) IP6Q_LOCK(i); head = IP6Q_HEAD(i); q6 = head->ip6q_next; - if (q6) - while (q6 != head) { - --q6->ip6q_ttl; - q6 =
svn commit: r337798 - stable/11/sys/netinet6
Author: jtl Date: Tue Aug 14 18:06:59 2018 New Revision: 337798 URL: https://svnweb.freebsd.org/changeset/base/337798 Log: MFC r337782: Add a limit of the number of fragments per IPv6 packet. The IPv4 fragment reassembly code supports a limit on the number of fragments per packet. The default limit is currently 17 fragments. Among other things, this limit serves to limit the number of fragments the code must parse when trying to reassembly a packet. Add a limit to the IPv6 reassembly code. By default, limit a packet to 65 fragments (64 on the queue, plus one final fragment to complete the packet). This allows an average fragment size of 1,008 bytes, which should be sufficient to hold a fragment. (Recall that the IPv6 minimum MTU is 1280 bytes. Therefore, this configuration allows a full-size IPv6 packet to be fragmented on a link with the minimum MTU and still carry approximately 272 bytes of headers before the fragmented portion of the packet.) Users can adjust this limit using the net.inet6.ip6.maxfragsperpacket sysctl. Approved by: so Security: FreeBSD-SA-18:10.ip Security: CVE-2018-6923 Modified: stable/11/sys/netinet6/frag6.c stable/11/sys/netinet6/in6.h stable/11/sys/netinet6/in6_proto.c stable/11/sys/netinet6/ip6_var.h Directory Properties: stable/11/ (props changed) Modified: stable/11/sys/netinet6/frag6.c == --- stable/11/sys/netinet6/frag6.c Tue Aug 14 17:59:42 2018 (r337797) +++ stable/11/sys/netinet6/frag6.c Tue Aug 14 18:06:59 2018 (r337798) @@ -134,6 +134,7 @@ frag6_init(void) mtx_init(_ip6q[i].lock, "ip6qlock", NULL, MTX_DEF); } V_ip6q_hashseed = arc4random(); + V_ip6_maxfragsperpacket = 64; if (!IS_DEFAULT_VNET(curvnet)) return; @@ -530,6 +531,7 @@ insert: /* * Stick new segment in its place; * check for complete reassembly. +* If not complete, check fragment limit. * Move to front of packet queue, as we are * the most recently active fragmented packet. */ @@ -546,12 +548,20 @@ insert: for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6; af6 = af6->ip6af_down) { if (af6->ip6af_off != next) { + if (q6->ip6q_nfrag > V_ip6_maxfragsperpacket) { + IP6STAT_INC(ip6s_fragdropped); + frag6_freef(q6, hash); + } IP6Q_UNLOCK(hash); return IPPROTO_DONE; } next += af6->ip6af_frglen; } if (af6->ip6af_up->ip6af_mff) { + if (q6->ip6q_nfrag > V_ip6_maxfragsperpacket) { + IP6STAT_INC(ip6s_fragdropped); + frag6_freef(q6, hash); + } IP6Q_UNLOCK(hash); return IPPROTO_DONE; } Modified: stable/11/sys/netinet6/in6.h == --- stable/11/sys/netinet6/in6.hTue Aug 14 17:59:42 2018 (r337797) +++ stable/11/sys/netinet6/in6.hTue Aug 14 18:06:59 2018 (r337798) @@ -637,7 +637,8 @@ struct ip6_mtuinfo { #defineIPV6CTL_INTRQMAXLEN 51 /* max length of IPv6 netisr queue */ #defineIPV6CTL_INTRDQMAXLEN52 /* max length of direct IPv6 netisr * queue */ -#defineIPV6CTL_MAXID 53 +#defineIPV6CTL_MAXFRAGSPERPACKET 53 /* Max fragments per packet */ +#defineIPV6CTL_MAXID 54 #endif /* __BSD_VISIBLE */ /* Modified: stable/11/sys/netinet6/in6_proto.c == --- stable/11/sys/netinet6/in6_proto.c Tue Aug 14 17:59:42 2018 (r337797) +++ stable/11/sys/netinet6/in6_proto.c Tue Aug 14 18:06:59 2018 (r337798) @@ -384,6 +384,7 @@ VNET_DEFINE(int, ip6_norbit_raif) = 0; VNET_DEFINE(int, ip6_rfc6204w3) = 0; VNET_DEFINE(int, ip6_maxfragpackets); /* initialized in frag6.c:frag6_init() */ int ip6_maxfrags; /* initialized in frag6.c:frag6_init() */ +VNET_DEFINE(int, ip6_maxfragsperpacket); /* initialized in frag6.c:frag6_init() */ VNET_DEFINE(int, ip6_log_interval) = 5; VNET_DEFINE(int, ip6_hdrnestlimit) = 15;/* How many header options will we * process? */ @@ -561,6 +562,9 @@ SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGS, maxfrags, "Maximum allowed number of outstanding IPv6 packet fragments. " "A value of 0 means no fragmented packets will be accepted, while a " "a value of -1 means no limit"); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGSPERPACKET, maxfragsperpacket, +
svn commit: r337797 - stable/11/sys/netinet6
Author: jtl Date: Tue Aug 14 17:59:42 2018 New Revision: 337797 URL: https://svnweb.freebsd.org/changeset/base/337797 Log: MFC r337781: Make the IPv6 fragment limits be global, rather than per-VNET, limits. The IPv6 reassembly fragment limit is based on the number of mbuf clusters, which are a global resource. However, the limit is currently applied on a per-VNET basis. Given enough VNETs (or given sufficient customization on enough VNETs), it is possible that the sum of all the VNET fragment limits will exceed the number of mbuf clusters available in the system. Given the fact that the fragment limits are intended (at least in part) to regulate access to a global resource, the IPv6 fragment limit should be applied on a global basis. Note that it is still possible to disable fragmentation for a particular VNET by setting the net.inet6.ip6.maxfragpackets sysctl to 0 for that VNET. In addition, it is now possible to disable fragmentation globally by setting the net.inet6.ip6.maxfrags sysctl to 0. Approved by: so Security: FreeBSD-SA-18:10.ip Security: CVE-2018-6923 Modified: stable/11/sys/netinet6/frag6.c stable/11/sys/netinet6/in6_proto.c stable/11/sys/netinet6/ip6_var.h Directory Properties: stable/11/ (props changed) Modified: stable/11/sys/netinet6/frag6.c == --- stable/11/sys/netinet6/frag6.c Tue Aug 14 17:54:39 2018 (r337796) +++ stable/11/sys/netinet6/frag6.c Tue Aug 14 17:59:42 2018 (r337797) @@ -87,12 +87,11 @@ struct ip6qbucket { }; static VNET_DEFINE(volatile u_int, frag6_nfragpackets); -static VNET_DEFINE(volatile u_int, frag6_nfrags); +volatile u_int frag6_nfrags = 0; static VNET_DEFINE(struct ip6qbucket, ip6q[IP6REASS_NHASH]); static VNET_DEFINE(uint32_t, ip6q_hashseed); #defineV_frag6_nfragpacketsVNET(frag6_nfragpackets) -#defineV_frag6_nfrags VNET(frag6_nfrags) #defineV_ip6q VNET(ip6q) #defineV_ip6q_hashseed VNET(ip6q_hashseed) @@ -110,9 +109,16 @@ static MALLOC_DEFINE(M_FTABLE, "fragment", "fragment r static void frag6_change(void *tag) { + VNET_ITERATOR_DECL(vnet_iter); - V_ip6_maxfragpackets = nmbclusters / 4; - V_ip6_maxfrags = nmbclusters / 4; + ip6_maxfrags = nmbclusters / 4; + VNET_LIST_RLOCK_NOSLEEP(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + V_ip6_maxfragpackets = nmbclusters / 4; + CURVNET_RESTORE(); + } + VNET_LIST_RUNLOCK_NOSLEEP(); } void @@ -122,7 +128,6 @@ frag6_init(void) int i; V_ip6_maxfragpackets = nmbclusters / 4; - V_ip6_maxfrags = nmbclusters / 4; for (i = 0; i < IP6REASS_NHASH; i++) { q6 = IP6Q_HEAD(i); q6->ip6q_next = q6->ip6q_prev = q6; @@ -132,6 +137,7 @@ frag6_init(void) if (!IS_DEFAULT_VNET(curvnet)) return; + ip6_maxfrags = nmbclusters / 4; EVENTHANDLER_REGISTER(nmbclusters_change, frag6_change, NULL, EVENTHANDLER_PRI_ANY); } @@ -265,9 +271,9 @@ frag6_input(struct mbuf **mp, int *offp, int proto) * If maxfrag is 0, never accept fragments. * If maxfrag is -1, accept all fragments without limitation. */ - if (V_ip6_maxfrags < 0) + if (ip6_maxfrags < 0) ; - else if (V_frag6_nfrags >= (u_int)V_ip6_maxfrags) + else if (frag6_nfrags >= (u_int)ip6_maxfrags) goto dropfrag; for (q6 = head->ip6q_next; q6 != head; q6 = q6->ip6q_next) @@ -528,7 +534,7 @@ insert: * the most recently active fragmented packet. */ frag6_enq(ip6af, af6->ip6af_up, hash); - atomic_add_int(_frag6_nfrags, 1); + atomic_add_int(_nfrags, 1); q6->ip6q_nfrag++; #if 0 /* xxx */ if (q6 != head->ip6q_next) { @@ -592,7 +598,7 @@ insert: if (ip6_deletefraghdr(m, offset, M_NOWAIT) != 0) { frag6_remque(q6, hash); - atomic_subtract_int(_frag6_nfrags, q6->ip6q_nfrag); + atomic_subtract_int(_nfrags, q6->ip6q_nfrag); #ifdef MAC mac_ip6q_destroy(q6); #endif @@ -609,7 +615,7 @@ insert: (caddr_t)); frag6_remque(q6, hash); - atomic_subtract_int(_frag6_nfrags, q6->ip6q_nfrag); + atomic_subtract_int(_nfrags, q6->ip6q_nfrag); #ifdef MAC mac_ip6q_reassemble(q6, m); mac_ip6q_destroy(q6); @@ -705,7 +711,7 @@ frag6_freef(struct ip6q *q6, uint32_t bucket) free(af6, M_FTABLE); } frag6_remque(q6, bucket); - atomic_subtract_int(_frag6_nfrags, q6->ip6q_nfrag); + atomic_subtract_int(_nfrags, q6->ip6q_nfrag); #ifdef MAC mac_ip6q_destroy(q6); #endif Modified: stable/11/sys/netinet6/in6_proto.c
svn commit: r337796 - stable/11/sys/netinet
Author: jtl Date: Tue Aug 14 17:54:39 2018 New Revision: 337796 URL: https://svnweb.freebsd.org/changeset/base/337796 Log: MFC r337780: Implement a limit on on the number of IPv4 reassembly queues per bucket. There is a hashing algorithm which should distribute IPv4 reassembly queues across the available buckets in a relatively even way. However, if there is a flaw in the hashing algorithm which allows a large number of IPv4 fragment reassembly queues to end up in a single bucket, a per- bucket limit could help mitigate the performance impact of this flaw. Implement such a limit, with a default of twice the maximum number of reassembly queues divided by the number of buckets. Recalculate the limit any time the maximum number of reassembly queues changes. However, allow the user to override the value using a sysctl (net.inet.ip.maxfragbucketsize). Approved by: so Security: FreeBSD-SA-18:10.ip Security: CVE-2018-6923 Modified: stable/11/sys/netinet/ip_reass.c Directory Properties: stable/11/ (props changed) Modified: stable/11/sys/netinet/ip_reass.c == --- stable/11/sys/netinet/ip_reass.cTue Aug 14 17:52:06 2018 (r337795) +++ stable/11/sys/netinet/ip_reass.cTue Aug 14 17:54:39 2018 (r337796) @@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -70,6 +71,7 @@ SYSCTL_DECL(_net_inet_ip); struct ipqbucket { TAILQ_HEAD(ipqhead, ipq) head; struct mtx lock; + int count; }; static VNET_DEFINE(struct ipqbucket, ipq[IPREASS_NHASH]); @@ -82,6 +84,9 @@ static VNET_DEFINE(uint32_t, ipq_hashseed); #defineIPQ_UNLOCK(i) mtx_unlock(_ipq[i].lock) #defineIPQ_LOCK_ASSERT(i) mtx_assert(_ipq[i].lock, MA_OWNED) +static VNET_DEFINE(int, ipreass_maxbucketsize); +#defineV_ipreass_maxbucketsize VNET(ipreass_maxbucketsize) + void ipreass_init(void); void ipreass_drain(void); void ipreass_slowtimo(void); @@ -89,25 +94,26 @@ voidipreass_slowtimo(void); void ipreass_destroy(void); #endif static int sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS); +static int sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS); static voidipreass_zone_change(void *); static voidipreass_drain_tomax(void); -static voidipq_free(struct ipqhead *, struct ipq *); +static voidipq_free(struct ipqbucket *, struct ipq *); static struct ipq * ipq_reuse(int); static inline void -ipq_timeout(struct ipqhead *head, struct ipq *fp) +ipq_timeout(struct ipqbucket *bucket, struct ipq *fp) { IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags); - ipq_free(head, fp); + ipq_free(bucket, fp); } static inline void -ipq_drop(struct ipqhead *head, struct ipq *fp) +ipq_drop(struct ipqbucket *bucket, struct ipq *fp) { IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); - ipq_free(head, fp); + ipq_free(bucket, fp); } static int maxfrags; @@ -136,6 +142,10 @@ static VNET_DEFINE(int, maxfragsperpacket); SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_VNET | CTLFLAG_RW, _NAME(maxfragsperpacket), 0, "Maximum number of IPv4 fragments allowed per packet"); +SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragbucketsize, +CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, +sysctl_maxfragbucketsize, "I", +"Maximum number of IPv4 fragment reassembly queue entries per bucket"); /* * Take incoming datagram fragment and try to reassemble it into @@ -241,9 +251,12 @@ ip_reass(struct mbuf *m) * If first fragment to arrive, create a reassembly queue. */ if (fp == NULL) { - fp = uma_zalloc(V_ipq_zone, M_NOWAIT); + if (V_ipq[hash].count < V_ipreass_maxbucketsize) + fp = uma_zalloc(V_ipq_zone, M_NOWAIT); if (fp == NULL) fp = ipq_reuse(hash); + if (fp == NULL) + goto dropfrag; #ifdef MAC if (mac_ipq_init(fp, M_NOWAIT) != 0) { uma_zfree(V_ipq_zone, fp); @@ -253,6 +266,7 @@ ip_reass(struct mbuf *m) mac_ipq_create(m, fp); #endif TAILQ_INSERT_HEAD(head, fp, ipq_list); + V_ipq[hash].count++; fp->ipq_nfrags = 1; atomic_add_int(, 1); fp->ipq_ttl = IPFRAGTTL; @@ -360,7 +374,7 @@ ip_reass(struct mbuf *m) for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) { if (ntohs(GETIP(q)->ip_off) != next) { if (fp->ipq_nfrags > V_maxfragsperpacket) - ipq_drop(head, fp); + ipq_drop(_ipq[hash], fp);
svn commit: r337795 - stable/11/sys/netinet
Author: jtl Date: Tue Aug 14 17:52:06 2018 New Revision: 337795 URL: https://svnweb.freebsd.org/changeset/base/337795 Log: MFC r337778: Add a global limit on the number of IPv4 fragments. The IP reassembly fragment limit is based on the number of mbuf clusters, which are a global resource. However, the limit is currently applied on a per-VNET basis. Given enough VNETs (or given sufficient customization of enough VNETs), it is possible that the sum of all the VNET limits will exceed the number of mbuf clusters available in the system. Given the fact that the fragment limit is intended (at least in part) to regulate access to a global resource, the fragment limit should be applied on a global basis. VNET-specific limits can be adjusted by modifying the net.inet.ip.maxfragpackets and net.inet.ip.maxfragsperpacket sysctls. To disable fragment reassembly globally, set net.inet.ip.maxfrags to 0. To disable fragment reassembly for a particular VNET, set net.inet.ip.maxfragpackets to 0. Approved by: so Security: FreeBSD-SA-18:10.ip Security: CVE-2018-6923 Modified: stable/11/sys/netinet/ip_reass.c Directory Properties: stable/11/ (props changed) Modified: stable/11/sys/netinet/ip_reass.c == --- stable/11/sys/netinet/ip_reass.cTue Aug 14 17:51:12 2018 (r337794) +++ stable/11/sys/netinet/ip_reass.cTue Aug 14 17:52:06 2018 (r337795) @@ -110,6 +110,15 @@ ipq_drop(struct ipqhead *head, struct ipq *fp) ipq_free(head, fp); } +static int maxfrags; +static volatile u_int nfrags; +SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfrags, CTLFLAG_RW, +, 0, +"Maximum number of IPv4 fragments allowed across all reassembly queues"); +SYSCTL_UINT(_net_inet_ip, OID_AUTO, curfrags, CTLFLAG_RD, +__DEVOLATILE(u_int *, ), 0, +"Current number of IPv4 fragments across all reassembly queues"); + static VNET_DEFINE(uma_zone_t, ipq_zone); #defineV_ipq_zone VNET(ipq_zone) SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLFLAG_VNET | @@ -146,7 +155,7 @@ ip_reass(struct mbuf *m) struct mbuf *p, *q, *nq, *t; struct ipq *fp; struct ipqhead *head; - int i, hlen, next; + int i, hlen, next, tmpmax; u_int8_t ecn, ecn0; uint32_t hash, hashkey[3]; #ifdef RSS @@ -156,8 +165,12 @@ ip_reass(struct mbuf *m) /* * If no reassembling or maxfragsperpacket are 0, * never accept fragments. +* Also, drop packet if it would exceed the maximum +* number of fragments. */ - if (V_noreass == 1 || V_maxfragsperpacket == 0) { + tmpmax = maxfrags; + if (V_noreass == 1 || V_maxfragsperpacket == 0 || + (tmpmax >= 0 && nfrags >= (u_int)tmpmax)) { IPSTAT_INC(ips_fragments); IPSTAT_INC(ips_fragdropped); m_freem(m); @@ -241,6 +254,7 @@ ip_reass(struct mbuf *m) #endif TAILQ_INSERT_HEAD(head, fp, ipq_list); fp->ipq_nfrags = 1; + atomic_add_int(, 1); fp->ipq_ttl = IPFRAGTTL; fp->ipq_p = ip->ip_p; fp->ipq_id = ip->ip_id; @@ -251,6 +265,7 @@ ip_reass(struct mbuf *m) goto done; } else { fp->ipq_nfrags++; + atomic_add_int(, 1); #ifdef MAC mac_ipq_update(m, fp); #endif @@ -327,6 +342,7 @@ ip_reass(struct mbuf *m) m->m_nextpkt = nq; IPSTAT_INC(ips_fragdropped); fp->ipq_nfrags--; + atomic_subtract_int(, 1); m_freem(q); } @@ -392,6 +408,7 @@ ip_reass(struct mbuf *m) while (m->m_pkthdr.csum_data & 0x) m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0x) + (m->m_pkthdr.csum_data >> 16); + atomic_subtract_int(, fp->ipq_nfrags); #ifdef MAC mac_ipq_reassemble(fp, m); mac_ipq_destroy(fp); @@ -451,8 +468,10 @@ ip_reass(struct mbuf *m) dropfrag: IPSTAT_INC(ips_fragdropped); - if (fp != NULL) + if (fp != NULL) { fp->ipq_nfrags--; + atomic_subtract_int(, 1); + } m_freem(m); done: IPQ_UNLOCK(hash); @@ -479,9 +498,11 @@ ipreass_init(void) NULL, UMA_ALIGN_PTR, 0); uma_zone_set_max(V_ipq_zone, nmbclusters / 32); - if (IS_DEFAULT_VNET(curvnet)) + if (IS_DEFAULT_VNET(curvnet)) { + maxfrags = nmbclusters / 32; EVENTHANDLER_REGISTER(nmbclusters_change, ipreass_zone_change, NULL, EVENTHANDLER_PRI_ANY); + } } /* @@ -564,9 +585,19 @@ ipreass_drain_tomax(void) static void ipreass_zone_change(void *tag) { + VNET_ITERATOR_DECL(vnet_iter); + int max; -
svn commit: r337790 - stable/11/sys/netinet6
Author: jtl Date: Tue Aug 14 17:46:54 2018 New Revision: 337790 URL: https://svnweb.freebsd.org/changeset/base/337790 Log: MFC r337776: Improve IPv6 reassembly performance by hashing fragments into buckets. Currently, all IPv6 fragment reassembly queues are kept in a flat linked list. This has a number of implications. Two significant implications are: all reassembly operations share a common lock, and it is possible for the linked list to grow quite large. Improve IPv6 reassembly performance by hashing fragments into buckets, each of which has its own lock. Calculate the hash key using a Jenkins hash with a random seed. Approved by: so Security: FreeBSD-SA-18:10.ip Security: CVE-2018-6923 Modified: stable/11/sys/netinet6/frag6.c Directory Properties: stable/11/ (props changed) Modified: stable/11/sys/netinet6/frag6.c == --- stable/11/sys/netinet6/frag6.c Tue Aug 14 17:43:11 2018 (r337789) +++ stable/11/sys/netinet6/frag6.c Tue Aug 14 17:46:54 2018 (r337790) @@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include @@ -47,6 +48,8 @@ __FBSDID("$FreeBSD$"); #include #include +#include + #include #include #include @@ -63,29 +66,41 @@ __FBSDID("$FreeBSD$"); #include -static void frag6_enq(struct ip6asfrag *, struct ip6asfrag *); -static void frag6_deq(struct ip6asfrag *); -static void frag6_insque(struct ip6q *, struct ip6q *); -static void frag6_remque(struct ip6q *); -static void frag6_freef(struct ip6q *); - -static struct mtx ip6qlock; /* - * These fields all protected by ip6qlock. + * Reassembly headers are stored in hash buckets. */ -static VNET_DEFINE(u_int, frag6_nfragpackets); -static VNET_DEFINE(u_int, frag6_nfrags); -static VNET_DEFINE(struct ip6q, ip6q); /* ip6 reassemble queue */ +#defineIP6REASS_NHASH_LOG2 6 +#defineIP6REASS_NHASH (1 << IP6REASS_NHASH_LOG2) +#defineIP6REASS_HMASK (IP6REASS_NHASH - 1) +static void frag6_enq(struct ip6asfrag *, struct ip6asfrag *, +uint32_t bucket __unused); +static void frag6_deq(struct ip6asfrag *, uint32_t bucket __unused); +static void frag6_insque_head(struct ip6q *, struct ip6q *, +uint32_t bucket __unused); +static void frag6_remque(struct ip6q *, uint32_t bucket __unused); +static void frag6_freef(struct ip6q *, uint32_t bucket); + +struct ip6qbucket { + struct ip6q ip6q; + struct mtx lock; +}; + +static VNET_DEFINE(volatile u_int, frag6_nfragpackets); +static VNET_DEFINE(volatile u_int, frag6_nfrags); +static VNET_DEFINE(struct ip6qbucket, ip6q[IP6REASS_NHASH]); +static VNET_DEFINE(uint32_t, ip6q_hashseed); + #defineV_frag6_nfragpacketsVNET(frag6_nfragpackets) #defineV_frag6_nfrags VNET(frag6_nfrags) #defineV_ip6q VNET(ip6q) +#defineV_ip6q_hashseed VNET(ip6q_hashseed) -#defineIP6Q_LOCK_INIT()mtx_init(, "ip6qlock", NULL, MTX_DEF); -#defineIP6Q_LOCK() mtx_lock() -#defineIP6Q_TRYLOCK() mtx_trylock() -#defineIP6Q_LOCK_ASSERT() mtx_assert(, MA_OWNED) -#defineIP6Q_UNLOCK() mtx_unlock() +#defineIP6Q_LOCK(i)mtx_lock(_ip6q[(i)].lock) +#defineIP6Q_TRYLOCK(i) mtx_trylock(_ip6q[(i)].lock) +#defineIP6Q_LOCK_ASSERT(i) mtx_assert(_ip6q[(i)].lock, MA_OWNED) +#defineIP6Q_UNLOCK(i) mtx_unlock(_ip6q[(i)].lock) +#defineIP6Q_HEAD(i)(_ip6q[(i)].ip6q) static MALLOC_DEFINE(M_FTABLE, "fragment", "fragment reassembly header"); @@ -103,18 +118,22 @@ frag6_change(void *tag) void frag6_init(void) { + struct ip6q *q6; + int i; V_ip6_maxfragpackets = nmbclusters / 4; V_ip6_maxfrags = nmbclusters / 4; - V_ip6q.ip6q_next = V_ip6q.ip6q_prev = _ip6q; - + for (i = 0; i < IP6REASS_NHASH; i++) { + q6 = IP6Q_HEAD(i); + q6->ip6q_next = q6->ip6q_prev = q6; + mtx_init(_ip6q[i].lock, "ip6qlock", NULL, MTX_DEF); + } + V_ip6q_hashseed = arc4random(); if (!IS_DEFAULT_VNET(curvnet)) return; EVENTHANDLER_REGISTER(nmbclusters_change, frag6_change, NULL, EVENTHANDLER_PRI_ANY); - - IP6Q_LOCK_INIT(); } /* @@ -155,12 +174,13 @@ frag6_input(struct mbuf **mp, int *offp, int proto) struct mbuf *m = *mp, *t; struct ip6_hdr *ip6; struct ip6_frag *ip6f; - struct ip6q *q6; + struct ip6q *head, *q6; struct ip6asfrag *af6, *ip6af, *af6dwn; struct in6_ifaddr *ia; int offset = *offp, nxt, i, next; int first_frag = 0; int fragoff, frgpartlen;/* must be larger than u_int16_t */ + uint32_t
svn commit: r337789 - stable/11/sys/netinet
Author: jtl Date: Tue Aug 14 17:43:11 2018 New Revision: 337789 URL: https://svnweb.freebsd.org/changeset/base/337789 Log: MFC r337775: Improve hashing of IPv4 fragments. Currently, IPv4 fragments are hashed into buckets based on a 32-bit key which is calculated by (src_ip ^ ip_id) and combined with a random seed. However, because an attacker can control the values of src_ip and ip_id, it is possible to construct an attack which causes very deep chains to form in a given bucket. To ensure more uniform distribution (and lower predictability for an attacker), calculate the hash based on a key which includes all the fields we use to identify a reassembly queue (dst_ip, src_ip, ip_id, and the ip protocol) as well as a random seed. Security: FreeBSD-SA-18:10.ip Security: CVE-2018-6923 Modified: stable/11/sys/netinet/ip_reass.c Directory Properties: stable/11/ (props changed) Modified: stable/11/sys/netinet/ip_reass.c == --- stable/11/sys/netinet/ip_reass.cTue Aug 14 17:36:21 2018 (r337788) +++ stable/11/sys/netinet/ip_reass.cTue Aug 14 17:43:11 2018 (r337789) @@ -148,7 +148,7 @@ ip_reass(struct mbuf *m) struct ipqhead *head; int i, hlen, next; u_int8_t ecn, ecn0; - uint32_t hash; + uint32_t hash, hashkey[3]; #ifdef RSS uint32_t rss_hash, rss_type; #endif @@ -202,8 +202,12 @@ ip_reass(struct mbuf *m) m->m_data += hlen; m->m_len -= hlen; - hash = ip->ip_src.s_addr ^ ip->ip_id; - hash = jenkins_hash32(, 1, V_ipq_hashseed) & IPREASS_HMASK; + hashkey[0] = ip->ip_src.s_addr; + hashkey[1] = ip->ip_dst.s_addr; + hashkey[2] = (uint32_t)ip->ip_p << 16; + hashkey[2] += ip->ip_id; + hash = jenkins_hash32(hashkey, nitems(hashkey), V_ipq_hashseed); + hash &= IPREASS_HMASK; head = _ipq[hash].head; IPQ_LOCK(hash); ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r337788 - head/share/man/man4
Author: jtl Date: Tue Aug 14 17:36:21 2018 New Revision: 337788 URL: https://svnweb.freebsd.org/changeset/base/337788 Log: Update the inet(4) and inet6(4) man pages to reflect the changes made to the reassembly code in r337778, r337780, r337781, r337782, and r337783. Security: FreeBSD-SA-18:10.ip Security: CVE-2018-6923 Modified: head/share/man/man4/inet.4 head/share/man/man4/inet6.4 Modified: head/share/man/man4/inet.4 == --- head/share/man/man4/inet.4 Tue Aug 14 17:32:07 2018(r337787) +++ head/share/man/man4/inet.4 Tue Aug 14 17:36:21 2018(r337788) @@ -28,7 +28,7 @@ .\" From: @(#)inet.4 8.1 (Berkeley) 6/5/93 .\" $FreeBSD$ .\" -.Dd Feb 4, 2016 +.Dd August 14, 2018 .Dt INET 4 .Os .Sh NAME @@ -229,15 +229,38 @@ At the same time, on high-speed links, it can decrease cycle greatly. Default is 0 (sequential IP IDs). IPv6 flow IDs and fragment IDs are always random. +.It Va ip.maxfrags +Integer: maximum number of fragments the host will accept and simultaneously +hold across all reassembly queues in all VNETs. +If set to 0, reassembly is disabled. +If set to -1, this limit is not applied. +This limit is recalculated when the number of mbuf clusters is changed. +This is a global limit. .It Va ip.maxfragpackets -Integer: maximum number of fragmented packets the host will accept and hold -in the reassembling queue simultaneously. -0 means that the host will not accept any fragmented packets. -\-1 means that the host will accept as many fragmented packets as it receives. +Integer: maximum number of fragmented packets the host will accept and +simultaneously hold in the reassembly queue for a particular VNET. +0 means that the host will not accept any fragmented packets for that VNET. +\-1 means that the host will not apply this limit for that VNET. +This limit is recalculated when the number of mbuf clusters is changed. +This is a per-VNET limit. +.It Va ip.maxfragbucketsize +Integer: maximum number of reassembly queues per bucket. +Fragmented packets are hashed to buckets. +Each bucket has a list of reassembly queues. +The system must compare the incoming packets to the existing reassembly queues +in the bucket to find a matching reassembly queue. +To preserve system resources, the system limits the number of reassembly +queues allowed in each bucket. +This limit is recalculated when the number of mbuf clusters is changed or +when the value of +.Va ip.maxfragpackets +changes. +This is a per-VNET limit. .It Va ip.maxfragsperpacket Integer: maximum number of fragments the host will accept and hold -in the reassembling queue for a packet. -0 means that the host will not accept any fragmented packets. +in the reassembly queue for a packet. +0 means that the host will not accept any fragmented packets for the VNET. +This is a per-VNET limit. .El .Sh SEE ALSO .Xr ioctl 2 , Modified: head/share/man/man4/inet6.4 == --- head/share/man/man4/inet6.4 Tue Aug 14 17:32:07 2018(r337787) +++ head/share/man/man4/inet6.4 Tue Aug 14 17:36:21 2018(r337788) @@ -29,7 +29,7 @@ .\" .\" $FreeBSD$ .\" -.Dd September 2, 2009 +.Dd August 14, 2018 .Dt INET6 4 .Os .Sh NAME @@ -219,12 +219,41 @@ packets. This value applies to all the transport protocols on top of .Tn IPv6 . There are APIs to override the value. +.It Dv IPV6CTL_MAXFRAGS +.Pq ip6.maxfrags +Integer: maximum number of fragments the host will accept and simultaneously +hold across all reassembly queues in all VNETs. +If set to 0, fragment reassembly is disabled. +If set to -1, this limit is not applied. +This limit is recalculated when the number of mbuf clusters is changed. +This is a global limit. .It Dv IPV6CTL_MAXFRAGPACKETS .Pq ip6.maxfragpackets -Integer: default maximum number of fragmented packets the node will accept. -0 means that the node will not accept any fragmented packets. --1 means that the node will accept as many fragmented packets as it receives. -The flag is provided basically for avoiding possible DoS attacks. +Integer: maximum number of fragmented packets the node will accept and +simultaneously hold in the reassembly queue for a particular VNET. +0 means that the node will not accept any fragmented packets for that VNET. +-1 means that the node will not apply this limit for that VNET. +This limit is recalculated when the number of mbuf clusters is changed. +This is a per-VNET limit. +.It Dv IPV6CTL_MAXFRAGBUCKETSIZE +.Pq ip6.maxfragbucketsize +Integer: maximum number of reassembly queues per bucket. +Fragmented packets are hashed to buckets. +Each bucket has a list of reassembly queues. +The system must compare the incoming packets to the existing reassembly queues +in the bucket to find a matching reassembly queue. +To preserve system resources, the system limits the number of reassembly +queues allowed in each
svn commit: r337787 - head/sys/netinet6
Author: jtl Date: Tue Aug 14 17:32:07 2018 New Revision: 337787 URL: https://svnweb.freebsd.org/changeset/base/337787 Log: Lower the default limits on the IPv6 reassembly queue. Currently, the limits are quite high. On machines with millions of mbuf clusters, the reassembly queue limits can also run into the millions. Lower these values. Also, try to ensure that no bucket will have a reassembly queue larger than approximately 100 items. This limits the cost to find the correct reassembly queue when processing an incoming fragment. Due to the low limits on each bucket's length, increase the size of the hash table from 64 to 1024. Reviewed by: jhb Security: FreeBSD-SA-18:10.ip Security: CVE-2018-6923 Modified: head/sys/netinet6/frag6.c Modified: head/sys/netinet6/frag6.c == --- head/sys/netinet6/frag6.c Tue Aug 14 17:30:46 2018(r337786) +++ head/sys/netinet6/frag6.c Tue Aug 14 17:32:07 2018(r337787) @@ -71,7 +71,7 @@ __FBSDID("$FreeBSD$"); /* * Reassembly headers are stored in hash buckets. */ -#defineIP6REASS_NHASH_LOG2 6 +#defineIP6REASS_NHASH_LOG2 10 #defineIP6REASS_NHASH (1 << IP6REASS_NHASH_LOG2) #defineIP6REASS_HMASK (IP6REASS_NHASH - 1) @@ -107,6 +107,22 @@ VNET_DEFINE_STATIC(uint32_t, ip6q_hashseed); static MALLOC_DEFINE(M_FTABLE, "fragment", "fragment reassembly header"); /* + * By default, limit the number of IP6 fragments across all reassembly + * queues to 1/32 of the total number of mbuf clusters. + * + * Limit the total number of reassembly queues per VNET to the + * IP6 fragment limit, but ensure the limit will not allow any bucket + * to grow above 100 items. (The bucket limit is + * IP_MAXFRAGPACKETS / (IPREASS_NHASH / 2), so the 50 is the correct + * multiplier to reach a 100-item limit.) + * The 100-item limit was chosen as brief testing seems to show that + * this produces "reasonable" performance on some subset of systems + * under DoS attack. + */ +#defineIP6_MAXFRAGS(nmbclusters / 32) +#defineIP6_MAXFRAGPACKETS (imin(IP6_MAXFRAGS, IP6REASS_NHASH * 50)) + +/* * Initialise reassembly queue and fragment identifier. */ void @@ -123,11 +139,11 @@ frag6_change(void *tag) { VNET_ITERATOR_DECL(vnet_iter); - ip6_maxfrags = nmbclusters / 4; + ip6_maxfrags = IP6_MAXFRAGS; VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); - V_ip6_maxfragpackets = nmbclusters / 4; + V_ip6_maxfragpackets = IP6_MAXFRAGPACKETS; frag6_set_bucketsize(); CURVNET_RESTORE(); } @@ -140,7 +156,7 @@ frag6_init(void) struct ip6q *q6; int i; - V_ip6_maxfragpackets = nmbclusters / 4; + V_ip6_maxfragpackets = IP6_MAXFRAGPACKETS; frag6_set_bucketsize(); for (i = 0; i < IP6REASS_NHASH; i++) { q6 = IP6Q_HEAD(i); @@ -153,7 +169,7 @@ frag6_init(void) if (!IS_DEFAULT_VNET(curvnet)) return; - ip6_maxfrags = nmbclusters / 4; + ip6_maxfrags = IP6_MAXFRAGS; EVENTHANDLER_REGISTER(nmbclusters_change, frag6_change, NULL, EVENTHANDLER_PRI_ANY); } ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r337786 - head/sys/netinet
Author: jtl Date: Tue Aug 14 17:30:46 2018 New Revision: 337786 URL: https://svnweb.freebsd.org/changeset/base/337786 Log: Lower the default limits on the IPv4 reassembly queue. In particular, try to ensure that no bucket will have a reassembly queue larger than approximately 100 items. This limits the cost to find the correct reassembly queue when processing an incoming fragment. Due to the low limits on each bucket's length, increase the size of the hash table from 64 to 1024. Reviewed by: jhb Security: FreeBSD-SA-18:10.ip Security: CVE-2018-6923 Modified: head/sys/netinet/ip_reass.c Modified: head/sys/netinet/ip_reass.c == --- head/sys/netinet/ip_reass.c Tue Aug 14 17:29:41 2018(r337785) +++ head/sys/netinet/ip_reass.c Tue Aug 14 17:30:46 2018(r337786) @@ -64,7 +64,7 @@ SYSCTL_DECL(_net_inet_ip); /* * Reassembly headers are stored in hash buckets. */ -#defineIPREASS_NHASH_LOG2 6 +#defineIPREASS_NHASH_LOG2 10 #defineIPREASS_NHASH (1 << IPREASS_NHASH_LOG2) #defineIPREASS_HMASK (IPREASS_NHASH - 1) @@ -116,6 +116,22 @@ ipq_drop(struct ipqbucket *bucket, struct ipq *fp) ipq_free(bucket, fp); } +/* + * By default, limit the number of IP fragments across all reassembly + * queues to 1/32 of the total number of mbuf clusters. + * + * Limit the total number of reassembly queues per VNET to the + * IP fragment limit, but ensure the limit will not allow any bucket + * to grow above 100 items. (The bucket limit is + * IP_MAXFRAGPACKETS / (IPREASS_NHASH / 2), so the 50 is the correct + * multiplier to reach a 100-item limit.) + * The 100-item limit was chosen as brief testing seems to show that + * this produces "reasonable" performance on some subset of systems + * under DoS attack. + */ +#defineIP_MAXFRAGS (nmbclusters / 32) +#defineIP_MAXFRAGPACKETS (imin(IP_MAXFRAGS, IPREASS_NHASH * 50)) + static int maxfrags; static volatile u_int nfrags; SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfrags, CTLFLAG_RW, @@ -513,12 +529,12 @@ ipreass_init(void) V_maxfragsperpacket = 16; V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); - max = nmbclusters / 32; + max = IP_MAXFRAGPACKETS; max = uma_zone_set_max(V_ipq_zone, max); V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1); if (IS_DEFAULT_VNET(curvnet)) { - maxfrags = nmbclusters / 32; + maxfrags = IP_MAXFRAGS; EVENTHANDLER_REGISTER(nmbclusters_change, ipreass_zone_change, NULL, EVENTHANDLER_PRI_ANY); } @@ -622,8 +638,8 @@ ipreass_zone_change(void *tag) VNET_ITERATOR_DECL(vnet_iter); int max; - maxfrags = nmbclusters / 32; - max = nmbclusters / 32; + maxfrags = IP_MAXFRAGS; + max = IP_MAXFRAGPACKETS; VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r337784 - head/sys/netinet6
Author: jtl Date: Tue Aug 14 17:29:22 2018 New Revision: 337784 URL: https://svnweb.freebsd.org/changeset/base/337784 Log: Drop 0-byte IPv6 fragments. Currently, we process IPv6 fragments with 0 bytes of payload, add them to the reassembly queue, and do not recognize them as duplicating or overlapping with adjacent 0-byte fragments. An attacker can exploit this to create long fragment queues. There is no legitimate reason for a fragment with no payload. However, because IPv6 packets with an empty payload are acceptable, allow an "atomic" fragment with no payload. Reviewed by: jhb Security: FreeBSD-SA-18:10.ip Security: CVE-2018-6923 Modified: head/sys/netinet6/frag6.c Modified: head/sys/netinet6/frag6.c == --- head/sys/netinet6/frag6.c Tue Aug 14 17:27:41 2018(r337783) +++ head/sys/netinet6/frag6.c Tue Aug 14 17:29:22 2018(r337784) @@ -271,6 +271,16 @@ frag6_input(struct mbuf **mp, int *offp, int proto) return (ip6f->ip6f_nxt); } + /* Get fragment length and discard 0-byte fragments. */ + frgpartlen = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) - offset; + if (frgpartlen == 0) { + icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, + offsetof(struct ip6_hdr, ip6_plen)); + in6_ifstat_inc(dstifp, ifs6_reass_fail); + IP6STAT_INC(ip6s_fragdropped); + return IPPROTO_DONE; + } + hashkeyp = hashkey; memcpy(hashkeyp, >ip6_src, sizeof(struct in6_addr)); hashkeyp += sizeof(struct in6_addr) / sizeof(*hashkeyp); @@ -368,7 +378,6 @@ frag6_input(struct mbuf **mp, int *offp, int proto) * in size. * If it would exceed, discard the fragment and return an ICMP error. */ - frgpartlen = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) - offset; if (q6->ip6q_unfrglen >= 0) { /* The 1st fragment has already arrived. */ if (q6->ip6q_unfrglen + fragoff + frgpartlen > IPV6_MAXPACKET) { ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r337783 - head/sys/netinet6
Author: jtl Date: Tue Aug 14 17:27:41 2018 New Revision: 337783 URL: https://svnweb.freebsd.org/changeset/base/337783 Log: Implement a limit on on the number of IPv6 reassembly queues per bucket. There is a hashing algorithm which should distribute IPv6 reassembly queues across the available buckets in a relatively even way. However, if there is a flaw in the hashing algorithm which allows a large number of IPv6 fragment reassembly queues to end up in a single bucket, a per- bucket limit could help mitigate the performance impact of this flaw. Implement such a limit, with a default of twice the maximum number of reassembly queues divided by the number of buckets. Recalculate the limit any time the maximum number of reassembly queues changes. However, allow the user to override the value using a sysctl (net.inet6.ip6.maxfragbucketsize). Reviewed by: jhb Security: FreeBSD-SA-18:10.ip Security: CVE-2018-6923 Modified: head/sys/netinet6/frag6.c head/sys/netinet6/in6.h head/sys/netinet6/in6_proto.c head/sys/netinet6/ip6_var.h Modified: head/sys/netinet6/frag6.c == --- head/sys/netinet6/frag6.c Tue Aug 14 17:26:07 2018(r337782) +++ head/sys/netinet6/frag6.c Tue Aug 14 17:27:41 2018(r337783) @@ -79,13 +79,14 @@ static void frag6_enq(struct ip6asfrag *, struct ip6as uint32_t bucket __unused); static void frag6_deq(struct ip6asfrag *, uint32_t bucket __unused); static void frag6_insque_head(struct ip6q *, struct ip6q *, -uint32_t bucket __unused); -static void frag6_remque(struct ip6q *, uint32_t bucket __unused); +uint32_t bucket); +static void frag6_remque(struct ip6q *, uint32_t bucket); static void frag6_freef(struct ip6q *, uint32_t bucket); struct ip6qbucket { struct ip6q ip6q; struct mtx lock; + int count; }; VNET_DEFINE_STATIC(volatile u_int, frag6_nfragpackets); @@ -108,6 +109,15 @@ static MALLOC_DEFINE(M_FTABLE, "fragment", "fragment r /* * Initialise reassembly queue and fragment identifier. */ +void +frag6_set_bucketsize() +{ + int i; + + if ((i = V_ip6_maxfragpackets) > 0) + V_ip6_maxfragbucketsize = imax(i / (IP6REASS_NHASH / 2), 1); +} + static void frag6_change(void *tag) { @@ -118,6 +128,7 @@ frag6_change(void *tag) VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); V_ip6_maxfragpackets = nmbclusters / 4; + frag6_set_bucketsize(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); @@ -130,10 +141,12 @@ frag6_init(void) int i; V_ip6_maxfragpackets = nmbclusters / 4; + frag6_set_bucketsize(); for (i = 0; i < IP6REASS_NHASH; i++) { q6 = IP6Q_HEAD(i); q6->ip6q_next = q6->ip6q_prev = q6; mtx_init(_ip6q[i].lock, "ip6qlock", NULL, MTX_DEF); + V_ip6q[i].count = 0; } V_ip6q_hashseed = arc4random(); V_ip6_maxfragsperpacket = 64; @@ -304,7 +317,8 @@ frag6_input(struct mbuf **mp, int *offp, int proto) */ if (V_ip6_maxfragpackets < 0) ; - else if (atomic_load_int(_frag6_nfragpackets) >= + else if (V_ip6q[hash].count >= V_ip6_maxfragbucketsize || + atomic_load_int(_frag6_nfragpackets) >= (u_int)V_ip6_maxfragpackets) goto dropfrag; atomic_add_int(_frag6_nfragpackets, 1); @@ -763,7 +777,7 @@ frag6_deq(struct ip6asfrag *af6, uint32_t bucket __unu } static void -frag6_insque_head(struct ip6q *new, struct ip6q *old, uint32_t bucket __unused) +frag6_insque_head(struct ip6q *new, struct ip6q *old, uint32_t bucket) { IP6Q_LOCK_ASSERT(bucket); @@ -775,16 +789,18 @@ frag6_insque_head(struct ip6q *new, struct ip6q *old, new->ip6q_next = old->ip6q_next; old->ip6q_next->ip6q_prev= new; old->ip6q_next = new; + V_ip6q[bucket].count++; } static void -frag6_remque(struct ip6q *p6, uint32_t bucket __unused) +frag6_remque(struct ip6q *p6, uint32_t bucket) { IP6Q_LOCK_ASSERT(bucket); p6->ip6q_prev->ip6q_next = p6->ip6q_next; p6->ip6q_next->ip6q_prev = p6->ip6q_prev; + V_ip6q[bucket].count--; } /* @@ -806,29 +822,59 @@ frag6_slowtimo(void) IP6Q_LOCK(i); head = IP6Q_HEAD(i); q6 = head->ip6q_next; - if (q6) - while (q6 != head) { - --q6->ip6q_ttl; - q6 = q6->ip6q_next; - if (q6->ip6q_prev->ip6q_ttl == 0) { - IP6STAT_INC(ip6s_fragtimeout); -
svn commit: r337782 - head/sys/netinet6
Author: jtl Date: Tue Aug 14 17:26:07 2018 New Revision: 337782 URL: https://svnweb.freebsd.org/changeset/base/337782 Log: Add a limit of the number of fragments per IPv6 packet. The IPv4 fragment reassembly code supports a limit on the number of fragments per packet. The default limit is currently 17 fragments. Among other things, this limit serves to limit the number of fragments the code must parse when trying to reassembly a packet. Add a limit to the IPv6 reassembly code. By default, limit a packet to 65 fragments (64 on the queue, plus one final fragment to complete the packet). This allows an average fragment size of 1,008 bytes, which should be sufficient to hold a fragment. (Recall that the IPv6 minimum MTU is 1280 bytes. Therefore, this configuration allows a full-size IPv6 packet to be fragmented on a link with the minimum MTU and still carry approximately 272 bytes of headers before the fragmented portion of the packet.) Users can adjust this limit using the net.inet6.ip6.maxfragsperpacket sysctl. Reviewed by: jhb Security: FreeBSD-SA-18:10.ip Security: CVE-2018-6923 Modified: head/sys/netinet6/frag6.c head/sys/netinet6/in6.h head/sys/netinet6/in6_proto.c head/sys/netinet6/ip6_var.h Modified: head/sys/netinet6/frag6.c == --- head/sys/netinet6/frag6.c Tue Aug 14 17:24:26 2018(r337781) +++ head/sys/netinet6/frag6.c Tue Aug 14 17:26:07 2018(r337782) @@ -136,6 +136,7 @@ frag6_init(void) mtx_init(_ip6q[i].lock, "ip6qlock", NULL, MTX_DEF); } V_ip6q_hashseed = arc4random(); + V_ip6_maxfragsperpacket = 64; if (!IS_DEFAULT_VNET(curvnet)) return; @@ -533,6 +534,7 @@ insert: /* * Stick new segment in its place; * check for complete reassembly. +* If not complete, check fragment limit. * Move to front of packet queue, as we are * the most recently active fragmented packet. */ @@ -549,12 +551,20 @@ insert: for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6; af6 = af6->ip6af_down) { if (af6->ip6af_off != next) { + if (q6->ip6q_nfrag > V_ip6_maxfragsperpacket) { + IP6STAT_INC(ip6s_fragdropped); + frag6_freef(q6, hash); + } IP6Q_UNLOCK(hash); return IPPROTO_DONE; } next += af6->ip6af_frglen; } if (af6->ip6af_up->ip6af_mff) { + if (q6->ip6q_nfrag > V_ip6_maxfragsperpacket) { + IP6STAT_INC(ip6s_fragdropped); + frag6_freef(q6, hash); + } IP6Q_UNLOCK(hash); return IPPROTO_DONE; } Modified: head/sys/netinet6/in6.h == --- head/sys/netinet6/in6.h Tue Aug 14 17:24:26 2018(r337781) +++ head/sys/netinet6/in6.h Tue Aug 14 17:26:07 2018(r337782) @@ -642,7 +642,8 @@ struct ip6_mtuinfo { #defineIPV6CTL_INTRQMAXLEN 51 /* max length of IPv6 netisr queue */ #defineIPV6CTL_INTRDQMAXLEN52 /* max length of direct IPv6 netisr * queue */ -#defineIPV6CTL_MAXID 53 +#defineIPV6CTL_MAXFRAGSPERPACKET 53 /* Max fragments per packet */ +#defineIPV6CTL_MAXID 54 #endif /* __BSD_VISIBLE */ /* Modified: head/sys/netinet6/in6_proto.c == --- head/sys/netinet6/in6_proto.c Tue Aug 14 17:24:26 2018 (r337781) +++ head/sys/netinet6/in6_proto.c Tue Aug 14 17:26:07 2018 (r337782) @@ -386,6 +386,7 @@ VNET_DEFINE(int, ip6_norbit_raif) = 0; VNET_DEFINE(int, ip6_rfc6204w3) = 0; VNET_DEFINE(int, ip6_maxfragpackets); /* initialized in frag6.c:frag6_init() */ int ip6_maxfrags; /* initialized in frag6.c:frag6_init() */ +VNET_DEFINE(int, ip6_maxfragsperpacket); /* initialized in frag6.c:frag6_init() */ VNET_DEFINE(int, ip6_log_interval) = 5; VNET_DEFINE(int, ip6_hdrnestlimit) = 15;/* How many header options will we * process? */ @@ -563,6 +564,9 @@ SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGS, maxfrags, "Maximum allowed number of outstanding IPv6 packet fragments. " "A value of 0 means no fragmented packets will be accepted, while a " "a value of -1 means no limit"); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGSPERPACKET, maxfragsperpacket, + CTLFLAG_VNET | CTLFLAG_RW, _NAME(ip6_maxfragsperpacket), 0, + "Maximum allowed number of fragments per packet"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MCAST_PMTU,
svn commit: r337781 - head/sys/netinet6
Author: jtl Date: Tue Aug 14 17:24:26 2018 New Revision: 337781 URL: https://svnweb.freebsd.org/changeset/base/337781 Log: Make the IPv6 fragment limits be global, rather than per-VNET, limits. The IPv6 reassembly fragment limit is based on the number of mbuf clusters, which are a global resource. However, the limit is currently applied on a per-VNET basis. Given enough VNETs (or given sufficient customization on enough VNETs), it is possible that the sum of all the VNET fragment limits will exceed the number of mbuf clusters available in the system. Given the fact that the fragment limits are intended (at least in part) to regulate access to a global resource, the IPv6 fragment limit should be applied on a global basis. Note that it is still possible to disable fragmentation for a particular VNET by setting the net.inet6.ip6.maxfragpackets sysctl to 0 for that VNET. In addition, it is now possible to disable fragmentation globally by setting the net.inet6.ip6.maxfrags sysctl to 0. Reviewed by: jhb Security: FreeBSD-SA-18:10.ip Security: CVE-2018-6923 Modified: head/sys/netinet6/frag6.c head/sys/netinet6/in6_proto.c head/sys/netinet6/ip6_var.h Modified: head/sys/netinet6/frag6.c == --- head/sys/netinet6/frag6.c Tue Aug 14 17:23:05 2018(r337780) +++ head/sys/netinet6/frag6.c Tue Aug 14 17:24:26 2018(r337781) @@ -89,12 +89,11 @@ struct ip6qbucket { }; VNET_DEFINE_STATIC(volatile u_int, frag6_nfragpackets); -VNET_DEFINE_STATIC(volatile u_int, frag6_nfrags); +volatile u_int frag6_nfrags = 0; VNET_DEFINE_STATIC(struct ip6qbucket, ip6q[IP6REASS_NHASH]); VNET_DEFINE_STATIC(uint32_t, ip6q_hashseed); #defineV_frag6_nfragpacketsVNET(frag6_nfragpackets) -#defineV_frag6_nfrags VNET(frag6_nfrags) #defineV_ip6q VNET(ip6q) #defineV_ip6q_hashseed VNET(ip6q_hashseed) @@ -112,9 +111,16 @@ static MALLOC_DEFINE(M_FTABLE, "fragment", "fragment r static void frag6_change(void *tag) { + VNET_ITERATOR_DECL(vnet_iter); - V_ip6_maxfragpackets = nmbclusters / 4; - V_ip6_maxfrags = nmbclusters / 4; + ip6_maxfrags = nmbclusters / 4; + VNET_LIST_RLOCK_NOSLEEP(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + V_ip6_maxfragpackets = nmbclusters / 4; + CURVNET_RESTORE(); + } + VNET_LIST_RUNLOCK_NOSLEEP(); } void @@ -124,7 +130,6 @@ frag6_init(void) int i; V_ip6_maxfragpackets = nmbclusters / 4; - V_ip6_maxfrags = nmbclusters / 4; for (i = 0; i < IP6REASS_NHASH; i++) { q6 = IP6Q_HEAD(i); q6->ip6q_next = q6->ip6q_prev = q6; @@ -134,6 +139,7 @@ frag6_init(void) if (!IS_DEFAULT_VNET(curvnet)) return; + ip6_maxfrags = nmbclusters / 4; EVENTHANDLER_REGISTER(nmbclusters_change, frag6_change, NULL, EVENTHANDLER_PRI_ANY); } @@ -267,9 +273,9 @@ frag6_input(struct mbuf **mp, int *offp, int proto) * If maxfrag is 0, never accept fragments. * If maxfrag is -1, accept all fragments without limitation. */ - if (V_ip6_maxfrags < 0) + if (ip6_maxfrags < 0) ; - else if (atomic_load_int(_frag6_nfrags) >= (u_int)V_ip6_maxfrags) + else if (atomic_load_int(_nfrags) >= (u_int)ip6_maxfrags) goto dropfrag; for (q6 = head->ip6q_next; q6 != head; q6 = q6->ip6q_next) @@ -531,7 +537,7 @@ insert: * the most recently active fragmented packet. */ frag6_enq(ip6af, af6->ip6af_up, hash); - atomic_add_int(_frag6_nfrags, 1); + atomic_add_int(_nfrags, 1); q6->ip6q_nfrag++; #if 0 /* xxx */ if (q6 != head->ip6q_next) { @@ -595,7 +601,7 @@ insert: if (ip6_deletefraghdr(m, offset, M_NOWAIT) != 0) { frag6_remque(q6, hash); - atomic_subtract_int(_frag6_nfrags, q6->ip6q_nfrag); + atomic_subtract_int(_nfrags, q6->ip6q_nfrag); #ifdef MAC mac_ip6q_destroy(q6); #endif @@ -612,7 +618,7 @@ insert: (caddr_t)); frag6_remque(q6, hash); - atomic_subtract_int(_frag6_nfrags, q6->ip6q_nfrag); + atomic_subtract_int(_nfrags, q6->ip6q_nfrag); #ifdef MAC mac_ip6q_reassemble(q6, m); mac_ip6q_destroy(q6); @@ -708,7 +714,7 @@ frag6_freef(struct ip6q *q6, uint32_t bucket) free(af6, M_FTABLE); } frag6_remque(q6, bucket); - atomic_subtract_int(_frag6_nfrags, q6->ip6q_nfrag); + atomic_subtract_int(_nfrags, q6->ip6q_nfrag); #ifdef MAC mac_ip6q_destroy(q6); #endif Modified: head/sys/netinet6/in6_proto.c == ---
svn commit: r337780 - head/sys/netinet
Author: jtl Date: Tue Aug 14 17:23:05 2018 New Revision: 337780 URL: https://svnweb.freebsd.org/changeset/base/337780 Log: Implement a limit on on the number of IPv4 reassembly queues per bucket. There is a hashing algorithm which should distribute IPv4 reassembly queues across the available buckets in a relatively even way. However, if there is a flaw in the hashing algorithm which allows a large number of IPv4 fragment reassembly queues to end up in a single bucket, a per- bucket limit could help mitigate the performance impact of this flaw. Implement such a limit, with a default of twice the maximum number of reassembly queues divided by the number of buckets. Recalculate the limit any time the maximum number of reassembly queues changes. However, allow the user to override the value using a sysctl (net.inet.ip.maxfragbucketsize). Reviewed by: jhb Security: FreeBSD-SA-18:10.ip Security: CVE-2018-6923 Modified: head/sys/netinet/ip_reass.c Modified: head/sys/netinet/ip_reass.c == --- head/sys/netinet/ip_reass.c Tue Aug 14 17:20:31 2018(r337779) +++ head/sys/netinet/ip_reass.c Tue Aug 14 17:23:05 2018(r337780) @@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -70,6 +71,7 @@ SYSCTL_DECL(_net_inet_ip); struct ipqbucket { TAILQ_HEAD(ipqhead, ipq) head; struct mtx lock; + int count; }; VNET_DEFINE_STATIC(struct ipqbucket, ipq[IPREASS_NHASH]); @@ -82,6 +84,9 @@ VNET_DEFINE_STATIC(uint32_t, ipq_hashseed); #defineIPQ_UNLOCK(i) mtx_unlock(_ipq[i].lock) #defineIPQ_LOCK_ASSERT(i) mtx_assert(_ipq[i].lock, MA_OWNED) +VNET_DEFINE_STATIC(int, ipreass_maxbucketsize); +#defineV_ipreass_maxbucketsize VNET(ipreass_maxbucketsize) + void ipreass_init(void); void ipreass_drain(void); void ipreass_slowtimo(void); @@ -89,25 +94,26 @@ voidipreass_slowtimo(void); void ipreass_destroy(void); #endif static int sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS); +static int sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS); static voidipreass_zone_change(void *); static voidipreass_drain_tomax(void); -static voidipq_free(struct ipqhead *, struct ipq *); +static voidipq_free(struct ipqbucket *, struct ipq *); static struct ipq * ipq_reuse(int); static inline void -ipq_timeout(struct ipqhead *head, struct ipq *fp) +ipq_timeout(struct ipqbucket *bucket, struct ipq *fp) { IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags); - ipq_free(head, fp); + ipq_free(bucket, fp); } static inline void -ipq_drop(struct ipqhead *head, struct ipq *fp) +ipq_drop(struct ipqbucket *bucket, struct ipq *fp) { IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); - ipq_free(head, fp); + ipq_free(bucket, fp); } static int maxfrags; @@ -136,6 +142,10 @@ VNET_DEFINE_STATIC(int, maxfragsperpacket); SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_VNET | CTLFLAG_RW, _NAME(maxfragsperpacket), 0, "Maximum number of IPv4 fragments allowed per packet"); +SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragbucketsize, +CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, +sysctl_maxfragbucketsize, "I", +"Maximum number of IPv4 fragment reassembly queue entries per bucket"); /* * Take incoming datagram fragment and try to reassemble it into @@ -241,9 +251,12 @@ ip_reass(struct mbuf *m) * If first fragment to arrive, create a reassembly queue. */ if (fp == NULL) { - fp = uma_zalloc(V_ipq_zone, M_NOWAIT); + if (V_ipq[hash].count < V_ipreass_maxbucketsize) + fp = uma_zalloc(V_ipq_zone, M_NOWAIT); if (fp == NULL) fp = ipq_reuse(hash); + if (fp == NULL) + goto dropfrag; #ifdef MAC if (mac_ipq_init(fp, M_NOWAIT) != 0) { uma_zfree(V_ipq_zone, fp); @@ -253,6 +266,7 @@ ip_reass(struct mbuf *m) mac_ipq_create(m, fp); #endif TAILQ_INSERT_HEAD(head, fp, ipq_list); + V_ipq[hash].count++; fp->ipq_nfrags = 1; atomic_add_int(, 1); fp->ipq_ttl = IPFRAGTTL; @@ -360,7 +374,7 @@ ip_reass(struct mbuf *m) for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) { if (ntohs(GETIP(q)->ip_off) != next) { if (fp->ipq_nfrags > V_maxfragsperpacket) - ipq_drop(head, fp); + ipq_drop(_ipq[hash], fp); goto done; } next += ntohs(GETIP(q)->ip_len); @@ -368,7 +382,7 @@
svn commit: r337778 - head/sys/netinet
Author: jtl Date: Tue Aug 14 17:19:49 2018 New Revision: 337778 URL: https://svnweb.freebsd.org/changeset/base/337778 Log: Add a global limit on the number of IPv4 fragments. The IP reassembly fragment limit is based on the number of mbuf clusters, which are a global resource. However, the limit is currently applied on a per-VNET basis. Given enough VNETs (or given sufficient customization of enough VNETs), it is possible that the sum of all the VNET limits will exceed the number of mbuf clusters available in the system. Given the fact that the fragment limit is intended (at least in part) to regulate access to a global resource, the fragment limit should be applied on a global basis. VNET-specific limits can be adjusted by modifying the net.inet.ip.maxfragpackets and net.inet.ip.maxfragsperpacket sysctls. To disable fragment reassembly globally, set net.inet.ip.maxfrags to 0. To disable fragment reassembly for a particular VNET, set net.inet.ip.maxfragpackets to 0. Reviewed by: jhb Security: FreeBSD-SA-18:10.ip Security: CVE-2018-6923 Modified: head/sys/netinet/ip_reass.c Modified: head/sys/netinet/ip_reass.c == --- head/sys/netinet/ip_reass.c Tue Aug 14 17:19:11 2018(r33) +++ head/sys/netinet/ip_reass.c Tue Aug 14 17:19:49 2018(r337778) @@ -110,6 +110,15 @@ ipq_drop(struct ipqhead *head, struct ipq *fp) ipq_free(head, fp); } +static int maxfrags; +static volatile u_int nfrags; +SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfrags, CTLFLAG_RW, +, 0, +"Maximum number of IPv4 fragments allowed across all reassembly queues"); +SYSCTL_UINT(_net_inet_ip, OID_AUTO, curfrags, CTLFLAG_RD, +__DEVOLATILE(u_int *, ), 0, +"Current number of IPv4 fragments across all reassembly queues"); + VNET_DEFINE_STATIC(uma_zone_t, ipq_zone); #defineV_ipq_zone VNET(ipq_zone) SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLFLAG_VNET | @@ -146,7 +155,7 @@ ip_reass(struct mbuf *m) struct mbuf *p, *q, *nq, *t; struct ipq *fp; struct ipqhead *head; - int i, hlen, next; + int i, hlen, next, tmpmax; u_int8_t ecn, ecn0; uint32_t hash, hashkey[3]; #ifdef RSS @@ -156,8 +165,12 @@ ip_reass(struct mbuf *m) /* * If no reassembling or maxfragsperpacket are 0, * never accept fragments. +* Also, drop packet if it would exceed the maximum +* number of fragments. */ - if (V_noreass == 1 || V_maxfragsperpacket == 0) { + tmpmax = maxfrags; + if (V_noreass == 1 || V_maxfragsperpacket == 0 || + (tmpmax >= 0 && atomic_load_int() >= (u_int)tmpmax)) { IPSTAT_INC(ips_fragments); IPSTAT_INC(ips_fragdropped); m_freem(m); @@ -241,6 +254,7 @@ ip_reass(struct mbuf *m) #endif TAILQ_INSERT_HEAD(head, fp, ipq_list); fp->ipq_nfrags = 1; + atomic_add_int(, 1); fp->ipq_ttl = IPFRAGTTL; fp->ipq_p = ip->ip_p; fp->ipq_id = ip->ip_id; @@ -251,6 +265,7 @@ ip_reass(struct mbuf *m) goto done; } else { fp->ipq_nfrags++; + atomic_add_int(, 1); #ifdef MAC mac_ipq_update(m, fp); #endif @@ -327,6 +342,7 @@ ip_reass(struct mbuf *m) m->m_nextpkt = nq; IPSTAT_INC(ips_fragdropped); fp->ipq_nfrags--; + atomic_subtract_int(, 1); m_freem(q); } @@ -392,6 +408,7 @@ ip_reass(struct mbuf *m) while (m->m_pkthdr.csum_data & 0x) m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0x) + (m->m_pkthdr.csum_data >> 16); + atomic_subtract_int(, fp->ipq_nfrags); #ifdef MAC mac_ipq_reassemble(fp, m); mac_ipq_destroy(fp); @@ -451,8 +468,10 @@ ip_reass(struct mbuf *m) dropfrag: IPSTAT_INC(ips_fragdropped); - if (fp != NULL) + if (fp != NULL) { fp->ipq_nfrags--; + atomic_subtract_int(, 1); + } m_freem(m); done: IPQ_UNLOCK(hash); @@ -479,9 +498,11 @@ ipreass_init(void) NULL, UMA_ALIGN_PTR, 0); uma_zone_set_max(V_ipq_zone, nmbclusters / 32); - if (IS_DEFAULT_VNET(curvnet)) + if (IS_DEFAULT_VNET(curvnet)) { + maxfrags = nmbclusters / 32; EVENTHANDLER_REGISTER(nmbclusters_change, ipreass_zone_change, NULL, EVENTHANDLER_PRI_ANY); + } } /* @@ -564,9 +585,19 @@ ipreass_drain_tomax(void) static void ipreass_zone_change(void *tag) { + VNET_ITERATOR_DECL(vnet_iter); + int max; - uma_zone_set_max(V_ipq_zone, nmbclusters / 32); - ipreass_drain_tomax(); + maxfrags = nmbclusters / 32; + max =
svn commit: r337776 - head/sys/netinet6
Author: jtl Date: Tue Aug 14 17:17:37 2018 New Revision: 337776 URL: https://svnweb.freebsd.org/changeset/base/337776 Log: Improve IPv6 reassembly performance by hashing fragments into buckets. Currently, all IPv6 fragment reassembly queues are kept in a flat linked list. This has a number of implications. Two significant implications are: all reassembly operations share a common lock, and it is possible for the linked list to grow quite large. Improve IPv6 reassembly performance by hashing fragments into buckets, each of which has its own lock. Calculate the hash key using a Jenkins hash with a random seed. Reviewed by: jhb Security: FreeBSD-SA-18:10.ip Security: CVE-2018-6923 Modified: head/sys/netinet6/frag6.c Modified: head/sys/netinet6/frag6.c == --- head/sys/netinet6/frag6.c Tue Aug 14 17:15:47 2018(r337775) +++ head/sys/netinet6/frag6.c Tue Aug 14 17:17:37 2018(r337776) @@ -38,6 +38,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include @@ -49,6 +50,8 @@ __FBSDID("$FreeBSD$"); #include #include +#include + #include #include #include @@ -65,29 +68,41 @@ __FBSDID("$FreeBSD$"); #include -static void frag6_enq(struct ip6asfrag *, struct ip6asfrag *); -static void frag6_deq(struct ip6asfrag *); -static void frag6_insque(struct ip6q *, struct ip6q *); -static void frag6_remque(struct ip6q *); -static void frag6_freef(struct ip6q *); - -static struct mtx ip6qlock; /* - * These fields all protected by ip6qlock. + * Reassembly headers are stored in hash buckets. */ -VNET_DEFINE_STATIC(u_int, frag6_nfragpackets); -VNET_DEFINE_STATIC(u_int, frag6_nfrags); -VNET_DEFINE_STATIC(struct ip6q, ip6q); /* ip6 reassemble queue */ +#defineIP6REASS_NHASH_LOG2 6 +#defineIP6REASS_NHASH (1 << IP6REASS_NHASH_LOG2) +#defineIP6REASS_HMASK (IP6REASS_NHASH - 1) +static void frag6_enq(struct ip6asfrag *, struct ip6asfrag *, +uint32_t bucket __unused); +static void frag6_deq(struct ip6asfrag *, uint32_t bucket __unused); +static void frag6_insque_head(struct ip6q *, struct ip6q *, +uint32_t bucket __unused); +static void frag6_remque(struct ip6q *, uint32_t bucket __unused); +static void frag6_freef(struct ip6q *, uint32_t bucket); + +struct ip6qbucket { + struct ip6q ip6q; + struct mtx lock; +}; + +VNET_DEFINE_STATIC(volatile u_int, frag6_nfragpackets); +VNET_DEFINE_STATIC(volatile u_int, frag6_nfrags); +VNET_DEFINE_STATIC(struct ip6qbucket, ip6q[IP6REASS_NHASH]); +VNET_DEFINE_STATIC(uint32_t, ip6q_hashseed); + #defineV_frag6_nfragpacketsVNET(frag6_nfragpackets) #defineV_frag6_nfrags VNET(frag6_nfrags) #defineV_ip6q VNET(ip6q) +#defineV_ip6q_hashseed VNET(ip6q_hashseed) -#defineIP6Q_LOCK_INIT()mtx_init(, "ip6qlock", NULL, MTX_DEF); -#defineIP6Q_LOCK() mtx_lock() -#defineIP6Q_TRYLOCK() mtx_trylock() -#defineIP6Q_LOCK_ASSERT() mtx_assert(, MA_OWNED) -#defineIP6Q_UNLOCK() mtx_unlock() +#defineIP6Q_LOCK(i)mtx_lock(_ip6q[(i)].lock) +#defineIP6Q_TRYLOCK(i) mtx_trylock(_ip6q[(i)].lock) +#defineIP6Q_LOCK_ASSERT(i) mtx_assert(_ip6q[(i)].lock, MA_OWNED) +#defineIP6Q_UNLOCK(i) mtx_unlock(_ip6q[(i)].lock) +#defineIP6Q_HEAD(i)(_ip6q[(i)].ip6q) static MALLOC_DEFINE(M_FTABLE, "fragment", "fragment reassembly header"); @@ -105,18 +120,22 @@ frag6_change(void *tag) void frag6_init(void) { + struct ip6q *q6; + int i; V_ip6_maxfragpackets = nmbclusters / 4; V_ip6_maxfrags = nmbclusters / 4; - V_ip6q.ip6q_next = V_ip6q.ip6q_prev = _ip6q; - + for (i = 0; i < IP6REASS_NHASH; i++) { + q6 = IP6Q_HEAD(i); + q6->ip6q_next = q6->ip6q_prev = q6; + mtx_init(_ip6q[i].lock, "ip6qlock", NULL, MTX_DEF); + } + V_ip6q_hashseed = arc4random(); if (!IS_DEFAULT_VNET(curvnet)) return; EVENTHANDLER_REGISTER(nmbclusters_change, frag6_change, NULL, EVENTHANDLER_PRI_ANY); - - IP6Q_LOCK_INIT(); } /* @@ -157,12 +176,13 @@ frag6_input(struct mbuf **mp, int *offp, int proto) struct mbuf *m = *mp, *t; struct ip6_hdr *ip6; struct ip6_frag *ip6f; - struct ip6q *q6; + struct ip6q *head, *q6; struct ip6asfrag *af6, *ip6af, *af6dwn; struct in6_ifaddr *ia; int offset = *offp, nxt, i, next; int first_frag = 0; int fragoff, frgpartlen;/* must be larger than u_int16_t */ + uint32_t hash, hashkey[sizeof(struct in6_addr) * 2 + 1], *hashkeyp; struct ifnet *dstifp; u_int8_t ecn,
svn commit: r337775 - head/sys/netinet
Author: jtl Date: Tue Aug 14 17:15:47 2018 New Revision: 337775 URL: https://svnweb.freebsd.org/changeset/base/337775 Log: Improve hashing of IPv4 fragments. Currently, IPv4 fragments are hashed into buckets based on a 32-bit key which is calculated by (src_ip ^ ip_id) and combined with a random seed. However, because an attacker can control the values of src_ip and ip_id, it is possible to construct an attack which causes very deep chains to form in a given bucket. To ensure more uniform distribution (and lower predictability for an attacker), calculate the hash based on a key which includes all the fields we use to identify a reassembly queue (dst_ip, src_ip, ip_id, and the ip protocol) as well as a random seed. Reviewed by: jhb Security: FreeBSD-SA-18:10.ip Security: CVE-2018-6923 Modified: head/sys/netinet/ip_reass.c Modified: head/sys/netinet/ip_reass.c == --- head/sys/netinet/ip_reass.c Tue Aug 14 17:14:33 2018(r337774) +++ head/sys/netinet/ip_reass.c Tue Aug 14 17:15:47 2018(r337775) @@ -148,7 +148,7 @@ ip_reass(struct mbuf *m) struct ipqhead *head; int i, hlen, next; u_int8_t ecn, ecn0; - uint32_t hash; + uint32_t hash, hashkey[3]; #ifdef RSS uint32_t rss_hash, rss_type; #endif @@ -202,8 +202,12 @@ ip_reass(struct mbuf *m) m->m_data += hlen; m->m_len -= hlen; - hash = ip->ip_src.s_addr ^ ip->ip_id; - hash = jenkins_hash32(, 1, V_ipq_hashseed) & IPREASS_HMASK; + hashkey[0] = ip->ip_src.s_addr; + hashkey[1] = ip->ip_dst.s_addr; + hashkey[2] = (uint32_t)ip->ip_p << 16; + hashkey[2] += ip->ip_id; + hash = jenkins_hash32(hashkey, nitems(hashkey), V_ipq_hashseed); + hash &= IPREASS_HMASK; head = _ipq[hash].head; IPQ_LOCK(hash); ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
Re: svn commit: r337384 - in head: share/man/man4 sys/netinet
On Mon, Aug 6, 2018 at 2:16 PM Bjoern A. Zeeb < bzeeb-li...@lists.zabbadoz.net> wrote: > > On 6 Aug 2018, at 17:36, Jonathan T. Looney wrote: > > > Author: jtl > > Date: Mon Aug 6 17:36:57 2018 > > New Revision: 337384 > > URL: https://svnweb.freebsd.org/changeset/base/337384 > > > > Log: > > Address concerns about CPU usage while doing TCP reassembly. > … > > > > Reviewed by:jhb > > Approved by:so > > Security: FreeBSD-SA-18:08.tcp > > Security: CVE-2018-6922 > > > > Modified: > > head/share/man/man4/tcp.4 > > head/sys/netinet/tcp_reass.c > > > Would you mind bumping .Dd as well please? Done for head and the stable branches. I'll ask so@ about the releng branches. Jonathan ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r337392 - stable/10/share/man/man4
Author: jtl Date: Mon Aug 6 18:47:03 2018 New Revision: 337392 URL: https://svnweb.freebsd.org/changeset/base/337392 Log: MFC r337390: Bump date after r337384. Modified: stable/10/share/man/man4/tcp.4 Directory Properties: stable/10/ (props changed) Modified: stable/10/share/man/man4/tcp.4 == --- stable/10/share/man/man4/tcp.4 Mon Aug 6 18:46:09 2018 (r337391) +++ stable/10/share/man/man4/tcp.4 Mon Aug 6 18:47:03 2018 (r337392) @@ -38,7 +38,7 @@ .\" From: @(#)tcp.48.1 (Berkeley) 6/5/93 .\" $FreeBSD$ .\" -.Dd October 13, 2014 +.Dd August 6, 2018 .Dt TCP 4 .Os .Sh NAME ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r337391 - stable/11/share/man/man4
Author: jtl Date: Mon Aug 6 18:46:09 2018 New Revision: 337391 URL: https://svnweb.freebsd.org/changeset/base/337391 Log: MFC r337390: Bump date after r337384. Modified: stable/11/share/man/man4/tcp.4 Directory Properties: stable/11/ (props changed) Modified: stable/11/share/man/man4/tcp.4 == --- stable/11/share/man/man4/tcp.4 Mon Aug 6 18:42:37 2018 (r337390) +++ stable/11/share/man/man4/tcp.4 Mon Aug 6 18:46:09 2018 (r337391) @@ -34,7 +34,7 @@ .\" From: @(#)tcp.48.1 (Berkeley) 6/5/93 .\" $FreeBSD$ .\" -.Dd February 6, 2017 +.Dd August 6, 2018 .Dt TCP 4 .Os .Sh NAME ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r337390 - head/share/man/man4
Author: jtl Date: Mon Aug 6 18:42:37 2018 New Revision: 337390 URL: https://svnweb.freebsd.org/changeset/base/337390 Log: Bump date after r337384. Reported by: bz Modified: head/share/man/man4/tcp.4 Modified: head/share/man/man4/tcp.4 == --- head/share/man/man4/tcp.4 Mon Aug 6 17:50:40 2018(r337389) +++ head/share/man/man4/tcp.4 Mon Aug 6 18:42:37 2018(r337390) @@ -34,7 +34,7 @@ .\" From: @(#)tcp.48.1 (Berkeley) 6/5/93 .\" $FreeBSD$ .\" -.Dd May 9, 2018 +.Dd August 6, 2018 .Dt TCP 4 .Os .Sh NAME ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r337389 - in releng/10.4: share/man/man4 sys/netinet
Author: jtl Date: Mon Aug 6 17:50:40 2018 New Revision: 337389 URL: https://svnweb.freebsd.org/changeset/base/337389 Log: Address concerns about CPU usage while doing TCP reassembly. Currently, the per-queue limit is a function of the receive buffer size and the MSS. In certain cases (such as connections with large receive buffers), the per-queue segment limit can be quite large. Because we process segments as a linked list, large queues may not perform acceptably. The better long-term solution is to make the queue more efficient. But, in the short-term, we can provide a way for a system administrator to set the maximum queue size. We set the default queue limit to 100. This is an effort to balance performance with a sane resource limit. Depending on their environment, goals, etc., an administrator may choose to modify this limit in either direction. Approved by: so Security: FreeBSD-SA-18:08.tcp Security: CVE-2018-6922 Modified: releng/10.4/share/man/man4/tcp.4 releng/10.4/sys/netinet/tcp_reass.c Modified: releng/10.4/share/man/man4/tcp.4 == --- releng/10.4/share/man/man4/tcp.4Mon Aug 6 17:48:46 2018 (r337388) +++ releng/10.4/share/man/man4/tcp.4Mon Aug 6 17:50:40 2018 (r337389) @@ -436,6 +436,20 @@ no reseeding will occur. Reseeding should not be necessary, and will break .Dv TIME_WAIT recycling for a few minutes. +.It Va reass.cursegments +The current total number of segments present in all reassembly queues. +.It Va reass.maxsegments +The maximum limit on the total number of segments across all reassembly +queues. +The limit can be adjusted as a tunable. +.It Va reass.maxqueuelen +The maximum number of segments allowed in each reassembly queue. +By default, the system chooses a limit based on each TCP connection's +receive buffer size and maximum segment size (MSS). +The actual limit applied to a session's reassembly queue will be the lower of +the system-calculated automatic limit and the user-specified +.Va reass.maxqueuelen +limit. .It Va rexmit_min , rexmit_slop Adjust the retransmit timer calculation for .Tn TCP . Modified: releng/10.4/sys/netinet/tcp_reass.c == --- releng/10.4/sys/netinet/tcp_reass.c Mon Aug 6 17:48:46 2018 (r337388) +++ releng/10.4/sys/netinet/tcp_reass.c Mon Aug 6 17:50:40 2018 (r337389) @@ -96,6 +96,11 @@ SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, static uma_zone_t tcp_reass_zone; +static u_int tcp_reass_maxqueuelen = 100; +SYSCTL_UINT(_net_inet_tcp_reass, OID_AUTO, maxqueuelen, CTLFLAG_RWTUN, +_reass_maxqueuelen, 0, +"Maximum number of TCP Segments per Reassembly Queue"); + /* Initialize TCP reassembly queue */ static void tcp_reass_zone_change(void *tag) @@ -184,6 +189,10 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tl * socket receive buffer determines our advertised window and grows * automatically when socket buffer autotuning is enabled. Use it as the * basis for our queue limit. +* +* However, allow the user to specify a ceiling for the number of +* segments in each queue. +* * Always let the missing segment through which caused this queue. * NB: Access to the socket buffer is left intentionally unlocked as we * can tolerate stale information here. @@ -194,7 +203,8 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tl * is understood. */ if ((th->th_seq != tp->rcv_nxt || !TCPS_HAVEESTABLISHED(tp->t_state)) && - tp->t_segqlen >= (so->so_rcv.sb_hiwat / tp->t_maxseg) + 1) { + tp->t_segqlen >= min((so->so_rcv.sb_hiwat / tp->t_maxseg) + 1, + tcp_reass_maxqueuelen)) { tcp_reass_overflows++; TCPSTAT_INC(tcps_rcvmemdrop); m_freem(m); ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r337388 - in releng/11.1: share/man/man4 sys/netinet
Author: jtl Date: Mon Aug 6 17:48:46 2018 New Revision: 337388 URL: https://svnweb.freebsd.org/changeset/base/337388 Log: Address concerns about CPU usage while doing TCP reassembly. Currently, the per-queue limit is a function of the receive buffer size and the MSS. In certain cases (such as connections with large receive buffers), the per-queue segment limit can be quite large. Because we process segments as a linked list, large queues may not perform acceptably. The better long-term solution is to make the queue more efficient. But, in the short-term, we can provide a way for a system administrator to set the maximum queue size. We set the default queue limit to 100. This is an effort to balance performance with a sane resource limit. Depending on their environment, goals, etc., an administrator may choose to modify this limit in either direction. Approved by: so Security: FreeBSD-SA-18:08.tcp Security: CVE-2018-6922 Modified: releng/11.1/share/man/man4/tcp.4 releng/11.1/sys/netinet/tcp_reass.c Modified: releng/11.1/share/man/man4/tcp.4 == --- releng/11.1/share/man/man4/tcp.4Mon Aug 6 17:47:47 2018 (r337387) +++ releng/11.1/share/man/man4/tcp.4Mon Aug 6 17:48:46 2018 (r337388) @@ -445,6 +445,20 @@ no reseeding will occur. Reseeding should not be necessary, and will break .Dv TIME_WAIT recycling for a few minutes. +.It Va reass.cursegments +The current total number of segments present in all reassembly queues. +.It Va reass.maxsegments +The maximum limit on the total number of segments across all reassembly +queues. +The limit can be adjusted as a tunable. +.It Va reass.maxqueuelen +The maximum number of segments allowed in each reassembly queue. +By default, the system chooses a limit based on each TCP connection's +receive buffer size and maximum segment size (MSS). +The actual limit applied to a session's reassembly queue will be the lower of +the system-calculated automatic limit and the user-specified +.Va reass.maxqueuelen +limit. .It Va rexmit_min , rexmit_slop Adjust the retransmit timer calculation for .Tn TCP . Modified: releng/11.1/sys/netinet/tcp_reass.c == --- releng/11.1/sys/netinet/tcp_reass.c Mon Aug 6 17:47:47 2018 (r337387) +++ releng/11.1/sys/netinet/tcp_reass.c Mon Aug 6 17:48:46 2018 (r337388) @@ -89,6 +89,11 @@ SYSCTL_UMA_CUR(_net_inet_tcp_reass, OID_AUTO, cursegme _reass_zone, "Global number of TCP Segments currently in Reassembly Queue"); +static u_int tcp_reass_maxqueuelen = 100; +SYSCTL_UINT(_net_inet_tcp_reass, OID_AUTO, maxqueuelen, CTLFLAG_RWTUN, +_reass_maxqueuelen, 0, +"Maximum number of TCP Segments per Reassembly Queue"); + /* Initialize TCP reassembly queue */ static void tcp_reass_zone_change(void *tag) @@ -168,6 +173,10 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tl * socket receive buffer determines our advertised window and grows * automatically when socket buffer autotuning is enabled. Use it as the * basis for our queue limit. +* +* However, allow the user to specify a ceiling for the number of +* segments in each queue. +* * Always let the missing segment through which caused this queue. * NB: Access to the socket buffer is left intentionally unlocked as we * can tolerate stale information here. @@ -178,7 +187,8 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tl * is understood. */ if ((th->th_seq != tp->rcv_nxt || !TCPS_HAVEESTABLISHED(tp->t_state)) && - tp->t_segqlen >= (so->so_rcv.sb_hiwat / tp->t_maxseg) + 1) { + tp->t_segqlen >= min((so->so_rcv.sb_hiwat / tp->t_maxseg) + 1, + tcp_reass_maxqueuelen)) { TCPSTAT_INC(tcps_rcvreassfull); *tlenp = 0; if ((s = tcp_log_addrs(>t_inpcb->inp_inc, th, NULL, NULL))) { ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r337387 - in releng/11.2: share/man/man4 sys/netinet
Author: jtl Date: Mon Aug 6 17:47:47 2018 New Revision: 337387 URL: https://svnweb.freebsd.org/changeset/base/337387 Log: Address concerns about CPU usage while doing TCP reassembly. Currently, the per-queue limit is a function of the receive buffer size and the MSS. In certain cases (such as connections with large receive buffers), the per-queue segment limit can be quite large. Because we process segments as a linked list, large queues may not perform acceptably. The better long-term solution is to make the queue more efficient. But, in the short-term, we can provide a way for a system administrator to set the maximum queue size. We set the default queue limit to 100. This is an effort to balance performance with a sane resource limit. Depending on their environment, goals, etc., an administrator may choose to modify this limit in either direction. Approved by: so Security: FreeBSD-SA-18:08.tcp Security: CVE-2018-6922 Modified: releng/11.2/share/man/man4/tcp.4 releng/11.2/sys/netinet/tcp_reass.c Modified: releng/11.2/share/man/man4/tcp.4 == --- releng/11.2/share/man/man4/tcp.4Mon Aug 6 17:46:28 2018 (r337386) +++ releng/11.2/share/man/man4/tcp.4Mon Aug 6 17:47:47 2018 (r337387) @@ -445,6 +445,20 @@ no reseeding will occur. Reseeding should not be necessary, and will break .Dv TIME_WAIT recycling for a few minutes. +.It Va reass.cursegments +The current total number of segments present in all reassembly queues. +.It Va reass.maxsegments +The maximum limit on the total number of segments across all reassembly +queues. +The limit can be adjusted as a tunable. +.It Va reass.maxqueuelen +The maximum number of segments allowed in each reassembly queue. +By default, the system chooses a limit based on each TCP connection's +receive buffer size and maximum segment size (MSS). +The actual limit applied to a session's reassembly queue will be the lower of +the system-calculated automatic limit and the user-specified +.Va reass.maxqueuelen +limit. .It Va rexmit_min , rexmit_slop Adjust the retransmit timer calculation for .Tn TCP . Modified: releng/11.2/sys/netinet/tcp_reass.c == --- releng/11.2/sys/netinet/tcp_reass.c Mon Aug 6 17:46:28 2018 (r337386) +++ releng/11.2/sys/netinet/tcp_reass.c Mon Aug 6 17:47:47 2018 (r337387) @@ -89,6 +89,11 @@ SYSCTL_UMA_CUR(_net_inet_tcp_reass, OID_AUTO, cursegme _reass_zone, "Global number of TCP Segments currently in Reassembly Queue"); +static u_int tcp_reass_maxqueuelen = 100; +SYSCTL_UINT(_net_inet_tcp_reass, OID_AUTO, maxqueuelen, CTLFLAG_RWTUN, +_reass_maxqueuelen, 0, +"Maximum number of TCP Segments per Reassembly Queue"); + /* Initialize TCP reassembly queue */ static void tcp_reass_zone_change(void *tag) @@ -168,6 +173,10 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tl * socket receive buffer determines our advertised window and grows * automatically when socket buffer autotuning is enabled. Use it as the * basis for our queue limit. +* +* However, allow the user to specify a ceiling for the number of +* segments in each queue. +* * Always let the missing segment through which caused this queue. * NB: Access to the socket buffer is left intentionally unlocked as we * can tolerate stale information here. @@ -178,7 +187,8 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tl * is understood. */ if ((th->th_seq != tp->rcv_nxt || !TCPS_HAVEESTABLISHED(tp->t_state)) && - tp->t_segqlen >= (so->so_rcv.sb_hiwat / tp->t_maxseg) + 1) { + tp->t_segqlen >= min((so->so_rcv.sb_hiwat / tp->t_maxseg) + 1, + tcp_reass_maxqueuelen)) { TCPSTAT_INC(tcps_rcvreassfull); *tlenp = 0; if ((s = tcp_log_addrs(>t_inpcb->inp_inc, th, NULL, NULL))) { ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r337386 - in stable/10: share/man/man4 sys/netinet
Author: jtl Date: Mon Aug 6 17:46:28 2018 New Revision: 337386 URL: https://svnweb.freebsd.org/changeset/base/337386 Log: MFC r337384: Address concerns about CPU usage while doing TCP reassembly. Currently, the per-queue limit is a function of the receive buffer size and the MSS. In certain cases (such as connections with large receive buffers), the per-queue segment limit can be quite large. Because we process segments as a linked list, large queues may not perform acceptably. The better long-term solution is to make the queue more efficient. But, in the short-term, we can provide a way for a system administrator to set the maximum queue size. We set the default queue limit to 100. This is an effort to balance performance with a sane resource limit. Depending on their environment, goals, etc., an administrator may choose to modify this limit in either direction. Approved by: so Security: FreeBSD-SA-18:08.tcp Sponsored by: CVE-2018-6922 Modified: stable/10/share/man/man4/tcp.4 stable/10/sys/netinet/tcp_reass.c Directory Properties: stable/10/ (props changed) Modified: stable/10/share/man/man4/tcp.4 == --- stable/10/share/man/man4/tcp.4 Mon Aug 6 17:41:53 2018 (r337385) +++ stable/10/share/man/man4/tcp.4 Mon Aug 6 17:46:28 2018 (r337386) @@ -436,6 +436,20 @@ no reseeding will occur. Reseeding should not be necessary, and will break .Dv TIME_WAIT recycling for a few minutes. +.It Va reass.cursegments +The current total number of segments present in all reassembly queues. +.It Va reass.maxsegments +The maximum limit on the total number of segments across all reassembly +queues. +The limit can be adjusted as a tunable. +.It Va reass.maxqueuelen +The maximum number of segments allowed in each reassembly queue. +By default, the system chooses a limit based on each TCP connection's +receive buffer size and maximum segment size (MSS). +The actual limit applied to a session's reassembly queue will be the lower of +the system-calculated automatic limit and the user-specified +.Va reass.maxqueuelen +limit. .It Va rexmit_min , rexmit_slop Adjust the retransmit timer calculation for .Tn TCP . Modified: stable/10/sys/netinet/tcp_reass.c == --- stable/10/sys/netinet/tcp_reass.c Mon Aug 6 17:41:53 2018 (r337385) +++ stable/10/sys/netinet/tcp_reass.c Mon Aug 6 17:46:28 2018 (r337386) @@ -96,6 +96,11 @@ SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, static uma_zone_t tcp_reass_zone; +static u_int tcp_reass_maxqueuelen = 100; +SYSCTL_UINT(_net_inet_tcp_reass, OID_AUTO, maxqueuelen, CTLFLAG_RWTUN, +_reass_maxqueuelen, 0, +"Maximum number of TCP Segments per Reassembly Queue"); + /* Initialize TCP reassembly queue */ static void tcp_reass_zone_change(void *tag) @@ -184,6 +189,10 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tl * socket receive buffer determines our advertised window and grows * automatically when socket buffer autotuning is enabled. Use it as the * basis for our queue limit. +* +* However, allow the user to specify a ceiling for the number of +* segments in each queue. +* * Always let the missing segment through which caused this queue. * NB: Access to the socket buffer is left intentionally unlocked as we * can tolerate stale information here. @@ -194,7 +203,8 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tl * is understood. */ if ((th->th_seq != tp->rcv_nxt || !TCPS_HAVEESTABLISHED(tp->t_state)) && - tp->t_segqlen >= (so->so_rcv.sb_hiwat / tp->t_maxseg) + 1) { + tp->t_segqlen >= min((so->so_rcv.sb_hiwat / tp->t_maxseg) + 1, + tcp_reass_maxqueuelen)) { tcp_reass_overflows++; TCPSTAT_INC(tcps_rcvmemdrop); m_freem(m); ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r337385 - in stable/11: share/man/man4 sys/netinet
Author: jtl Date: Mon Aug 6 17:41:53 2018 New Revision: 337385 URL: https://svnweb.freebsd.org/changeset/base/337385 Log: MFC r337384: Address concerns about CPU usage while doing TCP reassembly. Currently, the per-queue limit is a function of the receive buffer size and the MSS. In certain cases (such as connections with large receive buffers), the per-queue segment limit can be quite large. Because we process segments as a linked list, large queues may not perform acceptably. The better long-term solution is to make the queue more efficient. But, in the short-term, we can provide a way for a system administrator to set the maximum queue size. We set the default queue limit to 100. This is an effort to balance performance with a sane resource limit. Depending on their environment, goals, etc., an administrator may choose to modify this limit in either direction. Reviewed by: jhb Approved by: so Security: FreeBSD-SA-18:08.tcp Security: CVE-2018-6922 Modified: stable/11/share/man/man4/tcp.4 stable/11/sys/netinet/tcp_reass.c Directory Properties: stable/11/ (props changed) Modified: stable/11/share/man/man4/tcp.4 == --- stable/11/share/man/man4/tcp.4 Mon Aug 6 17:36:57 2018 (r337384) +++ stable/11/share/man/man4/tcp.4 Mon Aug 6 17:41:53 2018 (r337385) @@ -445,6 +445,20 @@ no reseeding will occur. Reseeding should not be necessary, and will break .Dv TIME_WAIT recycling for a few minutes. +.It Va reass.cursegments +The current total number of segments present in all reassembly queues. +.It Va reass.maxsegments +The maximum limit on the total number of segments across all reassembly +queues. +The limit can be adjusted as a tunable. +.It Va reass.maxqueuelen +The maximum number of segments allowed in each reassembly queue. +By default, the system chooses a limit based on each TCP connection's +receive buffer size and maximum segment size (MSS). +The actual limit applied to a session's reassembly queue will be the lower of +the system-calculated automatic limit and the user-specified +.Va reass.maxqueuelen +limit. .It Va rexmit_min , rexmit_slop Adjust the retransmit timer calculation for .Tn TCP . Modified: stable/11/sys/netinet/tcp_reass.c == --- stable/11/sys/netinet/tcp_reass.c Mon Aug 6 17:36:57 2018 (r337384) +++ stable/11/sys/netinet/tcp_reass.c Mon Aug 6 17:41:53 2018 (r337385) @@ -89,6 +89,11 @@ SYSCTL_UMA_CUR(_net_inet_tcp_reass, OID_AUTO, cursegme _reass_zone, "Global number of TCP Segments currently in Reassembly Queue"); +static u_int tcp_reass_maxqueuelen = 100; +SYSCTL_UINT(_net_inet_tcp_reass, OID_AUTO, maxqueuelen, CTLFLAG_RWTUN, +_reass_maxqueuelen, 0, +"Maximum number of TCP Segments per Reassembly Queue"); + /* Initialize TCP reassembly queue */ static void tcp_reass_zone_change(void *tag) @@ -168,6 +173,10 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tl * socket receive buffer determines our advertised window and grows * automatically when socket buffer autotuning is enabled. Use it as the * basis for our queue limit. +* +* However, allow the user to specify a ceiling for the number of +* segments in each queue. +* * Always let the missing segment through which caused this queue. * NB: Access to the socket buffer is left intentionally unlocked as we * can tolerate stale information here. @@ -178,7 +187,8 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tl * is understood. */ if ((th->th_seq != tp->rcv_nxt || !TCPS_HAVEESTABLISHED(tp->t_state)) && - tp->t_segqlen >= (so->so_rcv.sb_hiwat / tp->t_maxseg) + 1) { + tp->t_segqlen >= min((so->so_rcv.sb_hiwat / tp->t_maxseg) + 1, + tcp_reass_maxqueuelen)) { TCPSTAT_INC(tcps_rcvreassfull); *tlenp = 0; if ((s = tcp_log_addrs(>t_inpcb->inp_inc, th, NULL, NULL))) { ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r337384 - in head: share/man/man4 sys/netinet
Author: jtl Date: Mon Aug 6 17:36:57 2018 New Revision: 337384 URL: https://svnweb.freebsd.org/changeset/base/337384 Log: Address concerns about CPU usage while doing TCP reassembly. Currently, the per-queue limit is a function of the receive buffer size and the MSS. In certain cases (such as connections with large receive buffers), the per-queue segment limit can be quite large. Because we process segments as a linked list, large queues may not perform acceptably. The better long-term solution is to make the queue more efficient. But, in the short-term, we can provide a way for a system administrator to set the maximum queue size. We set the default queue limit to 100. This is an effort to balance performance with a sane resource limit. Depending on their environment, goals, etc., an administrator may choose to modify this limit in either direction. Reviewed by: jhb Approved by: so Security: FreeBSD-SA-18:08.tcp Security: CVE-2018-6922 Modified: head/share/man/man4/tcp.4 head/sys/netinet/tcp_reass.c Modified: head/share/man/man4/tcp.4 == --- head/share/man/man4/tcp.4 Mon Aug 6 17:21:20 2018(r337383) +++ head/share/man/man4/tcp.4 Mon Aug 6 17:36:57 2018(r337384) @@ -445,6 +445,20 @@ no reseeding will occur. Reseeding should not be necessary, and will break .Dv TIME_WAIT recycling for a few minutes. +.It Va reass.cursegments +The current total number of segments present in all reassembly queues. +.It Va reass.maxsegments +The maximum limit on the total number of segments across all reassembly +queues. +The limit can be adjusted as a tunable. +.It Va reass.maxqueuelen +The maximum number of segments allowed in each reassembly queue. +By default, the system chooses a limit based on each TCP connection's +receive buffer size and maximum segment size (MSS). +The actual limit applied to a session's reassembly queue will be the lower of +the system-calculated automatic limit and the user-specified +.Va reass.maxqueuelen +limit. .It Va rexmit_min , rexmit_slop Adjust the retransmit timer calculation for .Tn TCP . Modified: head/sys/netinet/tcp_reass.c == --- head/sys/netinet/tcp_reass.cMon Aug 6 17:21:20 2018 (r337383) +++ head/sys/netinet/tcp_reass.cMon Aug 6 17:36:57 2018 (r337384) @@ -91,6 +91,11 @@ SYSCTL_UMA_CUR(_net_inet_tcp_reass, OID_AUTO, cursegme _reass_zone, "Global number of TCP Segments currently in Reassembly Queue"); +static u_int tcp_reass_maxqueuelen = 100; +SYSCTL_UINT(_net_inet_tcp_reass, OID_AUTO, maxqueuelen, CTLFLAG_RWTUN, +_reass_maxqueuelen, 0, +"Maximum number of TCP Segments per Reassembly Queue"); + /* Initialize TCP reassembly queue */ static void tcp_reass_zone_change(void *tag) @@ -170,6 +175,10 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tl * socket receive buffer determines our advertised window and grows * automatically when socket buffer autotuning is enabled. Use it as the * basis for our queue limit. +* +* However, allow the user to specify a ceiling for the number of +* segments in each queue. +* * Always let the missing segment through which caused this queue. * NB: Access to the socket buffer is left intentionally unlocked as we * can tolerate stale information here. @@ -180,7 +189,8 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tl * is understood. */ if ((th->th_seq != tp->rcv_nxt || !TCPS_HAVEESTABLISHED(tp->t_state)) && - tp->t_segqlen >= (so->so_rcv.sb_hiwat / tp->t_maxseg) + 1) { + tp->t_segqlen >= min((so->so_rcv.sb_hiwat / tp->t_maxseg) + 1, + tcp_reass_maxqueuelen)) { TCPSTAT_INC(tcps_rcvreassfull); *tlenp = 0; if ((s = tcp_log_addrs(>t_inpcb->inp_inc, th, NULL, NULL))) { ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
Re: svn commit: r335856 - in head/sys: netinet sys
On Mon, Jul 2, 2018 at 10:44 AM Steven Hartland < steven.hartl...@multiplay.co.uk> wrote: > > You have M_WAITOK and a null check in this change And, that's the same as the way it was before his commits. So, he did exactly what he said he was doing and reverted his commits. I don't think it is good practice to mix reverts with other changes. Since you've noticed this, I think you should feel free to make the change. Jonathan ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
Re: svn commit: r335402 - head/sbin/veriexecctl
On Wed, Jun 20, 2018 at 9:49 AM Stephen Kiernan wrote: > And I was working on those sets of changes, when work and family didn't > steal away time. I was told that some discussion happened at BSDCan this > year in such that veriexec should go in as-is so it would be there, which is why > the commit happened (given the review was approved to land back in January.) I will readily admit that I was probably the source of this. My reasoning was fairly simple: when a review has been open for over a year with no action, it seems like the submitter should be able to commit it without waiting for more review, if they are confident in their change. I stand by that (and, in fact, would substitute something shorter for "over a year"). (Of course, if the submitter has other reasons for delaying the commit, that's their prerogative, and I wasn't intending to push Steve to commit this before he was ready.) Jonathan ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
Re: svn commit: r335402 - head/sbin/veriexecctl
On Tue, Jun 19, 2018 at 8:34 PM Conrad Meyer wrote: > Please revert this patchset. It's not ready. > I'm not sure I understand the need to revert the patches. They may need some refinement, but they also do provide some functionality upon which you can build the tooling that Simon discussed. Unless I missed something, this feature only impacts the system when it is specifically compiled in. In cases like that, I think its reasonable to give the committer some time to refine them in place prior to the code slush/freeze, at which point we can decide what to do. Jonathan ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
Re: svn commit: r335068 - in head: share/man/man9 sys/amd64/amd64 sys/i386/i386 sys/kern sys/net sys/sys sys/vm
On Thu, Jun 14, 2018 at 7:12 PM Olivier Houchard wrote: > On Fri, Jun 15, 2018 at 12:23:36AM +0200, Emmanuel Vadot wrote: > > This brake module loading on armv7 and arm64 > > I think I fixed it with r335182. > Jonathan probably missed it because modules are differnt on amd64 (and > mips), and the code that handles those are different, and doesn't use > malloc(). > Hi Olivier, Thanks for fixing that! FWIW, I do have plans to make some further enhancements to fine-tune the permissions used for kernel modules. Once I have a patch, I would appreciate it if a few of the people impacted by this change would be willing to test it. Jonathan ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r335068 - in head: share/man/man9 sys/amd64/amd64 sys/i386/i386 sys/kern sys/net sys/sys sys/vm
Author: jtl Date: Wed Jun 13 17:04:41 2018 New Revision: 335068 URL: https://svnweb.freebsd.org/changeset/base/335068 Log: Make UMA and malloc(9) return non-executable memory in most cases. Most kernel memory that is allocated after boot does not need to be executable. There are a few exceptions. For example, kernel modules do need executable memory, but they don't use UMA or malloc(9). The BPF JIT compiler also needs executable memory and did use malloc(9) until r317072. (Note that a side effect of r316767 was that the "small allocation" path in UMA on amd64 already returned non-executable memory. This meant that some calls to malloc(9) or the UMA zone(9) allocator could return executable memory, while others could return non-executable memory. This change makes the behavior consistent.) This change makes malloc(9) return non-executable memory unless the new M_EXEC flag is specified. After this change, the UMA zone(9) allocator will always return non-executable memory, and a KASSERT will catch attempts to use the M_EXEC flag to allocate executable memory using uma_zalloc() or its variants. Allocations that do need executable memory have various choices. They may use the M_EXEC flag to malloc(9), or they may use a different VM interfact to obtain executable pages. Now that malloc(9) again allows executable allocations, this change also reverts most of r317072. PR: 228927 Reviewed by: alc, kib, markj, jhb (previous version) Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D15691 Modified: head/share/man/man9/malloc.9 head/share/man/man9/zone.9 head/sys/amd64/amd64/bpf_jit_machdep.c head/sys/i386/i386/bpf_jit_machdep.c head/sys/kern/kern_malloc.c head/sys/kern/subr_vmem.c head/sys/net/bpf_jitter.c head/sys/net/bpf_jitter.h head/sys/sys/malloc.h head/sys/vm/uma.h head/sys/vm/uma_core.c head/sys/vm/vm_extern.h head/sys/vm/vm_init.c head/sys/vm/vm_kern.c head/sys/vm/vm_kern.h head/sys/vm/vm_pagequeue.h Modified: head/share/man/man9/malloc.9 == --- head/share/man/man9/malloc.9Wed Jun 13 17:01:57 2018 (r335067) +++ head/share/man/man9/malloc.9Wed Jun 13 17:04:41 2018 (r335068) @@ -29,7 +29,7 @@ .\" $NetBSD: malloc.9,v 1.3 1996/11/11 00:05:11 lukem Exp $ .\" $FreeBSD$ .\" -.Dd January 24, 2018 +.Dd June 13, 2018 .Dt MALLOC 9 .Os .Sh NAME @@ -189,6 +189,11 @@ This option should only be used in combination with .Dv M_NOWAIT when an allocation failure cannot be tolerated by the caller without catastrophic effects on the system. +.It Dv M_EXEC +Indicates that the system should allocate executable memory. +If this flag is not set, the system will not allocate executable memory. +Not all platforms enforce a distinction between executable and +non-executable memory. .El .Pp Exactly one of either Modified: head/share/man/man9/zone.9 == --- head/share/man/man9/zone.9 Wed Jun 13 17:01:57 2018(r335067) +++ head/share/man/man9/zone.9 Wed Jun 13 17:04:41 2018(r335068) @@ -25,7 +25,7 @@ .\" .\" $FreeBSD$ .\" -.Dd April 26, 2017 +.Dd June 13, 2018 .Dt ZONE 9 .Os .Sh NAME @@ -375,6 +375,15 @@ if the zone ran out of unused items and .Dv M_NOWAIT was specified. +.Sh IMPLEMENTATION NOTES +The memory that these allocation calls return is not executable. +The +.Fn uma_zalloc +function does not support the +.Dv M_EXEC +flag to allocate executable memory. +Not all platforms enforce a distinction between executable and +non-executable memory. .Sh SEE ALSO .Xr malloc 9 .Sh HISTORY Modified: head/sys/amd64/amd64/bpf_jit_machdep.c == --- head/sys/amd64/amd64/bpf_jit_machdep.c Wed Jun 13 17:01:57 2018 (r335067) +++ head/sys/amd64/amd64/bpf_jit_machdep.c Wed Jun 13 17:04:41 2018 (r335068) @@ -44,9 +44,6 @@ __FBSDID("$FreeBSD$"); #include #include -#include -#include -#include #else #include #include @@ -605,11 +602,7 @@ bpf_jit_compile(struct bpf_insn *prog, u_int nins, siz *size = stream.cur_ip; #ifdef _KERNEL - /* -* We cannot use malloc(9) because DMAP is mapped as NX. -*/ - stream.ibuf = (void *)kmem_malloc(kernel_arena, *size, - M_NOWAIT); + stream.ibuf = malloc(*size, M_BPFJIT, M_EXEC | M_NOWAIT); if (stream.ibuf == NULL) break; #else @@ -657,15 +650,4 @@ bpf_jit_compile(struct bpf_insn *prog, u_int nins, siz #endif return ((bpf_filter_func)(void *)stream.ibuf); -} - -void -bpf_jit_free(void *func, size_t size) -{ - -#ifdef _KERNEL - kmem_free(kernel_arena, (vm_offset_t)func, size);
svn commit: r334983 - head/sys/net
Author: jtl Date: Mon Jun 11 23:32:06 2018 New Revision: 334983 URL: https://svnweb.freebsd.org/changeset/base/334983 Log: Fix a memory leak for the BIOCSETWF ioctl on kernels with the BPF_JITTER option. The BPF code was creating a compiled filter in the common filter-creation path. However, BPF only uses compiled filters in the read direction. When creating a write filter, the common filter-creation code was creating an unneeded write filter and leaking the memory used for that. MFC after:2 weeks Sponsored by: Netflix Modified: head/sys/net/bpf.c Modified: head/sys/net/bpf.c == --- head/sys/net/bpf.c Mon Jun 11 22:48:34 2018(r334982) +++ head/sys/net/bpf.c Mon Jun 11 23:32:06 2018(r334983) @@ -1895,8 +1895,13 @@ bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_lo return (EINVAL); } #ifdef BPF_JITTER - /* Filter is copied inside fcode and is perfectly valid. */ - jfunc = bpf_jitter(fcode, flen); + if (cmd != BIOCSETWF) { + /* +* Filter is copied inside fcode and is +* perfectly valid. +*/ + jfunc = bpf_jitter(fcode, flen); + } #endif } ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r334949 - in head/sys/netinet: . tcp_stacks
Author: jtl Date: Mon Jun 11 14:27:19 2018 New Revision: 334949 URL: https://svnweb.freebsd.org/changeset/base/334949 Log: Change RACK dependency on TCPHPTS from a build-time dependency to a load- time dependency. At present, RACK requires the TCPHPTS option to run. However, because modules can be moved from machine to machine, this dependency is really best assessed at load time rather than at build time. Reviewed by: rrs Sponsored by: Netflix Differential Revision:https://reviews.freebsd.org/D15756 Modified: head/sys/netinet/tcp_hpts.c head/sys/netinet/tcp_stacks/rack.c Modified: head/sys/netinet/tcp_hpts.c == --- head/sys/netinet/tcp_hpts.c Mon Jun 11 10:08:22 2018(r334948) +++ head/sys/netinet/tcp_hpts.c Mon Jun 11 14:27:19 2018(r334949) @@ -1960,3 +1960,4 @@ tcp_init_hptsi(void *st) } SYSINIT(tcphptsi, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, tcp_init_hptsi, NULL); +MODULE_VERSION(tcphpts, 1); Modified: head/sys/netinet/tcp_stacks/rack.c == --- head/sys/netinet/tcp_stacks/rack.c Mon Jun 11 10:08:22 2018 (r334948) +++ head/sys/netinet/tcp_stacks/rack.c Mon Jun 11 14:27:19 2018 (r334949) @@ -127,10 +127,6 @@ uma_zone_t rack_pcb_zone; struct sysctl_ctx_list rack_sysctl_ctx; struct sysctl_oid *rack_sysctl_root; -#ifndef TCPHPTS -fatal error missing option TCPHSTS in the build; -#endif - #define CUM_ACKED 1 #define SACKED 2 @@ -9162,3 +9158,4 @@ static moduledata_t tcp_rack = { MODULE_VERSION(MODNAME, 1); DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); +MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r334854 - head/share/man/man9
Author: jtl Date: Fri Jun 8 19:47:04 2018 New Revision: 334854 URL: https://svnweb.freebsd.org/changeset/base/334854 Log: Create a symlink for sodtor_set(9) to the socket(9) man page. Modified: head/share/man/man9/Makefile Modified: head/share/man/man9/Makefile == --- head/share/man/man9/MakefileFri Jun 8 19:35:24 2018 (r334853) +++ head/share/man/man9/MakefileFri Jun 8 19:47:04 2018 (r334854) @@ -1895,6 +1895,7 @@ MLINKS+=socket.9 soabort.9 \ socket.9 soconnect.9 \ socket.9 socreate.9 \ socket.9 sodisconnect.9 \ + socket.9 sodtor_set.9 \ socket.9 sodupsockaddr.9 \ socket.9 sofree.9 \ socket.9 sogetopt.9 \ ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r334853 - in head: share/man/man9 sys/kern sys/sys
Author: jtl Date: Fri Jun 8 19:35:24 2018 New Revision: 334853 URL: https://svnweb.freebsd.org/changeset/base/334853 Log: Add a socket destructor callback. This allows kernel providers to set callbacks to perform additional cleanup actions at the time a socket is closed. Michio Honda presented a use for this at BSDCan 2018. (See https://www.bsdcan.org/2018/schedule/events/965.en.html .) Submitted by: Michio Honda (previous version) Reviewed by: lstewart (previous version) Differential Revision:https://reviews.freebsd.org/D15706 Modified: head/share/man/man9/socket.9 head/sys/kern/uipc_socket.c head/sys/sys/socketvar.h Modified: head/share/man/man9/socket.9 == --- head/share/man/man9/socket.9Fri Jun 8 18:54:47 2018 (r334852) +++ head/share/man/man9/socket.9Fri Jun 8 19:35:24 2018 (r334853) @@ -26,7 +26,7 @@ .\" .\" $FreeBSD$ .\" -.Dd May 26, 2014 +.Dd June 8, 2018 .Dt SOCKET 9 .Os .Sh NAME @@ -54,6 +54,11 @@ .Fc .Ft int .Fn sodisconnect "struct socket *so" +.Ft void +.Fo sodtor_set +.Fa "struct socket *so" +.Fa "void (*func)(struct socket *)" +.Fc .Ft struct sockaddr * .Fn sodupsockaddr "const struct sockaddr *sa" "int mflags" .Ft void @@ -369,6 +374,13 @@ be cleared, with .Dv SO_RCV or .Dv SO_SND . +.Ss Socket Destructor Callback +A kernel system can use the +.Fn sodtor_set +function to set a destructor for a socket. +The destructor is called when the socket is closed. +The destructor is called after the protocol close routine has completed. +The destructor can serve as a callback to initiate additional cleanup actions. .Ss Socket I/O The .Fn soreceive Modified: head/sys/kern/uipc_socket.c == --- head/sys/kern/uipc_socket.c Fri Jun 8 18:54:47 2018(r334852) +++ head/sys/kern/uipc_socket.c Fri Jun 8 19:35:24 2018(r334853) @@ -1101,6 +1101,8 @@ soclose(struct socket *so) drop: if (so->so_proto->pr_usrreqs->pru_close != NULL) (*so->so_proto->pr_usrreqs->pru_close)(so); + if (so->so_dtor != NULL) + so->so_dtor(so); SOCK_LOCK(so); if ((listening = (so->so_options & SO_ACCEPTCONN))) { @@ -3810,6 +3812,17 @@ sodupsockaddr(const struct sockaddr *sa, int mflags) if (sa2) bcopy(sa, sa2, sa->sa_len); return sa2; +} + +/* + * Register per-socket destructor. + */ +void +sodtor_set(struct socket *so, so_dtor_t *func) +{ + + SOCK_LOCK_ASSERT(so); + so->so_dtor = func; } /* Modified: head/sys/sys/socketvar.h == --- head/sys/sys/socketvar.hFri Jun 8 18:54:47 2018(r334852) +++ head/sys/sys/socketvar.hFri Jun 8 19:35:24 2018(r334853) @@ -63,6 +63,7 @@ struct vnet; * private data and error information. */ typedefint so_upcall_t(struct socket *, void *, int); +typedefvoid so_dtor_t(struct socket *); struct socket; @@ -99,6 +100,7 @@ struct socket { /* NB: generation count must not be first. */ so_gen_t so_gencnt; /* (h) generation count */ void*so_emuldata; /* (b) private data for emulators */ + so_dtor_t *so_dtor; /* (b) optional destructor */ struct osd osd;/* Object Specific extensions */ /* * so_fibnum, so_user_cookie and friends can be used to attach @@ -397,6 +399,7 @@ int soconnect2(struct socket *so1, struct socket *so2) intsocreate(int dom, struct socket **aso, int type, int proto, struct ucred *cred, struct thread *td); intsodisconnect(struct socket *so); +void sodtor_set(struct socket *, so_dtor_t *); struct sockaddr *sodupsockaddr(const struct sockaddr *sa, int mflags); void sofree(struct socket *so); void sohasoutofband(struct socket *so); ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
Re: svn commit: r334804 - in head/sys: kern modules/tcp modules/tcp/rack netinet netinet/tcp_stacks sys
On Fri, Jun 8, 2018 at 12:37 PM, hiren panchasara < hi...@strugglingcoder.info> wrote: > ps: I know we are not killing the default stack as of yet and just > stopping active maintenance of it but just wanted to raise these > (probably obvious) points. We absolutely are not stopping active maintenance of the "default" (soon to be renamed "base") TCP stack. RACK is an alternative which might be useful for some people. But, there is no suggestion that we should in any way change the priority we give to maintaining the base TCP stack anytime in the near future. Jonathan ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r334783 - head/sys/vm
Author: jtl Date: Thu Jun 7 13:29:54 2018 New Revision: 334783 URL: https://svnweb.freebsd.org/changeset/base/334783 Log: Fix a typo in vm_domain_set(). When a domain crosses into the severe range, we need to set the domain bit from the vm_severe_domains bitset (instead of clearing it). Reviewed by: jeff, markj Sponsored by: Netflix, Inc. Modified: head/sys/vm/vm_page.c Modified: head/sys/vm/vm_page.c == --- head/sys/vm/vm_page.c Thu Jun 7 13:16:53 2018(r334782) +++ head/sys/vm/vm_page.c Thu Jun 7 13:29:54 2018(r334783) @@ -2845,7 +2845,7 @@ vm_domain_set(struct vm_domain *vmd) } if (!vmd->vmd_severeset && vm_paging_severe(vmd)) { vmd->vmd_severeset = 1; - DOMAINSET_CLR(vmd->vmd_domain, _severe_domains); + DOMAINSET_SET(vmd->vmd_domain, _severe_domains); } mtx_unlock(_domainset_lock); } ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
Re: svn commit: r334702 - head/sys/sys
On Wed, Jun 6, 2018 at 10:14 PM, Ravi Pokala wrote: > > -Original Message- > From: on behalf of Mateusz Guzik < mjgu...@gmail.com> > Date: 2018-06-06, Wednesday at 09:01 > To: Ravi Pokala > Cc: Mateusz Guzik , src-committers < src-committ...@freebsd.org>, , < svn-src-h...@freebsd.org> > Subject: Re: svn commit: r334702 - head/sys/sys > > > On Wed, Jun 6, 2018 at 1:35 PM, Ravi Pokala wrote: > > > >>> + * Passing the flag down requires malloc to blindly zero the entire object. > >>> + * In practice a lot of the zeroing can be avoided if most of the object > >>> + * gets explicitly initialized after the allocation. Letting the compiler > >>> + * zero in place gives it the opportunity to take advantage of this state. > >> > >> This part, I still don't understand. :-( > > > > The call to bzero() is still for the full length passed in, so how does this help? > > > > bzero is: > > #define bzero(buf, len) __builtin_memset((buf), 0, (len)) > > I'm afraid that doesn't answer my question; you're passing the full length to __builtin_memset() too. I believe the theory is that the compiler (remember, this is __builtin_memset) can optimize away portions of the zeroing, or can optimize zeroing for small sizes. For example, imagine you do this: struct foo { uint32_t a; uint32_t b; }; struct foo * alloc_foo(void) { struct foo *rv; rv = malloc(sizeof(*rv), M_TMP, M_WAITOK|M_ZERO); rv->a = 1; rv->b = 2; return (rv); } In theory, the compiler can be smart enough to know that the entire structure is initialized, so it is not necessary to zero it. (I personally have not tested how well this works in practice. However, this change theoretically lets the compiler be smarter and optimize away unneeded work.) At minimum, it should let the compiler replace calls to memset() (and the loops there) with optimal instructions to zero the exact amount of memory that needs to be initialized. (Again, I haven't personally tested how smart the compilers we use are about producing optimal code in this situation.) Jonathan ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r334490 - head/share/man/man9
Author: jtl Date: Fri Jun 1 16:47:39 2018 New Revision: 334490 URL: https://svnweb.freebsd.org/changeset/base/334490 Log: Update the sysctl(9) manpage to indicate that is required instead of . ( includes NULL, which is defined with and not .) Sponsored by: Netflix Modified: head/share/man/man9/sysctl.9 Modified: head/share/man/man9/sysctl.9 == --- head/share/man/man9/sysctl.9Fri Jun 1 16:46:29 2018 (r334489) +++ head/share/man/man9/sysctl.9Fri Jun 1 16:47:39 2018 (r334490) @@ -25,7 +25,7 @@ .\" .\" $FreeBSD$ .\" -.Dd March 3, 2018 +.Dd June 1, 2018 .Dt SYSCTL 9 .Os .Sh NAME @@ -80,7 +80,7 @@ .Nm SYSCTL_UQUAD .Nd Dynamic and static sysctl MIB creation functions .Sh SYNOPSIS -.In sys/types.h +.In sys/param.h .In sys/sysctl.h .Fn SYSCTL_DECL name .Ft struct sysctl_oid * ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
Re: svn commit: r334104 - in head/sys: netinet sys
On Wed, May 23, 2018 at 7:13 PM, Matthew Macywrote: > > On Wed, May 23, 2018 at 11:52 AM, John Baldwin wrote: > > On Wednesday, May 23, 2018 05:00:05 PM Matt Macy wrote: > >> Author: mmacy > >> Date: Wed May 23 17:00:05 2018 > >> New Revision: 334104 > >> URL: https://svnweb.freebsd.org/changeset/base/334104 > >> > >> Log: > >> epoch: allow for conditionally asserting that the epoch context fields > >> are unused by zeroing on INVARIANTS builds > > > > Is M_ZERO really so bad that you need to make it conditional? > > In this case not at all. It's only exercised by sysctl handlers. I'm > mostly responding to an inquiry by jtl. However, gratuitous M_ZERO > usage does have a cumulative adverse performance impact. I appreciate you making this change. And, I do think it is worth avoiding M_ZERO where it is unnecessary, for the reason you state. > > I would probably have preferred something like 'M_ZERO_INVARIANTS' > > instead perhaps (or M_ZERO_EPOCH) that only controls M_ZERO and is > > still or'd with M_WAITOK or M_NOWAIT. > > Yes. I like that better too. Thanks. Yes, that does seem better. Thanks! Jonathan ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
Re: svn commit: r333915 - head/sys/netinet
Summary: I'm not sure this addresses all the subtleties of INPCB destruction. I think it would benefit from wider review. I suggest it be reverted in the meantime. On Sun, May 20, 2018 at 12:38 AM, Matt Macywrote: > + > +/* > + * Unconditionally schedule an inpcb to be freed by decrementing its > + * reference count, which should occur only after the inpcb has been > detached > + * from its socket. If another thread holds a temporary reference > (acquired > + * using in_pcbref()) then the free is deferred until that reference is > + * released using in_pcbrele(), but the inpcb is still unlocked. Almost > all > + * work, including removal from global lists, is done in this context, > where > + * the pcbinfo lock is held. > + */ > +void > +in_pcbfree(struct inpcb *inp) > +{ > + struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; > + > + KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", > __func__)); > + KASSERT((inp->inp_flags2 & INP_FREED) == 0, > + ("%s: called twice for pcb %p", __func__, inp)); > + if (inp->inp_flags2 & INP_FREED) { > INP_WUNLOCK(inp); > + return; > + } > This check no longer serves its purpose; however, I believe it points to a deeper problem: INP_FREED is not set until the deferred epoch context. So, other threads may not know that another thread has already called in_pcbfree. This changes things, such as the behavior of in_pcbrele_[rw]locked(). In the worst case, if someone calls in_pcbfree() twice, the two calls to in_pcbremlists() will result in running `LIST_REMOVE(inp, inp_list)` twice, resulting in memory corruption. I haven't undertaken to audit all the code. However, the fact that in_pcbrele_[rw]locked() will no longer immediately return 1 may change logic such that this is now possible. > + > +#ifdef INVARIANTS > + if (pcbinfo == _tcbinfo) { > + INP_INFO_LOCK_ASSERT(pcbinfo); > + } else { > + INP_INFO_WLOCK_ASSERT(pcbinfo); > + } > +#endif > + INP_WLOCK_ASSERT(inp); > + /* Remove first from list */ > + INP_LIST_WLOCK(pcbinfo); > + inp->inp_gencnt = ++pcbinfo->ipi_gencnt; > + in_pcbremlists(inp); > in_pcbremlists() also increments inp->inp_gencnt(); therefore, it is not necessary to increment it above. > + INP_LIST_WUNLOCK(pcbinfo); > + RO_INVALIDATE_CACHE(>inp_route); > + INP_WUNLOCK(inp); > + epoch_call(net_epoch_preempt, >inp_epoch_ctx, > in_pcbfree_deferred); > } > > /* > > Modified: head/sys/netinet/in_pcb.h > > == > --- head/sys/netinet/in_pcb.h Sun May 20 04:32:48 2018(r333914) > +++ head/sys/netinet/in_pcb.h Sun May 20 04:38:04 2018(r333915) > @@ -328,6 +328,7 @@ struct inpcb { > LIST_ENTRY(inpcb) inp_list; /* (p/l) list for all PCBs for > proto */ > /* (p[w]) for list iteration */ > /* (p[r]/l) for addition/removal */ > + struct epoch_context inp_epoch_ctx; > }; > #endif /* _KERNEL */ > > > ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
Re: svn commit: r333690 - in head/sys: dev/hwpmc kern sys
On Wed, May 16, 2018 at 6:29 PM, Matt Macywrote: > > Author: mmacy > Date: Wed May 16 22:29:20 2018 > New Revision: 333690 > URL: https://svnweb.freebsd.org/changeset/base/333690 > > Log: > hwpmc: Implement per-thread counters for PMC sampling > > This implements per-thread counters for PMC sampling. The thread > descriptors are stored in a list attached to the process descriptor. > These thread descriptors can store any per-thread information necessary > for current or future features. For the moment, they just store the counters > for sampling. > > The thread descriptors are created when the process descriptor is created. > Additionally, thread descriptors are created or freed when threads > are started or stopped. Because the thread exit function is called in a > critical section, we can't directly free the thread descriptors. Hence, > they are freed to a cache, which is also used as a source of allocations > when needed for new threads. > > Approved by: sbruno > Obtained from:jtl > Sponsored by: Juniper Networks, Limelight Networks > Differential Revision:https://reviews.freebsd.org/D15335 Thanks! Jonathan ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r333511 - stable/11/sys/kern
Author: jtl Date: Sat May 12 01:55:24 2018 New Revision: 333511 URL: https://svnweb.freebsd.org/changeset/base/333511 Log: r285910 attempted to make shutdown() be POSIX compliant by returning ENOTCONN when shutdown() is called on unconnected sockets. This change was slightly modified by r316874, which returns ENOTCONN in the case of an unconnected datagram socket, but still runs the shutdown code for the socket. This specifically supports the case where the user-space code is using the shutdown() call to wakeup another thread blocked on the socket. In PR 227259, a user is reporting that they have code which is using shutdown() to wakup another thread blocked on a stream listen socket. This code is failing, while it used to work on FreeBSD 10 and still works on Linux. It seems reasonable to add another exception to support something users are actually doing, which used to work on FreeBSD 10, and still works on Linux. And, it seems like it should be acceptable to POSIX, as we still return ENOTCONN. This is a direct commit to stable/11. The listen socket code changed substantially in head, and the code change there will be substantially more complex. In the meantime, it seems to make sense to commit this trivial fix to stable/11 given the fact that users appear to depend on this behavior, this appears to have been an unintended change in stable/11, and we did not announce the change. PR: 227259 Reviewed by: ed Approved by: re (gjb) Sponsored by: Netflix, Inc. Differential Revision:https://reviews.freebsd.org/D15021 Tested by:Eric Masson (emss at free.fr) Modified: stable/11/sys/kern/uipc_socket.c Modified: stable/11/sys/kern/uipc_socket.c == --- stable/11/sys/kern/uipc_socket.cSat May 12 01:43:32 2018 (r333510) +++ stable/11/sys/kern/uipc_socket.cSat May 12 01:55:24 2018 (r333511) @@ -2359,7 +2359,8 @@ soshutdown(struct socket *so, int how) * both backward-compatibility and POSIX requirements by forcing * ENOTCONN but still asking protocol to perform pru_shutdown(). */ - if (so->so_type != SOCK_DGRAM) + if (so->so_type != SOCK_DGRAM && + !(so->so_options & SO_ACCEPTCONN)) return (ENOTCONN); soerror_enotconn = 1; } ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
Re: svn commit: r333503 - stable/11/sys/net
On Fri, May 11, 2018 at 4:40 PM, Stephen Hurdwrote: > > Author: shurd > Date: Fri May 11 20:40:26 2018 > New Revision: 333503 > URL: https://svnweb.freebsd.org/changeset/base/333503 > > Log: > MFC r29, r66, r73 > > r29: Fix off-by-one error requesting tx interrupt > r66: Cleanup queues when iflib_device_register fails > r73: Log iflib_tx_structures_setup failure in function > Is this an acceptable style for MFC logs? I'm asking because I actually prefer this to reading (or compiling) the concatenated log messages from several changes. However, I never knew it was acceptable to summarize like this. If it is, I'd like to know so I can adopt it for run-of-the-mill MFCs. Jonathan ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
Re: svn commit: r333211 - head/sys/netinet/cc
On Thu, May 3, 2018 at 11:01 AM, Sean Brunowrote: > > @@ -242,8 +241,8 @@ tf_cwnd(int ticks_since_cong, int rtt_ticks, unsigned > { > > /* Equation 4 of I-D. */ > - return (((wmax * CUBIC_BETA) + (((THREE_X_PT2 * ticks_since_cong * > - smss) << CUBIC_SHIFT) / TWO_SUB_PT2 / rtt_ticks)) >> CUBIC_SHIFT); > + return (((wmax * CUBIC_BETA) + (((THREE_X_PT3 * ticks_since_cong * > + smss) << CUBIC_SHIFT) / TWO_SUB_PT3 / rtt_ticks)) >> CUBIC_SHIFT); > } > > #endif /* _NETINET_CC_CUBIC_H_ */ > Did you analyze this to ensure that the intermediate steps in this calculation would never overflow? Jonathan ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r332958 - head/sys/kern
Author: jtl Date: Tue Apr 24 18:47:35 2018 New Revision: 332958 URL: https://svnweb.freebsd.org/changeset/base/332958 Log: Update r332860 by changing the default from suppressing post-panic assertions to not suppressing post-panic assertions. There are some post-panic assertions that are valuable and we shouldn't default to disabling them. However, when a user trips over them, the user can still adjust the tunable/sysctl to suppress them temporarily to get conduct troubleshooting (e.g. get a core dump). Reported by: cem, markj Modified: head/sys/kern/kern_shutdown.c Modified: head/sys/kern/kern_shutdown.c == --- head/sys/kern/kern_shutdown.c Tue Apr 24 18:41:14 2018 (r332957) +++ head/sys/kern/kern_shutdown.c Tue Apr 24 18:47:35 2018 (r332958) @@ -642,7 +642,7 @@ static int kassert_do_log = 1; static int kassert_log_pps_limit = 4; static int kassert_log_mute_at = 0; static int kassert_log_panic_at = 0; -static int kassert_suppress_in_panic = 1; +static int kassert_suppress_in_panic = 0; static int kassert_warnings = 0; SYSCTL_NODE(_debug, OID_AUTO, kassert, CTLFLAG_RW, NULL, "kassert options"); ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
Re: svn commit: r332860 - head/sys/kern
On Mon, Apr 23, 2018 at 6:04 PM, John Baldwinwrote: > > I think this is actually a key question. In my experience to date I have not > encountered a large number of post-panic assertion failures. Given that > we already break all locks and disable assertions for locks I'd be curious > which assertions are actually failing. My inclination given my experiences > to date would be to explicitly ignore those as we do for locking if it is > constrained set rather than blacklisting all of them. However, I would be > most interested in seeing some examples of assertions that are failing. The latest example (the one that prompted me to finally commit this) is in lockmgr_sunlock_try(): 'panic: Assertion (*xp & ~LK_EXCLUSIVE_SPINNERS) == LK_SHARERS_LOCK(1) failed at /usr/src/sys/kern/kern_lock.c:541' I don't see any obvious recent changes that would have caused this, so this is probably a case where a change to another file suddenly made us trip over this assert. And, that really illustrates my overall point: most assertions in general-use code have limited value after a panic. We expect developers to write high-quality assertions so we can catch bugs. This requires that they understand how their code will be used. However, once we've panic'd, many assumptions behind code change and the assertions are no longer valid. (And, sometimes, it is difficult for a developer to predict how these things will change in a panic situation.) We can either play whack-a-mole to modify assertions as we trip over them in our post-panic work, or we can switch to an opt-in model where we only check assertions which the developer actually intends to run post-panic. Playing whack-a-mole seems like a Sisyphean task which will burn out developers and/or frustrate people who run INVARIANTS kernels. Switching to an opt-in model seems like the better long-term strategy. Having said all of that, I am cognizant of at least two things: 1) Mark Johnston has done a lot of work in coredumps and thinks there are post-panic assertions that have value. 2) Until we have both agreement to switch our post-panic assertion paradigm and also infrastructure to allow developers to opt in, it probably is not wise to disable all assertions by default. So, I will follow Mark's suggestions: I will change the default. I will also change the code so we print a limited number of failed assertions. However, I think that changing the post-panic assertion paradigm is an important conversation to have. We want people to run our INVARIANTS kernels. And, we want to get high-quality reports from those. I think we could better serve those goals by changing the post-panic assertion paradigm. Jonathan ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
Re: svn commit: r332860 - head/sys/kern
Hi Mark, Let me start by saying that I appreciate your well-reasoned response. (I think) I understand your reasoning, I appreciate your well-explained argument, and I respect your opinion. I just wanted to make that clear up front. On Sun, Apr 22, 2018 at 1:11 PM, Mark Johnstonwrote: > > > All too often, my ability to debug assertion violations is hindered because > > the system trips over yet another assertion while dumping the core. If we > > skip the assertion, nothing bad happens. (The post-panic debugging code > > already needs to deal with systems that are inconsistent, and it does a > > pretty good job at it.) > > I think we make a decent effort to fix such problems as they arise, but > you are declaring defeat on behalf of everyone. Did you make some effort > to fix or report these issues before resorting to the more drastic > measure taken here? We try to report or fix them as they arise. However, you don't know there is a problem until you actually run into it. And, you don't run into the problem until you can't get a core dump due to the assertion. (And, with elusive problems, it isn't always easy to duplicate them. So, fixing the assertion is sometimes "too late".) > > On the other hand, I really am not sure what you are worried might happen > > if we skip checking assertions after we've already panic'd. As far as I can > > tell, the likely worst case is that we hit a true panic of some kind. In > > that case, we're no worse off than before. > > > > I think the one obvious exception is when we're purposely trying to > > validate the post-panic debugging code. In that case, you can change the > > sysctl/tunable to enable troubleshooting. > > What about a user whose test system panics and fails to dump? With > assertions enabled, a developer has a better chance of spotting the > problem. Now we need at least one extra round trip to the user to > diagnose the problem, which may not be readily reproducible in the first > place. That's true. However, this is equally true in the other direction: Prior to this change, when a user tripped over an assertion and was unable to get a coredump due to a second post-panic assertion, it took (at least) another round-trip to get a coredump. First, without the capability to ignore assertions after a panic (introduced by this commit), you would need to fix the actual assertion to enable the user to get a coredump. At minimum, I think this change has value in that case. This change gives you a mechanism to get a coredump without requiring that you fix the assertion and get the user to recompile with the patch. But, moreover, if we change the default back to panic'ing on a second assertion, we will hamper our ability to get usable reports about elusive bugs. If we leave the default "as is", it won't take an extra round-trip to tell the user how to get a coredump. If we change the default (or, perhaps more correctly, "restore the prior default"), we will still need a second round-trip to get coredumps. That makes it tough to chase elusive bugs. > > I would honestly appreciate someone explaining the dangers in disabling a > > response to assertion violations after we've already panic'd and are simply > > trying to troubleshoot, because they are not obvious to me. But, I could > > simply be missing them. > > The assertions help identify code that is being executed during a dump > when it shouldn't be. In general we rely on users to opt in to running > INVARIANTS kernels because developers don't catch every single bug. With > this change it's harder to be confident in the kernel dump code. (Or in > any post-panic debugging code for that matter.) I can appreciate that. I am generally skeptical of the value of assertions in general-use code after a panic, since we already know the system is in an inconsistent/unexpected state. And, it is hard to predict all the various ways it could possibly be broken. However, I do recognize that there is code which specifically is written to run post-panic, and which has assertions which SHOULD be true, even after a panic. > I dislike the change and would prefer the default to be inverted. At the > very least I think we should print the assertion message rather than > returning silently from kassert_panic(). I still think this change has value (as described above). I can understand the argument for changing the default. In fact, after thinking about your email, I'm leaning towards doing that. But, I want to ponder it more today. If we leave the default alone, I agree we should print the assertion message (albeit with some rate limit). Jonathan ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r332889 - stable/11/sys/netinet
Author: jtl Date: Mon Apr 23 14:22:16 2018 New Revision: 332889 URL: https://svnweb.freebsd.org/changeset/base/332889 Log: MFC r331745 (by np): Fix RSS build (broken in r331309). Sponsored by: Chelsio Communications PR: 227691 Pointy-hat to:jtl Modified: stable/11/sys/netinet/in_pcb.c Directory Properties: stable/11/ (props changed) Modified: stable/11/sys/netinet/in_pcb.c == --- stable/11/sys/netinet/in_pcb.c Mon Apr 23 13:47:29 2018 (r332888) +++ stable/11/sys/netinet/in_pcb.c Mon Apr 23 14:22:16 2018 (r332889) @@ -1807,9 +1807,9 @@ in_pcblookup_group(struct inpcbinfo *pcbinfo, struct i found: if (lookupflags & INPLOOKUP_WLOCKPCB) - locked = TRY_INP_WLOCK(inp); + locked = INP_TRY_WLOCK(inp); else if (lookupflags & INPLOOKUP_RLOCKPCB) - locked = TRY_INP_RLOCK(inp); + locked = INP_TRY_RLOCK(inp); else panic("%s: locking bug", __func__); if (!locked) ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r332860 - head/sys/kern
Author: jtl Date: Sat Apr 21 17:05:00 2018 New Revision: 332860 URL: https://svnweb.freebsd.org/changeset/base/332860 Log: When running with INVARIANTS, the kernel contains extra checks. However, these assumptions may not hold true once we've panic'd. Therefore, the checks hold less value after a panic. Additionally, if one of the checks fails while we are already panic'd, this creates a double-panic which can interfere with debugging the original panic. Therefore, this commit allows an administrator to suppress a response to KASSERT checks after a panic by setting a tunable/sysctl. The tunable/sysctl (debug.kassert.suppress_in_panic) defaults to being enabled. Reviewed by: kib Sponsored by: Netflix, Inc. Differential Revision:https://reviews.freebsd.org/D12920 Modified: head/sys/kern/kern_shutdown.c Modified: head/sys/kern/kern_shutdown.c == --- head/sys/kern/kern_shutdown.c Sat Apr 21 15:15:47 2018 (r332859) +++ head/sys/kern/kern_shutdown.c Sat Apr 21 17:05:00 2018 (r332860) @@ -642,6 +642,7 @@ static int kassert_do_log = 1; static int kassert_log_pps_limit = 4; static int kassert_log_mute_at = 0; static int kassert_log_panic_at = 0; +static int kassert_suppress_in_panic = 1; static int kassert_warnings = 0; SYSCTL_NODE(_debug, OID_AUTO, kassert, CTLFLAG_RW, NULL, "kassert options"); @@ -676,6 +677,10 @@ SYSCTL_INT(_debug_kassert, OID_AUTO, log_pps_limit, CT SYSCTL_INT(_debug_kassert, OID_AUTO, log_mute_at, CTLFLAG_RWTUN, _log_mute_at, 0, "max number of KASSERTS to log"); +SYSCTL_INT(_debug_kassert, OID_AUTO, suppress_in_panic, CTLFLAG_RWTUN, +_suppress_in_panic, 0, +"KASSERTs will be suppressed while handling a panic"); + static int kassert_sysctl_kassert(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_debug_kassert, OID_AUTO, kassert, @@ -707,6 +712,10 @@ kassert_panic(const char *fmt, ...) { static char buf[256]; va_list ap; + + /* If we already panic'd, don't create a double-fault. */ + if (panicstr != NULL && kassert_suppress_in_panic) + return; va_start(ap, fmt); (void)vsnprintf(buf, sizeof(buf), fmt, ap); ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
Re: svn commit: r332833 - in head: contrib/llvm/include/llvm/CodeGen contrib/llvm/lib/CodeGen contrib/llvm/lib/Target/X86 contrib/llvm/lib/Target/X86/Disassembler contrib/llvm/tools/clang/include/clan
On Fri, Apr 20, 2018 at 2:20 PM, Dimitry Andricwrote: > > Author: dim > Date: Fri Apr 20 18:20:55 2018 > New Revision: 332833 > URL: https://svnweb.freebsd.org/changeset/base/332833 > > Log: > Recommit r332501, with an additional upstream fix for "Cannot lower > EFLAGS copy that lives out of a basic block!" errors on i386. > > [...] > > Together, these should ensure clang does not use pushf/popf sequences to > save and restore flags, avoiding problems with unrelated flags (such as > the interrupt flag) being restored unexpectedly. > > Requested by: jtl > PR: 225330 > MFC after:1 week Thanks (again)! Jonathan ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r332842 - stable/11/sys/netinet6
Author: jtl Date: Fri Apr 20 20:18:10 2018 New Revision: 332842 URL: https://svnweb.freebsd.org/changeset/base/332842 Log: MFC r319216: Fix an unnecessary/incorrect check in the PKTOPT_EXTHDRCPY macro. This macro allocates memory and, if malloc does not return NULL, copies data into the new memory. However, it doesn't just check whether malloc returns NULL. It also checks whether we called malloc with M_NOWAIT. That is not necessary. While it may be that malloc() will only return NULL when the M_NOWAIT flag is set, we don't need to check for this when checking malloc's return value. Further, in this case, the check was not completely accurate, because it checked for flags == M_NOWAIT, rather than treating it as a bit field and checking for (flags & M_NOWAIT). Sponsored by: Netflix, Inc. Modified: stable/11/sys/netinet6/ip6_output.c Directory Properties: stable/11/ (props changed) Modified: stable/11/sys/netinet6/ip6_output.c == --- stable/11/sys/netinet6/ip6_output.c Fri Apr 20 20:16:42 2018 (r332841) +++ stable/11/sys/netinet6/ip6_output.c Fri Apr 20 20:18:10 2018 (r332842) @@ -2421,7 +2421,7 @@ do {\ if (src->type) {\ int hlen = (((struct ip6_ext *)src->type)->ip6e_len + 1) << 3;\ dst->type = malloc(hlen, M_IP6OPT, canwait);\ - if (dst->type == NULL && canwait == M_NOWAIT)\ + if (dst->type == NULL)\ goto bad;\ bcopy(src->type, dst->type, hlen);\ }\ ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r332841 - stable/11/sys/netinet6
Author: jtl Date: Fri Apr 20 20:16:42 2018 New Revision: 332841 URL: https://svnweb.freebsd.org/changeset/base/332841 Log: MFC r319215: Fix two places in the ICMP6 code where we could dereference a NULL pointer in the icmp6_input() function. When processing an ICMP6_ECHO_REQUEST, if IP6_EXTHDR_GET fails, it will set nicmp6 and n to NULL. Therefore, we should condition our modification to nicmp6 on n being not NULL. And, when processing an ICMP6_WRUREQUEST in the (mode != FQDN) case, if m_dup_pkthdr() fails, the code will set n to NULL. However, the very next line dereferences n. Therefore, when m_dup_pkthdr() fails, we should discontinue further processing and follow the same path as when m_gethdr() fails. Reported by: clang static analyzer Sponsored by: Netflix, Inc. Modified: stable/11/sys/netinet6/icmp6.c Directory Properties: stable/11/ (props changed) Modified: stable/11/sys/netinet6/icmp6.c == --- stable/11/sys/netinet6/icmp6.c Fri Apr 20 20:09:42 2018 (r332840) +++ stable/11/sys/netinet6/icmp6.c Fri Apr 20 20:16:42 2018 (r332841) @@ -596,9 +596,9 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) sizeof(*nicmp6)); noff = off; } - nicmp6->icmp6_type = ICMP6_ECHO_REPLY; - nicmp6->icmp6_code = 0; if (n) { + nicmp6->icmp6_type = ICMP6_ECHO_REPLY; + nicmp6->icmp6_code = 0; ICMP6STAT_INC(icp6s_reflect); ICMP6STAT_INC(icp6s_outhist[ICMP6_ECHO_REPLY]); icmp6_reflect(n, noff); @@ -688,6 +688,7 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) */ m_free(n); n = NULL; + break; } maxhlen = M_TRAILINGSPACE(n) - (sizeof(*nip6) + sizeof(*nicmp6) + 4); ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r332840 - in stable/11/sys: netinet netinet6
Author: jtl Date: Fri Apr 20 20:09:42 2018 New Revision: 332840 URL: https://svnweb.freebsd.org/changeset/base/332840 Log: MFC r319214: Enforce the limit on ICMP messages before doing work to formulate the response. Delete an unneeded rate limit for UDP under IPv6. Because ICMP6 messages have their own rate limit, it is unnecessary to apply a second rate limit to UDP messages. Sponsored by: Netflix, Inc. Modified: stable/11/sys/netinet/ip_icmp.c stable/11/sys/netinet6/udp6_usrreq.c Directory Properties: stable/11/ (props changed) Modified: stable/11/sys/netinet/ip_icmp.c == --- stable/11/sys/netinet/ip_icmp.c Fri Apr 20 19:37:08 2018 (r332839) +++ stable/11/sys/netinet/ip_icmp.c Fri Apr 20 20:09:42 2018 (r332840) @@ -555,11 +555,10 @@ icmp_input(struct mbuf **mp, int *offp, int proto) ICMPSTAT_INC(icps_bmcastecho); break; } - icp->icmp_type = ICMP_ECHOREPLY; if (badport_bandlim(BANDLIM_ICMP_ECHO) < 0) goto freeit; - else - goto reflect; + icp->icmp_type = ICMP_ECHOREPLY; + goto reflect; case ICMP_TSTAMP: if (V_icmptstamprepl == 0) @@ -573,13 +572,12 @@ icmp_input(struct mbuf **mp, int *offp, int proto) ICMPSTAT_INC(icps_badlen); break; } + if (badport_bandlim(BANDLIM_ICMP_TSTAMP) < 0) + goto freeit; icp->icmp_type = ICMP_TSTAMPREPLY; icp->icmp_rtime = iptime(); icp->icmp_ttime = icp->icmp_rtime; /* bogus, do later! */ - if (badport_bandlim(BANDLIM_ICMP_TSTAMP) < 0) - goto freeit; - else - goto reflect; + goto reflect; case ICMP_MASKREQ: if (V_icmpmaskrepl == 0) Modified: stable/11/sys/netinet6/udp6_usrreq.c == --- stable/11/sys/netinet6/udp6_usrreq.cFri Apr 20 19:37:08 2018 (r332839) +++ stable/11/sys/netinet6/udp6_usrreq.cFri Apr 20 20:09:42 2018 (r332840) @@ -104,9 +104,7 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include #include -#include #include #include #include @@ -465,8 +463,6 @@ udp6_input(struct mbuf **mp, int *offp, int proto) goto badunlocked; } if (V_udp_blackhole) - goto badunlocked; - if (badport_bandlim(BANDLIM_ICMP6_UNREACH) < 0) goto badunlocked; icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOPORT, 0); return (IPPROTO_DONE); ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r332834 - stable/11/sys/net
Author: jtl Date: Fri Apr 20 18:37:19 2018 New Revision: 332834 URL: https://svnweb.freebsd.org/changeset/base/332834 Log: MFC r314286: Do some minimal work to better conform to the 802.3ad (LACP) standard. In particular, don't set the synchronized bit for the peer unless it truly appears to be synchronized to us. Also, don't set our own synchronized bit unless we have actually seen a remote system. Prior to this change, we were seeing some strange behavior, such as: 1. We send an advertisement with the Activity, Aggregation, and Default flags, followed by an advertisement with the Activity, Aggregation, Synchronization, and Default flags. However, we hadn't seen an advertisement from another peer and were still advertising the default (NULL) peer. A closer examination of the in-kernel data structures (using kgdb) showed that the system had added the default (NULL) peer as a valid aggregator for the segment. 2. We were receiving an advertisement from a peer that included the default (NULL) peer instead of including our system information. However, we responded with an advertisement that included the Synchronization flag for both our system and the peer. (Since the peer's advertisement did not include our system information, we shouldn't add the synchronization bit for the peer.) This patch corrects those two items. Sponsored by: Netflix, Inc. Modified: stable/11/sys/net/ieee8023ad_lacp.c Directory Properties: stable/11/ (props changed) Modified: stable/11/sys/net/ieee8023ad_lacp.c == --- stable/11/sys/net/ieee8023ad_lacp.c Fri Apr 20 18:20:55 2018 (r332833) +++ stable/11/sys/net/ieee8023ad_lacp.c Fri Apr 20 18:37:19 2018 (r332834) @@ -1308,6 +1308,10 @@ lacp_select(struct lacp_port *lp) return; } + /* If we haven't heard from our peer, skip this step. */ + if (lp->lp_state & LACP_STATE_DEFAULTED) + return; + KASSERT(!LACP_TIMER_ISARMED(lp, LACP_TIMER_WAIT_WHILE), ("timer_wait_while still active")); @@ -1663,7 +1667,15 @@ lacp_sm_rx_record_pdu(struct lacp_port *lp, const stru LACP_STATE_AGGREGATION) && !lacp_compare_peerinfo(>lp_actor, >ldu_partner)) || (du->ldu_partner.lip_state & LACP_STATE_AGGREGATION) == 0)) { - /* XXX nothing? */ + /* +* XXX Maintain legacy behavior of leaving the +* LACP_STATE_SYNC bit unchanged from the partner's +* advertisement if lsc_strict_mode is false. +* TODO: We should re-examine the concept of the "strict mode" +* to ensure it makes sense to maintain a non-strict mode. +*/ + if (lp->lp_lsc->lsc_strict_mode) + lp->lp_partner.lip_state |= LACP_STATE_SYNC; } else { lp->lp_partner.lip_state &= ~LACP_STATE_SYNC; } @@ -1677,10 +1689,6 @@ lacp_sm_rx_record_pdu(struct lacp_port *lp, const stru lacp_format_state(lp->lp_partner.lip_state, buf, sizeof(buf; } - - /* XXX Hack, still need to implement 5.4.9 para 2,3,4 */ - if (lp->lp_lsc->lsc_strict_mode) - lp->lp_partner.lip_state |= LACP_STATE_SYNC; lacp_sm_ptx_update_timeout(lp, oldpstate); } ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r332831 - stable/11/sys/kern
Author: jtl Date: Fri Apr 20 15:55:09 2018 New Revision: 332831 URL: https://svnweb.freebsd.org/changeset/base/332831 Log: MFC r314116: Fix a panic during boot caused by inadequate locking of some vt(4) driver data structures. vt_change_font() calls vtbuf_grow() to change some vt driver data structures. It uses TF_MUTE to prevent the console from trying to use those data structures while it changes them. During the early stage of the boot process, the vt driver's tc_done routine uses those data structures; however, it is currently called outside the TF_MUTE check. Move the tc_done routine inside the locked TF_MUTE check. PR: 217282 Sponsored by: Netflix, Inc. Modified: stable/11/sys/kern/subr_terminal.c Directory Properties: stable/11/ (props changed) Modified: stable/11/sys/kern/subr_terminal.c == --- stable/11/sys/kern/subr_terminal.c Fri Apr 20 15:48:50 2018 (r332830) +++ stable/11/sys/kern/subr_terminal.c Fri Apr 20 15:55:09 2018 (r332831) @@ -400,7 +400,10 @@ termtty_outwakeup(struct tty *tp) TERMINAL_UNLOCK_TTY(tm); } - tm->tm_class->tc_done(tm); + TERMINAL_LOCK_TTY(tm); + if (!(tm->tm_flags & TF_MUTE)) + tm->tm_class->tc_done(tm); + TERMINAL_UNLOCK_TTY(tm); if (flags & TF_BELL) tm->tm_class->tc_bell(tm); } @@ -570,10 +573,9 @@ termcn_cnputc(struct consdev *cp, int c) teken_set_curattr(>tm_emulator, _message); teken_input(>tm_emulator, , 1); teken_set_curattr(>tm_emulator, ); + tm->tm_class->tc_done(tm); } TERMINAL_UNLOCK_CONS(tm); - - tm->tm_class->tc_done(tm); } /* ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r332830 - in stable/11/sys: dev/acpica x86/x86
Author: jtl Date: Fri Apr 20 15:48:50 2018 New Revision: 332830 URL: https://svnweb.freebsd.org/changeset/base/332830 Log: MFC r313447: Ensure the idle thread's loop services interrupts in a timely way when using the ACPI C1/mwait sleep method. Previously, the mwait instruction would return when an interrupt was pending; however, the idle loop did not actually enable interrupts when this occurred. This led to a situation where the idle loop could quickly spin through the C1/mwait sleep method a number of times when an interrupt was pending. (Eventually, the situation corrected itself when something other than an interrupt triggered the idle loop to either enable interrupts or schedule another thread.) Sponsored by: Netflix, Inc. Modified: stable/11/sys/dev/acpica/acpi_cpu.c stable/11/sys/x86/x86/cpu_machdep.c Directory Properties: stable/11/ (props changed) Modified: stable/11/sys/dev/acpica/acpi_cpu.c == --- stable/11/sys/dev/acpica/acpi_cpu.c Fri Apr 20 15:44:29 2018 (r332829) +++ stable/11/sys/dev/acpica/acpi_cpu.c Fri Apr 20 15:48:50 2018 (r332830) @@ -1151,6 +1151,9 @@ acpi_cpu_idle(sbintime_t sbt) end_time = ((cpu_ticks() - cputicks) << 20) / cpu_tickrate(); if (curthread->td_critnest == 0) end_time = min(end_time, 50 / hz); + /* acpi_cpu_c1() returns with interrupts enabled. */ + if (cx_next->do_mwait) + ACPI_ENABLE_IRQS(); sc->cpu_prev_sleep = (sc->cpu_prev_sleep * 3 + end_time) / 4; return; } Modified: stable/11/sys/x86/x86/cpu_machdep.c == --- stable/11/sys/x86/x86/cpu_machdep.c Fri Apr 20 15:44:29 2018 (r332829) +++ stable/11/sys/x86/x86/cpu_machdep.c Fri Apr 20 15:48:50 2018 (r332830) @@ -146,6 +146,14 @@ acpi_cpu_c1(void) __asm __volatile("sti; hlt"); } +/* + * Use mwait to pause execution while waiting for an interrupt or + * another thread to signal that there is more work. + * + * NOTE: Interrupts will cause a wakeup; however, this function does + * not enable interrupt handling. The caller is responsible to enable + * interrupts. + */ void acpi_cpu_idle_mwait(uint32_t mwait_hint) { ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
svn commit: r332829 - stable/11/sys/netinet
Author: jtl Date: Fri Apr 20 15:44:29 2018 New Revision: 332829 URL: https://svnweb.freebsd.org/changeset/base/332829 Log: MFC r307083: Currently, when tcp_input() receives a packet on a session that matches a TCPCB, it checks (so->so_options & SO_ACCEPTCONN) to determine whether or not the socket is a listening socket. However, this causes the code to access a different cacheline. If we first check if the socket is in the LISTEN state, we can avoid accessing so->so_options when processing packets received for ESTABLISHED sessions. If INVARIANTS is defined, the code still needs to access both variables to check that so->so_options is consistent with the state. Sponsored by: Netflix, Inc. Modified: stable/11/sys/netinet/tcp_input.c Directory Properties: stable/11/ (props changed) Modified: stable/11/sys/netinet/tcp_input.c == --- stable/11/sys/netinet/tcp_input.c Fri Apr 20 15:41:33 2018 (r332828) +++ stable/11/sys/netinet/tcp_input.c Fri Apr 20 15:44:29 2018 (r332829) @@ -1071,11 +1071,11 @@ relocked: * state) we look into the SYN cache if this is a new connection * attempt or the completion of a previous one. */ - if (so->so_options & SO_ACCEPTCONN) { + KASSERT(tp->t_state == TCPS_LISTEN || !(so->so_options & SO_ACCEPTCONN), + ("%s: so accepting but tp %p not listening", __func__, tp)); + if (tp->t_state == TCPS_LISTEN && (so->so_options & SO_ACCEPTCONN)) { struct in_conninfo inc; - KASSERT(tp->t_state == TCPS_LISTEN, ("%s: so accepting but " - "tp not listening", __func__)); bzero(, sizeof(inc)); #ifdef INET6 if (isipv6) { ___ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"