Module Name: src Committed By: ozaki-r Date: Tue Sep 20 07:19:15 UTC 2022
Modified Files: src/distrib/sets/lists/comp: mi src/sys/netinet: Makefile files.netinet tcp_input.c tcp_subr.c tcp_usrreq.c tcp_var.h src/sys/rump/net/lib/libnetinet: Makefile.inc Added Files: src/sys/netinet: tcp_syncache.c tcp_syncache.h Log Message: tcp: separate syn cache stuffs into tcp_syncache.[ch] files No functional change. To generate a diff of this commit: cvs rdiff -u -r1.2423 -r1.2424 src/distrib/sets/lists/comp/mi cvs rdiff -u -r1.30 -r1.31 src/sys/netinet/Makefile cvs rdiff -u -r1.29 -r1.30 src/sys/netinet/files.netinet cvs rdiff -u -r1.433 -r1.434 src/sys/netinet/tcp_input.c cvs rdiff -u -r1.290 -r1.291 src/sys/netinet/tcp_subr.c cvs rdiff -u -r0 -r1.1 src/sys/netinet/tcp_syncache.c \ src/sys/netinet/tcp_syncache.h cvs rdiff -u -r1.231 -r1.232 src/sys/netinet/tcp_usrreq.c cvs rdiff -u -r1.196 -r1.197 src/sys/netinet/tcp_var.h cvs rdiff -u -r1.15 -r1.16 src/sys/rump/net/lib/libnetinet/Makefile.inc Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/distrib/sets/lists/comp/mi diff -u src/distrib/sets/lists/comp/mi:1.2423 src/distrib/sets/lists/comp/mi:1.2424 --- src/distrib/sets/lists/comp/mi:1.2423 Sat Sep 10 15:50:57 2022 +++ src/distrib/sets/lists/comp/mi Tue Sep 20 07:19:14 2022 @@ -1,4 +1,4 @@ -# $NetBSD: mi,v 1.2423 2022/09/10 15:50:57 rillig Exp $ +# $NetBSD: mi,v 1.2424 2022/09/20 07:19:14 ozaki-r Exp $ # # Note: don't delete entries from here - mark them as "obsolete" instead. ./etc/mtree/set.comp comp-sys-root @@ -2745,6 +2745,7 @@ ./usr/include/netinet/tcp_debug.h comp-c-include ./usr/include/netinet/tcp_fsm.h comp-c-include ./usr/include/netinet/tcp_seq.h comp-c-include +./usr/include/netinet/tcp_syncache.h comp-c-include ./usr/include/netinet/tcp_timer.h comp-c-include ./usr/include/netinet/tcp_var.h comp-c-include ./usr/include/netinet/tcp_vtw.h comp-c-include Index: src/sys/netinet/Makefile diff -u src/sys/netinet/Makefile:1.30 src/sys/netinet/Makefile:1.31 --- src/sys/netinet/Makefile:1.30 Thu Sep 6 06:42:00 2018 +++ src/sys/netinet/Makefile Tue Sep 20 07:19:14 2022 @@ -1,4 +1,4 @@ -# $NetBSD: Makefile,v 1.30 2018/09/06 06:42:00 maxv Exp $ +# $NetBSD: Makefile,v 1.31 2022/09/20 07:19:14 ozaki-r Exp $ INCSDIR= /usr/include/netinet @@ -8,8 +8,8 @@ INCS= dccp.h icmp6.h icmp_var.h if_ether in_var.h ip.h ip_carp.h ip6.h ip_ecn.h ip_encap.h \ ip_icmp.h ip_mroute.h ip_var.h pim.h pim_var.h portalgo.h \ sctp.h sctp_uio.h \ - tcp.h tcp_debug.h tcp_fsm.h tcp_seq.h tcp_timer.h tcp_var.h \ - tcpip.h udp.h udp_var.h \ + tcp.h tcp_debug.h tcp_fsm.h tcp_seq.h tcp_syncache.h tcp_timer.h \ + tcp_var.h tcpip.h udp.h udp_var.h \ tcp_vtw.h # ipfilter headers Index: src/sys/netinet/files.netinet diff -u src/sys/netinet/files.netinet:1.29 src/sys/netinet/files.netinet:1.30 --- src/sys/netinet/files.netinet:1.29 Mon Mar 8 18:03:25 2021 +++ src/sys/netinet/files.netinet Tue Sep 20 07:19:14 2022 @@ -1,4 +1,4 @@ -# $NetBSD: files.netinet,v 1.29 2021/03/08 18:03:25 christos Exp $ +# $NetBSD: files.netinet,v 1.30 2022/09/20 07:19:14 ozaki-r Exp $ defflag opt_tcp_debug.h TCP_DEBUG defparam opt_tcp_debug.h TCP_NDEBUG @@ -45,6 +45,7 @@ file netinet/tcp_input.c inet | inet6 file netinet/tcp_output.c inet | inet6 file netinet/tcp_sack.c inet | inet6 file netinet/tcp_subr.c inet | inet6 +file netinet/tcp_syncache.c inet | inet6 file netinet/tcp_timer.c inet | inet6 file netinet/tcp_usrreq.c inet | inet6 file netinet/tcp_congctl.c inet | inet6 Index: src/sys/netinet/tcp_input.c diff -u src/sys/netinet/tcp_input.c:1.433 src/sys/netinet/tcp_input.c:1.434 --- src/sys/netinet/tcp_input.c:1.433 Tue May 24 20:50:20 2022 +++ src/sys/netinet/tcp_input.c Tue Sep 20 07:19:14 2022 @@ -1,4 +1,4 @@ -/* $NetBSD: tcp_input.c,v 1.433 2022/05/24 20:50:20 andvar Exp $ */ +/* $NetBSD: tcp_input.c,v 1.434 2022/09/20 07:19:14 ozaki-r Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. @@ -137,18 +137,8 @@ * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 */ -/* - * TODO list for SYN cache stuff: - * - * Find room for a "state" field, which is needed to keep a - * compressed state for TIME_WAIT TCBs. It's been noted already - * that this is fairly important for very high-volume web and - * mail servers, which use a large number of short-lived - * connections. - */ - #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: tcp_input.c,v 1.433 2022/05/24 20:50:20 andvar Exp $"); +__KERNEL_RCSID(0, "$NetBSD: tcp_input.c,v 1.434 2022/09/20 07:19:14 ozaki-r Exp $"); #ifdef _KERNEL_OPT #include "opt_inet.h" @@ -214,6 +204,7 @@ __KERNEL_RCSID(0, "$NetBSD: tcp_input.c, #include <netinet/tcp_private.h> #include <netinet/tcp_congctl.h> #include <netinet/tcp_debug.h> +#include <netinet/tcp_syncache.h> #ifdef INET6 #include "faith.h" @@ -245,8 +236,6 @@ static struct timeval tcp_rst_ppslim_las static int tcp_ackdrop_ppslim_count = 0; static struct timeval tcp_ackdrop_ppslim_last; -static void syn_cache_timer(void *); - #define TCP_PAWS_IDLE (24U * 24 * 60 * 60 * PR_SLOWHZ) /* for modulo comparisons of timestamps */ @@ -426,8 +415,6 @@ extern struct evcnt tcp_reass_fragdup; static int tcp_reass(struct tcpcb *, const struct tcphdr *, struct mbuf *, int); -static int tcp_dooptions(struct tcpcb *, const u_char *, int, - struct tcphdr *, struct mbuf *, int, struct tcp_opt_info *); static void tcp4_log_refused(const struct ip *, const struct tcphdr *); #ifdef INET6 @@ -3155,7 +3142,7 @@ tcp_signature(struct mbuf *m, struct tcp * Returns -1 if this segment should be dropped. (eg. wrong signature) * Otherwise returns 0. */ -static int +int tcp_dooptions(struct tcpcb *tp, const u_char *cp, int cnt, struct tcphdr *th, struct mbuf *m, int toff, struct tcp_opt_info *oi) { @@ -3470,1172 +3457,3 @@ tcp_xmit_timer(struct tcpcb *tp, uint32_ */ tp->t_softerror = 0; } - - -/* - * TCP compressed state engine. Currently used to hold compressed - * state for SYN_RECEIVED. - */ - -u_long syn_cache_count; -u_int32_t syn_hash1, syn_hash2; - -#define SYN_HASH(sa, sp, dp) \ - ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \ - ((u_int32_t)(sp)))^syn_hash2))) -#ifndef INET6 -#define SYN_HASHALL(hash, src, dst) \ -do { \ - hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \ - ((const struct sockaddr_in *)(src))->sin_port, \ - ((const struct sockaddr_in *)(dst))->sin_port); \ -} while (/*CONSTCOND*/ 0) -#else -#define SYN_HASH6(sa, sp, dp) \ - ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \ - (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \ - & 0x7fffffff) - -#define SYN_HASHALL(hash, src, dst) \ -do { \ - switch ((src)->sa_family) { \ - case AF_INET: \ - hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \ - ((const struct sockaddr_in *)(src))->sin_port, \ - ((const struct sockaddr_in *)(dst))->sin_port); \ - break; \ - case AF_INET6: \ - hash = SYN_HASH6(&((const struct sockaddr_in6 *)(src))->sin6_addr, \ - ((const struct sockaddr_in6 *)(src))->sin6_port, \ - ((const struct sockaddr_in6 *)(dst))->sin6_port); \ - break; \ - default: \ - hash = 0; \ - } \ -} while (/*CONSTCOND*/0) -#endif /* INET6 */ - -static struct pool syn_cache_pool; - -/* - * We don't estimate RTT with SYNs, so each packet starts with the default - * RTT and each timer step has a fixed timeout value. - */ -static inline void -syn_cache_timer_arm(struct syn_cache *sc) -{ - - TCPT_RANGESET(sc->sc_rxtcur, - TCPTV_SRTTDFLT * tcp_backoff[sc->sc_rxtshift], TCPTV_MIN, - TCPTV_REXMTMAX); - callout_reset(&sc->sc_timer, - sc->sc_rxtcur * (hz / PR_SLOWHZ), syn_cache_timer, sc); -} - -#define SYN_CACHE_TIMESTAMP(sc) (tcp_now - (sc)->sc_timebase) - -static inline void -syn_cache_rm(struct syn_cache *sc) -{ - TAILQ_REMOVE(&tcp_syn_cache[sc->sc_bucketidx].sch_bucket, - sc, sc_bucketq); - sc->sc_tp = NULL; - LIST_REMOVE(sc, sc_tpq); - tcp_syn_cache[sc->sc_bucketidx].sch_length--; - callout_stop(&sc->sc_timer); - syn_cache_count--; -} - -static inline void -syn_cache_put(struct syn_cache *sc) -{ - if (sc->sc_ipopts) - (void) m_free(sc->sc_ipopts); - rtcache_free(&sc->sc_route); - sc->sc_flags |= SCF_DEAD; - if (!callout_invoking(&sc->sc_timer)) - callout_schedule(&(sc)->sc_timer, 1); -} - -void -syn_cache_init(void) -{ - int i; - - pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, - "synpl", NULL, IPL_SOFTNET); - - /* Initialize the hash buckets. */ - for (i = 0; i < tcp_syn_cache_size; i++) - TAILQ_INIT(&tcp_syn_cache[i].sch_bucket); -} - -void -syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) -{ - struct syn_cache_head *scp; - struct syn_cache *sc2; - int s; - - /* - * If there are no entries in the hash table, reinitialize - * the hash secrets. - */ - if (syn_cache_count == 0) { - syn_hash1 = cprng_fast32(); - syn_hash2 = cprng_fast32(); - } - - SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa); - sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size; - scp = &tcp_syn_cache[sc->sc_bucketidx]; - - /* - * Make sure that we don't overflow the per-bucket - * limit or the total cache size limit. - */ - s = splsoftnet(); - if (scp->sch_length >= tcp_syn_bucket_limit) { - TCP_STATINC(TCP_STAT_SC_BUCKETOVERFLOW); - /* - * The bucket is full. Toss the oldest element in the - * bucket. This will be the first entry in the bucket. - */ - sc2 = TAILQ_FIRST(&scp->sch_bucket); -#ifdef DIAGNOSTIC - /* - * This should never happen; we should always find an - * entry in our bucket. - */ - if (sc2 == NULL) - panic("syn_cache_insert: bucketoverflow: impossible"); -#endif - syn_cache_rm(sc2); - syn_cache_put(sc2); /* calls pool_put but see spl above */ - } else if (syn_cache_count >= tcp_syn_cache_limit) { - struct syn_cache_head *scp2, *sce; - - TCP_STATINC(TCP_STAT_SC_OVERFLOWED); - /* - * The cache is full. Toss the oldest entry in the - * first non-empty bucket we can find. - * - * XXX We would really like to toss the oldest - * entry in the cache, but we hope that this - * condition doesn't happen very often. - */ - scp2 = scp; - if (TAILQ_EMPTY(&scp2->sch_bucket)) { - sce = &tcp_syn_cache[tcp_syn_cache_size]; - for (++scp2; scp2 != scp; scp2++) { - if (scp2 >= sce) - scp2 = &tcp_syn_cache[0]; - if (! TAILQ_EMPTY(&scp2->sch_bucket)) - break; - } -#ifdef DIAGNOSTIC - /* - * This should never happen; we should always find a - * non-empty bucket. - */ - if (scp2 == scp) - panic("syn_cache_insert: cacheoverflow: " - "impossible"); -#endif - } - sc2 = TAILQ_FIRST(&scp2->sch_bucket); - syn_cache_rm(sc2); - syn_cache_put(sc2); /* calls pool_put but see spl above */ - } - - /* - * Initialize the entry's timer. - */ - sc->sc_rxttot = 0; - sc->sc_rxtshift = 0; - syn_cache_timer_arm(sc); - - /* Link it from tcpcb entry */ - LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); - - /* Put it into the bucket. */ - TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); - scp->sch_length++; - syn_cache_count++; - - TCP_STATINC(TCP_STAT_SC_ADDED); - splx(s); -} - -/* - * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. - * If we have retransmitted an entry the maximum number of times, expire - * that entry. - */ -static void -syn_cache_timer(void *arg) -{ - struct syn_cache *sc = arg; - - mutex_enter(softnet_lock); - KERNEL_LOCK(1, NULL); - - callout_ack(&sc->sc_timer); - - if (__predict_false(sc->sc_flags & SCF_DEAD)) { - TCP_STATINC(TCP_STAT_SC_DELAYED_FREE); - goto free; - } - - if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { - /* Drop it -- too many retransmissions. */ - goto dropit; - } - - /* - * Compute the total amount of time this entry has - * been on a queue. If this entry has been on longer - * than the keep alive timer would allow, expire it. - */ - sc->sc_rxttot += sc->sc_rxtcur; - if (sc->sc_rxttot >= MIN(tcp_keepinit, TCP_TIMER_MAXTICKS)) - goto dropit; - - TCP_STATINC(TCP_STAT_SC_RETRANSMITTED); - (void)syn_cache_respond(sc); - - /* Advance the timer back-off. */ - sc->sc_rxtshift++; - syn_cache_timer_arm(sc); - - goto out; - - dropit: - TCP_STATINC(TCP_STAT_SC_TIMED_OUT); - syn_cache_rm(sc); - if (sc->sc_ipopts) - (void) m_free(sc->sc_ipopts); - rtcache_free(&sc->sc_route); - - free: - callout_destroy(&sc->sc_timer); - pool_put(&syn_cache_pool, sc); - - out: - KERNEL_UNLOCK_ONE(NULL); - mutex_exit(softnet_lock); -} - -/* - * Remove syn cache created by the specified tcb entry, - * because this does not make sense to keep them - * (if there's no tcb entry, syn cache entry will never be used) - */ -void -syn_cache_cleanup(struct tcpcb *tp) -{ - struct syn_cache *sc, *nsc; - int s; - - s = splsoftnet(); - - for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) { - nsc = LIST_NEXT(sc, sc_tpq); - -#ifdef DIAGNOSTIC - if (sc->sc_tp != tp) - panic("invalid sc_tp in syn_cache_cleanup"); -#endif - syn_cache_rm(sc); - syn_cache_put(sc); /* calls pool_put but see spl above */ - } - /* just for safety */ - LIST_INIT(&tp->t_sc); - - splx(s); -} - -/* - * Find an entry in the syn cache. - */ -struct syn_cache * -syn_cache_lookup(const struct sockaddr *src, const struct sockaddr *dst, - struct syn_cache_head **headp) -{ - struct syn_cache *sc; - struct syn_cache_head *scp; - u_int32_t hash; - int s; - - SYN_HASHALL(hash, src, dst); - - scp = &tcp_syn_cache[hash % tcp_syn_cache_size]; - *headp = scp; - s = splsoftnet(); - for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL; - sc = TAILQ_NEXT(sc, sc_bucketq)) { - if (sc->sc_hash != hash) - continue; - if (!memcmp(&sc->sc_src, src, src->sa_len) && - !memcmp(&sc->sc_dst, dst, dst->sa_len)) { - splx(s); - return (sc); - } - } - splx(s); - return (NULL); -} - -/* - * This function gets called when we receive an ACK for a socket in the - * LISTEN state. We look up the connection in the syn cache, and if it's - * there, we pull it out of the cache and turn it into a full-blown - * connection in the SYN-RECEIVED state. - * - * The return values may not be immediately obvious, and their effects - * can be subtle, so here they are: - * - * NULL SYN was not found in cache; caller should drop the - * packet and send an RST. - * - * -1 We were unable to create the new connection, and are - * aborting it. An ACK,RST is being sent to the peer - * (unless we got screwey sequence numbers; see below), - * because the 3-way handshake has been completed. Caller - * should not free the mbuf, since we may be using it. If - * we are not, we will free it. - * - * Otherwise, the return value is a pointer to the new socket - * associated with the connection. - */ -struct socket * -syn_cache_get(struct sockaddr *src, struct sockaddr *dst, - struct tcphdr *th, struct socket *so, struct mbuf *m) -{ - struct syn_cache *sc; - struct syn_cache_head *scp; - struct inpcb *inp = NULL; -#ifdef INET6 - struct in6pcb *in6p = NULL; -#endif - struct tcpcb *tp; - int s; - struct socket *oso; - - s = splsoftnet(); - if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { - splx(s); - return NULL; - } - - /* - * Verify the sequence and ack numbers. Try getting the correct - * response again. - */ - if ((th->th_ack != sc->sc_iss + 1) || - SEQ_LEQ(th->th_seq, sc->sc_irs) || - SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { - m_freem(m); - (void)syn_cache_respond(sc); - splx(s); - return ((struct socket *)(-1)); - } - - /* Remove this cache entry */ - syn_cache_rm(sc); - splx(s); - - /* - * Ok, create the full blown connection, and set things up - * as they would have been set up if we had created the - * connection when the SYN arrived. If we can't create - * the connection, abort it. - */ - /* - * inp still has the OLD in_pcb stuff, set the - * v6-related flags on the new guy, too. This is - * done particularly for the case where an AF_INET6 - * socket is bound only to a port, and a v4 connection - * comes in on that port. - * we also copy the flowinfo from the original pcb - * to the new one. - */ - oso = so; - so = sonewconn(so, true); - if (so == NULL) - goto resetandabort; - - switch (so->so_proto->pr_domain->dom_family) { - case AF_INET: - inp = sotoinpcb(so); - break; -#ifdef INET6 - case AF_INET6: - in6p = sotoin6pcb(so); - break; -#endif - } - - switch (src->sa_family) { - case AF_INET: - if (inp) { - inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr; - inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port; - inp->inp_options = ip_srcroute(m); - in_pcbstate(inp, INP_BOUND); - if (inp->inp_options == NULL) { - inp->inp_options = sc->sc_ipopts; - sc->sc_ipopts = NULL; - } - } -#ifdef INET6 - else if (in6p) { - /* IPv4 packet to AF_INET6 socket */ - memset(&in6p->in6p_laddr, 0, sizeof(in6p->in6p_laddr)); - in6p->in6p_laddr.s6_addr16[5] = htons(0xffff); - bcopy(&((struct sockaddr_in *)dst)->sin_addr, - &in6p->in6p_laddr.s6_addr32[3], - sizeof(((struct sockaddr_in *)dst)->sin_addr)); - in6p->in6p_lport = ((struct sockaddr_in *)dst)->sin_port; - in6totcpcb(in6p)->t_family = AF_INET; - if (sotoin6pcb(oso)->in6p_flags & IN6P_IPV6_V6ONLY) - in6p->in6p_flags |= IN6P_IPV6_V6ONLY; - else - in6p->in6p_flags &= ~IN6P_IPV6_V6ONLY; - in6_pcbstate(in6p, IN6P_BOUND); - } -#endif - break; -#ifdef INET6 - case AF_INET6: - if (in6p) { - in6p->in6p_laddr = ((struct sockaddr_in6 *)dst)->sin6_addr; - in6p->in6p_lport = ((struct sockaddr_in6 *)dst)->sin6_port; - in6_pcbstate(in6p, IN6P_BOUND); - } - break; -#endif - } - -#ifdef INET6 - if (in6p && in6totcpcb(in6p)->t_family == AF_INET6 && sotoinpcb(oso)) { - struct in6pcb *oin6p = sotoin6pcb(oso); - /* inherit socket options from the listening socket */ - in6p->in6p_flags |= (oin6p->in6p_flags & IN6P_CONTROLOPTS); - if (in6p->in6p_flags & IN6P_CONTROLOPTS) { - m_freem(in6p->in6p_options); - in6p->in6p_options = NULL; - } - ip6_savecontrol(in6p, &in6p->in6p_options, - mtod(m, struct ip6_hdr *), m); - } -#endif - - /* - * Give the new socket our cached route reference. - */ - if (inp) { - rtcache_copy(&inp->inp_route, &sc->sc_route); - rtcache_free(&sc->sc_route); - } -#ifdef INET6 - else { - rtcache_copy(&in6p->in6p_route, &sc->sc_route); - rtcache_free(&sc->sc_route); - } -#endif - - if (inp) { - struct sockaddr_in sin; - memcpy(&sin, src, src->sa_len); - if (in_pcbconnect(inp, &sin, &lwp0)) { - goto resetandabort; - } - } -#ifdef INET6 - else if (in6p) { - struct sockaddr_in6 sin6; - memcpy(&sin6, src, src->sa_len); - if (src->sa_family == AF_INET) { - /* IPv4 packet to AF_INET6 socket */ - in6_sin_2_v4mapsin6((struct sockaddr_in *)src, &sin6); - } - if (in6_pcbconnect(in6p, &sin6, NULL)) { - goto resetandabort; - } - } -#endif - else { - goto resetandabort; - } - - if (inp) - tp = intotcpcb(inp); -#ifdef INET6 - else if (in6p) - tp = in6totcpcb(in6p); -#endif - else - tp = NULL; - - tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY; - if (sc->sc_request_r_scale != 15) { - tp->requested_s_scale = sc->sc_requested_s_scale; - tp->request_r_scale = sc->sc_request_r_scale; - tp->snd_scale = sc->sc_requested_s_scale; - tp->rcv_scale = sc->sc_request_r_scale; - tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; - } - if (sc->sc_flags & SCF_TIMESTAMP) - tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; - tp->ts_timebase = sc->sc_timebase; - - tp->t_template = tcp_template(tp); - if (tp->t_template == 0) { - tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ - so = NULL; - m_freem(m); - goto abort; - } - - tp->iss = sc->sc_iss; - tp->irs = sc->sc_irs; - tcp_sendseqinit(tp); - tcp_rcvseqinit(tp); - tp->t_state = TCPS_SYN_RECEIVED; - TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepinit); - TCP_STATINC(TCP_STAT_ACCEPTS); - - if ((sc->sc_flags & SCF_SACK_PERMIT) && tcp_do_sack) - tp->t_flags |= TF_WILL_SACK; - - if ((sc->sc_flags & SCF_ECN_PERMIT) && tcp_do_ecn) - tp->t_flags |= TF_ECN_PERMIT; - -#ifdef TCP_SIGNATURE - if (sc->sc_flags & SCF_SIGNATURE) - tp->t_flags |= TF_SIGNATURE; -#endif - - /* Initialize tp->t_ourmss before we deal with the peer's! */ - tp->t_ourmss = sc->sc_ourmaxseg; - tcp_mss_from_peer(tp, sc->sc_peermaxseg); - - /* - * Initialize the initial congestion window. If we - * had to retransmit the SYN,ACK, we must initialize cwnd - * to 1 segment (i.e. the Loss Window). - */ - if (sc->sc_rxtshift) - tp->snd_cwnd = tp->t_peermss; - else { - int ss = tcp_init_win; - if (inp != NULL && in_localaddr(inp->inp_faddr)) - ss = tcp_init_win_local; -#ifdef INET6 - if (in6p != NULL && in6_localaddr(&in6p->in6p_faddr)) - ss = tcp_init_win_local; -#endif - tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss); - } - - tcp_rmx_rtt(tp); - tp->snd_wl1 = sc->sc_irs; - tp->rcv_up = sc->sc_irs + 1; - - /* - * This is what would have happened in tcp_output() when - * the SYN,ACK was sent. - */ - tp->snd_up = tp->snd_una; - tp->snd_max = tp->snd_nxt = tp->iss+1; - TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); - if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) - tp->rcv_adv = tp->rcv_nxt + sc->sc_win; - tp->last_ack_sent = tp->rcv_nxt; - tp->t_partialacks = -1; - tp->t_dupacks = 0; - - TCP_STATINC(TCP_STAT_SC_COMPLETED); - s = splsoftnet(); - syn_cache_put(sc); - splx(s); - return so; - -resetandabort: - (void)tcp_respond(NULL, m, m, th, (tcp_seq)0, th->th_ack, TH_RST); -abort: - if (so != NULL) { - (void) soqremque(so, 1); - (void) soabort(so); - mutex_enter(softnet_lock); - } - s = splsoftnet(); - syn_cache_put(sc); - splx(s); - TCP_STATINC(TCP_STAT_SC_ABORTED); - return ((struct socket *)(-1)); -} - -/* - * This function is called when we get a RST for a - * non-existent connection, so that we can see if the - * connection is in the syn cache. If it is, zap it. - */ - -void -syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th) -{ - struct syn_cache *sc; - struct syn_cache_head *scp; - int s = splsoftnet(); - - if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { - splx(s); - return; - } - if (SEQ_LT(th->th_seq, sc->sc_irs) || - SEQ_GT(th->th_seq, sc->sc_irs+1)) { - splx(s); - return; - } - syn_cache_rm(sc); - TCP_STATINC(TCP_STAT_SC_RESET); - syn_cache_put(sc); /* calls pool_put but see spl above */ - splx(s); -} - -void -syn_cache_unreach(const struct sockaddr *src, const struct sockaddr *dst, - struct tcphdr *th) -{ - struct syn_cache *sc; - struct syn_cache_head *scp; - int s; - - s = splsoftnet(); - if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { - splx(s); - return; - } - /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ - if (ntohl(th->th_seq) != sc->sc_iss) { - splx(s); - return; - } - - /* - * If we've retransmitted 3 times and this is our second error, - * we remove the entry. Otherwise, we allow it to continue on. - * This prevents us from incorrectly nuking an entry during a - * spurious network outage. - * - * See tcp_notify(). - */ - if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { - sc->sc_flags |= SCF_UNREACH; - splx(s); - return; - } - - syn_cache_rm(sc); - TCP_STATINC(TCP_STAT_SC_UNREACH); - syn_cache_put(sc); /* calls pool_put but see spl above */ - splx(s); -} - -/* - * Given a LISTEN socket and an inbound SYN request, add this to the syn - * cache, and send back a segment: - * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> - * to the source. - * - * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. - * Doing so would require that we hold onto the data and deliver it - * to the application. However, if we are the target of a SYN-flood - * DoS attack, an attacker could send data which would eventually - * consume all available buffer space if it were ACKed. By not ACKing - * the data, we avoid this DoS scenario. - */ -int -syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, - unsigned int toff, struct socket *so, struct mbuf *m, u_char *optp, - int optlen, struct tcp_opt_info *oi) -{ - struct tcpcb tb, *tp; - long win; - struct syn_cache *sc; - struct syn_cache_head *scp; - struct mbuf *ipopts; - int s; - - tp = sototcpcb(so); - - /* - * Initialize some local state. - */ - win = sbspace(&so->so_rcv); - if (win > TCP_MAXWIN) - win = TCP_MAXWIN; - -#ifdef TCP_SIGNATURE - if (optp || (tp->t_flags & TF_SIGNATURE)) -#else - if (optp) -#endif - { - tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; -#ifdef TCP_SIGNATURE - tb.t_flags |= (tp->t_flags & TF_SIGNATURE); -#endif - tb.t_state = TCPS_LISTEN; - if (tcp_dooptions(&tb, optp, optlen, th, m, toff, oi) < 0) - return 0; - } else - tb.t_flags = 0; - - switch (src->sa_family) { - case AF_INET: - /* Remember the IP options, if any. */ - ipopts = ip_srcroute(m); - break; - default: - ipopts = NULL; - } - - /* - * See if we already have an entry for this connection. - * If we do, resend the SYN,ACK. We do not count this - * as a retransmission (XXX though maybe we should). - */ - if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) { - TCP_STATINC(TCP_STAT_SC_DUPESYN); - if (ipopts) { - /* - * If we were remembering a previous source route, - * forget it and use the new one we've been given. - */ - if (sc->sc_ipopts) - (void)m_free(sc->sc_ipopts); - sc->sc_ipopts = ipopts; - } - sc->sc_timestamp = tb.ts_recent; - m_freem(m); - if (syn_cache_respond(sc) == 0) { - uint64_t *tcps = TCP_STAT_GETREF(); - tcps[TCP_STAT_SNDACKS]++; - tcps[TCP_STAT_SNDTOTAL]++; - TCP_STAT_PUTREF(); - } - return 1; - } - - s = splsoftnet(); - sc = pool_get(&syn_cache_pool, PR_NOWAIT); - splx(s); - if (sc == NULL) { - if (ipopts) - (void)m_free(ipopts); - return 0; - } - - /* - * Fill in the cache, and put the necessary IP and TCP - * options into the reply. - */ - memset(sc, 0, sizeof(struct syn_cache)); - callout_init(&sc->sc_timer, CALLOUT_MPSAFE); - memcpy(&sc->sc_src, src, src->sa_len); - memcpy(&sc->sc_dst, dst, dst->sa_len); - sc->sc_flags = 0; - sc->sc_ipopts = ipopts; - sc->sc_irs = th->th_seq; - switch (src->sa_family) { - case AF_INET: - { - struct sockaddr_in *srcin = (void *)src; - struct sockaddr_in *dstin = (void *)dst; - - sc->sc_iss = tcp_new_iss1(&dstin->sin_addr, - &srcin->sin_addr, dstin->sin_port, - srcin->sin_port, sizeof(dstin->sin_addr)); - break; - } -#ifdef INET6 - case AF_INET6: - { - struct sockaddr_in6 *srcin6 = (void *)src; - struct sockaddr_in6 *dstin6 = (void *)dst; - - sc->sc_iss = tcp_new_iss1(&dstin6->sin6_addr, - &srcin6->sin6_addr, dstin6->sin6_port, - srcin6->sin6_port, sizeof(dstin6->sin6_addr)); - break; - } -#endif - } - sc->sc_peermaxseg = oi->maxseg; - sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ? - m_get_rcvif_NOMPSAFE(m) : NULL, sc->sc_src.sa.sa_family); - sc->sc_win = win; - sc->sc_timebase = tcp_now - 1; /* see tcp_newtcpcb() */ - sc->sc_timestamp = tb.ts_recent; - if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == - (TF_REQ_TSTMP|TF_RCVD_TSTMP)) - sc->sc_flags |= SCF_TIMESTAMP; - if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == - (TF_RCVD_SCALE|TF_REQ_SCALE)) { - sc->sc_requested_s_scale = tb.requested_s_scale; - sc->sc_request_r_scale = 0; - /* - * Pick the smallest possible scaling factor that - * will still allow us to scale up to sb_max. - * - * We do this because there are broken firewalls that - * will corrupt the window scale option, leading to - * the other endpoint believing that our advertised - * window is unscaled. At scale factors larger than - * 5 the unscaled window will drop below 1500 bytes, - * leading to serious problems when traversing these - * broken firewalls. - * - * With the default sbmax of 256K, a scale factor - * of 3 will be chosen by this algorithm. Those who - * choose a larger sbmax should watch out - * for the compatibility problems mentioned above. - * - * RFC1323: The Window field in a SYN (i.e., a <SYN> - * or <SYN,ACK>) segment itself is never scaled. - */ - while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && - (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max) - sc->sc_request_r_scale++; - } else { - sc->sc_requested_s_scale = 15; - sc->sc_request_r_scale = 15; - } - if ((tb.t_flags & TF_SACK_PERMIT) && tcp_do_sack) - sc->sc_flags |= SCF_SACK_PERMIT; - - /* - * ECN setup packet received. - */ - if ((th->th_flags & (TH_ECE|TH_CWR)) && tcp_do_ecn) - sc->sc_flags |= SCF_ECN_PERMIT; - -#ifdef TCP_SIGNATURE - if (tb.t_flags & TF_SIGNATURE) - sc->sc_flags |= SCF_SIGNATURE; -#endif - sc->sc_tp = tp; - m_freem(m); - if (syn_cache_respond(sc) == 0) { - uint64_t *tcps = TCP_STAT_GETREF(); - tcps[TCP_STAT_SNDACKS]++; - tcps[TCP_STAT_SNDTOTAL]++; - TCP_STAT_PUTREF(); - syn_cache_insert(sc, tp); - } else { - s = splsoftnet(); - /* - * syn_cache_put() will try to schedule the timer, so - * we need to initialize it - */ - syn_cache_timer_arm(sc); - syn_cache_put(sc); - splx(s); - TCP_STATINC(TCP_STAT_SC_DROPPED); - } - return 1; -} - -/* - * syn_cache_respond: (re)send SYN+ACK. - * - * Returns 0 on success. - */ - -int -syn_cache_respond(struct syn_cache *sc) -{ -#ifdef INET6 - struct rtentry *rt = NULL; -#endif - struct route *ro; - u_int8_t *optp; - int optlen, error; - u_int16_t tlen; - struct ip *ip = NULL; -#ifdef INET6 - struct ip6_hdr *ip6 = NULL; -#endif - struct tcpcb *tp; - struct tcphdr *th; - struct mbuf *m; - u_int hlen; -#ifdef TCP_SIGNATURE - struct secasvar *sav = NULL; - u_int8_t *sigp = NULL; -#endif - - ro = &sc->sc_route; - switch (sc->sc_src.sa.sa_family) { - case AF_INET: - hlen = sizeof(struct ip); - break; -#ifdef INET6 - case AF_INET6: - hlen = sizeof(struct ip6_hdr); - break; -#endif - default: - return EAFNOSUPPORT; - } - - /* Worst case scenario, since we don't know the option size yet. */ - tlen = hlen + sizeof(struct tcphdr) + MAX_TCPOPTLEN; - KASSERT(max_linkhdr + tlen <= MCLBYTES); - - /* - * Create the IP+TCP header from scratch. - */ - MGETHDR(m, M_DONTWAIT, MT_DATA); - if (m && (max_linkhdr + tlen) > MHLEN) { - MCLGET(m, M_DONTWAIT); - if ((m->m_flags & M_EXT) == 0) { - m_freem(m); - m = NULL; - } - } - if (m == NULL) - return ENOBUFS; - MCLAIM(m, &tcp_tx_mowner); - - tp = sc->sc_tp; - - /* Fixup the mbuf. */ - m->m_data += max_linkhdr; - m_reset_rcvif(m); - memset(mtod(m, void *), 0, tlen); - - switch (sc->sc_src.sa.sa_family) { - case AF_INET: - ip = mtod(m, struct ip *); - ip->ip_v = 4; - ip->ip_dst = sc->sc_src.sin.sin_addr; - ip->ip_src = sc->sc_dst.sin.sin_addr; - ip->ip_p = IPPROTO_TCP; - th = (struct tcphdr *)(ip + 1); - th->th_dport = sc->sc_src.sin.sin_port; - th->th_sport = sc->sc_dst.sin.sin_port; - break; -#ifdef INET6 - case AF_INET6: - ip6 = mtod(m, struct ip6_hdr *); - ip6->ip6_vfc = IPV6_VERSION; - ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; - ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; - ip6->ip6_nxt = IPPROTO_TCP; - /* ip6_plen will be updated in ip6_output() */ - th = (struct tcphdr *)(ip6 + 1); - th->th_dport = sc->sc_src.sin6.sin6_port; - th->th_sport = sc->sc_dst.sin6.sin6_port; - break; -#endif - default: - panic("%s: impossible (1)", __func__); - } - - th->th_seq = htonl(sc->sc_iss); - th->th_ack = htonl(sc->sc_irs + 1); - th->th_flags = TH_SYN|TH_ACK; - th->th_win = htons(sc->sc_win); - /* th_x2, th_sum, th_urp already 0 from memset */ - - /* Tack on the TCP options. */ - optp = (u_int8_t *)(th + 1); - optlen = 0; - *optp++ = TCPOPT_MAXSEG; - *optp++ = TCPOLEN_MAXSEG; - *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; - *optp++ = sc->sc_ourmaxseg & 0xff; - optlen += TCPOLEN_MAXSEG; - - if (sc->sc_request_r_scale != 15) { - *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | - TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | - sc->sc_request_r_scale); - optp += TCPOLEN_WINDOW + TCPOLEN_NOP; - optlen += TCPOLEN_WINDOW + TCPOLEN_NOP; - } - - if (sc->sc_flags & SCF_SACK_PERMIT) { - /* Let the peer know that we will SACK. */ - *optp++ = TCPOPT_SACK_PERMITTED; - *optp++ = TCPOLEN_SACK_PERMITTED; - optlen += TCPOLEN_SACK_PERMITTED; - } - - if (sc->sc_flags & SCF_TIMESTAMP) { - while (optlen % 4 != 2) { - optlen += TCPOLEN_NOP; - *optp++ = TCPOPT_NOP; - } - *optp++ = TCPOPT_TIMESTAMP; - *optp++ = TCPOLEN_TIMESTAMP; - u_int32_t *lp = (u_int32_t *)(optp); - /* Form timestamp option as shown in appendix A of RFC 1323. */ - *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc)); - *lp = htonl(sc->sc_timestamp); - optp += TCPOLEN_TIMESTAMP - 2; - optlen += TCPOLEN_TIMESTAMP; - } - -#ifdef TCP_SIGNATURE - if (sc->sc_flags & SCF_SIGNATURE) { - sav = tcp_signature_getsav(m); - if (sav == NULL) { - m_freem(m); - return EPERM; - } - - *optp++ = TCPOPT_SIGNATURE; - *optp++ = TCPOLEN_SIGNATURE; - sigp = optp; - memset(optp, 0, TCP_SIGLEN); - optp += TCP_SIGLEN; - optlen += TCPOLEN_SIGNATURE; - } -#endif - - /* - * Terminate and pad TCP options to a 4 byte boundary. - * - * According to RFC793: "The content of the header beyond the - * End-of-Option option must be header padding (i.e., zero)." - * And later: "The padding is composed of zeros." - */ - if (optlen % 4) { - optlen += TCPOLEN_EOL; - *optp++ = TCPOPT_EOL; - } - while (optlen % 4) { - optlen += TCPOLEN_PAD; - *optp++ = TCPOPT_PAD; - } - - /* Compute the actual values now that we've added the options. */ - tlen = hlen + sizeof(struct tcphdr) + optlen; - m->m_len = m->m_pkthdr.len = tlen; - th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; - -#ifdef TCP_SIGNATURE - if (sav) { - (void)tcp_signature(m, th, hlen, sav, sigp); - key_sa_recordxfer(sav, m); - KEY_SA_UNREF(&sav); - } -#endif - - /* - * Send ECN SYN-ACK setup packet. - * Routes can be asymmetric, so, even if we receive a packet - * with ECE and CWR set, we must not assume no one will block - * the ECE packet we are about to send. - */ - if ((sc->sc_flags & SCF_ECN_PERMIT) && tp && - SEQ_GEQ(tp->snd_nxt, tp->snd_max)) { - th->th_flags |= TH_ECE; - TCP_STATINC(TCP_STAT_ECN_SHS); - - /* - * draft-ietf-tcpm-ecnsyn-00.txt - * - * "[...] a TCP node MAY respond to an ECN-setup - * SYN packet by setting ECT in the responding - * ECN-setup SYN/ACK packet, indicating to routers - * that the SYN/ACK packet is ECN-Capable. - * This allows a congested router along the path - * to mark the packet instead of dropping the - * packet as an indication of congestion." - * - * "[...] There can be a great benefit in setting - * an ECN-capable codepoint in SYN/ACK packets [...] - * Congestion is most likely to occur in - * the server-to-client direction. As a result, - * setting an ECN-capable codepoint in SYN/ACK - * packets can reduce the occurrence of three-second - * retransmit timeouts resulting from the drop - * of SYN/ACK packets." - * - * Page 4 and 6, January 2006. - */ - - switch (sc->sc_src.sa.sa_family) { - case AF_INET: - ip->ip_tos |= IPTOS_ECN_ECT0; - break; -#ifdef INET6 - case AF_INET6: - ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); - break; -#endif - } - TCP_STATINC(TCP_STAT_ECN_ECT); - } - - - /* - * Compute the packet's checksum. - * - * Fill in some straggling IP bits. Note the stack expects - * ip_len to be in host order, for convenience. - */ - switch (sc->sc_src.sa.sa_family) { - case AF_INET: - ip->ip_len = htons(tlen - hlen); - th->th_sum = 0; - th->th_sum = in4_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); - ip->ip_len = htons(tlen); - ip->ip_ttl = ip_defttl; - /* XXX tos? */ - break; -#ifdef INET6 - case AF_INET6: - ip6->ip6_plen = htons(tlen - hlen); - th->th_sum = 0; - th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); - ip6->ip6_vfc &= ~IPV6_VERSION_MASK; - ip6->ip6_vfc |= IPV6_VERSION; - ip6->ip6_plen = htons(tlen - hlen); - /* ip6_hlim will be initialized afterwards */ - /* XXX flowlabel? */ - break; -#endif - } - - /* XXX use IPsec policy on listening socket, on SYN ACK */ - tp = sc->sc_tp; - - switch (sc->sc_src.sa.sa_family) { - case AF_INET: - error = ip_output(m, sc->sc_ipopts, ro, - (ip_mtudisc ? IP_MTUDISC : 0), - NULL, tp ? tp->t_inpcb : NULL); - break; -#ifdef INET6 - case AF_INET6: - ip6->ip6_hlim = in6_selecthlim(NULL, - (rt = rtcache_validate(ro)) != NULL ? rt->rt_ifp : NULL); - rtcache_unref(rt, ro); - - error = ip6_output(m, NULL /*XXX*/, ro, 0, NULL, - tp ? tp->t_in6pcb : NULL, NULL); - break; -#endif - default: - panic("%s: impossible (2)", __func__); - } - - return error; -} Index: src/sys/netinet/tcp_subr.c diff -u src/sys/netinet/tcp_subr.c:1.290 src/sys/netinet/tcp_subr.c:1.291 --- src/sys/netinet/tcp_subr.c:1.290 Mon Jun 27 01:29:51 2022 +++ src/sys/netinet/tcp_subr.c Tue Sep 20 07:19:14 2022 @@ -1,4 +1,4 @@ -/* $NetBSD: tcp_subr.c,v 1.290 2022/06/27 01:29:51 knakahara Exp $ */ +/* $NetBSD: tcp_subr.c,v 1.291 2022/09/20 07:19:14 ozaki-r Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. @@ -91,7 +91,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: tcp_subr.c,v 1.290 2022/06/27 01:29:51 knakahara Exp $"); +__KERNEL_RCSID(0, "$NetBSD: tcp_subr.c,v 1.291 2022/09/20 07:19:14 ozaki-r Exp $"); #ifdef _KERNEL_OPT #include "opt_inet.h" @@ -143,6 +143,7 @@ __KERNEL_RCSID(0, "$NetBSD: tcp_subr.c,v #include <netinet/tcp_vtw.h> #include <netinet/tcp_private.h> #include <netinet/tcp_congctl.h> +#include <netinet/tcp_syncache.h> #ifdef IPSEC #include <netipsec/ipsec.h> @@ -222,14 +223,6 @@ int tcp_vtw_entries = 1 << 4; /* 16 vest #endif int tcbhashsize = TCBHASHSIZE; -/* syn hash parameters */ -#define TCP_SYN_HASH_SIZE 293 -#define TCP_SYN_BUCKET_SIZE 35 -int tcp_syn_cache_size = TCP_SYN_HASH_SIZE; -int tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE; -int tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE; -struct syn_cache_head tcp_syn_cache[TCP_SYN_HASH_SIZE]; - int tcp_freeq(struct tcpcb *); static int tcp_iss_secret_init(void); Index: src/sys/netinet/tcp_usrreq.c diff -u src/sys/netinet/tcp_usrreq.c:1.231 src/sys/netinet/tcp_usrreq.c:1.232 --- src/sys/netinet/tcp_usrreq.c:1.231 Tue Jun 28 01:44:19 2022 +++ src/sys/netinet/tcp_usrreq.c Tue Sep 20 07:19:14 2022 @@ -1,4 +1,4 @@ -/* $NetBSD: tcp_usrreq.c,v 1.231 2022/06/28 01:44:19 riastradh Exp $ */ +/* $NetBSD: tcp_usrreq.c,v 1.232 2022/09/20 07:19:14 ozaki-r Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. @@ -99,7 +99,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: tcp_usrreq.c,v 1.231 2022/06/28 01:44:19 riastradh Exp $"); +__KERNEL_RCSID(0, "$NetBSD: tcp_usrreq.c,v 1.232 2022/09/20 07:19:14 ozaki-r Exp $"); #ifdef _KERNEL_OPT #include "opt_inet.h" @@ -151,6 +151,7 @@ __KERNEL_RCSID(0, "$NetBSD: tcp_usrreq.c #include <netinet/tcp_congctl.h> #include <netinet/tcp_debug.h> #include <netinet/tcp_vtw.h> +#include <netinet/tcp_syncache.h> static int tcp_debug_capture(struct tcpcb *tp, int req) Index: src/sys/netinet/tcp_var.h diff -u src/sys/netinet/tcp_var.h:1.196 src/sys/netinet/tcp_var.h:1.197 --- src/sys/netinet/tcp_var.h:1.196 Sat Jul 31 20:29:37 2021 +++ src/sys/netinet/tcp_var.h Tue Sep 20 07:19:14 2022 @@ -1,4 +1,4 @@ -/* $NetBSD: tcp_var.h,v 1.196 2021/07/31 20:29:37 andvar Exp $ */ +/* $NetBSD: tcp_var.h,v 1.197 2022/09/20 07:19:14 ozaki-r Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. @@ -205,6 +205,8 @@ struct sackhole { TAILQ_ENTRY(sackhole) sackhole_q; }; +struct syn_cache; + /* * Tcp control block, one per tcp; fields: */ @@ -520,57 +522,6 @@ struct tcp_opt_info { #define TOF_SIGNATURE 0x0040 /* signature option present */ #define TOF_SIGLEN 0x0080 /* sigature length valid (RFC2385) */ -/* - * Data for the TCP compressed state engine. - */ -union syn_cache_sa { - struct sockaddr sa; - struct sockaddr_in sin; -#if 1 /*def INET6*/ - struct sockaddr_in6 sin6; -#endif -}; - -struct syn_cache { - TAILQ_ENTRY(syn_cache) sc_bucketq; /* link on bucket list */ - callout_t sc_timer; /* rexmt timer */ - struct route sc_route; - long sc_win; /* advertised window */ - int sc_bucketidx; /* our bucket index */ - u_int32_t sc_hash; - u_int32_t sc_timestamp; /* timestamp from SYN */ - u_int32_t sc_timebase; /* our local timebase */ - union syn_cache_sa sc_src; - union syn_cache_sa sc_dst; - tcp_seq sc_irs; - tcp_seq sc_iss; - u_int sc_rxtcur; /* current rxt timeout */ - u_int sc_rxttot; /* total time spend on queues */ - u_short sc_rxtshift; /* for computing backoff */ - u_short sc_flags; - -#define SCF_UNREACH 0x0001 /* we've had an unreach error */ -#define SCF_TIMESTAMP 0x0002 /* peer will do timestamps */ -#define SCF_DEAD 0x0004 /* this entry to be released */ -#define SCF_SACK_PERMIT 0x0008 /* peer will do SACK */ -#define SCF_ECN_PERMIT 0x0010 /* peer will do ECN */ -#define SCF_SIGNATURE 0x40 /* send MD5 digests */ - - struct mbuf *sc_ipopts; /* IP options */ - u_int16_t sc_peermaxseg; - u_int16_t sc_ourmaxseg; - u_int8_t sc_request_r_scale : 4, - sc_requested_s_scale : 4; - - struct tcpcb *sc_tp; /* tcb for listening socket */ - LIST_ENTRY(syn_cache) sc_tpq; /* list of entries by same tp */ -}; - -struct syn_cache_head { - TAILQ_HEAD(, syn_cache) sch_bucket; /* bucket entries */ - u_short sch_length; /* # entries in bucket */ -}; - #define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb) #ifdef INET6 #define in6totcpcb(ip) ((struct tcpcb *)(ip)->in6p_ppcb) @@ -803,8 +754,6 @@ extern int tcp_mss_ifmtu; /* take MSS fr extern int tcp_cwm; /* enable Congestion Window Monitoring */ extern int tcp_cwm_burstsize; /* burst size allowed by CWM */ extern int tcp_ack_on_push; /* ACK immediately on PUSH */ -extern int tcp_syn_cache_limit; /* max entries for compressed state engine */ -extern int tcp_syn_bucket_limit;/* max entries per hash bucket */ extern int tcp_log_refused; /* log refused connections */ extern int tcp_do_ecn; /* TCP ECN enabled/disabled? */ extern int tcp_ecn_maxretries; /* Max ECN setup retries */ @@ -829,10 +778,6 @@ extern int tcp_vtw_entries; extern int tcp_rst_ppslim; extern int tcp_ackdrop_ppslim; -extern int tcp_syn_cache_size; -extern struct syn_cache_head tcp_syn_cache[]; -extern u_long syn_cache_count; - #ifdef MBUFTRACE extern struct mowner tcp_rx_mowner; extern struct mowner tcp_tx_mowner; @@ -940,24 +885,11 @@ int tcp_sack_numblks(const struct tcpcb void tcp_statinc(u_int); void tcp_statadd(u_int, uint64_t); -int syn_cache_add(struct sockaddr *, struct sockaddr *, - struct tcphdr *, unsigned int, struct socket *, - struct mbuf *, u_char *, int, struct tcp_opt_info *); -void syn_cache_unreach(const struct sockaddr *, const struct sockaddr *, - struct tcphdr *); -struct socket *syn_cache_get(struct sockaddr *, struct sockaddr *, - struct tcphdr *, struct socket *so, struct mbuf *); -void syn_cache_init(void); -void syn_cache_insert(struct syn_cache *, struct tcpcb *); -struct syn_cache *syn_cache_lookup(const struct sockaddr *, const struct sockaddr *, - struct syn_cache_head **); -void syn_cache_reset(struct sockaddr *, struct sockaddr *, - struct tcphdr *); -int syn_cache_respond(struct syn_cache *); -void syn_cache_cleanup(struct tcpcb *); - int tcp_input_checksum(int, struct mbuf *, const struct tcphdr *, int, int, int); + +int tcp_dooptions(struct tcpcb *, const u_char *, int, + struct tcphdr *, struct mbuf *, int, struct tcp_opt_info *); #endif #endif /* !_NETINET_TCP_VAR_H_ */ Index: src/sys/rump/net/lib/libnetinet/Makefile.inc diff -u src/sys/rump/net/lib/libnetinet/Makefile.inc:1.15 src/sys/rump/net/lib/libnetinet/Makefile.inc:1.16 --- src/sys/rump/net/lib/libnetinet/Makefile.inc:1.15 Mon Mar 8 20:43:22 2021 +++ src/sys/rump/net/lib/libnetinet/Makefile.inc Tue Sep 20 07:19:14 2022 @@ -1,4 +1,4 @@ -# $NetBSD: Makefile.inc,v 1.15 2021/03/08 20:43:22 christos Exp $ +# $NetBSD: Makefile.inc,v 1.16 2022/09/20 07:19:14 ozaki-r Exp $ # .PATH: ${.CURDIR}/../../../../netinet @@ -15,7 +15,7 @@ SRCS+= if_arp.c # TCP SRCS+= tcp_congctl.c tcp_input.c tcp_output.c tcp_sack.c tcp_subr.c \ - tcp_timer.c tcp_usrreq.c tcp_vtw.c + tcp_syncache.c tcp_timer.c tcp_usrreq.c tcp_vtw.c # UDP SRCS+= udp_usrreq.c Added files: Index: src/sys/netinet/tcp_syncache.c diff -u /dev/null src/sys/netinet/tcp_syncache.c:1.1 --- /dev/null Tue Sep 20 07:19:15 2022 +++ src/sys/netinet/tcp_syncache.c Tue Sep 20 07:19:14 2022 @@ -0,0 +1,1380 @@ +/* $NetBSD: tcp_syncache.c,v 1.1 2022/09/20 07:19:14 ozaki-r Exp $ */ + +/* + * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 + * + * NRL grants permission for redistribution and use in source and binary + * forms, with or without modification, of the software and documentation + * created at NRL provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgements: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * This product includes software developed at the Information + * Technology Division, US Naval Research Laboratory. + * 4. Neither the name of the NRL nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS + * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * The views and conclusions contained in the software and documentation + * are those of the authors and should not be interpreted as representing + * official policies, either expressed or implied, of the US Naval + * Research Laboratory (NRL). + */ + +/*- + * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006, + * 2011 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Coyote Point Systems, Inc. + * This code is derived from software contributed to The NetBSD Foundation + * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation + * Facility, NASA Ames Research Center. + * This code is derived from software contributed to The NetBSD Foundation + * by Charles M. Hannum. + * This code is derived from software contributed to The NetBSD Foundation + * by Rui Paulo. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 + */ + +/* + * TODO list for SYN cache stuff: + * + * Find room for a "state" field, which is needed to keep a + * compressed state for TIME_WAIT TCBs. It's been noted already + * that this is fairly important for very high-volume web and + * mail servers, which use a large number of short-lived + * connections. + */ + +#include <sys/cdefs.h> +__KERNEL_RCSID(0, "$NetBSD: tcp_syncache.c,v 1.1 2022/09/20 07:19:14 ozaki-r Exp $"); + +#ifdef _KERNEL_OPT +#include "opt_inet.h" +#include "opt_ipsec.h" +#endif + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/mbuf.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/errno.h> +#include <sys/syslog.h> +#include <sys/pool.h> +#include <sys/domain.h> +#include <sys/kernel.h> +#include <sys/lwp.h> /* for lwp0 */ +#include <sys/cprng.h> + +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/in_pcb.h> +#include <netinet/in_var.h> +#include <netinet/ip_var.h> + +#include <netinet/ip6.h> +#ifdef INET6 +#include <netinet6/ip6_var.h> +#include <netinet6/in6_pcb.h> +#include <netinet6/ip6_var.h> +#include <netinet6/in6_var.h> +#endif + +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/tcp_private.h> +#include <netinet/tcp_syncache.h> + +#ifdef TCP_SIGNATURE +#ifdef IPSEC +#include <netipsec/ipsec.h> +#include <netipsec/key.h> +#ifdef INET6 +#include <netipsec/ipsec6.h> +#endif +#endif /* IPSEC*/ +#endif + +static void syn_cache_timer(void *); + +/* syn hash parameters */ +#define TCP_SYN_HASH_SIZE 293 +#define TCP_SYN_BUCKET_SIZE 35 +static int tcp_syn_cache_size = TCP_SYN_HASH_SIZE; +int tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE; +int tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE; +static struct syn_cache_head tcp_syn_cache[TCP_SYN_HASH_SIZE]; + +/* + * TCP compressed state engine. Currently used to hold compressed + * state for SYN_RECEIVED. + */ + +u_long syn_cache_count; +static u_int32_t syn_hash1, syn_hash2; + +#define SYN_HASH(sa, sp, dp) \ + ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \ + ((u_int32_t)(sp)))^syn_hash2))) +#ifndef INET6 +#define SYN_HASHALL(hash, src, dst) \ +do { \ + hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \ + ((const struct sockaddr_in *)(src))->sin_port, \ + ((const struct sockaddr_in *)(dst))->sin_port); \ +} while (/*CONSTCOND*/ 0) +#else +#define SYN_HASH6(sa, sp, dp) \ + ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \ + (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \ + & 0x7fffffff) + +#define SYN_HASHALL(hash, src, dst) \ +do { \ + switch ((src)->sa_family) { \ + case AF_INET: \ + hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \ + ((const struct sockaddr_in *)(src))->sin_port, \ + ((const struct sockaddr_in *)(dst))->sin_port); \ + break; \ + case AF_INET6: \ + hash = SYN_HASH6(&((const struct sockaddr_in6 *)(src))->sin6_addr, \ + ((const struct sockaddr_in6 *)(src))->sin6_port, \ + ((const struct sockaddr_in6 *)(dst))->sin6_port); \ + break; \ + default: \ + hash = 0; \ + } \ +} while (/*CONSTCOND*/0) +#endif /* INET6 */ + +static struct pool syn_cache_pool; + +/* + * We don't estimate RTT with SYNs, so each packet starts with the default + * RTT and each timer step has a fixed timeout value. + */ +static inline void +syn_cache_timer_arm(struct syn_cache *sc) +{ + + TCPT_RANGESET(sc->sc_rxtcur, + TCPTV_SRTTDFLT * tcp_backoff[sc->sc_rxtshift], TCPTV_MIN, + TCPTV_REXMTMAX); + callout_reset(&sc->sc_timer, + sc->sc_rxtcur * (hz / PR_SLOWHZ), syn_cache_timer, sc); +} + +#define SYN_CACHE_TIMESTAMP(sc) (tcp_now - (sc)->sc_timebase) + +static inline void +syn_cache_rm(struct syn_cache *sc) +{ + TAILQ_REMOVE(&tcp_syn_cache[sc->sc_bucketidx].sch_bucket, + sc, sc_bucketq); + sc->sc_tp = NULL; + LIST_REMOVE(sc, sc_tpq); + tcp_syn_cache[sc->sc_bucketidx].sch_length--; + callout_stop(&sc->sc_timer); + syn_cache_count--; +} + +static inline void +syn_cache_put(struct syn_cache *sc) +{ + if (sc->sc_ipopts) + (void) m_free(sc->sc_ipopts); + rtcache_free(&sc->sc_route); + sc->sc_flags |= SCF_DEAD; + if (!callout_invoking(&sc->sc_timer)) + callout_schedule(&(sc)->sc_timer, 1); +} + +void +syn_cache_init(void) +{ + int i; + + pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, + "synpl", NULL, IPL_SOFTNET); + + /* Initialize the hash buckets. */ + for (i = 0; i < tcp_syn_cache_size; i++) + TAILQ_INIT(&tcp_syn_cache[i].sch_bucket); +} + +void +syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) +{ + struct syn_cache_head *scp; + struct syn_cache *sc2; + int s; + + /* + * If there are no entries in the hash table, reinitialize + * the hash secrets. + */ + if (syn_cache_count == 0) { + syn_hash1 = cprng_fast32(); + syn_hash2 = cprng_fast32(); + } + + SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa); + sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size; + scp = &tcp_syn_cache[sc->sc_bucketidx]; + + /* + * Make sure that we don't overflow the per-bucket + * limit or the total cache size limit. + */ + s = splsoftnet(); + if (scp->sch_length >= tcp_syn_bucket_limit) { + TCP_STATINC(TCP_STAT_SC_BUCKETOVERFLOW); + /* + * The bucket is full. Toss the oldest element in the + * bucket. This will be the first entry in the bucket. + */ + sc2 = TAILQ_FIRST(&scp->sch_bucket); +#ifdef DIAGNOSTIC + /* + * This should never happen; we should always find an + * entry in our bucket. + */ + if (sc2 == NULL) + panic("syn_cache_insert: bucketoverflow: impossible"); +#endif + syn_cache_rm(sc2); + syn_cache_put(sc2); /* calls pool_put but see spl above */ + } else if (syn_cache_count >= tcp_syn_cache_limit) { + struct syn_cache_head *scp2, *sce; + + TCP_STATINC(TCP_STAT_SC_OVERFLOWED); + /* + * The cache is full. Toss the oldest entry in the + * first non-empty bucket we can find. + * + * XXX We would really like to toss the oldest + * entry in the cache, but we hope that this + * condition doesn't happen very often. + */ + scp2 = scp; + if (TAILQ_EMPTY(&scp2->sch_bucket)) { + sce = &tcp_syn_cache[tcp_syn_cache_size]; + for (++scp2; scp2 != scp; scp2++) { + if (scp2 >= sce) + scp2 = &tcp_syn_cache[0]; + if (! TAILQ_EMPTY(&scp2->sch_bucket)) + break; + } +#ifdef DIAGNOSTIC + /* + * This should never happen; we should always find a + * non-empty bucket. + */ + if (scp2 == scp) + panic("syn_cache_insert: cacheoverflow: " + "impossible"); +#endif + } + sc2 = TAILQ_FIRST(&scp2->sch_bucket); + syn_cache_rm(sc2); + syn_cache_put(sc2); /* calls pool_put but see spl above */ + } + + /* + * Initialize the entry's timer. + */ + sc->sc_rxttot = 0; + sc->sc_rxtshift = 0; + syn_cache_timer_arm(sc); + + /* Link it from tcpcb entry */ + LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); + + /* Put it into the bucket. */ + TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); + scp->sch_length++; + syn_cache_count++; + + TCP_STATINC(TCP_STAT_SC_ADDED); + splx(s); +} + +/* + * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. + * If we have retransmitted an entry the maximum number of times, expire + * that entry. + */ +static void +syn_cache_timer(void *arg) +{ + struct syn_cache *sc = arg; + + mutex_enter(softnet_lock); + KERNEL_LOCK(1, NULL); + + callout_ack(&sc->sc_timer); + + if (__predict_false(sc->sc_flags & SCF_DEAD)) { + TCP_STATINC(TCP_STAT_SC_DELAYED_FREE); + goto free; + } + + if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { + /* Drop it -- too many retransmissions. */ + goto dropit; + } + + /* + * Compute the total amount of time this entry has + * been on a queue. If this entry has been on longer + * than the keep alive timer would allow, expire it. + */ + sc->sc_rxttot += sc->sc_rxtcur; + if (sc->sc_rxttot >= MIN(tcp_keepinit, TCP_TIMER_MAXTICKS)) + goto dropit; + + TCP_STATINC(TCP_STAT_SC_RETRANSMITTED); + (void)syn_cache_respond(sc); + + /* Advance the timer back-off. */ + sc->sc_rxtshift++; + syn_cache_timer_arm(sc); + + goto out; + + dropit: + TCP_STATINC(TCP_STAT_SC_TIMED_OUT); + syn_cache_rm(sc); + if (sc->sc_ipopts) + (void) m_free(sc->sc_ipopts); + rtcache_free(&sc->sc_route); + + free: + callout_destroy(&sc->sc_timer); + pool_put(&syn_cache_pool, sc); + + out: + KERNEL_UNLOCK_ONE(NULL); + mutex_exit(softnet_lock); +} + +/* + * Remove syn cache created by the specified tcb entry, + * because this does not make sense to keep them + * (if there's no tcb entry, syn cache entry will never be used) + */ +void +syn_cache_cleanup(struct tcpcb *tp) +{ + struct syn_cache *sc, *nsc; + int s; + + s = splsoftnet(); + + for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) { + nsc = LIST_NEXT(sc, sc_tpq); + +#ifdef DIAGNOSTIC + if (sc->sc_tp != tp) + panic("invalid sc_tp in syn_cache_cleanup"); +#endif + syn_cache_rm(sc); + syn_cache_put(sc); /* calls pool_put but see spl above */ + } + /* just for safety */ + LIST_INIT(&tp->t_sc); + + splx(s); +} + +/* + * Find an entry in the syn cache. + */ +struct syn_cache * +syn_cache_lookup(const struct sockaddr *src, const struct sockaddr *dst, + struct syn_cache_head **headp) +{ + struct syn_cache *sc; + struct syn_cache_head *scp; + u_int32_t hash; + int s; + + SYN_HASHALL(hash, src, dst); + + scp = &tcp_syn_cache[hash % tcp_syn_cache_size]; + *headp = scp; + s = splsoftnet(); + for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL; + sc = TAILQ_NEXT(sc, sc_bucketq)) { + if (sc->sc_hash != hash) + continue; + if (!memcmp(&sc->sc_src, src, src->sa_len) && + !memcmp(&sc->sc_dst, dst, dst->sa_len)) { + splx(s); + return (sc); + } + } + splx(s); + return (NULL); +} + +/* + * This function gets called when we receive an ACK for a socket in the + * LISTEN state. We look up the connection in the syn cache, and if it's + * there, we pull it out of the cache and turn it into a full-blown + * connection in the SYN-RECEIVED state. + * + * The return values may not be immediately obvious, and their effects + * can be subtle, so here they are: + * + * NULL SYN was not found in cache; caller should drop the + * packet and send an RST. + * + * -1 We were unable to create the new connection, and are + * aborting it. An ACK,RST is being sent to the peer + * (unless we got screwey sequence numbers; see below), + * because the 3-way handshake has been completed. Caller + * should not free the mbuf, since we may be using it. If + * we are not, we will free it. + * + * Otherwise, the return value is a pointer to the new socket + * associated with the connection. + */ +struct socket * +syn_cache_get(struct sockaddr *src, struct sockaddr *dst, + struct tcphdr *th, struct socket *so, struct mbuf *m) +{ + struct syn_cache *sc; + struct syn_cache_head *scp; + struct inpcb *inp = NULL; +#ifdef INET6 + struct in6pcb *in6p = NULL; +#endif + struct tcpcb *tp; + int s; + struct socket *oso; + + s = splsoftnet(); + if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { + splx(s); + return NULL; + } + + /* + * Verify the sequence and ack numbers. Try getting the correct + * response again. + */ + if ((th->th_ack != sc->sc_iss + 1) || + SEQ_LEQ(th->th_seq, sc->sc_irs) || + SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { + m_freem(m); + (void)syn_cache_respond(sc); + splx(s); + return ((struct socket *)(-1)); + } + + /* Remove this cache entry */ + syn_cache_rm(sc); + splx(s); + + /* + * Ok, create the full blown connection, and set things up + * as they would have been set up if we had created the + * connection when the SYN arrived. If we can't create + * the connection, abort it. + */ + /* + * inp still has the OLD in_pcb stuff, set the + * v6-related flags on the new guy, too. This is + * done particularly for the case where an AF_INET6 + * socket is bound only to a port, and a v4 connection + * comes in on that port. + * we also copy the flowinfo from the original pcb + * to the new one. + */ + oso = so; + so = sonewconn(so, true); + if (so == NULL) + goto resetandabort; + + switch (so->so_proto->pr_domain->dom_family) { + case AF_INET: + inp = sotoinpcb(so); + break; +#ifdef INET6 + case AF_INET6: + in6p = sotoin6pcb(so); + break; +#endif + } + + switch (src->sa_family) { + case AF_INET: + if (inp) { + inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr; + inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port; + inp->inp_options = ip_srcroute(m); + in_pcbstate(inp, INP_BOUND); + if (inp->inp_options == NULL) { + inp->inp_options = sc->sc_ipopts; + sc->sc_ipopts = NULL; + } + } +#ifdef INET6 + else if (in6p) { + /* IPv4 packet to AF_INET6 socket */ + memset(&in6p->in6p_laddr, 0, sizeof(in6p->in6p_laddr)); + in6p->in6p_laddr.s6_addr16[5] = htons(0xffff); + bcopy(&((struct sockaddr_in *)dst)->sin_addr, + &in6p->in6p_laddr.s6_addr32[3], + sizeof(((struct sockaddr_in *)dst)->sin_addr)); + in6p->in6p_lport = ((struct sockaddr_in *)dst)->sin_port; + in6totcpcb(in6p)->t_family = AF_INET; + if (sotoin6pcb(oso)->in6p_flags & IN6P_IPV6_V6ONLY) + in6p->in6p_flags |= IN6P_IPV6_V6ONLY; + else + in6p->in6p_flags &= ~IN6P_IPV6_V6ONLY; + in6_pcbstate(in6p, IN6P_BOUND); + } +#endif + break; +#ifdef INET6 + case AF_INET6: + if (in6p) { + in6p->in6p_laddr = ((struct sockaddr_in6 *)dst)->sin6_addr; + in6p->in6p_lport = ((struct sockaddr_in6 *)dst)->sin6_port; + in6_pcbstate(in6p, IN6P_BOUND); + } + break; +#endif + } + +#ifdef INET6 + if (in6p && in6totcpcb(in6p)->t_family == AF_INET6 && sotoinpcb(oso)) { + struct in6pcb *oin6p = sotoin6pcb(oso); + /* inherit socket options from the listening socket */ + in6p->in6p_flags |= (oin6p->in6p_flags & IN6P_CONTROLOPTS); + if (in6p->in6p_flags & IN6P_CONTROLOPTS) { + m_freem(in6p->in6p_options); + in6p->in6p_options = NULL; + } + ip6_savecontrol(in6p, &in6p->in6p_options, + mtod(m, struct ip6_hdr *), m); + } +#endif + + /* + * Give the new socket our cached route reference. + */ + if (inp) { + rtcache_copy(&inp->inp_route, &sc->sc_route); + rtcache_free(&sc->sc_route); + } +#ifdef INET6 + else { + rtcache_copy(&in6p->in6p_route, &sc->sc_route); + rtcache_free(&sc->sc_route); + } +#endif + + if (inp) { + struct sockaddr_in sin; + memcpy(&sin, src, src->sa_len); + if (in_pcbconnect(inp, &sin, &lwp0)) { + goto resetandabort; + } + } +#ifdef INET6 + else if (in6p) { + struct sockaddr_in6 sin6; + memcpy(&sin6, src, src->sa_len); + if (src->sa_family == AF_INET) { + /* IPv4 packet to AF_INET6 socket */ + in6_sin_2_v4mapsin6((struct sockaddr_in *)src, &sin6); + } + if (in6_pcbconnect(in6p, &sin6, NULL)) { + goto resetandabort; + } + } +#endif + else { + goto resetandabort; + } + + if (inp) + tp = intotcpcb(inp); +#ifdef INET6 + else if (in6p) + tp = in6totcpcb(in6p); +#endif + else + tp = NULL; + + tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY; + if (sc->sc_request_r_scale != 15) { + tp->requested_s_scale = sc->sc_requested_s_scale; + tp->request_r_scale = sc->sc_request_r_scale; + tp->snd_scale = sc->sc_requested_s_scale; + tp->rcv_scale = sc->sc_request_r_scale; + tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; + } + if (sc->sc_flags & SCF_TIMESTAMP) + tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; + tp->ts_timebase = sc->sc_timebase; + + tp->t_template = tcp_template(tp); + if (tp->t_template == 0) { + tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ + so = NULL; + m_freem(m); + goto abort; + } + + tp->iss = sc->sc_iss; + tp->irs = sc->sc_irs; + tcp_sendseqinit(tp); + tcp_rcvseqinit(tp); + tp->t_state = TCPS_SYN_RECEIVED; + TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepinit); + TCP_STATINC(TCP_STAT_ACCEPTS); + + if ((sc->sc_flags & SCF_SACK_PERMIT) && tcp_do_sack) + tp->t_flags |= TF_WILL_SACK; + + if ((sc->sc_flags & SCF_ECN_PERMIT) && tcp_do_ecn) + tp->t_flags |= TF_ECN_PERMIT; + +#ifdef TCP_SIGNATURE + if (sc->sc_flags & SCF_SIGNATURE) + tp->t_flags |= TF_SIGNATURE; +#endif + + /* Initialize tp->t_ourmss before we deal with the peer's! */ + tp->t_ourmss = sc->sc_ourmaxseg; + tcp_mss_from_peer(tp, sc->sc_peermaxseg); + + /* + * Initialize the initial congestion window. If we + * had to retransmit the SYN,ACK, we must initialize cwnd + * to 1 segment (i.e. the Loss Window). + */ + if (sc->sc_rxtshift) + tp->snd_cwnd = tp->t_peermss; + else { + int ss = tcp_init_win; + if (inp != NULL && in_localaddr(inp->inp_faddr)) + ss = tcp_init_win_local; +#ifdef INET6 + if (in6p != NULL && in6_localaddr(&in6p->in6p_faddr)) + ss = tcp_init_win_local; +#endif + tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss); + } + + tcp_rmx_rtt(tp); + tp->snd_wl1 = sc->sc_irs; + tp->rcv_up = sc->sc_irs + 1; + + /* + * This is what would have happened in tcp_output() when + * the SYN,ACK was sent. + */ + tp->snd_up = tp->snd_una; + tp->snd_max = tp->snd_nxt = tp->iss+1; + TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); + if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) + tp->rcv_adv = tp->rcv_nxt + sc->sc_win; + tp->last_ack_sent = tp->rcv_nxt; + tp->t_partialacks = -1; + tp->t_dupacks = 0; + + TCP_STATINC(TCP_STAT_SC_COMPLETED); + s = splsoftnet(); + syn_cache_put(sc); + splx(s); + return so; + +resetandabort: + (void)tcp_respond(NULL, m, m, th, (tcp_seq)0, th->th_ack, TH_RST); +abort: + if (so != NULL) { + (void) soqremque(so, 1); + (void) soabort(so); + mutex_enter(softnet_lock); + } + s = splsoftnet(); + syn_cache_put(sc); + splx(s); + TCP_STATINC(TCP_STAT_SC_ABORTED); + return ((struct socket *)(-1)); +} + +/* + * This function is called when we get a RST for a + * non-existent connection, so that we can see if the + * connection is in the syn cache. If it is, zap it. + */ + +void +syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th) +{ + struct syn_cache *sc; + struct syn_cache_head *scp; + int s = splsoftnet(); + + if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { + splx(s); + return; + } + if (SEQ_LT(th->th_seq, sc->sc_irs) || + SEQ_GT(th->th_seq, sc->sc_irs+1)) { + splx(s); + return; + } + syn_cache_rm(sc); + TCP_STATINC(TCP_STAT_SC_RESET); + syn_cache_put(sc); /* calls pool_put but see spl above */ + splx(s); +} + +void +syn_cache_unreach(const struct sockaddr *src, const struct sockaddr *dst, + struct tcphdr *th) +{ + struct syn_cache *sc; + struct syn_cache_head *scp; + int s; + + s = splsoftnet(); + if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { + splx(s); + return; + } + /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ + if (ntohl(th->th_seq) != sc->sc_iss) { + splx(s); + return; + } + + /* + * If we've retransmitted 3 times and this is our second error, + * we remove the entry. Otherwise, we allow it to continue on. + * This prevents us from incorrectly nuking an entry during a + * spurious network outage. + * + * See tcp_notify(). + */ + if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { + sc->sc_flags |= SCF_UNREACH; + splx(s); + return; + } + + syn_cache_rm(sc); + TCP_STATINC(TCP_STAT_SC_UNREACH); + syn_cache_put(sc); /* calls pool_put but see spl above */ + splx(s); +} + +/* + * Given a LISTEN socket and an inbound SYN request, add this to the syn + * cache, and send back a segment: + * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> + * to the source. + * + * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. + * Doing so would require that we hold onto the data and deliver it + * to the application. However, if we are the target of a SYN-flood + * DoS attack, an attacker could send data which would eventually + * consume all available buffer space if it were ACKed. By not ACKing + * the data, we avoid this DoS scenario. + */ +int +syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, + unsigned int toff, struct socket *so, struct mbuf *m, u_char *optp, + int optlen, struct tcp_opt_info *oi) +{ + struct tcpcb tb, *tp; + long win; + struct syn_cache *sc; + struct syn_cache_head *scp; + struct mbuf *ipopts; + int s; + + tp = sototcpcb(so); + + /* + * Initialize some local state. + */ + win = sbspace(&so->so_rcv); + if (win > TCP_MAXWIN) + win = TCP_MAXWIN; + +#ifdef TCP_SIGNATURE + if (optp || (tp->t_flags & TF_SIGNATURE)) +#else + if (optp) +#endif + { + tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; +#ifdef TCP_SIGNATURE + tb.t_flags |= (tp->t_flags & TF_SIGNATURE); +#endif + tb.t_state = TCPS_LISTEN; + if (tcp_dooptions(&tb, optp, optlen, th, m, toff, oi) < 0) + return 0; + } else + tb.t_flags = 0; + + switch (src->sa_family) { + case AF_INET: + /* Remember the IP options, if any. */ + ipopts = ip_srcroute(m); + break; + default: + ipopts = NULL; + } + + /* + * See if we already have an entry for this connection. + * If we do, resend the SYN,ACK. We do not count this + * as a retransmission (XXX though maybe we should). + */ + if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) { + TCP_STATINC(TCP_STAT_SC_DUPESYN); + if (ipopts) { + /* + * If we were remembering a previous source route, + * forget it and use the new one we've been given. + */ + if (sc->sc_ipopts) + (void)m_free(sc->sc_ipopts); + sc->sc_ipopts = ipopts; + } + sc->sc_timestamp = tb.ts_recent; + m_freem(m); + if (syn_cache_respond(sc) == 0) { + uint64_t *tcps = TCP_STAT_GETREF(); + tcps[TCP_STAT_SNDACKS]++; + tcps[TCP_STAT_SNDTOTAL]++; + TCP_STAT_PUTREF(); + } + return 1; + } + + s = splsoftnet(); + sc = pool_get(&syn_cache_pool, PR_NOWAIT); + splx(s); + if (sc == NULL) { + if (ipopts) + (void)m_free(ipopts); + return 0; + } + + /* + * Fill in the cache, and put the necessary IP and TCP + * options into the reply. + */ + memset(sc, 0, sizeof(struct syn_cache)); + callout_init(&sc->sc_timer, CALLOUT_MPSAFE); + memcpy(&sc->sc_src, src, src->sa_len); + memcpy(&sc->sc_dst, dst, dst->sa_len); + sc->sc_flags = 0; + sc->sc_ipopts = ipopts; + sc->sc_irs = th->th_seq; + switch (src->sa_family) { + case AF_INET: + { + struct sockaddr_in *srcin = (void *)src; + struct sockaddr_in *dstin = (void *)dst; + + sc->sc_iss = tcp_new_iss1(&dstin->sin_addr, + &srcin->sin_addr, dstin->sin_port, + srcin->sin_port, sizeof(dstin->sin_addr)); + break; + } +#ifdef INET6 + case AF_INET6: + { + struct sockaddr_in6 *srcin6 = (void *)src; + struct sockaddr_in6 *dstin6 = (void *)dst; + + sc->sc_iss = tcp_new_iss1(&dstin6->sin6_addr, + &srcin6->sin6_addr, dstin6->sin6_port, + srcin6->sin6_port, sizeof(dstin6->sin6_addr)); + break; + } +#endif + } + sc->sc_peermaxseg = oi->maxseg; + sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ? + m_get_rcvif_NOMPSAFE(m) : NULL, sc->sc_src.sa.sa_family); + sc->sc_win = win; + sc->sc_timebase = tcp_now - 1; /* see tcp_newtcpcb() */ + sc->sc_timestamp = tb.ts_recent; + if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == + (TF_REQ_TSTMP|TF_RCVD_TSTMP)) + sc->sc_flags |= SCF_TIMESTAMP; + if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) { + sc->sc_requested_s_scale = tb.requested_s_scale; + sc->sc_request_r_scale = 0; + /* + * Pick the smallest possible scaling factor that + * will still allow us to scale up to sb_max. + * + * We do this because there are broken firewalls that + * will corrupt the window scale option, leading to + * the other endpoint believing that our advertised + * window is unscaled. At scale factors larger than + * 5 the unscaled window will drop below 1500 bytes, + * leading to serious problems when traversing these + * broken firewalls. + * + * With the default sbmax of 256K, a scale factor + * of 3 will be chosen by this algorithm. Those who + * choose a larger sbmax should watch out + * for the compatibility problems mentioned above. + * + * RFC1323: The Window field in a SYN (i.e., a <SYN> + * or <SYN,ACK>) segment itself is never scaled. + */ + while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && + (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max) + sc->sc_request_r_scale++; + } else { + sc->sc_requested_s_scale = 15; + sc->sc_request_r_scale = 15; + } + if ((tb.t_flags & TF_SACK_PERMIT) && tcp_do_sack) + sc->sc_flags |= SCF_SACK_PERMIT; + + /* + * ECN setup packet received. + */ + if ((th->th_flags & (TH_ECE|TH_CWR)) && tcp_do_ecn) + sc->sc_flags |= SCF_ECN_PERMIT; + +#ifdef TCP_SIGNATURE + if (tb.t_flags & TF_SIGNATURE) + sc->sc_flags |= SCF_SIGNATURE; +#endif + sc->sc_tp = tp; + m_freem(m); + if (syn_cache_respond(sc) == 0) { + uint64_t *tcps = TCP_STAT_GETREF(); + tcps[TCP_STAT_SNDACKS]++; + tcps[TCP_STAT_SNDTOTAL]++; + TCP_STAT_PUTREF(); + syn_cache_insert(sc, tp); + } else { + s = splsoftnet(); + /* + * syn_cache_put() will try to schedule the timer, so + * we need to initialize it + */ + syn_cache_timer_arm(sc); + syn_cache_put(sc); + splx(s); + TCP_STATINC(TCP_STAT_SC_DROPPED); + } + return 1; +} + +/* + * syn_cache_respond: (re)send SYN+ACK. + * + * Returns 0 on success. + */ + +int +syn_cache_respond(struct syn_cache *sc) +{ +#ifdef INET6 + struct rtentry *rt = NULL; +#endif + struct route *ro; + u_int8_t *optp; + int optlen, error; + u_int16_t tlen; + struct ip *ip = NULL; +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; +#endif + struct tcpcb *tp; + struct tcphdr *th; + struct mbuf *m; + u_int hlen; +#ifdef TCP_SIGNATURE + struct secasvar *sav = NULL; + u_int8_t *sigp = NULL; +#endif + + ro = &sc->sc_route; + switch (sc->sc_src.sa.sa_family) { + case AF_INET: + hlen = sizeof(struct ip); + break; +#ifdef INET6 + case AF_INET6: + hlen = sizeof(struct ip6_hdr); + break; +#endif + default: + return EAFNOSUPPORT; + } + + /* Worst case scenario, since we don't know the option size yet. */ + tlen = hlen + sizeof(struct tcphdr) + MAX_TCPOPTLEN; + KASSERT(max_linkhdr + tlen <= MCLBYTES); + + /* + * Create the IP+TCP header from scratch. + */ + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (m && (max_linkhdr + tlen) > MHLEN) { + MCLGET(m, M_DONTWAIT); + if ((m->m_flags & M_EXT) == 0) { + m_freem(m); + m = NULL; + } + } + if (m == NULL) + return ENOBUFS; + MCLAIM(m, &tcp_tx_mowner); + + tp = sc->sc_tp; + + /* Fixup the mbuf. */ + m->m_data += max_linkhdr; + m_reset_rcvif(m); + memset(mtod(m, void *), 0, tlen); + + switch (sc->sc_src.sa.sa_family) { + case AF_INET: + ip = mtod(m, struct ip *); + ip->ip_v = 4; + ip->ip_dst = sc->sc_src.sin.sin_addr; + ip->ip_src = sc->sc_dst.sin.sin_addr; + ip->ip_p = IPPROTO_TCP; + th = (struct tcphdr *)(ip + 1); + th->th_dport = sc->sc_src.sin.sin_port; + th->th_sport = sc->sc_dst.sin.sin_port; + break; +#ifdef INET6 + case AF_INET6: + ip6 = mtod(m, struct ip6_hdr *); + ip6->ip6_vfc = IPV6_VERSION; + ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; + ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; + ip6->ip6_nxt = IPPROTO_TCP; + /* ip6_plen will be updated in ip6_output() */ + th = (struct tcphdr *)(ip6 + 1); + th->th_dport = sc->sc_src.sin6.sin6_port; + th->th_sport = sc->sc_dst.sin6.sin6_port; + break; +#endif + default: + panic("%s: impossible (1)", __func__); + } + + th->th_seq = htonl(sc->sc_iss); + th->th_ack = htonl(sc->sc_irs + 1); + th->th_flags = TH_SYN|TH_ACK; + th->th_win = htons(sc->sc_win); + /* th_x2, th_sum, th_urp already 0 from memset */ + + /* Tack on the TCP options. */ + optp = (u_int8_t *)(th + 1); + optlen = 0; + *optp++ = TCPOPT_MAXSEG; + *optp++ = TCPOLEN_MAXSEG; + *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; + *optp++ = sc->sc_ourmaxseg & 0xff; + optlen += TCPOLEN_MAXSEG; + + if (sc->sc_request_r_scale != 15) { + *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | + TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | + sc->sc_request_r_scale); + optp += TCPOLEN_WINDOW + TCPOLEN_NOP; + optlen += TCPOLEN_WINDOW + TCPOLEN_NOP; + } + + if (sc->sc_flags & SCF_SACK_PERMIT) { + /* Let the peer know that we will SACK. */ + *optp++ = TCPOPT_SACK_PERMITTED; + *optp++ = TCPOLEN_SACK_PERMITTED; + optlen += TCPOLEN_SACK_PERMITTED; + } + + if (sc->sc_flags & SCF_TIMESTAMP) { + while (optlen % 4 != 2) { + optlen += TCPOLEN_NOP; + *optp++ = TCPOPT_NOP; + } + *optp++ = TCPOPT_TIMESTAMP; + *optp++ = TCPOLEN_TIMESTAMP; + u_int32_t *lp = (u_int32_t *)(optp); + /* Form timestamp option as shown in appendix A of RFC 1323. */ + *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc)); + *lp = htonl(sc->sc_timestamp); + optp += TCPOLEN_TIMESTAMP - 2; + optlen += TCPOLEN_TIMESTAMP; + } + +#ifdef TCP_SIGNATURE + if (sc->sc_flags & SCF_SIGNATURE) { + sav = tcp_signature_getsav(m); + if (sav == NULL) { + m_freem(m); + return EPERM; + } + + *optp++ = TCPOPT_SIGNATURE; + *optp++ = TCPOLEN_SIGNATURE; + sigp = optp; + memset(optp, 0, TCP_SIGLEN); + optp += TCP_SIGLEN; + optlen += TCPOLEN_SIGNATURE; + } +#endif + + /* + * Terminate and pad TCP options to a 4 byte boundary. + * + * According to RFC793: "The content of the header beyond the + * End-of-Option option must be header padding (i.e., zero)." + * And later: "The padding is composed of zeros." + */ + if (optlen % 4) { + optlen += TCPOLEN_EOL; + *optp++ = TCPOPT_EOL; + } + while (optlen % 4) { + optlen += TCPOLEN_PAD; + *optp++ = TCPOPT_PAD; + } + + /* Compute the actual values now that we've added the options. */ + tlen = hlen + sizeof(struct tcphdr) + optlen; + m->m_len = m->m_pkthdr.len = tlen; + th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; + +#ifdef TCP_SIGNATURE + if (sav) { + (void)tcp_signature(m, th, hlen, sav, sigp); + key_sa_recordxfer(sav, m); + KEY_SA_UNREF(&sav); + } +#endif + + /* + * Send ECN SYN-ACK setup packet. + * Routes can be asymmetric, so, even if we receive a packet + * with ECE and CWR set, we must not assume no one will block + * the ECE packet we are about to send. + */ + if ((sc->sc_flags & SCF_ECN_PERMIT) && tp && + SEQ_GEQ(tp->snd_nxt, tp->snd_max)) { + th->th_flags |= TH_ECE; + TCP_STATINC(TCP_STAT_ECN_SHS); + + /* + * draft-ietf-tcpm-ecnsyn-00.txt + * + * "[...] a TCP node MAY respond to an ECN-setup + * SYN packet by setting ECT in the responding + * ECN-setup SYN/ACK packet, indicating to routers + * that the SYN/ACK packet is ECN-Capable. + * This allows a congested router along the path + * to mark the packet instead of dropping the + * packet as an indication of congestion." + * + * "[...] There can be a great benefit in setting + * an ECN-capable codepoint in SYN/ACK packets [...] + * Congestion is most likely to occur in + * the server-to-client direction. As a result, + * setting an ECN-capable codepoint in SYN/ACK + * packets can reduce the occurrence of three-second + * retransmit timeouts resulting from the drop + * of SYN/ACK packets." + * + * Page 4 and 6, January 2006. + */ + + switch (sc->sc_src.sa.sa_family) { + case AF_INET: + ip->ip_tos |= IPTOS_ECN_ECT0; + break; +#ifdef INET6 + case AF_INET6: + ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); + break; +#endif + } + TCP_STATINC(TCP_STAT_ECN_ECT); + } + + + /* + * Compute the packet's checksum. + * + * Fill in some straggling IP bits. Note the stack expects + * ip_len to be in host order, for convenience. + */ + switch (sc->sc_src.sa.sa_family) { + case AF_INET: + ip->ip_len = htons(tlen - hlen); + th->th_sum = 0; + th->th_sum = in4_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); + ip->ip_len = htons(tlen); + ip->ip_ttl = ip_defttl; + /* XXX tos? */ + break; +#ifdef INET6 + case AF_INET6: + ip6->ip6_plen = htons(tlen - hlen); + th->th_sum = 0; + th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); + ip6->ip6_vfc &= ~IPV6_VERSION_MASK; + ip6->ip6_vfc |= IPV6_VERSION; + ip6->ip6_plen = htons(tlen - hlen); + /* ip6_hlim will be initialized afterwards */ + /* XXX flowlabel? */ + break; +#endif + } + + /* XXX use IPsec policy on listening socket, on SYN ACK */ + tp = sc->sc_tp; + + switch (sc->sc_src.sa.sa_family) { + case AF_INET: + error = ip_output(m, sc->sc_ipopts, ro, + (ip_mtudisc ? IP_MTUDISC : 0), + NULL, tp ? tp->t_inpcb : NULL); + break; +#ifdef INET6 + case AF_INET6: + ip6->ip6_hlim = in6_selecthlim(NULL, + (rt = rtcache_validate(ro)) != NULL ? rt->rt_ifp : NULL); + rtcache_unref(rt, ro); + + error = ip6_output(m, NULL /*XXX*/, ro, 0, NULL, + tp ? tp->t_in6pcb : NULL, NULL); + break; +#endif + default: + panic("%s: impossible (2)", __func__); + } + + return error; +} Index: src/sys/netinet/tcp_syncache.h diff -u /dev/null src/sys/netinet/tcp_syncache.h:1.1 --- /dev/null Tue Sep 20 07:19:15 2022 +++ src/sys/netinet/tcp_syncache.h Tue Sep 20 07:19:14 2022 @@ -0,0 +1,222 @@ +/* $NetBSD: tcp_syncache.h,v 1.1 2022/09/20 07:19:14 ozaki-r Exp $ */ + +/* + * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 + * + * NRL grants permission for redistribution and use in source and binary + * forms, with or without modification, of the software and documentation + * created at NRL provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgements: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * This product includes software developed at the Information + * Technology Division, US Naval Research Laboratory. + * 4. Neither the name of the NRL nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS + * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * The views and conclusions contained in the software and documentation + * are those of the authors and should not be interpreted as representing + * official policies, either expressed or implied, of the US Naval + * Research Laboratory (NRL). + */ + +/*- + * Copyright (c) 1997, 1998, 1999, 2001, 2005 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, + * NASA Ames Research Center. + * This code is derived from software contributed to The NetBSD Foundation + * by Charles M. Hannum. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 1982, 1986, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_var.h 8.4 (Berkeley) 5/24/95 + */ + +#ifndef _NETINET_TCP_SYNCACHE_H_ +#define _NETINET_TCP_SYNCACHE_H_ + +#if defined(_KERNEL_OPT) +#include "opt_inet.h" +#include "opt_mbuftrace.h" +#endif + +#ifdef _KERNEL +#include <sys/callout.h> +#include <sys/mbuf.h> +#include <sys/queue.h> + +#include <net/route.h> + +/* + * Data for the TCP compressed state engine. + */ +union syn_cache_sa { + struct sockaddr sa; + struct sockaddr_in sin; +#if 1 /*def INET6*/ + struct sockaddr_in6 sin6; +#endif +}; + +struct syn_cache { + TAILQ_ENTRY(syn_cache) sc_bucketq; /* link on bucket list */ + callout_t sc_timer; /* rexmt timer */ + struct route sc_route; + long sc_win; /* advertised window */ + int sc_bucketidx; /* our bucket index */ + u_int32_t sc_hash; + u_int32_t sc_timestamp; /* timestamp from SYN */ + u_int32_t sc_timebase; /* our local timebase */ + union syn_cache_sa sc_src; + union syn_cache_sa sc_dst; + tcp_seq sc_irs; + tcp_seq sc_iss; + u_int sc_rxtcur; /* current rxt timeout */ + u_int sc_rxttot; /* total time spend on queues */ + u_short sc_rxtshift; /* for computing backoff */ + u_short sc_flags; + +#define SCF_UNREACH 0x0001 /* we've had an unreach error */ +#define SCF_TIMESTAMP 0x0002 /* peer will do timestamps */ +#define SCF_DEAD 0x0004 /* this entry to be released */ +#define SCF_SACK_PERMIT 0x0008 /* peer will do SACK */ +#define SCF_ECN_PERMIT 0x0010 /* peer will do ECN */ +#define SCF_SIGNATURE 0x40 /* send MD5 digests */ + + struct mbuf *sc_ipopts; /* IP options */ + u_int16_t sc_peermaxseg; + u_int16_t sc_ourmaxseg; + u_int8_t sc_request_r_scale : 4, + sc_requested_s_scale : 4; + + struct tcpcb *sc_tp; /* tcb for listening socket */ + LIST_ENTRY(syn_cache) sc_tpq; /* list of entries by same tp */ +}; + +struct syn_cache_head { + TAILQ_HEAD(, syn_cache) sch_bucket; /* bucket entries */ + u_short sch_length; /* # entries in bucket */ +}; + +extern int tcp_syn_bucket_limit;/* max entries per hash bucket */ +extern int tcp_syn_cache_limit; /* max entries for compressed state engine */ +extern u_long syn_cache_count; + +int syn_cache_add(struct sockaddr *, struct sockaddr *, + struct tcphdr *, unsigned int, struct socket *, + struct mbuf *, u_char *, int, struct tcp_opt_info *); +void syn_cache_unreach(const struct sockaddr *, const struct sockaddr *, + struct tcphdr *); +struct socket *syn_cache_get(struct sockaddr *, struct sockaddr *, + struct tcphdr *, struct socket *so, struct mbuf *); +void syn_cache_init(void); +void syn_cache_insert(struct syn_cache *, struct tcpcb *); +struct syn_cache *syn_cache_lookup(const struct sockaddr *, const struct sockaddr *, + struct syn_cache_head **); +void syn_cache_reset(struct sockaddr *, struct sockaddr *, + struct tcphdr *); +int syn_cache_respond(struct syn_cache *); +void syn_cache_cleanup(struct tcpcb *); +#endif + +#endif /* !_NETINET_TCP_SYNCACHE_H_ */