> -----Original Message----- > From: Ilpo Järvinen <i...@kernel.org> > Sent: Tuesday, May 6, 2025 12:54 AM > To: Paolo Abeni <pab...@redhat.com> > Cc: Chia-Yu Chang (Nokia) <chia-yu.ch...@nokia-bell-labs.com>; > ho...@kernel.org; dsah...@kernel.org; kun...@amazon.com; > b...@vger.kernel.org; net...@vger.kernel.org; dave.t...@gmail.com; > j...@mojatatu.com; k...@kernel.org; step...@networkplumber.org; > xiyou.wangc...@gmail.com; j...@resnulli.us; da...@davemloft.net; > eduma...@google.com; andrew+net...@lunn.ch; donald.hun...@gmail.com; > a...@fiberby.net; liuhang...@gmail.com; sh...@kernel.org; > linux-kselftest@vger.kernel.org; ncardw...@google.com; Koen De Schepper > (Nokia) <koen.de_schep...@nokia-bell-labs.com>; g.white > <g.wh...@cablelabs.com>; ingemar.s.johans...@ericsson.com; > mirja.kuehlew...@ericsson.com; chesh...@apple.com; rs.i...@gmx.at; > jason_living...@comcast.com; vidhi_goel <vidhi_g...@apple.com> > Subject: Re: [PATCH v5 net-next 09/15] tcp: accecn: AccECN option > > > CAUTION: This is an external email. Please be very careful when clicking > links or opening attachments. See the URL nok.it/ext for additional > information. > > > > On Tue, 29 Apr 2025, Paolo Abeni wrote: > > > On 4/22/25 5:35 PM, chia-yu.ch...@nokia-bell-labs.com wrote: > > > @@ -302,10 +303,13 @@ struct tcp_sock { > > > u32 snd_up; /* Urgent pointer */ > > > u32 delivered; /* Total data packets delivered incl. rexmits > > > */ > > > u32 delivered_ce; /* Like the above but only ECE marked packets > > > */ > > > + u32 delivered_ecn_bytes[3]; > > > > This new fields do not belong to this cacheline group. I'm unsure they > > belong to fast-path at all. Also u32 will wrap-around very soon. > > > > [...] > > > diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h > > > index dc8fdc80e16b..74ac8a5d2e00 100644 > > > --- a/include/uapi/linux/tcp.h > > > +++ b/include/uapi/linux/tcp.h > > > @@ -298,6 +298,13 @@ struct tcp_info { > > > __u32 tcpi_snd_wnd; /* peer's advertised receive window after > > > * scaling (bytes) > > > */ > > > + __u32 tcpi_received_ce; /* # of CE marks received */ > > > + __u32 tcpi_delivered_e1_bytes; /* Accurate ECN byte counters */ > > > + __u32 tcpi_delivered_e0_bytes; > > > + __u32 tcpi_delivered_ce_bytes; > > > + __u32 tcpi_received_e1_bytes; > > > + __u32 tcpi_received_e0_bytes; > > > + __u32 tcpi_received_ce_bytes; > > > > This will break uAPI: new fields must be addded at the end, or must > > fill existing holes. Also u32 set in stone in uAPI for a byte counter > > looks way too small. > > > > > @@ -5100,7 +5113,7 @@ static void __init tcp_struct_check(void) > > > /* 32bit arches with 8byte alignment on u64 fields might need padding > > > * before tcp_clock_cache. > > > */ > > > - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 109 > > > + 7); > > > + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, > > > + tcp_sock_write_txrx, 122 + 6); > > > > The above means an additional cacheline in fast-path WRT the current > > status. IMHO should be avoided. > > > > > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index > > > 5bd7fc9bcf66..41e45b9aff3f 100644 > > > --- a/net/ipv4/tcp_input.c > > > +++ b/net/ipv4/tcp_input.c > > > @@ -70,6 +70,7 @@ > > > #include <linux/sysctl.h> > > > #include <linux/kernel.h> > > > #include <linux/prefetch.h> > > > +#include <linux/bitops.h> > > > #include <net/dst.h> > > > #include <net/tcp.h> > > > #include <net/proto_memory.h> > > > @@ -499,6 +500,144 @@ static bool tcp_ecn_rcv_ecn_echo(const struct > > > tcp_sock *tp, const struct tcphdr > > > return false; > > > } > > > > > > +/* Maps IP ECN field ECT/CE code point to AccECN option field > > > +number, given > > > + * we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0). > > > + */ > > > +static u8 tcp_ecnfield_to_accecn_optfield(u8 ecnfield) { > > > + switch (ecnfield) { > > > + case INET_ECN_NOT_ECT: > > > + return 0; /* AccECN does not send counts of NOT_ECT */ > > > + case INET_ECN_ECT_1: > > > + return 1; > > > + case INET_ECN_CE: > > > + return 2; > > > + case INET_ECN_ECT_0: > > > + return 3; > > > + default: > > > + WARN_ONCE(1, "bad ECN code point: %d\n", ecnfield); > > > > No WARN_ONCE() above please: either the 'ecnfield' data is masked vs > > INET_ECN_MASK and the WARN_ONCE should not be possible or a remote > > sender can deterministically trigger a WARN() which nowadays will in > > turn raise a CVE... > > > > [...] > > > +static u32 tcp_accecn_field_init_offset(u8 ecnfield) { > > > + switch (ecnfield) { > > > + case INET_ECN_NOT_ECT: > > > + return 0; /* AccECN does not send counts of NOT_ECT */ > > > + case INET_ECN_ECT_1: > > > + return TCP_ACCECN_E1B_INIT_OFFSET; > > > + case INET_ECN_CE: > > > + return TCP_ACCECN_CEB_INIT_OFFSET; > > > + case INET_ECN_ECT_0: > > > + return TCP_ACCECN_E0B_INIT_OFFSET; > > > + default: > > > + WARN_ONCE(1, "bad ECN code point: %d\n", ecnfield); > > > > Same as above. > > > > > + } > > > + return 0; > > > +} > > > + > > > +/* Maps AccECN option field #nr to IP ECN field ECT/CE bits */ > > > +static unsigned int tcp_accecn_optfield_to_ecnfield(unsigned int > > > optfield, > > > + bool order) { > > > + u8 tmp; > > > + > > > + optfield = order ? 2 - optfield : optfield; > > > + tmp = optfield + 2; > > > + > > > + return (tmp + (tmp >> 2)) & INET_ECN_MASK; } > > > + > > > +/* Handles AccECN option ECT and CE 24-bit byte counters update > > > +into > > > + * the u32 value in tcp_sock. As we're processing TCP options, it > > > +is > > > + * safe to access from - 1. > > > + */ > > > +static s32 tcp_update_ecn_bytes(u32 *cnt, const char *from, u32 > > > +init_offset) { > > > + u32 truncated = (get_unaligned_be32(from - 1) - init_offset) & > > > + 0xFFFFFFU; > > > + u32 delta = (truncated - *cnt) & 0xFFFFFFU; > > > + > > > + /* If delta has the highest bit set (24th bit) indicating > > > + * negative, sign extend to correct an estimation using > > > + * sign_extend32(delta, 24 - 1) > > > + */ > > > + delta = sign_extend32(delta, 23); > > > + *cnt += delta; > > > + return (s32)delta; > > > +} > > > + > > > +/* Returns true if the byte counters can be used */ static bool > > > +tcp_accecn_process_option(struct tcp_sock *tp, > > > + const struct sk_buff *skb, > > > + u32 delivered_bytes, int flag) { > > > + u8 estimate_ecnfield = tp->est_ecnfield; > > > + bool ambiguous_ecn_bytes_incr = false; > > > + bool first_changed = false; > > > + unsigned int optlen; > > > + unsigned char *ptr; > > u8 would we more appropriate type for binary data.
Hi Ilpo, Not sure I understand your point, could you elaborate which binary data you think shall use u8? > > > > + bool order1, res; > > > + unsigned int i; > > > + > > > + if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) { > > > + if (estimate_ecnfield) { > > > + u8 ecnfield = estimate_ecnfield - 1; > > > + > > > + tp->delivered_ecn_bytes[ecnfield] += delivered_bytes; > > > + return true; > > > + } > > > + return false; > > > + } > > > + > > > + ptr = skb_transport_header(skb) + tp->rx_opt.accecn; > > > + optlen = ptr[1] - 2; > > > > This assumes optlen is greater then 2, but I don't see the relevant > > check. > > The options parser should check that, please see the "silly options" > check. > > > Are tcp options present at all? > > There is !tp->rx_opt.accecn check above which should ensure we're processing > only AccECN Option that is present. > > > > + WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != > > > + TCPOPT_ACCECN1); > > > > Please, don't warn for arbitrary wrong data sent from the peer. > > If there isn't AccECN option at ptr, there's bug elsewhere in the code (in > the option parse code). So this is an internal sanity check that > tp->rx_opt.accecn points to AccECN option for real like it should. > > If you still want that removed, no problem but it's should not be arbitrary > data at this point because the options parsing code should have validated > this condition already, thus WARN_ON_ONCE() seemed appropriate to me. Indeed, then I will keep this for next version, but can be adjust once receiving further feedback. > > > > + order1 = (ptr[0] == TCPOPT_ACCECN1); > > > + ptr += 2; > > > + > > > + res = !!estimate_ecnfield; > > > + for (i = 0; i < 3; i++) { > > > + if (optlen >= TCPOLEN_ACCECN_PERFIELD) { > > It's easy to reverse logic here and use continue, which buys one level of > indentation. Sure, thanks for explicit suggestion, will do. Chia-Yu > > > > + u32 init_offset; > > > + u8 ecnfield; > > > + s32 delta; > > > + u32 *cnt; > > > + > > > + ecnfield = tcp_accecn_optfield_to_ecnfield(i, order1); > > > + init_offset = tcp_accecn_field_init_offset(ecnfield); > > > + cnt = &tp->delivered_ecn_bytes[ecnfield - 1]; > > > + delta = tcp_update_ecn_bytes(cnt, ptr, init_offset); > > > + if (delta) { > > > + if (delta < 0) { > > > + res = false; > > > + ambiguous_ecn_bytes_incr = true; > > > + } > > > + if (ecnfield != estimate_ecnfield) { > > > + if (!first_changed) { > > > + tp->est_ecnfield = ecnfield; > > > + first_changed = true; > > > + } else { > > > + res = false; > > > + ambiguous_ecn_bytes_incr = > > > true; > > > + } > > > > At least 2 indentation levels above the maximum readable. > > > > [...] > > > @@ -4378,6 +4524,7 @@ void tcp_parse_options(const struct net *net, > > > > > > ptr = (const unsigned char *)(th + 1); > > > opt_rx->saw_tstamp = 0; > > > + opt_rx->accecn = 0; > > > opt_rx->saw_unknown = 0; > > > > It would be good to be able to zero both 'accecn' and 'saw_unknown' > > with a single statement. > > > > [...] > > > @@ -766,6 +769,47 @@ static void tcp_options_write(struct tcphdr *th, > > > struct tcp_sock *tp, > > > *ptr++ = htonl(opts->tsecr); > > > } > > > > > > + if (OPTION_ACCECN & options) { > > > + const u8 ect0_idx = INET_ECN_ECT_0 - 1; > > > + const u8 ect1_idx = INET_ECN_ECT_1 - 1; > > > + const u8 ce_idx = INET_ECN_CE - 1; > > > + u32 e0b; > > > + u32 e1b; > > > + u32 ceb; > > > + u8 len; > > > + > > > + e0b = opts->ecn_bytes[ect0_idx] + TCP_ACCECN_E0B_INIT_OFFSET; > > > + e1b = opts->ecn_bytes[ect1_idx] + TCP_ACCECN_E1B_INIT_OFFSET; > > > + ceb = opts->ecn_bytes[ce_idx] + TCP_ACCECN_CEB_INIT_OFFSET; > > > + len = TCPOLEN_ACCECN_BASE + > > > + opts->num_accecn_fields * TCPOLEN_ACCECN_PERFIELD; > > > + > > > + if (opts->num_accecn_fields == 2) { > > > + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | > > > + ((e1b >> 8) & 0xffff)); > > > + *ptr++ = htonl(((e1b & 0xff) << 24) | > > > + (ceb & 0xffffff)); > > > + } else if (opts->num_accecn_fields == 1) { > > > + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | > > > + ((e1b >> 8) & 0xffff)); > > > + leftover_bytes = ((e1b & 0xff) << 8) | > > > + TCPOPT_NOP; > > > + leftover_size = 1; > > > + } else if (opts->num_accecn_fields == 0) { > > > + leftover_bytes = (TCPOPT_ACCECN1 << 8) | len; > > > + leftover_size = 2; > > > + } else if (opts->num_accecn_fields == 3) { > > > + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | > > > + ((e1b >> 8) & 0xffff)); > > > + *ptr++ = htonl(((e1b & 0xff) << 24) | > > > + (ceb & 0xffffff)); > > > + *ptr++ = htonl(((e0b & 0xffffff) << 8) | > > > + TCPOPT_NOP); > > > > The above chunck and the contents of patch 7 must be in the same patch. > > This split makes the review even harder. > > > > [...] > > > @@ -1117,6 +1235,17 @@ static unsigned int tcp_established_options(struct > > > sock *sk, struct sk_buff *skb > > > opts->num_sack_blocks = 0; > > > } > > > > > > + if (tcp_ecn_mode_accecn(tp) && > > > + sock_net(sk)->ipv4.sysctl_tcp_ecn_option) { > > > + int saving = opts->num_sack_blocks > 0 ? 2 : 0; > > > + int remaining = MAX_TCP_OPTION_SPACE - size; > > > > AFACS the above means tcp_options_fit_accecn() must clear any already > > set options, but apparently it does not do so. Have you tested with > > something adding largish options like mptcp? > > This "fitting" for AccEcn option is not to make room for the option but to > check if AccECN option fits and in what length, and how it can take advantage > of some nop bytes when available to save option space. > > -- > i.