On Sat, 26 Apr 2025 at 18:20, <[email protected]> wrote: > > diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h > index 9ea874395717..b4caeccbea72 100644 > --- a/include/uapi/linux/pkt_sched.h > +++ b/include/uapi/linux/pkt_sched.h > @@ -1210,4 +1210,28 @@ enum { > > #define TCA_ETS_MAX (__TCA_ETS_MAX - 1) > > +/* DUALPI2 */ > +enum { > + TCA_DUALPI2_UNSPEC, > + TCA_DUALPI2_LIMIT, /* Packets */ > + TCA_DUALPI2_MEMORY_LIMIT, /* Bytes */ > + TCA_DUALPI2_TARGET, /* us */ > + TCA_DUALPI2_TUPDATE, /* us */ > + TCA_DUALPI2_ALPHA, /* Hz scaled up by 256 */ > + TCA_DUALPI2_BETA, /* HZ scaled up by 256 */ > + TCA_DUALPI2_STEP_THRESH, /* Packets or us */ > + TCA_DUALPI2_STEP_PACKETS, /* Whether STEP_THRESH is in packets > */ > + TCA_DUALPI2_MIN_QLEN_STEP, /* Minimum qlen to apply STEP_THRESH > */ > + TCA_DUALPI2_COUPLING, /* Coupling factor between queues */ > + TCA_DUALPI2_DROP_OVERLOAD, /* Whether to drop on overload */ > + TCA_DUALPI2_DROP_EARLY, /* Whether to drop on enqueue */ > + TCA_DUALPI2_C_PROTECTION, /* Percentage */ > + TCA_DUALPI2_ECN_MASK, /* L4S queue classification mask */ > + TCA_DUALPI2_SPLIT_GSO, /* Split GSO packets at enqueue */ > + TCA_DUALPI2_PAD, > + __TCA_DUALPI2_MAX > +}; > + > +#define TCA_DUALPI2_MAX (__TCA_DUALPI2_MAX - 1)
Need to define enums here as part of UAPI for each of the new enum types you include in the tc.yaml patch. > #endif > diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c > new file mode 100644 > index 000000000000..bb3d9982b572 > --- /dev/null > +++ b/net/sched/sch_dualpi2.c > @@ -0,0 +1,554 @@ > +// SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause > +/* Copyright (C) 2024 Nokia > + * > + * Author: Koen De Schepper <[email protected]> > + * Author: Olga Albisser <[email protected]> > + * Author: Henrik Steen <[email protected]> > + * Author: Olivier Tilmans <[email protected]> > + * Author: Chia-Yu Chang <[email protected]> > + * > + * DualPI Improved with a Square (dualpi2): > + * - Supports congestion controls that comply with the Prague requirements > + * in RFC9331 (e.g. TCP-Prague) > + * - Supports coupled dual-queue with PI2 as defined in RFC9332 > + * - Supports ECN L4S-identifier (IP.ECN==0b*1) > + * > + * note: Although DCTCP and BBRv3 can use shallow-threshold ECN marks, > + * they do not meet the 'Prague L4S Requirements' listed in RFC 9331 > + * Section 4, so they can only be used with DualPI2 in a datacenter > + * context. > + * > + * References: > + * - RFC9332: https://datatracker.ietf.org/doc/html/rfc9332 > + * - De Schepper, Koen, et al. "PI 2: A linearized AQM for both classic and > + * scalable TCP." in proc. ACM CoNEXT'16, 2016. > + */ > + > +#include <linux/errno.h> > +#include <linux/hrtimer.h> > +#include <linux/if_vlan.h> > +#include <linux/kernel.h> > +#include <linux/limits.h> > +#include <linux/module.h> > +#include <linux/skbuff.h> > +#include <linux/types.h> > + > +#include <net/gso.h> > +#include <net/inet_ecn.h> > +#include <net/pkt_cls.h> > +#include <net/pkt_sched.h> > + > +/* 32b enable to support flows with windows up to ~8.6 * 1e9 packets > + * i.e., twice the maximal snd_cwnd. > + * MAX_PROB must be consistent with the RNG in dualpi2_roll(). > + */ > +#define MAX_PROB U32_MAX > + > +/* alpha/beta values exchanged over netlink are in units of 256ns */ > +#define ALPHA_BETA_SHIFT 8 > + > +/* Scaled values of alpha/beta must fit in 32b to avoid overflow in later > + * computations. Consequently (see and dualpi2_scale_alpha_beta()), their > + * netlink-provided values can use at most 31b, i.e. be at most (2^23)-1 > + * (~4MHz) as those are given in 1/256th. This enable to tune alpha/beta to > + * control flows whose maximal RTTs can be in usec up to few secs. > + */ > +#define ALPHA_BETA_MAX ((1U << 31) - 1) > + > +/* Internal alpha/beta are in units of 64ns. > + * This enables to use all alpha/beta values in the allowed range without > loss > + * of precision due to rounding when scaling them internally, e.g., > + * scale_alpha_beta(1) will not round down to 0. > + */ > +#define ALPHA_BETA_GRANULARITY 6 > + > +#define ALPHA_BETA_SCALING (ALPHA_BETA_SHIFT - ALPHA_BETA_GRANULARITY) > + > +/* We express the weights (wc, wl) in %, i.e., wc + wl = 100 */ > +#define MAX_WC 100 > + > +struct dualpi2_sched_data { > + struct Qdisc *l_queue; /* The L4S Low latency queue (L-queue) */ > + struct Qdisc *sch; /* The Classic queue (C-queue) */ > + > + /* Registered tc filters */ > + struct tcf_proto __rcu *tcf_filters; > + struct tcf_block *tcf_block; > + > + /* PI2 parameters */ > + u64 pi2_target; /* Target delay in nanoseconds */ > + u32 pi2_tupdate; /* Timer frequency in nanoseconds */ > + u32 pi2_prob; /* Base PI probability */ > + u32 pi2_alpha; /* Gain factor for the integral rate response > */ > + u32 pi2_beta; /* Gain factor for the proportional response > */ > + struct hrtimer pi2_timer; /* prob update timer */ > + > + /* Step AQM (L-queue only) parameters */ > + u32 step_thresh; /* Step threshold */ > + bool step_in_packets; /* Whether the step is in packets or time */ > + > + /* C-queue starvation protection */ > + s32 c_protection_credit; /* Credit (sign indicates which queue) */ > + s32 c_protection_init; /* Reset value of the credit */ > + u8 c_protection_wc; /* C-queue weight (between 0 and MAX_WC) */ > + u8 c_protection_wl; /* L-queue weight (MAX_WC - wc) */ > + > + /* General dualQ parameters */ > + u32 memory_limit; /* Memory limit of both queues */ > + u8 coupling_factor;/* Coupling factor (k) between both queues */ > + u8 ecn_mask; /* Mask to match packets into L-queue */ > + u32 min_qlen_step; /* Minimum queue length to apply step thresh > */ > + bool drop_early; /* Drop at enqueue instead of dequeue if true > */ > + bool drop_overload; /* Drop (1) on overload, or overflow (0) */ > + bool split_gso; /* Split aggregated skb (1) or leave as is */ > + > + /* Statistics */ > + u64 c_head_ts; /* Enqueue timestamp of the C-queue head */ > + u64 l_head_ts; /* Enqueue timestamp of the L-queue head */ > + u64 last_qdelay; /* Q delay val at the last probability update > */ > + u32 packets_in_c; /* Enqueue packet counter of the C-queue */ > + u32 packets_in_l; /* Enqueue packet counter of the L-queue */ > + u32 maxq; /* Maximum queue size of the C-queue */ > + u32 ecn_mark; /* ECN mark pkt counter due to PI probability > */ > + u32 step_marks; /* ECN mark pkt counter due to step AQM */ > + u32 memory_used; /* Memory used of both queues */ > + u32 max_memory_used;/* Maximum used memory */ > +}; > + > +static u32 dualpi2_scale_alpha_beta(u32 param) > +{ > + u64 tmp = ((u64)param * MAX_PROB >> ALPHA_BETA_SCALING); > + > + do_div(tmp, NSEC_PER_SEC); > + return tmp; > +} > + > +static ktime_t next_pi2_timeout(struct dualpi2_sched_data *q) > +{ > + return ktime_add_ns(ktime_get_ns(), q->pi2_tupdate); > +} > + > +static void dualpi2_reset_c_protection(struct dualpi2_sched_data *q) > +{ > + q->c_protection_credit = q->c_protection_init; > +} > + > +/* This computes the initial credit value and WRR weight for the L queue (wl) > + * from the weight of the C queue (wc). > + * If wl > wc, the scheduler will start with the L queue when reset. > + */ > +static void dualpi2_calculate_c_protection(struct Qdisc *sch, > + struct dualpi2_sched_data *q, u32 > wc) > +{ > + q->c_protection_wc = wc; > + q->c_protection_wl = MAX_WC - wc; > + q->c_protection_init = (s32)psched_mtu(qdisc_dev(sch)) * > + ((int)q->c_protection_wc - (int)q->c_protection_wl); > + dualpi2_reset_c_protection(q); > +} > + > +static s64 __scale_delta(u64 diff) > +{ > + do_div(diff, 1 << ALPHA_BETA_GRANULARITY); > + return diff; > +} > + > +static void get_queue_delays(struct dualpi2_sched_data *q, u64 *qdelay_c, > + u64 *qdelay_l) > +{ > + u64 now, qc, ql; > + > + now = ktime_get_ns(); > + qc = READ_ONCE(q->c_head_ts); > + ql = READ_ONCE(q->l_head_ts); > + > + *qdelay_c = qc ? now - qc : 0; > + *qdelay_l = ql ? now - ql : 0; > +} > + > +static u32 calculate_probability(struct Qdisc *sch) > +{ > + struct dualpi2_sched_data *q = qdisc_priv(sch); > + u32 new_prob; > + u64 qdelay_c; > + u64 qdelay_l; > + u64 qdelay; > + s64 delta; > + > + get_queue_delays(q, &qdelay_c, &qdelay_l); > + qdelay = max(qdelay_l, qdelay_c); > + /* Alpha and beta take at most 32b, i.e, the delay difference would > + * overflow for queuing delay differences > ~4.2sec. > + */ > + delta = ((s64)qdelay - q->pi2_target) * q->pi2_alpha; > + delta += ((s64)qdelay - q->last_qdelay) * q->pi2_beta; > + if (delta > 0) { > + new_prob = __scale_delta(delta) + q->pi2_prob; > + if (new_prob < q->pi2_prob) > + new_prob = MAX_PROB; > + } else { > + new_prob = q->pi2_prob - __scale_delta(~delta + 1); > + if (new_prob > q->pi2_prob) > + new_prob = 0; > + } > + q->last_qdelay = qdelay; > + /* If we do not drop on overload, ensure we cap the L4S probability to > + * 100% to keep window fairness when overflowing. > + */ > + if (!q->drop_overload) > + return min_t(u32, new_prob, MAX_PROB / q->coupling_factor); > + return new_prob; > +} > + > +static u32 get_memory_limit(struct Qdisc *sch, u32 limit) > +{ > + /* Apply rule of thumb, i.e., doubling the packet length, > + * to further include per packet overhead in memory_limit. > + */ > + u64 memlim = mul_u32_u32(limit, 2 * psched_mtu(qdisc_dev(sch))); > + > + if (upper_32_bits(memlim)) > + return 0xffffffff; > + else > + return lower_32_bits(memlim); > +} > + > +static enum hrtimer_restart dualpi2_timer(struct hrtimer *timer) > +{ > + struct dualpi2_sched_data *q = from_timer(q, timer, pi2_timer); > + > + WRITE_ONCE(q->pi2_prob, calculate_probability(q->sch)); > + > + hrtimer_set_expires(&q->pi2_timer, next_pi2_timeout(q)); > + return HRTIMER_RESTART; > +} > + > +static struct netlink_range_validation dualpi2_alpha_beta_range = { > + .min = 1, > + .max = ALPHA_BETA_MAX, > +}; > + > +static struct netlink_range_validation dualpi2_wc_range = { > + .min = 0, > + .max = MAX_WC, > +}; > + > +static const struct nla_policy dualpi2_policy[TCA_DUALPI2_MAX + 1] = { > + [TCA_DUALPI2_LIMIT] = NLA_POLICY_MIN(NLA_U32, 1), > + [TCA_DUALPI2_MEMORY_LIMIT] = NLA_POLICY_MIN(NLA_U32, 1), > + [TCA_DUALPI2_TARGET] = {.type = NLA_U32}, > + [TCA_DUALPI2_TUPDATE] = NLA_POLICY_MIN(NLA_U32, 1), > + [TCA_DUALPI2_ALPHA] = > + NLA_POLICY_FULL_RANGE(NLA_U32, &dualpi2_alpha_beta_range), > + [TCA_DUALPI2_BETA] = > + NLA_POLICY_FULL_RANGE(NLA_U32, &dualpi2_alpha_beta_range), > + [TCA_DUALPI2_STEP_THRESH] = {.type = NLA_U32}, > + [TCA_DUALPI2_STEP_PACKETS] = {.type = NLA_FLAG}, > + [TCA_DUALPI2_MIN_QLEN_STEP] = {.type = NLA_U32}, > + [TCA_DUALPI2_COUPLING] = NLA_POLICY_MIN(NLA_U8, 1), > + [TCA_DUALPI2_DROP_OVERLOAD] = {.type = NLA_U8}, > + [TCA_DUALPI2_DROP_EARLY] = {.type = NLA_U8}, > + [TCA_DUALPI2_C_PROTECTION] = > + NLA_POLICY_FULL_RANGE(NLA_U8, &dualpi2_wc_range), > + [TCA_DUALPI2_ECN_MASK] = {.type = NLA_U8}, > + [TCA_DUALPI2_SPLIT_GSO] = {.type = NLA_U8}, > +}; > + > +static int dualpi2_change(struct Qdisc *sch, struct nlattr *opt, > + struct netlink_ext_ack *extack) > +{ > + struct nlattr *tb[TCA_DUALPI2_MAX + 1]; > + struct dualpi2_sched_data *q; > + int old_backlog; > + int old_qlen; > + int err; > + > + if (!opt) > + return -EINVAL; > + err = nla_parse_nested(tb, TCA_DUALPI2_MAX, opt, dualpi2_policy, > + extack); > + if (err < 0) > + return err; > + > + q = qdisc_priv(sch); > + sch_tree_lock(sch); > + > + if (tb[TCA_DUALPI2_LIMIT]) { > + u32 limit = nla_get_u32(tb[TCA_DUALPI2_LIMIT]); > + > + WRITE_ONCE(sch->limit, limit); > + WRITE_ONCE(q->memory_limit, get_memory_limit(sch, limit)); > + } > + > + if (tb[TCA_DUALPI2_MEMORY_LIMIT]) > + WRITE_ONCE(q->memory_limit, > + nla_get_u32(tb[TCA_DUALPI2_MEMORY_LIMIT])); > + > + if (tb[TCA_DUALPI2_TARGET]) { > + u64 target = nla_get_u32(tb[TCA_DUALPI2_TARGET]); > + > + WRITE_ONCE(q->pi2_target, target * NSEC_PER_USEC); > + } > + > + if (tb[TCA_DUALPI2_TUPDATE]) { > + u64 tupdate = nla_get_u32(tb[TCA_DUALPI2_TUPDATE]); > + > + WRITE_ONCE(q->pi2_tupdate, tupdate * NSEC_PER_USEC); > + } > + > + if (tb[TCA_DUALPI2_ALPHA]) { > + u32 alpha = nla_get_u32(tb[TCA_DUALPI2_ALPHA]); > + > + WRITE_ONCE(q->pi2_alpha, dualpi2_scale_alpha_beta(alpha)); > + } > + > + if (tb[TCA_DUALPI2_BETA]) { > + u32 beta = nla_get_u32(tb[TCA_DUALPI2_BETA]); > + > + WRITE_ONCE(q->pi2_beta, dualpi2_scale_alpha_beta(beta)); > + } > + > + if (tb[TCA_DUALPI2_STEP_THRESH]) { > + u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH]); > + bool step_pkt = nla_get_flag(tb[TCA_DUALPI2_STEP_PACKETS]); > + > + WRITE_ONCE(q->step_in_packets, step_pkt); > + WRITE_ONCE(q->step_thresh, > + step_pkt ? step_th : (step_th * NSEC_PER_USEC)); > + } > + > + if (tb[TCA_DUALPI2_MIN_QLEN_STEP]) > + WRITE_ONCE(q->min_qlen_step, > + nla_get_u32(tb[TCA_DUALPI2_MIN_QLEN_STEP])); > + > + if (tb[TCA_DUALPI2_COUPLING]) { > + u8 coupling = nla_get_u8(tb[TCA_DUALPI2_COUPLING]); > + > + WRITE_ONCE(q->coupling_factor, coupling); > + } > + > + if (tb[TCA_DUALPI2_DROP_OVERLOAD]) > + WRITE_ONCE(q->drop_overload, > + !!nla_get_u8(tb[TCA_DUALPI2_DROP_OVERLOAD])); You declared an enum for this in tc.yaml so this should be implemented using an enum that you define as part of UAPI in pkt_sched.h > + > + if (tb[TCA_DUALPI2_DROP_EARLY]) > + WRITE_ONCE(q->drop_early, > + !!nla_get_u8(tb[TCA_DUALPI2_DROP_EARLY])); You declared an enum for this in tc.yaml so this should be implemented using an enum that you define as part of UAPI in pkt_sched.h > + > + if (tb[TCA_DUALPI2_C_PROTECTION]) { > + u8 wc = nla_get_u8(tb[TCA_DUALPI2_C_PROTECTION]); > + > + dualpi2_calculate_c_protection(sch, q, wc); > + } > + > + if (tb[TCA_DUALPI2_ECN_MASK]) > + WRITE_ONCE(q->ecn_mask, > + nla_get_u8(tb[TCA_DUALPI2_ECN_MASK])); You declared flags for this in tc.yaml so this should be implemented using flags that you define as part of UAPI in pkt_sched.h and you should check the provided value for validity before accepting it. > + > + if (tb[TCA_DUALPI2_SPLIT_GSO]) > + WRITE_ONCE(q->split_gso, > + !!nla_get_u8(tb[TCA_DUALPI2_SPLIT_GSO])); This looks like another flag or enum but it is declared as a u8 in tc.yaml. Please fix the discrepancy. > + > + old_qlen = qdisc_qlen(sch); > + old_backlog = sch->qstats.backlog; > + while (qdisc_qlen(sch) > sch->limit || > + q->memory_used > q->memory_limit) { > + struct sk_buff *skb = __qdisc_dequeue_head(&sch->q); > + > + q->memory_used -= skb->truesize; > + qdisc_qstats_backlog_dec(sch, skb); > + rtnl_qdisc_drop(skb, sch); > + } > + qdisc_tree_reduce_backlog(sch, old_qlen - qdisc_qlen(sch), > + old_backlog - sch->qstats.backlog); > + > + sch_tree_unlock(sch); > + return 0; > +} > +
