* Rebased on e5d92c1a5485 ("cirrus: Update to use FreeBSD 12.4.").
* Style changes to follow current code base (reverse xmas trees,
alignments).
Documentation/topics/dpdk/phy.rst | 77 ++++++++
NEWS | 3 +
lib/netdev-dpdk.c | 302 +++++++++++++++++++++++++++++-
vswitchd/vswitch.xml | 26 +++
4 files changed, 405 insertions(+), 3 deletions(-)
diff --git a/Documentation/topics/dpdk/phy.rst
b/Documentation/topics/dpdk/phy.rst
index 4b0fe8dded3a..518b67134639 100644
--- a/Documentation/topics/dpdk/phy.rst
+++ b/Documentation/topics/dpdk/phy.rst
@@ -131,6 +131,83 @@ possible with DPDK acceleration. It is possible to
configure multiple Rx queues
for ``dpdk`` ports, thus ensuring this is not a bottleneck for performance.
For
information on configuring PMD threads, refer to :doc:`pmd`.
+Control Plane Protection
+------------------------
+
+.. warning:: This feature is experimental.
+
+Some control protocols are used to maintain link status between forwarding
+engines. In SDN environments, these packets share the same physical network
+than the user data traffic.
+
+When the system is not sized properly, the PMD threads may not be able to
+process all incoming traffic from the configured Rx queues. When a signaling
+packet of such protocols is dropped, it can cause link flapping, worsening the
+situation.
+
+Some physical NICs can be programmed to put these protocols in a dedicated
+hardware Rx queue using the rte_flow__ API.
+
+__ https://doc.dpdk.org/guides-22.11/prog_guide/rte_flow.html
+
+The currently supported control plane protocols are:
+
+``lacp``
+ `Link Aggregation Control Protocol`__. Ether type ``0x8809``.
+
+ __ https://www.ieee802.org/3/ad/public/mar99/seaman_1_0399.pdf
+
+.. warning::
+
+ This feature is not compatible with all NICs. Refer to the DPDK
+ `compatibilty matrix`__ and vendor documentation for more details.
+
+ __ https://doc.dpdk.org/guides-22.11/nics/overview.html
+
+Control plane protection must be enabled on specific protocols per port. The
+``cp-protection`` option requires a coma separated list of protocol names::
+
+ $ ovs-vsctl add-port br0 dpdk-p0 -- set Interface dpdk-p0 type=dpdk \
+ options:dpdk-devargs=0000:01:00.0 options:n_rxq=2 \
+ options:cp-protection=lacp
+
+.. note::
+
+ If multiple Rx queues are already configured, regular RSS (Receive Side
+ Scaling) queue balancing is done on all but the extra control plane
+ protection queue.
+
+.. tip::
+
+ You can check if control plane protection is supported on a port with the
+ following command::
+
+ $ ovs-vsctl get interface dpdk-p0 status
+ {cp_protection_queue="2", driver_name=..., rss_queues="0-1"}
+
+ This will also show in ``ovs-vswitchd.log``::
+
+ INFO|dpdk-p0: cp-protection: redirecting lacp traffic to queue 2
+ INFO|dpdk-p0: cp-protection: applying rss on queues 0-1
+
+ If the hardware does not support redirecting control plane traffic to
+ a dedicated queue, it will be explicit::
+
+ $ ovs-vsctl get interface dpdk-p0 status
+ {cp_protection=unsupported, driver_name=...}
+
+ More details can often be found in ``ovs-vswitchd.log``::
+
+ WARN|dpdk-p0: cp-protection: failed to add lacp flow: Unsupported pattern
+
+To disable control plane protection on a port, use the following command::
+
+ $ ovs-vsctl remove Interface dpdk-p0 options cp-protection
+
+You can see that it has been disabled in ``ovs-vswitchd.log``::
+
+ INFO|dpdk-p0: cp-protection: disabled
+
.. _dpdk-phy-flow-control:
Flow Control
diff --git a/NEWS b/NEWS
index 2f6ededfe47d..685a9aae5427 100644
--- a/NEWS
+++ b/NEWS
@@ -17,6 +17,9 @@ Post-v3.0.0
* Conntrack IPv6 fragment support.
- DPDK:
* Add support for DPDK 22.11.1.
+ * New experimental "cp-protection=<protocol>" option to redirect certain
+ protocols (for now, only LACP) to a dedicated hardware queue using
+ RTE flow.
- For the QoS max-rate and STP/RSTP path-cost configuration OVS now
assumes
10 Gbps link speed by default in case the actual link speed cannot be
determined. Previously it was 10 Mbps. Values can still be overridden
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 5e2d64651db3..edfe825f397d 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -414,6 +414,10 @@ enum dpdk_hw_ol_features {
NETDEV_TX_SCTP_CHECKSUM_OFFLOAD = 1 << 4,
};
+enum dpdk_cp_prot_flags {
+ DPDK_CP_PROT_LACP = 1 << 0,
+};
+
/*
* In order to avoid confusion in variables names, following naming
convention
* should be used, if possible:
@@ -508,6 +512,9 @@ struct netdev_dpdk {
int requested_rxq_size;
int requested_txq_size;
+ /* User input for n_rxq (see netdev_dpdk_reconfigure). */
+ int user_n_rxq;
+
/* Number of rx/tx descriptors for physical devices */
int rxq_size;
int txq_size;
@@ -533,6 +540,13 @@ struct netdev_dpdk {
/* VF configuration. */
struct eth_addr requested_hwaddr;
+
+ /* Requested control plane protection flags,
+ * from the enum set 'dpdk_cp_prot_flags'. */
+ uint64_t requested_cp_prot_flags;
+ uint64_t cp_prot_flags;
+ size_t cp_prot_flows_num;
+ struct rte_flow **cp_prot_flows;
);
PADDED_MEMBERS(CACHE_LINE_SIZE,
@@ -1299,6 +1313,7 @@ common_construct(struct netdev *netdev, dpdk_port_t
port_no,
dev->attached = false;
dev->started = false;
dev->reset_needed = false;
+ dev->tx_q = NULL;
ovsrcu_init(&dev->qos_conf, NULL);
@@ -1309,9 +1324,14 @@ common_construct(struct netdev *netdev, dpdk_port_t port_no,
netdev->n_rxq = 0;
netdev->n_txq = 0;
dev->requested_n_rxq = NR_QUEUE;
+ dev->user_n_rxq = NR_QUEUE;
dev->requested_n_txq = NR_QUEUE;
dev->requested_rxq_size = NIC_PORT_DEFAULT_RXQ_SIZE;
dev->requested_txq_size = NIC_PORT_DEFAULT_TXQ_SIZE;
+ dev->requested_cp_prot_flags = 0;
+ dev->cp_prot_flags = 0;
+ dev->cp_prot_flows_num = 0;
+ dev->cp_prot_flows = NULL;
/* Initialize the flow control to NULL */
memset(&dev->fc_conf, 0, sizeof dev->fc_conf);
@@ -1486,6 +1506,8 @@ common_destruct(struct netdev_dpdk *dev)
ovs_mutex_destroy(&dev->mutex);
}
+static void dpdk_cp_prot_unconfigure(struct netdev_dpdk *dev);
+
static void
netdev_dpdk_destruct(struct netdev *netdev)
{
@@ -1493,6 +1515,9 @@ netdev_dpdk_destruct(struct netdev *netdev)
ovs_mutex_lock(&dpdk_mutex);
+ /* Destroy any rte flows to allow RXQs to be removed. */
+ dpdk_cp_prot_unconfigure(dev);
+
rte_eth_dev_stop(dev->port_id);
dev->started = false;
@@ -1907,8 +1932,8 @@ dpdk_set_rxq_config(struct netdev_dpdk *dev, const struct smap *args)
int new_n_rxq;
new_n_rxq = MAX(smap_get_int(args, "n_rxq", NR_QUEUE), 1);
- if (new_n_rxq != dev->requested_n_rxq) {
- dev->requested_n_rxq = new_n_rxq;
+ if (new_n_rxq != dev->user_n_rxq) {
+ dev->user_n_rxq = new_n_rxq;
netdev_request_reconfigure(&dev->up);
}
}
@@ -1930,6 +1955,48 @@ dpdk_process_queue_size(struct netdev *netdev, const
struct smap *args,
}
}
+static void
+dpdk_cp_prot_set_config(struct netdev *netdev, struct netdev_dpdk *dev,
+ const struct smap *args, char **errp)
+{
+ const char *arg = smap_get_def(args, "cp-protection", "");
+ char *token, *saveptr, *buf;
+ uint64_t flags = 0;
+
+ buf = xstrdup(arg);
+ token = strtok_r(buf, ",", &saveptr);
+ while (token) {
+ if (strcmp(token, "lacp") == 0) {
+ flags |= DPDK_CP_PROT_LACP;
+ } else {
+ VLOG_WARN_BUF(errp, "%s options:cp-protection "
+ "unknown protocol '%s'",
+ netdev_get_name(netdev), token);
+ }
+ token = strtok_r(NULL, ",", &saveptr);
+ }
+ free(buf);
+
+ if (flags && dev->type != DPDK_DEV_ETH) {
+ VLOG_WARN_BUF(errp, "%s options:cp-protection "
+ "is only supported on ethernet ports",
+ netdev_get_name(netdev));
+ flags = 0;
+ }
+
+ if (flags && netdev_is_flow_api_enabled()) {
+ VLOG_WARN_BUF(errp, "%s options:cp-protection "
+ "is incompatible with hw-offload",
+ netdev_get_name(netdev));
+ flags = 0;
+ }
+
+ if (flags != dev->requested_cp_prot_flags) {
+ dev->requested_cp_prot_flags = flags;
+ netdev_request_reconfigure(netdev);
+ }
+}
+
static int
netdev_dpdk_set_config(struct netdev *netdev, const struct smap *args,
char **errp)
@@ -1949,6 +2016,8 @@ netdev_dpdk_set_config(struct netdev *netdev, const
struct smap *args,
ovs_mutex_lock(&dpdk_mutex);
ovs_mutex_lock(&dev->mutex);
+ dpdk_cp_prot_set_config(netdev, dev, args, errp);
+
dpdk_set_rxq_config(dev, args);
dpdk_process_queue_size(netdev, args, "n_rxq_desc",
@@ -3825,9 +3894,12 @@ netdev_dpdk_get_status(const struct netdev *netdev,
struct smap *args)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct rte_eth_dev_info dev_info;
+ size_t cp_prot_flows_num;
+ uint64_t cp_prot_flags;
const char *bus_info;
uint32_t link_speed;
uint32_t dev_flags;
+ int n_rxq;
if (!rte_eth_dev_is_valid_port(dev->port_id)) {
return ENODEV;
@@ -3839,6 +3911,9 @@ netdev_dpdk_get_status(const struct netdev *netdev,
struct smap *args)
link_speed = dev->link.link_speed;
dev_flags = *dev_info.dev_flags;
bus_info = rte_dev_bus_info(dev_info.device);
+ cp_prot_flags = dev->cp_prot_flags;
+ cp_prot_flows_num = dev->cp_prot_flows_num;
+ n_rxq = netdev->n_rxq;
ovs_mutex_unlock(&dev->mutex);
ovs_mutex_unlock(&dpdk_mutex);
@@ -3881,6 +3956,19 @@ netdev_dpdk_get_status(const struct netdev *netdev, struct smap *args)
ETH_ADDR_ARGS(dev->hwaddr));
}
+ if (cp_prot_flags) {
+ if (!cp_prot_flows_num) {
+ smap_add(args, "cp_protection", "unsupported");
+ } else {
+ smap_add_format(args, "cp_protection_queue", "%d", n_rxq - 1);
+ if (n_rxq > 2) {
+ smap_add_format(args, "rss_queues", "0-%d", n_rxq - 2);
+ } else {
+ smap_add(args, "rss_queues", "0");
+ }
+ }
+ }
+
return 0;
}
@@ -5105,16 +5193,199 @@ static const struct dpdk_qos_ops trtcm_policer_ops = {
.qos_queue_dump_state_init = trtcm_policer_qos_queue_dump_state_init
};
+static int
+dpdk_cp_prot_add_flow(struct netdev_dpdk *dev,
+ const struct rte_flow_item items[],
+ const char *desc)
+{
+ const struct rte_flow_attr attr = { .ingress = 1 };
+ const struct rte_flow_action actions[] = {
+ {
+ .type = RTE_FLOW_ACTION_TYPE_QUEUE,
+ .conf = &(const struct rte_flow_action_queue) {
+ .index = dev->up.n_rxq - 1,
+ },
+ },
+ { .type = RTE_FLOW_ACTION_TYPE_END },
+ };
+ struct rte_flow_error error;
+ struct rte_flow *flow;
+ size_t num;
+ int err;
+
+ err = rte_flow_validate(dev->port_id, &attr, items, actions, &error);
+ if (err) {
+ VLOG_WARN("%s: cp-protection: device does not support %s flow: %s",
+ netdev_get_name(&dev->up), desc, error.message);
+ goto out;
+ }
+
+ flow = rte_flow_create(dev->port_id, &attr, items, actions, &error);
+ if (flow == NULL) {
+ VLOG_WARN("%s: cp-protection: failed to add %s flow: %s",
+ netdev_get_name(&dev->up), desc, error.message);
+ err = rte_errno;
+ goto out;
+ }
+
+ num = dev->cp_prot_flows_num + 1;
+ dev->cp_prot_flows = xrealloc(dev->cp_prot_flows, sizeof(flow) * num);
+ dev->cp_prot_flows[dev->cp_prot_flows_num] = flow;
+ dev->cp_prot_flows_num = num;
+
+ VLOG_INFO("%s: cp-protection: redirected %s traffic to rx queue %d",
+ netdev_get_name(&dev->up), desc, dev->up.n_rxq - 1);
+out:
+ return err;
+}
+
+#define RETA_CONF_SIZE (RTE_ETH_RSS_RETA_SIZE_512 / RTE_ETH_RETA_GROUP_SIZE)
+
+static int
+dpdk_cp_prot_rss_configure(struct netdev_dpdk *dev, int rss_n_rxq)
+{
+ struct rte_eth_rss_reta_entry64 reta_conf[RETA_CONF_SIZE];
+ struct rte_eth_dev_info info;
+ int err;
+
+ rte_eth_dev_info_get(dev->port_id, &info);
+
+ if (info.reta_size % rss_n_rxq != 0 &&
+ info.reta_size < RTE_ETH_RSS_RETA_SIZE_128) {
+ /*
+ * Some drivers set reta_size equal to the total number of rxqs that
+ * are configured when it is a power of two. Since we are actually
+ * reconfiguring the redirection table to exclude the last rxq, we may
+ * end up with an imbalanced redirection table. For example, such
+ * configuration:
+ *
+ * options:n_rxq=3 options:cp-protection=lacp
+ *
+ * Will actually configure 4 rxqs on the NIC, and the default reta to:
+ *
+ * [0, 1, 2, 3]
+ *
+ * And dpdk_cp_prot_rss_configure() will reconfigure reta to:
+ *
+ * [0, 1, 2, 0]
+ *
+ * Causing queue 0 to receive twice as much traffic as queues 1 and 2.
+ *
+ * Work around that corner case by forcing a bigger redirection table
+ * size to 128 entries when reta_size is not a multiple of rss_n_rxq
+ * and when reta_size is less than 128. This value seems to be
+ * supported by most of the drivers that also support rte flow.
+ */
+ info.reta_size = RTE_ETH_RSS_RETA_SIZE_128;
+ }
+
+ memset(reta_conf, 0, sizeof(reta_conf));
+ for (uint16_t i = 0; i < info.reta_size; i++) {
+ uint16_t idx = i / RTE_ETH_RETA_GROUP_SIZE;
+ uint16_t shift = i % RTE_ETH_RETA_GROUP_SIZE;
+ reta_conf[idx].mask |= 1ULL << shift;
+ reta_conf[idx].reta[shift] = i % rss_n_rxq;
+ }
+ err = rte_eth_dev_rss_reta_update(dev->port_id, reta_conf, info.reta_size);
+ if (err < 0) {
+ VLOG_WARN("%s: failed to configure RSS redirection table: err=%d",
+ netdev_get_name(&dev->up), err);
+ }
+
+ return err;
+}
+
+static int
+dpdk_cp_prot_configure(struct netdev_dpdk *dev)
+{
+ int err = 0;
+
+ if (dev->up.n_rxq < 2) {
+ err = ENOTSUP;
+ VLOG_WARN("%s: cp-protection: not enough available rx queues",
+ netdev_get_name(&dev->up));
+ goto out;
+ }
+
+ if (dev->requested_cp_prot_flags & DPDK_CP_PROT_LACP) {
+ const struct rte_flow_item items[] = {
+ {
+ .type = RTE_FLOW_ITEM_TYPE_ETH,
+ .spec = &(const struct rte_flow_item_eth){
+ .type = htons(ETH_TYPE_LACP),
+ },
+ .mask = &(const struct rte_flow_item_eth){
+ .type = htons(0xffff),
+ },
+ },
+ { .type = RTE_FLOW_ITEM_TYPE_END },
+ };
+ err = dpdk_cp_prot_add_flow(dev, items, "lacp");
+ if (err) {
+ goto out;
+ }
+ }
+
+ if (dev->cp_prot_flows_num) {
+ /* Reconfigure RSS reta in all but the cp protection queue. */
+ err = dpdk_cp_prot_rss_configure(dev, dev->up.n_rxq - 1);
+ if (!err) {
+ if (dev->up.n_rxq == 2) {
+ VLOG_INFO("%s: cp-protection: redirected other traffic to "
+ "rx queue 0", netdev_get_name(&dev->up));
+ } else {
+ VLOG_INFO("%s: cp-protection: applied rss on rx queue 0-%u",
+ netdev_get_name(&dev->up), dev->up.n_rxq - 2);
+ }
+ }
+ }
+
+out:
+ return err;
+}
+
+static void
+dpdk_cp_prot_unconfigure(struct netdev_dpdk *dev)
+{
+ struct rte_flow_error error;
+
+ if (!dev->cp_prot_flows_num) {
+ return;
+ }
+
+ VLOG_DBG("%s: cp-protection: reset flows", netdev_get_name(&dev->up));
+
+ for (int i = 0; i < dev->cp_prot_flows_num; i++) {
+ if (rte_flow_destroy(dev->port_id, dev->cp_prot_flows[i], &error)) {
+ VLOG_DBG("%s: cp-protection: failed to destroy flow: %s",
+ netdev_get_name(&dev->up), error.message);
+ }
+ }
+ free(dev->cp_prot_flows);
+ dev->cp_prot_flows_num = 0;
+ dev->cp_prot_flows = NULL;
+