Documentation/topics/dpdk/phy.rst | 87 +++++++++
NEWS | 3 +
lib/netdev-dpdk.c | 303 +++++++++++++++++++++++++++++-
vswitchd/vswitch.xml | 39 ++++
4 files changed, 430 insertions(+), 2 deletions(-)
diff --git a/Documentation/topics/dpdk/phy.rst
b/Documentation/topics/dpdk/phy.rst
index 4b0fe8dded3a..8f57bf4efe2d 100644
--- a/Documentation/topics/dpdk/phy.rst
+++ b/Documentation/topics/dpdk/phy.rst
@@ -131,6 +131,93 @@ possible with DPDK acceleration. It is possible to
configure multiple Rx queues
for ``dpdk`` ports, thus ensuring this is not a bottleneck for performance.
For
information on configuring PMD threads, refer to :doc:`pmd`.
+Traffic Rx Steering
+-------------------
+
+.. warning:: This feature is experimental.
+
+Some control protocols are used to maintain link status between forwarding
+engines. In SDN environments, these packets share the same physical network
+than the user data traffic.
+
+When the system is not sized properly, the PMD threads may not be able to
+process all incoming traffic from the configured Rx queues. When a signaling
+packet of such protocols is dropped, it can cause link flapping, worsening the
+situation.
+
+Some physical NICs can be programmed to put these protocols in a dedicated
+hardware Rx queue using the rte_flow__ API.
+
+__ https://doc.dpdk.org/guides-22.11/prog_guide/rte_flow.html
+
+.. warning::
+
+ This feature is not compatible with all NICs. Refer to the DPDK
+ `compatibilty matrix`__ and vendor documentation for more details.
+
+ __ https://doc.dpdk.org/guides-22.11/nics/overview.html
+
+Rx steering must be enabled for specific protocols per port. The
+``rx-steering`` option takes one of the following values:
+
+``rss``
+ Do regular RSS on all configured Rx queues. This is the default behaviour.
+
+``rss+lacp``
+ Do regular RSS on all configured Rx queues. An extra Rx queue is configured
+ for LACP__ packets (ether type ``0x8809``).
+
+ __ https://www.ieee802.org/3/ad/public/mar99/seaman_1_0399.pdf
+
+Example::
+
+ $ ovs-vsctl add-port br0 dpdk-p0 -- set Interface dpdk-p0 type=dpdk \
+ options:dpdk-devargs=0000:01:00.0 options:n_rxq=2 \
+ options:rx-steering=rss+lacp
+
+.. note::
+
+ If multiple Rx queues are already configured, regular hash-based RSS
+ (Receive Side Scaling) queue balancing is done on all but the extra Rx
+ queue.
+
+.. tip::
+
+ You can check if Rx steering is supported on a port with the following
+ command::
+
+ $ ovs-vsctl get interface dpdk-p0 status
+ {..., rss_queues="0-1", rx_steering_queue="2"}
+
+ This will also show in ``ovs-vswitchd.log``::
+
+ INFO|dpdk-p0: rx-steering: redirecting lacp traffic to queue 2
+ INFO|dpdk-p0: rx-steering: applying rss on queues 0-1
+
+ If the hardware does not support redirecting the specified protocols to
+ a dedicated queue, it will be explicit::
+
+ $ ovs-vsctl get interface dpdk-p0 status
+ {..., rx_steering=unsupported}
+
+ More details can often be found in ``ovs-vswitchd.log``::
+
+ WARN|dpdk-p0: rx-steering: failed to add lacp flow: Unsupported pattern
+
+To disable Rx steering on a port, use the following command::
+
+ $ ovs-vsctl remove Interface dpdk-p0 options rx-steering
+
+You can see that it has been disabled in ``ovs-vswitchd.log``::
+
+ INFO|dpdk-p0: rx-steering: disabled
+
+.. warning::
+
+ This feature is mutually exclusive with ``other_options:hw-offload`` as it
+ may conflict with the offloaded RTE flows. If both are enabled,
+ ``rx-steering`` will be forcibly disabled.
+
.. _dpdk-phy-flow-control:
Flow Control
diff --git a/NEWS b/NEWS
index 66d5a4ea3751..cba2c1388266 100644
--- a/NEWS
+++ b/NEWS
@@ -34,6 +34,9 @@ Post-v3.1.0
* ovs-vswitchd will keep the CAP_SYS_RAWIO capability when started
with the --hw-rawio-access command line option. This allows the
process extra privileges when mapping physical interconnect memory.
+ * New experimental "rx-steering=rss+<protocol>" option to redirect
+ certain protocols (for now, only LACP) to a dedicated hardware queue
+ using RTE flow.
- SRv6 Tunnel Protocol
* Added support for userspace datapath (only).
- Userspace datapath:
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 63dac689e38b..e1f84b5c8aed 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -418,6 +418,10 @@ enum dpdk_hw_ol_features {
NETDEV_TX_TSO_OFFLOAD = 1 << 7,
};
+enum dpdk_rx_steer_flags {
+ DPDK_RX_STEER_LACP = 1 << 0,
+};
+
/*
* In order to avoid confusion in variables names, following naming convention
* should be used, if possible:
@@ -508,6 +512,12 @@ struct netdev_dpdk {
* netdev_dpdk*_reconfigure() is called */
int requested_mtu;
int requested_n_txq;
+ /* User input for n_rxq (see dpdk_set_rxq_config). */
+ int user_n_rxq;
+ /* user_n_rxq + an optional rx steering queue (see
+ * netdev_dpdk_reconfigure). This field is different from the other
+ * requested_* fields as it may contain a different value than the user
+ * input. */
int requested_n_rxq;
int requested_rxq_size;
int requested_txq_size;
@@ -537,6 +547,13 @@ struct netdev_dpdk {
/* VF configuration. */
struct eth_addr requested_hwaddr;
+
+ /* Requested rx queue steering flags,
+ * from the enum set 'dpdk_rx_steer_flags'. */
+ uint64_t requested_rx_steer_flags;
+ uint64_t rx_steer_flags;
+ size_t rx_steer_flows_num;
+ struct rte_flow **rx_steer_flows;
);
PADDED_MEMBERS(CACHE_LINE_SIZE,
@@ -1371,10 +1388,15 @@ common_construct(struct netdev *netdev, dpdk_port_t
port_no,
netdev->n_rxq = 0;
netdev->n_txq = 0;
+ dev->user_n_rxq = NR_QUEUE;
dev->requested_n_rxq = NR_QUEUE;
dev->requested_n_txq = NR_QUEUE;
dev->requested_rxq_size = NIC_PORT_DEFAULT_RXQ_SIZE;
dev->requested_txq_size = NIC_PORT_DEFAULT_TXQ_SIZE;
+ dev->requested_rx_steer_flags = 0;
+ dev->rx_steer_flags = 0;
+ dev->rx_steer_flows_num = 0;
+ dev->rx_steer_flows = NULL;
/* Initialize the flow control to NULL */
memset(&dev->fc_conf, 0, sizeof dev->fc_conf);
@@ -1549,6 +1571,8 @@ common_destruct(struct netdev_dpdk *dev)
ovs_mutex_destroy(&dev->mutex);
}
+static void dpdk_rx_steer_unconfigure(struct netdev_dpdk *dev);
+
static void
netdev_dpdk_destruct(struct netdev *netdev)
{
@@ -1556,6 +1580,9 @@ netdev_dpdk_destruct(struct netdev *netdev)
ovs_mutex_lock(&dpdk_mutex);
+ /* Destroy any rte flows to allow RXQs to be removed. */
+ dpdk_rx_steer_unconfigure(dev);
+
rte_eth_dev_stop(dev->port_id);
dev->started = false;
@@ -1959,8 +1986,8 @@ dpdk_set_rxq_config(struct netdev_dpdk *dev, const struct smap *args)
int new_n_rxq;
new_n_rxq = MAX(smap_get_int(args, "n_rxq", NR_QUEUE), 1);
- if (new_n_rxq != dev->requested_n_rxq) {
- dev->requested_n_rxq = new_n_rxq;
+ if (new_n_rxq != dev->user_n_rxq) {
+ dev->user_n_rxq = new_n_rxq;
netdev_request_reconfigure(&dev->up);
}
}
@@ -2020,6 +2047,41 @@ dpdk_process_queue_size(struct netdev *netdev, const
struct smap *args,
}
}
+static void
+dpdk_set_rx_steer_config(struct netdev *netdev, struct netdev_dpdk *dev,
+ const struct smap *args, char **errp)
+{
+ const char *arg = smap_get_def(args, "rx-steering", "");
+ uint64_t flags = 0;
+
+ if (strcmp(arg, "rss+lacp") == 0) {
+ flags = DPDK_RX_STEER_LACP;
+ } else if (strcmp(arg, "") != 0 && strcmp(arg, "rss") != 0) {
+ VLOG_WARN_BUF(errp, "%s options:rx-steering "
+ "unsupported parameter value '%s'",
+ netdev_get_name(netdev), arg);
+ }
+
+ if (strcmp(arg, "") != 0 && dev->type != DPDK_DEV_ETH) {
+ VLOG_WARN_BUF(errp, "%s options:rx-steering "
+ "is only supported on ethernet ports",
+ netdev_get_name(netdev));
+ flags = 0;
+ }
+
+ if (flags && ovsrcu_get(void *, &netdev->hw_info.offload_data)) {
+ VLOG_WARN_BUF(errp, "%s options:rx-steering "
+ "is incompatible with hw-offload",
+ netdev_get_name(netdev));
+ flags = 0;
+ }
+
+ if (flags != dev->requested_rx_steer_flags) {
+ dev->requested_rx_steer_flags = flags;
+ netdev_request_reconfigure(netdev);
+ }
+}
+
static int
netdev_dpdk_set_config(struct netdev *netdev, const struct smap *args,
char **errp)
@@ -2041,6 +2103,8 @@ netdev_dpdk_set_config(struct netdev *netdev, const
struct smap *args,
ovs_mutex_lock(&dpdk_mutex);
ovs_mutex_lock(&dev->mutex);
+ dpdk_set_rx_steer_config(netdev, dev, args, errp);
+
dpdk_set_rxq_config(dev, args);
new_devargs = smap_get(args, "dpdk-devargs");
@@ -3916,9 +3980,12 @@ netdev_dpdk_get_status(const struct netdev *netdev,
struct smap *args)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct rte_eth_dev_info dev_info;
+ size_t rx_steer_flows_num;
+ uint64_t rx_steer_flags;
const char *bus_info;
uint32_t link_speed;
uint32_t dev_flags;
+ int n_rxq;
if (!rte_eth_dev_is_valid_port(dev->port_id)) {
return ENODEV;
@@ -3930,6 +3997,9 @@ netdev_dpdk_get_status(const struct netdev *netdev,
struct smap *args)
link_speed = dev->link.link_speed;
dev_flags = *dev_info.dev_flags;
bus_info = rte_dev_bus_info(dev_info.device);
+ rx_steer_flags = dev->rx_steer_flags;
+ rx_steer_flows_num = dev->rx_steer_flows_num;
+ n_rxq = netdev->n_rxq;
ovs_mutex_unlock(&dev->mutex);
ovs_mutex_unlock(&dpdk_mutex);
@@ -3972,6 +4042,19 @@ netdev_dpdk_get_status(const struct netdev *netdev, struct smap *args)
ETH_ADDR_ARGS(dev->hwaddr));
}
+ if (rx_steer_flags) {
+ if (!rx_steer_flows_num) {
+ smap_add(args, "rx_steering", "unsupported");
+ } else {
+ smap_add_format(args, "rx_steering_queue", "%d", n_rxq - 1);
+ if (n_rxq > 2) {
+ smap_add_format(args, "rss_queues", "0-%d", n_rxq - 2);
+ } else {
+ smap_add(args, "rss_queues", "0");
+ }
+ }
+ }
+
return 0;
}
@@ -5310,16 +5393,204 @@ static const struct dpdk_qos_ops trtcm_policer_ops = {
.qos_queue_dump_state_init = trtcm_policer_qos_queue_dump_state_init
};
+static int
+dpdk_rx_steer_add_flow(struct netdev_dpdk *dev,
+ const struct rte_flow_item items[],
+ const char *desc)
+{
+ const struct rte_flow_attr attr = { .ingress = 1 };
+ const struct rte_flow_action actions[] = {
+ {
+ .type = RTE_FLOW_ACTION_TYPE_QUEUE,
+ .conf = &(const struct rte_flow_action_queue) {
+ .index = dev->up.n_rxq - 1,
+ },
+ },
+ { .type = RTE_FLOW_ACTION_TYPE_END },
+ };
+ struct rte_flow_error error;
+ struct rte_flow *flow;
+ size_t num;
+ int err;
+
+ set_error(&error, RTE_FLOW_ERROR_TYPE_NONE);
+ err = rte_flow_validate(dev->port_id, &attr, items, actions, &error);
+ if (err) {
+ VLOG_WARN("%s: rx-steering: device does not support %s flow: %s",
+ netdev_get_name(&dev->up), desc,
+ error.message ? error.message : "");
+ goto out;
+ }
+
+ set_error(&error, RTE_FLOW_ERROR_TYPE_NONE);
+ flow = rte_flow_create(dev->port_id, &attr, items, actions, &error);
+ if (flow == NULL) {
+ VLOG_WARN("%s: rx-steering: failed to add %s flow: %s",
+ netdev_get_name(&dev->up), desc,
+ error.message ? error.message : "");
+ err = rte_errno;
+ goto out;
+ }
+
+ num = dev->rx_steer_flows_num + 1;
+ dev->rx_steer_flows = xrealloc(dev->rx_steer_flows, sizeof(flow) * num);
+ dev->rx_steer_flows[dev->rx_steer_flows_num] = flow;
+ dev->rx_steer_flows_num = num;
+
+ VLOG_INFO("%s: rx-steering: redirected %s traffic to rx queue %d",
+ netdev_get_name(&dev->up), desc, dev->up.n_rxq - 1);
+out:
+ return err;
+}
+
+#define RETA_CONF_SIZE (RTE_ETH_RSS_RETA_SIZE_512 / RTE_ETH_RETA_GROUP_SIZE)
+
+static int
+dpdk_rx_steer_rss_configure(struct netdev_dpdk *dev, int rss_n_rxq)
+{
+ struct rte_eth_rss_reta_entry64 reta_conf[RETA_CONF_SIZE];
+ struct rte_eth_dev_info info;
+ int err;
+
+ rte_eth_dev_info_get(dev->port_id, &info);
+
+ if (info.reta_size % rss_n_rxq != 0 &&
+ info.reta_size < RTE_ETH_RSS_RETA_SIZE_128) {
+ /*
+ * Some drivers set reta_size equal to the total number of rxqs that
+ * are configured when it is a power of two. Since we are actually
+ * reconfiguring the redirection table to exclude the last rxq, we may
+ * end up with an imbalanced redirection table. For example, such
+ * configuration:
+ *
+ * options:n_rxq=3 options:rx-steering=rss+lacp
+ *
+ * Will actually configure 4 rxqs on the NIC, and the default reta to:
+ *
+ * [0, 1, 2, 3]
+ *
+ * And dpdk_rx_steer_rss_configure() will reconfigure reta to:
+ *
+ * [0, 1, 2, 0]
+ *
+ * Causing queue 0 to receive twice as much traffic as queues 1 and 2.
+ *
+ * Work around that corner case by forcing a bigger redirection table
+ * size to 128 entries when reta_size is not a multiple of rss_n_rxq
+ * and when reta_size is less than 128. This value seems to be
+ * supported by most of the drivers that also support rte flow.
+ */
+ info.reta_size = RTE_ETH_RSS_RETA_SIZE_128;
+ }
+
+ memset(reta_conf, 0, sizeof(reta_conf));
+ for (uint16_t i = 0; i < info.reta_size; i++) {
+ uint16_t idx = i / RTE_ETH_RETA_GROUP_SIZE;
+ uint16_t shift = i % RTE_ETH_RETA_GROUP_SIZE;
+ reta_conf[idx].mask |= 1ULL << shift;
+ reta_conf[idx].reta[shift] = i % rss_n_rxq;
+ }
+ err = rte_eth_dev_rss_reta_update(dev->port_id, reta_conf, info.reta_size);
+ if (err < 0) {
+ VLOG_WARN("%s: failed to configure RSS redirection table: err=%d",
+ netdev_get_name(&dev->up), err);
+ }
+
+ return err;
+}
+
+static int
+dpdk_rx_steer_configure(struct netdev_dpdk *dev)
+{
+ int err = 0;
+
+ if (dev->up.n_rxq < 2) {
+ err = ENOTSUP;
+ VLOG_WARN("%s: rx-steering: not enough available rx queues",
+ netdev_get_name(&dev->up));
+ goto out;
+ }
+
+ if (dev->requested_rx_steer_flags & DPDK_RX_STEER_LACP) {
+ const struct rte_flow_item items[] = {
+ {
+ .type = RTE_FLOW_ITEM_TYPE_ETH,
+ .spec = &(const struct rte_flow_item_eth){
+ .type = htons(ETH_TYPE_LACP),
+ },
+ .mask = &(const struct rte_flow_item_eth){
+ .type = htons(0xffff),
+ },
+ },
+ { .type = RTE_FLOW_ITEM_TYPE_END },
+ };
+ err = dpdk_rx_steer_add_flow(dev, items, "lacp");
+ if (err) {
+ goto out;
+ }
+ }
+
+ if (dev->rx_steer_flows_num) {
+ /* Reconfigure RSS reta in all but the rx steering queue. */
+ err = dpdk_rx_steer_rss_configure(dev, dev->up.n_rxq - 1);
+ if (err) {
+ goto out;
+ }
+ if (dev->up.n_rxq == 2) {
+ VLOG_INFO("%s: rx-steering: redirected other traffic to "
+ "rx queue 0", netdev_get_name(&dev->up));
+ } else {
+ VLOG_INFO("%s: rx-steering: applied rss on rx queue 0-%u",
+ netdev_get_name(&dev->up), dev->up.n_rxq - 2);
+ }
+ }
+
+out:
+ return err;
+}
+
+static void
+dpdk_rx_steer_unconfigure(struct netdev_dpdk *dev)
+{
+ struct rte_flow_error error;
+
+ if (!dev->rx_steer_flows_num) {
+ return;
+ }
+
+ VLOG_DBG("%s: rx-steering: reset flows", netdev_get_name(&dev->up));
+
+ for (int i = 0; i < dev->rx_steer_flows_num; i++) {
+ set_error(&error, RTE_FLOW_ERROR_TYPE_NONE);
+ if (rte_flow_destroy(dev->port_id, dev->rx_steer_flows[i], &error)) {
+ VLOG_DBG("%s: rx-steering: failed to destroy flow: %s",
+ netdev_get_name(&dev->up),
+ error.message ? error.message : "");
+ }
+ }
+ free(dev->rx_steer_flows);
+ dev->rx_steer_flows_num = 0;
+ dev->rx_steer_flows = NULL;
+}
+
static int
netdev_dpdk_reconfigure(struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
+ bool try_rx_steer;
int err = 0;
ovs_mutex_lock(&dev->mutex);
+ try_rx_steer = dev->requested_rx_steer_flags != 0;
+ dev->requested_n_rxq = dev->user_n_rxq;
+ if (try_rx_steer) {
+ dev->requested_n_rxq += 1;
+ }
+
if (netdev->n_txq == dev->requested_n_txq
&& netdev->n_rxq == dev->requested_n_rxq
+ && dev->rx_steer_flags == dev->requested_rx_steer_flags
&& dev->mtu == dev->requested_mtu
&& dev->lsc_interrupt_mode == dev->requested_lsc_interrupt_mode
&& dev->rxq_size == dev->requested_rxq_size
@@ -5332,6 +5603,9 @@ netdev_dpdk_reconfigure(struct netdev *netdev)
goto out;
}
+retry:
+ dpdk_rx_steer_unconfigure(dev);
+
if (dev->reset_needed) {
rte_eth_dev_reset(dev->port_id);
if_notifier_manual_report();
@@ -5356,6 +5630,7 @@ netdev_dpdk_reconfigure(struct netdev *netdev)
dev->txq_size = dev->requested_txq_size;
rte_free(dev->tx_q);
+ dev->tx_q = NULL;
if (!eth_addr_equals(dev->hwaddr, dev->requested_hwaddr)) {
err = netdev_dpdk_set_etheraddr__(dev, dev->requested_hwaddr);
@@ -5379,6 +5654,23 @@ netdev_dpdk_reconfigure(struct netdev *netdev)
*/
dev->requested_hwaddr = dev->hwaddr;
+ if (try_rx_steer) {
+ err = dpdk_rx_steer_configure(dev);
+ if (err) {
+ /* No hw support, disable & recover gracefully. */
+ try_rx_steer = false;
+ /*
+ * The extra queue must be explicitly removed here to ensure that
+ * it is unconfigured immediately.
+ */
+ dev->requested_n_rxq = dev->user_n_rxq;
+ goto retry;
+ }
+ } else {
+ VLOG_INFO("%s: rx-steering: disabled", netdev_get_name(&dev->up));
+ }
+ dev->rx_steer_flags = dev->requested_rx_steer_flags;
+
dev->tx_q = netdev_dpdk_alloc_txq(netdev->n_txq);
if (!dev->tx_q) {
err = ENOMEM;
@@ -5589,6 +5881,13 @@ netdev_dpdk_flow_api_supported(struct netdev *netdev)
dev = netdev_dpdk_cast(netdev);
ovs_mutex_lock(&dev->mutex);
if (dev->type == DPDK_DEV_ETH) {
+ if (dev->requested_rx_steer_flags) {
+ VLOG_WARN("%s: disabling rx-steering as it is "
+ "mutually exclusive with hw-offload.",
+ netdev_get_name(netdev));
+ dev->requested_rx_steer_flags = 0;
+ netdev_request_reconfigure(netdev);
+ }
/* TODO: Check if we able to offload some minimal flow. */
ret = true;
}
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index 59c404bbbc7a..c18d62029d4d 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -3517,6 +3517,45 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch
options:peer=p1 \
<p>This option may only be used with dpdk VF representors.</p>
</column>
+ <column name="options" key="rx-steering"
+ type='{"type": "string", "enum": ["set", ["rss", "rss+lacp"]]}'>
+ <p>
+ Configure hardware Rx queue steering policy.
+ </p>
+ <p>
+ This option takes one of the following values:
+ </p>
+ <dl>
+ <dt><code>rss</code></dt>
+ <dd>
+ Distribution of ingress packets in all Rx queues according to the
+ RSS algorithm. This is the default behaviour.
+ </dd>
+ <dt><code>rss+lacp</code></dt>
+ <dd>
+ Distribution of ingress packets according to the RSS algorithm on
+ all but the last Rx queue. An extra Rx queue is allocated for LACP
+ packets.
+ </dd>
+ </dl>
+ <p>
+ If the user has already configured multiple
+ <code>options:n_rxq</code> on the port, an additional one will be
+ allocated for the specified protocols. Even if the hardware cannot
+ satisfy the requested number of requested Rx queues, the last Rx
+ queue will be used. If only one Rx queue is available or if the
+ hardware does not support the RTE flow matchers/actions required to
+ redirect the selected protocols, custom <code>rx-steering</code> will
+ be disabled.
+ </p>
+ <p>
+ This feature is mutually exclusive with
+ <code>other_options:hw-offload</code> as it may conflict with the
+ offloaded RTE flows. If both are enabled, <code>rx-steering</code>
+ will be forcibly disabled.
+ </p>
+ </column>
+
<column name="other_config" key="tx-steering"
type='{"type": "string",
"enum": ["set", ["thread", "hash"]]}'>