A pmd would poll all rxqs with no weight. When a pmd has one rxq
from phy port and several from vhu port, and high loads run for
both rx and tx, then the vhu can get polled more. This will cause
the polling for rx of phy port much less than the vhu port. The
loads for tx/rx will lose balance. With traffic to both directions,
rx will be limited to a very low rate as phy port get polled less.
For example, originally poll list for each pmd is like below:
pmd 0 phy0_0 vhu0_0 vhu0_4 vhu0_8 vhu0_12
pmd 1 phy0_1 vhu0_1 vhu0_5 vhu0_9 vhu0_13
pmd 2 phy0_2 vhu0_2 vhu0_6 vhu0_10 vhu0_14
pmd 3 phy0_3 vhu0_3 vhu0_7 vhu0_11 vhu0_15
With traffic to both directions, rx will be limited to 2Mpps and
tx is 9Mpps.
This patch provide an option to reinforce the phy port polling.
Add a configuration for rxq schedule, which will try to balance
the poll for phy and vhu port. It will increase the poll times
for a phy port, and interlace the phy rxq and vhu rxq in the poll
list.
scale the rxq poll list:
pmd 0 phy0_0 vhu0_0 phy0_0 vhu0_4 phy0_0 vhu0_8 phy0_0 vhu0_12
pmd 1 phy0_1 vhu0_1 phy0_1 vhu0_5 phy0_1 vhu0_9 phy0_1 vhu0_13
pmd 2 phy0_2 vhu0_2 phy0_2 vhu0_6 phy0_2 vhu0_10 phy0_2 vhu0_14
pmd 3 phy0_3 vhu0_3 phy0_3 vhu0_7 phy0_3 vhu0_11 phy0_3 vhu0_15
to enable it, run:
'ovs-vsctl set open . other_config:pmd-rxq-schedule=scaling'
to disable it, remove the setting or set it to 'single'
And it works fairly well when n_rxq of dpdk phy port equals to
number of dpdk pmds, which means one n_rxq for one pmd
Signed-off-by: Wan Junjie <[email protected]>
Reviewed-by: He Peng <[email protected]>
---
lib/dpif-netdev.c | 133 ++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 122 insertions(+), 11 deletions(-)
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 9f35713ef..3acf5512a 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -297,6 +297,7 @@ struct dp_netdev {
struct ovs_mutex tx_qid_pool_mutex;
/* Rxq to pmd assignment type. */
enum sched_assignment_type pmd_rxq_assign_type;
+ bool pmd_rxq_scaling_single;
bool pmd_iso;
/* Protects the access of the 'struct dp_netdev_pmd_thread'
@@ -430,6 +431,7 @@ struct dp_netdev_rxq {
unsigned intrvl_idx; /* Write index for 'cycles_intrvl'. */
struct dp_netdev_pmd_thread *pmd; /* pmd thread that polls this queue. */
bool is_vhost; /* Is rxq of a vhost port. */
+ bool scale; /* rxq should be scheduled scalely */
bool hw_miss_api_supported; /* hw_miss_packet_recover() supported.*/
/* Counters of cycles spent successfully polling and processing pkts. */
@@ -918,7 +920,8 @@ pmd_info_show_rxq(struct ds *reply, struct
dp_netdev_pmd_thread *pmd)
ds_put_format(reply, " port: %-16s queue-id: %2d", name,
netdev_rxq_get_queue_id(list[i].rxq->rx));
ds_put_format(reply, " %s", netdev_rxq_enabled(list[i].rxq->rx)
- ? "(enabled) " : "(disabled)");
+ ? ((list[i].rxq->scale) ? "(rescaled)"
+ : "(enabled) ") : "(disabled)");
ds_put_format(reply, " pmd usage: ");
if (total_cycles) {
ds_put_format(reply, "%2"PRIu64"",
@@ -1845,6 +1848,7 @@ create_dp_netdev(const char *name, const struct
dpif_class *class,
cmap_init(&dp->poll_threads);
dp->pmd_rxq_assign_type = SCHED_CYCLES;
+ dp->pmd_rxq_scaling_single = false;
ovs_mutex_init(&dp->tx_qid_pool_mutex);
/* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
@@ -4772,6 +4776,8 @@ dpif_netdev_set_config(struct dpif *dpif, const struct
smap *other_config)
const char *cmask = smap_get(other_config, "pmd-cpu-mask");
const char *pmd_rxq_assign = smap_get_def(other_config, "pmd-rxq-assign",
"cycles");
+ const char *pmd_rxq_sche = smap_get_def(other_config, "pmd-rxq-schedule",
+ "single");
unsigned long long insert_prob =
smap_get_ullong(other_config, "emc-insert-inv-prob",
DEFAULT_EM_FLOW_INSERT_INV_PROB);
@@ -4860,6 +4866,20 @@ dpif_netdev_set_config(struct dpif *dpif, const struct
smap *other_config)
dp_netdev_request_reconfigure(dp);
}
+ bool pmd_rxq_scale = !strcmp(pmd_rxq_sche, "scaling");
+ if (!pmd_rxq_scale && strcmp(pmd_rxq_sche, "single")) {
+ VLOG_WARN("Unsupported Phy Rxq schedule mode in pmd-rxq-schedule, "
+ "Defaulting to 'single'.");
+ pmd_rxq_scale = true;
+ pmd_rxq_sche = "single";
+ }
+ if (dp->pmd_rxq_scaling_single != pmd_rxq_scale) {
+ dp->pmd_rxq_scaling_single = pmd_rxq_scale;
+ VLOG_INFO("Phy rxq schedule mode changed to: \'%s\'.",
+ pmd_rxq_sche);
+ dp_netdev_request_reconfigure(dp);
+ }
+
bool pmd_iso = smap_get_bool(other_config, "pmd-rxq-isolate", true);
if (pmd_rxq_assign_type != SCHED_GROUP && pmd_iso == false) {
@@ -5425,6 +5445,7 @@ port_reconfigure(struct dp_netdev_port *port)
port->rxqs[i].port = port;
port->rxqs[i].is_vhost = !strncmp(port->type, "dpdkvhost", 9);
+ port->rxqs[i].scale = false;
port->rxqs[i].hw_miss_api_supported = true;
err = netdev_rxq_open(netdev, &port->rxqs[i].rx, i);
@@ -5891,6 +5912,7 @@ sched_numa_list_schedule(struct sched_numa_list
*numa_list,
unsigned n_rxqs = 0;
bool start_logged = false;
size_t n_numa;
+ bool scale = dp->pmd_rxq_scaling_single;
/* For each port. */
HMAP_FOR_EACH (port, node, &dp->ports) {
@@ -5957,6 +5979,23 @@ sched_numa_list_schedule(struct sched_numa_list
*numa_list,
netdev_rxq_get_name(rxq->rx),
netdev_rxq_get_queue_id(rxq->rx),
get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles));
+ if (!rxq->is_vhost && scale && !rxq->scale) {
+ VLOG(level, "Enable scaling on pmd %d for \'%s\' "
+ "rx queue %d\n",
+ sched_pmd->pmd->core_id,
+ netdev_rxq_get_name(rxq->rx),
+ netdev_rxq_get_queue_id(rxq->rx));
+ rxq->scale = true;
+ sched_pmd->pmd->need_reload = true;
+ } else if (!rxq->is_vhost && !scale && rxq->scale) {
+ VLOG(level, "Disable scaling on pmd %d for \'%s\' "
+ "rx queue %d\n",
+ sched_pmd->pmd->core_id,
+ netdev_rxq_get_name(rxq->rx),
+ netdev_rxq_get_queue_id(rxq->rx));
+ rxq->scale = false;
+ sched_pmd->pmd->need_reload = true;
+ }
sched_pmd_add_rxq(sched_pmd, rxq, proc_cycles);
} else {
rxqs = xrealloc(rxqs, (n_rxqs + 1) * sizeof *rxqs);
@@ -6028,6 +6067,23 @@ sched_numa_list_schedule(struct sched_numa_list
*numa_list,
netdev_rxq_get_queue_id(rxq->rx),
get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles));
sched_pmd_add_rxq(sched_pmd, rxq, proc_cycles);
+ if (!rxq->is_vhost && scale && !rxq->scale) {
+ VLOG_INFO("Enable scaling on pmd %d for \'%s\' "
+ "rx queue %d\n",
+ sched_pmd->pmd->core_id,
+ netdev_rxq_get_name(rxq->rx),
+ netdev_rxq_get_queue_id(rxq->rx));
+ sched_pmd->pmd->need_reload = true;
+ rxq->scale = true;
+ } else if (!rxqs[i]->is_vhost && !scale && rxq->scale) {
+ VLOG_INFO("Disable scaling on pmd %d for \'%s\' "
+ "rx queue %d\n",
+ sched_pmd->pmd->core_id,
+ netdev_rxq_get_name(rxq->rx),
+ netdev_rxq_get_queue_id(rxq->rx));
+ sched_pmd->pmd->need_reload = true;
+ rxq->scale = false;
+ }
}
}
if (!sched_pmd) {
@@ -6040,6 +6096,7 @@ sched_numa_list_schedule(struct sched_numa_list
*numa_list,
get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles));
}
}
+
free(rxqs);
}
@@ -6820,23 +6877,77 @@ pmd_load_queues_and_ports(struct dp_netdev_pmd_thread
*pmd,
{
struct polled_queue *poll_list = *ppoll_list;
struct rxq_poll *poll;
- int i;
+ int i = 0, n_scale = 0, start = 0;
ovs_mutex_lock(&pmd->port_mutex);
- poll_list = xrealloc(poll_list, hmap_count(&pmd->poll_list)
- * sizeof *poll_list);
+ HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
+ if (poll->rxq->scale) {
+ n_scale ++;
+ }
+ }
+
+ int n_total = hmap_count(&pmd->poll_list);
+ struct polled_queue *scale_queue = NULL;
+ if (n_scale) {
+ scale_queue = xmalloc(sizeof(*scale_queue) * n_scale);
+ HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
+ if (poll->rxq->scale) {
+ scale_queue[i].rxq = poll->rxq;
+ scale_queue[i].port_no = poll->rxq->port->port_no;
+ scale_queue[i].emc_enabled = poll->rxq->port->emc_enabled;
+ scale_queue[i].rxq_enabled = netdev_rxq_enabled(poll->rxq->rx);
+ scale_queue[i].change_seq =
+ netdev_get_change_seq(poll->rxq->port->netdev);
+ i ++;
+ }
+ }
+ }
+
+ int non_scale = n_total - n_scale;
+ int times = 0;
+ if (n_scale) {
+ times = non_scale / n_scale;
+ times = (times == 0) ? 1: times;
+ }
+ poll_list = xrealloc(poll_list, (non_scale + times * n_scale)
+ * (sizeof *poll_list));
i = 0;
+ start = 0;
HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
- poll_list[i].rxq = poll->rxq;
- poll_list[i].port_no = poll->rxq->port->port_no;
- poll_list[i].emc_enabled = poll->rxq->port->emc_enabled;
- poll_list[i].rxq_enabled = netdev_rxq_enabled(poll->rxq->rx);
- poll_list[i].change_seq =
- netdev_get_change_seq(poll->rxq->port->netdev);
- i++;
+ if (!poll->rxq->scale) {
+ if (n_scale && start < times * n_scale) {
+ poll_list[i] = scale_queue[start % n_scale];
+ VLOG_DBG("PMD %d: %d rxq %s %d\n", pmd->core_id, i,
+ netdev_rxq_get_name(poll_list[i].rxq->rx),
+ netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
+ i ++;
+ start ++;
+ }
+ poll_list[i].rxq = poll->rxq;
+ poll_list[i].port_no = poll->rxq->port->port_no;
+ poll_list[i].emc_enabled = poll->rxq->port->emc_enabled;
+ poll_list[i].rxq_enabled = netdev_rxq_enabled(poll->rxq->rx);
+ poll_list[i].change_seq =
+ netdev_get_change_seq(poll->rxq->port->netdev);
+ VLOG_DBG("PMD %d: %d rxq %s %d\n", pmd->core_id, i,
+ netdev_rxq_get_name(poll_list[i].rxq->rx),
+ netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
+ i ++;
+ }
+ }
+ if (start == 0 && n_scale && scale_queue) {
+ for (i = 0; i < n_scale; i ++) {
+ poll_list[i] = scale_queue[i];
+ VLOG_DBG("PMD %d: %d rxq %s %d\n", pmd->core_id, i,
+ netdev_rxq_get_name(poll_list[i].rxq->rx),
+ netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
+ }
}
+ if (scale_queue) {
+ free(scale_queue);
+ }
pmd_load_cached_ports(pmd);
ovs_mutex_unlock(&pmd->port_mutex);
--
2.33.0
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev