Hi, Jan:
I think the following code should also be modified
+ for (int hash = 0; hash < n_hash; hash++) {
+ double max_val = 0.0;
+ struct webster *winner;
+ for (i = 0; i < n_buckets; i++) {
+ if (webster[i].value > max_val) { =======================> if
bucket->weight=0, and there is only one bucket with weight equal to 0, then
winner will be null
+ max_val = webster[i].value;
+ winner = &webster[i];
+ }
+ }
Test like this command:
ovs-ofctl add-group br-int -O openflow15
"group_id=2,type=select,selection_method=dp_hash,bucket=bucket_id=1,weight=0,actions=output:10"
vswitchd crashed after command put.
At 2018-04-16 22:26:27, "Jan Scheurich" <[email protected]> wrote:
>The current implementation of the "dp_hash" selection method suffers
>from two deficiences: 1. The hash mask and hence the number of dp_hash
>values is just large enough to cover the number of group buckets, but
>does not consider the case that buckets have different weights. 2. The
>xlate-time selection of best bucket from the masked dp_hash value often
>results in bucket load distributions that are quite different from the
>bucket weights because the number of available masked dp_hash values
>is too small (2-6 bits compared to 32 bits of a full hash in the default
>hash selection method).
>
>This commit provides a more accurate implementation of the dp_hash
>select group by applying the well known Webster method for distributing
>a small number of "seats" fairly over the weighted "parties"
>(see https://en.wikipedia.org/wiki/Webster/Sainte-Lagu%C3%AB_method).
>The dp_hash mask is autmatically chosen large enough to provide good
>enough accuracy even with widely differing weights.
>
>This distribution happens at group modification time and the resulting
>table is stored with the group-dpif struct. At xlation time, we use the
>masked dp_hash values as index to look up the assigned bucket.
>
>If the bucket should not be live, we do a circular search over the
>mapping table until we find the first live bucket. As the buckets in
>the table are by construction in pseudo-random order with a frequency
>according to their weight, this method maintains correct distribution
>even if one or more buckets are non-live.
>
>Xlation is further simplified by storing some derived select group state
>at group construction in struct group-dpif in a form better suited for
>xlation purposes.
>
>Adapted the unit test case for dp_hash select group accordingly.
>
>Signed-off-by: Jan Scheurich <[email protected]>
>Signed-off-by: Nitin Katiyar <[email protected]>
>Co-authored-by: Nitin Katiyar <[email protected]>
>---
> include/openvswitch/ofp-group.h | 1 +
> ofproto/ofproto-dpif-xlate.c | 74 +++++++++++++-------
> ofproto/ofproto-dpif.c | 146 ++++++++++++++++++++++++++++++++++++++++
> ofproto/ofproto-dpif.h | 13 ++++
> tests/ofproto-dpif.at | 18 +++--
> 5 files changed, 221 insertions(+), 31 deletions(-)
>
>diff --git a/include/openvswitch/ofp-group.h b/include/openvswitch/ofp-group.h
>index 8d893a5..af4033d 100644
>--- a/include/openvswitch/ofp-group.h
>+++ b/include/openvswitch/ofp-group.h
>@@ -47,6 +47,7 @@ struct bucket_counter {
> /* Bucket for use in groups. */
> struct ofputil_bucket {
> struct ovs_list list_node;
>+ uint16_t aux; /* Padding. Also used for temporary data. */
> uint16_t weight; /* Relative weight, for "select" groups. */
> ofp_port_t watch_port; /* Port whose state affects whether this
> bucket
> * is live. Only required for fast failover
>diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
>index c8baba1..df245c5 100644
>--- a/ofproto/ofproto-dpif-xlate.c
>+++ b/ofproto/ofproto-dpif-xlate.c
>@@ -4235,35 +4235,55 @@ xlate_hash_fields_select_group(struct xlate_ctx *ctx,
>struct group_dpif *group,
> }
> }
>
>+static struct ofputil_bucket *
>+group_dp_hash_best_bucket(struct xlate_ctx *ctx,
>+ const struct group_dpif *group,
>+ uint32_t dp_hash)
>+{
>+ struct ofputil_bucket *bucket, *best_bucket = NULL;
>+ uint32_t n_hash = group->hash_mask + 1;
>+
>+ uint32_t hash = dp_hash &= group->hash_mask;
>+ ctx->wc->masks.dp_hash |= group->hash_mask;
>+
>+ /* Starting from the original masked dp_hash value iterate over the
>+ * hash mapping table to find the first live bucket. As the buckets
>+ * are quasi-randomly spread over the hash values, this maintains
>+ * a distribution according to bucket weights even when some buckets
>+ * are non-live. */
>+ for (int i = 0; i < n_hash; i++) {
>+ bucket = group->hash_map[(hash + i) % n_hash];
>+ if (bucket_is_alive(ctx, bucket, 0)) {
>+ best_bucket = bucket;
>+ break;
>+ }
>+ }
>+
>+ return best_bucket;
>+}
>+
> static void
> xlate_dp_hash_select_group(struct xlate_ctx *ctx, struct group_dpif *group,
> bool is_last_action)
> {
>- struct ofputil_bucket *bucket;
>-
> /* dp_hash value 0 is special since it means that the dp_hash has not been
> * computed, as all computed dp_hash values are non-zero. Therefore
> * compare to zero can be used to decide if the dp_hash value is valid
> * without masking the dp_hash field. */
> if (!ctx->xin->flow.dp_hash) {
>- uint64_t param = group->up.props.selection_method_param;
>-
>- ctx_trigger_recirculate_with_hash(ctx, param >> 32, (uint32_t)param);
>+ ctx_trigger_recirculate_with_hash(ctx, group->hash_alg,
>+ group->hash_basis);
>+ if (ctx->xin->xcache) {
>+ ofproto_group_unref(&group->up);
>+ }
> } else {
>- uint32_t n_buckets = group->up.n_buckets;
>- if (n_buckets) {
>- /* Minimal mask to cover the number of buckets. */
>- uint32_t mask = (1 << log_2_ceil(n_buckets)) - 1;
>- /* Multiplier chosen to make the trivial 1 bit case to
>- * actually distribute amongst two equal weight buckets. */
>- uint32_t basis = 0xc2b73583 * (ctx->xin->flow.dp_hash & mask);
>-
>- ctx->wc->masks.dp_hash |= mask;
>- bucket = group_best_live_bucket(ctx, group, basis);
>- if (bucket) {
>- xlate_group_bucket(ctx, bucket, is_last_action);
>- xlate_group_stats(ctx, group, bucket);
>- }
>+ struct ofputil_bucket *bucket =
>+ group_dp_hash_best_bucket(ctx, group, ctx->xin->flow.dp_hash);
>+ if (bucket) {
>+ xlate_group_bucket(ctx, bucket, is_last_action);
>+ xlate_group_stats(ctx, group, bucket);
>+ } else if (ctx->xin->xcache) {
>+ ofproto_group_unref(&group->up);
> }
> }
> }
>@@ -4272,8 +4292,6 @@ static void
> xlate_select_group(struct xlate_ctx *ctx, struct group_dpif *group,
> bool is_last_action)
> {
>- const char *selection_method = group->up.props.selection_method;
>-
> /* Select groups may access flow keys beyond L2 in order to
> * select a bucket. Recirculate as appropriate to make this possible.
> */
>@@ -4281,15 +4299,19 @@ xlate_select_group(struct xlate_ctx *ctx, struct
>group_dpif *group,
> ctx_trigger_freeze(ctx);
> }
>
>- if (selection_method[0] == '\0') {
>+ switch (group->selection_method) {
>+ case SEL_METHOD_DEFAULT:
> xlate_default_select_group(ctx, group, is_last_action);
>- } else if (!strcasecmp("hash", selection_method)) {
>+ break;
>+ case SEL_METHOD_HASH:
> xlate_hash_fields_select_group(ctx, group, is_last_action);
>- } else if (!strcasecmp("dp_hash", selection_method)) {
>+ break;
>+ case SEL_METHOD_DP_HASH:
> xlate_dp_hash_select_group(ctx, group, is_last_action);
>- } else {
>- /* Parsing of groups should ensure this never happens */
>+ break;
>+ default:
> OVS_NOT_REACHED();
>+ break;
> }
> }
>
>diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
>index 1ed82d0..e1a5c09 100644
>--- a/ofproto/ofproto-dpif.c
>+++ b/ofproto/ofproto-dpif.c
>@@ -32,6 +32,7 @@
> #include "lacp.h"
> #include "learn.h"
> #include "mac-learning.h"
>+#include "math.h"
> #include "mcast-snooping.h"
> #include "multipath.h"
> #include "netdev-vport.h"
>@@ -4717,6 +4718,143 @@ group_dpif_credit_stats(struct group_dpif *group,
> ovs_mutex_unlock(&group->stats_mutex);
> }
>
>+/* Calculate the dp_hash mask needed to provide the least weighted bucket
>+ * with at least one hash value and construct a mapping table from masked
>+ * dp_hash value to group bucket using the Webster method.
>+ * If the caller specifies a non-zero max_hash value, abort and return false
>+ * if more hash values would be required. The absolute maximum number of
>+ * hash values supported is 256. */
>+
>+#define MAX_SELECT_GROUP_HASH_VALUES 256
>+
>+static bool
>+group_setup_dp_hash_table(struct group_dpif *group, size_t max_hash)
>+{
>+ struct ofputil_bucket *bucket;
>+ uint32_t n_buckets = group->up.n_buckets;
>+ double total_weight = 0.0;
>+ uint16_t min_weight = UINT16_MAX;
>+ uint32_t n_hash;
>+ struct webster {
>+ struct ofputil_bucket *bucket;
>+ uint32_t divisor;
>+ double value;
>+ } webster[group->up.n_buckets];
>+
>+ if (n_buckets == 0) {
>+ VLOG_DBG(" Don't apply dp_hash method without buckets");
>+ return false;
>+ }
>+
>+ int i = 0;
>+ LIST_FOR_EACH (bucket, list_node, &group->up.buckets) {
>+ bucket->aux = 0;
>+ if (bucket->weight > 0 && bucket->weight < min_weight) {
>+ min_weight = bucket->weight;
>+ }
>+ total_weight += bucket->weight;
>+ webster[i].bucket = bucket;
>+ webster[i].divisor = 1;
>+ webster[i].value = bucket->weight;
>+ i++;
>+ }
>+
>+ VLOG_DBG(" Minimum weight: %d, total weight: %.0f",
>+ min_weight, total_weight);
>+
>+ uint32_t min_slots = ceil(total_weight / min_weight);
>+ n_hash = MAX(16, 1L << log_2_ceil(min_slots));
>+
>+ if (n_hash > MAX_SELECT_GROUP_HASH_VALUES ||
>+ (max_hash != 0 && n_hash > max_hash)) {
>+ VLOG_DBG(" Too many hash values required: %d", n_hash);
>+ return false;
>+ }
>+
>+ VLOG_DBG(" Using %d hash values:", n_hash);
>+ group->hash_mask = n_hash - 1;
>+ if (group->hash_map) {
>+ free(group->hash_map);
>+ }
>+ group->hash_map = xcalloc(n_hash, sizeof(struct ofputil_bucket *));
>+
>+ /* Use Webster method to distribute hash values over buckets. */
>+ for (int hash = 0; hash < n_hash; hash++) {
>+ double max_val = 0.0;
>+ struct webster *winner;
>+ for (i = 0; i < n_buckets; i++) {
>+ if (webster[i].value > max_val) {
>+ max_val = webster[i].value;
>+ winner = &webster[i];
>+ }
>+ }
>+#pragma GCC diagnostic push
>+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
>+ /* winner is a reference to a webster[] element initialized above. */
>+ winner->divisor += 2;
>+ winner->value = (double) winner->bucket->weight / winner->divisor;
>+ group->hash_map[hash] = winner->bucket;
>+ winner->bucket->aux++;
>+#pragma GCC diagnostic pop
>+ }
>+
>+ LIST_FOR_EACH (bucket, list_node, &group->up.buckets) {
>+ double target = (n_hash * bucket->weight) / total_weight;
>+ VLOG_DBG(" Bucket %d: weight=%d, target=%.2f hits=%d",
>+ bucket->bucket_id, bucket->weight,
>+ target, bucket->aux);
>+ }
>+
>+ return true;
>+}
>+
>+static void
>+group_set_selection_method(struct group_dpif *group)
>+{
>+ struct ofputil_group_props *props = &group->up.props;
>+ char *selection_method = props->selection_method;
>+
>+ if (selection_method[0] == '\0') {
>+ VLOG_INFO("No selection method specified.");
>+ group->selection_method = SEL_METHOD_DEFAULT;
>+
>+ } else if (!strcmp(selection_method, "dp_hash")) {
>+ VLOG_INFO("Selection method specified: dp_hash.");
>+ /* Try to use dp_hash if possible at all. */
>+ if (group_setup_dp_hash_table(group, 0)) {
>+ group->selection_method = SEL_METHOD_DP_HASH;
>+ group->hash_alg = props->selection_method_param >> 32;
>+ if (group->hash_alg >= __OVS_HASH_MAX) {
>+ VLOG_INFO(" Invalid dp_hash algorithm %d. "
>+ "Defaulting to OVS_HASH_ALG_L4", group->hash_alg);
>+ group->hash_alg = OVS_HASH_ALG_L4;
>+ }
>+ group->hash_basis = (uint32_t) props->selection_method_param;
>+ } else {
>+ /* Fall back to original default hashing in slow path. */
>+ VLOG_INFO(" Falling back to default hash method.");
>+ group->selection_method = SEL_METHOD_DEFAULT;
>+ }
>+ } else if (!strcmp(selection_method, "hash")) {
>+ VLOG_INFO("Selection method specified: hash.");
>+ if (props->fields.values_size > 0) {
>+ /* Controller has specified hash fields. */
>+ struct ds s = DS_EMPTY_INITIALIZER;
>+ oxm_format_field_array(&s, &props->fields);
>+ VLOG_INFO(" Hash fields: %s", ds_cstr(&s));
>+ ds_destroy(&s);
>+ group->selection_method = SEL_METHOD_HASH;
>+ } else {
>+ /* No hash fields. Fall back to original default hashing. */
>+ VLOG_INFO(" No hash fields. Falling back to default hash
>method.");
>+ group->selection_method = SEL_METHOD_DEFAULT;
>+ }
>+ } else {
>+ /* Parsing of groups should ensure this never happens */
>+ OVS_NOT_REACHED();
>+ }
>+}
>+
> static enum ofperr
> group_construct(struct ofgroup *group_)
> {
>@@ -4725,6 +4863,10 @@ group_construct(struct ofgroup *group_)
> ovs_mutex_init_adaptive(&group->stats_mutex);
> ovs_mutex_lock(&group->stats_mutex);
> group_construct_stats(group);
>+ group->hash_map = NULL;
>+ if (group->up.type == OFPGT11_SELECT) {
>+ group_set_selection_method(group);
>+ }
> ovs_mutex_unlock(&group->stats_mutex);
> return 0;
> }
>@@ -4734,6 +4876,10 @@ group_destruct(struct ofgroup *group_)
> {
> struct group_dpif *group = group_dpif_cast(group_);
> ovs_mutex_destroy(&group->stats_mutex);
>+ if (group->hash_map) {
>+ free(group->hash_map);
>+ group->hash_map = NULL;
>+ }
> }
>
> static enum ofperr
>diff --git a/ofproto/ofproto-dpif.h b/ofproto/ofproto-dpif.h
>index 47bf7f9..880dd3d 100644
>--- a/ofproto/ofproto-dpif.h
>+++ b/ofproto/ofproto-dpif.h
>@@ -119,6 +119,12 @@ rule_dpif_is_internal(const struct rule_dpif *rule)
>
> /* Groups. */
>
>+enum group_selection_method {
>+ SEL_METHOD_DEFAULT,
>+ SEL_METHOD_DP_HASH,
>+ SEL_METHOD_HASH,
>+};
>+
> struct group_dpif {
> struct ofgroup up;
>
>@@ -129,6 +135,12 @@ struct group_dpif {
> struct ovs_mutex stats_mutex;
> uint64_t packet_count OVS_GUARDED; /* Number of packets received. */
> uint64_t byte_count OVS_GUARDED; /* Number of bytes received. */
>+
>+ enum group_selection_method selection_method;
>+ enum ovs_hash_alg hash_alg; /* dp_hash algorithm to be applied. */
>+ uint32_t hash_basis; /* Basis for dp_hash. */
>+ uint32_t hash_mask; /* Used to mask dp_hash (2^N - 1).*/
>+ struct ofputil_bucket **hash_map; /* Map hash values to buckets. */
> };
>
> void group_dpif_credit_stats(struct group_dpif *,
>@@ -137,6 +149,7 @@ void group_dpif_credit_stats(struct group_dpif *,
> struct group_dpif *group_dpif_lookup(struct ofproto_dpif *,
> uint32_t group_id, ovs_version_t version,
> bool take_ref);
>+
>
> /* Backers.
> *
>diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at
>index 60f28e2..6c20208 100644
>--- a/tests/ofproto-dpif.at
>+++ b/tests/ofproto-dpif.at
>@@ -500,11 +500,19 @@ for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15; do
> AT_CHECK([ovs-appctl netdev-dummy/receive p1 $pkt])
> done
>
>-AT_CHECK([ovs-appctl dpctl/dump-flows | sed
>'s/dp_hash(.*\/0x1)/dp_hash(0xXXXX\/0x1)/' | sed
>'s/packets.*actions:1/actions:1/' | strip_ufid | strip_used | sort], [0], [dnl
>+AT_CHECK([ovs-appctl dpctl/dump-flows | sed
>'s/dp_hash(.*\/0xf)/dp_hash(0xXXXX\/0xf)/' | sed
>'s/packets.*actions:1/actions:1/' | strip_ufid | strip_used | sort], [0], [dnl
> flow-dump from non-dpdk interfaces:
> recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(src=192.168.0.1,frag=no),
> packets:15, bytes:1590, used:0.0s, actions:hash(hash_l4(0)),recirc(0x1)
>-recirc_id(0x1),dp_hash(0xXXXX/0x1),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
> actions:10
>-recirc_id(0x1),dp_hash(0xXXXX/0x1),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
> actions:11
>+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
> actions:10
>+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
> actions:10
>+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
> actions:10
>+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
> actions:10
>+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
> actions:10
>+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
> actions:10
>+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
> actions:11
>+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
> actions:11
>+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
> actions:11
>+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
> actions:11
> ])
>
> AT_CHECK([ovs-appctl revalidator/purge], [0])
>@@ -516,10 +524,10 @@ for d in 0 1 2 3 4 5 6 7 8 9 a b c d e f; do
> AT_CHECK([ovs-appctl netdev-dummy/receive p1 $pkt])
> done
>
>-AT_CHECK([ovs-appctl dpctl/dump-flows | sed
>'s/dp_hash(.*\/0x1)/dp_hash(0xXXXX\/0x1)/' | sed 's/\(actions:1\)[[01]]/\1X/'
>| strip_ufid | strip_used | sort], [0], [dnl
>+AT_CHECK([ovs-appctl dpctl/dump-flows | sed
>'s/dp_hash(.*\/0xf)/dp_hash(0xXXXX\/0xf)/' | sed 's/\(actions:1\)[[01]]/\1X/'
>| strip_ufid | strip_used | sort], [0], [dnl
> flow-dump from non-dpdk interfaces:
> recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(src=192.168.0.1,frag=no),
> packets:15, bytes:1590, used:0.0s, actions:hash(hash_l4(0)),recirc(0x2)
>-recirc_id(0x2),dp_hash(0xXXXX/0x1),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
> packets:15, bytes:1590, used:0.0s, actions:1X
>+recirc_id(0x2),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
> packets:15, bytes:1590, used:0.0s, actions:1X
> ])
>
> OVS_VSWITCHD_STOP
>--
>1.9.1
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev