Hi, Jan: I think the following code should also be modified + for (int hash = 0; hash < n_hash; hash++) { + double max_val = 0.0; + struct webster *winner; + for (i = 0; i < n_buckets; i++) { + if (webster[i].value > max_val) { =======================> if bucket->weight=0, and there is only one bucket with weight equal to 0, then winner will be null + max_val = webster[i].value; + winner = &webster[i]; + }
+ } Test like this command: ovs-ofctl add-group br-int -O openflow15 "group_id=2,type=select,selection_method=dp_hash,bucket=bucket_id=1,weight=0,actions=output:10" vswitchd crashed after command put. At 2018-04-16 22:26:27, "Jan Scheurich" <jan.scheur...@ericsson.com> wrote: >The current implementation of the "dp_hash" selection method suffers >from two deficiences: 1. The hash mask and hence the number of dp_hash >values is just large enough to cover the number of group buckets, but >does not consider the case that buckets have different weights. 2. The >xlate-time selection of best bucket from the masked dp_hash value often >results in bucket load distributions that are quite different from the >bucket weights because the number of available masked dp_hash values >is too small (2-6 bits compared to 32 bits of a full hash in the default >hash selection method). > >This commit provides a more accurate implementation of the dp_hash >select group by applying the well known Webster method for distributing >a small number of "seats" fairly over the weighted "parties" >(see https://en.wikipedia.org/wiki/Webster/Sainte-Lagu%C3%AB_method). >The dp_hash mask is autmatically chosen large enough to provide good >enough accuracy even with widely differing weights. > >This distribution happens at group modification time and the resulting >table is stored with the group-dpif struct. At xlation time, we use the >masked dp_hash values as index to look up the assigned bucket. > >If the bucket should not be live, we do a circular search over the >mapping table until we find the first live bucket. As the buckets in >the table are by construction in pseudo-random order with a frequency >according to their weight, this method maintains correct distribution >even if one or more buckets are non-live. > >Xlation is further simplified by storing some derived select group state >at group construction in struct group-dpif in a form better suited for >xlation purposes. > >Adapted the unit test case for dp_hash select group accordingly. > >Signed-off-by: Jan Scheurich <jan.scheur...@ericsson.com> >Signed-off-by: Nitin Katiyar <nitin.kati...@ericsson.com> >Co-authored-by: Nitin Katiyar <nitin.kati...@ericsson.com> >--- > include/openvswitch/ofp-group.h | 1 + > ofproto/ofproto-dpif-xlate.c | 74 +++++++++++++------- > ofproto/ofproto-dpif.c | 146 ++++++++++++++++++++++++++++++++++++++++ > ofproto/ofproto-dpif.h | 13 ++++ > tests/ofproto-dpif.at | 18 +++-- > 5 files changed, 221 insertions(+), 31 deletions(-) > >diff --git a/include/openvswitch/ofp-group.h b/include/openvswitch/ofp-group.h >index 8d893a5..af4033d 100644 >--- a/include/openvswitch/ofp-group.h >+++ b/include/openvswitch/ofp-group.h >@@ -47,6 +47,7 @@ struct bucket_counter { > /* Bucket for use in groups. */ > struct ofputil_bucket { > struct ovs_list list_node; >+ uint16_t aux; /* Padding. Also used for temporary data. */ > uint16_t weight; /* Relative weight, for "select" groups. */ > ofp_port_t watch_port; /* Port whose state affects whether this > bucket > * is live. Only required for fast failover >diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c >index c8baba1..df245c5 100644 >--- a/ofproto/ofproto-dpif-xlate.c >+++ b/ofproto/ofproto-dpif-xlate.c >@@ -4235,35 +4235,55 @@ xlate_hash_fields_select_group(struct xlate_ctx *ctx, >struct group_dpif *group, > } > } > >+static struct ofputil_bucket * >+group_dp_hash_best_bucket(struct xlate_ctx *ctx, >+ const struct group_dpif *group, >+ uint32_t dp_hash) >+{ >+ struct ofputil_bucket *bucket, *best_bucket = NULL; >+ uint32_t n_hash = group->hash_mask + 1; >+ >+ uint32_t hash = dp_hash &= group->hash_mask; >+ ctx->wc->masks.dp_hash |= group->hash_mask; >+ >+ /* Starting from the original masked dp_hash value iterate over the >+ * hash mapping table to find the first live bucket. As the buckets >+ * are quasi-randomly spread over the hash values, this maintains >+ * a distribution according to bucket weights even when some buckets >+ * are non-live. */ >+ for (int i = 0; i < n_hash; i++) { >+ bucket = group->hash_map[(hash + i) % n_hash]; >+ if (bucket_is_alive(ctx, bucket, 0)) { >+ best_bucket = bucket; >+ break; >+ } >+ } >+ >+ return best_bucket; >+} >+ > static void > xlate_dp_hash_select_group(struct xlate_ctx *ctx, struct group_dpif *group, > bool is_last_action) > { >- struct ofputil_bucket *bucket; >- > /* dp_hash value 0 is special since it means that the dp_hash has not been > * computed, as all computed dp_hash values are non-zero. Therefore > * compare to zero can be used to decide if the dp_hash value is valid > * without masking the dp_hash field. */ > if (!ctx->xin->flow.dp_hash) { >- uint64_t param = group->up.props.selection_method_param; >- >- ctx_trigger_recirculate_with_hash(ctx, param >> 32, (uint32_t)param); >+ ctx_trigger_recirculate_with_hash(ctx, group->hash_alg, >+ group->hash_basis); >+ if (ctx->xin->xcache) { >+ ofproto_group_unref(&group->up); >+ } > } else { >- uint32_t n_buckets = group->up.n_buckets; >- if (n_buckets) { >- /* Minimal mask to cover the number of buckets. */ >- uint32_t mask = (1 << log_2_ceil(n_buckets)) - 1; >- /* Multiplier chosen to make the trivial 1 bit case to >- * actually distribute amongst two equal weight buckets. */ >- uint32_t basis = 0xc2b73583 * (ctx->xin->flow.dp_hash & mask); >- >- ctx->wc->masks.dp_hash |= mask; >- bucket = group_best_live_bucket(ctx, group, basis); >- if (bucket) { >- xlate_group_bucket(ctx, bucket, is_last_action); >- xlate_group_stats(ctx, group, bucket); >- } >+ struct ofputil_bucket *bucket = >+ group_dp_hash_best_bucket(ctx, group, ctx->xin->flow.dp_hash); >+ if (bucket) { >+ xlate_group_bucket(ctx, bucket, is_last_action); >+ xlate_group_stats(ctx, group, bucket); >+ } else if (ctx->xin->xcache) { >+ ofproto_group_unref(&group->up); > } > } > } >@@ -4272,8 +4292,6 @@ static void > xlate_select_group(struct xlate_ctx *ctx, struct group_dpif *group, > bool is_last_action) > { >- const char *selection_method = group->up.props.selection_method; >- > /* Select groups may access flow keys beyond L2 in order to > * select a bucket. Recirculate as appropriate to make this possible. > */ >@@ -4281,15 +4299,19 @@ xlate_select_group(struct xlate_ctx *ctx, struct >group_dpif *group, > ctx_trigger_freeze(ctx); > } > >- if (selection_method[0] == '\0') { >+ switch (group->selection_method) { >+ case SEL_METHOD_DEFAULT: > xlate_default_select_group(ctx, group, is_last_action); >- } else if (!strcasecmp("hash", selection_method)) { >+ break; >+ case SEL_METHOD_HASH: > xlate_hash_fields_select_group(ctx, group, is_last_action); >- } else if (!strcasecmp("dp_hash", selection_method)) { >+ break; >+ case SEL_METHOD_DP_HASH: > xlate_dp_hash_select_group(ctx, group, is_last_action); >- } else { >- /* Parsing of groups should ensure this never happens */ >+ break; >+ default: > OVS_NOT_REACHED(); >+ break; > } > } > >diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c >index 1ed82d0..e1a5c09 100644 >--- a/ofproto/ofproto-dpif.c >+++ b/ofproto/ofproto-dpif.c >@@ -32,6 +32,7 @@ > #include "lacp.h" > #include "learn.h" > #include "mac-learning.h" >+#include "math.h" > #include "mcast-snooping.h" > #include "multipath.h" > #include "netdev-vport.h" >@@ -4717,6 +4718,143 @@ group_dpif_credit_stats(struct group_dpif *group, > ovs_mutex_unlock(&group->stats_mutex); > } > >+/* Calculate the dp_hash mask needed to provide the least weighted bucket >+ * with at least one hash value and construct a mapping table from masked >+ * dp_hash value to group bucket using the Webster method. >+ * If the caller specifies a non-zero max_hash value, abort and return false >+ * if more hash values would be required. The absolute maximum number of >+ * hash values supported is 256. */ >+ >+#define MAX_SELECT_GROUP_HASH_VALUES 256 >+ >+static bool >+group_setup_dp_hash_table(struct group_dpif *group, size_t max_hash) >+{ >+ struct ofputil_bucket *bucket; >+ uint32_t n_buckets = group->up.n_buckets; >+ double total_weight = 0.0; >+ uint16_t min_weight = UINT16_MAX; >+ uint32_t n_hash; >+ struct webster { >+ struct ofputil_bucket *bucket; >+ uint32_t divisor; >+ double value; >+ } webster[group->up.n_buckets]; >+ >+ if (n_buckets == 0) { >+ VLOG_DBG(" Don't apply dp_hash method without buckets"); >+ return false; >+ } >+ >+ int i = 0; >+ LIST_FOR_EACH (bucket, list_node, &group->up.buckets) { >+ bucket->aux = 0; >+ if (bucket->weight > 0 && bucket->weight < min_weight) { >+ min_weight = bucket->weight; >+ } >+ total_weight += bucket->weight; >+ webster[i].bucket = bucket; >+ webster[i].divisor = 1; >+ webster[i].value = bucket->weight; >+ i++; >+ } >+ >+ VLOG_DBG(" Minimum weight: %d, total weight: %.0f", >+ min_weight, total_weight); >+ >+ uint32_t min_slots = ceil(total_weight / min_weight); >+ n_hash = MAX(16, 1L << log_2_ceil(min_slots)); >+ >+ if (n_hash > MAX_SELECT_GROUP_HASH_VALUES || >+ (max_hash != 0 && n_hash > max_hash)) { >+ VLOG_DBG(" Too many hash values required: %d", n_hash); >+ return false; >+ } >+ >+ VLOG_DBG(" Using %d hash values:", n_hash); >+ group->hash_mask = n_hash - 1; >+ if (group->hash_map) { >+ free(group->hash_map); >+ } >+ group->hash_map = xcalloc(n_hash, sizeof(struct ofputil_bucket *)); >+ >+ /* Use Webster method to distribute hash values over buckets. */ >+ for (int hash = 0; hash < n_hash; hash++) { >+ double max_val = 0.0; >+ struct webster *winner; >+ for (i = 0; i < n_buckets; i++) { >+ if (webster[i].value > max_val) { >+ max_val = webster[i].value; >+ winner = &webster[i]; >+ } >+ } >+#pragma GCC diagnostic push >+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" >+ /* winner is a reference to a webster[] element initialized above. */ >+ winner->divisor += 2; >+ winner->value = (double) winner->bucket->weight / winner->divisor; >+ group->hash_map[hash] = winner->bucket; >+ winner->bucket->aux++; >+#pragma GCC diagnostic pop >+ } >+ >+ LIST_FOR_EACH (bucket, list_node, &group->up.buckets) { >+ double target = (n_hash * bucket->weight) / total_weight; >+ VLOG_DBG(" Bucket %d: weight=%d, target=%.2f hits=%d", >+ bucket->bucket_id, bucket->weight, >+ target, bucket->aux); >+ } >+ >+ return true; >+} >+ >+static void >+group_set_selection_method(struct group_dpif *group) >+{ >+ struct ofputil_group_props *props = &group->up.props; >+ char *selection_method = props->selection_method; >+ >+ if (selection_method[0] == '\0') { >+ VLOG_INFO("No selection method specified."); >+ group->selection_method = SEL_METHOD_DEFAULT; >+ >+ } else if (!strcmp(selection_method, "dp_hash")) { >+ VLOG_INFO("Selection method specified: dp_hash."); >+ /* Try to use dp_hash if possible at all. */ >+ if (group_setup_dp_hash_table(group, 0)) { >+ group->selection_method = SEL_METHOD_DP_HASH; >+ group->hash_alg = props->selection_method_param >> 32; >+ if (group->hash_alg >= __OVS_HASH_MAX) { >+ VLOG_INFO(" Invalid dp_hash algorithm %d. " >+ "Defaulting to OVS_HASH_ALG_L4", group->hash_alg); >+ group->hash_alg = OVS_HASH_ALG_L4; >+ } >+ group->hash_basis = (uint32_t) props->selection_method_param; >+ } else { >+ /* Fall back to original default hashing in slow path. */ >+ VLOG_INFO(" Falling back to default hash method."); >+ group->selection_method = SEL_METHOD_DEFAULT; >+ } >+ } else if (!strcmp(selection_method, "hash")) { >+ VLOG_INFO("Selection method specified: hash."); >+ if (props->fields.values_size > 0) { >+ /* Controller has specified hash fields. */ >+ struct ds s = DS_EMPTY_INITIALIZER; >+ oxm_format_field_array(&s, &props->fields); >+ VLOG_INFO(" Hash fields: %s", ds_cstr(&s)); >+ ds_destroy(&s); >+ group->selection_method = SEL_METHOD_HASH; >+ } else { >+ /* No hash fields. Fall back to original default hashing. */ >+ VLOG_INFO(" No hash fields. Falling back to default hash >method."); >+ group->selection_method = SEL_METHOD_DEFAULT; >+ } >+ } else { >+ /* Parsing of groups should ensure this never happens */ >+ OVS_NOT_REACHED(); >+ } >+} >+ > static enum ofperr > group_construct(struct ofgroup *group_) > { >@@ -4725,6 +4863,10 @@ group_construct(struct ofgroup *group_) > ovs_mutex_init_adaptive(&group->stats_mutex); > ovs_mutex_lock(&group->stats_mutex); > group_construct_stats(group); >+ group->hash_map = NULL; >+ if (group->up.type == OFPGT11_SELECT) { >+ group_set_selection_method(group); >+ } > ovs_mutex_unlock(&group->stats_mutex); > return 0; > } >@@ -4734,6 +4876,10 @@ group_destruct(struct ofgroup *group_) > { > struct group_dpif *group = group_dpif_cast(group_); > ovs_mutex_destroy(&group->stats_mutex); >+ if (group->hash_map) { >+ free(group->hash_map); >+ group->hash_map = NULL; >+ } > } > > static enum ofperr >diff --git a/ofproto/ofproto-dpif.h b/ofproto/ofproto-dpif.h >index 47bf7f9..880dd3d 100644 >--- a/ofproto/ofproto-dpif.h >+++ b/ofproto/ofproto-dpif.h >@@ -119,6 +119,12 @@ rule_dpif_is_internal(const struct rule_dpif *rule) > > /* Groups. */ > >+enum group_selection_method { >+ SEL_METHOD_DEFAULT, >+ SEL_METHOD_DP_HASH, >+ SEL_METHOD_HASH, >+}; >+ > struct group_dpif { > struct ofgroup up; > >@@ -129,6 +135,12 @@ struct group_dpif { > struct ovs_mutex stats_mutex; > uint64_t packet_count OVS_GUARDED; /* Number of packets received. */ > uint64_t byte_count OVS_GUARDED; /* Number of bytes received. */ >+ >+ enum group_selection_method selection_method; >+ enum ovs_hash_alg hash_alg; /* dp_hash algorithm to be applied. */ >+ uint32_t hash_basis; /* Basis for dp_hash. */ >+ uint32_t hash_mask; /* Used to mask dp_hash (2^N - 1).*/ >+ struct ofputil_bucket **hash_map; /* Map hash values to buckets. */ > }; > > void group_dpif_credit_stats(struct group_dpif *, >@@ -137,6 +149,7 @@ void group_dpif_credit_stats(struct group_dpif *, > struct group_dpif *group_dpif_lookup(struct ofproto_dpif *, > uint32_t group_id, ovs_version_t version, > bool take_ref); >+ > > /* Backers. > * >diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at >index 60f28e2..6c20208 100644 >--- a/tests/ofproto-dpif.at >+++ b/tests/ofproto-dpif.at >@@ -500,11 +500,19 @@ for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15; do > AT_CHECK([ovs-appctl netdev-dummy/receive p1 $pkt]) > done > >-AT_CHECK([ovs-appctl dpctl/dump-flows | sed >'s/dp_hash(.*\/0x1)/dp_hash(0xXXXX\/0x1)/' | sed >'s/packets.*actions:1/actions:1/' | strip_ufid | strip_used | sort], [0], [dnl >+AT_CHECK([ovs-appctl dpctl/dump-flows | sed >'s/dp_hash(.*\/0xf)/dp_hash(0xXXXX\/0xf)/' | sed >'s/packets.*actions:1/actions:1/' | strip_ufid | strip_used | sort], [0], [dnl > flow-dump from non-dpdk interfaces: > recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(src=192.168.0.1,frag=no), > packets:15, bytes:1590, used:0.0s, actions:hash(hash_l4(0)),recirc(0x1) >-recirc_id(0x1),dp_hash(0xXXXX/0x1),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), > actions:10 >-recirc_id(0x1),dp_hash(0xXXXX/0x1),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), > actions:11 >+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), > actions:10 >+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), > actions:10 >+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), > actions:10 >+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), > actions:10 >+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), > actions:10 >+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), > actions:10 >+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), > actions:11 >+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), > actions:11 >+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), > actions:11 >+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), > actions:11 > ]) > > AT_CHECK([ovs-appctl revalidator/purge], [0]) >@@ -516,10 +524,10 @@ for d in 0 1 2 3 4 5 6 7 8 9 a b c d e f; do > AT_CHECK([ovs-appctl netdev-dummy/receive p1 $pkt]) > done > >-AT_CHECK([ovs-appctl dpctl/dump-flows | sed >'s/dp_hash(.*\/0x1)/dp_hash(0xXXXX\/0x1)/' | sed 's/\(actions:1\)[[01]]/\1X/' >| strip_ufid | strip_used | sort], [0], [dnl >+AT_CHECK([ovs-appctl dpctl/dump-flows | sed >'s/dp_hash(.*\/0xf)/dp_hash(0xXXXX\/0xf)/' | sed 's/\(actions:1\)[[01]]/\1X/' >| strip_ufid | strip_used | sort], [0], [dnl > flow-dump from non-dpdk interfaces: > recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(src=192.168.0.1,frag=no), > packets:15, bytes:1590, used:0.0s, actions:hash(hash_l4(0)),recirc(0x2) >-recirc_id(0x2),dp_hash(0xXXXX/0x1),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), > packets:15, bytes:1590, used:0.0s, actions:1X >+recirc_id(0x2),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), > packets:15, bytes:1590, used:0.0s, actions:1X > ]) > > OVS_VSWITCHD_STOP >-- >1.9.1 _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev