On Fri, Apr 27, 2018 at 02:20:33PM +0200, Jan Scheurich wrote:
> The current implementation of the "dp_hash" selection method suffers
> from two deficiences: 1. The hash mask and hence the number of dp_hash
> values is just large enough to cover the number of group buckets, but
> does not consider the case that buckets have different weights. 2. The
> xlate-time selection of best bucket from the masked dp_hash value often
> results in bucket load distributions that are quite different from the
> bucket weights because the number of available masked dp_hash values
> is too small (2-6 bits compared to 32 bits of a full hash in the default
> hash selection method).
>
> This commit provides a more accurate implementation of the dp_hash
> select group by applying the well known Webster method for distributing
> a small number of "seats" fairly over the weighted "parties"
> (see https://en.wikipedia.org/wiki/Webster/Sainte-Lagu%C3%AB_method).
> The dp_hash mask is autmatically chosen large enough to provide good
> enough accuracy even with widely differing weights.
>
> This distribution happens at group modification time and the resulting
> table is stored with the group-dpif struct. At xlation time, we use the
> masked dp_hash values as index to look up the assigned bucket.
>
> If the bucket should not be live, we do a circular search over the
> mapping table until we find the first live bucket. As the buckets in
> the table are by construction in pseudo-random order with a frequency
> according to their weight, this method maintains correct distribution
> even if one or more buckets are non-live.
>
> Xlation is further simplified by storing some derived select group state
> at group construction in struct group-dpif in a form better suited for
> xlation purposes.
>
> Adapted the unit test case for dp_hash select group accordingly.
>
> Signed-off-by: Jan Scheurich <[email protected]>
> Signed-off-by: Nitin Katiyar <[email protected]>
> Co-authored-by: Nitin Katiyar <[email protected]>
Thanks a lot.
I don't think that the new 'aux' member in ofputil_bucket is too
useful. It looks to me like the only use of it could be kept just as
easily in struct webster.
group_setup_dp_hash_table() uses floating-point arithmetic for good
reasons, but it seems to me that some of it is unnecessary, especially
since we have DIV_ROUND_UP and ROUND_UP_POW2.
group_dp_hash_best_bucket() seems like it unnecessarily modifies its
dp_hash parameter (and then never uses it again) and unnecessarily uses
% when & would work. I also saw a few ways to make the style better
match what we most often do these days.
So here's an incremental that I suggest folding in for v4. What do you
think?
Thanks,
Ben.
--8<--------------------------cut here-------------------------->8--
diff --git a/include/openvswitch/ofp-group.h b/include/openvswitch/ofp-group.h
index af4033dc68e4..8d893a53fcb2 100644
--- a/include/openvswitch/ofp-group.h
+++ b/include/openvswitch/ofp-group.h
@@ -47,7 +47,6 @@ struct bucket_counter {
/* Bucket for use in groups. */
struct ofputil_bucket {
struct ovs_list list_node;
- uint16_t aux; /* Padding. Also used for temporary data. */
uint16_t weight; /* Relative weight, for "select" groups. */
ofp_port_t watch_port; /* Port whose state affects whether this bucket
* is live. Only required for fast failover
diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
index e35582df0c37..1c78c2d7ca50 100644
--- a/ofproto/ofproto-dpif-xlate.c
+++ b/ofproto/ofproto-dpif-xlate.c
@@ -4386,26 +4386,22 @@ group_dp_hash_best_bucket(struct xlate_ctx *ctx,
const struct group_dpif *group,
uint32_t dp_hash)
{
- struct ofputil_bucket *bucket, *best_bucket = NULL;
- uint32_t n_hash = group->hash_mask + 1;
-
- uint32_t hash = dp_hash &= group->hash_mask;
- ctx->wc->masks.dp_hash |= group->hash_mask;
+ uint32_t hash_mask = group->hash_mask;
+ ctx->wc->masks.dp_hash |= hash_mask;
/* Starting from the original masked dp_hash value iterate over the
* hash mapping table to find the first live bucket. As the buckets
* are quasi-randomly spread over the hash values, this maintains
* a distribution according to bucket weights even when some buckets
* are non-live. */
- for (int i = 0; i < n_hash; i++) {
- bucket = group->hash_map[(hash + i) % n_hash];
- if (bucket_is_alive(ctx, bucket, 0)) {
- best_bucket = bucket;
- break;
+ for (int i = 0; i <= hash_mask; i++) {
+ struct ofputil_bucket *b = group->hash_map[(dp_hash + i) & hash_mask];
+ if (bucket_is_alive(ctx, b, 0)) {
+ return b;
}
}
- return best_bucket;
+ return NULL;
}
static void
diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
index f5ecd8be8d05..c9c2e5176e46 100644
--- a/ofproto/ofproto-dpif.c
+++ b/ofproto/ofproto-dpif.c
@@ -4777,13 +4777,13 @@ group_setup_dp_hash_table(struct group_dpif *group,
size_t max_hash)
{
struct ofputil_bucket *bucket;
uint32_t n_buckets = group->up.n_buckets;
- double total_weight = 0.0;
+ uint64_t total_weight = 0;
uint16_t min_weight = UINT16_MAX;
- uint32_t n_hash;
struct webster {
struct ofputil_bucket *bucket;
uint32_t divisor;
double value;
+ int hits;
} *webster;
if (n_buckets == 0) {
@@ -4794,7 +4794,6 @@ group_setup_dp_hash_table(struct group_dpif *group,
size_t max_hash)
webster = xcalloc(n_buckets, sizeof(struct webster));
int i = 0;
LIST_FOR_EACH (bucket, list_node, &group->up.buckets) {
- bucket->aux = 0;
if (bucket->weight > 0 && bucket->weight < min_weight) {
min_weight = bucket->weight;
}
@@ -4802,6 +4801,7 @@ group_setup_dp_hash_table(struct group_dpif *group,
size_t max_hash)
webster[i].bucket = bucket;
webster[i].divisor = 1;
webster[i].value = bucket->weight;
+ webster[i].hits = 0;
i++;
}
@@ -4810,19 +4810,19 @@ group_setup_dp_hash_table(struct group_dpif *group,
size_t max_hash)
free(webster);
return false;
}
- VLOG_DBG(" Minimum weight: %d, total weight: %.0f",
+ VLOG_DBG(" Minimum weight: %d, total weight: %"PRIu64,
min_weight, total_weight);
- uint32_t min_slots = ceil(total_weight / min_weight);
- n_hash = MAX(16, 1L << log_2_ceil(min_slots));
-
+ uint64_t min_slots = DIV_ROUND_UP(total_weight, min_weight);
+ uint64_t min_slots2 = ROUND_UP_POW2(min_slots);
+ uint64_t n_hash = MAX(16, min_slots2);
if (n_hash > MAX_SELECT_GROUP_HASH_VALUES ||
(max_hash != 0 && n_hash > max_hash)) {
- VLOG_DBG(" Too many hash values required: %d", n_hash);
+ VLOG_DBG(" Too many hash values required: %"PRIu64, n_hash);
return false;
}
- VLOG_DBG(" Using %d hash values:", n_hash);
+ VLOG_DBG(" Using %"PRIu64" hash values:", n_hash);
group->hash_mask = n_hash - 1;
if (group->hash_map) {
free(group->hash_map);
@@ -4837,17 +4837,19 @@ group_setup_dp_hash_table(struct group_dpif *group,
size_t max_hash)
winner = &webster[i];
}
}
+ winner->hits++;
winner->divisor += 2;
winner->value = (double) winner->bucket->weight / winner->divisor;
group->hash_map[hash] = winner->bucket;
- winner->bucket->aux++;
}
+ i = 0;
LIST_FOR_EACH (bucket, list_node, &group->up.buckets) {
- double target = (n_hash * bucket->weight) / total_weight;
+ double target = (n_hash * bucket->weight) / (double) total_weight;
VLOG_DBG(" Bucket %d: weight=%d, target=%.2f hits=%d",
bucket->bucket_id, bucket->weight,
- target, bucket->aux);
+ target, webster[i].hits);
+ i++;
}
free(webster);
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev