On Fri, Apr 27, 2018 at 02:20:33PM +0200, Jan Scheurich wrote:
> The current implementation of the "dp_hash" selection method suffers
> from two deficiences: 1. The hash mask and hence the number of dp_hash
> values is just large enough to cover the number of group buckets, but
> does not consider the case that buckets have different weights. 2. The
> xlate-time selection of best bucket from the masked dp_hash value often
> results in bucket load distributions that are quite different from the
> bucket weights because the number of available masked dp_hash values
> is too small (2-6 bits compared to 32 bits of a full hash in the default
> hash selection method).
> 
> This commit provides a more accurate implementation of the dp_hash
> select group by applying the well known Webster method for distributing
> a small number of "seats" fairly over the weighted "parties"
> (see https://en.wikipedia.org/wiki/Webster/Sainte-Lagu%C3%AB_method).
> The dp_hash mask is autmatically chosen large enough to provide good
> enough accuracy even with widely differing weights.
> 
> This distribution happens at group modification time and the resulting
> table is stored with the group-dpif struct. At xlation time, we use the
> masked dp_hash values as index to look up the assigned bucket.
> 
> If the bucket should not be live, we do a circular search over the
> mapping table until we find the first live bucket. As the buckets in
> the table are by construction in pseudo-random order with a frequency
> according to their weight, this method maintains correct distribution
> even if one or more buckets are non-live.
> 
> Xlation is further simplified by storing some derived select group state
> at group construction in struct group-dpif in a form better suited for
> xlation purposes.
> 
> Adapted the unit test case for dp_hash select group accordingly.
> 
> Signed-off-by: Jan Scheurich <[email protected]>
> Signed-off-by: Nitin Katiyar <[email protected]>
> Co-authored-by: Nitin Katiyar <[email protected]>

Thanks a lot.

I don't think that the new 'aux' member in ofputil_bucket is too
useful.  It looks to me like the only use of it could be kept just as
easily in struct webster.

group_setup_dp_hash_table() uses floating-point arithmetic for good
reasons, but it seems to me that some of it is unnecessary, especially
since we have DIV_ROUND_UP and ROUND_UP_POW2.

group_dp_hash_best_bucket() seems like it unnecessarily modifies its
dp_hash parameter (and then never uses it again) and unnecessarily uses
% when & would work.  I also saw a few ways to make the style better
match what we most often do these days.

So here's an incremental that I suggest folding in for v4.  What do you
think?

Thanks,

Ben.

--8<--------------------------cut here-------------------------->8--

diff --git a/include/openvswitch/ofp-group.h b/include/openvswitch/ofp-group.h
index af4033dc68e4..8d893a53fcb2 100644
--- a/include/openvswitch/ofp-group.h
+++ b/include/openvswitch/ofp-group.h
@@ -47,7 +47,6 @@ struct bucket_counter {
 /* Bucket for use in groups. */
 struct ofputil_bucket {
     struct ovs_list list_node;
-    uint16_t aux;               /* Padding. Also used for temporary data. */
     uint16_t weight;            /* Relative weight, for "select" groups. */
     ofp_port_t watch_port;      /* Port whose state affects whether this bucket
                                  * is live. Only required for fast failover
diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
index e35582df0c37..1c78c2d7ca50 100644
--- a/ofproto/ofproto-dpif-xlate.c
+++ b/ofproto/ofproto-dpif-xlate.c
@@ -4386,26 +4386,22 @@ group_dp_hash_best_bucket(struct xlate_ctx *ctx,
                           const struct group_dpif *group,
                           uint32_t dp_hash)
 {
-    struct ofputil_bucket *bucket, *best_bucket = NULL;
-    uint32_t n_hash = group->hash_mask + 1;
-
-    uint32_t hash = dp_hash &= group->hash_mask;
-    ctx->wc->masks.dp_hash |= group->hash_mask;
+    uint32_t hash_mask = group->hash_mask;
+    ctx->wc->masks.dp_hash |= hash_mask;
 
     /* Starting from the original masked dp_hash value iterate over the
      * hash mapping table to find the first live bucket. As the buckets
      * are quasi-randomly spread over the hash values, this maintains
      * a distribution according to bucket weights even when some buckets
      * are non-live. */
-    for (int i = 0; i < n_hash; i++) {
-        bucket = group->hash_map[(hash + i) % n_hash];
-        if (bucket_is_alive(ctx, bucket, 0)) {
-            best_bucket = bucket;
-            break;
+    for (int i = 0; i <= hash_mask; i++) {
+        struct ofputil_bucket *b = group->hash_map[(dp_hash + i) & hash_mask];
+        if (bucket_is_alive(ctx, b, 0)) {
+            return b;
         }
     }
 
-    return best_bucket;
+    return NULL;
 }
 
 static void
diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
index f5ecd8be8d05..c9c2e5176e46 100644
--- a/ofproto/ofproto-dpif.c
+++ b/ofproto/ofproto-dpif.c
@@ -4777,13 +4777,13 @@ group_setup_dp_hash_table(struct group_dpif *group, 
size_t max_hash)
 {
     struct ofputil_bucket *bucket;
     uint32_t n_buckets = group->up.n_buckets;
-    double total_weight = 0.0;
+    uint64_t total_weight = 0;
     uint16_t min_weight = UINT16_MAX;
-    uint32_t n_hash;
     struct webster {
         struct ofputil_bucket *bucket;
         uint32_t divisor;
         double value;
+        int hits;
     } *webster;
 
     if (n_buckets == 0) {
@@ -4794,7 +4794,6 @@ group_setup_dp_hash_table(struct group_dpif *group, 
size_t max_hash)
     webster = xcalloc(n_buckets, sizeof(struct webster));
     int i = 0;
     LIST_FOR_EACH (bucket, list_node, &group->up.buckets) {
-        bucket->aux = 0;
         if (bucket->weight > 0 && bucket->weight < min_weight) {
             min_weight = bucket->weight;
         }
@@ -4802,6 +4801,7 @@ group_setup_dp_hash_table(struct group_dpif *group, 
size_t max_hash)
         webster[i].bucket = bucket;
         webster[i].divisor = 1;
         webster[i].value = bucket->weight;
+        webster[i].hits = 0;
         i++;
     }
 
@@ -4810,19 +4810,19 @@ group_setup_dp_hash_table(struct group_dpif *group, 
size_t max_hash)
         free(webster);
         return false;
     }
-    VLOG_DBG("  Minimum weight: %d, total weight: %.0f",
+    VLOG_DBG("  Minimum weight: %d, total weight: %"PRIu64,
              min_weight, total_weight);
 
-    uint32_t min_slots = ceil(total_weight / min_weight);
-    n_hash = MAX(16, 1L << log_2_ceil(min_slots));
-
+    uint64_t min_slots = DIV_ROUND_UP(total_weight, min_weight);
+    uint64_t min_slots2 = ROUND_UP_POW2(min_slots);
+    uint64_t n_hash = MAX(16, min_slots2);
     if (n_hash > MAX_SELECT_GROUP_HASH_VALUES ||
         (max_hash != 0 && n_hash > max_hash)) {
-        VLOG_DBG("  Too many hash values required: %d", n_hash);
+        VLOG_DBG("  Too many hash values required: %"PRIu64, n_hash);
         return false;
     }
 
-    VLOG_DBG("  Using %d hash values:", n_hash);
+    VLOG_DBG("  Using %"PRIu64" hash values:", n_hash);
     group->hash_mask = n_hash - 1;
     if (group->hash_map) {
         free(group->hash_map);
@@ -4837,17 +4837,19 @@ group_setup_dp_hash_table(struct group_dpif *group, 
size_t max_hash)
                 winner = &webster[i];
             }
         }
+        winner->hits++;
         winner->divisor += 2;
         winner->value = (double) winner->bucket->weight / winner->divisor;
         group->hash_map[hash] = winner->bucket;
-        winner->bucket->aux++;
     }
 
+    i = 0;
     LIST_FOR_EACH (bucket, list_node, &group->up.buckets) {
-        double target = (n_hash * bucket->weight) / total_weight;
+        double target = (n_hash * bucket->weight) / (double) total_weight;
         VLOG_DBG("  Bucket %d: weight=%d, target=%.2f hits=%d",
                  bucket->bucket_id, bucket->weight,
-                 target, bucket->aux);
+                 target, webster[i].hits);
+        i++;
     }
 
     free(webster);
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to