The current implementation of the "dp_hash" selection method suffers
from two deficiences: 1. The hash mask and hence the number of dp_hash
values is just large enough to cover the number of group buckets, but
does not consider the case that buckets have different weights. 2. The
xlate-time selection of best bucket from the masked dp_hash value often
results in bucket load distributions that are quite different from the
bucket weights because the number of available masked dp_hash values
is too small (2-6 bits compared to 32 bits of a full hash in the default
hash selection method).

This commit provides a more accurate implementation of the dp_hash
select group by applying the well known Webster method for distributing
a small number of "seats" fairly over the weighted "parties"
(see https://en.wikipedia.org/wiki/Webster/Sainte-Lagu%C3%AB_method).
The dp_hash mask is autmatically chosen large enough to provide good
enough accuracy even with widely differing weights.

This distribution happens at group modification time and the resulting
table is stored with the group-dpif struct. At xlation time, we use the
masked dp_hash values as index to look up the assigned bucket.

If the bucket should not be live, we do a circular search over the
mapping table until we find the first live bucket. As the buckets in
the table are by construction in pseudo-random order with a frequency
according to their weight, this method maintains correct distribution
even if one or more buckets are non-live.

Xlation is further simplified by storing some derived select group state
at group construction in struct group-dpif in a form better suited for
xlation purposes.

Adapted the unit test case for dp_hash select group accordingly.

Signed-off-by: Jan Scheurich <jan.scheur...@ericsson.com>
Signed-off-by: Nitin Katiyar <nitin.kati...@ericsson.com>
Co-authored-by: Nitin Katiyar <nitin.kati...@ericsson.com>
---
 include/openvswitch/ofp-group.h |   1 +
 ofproto/ofproto-dpif-xlate.c    |  74 +++++++++++++-------
 ofproto/ofproto-dpif.c          | 146 ++++++++++++++++++++++++++++++++++++++++
 ofproto/ofproto-dpif.h          |  13 ++++
 tests/ofproto-dpif.at           |  18 +++--
 5 files changed, 221 insertions(+), 31 deletions(-)

diff --git a/include/openvswitch/ofp-group.h b/include/openvswitch/ofp-group.h
index 8d893a5..af4033d 100644
--- a/include/openvswitch/ofp-group.h
+++ b/include/openvswitch/ofp-group.h
@@ -47,6 +47,7 @@ struct bucket_counter {
 /* Bucket for use in groups. */
 struct ofputil_bucket {
     struct ovs_list list_node;
+    uint16_t aux;               /* Padding. Also used for temporary data. */
     uint16_t weight;            /* Relative weight, for "select" groups. */
     ofp_port_t watch_port;      /* Port whose state affects whether this bucket
                                  * is live. Only required for fast failover
diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
index c8baba1..df245c5 100644
--- a/ofproto/ofproto-dpif-xlate.c
+++ b/ofproto/ofproto-dpif-xlate.c
@@ -4235,35 +4235,55 @@ xlate_hash_fields_select_group(struct xlate_ctx *ctx, 
struct group_dpif *group,
     }
 }
 
+static struct ofputil_bucket *
+group_dp_hash_best_bucket(struct xlate_ctx *ctx,
+                          const struct group_dpif *group,
+                          uint32_t dp_hash)
+{
+    struct ofputil_bucket *bucket, *best_bucket = NULL;
+    uint32_t n_hash = group->hash_mask + 1;
+
+    uint32_t hash = dp_hash &= group->hash_mask;
+    ctx->wc->masks.dp_hash |= group->hash_mask;
+
+    /* Starting from the original masked dp_hash value iterate over the
+     * hash mapping table to find the first live bucket. As the buckets
+     * are quasi-randomly spread over the hash values, this maintains
+     * a distribution according to bucket weights even when some buckets
+     * are non-live. */
+    for (int i = 0; i < n_hash; i++) {
+        bucket = group->hash_map[(hash + i) % n_hash];
+        if (bucket_is_alive(ctx, bucket, 0)) {
+            best_bucket = bucket;
+            break;
+        }
+    }
+
+    return best_bucket;
+}
+
 static void
 xlate_dp_hash_select_group(struct xlate_ctx *ctx, struct group_dpif *group,
                            bool is_last_action)
 {
-    struct ofputil_bucket *bucket;
-
     /* dp_hash value 0 is special since it means that the dp_hash has not been
      * computed, as all computed dp_hash values are non-zero.  Therefore
      * compare to zero can be used to decide if the dp_hash value is valid
      * without masking the dp_hash field. */
     if (!ctx->xin->flow.dp_hash) {
-        uint64_t param = group->up.props.selection_method_param;
-
-        ctx_trigger_recirculate_with_hash(ctx, param >> 32, (uint32_t)param);
+        ctx_trigger_recirculate_with_hash(ctx, group->hash_alg,
+                                          group->hash_basis);
+        if (ctx->xin->xcache) {
+            ofproto_group_unref(&group->up);
+        }
     } else {
-        uint32_t n_buckets = group->up.n_buckets;
-        if (n_buckets) {
-            /* Minimal mask to cover the number of buckets. */
-            uint32_t mask = (1 << log_2_ceil(n_buckets)) - 1;
-            /* Multiplier chosen to make the trivial 1 bit case to
-             * actually distribute amongst two equal weight buckets. */
-            uint32_t basis = 0xc2b73583 * (ctx->xin->flow.dp_hash & mask);
-
-            ctx->wc->masks.dp_hash |= mask;
-            bucket = group_best_live_bucket(ctx, group, basis);
-            if (bucket) {
-                xlate_group_bucket(ctx, bucket, is_last_action);
-                xlate_group_stats(ctx, group, bucket);
-            }
+        struct ofputil_bucket *bucket =
+                group_dp_hash_best_bucket(ctx, group, ctx->xin->flow.dp_hash);
+        if (bucket) {
+            xlate_group_bucket(ctx, bucket, is_last_action);
+            xlate_group_stats(ctx, group, bucket);
+        } else if (ctx->xin->xcache) {
+            ofproto_group_unref(&group->up);
         }
     }
 }
@@ -4272,8 +4292,6 @@ static void
 xlate_select_group(struct xlate_ctx *ctx, struct group_dpif *group,
                    bool is_last_action)
 {
-    const char *selection_method = group->up.props.selection_method;
-
     /* Select groups may access flow keys beyond L2 in order to
      * select a bucket. Recirculate as appropriate to make this possible.
      */
@@ -4281,15 +4299,19 @@ xlate_select_group(struct xlate_ctx *ctx, struct 
group_dpif *group,
         ctx_trigger_freeze(ctx);
     }
 
-    if (selection_method[0] == '\0') {
+    switch (group->selection_method) {
+    case SEL_METHOD_DEFAULT:
         xlate_default_select_group(ctx, group, is_last_action);
-    } else if (!strcasecmp("hash", selection_method)) {
+        break;
+    case SEL_METHOD_HASH:
         xlate_hash_fields_select_group(ctx, group, is_last_action);
-    } else if (!strcasecmp("dp_hash", selection_method)) {
+        break;
+    case SEL_METHOD_DP_HASH:
         xlate_dp_hash_select_group(ctx, group, is_last_action);
-    } else {
-        /* Parsing of groups should ensure this never happens */
+        break;
+    default:
         OVS_NOT_REACHED();
+        break;
     }
 }
 
diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
index 1ed82d0..e1a5c09 100644
--- a/ofproto/ofproto-dpif.c
+++ b/ofproto/ofproto-dpif.c
@@ -32,6 +32,7 @@
 #include "lacp.h"
 #include "learn.h"
 #include "mac-learning.h"
+#include "math.h"
 #include "mcast-snooping.h"
 #include "multipath.h"
 #include "netdev-vport.h"
@@ -4717,6 +4718,143 @@ group_dpif_credit_stats(struct group_dpif *group,
     ovs_mutex_unlock(&group->stats_mutex);
 }
 
+/* Calculate the dp_hash mask needed to provide the least weighted bucket
+ * with at least one hash value and construct a mapping table from masked
+ * dp_hash value to group bucket using the Webster method.
+ * If the caller specifies a non-zero max_hash value, abort and return false
+ * if more hash values would be required. The absolute maximum number of
+ * hash values supported is 256. */
+
+#define MAX_SELECT_GROUP_HASH_VALUES 256
+
+static bool
+group_setup_dp_hash_table(struct group_dpif *group, size_t max_hash)
+{
+    struct ofputil_bucket *bucket;
+    uint32_t n_buckets = group->up.n_buckets;
+    double total_weight = 0.0;
+    uint16_t min_weight = UINT16_MAX;
+    uint32_t n_hash;
+    struct webster {
+        struct ofputil_bucket *bucket;
+        uint32_t divisor;
+        double value;
+    } webster[group->up.n_buckets];
+
+    if (n_buckets == 0) {
+        VLOG_DBG("  Don't apply dp_hash method without buckets");
+        return false;
+    }
+
+    int i = 0;
+    LIST_FOR_EACH (bucket, list_node, &group->up.buckets) {
+        bucket->aux = 0;
+        if (bucket->weight > 0 && bucket->weight < min_weight) {
+            min_weight = bucket->weight;
+        }
+        total_weight += bucket->weight;
+        webster[i].bucket = bucket;
+        webster[i].divisor = 1;
+        webster[i].value = bucket->weight;
+        i++;
+    }
+
+    VLOG_DBG("  Minimum weight: %d, total weight: %.0f",
+             min_weight, total_weight);
+
+    uint32_t min_slots = ceil(total_weight / min_weight);
+    n_hash = MAX(16, 1L << log_2_ceil(min_slots));
+
+    if (n_hash > MAX_SELECT_GROUP_HASH_VALUES ||
+        (max_hash != 0 && n_hash > max_hash)) {
+        VLOG_DBG("  Too many hash values required: %d", n_hash);
+        return false;
+    }
+
+    VLOG_DBG("  Using %d hash values:", n_hash);
+    group->hash_mask = n_hash - 1;
+    if (group->hash_map) {
+        free(group->hash_map);
+    }
+    group->hash_map = xcalloc(n_hash, sizeof(struct ofputil_bucket *));
+
+    /* Use Webster method to distribute hash values over buckets. */
+    for (int hash = 0; hash < n_hash; hash++) {
+        double max_val = 0.0;
+        struct webster *winner;
+        for (i = 0; i < n_buckets; i++) {
+            if (webster[i].value > max_val) {
+                max_val = webster[i].value;
+                winner = &webster[i];
+            }
+        }
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+        /* winner is a reference to a webster[] element initialized above. */
+        winner->divisor += 2;
+        winner->value = (double) winner->bucket->weight / winner->divisor;
+        group->hash_map[hash] = winner->bucket;
+        winner->bucket->aux++;
+#pragma GCC diagnostic pop
+    }
+
+    LIST_FOR_EACH (bucket, list_node, &group->up.buckets) {
+        double target = (n_hash * bucket->weight) / total_weight;
+        VLOG_DBG("  Bucket %d: weight=%d, target=%.2f hits=%d",
+                 bucket->bucket_id, bucket->weight,
+                 target, bucket->aux);
+    }
+
+    return true;
+}
+
+static void
+group_set_selection_method(struct group_dpif *group)
+{
+    struct ofputil_group_props *props = &group->up.props;
+    char *selection_method = props->selection_method;
+
+    if (selection_method[0] == '\0') {
+        VLOG_INFO("No selection method specified.");
+        group->selection_method = SEL_METHOD_DEFAULT;
+
+    } else if (!strcmp(selection_method, "dp_hash")) {
+        VLOG_INFO("Selection method specified: dp_hash.");
+        /* Try to use dp_hash if possible at all. */
+        if (group_setup_dp_hash_table(group, 0)) {
+            group->selection_method = SEL_METHOD_DP_HASH;
+            group->hash_alg = props->selection_method_param >> 32;
+            if (group->hash_alg >= __OVS_HASH_MAX) {
+                VLOG_INFO("  Invalid dp_hash algorithm %d. "
+                          "Defaulting to OVS_HASH_ALG_L4", group->hash_alg);
+                group->hash_alg = OVS_HASH_ALG_L4;
+            }
+            group->hash_basis = (uint32_t) props->selection_method_param;
+        } else {
+            /* Fall back to original default hashing in slow path. */
+            VLOG_INFO("  Falling back to default hash method.");
+            group->selection_method = SEL_METHOD_DEFAULT;
+        }
+    } else if (!strcmp(selection_method, "hash")) {
+        VLOG_INFO("Selection method specified: hash.");
+        if (props->fields.values_size > 0) {
+            /* Controller has specified hash fields. */
+            struct ds s = DS_EMPTY_INITIALIZER;
+            oxm_format_field_array(&s, &props->fields);
+            VLOG_INFO("  Hash fields: %s", ds_cstr(&s));
+            ds_destroy(&s);
+            group->selection_method = SEL_METHOD_HASH;
+        } else {
+            /* No hash fields. Fall back to original default hashing. */
+            VLOG_INFO("  No hash fields. Falling back to default hash 
method.");
+            group->selection_method = SEL_METHOD_DEFAULT;
+        }
+    } else {
+        /* Parsing of groups should ensure this never happens */
+        OVS_NOT_REACHED();
+    }
+}
+
 static enum ofperr
 group_construct(struct ofgroup *group_)
 {
@@ -4725,6 +4863,10 @@ group_construct(struct ofgroup *group_)
     ovs_mutex_init_adaptive(&group->stats_mutex);
     ovs_mutex_lock(&group->stats_mutex);
     group_construct_stats(group);
+    group->hash_map = NULL;
+    if (group->up.type == OFPGT11_SELECT) {
+        group_set_selection_method(group);
+    }
     ovs_mutex_unlock(&group->stats_mutex);
     return 0;
 }
@@ -4734,6 +4876,10 @@ group_destruct(struct ofgroup *group_)
 {
     struct group_dpif *group = group_dpif_cast(group_);
     ovs_mutex_destroy(&group->stats_mutex);
+    if (group->hash_map) {
+        free(group->hash_map);
+        group->hash_map = NULL;
+    }
 }
 
 static enum ofperr
diff --git a/ofproto/ofproto-dpif.h b/ofproto/ofproto-dpif.h
index 47bf7f9..880dd3d 100644
--- a/ofproto/ofproto-dpif.h
+++ b/ofproto/ofproto-dpif.h
@@ -119,6 +119,12 @@ rule_dpif_is_internal(const struct rule_dpif *rule)
 
 /* Groups. */
 
+enum group_selection_method {
+    SEL_METHOD_DEFAULT,
+    SEL_METHOD_DP_HASH,
+    SEL_METHOD_HASH,
+};
+
 struct group_dpif {
     struct ofgroup up;
 
@@ -129,6 +135,12 @@ struct group_dpif {
     struct ovs_mutex stats_mutex;
     uint64_t packet_count OVS_GUARDED;  /* Number of packets received. */
     uint64_t byte_count OVS_GUARDED;    /* Number of bytes received. */
+
+    enum group_selection_method selection_method;
+    enum ovs_hash_alg hash_alg;         /* dp_hash algorithm to be applied. */
+    uint32_t hash_basis;                /* Basis for dp_hash. */
+    uint32_t hash_mask;                 /* Used to mask dp_hash (2^N - 1).*/
+    struct ofputil_bucket **hash_map;   /* Map hash values to buckets. */
 };
 
 void group_dpif_credit_stats(struct group_dpif *,
@@ -137,6 +149,7 @@ void group_dpif_credit_stats(struct group_dpif *,
 struct group_dpif *group_dpif_lookup(struct ofproto_dpif *,
                                      uint32_t group_id, ovs_version_t version,
                                      bool take_ref);
+
 
 /* Backers.
  *
diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at
index 60f28e2..6c20208 100644
--- a/tests/ofproto-dpif.at
+++ b/tests/ofproto-dpif.at
@@ -500,11 +500,19 @@ for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15; do
     AT_CHECK([ovs-appctl netdev-dummy/receive p1 $pkt])
 done
 
-AT_CHECK([ovs-appctl dpctl/dump-flows | sed 
's/dp_hash(.*\/0x1)/dp_hash(0xXXXX\/0x1)/' | sed 
's/packets.*actions:1/actions:1/' | strip_ufid | strip_used | sort], [0], [dnl
+AT_CHECK([ovs-appctl dpctl/dump-flows | sed 
's/dp_hash(.*\/0xf)/dp_hash(0xXXXX\/0xf)/' | sed 
's/packets.*actions:1/actions:1/' | strip_ufid | strip_used | sort], [0], [dnl
 flow-dump from non-dpdk interfaces:
 
recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(src=192.168.0.1,frag=no),
 packets:15, bytes:1590, used:0.0s, actions:hash(hash_l4(0)),recirc(0x1)
-recirc_id(0x1),dp_hash(0xXXXX/0x1),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
 actions:10
-recirc_id(0x1),dp_hash(0xXXXX/0x1),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
 actions:11
+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
 actions:10
+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
 actions:10
+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
 actions:10
+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
 actions:10
+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
 actions:10
+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
 actions:10
+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
 actions:11
+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
 actions:11
+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
 actions:11
+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
 actions:11
 ])
 
 AT_CHECK([ovs-appctl revalidator/purge], [0])
@@ -516,10 +524,10 @@ for d in 0 1 2 3 4 5 6 7 8 9 a b c d e f; do
     AT_CHECK([ovs-appctl netdev-dummy/receive p1 $pkt])
 done
 
-AT_CHECK([ovs-appctl dpctl/dump-flows | sed 
's/dp_hash(.*\/0x1)/dp_hash(0xXXXX\/0x1)/' | sed 's/\(actions:1\)[[01]]/\1X/' | 
strip_ufid | strip_used | sort], [0], [dnl
+AT_CHECK([ovs-appctl dpctl/dump-flows | sed 
's/dp_hash(.*\/0xf)/dp_hash(0xXXXX\/0xf)/' | sed 's/\(actions:1\)[[01]]/\1X/' | 
strip_ufid | strip_used | sort], [0], [dnl
 flow-dump from non-dpdk interfaces:
 
recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(src=192.168.0.1,frag=no),
 packets:15, bytes:1590, used:0.0s, actions:hash(hash_l4(0)),recirc(0x2)
-recirc_id(0x2),dp_hash(0xXXXX/0x1),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
 packets:15, bytes:1590, used:0.0s, actions:1X
+recirc_id(0x2),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
 packets:15, bytes:1590, used:0.0s, actions:1X
 ])
 
 OVS_VSWITCHD_STOP
-- 
1.9.1

_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to