Add a new select group selection method "dp_hash", which uses minimal number of bits from the datapath calculated packet hash to inform the select group bucket selection. This makes the datapath flows more generic resulting in less upcalls to userspace, but adds recirculation prior to group selection.
Signed-off-by: Jarno Rajahalme <ja...@ovn.org> --- v3: Further improved documentation, and moved the NEWS piece to post-2.6. NEWS | 8 +++++ lib/ofp-util.c | 16 ++++------ ofproto/ofproto-dpif-xlate.c | 70 ++++++++++++++++++++++++++++++++++++++++++-- ofproto/ofproto-dpif.c | 11 ++++--- ofproto/ofproto-dpif.h | 4 +-- tests/ofproto-dpif.at | 38 ++++++++++++++++++++++++ tests/ofproto.at | 3 ++ utilities/ovs-ofctl.8.in | 38 ++++++++++++++++++++++-- 8 files changed, 166 insertions(+), 22 deletions(-) diff --git a/NEWS b/NEWS index 343f7f1..98d9702 100644 --- a/NEWS +++ b/NEWS @@ -5,6 +5,14 @@ Post-v2.6.0 not accurate. - OpenFlow: * OFPT_PACKET_OUT messages are now supported in bundles. + * A new "selection_method=dp_hash" type for OpenFlow select group + bucket selection that uses the datapath computed 5-tuple hash + without making datapath flows match the 5-tuple fields, which + is useful for more efficient load balancing, for example. This + uses the Netronome extension to OpenFlow 1.5+ that allows + control over the OpenFlow select groups selection method. See + "selection_method" and related options in ovs-ofctl(8) for + details. - ovs-ofctl: * 'bundle' command now supports packet-out messages. * New syntax for 'ovs-ofctl packet-out' command, which uses the diff --git a/lib/ofp-util.c b/lib/ofp-util.c index c84ed17..f83e4a3 100644 --- a/lib/ofp-util.c +++ b/lib/ofp-util.c @@ -8904,28 +8904,24 @@ parse_group_prop_ntr_selection_method(struct ofpbuf *payload, return OFPERR_OFPBPC_BAD_VALUE; } - if (strcmp("hash", prop->selection_method)) { + if (strcmp("hash", prop->selection_method) + && strcmp("dp_hash", prop->selection_method)) { OFPPROP_LOG(&bad_ofmsg_rl, false, "ntr selection method '%s' is not supported", prop->selection_method); return OFPERR_OFPBPC_BAD_VALUE; } + /* 'method_len' is now non-zero. */ strcpy(gp->selection_method, prop->selection_method); gp->selection_method_param = ntohll(prop->selection_method_param); - if (!method_len && gp->selection_method_param) { - OFPPROP_LOG(&bad_ofmsg_rl, false, "ntr selection method parameter is " - "non-zero but selection method is empty"); - return OFPERR_OFPBPC_BAD_VALUE; - } - ofpbuf_pull(payload, sizeof *prop); fields_len = ntohs(prop->length) - sizeof *prop; - if (!method_len && fields_len) { - OFPPROP_LOG(&bad_ofmsg_rl, false, "ntr selection method parameter is " - "zero but fields are provided"); + if (fields_len && strcmp("hash", gp->selection_method)) { + OFPPROP_LOG(&bad_ofmsg_rl, false, "ntr selection method %s " + "does not support fields", gp->selection_method); return OFPERR_OFPBPC_BAD_VALUE; } diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index 6854da3..9dd7206 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -346,6 +346,11 @@ struct xlate_ctx { * case at that point. */ bool freezing; + bool recirc_update_dp_hash; /* Generated recirculation will be preceded + * by datapath HASH action to get an updated + * dp_hash after recirculation. */ + uint32_t dp_hash_alg; + uint32_t dp_hash_basis; struct ofpbuf frozen_actions; const struct ofpact_controller *pause; @@ -408,6 +413,17 @@ ctx_trigger_freeze(struct xlate_ctx *ctx) ctx->freezing = true; } +static void +ctx_trigger_recirculate_with_hash(struct xlate_ctx *ctx, uint32_t type, + uint32_t basis) +{ + ctx->exit = true; + ctx->freezing = true; + ctx->recirc_update_dp_hash = true; + ctx->dp_hash_alg = type; + ctx->dp_hash_basis = basis; +} + static bool ctx_first_frozen_action(const struct xlate_ctx *ctx) { @@ -419,6 +435,7 @@ ctx_cancel_freeze(struct xlate_ctx *ctx) { if (ctx->freezing) { ctx->freezing = false; + ctx->recirc_update_dp_hash = false; ofpbuf_clear(&ctx->frozen_actions); ctx->frozen_actions.header = NULL; } @@ -1465,7 +1482,7 @@ group_first_live_bucket(const struct xlate_ctx *ctx, struct ofputil_bucket *bucket; const struct ovs_list *buckets; - buckets = group_dpif_get_buckets(group); + buckets = group_dpif_get_buckets(group, NULL); LIST_FOR_EACH (bucket, list_node, buckets) { if (bucket_is_alive(ctx, bucket, depth)) { return bucket; @@ -1486,7 +1503,7 @@ group_best_live_bucket(const struct xlate_ctx *ctx, struct ofputil_bucket *bucket; const struct ovs_list *buckets; - buckets = group_dpif_get_buckets(group); + buckets = group_dpif_get_buckets(group, NULL); LIST_FOR_EACH (bucket, list_node, buckets) { if (bucket_is_alive(ctx, bucket, 0)) { uint32_t score = @@ -3286,7 +3303,7 @@ xlate_all_group(struct xlate_ctx *ctx, struct group_dpif *group) struct ofputil_bucket *bucket; const struct ovs_list *buckets; - buckets = group_dpif_get_buckets(group); + buckets = group_dpif_get_buckets(group, NULL); LIST_FOR_EACH (bucket, list_node, buckets) { xlate_group_bucket(ctx, bucket); } @@ -3377,6 +3394,40 @@ xlate_hash_fields_select_group(struct xlate_ctx *ctx, struct group_dpif *group) } static void +xlate_dp_hash_select_group(struct xlate_ctx *ctx, struct group_dpif *group) +{ + struct ofputil_bucket *bucket; + + /* dp_hash value 0 is special since it means that the dp_hash has not been + * computed, as all computed dp_hash values are non-zero. Therefore + * compare to zero can be used to decide if the dp_hash value is valid + * without masking the dp_hash field. */ + if (!ctx->xin->flow.dp_hash) { + uint64_t param = group_dpif_get_selection_method_param(group); + + ctx_trigger_recirculate_with_hash(ctx, param >> 32, (uint32_t)param); + } else { + uint32_t n_buckets; + + group_dpif_get_buckets(group, &n_buckets); + if (n_buckets) { + /* Minimal mask to cover the number of buckets. */ + uint32_t mask = (1 << log_2_ceil(n_buckets)) - 1; + /* Multiplier chosen to make the trivial 1 bit case to + * actually distribute amongst two equal weight buckets. */ + uint32_t basis = 0xc2b73583 * (ctx->xin->flow.dp_hash & mask); + + ctx->wc->masks.dp_hash |= mask; + bucket = group_best_live_bucket(ctx, group, basis); + if (bucket) { + xlate_group_bucket(ctx, bucket); + xlate_group_stats(ctx, group, bucket); + } + } + } +} + +static void xlate_select_group(struct xlate_ctx *ctx, struct group_dpif *group) { const char *selection_method = group_dpif_get_selection_method(group); @@ -3392,6 +3443,8 @@ xlate_select_group(struct xlate_ctx *ctx, struct group_dpif *group) xlate_default_select_group(ctx, group); } else if (!strcasecmp("hash", selection_method)) { xlate_hash_fields_select_group(ctx, group); + } else if (!strcasecmp("dp_hash", selection_method)) { + xlate_dp_hash_select_group(ctx, group); } else { /* Parsing of groups should ensure this never happens */ OVS_NOT_REACHED(); @@ -3650,6 +3703,16 @@ finish_freezing__(struct xlate_ctx *ctx, uint8_t table) } recirc_refs_add(&ctx->xout->recircs, id); + if (ctx->recirc_update_dp_hash) { + struct ovs_action_hash *act_hash; + + /* Hash action. */ + act_hash = nl_msg_put_unspec_uninit(ctx->odp_actions, + OVS_ACTION_ATTR_HASH, + sizeof *act_hash); + act_hash->hash_alg = OVS_HASH_ALG_L4; /* Make configurable. */ + act_hash->hash_basis = 0; /* Make configurable. */ + } nl_msg_put_u32(ctx->odp_actions, OVS_ACTION_ATTR_RECIRC, id); } @@ -5257,6 +5320,7 @@ xlate_actions(struct xlate_in *xin, struct xlate_out *xout) .mirrors = 0, .freezing = false, + .recirc_update_dp_hash = false, .frozen_actions = OFPBUF_STUB_INITIALIZER(frozen_actions_stub), .pause = NULL, diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index f6f5cd1..0cffca1 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -4500,7 +4500,7 @@ group_construct_stats(struct group_dpif *group) group->packet_count = 0; group->byte_count = 0; - buckets = group_dpif_get_buckets(group); + buckets = group_dpif_get_buckets(group, NULL); LIST_FOR_EACH (bucket, list_node, buckets) { bucket->stats.packet_count = 0; bucket->stats.byte_count = 0; @@ -4521,7 +4521,7 @@ group_dpif_credit_stats(struct group_dpif *group, } else { /* Credit to all buckets */ const struct ovs_list *buckets; - buckets = group_dpif_get_buckets(group); + buckets = group_dpif_get_buckets(group, NULL); LIST_FOR_EACH (bucket, list_node, buckets) { bucket->stats.packet_count += stats->n_packets; bucket->stats.byte_count += stats->n_bytes; @@ -4569,7 +4569,7 @@ group_get_stats(const struct ofgroup *group_, struct ofputil_group_stats *ogs) ogs->packet_count = group->packet_count; ogs->byte_count = group->byte_count; - buckets = group_dpif_get_buckets(group); + buckets = group_dpif_get_buckets(group, NULL); bucket_stats = ogs->bucket_stats; LIST_FOR_EACH (bucket, list_node, buckets) { bucket_stats->packet_count = bucket->stats.packet_count; @@ -4595,8 +4595,11 @@ group_dpif_lookup(struct ofproto_dpif *ofproto, uint32_t group_id, } const struct ovs_list * -group_dpif_get_buckets(const struct group_dpif *group) +group_dpif_get_buckets(const struct group_dpif *group, uint32_t *n_buckets) { + if (n_buckets) { + *n_buckets = group->up.n_buckets; + } return &group->up.buckets; } diff --git a/ofproto/ofproto-dpif.h b/ofproto/ofproto-dpif.h index ec8830d..67e1a99 100644 --- a/ofproto/ofproto-dpif.h +++ b/ofproto/ofproto-dpif.h @@ -145,8 +145,8 @@ void group_dpif_credit_stats(struct group_dpif *, struct group_dpif *group_dpif_lookup(struct ofproto_dpif *ofproto, uint32_t group_id, ovs_version_t version, bool take_ref); -const struct ovs_list *group_dpif_get_buckets(const struct group_dpif *group); - +const struct ovs_list *group_dpif_get_buckets(const struct group_dpif *group, + uint32_t *n_buckets); enum ofp11_group_type group_dpif_get_type(const struct group_dpif *group); const char *group_dpif_get_selection_method(const struct group_dpif *group); uint64_t group_dpif_get_selection_method_param(const struct group_dpif *group); diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at index cc38858..de57efd 100644 --- a/tests/ofproto-dpif.at +++ b/tests/ofproto-dpif.at @@ -462,6 +462,44 @@ ovs-ofctl: selection_method_param is only allowed with "selection_method" OVS_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([ofproto-dpif - select group with dp_hash selection method]) +OVS_VSWITCHD_START +add_of_ports br0 1 10 11 +AT_CHECK([ovs-ofctl -O OpenFlow15 add-group br0 'group_id=1234,type=select,selection_method=dp_hash,bucket=output:10,bucket=output:11']) +AT_CHECK([ovs-ofctl -O OpenFlow15 add-flow br0 'ip,nw_src=192.168.0.1 actions=group:1234']) + +# Try a bunch of different flows and make sure that they get distributed +# at least somewhat. +for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15; do + pkt="in_port(1),eth(src=50:54:00:00:00:07,dst=50:54:00:00:00:01),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.1.$d,proto=1,tos=0,ttl=128,frag=no),icmp(type=8,code=0)" + AT_CHECK([ovs-appctl netdev-dummy/receive p1 $pkt]) +done + +AT_CHECK([ovs-appctl dpctl/dump-flows | sed 's/dp_hash(.*\/0x1)/dp_hash(0xXXXX\/0x1)/' | sed 's/packets.*actions:1/actions:1/' | strip_ufid | strip_used | sort], [0], [dnl +flow-dump from non-dpdk interfaces: +recirc_id(0),in_port(1),eth_type(0x0800),ipv4(src=192.168.0.1,frag=no), packets:15, bytes:630, used:0.0s, actions:hash(hash_l4(0)),recirc(0x1) +recirc_id(0x1),dp_hash(0xXXXX/0x1),in_port(1),eth_type(0x0800),ipv4(frag=no), actions:10 +recirc_id(0x1),dp_hash(0xXXXX/0x1),in_port(1),eth_type(0x0800),ipv4(frag=no), actions:11 +]) + +AT_CHECK([ovs-appctl revalidator/purge], [0]) + +# Try a bunch of different flows and make sure that they are not distributed +# as they only vary a field that is not hashed +for d in 0 1 2 3 4 5 6 7 8 9 a b c d e f; do + pkt="in_port(1),eth(src=50:54:00:00:00:$d,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=1,tos=0,ttl=128,frag=no),icmp(type=8,code=0)" + AT_CHECK([ovs-appctl netdev-dummy/receive p1 $pkt]) +done + +AT_CHECK([ovs-appctl dpctl/dump-flows | sed 's/dp_hash(.*\/0x1)/dp_hash(0xXXXX\/0x1)/' | strip_ufid | strip_used | sort], [0], [dnl +flow-dump from non-dpdk interfaces: +recirc_id(0),in_port(1),eth_type(0x0800),ipv4(src=192.168.0.1,frag=no), packets:15, bytes:630, used:0.0s, actions:hash(hash_l4(0)),recirc(0x2) +recirc_id(0x2),dp_hash(0xXXXX/0x1),in_port(1),eth_type(0x0800),ipv4(frag=no), packets:15, bytes:630, used:0.0s, actions:11 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + AT_SETUP([ofproto-dpif - fast failover group]) OVS_VSWITCHD_START add_of_ports br0 1 10 11 diff --git a/tests/ofproto.at b/tests/ofproto.at index 40e0e68..5a2ff19 100644 --- a/tests/ofproto.at +++ b/tests/ofproto.at @@ -429,6 +429,7 @@ AT_DATA([groups.txt], [dnl group_id=1233,type=select,selection_method=hash,bucket=output:10,bucket=output:11 group_id=1234,type=select,selection_method=hash,fields(eth_dst,ip_dst,tcp_dst),bucket=output:10,bucket=output:11 group_id=1235,type=all,bucket=actions=output:12,bucket=bucket_id:0,actions=output:10,bucket=bucket_id:1,actions=output:11 +group_id=1236,type=select,selection_method=dp_hash,bucket=output:10,bucket=output:11 ]) AT_CHECK([ovs-ofctl -O OpenFlow15 -vwarn add-groups br0 groups.txt]) AT_CHECK([ovs-ofctl -F OXM-OpenFlow15 -O OpenFlow15 -vwarn dump-groups br0 1234], [0], [stdout]) @@ -441,6 +442,7 @@ AT_CHECK([ovs-ofctl -O OpenFlow15 -vwarn dump-groups br0], [0], [stdout]) AT_CHECK([strip_xids < stdout | sort], [0], [dnl group_id=1233,type=select,selection_method=hash,bucket=bucket_id:0,actions=output:10,bucket=bucket_id:1,actions=output:11 group_id=1235,type=all,bucket=bucket_id:2,actions=output:12,bucket=bucket_id:0,actions=output:10,bucket=bucket_id:1,actions=output:11 + group_id=1236,type=select,selection_method=dp_hash,bucket=bucket_id:0,actions=output:10,bucket=bucket_id:1,actions=output:11 OFPST_GROUP_DESC reply (OF1.5): ]) AT_CHECK([ovs-ofctl -O OpenFlow15 -vwarn del-groups br0 group_id=1234]) @@ -448,6 +450,7 @@ AT_CHECK([ovs-ofctl -O OpenFlow15 -vwarn dump-groups br0], [0], [stdout]) AT_CHECK([strip_xids < stdout | sort], [0], [dnl group_id=1233,type=select,selection_method=hash,bucket=bucket_id:0,actions=output:10,bucket=bucket_id:1,actions=output:11 group_id=1235,type=all,bucket=bucket_id:2,actions=output:12,bucket=bucket_id:0,actions=output:10,bucket=bucket_id:1,actions=output:11 + group_id=1236,type=select,selection_method=dp_hash,bucket=bucket_id:0,actions=output:10,bucket=bucket_id:1,actions=output:11 OFPST_GROUP_DESC reply (OF1.5): ]) AT_CHECK([ovs-ofctl -O OpenFlow15 -vwarn del-groups br0], [0]) diff --git a/utilities/ovs-ofctl.8.in b/utilities/ovs-ofctl.8.in index 675c308..249b28a 100644 --- a/utilities/ovs-ofctl.8.in +++ b/utilities/ovs-ofctl.8.in @@ -2867,9 +2867,41 @@ This is a string of 1 to 15 bytes in length known to lower layers. This field is optional for \fBadd\-group\fR, \fBadd\-groups\fR and \fBmod\-group\fR commands on groups of type \fBselect\fR. Prohibited otherwise. The default value is the empty string. -.IP -Other than the empty string, \fBhash\fR is currently the only defined -selection method. +.RS +.IP \fBhash\fR +Use a hash computed over the fields specified with the \fBfields\fR +option, see below. \fBhash\fR uses the \fBselection_method_param\fR +as the hash basis. +.IP +Note that the hashed fields become exact matched by the datapath +flows. For example, if the TCP source port is hashed, the created +datapath flows will match the specific TCP source port value present +in the packet received. Since each TCP connection generally has a +different source port value, a separate datapath flow will be need to +be inserted for each TCP connection thus hashed to a select group +bucket. +.IP \fBdp_hash\fR +Use a datapath computed hash value. The hash algorithm varies accross +different datapath implementations. \fBdp_hash\fR uses the upper 32 +bits of the \fBselection_method_param\fR as the datapath hash +algorithm selector, which currently must always be 0, corresponding to +hash computation over the IP 5-tuple (selecting specific fields with +the \fBfields\fR option is not allowed with \fBdp_hash\fR). The lower +32 bits are used as the hash basis. +.IP +Using \fBdp_hash\fR has the advantage that it does not require the +generated datapath flows to exact match any additional packet header +fields. For example, even if multiple TCP connections thus hashed to +different select group buckets have different source port numbers, +generally all of them would be handled with a small set of already +established datapath flows, resulting in less latency for TCP SYN +packets. The downside is that the shared datapath flows must match +each packet twice, as the datapath hash value calculation happens only +when needed, and a second match is required to match some bits of its +value. This double-matching incurs a small additional latency cost +for each packet, but this latency is orders of magnitude less than the +latency of creating new datapath flows for new TCP connections. +.RE .IP This option will use a Netronome OpenFlow extension which is only supported when using Open vSwitch 2.4 and later with OpenFlow 1.5 and later. -- 2.1.4 _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev