Before the patch, for switches with a localnet port,
- traffic to a multichassis port was funneled through tunnels; but
- traffic from a multichassis port was sent through localnet.
This is not optimal because:
- bidirectional sessions are sent through two separate paths;
- leaking multichassis traffic to physical makes upstream switch flip
port's mac address location.
One of the effects of this behavior is prolonged network downtime when
live migrating a VM attached to a physical network (compared to
downtime observed for pure geneve networks).
As per my unscientific local testing, this patch reduces network
downtime during OpenStack VM live migration from 1.0s+ to 0.05-0.10s,
on par with geneve networks.
Fixes: 10398c1f51d5 ("Clone packets to all port chassis")
Signed-off-by: Ihar Hrachyshka <[email protected]>
---
v2: fixed a memory leak caught by asan job.
v1: initial commit.
---
controller/binding.c | 38 ++++++++++++++
controller/local_data.c | 30 ++++++++++-
controller/local_data.h | 6 +++
controller/ovn-controller.c | 6 +++
controller/physical.c | 99 +++++++++++++++++++++++++++++++++++--
controller/physical.h | 1 +
tests/ovn.at | 30 +++++++++++
7 files changed, 206 insertions(+), 4 deletions(-)
diff --git a/controller/binding.c b/controller/binding.c
index 2279570f9..9025681db 100644
--- a/controller/binding.c
+++ b/controller/binding.c
@@ -386,6 +386,23 @@ update_ld_external_ports(const struct sbrec_port_binding
*binding_rec,
}
}
+static void
+update_ld_multichassis_ports(const struct sbrec_port_binding *binding_rec,
+ struct hmap *local_datapaths)
+{
+ struct local_datapath *ld = get_local_datapath(
+ local_datapaths, binding_rec->datapath->tunnel_key);
+ if (!ld) {
+ return;
+ }
+ if (binding_rec->additional_chassis) {
+ add_local_datapath_multichassis_port(ld, binding_rec->logical_port,
+ binding_rec);
+ } else {
+ remove_local_datapath_multichassis_port(ld, binding_rec->logical_port);
+ }
+}
+
static void
update_ld_localnet_port(const struct sbrec_port_binding *binding_rec,
struct shash *bridge_mappings,
@@ -1752,6 +1769,8 @@ binding_run(struct binding_ctx_in *b_ctx_in, struct
binding_ctx_out *b_ctx_out)
struct ovs_list localnet_lports = OVS_LIST_INITIALIZER(&localnet_lports);
struct ovs_list external_lports = OVS_LIST_INITIALIZER(&external_lports);
+ struct ovs_list multichassis_ports = OVS_LIST_INITIALIZER(
+ &multichassis_ports);
struct lport {
struct ovs_list list_node;
@@ -1787,6 +1806,13 @@ binding_run(struct binding_ctx_in *b_ctx_in, struct
binding_ctx_out *b_ctx_out)
case LP_VIF:
consider_vif_lport(pb, b_ctx_in, b_ctx_out, NULL, qos_map_ptr);
+ if (pb->additional_chassis) {
+ struct lport *multichassis_lport = xmalloc(
+ sizeof *multichassis_lport);
+ multichassis_lport->pb = pb;
+ ovs_list_push_back(&multichassis_ports,
+ &multichassis_lport->list_node);
+ }
break;
case LP_CONTAINER:
@@ -1862,6 +1888,16 @@ binding_run(struct binding_ctx_in *b_ctx_in, struct
binding_ctx_out *b_ctx_out)
free(ext_lport);
}
+ /* Run through multichassis lport list to see if these are ports
+ * on local datapaths discovered from above loop, and update the
+ * corresponding local datapath accordingly. */
+ struct lport *multichassis_lport;
+ LIST_FOR_EACH_POP (multichassis_lport, list_node, &multichassis_ports) {
+ update_ld_multichassis_ports(multichassis_lport->pb,
+ b_ctx_out->local_datapaths);
+ free(multichassis_lport);
+ }
+
shash_destroy(&bridge_mappings);
if (!sset_is_empty(b_ctx_out->egress_ifaces)
@@ -1934,6 +1970,7 @@ remove_pb_from_local_datapath(const struct
sbrec_port_binding *pb,
} else if (!strcmp(pb->type, "external")) {
remove_local_datapath_external_port(ld, pb->logical_port);
}
+ remove_local_datapath_multichassis_port(ld, pb->logical_port);
}
static void
@@ -2677,6 +2714,7 @@ delete_done:
case LP_VIF:
case LP_CONTAINER:
case LP_VIRTUAL:
+ update_ld_multichassis_ports(pb, b_ctx_out->local_datapaths);
handled = handle_updated_vif_lport(pb, lport_type, b_ctx_in,
b_ctx_out, qos_map_ptr);
break;
diff --git a/controller/local_data.c b/controller/local_data.c
index 98445902b..7f874fc19 100644
--- a/controller/local_data.c
+++ b/controller/local_data.c
@@ -72,6 +72,7 @@ local_datapath_alloc(const struct sbrec_datapath_binding *dp)
ld->is_switch = datapath_is_switch(dp);
ld->is_transit_switch = datapath_is_transit_switch(dp);
shash_init(&ld->external_ports);
+ shash_init(&ld->multichassis_ports);
/* memory accounting - common part. */
local_datapath_usage += sizeof *ld;
@@ -97,13 +98,20 @@ local_datapath_destroy(struct local_datapath *ld)
SHASH_FOR_EACH (node, &ld->external_ports) {
local_datapath_usage -= strlen(node->name);
}
- local_datapath_usage -= shash_count(&ld->external_ports) * sizeof *node;
+ SHASH_FOR_EACH (node, &ld->multichassis_ports) {
+ local_datapath_usage -= strlen(node->name);
+ }
+ local_datapath_usage -= (shash_count(&ld->external_ports)
+ * sizeof *node);
+ local_datapath_usage -= (shash_count(&ld->multichassis_ports)
+ * sizeof *node);
local_datapath_usage -= sizeof *ld;
local_datapath_usage -=
ld->n_allocated_peer_ports * sizeof *ld->peer_ports;
free(ld->peer_ports);
shash_destroy(&ld->external_ports);
+ shash_destroy(&ld->multichassis_ports);
free(ld);
}
@@ -274,6 +282,26 @@ remove_local_datapath_external_port(struct local_datapath *ld,
}
}
+void
+add_local_datapath_multichassis_port(struct local_datapath *ld,
+ char *logical_port, const void *data)
+{
+ if (!shash_replace(&ld->multichassis_ports, logical_port, data)) {
+ local_datapath_usage += sizeof(struct shash_node) +
+ strlen(logical_port);
+ }
+}
+
+void
+remove_local_datapath_multichassis_port(struct local_datapath *ld,
+ char *logical_port)
+{
+ if (shash_find_and_delete(&ld->multichassis_ports, logical_port)) {
+ local_datapath_usage -= sizeof(struct shash_node) +
+ strlen(logical_port);
+ }
+}
+
void
local_datapath_memory_usage(struct simap *usage)
{
diff --git a/controller/local_data.h b/controller/local_data.h
index 9306ddf15..d898c8aa5 100644
--- a/controller/local_data.h
+++ b/controller/local_data.h
@@ -58,6 +58,7 @@ struct local_datapath {
size_t n_allocated_peer_ports;
struct shash external_ports;
+ struct shash multichassis_ports;
};
struct local_datapath *local_datapath_alloc(
@@ -155,5 +156,10 @@ void add_local_datapath_external_port(struct
local_datapath *ld,
char *logical_port, const void *data);
void remove_local_datapath_external_port(struct local_datapath *ld,
char *logical_port);
+void add_local_datapath_multichassis_port(struct local_datapath *ld,
+ char *logical_port,
+ const void *data);
+void remove_local_datapath_multichassis_port(struct local_datapath *ld,
+ char *logical_port);
#endif /* controller/local_data.h */
diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
index 69615308e..2e9138036 100644
--- a/controller/ovn-controller.c
+++ b/controller/ovn-controller.c
@@ -3000,6 +3000,11 @@ static void init_physical_ctx(struct engine_node *node,
engine_get_input("SB_port_binding", node),
"name");
+ struct ovsdb_idl_index *sbrec_port_binding_by_datapath =
+ engine_ovsdb_node_get_index(
+ engine_get_input("SB_port_binding", node),
+ "datapath");
+
struct sbrec_multicast_group_table *multicast_group_table =
(struct sbrec_multicast_group_table *)EN_OVSDB_GET(
engine_get_input("SB_multicast_group", node));
@@ -3039,6 +3044,7 @@ static void init_physical_ctx(struct engine_node *node,
struct simap *ct_zones = &ct_zones_data->current;
p_ctx->sbrec_port_binding_by_name = sbrec_port_binding_by_name;
+ p_ctx->sbrec_port_binding_by_datapath = sbrec_port_binding_by_datapath;
p_ctx->port_binding_table = port_binding_table;
p_ctx->mc_group_table = multicast_group_table;
p_ctx->br_int = br_int;
diff --git a/controller/physical.c b/controller/physical.c
index fc8280a99..816a557e7 100644
--- a/controller/physical.c
+++ b/controller/physical.c
@@ -1074,6 +1074,67 @@ setup_activation_strategy(const struct
sbrec_port_binding *binding,
}
}
+static void
+enforce_tunneling_for_multichassis_ports(
+ struct local_datapath *ld,
+ const struct sbrec_port_binding *binding,
+ const struct sbrec_chassis *chassis,
+ const struct hmap *chassis_tunnels,
+ enum mf_field_id mff_ovn_geneve,
+ struct ovn_desired_flow_table *flow_table)
+{
+ if (shash_is_empty(&ld->multichassis_ports)) {
+ return;
+ }
+
+ struct ovs_list *tuns = get_remote_tunnels(binding, chassis,
+ chassis_tunnels);
+ if (ovs_list_is_empty(tuns)) {
+ free(tuns);
+ return;
+ }
+
+ uint32_t dp_key = binding->datapath->tunnel_key;
+ uint32_t port_key = binding->tunnel_key;
+
+ struct shash_node *node;
+ SHASH_FOR_EACH (node, &ld->multichassis_ports) {
+ const struct sbrec_port_binding *mcp = node->data;
+
+ struct ofpbuf ofpacts;
+ ofpbuf_init(&ofpacts, 0);
+
+ bool is_vtep_port = !strcmp(binding->type, "vtep");
+ /* rewrite MFF_IN_PORT to bypass OpenFlow loopback check for ARP/ND
+ * responder in L3 networks. */
+ if (is_vtep_port) {
+ put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16, &ofpacts);
+ }
+
+ struct match match;
+ match_outport_dp_and_port_keys(&match, dp_key, port_key);
+ match_set_reg(&match, MFF_LOG_INPORT - MFF_REG0, mcp->tunnel_key);
+
+ struct tunnel *tun;
+ LIST_FOR_EACH (tun, list_node, tuns) {
+ put_encapsulation(mff_ovn_geneve, tun->tun,
+ binding->datapath, port_key, is_vtep_port,
+ &ofpacts);
+ ofpact_put_OUTPUT(&ofpacts)->port = tun->tun->ofport;
+ }
+ ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 110,
+ binding->header_.uuid.parts[0], &match, &ofpacts,
+ &binding->header_.uuid);
+ ofpbuf_uninit(&ofpacts);
+ }
+
+ struct tunnel *tun_elem;
+ LIST_FOR_EACH_POP (tun_elem, list_node, tuns) {
+ free(tun_elem);
+ }
+ free(tuns);
+}
+
static void
consider_port_binding(struct ovsdb_idl_index *sbrec_port_binding_by_name,
enum mf_field_id mff_ovn_geneve,
@@ -1509,6 +1570,9 @@ consider_port_binding(struct ovsdb_idl_index
*sbrec_port_binding_by_name,
binding->header_.uuid.parts[0],
&match, ofpacts_p, &binding->header_.uuid);
+ enforce_tunneling_for_multichassis_ports(
+ ld, binding, chassis, chassis_tunnels, mff_ovn_geneve, flow_table);
+
/* No more tunneling to set up. */
goto out;
}
@@ -1827,20 +1891,49 @@ physical_handle_flows_for_lport(const struct
sbrec_port_binding *pb,
ofctrl_remove_flows(flow_table, &pb->header_.uuid);
+ struct local_datapath *ldp =
+ get_local_datapath(p_ctx->local_datapaths,
+ pb->datapath->tunnel_key);
if (!strcmp(pb->type, "external")) {
/* External lports have a dependency on the localnet port.
* We need to remove the flows of the localnet port as well
* and re-consider adding the flows for it.
*/
- struct local_datapath *ldp =
- get_local_datapath(p_ctx->local_datapaths,
- pb->datapath->tunnel_key);
if (ldp && ldp->localnet_port) {
ofctrl_remove_flows(flow_table,
&ldp->localnet_port->header_.uuid);
physical_eval_port_binding(p_ctx, ldp->localnet_port, flow_table);
}
}
+ if (ldp) {
+ bool multichassis_state_changed = (
+ !!pb->additional_chassis ==
+ !!shash_find(&ldp->multichassis_ports, pb->logical_port)
+ );
+ if (multichassis_state_changed) {
+ if (pb->additional_chassis) {
+ add_local_datapath_multichassis_port(
+ ldp, pb->logical_port, pb);
+ } else {
+ remove_local_datapath_multichassis_port(
+ ldp, pb->logical_port);
+ }
+
+ struct sbrec_port_binding *target =
+ sbrec_port_binding_index_init_row(
+ p_ctx->sbrec_port_binding_by_datapath);
+ sbrec_port_binding_index_set_datapath(target, ldp->datapath);
+
+ const struct sbrec_port_binding *port;
+ SBREC_PORT_BINDING_FOR_EACH_EQUAL (
+ port, target, p_ctx->sbrec_port_binding_by_datapath) {
+ ofctrl_remove_flows(flow_table, &port->header_.uuid);
+ physical_eval_port_binding(p_ctx, port, flow_table);
+ }
+ sbrec_port_binding_index_destroy_row(target);
+ }
+ }
+
if (!removed) {
physical_eval_port_binding(p_ctx, pb, flow_table);
if (!strcmp(pb->type, "patch")) {
diff --git a/controller/physical.h b/controller/physical.h
index ee4b1ae1f..1b8f1ea55 100644
--- a/controller/physical.h
+++ b/controller/physical.h
@@ -45,6 +45,7 @@ struct local_nonvif_data;
struct physical_ctx {
struct ovsdb_idl_index *sbrec_port_binding_by_name;
+ struct ovsdb_idl_index *sbrec_port_binding_by_datapath;
const struct sbrec_port_binding_table *port_binding_table;
const struct sbrec_multicast_group_table *mc_group_table;
const struct ovsrec_bridge *br_int;
diff --git a/tests/ovn.at b/tests/ovn.at
index bfaa41962..a82a33aa7 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -14657,6 +14657,14 @@ reset_env() {
for port in hv1/migrator hv2/migrator hv1/first hv2/second hv3/third; do
: > $port.expected
done
+
+ for hv in hv1 hv2 hv3; do
+ : > $hv/n1.expected
+ done
+
+ reset_pcap_file hv1 br-phys_n1 hv1/br-phys_n1
+ reset_pcap_file hv2 br-phys_n1 hv2/br-phys_n1
+ reset_pcap_file hv3 br-phys_n1 hv3/br-phys_n1
}
check_packets() {
@@ -14667,6 +14675,10 @@ check_packets() {
OVN_CHECK_PACKETS_CONTAIN([hv1/first-tx.pcap], [hv1/first.expected])
OVN_CHECK_PACKETS_CONTAIN([hv2/second-tx.pcap], [hv2/second.expected])
OVN_CHECK_PACKETS_CONTAIN([hv3/third-tx.pcap], [hv3/third.expected])
+
+ OVN_CHECK_PACKETS_CONTAIN([hv1/br-phys_n1-tx.pcap], [hv1/n1.expected])
+ OVN_CHECK_PACKETS_CONTAIN([hv2/br-phys_n1-tx.pcap], [hv2/n1.expected])
+ OVN_CHECK_PACKETS_CONTAIN([hv3/br-phys_n1-tx.pcap], [hv3/n1.expected])
}
migrator_tpa=$(ip_to_hex 10 0 0 100)
@@ -14714,6 +14726,7 @@ echo $request >> hv3/third.expected
# unicast from Second doesn't arrive to hv2:Migrator
request=$(send_arp hv2 second 000000000002 0000000000ff $second_spa
$migrator_tpa)
echo $request >> hv1/migrator.expected
+echo $request >> hv2/n1.expected
# mcast from Second arrives to hv1:Migrator
# mcast from Second doesn't arrive to hv2:Migrator
@@ -14721,11 +14734,13 @@ request=$(send_arp hv2 second 000000000002
ffffffffffff $second_spa $migrator_tp
echo $request >> hv1/migrator.expected
echo $request >> hv1/first.expected
echo $request >> hv3/third.expected
+echo $request >> hv2/n1.expected
# unicast from Third arrives to hv1:Migrator
# unicast from Third doesn't arrive to hv2:Migrator
request=$(send_arp hv3 third 000000000003 0000000000ff $third_spa
$migrator_tpa)
echo $request >> hv1/migrator.expected
+echo $request >> hv3/n1.expected
# mcast from Third arrives to hv1:Migrator
# mcast from Third doesn't arrive to hv2:Migrator
@@ -14733,14 +14748,17 @@ request=$(send_arp hv3 third 000000000003
ffffffffffff $third_spa $migrator_tpa)
echo $request >> hv1/migrator.expected
echo $request >> hv1/first.expected
echo $request >> hv2/second.expected
+echo $request >> hv3/n1.expected
# unicast from hv1:Migrator arrives to First, Second, and Third
request=$(send_arp hv1 migrator 0000000000ff 000000000001 $migrator_tpa
$first_spa)
echo $request >> hv1/first.expected
request=$(send_arp hv1 migrator 0000000000ff 000000000002 $migrator_tpa
$second_spa)
echo $request >> hv2/second.expected
+echo $request >> hv1/n1.expected
request=$(send_arp hv1 migrator 0000000000ff 000000000003 $migrator_tpa
$third_spa)
echo $request >> hv3/third.expected
+echo $request >> hv1/n1.expected
# unicast from hv2:Migrator doesn't arrive to First, Second, or Third
request=$(send_arp hv2 migrator 0000000000ff 000000000001 $migrator_tpa
$first_spa)
@@ -14752,6 +14770,7 @@ request=$(send_arp hv1 migrator 0000000000ff
ffffffffffff $migrator_tpa $first_s
echo $request >> hv1/first.expected
echo $request >> hv2/second.expected
echo $request >> hv3/third.expected
+echo $request >> hv1/n1.expected
# mcast from hv2:Migrator doesn't arrive to First, Second, or Third
request=$(send_arp hv2 migrator 0000000000ff ffffffffffff $migrator_tpa
$first_spa)
@@ -14838,6 +14857,10 @@ echo $request >> hv1/first.expected
echo $request >> hv2/second.expected
echo $request >> hv3/third.expected
+# unicast from Second arrives to Third through localnet port
+request=$(send_arp hv2 second 000000000002 000000000003 $second_spa $third_spa)
+echo $request >> hv2/n1.expected
+
check_packets
# Complete migration: destination is bound
@@ -14860,6 +14883,7 @@ reset_env
# unicast from Third arrives to hv2:Migrator
request=$(send_arp hv3 third 000000000003 0000000000ff $third_spa
$migrator_tpa)
echo $request >> hv2/migrator.expected
+echo $request >> hv3/n1.expected
# mcast from Third doesn't arrive to hv1:Migrator
# mcast from Third arrives to hv2:Migrator
@@ -14867,11 +14891,13 @@ request=$(send_arp hv3 third 000000000003
ffffffffffff $third_spa $migrator_tpa)
echo $request >> hv2/migrator.expected
echo $request >> hv1/first.expected
echo $request >> hv2/second.expected
+echo $request >> hv3/n1.expected
# unicast from First doesn't arrive to hv1:Migrator
# unicast from First arrives to hv2:Migrator
request=$(send_arp hv1 first 000000000001 0000000000ff $first_spa
$migrator_tpa)
echo $request >> hv2/migrator.expected
+echo $request >> hv1/n1.expected
# mcast from First doesn't arrive to hv1:Migrator
# mcast from First arrives to hv2:Migrator binding
@@ -14879,6 +14905,7 @@ request=$(send_arp hv1 first 000000000001 ffffffffffff
$first_spa $migrator_tpa)
echo $request >> hv2/migrator.expected
echo $request >> hv2/second.expected
echo $request >> hv3/third.expected
+echo $request >> hv1/n1.expected
# unicast from Second doesn't arrive to hv1:Migrator
# unicast from Second arrives to hv2:Migrator
@@ -14900,10 +14927,12 @@ request=$(send_arp hv1 migrator 0000000000ff
000000000003 $migrator_tpa $third_s
# unicast from hv2:Migrator arrives to First, Second, and Third
request=$(send_arp hv2 migrator 0000000000ff 000000000001 $migrator_tpa
$first_spa)
echo $request >> hv1/first.expected
+echo $request >> hv2/n1.expected
request=$(send_arp hv2 migrator 0000000000ff 000000000002 $migrator_tpa
$second_spa)
echo $request >> hv2/second.expected
request=$(send_arp hv2 migrator 0000000000ff 000000000003 $migrator_tpa
$third_spa)
echo $request >> hv3/third.expected
+echo $request >> hv2/n1.expected
# mcast from hv1:Migrator doesn't arrive to First, Second, or Third
request=$(send_arp hv1 migrator 0000000000ff ffffffffffff $migrator_tpa
$first_spa)
@@ -14913,6 +14942,7 @@ request=$(send_arp hv2 migrator 0000000000ff
ffffffffffff $migrator_tpa $first_s
echo $request >> hv1/first.expected
echo $request >> hv2/second.expected
echo $request >> hv3/third.expected
+echo $request >> hv2/n1.expected
check_packets