Looks good to me. Thanks, Ihar.

Acked-by: Mark Michelson <[email protected]>

On 6/24/22 18:35, Ihar Hrachyshka wrote:
Before the patch, for switches with a localnet port,

- traffic to a multichassis port was funneled through tunnels; but
- traffic from a multichassis port was sent through localnet.

This is not optimal because:
- bidirectional sessions are sent through two separate paths;
- leaking multichassis traffic to physical makes upstream switch flip
   port's mac address location.

One of the effects of this behavior is prolonged network downtime when
live migrating a VM attached to a physical network (compared to
downtime observed for pure geneve networks).

As per my unscientific local testing, this patch reduces network
downtime during OpenStack VM live migration from 1.0s+ to 0.05-0.10s,
on par with geneve networks.

Fixes: 10398c1f51d5 ("Clone packets to all port chassis")
Signed-off-by: Ihar Hrachyshka <[email protected]>
---
v2: fixed a memory leak caught by asan job.
v1: initial commit.
---
  controller/binding.c        | 38 ++++++++++++++
  controller/local_data.c     | 30 ++++++++++-
  controller/local_data.h     |  6 +++
  controller/ovn-controller.c |  6 +++
  controller/physical.c       | 99 +++++++++++++++++++++++++++++++++++--
  controller/physical.h       |  1 +
  tests/ovn.at                | 30 +++++++++++
  7 files changed, 206 insertions(+), 4 deletions(-)

diff --git a/controller/binding.c b/controller/binding.c
index 2279570f9..9025681db 100644
--- a/controller/binding.c
+++ b/controller/binding.c
@@ -386,6 +386,23 @@ update_ld_external_ports(const struct sbrec_port_binding 
*binding_rec,
      }
  }
+static void
+update_ld_multichassis_ports(const struct sbrec_port_binding *binding_rec,
+                             struct hmap *local_datapaths)
+{
+    struct local_datapath *ld = get_local_datapath(
+        local_datapaths, binding_rec->datapath->tunnel_key);
+    if (!ld) {
+        return;
+    }
+    if (binding_rec->additional_chassis) {
+        add_local_datapath_multichassis_port(ld, binding_rec->logical_port,
+                                             binding_rec);
+    } else {
+        remove_local_datapath_multichassis_port(ld, binding_rec->logical_port);
+    }
+}
+
  static void
  update_ld_localnet_port(const struct sbrec_port_binding *binding_rec,
                          struct shash *bridge_mappings,
@@ -1752,6 +1769,8 @@ binding_run(struct binding_ctx_in *b_ctx_in, struct 
binding_ctx_out *b_ctx_out)
struct ovs_list localnet_lports = OVS_LIST_INITIALIZER(&localnet_lports);
      struct ovs_list external_lports = OVS_LIST_INITIALIZER(&external_lports);
+    struct ovs_list multichassis_ports = OVS_LIST_INITIALIZER(
+                                                        &multichassis_ports);
struct lport {
          struct ovs_list list_node;
@@ -1787,6 +1806,13 @@ binding_run(struct binding_ctx_in *b_ctx_in, struct 
binding_ctx_out *b_ctx_out)
case LP_VIF:
              consider_vif_lport(pb, b_ctx_in, b_ctx_out, NULL, qos_map_ptr);
+            if (pb->additional_chassis) {
+                struct lport *multichassis_lport = xmalloc(
+                    sizeof *multichassis_lport);
+                multichassis_lport->pb = pb;
+                ovs_list_push_back(&multichassis_ports,
+                                   &multichassis_lport->list_node);
+            }
              break;
case LP_CONTAINER:
@@ -1862,6 +1888,16 @@ binding_run(struct binding_ctx_in *b_ctx_in, struct 
binding_ctx_out *b_ctx_out)
          free(ext_lport);
      }
+ /* Run through multichassis lport list to see if these are ports
+     * on local datapaths discovered from above loop, and update the
+     * corresponding local datapath accordingly. */
+    struct lport *multichassis_lport;
+    LIST_FOR_EACH_POP (multichassis_lport, list_node, &multichassis_ports) {
+        update_ld_multichassis_ports(multichassis_lport->pb,
+                                     b_ctx_out->local_datapaths);
+        free(multichassis_lport);
+    }
+
      shash_destroy(&bridge_mappings);
if (!sset_is_empty(b_ctx_out->egress_ifaces)
@@ -1934,6 +1970,7 @@ remove_pb_from_local_datapath(const struct 
sbrec_port_binding *pb,
      } else if (!strcmp(pb->type, "external")) {
          remove_local_datapath_external_port(ld, pb->logical_port);
      }
+    remove_local_datapath_multichassis_port(ld, pb->logical_port);
  }
static void
@@ -2677,6 +2714,7 @@ delete_done:
          case LP_VIF:
          case LP_CONTAINER:
          case LP_VIRTUAL:
+            update_ld_multichassis_ports(pb, b_ctx_out->local_datapaths);
              handled = handle_updated_vif_lport(pb, lport_type, b_ctx_in,
                                                 b_ctx_out, qos_map_ptr);
              break;
diff --git a/controller/local_data.c b/controller/local_data.c
index 98445902b..7f874fc19 100644
--- a/controller/local_data.c
+++ b/controller/local_data.c
@@ -72,6 +72,7 @@ local_datapath_alloc(const struct sbrec_datapath_binding *dp)
      ld->is_switch = datapath_is_switch(dp);
      ld->is_transit_switch = datapath_is_transit_switch(dp);
      shash_init(&ld->external_ports);
+    shash_init(&ld->multichassis_ports);
      /* memory accounting - common part. */
      local_datapath_usage += sizeof *ld;
@@ -97,13 +98,20 @@ local_datapath_destroy(struct local_datapath *ld)
      SHASH_FOR_EACH (node, &ld->external_ports) {
          local_datapath_usage -= strlen(node->name);
      }
-    local_datapath_usage -= shash_count(&ld->external_ports) * sizeof *node;
+    SHASH_FOR_EACH (node, &ld->multichassis_ports) {
+        local_datapath_usage -= strlen(node->name);
+    }
+    local_datapath_usage -= (shash_count(&ld->external_ports)
+                             * sizeof *node);
+    local_datapath_usage -= (shash_count(&ld->multichassis_ports)
+                             * sizeof *node);
      local_datapath_usage -= sizeof *ld;
      local_datapath_usage -=
          ld->n_allocated_peer_ports * sizeof *ld->peer_ports;
free(ld->peer_ports);
      shash_destroy(&ld->external_ports);
+    shash_destroy(&ld->multichassis_ports);
      free(ld);
  }
@@ -274,6 +282,26 @@ remove_local_datapath_external_port(struct local_datapath *ld,
      }
  }
+void
+add_local_datapath_multichassis_port(struct local_datapath *ld,
+                                     char *logical_port, const void *data)
+{
+    if (!shash_replace(&ld->multichassis_ports, logical_port, data)) {
+        local_datapath_usage += sizeof(struct shash_node) +
+                                strlen(logical_port);
+    }
+}
+
+void
+remove_local_datapath_multichassis_port(struct local_datapath *ld,
+                                        char *logical_port)
+{
+    if (shash_find_and_delete(&ld->multichassis_ports, logical_port)) {
+        local_datapath_usage -= sizeof(struct shash_node) +
+                                strlen(logical_port);
+    }
+}
+
  void
  local_datapath_memory_usage(struct simap *usage)
  {
diff --git a/controller/local_data.h b/controller/local_data.h
index 9306ddf15..d898c8aa5 100644
--- a/controller/local_data.h
+++ b/controller/local_data.h
@@ -58,6 +58,7 @@ struct local_datapath {
      size_t n_allocated_peer_ports;
struct shash external_ports;
+    struct shash multichassis_ports;
  };
struct local_datapath *local_datapath_alloc(
@@ -155,5 +156,10 @@ void add_local_datapath_external_port(struct 
local_datapath *ld,
                                        char *logical_port, const void *data);
  void remove_local_datapath_external_port(struct local_datapath *ld,
                                           char *logical_port);
+void add_local_datapath_multichassis_port(struct local_datapath *ld,
+                                          char *logical_port,
+                                          const void *data);
+void remove_local_datapath_multichassis_port(struct local_datapath *ld,
+                                             char *logical_port);
#endif /* controller/local_data.h */
diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
index 69615308e..2e9138036 100644
--- a/controller/ovn-controller.c
+++ b/controller/ovn-controller.c
@@ -3000,6 +3000,11 @@ static void init_physical_ctx(struct engine_node *node,
                  engine_get_input("SB_port_binding", node),
                  "name");
+ struct ovsdb_idl_index *sbrec_port_binding_by_datapath =
+        engine_ovsdb_node_get_index(
+                engine_get_input("SB_port_binding", node),
+                "datapath");
+
      struct sbrec_multicast_group_table *multicast_group_table =
          (struct sbrec_multicast_group_table *)EN_OVSDB_GET(
              engine_get_input("SB_multicast_group", node));
@@ -3039,6 +3044,7 @@ static void init_physical_ctx(struct engine_node *node,
      struct simap *ct_zones = &ct_zones_data->current;
p_ctx->sbrec_port_binding_by_name = sbrec_port_binding_by_name;
+    p_ctx->sbrec_port_binding_by_datapath = sbrec_port_binding_by_datapath;
      p_ctx->port_binding_table = port_binding_table;
      p_ctx->mc_group_table = multicast_group_table;
      p_ctx->br_int = br_int;
diff --git a/controller/physical.c b/controller/physical.c
index fc8280a99..816a557e7 100644
--- a/controller/physical.c
+++ b/controller/physical.c
@@ -1074,6 +1074,67 @@ setup_activation_strategy(const struct 
sbrec_port_binding *binding,
      }
  }
+static void
+enforce_tunneling_for_multichassis_ports(
+    struct local_datapath *ld,
+    const struct sbrec_port_binding *binding,
+    const struct sbrec_chassis *chassis,
+    const struct hmap *chassis_tunnels,
+    enum mf_field_id mff_ovn_geneve,
+    struct ovn_desired_flow_table *flow_table)
+{
+    if (shash_is_empty(&ld->multichassis_ports)) {
+        return;
+    }
+
+    struct ovs_list *tuns = get_remote_tunnels(binding, chassis,
+                                               chassis_tunnels);
+    if (ovs_list_is_empty(tuns)) {
+        free(tuns);
+        return;
+    }
+
+    uint32_t dp_key = binding->datapath->tunnel_key;
+    uint32_t port_key = binding->tunnel_key;
+
+    struct shash_node *node;
+    SHASH_FOR_EACH (node, &ld->multichassis_ports) {
+        const struct sbrec_port_binding *mcp = node->data;
+
+        struct ofpbuf ofpacts;
+        ofpbuf_init(&ofpacts, 0);
+
+        bool is_vtep_port = !strcmp(binding->type, "vtep");
+        /* rewrite MFF_IN_PORT to bypass OpenFlow loopback check for ARP/ND
+         * responder in L3 networks. */
+        if (is_vtep_port) {
+            put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16, &ofpacts);
+        }
+
+        struct match match;
+        match_outport_dp_and_port_keys(&match, dp_key, port_key);
+        match_set_reg(&match, MFF_LOG_INPORT - MFF_REG0, mcp->tunnel_key);
+
+        struct tunnel *tun;
+        LIST_FOR_EACH (tun, list_node, tuns) {
+            put_encapsulation(mff_ovn_geneve, tun->tun,
+                              binding->datapath, port_key, is_vtep_port,
+                              &ofpacts);
+            ofpact_put_OUTPUT(&ofpacts)->port = tun->tun->ofport;
+        }
+        ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 110,
+                        binding->header_.uuid.parts[0], &match, &ofpacts,
+                        &binding->header_.uuid);
+        ofpbuf_uninit(&ofpacts);
+    }
+
+    struct tunnel *tun_elem;
+    LIST_FOR_EACH_POP (tun_elem, list_node, tuns) {
+        free(tun_elem);
+    }
+    free(tuns);
+}
+
  static void
  consider_port_binding(struct ovsdb_idl_index *sbrec_port_binding_by_name,
                        enum mf_field_id mff_ovn_geneve,
@@ -1509,6 +1570,9 @@ consider_port_binding(struct ovsdb_idl_index 
*sbrec_port_binding_by_name,
                          binding->header_.uuid.parts[0],
                          &match, ofpacts_p, &binding->header_.uuid);
+ enforce_tunneling_for_multichassis_ports(
+            ld, binding, chassis, chassis_tunnels, mff_ovn_geneve, flow_table);
+
          /* No more tunneling to set up. */
          goto out;
      }
@@ -1827,20 +1891,49 @@ physical_handle_flows_for_lport(const struct 
sbrec_port_binding *pb,
ofctrl_remove_flows(flow_table, &pb->header_.uuid); + struct local_datapath *ldp =
+        get_local_datapath(p_ctx->local_datapaths,
+                           pb->datapath->tunnel_key);
      if (!strcmp(pb->type, "external")) {
          /* External lports have a dependency on the localnet port.
           * We need to remove the flows of the localnet port as well
           * and re-consider adding the flows for it.
           */
-        struct local_datapath *ldp =
-            get_local_datapath(p_ctx->local_datapaths,
-                               pb->datapath->tunnel_key);
          if (ldp && ldp->localnet_port) {
              ofctrl_remove_flows(flow_table, 
&ldp->localnet_port->header_.uuid);
              physical_eval_port_binding(p_ctx, ldp->localnet_port, flow_table);
          }
      }
+ if (ldp) {
+        bool multichassis_state_changed = (
+            !!pb->additional_chassis ==
+            !!shash_find(&ldp->multichassis_ports, pb->logical_port)
+        );
+        if (multichassis_state_changed) {
+            if (pb->additional_chassis) {
+                add_local_datapath_multichassis_port(
+                    ldp, pb->logical_port, pb);
+            } else {
+                remove_local_datapath_multichassis_port(
+                    ldp, pb->logical_port);
+            }
+
+            struct sbrec_port_binding *target =
+                sbrec_port_binding_index_init_row(
+                    p_ctx->sbrec_port_binding_by_datapath);
+            sbrec_port_binding_index_set_datapath(target, ldp->datapath);
+
+            const struct sbrec_port_binding *port;
+            SBREC_PORT_BINDING_FOR_EACH_EQUAL (
+                    port, target, p_ctx->sbrec_port_binding_by_datapath) {
+                ofctrl_remove_flows(flow_table, &port->header_.uuid);
+                physical_eval_port_binding(p_ctx, port, flow_table);
+            }
+            sbrec_port_binding_index_destroy_row(target);
+        }
+    }
+
      if (!removed) {
          physical_eval_port_binding(p_ctx, pb, flow_table);
          if (!strcmp(pb->type, "patch")) {
diff --git a/controller/physical.h b/controller/physical.h
index ee4b1ae1f..1b8f1ea55 100644
--- a/controller/physical.h
+++ b/controller/physical.h
@@ -45,6 +45,7 @@ struct local_nonvif_data;
struct physical_ctx {
      struct ovsdb_idl_index *sbrec_port_binding_by_name;
+    struct ovsdb_idl_index *sbrec_port_binding_by_datapath;
      const struct sbrec_port_binding_table *port_binding_table;
      const struct sbrec_multicast_group_table *mc_group_table;
      const struct ovsrec_bridge *br_int;
diff --git a/tests/ovn.at b/tests/ovn.at
index bfaa41962..a82a33aa7 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -14657,6 +14657,14 @@ reset_env() {
      for port in hv1/migrator hv2/migrator hv1/first hv2/second hv3/third; do
          : > $port.expected
      done
+
+    for hv in hv1 hv2 hv3; do
+        : > $hv/n1.expected
+    done
+
+    reset_pcap_file hv1 br-phys_n1 hv1/br-phys_n1
+    reset_pcap_file hv2 br-phys_n1 hv2/br-phys_n1
+    reset_pcap_file hv3 br-phys_n1 hv3/br-phys_n1
  }
check_packets() {
@@ -14667,6 +14675,10 @@ check_packets() {
      OVN_CHECK_PACKETS_CONTAIN([hv1/first-tx.pcap], [hv1/first.expected])
      OVN_CHECK_PACKETS_CONTAIN([hv2/second-tx.pcap], [hv2/second.expected])
      OVN_CHECK_PACKETS_CONTAIN([hv3/third-tx.pcap], [hv3/third.expected])
+
+    OVN_CHECK_PACKETS_CONTAIN([hv1/br-phys_n1-tx.pcap], [hv1/n1.expected])
+    OVN_CHECK_PACKETS_CONTAIN([hv2/br-phys_n1-tx.pcap], [hv2/n1.expected])
+    OVN_CHECK_PACKETS_CONTAIN([hv3/br-phys_n1-tx.pcap], [hv3/n1.expected])
  }
migrator_tpa=$(ip_to_hex 10 0 0 100)
@@ -14714,6 +14726,7 @@ echo $request >> hv3/third.expected
  # unicast from Second doesn't arrive to hv2:Migrator
  request=$(send_arp hv2 second 000000000002 0000000000ff $second_spa 
$migrator_tpa)
  echo $request >> hv1/migrator.expected
+echo $request >> hv2/n1.expected
# mcast from Second arrives to hv1:Migrator
  # mcast from Second doesn't arrive to hv2:Migrator
@@ -14721,11 +14734,13 @@ request=$(send_arp hv2 second 000000000002 
ffffffffffff $second_spa $migrator_tp
  echo $request >> hv1/migrator.expected
  echo $request >> hv1/first.expected
  echo $request >> hv3/third.expected
+echo $request >> hv2/n1.expected
# unicast from Third arrives to hv1:Migrator
  # unicast from Third doesn't arrive to hv2:Migrator
  request=$(send_arp hv3 third 000000000003 0000000000ff $third_spa 
$migrator_tpa)
  echo $request >> hv1/migrator.expected
+echo $request >> hv3/n1.expected
# mcast from Third arrives to hv1:Migrator
  # mcast from Third doesn't arrive to hv2:Migrator
@@ -14733,14 +14748,17 @@ request=$(send_arp hv3 third 000000000003 
ffffffffffff $third_spa $migrator_tpa)
  echo $request >> hv1/migrator.expected
  echo $request >> hv1/first.expected
  echo $request >> hv2/second.expected
+echo $request >> hv3/n1.expected
# unicast from hv1:Migrator arrives to First, Second, and Third
  request=$(send_arp hv1 migrator 0000000000ff 000000000001 $migrator_tpa 
$first_spa)
  echo $request >> hv1/first.expected
  request=$(send_arp hv1 migrator 0000000000ff 000000000002 $migrator_tpa 
$second_spa)
  echo $request >> hv2/second.expected
+echo $request >> hv1/n1.expected
  request=$(send_arp hv1 migrator 0000000000ff 000000000003 $migrator_tpa 
$third_spa)
  echo $request >> hv3/third.expected
+echo $request >> hv1/n1.expected
# unicast from hv2:Migrator doesn't arrive to First, Second, or Third
  request=$(send_arp hv2 migrator 0000000000ff 000000000001 $migrator_tpa 
$first_spa)
@@ -14752,6 +14770,7 @@ request=$(send_arp hv1 migrator 0000000000ff 
ffffffffffff $migrator_tpa $first_s
  echo $request >> hv1/first.expected
  echo $request >> hv2/second.expected
  echo $request >> hv3/third.expected
+echo $request >> hv1/n1.expected
# mcast from hv2:Migrator doesn't arrive to First, Second, or Third
  request=$(send_arp hv2 migrator 0000000000ff ffffffffffff $migrator_tpa 
$first_spa)
@@ -14838,6 +14857,10 @@ echo $request >> hv1/first.expected
  echo $request >> hv2/second.expected
  echo $request >> hv3/third.expected
+# unicast from Second arrives to Third through localnet port
+request=$(send_arp hv2 second 000000000002 000000000003 $second_spa $third_spa)
+echo $request >> hv2/n1.expected
+
  check_packets
# Complete migration: destination is bound
@@ -14860,6 +14883,7 @@ reset_env
  # unicast from Third arrives to hv2:Migrator
  request=$(send_arp hv3 third 000000000003 0000000000ff $third_spa 
$migrator_tpa)
  echo $request >> hv2/migrator.expected
+echo $request >> hv3/n1.expected
# mcast from Third doesn't arrive to hv1:Migrator
  # mcast from Third arrives to hv2:Migrator
@@ -14867,11 +14891,13 @@ request=$(send_arp hv3 third 000000000003 
ffffffffffff $third_spa $migrator_tpa)
  echo $request >> hv2/migrator.expected
  echo $request >> hv1/first.expected
  echo $request >> hv2/second.expected
+echo $request >> hv3/n1.expected
# unicast from First doesn't arrive to hv1:Migrator
  # unicast from First arrives to hv2:Migrator
  request=$(send_arp hv1 first 000000000001 0000000000ff $first_spa 
$migrator_tpa)
  echo $request >> hv2/migrator.expected
+echo $request >> hv1/n1.expected
# mcast from First doesn't arrive to hv1:Migrator
  # mcast from First arrives to hv2:Migrator binding
@@ -14879,6 +14905,7 @@ request=$(send_arp hv1 first 000000000001 ffffffffffff 
$first_spa $migrator_tpa)
  echo $request >> hv2/migrator.expected
  echo $request >> hv2/second.expected
  echo $request >> hv3/third.expected
+echo $request >> hv1/n1.expected
# unicast from Second doesn't arrive to hv1:Migrator
  # unicast from Second arrives to hv2:Migrator
@@ -14900,10 +14927,12 @@ request=$(send_arp hv1 migrator 0000000000ff 
000000000003 $migrator_tpa $third_s
  # unicast from hv2:Migrator arrives to First, Second, and Third
  request=$(send_arp hv2 migrator 0000000000ff 000000000001 $migrator_tpa 
$first_spa)
  echo $request >> hv1/first.expected
+echo $request >> hv2/n1.expected
  request=$(send_arp hv2 migrator 0000000000ff 000000000002 $migrator_tpa 
$second_spa)
  echo $request >> hv2/second.expected
  request=$(send_arp hv2 migrator 0000000000ff 000000000003 $migrator_tpa 
$third_spa)
  echo $request >> hv3/third.expected
+echo $request >> hv2/n1.expected
# mcast from hv1:Migrator doesn't arrive to First, Second, or Third
  request=$(send_arp hv1 migrator 0000000000ff ffffffffffff $migrator_tpa 
$first_spa)
@@ -14913,6 +14942,7 @@ request=$(send_arp hv2 migrator 0000000000ff 
ffffffffffff $migrator_tpa $first_s
  echo $request >> hv1/first.expected
  echo $request >> hv2/second.expected
  echo $request >> hv3/third.expected
+echo $request >> hv2/n1.expected
check_packets

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to