Hi Han, I was only able to have a look at about 2/3 of the patch, but I figured I would share my feedback anyway. Most of what I have to say is low-level nitpicky things, rather than high-level commentary about the feature itself.
On Wed, Oct 29, 2025 at 2:06 PM Han Zhou <[email protected]> wrote: > > This patch introduces flow-based tunnels as an alternative to > traditional port-based tunnels, significantly reducing tunnel port count > in large deployments. > > Flow-based tunnels use shared ports (one per tunnel type) with > options:local_ip=flow and options:remote_ip=flow. OpenFlow flows > dynamically set tunnel endpoints using set_field actions, reducing port > count to O(T) where T is the number of tunnel types. > > The feature is experimental, and controlled by > external_ids:ovn-enable-flow-based-tunnels (default: false). > > Some known limitations: > - IPsec is not supported > - BFD between tunnel endpoints is not supported, thus HA chassis not > supported. > > Assisted-by: Cursor, with model: Claude Sonnet 4.5 > Signed-off-by: Han Zhou <[email protected]> > --- > NEWS | 4 + > controller/encaps.c | 247 ++++++++++++++++---- > controller/encaps.h | 15 ++ > controller/local_data.c | 202 ++++++++++++++++- > controller/local_data.h | 34 ++- > controller/ovn-controller.8.xml | 30 +++ > controller/ovn-controller.c | 23 +- > controller/physical.c | 384 ++++++++++++++++++++++++++++---- > controller/physical.h | 3 + > lib/ovn-util.h | 1 + > tests/ovn-macros.at | 31 ++- > tests/ovn.at | 135 ++++++++--- > tests/ovs-macros.at | 4 +- > 13 files changed, 988 insertions(+), 125 deletions(-) > > diff --git a/NEWS b/NEWS > index 06cc18b41aff..405a04747666 100644 > --- a/NEWS > +++ b/NEWS > @@ -58,6 +58,10 @@ Post v25.09.0 > specified LS. > - Add ovn-nbctl lsp-add-localnet-port which will create localnet port on > specified LS. > + - Added experimental flow-based tunnel support. Enable via > + external_ids:ovn-enable-flow-based-tunnels=true to use shared tunnel > + ports instead of per-chassis ports, reducing port count for large scale > + environments. Default is disabled. > > OVN v25.09.0 - xxx xx xxxx > -------------------------- > diff --git a/controller/encaps.c b/controller/encaps.c > index 67f8c9c9c82f..288959180402 100644 > --- a/controller/encaps.c > +++ b/controller/encaps.c > @@ -29,14 +29,6 @@ > > VLOG_DEFINE_THIS_MODULE(encaps); > > -/* > - * Given there could be multiple tunnels with different IPs to the same > - * chassis we annotate the external_ids:ovn-chassis-id in tunnel port with > - * <chassis_name>@<remote IP>%<local IP>. The external_id key > - * "ovn-chassis-id" is kept for backward compatibility. > - */ > -#define OVN_TUNNEL_ID "ovn-chassis-id" > - > static char *current_br_int_name = NULL; > > void > @@ -547,6 +539,179 @@ create_evpn_tunnels(struct tunnel_ctx *tc) > } > > > +bool > +is_flow_based_tunnels_enabled( > + const struct ovsrec_open_vswitch_table *ovs_table, > + const struct sbrec_chassis *chassis) > +{ > + const struct ovsrec_open_vswitch *cfg = > + ovsrec_open_vswitch_table_first(ovs_table); > + > + if (cfg) { > + const char *enable_flow_tunnels = > + get_chassis_external_id_value( Nit: You can use get_chassis_external_id_bool() here to get a bool instead of a string. This saves you a strcmp() call later. > + &cfg->external_ids, chassis->name, > + "ovn-enable-flow-based-tunnels", "false"); > + return !strcmp(enable_flow_tunnels, "true"); > + } > + return false; > +} > + > +static char * > +flow_based_tunnel_name(const char *tunnel_type, const char *chassis_idx) > +{ > + return xasprintf("ovn%s-%s", chassis_idx, tunnel_type); > +} > + > +static void > +flow_based_tunnel_ensure(struct tunnel_ctx *tc, const char *tunnel_type, > + const char *port_name, > + const struct sbrec_sb_global *sbg, > + const struct ovsrec_open_vswitch_table *ovs_table) > +{ > + /* Check if flow-based tunnel already exists. */ > + const struct ovsrec_port *existing_port = NULL; > + for (size_t i = 0; i < tc->br_int->n_ports; i++) { > + const struct ovsrec_port *port = tc->br_int->ports[i]; > + if (!strcmp(port->name, port_name)) { > + existing_port = port; > + break; > + } > + } > + > + if (existing_port) { > + return; > + } > + > + /* Create flow-based tunnel port. */ > + struct smap options = SMAP_INITIALIZER(&options); > + smap_add(&options, "remote_ip", "flow"); > + smap_add(&options, "local_ip", "flow"); > + smap_add(&options, "key", "flow"); > + > + if (sbg->ipsec) { > + /* For flow-based tunnels, we can't specify remote_name since > + * remote chassis varies. IPsec will need to handle this differently. > + */ > + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1); > + VLOG_WARN_RL(&rl, "IPsec is not supported for flow-based tunnels. " > + "Ignoring IPsec settings."); > + } > + > + /* Add other tunnel options from OVS config. */ > + const struct ovsrec_open_vswitch *cfg = > + ovsrec_open_vswitch_table_first(ovs_table); > + if (cfg) { > + const char *encap_tos = > + get_chassis_external_id_value(&cfg->external_ids, > + tc->this_chassis->name, > + "ovn-encap-tos", "none"); > + if (encap_tos && strcmp(encap_tos, "none")) { > + smap_add(&options, "tos", encap_tos); > + } > + > + const char *encap_df = > + get_chassis_external_id_value(&cfg->external_ids, > + tc->this_chassis->name, > + "ovn-encap-df_default", NULL); > + if (encap_df) { > + smap_add(&options, "df_default", encap_df); > + } > + } > + > + /* Create interface. */ > + struct ovsrec_interface *iface = ovsrec_interface_insert(tc->ovs_txn); > + ovsrec_interface_set_name(iface, port_name); > + ovsrec_interface_set_type(iface, tunnel_type); > + ovsrec_interface_set_options(iface, &options); > + > + /* Create port. */ > + struct ovsrec_port *port = ovsrec_port_insert(tc->ovs_txn); > + ovsrec_port_set_name(port, port_name); > + ovsrec_port_set_interfaces(port, &iface, 1); > + > + /* Set external IDs to mark as flow-based tunnel using unified > + * OVN_TUNNEL_ID. */ > + const struct smap external_ids = SMAP_CONST2(&external_ids, > + OVN_TUNNEL_ID, "flow", > + "ovn-tunnel-type", > + tunnel_type); > + ovsrec_port_set_external_ids(port, &external_ids); > + > + /* Add to bridge. */ > + ovsrec_bridge_update_ports_addvalue(tc->br_int, port); > + > + VLOG_INFO("Created flow-based %s tunnel port: %s", tunnel_type, > port_name); > + > + smap_destroy(&options); > +} > + > +static void > +create_flow_based_tunnels(struct tunnel_ctx *tc, > + const struct sbrec_sb_global *sbg) > +{ > + struct sset tunnel_types = SSET_INITIALIZER(&tunnel_types); > + > + for (size_t i = 0; i < tc->this_chassis->n_encaps; i++) { > + sset_add(&tunnel_types, tc->this_chassis->encaps[i]->type); > + } > + > + const char *tunnel_type; > + SSET_FOR_EACH (tunnel_type, &tunnel_types) { > + char *port_name = flow_based_tunnel_name(tunnel_type, > + get_chassis_idx(tc->ovs_table)); > + flow_based_tunnel_ensure(tc, tunnel_type, port_name, sbg, > + tc->ovs_table); > + /* remove any existing tunnel with this name from tracking so it > + * doesn't get deleted. */ > + struct tunnel_node *existing_tunnel = shash_find_data(&tc->tunnel, > + port_name); > + if (existing_tunnel) { > + shash_find_and_delete(&tc->tunnel, port_name); > + free(existing_tunnel); > + } > + free(port_name); > + } > + > + sset_destroy(&tunnel_types); > +} > + > +static void > +create_port_based_tunnels(struct tunnel_ctx *tc, > + const struct sbrec_chassis_table *chassis_table, > + const struct sbrec_sb_global *sbg, > + const struct sset *transport_zones) > +{ > + const struct sbrec_chassis *chassis_rec; > + SBREC_CHASSIS_TABLE_FOR_EACH (chassis_rec, chassis_table) { > + if (strcmp(chassis_rec->name, tc->this_chassis->name)) { > + /* Create tunnels to the other Chassis belonging to the > + * same transport zone */ > + if (!chassis_tzones_overlap(transport_zones, chassis_rec)) { > + VLOG_DBG("Skipping encap creation for Chassis '%s' because " > + "it belongs to different transport zones", > + chassis_rec->name); > + continue; > + } > + > + if (smap_get_bool(&chassis_rec->other_config, "is-remote", false) > + && !smap_get_bool(&tc->this_chassis->other_config, > + "is-interconn", false)) { > + VLOG_DBG("Skipping encap creation for Chassis '%s' because " > + "it is remote but this chassis is not interconn.", > + chassis_rec->name); > + continue; > + } > + > + if (chassis_tunnel_add(chassis_rec, sbg, tc->ovs_table, tc, > + tc->this_chassis) == 0) { > + VLOG_INFO("Creating encap for '%s' failed", > chassis_rec->name); > + continue; > + } > + } > + } > +} > + > void > encaps_run(struct ovsdb_idl_txn *ovs_idl_txn, > const struct ovsrec_bridge *br_int, > @@ -561,6 +726,11 @@ encaps_run(struct ovsdb_idl_txn *ovs_idl_txn, > return; > } > > + bool use_flow_based = is_flow_based_tunnels_enabled(ovs_table, > + this_chassis); > + VLOG_DBG("Using %s tunnels for this chassis.", > + use_flow_based ? "flow-based" : "port-based"); > + > if (!current_br_int_name) { > /* The controller has just started, we need to look through all > * bridges for old tunnel ports. */ > @@ -590,8 +760,6 @@ encaps_run(struct ovsdb_idl_txn *ovs_idl_txn, > current_br_int_name = xstrdup(br_int->name); > } > > - const struct sbrec_chassis *chassis_rec; > - > struct tunnel_ctx tc = { > .tunnel = SHASH_INITIALIZER(&tc.tunnel), > .port_names = SSET_INITIALIZER(&tc.port_names), > @@ -606,24 +774,30 @@ encaps_run(struct ovsdb_idl_txn *ovs_idl_txn, > "ovn-controller: modifying OVS tunnels '%s'", > this_chassis->name); > > - /* Collect all port names into tc.port_names. > - * > - * Collect all the OVN-created tunnels into tc.tunnel_hmap. */ > + /* Collect existing port names and tunnel ports for cleanup. */ > for (size_t i = 0; i < br_int->n_ports; i++) { > const struct ovsrec_port *port = br_int->ports[i]; > sset_add(&tc.port_names, port->name); > > const char *id = smap_get(&port->external_ids, OVN_TUNNEL_ID); > if (id) { > - if (!shash_find(&tc.tunnel, id)) { > - struct tunnel_node *tunnel = xzalloc(sizeof *tunnel); > - tunnel->bridge = br_int; > - tunnel->port = port; > - shash_add_assert(&tc.tunnel, id, tunnel); > + struct tunnel_node *tunnel = xzalloc(sizeof *tunnel); > + tunnel->bridge = br_int; > + tunnel->port = port; > + > + if (use_flow_based) { > + /* Flow-based: track by port name */ > + shash_add(&tc.tunnel, port->name, tunnel); > } else { > - /* Duplicate port for tunnel-id. Arbitrarily choose > - * to delete this one. */ > - ovsrec_bridge_update_ports_delvalue(br_int, port); > + /* Port-based: track by tunnel ID, handle duplicates */ > + if (!shash_find(&tc.tunnel, id)) { > + shash_add_assert(&tc.tunnel, id, tunnel); > + } else { > + /* Duplicate port for tunnel-id. Arbitrarily choose > + * to delete this one. */ > + ovsrec_bridge_update_ports_delvalue(br_int, port); > + free(tunnel); > + } > } > } > > @@ -632,32 +806,11 @@ encaps_run(struct ovsdb_idl_txn *ovs_idl_txn, > } > } > > - SBREC_CHASSIS_TABLE_FOR_EACH (chassis_rec, chassis_table) { > - if (strcmp(chassis_rec->name, this_chassis->name)) { > - /* Create tunnels to the other Chassis belonging to the > - * same transport zone */ > - if (!chassis_tzones_overlap(transport_zones, chassis_rec)) { > - VLOG_DBG("Skipping encap creation for Chassis '%s' because " > - "it belongs to different transport zones", > - chassis_rec->name); > - continue; > - } > - > - if (smap_get_bool(&chassis_rec->other_config, "is-remote", false) > - && !smap_get_bool(&this_chassis->other_config, > "is-interconn", > - false)) { > - VLOG_DBG("Skipping encap creation for Chassis '%s' because " > - "it is remote but this chassis is not interconn.", > - chassis_rec->name); > - continue; > - } > - > - if (chassis_tunnel_add(chassis_rec, sbg, ovs_table, &tc, > - this_chassis) == 0) { > - VLOG_INFO("Creating encap for '%s' failed", > chassis_rec->name); > - continue; > - } > - } > + /* Create OVN tunnels (mode-specific). */ > + if (use_flow_based) { > + create_flow_based_tunnels(&tc, sbg); > + } else { > + create_port_based_tunnels(&tc, chassis_table, sbg, transport_zones); > } > > create_evpn_tunnels(&tc); > diff --git a/controller/encaps.h b/controller/encaps.h > index 6f9891ee5907..240787e8fbb7 100644 > --- a/controller/encaps.h > +++ b/controller/encaps.h > @@ -18,6 +18,17 @@ > > #include <stdbool.h> > > +/* > + * Given there could be multiple tunnels with different IPs to the same > + * chassis we annotate the external_ids:ovn-chassis-id in tunnel port with > + * <chassis_name>@<remote IP>%<local IP>. The external_id key > + * "ovn-chassis-id" is kept for backward compatibility. > + * > + * For flow-based tunnels, we use the special value "flow" to identify > + * shared tunnel ports that handle dynamic endpoint resolution. > + */ > +#define OVN_TUNNEL_ID "ovn-chassis-id" > + > struct ovsdb_idl; > struct ovsdb_idl_txn; > struct ovsrec_bridge; > @@ -38,6 +49,10 @@ void encaps_run(struct ovsdb_idl_txn *ovs_idl_txn, > const struct sset *transport_zones, > const struct ovsrec_bridge_table *bridge_table); > > +bool is_flow_based_tunnels_enabled( > + const struct ovsrec_open_vswitch_table *ovs_table, > + const struct sbrec_chassis *chassis); > + > bool encaps_cleanup(struct ovsdb_idl_txn *ovs_idl_txn, > const struct ovsrec_bridge *br_int); > > diff --git a/controller/local_data.c b/controller/local_data.c > index a35d3fa5ae37..d5afc89da2cd 100644 > --- a/controller/local_data.c > +++ b/controller/local_data.c > @@ -51,6 +51,10 @@ static struct tracked_datapath *tracked_datapath_create( > enum en_tracked_resource_type tracked_type, > struct hmap *tracked_datapaths); > > +static void track_flow_based_tunnel( > + const struct ovsrec_port *, const struct sbrec_chassis *, > + struct ovs_list *flow_tunnels); > + > static bool datapath_is_switch(const struct sbrec_datapath_binding *); > static bool datapath_is_transit_switch(const struct sbrec_datapath_binding > *); > > @@ -454,7 +458,8 @@ void > local_nonvif_data_run(const struct ovsrec_bridge *br_int, > const struct sbrec_chassis *chassis_rec, > struct simap *patch_ofports, > - struct hmap *chassis_tunnels) > + struct hmap *chassis_tunnels, > + struct ovs_list *flow_tunnels) > { > for (int i = 0; i < br_int->n_ports; i++) { > const struct ovsrec_port *port_rec = br_int->ports[i]; > @@ -470,6 +475,10 @@ local_nonvif_data_run(const struct ovsrec_bridge *br_int, > continue; > } > > + if (flow_tunnels) { > + track_flow_based_tunnel(port_rec, chassis_rec, flow_tunnels); > + } > + > const char *localnet = smap_get(&port_rec->external_ids, > "ovn-localnet-port"); > const char *l2gateway = smap_get(&port_rec->external_ids, > @@ -757,3 +766,194 @@ lb_is_local(const struct sbrec_load_balancer *sbrec_lb, > > return false; > } > + > +/* Flow-based tunnel management functions. */ > + > +struct flow_based_tunnel * > +flow_based_tunnel_find(const struct ovs_list *flow_tunnels, > + enum chassis_tunnel_type type) > +{ > + struct flow_based_tunnel *tun; > + LIST_FOR_EACH (tun, list_node, flow_tunnels) { If I've read the code correctly, the flow_tunnels list contains at most one flow per encap type. If that's the case, then since the number of encap types is limited, you could probably just use a static array instead of a list. Then you can retrieve the tunnel based on an index value instead of having to traverse the list. > + if (tun->type == type) { > + return tun; > + } > + } > + return NULL; > +} > + > +void > +flow_based_tunnels_destroy(struct ovs_list *flow_tunnels) > +{ > + struct flow_based_tunnel *tun, *next; > + LIST_FOR_EACH_SAFE (tun, next, list_node, flow_tunnels) { > + ovs_list_remove(&tun->list_node); > + free(tun->port_name); > + free(tun); > + } > +} > + > +ofp_port_t > +get_flow_based_tunnel_port(enum chassis_tunnel_type type, > + const struct ovs_list *flow_tunnels) > +{ > + struct flow_based_tunnel *tun = flow_based_tunnel_find(flow_tunnels, > type); > + return tun ? tun->ofport : 0; > +} > + > +/* Direct tunnel endpoint selection utilities. */ > + > +enum chassis_tunnel_type > +select_preferred_tunnel_type(const struct sbrec_chassis *local_chassis, > + const struct sbrec_chassis *remote_chassis) > +{ > + /* Determine what tunnel types both chassis support */ > + bool local_supports_geneve = false; > + bool local_supports_vxlan = false; > + bool remote_supports_geneve = false; > + bool remote_supports_vxlan = false; > + > + for (size_t i = 0; i < local_chassis->n_encaps; i++) { > + const char *type = local_chassis->encaps[i]->type; > + if (!strcmp(type, "geneve")) { > + local_supports_geneve = true; > + } else if (!strcmp(type, "vxlan")) { > + local_supports_vxlan = true; > + } > + } > + > + for (size_t i = 0; i < remote_chassis->n_encaps; i++) { > + const char *type = remote_chassis->encaps[i]->type; > + if (!strcmp(type, "geneve")) { > + remote_supports_geneve = true; > + } else if (!strcmp(type, "vxlan")) { > + remote_supports_vxlan = true; > + } > + } > + > + /* Return preferred common tunnel type: geneve > vxlan */ > + if (local_supports_geneve && remote_supports_geneve) { > + return GENEVE; > + } else if (local_supports_vxlan && remote_supports_vxlan) { > + return VXLAN; > + } else { > + return TUNNEL_TYPE_INVALID; /* No common tunnel type */ > + } > +} > + > +const char * > +select_default_encap_ip(const struct sbrec_chassis *chassis, > + enum chassis_tunnel_type tunnel_type) > +{ > + const char *default_ip = NULL; > + const char *first_ip = NULL; > + const char *tunnel_type_str = (tunnel_type == GENEVE) ? "geneve" : > "vxlan"; > + > + for (size_t i = 0; i < chassis->n_encaps; i++) { > + const struct sbrec_encap *encap = chassis->encaps[i]; > + > + if (strcmp(encap->type, tunnel_type_str)) { > + continue; > + } > + > + if (!first_ip) { > + first_ip = encap->ip; > + } > + > + const char *is_default = smap_get(&encap->options, > "default-encap-ip"); Nit: You can use smap_get_bool(&encap->options, "default-encap-ip", false) to save a strcmp(). > + if (is_default && !strcmp(is_default, "true")) { > + default_ip = encap->ip; > + break; /* Found explicit default */ > + } > + } > + > + return default_ip ? default_ip : first_ip; > +} > + > +const char * > +select_port_encap_ip(const struct sbrec_port_binding *binding, > + enum chassis_tunnel_type tunnel_type) > +{ > + const char *tunnel_type_str = (tunnel_type == GENEVE) ? "geneve" : > "vxlan"; > + > + if (binding->encap && !strcmp(binding->encap->type, tunnel_type_str)) { > + VLOG_DBG("Using port-specific encap IP %s for binding %s", > + binding->encap->ip, binding->logical_port); > + return binding->encap->ip; > + } > + > + /* Fall back to chassis default encap IP */ > + return select_default_encap_ip(binding->chassis, tunnel_type); > +} > + > +static void > +track_flow_based_tunnel(const struct ovsrec_port *port_rec, > + const struct sbrec_chassis *chassis_rec, > + struct ovs_list *flow_tunnels) > +{ > + if (port_rec->n_interfaces != 1) { > + return; > + } > + > + const struct ovsrec_interface *iface_rec = port_rec->interfaces[0]; > + > + /* Check if this is a flow-based tunnel port using unified > + * OVN_TUNNEL_ID. */ > + const char *tunnel_id = smap_get(&port_rec->external_ids, OVN_TUNNEL_ID); > + const char *tunnel_type_str = smap_get(&port_rec->external_ids, > + "ovn-tunnel-type"); > + > + if (!tunnel_id || !tunnel_type_str || strcmp(tunnel_id, "flow")) { > + return; > + } > + > + /* Get tunnel type. */ > + enum chassis_tunnel_type tunnel_type; > + if (!strcmp(tunnel_type_str, "geneve")) { > + tunnel_type = GENEVE; > + } else if (!strcmp(tunnel_type_str, "vxlan")) { > + tunnel_type = VXLAN; > + } else { > + return; > + } > + > + /* Check if we already track this tunnel type. */ > + if (flow_based_tunnel_find(flow_tunnels, tunnel_type)) { > + return; > + } > + > + int64_t ofport = iface_rec->ofport ? *iface_rec->ofport : 0; > + if (ofport <= 0 || ofport > UINT16_MAX) { > + VLOG_INFO("Invalid ofport %"PRId64" for flow-based tunnel %s", > + ofport, port_rec->name); > + return; > + } > + > + /* Detect if this tunnel will use IPv6. > + * For flow-based tunnels with local_ip="flow", check if any of the > + * chassis encap IPs for this tunnel type are IPv6 addresses. > + */ > + bool is_ipv6 = false; > + for (size_t i = 0; i < chassis_rec->n_encaps; i++) { > + const struct sbrec_encap *encap = chassis_rec->encaps[i]; > + if (!strcmp(encap->type, tunnel_type_str)) { > + if (addr_is_ipv6(encap->ip)) { > + is_ipv6 = true; > + break; > + } > + } > + } > + > + struct flow_based_tunnel *tun = xzalloc(sizeof *tun); > + tun->type = tunnel_type; > + tun->ofport = u16_to_ofp(ofport); > + tun->is_ipv6 = is_ipv6; > + tun->port_name = xstrdup(port_rec->name); > + > + VLOG_INFO("Tracking flow-based tunnel: port=%s, type=%s, ofport=%"PRId64 > + ", is_ipv6=%s", port_rec->name, tunnel_type_str, ofport, > + is_ipv6 ? "true" : "false"); > + > + ovs_list_push_back(flow_tunnels, &tun->list_node); > +} > + > diff --git a/controller/local_data.h b/controller/local_data.h > index 841829f2e071..28e3fb70731d 100644 > --- a/controller/local_data.h > +++ b/controller/local_data.h > @@ -28,6 +28,7 @@ > struct sbrec_datapath_binding; > struct sbrec_port_binding; > struct sbrec_chassis; > +struct sbrec_chassis_table; > struct ovsdb_idl_index; > struct ovsrec_bridge; > struct ovsrec_interface_table; > @@ -147,10 +148,22 @@ struct chassis_tunnel { > bool is_ipv6; > }; > > +/* Flow-based tunnel that consolidates multiple endpoints into a single > + * port. */ > +struct flow_based_tunnel { > + struct ovs_list list_node; > + enum chassis_tunnel_type type; /* geneve, vxlan */ > + ofp_port_t ofport; /* Single port for all endpoints */ > + bool is_ipv6; > + char *port_name; /* e.g., "ovn-geneve" */ > +}; > + > + > void local_nonvif_data_run(const struct ovsrec_bridge *br_int, > - const struct sbrec_chassis *, > + const struct sbrec_chassis *chassis, > struct simap *patch_ofports, > - struct hmap *chassis_tunnels); > + struct hmap *chassis_tunnels, > + struct ovs_list *flow_tunnels); > > bool local_nonvif_data_handle_ovs_iface_changes( > const struct ovsrec_interface_table *); > @@ -165,6 +178,23 @@ bool get_chassis_tunnel_ofport(const struct hmap > *chassis_tunnels, > ofp_port_t *ofport); > > void chassis_tunnels_destroy(struct hmap *chassis_tunnels); > + > +/* Flow-based tunnel management functions. */ > +struct flow_based_tunnel *flow_based_tunnel_find( > + const struct ovs_list *flow_tunnels, enum chassis_tunnel_type); > +void flow_based_tunnels_destroy(struct ovs_list *flow_tunnels); > +ofp_port_t get_flow_based_tunnel_port( > + enum chassis_tunnel_type, const struct ovs_list *flow_tunnels); > + > +/* Direct tunnel endpoint selection utilities. */ > +enum chassis_tunnel_type select_preferred_tunnel_type( > + const struct sbrec_chassis *local_chassis, > + const struct sbrec_chassis *remote_chassis); > +const char *select_default_encap_ip(const struct sbrec_chassis *, > + enum chassis_tunnel_type tunnel_type); > +const char *select_port_encap_ip(const struct sbrec_port_binding *binding, > + enum chassis_tunnel_type tunnel_type); > + > void local_datapath_memory_usage(struct simap *usage); > void add_local_datapath_external_port(struct local_datapath *ld, > char *logical_port, const void *data); > diff --git a/controller/ovn-controller.8.xml b/controller/ovn-controller.8.xml > index 32a1f7a6f658..dfc7cc2179ea 100644 > --- a/controller/ovn-controller.8.xml > +++ b/controller/ovn-controller.8.xml > @@ -219,6 +219,36 @@ > <code>false</code> to clear the DF flag. > </dd> > > + <dt><code>external_ids:ovn-enable-flow-based-tunnels</code></dt> > + <dd> > + <p> > + The boolean flag indicates if <code>ovn-controller</code> should > + use flow-based tunnels instead of port-based tunnels for overlay > + network connectivity. Default value is <code>false</code>. > + </p> > + <p> > + <em>Port-based tunnels</em> (default mode) create a dedicated > tunnel > + port for each combination of remote chassis and local encap IP, > + while <em>Flow-based tunnels</em> create a single shared tunnel > port > + for each tunnel type, and use OpenFlow <code>set_field</code> > actions > + to dynamically set tunnel source and destination IP addresses per > + packet. > + </p> > + <p> > + This feature is experimental and is disabled by default. There are > + some known limitations to the feature: > + <ul> > + <li> > + IPSec is not supported. > + </li> > + <li> > + BFD between tunnel endpoints is not supported, thus HA chassis > + (e.g. for Distributed Gateway Port redundancy) is not > supported. > + </li> > + </ul> > + </p> > + </dd> > + > <dt><code>external_ids:ovn-bridge-mappings</code></dt> > <dd> > A list of key-value pairs that map a physical network name to a local > diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c > index ea65d3a3e8b0..e96ab8b39879 100644 > --- a/controller/ovn-controller.c > +++ b/controller/ovn-controller.c > @@ -3516,6 +3516,9 @@ struct ed_type_non_vif_data { > struct simap patch_ofports; /* simap of patch ovs ports. */ > struct hmap chassis_tunnels; /* hmap of 'struct chassis_tunnel' from the > * tunnel OVS ports. */ > + struct ovs_list flow_tunnels; /* list of 'struct flow_based_tunnel' > from > + * flow-based tunnel ports. */ > + bool use_flow_based_tunnels; /* Enable flow-based tunnels. */ > }; > > static void * > @@ -3525,6 +3528,8 @@ en_non_vif_data_init(struct engine_node *node > OVS_UNUSED, > struct ed_type_non_vif_data *data = xzalloc(sizeof *data); > simap_init(&data->patch_ofports); > hmap_init(&data->chassis_tunnels); > + ovs_list_init(&data->flow_tunnels); > + data->use_flow_based_tunnels = false; > return data; > } > > @@ -3534,6 +3539,7 @@ en_non_vif_data_cleanup(void *data OVS_UNUSED) > struct ed_type_non_vif_data *ed_non_vif_data = data; > simap_destroy(&ed_non_vif_data->patch_ofports); > chassis_tunnels_destroy(&ed_non_vif_data->chassis_tunnels); > + flow_based_tunnels_destroy(&ed_non_vif_data->flow_tunnels); > } > > static enum engine_node_state > @@ -3542,8 +3548,11 @@ en_non_vif_data_run(struct engine_node *node, void > *data) > struct ed_type_non_vif_data *ed_non_vif_data = data; > simap_destroy(&ed_non_vif_data->patch_ofports); > chassis_tunnels_destroy(&ed_non_vif_data->chassis_tunnels); > + flow_based_tunnels_destroy(&ed_non_vif_data->flow_tunnels); > + > simap_init(&ed_non_vif_data->patch_ofports); > hmap_init(&ed_non_vif_data->chassis_tunnels); > + ovs_list_init(&ed_non_vif_data->flow_tunnels); > > const struct ovsrec_open_vswitch_table *ovs_table = > EN_OVSDB_GET(engine_get_input("OVS_open_vswitch", node)); > @@ -3563,8 +3572,15 @@ en_non_vif_data_run(struct engine_node *node, void > *data) > = chassis_lookup_by_name(sbrec_chassis_by_name, chassis_id); > ovs_assert(chassis); > > - local_nonvif_data_run(br_int, chassis, &ed_non_vif_data->patch_ofports, > - &ed_non_vif_data->chassis_tunnels); > + ed_non_vif_data->use_flow_based_tunnels = > + is_flow_based_tunnels_enabled(ovs_table, chassis); > + > + local_nonvif_data_run(br_int, chassis, > + &ed_non_vif_data->patch_ofports, > + &ed_non_vif_data->chassis_tunnels, > + ed_non_vif_data->use_flow_based_tunnels ? > + &ed_non_vif_data->flow_tunnels : NULL); > + > return EN_UPDATED; > } > > @@ -4673,6 +4689,7 @@ static void init_physical_ctx(struct engine_node *node, > parse_encap_ips(ovs_table, &p_ctx->n_encap_ips, &p_ctx->encap_ips); > p_ctx->sbrec_port_binding_by_name = sbrec_port_binding_by_name; > p_ctx->sbrec_port_binding_by_datapath = sbrec_port_binding_by_datapath; > + p_ctx->sbrec_chassis_by_name = sbrec_chassis_by_name; > p_ctx->port_binding_table = port_binding_table; > p_ctx->ovs_interface_table = ovs_interface_table; > p_ctx->mc_group_table = multicast_group_table; > @@ -4686,6 +4703,8 @@ static void init_physical_ctx(struct engine_node *node, > p_ctx->local_bindings = &rt_data->lbinding_data.bindings; > p_ctx->patch_ofports = &non_vif_data->patch_ofports; > p_ctx->chassis_tunnels = &non_vif_data->chassis_tunnels; > + p_ctx->flow_tunnels = &non_vif_data->flow_tunnels; > + p_ctx->use_flow_based_tunnels = non_vif_data->use_flow_based_tunnels; > p_ctx->always_tunnel = n_opts->always_tunnel; > p_ctx->evpn_bindings = &eb_data->bindings; > p_ctx->evpn_multicast_groups = &eb_data->multicast_groups; > diff --git a/controller/physical.c b/controller/physical.c > index 6ac5dcd3fb28..e8800fcaa1c0 100644 > --- a/controller/physical.c > +++ b/controller/physical.c > @@ -225,6 +225,202 @@ match_set_chassis_flood_outport(struct match *match, > } > } > > +/* Flow-based tunnel helper functions. */ > + > +static void > +put_set_tunnel_src(const char *src_ip, struct ofpbuf *ofpacts) This and put_set_tunnel_dst() are nearly identical. The only difference is the MFF values used. These could be combined into a single function and use a boolean src/dst qualifier to determine which MFF value to use. > +{ > + if (strchr(src_ip, ':')) { > + /* IPv6 */ > + struct in6_addr src_ipv6; > + if (inet_pton(AF_INET6, src_ip, &src_ipv6) == 1) { There is rarely a need to use inet_pton() directly in OVS/OVN code. You can use ip_parse() and ipv6_parse() as provided by OVS. > + union mf_value value; > + memset(&value, 0, sizeof value); > + memcpy(&value.ipv6, &src_ipv6, sizeof src_ipv6); > + ofpact_put_set_field(ofpacts, mf_from_id(MFF_TUN_IPV6_SRC), > + &value, NULL); > + } > + } else { > + /* IPv4 */ > + struct in_addr src_ipv4; > + if (inet_pton(AF_INET, src_ip, &src_ipv4) == 1) { > + put_load(ntohl(src_ipv4.s_addr), MFF_TUN_SRC, 0, 32, ofpacts); > + } > + } > +} > + > +static void > +put_set_tunnel_dst(const char *dst_ip, struct ofpbuf *ofpacts) > +{ > + if (strchr(dst_ip, ':')) { > + /* IPv6 */ > + struct in6_addr dst_ipv6; > + if (inet_pton(AF_INET6, dst_ip, &dst_ipv6) == 1) { > + union mf_value value; > + memset(&value, 0, sizeof value); > + memcpy(&value.ipv6, &dst_ipv6, sizeof dst_ipv6); > + ofpact_put_set_field(ofpacts, mf_from_id(MFF_TUN_IPV6_DST), > + &value, NULL); > + } > + } else { > + /* IPv4 */ > + struct in_addr dst_ipv4; > + if (inet_pton(AF_INET, dst_ip, &dst_ipv4) == 1) { > + put_load(ntohl(dst_ipv4.s_addr), MFF_TUN_DST, 0, 32, ofpacts); > + } > + } > +} > + > +/* Flow-based encapsulation that sets tunnel metadata and endpoint IPs. */ > +static void > +put_flow_based_encapsulation(enum mf_field_id mff_ovn_geneve, > + enum chassis_tunnel_type tunnel_type, > + const char *local_ip, const char *remote_ip, > + const struct sbrec_datapath_binding *datapath, > + uint16_t outport, bool is_ramp_switch, > + struct ofpbuf *ofpacts) > +{ > + struct chassis_tunnel temp_tun = { > + .type = tunnel_type, > + }; > + put_encapsulation(mff_ovn_geneve, &temp_tun, datapath, > + outport, is_ramp_switch, ofpacts); > + > + /* Set tunnel source and destination IPs (flow-based specific) */ > + put_set_tunnel_src(local_ip, ofpacts); > + put_set_tunnel_dst(remote_ip, ofpacts); > +} > + > + > +/* Generate flows for flow-based tunnel to a specific chassis. */ > +static void > +put_flow_based_remote_port_redirect_overlay( > + const struct sbrec_port_binding *binding, > + const enum en_lport_type type, > + const struct physical_ctx *ctx, > + uint32_t port_key, > + struct match *match, > + struct ofpbuf *ofpacts_p, > + struct ovn_desired_flow_table *flow_table) > +{ > + if (!ctx->use_flow_based_tunnels || !binding->chassis) { You check ctx->use_flow_based_tunnels before calling this function and then here during the function. You could probably avoid the double-check. One way would be to remove the check before calling put_flow_based_remote_port_redirect_overlay(), and changing this function to return a value to indicate whether it was successful or not. This way, the caller of put_flow_based_remote_port_redirect_overlay() can call it, check the return value, and if it's false, move on to port-based tunnel flow installation. Note: this is as far as I made it in my review. I will have a closer look at the rest some time next week. > + return; > + } > + > + /* Skip if this is a local port (no tunneling needed). */ > + if (binding->chassis == ctx->chassis) { > + return; > + } > + > + enum chassis_tunnel_type tunnel_type = > + select_preferred_tunnel_type(ctx->chassis, binding->chassis); > + if (tunnel_type == TUNNEL_TYPE_INVALID) { > + VLOG_DBG("No common tunnel type with chassis %s", > + binding->chassis->name); > + return; > + } > + > + const char *tunnel_type_str = (tunnel_type == GENEVE) ? "geneve" : > "vxlan"; > + const char *remote_ip = select_port_encap_ip(binding, tunnel_type); > + if (!remote_ip) { > + VLOG_DBG("No compatible encap IP for port %s on chassis %s " > + "with type %s", binding->logical_port, > + binding->chassis->name, tunnel_type_str); > + return; > + } > + > + ofp_port_t flow_port = get_flow_based_tunnel_port(tunnel_type, > + ctx->flow_tunnels); > + if (flow_port == 0) { > + VLOG_DBG("No flow-based tunnel port found for type %s", > + tunnel_type_str); > + return; > + } > + > + VLOG_DBG("Using flow-based tunnel: chassis=%s, tunnel_type=%s, " > + "remote_ip=%s, flow_port=%d", binding->chassis->name, > + tunnel_type_str, remote_ip, flow_port); > + > + /* Generate flows for each local encap IP. */ > + for (size_t i = 0; i < ctx->n_encap_ips; i++) { > + const char *local_encap_ip = ctx->encap_ips[i]; > + > + struct ofpbuf *ofpacts_clone = ofpbuf_clone(ofpacts_p); > + > + /* Set encap ID for this local IP. */ > + match_set_reg_masked(match, MFF_LOG_ENCAP_ID - MFF_REG0, i << 16, > + (uint32_t) 0xFFFF << 16); > + > + bool is_vtep_port = type == LP_VTEP; > + if (is_vtep_port) { > + put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16, > ofpacts_clone); > + } > + > + /* Set flow-based tunnel encapsulation. */ > + put_flow_based_encapsulation(ctx->mff_ovn_geneve, tunnel_type, > + local_encap_ip, remote_ip, > + binding->datapath, port_key, > + is_vtep_port, ofpacts_clone); > + > + ofpact_put_OUTPUT(ofpacts_clone)->port = flow_port; > + put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_clone); > + > + ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100, > + binding->header_.uuid.parts[0], match, > + ofpacts_clone, &binding->header_.uuid); > + > + ofpbuf_delete(ofpacts_clone); > + } > +} > + > +static void > +add_tunnel_ingress_flows(const struct chassis_tunnel *tun, > + enum mf_field_id mff_ovn_geneve, > + struct ovn_desired_flow_table *flow_table, > + struct ofpbuf *ofpacts) > +{ > + /* Main ingress flow (priority 100) */ > + struct match match = MATCH_CATCHALL_INITIALIZER; > + match_set_in_port(&match, tun->ofport); > + > + ofpbuf_clear(ofpacts); > + put_decapsulation(mff_ovn_geneve, tun, ofpacts); > + put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts); > + > + ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 100, 0, &match, > + ofpacts, hc_uuid); > + > + /* Set allow rx from tunnel bit */ > + ofpbuf_clear(ofpacts); > + put_load(1, MFF_LOG_FLAGS, MLF_RX_FROM_TUNNEL_BIT, 1, ofpacts); > + put_resubmit(OFTABLE_CT_ZONE_LOOKUP, ofpacts); > + > + /* Add specific flows for E/W ICMPv{4,6} packets if tunnelled packets > + * do not fit path MTU. */ > + > + /* IPv4 ICMP flow (priority 120) */ > + match_init_catchall(&match); > + match_set_in_port(&match, tun->ofport); > + match_set_dl_type(&match, htons(ETH_TYPE_IP)); > + match_set_nw_proto(&match, IPPROTO_ICMP); > + match_set_icmp_type(&match, 3); > + match_set_icmp_code(&match, 4); > + > + ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 120, 0, &match, > + ofpacts, hc_uuid); > + > + /* IPv6 ICMP flow (priority 120) */ > + match_init_catchall(&match); > + match_set_in_port(&match, tun->ofport); > + match_set_dl_type(&match, htons(ETH_TYPE_IPV6)); > + match_set_nw_proto(&match, IPPROTO_ICMPV6); > + match_set_icmp_type(&match, 2); > + match_set_icmp_code(&match, 0); > + > + ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 120, 0, &match, > + ofpacts, hc_uuid); > +} > + > static void > put_stack(enum mf_field_id field, struct ofpact_stack *stack) > { > @@ -435,7 +631,14 @@ put_remote_port_redirect_overlay(const struct > sbrec_port_binding *binding, > struct ovn_desired_flow_table *flow_table, > bool allow_hairpin) > { > - /* Setup encapsulation */ > + if (ctx->use_flow_based_tunnels) { > + put_flow_based_remote_port_redirect_overlay(binding, type, ctx, > + port_key, match, > + ofpacts_p, flow_table); > + return; > + } > + > + /* Setup encapsulation using traditional port-based tunnels. */ > for (size_t i = 0; i < ctx->n_encap_ips; i++) { > const char *encap_ip = ctx->encap_ips[i]; > struct ofpbuf *ofpacts_clone = ofpbuf_clone(ofpacts_p); > @@ -2516,6 +2719,95 @@ tunnel_to_chassis(enum mf_field_id mff_ovn_geneve, > ofpact_put_OUTPUT(remote_ofpacts)->port = tun->ofport; > } > > +/* Flow-based tunnel version of fanout_to_chassis for multicast/broadcast. */ > +static void > +fanout_to_chassis_flow_based(const struct physical_ctx *ctx, > + struct sset *remote_chassis, > + const struct sbrec_datapath_binding *datapath, > + uint16_t outport, bool is_ramp_switch, > + struct ofpbuf *remote_ofpacts) > +{ > + VLOG_DBG("fanout_to_chassis_flow_based called with %"PRIuSIZE > + " remote chassis", sset_count(remote_chassis)); > + > + if (!ctx->flow_tunnels || ovs_list_is_empty(ctx->flow_tunnels)) { > + VLOG_DBG("fanout_to_chassis_flow_based: Missing or empty " > + "flow_tunnels"); > + return; > + } > + > + if (!remote_chassis || sset_is_empty(remote_chassis)) { > + VLOG_DBG("fanout_to_chassis_flow_based: No remote chassis " > + "to send to"); > + return; > + } > + > + const char *local_encap_ip = NULL; > + if (ctx->n_encap_ips <= 0) { > + return; > + } > + local_encap_ip = ctx->encap_ips[0]; /* Use first/default local IP */ > + > + const char *chassis_name; > + enum chassis_tunnel_type prev_type = TUNNEL_TYPE_INVALID; > + > + SSET_FOR_EACH (chassis_name, remote_chassis) { > + const struct sbrec_chassis *remote_chassis_rec = > + chassis_lookup_by_name(ctx->sbrec_chassis_by_name, chassis_name); > + if (!remote_chassis_rec) { > + VLOG_DBG("Chassis %s not found in SB", chassis_name); > + continue; > + } > + > + /* Direct tunnel type selection using new utility */ > + enum chassis_tunnel_type tunnel_type = > + select_preferred_tunnel_type(ctx->chassis, remote_chassis_rec); > + if (tunnel_type == TUNNEL_TYPE_INVALID) { > + VLOG_DBG("No common tunnel type with chassis %s", chassis_name); > + continue; > + } > + const char *tunnel_type_str = (tunnel_type == GENEVE) ? "geneve" > + : "vxlan"; > + > + /* Find the flow-based tunnel port for this type */ > + struct flow_based_tunnel *compatible_tunnel = > + flow_based_tunnel_find(ctx->flow_tunnels, tunnel_type); > + if (!compatible_tunnel) { > + VLOG_DBG("No flow-based tunnel port found for type %s", > + tunnel_type_str); > + continue; > + } > + > + /* Get default remote IP using new utility */ > + const char *remote_ip = select_default_encap_ip(remote_chassis_rec, > + tunnel_type); > + if (!remote_ip) { > + VLOG_DBG("No compatible encap IP for chassis %s with type %s", > + chassis_name, tunnel_type_str); > + continue; > + } > + > + /* Add encapsulation if tunnel type changed or this is the first > + * chassis */ > + if (tunnel_type != prev_type) { > + /* Create a temporary chassis_tunnel structure for encapsulation > */ > + struct chassis_tunnel temp_tun = { > + .chassis_id = CONST_CAST(char *, chassis_name), > + .ofport = compatible_tunnel->ofport, > + .type = tunnel_type > + }; > + put_encapsulation(ctx->mff_ovn_geneve, &temp_tun, datapath, > + outport, is_ramp_switch, remote_ofpacts); > + prev_type = tunnel_type; > + } > + > + /* Set tunnel source and destination IPs for flow-based tunnels */ > + put_set_tunnel_src(local_encap_ip, remote_ofpacts); > + put_set_tunnel_dst(remote_ip, remote_ofpacts); > + ofpact_put_OUTPUT(remote_ofpacts)->port = compatible_tunnel->ofport; > + } > +} > + > /* Encapsulate and send to a set of remote chassis. */ > static void > fanout_to_chassis(enum mf_field_id mff_ovn_geneve, > @@ -2757,12 +3049,27 @@ consider_mc_group(const struct physical_ctx *ctx, > if (remote_ports) { > put_load(mc->tunnel_key, MFF_LOG_OUTPORT, 0, 32, > &remote_ctx->ofpacts); > } > - fanout_to_chassis(ctx->mff_ovn_geneve, &remote_chassis, > - ctx->chassis_tunnels, mc->datapath, mc->tunnel_key, > - false, &remote_ctx->ofpacts); > - fanout_to_chassis(ctx->mff_ovn_geneve, &vtep_chassis, > - ctx->chassis_tunnels, mc->datapath, mc->tunnel_key, > - true, &remote_ctx->ofpacts); > + if (ctx->use_flow_based_tunnels) { > + VLOG_DBG("Using flow-based tunnels for multicast group %s " > + "(tunnel_key=%"PRId64") with %"PRIuSIZE" remote chassis", > + mc->name, mc->tunnel_key, sset_count(&remote_chassis)); > + fanout_to_chassis_flow_based(ctx, &remote_chassis, > + mc->datapath, mc->tunnel_key, > + false, &remote_ctx->ofpacts); > + fanout_to_chassis_flow_based(ctx, &vtep_chassis, > + mc->datapath, mc->tunnel_key, > + true, &remote_ctx->ofpacts); > + } else { > + VLOG_DBG("Using port-based tunnels for multicast group %s " > + "(tunnel_key=%"PRId64") with %"PRIuSIZE" remote chassis", > + mc->name, mc->tunnel_key, sset_count(&remote_chassis)); > + fanout_to_chassis(ctx->mff_ovn_geneve, &remote_chassis, > + ctx->chassis_tunnels, mc->datapath, mc->tunnel_key, > + false, &remote_ctx->ofpacts); > + fanout_to_chassis(ctx->mff_ovn_geneve, &vtep_chassis, > + ctx->chassis_tunnels, mc->datapath, mc->tunnel_key, > + true, &remote_ctx->ofpacts); > + } > > remote_ports = remote_ctx->ofpacts.size > 0; > if (remote_ports) { > @@ -3394,6 +3701,7 @@ physical_run(struct physical_ctx *p_ctx, > consider_mc_group(p_ctx, mc, flow_table); > } > > + > /* Table 0, priority 100. > * ====================== > * > @@ -3408,44 +3716,30 @@ physical_run(struct physical_ctx *p_ctx, > * packets to the local hypervisor. */ > struct chassis_tunnel *tun; > HMAP_FOR_EACH (tun, hmap_node, p_ctx->chassis_tunnels) { > - struct match match = MATCH_CATCHALL_INITIALIZER; > - match_set_in_port(&match, tun->ofport); > - > - ofpbuf_clear(&ofpacts); > - put_decapsulation(p_ctx->mff_ovn_geneve, tun, &ofpacts); > - > - put_resubmit(OFTABLE_LOCAL_OUTPUT, &ofpacts); > - ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 100, 0, &match, > - &ofpacts, hc_uuid); > - > - /* Set allow rx from tunnel bit. */ > - put_load(1, MFF_LOG_FLAGS, MLF_RX_FROM_TUNNEL_BIT, 1, &ofpacts); > - > - /* Add specif flows for E/W ICMPv{4,6} packets if tunnelled packets > - * do not fit path MTU. > - */ > - put_resubmit(OFTABLE_CT_ZONE_LOOKUP, &ofpacts); > - > - /* IPv4 */ > - match_init_catchall(&match); > - match_set_in_port(&match, tun->ofport); > - match_set_dl_type(&match, htons(ETH_TYPE_IP)); > - match_set_nw_proto(&match, IPPROTO_ICMP); > - match_set_icmp_type(&match, 3); > - match_set_icmp_code(&match, 4); > - > - ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 120, 0, &match, > - &ofpacts, hc_uuid); > - /* IPv6 */ > - match_init_catchall(&match); > - match_set_in_port(&match, tun->ofport); > - match_set_dl_type(&match, htons(ETH_TYPE_IPV6)); > - match_set_nw_proto(&match, IPPROTO_ICMPV6); > - match_set_icmp_type(&match, 2); > - match_set_icmp_code(&match, 0); > - > - ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 120, 0, &match, > - &ofpacts, hc_uuid); > + add_tunnel_ingress_flows(tun, p_ctx->mff_ovn_geneve, flow_table, > + &ofpacts); > + } > + > + /* Process packets that arrive from flow-based tunnels. */ > + if (p_ctx->use_flow_based_tunnels && p_ctx->flow_tunnels > + && !ovs_list_is_empty(p_ctx->flow_tunnels)) { > + struct flow_based_tunnel *tunnel; > + LIST_FOR_EACH (tunnel, list_node, p_ctx->flow_tunnels) { > + /* Flow-based tunnels use the same ingress flow logic as > + * port-based. Create a temporary chassis_tunnel structure > + * for compatibility. */ > + struct chassis_tunnel temp_tunnel = { > + .type = tunnel->type, > + .ofport = tunnel->ofport, > + .chassis_id = NULL /* Not needed for decapsulation */ > + }; > + > + VLOG_DBG("Adding flow-based tunnel ingress flow: in_port=%d, " > + "type=%d", tunnel->ofport, tunnel->type); > + > + add_tunnel_ingress_flows(&temp_tunnel, p_ctx->mff_ovn_geneve, > + flow_table, &ofpacts); > + } > } > > /* Add VXLAN specific rules to transform port keys > diff --git a/controller/physical.h b/controller/physical.h > index 0dc544823ad5..6cefe0ab36d7 100644 > --- a/controller/physical.h > +++ b/controller/physical.h > @@ -51,6 +51,7 @@ struct physical_debug { > struct physical_ctx { > struct ovsdb_idl_index *sbrec_port_binding_by_name; > struct ovsdb_idl_index *sbrec_port_binding_by_datapath; > + struct ovsdb_idl_index *sbrec_chassis_by_name; > const struct sbrec_port_binding_table *port_binding_table; > const struct ovsrec_interface_table *ovs_interface_table; > const struct sbrec_multicast_group_table *mc_group_table; > @@ -65,6 +66,8 @@ struct physical_ctx { > struct shash *local_bindings; > struct simap *patch_ofports; > struct hmap *chassis_tunnels; > + bool use_flow_based_tunnels; > + struct ovs_list *flow_tunnels; > size_t n_encap_ips; > const char **encap_ips; > struct physical_debug debug; > diff --git a/lib/ovn-util.h b/lib/ovn-util.h > index 7a5bb9559efd..69a0d41a6894 100644 > --- a/lib/ovn-util.h > +++ b/lib/ovn-util.h > @@ -347,6 +347,7 @@ hash_add_in6_addr(uint32_t hash, const struct in6_addr > *addr) > /* Must be a bit-field ordered from most-preferred (higher number) to > * least-preferred (lower number). */ > enum chassis_tunnel_type { > + TUNNEL_TYPE_INVALID = 0, /* No valid tunnel type or no common type */ > GENEVE = 1 << 1, > VXLAN = 1 << 0 > }; > diff --git a/tests/ovn-macros.at b/tests/ovn-macros.at > index cc16fe0298e6..4055697f4ae4 100644 > --- a/tests/ovn-macros.at > +++ b/tests/ovn-macros.at > @@ -109,7 +109,12 @@ m4_divert_text([PREPARE_TESTS], > echo "$3: waiting for flows for remote output on $hv1" > # Wait for a flow outputing to remote output > OVS_WAIT_UNTIL([ > - ofport=$(as $hv1 ovs-vsctl --bare --columns ofport find Interface > name=ovn-${hv2}-0) > + if test X$OVN_ENABLE_FLOW_BASED_TUNNELS = Xyes; then > + tun_name=ovn-geneve > + else > + tun_name=ovn-${hv2}-0 > + fi > + ofport=$(as $hv1 ovs-vsctl --bare --columns ofport find Interface > name=$tun_name) > echo "tunnel port=$ofport" > test -n "$ofport" && test 1 -le $(as $hv1 ovs-ofctl dump-flows > br-int | grep -c "output:$ofport") > ]) > @@ -121,7 +126,12 @@ m4_divert_text([PREPARE_TESTS], > echo "$3: waiting for flows for remote input on $hv1" > # Wait for a flow outputing to remote input > OVS_WAIT_UNTIL([ > - ofport=$(as $hv1 ovs-vsctl --bare --columns ofport find Interface > name=ovn-${hv2}-0) > + if test X$OVN_ENABLE_FLOW_BASED_TUNNELS = Xyes; then > + tun_name=ovn-geneve > + else > + tun_name=ovn-${hv2}-0 > + fi > + ofport=$(as $hv1 ovs-vsctl --bare --columns ofport find Interface > name=$tun_name) > echo "tunnel port=$ofport" > test -n "$ofport" && test 1 -le $(as $hv1 ovs-ofctl dump-flows > br-int | grep -c "in_port=$ofport") > ]) > @@ -762,6 +772,13 @@ ovn_az_attach() { > ovs-vsctl set open . external_ids:ovn-monitor-all=true > fi > > + # Configure flow-based tunnels if the test variable is set > + if test X$OVN_ENABLE_FLOW_BASED_TUNNELS = Xyes; then > + ovs-vsctl set open . external_ids:ovn-enable-flow-based-tunnels=true > + elif test X$OVN_ENABLE_FLOW_BASED_TUNNELS = Xno; then > + ovs-vsctl set open . external_ids:ovn-enable-flow-based-tunnels=false > + fi > + > start_daemon ovn-controller --enable-dummy-vif-plug ${cli_args} || > return 1 > if test X"$az" = XNONE; then > ovn_wait_for_encaps $expected_encap_id > @@ -1441,6 +1458,16 @@ m4_define([OVN_FOR_EACH_NORTHD_NO_HV_PARALLELIZATION], > [m4_foreach([NORTHD_USE_PARALLELIZATION], [yes, no], [$1 > ])]) > > +# Defines versions of the test with all combinations of northd, > +# parallelization enabled, conditional monitoring on/off, and flow-based > +# tunnels on/off. Use this for tests that need to verify behavior with both > +# port-based and flow-based tunnel implementations. > +m4_define([OVN_FOR_EACH_NORTHD_FLOW_TUNNEL], > + [m4_foreach([NORTHD_USE_PARALLELIZATION], [yes], > + [m4_foreach([OVN_MONITOR_ALL], [yes, no], > + [m4_foreach([OVN_ENABLE_FLOW_BASED_TUNNELS], [yes, no], [$1 > +])])])]) > + > # OVN_NBCTL(NBCTL_COMMAND) adds NBCTL_COMMAND to list of commands to be run > by RUN_OVN_NBCTL(). > m4_define([OVN_NBCTL], [ > command="${command} -- $1" > diff --git a/tests/ovn.at b/tests/ovn.at > index 6aff32dc05a4..8b74bdfdf618 100644 > --- a/tests/ovn.at > +++ b/tests/ovn.at > @@ -2508,7 +2508,7 @@ AT_CLEANUP > ]) > > # 3 hypervisors, one logical switch, 3 logical ports per hypervisor > -OVN_FOR_EACH_NORTHD([ > +OVN_FOR_EACH_NORTHD_FLOW_TUNNEL([ > AT_SETUP([3 HVs, 1 LS, 3 lports/HV]) > AT_KEYWORDS([ovnarp]) > AT_KEYWORDS([slowtest]) > @@ -2785,7 +2785,7 @@ AT_CLEANUP > > # 2 hypervisors, one logical switch, 2 logical ports per hypervisor > # logical ports bound to chassis encap-ip. > -OVN_FOR_EACH_NORTHD([ > +OVN_FOR_EACH_NORTHD_FLOW_TUNNEL([ > AT_SETUP([2 HVs, 1 LS, 2 lports/HV]) > AT_KEYWORDS([ovnarp]) > ovn_start > @@ -3188,7 +3188,7 @@ AT_CLEANUP > # 2 hypervisors, 4 logical ports per HV > # 2 locally attached networks (one flat, one vlan tagged over same device) > # 2 ports per HV on each network > -OVN_FOR_EACH_NORTHD([ > +OVN_FOR_EACH_NORTHD_FLOW_TUNNEL([ > AT_SETUP([2 HVs, 4 lports/HV, localnet ports]) > ovn_start > > @@ -3522,7 +3522,7 @@ OVN_CLEANUP([hv-1 > AT_CLEANUP > ]) > > -OVN_FOR_EACH_NORTHD([ > +OVN_FOR_EACH_NORTHD_FLOW_TUNNEL([ > AT_SETUP([2 HVs, 2 LS, broadcast traffic with multiple localnet ports per > switch]) > ovn_start > > @@ -4756,7 +4756,7 @@ AT_CLEANUP > ]) > > # Similar test to "hardware GW" > -OVN_FOR_EACH_NORTHD([ > +OVN_FOR_EACH_NORTHD_FLOW_TUNNEL([ > AT_SETUP([3 HVs, 1 VIFs/HV, 1 software GW, 1 LS]) > ovn_start > > @@ -4915,7 +4915,7 @@ AT_CLEANUP > ]) > > # 3 hypervisors, 3 logical switches with 3 logical ports each, 1 logical > router > -OVN_FOR_EACH_NORTHD([ > +OVN_FOR_EACH_NORTHD_FLOW_TUNNEL([ > AT_SETUP([3 HVs, 3 LS, 3 lports/LS, 1 LR]) > AT_KEYWORDS([slowtest]) > AT_SKIP_IF([test $HAVE_SCAPY = no]) > @@ -5352,7 +5352,7 @@ OVN_CLEANUP([hv1], [hv2], [hv3]) > AT_CLEANUP > ]) > > -OVN_FOR_EACH_NORTHD([ > +OVN_FOR_EACH_NORTHD_FLOW_TUNNEL([ > AT_SETUP([IP relocation using GARP request]) > AT_SKIP_IF([test $HAVE_SCAPY = no]) > ovn_start > @@ -31371,6 +31371,7 @@ for i in 1 2; do > ovs-vsctl add-br br-phys-$j > ovn_attach n1 br-phys-$j 192.168.0.${i}1 > ovs-vsctl set open . > external_ids:ovn-encap-ip=192.168.0.${i}1,192.168.0.${i}2 > + ovs-vsctl set open . external_ids:ovn-enable-flow-based-tunnels=false > > for j in 1 2; do > ovs-vsctl add-port br-int vif$i$j -- set Interface vif$i$j \ > @@ -31424,6 +31425,7 @@ vif_to_hv() { > # tunnel that matches the VIFs' encap_ip configurations. > check_packet_tunnel() { > local src=$1 dst=$2 > + local flow_based_tunnel=$3 > local src_mac=f0:00:00:00:00:$src > local dst_mac=f0:00:00:00:00:$dst > local src_ip=10.0.0.$src > @@ -31436,8 +31438,17 @@ check_packet_tunnel() { > hv=`vif_to_hv vif$src` > as $hv > echo "vif$src -> vif$dst should go through tunnel $local_encap_ip -> > $remote_encap_ip" > - tunnel_ofport=$(ovs-vsctl --bare --column=ofport find interface > options:local_ip=$local_encap_ip options:remote_ip=$remote_encap_ip) > + if test x$flow_based_tunnel == xtrue; then > + tunnel_ofport=$(ovs-vsctl --bare --column=ofport list interface > ovn-geneve) > + else > + tunnel_ofport=$(ovs-vsctl --bare --column=ofport find interface > options:local_ip=$local_encap_ip options:remote_ip=$remote_encap_ip) > + fi > AT_CHECK([test $(ovs-appctl ofproto/trace br-int in_port=vif$src $packet > | grep "output:" | awk -F ':' '{ print $2 }') == $tunnel_ofport]) > + if test x$flow_based_tunnel == xtrue; then > + trace_output=$(ovs-appctl ofproto/trace br-int in_port=vif$src > $packet) > + AT_CHECK([echo "$trace_output" | grep -q > "set_field:$local_encap_ip->tun_src"]) > + AT_CHECK([echo "$trace_output" | grep -q > "set_field:$remote_encap_ip->tun_dst"]) > + fi > } > > for i in 1 2; do > @@ -31446,6 +31457,20 @@ for i in 1 2; do > done > done > > +# Test flow-based tunnels > +for i in 1 2; do > + as hv$i > + ovs-vsctl set open . external_ids:ovn-enable-flow-based-tunnels=true > +done > +check ovn-nbctl --wait=hv sync > + > +for i in 1 2; do > + for j in 1 2; do > + check_packet_tunnel 1$i 2$j true > + done > +done > + > + > OVN_CLEANUP([hv1],[hv2]) > AT_CLEANUP > ]) > @@ -31467,6 +31492,7 @@ for i in 1 2; do > ovs-vsctl add-br br-phys-$j > ovn_attach n1 br-phys-$j 192.168.0.${i}1 > ovs-vsctl set open . > external_ids:ovn-encap-ip=192.168.0.${i}1,192.168.0.${i}2 > + ovs-vsctl set open . external_ids:ovn-enable-flow-based-tunnels=false > > check ovn-nbctl ls-add ls$i -- \ > lrp-add lr lrp-ls$i f0:00:00:00:88:0$i 10.0.$i.88/24 -- \ > @@ -31503,38 +31529,37 @@ vif_to_ip() { > # tunnel that matches the VIFs' encap_ip configurations. > check_packet_tunnel() { > local src=$1 dst=$2 > + local local_encap_ip=$3 > + local remote_encap_ip=$4 > + local flow_based_tunnel=$5 > local src_mac=f0:00:00:00:00:$src > local dst_mac=f0:00:00:00:88:01 # lrp-ls1's MAC > local src_ip=$(vif_to_ip vif$src) > local dst_ip=$(vif_to_ip vif$dst) > > - local local_encap_ip > - if test -n "$3"; then > - local_encap_ip=$3 > - else > - local_encap_ip=192.168.0.$src > - fi > - > - local remote_encap_ip > - if test -n "$4"; then > - remote_encap_ip=$4 > - else > - remote_encap_ip=192.168.0.$dst > - fi > - > local packet=$(fmt_pkt "Ether(dst='${dst_mac}', src='${src_mac}')/ \ > IP(dst='${dst_ip}', src='${src_ip}')/ \ > ICMP(type=8)") > hv=`vif_to_hv vif$src` > as $hv > echo "vif$src -> vif$dst should go through tunnel $local_encap_ip -> > $remote_encap_ip" > - tunnel_ofport=$(ovs-vsctl --bare --column=ofport find interface > options:local_ip=$local_encap_ip options:remote_ip=$remote_encap_ip) > + > + if test x$flow_based_tunnel == xtrue; then > + tunnel_ofport=$(ovs-vsctl --bare --column=ofport list interface > ovn-geneve) > + else > + tunnel_ofport=$(ovs-vsctl --bare --column=ofport find interface > options:local_ip=$local_encap_ip options:remote_ip=$remote_encap_ip) > + fi > AT_CHECK([test $(ovs-appctl ofproto/trace br-int in_port=vif$src $packet > | grep "output:" | awk -F ':' '{ print $2 }') == $tunnel_ofport]) > + if test x$flow_based_tunnel == xtrue; then > + trace_output=$(ovs-appctl ofproto/trace br-int in_port=vif$src > $packet) > + AT_CHECK([echo "$trace_output" | grep -q > "set_field:$local_encap_ip->tun_src"]) > + AT_CHECK([echo "$trace_output" | grep -q > "set_field:$remote_encap_ip->tun_dst"]) > + fi > } > > for i in 1 2; do > for j in 1 2; do > - check_packet_tunnel 1$i 2$j > + check_packet_tunnel 1$i 2$j 192.168.0.1$i 192.168.0.2$j > done > done > > @@ -31558,6 +31583,36 @@ for i in 1 2; do > done > done > > +# Test flow-based tunnels > +for i in 1 2; do > + as hv$i > + check ovs-vsctl set open . > external_ids:ovn-enable-flow-based-tunnels=true > + for j in 1 2; do > + check ovs-vsctl set Interface vif$i$j > external_ids:encap-ip=192.168.0.$i$j > + done > +done > +check ovn-nbctl --wait=hv sync > + > +for i in 1 2; do > + for j in 1 2; do > + check_packet_tunnel 1$i 2$j 192.168.0.1$i 192.168.0.2$j true > + done > +done > + > +for i in 1 2; do > + as hv$i > + for j in 1 2; do > + check ovs-vsctl remove Interface vif$i$j external_ids encap-ip > + done > +done > +check ovn-nbctl --wait=hv sync > + > +for i in 1 2; do > + for j in 1 2; do > + check_packet_tunnel 1$i 2$j 192.168.0.12 192.168.0.22 true > + done > +done > + > OVN_CLEANUP([hv1],[hv2]) > AT_CLEANUP > ]) > @@ -31585,6 +31640,7 @@ for i in 1 2; do > ovs-vsctl add-br br-phys-$j > ovn_attach n1 br-phys-$j 192.168.0.${i}1 > ovs-vsctl set open . > external_ids:ovn-encap-ip=192.168.0.${i}1,192.168.0.${i}2 > + ovs-vsctl set open . external_ids:ovn-enable-flow-based-tunnels=false > > ovs-vsctl add-br br-ext > ovs-vsctl add-port br-ext vif$i -- set Interface vif$i \ > @@ -31630,14 +31686,26 @@ check_packet_tunnel() { > local local_encap_ip=$3 > local remote_encap_ip=$4 > > + local flow_based_tunnel=$5 > + > local packet=$(fmt_pkt "Ether(dst='${dst_mac}', src='${src_mac}')/ \ > IP(dst='${dst_ip}', src='${src_ip}')/ \ > ICMP(type=8)") > hv=`vif_to_hv vif$src` > as $hv > echo "vif$src -> vif$dst should go through tunnel $local_encap_ip -> > $remote_encap_ip" > - tunnel_ofport=$(ovs-vsctl --bare --column=ofport find interface > options:local_ip=$local_encap_ip options:remote_ip=$remote_encap_ip) > + if test x$flow_based_tunnel == xtrue; then > + tunnel_ofport=$(ovs-vsctl --bare --column=ofport list interface > ovn-geneve) > + else > + tunnel_ofport=$(ovs-vsctl --bare --column=ofport find interface > options:local_ip=$local_encap_ip options:remote_ip=$remote_encap_ip) > + fi > + ovs-appctl ofproto/trace br-ext in_port=vif$src $packet > AT_CHECK([test $(ovs-appctl ofproto/trace br-ext in_port=vif$src $packet > | grep "output:" | awk -F ':' '{ print $2 }') == $tunnel_ofport]) > + if test x$flow_based_tunnel == xtrue; then > + trace_output=$(ovs-appctl ofproto/trace br-ext in_port=vif$src > $packet) > + AT_CHECK([echo "$trace_output" | grep -q > "set_field:$local_encap_ip->tun_src"]) > + AT_CHECK([echo "$trace_output" | grep -q > "set_field:$remote_encap_ip->tun_dst"]) > + fi > } > > # Create mac-binding for the destination so that there is no need to trigger > @@ -31659,6 +31727,23 @@ for e in 1 2; do > check_packet_tunnel 1 2 192.168.0.1${e} 192.168.0.2${e} > done > > +# Test flow-based tunnels > +for i in 1 2; do > + as hv$i > + check ovs-vsctl set open . > external_ids:ovn-enable-flow-based-tunnels=true > +done > +check ovn-nbctl --wait=hv sync > + > +for e in 1 2; do > + for i in 1 2; do > + as hv$i > + check ovs-vsctl set open . > external_ids:ovn-encap-ip-default=192.168.0.${i}${e} > + done > + check ovn-nbctl --wait=hv sync > + > + check_packet_tunnel 1 2 192.168.0.1${e} 192.168.0.2${e} true > +done > + > OVN_CLEANUP([hv1],[hv2]) > AT_CLEANUP > ]) > diff --git a/tests/ovs-macros.at b/tests/ovs-macros.at > index 6a3dc51dc392..6e6e5208028e 100644 > --- a/tests/ovs-macros.at > +++ b/tests/ovs-macros.at > @@ -9,13 +9,15 @@ dnl - If NORTHD_USE_DP_GROUPS is defined, then append it to > the test name and > dnl set it as a shell variable as well. > m4_rename([AT_SETUP], [OVS_AT_SETUP]) > m4_define([AT_SETUP], > - [OVS_AT_SETUP($@[]m4_ifdef([NORTHD_USE_PARALLELIZATION], [ -- > parallelization=NORTHD_USE_PARALLELIZATION])[]m4_ifdef([OVN_MONITOR_ALL], [ > -- ovn_monitor_all=OVN_MONITOR_ALL])) > + [OVS_AT_SETUP($@[]m4_ifdef([NORTHD_USE_PARALLELIZATION], [ -- > parallelization=NORTHD_USE_PARALLELIZATION])[]m4_ifdef([OVN_MONITOR_ALL], [ > -- > ovn_monitor_all=OVN_MONITOR_ALL])[]m4_ifdef([OVN_ENABLE_FLOW_BASED_TUNNELS], > [ -- flow_based_tunnels=OVN_ENABLE_FLOW_BASED_TUNNELS])) > m4_ifdef([NORTHD_USE_PARALLELIZATION], > [[NORTHD_USE_PARALLELIZATION]=NORTHD_USE_PARALLELIZATION > ])dnl > m4_ifdef([NORTHD_DUMMY_NUMA], [[NORTHD_DUMMY_NUMA]=NORTHD_DUMMY_NUMA > ])dnl > m4_ifdef([OVN_MONITOR_ALL], [[OVN_MONITOR_ALL]=OVN_MONITOR_ALL > ])dnl > +m4_ifdef([OVN_ENABLE_FLOW_BASED_TUNNELS], > [[OVN_ENABLE_FLOW_BASED_TUNNELS]=OVN_ENABLE_FLOW_BASED_TUNNELS > +])dnl > ovs_init > ]) > > -- > 2.38.1 > > _______________________________________________ > dev mailing list > [email protected] > https://mail.openvswitch.org/mailman/listinfo/ovs-dev > _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
