This patch introduces flow-based tunnels as an alternative to traditional port-based tunnels, significantly reducing tunnel port count in large deployments.
Flow-based tunnels use shared ports (one per tunnel type) with options:local_ip=flow and options:remote_ip=flow. OpenFlow flows dynamically set tunnel endpoints using set_field actions, reducing port count to O(T) where T is the number of tunnel types. The feature is experimental, and controlled by external_ids:ovn-enable-flow-based-tunnels (default: false). Some known limitations: - IPsec is not supported - BFD between tunnel endpoints is not supported, thus HA chassis not supported. Assisted-by: Cursor, with model: Claude Sonnet 4.5 Signed-off-by: Han Zhou <[email protected]> --- NEWS | 10 +- controller/bfd.c | 16 +- controller/bfd.h | 3 +- controller/encaps.c | 240 +++++++++++++++---- controller/encaps.h | 15 ++ controller/local_data.c | 188 ++++++++++++++- controller/local_data.h | 32 ++- controller/ovn-controller.8.xml | 30 +++ controller/ovn-controller.c | 26 ++- controller/physical.c | 402 +++++++++++++++++++++++++++----- controller/physical.h | 3 + lib/ovn-util.h | 3 +- tests/ovn-macros.at | 31 ++- tests/ovn.at | 135 +++++++++-- tests/ovs-macros.at | 4 +- 15 files changed, 992 insertions(+), 146 deletions(-) diff --git a/NEWS b/NEWS index a4c8557eecf0..22a9141e0fec 100644 --- a/NEWS +++ b/NEWS @@ -66,9 +66,13 @@ Post v25.09.0 specified LS. - Add ovn-nbctl lsp-add-localnet-port which will create localnet port on specified LS. - - Add a new experimental service - ovn-br-controller to program and - manage OVS bridges (not managed by ovn-controller) using OVN logical flows. - For more details see man ovn-br(5). + - Add a new experimental service - ovn-br-controller to program and + manage OVS bridges (not managed by ovn-controller) using OVN logical + flows. For more details see man ovn-br(5). + - Added experimental flow-based tunnel support. Enable via + external_ids:ovn-enable-flow-based-tunnels=true to use shared tunnel + ports instead of per-chassis ports, reducing port count for large scale + environments. Default is disabled. OVN v25.09.0 - xxx xx xxxx -------------------------- diff --git a/controller/bfd.c b/controller/bfd.c index 9889b7e7a4cb..3b0c3f6dae0f 100644 --- a/controller/bfd.c +++ b/controller/bfd.c @@ -70,7 +70,7 @@ bfd_calculate_active_tunnels(const struct ovsrec_bridge *br_int, "state"); if (status && !strcmp(status, "up")) { const char *id = smap_get(&port_rec->external_ids, - "ovn-chassis-id"); + OVN_TUNNEL_ID); if (id) { char *chassis_name = NULL; @@ -185,7 +185,8 @@ bfd_run(const struct ovsrec_interface_table *interface_table, const struct ovsrec_bridge *br_int, const struct sset *bfd_chassis, const struct sbrec_chassis *chassis_rec, - const struct sbrec_sb_global_table *sb_global_table) + const struct sbrec_sb_global_table *sb_global_table, + const struct ovsrec_open_vswitch_table *ovs_table) { if (!chassis_rec) { return; @@ -196,7 +197,7 @@ bfd_run(const struct ovsrec_interface_table *interface_table, struct sset bfd_ifaces = SSET_INITIALIZER(&bfd_ifaces); for (size_t k = 0; k < br_int->n_ports; k++) { const char *tunnel_id = smap_get(&br_int->ports[k]->external_ids, - "ovn-chassis-id"); + OVN_TUNNEL_ID); if (tunnel_id) { char *chassis_name = NULL; char *port_name = br_int->ports[k]->name; @@ -212,6 +213,15 @@ bfd_run(const struct ovsrec_interface_table *interface_table, } } + /* Warn if BFD is needed but flow-based tunnels are enabled */ + if (is_flow_based_tunnels_enabled(ovs_table, chassis_rec) + && !sset_is_empty(bfd_chassis)) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + VLOG_WARN_RL(&rl, "Flow-based tunnels are enabled but BFD is required " + "for HA chassis groups. BFD will not function correctly " + "with flow-based tunnels."); + } + const struct sbrec_sb_global *sb = sbrec_sb_global_table_first(sb_global_table); struct smap bfd = SMAP_INITIALIZER(&bfd); diff --git a/controller/bfd.h b/controller/bfd.h index be0d599b7cee..f8fece5a580b 100644 --- a/controller/bfd.h +++ b/controller/bfd.h @@ -33,7 +33,8 @@ void bfd_run(const struct ovsrec_interface_table *, const struct ovsrec_bridge *, const struct sset *, const struct sbrec_chassis *, - const struct sbrec_sb_global_table *); + const struct sbrec_sb_global_table *, + const struct ovsrec_open_vswitch_table *); void bfd_calculate_chassis( const struct sbrec_chassis *, diff --git a/controller/encaps.c b/controller/encaps.c index 3ed42534b36a..61f41bf3acb5 100644 --- a/controller/encaps.c +++ b/controller/encaps.c @@ -29,14 +29,6 @@ VLOG_DEFINE_THIS_MODULE(encaps); -/* - * Given there could be multiple tunnels with different IPs to the same - * chassis we annotate the external_ids:ovn-chassis-id in tunnel port with - * <chassis_name>@<remote IP>%<local IP>. The external_id key - * "ovn-chassis-id" is kept for backward compatibility. - */ -#define OVN_TUNNEL_ID "ovn-chassis-id" - static char *current_br_int_name = NULL; void @@ -550,6 +542,172 @@ create_evpn_tunnels(struct tunnel_ctx *tc) } +bool +is_flow_based_tunnels_enabled( + const struct ovsrec_open_vswitch_table *ovs_table, + const struct sbrec_chassis *chassis) +{ + const struct ovsrec_open_vswitch *cfg = + ovsrec_open_vswitch_table_first(ovs_table); + + return cfg ? get_chassis_external_id_value_bool( + &cfg->external_ids, chassis->name, + "ovn-enable-flow-based-tunnels", false) + : false; +} + +static char * +flow_based_tunnel_name(const char *tunnel_type, const char *chassis_idx) +{ + return xasprintf("ovn%s-%s", chassis_idx, tunnel_type); +} + +static void +flow_based_tunnel_ensure(struct tunnel_ctx *tc, const char *tunnel_type, + const char *port_name, + const struct sbrec_sb_global *sbg, + const struct ovsrec_open_vswitch_table *ovs_table) +{ + /* Check if flow-based tunnel already exists. */ + const struct ovsrec_port *existing_port = NULL; + for (size_t i = 0; i < tc->br_int->n_ports; i++) { + const struct ovsrec_port *port = tc->br_int->ports[i]; + if (!strcmp(port->name, port_name)) { + existing_port = port; + break; + } + } + + if (existing_port) { + return; + } + + /* Create flow-based tunnel port. */ + struct smap options = SMAP_INITIALIZER(&options); + smap_add(&options, "remote_ip", "flow"); + smap_add(&options, "local_ip", "flow"); + smap_add(&options, "key", "flow"); + + if (sbg->ipsec) { + /* For flow-based tunnels, we can't specify remote_name since + * remote chassis varies. IPsec will need to handle this differently. + */ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1); + VLOG_WARN_RL(&rl, "IPsec is not supported for flow-based tunnels. " + "Ignoring IPsec settings."); + } + + /* Add other tunnel options from OVS config. */ + const struct ovsrec_open_vswitch *cfg = + ovsrec_open_vswitch_table_first(ovs_table); + if (cfg) { + const char *encap_tos = + get_chassis_external_id_value(&cfg->external_ids, + tc->this_chassis->name, + "ovn-encap-tos", "none"); + if (encap_tos && strcmp(encap_tos, "none")) { + smap_add(&options, "tos", encap_tos); + } + + const char *encap_df = + get_chassis_external_id_value(&cfg->external_ids, + tc->this_chassis->name, + "ovn-encap-df_default", NULL); + if (encap_df) { + smap_add(&options, "df_default", encap_df); + } + } + + /* Create interface. */ + struct ovsrec_interface *iface = ovsrec_interface_insert(tc->ovs_txn); + ovsrec_interface_set_name(iface, port_name); + ovsrec_interface_set_type(iface, tunnel_type); + ovsrec_interface_set_options(iface, &options); + + /* Create port. */ + struct ovsrec_port *port = ovsrec_port_insert(tc->ovs_txn); + ovsrec_port_set_name(port, port_name); + ovsrec_port_set_interfaces(port, &iface, 1); + + /* Set external IDs to mark as flow-based tunnel using unified + * OVN_TUNNEL_ID. */ + const struct smap external_ids = SMAP_CONST2(&external_ids, + OVN_TUNNEL_ID, "flow", + "ovn-tunnel-type", + tunnel_type); + ovsrec_port_set_external_ids(port, &external_ids); + + /* Add to bridge. */ + ovsrec_bridge_update_ports_addvalue(tc->br_int, port); + + VLOG_INFO("Created flow-based %s tunnel port: %s", tunnel_type, port_name); + + smap_destroy(&options); +} + +static void +create_flow_based_tunnels(struct tunnel_ctx *tc, + const struct sbrec_sb_global *sbg) +{ + struct sset tunnel_types = SSET_INITIALIZER(&tunnel_types); + + for (size_t i = 0; i < tc->this_chassis->n_encaps; i++) { + sset_add(&tunnel_types, tc->this_chassis->encaps[i]->type); + } + + const char *tunnel_type; + SSET_FOR_EACH (tunnel_type, &tunnel_types) { + char *port_name = flow_based_tunnel_name(tunnel_type, + get_chassis_idx(tc->ovs_table)); + flow_based_tunnel_ensure(tc, tunnel_type, port_name, sbg, + tc->ovs_table); + /* Remove any existing tunnel with this name from tracking so it + * doesn't get deleted. */ + struct tunnel_node *exist_tun = shash_find_and_delete(&tc->tunnel, + port_name); + free(exist_tun); + free(port_name); + } + + sset_destroy(&tunnel_types); +} + +static void +create_port_based_tunnels(struct tunnel_ctx *tc, + const struct sbrec_chassis_table *chassis_table, + const struct sbrec_sb_global *sbg, + const struct sset *transport_zones) +{ + const struct sbrec_chassis *chassis_rec; + SBREC_CHASSIS_TABLE_FOR_EACH (chassis_rec, chassis_table) { + if (strcmp(chassis_rec->name, tc->this_chassis->name)) { + /* Create tunnels to the other Chassis belonging to the + * same transport zone */ + if (!chassis_tzones_overlap(transport_zones, chassis_rec)) { + VLOG_DBG("Skipping encap creation for Chassis '%s' because " + "it belongs to different transport zones", + chassis_rec->name); + continue; + } + + if (smap_get_bool(&chassis_rec->other_config, "is-remote", false) + && !smap_get_bool(&tc->this_chassis->other_config, + "is-interconn", false)) { + VLOG_DBG("Skipping encap creation for Chassis '%s' because " + "it is remote but this chassis is not interconn.", + chassis_rec->name); + continue; + } + + if (chassis_tunnel_add(chassis_rec, sbg, tc->ovs_table, tc, + tc->this_chassis) == 0) { + VLOG_INFO("Creating encap for '%s' failed", chassis_rec->name); + continue; + } + } + } +} + void encaps_run(struct ovsdb_idl_txn *ovs_idl_txn, struct ovsdb_idl_txn *ovnsb_idl_txn, @@ -565,6 +723,11 @@ encaps_run(struct ovsdb_idl_txn *ovs_idl_txn, return; } + bool use_flow_based = is_flow_based_tunnels_enabled(ovs_table, + this_chassis); + VLOG_DBG("Using %s tunnels for this chassis.", + use_flow_based ? "flow-based" : "port-based"); + if (!current_br_int_name) { /* The controller has just started, we need to look through all * bridges for old tunnel ports. */ @@ -594,8 +757,6 @@ encaps_run(struct ovsdb_idl_txn *ovs_idl_txn, current_br_int_name = xstrdup(br_int->name); } - const struct sbrec_chassis *chassis_rec; - struct tunnel_ctx tc = { .tunnel = SHASH_INITIALIZER(&tc.tunnel), .port_names = SSET_INITIALIZER(&tc.port_names), @@ -610,24 +771,30 @@ encaps_run(struct ovsdb_idl_txn *ovs_idl_txn, "ovn-controller: modifying OVS tunnels '%s'", this_chassis->name); - /* Collect all port names into tc.port_names. - * - * Collect all the OVN-created tunnels into tc.tunnel_hmap. */ + /* Collect existing port names and tunnel ports for cleanup. */ for (size_t i = 0; i < br_int->n_ports; i++) { const struct ovsrec_port *port = br_int->ports[i]; sset_add(&tc.port_names, port->name); const char *id = smap_get(&port->external_ids, OVN_TUNNEL_ID); if (id) { - if (!shash_find(&tc.tunnel, id)) { - struct tunnel_node *tunnel = xzalloc(sizeof *tunnel); - tunnel->bridge = br_int; - tunnel->port = port; - shash_add_assert(&tc.tunnel, id, tunnel); + struct tunnel_node *tunnel = xzalloc(sizeof *tunnel); + tunnel->bridge = br_int; + tunnel->port = port; + + if (use_flow_based) { + /* Flow-based: track by port name */ + shash_add(&tc.tunnel, port->name, tunnel); } else { - /* Duplicate port for tunnel-id. Arbitrarily choose - * to delete this one. */ - ovsrec_bridge_update_ports_delvalue(br_int, port); + /* Port-based: track by tunnel ID, handle duplicates */ + if (!shash_find(&tc.tunnel, id)) { + shash_add_assert(&tc.tunnel, id, tunnel); + } else { + /* Duplicate port for tunnel-id. Arbitrarily choose + * to delete this one. */ + ovsrec_bridge_update_ports_delvalue(br_int, port); + free(tunnel); + } } } @@ -636,32 +803,11 @@ encaps_run(struct ovsdb_idl_txn *ovs_idl_txn, } } - SBREC_CHASSIS_TABLE_FOR_EACH (chassis_rec, chassis_table) { - if (strcmp(chassis_rec->name, this_chassis->name)) { - /* Create tunnels to the other Chassis belonging to the - * same transport zone */ - if (!chassis_tzones_overlap(transport_zones, chassis_rec)) { - VLOG_DBG("Skipping encap creation for Chassis '%s' because " - "it belongs to different transport zones", - chassis_rec->name); - continue; - } - - if (smap_get_bool(&chassis_rec->other_config, "is-remote", false) - && !smap_get_bool(&this_chassis->other_config, "is-interconn", - false)) { - VLOG_DBG("Skipping encap creation for Chassis '%s' because " - "it is remote but this chassis is not interconn.", - chassis_rec->name); - continue; - } - - if (chassis_tunnel_add(chassis_rec, sbg, ovs_table, &tc, - this_chassis) == 0) { - VLOG_INFO("Creating encap for '%s' failed", chassis_rec->name); - continue; - } - } + /* Create OVN tunnels (mode-specific). */ + if (use_flow_based) { + create_flow_based_tunnels(&tc, sbg); + } else { + create_port_based_tunnels(&tc, chassis_table, sbg, transport_zones); } create_evpn_tunnels(&tc); diff --git a/controller/encaps.h b/controller/encaps.h index 3f4a82d683f2..fa5dc17e5fdb 100644 --- a/controller/encaps.h +++ b/controller/encaps.h @@ -18,6 +18,17 @@ #include <stdbool.h> +/* + * Given there could be multiple tunnels with different IPs to the same + * chassis we annotate the external_ids:ovn-chassis-id in tunnel port with + * <chassis_name>@<remote IP>%<local IP>. The external_id key + * "ovn-chassis-id" is kept for backward compatibility. + * + * For flow-based tunnels, we use the special value "flow" to identify + * shared tunnel ports that handle dynamic endpoint resolution. + */ +#define OVN_TUNNEL_ID "ovn-chassis-id" + struct ovsdb_idl; struct ovsdb_idl_txn; struct ovsrec_bridge; @@ -39,6 +50,10 @@ void encaps_run(struct ovsdb_idl_txn *ovs_idl_txn, const struct sset *transport_zones, const struct ovsrec_bridge_table *bridge_table); +bool is_flow_based_tunnels_enabled( + const struct ovsrec_open_vswitch_table *ovs_table, + const struct sbrec_chassis *chassis); + bool encaps_cleanup(struct ovsdb_idl_txn *ovs_idl_txn, const struct ovsrec_bridge *br_int); diff --git a/controller/local_data.c b/controller/local_data.c index a35d3fa5ae37..dda746d73ef4 100644 --- a/controller/local_data.c +++ b/controller/local_data.c @@ -51,6 +51,10 @@ static struct tracked_datapath *tracked_datapath_create( enum en_tracked_resource_type tracked_type, struct hmap *tracked_datapaths); +static void track_flow_based_tunnel( + const struct ovsrec_port *, const struct sbrec_chassis *, + struct flow_based_tunnel *flow_tunnels); + static bool datapath_is_switch(const struct sbrec_datapath_binding *); static bool datapath_is_transit_switch(const struct sbrec_datapath_binding *); @@ -454,7 +458,8 @@ void local_nonvif_data_run(const struct ovsrec_bridge *br_int, const struct sbrec_chassis *chassis_rec, struct simap *patch_ofports, - struct hmap *chassis_tunnels) + struct hmap *chassis_tunnels, + struct flow_based_tunnel *flow_tunnels) { for (int i = 0; i < br_int->n_ports; i++) { const struct ovsrec_port *port_rec = br_int->ports[i]; @@ -463,13 +468,15 @@ local_nonvif_data_run(const struct ovsrec_bridge *br_int, } const char *tunnel_id = smap_get(&port_rec->external_ids, - "ovn-chassis-id"); + OVN_TUNNEL_ID); if (tunnel_id && encaps_tunnel_id_match(tunnel_id, chassis_rec->name, NULL, NULL)) { continue; } + track_flow_based_tunnel(port_rec, chassis_rec, flow_tunnels); + const char *localnet = smap_get(&port_rec->external_ids, "ovn-localnet-port"); const char *l2gateway = smap_get(&port_rec->external_ids, @@ -757,3 +764,180 @@ lb_is_local(const struct sbrec_load_balancer *sbrec_lb, return false; } + +/* Flow-based tunnel management functions. */ + +void +flow_based_tunnels_init(struct flow_based_tunnel *flow_tunnels) +{ + for (size_t i = 0; i < TUNNEL_TYPE_MAX; i++) { + flow_tunnels[i] = (struct flow_based_tunnel) { + .ofport = 0, + .is_ipv6 = false, + .port_name = NULL, + }; + } +} + +void +flow_based_tunnels_destroy(struct flow_based_tunnel *flow_tunnels) +{ + for (size_t i = 0; i < TUNNEL_TYPE_MAX; i++) { + free(flow_tunnels[i].port_name); + flow_tunnels[i].port_name = NULL; + flow_tunnels[i].ofport = 0; + } +} + +ofp_port_t +get_flow_based_tunnel_port(enum chassis_tunnel_type type, + const struct flow_based_tunnel *flow_tunnels) +{ + if (type < 0 || type >= TUNNEL_TYPE_MAX) { + return 0; + } + return flow_tunnels[type].ofport; +} + +/* Direct tunnel endpoint selection utilities. */ + +enum chassis_tunnel_type +select_preferred_tunnel_type(const struct sbrec_chassis *local_chassis, + const struct sbrec_chassis *remote_chassis) +{ + /* Determine what tunnel types both chassis support */ + bool local_supports_geneve = false; + bool local_supports_vxlan = false; + bool remote_supports_geneve = false; + bool remote_supports_vxlan = false; + + for (size_t i = 0; i < local_chassis->n_encaps; i++) { + const char *type = local_chassis->encaps[i]->type; + if (!strcmp(type, "geneve")) { + local_supports_geneve = true; + } else if (!strcmp(type, "vxlan")) { + local_supports_vxlan = true; + } + } + + for (size_t i = 0; i < remote_chassis->n_encaps; i++) { + const char *type = remote_chassis->encaps[i]->type; + if (!strcmp(type, "geneve")) { + remote_supports_geneve = true; + } else if (!strcmp(type, "vxlan")) { + remote_supports_vxlan = true; + } + } + + /* Return preferred common tunnel type: geneve > vxlan */ + if (local_supports_geneve && remote_supports_geneve) { + return GENEVE; + } else if (local_supports_vxlan && remote_supports_vxlan) { + return VXLAN; + } else { + return TUNNEL_TYPE_INVALID; /* No common tunnel type */ + } +} + +const char * +select_default_encap_ip(const struct sbrec_chassis *chassis, + enum chassis_tunnel_type tunnel_type) +{ + const char *default_ip = NULL; + const char *tunnel_type_str = tunnel_type == GENEVE ? "geneve" : "vxlan"; + + for (size_t i = 0; i < chassis->n_encaps; i++) { + const struct sbrec_encap *encap = chassis->encaps[i]; + + if (strcmp(encap->type, tunnel_type_str)) { + continue; + } + + if (!default_ip) { + default_ip = encap->ip; + } + + if (smap_get_bool(&encap->options, "default-encap-ip", false)) { + default_ip = encap->ip; + break; /* Found explicit default */ + } + } + + return default_ip; +} + +const char * +select_port_encap_ip(const struct sbrec_port_binding *binding, + enum chassis_tunnel_type tunnel_type) +{ + const char *tunnel_type_str = tunnel_type == GENEVE ? "geneve" : "vxlan"; + + if (binding->encap && !strcmp(binding->encap->type, tunnel_type_str)) { + VLOG_DBG("Using port-specific encap IP %s for binding %s", + binding->encap->ip, binding->logical_port); + return binding->encap->ip; + } + + /* Fall back to chassis default encap IP */ + return select_default_encap_ip(binding->chassis, tunnel_type); +} + +static void +track_flow_based_tunnel(const struct ovsrec_port *port_rec, + const struct sbrec_chassis *chassis_rec, + struct flow_based_tunnel *flow_tunnels) +{ + if (port_rec->n_interfaces != 1) { + return; + } + + const struct ovsrec_interface *iface_rec = port_rec->interfaces[0]; + + /* Check if this is a flow-based tunnel port using unified + * OVN_TUNNEL_ID. */ + const char *tunnel_id = smap_get(&port_rec->external_ids, OVN_TUNNEL_ID); + const char *tunnel_type_str = smap_get(&port_rec->external_ids, + "ovn-tunnel-type"); + + if (!tunnel_id || !tunnel_type_str || strcmp(tunnel_id, "flow")) { + return; + } + + /* Get tunnel type. */ + enum chassis_tunnel_type tunnel_type = get_tunnel_type(tunnel_type_str); + if (tunnel_type == TUNNEL_TYPE_INVALID) { + return; + } + + /* Check if we already track this tunnel type. */ + if (flow_tunnels[tunnel_type].ofport != 0) { + return; + } + + int64_t ofport = iface_rec->ofport ? *iface_rec->ofport : 0; + if (ofport <= 0 || ofport > UINT16_MAX) { + VLOG_INFO("Invalid ofport %"PRId64" for flow-based tunnel %s", + ofport, port_rec->name); + return; + } + + /* Detect if this tunnel will use IPv6 by checking chassis encap IPs. */ + bool is_ipv6 = false; + for (size_t i = 0; i < chassis_rec->n_encaps; i++) { + const struct sbrec_encap *encap = chassis_rec->encaps[i]; + if (!strcmp(encap->type, tunnel_type_str) && + addr_is_ipv6(encap->ip)) { + is_ipv6 = true; + break; + } + } + + /* Store in array using tunnel_type as index. */ + flow_tunnels[tunnel_type].ofport = u16_to_ofp(ofport); + flow_tunnels[tunnel_type].is_ipv6 = is_ipv6; + flow_tunnels[tunnel_type].port_name = xstrdup(port_rec->name); + + VLOG_DBG("Tracking flow-based tunnel: port=%s, type=%s, ofport=%"PRId64 + ", is_ipv6=%s", port_rec->name, tunnel_type_str, ofport, + is_ipv6 ? "true" : "false"); +} diff --git a/controller/local_data.h b/controller/local_data.h index 841829f2e071..948c1a935e0c 100644 --- a/controller/local_data.h +++ b/controller/local_data.h @@ -28,6 +28,7 @@ struct sbrec_datapath_binding; struct sbrec_port_binding; struct sbrec_chassis; +struct sbrec_chassis_table; struct ovsdb_idl_index; struct ovsrec_bridge; struct ovsrec_interface_table; @@ -147,10 +148,21 @@ struct chassis_tunnel { bool is_ipv6; }; +/* Flow-based tunnel that consolidates multiple endpoints into a single + * port. Array is indexed by tunnel type (VXLAN=0, GENEVE=1). + * The type is implicit from the array index, so not stored. */ +struct flow_based_tunnel { + ofp_port_t ofport; /* Single port for all endpoints */ + bool is_ipv6; + char *port_name; /* e.g., "ovn-geneve" */ +}; + + void local_nonvif_data_run(const struct ovsrec_bridge *br_int, - const struct sbrec_chassis *, + const struct sbrec_chassis *chassis, struct simap *patch_ofports, - struct hmap *chassis_tunnels); + struct hmap *chassis_tunnels, + struct flow_based_tunnel *flow_tunnels); bool local_nonvif_data_handle_ovs_iface_changes( const struct ovsrec_interface_table *); @@ -165,6 +177,22 @@ bool get_chassis_tunnel_ofport(const struct hmap *chassis_tunnels, ofp_port_t *ofport); void chassis_tunnels_destroy(struct hmap *chassis_tunnels); + +/* Flow-based tunnel management functions. */ +void flow_based_tunnels_init(struct flow_based_tunnel *); +void flow_based_tunnels_destroy(struct flow_based_tunnel *); +ofp_port_t get_flow_based_tunnel_port( + enum chassis_tunnel_type, const struct flow_based_tunnel *); + +/* Direct tunnel endpoint selection utilities. */ +enum chassis_tunnel_type select_preferred_tunnel_type( + const struct sbrec_chassis *local_chassis, + const struct sbrec_chassis *remote_chassis); +const char *select_default_encap_ip(const struct sbrec_chassis *, + enum chassis_tunnel_type); +const char *select_port_encap_ip(const struct sbrec_port_binding *, + enum chassis_tunnel_type); + void local_datapath_memory_usage(struct simap *usage); void add_local_datapath_external_port(struct local_datapath *ld, char *logical_port, const void *data); diff --git a/controller/ovn-controller.8.xml b/controller/ovn-controller.8.xml index 32a1f7a6f658..dfc7cc2179ea 100644 --- a/controller/ovn-controller.8.xml +++ b/controller/ovn-controller.8.xml @@ -219,6 +219,36 @@ <code>false</code> to clear the DF flag. </dd> + <dt><code>external_ids:ovn-enable-flow-based-tunnels</code></dt> + <dd> + <p> + The boolean flag indicates if <code>ovn-controller</code> should + use flow-based tunnels instead of port-based tunnels for overlay + network connectivity. Default value is <code>false</code>. + </p> + <p> + <em>Port-based tunnels</em> (default mode) create a dedicated tunnel + port for each combination of remote chassis and local encap IP, + while <em>Flow-based tunnels</em> create a single shared tunnel port + for each tunnel type, and use OpenFlow <code>set_field</code> actions + to dynamically set tunnel source and destination IP addresses per + packet. + </p> + <p> + This feature is experimental and is disabled by default. There are + some known limitations to the feature: + <ul> + <li> + IPSec is not supported. + </li> + <li> + BFD between tunnel endpoints is not supported, thus HA chassis + (e.g. for Distributed Gateway Port redundancy) is not supported. + </li> + </ul> + </p> + </dd> + <dt><code>external_ids:ovn-bridge-mappings</code></dt> <dd> A list of key-value pairs that map a physical network name to a local diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c index 2a99891d0c45..e5cffc94e685 100644 --- a/controller/ovn-controller.c +++ b/controller/ovn-controller.c @@ -3461,6 +3461,10 @@ struct ed_type_non_vif_data { struct simap patch_ofports; /* simap of patch ovs ports. */ struct hmap chassis_tunnels; /* hmap of 'struct chassis_tunnel' from the * tunnel OVS ports. */ + struct flow_based_tunnel flow_tunnels[TUNNEL_TYPE_MAX]; + /* Array of flow-based tunnels indexed by + * tunnel type. */ + bool use_flow_based_tunnels; /* Enable flow-based tunnels. */ }; static void * @@ -3470,6 +3474,8 @@ en_non_vif_data_init(struct engine_node *node OVS_UNUSED, struct ed_type_non_vif_data *data = xzalloc(sizeof *data); simap_init(&data->patch_ofports); hmap_init(&data->chassis_tunnels); + flow_based_tunnels_init(data->flow_tunnels); + data->use_flow_based_tunnels = false; return data; } @@ -3479,6 +3485,7 @@ en_non_vif_data_cleanup(void *data OVS_UNUSED) struct ed_type_non_vif_data *ed_non_vif_data = data; simap_destroy(&ed_non_vif_data->patch_ofports); chassis_tunnels_destroy(&ed_non_vif_data->chassis_tunnels); + flow_based_tunnels_destroy(ed_non_vif_data->flow_tunnels); } static enum engine_node_state @@ -3487,8 +3494,11 @@ en_non_vif_data_run(struct engine_node *node, void *data) struct ed_type_non_vif_data *ed_non_vif_data = data; simap_destroy(&ed_non_vif_data->patch_ofports); chassis_tunnels_destroy(&ed_non_vif_data->chassis_tunnels); + flow_based_tunnels_destroy(ed_non_vif_data->flow_tunnels); + simap_init(&ed_non_vif_data->patch_ofports); hmap_init(&ed_non_vif_data->chassis_tunnels); + flow_based_tunnels_init(ed_non_vif_data->flow_tunnels); const struct ovsrec_open_vswitch_table *ovs_table = EN_OVSDB_GET(engine_get_input("OVS_open_vswitch", node)); @@ -3508,8 +3518,14 @@ en_non_vif_data_run(struct engine_node *node, void *data) = chassis_lookup_by_name(sbrec_chassis_by_name, chassis_id); ovs_assert(chassis); - local_nonvif_data_run(br_int, chassis, &ed_non_vif_data->patch_ofports, - &ed_non_vif_data->chassis_tunnels); + ed_non_vif_data->use_flow_based_tunnels = + is_flow_based_tunnels_enabled(ovs_table, chassis); + + local_nonvif_data_run(br_int, chassis, + &ed_non_vif_data->patch_ofports, + &ed_non_vif_data->chassis_tunnels, + ed_non_vif_data->flow_tunnels); + return EN_UPDATED; } @@ -4618,6 +4634,7 @@ static void init_physical_ctx(struct engine_node *node, parse_encap_ips(ovs_table, &p_ctx->n_encap_ips, &p_ctx->encap_ips); p_ctx->sbrec_port_binding_by_name = sbrec_port_binding_by_name; p_ctx->sbrec_port_binding_by_datapath = sbrec_port_binding_by_datapath; + p_ctx->sbrec_chassis_by_name = sbrec_chassis_by_name; p_ctx->port_binding_table = port_binding_table; p_ctx->ovs_interface_table = ovs_interface_table; p_ctx->mc_group_table = multicast_group_table; @@ -4631,6 +4648,8 @@ static void init_physical_ctx(struct engine_node *node, p_ctx->local_bindings = &rt_data->lbinding_data.bindings; p_ctx->patch_ofports = &non_vif_data->patch_ofports; p_ctx->chassis_tunnels = &non_vif_data->chassis_tunnels; + p_ctx->flow_tunnels = non_vif_data->flow_tunnels; + p_ctx->use_flow_based_tunnels = non_vif_data->use_flow_based_tunnels; p_ctx->always_tunnel = n_opts->always_tunnel; p_ctx->evpn_bindings = &eb_data->bindings; p_ctx->evpn_multicast_groups = &eb_data->multicast_groups; @@ -7649,7 +7668,8 @@ main(int argc, char *argv[]) ovsrec_interface_table_get(ovs_idl_loop.idl), br_int, &bfd_chassis_data->bfd_chassis, chassis, sbrec_sb_global_table_get( - ovnsb_idl_loop.idl) + ovnsb_idl_loop.idl), + ovs_table ); stopwatch_stop( BFD_RUN_STOPWATCH_NAME, time_msec()); diff --git a/controller/physical.c b/controller/physical.c index 2c314b7da303..697a6d7b48c3 100644 --- a/controller/physical.c +++ b/controller/physical.c @@ -51,6 +51,7 @@ #include "openvswitch/shash.h" #include "simap.h" #include "smap.h" +#include "socket-util.h" #include "sset.h" #include "util.h" #include "vswitch-idl.h" @@ -225,6 +226,178 @@ match_set_chassis_flood_outport(struct match *match, } } +/* Flow-based tunnel helper function to set tunnel source or destination IP. */ + +static void +put_set_tunnel_ip(const char *ip, bool is_src, struct ofpbuf *ofpacts) +{ + struct in6_addr ip_addr; + if (!ip46_parse(ip, &ip_addr)) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1); + VLOG_WARN_RL(&rl, "Invalid tunnel IP address: %s", ip); + return; + } + + if (IN6_IS_ADDR_V4MAPPED(&ip_addr)) { + /* IPv4 */ + ovs_be32 ipv4 = in6_addr_get_mapped_ipv4(&ip_addr); + put_load_bytes(&ipv4, sizeof ipv4, + is_src ? MFF_TUN_SRC : MFF_TUN_DST, + 0, 32, ofpacts); + } else { + /* IPv6 */ + put_load_bytes(&ip_addr, sizeof ip_addr, + is_src ? MFF_TUN_IPV6_SRC : MFF_TUN_IPV6_DST, + 0, 128, ofpacts); + } +} + +/* Flow-based encapsulation that sets tunnel metadata and endpoint IPs. */ +static void +put_flow_based_encapsulation(enum mf_field_id mff_ovn_geneve, + enum chassis_tunnel_type tunnel_type, + const char *local_ip, const char *remote_ip, + const struct sbrec_datapath_binding *datapath, + uint16_t outport, bool is_ramp_switch, + struct ofpbuf *ofpacts) +{ + struct chassis_tunnel temp_tun = { + .type = tunnel_type, + }; + put_encapsulation(mff_ovn_geneve, &temp_tun, datapath, + outport, is_ramp_switch, ofpacts); + + /* Set tunnel source and destination IPs (flow-based specific) */ + put_set_tunnel_ip(local_ip, true, ofpacts); + put_set_tunnel_ip(remote_ip, false, ofpacts); +} + + +/* Generate flows for flow-based tunnel to a specific chassis. */ +static void +put_flow_based_remote_port_redirect_overlay( + const struct sbrec_port_binding *binding, + const enum en_lport_type type, + const struct physical_ctx *ctx, + uint32_t port_key, + struct match *match, + struct ofpbuf *ofpacts_p, + struct ovn_desired_flow_table *flow_table) +{ + if (!binding->chassis || binding->chassis == ctx->chassis) { + return; + } + + enum chassis_tunnel_type tunnel_type = + select_preferred_tunnel_type(ctx->chassis, binding->chassis); + if (tunnel_type == TUNNEL_TYPE_INVALID) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1); + VLOG_WARN_RL(&rl, "No common tunnel type with chassis %s", + binding->chassis->name); + return; + } + + const char *tunnel_type_str = tunnel_type == GENEVE ? "geneve" : "vxlan"; + const char *remote_ip = select_port_encap_ip(binding, tunnel_type); + if (!remote_ip) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1); + VLOG_WARN_RL(&rl, "No compatible encap IP for port %s on chassis %s " + "with type %s", binding->logical_port, + binding->chassis->name, tunnel_type_str); + return; + } + + ofp_port_t flow_port = get_flow_based_tunnel_port(tunnel_type, + ctx->flow_tunnels); + if (flow_port == 0) { + VLOG_DBG("No flow-based tunnel port found for type %s", + tunnel_type_str); + return; + } + + VLOG_DBG("Using flow-based tunnel: chassis=%s, tunnel_type=%s, " + "remote_ip=%s, flow_port=%d", binding->chassis->name, + tunnel_type_str, remote_ip, flow_port); + + /* Generate flows for each local encap IP. */ + for (size_t i = 0; i < ctx->n_encap_ips; i++) { + const char *local_encap_ip = ctx->encap_ips[i]; + + struct ofpbuf *ofpacts_clone = ofpbuf_clone(ofpacts_p); + + /* Set encap ID for this local IP. */ + match_set_reg_masked(match, MFF_LOG_ENCAP_ID - MFF_REG0, i << 16, + (uint32_t) 0xFFFF << 16); + + bool is_vtep_port = type == LP_VTEP; + if (is_vtep_port) { + put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16, ofpacts_clone); + } + + /* Set flow-based tunnel encapsulation. */ + put_flow_based_encapsulation(ctx->mff_ovn_geneve, tunnel_type, + local_encap_ip, remote_ip, + binding->datapath, port_key, + is_vtep_port, ofpacts_clone); + + ofpact_put_OUTPUT(ofpacts_clone)->port = flow_port; + put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts_clone); + + ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100, + binding->header_.uuid.parts[0], match, + ofpacts_clone, &binding->header_.uuid); + + ofpbuf_delete(ofpacts_clone); + } +} + +static void +add_tunnel_ingress_flows(const struct chassis_tunnel *tun, + enum mf_field_id mff_ovn_geneve, + struct ovn_desired_flow_table *flow_table, + struct ofpbuf *ofpacts) +{ + /* Main ingress flow (priority 100) */ + struct match match = MATCH_CATCHALL_INITIALIZER; + match_set_in_port(&match, tun->ofport); + + ofpbuf_clear(ofpacts); + put_decapsulation(mff_ovn_geneve, tun, ofpacts); + put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts); + + ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 100, 0, &match, + ofpacts, hc_uuid); + + /* Set allow rx from tunnel bit */ + put_load(1, MFF_LOG_FLAGS, MLF_RX_FROM_TUNNEL_BIT, 1, ofpacts); + put_resubmit(OFTABLE_CT_ZONE_LOOKUP, ofpacts); + + /* Add specific flows for E/W ICMPv{4,6} packets if tunnelled packets + * do not fit path MTU. */ + + /* IPv4 ICMP flow (priority 120) */ + match_init_catchall(&match); + match_set_in_port(&match, tun->ofport); + match_set_dl_type(&match, htons(ETH_TYPE_IP)); + match_set_nw_proto(&match, IPPROTO_ICMP); + match_set_icmp_type(&match, 3); + match_set_icmp_code(&match, 4); + + ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 120, 0, &match, + ofpacts, hc_uuid); + + /* IPv6 ICMP flow (priority 120) */ + match_init_catchall(&match); + match_set_in_port(&match, tun->ofport); + match_set_dl_type(&match, htons(ETH_TYPE_IPV6)); + match_set_nw_proto(&match, IPPROTO_ICMPV6); + match_set_icmp_type(&match, 2); + match_set_icmp_code(&match, 0); + + ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 120, 0, &match, + ofpacts, hc_uuid); +} + static void put_stack(enum mf_field_id field, struct ofpact_stack *stack) { @@ -425,17 +598,18 @@ get_remote_tunnels(const struct sbrec_port_binding *binding, return tunnels; } +/* Generate flows for port-based tunnel to a specific chassis. */ static void -put_remote_port_redirect_overlay(const struct sbrec_port_binding *binding, - const enum en_lport_type type, - const struct physical_ctx *ctx, - uint32_t port_key, - struct match *match, - struct ofpbuf *ofpacts_p, - struct ovn_desired_flow_table *flow_table, - bool allow_hairpin) +put_port_based_remote_port_redirect_overlay( + const struct sbrec_port_binding *binding, + const enum en_lport_type type, + const struct physical_ctx *ctx, + uint32_t port_key, + struct match *match, + struct ofpbuf *ofpacts_p, + struct ovn_desired_flow_table *flow_table, + bool allow_hairpin) { - /* Setup encapsulation */ for (size_t i = 0; i < ctx->n_encap_ips; i++) { const char *encap_ip = ctx->encap_ips[i]; struct ofpbuf *ofpacts_clone = ofpbuf_clone(ofpacts_p); @@ -481,6 +655,28 @@ put_remote_port_redirect_overlay(const struct sbrec_port_binding *binding, } } +static void +put_remote_port_redirect_overlay(const struct sbrec_port_binding *binding, + const enum en_lport_type type, + const struct physical_ctx *ctx, + uint32_t port_key, + struct match *match, + struct ofpbuf *ofpacts_p, + struct ovn_desired_flow_table *flow_table, + bool allow_hairpin) +{ + if (ctx->use_flow_based_tunnels) { + put_flow_based_remote_port_redirect_overlay(binding, type, ctx, + port_key, match, + ofpacts_p, flow_table); + } else { + put_port_based_remote_port_redirect_overlay(binding, type, ctx, + port_key, match, + ofpacts_p, flow_table, + allow_hairpin); + } +} + static const struct sbrec_port_binding * get_binding_network_function_linked_port( struct ovsdb_idl_index *sbrec_port_binding_by_name, @@ -1903,6 +2099,7 @@ handle_pkt_too_big(struct ovn_desired_flow_table *flow_table, handle_pkt_too_big_for_ip_version(flow_table, binding, mcp, mtu, true); } +/* XXX: Need to support flow-based tunnel for this function. */ static void enforce_tunneling_for_multichassis_ports( struct local_datapath *ld, @@ -2516,14 +2713,101 @@ tunnel_to_chassis(enum mf_field_id mff_ovn_geneve, ofpact_put_OUTPUT(remote_ofpacts)->port = tun->ofport; } -/* Encapsulate and send to a set of remote chassis. */ +/* Flow-based tunnel version of fanout_to_chassis for multicast/broadcast. */ static void -fanout_to_chassis(enum mf_field_id mff_ovn_geneve, - struct sset *remote_chassis, - const struct hmap *chassis_tunnels, - const struct sbrec_datapath_binding *datapath, - uint16_t outport, bool is_ramp_switch, - struct ofpbuf *remote_ofpacts) +fanout_to_chassis_flow_based(const struct physical_ctx *ctx, + struct sset *remote_chassis, + const struct sbrec_datapath_binding *datapath, + uint16_t outport, bool is_ramp_switch, + struct ofpbuf *remote_ofpacts) +{ + VLOG_DBG("fanout_to_chassis_flow_based called with %"PRIuSIZE + " remote chassis", sset_count(remote_chassis)); + + if (!ctx->flow_tunnels) { + VLOG_DBG("fanout_to_chassis_flow_based: Missing flow_tunnels"); + return; + } + + if (!remote_chassis || sset_is_empty(remote_chassis)) { + VLOG_DBG("fanout_to_chassis_flow_based: No remote chassis " + "to send to"); + return; + } + + const char *local_encap_ip = NULL; + if (ctx->n_encap_ips <= 0) { + return; + } + local_encap_ip = ctx->encap_ips[0]; /* Use first/default local IP */ + + const char *chassis_name; + enum chassis_tunnel_type prev_type = TUNNEL_TYPE_INVALID; + + SSET_FOR_EACH (chassis_name, remote_chassis) { + const struct sbrec_chassis *remote_chassis_rec = + chassis_lookup_by_name(ctx->sbrec_chassis_by_name, chassis_name); + if (!remote_chassis_rec) { + VLOG_DBG("Chassis %s not found in SB", chassis_name); + continue; + } + + enum chassis_tunnel_type tunnel_type = + select_preferred_tunnel_type(ctx->chassis, remote_chassis_rec); + if (tunnel_type == TUNNEL_TYPE_INVALID) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1); + VLOG_WARN_RL(&rl, "No common tunnel type with chassis %s", + chassis_name); + continue; + } + const char *tunnel_type_str = tunnel_type == GENEVE ? "geneve" + : "vxlan"; + + ofp_port_t flow_port = get_flow_based_tunnel_port(tunnel_type, + ctx->flow_tunnels); + if (flow_port == 0) { + VLOG_DBG("No flow-based tunnel port found for type %s", + tunnel_type_str); + continue; + } + + const char *remote_ip = select_default_encap_ip(remote_chassis_rec, + tunnel_type); + if (!remote_ip) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1); + VLOG_WARN_RL(&rl, "No compatible encap IP for chassis %s with type %s", + chassis_name, tunnel_type_str); + continue; + } + + /* Add encapsulation if tunnel type changed or this is the first + * chassis. */ + if (tunnel_type != prev_type) { + struct chassis_tunnel temp_tun = { + .chassis_id = CONST_CAST(char *, chassis_name), + .ofport = flow_port, + .type = tunnel_type + }; + put_encapsulation(ctx->mff_ovn_geneve, &temp_tun, datapath, + outport, is_ramp_switch, remote_ofpacts); + prev_type = tunnel_type; + } + + /* Set tunnel source and destination IPs for flow-based tunnels. */ + put_set_tunnel_ip(local_encap_ip, true, remote_ofpacts); + put_set_tunnel_ip(remote_ip, false, remote_ofpacts); + ofpact_put_OUTPUT(remote_ofpacts)->port = flow_port; + } +} + +/* Encapsulate and send to a set of remote chassis (port-based tunnels). */ +static void +fanout_to_chassis_port_based(enum mf_field_id mff_ovn_geneve, + struct sset *remote_chassis, + const struct hmap *chassis_tunnels, + const struct sbrec_datapath_binding *datapath, + uint16_t outport, bool is_ramp_switch, + struct ofpbuf *remote_ofpacts) { const char *chassis_name; const struct chassis_tunnel *prev = NULL; @@ -2757,12 +3041,29 @@ consider_mc_group(const struct physical_ctx *ctx, if (remote_ports) { put_load(mc->tunnel_key, MFF_LOG_OUTPORT, 0, 32, &remote_ctx->ofpacts); } - fanout_to_chassis(ctx->mff_ovn_geneve, &remote_chassis, - ctx->chassis_tunnels, mc->datapath, mc->tunnel_key, - false, &remote_ctx->ofpacts); - fanout_to_chassis(ctx->mff_ovn_geneve, &vtep_chassis, - ctx->chassis_tunnels, mc->datapath, mc->tunnel_key, - true, &remote_ctx->ofpacts); + if (ctx->use_flow_based_tunnels) { + VLOG_DBG("Using flow-based tunnels for multicast group %s " + "(tunnel_key=%"PRId64") with %"PRIuSIZE" remote chassis", + mc->name, mc->tunnel_key, sset_count(&remote_chassis)); + fanout_to_chassis_flow_based(ctx, &remote_chassis, + mc->datapath, mc->tunnel_key, + false, &remote_ctx->ofpacts); + fanout_to_chassis_flow_based(ctx, &vtep_chassis, + mc->datapath, mc->tunnel_key, + true, &remote_ctx->ofpacts); + } else { + VLOG_DBG("Using port-based tunnels for multicast group %s " + "(tunnel_key=%"PRId64") with %"PRIuSIZE" remote chassis", + mc->name, mc->tunnel_key, sset_count(&remote_chassis)); + fanout_to_chassis_port_based(ctx->mff_ovn_geneve, &remote_chassis, + ctx->chassis_tunnels, mc->datapath, + mc->tunnel_key, false, + &remote_ctx->ofpacts); + fanout_to_chassis_port_based(ctx->mff_ovn_geneve, &vtep_chassis, + ctx->chassis_tunnels, mc->datapath, + mc->tunnel_key, true, + &remote_ctx->ofpacts); + } remote_ports = remote_ctx->ofpacts.size > 0; if (remote_ports) { @@ -3408,44 +3709,33 @@ physical_run(struct physical_ctx *p_ctx, * packets to the local hypervisor. */ struct chassis_tunnel *tun; HMAP_FOR_EACH (tun, hmap_node, p_ctx->chassis_tunnels) { - struct match match = MATCH_CATCHALL_INITIALIZER; - match_set_in_port(&match, tun->ofport); - - ofpbuf_clear(&ofpacts); - put_decapsulation(p_ctx->mff_ovn_geneve, tun, &ofpacts); + add_tunnel_ingress_flows(tun, p_ctx->mff_ovn_geneve, flow_table, + &ofpacts); + } - put_resubmit(OFTABLE_LOCAL_OUTPUT, &ofpacts); - ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 100, 0, &match, - &ofpacts, hc_uuid); + /* Process packets that arrive from flow-based tunnels. */ + if (p_ctx->use_flow_based_tunnels && p_ctx->flow_tunnels) { + for (size_t i = 0; i < TUNNEL_TYPE_MAX; i++) { + if (p_ctx->flow_tunnels[i].ofport == 0) { + continue; /* Tunnel not configured */ + } - /* Set allow rx from tunnel bit. */ - put_load(1, MFF_LOG_FLAGS, MLF_RX_FROM_TUNNEL_BIT, 1, &ofpacts); + /* Flow-based tunnels use the same ingress flow logic as + * port-based. Create a temporary chassis_tunnel structure + * for compatibility. */ + struct chassis_tunnel temp_tunnel = { + .type = i, /* Array index is the tunnel type */ + .ofport = p_ctx->flow_tunnels[i].ofport, + .chassis_id = NULL /* Not needed for decapsulation */ + }; - /* Add specif flows for E/W ICMPv{4,6} packets if tunnelled packets - * do not fit path MTU. - */ - put_resubmit(OFTABLE_CT_ZONE_LOOKUP, &ofpacts); + VLOG_DBG("Adding flow-based tunnel ingress flow: in_port=%d, " + "type=%s", p_ctx->flow_tunnels[i].ofport, + i == GENEVE ? "geneve" : "vxlan"); - /* IPv4 */ - match_init_catchall(&match); - match_set_in_port(&match, tun->ofport); - match_set_dl_type(&match, htons(ETH_TYPE_IP)); - match_set_nw_proto(&match, IPPROTO_ICMP); - match_set_icmp_type(&match, 3); - match_set_icmp_code(&match, 4); - - ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 120, 0, &match, - &ofpacts, hc_uuid); - /* IPv6 */ - match_init_catchall(&match); - match_set_in_port(&match, tun->ofport); - match_set_dl_type(&match, htons(ETH_TYPE_IPV6)); - match_set_nw_proto(&match, IPPROTO_ICMPV6); - match_set_icmp_type(&match, 2); - match_set_icmp_code(&match, 0); - - ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 120, 0, &match, - &ofpacts, hc_uuid); + add_tunnel_ingress_flows(&temp_tunnel, p_ctx->mff_ovn_geneve, + flow_table, &ofpacts); + } } /* Add VXLAN specific rules to transform port keys diff --git a/controller/physical.h b/controller/physical.h index 0dc544823ad5..c7a33bd028a0 100644 --- a/controller/physical.h +++ b/controller/physical.h @@ -51,6 +51,7 @@ struct physical_debug { struct physical_ctx { struct ovsdb_idl_index *sbrec_port_binding_by_name; struct ovsdb_idl_index *sbrec_port_binding_by_datapath; + struct ovsdb_idl_index *sbrec_chassis_by_name; const struct sbrec_port_binding_table *port_binding_table; const struct ovsrec_interface_table *ovs_interface_table; const struct sbrec_multicast_group_table *mc_group_table; @@ -65,6 +66,8 @@ struct physical_ctx { struct shash *local_bindings; struct simap *patch_ofports; struct hmap *chassis_tunnels; + bool use_flow_based_tunnels; + struct flow_based_tunnel *flow_tunnels; size_t n_encap_ips; const char **encap_ips; struct physical_debug debug; diff --git a/lib/ovn-util.h b/lib/ovn-util.h index 1c9b62ebb682..daff019953d0 100644 --- a/lib/ovn-util.h +++ b/lib/ovn-util.h @@ -350,7 +350,8 @@ hash_add_in6_addr(uint32_t hash, const struct in6_addr *addr) enum chassis_tunnel_type { TUNNEL_TYPE_INVALID = -1, VXLAN = 0, - GENEVE = 1 + GENEVE = 1, + TUNNEL_TYPE_MAX = 2 /* Number of valid tunnel types */ }; enum chassis_tunnel_type get_tunnel_type(const char *name); diff --git a/tests/ovn-macros.at b/tests/ovn-macros.at index 214cb9b4bbd5..47cd27b9f8d2 100644 --- a/tests/ovn-macros.at +++ b/tests/ovn-macros.at @@ -109,7 +109,12 @@ m4_divert_text([PREPARE_TESTS], echo "$3: waiting for flows for remote output on $hv1" # Wait for a flow outputing to remote output OVS_WAIT_UNTIL([ - ofport=$(as $hv1 ovs-vsctl --bare --columns ofport find Interface name=ovn-${hv2}-0) + if test X$OVN_ENABLE_FLOW_BASED_TUNNELS = Xyes; then + tun_name=ovn-geneve + else + tun_name=ovn-${hv2}-0 + fi + ofport=$(as $hv1 ovs-vsctl --bare --columns ofport find Interface name=$tun_name) echo "tunnel port=$ofport" test -n "$ofport" && test 1 -le $(as $hv1 ovs-ofctl dump-flows br-int | grep -c "output:$ofport") ]) @@ -121,7 +126,12 @@ m4_divert_text([PREPARE_TESTS], echo "$3: waiting for flows for remote input on $hv1" # Wait for a flow outputing to remote input OVS_WAIT_UNTIL([ - ofport=$(as $hv1 ovs-vsctl --bare --columns ofport find Interface name=ovn-${hv2}-0) + if test X$OVN_ENABLE_FLOW_BASED_TUNNELS = Xyes; then + tun_name=ovn-geneve + else + tun_name=ovn-${hv2}-0 + fi + ofport=$(as $hv1 ovs-vsctl --bare --columns ofport find Interface name=$tun_name) echo "tunnel port=$ofport" test -n "$ofport" && test 1 -le $(as $hv1 ovs-ofctl dump-flows br-int | grep -c "in_port=$ofport") ]) @@ -780,6 +790,13 @@ ovn_az_attach() { ovs-vsctl set open . external_ids:ovn-monitor-all=true fi + # Configure flow-based tunnels if the test variable is set + if test X$OVN_ENABLE_FLOW_BASED_TUNNELS = Xyes; then + ovs-vsctl set open . external_ids:ovn-enable-flow-based-tunnels=true + elif test X$OVN_ENABLE_FLOW_BASED_TUNNELS = Xno; then + ovs-vsctl set open . external_ids:ovn-enable-flow-based-tunnels=false + fi + start_daemon ovn-controller --enable-dummy-vif-plug ${cli_args} || return 1 if test X"$az" = XNONE; then ovn_wait_for_encaps $expected_encap_id @@ -1459,6 +1476,16 @@ m4_define([OVN_FOR_EACH_NORTHD_NO_HV_PARALLELIZATION], [m4_foreach([NORTHD_USE_PARALLELIZATION], [yes, no], [$1 ])]) +# Defines versions of the test with all combinations of northd, +# parallelization enabled, conditional monitoring on/off, and flow-based +# tunnels on/off. Use this for tests that need to verify behavior with both +# port-based and flow-based tunnel implementations. +m4_define([OVN_FOR_EACH_NORTHD_FLOW_TUNNEL], + [m4_foreach([NORTHD_USE_PARALLELIZATION], [yes], + [m4_foreach([OVN_MONITOR_ALL], [yes, no], + [m4_foreach([OVN_ENABLE_FLOW_BASED_TUNNELS], [yes, no], [$1 +])])])]) + # OVN_NBCTL(NBCTL_COMMAND) adds NBCTL_COMMAND to list of commands to be run by RUN_OVN_NBCTL(). m4_define([OVN_NBCTL], [ command="${command} -- $1" diff --git a/tests/ovn.at b/tests/ovn.at index 6ea0d1de5092..e1c1a5532cd8 100644 --- a/tests/ovn.at +++ b/tests/ovn.at @@ -2508,7 +2508,7 @@ AT_CLEANUP ]) # 3 hypervisors, one logical switch, 3 logical ports per hypervisor -OVN_FOR_EACH_NORTHD([ +OVN_FOR_EACH_NORTHD_FLOW_TUNNEL([ AT_SETUP([3 HVs, 1 LS, 3 lports/HV]) AT_KEYWORDS([ovnarp]) AT_KEYWORDS([slowtest]) @@ -2785,7 +2785,7 @@ AT_CLEANUP # 2 hypervisors, one logical switch, 2 logical ports per hypervisor # logical ports bound to chassis encap-ip. -OVN_FOR_EACH_NORTHD([ +OVN_FOR_EACH_NORTHD_FLOW_TUNNEL([ AT_SETUP([2 HVs, 1 LS, 2 lports/HV]) AT_KEYWORDS([ovnarp]) ovn_start @@ -3188,7 +3188,7 @@ AT_CLEANUP # 2 hypervisors, 4 logical ports per HV # 2 locally attached networks (one flat, one vlan tagged over same device) # 2 ports per HV on each network -OVN_FOR_EACH_NORTHD([ +OVN_FOR_EACH_NORTHD_FLOW_TUNNEL([ AT_SETUP([2 HVs, 4 lports/HV, localnet ports]) ovn_start @@ -3522,7 +3522,7 @@ OVN_CLEANUP([hv-1 AT_CLEANUP ]) -OVN_FOR_EACH_NORTHD([ +OVN_FOR_EACH_NORTHD_FLOW_TUNNEL([ AT_SETUP([2 HVs, 2 LS, broadcast traffic with multiple localnet ports per switch]) ovn_start @@ -4756,7 +4756,7 @@ AT_CLEANUP ]) # Similar test to "hardware GW" -OVN_FOR_EACH_NORTHD([ +OVN_FOR_EACH_NORTHD_FLOW_TUNNEL([ AT_SETUP([3 HVs, 1 VIFs/HV, 1 software GW, 1 LS]) ovn_start @@ -4915,7 +4915,7 @@ AT_CLEANUP ]) # 3 hypervisors, 3 logical switches with 3 logical ports each, 1 logical router -OVN_FOR_EACH_NORTHD([ +OVN_FOR_EACH_NORTHD_FLOW_TUNNEL([ AT_SETUP([3 HVs, 3 LS, 3 lports/LS, 1 LR]) AT_KEYWORDS([slowtest]) AT_SKIP_IF([test $HAVE_SCAPY = no]) @@ -5352,7 +5352,7 @@ OVN_CLEANUP([hv1], [hv2], [hv3]) AT_CLEANUP ]) -OVN_FOR_EACH_NORTHD([ +OVN_FOR_EACH_NORTHD_FLOW_TUNNEL([ AT_SETUP([IP relocation using GARP request]) AT_SKIP_IF([test $HAVE_SCAPY = no]) ovn_start @@ -31364,6 +31364,7 @@ for i in 1 2; do ovs-vsctl add-br br-phys-$j ovn_attach n1 br-phys-$j 192.168.0.${i}1 ovs-vsctl set open . external_ids:ovn-encap-ip=192.168.0.${i}1,192.168.0.${i}2 + ovs-vsctl set open . external_ids:ovn-enable-flow-based-tunnels=false for j in 1 2; do ovs-vsctl add-port br-int vif$i$j -- set Interface vif$i$j \ @@ -31417,6 +31418,7 @@ vif_to_hv() { # tunnel that matches the VIFs' encap_ip configurations. check_packet_tunnel() { local src=$1 dst=$2 + local flow_based_tunnel=$3 local src_mac=f0:00:00:00:00:$src local dst_mac=f0:00:00:00:00:$dst local src_ip=10.0.0.$src @@ -31429,8 +31431,17 @@ check_packet_tunnel() { hv=`vif_to_hv vif$src` as $hv echo "vif$src -> vif$dst should go through tunnel $local_encap_ip -> $remote_encap_ip" - tunnel_ofport=$(ovs-vsctl --bare --column=ofport find interface options:local_ip=$local_encap_ip options:remote_ip=$remote_encap_ip) + if test x$flow_based_tunnel == xtrue; then + tunnel_ofport=$(ovs-vsctl --bare --column=ofport list interface ovn-geneve) + else + tunnel_ofport=$(ovs-vsctl --bare --column=ofport find interface options:local_ip=$local_encap_ip options:remote_ip=$remote_encap_ip) + fi AT_CHECK([test $(ovs-appctl ofproto/trace br-int in_port=vif$src $packet | grep "output:" | awk -F ':' '{ print $2 }') == $tunnel_ofport]) + if test x$flow_based_tunnel == xtrue; then + trace_output=$(ovs-appctl ofproto/trace br-int in_port=vif$src $packet) + AT_CHECK([echo "$trace_output" | grep -q "set_field:$local_encap_ip->tun_src"]) + AT_CHECK([echo "$trace_output" | grep -q "set_field:$remote_encap_ip->tun_dst"]) + fi } for i in 1 2; do @@ -31439,6 +31450,20 @@ for i in 1 2; do done done +# Test flow-based tunnels +for i in 1 2; do + as hv$i + ovs-vsctl set open . external_ids:ovn-enable-flow-based-tunnels=true +done +check ovn-nbctl --wait=hv sync + +for i in 1 2; do + for j in 1 2; do + check_packet_tunnel 1$i 2$j true + done +done + + OVN_CLEANUP([hv1],[hv2]) AT_CLEANUP ]) @@ -31460,6 +31485,7 @@ for i in 1 2; do ovs-vsctl add-br br-phys-$j ovn_attach n1 br-phys-$j 192.168.0.${i}1 ovs-vsctl set open . external_ids:ovn-encap-ip=192.168.0.${i}1,192.168.0.${i}2 + ovs-vsctl set open . external_ids:ovn-enable-flow-based-tunnels=false check ovn-nbctl ls-add ls$i -- \ lrp-add lr lrp-ls$i f0:00:00:00:88:0$i 10.0.$i.88/24 -- \ @@ -31496,38 +31522,37 @@ vif_to_ip() { # tunnel that matches the VIFs' encap_ip configurations. check_packet_tunnel() { local src=$1 dst=$2 + local local_encap_ip=$3 + local remote_encap_ip=$4 + local flow_based_tunnel=$5 local src_mac=f0:00:00:00:00:$src local dst_mac=f0:00:00:00:88:01 # lrp-ls1's MAC local src_ip=$(vif_to_ip vif$src) local dst_ip=$(vif_to_ip vif$dst) - local local_encap_ip - if test -n "$3"; then - local_encap_ip=$3 - else - local_encap_ip=192.168.0.$src - fi - - local remote_encap_ip - if test -n "$4"; then - remote_encap_ip=$4 - else - remote_encap_ip=192.168.0.$dst - fi - local packet=$(fmt_pkt "Ether(dst='${dst_mac}', src='${src_mac}')/ \ IP(dst='${dst_ip}', src='${src_ip}')/ \ ICMP(type=8)") hv=`vif_to_hv vif$src` as $hv echo "vif$src -> vif$dst should go through tunnel $local_encap_ip -> $remote_encap_ip" - tunnel_ofport=$(ovs-vsctl --bare --column=ofport find interface options:local_ip=$local_encap_ip options:remote_ip=$remote_encap_ip) + + if test x$flow_based_tunnel == xtrue; then + tunnel_ofport=$(ovs-vsctl --bare --column=ofport list interface ovn-geneve) + else + tunnel_ofport=$(ovs-vsctl --bare --column=ofport find interface options:local_ip=$local_encap_ip options:remote_ip=$remote_encap_ip) + fi AT_CHECK([test $(ovs-appctl ofproto/trace br-int in_port=vif$src $packet | grep "output:" | awk -F ':' '{ print $2 }') == $tunnel_ofport]) + if test x$flow_based_tunnel == xtrue; then + trace_output=$(ovs-appctl ofproto/trace br-int in_port=vif$src $packet) + AT_CHECK([echo "$trace_output" | grep -q "set_field:$local_encap_ip->tun_src"]) + AT_CHECK([echo "$trace_output" | grep -q "set_field:$remote_encap_ip->tun_dst"]) + fi } for i in 1 2; do for j in 1 2; do - check_packet_tunnel 1$i 2$j + check_packet_tunnel 1$i 2$j 192.168.0.1$i 192.168.0.2$j done done @@ -31551,6 +31576,36 @@ for i in 1 2; do done done +# Test flow-based tunnels +for i in 1 2; do + as hv$i + check ovs-vsctl set open . external_ids:ovn-enable-flow-based-tunnels=true + for j in 1 2; do + check ovs-vsctl set Interface vif$i$j external_ids:encap-ip=192.168.0.$i$j + done +done +check ovn-nbctl --wait=hv sync + +for i in 1 2; do + for j in 1 2; do + check_packet_tunnel 1$i 2$j 192.168.0.1$i 192.168.0.2$j true + done +done + +for i in 1 2; do + as hv$i + for j in 1 2; do + check ovs-vsctl remove Interface vif$i$j external_ids encap-ip + done +done +check ovn-nbctl --wait=hv sync + +for i in 1 2; do + for j in 1 2; do + check_packet_tunnel 1$i 2$j 192.168.0.12 192.168.0.22 true + done +done + OVN_CLEANUP([hv1],[hv2]) AT_CLEANUP ]) @@ -31578,6 +31633,7 @@ for i in 1 2; do ovs-vsctl add-br br-phys-$j ovn_attach n1 br-phys-$j 192.168.0.${i}1 ovs-vsctl set open . external_ids:ovn-encap-ip=192.168.0.${i}1,192.168.0.${i}2 + ovs-vsctl set open . external_ids:ovn-enable-flow-based-tunnels=false ovs-vsctl add-br br-ext ovs-vsctl add-port br-ext vif$i -- set Interface vif$i \ @@ -31623,14 +31679,26 @@ check_packet_tunnel() { local local_encap_ip=$3 local remote_encap_ip=$4 + local flow_based_tunnel=$5 + local packet=$(fmt_pkt "Ether(dst='${dst_mac}', src='${src_mac}')/ \ IP(dst='${dst_ip}', src='${src_ip}')/ \ ICMP(type=8)") hv=`vif_to_hv vif$src` as $hv echo "vif$src -> vif$dst should go through tunnel $local_encap_ip -> $remote_encap_ip" - tunnel_ofport=$(ovs-vsctl --bare --column=ofport find interface options:local_ip=$local_encap_ip options:remote_ip=$remote_encap_ip) + if test x$flow_based_tunnel == xtrue; then + tunnel_ofport=$(ovs-vsctl --bare --column=ofport list interface ovn-geneve) + else + tunnel_ofport=$(ovs-vsctl --bare --column=ofport find interface options:local_ip=$local_encap_ip options:remote_ip=$remote_encap_ip) + fi + ovs-appctl ofproto/trace br-ext in_port=vif$src $packet AT_CHECK([test $(ovs-appctl ofproto/trace br-ext in_port=vif$src $packet | grep "output:" | awk -F ':' '{ print $2 }') == $tunnel_ofport]) + if test x$flow_based_tunnel == xtrue; then + trace_output=$(ovs-appctl ofproto/trace br-ext in_port=vif$src $packet) + AT_CHECK([echo "$trace_output" | grep -q "set_field:$local_encap_ip->tun_src"]) + AT_CHECK([echo "$trace_output" | grep -q "set_field:$remote_encap_ip->tun_dst"]) + fi } # Create mac-binding for the destination so that there is no need to trigger @@ -31652,6 +31720,23 @@ for e in 1 2; do check_packet_tunnel 1 2 192.168.0.1${e} 192.168.0.2${e} done +# Test flow-based tunnels +for i in 1 2; do + as hv$i + check ovs-vsctl set open . external_ids:ovn-enable-flow-based-tunnels=true +done +check ovn-nbctl --wait=hv sync + +for e in 1 2; do + for i in 1 2; do + as hv$i + check ovs-vsctl set open . external_ids:ovn-encap-ip-default=192.168.0.${i}${e} + done + check ovn-nbctl --wait=hv sync + + check_packet_tunnel 1 2 192.168.0.1${e} 192.168.0.2${e} true +done + OVN_CLEANUP([hv1],[hv2]) AT_CLEANUP ]) diff --git a/tests/ovs-macros.at b/tests/ovs-macros.at index 6a3dc51dc392..6e6e5208028e 100644 --- a/tests/ovs-macros.at +++ b/tests/ovs-macros.at @@ -9,13 +9,15 @@ dnl - If NORTHD_USE_DP_GROUPS is defined, then append it to the test name and dnl set it as a shell variable as well. m4_rename([AT_SETUP], [OVS_AT_SETUP]) m4_define([AT_SETUP], - [OVS_AT_SETUP($@[]m4_ifdef([NORTHD_USE_PARALLELIZATION], [ -- parallelization=NORTHD_USE_PARALLELIZATION])[]m4_ifdef([OVN_MONITOR_ALL], [ -- ovn_monitor_all=OVN_MONITOR_ALL])) + [OVS_AT_SETUP($@[]m4_ifdef([NORTHD_USE_PARALLELIZATION], [ -- parallelization=NORTHD_USE_PARALLELIZATION])[]m4_ifdef([OVN_MONITOR_ALL], [ -- ovn_monitor_all=OVN_MONITOR_ALL])[]m4_ifdef([OVN_ENABLE_FLOW_BASED_TUNNELS], [ -- flow_based_tunnels=OVN_ENABLE_FLOW_BASED_TUNNELS])) m4_ifdef([NORTHD_USE_PARALLELIZATION], [[NORTHD_USE_PARALLELIZATION]=NORTHD_USE_PARALLELIZATION ])dnl m4_ifdef([NORTHD_DUMMY_NUMA], [[NORTHD_DUMMY_NUMA]=NORTHD_DUMMY_NUMA ])dnl m4_ifdef([OVN_MONITOR_ALL], [[OVN_MONITOR_ALL]=OVN_MONITOR_ALL ])dnl +m4_ifdef([OVN_ENABLE_FLOW_BASED_TUNNELS], [[OVN_ENABLE_FLOW_BASED_TUNNELS]=OVN_ENABLE_FLOW_BASED_TUNNELS +])dnl ovs_init ]) -- 2.38.1 _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
