On 1/29/25 12:15 PM, Felix Huettner via dev wrote: > northd allows us to announce host routes instead of connected routes of > LRs. > > For the host routes we also know the LSP that host the address. > We can then check if this LSP is pointing to a LRP that is also local to > the current chassis, so if a LR is chained behind an LR and both use > chassisredirect ports. > In this case we can announce the route with a more preferable priority > than otherwise. > > This helps in the following case: > * the backend router is bound on only a single chassis > * the frontend router is bound to multiple chassis (with multiple LRPs) > * one of the chassis of the frontend router matches the backend router > > In this case it would be preferable if the network fabric sends the > traffic to the chassis that hosts both the frontend and backend router. > Other chassis would work as well, but then OVN would redirect the > traffic from one chassis to the other. > So this allows us to skip tunneling traffic in one case. > > Signed-off-by: Felix Huettner <felix.huettner@stackit.cloud> > ---
Hi Felix, With the minor comments I left below fixed, feel free to add my ack to v6: Acked-by: Dumitru Ceara <dce...@redhat.com> > v3->v4: > - addressed review comments. > > controller/lport.c | 44 ++++++++++++++--- > controller/lport.h | 4 ++ > controller/ovn-controller.c | 56 ++++++++++++++++++++- > controller/route-exchange-netlink.c | 20 +++++--- > controller/route-exchange-netlink.h | 4 +- > controller/route.c | 33 ++++++++++--- > controller/route.h | 13 +++++ > tests/system-ovn.at | 76 +++++++++++++++++++++++------ > 8 files changed, 212 insertions(+), 38 deletions(-) > > diff --git a/controller/lport.c b/controller/lport.c > index b3721024b..addec7ac3 100644 > --- a/controller/lport.c > +++ b/controller/lport.c > @@ -79,14 +79,11 @@ lport_lookup_by_key(struct ovsdb_idl_index > *sbrec_datapath_binding_by_key, > port_key); > } > > -bool > -lport_is_chassis_resident(struct ovsdb_idl_index *sbrec_port_binding_by_name, > - const struct sbrec_chassis *chassis, > - const struct sset *active_tunnels, > - const char *port_name) > +static bool > +lport_pb_is_chassis_resident(const struct sbrec_chassis *chassis, > + const struct sset *active_tunnels, > + const struct sbrec_port_binding *pb) > { > - const struct sbrec_port_binding *pb > - = lport_lookup_by_name(sbrec_port_binding_by_name, port_name); > if (!pb || !pb->chassis) { > return false; > } > @@ -98,6 +95,39 @@ lport_is_chassis_resident(struct ovsdb_idl_index > *sbrec_port_binding_by_name, > } > } > > +bool > +lport_is_chassis_resident(struct ovsdb_idl_index *sbrec_port_binding_by_name, > + const struct sbrec_chassis *chassis, > + const struct sset *active_tunnels, > + const char *port_name) > +{ > + const struct sbrec_port_binding *pb > + = lport_lookup_by_name(sbrec_port_binding_by_name, port_name); > + return lport_pb_is_chassis_resident(chassis, active_tunnels, pb); > +} > + > +bool > +lport_is_local(struct ovsdb_idl_index *sbrec_port_binding_by_name, > + const struct sbrec_chassis *chassis, > + const struct sset *active_tunnels, > + const char *port_name) > +{ > + const struct sbrec_port_binding *pb = lport_lookup_by_name( > + sbrec_port_binding_by_name, port_name); > + > + if (lport_pb_is_chassis_resident(chassis, active_tunnels, pb)) { > + return true; > + } > + > + const char *crp = smap_get(&pb->options, "chassis-redirect-port"); > + if (!crp) { > + return false; > + } > + > + return lport_is_chassis_resident(sbrec_port_binding_by_name, chassis, > + active_tunnels, crp); > +} > + > const struct sbrec_port_binding * > lport_get_peer(const struct sbrec_port_binding *pb, > struct ovsdb_idl_index *sbrec_port_binding_by_name) > diff --git a/controller/lport.h b/controller/lport.h > index 2f72aef5e..f02c47f3a 100644 > --- a/controller/lport.h > +++ b/controller/lport.h > @@ -68,6 +68,10 @@ lport_is_chassis_resident(struct ovsdb_idl_index > *sbrec_port_binding_by_name, > const struct sbrec_chassis *chassis, > const struct sset *active_tunnels, > const char *port_name); > +bool lport_is_local(struct ovsdb_idl_index *sbrec_port_binding_by_name, > + const struct sbrec_chassis *chassis, > + const struct sset *active_tunnels, > + const char *port_name); > const struct sbrec_port_binding *lport_get_peer( > const struct sbrec_port_binding *, > struct ovsdb_idl_index *sbrec_port_binding_by_name); > diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c > index 5b31f6fd2..612fccb42 100644 > --- a/controller/ovn-controller.c > +++ b/controller/ovn-controller.c > @@ -4852,7 +4852,15 @@ struct ed_type_route { > /* Contains struct tracked_datapath entries for local datapaths subject > to > * route exchange. */ > struct hmap tracked_route_datapaths; > - /* Contains struct advertise_datapath_entry */ > + > + /* Contains the tracked_ports that in the last run where bound locally. > */ > + struct sset tracked_ports_local; > + > + /* Contains the tracked_ports that in the last run where bound not > + * local. */ > + struct sset tracked_ports_remote; > + > + /* Contains struct advertise_datapath_entry. */ > struct hmap announce_routes; > }; > > @@ -4893,10 +4901,14 @@ en_route_run(struct engine_node *node, void *data) > > struct route_ctx_out r_ctx_out = { > .tracked_re_datapaths = &re_data->tracked_route_datapaths, > + .tracked_ports_local = &re_data->tracked_ports_local, > + .tracked_ports_remote = &re_data->tracked_ports_remote, > .announce_routes = &re_data->announce_routes, > }; > > tracked_datapaths_clear(r_ctx_out.tracked_re_datapaths); > + sset_clear(r_ctx_out.tracked_ports_local); > + sset_clear(r_ctx_out.tracked_ports_remote); > > route_run(&r_ctx_in, &r_ctx_out); > > @@ -4911,6 +4923,8 @@ en_route_init(struct engine_node *node OVS_UNUSED, > struct ed_type_route *data = xzalloc(sizeof *data); > > hmap_init(&data->tracked_route_datapaths); > + sset_init(&data->tracked_ports_local); > + sset_init(&data->tracked_ports_remote); > hmap_init(&data->announce_routes); > data->ovnsb_idl = arg->sb_idl; > > @@ -4923,6 +4937,8 @@ en_route_cleanup(void *data) > struct ed_type_route *re_data = data; > > tracked_datapaths_destroy(&re_data->tracked_route_datapaths); > + sset_destroy(&re_data->tracked_ports_local); > + sset_destroy(&re_data->tracked_ports_remote); > route_cleanup(&re_data->announce_routes); > hmap_destroy(&re_data->announce_routes); > } > @@ -4970,6 +4986,26 @@ route_sb_port_binding_data_handler(struct engine_node > *node, void *data) > const struct sbrec_port_binding_table *pb_table = > EN_OVSDB_GET(engine_get_input("SB_port_binding", node)); > > + const struct ovsrec_open_vswitch_table *ovs_table = > + EN_OVSDB_GET(engine_get_input("OVS_open_vswitch", node)); > + const char *chassis_id = get_ovs_chassis_id(ovs_table); > + ovs_assert(chassis_id); > + > + struct ovsdb_idl_index *sbrec_chassis_by_name = > + engine_ovsdb_node_get_index( > + engine_get_input("SB_chassis", node), > + "name"); > + const struct sbrec_chassis *chassis > + = chassis_lookup_by_name(sbrec_chassis_by_name, chassis_id); > + ovs_assert(chassis); > + > + struct ovsdb_idl_index *sbrec_port_binding_by_name = > + engine_ovsdb_node_get_index( > + engine_get_input("SB_port_binding", node), > + "name"); > + struct ed_type_runtime_data *rt_data = > + engine_get_input_data("runtime_data", node); > + > const struct sbrec_port_binding *sbrec_pb; > SBREC_PORT_BINDING_TABLE_FOR_EACH_TRACKED (sbrec_pb, pb_table) { > struct tracked_datapath *re_t_dp = > @@ -4987,6 +5023,24 @@ route_sb_port_binding_data_handler(struct engine_node > *node, void *data) > return false; > } > > + if (sset_contains(&re_data->tracked_ports_local, > + sbrec_pb->logical_port)) { > + if (!route_exchange_find_port(sbrec_port_binding_by_name, > chassis, > + &rt_data->active_tunnels, sbrec_pb)) > { > + /* The port was previously local but now it no longer is. */ > + return false; > + } > + } > + > + if (sset_contains(&re_data->tracked_ports_remote, > + sbrec_pb->logical_port)) { > + if (route_exchange_find_port(sbrec_port_binding_by_name, chassis, > + &rt_data->active_tunnels, sbrec_pb)) { > + /* The port was previously remote but now we bound it. */ > + return false; > + } > + } > + > } > return true; > } > diff --git a/controller/route-exchange-netlink.c > b/controller/route-exchange-netlink.c > index c5a77efe4..6a9612a7e 100644 > --- a/controller/route-exchange-netlink.c > +++ b/controller/route-exchange-netlink.c > @@ -102,7 +102,8 @@ re_nl_delete_vrf(const char *ifname) > > static int > modify_route(uint32_t type, uint32_t flags_arg, uint32_t table_id, > - const struct in6_addr *dst, unsigned int plen) > + const struct in6_addr *dst, unsigned int plen, > + unsigned int priority) > { > uint32_t flags = NLM_F_REQUEST | NLM_F_ACK; > bool is_ipv4 = IN6_IS_ADDR_V4MAPPED(dst); > @@ -128,6 +129,7 @@ modify_route(uint32_t type, uint32_t flags_arg, uint32_t > table_id, > rt->rtm_dst_len = plen; > > nl_msg_put_u32(&request, RTA_TABLE, table_id); > + nl_msg_put_u32(&request, RTA_PRIORITY, priority); > > if (is_ipv4) { > nl_msg_put_be32(&request, RTA_DST, in6_addr_get_mapped_ipv4(dst)); > @@ -167,7 +169,7 @@ modify_route(uint32_t type, uint32_t flags_arg, uint32_t > table_id, > > int > re_nl_add_route(uint32_t table_id, const struct in6_addr *dst, > - unsigned int plen) > + unsigned int plen, unsigned int priority) > { > uint32_t flags = NLM_F_CREATE | NLM_F_EXCL; > uint32_t type = RTM_NEWROUTE; > @@ -179,12 +181,12 @@ re_nl_add_route(uint32_t table_id, const struct > in6_addr *dst, > return EINVAL; > } > > - return modify_route(type, flags, table_id, dst, plen); > + return modify_route(type, flags, table_id, dst, plen, priority); > } > > int > re_nl_delete_route(uint32_t table_id, const struct in6_addr *dst, > - unsigned int plen) > + unsigned int plen, unsigned int priority) > { > if (!TABLE_ID_VALID(table_id)) { > VLOG_WARN_RL(&rl, > @@ -193,7 +195,7 @@ re_nl_delete_route(uint32_t table_id, const struct > in6_addr *dst, > return EINVAL; > } > > - return modify_route(RTM_DELROUTE, 0, table_id, dst, plen); > + return modify_route(RTM_DELROUTE, 0, table_id, dst, plen, priority); > } > > void > @@ -246,13 +248,15 @@ handle_route_msg(const struct route_table_msg *msg, > void *data) > HMAPX_FOR_EACH_SAFE (hn, handle_data->routes_to_advertise) { > ar = hn->data; > if (ipv6_addr_equals(&ar->addr, &rd->rta_dst) > - && ar->plen == rd->rtm_dst_len) { > + && ar->plen == rd->rtm_dst_len > + && ar->priority == rd->rta_priority) { > hmapx_delete(handle_data->routes_to_advertise, hn); > return; > } > } > + Nit: unrelated newline. > err = re_nl_delete_route(rd->rta_table_id, &rd->rta_dst, > - rd->rtm_dst_len); > + rd->rtm_dst_len, rd->rta_priority); > if (err) { > char addr_s[INET6_ADDRSTRLEN + 1]; > VLOG_WARN_RL(&rl, "Delete route table_id=%"PRIu32" dst=%s plen=%d: > %s", > @@ -291,7 +295,7 @@ re_nl_sync_routes(uint32_t table_id, const struct hmap > *routes, > struct hmapx_node *hn; > HMAPX_FOR_EACH (hn, &routes_to_advertise) { > ar = hn->data; > - int err = re_nl_add_route(table_id, &ar->addr, ar->plen); > + int err = re_nl_add_route(table_id, &ar->addr, ar->plen, > ar->priority); > if (err) { > char addr_s[INET6_ADDRSTRLEN + 1]; > VLOG_WARN_RL(&rl, "Add route table_id=%"PRIu32" dst=%s " > diff --git a/controller/route-exchange-netlink.h > b/controller/route-exchange-netlink.h > index e4fe81977..b930f05a2 100644 > --- a/controller/route-exchange-netlink.h > +++ b/controller/route-exchange-netlink.h > @@ -44,9 +44,9 @@ int re_nl_create_vrf(const char *ifname, uint32_t table_id); > int re_nl_delete_vrf(const char *ifname); > > int re_nl_add_route(uint32_t table_id, const struct in6_addr *dst, > - unsigned int plen); > + unsigned int plen, unsigned int priority); > int re_nl_delete_route(uint32_t table_id, const struct in6_addr *dst, > - unsigned int plen); > + unsigned int plen, unsigned int priority); > > void re_nl_dump(uint32_t table_id); > > diff --git a/controller/route.c b/controller/route.c > index 22f38976b..2d86edc97 100644 > --- a/controller/route.c > +++ b/controller/route.c > @@ -37,6 +37,9 @@ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, > 20); > * in the corresponding VRF interface name. */ > #define MAX_TABLE_ID 1000000000 > > +#define PRIORITY_DEFAULT 1000 > +#define PRIORITY_LOCAL_BOUND 100 > + > bool > route_exchange_relevant_port(const struct sbrec_port_binding *pb) > { > @@ -50,11 +53,11 @@ advertise_route_hash(const struct in6_addr *dst, unsigned > int plen) > return hash_int(plen, hash); > } > > -static const struct sbrec_port_binding* > -find_route_exchange_pb(struct ovsdb_idl_index *sbrec_port_binding_by_name, > - const struct sbrec_chassis *chassis, > - const struct sset *active_tunnels, > - const struct sbrec_port_binding *pb) > +const struct sbrec_port_binding* > +route_exchange_find_port(struct ovsdb_idl_index *sbrec_port_binding_by_name, > + const struct sbrec_chassis *chassis, > + const struct sset *active_tunnels, > + const struct sbrec_port_binding *pb) > { > if (!pb) { > return NULL; > @@ -126,7 +129,7 @@ route_run(struct route_ctx_in *r_ctx_in, > for (size_t i = 0; i < ld->n_peer_ports; i++) { > const struct sbrec_port_binding *local_peer > = ld->peer_ports[i].local; > - const struct sbrec_port_binding *repb = find_route_exchange_pb( > + const struct sbrec_port_binding *repb = route_exchange_find_port( > r_ctx_in->sbrec_port_binding_by_name, > r_ctx_in->chassis, > r_ctx_in->active_tunnels, > @@ -188,11 +191,29 @@ cleanup: > continue; > } > > + unsigned int priority = PRIORITY_DEFAULT; > + > + if (route->tracked_port) { > + if (lport_is_local( > + r_ctx_in->sbrec_port_binding_by_name, > + r_ctx_in->chassis, > + r_ctx_in->active_tunnels, > + route->tracked_port->logical_port)) { Nit: in my opinion this looks better if we rewrite it as: if (lport_is_local(r_ctx_in->sbrec_port_binding_by_name, r_ctx_in->chassis, r_ctx_in->active_tunnels, route->tracked_port->logical_port)) { > + priority = PRIORITY_LOCAL_BOUND; > + sset_add(r_ctx_out->tracked_ports_local, > + route->tracked_port->logical_port); > + } else { > + sset_add(r_ctx_out->tracked_ports_remote, > + route->tracked_port->logical_port); > + } > + } > + > struct advertise_route_entry *ar = xzalloc(sizeof(*ar)); > hmap_insert(&ad->routes, &ar->node, > advertise_route_hash(&prefix, plen)); > ar->addr = prefix; > ar->plen = plen; > + ar->priority = priority; > } > } > > diff --git a/controller/route.h b/controller/route.h > index f23115abb..986c35ec0 100644 > --- a/controller/route.h > +++ b/controller/route.h > @@ -40,6 +40,13 @@ struct route_ctx_in { > > struct route_ctx_out { > struct hmap *tracked_re_datapaths; > + > + /* Contains the tracked_ports that in the last run where bound locally */ Typo: s/where/were. Missing period at the end. > + struct sset *tracked_ports_local; > + > + /* Contains the tracked_ports that in the last run where bound not local > */ I guess you meant "were not bound locally.", right? > + struct sset *tracked_ports_remote; > + > /* Contains struct advertise_datapath_entry */ > struct hmap *announce_routes; > }; > @@ -60,8 +67,14 @@ struct advertise_route_entry { > struct hmap_node node; > struct in6_addr addr; > unsigned int plen; > + unsigned int priority; > }; > > +const struct sbrec_port_binding *route_exchange_find_port( > + struct ovsdb_idl_index *sbrec_port_binding_by_name, > + const struct sbrec_chassis *chassis, > + const struct sset *active_tunnels, > + const struct sbrec_port_binding *pb); > bool route_exchange_relevant_port(const struct sbrec_port_binding *pb); > uint32_t advertise_route_hash(const struct in6_addr *dst, unsigned int plen); > void route_run(struct route_ctx_in *, struct route_ctx_out *); > diff --git a/tests/system-ovn.at b/tests/system-ovn.at > index e17d9534e..ef43e242e 100644 > --- a/tests/system-ovn.at > +++ b/tests/system-ovn.at > @@ -15029,8 +15029,8 @@ ovnvrf1337 1337 > # "ip route list" output has a trailing space on each line. > # The awk magic removes all trailing spaces. > OVS_WAIT_UNTIL_EQUAL([ip route list vrf ovnvrf1337 | awk '{$1=$1};1'], [dnl > -blackhole 192.0.2.0/24 proto 84 > -blackhole 198.51.100.0/24 proto 84]) > +blackhole 192.0.2.0/24 proto 84 metric 1000 > +blackhole 198.51.100.0/24 proto 84 metric 1000]) > > # We now switch to announcing host routes and expect 192.0.2.0/24 to be gone > # and the following to be added: > @@ -15038,15 +15038,39 @@ blackhole 198.51.100.0/24 proto 84]) > # * 192.0.2.2/32 > # * 192.0.2.3/32 > # * 192.0.2.10/32 > +# The last 3 of them are local to the current chassis so we expect a better > +# prio. > check ovn-nbctl --wait=hv set Logical_Router_Port internet-public \ > > options:dynamic-routing-connected-as-host-routes=true > > OVS_WAIT_UNTIL_EQUAL([ip route list vrf ovnvrf1337 | awk '{$1=$1};1'], [dnl > -blackhole 192.0.2.1 proto 84 > -blackhole 192.0.2.2 proto 84 > -blackhole 192.0.2.3 proto 84 > -blackhole 192.0.2.10 proto 84 > -blackhole 198.51.100.0/24 proto 84]) > +blackhole 192.0.2.1 proto 84 metric 1000 > +blackhole 192.0.2.2 proto 84 metric 100 > +blackhole 192.0.2.3 proto 84 metric 100 > +blackhole 192.0.2.10 proto 84 metric 100 > +blackhole 198.51.100.0/24 proto 84 metric 1000]) > + > +# If the pr1-public lrp is now removed from this hypervisor the route metric > +# will go back to the default. > +# For this we just schedule it on a non existing chassis. > +check ovn-nbctl lrp-del-gateway-chassis pr1-public hv1 > +check ovn-nbctl --wait=hv lrp-set-gateway-chassis pr1-public hv123 > +OVS_WAIT_UNTIL_EQUAL([ip route list vrf ovnvrf1337 | awk '{$1=$1};1'], [dnl > +blackhole 192.0.2.1 proto 84 metric 1000 > +blackhole 192.0.2.2 proto 84 metric 1000 > +blackhole 192.0.2.3 proto 84 metric 100 > +blackhole 192.0.2.10 proto 84 metric 1000 > +blackhole 198.51.100.0/24 proto 84 metric 1000]) > + > +# Moving pr1-public back will also change the route metrics again. > +check ovn-nbctl lrp-del-gateway-chassis pr1-public hv123 > +check ovn-nbctl --wait=hv lrp-set-gateway-chassis pr1-public hv1 > +OVS_WAIT_UNTIL_EQUAL([ip route list vrf ovnvrf1337 | awk '{$1=$1};1'], [dnl > +blackhole 192.0.2.1 proto 84 metric 1000 > +blackhole 192.0.2.2 proto 84 metric 100 > +blackhole 192.0.2.3 proto 84 metric 100 > +blackhole 192.0.2.10 proto 84 metric 100 > +blackhole 198.51.100.0/24 proto 84 metric 1000]) > > # Now we test route learning. > check_row_count Learned_Route 0 > @@ -15254,8 +15278,8 @@ ovnvrf1337 1337 > # "ip route list" output has a trailing space on each line. > # The awk magic removes all trailing spaces. > OVS_WAIT_UNTIL_EQUAL([ip route list vrf ovnvrf1337 | awk '{$1=$1};1'], [dnl > -blackhole 192.0.2.0/24 proto 84 > -blackhole 198.51.100.0/24 proto 84]) > +blackhole 192.0.2.0/24 proto 84 metric 1000 > +blackhole 198.51.100.0/24 proto 84 metric 1000]) > > # We now switch to announcing host routes and expect 192.0.2.0/24 to be gone > # and the following to be added: > @@ -15263,15 +15287,39 @@ blackhole 198.51.100.0/24 proto 84]) > # * 192.0.2.2/32 > # * 192.0.2.3/32 > # * 192.0.2.10/32 > +# The last 3 of them are local to the current chassis so we expect a better > +# prio. > check ovn-nbctl --wait=hv set Logical_Router_Port internet-public \ > > options:dynamic-routing-connected-as-host-routes=true > > OVS_WAIT_UNTIL_EQUAL([ip route list vrf ovnvrf1337 | awk '{$1=$1};1'], [dnl > -blackhole 192.0.2.1 proto 84 > -blackhole 192.0.2.2 proto 84 > -blackhole 192.0.2.3 proto 84 > -blackhole 192.0.2.10 proto 84 > -blackhole 198.51.100.0/24 proto 84]) > +blackhole 192.0.2.1 proto 84 metric 100 > +blackhole 192.0.2.2 proto 84 metric 100 > +blackhole 192.0.2.3 proto 84 metric 100 > +blackhole 192.0.2.10 proto 84 metric 100 > +blackhole 198.51.100.0/24 proto 84 metric 1000]) > + > +# If the pr1-public lrp is now removed from this hypervisor the route metric > +# will go back to the default. > +# For this we just schedule it on a non existing chassis. > +check ovn-nbctl lrp-del-gateway-chassis pr1-public hv1 > +check ovn-nbctl --wait=hv lrp-set-gateway-chassis pr1-public hv123 > +OVS_WAIT_UNTIL_EQUAL([ip route list vrf ovnvrf1337 | awk '{$1=$1};1'], [dnl > +blackhole 192.0.2.1 proto 84 metric 100 > +blackhole 192.0.2.2 proto 84 metric 1000 > +blackhole 192.0.2.3 proto 84 metric 100 > +blackhole 192.0.2.10 proto 84 metric 1000 > +blackhole 198.51.100.0/24 proto 84 metric 1000]) > + > +# Moving pr1-public back will also change the route metrics again. > +check ovn-nbctl lrp-del-gateway-chassis pr1-public hv123 > +check ovn-nbctl --wait=hv lrp-set-gateway-chassis pr1-public hv1 > +OVS_WAIT_UNTIL_EQUAL([ip route list vrf ovnvrf1337 | awk '{$1=$1};1'], [dnl > +blackhole 192.0.2.1 proto 84 metric 100 > +blackhole 192.0.2.2 proto 84 metric 100 > +blackhole 192.0.2.3 proto 84 metric 100 > +blackhole 192.0.2.10 proto 84 metric 100 > +blackhole 198.51.100.0/24 proto 84 metric 1000]) > > # Now we test route learning. > check_row_count Learned_Route 0 Thanks, Dumitru _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev