On 4/24/26 3:23 PM, Ales Musil wrote:
> Add nexthop_exchange engine node that handles nexthop updates
> keeping the internal nexthop mapping up to date. The noop handler
> for evpn_fdb will be replaced by later commit.
>
> Signed-off-by: Ales Musil <[email protected]>
> ---
Hi Ales,
> controller/nexthop-exchange-stub.c | 20 +++++++
> controller/nexthop-exchange.c | 82 ++++++++++++++++++++++-----
> controller/nexthop-exchange.h | 5 ++
> controller/ovn-controller.c | 89 ++++++++++++++++++++++++++++++
> tests/ovn-inc-proc-graph-dump.at | 3 +
> 5 files changed, 184 insertions(+), 15 deletions(-)
>
> diff --git a/controller/nexthop-exchange-stub.c
> b/controller/nexthop-exchange-stub.c
> index 2742dc7e2..52c1bf028 100644
> --- a/controller/nexthop-exchange-stub.c
> +++ b/controller/nexthop-exchange-stub.c
> @@ -18,6 +18,7 @@
> #include "lib/netlink.h"
> #include "openvswitch/hmap.h"
> #include "openvswitch/ofpbuf.h"
> +#include "vec.h"
>
> #include "nexthop-exchange.h"
>
> @@ -34,9 +35,28 @@ nexthop_entry_format(struct ds *ds OVS_UNUSED,
> {
> }
>
> +struct nexthop_entry *
> +nexthop_entry_find(const struct hmap *nexthops OVS_UNUSED,
> + uint32_t id OVS_UNUSED)
> +{
> + return NULL;
> +}
> +
> int
> nh_table_parse(struct ofpbuf *buf OVS_UNUSED,
> struct nh_table_msg *change OVS_UNUSED)
> {
> return 0;
> }
> +
> +bool
> +nexthops_handle_changes(struct hmap *nexthops OVS_UNUSED,
> + struct vector *msgs OVS_UNUSED)
> +{
> + return false;
> +}
> +
> +void
> +nexthops_destroy(struct hmap *nexthops OVS_UNUSED)
> +{
> +}
> diff --git a/controller/nexthop-exchange.c b/controller/nexthop-exchange.c
> index a2ad643a6..8718b893f 100644
> --- a/controller/nexthop-exchange.c
> +++ b/controller/nexthop-exchange.c
> @@ -19,9 +19,11 @@
>
> #include "lib/netlink.h"
> #include "lib/netlink-socket.h"
> +#include "hmapx.h"
> #include "openvswitch/ofpbuf.h"
> #include "openvswitch/vlog.h"
> #include "packets.h"
> +#include "vec.h"
>
> #include "nexthop-exchange.h"
>
> @@ -109,6 +111,20 @@ nexthop_entry_format(struct ds *ds, const struct
> nexthop_entry *nhe)
> }
> }
>
> +struct nexthop_entry *
> +nexthop_entry_find(const struct hmap *nexthops, uint32_t id)
> +{
> + uint32_t hash = nexthop_entry_hash(id);
> + struct nexthop_entry *nhe;
> + HMAP_FOR_EACH_WITH_HASH (nhe, hmap_node, hash, nexthops) {
> + if (nhe->id == id) {
> + return nhe;
> + }
> + }
> +
> + return NULL;
> +}
> +
> /* Parse Netlink message in buf, which is expected to contain a UAPI nhmsg
> * header and associated nexthop attributes. This will allocate
> * 'struct nexthop_entry' which needs to be freed by the caller.
> @@ -128,6 +144,56 @@ nh_table_parse(struct ofpbuf *buf, struct nh_table_msg
> *change)
> nlmsg, change);
> }
>
> +bool
> +nexthops_handle_changes(struct hmap *nexthops, struct vector *msgs)
> +{
> + if (vector_is_empty(msgs)) {
> + return false;
> + }
> +
> + struct hmapx updated_groups = HMAPX_INITIALIZER(&updated_groups);
> +
> + struct nh_table_msg *msg;
> + VECTOR_FOR_EACH_PTR (msgs, msg) {
> + struct nexthop_entry *nhe = nexthop_entry_find(nexthops,
> msg->nhe->id);
> + if (nhe) {
> + hmap_remove(nexthops, &nhe->hmap_node);
We assume here that if nhe used to be referenced by a different
nexthop_grp_entry->gateway, the same batch of changes will also include
an RTM_DELNEXTHOP for that nexthop too.
It feels a bit fragile but it's probably fine. The alternative would be
to store backrefs but that seems a bit excessive.
Let's leave it as is.
> + free(nhe);
> + }
> +
> + if (msg->nlmsg_type == RTM_NEWNEXTHOP) {
> + hmap_insert(nexthops, &msg->nhe->hmap_node,
> + nexthop_entry_hash(msg->nhe->id));
> +
> + if (msg->nhe->n_grps) {
> + hmapx_add(&updated_groups, msg->nhe);
> + }
> +
> + /* The nexthop entry moved into the hmap, prevent double free. */
> + msg->nhe = NULL;
> + }
> + }
> +
> + struct hmapx_node *hmapx_node;
> + HMAPX_FOR_EACH (hmapx_node, &updated_groups) {
> + struct nexthop_entry *nhe = hmapx_node->data;
> + nh_populate_grp_pointers(nhe, nexthops);
> + }
> +
> + hmapx_destroy(&updated_groups);
> +
> + return true;
> +}
> +
> +void
> +nexthops_destroy(struct hmap *nexthops)
> +{
> + struct nexthop_entry *entry;
> + HMAP_FOR_EACH_POP (entry, hmap_node, nexthops) {
> + free(entry);
> + }
> +}
> +
> static int
> nh_table_parse__(struct ofpbuf *buf, size_t ofs, const struct nlmsghdr
> *nlmsg,
> struct nh_table_msg *change)
> @@ -214,25 +280,11 @@ nexthop_entry_hash(uint32_t id)
> return hash_int(id, 0);
> }
>
> -static struct nexthop_entry *
> -nexthop_find(struct hmap *nexthops, uint32_t id)
> -{
> - uint32_t hash = nexthop_entry_hash(id);
> - struct nexthop_entry *nhe;
> - HMAP_FOR_EACH_WITH_HASH (nhe, hmap_node, hash, nexthops) {
> - if (nhe->id == id) {
> - return nhe;
> - }
> - }
> -
> - return NULL;
> -}
> -
> static void
> nh_populate_grp_pointers(struct nexthop_entry *nhe, struct hmap *nexthops)
> {
> for (size_t i = 0; i < nhe->n_grps; i++) {
> struct nexthop_grp_entry *grp = &nhe->grps[i];
> - grp->gateway = nexthop_find(nexthops, grp->id);
> + grp->gateway = nexthop_entry_find(nexthops, grp->id);
> }
> }
> diff --git a/controller/nexthop-exchange.h b/controller/nexthop-exchange.h
> index e94fdc73a..73f08c2fe 100644
> --- a/controller/nexthop-exchange.h
> +++ b/controller/nexthop-exchange.h
> @@ -23,6 +23,7 @@
>
> struct ds;
> struct ofpbuf;
> +struct vector;
>
> struct nexthop_grp_entry {
> /* The id of the nexthop gateway. */
> @@ -56,6 +57,10 @@ struct nh_table_msg {
>
> void nexthops_sync(struct hmap *nexthops);
> void nexthop_entry_format(struct ds *ds, const struct nexthop_entry *nhe);
> +struct nexthop_entry *nexthop_entry_find(const struct hmap *nexthops,
> + uint32_t id);
> int nh_table_parse(struct ofpbuf *, struct nh_table_msg *change);
> +bool nexthops_handle_changes(struct hmap *nexthops, struct vector *msgs);
> +void nexthops_destroy(struct hmap *nexthops);
>
> #endif /* NEXTHOP_EXCHANGE_H */
> diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
> index 35a5cd0b4..c46530f90 100644
> --- a/controller/ovn-controller.c
> +++ b/controller/ovn-controller.c
> @@ -99,6 +99,7 @@
> #include "neighbor.h"
> #include "neighbor-exchange.h"
> #include "neighbor-exchange-netlink.h"
> +#include "nexthop-exchange.h"
> #include "evpn-arp.h"
> #include "evpn-binding.h"
> #include "evpn-fdb.h"
> @@ -6344,6 +6345,83 @@ en_neighbor_table_notify_run(struct engine_node *node
> OVS_UNUSED,
> return state;
> }
>
> +/* The nexthop_exchange node is an input node, but is enabled/disabled
> + * based on en_neighbor_exchange node. The reason being that engine
> + * periodically runs input nodes to check if there are updates, so it could
> + * be polled for updates without requiring other nodes to run first. */
> +struct ed_type_nexthop_exchange {
> + struct hmap nexthops;
> + bool enabled;
> + bool recompute;
> +};
> +
> +static void *
> +en_nexthop_exchange_init(struct engine_node *node OVS_UNUSED,
> + struct engine_arg *arg OVS_UNUSED)
> +{
> + struct ed_type_nexthop_exchange *nhe_data = xmalloc(sizeof *nhe_data);
> + *nhe_data = (struct ed_type_nexthop_exchange) {
> + .nexthops = HMAP_INITIALIZER(&nhe_data->nexthops),
> + .enabled = false,
> + .recompute = true,
> + };
> +
> + return nhe_data;
> +}
> +
> +static void
> +en_nexthop_exchange_cleanup(void *data)
> +{
> + struct ed_type_nexthop_exchange *nhe_data = data;
> + nexthops_destroy(&nhe_data->nexthops);
> + hmap_destroy(&nhe_data->nexthops);
> +}
> +
> +static enum engine_node_state
> +en_nexthop_exchange_run(struct engine_node *node OVS_UNUSED, void *data)
> +{
> + struct ed_type_nexthop_exchange *nhe_data = data;
> +
> + if (!nhe_data->enabled) {
> + return EN_UNCHANGED;
> + }
> +
> + if (nhe_data->recompute) {
> + nexthops_destroy(&nhe_data->nexthops);
> + nexthops_sync(&nhe_data->nexthops);
> + /* We are doing a full sync, let's clear any data
> + * that might accumulate in the meantime. */
> + ovn_netlink_notifier_flush(OVN_NL_NOTIFIER_NEXTHOP);
> +
> + nhe_data->recompute = false;
> + return EN_UPDATED;
> + }
> +
> + struct vector *msgs = ovn_netlink_get_msgs(OVN_NL_NOTIFIER_NEXTHOP);
> + bool updated = nexthops_handle_changes(&nhe_data->nexthops, msgs);
> + ovn_netlink_notifier_flush(OVN_NL_NOTIFIER_NEXTHOP);
> +
> + return updated ? EN_UPDATED : EN_UNCHANGED;
> +}
> +
> +static void
> +nexthop_exchange_update(struct ed_type_nexthop_exchange *nhe_data,
> + bool enabled)
> +{
> + if (nhe_data->enabled == enabled) {
> + return;
> + }
> +
> + if (nhe_data->enabled && !enabled) {
> + nexthops_destroy(&nhe_data->nexthops);
> + } else if (!nhe_data->enabled && enabled) {
> + nhe_data->recompute = true;
> + }
> +
> + nhe_data->enabled = enabled;
> + ovn_netlink_update_notifier(OVN_NL_NOTIFIER_NEXTHOP, enabled);
> +}
> +
> struct ed_type_neighbor_exchange {
> /* Contains 'struct evpn_remote_vtep'. */
> struct hmap remote_vteps;
> @@ -6389,6 +6467,8 @@ en_neighbor_exchange_run(struct engine_node *node, void
> *data_)
> engine_get_input_data("neighbor", node);
> struct ed_type_neighbor_table_notify *nt_notify =
> engine_get_input_data("neighbor_table_notify", node);
> + struct ed_type_nexthop_exchange *nhe_data =
> + engine_get_input_data("nexthop_exchange", node);
>
> evpn_remote_vteps_clear(&data->remote_vteps);
> evpn_static_entries_clear(&data->static_fdbs);
> @@ -6407,6 +6487,7 @@ en_neighbor_exchange_run(struct engine_node *node, void
> *data_)
>
> neighbor_exchange_run(&n_ctx_in, &n_ctx_out);
> neighbor_table_notify_update(&nt_notify->watches);
> + nexthop_exchange_update(nhe_data, !vector_is_empty(&nt_notify->watches));
This reads a bit weird to be honest. 'nhe_data' is the data of the
"nexthop_exchange" node, which is an input of the "neighbor_exchange"
node whose run() function we're executing here.
So we're changing the data owned by an input node inside a different
node's run callback. That is not really nice.
OTOH, we already broke that pattern with 5ec13ae6eff6 ("controller:
Consolidate the netlink notifiers.") as we're doing the same thing
(changing input node data) for rt_notify.
Let's keep it for now but maybe we should follow up with a finer grain
approach that doesn't require this kind of semantics?
>
> return EN_UPDATED;
> }
> @@ -6864,6 +6945,7 @@ static ENGINE_NODE(neighbor);
> static ENGINE_NODE(neighbor_table_notify);
> static ENGINE_NODE(neighbor_exchange);
> static ENGINE_NODE(neighbor_exchange_status);
> +static ENGINE_NODE(nexthop_exchange);
> static ENGINE_NODE(evpn_vtep_binding, CLEAR_TRACKED_DATA);
> static ENGINE_NODE(evpn_fdb, CLEAR_TRACKED_DATA);
> static ENGINE_NODE(evpn_arp, CLEAR_TRACKED_DATA);
> @@ -7117,6 +7199,10 @@ inc_proc_ovn_controller_init(
> engine_add_input(&en_neighbor_exchange, &en_neighbor_table_notify, NULL);
> engine_add_input(&en_neighbor_exchange, &en_neighbor_exchange_status,
> NULL);
> + /* We just need to enable/disable the nexthop exchange based on
> + * the neighbor status. */
> + engine_add_input(&en_neighbor_exchange, &en_nexthop_exchange,
> + engine_noop_handler);
>
> engine_add_input(&en_evpn_vtep_binding, &en_ovs_open_vswitch, NULL);
> engine_add_input(&en_evpn_vtep_binding, &en_ovs_bridge, NULL);
> @@ -7133,6 +7219,9 @@ inc_proc_ovn_controller_init(
> engine_add_input(&en_evpn_fdb, &en_neighbor_exchange, NULL);
> engine_add_input(&en_evpn_fdb, &en_evpn_vtep_binding,
> evpn_fdb_vtep_binding_handler);
> + /* XXX: This is just a place holder and it will be updated later on. */
> + engine_add_input(&en_evpn_fdb, &en_nexthop_exchange,
> + engine_noop_handler);
>
> engine_add_input(&en_evpn_arp, &en_neighbor_exchange, NULL);
> engine_add_input(&en_evpn_arp, &en_evpn_vtep_binding,
> diff --git a/tests/ovn-inc-proc-graph-dump.at
> b/tests/ovn-inc-proc-graph-dump.at
> index 178310978..6b4d94835 100644
> --- a/tests/ovn-inc-proc-graph-dump.at
> +++ b/tests/ovn-inc-proc-graph-dump.at
> @@ -401,11 +401,13 @@ digraph "Incremental-Processing-Engine" {
> host_if_monitor [[style=filled, shape=box, fillcolor=white,
> label="host_if_monitor"]];
> neighbor_table_notify [[style=filled, shape=box, fillcolor=white,
> label="neighbor_table_notify"]];
> neighbor_exchange_status [[style=filled, shape=box, fillcolor=white,
> label="neighbor_exchange_status"]];
> + nexthop_exchange [[style=filled, shape=box, fillcolor=white,
> label="nexthop_exchange"]];
> neighbor_exchange [[style=filled, shape=box, fillcolor=white,
> label="neighbor_exchange"]];
> neighbor -> neighbor_exchange [[label=""]];
> host_if_monitor -> neighbor_exchange [[label=""]];
> neighbor_table_notify -> neighbor_exchange [[label=""]];
> neighbor_exchange_status -> neighbor_exchange [[label=""]];
> + nexthop_exchange -> neighbor_exchange [[label="engine_noop_handler"]];
> evpn_vtep_binding [[style=filled, shape=box, fillcolor=white,
> label="evpn_vtep_binding"]];
> OVS_open_vswitch -> evpn_vtep_binding [[label=""]];
> OVS_bridge -> evpn_vtep_binding [[label=""]];
> @@ -416,6 +418,7 @@ digraph "Incremental-Processing-Engine" {
> evpn_fdb [[style=filled, shape=box, fillcolor=white, label="evpn_fdb"]];
> neighbor_exchange -> evpn_fdb [[label=""]];
> evpn_vtep_binding -> evpn_fdb [[label="evpn_fdb_vtep_binding_handler"]];
> + nexthop_exchange -> evpn_fdb [[label="engine_noop_handler"]];
> evpn_arp [[style=filled, shape=box, fillcolor=white, label="evpn_arp"]];
> neighbor_exchange -> evpn_arp [[label=""]];
> evpn_vtep_binding -> evpn_arp [[label="evpn_arp_vtep_binding_handler"]];
Applied to main, thanks!
Regards,
Dumitru
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev