On Wed, Sep 10, 2025 at 3:04 PM Ales Musil <amu...@redhat.com> wrote:

>
>
> On Wed, Sep 10, 2025 at 9:44 AM Dumitru Ceara <dce...@redhat.com> wrote:
>
>> On 9/10/25 7:59 AM, Ales Musil via dev wrote:
>> > We were removing routes while still iterating over them,
>> > while it might work it's not a good practice. Postpone
>> > the removal after the dump is done.
>> >
>> > Reported-at: https://issues.redhat.com/browse/FDP-1594
>> > Signed-off-by: Ales Musil <amu...@redhat.com>
>> > ---
>>
>> Hi Ales,
>>
>> Thanks for the fix!
>>
>> There's one CI test failure in the ovsrobot run of this patch but it's
>> not related to the change:
>>
>> https://github.com/ovsrobot/ovn/actions/runs/17604844865/job/50013775059#step:11:4605
>
>
> Thank you Dumitru,
>
> it seems that we have a new flake.
>
>
>>
>>
>> >  controller/route-exchange-netlink.c | 70 ++++++++++++++++++++---------
>> >  1 file changed, 50 insertions(+), 20 deletions(-)
>> >
>> > diff --git a/controller/route-exchange-netlink.c
>> b/controller/route-exchange-netlink.c
>> > index cec4b1ec3..91f059492 100644
>> > --- a/controller/route-exchange-netlink.c
>> > +++ b/controller/route-exchange-netlink.c
>> > @@ -200,9 +200,9 @@ struct route_msg_handle_data {
>> >      const struct sbrec_datapath_binding *db;
>> >      struct hmapx *routes_to_advertise;
>> >      struct vector *learned_routes;
>> > +    struct vector *stale_routes;
>> >      const struct hmap *routes;
>> >      uint32_t table_id; /* requested table id. */
>> > -    int ret;
>> >  };
>> >
>> >  static void
>> > @@ -211,7 +211,6 @@ handle_route_msg(const struct route_table_msg *msg,
>> void *data)
>> >      struct route_msg_handle_data *handle_data = data;
>> >      const struct route_data *rd = &msg->rd;
>> >      struct advertise_route_entry *ar;
>> > -    int err;
>> >
>> >      if (handle_data->table_id != rd->rta_table_id) {
>> >          /* We do not have the NLM_F_DUMP_FILTERED info here, so check
>> if the
>> > @@ -261,23 +260,37 @@ handle_route_msg(const struct route_table_msg
>> *msg, void *data)
>> >              }
>> >          }
>> >      }
>> > -    err = re_nl_delete_route(rd->rta_table_id, &rd->rta_dst,
>> > -                             rd->rtm_dst_len, rd->rta_priority);
>> > -    if (err) {
>> > -        char addr_s[INET6_ADDRSTRLEN + 1];
>> > -        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
>> > -        VLOG_WARN_RL(&rl, "Delete route table_id=%"PRIu32" dst=%s
>> plen=%d "
>> > -                     "failed: %s", rd->rta_table_id,
>> > -                     ipv6_string_mapped(
>> > -                         addr_s, &rd->rta_dst) ? addr_s : "(invalid)",
>> > -                     rd->rtm_dst_len,
>> > -                     ovs_strerror(err));
>> > -
>> > -        if (!handle_data->ret) {
>> > -            /* Report the first error value to the caller. */
>> > -            handle_data->ret = err;
>> > +
>> > +    if (handle_data->stale_routes) {
>> > +        vector_push(handle_data->stale_routes, rd);
>> > +    }
>> > +}
>> > +
>> > +static int
>> > +re_nl_delete_stale_routes(const struct vector *stale_routes)
>> > +{
>> > +    int ret = 0;
>> > +
>> > +    const struct route_data *rd;
>> > +    VECTOR_FOR_EACH_PTR (stale_routes, rd) {
>> > +        int err = re_nl_delete_route(rd->rta_table_id, &rd->rta_dst,
>> > +                                     rd->rtm_dst_len,
>> rd->rta_priority);
>> > +        if (err) {
>> > +            char addr_s[INET6_ADDRSTRLEN + 1];
>> > +            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5,
>> 20);
>> > +            VLOG_WARN_RL(&rl, "Delete route table_id=%"PRIu32" dst=%s
>> plen=%d "
>> > +                         "failed: %s", rd->rta_table_id,
>> > +                         ipv6_string_mapped(
>> > +                             addr_s, &rd->rta_dst) ? addr_s :
>> "(invalid)",
>> > +                         rd->rtm_dst_len,
>> > +                         ovs_strerror(err));
>> > +            if (!ret) {
>> > +                ret = err;
>> > +            }
>> >          }
>> >      }
>> > +
>> > +    return ret;
>> >  }
>> >
>> >  int
>> > @@ -286,8 +299,9 @@ re_nl_sync_routes(uint32_t table_id, const struct
>> hmap *routes,
>> >                    const struct sbrec_datapath_binding *db)
>> >  {
>> >      struct hmapx routes_to_advertise =
>> HMAPX_INITIALIZER(&routes_to_advertise);
>> > +    struct vector stale_routes = VECTOR_EMPTY_INITIALIZER(struct
>> route_data);
>> >      struct advertise_route_entry *ar;
>> > -    int ret;
>> > +    int ret = 0;
>> >
>> >      HMAP_FOR_EACH (ar, node, routes) {
>> >          hmapx_add(&routes_to_advertise, ar);
>> > @@ -300,11 +314,11 @@ re_nl_sync_routes(uint32_t table_id, const struct
>> hmap *routes,
>> >          .routes = routes,
>> >          .routes_to_advertise = &routes_to_advertise,
>> >          .learned_routes = learned_routes,
>> > +        .stale_routes = &stale_routes,
>> >          .db = db,
>> >          .table_id = table_id,
>> >      };
>> >      route_table_dump_one_table(table_id, handle_route_msg, &data);
>> > -    ret = data.ret;
>> >
>> >      /* Add any remaining routes in the routes_to_advertise hmapx to the
>> >       * system routing table. */
>> > @@ -328,7 +342,14 @@ re_nl_sync_routes(uint32_t table_id, const struct
>> hmap *routes,
>> >              }
>> >          }
>> >      }
>> > +
>> > +    int err = re_nl_delete_stale_routes(&stale_routes);
>> > +    if (!ret) {
>> > +        ret = err;
>> > +    }
>> > +
>> >      hmapx_destroy(&routes_to_advertise);
>> > +    vector_destroy(&stale_routes);
>> >
>> >      return ret;
>> >  }
>> > @@ -336,15 +357,24 @@ re_nl_sync_routes(uint32_t table_id, const struct
>> hmap *routes,
>> >  int
>> >  re_nl_cleanup_routes(uint32_t table_id)
>> >  {
>> > +    int ret = 0;
>> > +    struct vector stale_routes = VECTOR_EMPTY_INITIALIZER(struct
>> route_data);
>> >      /* Remove routes from the system that are not in the host_routes
>> hmap and
>> >       * remove entries from host_routes hmap that match routes already
>> installed
>> >       * in the system. */
>> >      struct route_msg_handle_data data = {
>> >          .routes_to_advertise = NULL,
>> >          .learned_routes = NULL,
>> > +        .stale_routes = &stale_routes,
>> >          .table_id = table_id,
>> >      };
>> >      route_table_dump_one_table(table_id, handle_route_msg, &data);
>> >
>> > -    return data.ret;
>> > +    int err = re_nl_delete_stale_routes(&stale_routes);
>>
>> Nit: I think I would've just inlined the re_nl_delete_stale_routes()
>> code here like we do in ne_nl_sync_neigh().  It saves us the check below
>> and the function itself is not that large, IMO, it gives a better
>> overview of what's going on.
>>
>
> This one is actually called twice once in "re_nl_cleanup_routes()"
> and second in "re_nl_sync_routes()". That's why there is separate
> function. Which is different from "ne_nl_sync_neigh()" where deletion
> happens only once. With that said I'll leave it as is.
>
>
>> But I'll leave it up to you:
>>
>> Acked-by: Dumitru Ceara <dce...@redhat.com>
>>
>> > +    if (!ret) {
>> > +        ret = err;
>> > +    }
>> > +    vector_destroy(&stale_routes);
>> > +
>> > +    return ret;
>> >  }
>>
>> Regards,
>> Dumitru
>>
>>
> I went ahead and merged this into main and backported to 25.09. I will
> have to create a custom backport patch for 25.03, I'll send it out to the
> ML.
>
> Regards,
> Ales
>

As discussed offline this is the backport I'm going to apply, let me know
if it looks good to you.

https://github.com/almusil/ovn/commit/62722d8713c9ba4596c6976b29b7334aaa100dbf

Thanks,
Ales
_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to