On 2025-08-01 10:43 PM, Ilya Maximets wrote: 
> On 7/23/25 3:12 PM, Dima Chumak via dev wrote:
>> For route lookup based on the destination address a single routing table
>> is sufficient. For a more advanced routing when a lookup needs to take
>> into consideration other parameters, such as a source address, a
>> multi-table lookup is needed.
>>
>> This change introduces infrastructure for using multiple routing
>> tables that can be added dynamically, as a pre-step towards importing
>> non-default routing tables from kernel.
>>
>> Signed-off-by: Dima Chumak <dchu...@nvidia.com>
>> ---
>>  lib/netdev-dummy.c     |   6 +-
>>  lib/ovs-router.c       | 135 ++++++++++++++++++++++++++++++++---------
>>  lib/ovs-router.h       |  13 +++-
>>  lib/route-table-bsd.c  |   3 +-
>>  lib/route-table-stub.c |   3 +-
>>  lib/route-table.c      |   4 +-
>>  6 files changed, 124 insertions(+), 40 deletions(-)
>>
>> diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c
>> index b72820fcc506..98efb9faf03a 100644
>> --- a/lib/netdev-dummy.c
>> +++ b/lib/netdev-dummy.c
>> @@ -2223,8 +2223,8 @@ netdev_dummy_ip4addr(struct unixctl_conn *conn, int 
>> argc OVS_UNUSED,
>>
>>              /* Insert local route entry for the new address. */
>>              in6_addr_set_mapped_ipv4(&ip6, ip.s_addr);
>> -            ovs_router_force_insert(0, &ip6, plen + 96, true, argv[1],
>> -                                    &in6addr_any, &ip6);
>> +            ovs_router_force_insert(CLS_MAIN, 0, &ip6, plen + 96, true,
>> +                                    argv[1], &in6addr_any, &ip6);
>>
>>              unixctl_command_reply(conn, "OK");
>>          } else {
>> @@ -2257,7 +2257,7 @@ netdev_dummy_ip6addr(struct unixctl_conn *conn, int 
>> argc OVS_UNUSED,
>>              netdev_dummy_add_in6(netdev, &ip6, &mask);
>>
>>              /* Insert local route entry for the new address. */
>> -            ovs_router_force_insert(0, &ip6, plen, true, argv[1],
>> +            ovs_router_force_insert(CLS_MAIN, 0, &ip6, plen, true, argv[1],
>>                                      &in6addr_any, &ip6);
>>
>>              unixctl_command_reply(conn, "OK");
>> diff --git a/lib/ovs-router.c b/lib/ovs-router.c
>> index 2827a6e43574..02dd94fd18c0 100644
>> --- a/lib/ovs-router.c
>> +++ b/lib/ovs-router.c
>> @@ -50,10 +50,17 @@
>>
>>  VLOG_DEFINE_THIS_MODULE(ovs_router);
>>
>> +struct clsmap_node {
>> +    struct hmap_node hash_node;
>> +    struct classifier cls;
>> +    uint32_t table;
>> +};
>> +
>>  static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
>>
>>  static struct ovs_mutex mutex = OVS_MUTEX_INITIALIZER;
>> -static struct classifier cls;
>> +static struct hmap clsmap OVS_GUARDED_BY(mutex) = HMAP_INITIALIZER(&clsmap);
>> +static struct classifier cls_main;
>>
>>  /* By default, use the system routing table.  For system-independent 
>> testing,
>>   * the unit tests disable using the system routing table. */
>> @@ -71,6 +78,59 @@ struct ovs_router_entry {
>>      uint32_t mark;
>>  };
>>
>> +static void
>> +rt_entry_delete__(const struct cls_rule *cr, struct classifier *cls);
>> +
>> +static struct classifier *
>> +cls_find(uint32_t table)
>> +{
>> +    struct clsmap_node *node;
>> +
>> +    if (route_table_is_standard_id(table)) {
>> +        return &cls_main;
>> +    }
> 
> Why do we need to special-case these tables?  If we just treat all the
> tables equally, this should save us a decent amount of code.
> 
>> +
>> +    ovs_mutex_lock(&mutex);
> 
> Taking a mutex while processing a packet can be problematic from the
> performance perspective.  We should likely use cmap instead of the
> simple hash map, so the mutex will be needed only when we add or remove
> a new table.
> 
>> +    HMAP_FOR_EACH_IN_BUCKET (node, hash_node, hash_int(table, 0), &clsmap) {
>> +        if (node->table == table) {
>> +            ovs_mutex_unlock(&mutex);
>> +            return &node->cls;
>> +        }
>> +    }
>> +    ovs_mutex_unlock(&mutex);
>> +
>> +    return NULL;
>> +}
>> +
>> +static struct classifier *
>> +cls_create(uint32_t table)
>> +{
>> +    struct clsmap_node *node;
>> +
>> +    node = xmalloc(sizeof *node);
>> +    classifier_init(&node->cls, NULL);
>> +    node->table = table;
>> +    ovs_mutex_lock(&mutex);
>> +    hmap_insert(&clsmap, &node->hash_node, hash_int(table, 0));
>> +    ovs_mutex_unlock(&mutex);
>> +
>> +    return &node->cls;
>> +}
>> +
>> +static void
>> +cls_destroy(struct classifier *cls, bool flush_all)
>> +{
>> +    struct ovs_router_entry *rt;
>> +
>> +    classifier_defer(cls);
>> +    CLS_FOR_EACH (rt, cr, cls) {
>> +        if (flush_all || rt->priority == rt->plen || rt->local) {
>> +            rt_entry_delete__(&rt->cr, cls);
>> +        }
>> +    }
>> +    classifier_publish(cls);
>> +}
>> +
>>  static struct ovs_router_entry *
>>  ovs_router_entry_cast(const struct cls_rule *cr)
>>  {
>> @@ -117,8 +177,8 @@ ovs_router_lookup(uint32_t mark, const struct in6_addr 
>> *ip6_dst,
>>          const struct cls_rule *cr_src;
>>          struct flow flow_src = {.ipv6_dst = *src, .pkt_mark = mark};
>>
>> -        cr_src = classifier_lookup(&cls, OVS_VERSION_MAX, &flow_src, NULL,
>> -                                   NULL);
>> +        cr_src = classifier_lookup(&cls_main, OVS_VERSION_MAX, &flow_src,
>> +                                   NULL, NULL);
>>          if (cr_src) {
>>              struct ovs_router_entry *p_src = ovs_router_entry_cast(cr_src);
>>              if (!p_src->local) {
>> @@ -129,7 +189,7 @@ ovs_router_lookup(uint32_t mark, const struct in6_addr 
>> *ip6_dst,
>>          }
>>      }
>>
>> -    cr = classifier_lookup(&cls, OVS_VERSION_MAX, &flow, NULL, NULL);
>> +    cr = classifier_lookup(&cls_main, OVS_VERSION_MAX, &flow, NULL, NULL);
>>      if (cr) {
>>          struct ovs_router_entry *p = ovs_router_entry_cast(cr);
>>
>> @@ -257,8 +317,8 @@ out:
>>  }
>>
>>  static int
>> -ovs_router_insert__(uint32_t mark, uint8_t priority, bool local,
>> -                    const struct in6_addr *ip6_dst,
>> +ovs_router_insert__(uint32_t table, uint32_t mark, uint8_t priority,
>> +                    bool local, const struct in6_addr *ip6_dst,
>>                      uint8_t plen, const char output_netdev[],
>>                      const struct in6_addr *gw,
>>                      const struct in6_addr *ip6_src)
>> @@ -268,6 +328,7 @@ ovs_router_insert__(uint32_t mark, uint8_t priority, 
>> bool local,
>>                          struct in6_addr *prefsrc);
>>      const struct cls_rule *cr;
>>      struct ovs_router_entry *p;
>> +    struct classifier *cls;
>>      struct match match;
>>      int err;
>>
>> @@ -307,8 +368,12 @@ ovs_router_insert__(uint32_t mark, uint8_t priority, 
>> bool local,
>>      /* Longest prefix matches first. */
>>      cls_rule_init(&p->cr, &match, priority);
>>
>> +    cls = cls_find(table);
>> +    if (!cls) {
>> +        cls = cls_create(table);
>> +    }
>>      ovs_mutex_lock(&mutex);
>> -    cr = classifier_replace(&cls, &p->cr, OVS_VERSION_MIN, NULL, 0);
>> +    cr = classifier_replace(cls, &p->cr, OVS_VERSION_MIN, NULL, 0);
>>      ovs_mutex_unlock(&mutex);
>>
>>      if (cr) {
>> @@ -321,13 +386,13 @@ ovs_router_insert__(uint32_t mark, uint8_t priority, 
>> bool local,
>>  }
>>
>>  void
>> -ovs_router_insert(uint32_t mark, const struct in6_addr *ip_dst, uint8_t 
>> plen,
>> -                  bool local, const char output_netdev[],
>> +ovs_router_insert(uint32_t table, uint32_t mark, const struct in6_addr 
>> *ip_dst,
>> +                  uint8_t plen, bool local, const char output_netdev[],
>>                    const struct in6_addr *gw, const struct in6_addr *prefsrc)
>>  {
>>      if (use_system_routing_table) {
>>          uint8_t priority = local ? plen + 64 : plen;
>> -        ovs_router_insert__(mark, priority, local, ip_dst, plen,
>> +        ovs_router_insert__(table, mark, priority, local, ip_dst, plen,
>>                              output_netdev, gw, prefsrc);
>>      }
>>  }
>> @@ -335,24 +400,25 @@ ovs_router_insert(uint32_t mark, const struct in6_addr 
>> *ip_dst, uint8_t plen,
>>  /* The same as 'ovs_router_insert', but it adds the route even if updates
>>   * from the system routing table are disabled.  Used for unit tests. */
>>  void
>> -ovs_router_force_insert(uint32_t mark, const struct in6_addr *ip_dst,
>> +ovs_router_force_insert(uint32_t table, uint32_t mark,
>> +                        const struct in6_addr *ip_dst,
>>                          uint8_t plen, bool local, const char 
>> output_netdev[],
>>                          const struct in6_addr *gw,
>>                          const struct in6_addr *prefsrc)
>>  {
>>      uint8_t priority = local ? plen + 64 : plen;
>>
>> -    ovs_router_insert__(mark, priority, local, ip_dst, plen,
>> +    ovs_router_insert__(table, mark, priority, local, ip_dst, plen,
>>                          output_netdev, gw, prefsrc);
>>  }
>>
>>  static void
>> -rt_entry_delete__(const struct cls_rule *cr)
>> +rt_entry_delete__(const struct cls_rule *cr, struct classifier *cls)
>>  {
>>      struct ovs_router_entry *p = ovs_router_entry_cast(cr);
>>
>>      tnl_port_map_delete_ipdev(p->output_netdev);
>> -    classifier_remove_assert(&cls, cr);
>> +    classifier_remove_assert(cls, cr);
>>      ovsrcu_postpone(rt_entry_free, ovs_router_entry_cast(cr));
>>  }
>>
>> @@ -370,10 +436,10 @@ rt_entry_delete(uint32_t mark, uint8_t priority,
>>      cls_rule_init(&rule, &match, priority);
>>
>>      /* Find the exact rule. */
>> -    cr = classifier_find_rule_exactly(&cls, &rule, OVS_VERSION_MAX);
>> +    cr = classifier_find_rule_exactly(&cls_main, &rule, OVS_VERSION_MAX);
>>      if (cr) {
>>          ovs_mutex_lock(&mutex);
>> -        rt_entry_delete__(cr);
>> +        rt_entry_delete__(cr, &cls_main);
>>          ovs_mutex_unlock(&mutex);
>>
>>          res = true;
>> @@ -469,8 +535,8 @@ ovs_router_add(struct unixctl_conn *conn, int argc,
>>          in6_addr_set_mapped_ipv4(&src6, src);
>>      }
>>
>> -    err = ovs_router_insert__(mark, plen + 32, false, &ip6, plen, argv[2],
>> -                              &gw6, &src6);
>> +    err = ovs_router_insert__(CLS_MAIN, mark, plen + 32, false, &ip6, plen,
>> +                              argv[2], &gw6, &src6);
>>      if (err) {
>>          unixctl_command_reply_error(conn, "Error while inserting route.");
>>      } else {
>> @@ -512,7 +578,7 @@ ovs_router_del(struct unixctl_conn *conn, int argc 
>> OVS_UNUSED,
>>  static void
>>  ovs_router_show_json(struct json **routes)
>>  {
>> -    int n_rules = classifier_count(&cls);
>> +    int n_rules = classifier_count(&cls_main);
>>      struct json **json_entries = NULL;
>>      struct ovs_router_entry *rt;
>>      struct ds ds;
>> @@ -525,7 +591,7 @@ ovs_router_show_json(struct json **routes)
>>      json_entries = xmalloc(n_rules * sizeof *json_entries);
>>      ds_init(&ds);
>>
>> -    CLS_FOR_EACH (rt, cr, &cls) {
>> +    CLS_FOR_EACH (rt, cr, &cls_main) {
>>          bool user = rt->priority != rt->plen && !rt->local;
>>          uint8_t plen = rt->plen;
>>          struct json *json, *nh;
>> @@ -581,7 +647,7 @@ ovs_router_show_text(struct ds *ds)
>>      struct ovs_router_entry *rt;
>>
>>      ds_put_format(ds, "Route Table:\n");
>> -    CLS_FOR_EACH (rt, cr, &cls) {
>> +    CLS_FOR_EACH (rt, cr, &cls_main) {
>>          uint8_t plen;
>>          if (rt->priority == rt->plen || rt->local) {
>>              ds_put_format(ds, "Cached: ");
>> @@ -669,18 +735,19 @@ ovs_router_lookup_cmd(struct unixctl_conn *conn, int 
>> argc,
>>  }
>>
>>  void
>> -ovs_router_flush(void)
>> +ovs_router_flush(bool flush_all)
>>  {
>> -    struct ovs_router_entry *rt;
>> +    struct clsmap_node *node;
>>
>>      ovs_mutex_lock(&mutex);
>> -    classifier_defer(&cls);
>> -    CLS_FOR_EACH(rt, cr, &cls) {
>> -        if (rt->priority == rt->plen || rt->local) {
>> -            rt_entry_delete__(&rt->cr);
>> +    HMAP_FOR_EACH_POP (node, hash_node, &clsmap) {
>> +        cls_destroy(&node->cls, flush_all);
>> +        if (!node->cls.n_rules) {
>> +            classifier_destroy(&node->cls);
>> +            free(node);
>>          }
> 
> This feels like there will be a memory leak in case there are still rules in
> this classifier.  It also feels strange that the function named cls_destroy
> doesn't actually destroy the classifier and can even leave some rules in it.
> 

Probably something like cls_flush() would be a better name to indicate
that some entries may not be deleted?

I tried to preserve the current behaviour that we have for standard
tables, where user added entries are not flushed, and apply the same
logic to custom tables as well.

The intention behind flush_all argument is to enable flushing
everything, including user entries, on ovs shutdown to avoid leaks that
address sanitizer reported for some router related unit tests.

At the moment ASAN check doesn't show any leaks but I can add a more
specific unit test to check potential leaks for user added routes and
rules, what do you think?

P.S. For all other comments they're all clear to me and I can start
preparing V3 with the changes. Thanks.

>>      }
>> -    classifier_publish(&cls);
>> +    cls_destroy(&cls_main, flush_all);
>>      ovs_mutex_unlock(&mutex);
>>      seq_change(tnl_conf_seq);
>>  }
>> @@ -688,7 +755,12 @@ ovs_router_flush(void)
>>  static void
>>  ovs_router_flush_handler(void *aux OVS_UNUSED)
>>  {
>> -    ovs_router_flush();
>> +    ovs_router_flush(true);
>> +
>> +    ovs_mutex_lock(&mutex);
>> +    hmap_destroy(&clsmap);
>> +    hmap_init(&clsmap);
>> +    ovs_mutex_unlock(&mutex);
>>  }
>>
>>  void
>> @@ -697,8 +769,11 @@ ovs_router_init(void)
>>      static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
>>
>>      if (ovsthread_once_start(&once)) {
>> +        ovs_mutex_lock(&mutex);
>> +        hmap_init(&clsmap);
>> +        classifier_init(&cls_main, NULL);
>> +        ovs_mutex_unlock(&mutex);
>>          fatal_signal_add_hook(ovs_router_flush_handler, NULL, NULL, true);
>> -        classifier_init(&cls, NULL);
>>          unixctl_command_register("ovs/route/add",
>>                                   "ip/plen dev [gw] "
>>                                   "[pkt_mark=mark] [src=src_ip]",
>> diff --git a/lib/ovs-router.h b/lib/ovs-router.h
>> index b61712707b1e..e2d995f3a8dc 100644
>> --- a/lib/ovs-router.h
>> +++ b/lib/ovs-router.h
>> @@ -26,20 +26,27 @@
>>  extern "C" {
>>  #endif
>>
>> +enum {
>> +    CLS_MAIN = 254,    /* RT_TABLE_MAIN */
>> +    CLS_ALL = UINT_MAX,
>> +};
>> +
>>  bool ovs_router_lookup(uint32_t mark, const struct in6_addr *ip_dst,
>>                         char output_netdev[],
>>                         struct in6_addr *src, struct in6_addr *gw);
>>  void ovs_router_init(void);
>> -void ovs_router_insert(uint32_t mark, const struct in6_addr *ip_dst,
>> +void ovs_router_insert(uint32_t table, uint32_t mark,
>> +                       const struct in6_addr *ip_dst,
>>                         uint8_t plen, bool local,
>>                         const char output_netdev[], const struct in6_addr 
>> *gw,
>>                         const struct in6_addr *prefsrc);
>> -void ovs_router_force_insert(uint32_t mark, const struct in6_addr *ip_dst,
>> +void ovs_router_force_insert(uint32_t table, uint32_t mark,
>> +                             const struct in6_addr *ip_dst,
>>                               uint8_t plen, bool local,
>>                               const char output_netdev[],
>>                               const struct in6_addr *gw,
>>                               const struct in6_addr *prefsrc);
>> -void ovs_router_flush(void);
>> +void ovs_router_flush(bool flush_all);
>>
>>  void ovs_router_disable_system_routing_table(void);
>>
>> diff --git a/lib/route-table-bsd.c b/lib/route-table-bsd.c
>> index a4eccb913916..6488df2637c8 100644
>> --- a/lib/route-table-bsd.c
>> +++ b/lib/route-table-bsd.c
>> @@ -46,7 +46,8 @@ VLOG_DEFINE_THIS_MODULE(route_table_bsd);
>>  bool
>>  route_table_is_standard_id(uint32_t table_id)
>>  {
>> -    return !table_id;
>> +    return !table_id
>> +           || table_id == CLS_MAIN;
>>  }
>>
>>  bool
>> diff --git a/lib/route-table-stub.c b/lib/route-table-stub.c
>> index 75e5bf37e3b5..5bf00e820796 100644
>> --- a/lib/route-table-stub.c
>> +++ b/lib/route-table-stub.c
>> @@ -21,7 +21,8 @@
>>  bool
>>  route_table_is_standard_id(uint32_t table_id)
>>  {
>> -    return !table_id;
>> +    return !table_id
>> +           || table_id == CLS_MAIN;
>>  }
>>
>>  bool
>> diff --git a/lib/route-table.c b/lib/route-table.c
>> index 4b93339c93c9..da7b276544e5 100644
>> --- a/lib/route-table.c
>> +++ b/lib/route-table.c
>> @@ -556,7 +556,7 @@ route_table_handle_msg(const struct route_table_msg 
>> *change,
>>          rdnh = CONTAINER_OF(ovs_list_front(&change->rd.nexthops),
>>                              const struct route_data_nexthop, nexthop_node);
>>
>> -        ovs_router_insert(rd->rta_mark, &rd->rta_dst,
>> +        ovs_router_insert(CLS_MAIN, rd->rta_mark, &rd->rta_dst,
>>                            IN6_IS_ADDR_V4MAPPED(&rd->rta_dst)
>>                            ? rd->rtm_dst_len + 96 : rd->rtm_dst_len,
>>                            rd->rtn_local, rdnh->ifname, &rdnh->addr,
>> @@ -567,7 +567,7 @@ route_table_handle_msg(const struct route_table_msg 
>> *change,
>>  static void
>>  route_map_clear(void)
>>  {
>> -    ovs_router_flush();
>> +    ovs_router_flush(false);
>>  }
>>
>>  bool
> 

_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to