On 29 Jan 2026, at 19:57, Mike Pattrick via dev wrote:

> Currently there is no rate limit on tunnel hardware address ARP/ND
> lookups. Furthermote, there is no indication for how frequently action
> translation composes these packets.
>
> This patch implements a limit of generating one packet per destination
> per second and adds a counter for each time a lookup happens.
>
> Fixes: a36de779d739 ("openvswitch: Userspace tunneling.")
> Reported-at: https://issues.redhat.com/browse/FDP-2986
> Signed-off-by: Mike Pattrick <[email protected]>
> ---
>  NEWS                               |  2 +
>  lib/tnl-neigh-cache.c              | 97 ++++++++++++++++++++++++++----
>  lib/tnl-neigh-cache.h              |  2 +-
>  ofproto/ofproto-dpif-xlate-cache.c |  2 +-
>  ofproto/ofproto-dpif-xlate.c       | 12 +++-
>  tests/tunnel-push-pop-ipv6.at      | 19 +++++-
>  tests/tunnel-push-pop.at           | 15 ++++-
>  7 files changed, 130 insertions(+), 19 deletions(-)
>
> diff --git a/NEWS b/NEWS
> index c3470b84e..809b4f114 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -1,5 +1,7 @@
>  Post-v3.7.0
>  --------------------
> +   - Userspace datapath:
> +     * ARP/ND lookups for native tunnel are now rate limited.
>
>
>  v3.7.0 - xx xxx xxxx
> diff --git a/lib/tnl-neigh-cache.c b/lib/tnl-neigh-cache.c
> index bdff1debc..78da2bc12 100644
> --- a/lib/tnl-neigh-cache.c
> +++ b/lib/tnl-neigh-cache.c
> @@ -47,6 +47,7 @@
>
>  #define NEIGH_ENTRY_DEFAULT_IDLE_TIME_MS  (15 * 60 * 1000)
>  #define NEIGH_ENTRY_MAX_AGING_TIME_S  3600
> +#define NEIGH_ENTRY_LOOKUP_HOLDOUT_MS   1000

Spacing looks odd here. I guess either a single space or align them.

>  struct tnl_neigh_entry {
>      struct cmap_node cmap_node;
> @@ -54,12 +55,16 @@ struct tnl_neigh_entry {
>      struct eth_addr mac;
>      atomic_llong expires;       /* Expiration time in ms. */
>      char br_name[IFNAMSIZ];
> +    atomic_bool complete;
>  };
>
>  static struct cmap table = CMAP_INITIALIZER;
>  static struct ovs_mutex mutex = OVS_MUTEX_INITIALIZER;
>  static atomic_uint32_t neigh_aging;
>
> +void
> +tnl_neigh_set_partial(const char name[IFNAMSIZ], const struct in6_addr *dst);
> +
>  static uint32_t
>  tnl_neigh_hash(const struct in6_addr *ip)
>  {
> @@ -85,6 +90,16 @@ tnl_neigh_get_aging(void)
>      return aging;
>  }
>
> +static bool
> +tnl_neigh_is_complete(struct tnl_neigh_entry *neigh)
> +{
> +    bool complete;
> +
> +    atomic_read_explicit(&neigh->complete, &complete, memory_order_acquire);
> +
> +    return complete;
> +}
> +
>  static struct tnl_neigh_entry *
>  tnl_neigh_lookup__(const char br_name[IFNAMSIZ], const struct in6_addr *dst)
>  {
> @@ -98,9 +113,12 @@ tnl_neigh_lookup__(const char br_name[IFNAMSIZ], const 
> struct in6_addr *dst)
>                  return NULL;
>              }
>
> -            atomic_store_explicit(&neigh->expires, time_msec() +
> -                                  tnl_neigh_get_aging(),
> -                                  memory_order_release);
> +            if (tnl_neigh_is_complete(neigh)) {
> +                atomic_store_explicit(&neigh->expires, time_msec() +
> +                                      tnl_neigh_get_aging(),
> +                                      memory_order_release);
Maybe;

                atomic_store_explicit(&neigh->expires,
                                      time_msec() + tnl_neigh_get_aging(),
                                      memory_order_release);

> +            }
> +
>              return neigh;
>          }
>      }
> @@ -109,16 +127,24 @@ tnl_neigh_lookup__(const char br_name[IFNAMSIZ], const 
> struct in6_addr *dst)
>
>  int
>  tnl_neigh_lookup(const char br_name[IFNAMSIZ], const struct in6_addr *dst,
> -                 struct eth_addr *mac)
> +                 struct eth_addr *mac, bool readonly)

nit; Would update_only make more sense than read_only?

>  {
>      struct tnl_neigh_entry *neigh;
>      int res = ENOENT;
>
>      neigh = tnl_neigh_lookup__(br_name, dst);
>      if (neigh) {
> -        *mac = neigh->mac;
> -        res = 0;
> +        if (tnl_neigh_is_complete(neigh)) {
> +            *mac = neigh->mac;
> +            res = 0;
> +        } else {
> +            res = EINPROGRESS;
> +        }
> +    } else if (!readonly) {
> +        /* Insert a partial entry. */
> +        tnl_neigh_set_partial(br_name, dst);
>      }
> +
>      return res;
>  }
>
> @@ -142,6 +168,7 @@ tnl_neigh_set(const char name[IFNAMSIZ], const struct 
> in6_addr *dst,
>  {
>      ovs_mutex_lock(&mutex);
>      struct tnl_neigh_entry *neigh = tnl_neigh_lookup__(name, dst);
> +    bool insert = true;
>      if (neigh) {
>          if (eth_addr_equals(neigh->mac, mac)) {
>              atomic_store_relaxed(&neigh->expires, time_msec() +
> @@ -149,18 +176,30 @@ tnl_neigh_set(const char name[IFNAMSIZ], const struct 
> in6_addr *dst,
>              ovs_mutex_unlock(&mutex);
>              return;
>          }
> -        tnl_neigh_delete(neigh);

I would add a new line here.

> +        if (tnl_neigh_is_complete(neigh)) {
> +            tnl_neigh_delete(neigh);
> +        } else {
> +            insert = false;
> +        }
>      }
> -    seq_change(tnl_conf_seq);
>
> -    neigh = xmalloc(sizeof *neigh);
> +    if (insert) {
> +        neigh = xmalloc(sizeof *neigh);
> +
> +        neigh->ip = *dst;
> +        ovs_strlcpy(neigh->br_name, name, sizeof neigh->br_name);
> +        insert = true;

Insert is already true in this path.

> +    }
> +    seq_change(tnl_conf_seq);
>
> -    neigh->ip = *dst;
>      neigh->mac = mac;
>      atomic_store_relaxed(&neigh->expires, time_msec() +
>                           tnl_neigh_get_aging());

See below, should only be relaxed if we insert it in the cmap.

> -    ovs_strlcpy(neigh->br_name, name, sizeof neigh->br_name);
> -    cmap_insert(&table, &neigh->cmap_node, tnl_neigh_hash(&neigh->ip));
> +    atomic_store_relaxed(&neigh->complete, true);

You are using relaxed here, but memory_order_acquire for reading? Should this 
be memory_order_release if you use this variable to synchronize?
I guess it should only be relaxed 'if (insert)'.

> +
> +    if (insert) {
> +        cmap_insert(&table, &neigh->cmap_node, tnl_neigh_hash(&neigh->ip));
> +    }
>      ovs_mutex_unlock(&mutex);
>  }
>
> @@ -172,6 +211,37 @@ tnl_arp_set(const char name[IFNAMSIZ], ovs_be32 dst,
>      tnl_neigh_set(name, &dst6, mac);
>  }
>
> +void
> +tnl_neigh_set_partial(const char name[IFNAMSIZ], const struct in6_addr *dst)
> +{
> +    ovs_mutex_lock(&mutex);
> +    struct tnl_neigh_entry *neigh = tnl_neigh_lookup__(name, dst);
> +    if (neigh) {
> +        if (!tnl_neigh_is_complete(neigh)) {
> +            atomic_store_relaxed(&neigh->expires, time_msec() +
> +                                 NEIGH_ENTRY_LOOKUP_HOLDOUT_MS);

This should not be relaxed as it’s already in the map.

> +            goto done;
> +        } else if (!tnl_neigh_expired(neigh)) {
> +            /* Case where initial lookup did not find a complete entry but it
> +             * was inserted before we aquired a lock. */

aquired  -> acquired

> +            goto done;
> +        }
> +        tnl_neigh_delete(neigh);
> +    }
> +    seq_change(tnl_conf_seq);
> +
> +    neigh = xmalloc(sizeof *neigh);
> +
> +    neigh->ip = *dst;
> +    neigh->complete = false;
> +    neigh->expires = time_msec() + NEIGH_ENTRY_LOOKUP_HOLDOUT_MS;

As the above two are atomic variables, we should use the appropriate setters.

> +    ovs_strlcpy(neigh->br_name, name, sizeof neigh->br_name);
> +    cmap_insert(&table, &neigh->cmap_node, tnl_neigh_hash(&neigh->ip));
> +
> +done:
> +    ovs_mutex_unlock(&mutex);
> +}
> +
>  static int
>  tnl_arp_snoop(const struct flow *flow, struct flow_wildcards *wc,
>                const char name[IFNAMSIZ], bool allow_update)
> @@ -381,6 +451,9 @@ tnl_neigh_cache_show(struct unixctl_conn *conn, int argc 
> OVS_UNUSED,
>      CMAP_FOR_EACH(neigh, cmap_node, &table) {
>          int start_len, need_ws;
>
> +        if (!tnl_neigh_is_complete(neigh)) {

I think we should display specific output here to make it clear that the entry 
is not complete.
Maybe something like this (and add a test case):

    ds_put_format(&ds, " INCOMPLETE");

> +            continue;
> +        }
>          start_len = ds.length;
>          ipv6_format_mapped(&neigh->ip, &ds);
>
> diff --git a/lib/tnl-neigh-cache.h b/lib/tnl-neigh-cache.h
> index 877bca312..03bd3e07c 100644
> --- a/lib/tnl-neigh-cache.h
> +++ b/lib/tnl-neigh-cache.h
> @@ -36,7 +36,7 @@ int tnl_neigh_snoop(const struct flow *flow, struct 
> flow_wildcards *wc,
>  void tnl_neigh_set(const char name[IFNAMSIZ], const struct in6_addr *dst,
>                     const struct eth_addr mac);
>  int tnl_neigh_lookup(const char dev_name[IFNAMSIZ], const struct in6_addr 
> *dst,
> -                     struct eth_addr *mac);
> +                     struct eth_addr *mac, bool readonly);
>  void tnl_neigh_cache_init(void);
>  void tnl_neigh_cache_run(void);
>  void tnl_neigh_flush(const char dev_name[IFNAMSIZ]);
> diff --git a/ofproto/ofproto-dpif-xlate-cache.c 
> b/ofproto/ofproto-dpif-xlate-cache.c
> index c6d935cf0..7dda67833 100644
> --- a/ofproto/ofproto-dpif-xlate-cache.c
> +++ b/ofproto/ofproto-dpif-xlate-cache.c
> @@ -152,7 +152,7 @@ xlate_push_stats_entry(struct xc_entry *entry,
>      case XC_TNL_NEIGH:
>          /* Lookup neighbor to avoid timeout. */
>          tnl_neigh_lookup(entry->tnl_neigh_cache.br_name,
> -                         &entry->tnl_neigh_cache.d_ipv6, &dmac);
> +                         &entry->tnl_neigh_cache.d_ipv6, &dmac, true);
>          break;
>      case XC_TUNNEL_HEADER:
>          if (entry->tunnel_hdr.operation == ADD) {
> diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
> index f2db4723b..de068d6f4 100644
> --- a/ofproto/ofproto-dpif-xlate.c
> +++ b/ofproto/ofproto-dpif-xlate.c
> @@ -72,6 +72,7 @@
>  COVERAGE_DEFINE(xlate_actions);
>  COVERAGE_DEFINE(xlate_actions_oversize);
>  COVERAGE_DEFINE(xlate_actions_too_many_output);
> +COVERAGE_DEFINE(xlate_actions_sent_packet);

Should this be more specific? xlate_actions_neigh_sent
>
>  VLOG_DEFINE_THIS_MODULE(ofproto_dpif_xlate);
>
> @@ -3931,17 +3932,26 @@ native_tunnel_output(struct xlate_ctx *ctx, const 
> struct xport *xport,
>          s_ip = in6_addr_get_mapped_ipv4(&s_ip6);
>      }
>
> -    err = tnl_neigh_lookup(out_dev->xbridge->name, &d_ip6, &dmac);
> +    err = tnl_neigh_lookup(out_dev->xbridge->name, &d_ip6, &dmac, false);
>      if (err) {
>          struct in6_addr nh_s_ip6 = in6addr_any;
>
>          put_cloned_drop_action(ctx->xbridge->ofproto, ctx->odp_actions,
>                                 XLATE_TUNNEL_NEIGH_CACHE_MISS,
>                                 !is_last_action);
> +        if (err == EINPROGRESS) {
> +            xlate_report(ctx, OFT_DETAIL,
> +                         "neighbor cache miss for %s on bridge %s, "
> +                         "waiting on %s request",
> +                         buf_dip6, out_dev->xbridge->name,
> +                         d_ip ? "ARP" : "ND");
> +            return err;
> +        }
>          xlate_report(ctx, OFT_DETAIL,
>                       "neighbor cache miss for %s on bridge %s, "
>                       "sending %s request",
>                       buf_dip6, out_dev->xbridge->name, d_ip ? "ARP" : "ND");
> +        COVERAGE_INC(xlate_actions_sent_packet);
>
>          err = ovs_router_get_netdev_source_address(
>              &d_ip6, netdev_get_name(out_dev->netdev), &nh_s_ip6);
> diff --git a/tests/tunnel-push-pop-ipv6.at b/tests/tunnel-push-pop-ipv6.at
> index b11387345..7263bf4f0 100644
> --- a/tests/tunnel-push-pop-ipv6.at
> +++ b/tests/tunnel-push-pop-ipv6.at
> @@ -344,7 +344,20 @@ AT_CHECK([ovs-ofctl add-flow br0 action=normal])
>  dnl Check Neighbour discovery.
>  AT_CHECK([ovs-vsctl -- set Interface p0 options:pcap=p0.pcap])
>
> -AT_CHECK([ovs-appctl netdev-dummy/receive int-br 
> 'in_port(2),eth(src=aa:55:aa:55:00:00,dst=f8:bc:12:ff:ff:ff),eth_type(0x0800),ipv4(src=1.1.3.92,dst=1.1.3.88,proto=1,tos=0,ttl=64,frag=no),icmp(type=0,code=0)'])
> +dnl First trace should send only two ND lookups, for each destination IP
> +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(int-br),]dnl
> +  [eth(src=aa:55:aa:55:00:00,dst=f8:bc:12:ff:ff:ff),eth_type(0x0800),]dnl
> +  [ipv4(src=1.1.3.92,dst=1.1.3.88,proto=1,tos=0,ttl=64,frag=no),]dnl
> +  [icmp(type=0,code=0)"], [0], [stdout])
> +AT_CHECK([grep -q "sending ND request" stdout], [0])
> +
> +dnl Second trace should not send any ND lookups
> +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(int-br),]dnl
> +  [eth(src=aa:55:aa:55:00:00,dst=f8:bc:12:ff:ff:ff),eth_type(0x0800),]dnl
> +  [ipv4(src=1.1.3.92,dst=1.1.3.88,proto=1,tos=0,ttl=64,frag=no),]dnl
> +  [icmp(type=0,code=0)"], [0], [stdout])
> +AT_CHECK([grep -q "waiting on ND request" stdout], [0])
> +AT_CHECK([grep -qv "sending ND request" stdout], [0])
>
>  dnl Wait for the two Neighbor Solicitation packets to be sent.
>  dnl Sometimes the system can be slow (e.g. under valgrind)
> @@ -577,8 +590,8 @@ 
> icmp,vlan_tci=0x0000,dl_src=be:b6:f4:e1:49:4a,dl_dst=fe:71:d8:83:72:4f,nw_src=30
>  AT_CHECK([ovs-ofctl dump-ports int-br | grep 'port  5'], [0], [dnl
>    port  5: rx pkts=1, bytes=98, drop=?, errs=?, frame=?, over=?, crc=?
>  ])
> -AT_CHECK([ovs-appctl dpif/dump-flows int-br | grep 'in_port(6081)'], [0], 
> [dnl
> -recirc_id(0),tunnel(tun_id=0x7b,ipv6_src=2001:cafe::92,ipv6_dst=2001:cafe::88,geneve({class=0xffff,type=0x80,len=4,0xa/0xf}{class=0xffff,type=0,len=4}),flags(+key)),in_port(6081),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
>  packets:0, bytes:0, used:never, 
> actions:userspace(pid=0,controller(reason=1,dont_send=0,continuation=0,recirc_id=3,rule_cookie=0,controller_id=0,max_len=65535))
> +AT_CHECK([ovs-appctl dpif/dump-flows int-br | grep 'in_port(6081)' | 
> strip_recirc], [0], [dnl
> +recirc_id(<recirc>),tunnel(tun_id=0x7b,ipv6_src=2001:cafe::92,ipv6_dst=2001:cafe::88,geneve({class=0xffff,type=0x80,len=4,0xa/0xf}{class=0xffff,type=0,len=4}),flags(+key)),in_port(6081),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
>  packets:0, bytes:0, used:never, 
> actions:userspace(pid=0,controller(reason=1,dont_send=0,continuation=0,recirc_id=<recirc>,rule_cookie=0,controller_id=0,max_len=65535))
>  ])
>
>  dnl Receive VXLAN with different MAC and verify that the neigh cache gets 
> updated
> diff --git a/tests/tunnel-push-pop.at b/tests/tunnel-push-pop.at
> index f22a37570..e7211149b 100644
> --- a/tests/tunnel-push-pop.at
> +++ b/tests/tunnel-push-pop.at
> @@ -256,7 +256,20 @@ AT_CHECK([ovs-appctl revalidator/wait])
>  dnl Check ARP request
>  AT_CHECK([ovs-vsctl -- set Interface p0 options:pcap=p0.pcap])
>
> -AT_CHECK([ovs-appctl netdev-dummy/receive int-br 
> 'in_port(2),eth(src=aa:55:aa:55:00:00,dst=f8:bc:12:ff:ff:ff),eth_type(0x0800),ipv4(src=1.1.3.92,dst=1.1.3.88,proto=1,tos=0,ttl=64,frag=no),icmp(type=0,code=0)'])
> +dnl First trace should send only two ARP lookups, for each destination IP
> +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(int-br),]dnl
> +  [eth(src=aa:55:aa:55:00:00,dst=f8:bc:12:ff:ff:ff),eth_type(0x0800),]dnl
> +  [ipv4(src=1.1.3.92,dst=1.1.3.88,proto=1,tos=0,ttl=64,frag=no),]dnl
> +  [icmp(type=0,code=0)"], [0], [stdout])
> +AT_CHECK([grep -q "sending ARP request" stdout], [0])
> +
> +dnl Second trace should not send any ARP lookups
> +AT_CHECK([[ovs-appctl ofproto/trace ovs-dummy "in_port(int-br),]dnl
> +  [eth(src=aa:55:aa:55:00:00,dst=f8:bc:12:ff:ff:ff),eth_type(0x0800),]dnl
> +  [ipv4(src=1.1.3.92,dst=1.1.3.88,proto=1,tos=0,ttl=64,frag=no),]dnl
> +  [icmp(type=0,code=0)"]], [0], [stdout])
> +AT_CHECK([grep -q "waiting on ARP request" stdout], [0])
> +AT_CHECK([grep -qv "sending ARP request" stdout], [0])
>
>  dnl Wait for the two ARP requests to be sent. Sometimes the system
>  dnl can be slow (e.g. under valgrind)
> -- 
> 2.52.0
>
> _______________________________________________
> dev mailing list
> [email protected]
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to