On 29 Jan 2026, at 19:57, Mike Pattrick via dev wrote:
> Currently there is no rate limit on tunnel hardware address ARP/ND
> lookups. Furthermote, there is no indication for how frequently action
> translation composes these packets.
>
> This patch implements a limit of generating one packet per destination
> per second and adds a counter for each time a lookup happens.
>
> Fixes: a36de779d739 ("openvswitch: Userspace tunneling.")
> Reported-at: https://issues.redhat.com/browse/FDP-2986
> Signed-off-by: Mike Pattrick <[email protected]>
> ---
> NEWS | 2 +
> lib/tnl-neigh-cache.c | 97 ++++++++++++++++++++++++++----
> lib/tnl-neigh-cache.h | 2 +-
> ofproto/ofproto-dpif-xlate-cache.c | 2 +-
> ofproto/ofproto-dpif-xlate.c | 12 +++-
> tests/tunnel-push-pop-ipv6.at | 19 +++++-
> tests/tunnel-push-pop.at | 15 ++++-
> 7 files changed, 130 insertions(+), 19 deletions(-)
>
> diff --git a/NEWS b/NEWS
> index c3470b84e..809b4f114 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -1,5 +1,7 @@
> Post-v3.7.0
> --------------------
> + - Userspace datapath:
> + * ARP/ND lookups for native tunnel are now rate limited.
>
>
> v3.7.0 - xx xxx xxxx
> diff --git a/lib/tnl-neigh-cache.c b/lib/tnl-neigh-cache.c
> index bdff1debc..78da2bc12 100644
> --- a/lib/tnl-neigh-cache.c
> +++ b/lib/tnl-neigh-cache.c
> @@ -47,6 +47,7 @@
>
> #define NEIGH_ENTRY_DEFAULT_IDLE_TIME_MS (15 * 60 * 1000)
> #define NEIGH_ENTRY_MAX_AGING_TIME_S 3600
> +#define NEIGH_ENTRY_LOOKUP_HOLDOUT_MS 1000
Spacing looks odd here. I guess either a single space or align them.
> struct tnl_neigh_entry {
> struct cmap_node cmap_node;
> @@ -54,12 +55,16 @@ struct tnl_neigh_entry {
> struct eth_addr mac;
> atomic_llong expires; /* Expiration time in ms. */
> char br_name[IFNAMSIZ];
> + atomic_bool complete;
> };
>
> static struct cmap table = CMAP_INITIALIZER;
> static struct ovs_mutex mutex = OVS_MUTEX_INITIALIZER;
> static atomic_uint32_t neigh_aging;
>
> +void
> +tnl_neigh_set_partial(const char name[IFNAMSIZ], const struct in6_addr *dst);
> +
> static uint32_t
> tnl_neigh_hash(const struct in6_addr *ip)
> {
> @@ -85,6 +90,16 @@ tnl_neigh_get_aging(void)
> return aging;
> }
>
> +static bool
> +tnl_neigh_is_complete(struct tnl_neigh_entry *neigh)
> +{
> + bool complete;
> +
> + atomic_read_explicit(&neigh->complete, &complete, memory_order_acquire);
> +
> + return complete;
> +}
> +
> static struct tnl_neigh_entry *
> tnl_neigh_lookup__(const char br_name[IFNAMSIZ], const struct in6_addr *dst)
> {
> @@ -98,9 +113,12 @@ tnl_neigh_lookup__(const char br_name[IFNAMSIZ], const
> struct in6_addr *dst)
> return NULL;
> }
>
> - atomic_store_explicit(&neigh->expires, time_msec() +
> - tnl_neigh_get_aging(),
> - memory_order_release);
> + if (tnl_neigh_is_complete(neigh)) {
> + atomic_store_explicit(&neigh->expires, time_msec() +
> + tnl_neigh_get_aging(),
> + memory_order_release);
Maybe;
atomic_store_explicit(&neigh->expires,
time_msec() + tnl_neigh_get_aging(),
memory_order_release);
> + }
> +
> return neigh;
> }
> }
> @@ -109,16 +127,24 @@ tnl_neigh_lookup__(const char br_name[IFNAMSIZ], const
> struct in6_addr *dst)
>
> int
> tnl_neigh_lookup(const char br_name[IFNAMSIZ], const struct in6_addr *dst,
> - struct eth_addr *mac)
> + struct eth_addr *mac, bool readonly)
nit; Would update_only make more sense than read_only?
> {
> struct tnl_neigh_entry *neigh;
> int res = ENOENT;
>
> neigh = tnl_neigh_lookup__(br_name, dst);
> if (neigh) {
> - *mac = neigh->mac;
> - res = 0;
> + if (tnl_neigh_is_complete(neigh)) {
> + *mac = neigh->mac;
> + res = 0;
> + } else {
> + res = EINPROGRESS;
> + }
> + } else if (!readonly) {
> + /* Insert a partial entry. */
> + tnl_neigh_set_partial(br_name, dst);
> }
> +
> return res;
> }
>
> @@ -142,6 +168,7 @@ tnl_neigh_set(const char name[IFNAMSIZ], const struct
> in6_addr *dst,
> {
> ovs_mutex_lock(&mutex);
> struct tnl_neigh_entry *neigh = tnl_neigh_lookup__(name, dst);
> + bool insert = true;
> if (neigh) {
> if (eth_addr_equals(neigh->mac, mac)) {
> atomic_store_relaxed(&neigh->expires, time_msec() +
> @@ -149,18 +176,30 @@ tnl_neigh_set(const char name[IFNAMSIZ], const struct
> in6_addr *dst,
> ovs_mutex_unlock(&mutex);
> return;
> }
> - tnl_neigh_delete(neigh);
I would add a new line here.
> + if (tnl_neigh_is_complete(neigh)) {
> + tnl_neigh_delete(neigh);
> + } else {
> + insert = false;
> + }
> }
> - seq_change(tnl_conf_seq);
>
> - neigh = xmalloc(sizeof *neigh);
> + if (insert) {
> + neigh = xmalloc(sizeof *neigh);
> +
> + neigh->ip = *dst;
> + ovs_strlcpy(neigh->br_name, name, sizeof neigh->br_name);
> + insert = true;
Insert is already true in this path.
> + }
> + seq_change(tnl_conf_seq);
>
> - neigh->ip = *dst;
> neigh->mac = mac;
> atomic_store_relaxed(&neigh->expires, time_msec() +
> tnl_neigh_get_aging());
See below, should only be relaxed if we insert it in the cmap.
> - ovs_strlcpy(neigh->br_name, name, sizeof neigh->br_name);
> - cmap_insert(&table, &neigh->cmap_node, tnl_neigh_hash(&neigh->ip));
> + atomic_store_relaxed(&neigh->complete, true);
You are using relaxed here, but memory_order_acquire for reading? Should this
be memory_order_release if you use this variable to synchronize?
I guess it should only be relaxed 'if (insert)'.
> +
> + if (insert) {
> + cmap_insert(&table, &neigh->cmap_node, tnl_neigh_hash(&neigh->ip));
> + }
> ovs_mutex_unlock(&mutex);
> }
>
> @@ -172,6 +211,37 @@ tnl_arp_set(const char name[IFNAMSIZ], ovs_be32 dst,
> tnl_neigh_set(name, &dst6, mac);
> }
>
> +void
> +tnl_neigh_set_partial(const char name[IFNAMSIZ], const struct in6_addr *dst)
> +{
> + ovs_mutex_lock(&mutex);
> + struct tnl_neigh_entry *neigh = tnl_neigh_lookup__(name, dst);
> + if (neigh) {
> + if (!tnl_neigh_is_complete(neigh)) {
> + atomic_store_relaxed(&neigh->expires, time_msec() +
> + NEIGH_ENTRY_LOOKUP_HOLDOUT_MS);
This should not be relaxed as it’s already in the map.
> + goto done;
> + } else if (!tnl_neigh_expired(neigh)) {
> + /* Case where initial lookup did not find a complete entry but it
> + * was inserted before we aquired a lock. */
aquired -> acquired
> + goto done;
> + }
> + tnl_neigh_delete(neigh);
> + }
> + seq_change(tnl_conf_seq);
> +
> + neigh = xmalloc(sizeof *neigh);
> +
> + neigh->ip = *dst;
> + neigh->complete = false;
> + neigh->expires = time_msec() + NEIGH_ENTRY_LOOKUP_HOLDOUT_MS;
As the above two are atomic variables, we should use the appropriate setters.
> + ovs_strlcpy(neigh->br_name, name, sizeof neigh->br_name);
> + cmap_insert(&table, &neigh->cmap_node, tnl_neigh_hash(&neigh->ip));
> +
> +done:
> + ovs_mutex_unlock(&mutex);
> +}
> +
> static int
> tnl_arp_snoop(const struct flow *flow, struct flow_wildcards *wc,
> const char name[IFNAMSIZ], bool allow_update)
> @@ -381,6 +451,9 @@ tnl_neigh_cache_show(struct unixctl_conn *conn, int argc
> OVS_UNUSED,
> CMAP_FOR_EACH(neigh, cmap_node, &table) {
> int start_len, need_ws;
>
> + if (!tnl_neigh_is_complete(neigh)) {
I think we should display specific output here to make it clear that the entry
is not complete.
Maybe something like this (and add a test case):
ds_put_format(&ds, " INCOMPLETE");
> + continue;
> + }
> start_len = ds.length;
> ipv6_format_mapped(&neigh->ip, &ds);
>
> diff --git a/lib/tnl-neigh-cache.h b/lib/tnl-neigh-cache.h
> index 877bca312..03bd3e07c 100644
> --- a/lib/tnl-neigh-cache.h
> +++ b/lib/tnl-neigh-cache.h
> @@ -36,7 +36,7 @@ int tnl_neigh_snoop(const struct flow *flow, struct
> flow_wildcards *wc,
> void tnl_neigh_set(const char name[IFNAMSIZ], const struct in6_addr *dst,
> const struct eth_addr mac);
> int tnl_neigh_lookup(const char dev_name[IFNAMSIZ], const struct in6_addr
> *dst,
> - struct eth_addr *mac);
> + struct eth_addr *mac, bool readonly);
> void tnl_neigh_cache_init(void);
> void tnl_neigh_cache_run(void);
> void tnl_neigh_flush(const char dev_name[IFNAMSIZ]);
> diff --git a/ofproto/ofproto-dpif-xlate-cache.c
> b/ofproto/ofproto-dpif-xlate-cache.c
> index c6d935cf0..7dda67833 100644
> --- a/ofproto/ofproto-dpif-xlate-cache.c
> +++ b/ofproto/ofproto-dpif-xlate-cache.c
> @@ -152,7 +152,7 @@ xlate_push_stats_entry(struct xc_entry *entry,
> case XC_TNL_NEIGH:
> /* Lookup neighbor to avoid timeout. */
> tnl_neigh_lookup(entry->tnl_neigh_cache.br_name,
> - &entry->tnl_neigh_cache.d_ipv6, &dmac);
> + &entry->tnl_neigh_cache.d_ipv6, &dmac, true);
> break;
> case XC_TUNNEL_HEADER:
> if (entry->tunnel_hdr.operation == ADD) {
> diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
> index f2db4723b..de068d6f4 100644
> --- a/ofproto/ofproto-dpif-xlate.c
> +++ b/ofproto/ofproto-dpif-xlate.c
> @@ -72,6 +72,7 @@
> COVERAGE_DEFINE(xlate_actions);
> COVERAGE_DEFINE(xlate_actions_oversize);
> COVERAGE_DEFINE(xlate_actions_too_many_output);
> +COVERAGE_DEFINE(xlate_actions_sent_packet);
Should this be more specific? xlate_actions_neigh_sent
>
> VLOG_DEFINE_THIS_MODULE(ofproto_dpif_xlate);
>
> @@ -3931,17 +3932,26 @@ native_tunnel_output(struct xlate_ctx *ctx, const
> struct xport *xport,
> s_ip = in6_addr_get_mapped_ipv4(&s_ip6);
> }
>
> - err = tnl_neigh_lookup(out_dev->xbridge->name, &d_ip6, &dmac);
> + err = tnl_neigh_lookup(out_dev->xbridge->name, &d_ip6, &dmac, false);
> if (err) {
> struct in6_addr nh_s_ip6 = in6addr_any;
>
> put_cloned_drop_action(ctx->xbridge->ofproto, ctx->odp_actions,
> XLATE_TUNNEL_NEIGH_CACHE_MISS,
> !is_last_action);
> + if (err == EINPROGRESS) {
> + xlate_report(ctx, OFT_DETAIL,
> + "neighbor cache miss for %s on bridge %s, "
> + "waiting on %s request",
> + buf_dip6, out_dev->xbridge->name,
> + d_ip ? "ARP" : "ND");
> + return err;
> + }
> xlate_report(ctx, OFT_DETAIL,
> "neighbor cache miss for %s on bridge %s, "
> "sending %s request",
> buf_dip6, out_dev->xbridge->name, d_ip ? "ARP" : "ND");
> + COVERAGE_INC(xlate_actions_sent_packet);
>
> err = ovs_router_get_netdev_source_address(
> &d_ip6, netdev_get_name(out_dev->netdev), &nh_s_ip6);
> diff --git a/tests/tunnel-push-pop-ipv6.at b/tests/tunnel-push-pop-ipv6.at
> index b11387345..7263bf4f0 100644
> --- a/tests/tunnel-push-pop-ipv6.at
> +++ b/tests/tunnel-push-pop-ipv6.at
> @@ -344,7 +344,20 @@ AT_CHECK([ovs-ofctl add-flow br0 action=normal])
> dnl Check Neighbour discovery.
> AT_CHECK([ovs-vsctl -- set Interface p0 options:pcap=p0.pcap])
>
> -AT_CHECK([ovs-appctl netdev-dummy/receive int-br
> 'in_port(2),eth(src=aa:55:aa:55:00:00,dst=f8:bc:12:ff:ff:ff),eth_type(0x0800),ipv4(src=1.1.3.92,dst=1.1.3.88,proto=1,tos=0,ttl=64,frag=no),icmp(type=0,code=0)'])
> +dnl First trace should send only two ND lookups, for each destination IP
> +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(int-br),]dnl
> + [eth(src=aa:55:aa:55:00:00,dst=f8:bc:12:ff:ff:ff),eth_type(0x0800),]dnl
> + [ipv4(src=1.1.3.92,dst=1.1.3.88,proto=1,tos=0,ttl=64,frag=no),]dnl
> + [icmp(type=0,code=0)"], [0], [stdout])
> +AT_CHECK([grep -q "sending ND request" stdout], [0])
> +
> +dnl Second trace should not send any ND lookups
> +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(int-br),]dnl
> + [eth(src=aa:55:aa:55:00:00,dst=f8:bc:12:ff:ff:ff),eth_type(0x0800),]dnl
> + [ipv4(src=1.1.3.92,dst=1.1.3.88,proto=1,tos=0,ttl=64,frag=no),]dnl
> + [icmp(type=0,code=0)"], [0], [stdout])
> +AT_CHECK([grep -q "waiting on ND request" stdout], [0])
> +AT_CHECK([grep -qv "sending ND request" stdout], [0])
>
> dnl Wait for the two Neighbor Solicitation packets to be sent.
> dnl Sometimes the system can be slow (e.g. under valgrind)
> @@ -577,8 +590,8 @@
> icmp,vlan_tci=0x0000,dl_src=be:b6:f4:e1:49:4a,dl_dst=fe:71:d8:83:72:4f,nw_src=30
> AT_CHECK([ovs-ofctl dump-ports int-br | grep 'port 5'], [0], [dnl
> port 5: rx pkts=1, bytes=98, drop=?, errs=?, frame=?, over=?, crc=?
> ])
> -AT_CHECK([ovs-appctl dpif/dump-flows int-br | grep 'in_port(6081)'], [0],
> [dnl
> -recirc_id(0),tunnel(tun_id=0x7b,ipv6_src=2001:cafe::92,ipv6_dst=2001:cafe::88,geneve({class=0xffff,type=0x80,len=4,0xa/0xf}{class=0xffff,type=0,len=4}),flags(+key)),in_port(6081),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
> packets:0, bytes:0, used:never,
> actions:userspace(pid=0,controller(reason=1,dont_send=0,continuation=0,recirc_id=3,rule_cookie=0,controller_id=0,max_len=65535))
> +AT_CHECK([ovs-appctl dpif/dump-flows int-br | grep 'in_port(6081)' |
> strip_recirc], [0], [dnl
> +recirc_id(<recirc>),tunnel(tun_id=0x7b,ipv6_src=2001:cafe::92,ipv6_dst=2001:cafe::88,geneve({class=0xffff,type=0x80,len=4,0xa/0xf}{class=0xffff,type=0,len=4}),flags(+key)),in_port(6081),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no),
> packets:0, bytes:0, used:never,
> actions:userspace(pid=0,controller(reason=1,dont_send=0,continuation=0,recirc_id=<recirc>,rule_cookie=0,controller_id=0,max_len=65535))
> ])
>
> dnl Receive VXLAN with different MAC and verify that the neigh cache gets
> updated
> diff --git a/tests/tunnel-push-pop.at b/tests/tunnel-push-pop.at
> index f22a37570..e7211149b 100644
> --- a/tests/tunnel-push-pop.at
> +++ b/tests/tunnel-push-pop.at
> @@ -256,7 +256,20 @@ AT_CHECK([ovs-appctl revalidator/wait])
> dnl Check ARP request
> AT_CHECK([ovs-vsctl -- set Interface p0 options:pcap=p0.pcap])
>
> -AT_CHECK([ovs-appctl netdev-dummy/receive int-br
> 'in_port(2),eth(src=aa:55:aa:55:00:00,dst=f8:bc:12:ff:ff:ff),eth_type(0x0800),ipv4(src=1.1.3.92,dst=1.1.3.88,proto=1,tos=0,ttl=64,frag=no),icmp(type=0,code=0)'])
> +dnl First trace should send only two ARP lookups, for each destination IP
> +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(int-br),]dnl
> + [eth(src=aa:55:aa:55:00:00,dst=f8:bc:12:ff:ff:ff),eth_type(0x0800),]dnl
> + [ipv4(src=1.1.3.92,dst=1.1.3.88,proto=1,tos=0,ttl=64,frag=no),]dnl
> + [icmp(type=0,code=0)"], [0], [stdout])
> +AT_CHECK([grep -q "sending ARP request" stdout], [0])
> +
> +dnl Second trace should not send any ARP lookups
> +AT_CHECK([[ovs-appctl ofproto/trace ovs-dummy "in_port(int-br),]dnl
> + [eth(src=aa:55:aa:55:00:00,dst=f8:bc:12:ff:ff:ff),eth_type(0x0800),]dnl
> + [ipv4(src=1.1.3.92,dst=1.1.3.88,proto=1,tos=0,ttl=64,frag=no),]dnl
> + [icmp(type=0,code=0)"]], [0], [stdout])
> +AT_CHECK([grep -q "waiting on ARP request" stdout], [0])
> +AT_CHECK([grep -qv "sending ARP request" stdout], [0])
>
> dnl Wait for the two ARP requests to be sent. Sometimes the system
> dnl can be slow (e.g. under valgrind)
> --
> 2.52.0
>
> _______________________________________________
> dev mailing list
> [email protected]
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev