OVS enables NETLINK_LISTEN_ALL_NSID on its RTNL notification socket so
that it can receive link and address events that originate in other
network namespaces. A side effect of this option is that the kernel
tags every broadcast notification with the sender's nsid as looked up in
the receiver's nsid table.
Normally, events that originate in the local namespace carry no nsid
ancillary data, which OVS interprets as NETNSID_LOCAL. However, if a
self-referential nsid mapping exists for the local namespace, the kernel
tags local events with that nsid instead. Such a mapping can be created
by other processes on the system (for example as a side effect of
cross-namespace link queries by some container runtimes), at which point
OVS starts treating its own local events as coming from a remote
namespace and silently ignores them.
To handle this deterministically, create a self-referential nsid mapping
for the local namespace at startup, before the notification socket is set
up and record the assigned id. The namespace is identified by our own
pid (NETNSA_PID). A self-referential nsid mapping is permanent once
created: the kernel only removes nsid entries when the peer namespace
is destroyed, which can never happen for our own namespace. So, the
value never changes and no runtime monitoring is required.
The recorded self nsid is stored in a global variable accessed through
netnsid_is_local(), so that any RTM event tagged with either the absence
of an nsid or our own self nsid is treated as local and processed
normally.
This kernel behavior was fixed in commit 88b126b39f97 ("net: netlink:
don't set nsid on local notifications"), which stops tagging local
events with the self-referential nsid. This workaround is only needed
for older kernels that lack that fix and can be removed once such kernels
are no longer supported.
Signed-off-by: Matteo Perin <[email protected]>
---
lib/automake.mk | 1 +
lib/netdev-linux.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++
lib/netnsid.c | 17 +++++++++
lib/netnsid.h | 12 +++++-
4 files changed, 121 insertions(+), 1 deletion(-)
create mode 100644 lib/netnsid.c
diff --git a/lib/automake.mk b/lib/automake.mk
index 9b0f6fe0a..2dfca79da 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -168,6 +168,7 @@ lib_libopenvswitch_la_SOURCES = \
lib/netflow.h \
lib/netlink.c \
lib/netlink.h \
+ lib/netnsid.c \
lib/netnsid.h \
lib/nx-match.c \
lib/nx-match.h \
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index c694dc1c5..d7447fa06 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -586,6 +586,84 @@ is_tap_netdev(const struct netdev *netdev)
return netdev_get_class(netdev) == &netdev_tap_class;
}
+/* Ensures that our own network namespace has a self-referential nsid mapping
+ * and records its value through netnsid_set_self() so that netnsid_is_local()
+ * treats it as the local namespace.
+ *
+ * NETLINK_LISTEN_ALL_NSID workaround: OVS enables this option on its RTNL
+ * notification socket so that it can receive events from remote namespaces.
+ * A side-effect of this option is that the kernel tags every broadcast
+ * (including locally-originated RTM events) with the sender nsid as looked up
+ * in the receiver nsid table. Normally local events carry no nsid cmsg
+ * (which OVS interprets as NETNSID_LOCAL), but if a self-referential nsid
+ * mapping exists for the local namespace the kernel tags them with that nsid
+ * instead, causing OVS to silently reject them. Some container runtimes
+ * create such a mapping as a side-effect of cross-namespace link queries.
+ *
+ * To make this deterministic, we create the mapping ourselves (or read back
+ * the existing one) at startup, before the notification socket exists and
+ * record the resulting nsid as local. A self-referential nsid mapping is
+ * permanent once created, the kernel only removes nsid entries when the
+ * peer namespace is destroyed, which can never happen for our own namespace.
+ * Therefore, the value never changes and no runtime monitoring is required.
+ *
+ * The namespace is identified by our own pid (NETNSA_PID).
+ *
+ * This kernel behavior was fixed in commit 88b126b39f97 ("net: netlink: don't
+ * set nsid on local notifications"), which stops tagging local events with the
+ * self-referential nsid. This workaround (and the self-nsid check in
+ * netnsid_is_local()) is only needed for older kernels that lack that fix and
+ * can be removed once such kernels are no longer supported. */
+static void
+netdev_linux_init_self_nsid(void)
+{
+#ifdef HAVE_LINUX_NET_NAMESPACE_H
+ const int rta_offset = NLMSG_ALIGN(sizeof(struct rtgenmsg));
+ struct ofpbuf request;
+ struct ofpbuf *reply = NULL;
+ uint32_t pid = getpid();
+ int error;
+
+ /* Create a self-referential nsid mapping for our own namespace. Passing
+ * NETNSA_NSID == -1 lets the kernel allocate a free id. If the mapping
+ * already exists the request fails with EEXIST, which is harmless. */
+ ofpbuf_init(&request, 0);
+ nl_msg_put_nlmsghdr(&request,
+ rta_offset + 2 * NL_ATTR_SIZE(sizeof(uint32_t)),
+ RTM_NEWNSID, NLM_F_REQUEST | NLM_F_ACK);
+ ofpbuf_put_zeros(&request, rta_offset);
+ nl_msg_put_u32(&request, NETNSA_PID, pid);
+ nl_msg_put_u32(&request, NETNSA_NSID, NETNSID_LOCAL);
+ nl_transact(NETLINK_ROUTE, &request, NULL);
+ ofpbuf_uninit(&request);
+
+ /* Read back the nsid that the kernel assigned to our namespace. */
+ ofpbuf_init(&request, 0);
+ nl_msg_put_nlmsghdr(&request, rta_offset + NL_ATTR_SIZE(sizeof(uint32_t)),
+ RTM_GETNSID, NLM_F_REQUEST);
+ ofpbuf_put_zeros(&request, rta_offset);
+ nl_msg_put_u32(&request, NETNSA_PID, pid);
+
+ error = nl_transact(NETLINK_ROUTE, &request, &reply);
+ if (!error && reply) {
+ const struct nlattr *a;
+
+ a = nl_attr_find(reply, NLMSG_HDRLEN + rta_offset, NETNSA_NSID);
+ if (a) {
+ int nsid = nl_attr_get_u32(a);
+
+ if (nsid >= 0) {
+ netnsid_set_self(nsid);
+ VLOG_DBG("local network namespace has nsid %d", nsid);
+ }
+ }
+ }
+
+ ofpbuf_uninit(&request);
+ ofpbuf_delete(reply);
+#endif
+}
+
static int
netdev_linux_netnsid_update__(struct netdev_linux *netdev)
{
@@ -665,6 +743,11 @@ netdev_linux_notify_sock(void)
if (ovsthread_once_start(&once)) {
int error;
+ /* Discover (creating it if necessary) the nsid that the kernel uses
+ * for our own namespace, before the notification socket starts
+ * receiving namespace-tagged events. */
+ netdev_linux_init_self_nsid();
+
error = nl_sock_create(NETLINK_ROUTE, &sock);
if (!error) {
size_t i;
@@ -936,6 +1019,15 @@ netdev_linux_update(struct netdev_linux *dev, int nsid,
const struct rtnetlink_change *change)
OVS_REQUIRES(dev->mutex)
{
+ /* NETLINK_LISTEN_ALL_NSID workaround: the kernel may tag locally
+ * originated events with our own self-referential nsid instead of
+ * omitting the cmsg. netnsid_is_local() knows about that self-nsid (set
+ * once at startup, see netdev_linux_init_self_nsid()), so normalize such
+ * events back to the local namespace id before matching. */
+ if (netnsid_is_local(nsid)) {
+ netnsid_set_local(&nsid);
+ }
+
if (netdev_linux_netnsid_is_eq(dev, nsid)) {
netdev_linux_update__(dev, change);
}
diff --git a/lib/netnsid.c b/lib/netnsid.c
new file mode 100644
index 000000000..043637e12
--- /dev/null
+++ b/lib/netnsid.c
@@ -0,0 +1,17 @@
+#include <config.h>
+#include "netnsid.h"
+
+/* The nsid that the local kernel assigns to our own network namespace when a
+ * self-referential mapping exists, or NETNSID_LOCAL when none does.
+ *
+ * It is set once at startup by the platform-specific netdev code (see
+ * netdev-linux.c) before any namespace-tagged netlink event can be received,
+ * and is read afterwards by netnsid_is_local(). A self-referential nsid
+ * mapping is permanent once created, so this value never changes again. */
+int netnsid_self = NETNSID_LOCAL;
+
+void
+netnsid_set_self(int nsid)
+{
+ netnsid_self = nsid;
+}
diff --git a/lib/netnsid.h b/lib/netnsid.h
index 1d5ab83c5..0f558c79c 100644
--- a/lib/netnsid.h
+++ b/lib/netnsid.h
@@ -19,6 +19,8 @@
#include <stdbool.h>
+#include "util.h"
+
#ifdef HAVE_LINUX_NET_NAMESPACE_H
#include <linux/net_namespace.h>
#endif
@@ -67,6 +69,14 @@
#endif
#define NETNSID_UNSET (NETNSID_LOCAL - 1)
+/* The nsid that the local kernel assigns to our own network namespace when a
+ * self-referential mapping exists, or NETNSID_LOCAL otherwise. It is defined
+ * in netnsid.c and set once at startup by the platform-specific netdev code
+ * (see netdev-linux.c). netnsid_is_local() treats this value as equivalent
+ * to the local namespace. */
+extern int netnsid_self;
+void netnsid_set_self(int nsid);
+
/* Prototypes */
static inline void netnsid_set_local(int *nsid);
static inline bool netnsid_is_local(int nsid);
@@ -86,7 +96,7 @@ netnsid_set_local(int *nsid)
static inline bool
netnsid_is_local(int nsid)
{
- return nsid == NETNSID_LOCAL;
+ return nsid == NETNSID_LOCAL || nsid == netnsid_self;
}
static inline void
--
2.43.0
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev