[PATCH net-next 1/7] net: inetdevice: provide replacement iterators for in_ifaddr walk

2019-05-29 Thread Florian Westphal
The ifa_list is protected either by rcu or rtnl lock, but the
current iterators do not account for this.

This adds two iterators as replacement, a later patch in
the series will update them with the needed rcu/rtnl_dereference calls.

Its not done in this patch yet to avoid sparse warnings -- the fields
lack the proper __rcu annotation.

Signed-off-by: Florian Westphal 
---
 include/linux/inetdevice.h | 10 +-
 net/ipv4/devinet.c | 31 ---
 2 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index 367dc2a0f84a..d5d05503a04b 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -186,7 +186,7 @@ __be32 inet_confirm_addr(struct net *net, struct in_device 
*in_dev, __be32 dst,
 struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
__be32 mask);
 struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr);
-static __inline__ bool inet_ifa_match(__be32 addr, struct in_ifaddr *ifa)
+static inline bool inet_ifa_match(__be32 addr, const struct in_ifaddr *ifa)
 {
return !((addr^ifa->ifa_address)&ifa->ifa_mask);
 }
@@ -215,6 +215,14 @@ static __inline__ bool bad_mask(__be32 mask, __be32 addr)
 
 #define endfor_ifa(in_dev) }
 
+#define in_dev_for_each_ifa_rtnl(ifa, in_dev)  \
+   for (ifa = (in_dev)->ifa_list; ifa; \
+ifa = ifa->ifa_next)
+
+#define in_dev_for_each_ifa_rcu(ifa, in_dev)   \
+   for (ifa = (in_dev)->ifa_list; ifa; \
+ifa = ifa->ifa_next)
+
 static inline struct in_device *__in_dev_get_rcu(const struct net_device *dev)
 {
return rcu_dereference(dev->ip_ptr);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 701c5d113a34..7803a4d2951c 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -873,13 +873,12 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, 
struct nlmsghdr *nlh,
 static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa)
 {
struct in_device *in_dev = ifa->ifa_dev;
-   struct in_ifaddr *ifa1, **ifap;
+   struct in_ifaddr *ifa1;
 
if (!ifa->ifa_local)
return NULL;
 
-   for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL;
-ifap = &ifa1->ifa_next) {
+   in_dev_for_each_ifa_rtnl(ifa1, in_dev) {
if (ifa1->ifa_mask == ifa->ifa_mask &&
inet_ifa_match(ifa1->ifa_address, ifa) &&
ifa1->ifa_local == ifa->ifa_local)
@@ -1208,7 +1207,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, 
struct ifreq *ifr)
 static int inet_gifconf(struct net_device *dev, char __user *buf, int len, int 
size)
 {
struct in_device *in_dev = __in_dev_get_rtnl(dev);
-   struct in_ifaddr *ifa;
+   const struct in_ifaddr *ifa;
struct ifreq ifr;
int done = 0;
 
@@ -1218,7 +1217,7 @@ static int inet_gifconf(struct net_device *dev, char 
__user *buf, int len, int s
if (!in_dev)
goto out;
 
-   for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+   in_dev_for_each_ifa_rtnl(ifa, in_dev) {
if (!buf) {
done += size;
continue;
@@ -1321,10 +1320,11 @@ EXPORT_SYMBOL(inet_select_addr);
 static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
  __be32 local, int scope)
 {
-   int same = 0;
+   const struct in_ifaddr *ifa;
__be32 addr = 0;
+   int same = 0;
 
-   for_ifa(in_dev) {
+   in_dev_for_each_ifa_rcu(ifa, in_dev) {
if (!addr &&
(local == ifa->ifa_local || !local) &&
ifa->ifa_scope <= scope) {
@@ -1350,7 +1350,7 @@ static __be32 confirm_addr_indev(struct in_device 
*in_dev, __be32 dst,
same = 0;
}
}
-   } endfor_ifa(in_dev);
+   }
 
return same ? addr : 0;
 }
@@ -1424,7 +1424,7 @@ static void inetdev_changename(struct net_device *dev, 
struct in_device *in_dev)
struct in_ifaddr *ifa;
int named = 0;
 
-   for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+   in_dev_for_each_ifa_rtnl(ifa, in_dev) {
char old[IFNAMSIZ], *dot;
 
memcpy(old, ifa->ifa_label, IFNAMSIZ);
@@ -1454,10 +1454,9 @@ static void inetdev_send_gratuitous_arp(struct 
net_device *dev,
struct in_device *in_dev)
 
 {
-   struct in_ifaddr *ifa;
+   const struct in_ifaddr *ifa;
 
-   for (ifa = in_dev->ifa_list; ifa;
-ifa = ifa->ifa_next) {
+   in_dev_for_each_ifa_rtnl(ifa, in_dev) {
arp_

[PATCH net-next 2/7] devinet: use in_dev_for_each_ifa_rcu in more places

2019-05-29 Thread Florian Westphal
This also replaces spots that used for_primary_ifa().

for_primary_ifa() aborts the loop on the first secondary address seen.

Replace it with either the rcu or rtnl variant of in_dev_for_each_ifa(),
but two places will now also consider secondary addresses too:
inet_addr_onlink() and inet_ifa_byprefix().

I do not understand why they should ignore secondary addresses.

Why would a secondary address not be considered 'on link'?
When matching a prefix, why ignore a matching secondary address?

Other places get converted as well, but gain "->flags & SECONDARY" check.

Signed-off-by: Florian Westphal 
---
 net/ipv4/devinet.c | 27 +++
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 7803a4d2951c..b45421b2b734 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -327,15 +327,17 @@ static void inetdev_destroy(struct in_device *in_dev)
 
 int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b)
 {
+   const struct in_ifaddr *ifa;
+
rcu_read_lock();
-   for_primary_ifa(in_dev) {
+   in_dev_for_each_ifa_rcu(ifa, in_dev) {
if (inet_ifa_match(a, ifa)) {
if (!b || inet_ifa_match(b, ifa)) {
rcu_read_unlock();
return 1;
}
}
-   } endfor_ifa(in_dev);
+   }
rcu_read_unlock();
return 0;
 }
@@ -580,12 +582,14 @@ EXPORT_SYMBOL(inetdev_by_index);
 struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
__be32 mask)
 {
+   struct in_ifaddr *ifa;
+
ASSERT_RTNL();
 
-   for_primary_ifa(in_dev) {
+   in_dev_for_each_ifa_rtnl(ifa, in_dev) {
if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa))
return ifa;
-   } endfor_ifa(in_dev);
+   }
return NULL;
 }
 
@@ -1245,17 +1249,22 @@ static int inet_gifconf(struct net_device *dev, char 
__user *buf, int len, int s
 static __be32 in_dev_select_addr(const struct in_device *in_dev,
 int scope)
 {
-   for_primary_ifa(in_dev) {
+   const struct in_ifaddr *ifa;
+
+   in_dev_for_each_ifa_rcu(ifa, in_dev) {
+   if (ifa->ifa_flags & IFA_F_SECONDARY)
+   continue;
if (ifa->ifa_scope != RT_SCOPE_LINK &&
ifa->ifa_scope <= scope)
return ifa->ifa_local;
-   } endfor_ifa(in_dev);
+   }
 
return 0;
 }
 
 __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
 {
+   const struct in_ifaddr *ifa;
__be32 addr = 0;
struct in_device *in_dev;
struct net *net = dev_net(dev);
@@ -1266,7 +1275,9 @@ __be32 inet_select_addr(const struct net_device *dev, 
__be32 dst, int scope)
if (!in_dev)
goto no_in_dev;
 
-   for_primary_ifa(in_dev) {
+   in_dev_for_each_ifa_rcu(ifa, in_dev) {
+   if (ifa->ifa_flags & IFA_F_SECONDARY)
+   continue;
if (ifa->ifa_scope > scope)
continue;
if (!dst || inet_ifa_match(dst, ifa)) {
@@ -1275,7 +1286,7 @@ __be32 inet_select_addr(const struct net_device *dev, 
__be32 dst, int scope)
}
if (!addr)
addr = ifa->ifa_local;
-   } endfor_ifa(in_dev);
+   }
 
if (addr)
goto out_unlock;
-- 
2.21.0



[PATCH net-next 7/7] net: ipv4: provide __rcu annotation for ifa_list

2019-05-29 Thread Florian Westphal
ifa_list is protected by rcu, yet code doesn't reflect this.

Add the __rcu annotations and fix up all places that are now reported by
sparse.

I've done this in the same commit to not add intermediate patches that
result in new warnings.

Reported-by: Eric Dumazet 
Signed-off-by: Florian Westphal 
---
 drivers/infiniband/hw/i40iw/i40iw_utils.c | 12 ++-
 drivers/infiniband/hw/nes/nes.c   |  8 +-
 drivers/infiniband/hw/usnic/usnic_ib_main.c   | 15 ++--
 drivers/isdn/hysdn/hysdn_net.c|  6 +-
 drivers/isdn/i4l/isdn_net.c   | 20 -
 drivers/net/ethernet/via/via-velocity.h   |  2 +-
 drivers/net/plip/plip.c   |  4 +-
 drivers/net/vmxnet3/vmxnet3_drv.c | 19 ++--
 drivers/net/wireless/ath/ath6kl/cfg80211.c|  4 +-
 .../net/wireless/marvell/mwifiex/cfg80211.c   |  2 +-
 include/linux/inetdevice.h| 21 ++---
 net/core/netpoll.c| 10 ++-
 net/core/pktgen.c |  8 +-
 net/ipv4/devinet.c| 88 ---
 net/mac80211/main.c   |  4 +-
 net/netfilter/nf_nat_redirect.c   | 12 +--
 16 files changed, 150 insertions(+), 85 deletions(-)

diff --git a/drivers/infiniband/hw/i40iw/i40iw_utils.c 
b/drivers/infiniband/hw/i40iw/i40iw_utils.c
index 337410f40860..016524683e17 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_utils.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_utils.c
@@ -174,10 +174,14 @@ int i40iw_inetaddr_event(struct notifier_block *notifier,
rcu_read_lock();
in = __in_dev_get_rcu(upper_dev);
 
-   if (!in->ifa_list)
-   local_ipaddr = 0;
-   else
-   local_ipaddr = ntohl(in->ifa_list->ifa_address);
+   local_ipaddr = 0;
+   if (in) {
+   struct in_ifaddr *ifa;
+
+   ifa = rcu_dereference(in->ifa_list);
+   if (ifa)
+   local_ipaddr = ntohl(ifa->ifa_address);
+   }
 
rcu_read_unlock();
} else {
diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c
index e00add6d78ec..29b324726ea6 100644
--- a/drivers/infiniband/hw/nes/nes.c
+++ b/drivers/infiniband/hw/nes/nes.c
@@ -183,7 +183,13 @@ static int nes_inetaddr_event(struct notifier_block 
*notifier,
 
rcu_read_lock();
in = 
__in_dev_get_rcu(upper_dev);
-   nesvnic->local_ipaddr = 
in->ifa_list->ifa_address;
+   if (in) {
+   struct in_ifaddr *ifa;
+
+   ifa = 
rcu_dereference(in->ifa_list);
+   if (ifa)
+   
nesvnic->local_ipaddr = ifa->ifa_address;
+   }
rcu_read_unlock();
} else {
nesvnic->local_ipaddr = 
ifa->ifa_address;
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c 
b/drivers/infiniband/hw/usnic/usnic_ib_main.c
index d88d9f8a7f9a..34c1f9d6c915 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_main.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c
@@ -427,11 +427,16 @@ static void *usnic_ib_device_add(struct pci_dev *dev)
if (netif_carrier_ok(us_ibdev->netdev))
usnic_fwd_carrier_up(us_ibdev->ufdev);
 
-   ind = in_dev_get(netdev);
-   if (ind->ifa_list)
-   usnic_fwd_add_ipaddr(us_ibdev->ufdev,
-ind->ifa_list->ifa_address);
-   in_dev_put(ind);
+   rcu_read_lock();
+   ind = __in_dev_get_rcu(netdev);
+   if (ind) {
+   const struct in_ifaddr *ifa;
+
+   ifa = rcu_dereference(ind->ifa_list);
+   if (ifa)
+   usnic_fwd_add_ipaddr(us_ibdev->ufdev, ifa->ifa_address);
+   }
+   rcu_read_unlock();
 
usnic_mac_ip_to_gid(us_ibdev->netdev->perm_addr,
us_ibdev->ufdev->inaddr, &gid.raw[0]);
diff --git a/drivers/isdn/hysdn/hysdn_net.c b/drivers/isdn/hysdn/hysdn_net.c
index 8e9c34f33d86..bea37ae30ebb 100644
--- a/drivers/isdn/hysdn/hysdn_net.c
+++ b/drivers/isdn/hysdn/hysdn_net.c
@@ -70,9 +70,13 @@ net_open(struct net_device *dev)
for (i = 0; i < ETH_ALEN; i++)
dev->dev_addr[i] = 0xfc;
if ((in_dev = dev->ip_ptr) != NULL) {
-   

[PATCH net-next 3/7] afs: switch to in_dev_for_each_ifa_rcu

2019-05-29 Thread Florian Westphal
The in_dev_for_each_ifa_rcu helper gets used so sparse won't
complain when we add the proper __rcu annotation to the ifa_list
member in struct in_device later.

While doing this I realized the helper only has one call site,
so move it to where its needed.

This then revealed that we allocate a temporary buffer needlessly
and pass an always-false bool argument.

So fold this into the calling function and fill dst buffer directly.

Compile tested only.

Cc: David Howells 
Cc: linux-...@lists.infradead.org
Signed-off-by: Florian Westphal 
---
 fs/afs/Makefile |  1 -
 fs/afs/cmservice.c  | 49 +++--
 fs/afs/internal.h   | 15 --
 fs/afs/netdevices.c | 48 
 4 files changed, 29 insertions(+), 84 deletions(-)
 delete mode 100644 fs/afs/netdevices.c

diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index cbf31f6cd177..10359bea7070 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -29,7 +29,6 @@ kafs-y := \
server.o \
server_list.o \
super.o \
-   netdevices.o \
vlclient.o \
vl_list.o \
vl_probe.o \
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 01437cfe5432..054590a6b1e2 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -14,6 +14,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include "internal.h"
 #include "afs_cm.h"
 #include "protocol_yfs.h"
@@ -584,9 +586,10 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call)
  */
 static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work)
 {
-   struct afs_interface *ifs;
+   struct net_device *dev;
+   struct in_device *idev;
struct afs_call *call = container_of(work, struct afs_call, work);
-   int loop, nifs;
+   int loop, nifs = 0;
 
struct {
struct /* InterfaceAddr */ {
@@ -604,19 +607,7 @@ static void SRXAFSCB_TellMeAboutYourself(struct 
work_struct *work)
 
_enter("");
 
-   nifs = 0;
-   ifs = kcalloc(32, sizeof(*ifs), GFP_KERNEL);
-   if (ifs) {
-   nifs = afs_get_ipv4_interfaces(call->net, ifs, 32, false);
-   if (nifs < 0) {
-   kfree(ifs);
-   ifs = NULL;
-   nifs = 0;
-   }
-   }
-
memset(&reply, 0, sizeof(reply));
-   reply.ia.nifs = htonl(nifs);
 
reply.ia.uuid[0] = call->net->uuid.time_low;
reply.ia.uuid[1] = htonl(ntohs(call->net->uuid.time_mid));
@@ -626,15 +617,33 @@ static void SRXAFSCB_TellMeAboutYourself(struct 
work_struct *work)
for (loop = 0; loop < 6; loop++)
reply.ia.uuid[loop + 5] = htonl((s8) 
call->net->uuid.node[loop]);
 
-   if (ifs) {
-   for (loop = 0; loop < nifs; loop++) {
-   reply.ia.ifaddr[loop] = ifs[loop].address.s_addr;
-   reply.ia.netmask[loop] = ifs[loop].netmask.s_addr;
-   reply.ia.mtu[loop] = htonl(ifs[loop].mtu);
+   rcu_read_lock();
+   for_each_netdev_rcu(call->net->net, dev) {
+   const struct in_ifaddr *ifa;
+
+   if (dev->flags & IFF_LOOPBACK)
+   continue;
+
+   idev = __in_dev_get_rcu(dev);
+   if (!idev)
+   continue;
+
+   in_dev_for_each_ifa_rcu(ifa, idev) {
+   if (ifa->ifa_flags & IFA_F_SECONDARY)
+   break;
+
+   reply.ia.ifaddr[nifs] = ifa->ifa_address;
+   reply.ia.netmask[nifs] = ifa->ifa_mask;
+   reply.ia.mtu[nifs] = htonl(dev->mtu);
+   nifs++;
+   break;
}
-   kfree(ifs);
+   if (nifs >= 32)
+   break;
}
+   rcu_read_unlock();
 
+   reply.ia.nifs = htonl(nifs);
reply.cap.capcount = htonl(1);
reply.cap.caps[0] = htonl(AFS_CAP_ERROR_TRANSLATION);
afs_send_simple_reply(call, &reply, sizeof(reply));
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 2073c1a3ab4b..a22fa3b77b3c 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -724,15 +724,6 @@ struct afs_permits {
struct afs_permit   permits[];  /* List of permits sorted by 
key pointer */
 };
 
-/*
- * record of one of a system's set of network interfaces
- */
-struct afs_interface {
-   struct in_addr  address;/* IPv4 address bound to interface */
-   struct in_addr  netmask;/* netmask applied to address */
-   unsignedmtu;/* MTU of interface */
-};
-
 /*
  * Error prioritisation and accumulation.
  */
@@ -1095,12 +1086,6 @@ extern const struct file_operations 
afs_mntpt_file_operations;
 extern struct vf

[PATCH net-next 0/7] net: add rcu annotations for ifa_list

2019-05-29 Thread Florian Westphal
Eric Dumazet reported follwing problem:

  It looks that unless RTNL is held, accessing ifa_list needs proper RCU
  protection.  indev->ifa_list can be changed under us by another cpu
  (which owns RTNL) [..]

  A proper rcu_dereference() with an happy sparse support would require
  adding __rcu attribute.

This patch series does that: add __rcu to the ifa_list pointers.
That makes sparse complain, so the series also adds the required
rcu_assign_pointer/dereference helpers where needed.

All patches except the last one are preparation work.
Two new macros are introduced for in_ifaddr walks.

Last patch adds the __rcu annotations and the assign_pointer/dereference
helper calls.

This patch is a bit large, but I found no better way -- other
approaches (annotate-first or add helpers-first) all result in
mid-series sparse warnings.

This series is submitted vs. net-next rather than net for several
reasons:

1. Its (mostly) compile-tested only
2. Second patch changes behaviour wrt. secondary addresses
   (see changelog)
3. the afs change removes one un-needed compilation unit
4. The problem exists for a very long time (2004), so it doesn't
   seem to be urgent to fix this -- rcu use to free ifa_list
   predates the git era.

Florian Westphal (7):
  net: inetdevice: provide replacement iterators for in_ifaddr walk
  devinet: use in_dev_for_each_ifa_rcu in more places
  afs: switch to in_dev_for_each_ifa_rcu
  netfilter: use in_dev_for_each_ifa_rcu
  net: use new in_dev_ifa iterators
  drivers: use in_dev_for_each_ifa_rtnl/rcu
  net: ipv4: provide __rcu annotation for ifa_list

 drivers/infiniband/core/roce_gid_mgmt.c  |5 
 drivers/infiniband/hw/cxgb4/cm.c |9 -
 drivers/infiniband/hw/i40iw/i40iw_cm.c   |7 
 drivers/infiniband/hw/i40iw/i40iw_main.c |6 
 drivers/infiniband/hw/i40iw/i40iw_utils.c|   12 -
 drivers/infiniband/hw/nes/nes.c  |8 
 drivers/infiniband/hw/usnic/usnic_ib_main.c  |   15 +
 drivers/isdn/hysdn/hysdn_net.c   |6 
 drivers/isdn/i4l/isdn_net.c  |   20 +-
 drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c |8 
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c |5 
 drivers/net/ethernet/via/via-velocity.h  |2 
 drivers/net/plip/plip.c  |4 
 drivers/net/vmxnet3/vmxnet3_drv.c|   19 +-
 drivers/net/wan/hdlc_cisco.c |   11 -
 drivers/net/wireless/ath/ath6kl/cfg80211.c   |4 
 drivers/net/wireless/marvell/mwifiex/cfg80211.c  |2 
 fs/afs/Makefile  |1 
 fs/afs/cmservice.c   |   49 +++--
 fs/afs/internal.h|   15 -
 include/linux/inetdevice.h   |   19 +-
 net/core/netpoll.c   |   10 -
 net/core/pktgen.c|8 
 net/ipv4/devinet.c   |  146 ++---
 net/ipv4/fib_frontend.c  |   24 +-
 net/ipv4/igmp.c  |5 
 net/ipv4/netfilter/nf_tproxy_ipv4.c  |9 -
 net/ipv6/addrconf.c  |4 
 net/mac80211/main.c  |4 
 net/netfilter/nf_conntrack_broadcast.c   |9 -
 net/netfilter/nf_nat_redirect.c  |   12 -
 net/netfilter/nfnetlink_osf.c|5 
 net/sctp/protocol.c  |2 
 net/smc/smc_clc.c|   11 -
 fs/afs/netdevices.c|   48 -
 35 files changed, 294 insertions(+), 230 deletions(-)



[PATCH net-next 4/7] netfilter: use in_dev_for_each_ifa_rcu

2019-05-29 Thread Florian Westphal
Netfilter hooks are always running under rcu read lock, use
the new iterator macro so sparse won't complain once we add
proper __rcu annotations.

Signed-off-by: Florian Westphal 
---
 net/ipv4/netfilter/nf_tproxy_ipv4.c| 9 +++--
 net/netfilter/nf_conntrack_broadcast.c | 9 +++--
 net/netfilter/nfnetlink_osf.c  | 5 ++---
 3 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c 
b/net/ipv4/netfilter/nf_tproxy_ipv4.c
index 164714104965..40c93b3bd731 100644
--- a/net/ipv4/netfilter/nf_tproxy_ipv4.c
+++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c
@@ -53,6 +53,7 @@ EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait4);
 
 __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
 {
+   const struct in_ifaddr *ifa;
struct in_device *indev;
__be32 laddr;
 
@@ -61,10 +62,14 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 
user_laddr, __be32 daddr)
 
laddr = 0;
indev = __in_dev_get_rcu(skb->dev);
-   for_primary_ifa(indev) {
+
+   in_dev_for_each_ifa_rcu(ifa, indev) {
+   if (ifa->ifa_flags & IFA_F_SECONDARY)
+   continue;
+
laddr = ifa->ifa_local;
break;
-   } endfor_ifa(indev);
+   }
 
return laddr ? laddr : daddr;
 }
diff --git a/net/netfilter/nf_conntrack_broadcast.c 
b/net/netfilter/nf_conntrack_broadcast.c
index 5423b197d98a..a5dbc3676a4f 100644
--- a/net/netfilter/nf_conntrack_broadcast.c
+++ b/net/netfilter/nf_conntrack_broadcast.c
@@ -41,12 +41,17 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb,
 
in_dev = __in_dev_get_rcu(rt->dst.dev);
if (in_dev != NULL) {
-   for_primary_ifa(in_dev) {
+   const struct in_ifaddr *ifa;
+
+   in_dev_for_each_ifa_rcu(ifa, in_dev) {
+   if (ifa->ifa_flags & IFA_F_SECONDARY)
+   continue;
+
if (ifa->ifa_broadcast == iph->daddr) {
mask = ifa->ifa_mask;
break;
}
-   } endfor_ifa(in_dev);
+   }
}
 
if (mask == 0)
diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c
index f42326b40d6f..9f5dea0064ea 100644
--- a/net/netfilter/nfnetlink_osf.c
+++ b/net/netfilter/nfnetlink_osf.c
@@ -33,6 +33,7 @@ static inline int nf_osf_ttl(const struct sk_buff *skb,
 {
struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
const struct iphdr *ip = ip_hdr(skb);
+   const struct in_ifaddr *ifa;
int ret = 0;
 
if (ttl_check == NF_OSF_TTL_TRUE)
@@ -42,15 +43,13 @@ static inline int nf_osf_ttl(const struct sk_buff *skb,
else if (ip->ttl <= f_ttl)
return 1;
 
-   for_ifa(in_dev) {
+   in_dev_for_each_ifa_rcu(ifa, in_dev) {
if (inet_ifa_match(ip->saddr, ifa)) {
ret = (ip->ttl == f_ttl);
break;
}
}
 
-   endfor_ifa(in_dev);
-
return ret;
 }
 
-- 
2.21.0



[PATCH net-next 5/7] net: use new in_dev_ifa iterators

2019-05-29 Thread Florian Westphal
Use in_dev_for_each_ifa_rcu/rtnl instead.
This prevents sparse warnings once proper __rcu annotations are added.

Signed-off-by: Florian Westphal 

t di# Last commands done (6 commands done):
---
 net/ipv4/fib_frontend.c | 24 +---
 net/ipv4/igmp.c |  5 +++--
 net/ipv6/addrconf.c |  4 +---
 net/sctp/protocol.c |  2 +-
 net/smc/smc_clc.c   | 11 +++
 5 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 76055c66326a..c7cdb8d0d164 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -540,14 +540,22 @@ static int rtentry_to_fib_config(struct net *net, int 
cmd, struct rtentry *rt,
cfg->fc_oif = dev->ifindex;
cfg->fc_table = l3mdev_fib_table(dev);
if (colon) {
-   struct in_ifaddr *ifa;
-   struct in_device *in_dev = __in_dev_get_rtnl(dev);
+   const struct in_ifaddr *ifa;
+   struct in_device *in_dev;
+
+   in_dev = __in_dev_get_rtnl(dev);
if (!in_dev)
return -ENODEV;
+
*colon = ':';
-   for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
+
+   rcu_read_lock();
+   in_dev_for_each_ifa_rcu(ifa, in_dev) {
if (strcmp(ifa->ifa_label, devname) == 0)
break;
+   }
+   rcu_read_unlock();
+
if (!ifa)
return -ENODEV;
cfg->fc_prefsrc = ifa->ifa_local;
@@ -1177,8 +1185,8 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct 
in_ifaddr *iprim)
 *
 * Scan address list to be sure that addresses are really gone.
 */
-
-   for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
+   rcu_read_lock();
+   in_dev_for_each_ifa_rcu(ifa1, in_dev) {
if (ifa1 == ifa) {
/* promotion, keep the IP */
gone = 0;
@@ -1246,6 +1254,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct 
in_ifaddr *iprim)
}
}
}
+   rcu_read_unlock();
 
 no_promotions:
if (!(ok & BRD_OK))
@@ -1415,6 +1424,7 @@ static int fib_netdev_event(struct notifier_block *this, 
unsigned long event, vo
struct netdev_notifier_info_ext *info_ext = ptr;
struct in_device *in_dev;
struct net *net = dev_net(dev);
+   struct in_ifaddr *ifa;
unsigned int flags;
 
if (event == NETDEV_UNREGISTER) {
@@ -1429,9 +1439,9 @@ static int fib_netdev_event(struct notifier_block *this, 
unsigned long event, vo
 
switch (event) {
case NETDEV_UP:
-   for_ifa(in_dev) {
+   in_dev_for_each_ifa_rtnl(ifa, in_dev) {
fib_add_ifaddr(ifa);
-   } endfor_ifa(in_dev);
+   }
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
fib_sync_up(dev, RTNH_F_DEAD);
 #endif
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 6c2febc39dca..719bd8e4eea4 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -325,14 +325,15 @@ static __be32 igmpv3_get_srcaddr(struct net_device *dev,
 const struct flowi4 *fl4)
 {
struct in_device *in_dev = __in_dev_get_rcu(dev);
+   const struct in_ifaddr *ifa;
 
if (!in_dev)
return htonl(INADDR_ANY);
 
-   for_ifa(in_dev) {
+   in_dev_for_each_ifa_rcu(ifa, in_dev) {
if (fl4->saddr == ifa->ifa_local)
return fl4->saddr;
-   } endfor_ifa(in_dev);
+   }
 
return htonl(INADDR_ANY);
 }
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 683613e7355b..40b154d45ab4 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3127,11 +3127,9 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
struct in_device *in_dev = __in_dev_get_rtnl(dev);
if (in_dev && (dev->flags & IFF_UP)) {
struct in_ifaddr *ifa;
-
int flag = scope;
 
-   for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
-
+   in_dev_for_each_ifa_rtnl(ifa, in_dev) {
addr.s6_addr32[3] = ifa->ifa_local;
 
if (ifa->ifa_scope == RT_SCOPE_LINK)
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index f0631bf486b6..e29cf27cb633 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -96,7 +96,7 @@ static void sctp_v4_copy_addrlist(struct list_head *addrlist,
return;
}
 
-   for (ifa = in_dev-&g

[PATCH net-next 6/7] drivers: use in_dev_for_each_ifa_rtnl/rcu

2019-05-29 Thread Florian Westphal
Like previous patches, use the new iterator macros to avoid sparse
warnings once proper __rcu annotations are added.

Compile tested only.

Signed-off-by: Florian Westphal 
---
 drivers/infiniband/core/roce_gid_mgmt.c  |  5 +++--
 drivers/infiniband/hw/cxgb4/cm.c |  9 +++--
 drivers/infiniband/hw/i40iw/i40iw_cm.c   |  7 +--
 drivers/infiniband/hw/i40iw/i40iw_main.c |  6 --
 drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c |  8 +---
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c |  5 +++--
 drivers/net/wan/hdlc_cisco.c | 11 +--
 7 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/drivers/infiniband/core/roce_gid_mgmt.c 
b/drivers/infiniband/core/roce_gid_mgmt.c
index 558de0b9895c..2860def84f4d 100644
--- a/drivers/infiniband/core/roce_gid_mgmt.c
+++ b/drivers/infiniband/core/roce_gid_mgmt.c
@@ -330,6 +330,7 @@ static void bond_delete_netdev_default_gids(struct 
ib_device *ib_dev,
 static void enum_netdev_ipv4_ips(struct ib_device *ib_dev,
 u8 port, struct net_device *ndev)
 {
+   const struct in_ifaddr *ifa;
struct in_device *in_dev;
struct sin_list {
struct list_headlist;
@@ -349,7 +350,7 @@ static void enum_netdev_ipv4_ips(struct ib_device *ib_dev,
return;
}
 
-   for_ifa(in_dev) {
+   in_dev_for_each_ifa_rcu(ifa, in_dev) {
struct sin_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
 
if (!entry)
@@ -359,7 +360,7 @@ static void enum_netdev_ipv4_ips(struct ib_device *ib_dev,
entry->ip.sin_addr.s_addr = ifa->ifa_address;
list_add_tail(&entry->list, &sin_list);
}
-   endfor_ifa(in_dev);
+
rcu_read_unlock();
 
list_for_each_entry_safe(sin_iter, sin_temp, &sin_list, list) {
diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index 0f3b1193d5f8..09fcfc9e052d 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -3230,17 +3230,22 @@ static int pick_local_ipaddrs(struct c4iw_dev *dev, 
struct iw_cm_id *cm_id)
int found = 0;
struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->m_local_addr;
struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->m_remote_addr;
+   const struct in_ifaddr *ifa;
 
ind = in_dev_get(dev->rdev.lldi.ports[0]);
if (!ind)
return -EADDRNOTAVAIL;
-   for_primary_ifa(ind) {
+   rcu_read_lock();
+   in_dev_for_each_ifa_rcu(ifa, ind) {
+   if (ifa->ifa_flags & IFA_F_SECONDARY)
+   continue;
laddr->sin_addr.s_addr = ifa->ifa_address;
raddr->sin_addr.s_addr = ifa->ifa_address;
found = 1;
break;
}
-   endfor_ifa(ind);
+   rcu_read_unlock();
+
in_dev_put(ind);
return found ? 0 : -EADDRNOTAVAIL;
 }
diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c 
b/drivers/infiniband/hw/i40iw/i40iw_cm.c
index 8233f5a4e623..700a5d06b60c 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_cm.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c
@@ -1773,8 +1773,11 @@ static enum i40iw_status_code i40iw_add_mqh_4(
if rdma_vlan_dev_vlan_id(dev) < I40IW_NO_VLAN) &&
  (rdma_vlan_dev_real_dev(dev) == iwdev->netdev)) ||
(dev == iwdev->netdev)) && (dev->flags & IFF_UP)) {
+   const struct in_ifaddr *ifa;
+
idev = in_dev_get(dev);
-   for_ifa(idev) {
+
+   in_dev_for_each_ifa_rtnl(ifa, idev) {
i40iw_debug(&iwdev->sc_dev,
I40IW_DEBUG_CM,
"Allocating child CM Listener 
forIP=%pI4, vlan_id=%d, MAC=%pM\n",
@@ -1819,7 +1822,7 @@ static enum i40iw_status_code i40iw_add_mqh_4(

cm_parent_listen_node->cm_core->stats_listen_nodes_created--;
}
}
-   endfor_ifa(idev);
+
in_dev_put(idev);
}
}
diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c 
b/drivers/infiniband/hw/i40iw/i40iw_main.c
index 10932baee279..d44cf33df81a 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_main.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_main.c
@@ -1222,8 +1222,10 @@ static void i40iw_add_ipv4_addr(struct i40iw_device 
*iwdev)
if rdma_vlan_dev_vlan_id(dev) < 0x) &&
  (rdma_vlan_dev_real_dev(dev) == iwdev->netdev)) ||
(dev == iwdev->netdev))

Re: [PATCH net-next 3/7] afs: switch to in_dev_for_each_ifa_rcu

2019-05-29 Thread Florian Westphal
David Howells  wrote:
> Actually, whilst thanks are due for doing the work - it looks nicer now - I'm
> told that there's not really any point populating the list.  Current OpenAFS
> ignores it, as does AuriStor - and IBM AFS 3.6 will do the right thing.

[..]

> On that basis, feel free to make it an empty list and remove all the interface
> enumeration.

Ok, will wait for others to comment before doing this in v2.


[PATCH net-next v2 0/7] net: add rcu annotations for ifa_list

2019-05-31 Thread Florian Westphal
v2:
 - remove ifa_list iteration in afs instead of conversion
 All other patches are unchanged.

Eric Dumazet reported following problem:

  It looks that unless RTNL is held, accessing ifa_list needs proper RCU
  protection.  indev->ifa_list can be changed under us by another cpu
  (which owns RTNL) [..]

  A proper rcu_dereference() with an happy sparse support would require
  adding __rcu attribute.

This patch series does that: add __rcu to the ifa_list pointers.
That makes sparse complain, so the series also adds the required
rcu_assign_pointer/dereference helpers where needed.

All patches except the last one are preparation work.
Two new macros are introduced for in_ifaddr walks.

Last patch adds the __rcu annotations and the assign_pointer/dereference
helper calls.

This patch is a bit large, but I found no better way -- other
approaches (annotate-first or add helpers-first) all result in
mid-series sparse warnings.

This series is submitted vs. net-next rather than net for several
reasons:

1. Its (mostly) compile-tested only
2. 3rd patch changes behaviour wrt. secondary addresses
   (see changelog)
3. The problem exists for a very long time (2004), so it doesn't
   seem to be urgent to fix this -- rcu use to free ifa_list
   predates the git era.

Florian Westphal (7):
  afs: do not send list of client addresses
  net: inetdevice: provide replacement iterators for in_ifaddr walk
  devinet: use in_dev_for_each_ifa_rcu in more places
  netfilter: use in_dev_for_each_ifa_rcu
  net: use new in_dev_ifa iterators
  drivers: use in_dev_for_each_ifa_rtnl/rcu
  net: ipv4: provide __rcu annotation for ifa_list

 drivers/infiniband/core/roce_gid_mgmt.c  |5 
 drivers/infiniband/hw/cxgb4/cm.c |9 -
 drivers/infiniband/hw/i40iw/i40iw_cm.c   |7 
 drivers/infiniband/hw/i40iw/i40iw_main.c |6 
 drivers/infiniband/hw/i40iw/i40iw_utils.c|   12 -
 drivers/infiniband/hw/nes/nes.c  |8 
 drivers/infiniband/hw/usnic/usnic_ib_main.c  |   15 +
 drivers/isdn/hysdn/hysdn_net.c   |6 
 drivers/isdn/i4l/isdn_net.c  |   20 +-
 drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c |8 
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c |5 
 drivers/net/ethernet/via/via-velocity.h  |2 
 drivers/net/plip/plip.c  |4 
 drivers/net/vmxnet3/vmxnet3_drv.c|   19 +-
 drivers/net/wan/hdlc_cisco.c |   11 -
 drivers/net/wireless/ath/ath6kl/cfg80211.c   |4 
 drivers/net/wireless/marvell/mwifiex/cfg80211.c  |2 
 fs/afs/Makefile  |1 
 fs/afs/cmservice.c   |   24 --
 fs/afs/internal.h|   15 -
 fs/afs/netdevices.c  |   48 -
 include/linux/inetdevice.h   |   19 +-
 net/core/netpoll.c   |   10 -
 net/core/pktgen.c|8 
 net/ipv4/devinet.c   |  146 ++---
 net/ipv4/fib_frontend.c  |   24 +-
 net/ipv4/igmp.c  |5 
 net/ipv4/netfilter/nf_tproxy_ipv4.c  |9 -
 net/ipv6/addrconf.c  |4 
 net/mac80211/main.c  |4 
 net/netfilter/nf_conntrack_broadcast.c   |9 -
 net/netfilter/nf_nat_redirect.c  |   12 -
 net/netfilter/nfnetlink_osf.c|5 
 net/sctp/protocol.c  |2 
 net/smc/smc_clc.c|   11 -
 35 files changed, 266 insertions(+), 233 deletions(-)




[PATCH net-next v2 1/7] afs: do not send list of client addresses

2019-05-31 Thread Florian Westphal
David Howell says:
  I'm told that there's not really any point populating the list.
  Current OpenAFS ignores it, as does AuriStor - and IBM AFS 3.6 will
  do the right thing.
  The list is actually useless as it's the client's view of the world,
  not the servers, so if there's any NAT in the way its contents are
  invalid.  Further, it doesn't support IPv6 addresses.

  On that basis, feel free to make it an empty list and remove all the
  interface enumeration.

V1 of this patch reworked the function to use a new helper for the
ifa_list iteration to avoid sparse warnings once the proper __rcu
annotations get added in struct in_device later.

But, in light of the above, just remove afs_get_ipv4_interfaces.

Compile tested only.

Cc: David Howells 
Cc: linux-...@lists.infradead.org
Signed-off-by: Florian Westphal 
---
 fs/afs/Makefile |  1 -
 fs/afs/cmservice.c  | 24 +--
 fs/afs/internal.h   | 15 --
 fs/afs/netdevices.c | 48 -
 4 files changed, 1 insertion(+), 87 deletions(-)
 delete mode 100644 fs/afs/netdevices.c

diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index cbf31f6cd177..10359bea7070 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -29,7 +29,6 @@ kafs-y := \
server.o \
server_list.o \
super.o \
-   netdevices.o \
vlclient.o \
vl_list.o \
vl_probe.o \
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 01437cfe5432..a61d2058c468 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -584,9 +584,8 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call)
  */
 static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work)
 {
-   struct afs_interface *ifs;
struct afs_call *call = container_of(work, struct afs_call, work);
-   int loop, nifs;
+   int loop;
 
struct {
struct /* InterfaceAddr */ {
@@ -604,19 +603,7 @@ static void SRXAFSCB_TellMeAboutYourself(struct 
work_struct *work)
 
_enter("");
 
-   nifs = 0;
-   ifs = kcalloc(32, sizeof(*ifs), GFP_KERNEL);
-   if (ifs) {
-   nifs = afs_get_ipv4_interfaces(call->net, ifs, 32, false);
-   if (nifs < 0) {
-   kfree(ifs);
-   ifs = NULL;
-   nifs = 0;
-   }
-   }
-
memset(&reply, 0, sizeof(reply));
-   reply.ia.nifs = htonl(nifs);
 
reply.ia.uuid[0] = call->net->uuid.time_low;
reply.ia.uuid[1] = htonl(ntohs(call->net->uuid.time_mid));
@@ -626,15 +613,6 @@ static void SRXAFSCB_TellMeAboutYourself(struct 
work_struct *work)
for (loop = 0; loop < 6; loop++)
reply.ia.uuid[loop + 5] = htonl((s8) 
call->net->uuid.node[loop]);
 
-   if (ifs) {
-   for (loop = 0; loop < nifs; loop++) {
-   reply.ia.ifaddr[loop] = ifs[loop].address.s_addr;
-   reply.ia.netmask[loop] = ifs[loop].netmask.s_addr;
-   reply.ia.mtu[loop] = htonl(ifs[loop].mtu);
-   }
-   kfree(ifs);
-   }
-
reply.cap.capcount = htonl(1);
reply.cap.caps[0] = htonl(AFS_CAP_ERROR_TRANSLATION);
afs_send_simple_reply(call, &reply, sizeof(reply));
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 2073c1a3ab4b..a22fa3b77b3c 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -724,15 +724,6 @@ struct afs_permits {
struct afs_permit   permits[];  /* List of permits sorted by 
key pointer */
 };
 
-/*
- * record of one of a system's set of network interfaces
- */
-struct afs_interface {
-   struct in_addr  address;/* IPv4 address bound to interface */
-   struct in_addr  netmask;/* netmask applied to address */
-   unsignedmtu;/* MTU of interface */
-};
-
 /*
  * Error prioritisation and accumulation.
  */
@@ -1095,12 +1086,6 @@ extern const struct file_operations 
afs_mntpt_file_operations;
 extern struct vfsmount *afs_d_automount(struct path *);
 extern void afs_mntpt_kill_timer(void);
 
-/*
- * netdevices.c
- */
-extern int afs_get_ipv4_interfaces(struct afs_net *, struct afs_interface *,
-  size_t, bool);
-
 /*
  * proc.c
  */
diff --git a/fs/afs/netdevices.c b/fs/afs/netdevices.c
deleted file mode 100644
index 2a009d1939d7..
--- a/fs/afs/netdevices.c
+++ /dev/null
@@ -1,48 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* AFS network device helpers
- *
- * Copyright (c) 2007 Patrick McHardy 
- */
-
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include "internal.h"
-
-/*
- * get a list of this system's interface IPv4 addresses, netmasks and MTUs
- * - maxbufs must be at least 1
- * - returns the number of interface records in the buffer
- */
-int afs_get_ipv4_interfaces

[PATCH net-next v2 3/7] devinet: use in_dev_for_each_ifa_rcu in more places

2019-05-31 Thread Florian Westphal
This also replaces spots that used for_primary_ifa().

for_primary_ifa() aborts the loop on the first secondary address seen.

Replace it with either the rcu or rtnl variant of in_dev_for_each_ifa(),
but two places will now also consider secondary addresses too:
inet_addr_onlink() and inet_ifa_byprefix().

I do not understand why they should ignore secondary addresses.

Why would a secondary address not be considered 'on link'?
When matching a prefix, why ignore a matching secondary address?

Other places get converted as well, but gain "->flags & SECONDARY" check.

Signed-off-by: Florian Westphal 
---
 net/ipv4/devinet.c | 27 +++
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 7803a4d2951c..b45421b2b734 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -327,15 +327,17 @@ static void inetdev_destroy(struct in_device *in_dev)
 
 int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b)
 {
+   const struct in_ifaddr *ifa;
+
rcu_read_lock();
-   for_primary_ifa(in_dev) {
+   in_dev_for_each_ifa_rcu(ifa, in_dev) {
if (inet_ifa_match(a, ifa)) {
if (!b || inet_ifa_match(b, ifa)) {
rcu_read_unlock();
return 1;
}
}
-   } endfor_ifa(in_dev);
+   }
rcu_read_unlock();
return 0;
 }
@@ -580,12 +582,14 @@ EXPORT_SYMBOL(inetdev_by_index);
 struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
__be32 mask)
 {
+   struct in_ifaddr *ifa;
+
ASSERT_RTNL();
 
-   for_primary_ifa(in_dev) {
+   in_dev_for_each_ifa_rtnl(ifa, in_dev) {
if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa))
return ifa;
-   } endfor_ifa(in_dev);
+   }
return NULL;
 }
 
@@ -1245,17 +1249,22 @@ static int inet_gifconf(struct net_device *dev, char 
__user *buf, int len, int s
 static __be32 in_dev_select_addr(const struct in_device *in_dev,
 int scope)
 {
-   for_primary_ifa(in_dev) {
+   const struct in_ifaddr *ifa;
+
+   in_dev_for_each_ifa_rcu(ifa, in_dev) {
+   if (ifa->ifa_flags & IFA_F_SECONDARY)
+   continue;
if (ifa->ifa_scope != RT_SCOPE_LINK &&
ifa->ifa_scope <= scope)
return ifa->ifa_local;
-   } endfor_ifa(in_dev);
+   }
 
return 0;
 }
 
 __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
 {
+   const struct in_ifaddr *ifa;
__be32 addr = 0;
struct in_device *in_dev;
struct net *net = dev_net(dev);
@@ -1266,7 +1275,9 @@ __be32 inet_select_addr(const struct net_device *dev, 
__be32 dst, int scope)
if (!in_dev)
goto no_in_dev;
 
-   for_primary_ifa(in_dev) {
+   in_dev_for_each_ifa_rcu(ifa, in_dev) {
+   if (ifa->ifa_flags & IFA_F_SECONDARY)
+   continue;
if (ifa->ifa_scope > scope)
continue;
if (!dst || inet_ifa_match(dst, ifa)) {
@@ -1275,7 +1286,7 @@ __be32 inet_select_addr(const struct net_device *dev, 
__be32 dst, int scope)
}
if (!addr)
addr = ifa->ifa_local;
-   } endfor_ifa(in_dev);
+   }
 
if (addr)
goto out_unlock;
-- 
2.21.0



[PATCH net-next v2 4/7] netfilter: use in_dev_for_each_ifa_rcu

2019-05-31 Thread Florian Westphal
Netfilter hooks are always running under rcu read lock, use
the new iterator macro so sparse won't complain once we add
proper __rcu annotations.

Signed-off-by: Florian Westphal 
---
 net/ipv4/netfilter/nf_tproxy_ipv4.c| 9 +++--
 net/netfilter/nf_conntrack_broadcast.c | 9 +++--
 net/netfilter/nfnetlink_osf.c  | 5 ++---
 3 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c 
b/net/ipv4/netfilter/nf_tproxy_ipv4.c
index 164714104965..40c93b3bd731 100644
--- a/net/ipv4/netfilter/nf_tproxy_ipv4.c
+++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c
@@ -53,6 +53,7 @@ EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait4);
 
 __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
 {
+   const struct in_ifaddr *ifa;
struct in_device *indev;
__be32 laddr;
 
@@ -61,10 +62,14 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 
user_laddr, __be32 daddr)
 
laddr = 0;
indev = __in_dev_get_rcu(skb->dev);
-   for_primary_ifa(indev) {
+
+   in_dev_for_each_ifa_rcu(ifa, indev) {
+   if (ifa->ifa_flags & IFA_F_SECONDARY)
+   continue;
+
laddr = ifa->ifa_local;
break;
-   } endfor_ifa(indev);
+   }
 
return laddr ? laddr : daddr;
 }
diff --git a/net/netfilter/nf_conntrack_broadcast.c 
b/net/netfilter/nf_conntrack_broadcast.c
index 5423b197d98a..a5dbc3676a4f 100644
--- a/net/netfilter/nf_conntrack_broadcast.c
+++ b/net/netfilter/nf_conntrack_broadcast.c
@@ -41,12 +41,17 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb,
 
in_dev = __in_dev_get_rcu(rt->dst.dev);
if (in_dev != NULL) {
-   for_primary_ifa(in_dev) {
+   const struct in_ifaddr *ifa;
+
+   in_dev_for_each_ifa_rcu(ifa, in_dev) {
+   if (ifa->ifa_flags & IFA_F_SECONDARY)
+   continue;
+
if (ifa->ifa_broadcast == iph->daddr) {
mask = ifa->ifa_mask;
break;
}
-   } endfor_ifa(in_dev);
+   }
}
 
if (mask == 0)
diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c
index f42326b40d6f..9f5dea0064ea 100644
--- a/net/netfilter/nfnetlink_osf.c
+++ b/net/netfilter/nfnetlink_osf.c
@@ -33,6 +33,7 @@ static inline int nf_osf_ttl(const struct sk_buff *skb,
 {
struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
const struct iphdr *ip = ip_hdr(skb);
+   const struct in_ifaddr *ifa;
int ret = 0;
 
if (ttl_check == NF_OSF_TTL_TRUE)
@@ -42,15 +43,13 @@ static inline int nf_osf_ttl(const struct sk_buff *skb,
else if (ip->ttl <= f_ttl)
return 1;
 
-   for_ifa(in_dev) {
+   in_dev_for_each_ifa_rcu(ifa, in_dev) {
if (inet_ifa_match(ip->saddr, ifa)) {
ret = (ip->ttl == f_ttl);
break;
}
}
 
-   endfor_ifa(in_dev);
-
return ret;
 }
 
-- 
2.21.0



[PATCH net-next v2 7/7] net: ipv4: provide __rcu annotation for ifa_list

2019-05-31 Thread Florian Westphal
ifa_list is protected by rcu, yet code doesn't reflect this.

Add the __rcu annotations and fix up all places that are now reported by
sparse.

I've done this in the same commit to not add intermediate patches that
result in new warnings.

Reported-by: Eric Dumazet 
Signed-off-by: Florian Westphal 
---
 drivers/infiniband/hw/i40iw/i40iw_utils.c | 12 ++-
 drivers/infiniband/hw/nes/nes.c   |  8 +-
 drivers/infiniband/hw/usnic/usnic_ib_main.c   | 15 ++--
 drivers/isdn/hysdn/hysdn_net.c|  6 +-
 drivers/isdn/i4l/isdn_net.c   | 20 -
 drivers/net/ethernet/via/via-velocity.h   |  2 +-
 drivers/net/plip/plip.c   |  4 +-
 drivers/net/vmxnet3/vmxnet3_drv.c | 19 ++--
 drivers/net/wireless/ath/ath6kl/cfg80211.c|  4 +-
 .../net/wireless/marvell/mwifiex/cfg80211.c   |  2 +-
 include/linux/inetdevice.h| 21 ++---
 net/core/netpoll.c| 10 ++-
 net/core/pktgen.c |  8 +-
 net/ipv4/devinet.c| 88 ---
 net/mac80211/main.c   |  4 +-
 net/netfilter/nf_nat_redirect.c   | 12 +--
 16 files changed, 150 insertions(+), 85 deletions(-)

diff --git a/drivers/infiniband/hw/i40iw/i40iw_utils.c 
b/drivers/infiniband/hw/i40iw/i40iw_utils.c
index 337410f40860..016524683e17 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_utils.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_utils.c
@@ -174,10 +174,14 @@ int i40iw_inetaddr_event(struct notifier_block *notifier,
rcu_read_lock();
in = __in_dev_get_rcu(upper_dev);
 
-   if (!in->ifa_list)
-   local_ipaddr = 0;
-   else
-   local_ipaddr = ntohl(in->ifa_list->ifa_address);
+   local_ipaddr = 0;
+   if (in) {
+   struct in_ifaddr *ifa;
+
+   ifa = rcu_dereference(in->ifa_list);
+   if (ifa)
+   local_ipaddr = ntohl(ifa->ifa_address);
+   }
 
rcu_read_unlock();
} else {
diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c
index e00add6d78ec..29b324726ea6 100644
--- a/drivers/infiniband/hw/nes/nes.c
+++ b/drivers/infiniband/hw/nes/nes.c
@@ -183,7 +183,13 @@ static int nes_inetaddr_event(struct notifier_block 
*notifier,
 
rcu_read_lock();
in = 
__in_dev_get_rcu(upper_dev);
-   nesvnic->local_ipaddr = 
in->ifa_list->ifa_address;
+   if (in) {
+   struct in_ifaddr *ifa;
+
+   ifa = 
rcu_dereference(in->ifa_list);
+   if (ifa)
+   
nesvnic->local_ipaddr = ifa->ifa_address;
+   }
rcu_read_unlock();
} else {
nesvnic->local_ipaddr = 
ifa->ifa_address;
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c 
b/drivers/infiniband/hw/usnic/usnic_ib_main.c
index d88d9f8a7f9a..34c1f9d6c915 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_main.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c
@@ -427,11 +427,16 @@ static void *usnic_ib_device_add(struct pci_dev *dev)
if (netif_carrier_ok(us_ibdev->netdev))
usnic_fwd_carrier_up(us_ibdev->ufdev);
 
-   ind = in_dev_get(netdev);
-   if (ind->ifa_list)
-   usnic_fwd_add_ipaddr(us_ibdev->ufdev,
-ind->ifa_list->ifa_address);
-   in_dev_put(ind);
+   rcu_read_lock();
+   ind = __in_dev_get_rcu(netdev);
+   if (ind) {
+   const struct in_ifaddr *ifa;
+
+   ifa = rcu_dereference(ind->ifa_list);
+   if (ifa)
+   usnic_fwd_add_ipaddr(us_ibdev->ufdev, ifa->ifa_address);
+   }
+   rcu_read_unlock();
 
usnic_mac_ip_to_gid(us_ibdev->netdev->perm_addr,
us_ibdev->ufdev->inaddr, &gid.raw[0]);
diff --git a/drivers/isdn/hysdn/hysdn_net.c b/drivers/isdn/hysdn/hysdn_net.c
index 8e9c34f33d86..bea37ae30ebb 100644
--- a/drivers/isdn/hysdn/hysdn_net.c
+++ b/drivers/isdn/hysdn/hysdn_net.c
@@ -70,9 +70,13 @@ net_open(struct net_device *dev)
for (i = 0; i < ETH_ALEN; i++)
dev->dev_addr[i] = 0xfc;
if ((in_dev = dev->ip_ptr) != NULL) {
-   

[PATCH net-next v2 6/7] drivers: use in_dev_for_each_ifa_rtnl/rcu

2019-05-31 Thread Florian Westphal
Like previous patches, use the new iterator macros to avoid sparse
warnings once proper __rcu annotations are added.

Compile tested only.

Signed-off-by: Florian Westphal 
---
 drivers/infiniband/core/roce_gid_mgmt.c  |  5 +++--
 drivers/infiniband/hw/cxgb4/cm.c |  9 +++--
 drivers/infiniband/hw/i40iw/i40iw_cm.c   |  7 +--
 drivers/infiniband/hw/i40iw/i40iw_main.c |  6 --
 drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c |  8 +---
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c |  5 +++--
 drivers/net/wan/hdlc_cisco.c | 11 +--
 7 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/drivers/infiniband/core/roce_gid_mgmt.c 
b/drivers/infiniband/core/roce_gid_mgmt.c
index 558de0b9895c..2860def84f4d 100644
--- a/drivers/infiniband/core/roce_gid_mgmt.c
+++ b/drivers/infiniband/core/roce_gid_mgmt.c
@@ -330,6 +330,7 @@ static void bond_delete_netdev_default_gids(struct 
ib_device *ib_dev,
 static void enum_netdev_ipv4_ips(struct ib_device *ib_dev,
 u8 port, struct net_device *ndev)
 {
+   const struct in_ifaddr *ifa;
struct in_device *in_dev;
struct sin_list {
struct list_headlist;
@@ -349,7 +350,7 @@ static void enum_netdev_ipv4_ips(struct ib_device *ib_dev,
return;
}
 
-   for_ifa(in_dev) {
+   in_dev_for_each_ifa_rcu(ifa, in_dev) {
struct sin_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
 
if (!entry)
@@ -359,7 +360,7 @@ static void enum_netdev_ipv4_ips(struct ib_device *ib_dev,
entry->ip.sin_addr.s_addr = ifa->ifa_address;
list_add_tail(&entry->list, &sin_list);
}
-   endfor_ifa(in_dev);
+
rcu_read_unlock();
 
list_for_each_entry_safe(sin_iter, sin_temp, &sin_list, list) {
diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index 0f3b1193d5f8..09fcfc9e052d 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -3230,17 +3230,22 @@ static int pick_local_ipaddrs(struct c4iw_dev *dev, 
struct iw_cm_id *cm_id)
int found = 0;
struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->m_local_addr;
struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->m_remote_addr;
+   const struct in_ifaddr *ifa;
 
ind = in_dev_get(dev->rdev.lldi.ports[0]);
if (!ind)
return -EADDRNOTAVAIL;
-   for_primary_ifa(ind) {
+   rcu_read_lock();
+   in_dev_for_each_ifa_rcu(ifa, ind) {
+   if (ifa->ifa_flags & IFA_F_SECONDARY)
+   continue;
laddr->sin_addr.s_addr = ifa->ifa_address;
raddr->sin_addr.s_addr = ifa->ifa_address;
found = 1;
break;
}
-   endfor_ifa(ind);
+   rcu_read_unlock();
+
in_dev_put(ind);
return found ? 0 : -EADDRNOTAVAIL;
 }
diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c 
b/drivers/infiniband/hw/i40iw/i40iw_cm.c
index 8233f5a4e623..700a5d06b60c 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_cm.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c
@@ -1773,8 +1773,11 @@ static enum i40iw_status_code i40iw_add_mqh_4(
if rdma_vlan_dev_vlan_id(dev) < I40IW_NO_VLAN) &&
  (rdma_vlan_dev_real_dev(dev) == iwdev->netdev)) ||
(dev == iwdev->netdev)) && (dev->flags & IFF_UP)) {
+   const struct in_ifaddr *ifa;
+
idev = in_dev_get(dev);
-   for_ifa(idev) {
+
+   in_dev_for_each_ifa_rtnl(ifa, idev) {
i40iw_debug(&iwdev->sc_dev,
I40IW_DEBUG_CM,
"Allocating child CM Listener 
forIP=%pI4, vlan_id=%d, MAC=%pM\n",
@@ -1819,7 +1822,7 @@ static enum i40iw_status_code i40iw_add_mqh_4(

cm_parent_listen_node->cm_core->stats_listen_nodes_created--;
}
}
-   endfor_ifa(idev);
+
in_dev_put(idev);
}
}
diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c 
b/drivers/infiniband/hw/i40iw/i40iw_main.c
index 10932baee279..d44cf33df81a 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_main.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_main.c
@@ -1222,8 +1222,10 @@ static void i40iw_add_ipv4_addr(struct i40iw_device 
*iwdev)
if rdma_vlan_dev_vlan_id(dev) < 0x) &&
  (rdma_vlan_dev_real_dev(dev) == iwdev->netdev)) ||
(dev == iwdev->netdev))

[PATCH net-next v2 5/7] net: use new in_dev_ifa iterators

2019-05-31 Thread Florian Westphal
Use in_dev_for_each_ifa_rcu/rtnl instead.
This prevents sparse warnings once proper __rcu annotations are added.

Signed-off-by: Florian Westphal 

t di# Last commands done (6 commands done):
---
 net/ipv4/fib_frontend.c | 24 +---
 net/ipv4/igmp.c |  5 +++--
 net/ipv6/addrconf.c |  4 +---
 net/sctp/protocol.c |  2 +-
 net/smc/smc_clc.c   | 11 +++
 5 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 76055c66326a..c7cdb8d0d164 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -540,14 +540,22 @@ static int rtentry_to_fib_config(struct net *net, int 
cmd, struct rtentry *rt,
cfg->fc_oif = dev->ifindex;
cfg->fc_table = l3mdev_fib_table(dev);
if (colon) {
-   struct in_ifaddr *ifa;
-   struct in_device *in_dev = __in_dev_get_rtnl(dev);
+   const struct in_ifaddr *ifa;
+   struct in_device *in_dev;
+
+   in_dev = __in_dev_get_rtnl(dev);
if (!in_dev)
return -ENODEV;
+
*colon = ':';
-   for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
+
+   rcu_read_lock();
+   in_dev_for_each_ifa_rcu(ifa, in_dev) {
if (strcmp(ifa->ifa_label, devname) == 0)
break;
+   }
+   rcu_read_unlock();
+
if (!ifa)
return -ENODEV;
cfg->fc_prefsrc = ifa->ifa_local;
@@ -1177,8 +1185,8 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct 
in_ifaddr *iprim)
 *
 * Scan address list to be sure that addresses are really gone.
 */
-
-   for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
+   rcu_read_lock();
+   in_dev_for_each_ifa_rcu(ifa1, in_dev) {
if (ifa1 == ifa) {
/* promotion, keep the IP */
gone = 0;
@@ -1246,6 +1254,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct 
in_ifaddr *iprim)
}
}
}
+   rcu_read_unlock();
 
 no_promotions:
if (!(ok & BRD_OK))
@@ -1415,6 +1424,7 @@ static int fib_netdev_event(struct notifier_block *this, 
unsigned long event, vo
struct netdev_notifier_info_ext *info_ext = ptr;
struct in_device *in_dev;
struct net *net = dev_net(dev);
+   struct in_ifaddr *ifa;
unsigned int flags;
 
if (event == NETDEV_UNREGISTER) {
@@ -1429,9 +1439,9 @@ static int fib_netdev_event(struct notifier_block *this, 
unsigned long event, vo
 
switch (event) {
case NETDEV_UP:
-   for_ifa(in_dev) {
+   in_dev_for_each_ifa_rtnl(ifa, in_dev) {
fib_add_ifaddr(ifa);
-   } endfor_ifa(in_dev);
+   }
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
fib_sync_up(dev, RTNH_F_DEAD);
 #endif
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 6c2febc39dca..719bd8e4eea4 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -325,14 +325,15 @@ static __be32 igmpv3_get_srcaddr(struct net_device *dev,
 const struct flowi4 *fl4)
 {
struct in_device *in_dev = __in_dev_get_rcu(dev);
+   const struct in_ifaddr *ifa;
 
if (!in_dev)
return htonl(INADDR_ANY);
 
-   for_ifa(in_dev) {
+   in_dev_for_each_ifa_rcu(ifa, in_dev) {
if (fl4->saddr == ifa->ifa_local)
return fl4->saddr;
-   } endfor_ifa(in_dev);
+   }
 
return htonl(INADDR_ANY);
 }
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 683613e7355b..40b154d45ab4 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3127,11 +3127,9 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
struct in_device *in_dev = __in_dev_get_rtnl(dev);
if (in_dev && (dev->flags & IFF_UP)) {
struct in_ifaddr *ifa;
-
int flag = scope;
 
-   for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
-
+   in_dev_for_each_ifa_rtnl(ifa, in_dev) {
addr.s6_addr32[3] = ifa->ifa_local;
 
if (ifa->ifa_scope == RT_SCOPE_LINK)
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index f0631bf486b6..e29cf27cb633 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -96,7 +96,7 @@ static void sctp_v4_copy_addrlist(struct list_head *addrlist,
return;
}
 
-   for (ifa = in_dev-&g

[PATCH net-next v2 2/7] net: inetdevice: provide replacement iterators for in_ifaddr walk

2019-05-31 Thread Florian Westphal
The ifa_list is protected either by rcu or rtnl lock, but the
current iterators do not account for this.

This adds two iterators as replacement, a later patch in
the series will update them with the needed rcu/rtnl_dereference calls.

Its not done in this patch yet to avoid sparse warnings -- the fields
lack the proper __rcu annotation.

Signed-off-by: Florian Westphal 
---
 include/linux/inetdevice.h | 10 +-
 net/ipv4/devinet.c | 31 ---
 2 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index 367dc2a0f84a..d5d05503a04b 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -186,7 +186,7 @@ __be32 inet_confirm_addr(struct net *net, struct in_device 
*in_dev, __be32 dst,
 struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
__be32 mask);
 struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr);
-static __inline__ bool inet_ifa_match(__be32 addr, struct in_ifaddr *ifa)
+static inline bool inet_ifa_match(__be32 addr, const struct in_ifaddr *ifa)
 {
return !((addr^ifa->ifa_address)&ifa->ifa_mask);
 }
@@ -215,6 +215,14 @@ static __inline__ bool bad_mask(__be32 mask, __be32 addr)
 
 #define endfor_ifa(in_dev) }
 
+#define in_dev_for_each_ifa_rtnl(ifa, in_dev)  \
+   for (ifa = (in_dev)->ifa_list; ifa; \
+ifa = ifa->ifa_next)
+
+#define in_dev_for_each_ifa_rcu(ifa, in_dev)   \
+   for (ifa = (in_dev)->ifa_list; ifa; \
+ifa = ifa->ifa_next)
+
 static inline struct in_device *__in_dev_get_rcu(const struct net_device *dev)
 {
return rcu_dereference(dev->ip_ptr);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 701c5d113a34..7803a4d2951c 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -873,13 +873,12 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, 
struct nlmsghdr *nlh,
 static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa)
 {
struct in_device *in_dev = ifa->ifa_dev;
-   struct in_ifaddr *ifa1, **ifap;
+   struct in_ifaddr *ifa1;
 
if (!ifa->ifa_local)
return NULL;
 
-   for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL;
-ifap = &ifa1->ifa_next) {
+   in_dev_for_each_ifa_rtnl(ifa1, in_dev) {
if (ifa1->ifa_mask == ifa->ifa_mask &&
inet_ifa_match(ifa1->ifa_address, ifa) &&
ifa1->ifa_local == ifa->ifa_local)
@@ -1208,7 +1207,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, 
struct ifreq *ifr)
 static int inet_gifconf(struct net_device *dev, char __user *buf, int len, int 
size)
 {
struct in_device *in_dev = __in_dev_get_rtnl(dev);
-   struct in_ifaddr *ifa;
+   const struct in_ifaddr *ifa;
struct ifreq ifr;
int done = 0;
 
@@ -1218,7 +1217,7 @@ static int inet_gifconf(struct net_device *dev, char 
__user *buf, int len, int s
if (!in_dev)
goto out;
 
-   for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+   in_dev_for_each_ifa_rtnl(ifa, in_dev) {
if (!buf) {
done += size;
continue;
@@ -1321,10 +1320,11 @@ EXPORT_SYMBOL(inet_select_addr);
 static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
  __be32 local, int scope)
 {
-   int same = 0;
+   const struct in_ifaddr *ifa;
__be32 addr = 0;
+   int same = 0;
 
-   for_ifa(in_dev) {
+   in_dev_for_each_ifa_rcu(ifa, in_dev) {
if (!addr &&
(local == ifa->ifa_local || !local) &&
ifa->ifa_scope <= scope) {
@@ -1350,7 +1350,7 @@ static __be32 confirm_addr_indev(struct in_device 
*in_dev, __be32 dst,
same = 0;
}
}
-   } endfor_ifa(in_dev);
+   }
 
return same ? addr : 0;
 }
@@ -1424,7 +1424,7 @@ static void inetdev_changename(struct net_device *dev, 
struct in_device *in_dev)
struct in_ifaddr *ifa;
int named = 0;
 
-   for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+   in_dev_for_each_ifa_rtnl(ifa, in_dev) {
char old[IFNAMSIZ], *dot;
 
memcpy(old, ifa->ifa_label, IFNAMSIZ);
@@ -1454,10 +1454,9 @@ static void inetdev_send_gratuitous_arp(struct 
net_device *dev,
struct in_device *in_dev)
 
 {
-   struct in_ifaddr *ifa;
+   const struct in_ifaddr *ifa;
 
-   for (ifa = in_dev->ifa_list; ifa;
-ifa = ifa->ifa_next) {
+   in_dev_for_each_ifa_rtnl(ifa, in_dev) {
arp_

[PATCH net-next v3] net: add rcu annotations for ifa_list

2019-05-31 Thread Florian Westphal
v3: fix typo in patch1 commit message
All other patches are unchanged.
v2: remove ifa_list iteration in afs instead of conversion

Eric Dumazet reported following problem:

  It looks that unless RTNL is held, accessing ifa_list needs proper RCU
  protection.  indev->ifa_list can be changed under us by another cpu
  (which owns RTNL) [..]

  A proper rcu_dereference() with an happy sparse support would require
  adding __rcu attribute.

This patch series does that: add __rcu to the ifa_list pointers.
That makes sparse complain, so the series also adds the required
rcu_assign_pointer/dereference helpers where needed.

All patches except the last one are preparation work.
Two new macros are introduced for in_ifaddr walks.

Last patch adds the __rcu annotations and the assign_pointer/dereference
helper calls.

This patch is a bit large, but I found no better way -- other
approaches (annotate-first or add helpers-first) all result in
mid-series sparse warnings.

This series is submitted vs. net-next rather than net for several
reasons:

1. Its (mostly) compile-tested only
2. 3rd patch changes behaviour wrt. secondary addresses
   (see changelog)
3. The problem exists for a very long time (2004), so it doesn't
   seem to be urgent to fix this -- rcu use to free ifa_list
   predates the git era.

Florian Westphal (7):
  afs: do not send list of client addresses
  net: inetdevice: provide replacement iterators for in_ifaddr walk
  devinet: use in_dev_for_each_ifa_rcu in more places
  netfilter: use in_dev_for_each_ifa_rcu
  net: use new in_dev_ifa iterators
  drivers: use in_dev_for_each_ifa_rtnl/rcu
  net: ipv4: provide __rcu annotation for ifa_list

drivers/infiniband/core/roce_gid_mgmt.c  |5 
drivers/infiniband/hw/cxgb4/cm.c |9 -
drivers/infiniband/hw/i40iw/i40iw_cm.c   |7 
drivers/infiniband/hw/i40iw/i40iw_main.c |6 
drivers/infiniband/hw/i40iw/i40iw_utils.c|   12 -
drivers/infiniband/hw/nes/nes.c  |8 
drivers/infiniband/hw/usnic/usnic_ib_main.c  |   15 +
drivers/isdn/hysdn/hysdn_net.c   |6 
drivers/isdn/i4l/isdn_net.c  |   20 +-
drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c |8 
drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c |5 
drivers/net/ethernet/via/via-velocity.h  |2 
drivers/net/plip/plip.c  |4 
drivers/net/vmxnet3/vmxnet3_drv.c|   19 +-
drivers/net/wan/hdlc_cisco.c |   11 -
drivers/net/wireless/ath/ath6kl/cfg80211.c   |4 
drivers/net/wireless/marvell/mwifiex/cfg80211.c  |2 
fs/afs/Makefile  |1 
fs/afs/cmservice.c   |   24 --
fs/afs/internal.h|   15 -
fs/afs/netdevices.c  |   48 -
include/linux/inetdevice.h   |   19 +-
net/core/netpoll.c   |   10 -
net/core/pktgen.c|8 
net/ipv4/devinet.c   |  146 ++---
net/ipv4/fib_frontend.c  |   24 +-
net/ipv4/igmp.c  |5 
net/ipv4/netfilter/nf_tproxy_ipv4.c  |9 -
net/ipv6/addrconf.c  |4 
net/mac80211/main.c  |4 
net/netfilter/nf_conntrack_broadcast.c   |9 -
net/netfilter/nf_nat_redirect.c  |   12 -
net/netfilter/nfnetlink_osf.c|5 
net/sctp/protocol.c  |2 
net/smc/smc_clc.c|   11 -
 35 files changed, 266 insertions(+), 233 deletions(-)




[PATCH net-next v3 1/7] afs: do not send list of client addresses

2019-05-31 Thread Florian Westphal
David Howells says:
  I'm told that there's not really any point populating the list.
  Current OpenAFS ignores it, as does AuriStor - and IBM AFS 3.6 will
  do the right thing.
  The list is actually useless as it's the client's view of the world,
  not the servers, so if there's any NAT in the way its contents are
  invalid.  Further, it doesn't support IPv6 addresses.

  On that basis, feel free to make it an empty list and remove all the
  interface enumeration.

V1 of this patch reworked the function to use a new helper for the
ifa_list iteration to avoid sparse warnings once the proper __rcu
annotations get added in struct in_device later.

But, in light of the above, just remove afs_get_ipv4_interfaces.

Compile tested only.

Cc: David Howells 
Cc: linux-...@lists.infradead.org
Signed-off-by: Florian Westphal 
Tested-by: David Howells 
---
 fs/afs/Makefile |  1 -
 fs/afs/cmservice.c  | 24 +--
 fs/afs/internal.h   | 15 --
 fs/afs/netdevices.c | 48 -
 4 files changed, 1 insertion(+), 87 deletions(-)
 delete mode 100644 fs/afs/netdevices.c

diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index cbf31f6cd177..10359bea7070 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -29,7 +29,6 @@ kafs-y := \
server.o \
server_list.o \
super.o \
-   netdevices.o \
vlclient.o \
vl_list.o \
vl_probe.o \
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 01437cfe5432..a61d2058c468 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -584,9 +584,8 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call)
  */
 static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work)
 {
-   struct afs_interface *ifs;
struct afs_call *call = container_of(work, struct afs_call, work);
-   int loop, nifs;
+   int loop;
 
struct {
struct /* InterfaceAddr */ {
@@ -604,19 +603,7 @@ static void SRXAFSCB_TellMeAboutYourself(struct 
work_struct *work)
 
_enter("");
 
-   nifs = 0;
-   ifs = kcalloc(32, sizeof(*ifs), GFP_KERNEL);
-   if (ifs) {
-   nifs = afs_get_ipv4_interfaces(call->net, ifs, 32, false);
-   if (nifs < 0) {
-   kfree(ifs);
-   ifs = NULL;
-   nifs = 0;
-   }
-   }
-
memset(&reply, 0, sizeof(reply));
-   reply.ia.nifs = htonl(nifs);
 
reply.ia.uuid[0] = call->net->uuid.time_low;
reply.ia.uuid[1] = htonl(ntohs(call->net->uuid.time_mid));
@@ -626,15 +613,6 @@ static void SRXAFSCB_TellMeAboutYourself(struct 
work_struct *work)
for (loop = 0; loop < 6; loop++)
reply.ia.uuid[loop + 5] = htonl((s8) 
call->net->uuid.node[loop]);
 
-   if (ifs) {
-   for (loop = 0; loop < nifs; loop++) {
-   reply.ia.ifaddr[loop] = ifs[loop].address.s_addr;
-   reply.ia.netmask[loop] = ifs[loop].netmask.s_addr;
-   reply.ia.mtu[loop] = htonl(ifs[loop].mtu);
-   }
-   kfree(ifs);
-   }
-
reply.cap.capcount = htonl(1);
reply.cap.caps[0] = htonl(AFS_CAP_ERROR_TRANSLATION);
afs_send_simple_reply(call, &reply, sizeof(reply));
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 2073c1a3ab4b..a22fa3b77b3c 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -724,15 +724,6 @@ struct afs_permits {
struct afs_permit   permits[];  /* List of permits sorted by 
key pointer */
 };
 
-/*
- * record of one of a system's set of network interfaces
- */
-struct afs_interface {
-   struct in_addr  address;/* IPv4 address bound to interface */
-   struct in_addr  netmask;/* netmask applied to address */
-   unsignedmtu;/* MTU of interface */
-};
-
 /*
  * Error prioritisation and accumulation.
  */
@@ -1095,12 +1086,6 @@ extern const struct file_operations 
afs_mntpt_file_operations;
 extern struct vfsmount *afs_d_automount(struct path *);
 extern void afs_mntpt_kill_timer(void);
 
-/*
- * netdevices.c
- */
-extern int afs_get_ipv4_interfaces(struct afs_net *, struct afs_interface *,
-  size_t, bool);
-
 /*
  * proc.c
  */
diff --git a/fs/afs/netdevices.c b/fs/afs/netdevices.c
deleted file mode 100644
index 2a009d1939d7..
--- a/fs/afs/netdevices.c
+++ /dev/null
@@ -1,48 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* AFS network device helpers
- *
- * Copyright (c) 2007 Patrick McHardy 
- */
-
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include "internal.h"
-
-/*
- * get a list of this system's interface IPv4 addresses, netmasks and MTUs
- * - maxbufs must be at least 1
- * - returns the number of interface recor

[PATCH net-next v3 3/7] devinet: use in_dev_for_each_ifa_rcu in more places

2019-05-31 Thread Florian Westphal
This also replaces spots that used for_primary_ifa().

for_primary_ifa() aborts the loop on the first secondary address seen.

Replace it with either the rcu or rtnl variant of in_dev_for_each_ifa(),
but two places will now also consider secondary addresses too:
inet_addr_onlink() and inet_ifa_byprefix().

I do not understand why they should ignore secondary addresses.

Why would a secondary address not be considered 'on link'?
When matching a prefix, why ignore a matching secondary address?

Other places get converted as well, but gain "->flags & SECONDARY" check.

Signed-off-by: Florian Westphal 
---
 net/ipv4/devinet.c | 27 +++
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 7803a4d2951c..b45421b2b734 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -327,15 +327,17 @@ static void inetdev_destroy(struct in_device *in_dev)
 
 int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b)
 {
+   const struct in_ifaddr *ifa;
+
rcu_read_lock();
-   for_primary_ifa(in_dev) {
+   in_dev_for_each_ifa_rcu(ifa, in_dev) {
if (inet_ifa_match(a, ifa)) {
if (!b || inet_ifa_match(b, ifa)) {
rcu_read_unlock();
return 1;
}
}
-   } endfor_ifa(in_dev);
+   }
rcu_read_unlock();
return 0;
 }
@@ -580,12 +582,14 @@ EXPORT_SYMBOL(inetdev_by_index);
 struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
__be32 mask)
 {
+   struct in_ifaddr *ifa;
+
ASSERT_RTNL();
 
-   for_primary_ifa(in_dev) {
+   in_dev_for_each_ifa_rtnl(ifa, in_dev) {
if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa))
return ifa;
-   } endfor_ifa(in_dev);
+   }
return NULL;
 }
 
@@ -1245,17 +1249,22 @@ static int inet_gifconf(struct net_device *dev, char 
__user *buf, int len, int s
 static __be32 in_dev_select_addr(const struct in_device *in_dev,
 int scope)
 {
-   for_primary_ifa(in_dev) {
+   const struct in_ifaddr *ifa;
+
+   in_dev_for_each_ifa_rcu(ifa, in_dev) {
+   if (ifa->ifa_flags & IFA_F_SECONDARY)
+   continue;
if (ifa->ifa_scope != RT_SCOPE_LINK &&
ifa->ifa_scope <= scope)
return ifa->ifa_local;
-   } endfor_ifa(in_dev);
+   }
 
return 0;
 }
 
 __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
 {
+   const struct in_ifaddr *ifa;
__be32 addr = 0;
struct in_device *in_dev;
struct net *net = dev_net(dev);
@@ -1266,7 +1275,9 @@ __be32 inet_select_addr(const struct net_device *dev, 
__be32 dst, int scope)
if (!in_dev)
goto no_in_dev;
 
-   for_primary_ifa(in_dev) {
+   in_dev_for_each_ifa_rcu(ifa, in_dev) {
+   if (ifa->ifa_flags & IFA_F_SECONDARY)
+   continue;
if (ifa->ifa_scope > scope)
continue;
if (!dst || inet_ifa_match(dst, ifa)) {
@@ -1275,7 +1286,7 @@ __be32 inet_select_addr(const struct net_device *dev, 
__be32 dst, int scope)
}
if (!addr)
addr = ifa->ifa_local;
-   } endfor_ifa(in_dev);
+   }
 
if (addr)
goto out_unlock;
-- 
2.21.0



[PATCH net-next v3 4/7] netfilter: use in_dev_for_each_ifa_rcu

2019-05-31 Thread Florian Westphal
Netfilter hooks are always running under rcu read lock, use
the new iterator macro so sparse won't complain once we add
proper __rcu annotations.

Signed-off-by: Florian Westphal 
---
 net/ipv4/netfilter/nf_tproxy_ipv4.c| 9 +++--
 net/netfilter/nf_conntrack_broadcast.c | 9 +++--
 net/netfilter/nfnetlink_osf.c  | 5 ++---
 3 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c 
b/net/ipv4/netfilter/nf_tproxy_ipv4.c
index 164714104965..40c93b3bd731 100644
--- a/net/ipv4/netfilter/nf_tproxy_ipv4.c
+++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c
@@ -53,6 +53,7 @@ EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait4);
 
 __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
 {
+   const struct in_ifaddr *ifa;
struct in_device *indev;
__be32 laddr;
 
@@ -61,10 +62,14 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 
user_laddr, __be32 daddr)
 
laddr = 0;
indev = __in_dev_get_rcu(skb->dev);
-   for_primary_ifa(indev) {
+
+   in_dev_for_each_ifa_rcu(ifa, indev) {
+   if (ifa->ifa_flags & IFA_F_SECONDARY)
+   continue;
+
laddr = ifa->ifa_local;
break;
-   } endfor_ifa(indev);
+   }
 
return laddr ? laddr : daddr;
 }
diff --git a/net/netfilter/nf_conntrack_broadcast.c 
b/net/netfilter/nf_conntrack_broadcast.c
index 5423b197d98a..a5dbc3676a4f 100644
--- a/net/netfilter/nf_conntrack_broadcast.c
+++ b/net/netfilter/nf_conntrack_broadcast.c
@@ -41,12 +41,17 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb,
 
in_dev = __in_dev_get_rcu(rt->dst.dev);
if (in_dev != NULL) {
-   for_primary_ifa(in_dev) {
+   const struct in_ifaddr *ifa;
+
+   in_dev_for_each_ifa_rcu(ifa, in_dev) {
+   if (ifa->ifa_flags & IFA_F_SECONDARY)
+   continue;
+
if (ifa->ifa_broadcast == iph->daddr) {
mask = ifa->ifa_mask;
break;
}
-   } endfor_ifa(in_dev);
+   }
}
 
if (mask == 0)
diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c
index f42326b40d6f..9f5dea0064ea 100644
--- a/net/netfilter/nfnetlink_osf.c
+++ b/net/netfilter/nfnetlink_osf.c
@@ -33,6 +33,7 @@ static inline int nf_osf_ttl(const struct sk_buff *skb,
 {
struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
const struct iphdr *ip = ip_hdr(skb);
+   const struct in_ifaddr *ifa;
int ret = 0;
 
if (ttl_check == NF_OSF_TTL_TRUE)
@@ -42,15 +43,13 @@ static inline int nf_osf_ttl(const struct sk_buff *skb,
else if (ip->ttl <= f_ttl)
return 1;
 
-   for_ifa(in_dev) {
+   in_dev_for_each_ifa_rcu(ifa, in_dev) {
if (inet_ifa_match(ip->saddr, ifa)) {
ret = (ip->ttl == f_ttl);
break;
}
}
 
-   endfor_ifa(in_dev);
-
return ret;
 }
 
-- 
2.21.0



[PATCH net-next v3 6/7] drivers: use in_dev_for_each_ifa_rtnl/rcu

2019-05-31 Thread Florian Westphal
Like previous patches, use the new iterator macros to avoid sparse
warnings once proper __rcu annotations are added.

Compile tested only.

Signed-off-by: Florian Westphal 
---
 drivers/infiniband/core/roce_gid_mgmt.c  |  5 +++--
 drivers/infiniband/hw/cxgb4/cm.c |  9 +++--
 drivers/infiniband/hw/i40iw/i40iw_cm.c   |  7 +--
 drivers/infiniband/hw/i40iw/i40iw_main.c |  6 --
 drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c |  8 +---
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c |  5 +++--
 drivers/net/wan/hdlc_cisco.c | 11 +--
 7 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/drivers/infiniband/core/roce_gid_mgmt.c 
b/drivers/infiniband/core/roce_gid_mgmt.c
index 558de0b9895c..2860def84f4d 100644
--- a/drivers/infiniband/core/roce_gid_mgmt.c
+++ b/drivers/infiniband/core/roce_gid_mgmt.c
@@ -330,6 +330,7 @@ static void bond_delete_netdev_default_gids(struct 
ib_device *ib_dev,
 static void enum_netdev_ipv4_ips(struct ib_device *ib_dev,
 u8 port, struct net_device *ndev)
 {
+   const struct in_ifaddr *ifa;
struct in_device *in_dev;
struct sin_list {
struct list_headlist;
@@ -349,7 +350,7 @@ static void enum_netdev_ipv4_ips(struct ib_device *ib_dev,
return;
}
 
-   for_ifa(in_dev) {
+   in_dev_for_each_ifa_rcu(ifa, in_dev) {
struct sin_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
 
if (!entry)
@@ -359,7 +360,7 @@ static void enum_netdev_ipv4_ips(struct ib_device *ib_dev,
entry->ip.sin_addr.s_addr = ifa->ifa_address;
list_add_tail(&entry->list, &sin_list);
}
-   endfor_ifa(in_dev);
+
rcu_read_unlock();
 
list_for_each_entry_safe(sin_iter, sin_temp, &sin_list, list) {
diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index 0f3b1193d5f8..09fcfc9e052d 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -3230,17 +3230,22 @@ static int pick_local_ipaddrs(struct c4iw_dev *dev, 
struct iw_cm_id *cm_id)
int found = 0;
struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->m_local_addr;
struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->m_remote_addr;
+   const struct in_ifaddr *ifa;
 
ind = in_dev_get(dev->rdev.lldi.ports[0]);
if (!ind)
return -EADDRNOTAVAIL;
-   for_primary_ifa(ind) {
+   rcu_read_lock();
+   in_dev_for_each_ifa_rcu(ifa, ind) {
+   if (ifa->ifa_flags & IFA_F_SECONDARY)
+   continue;
laddr->sin_addr.s_addr = ifa->ifa_address;
raddr->sin_addr.s_addr = ifa->ifa_address;
found = 1;
break;
}
-   endfor_ifa(ind);
+   rcu_read_unlock();
+
in_dev_put(ind);
return found ? 0 : -EADDRNOTAVAIL;
 }
diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c 
b/drivers/infiniband/hw/i40iw/i40iw_cm.c
index 8233f5a4e623..700a5d06b60c 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_cm.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c
@@ -1773,8 +1773,11 @@ static enum i40iw_status_code i40iw_add_mqh_4(
if rdma_vlan_dev_vlan_id(dev) < I40IW_NO_VLAN) &&
  (rdma_vlan_dev_real_dev(dev) == iwdev->netdev)) ||
(dev == iwdev->netdev)) && (dev->flags & IFF_UP)) {
+   const struct in_ifaddr *ifa;
+
idev = in_dev_get(dev);
-   for_ifa(idev) {
+
+   in_dev_for_each_ifa_rtnl(ifa, idev) {
i40iw_debug(&iwdev->sc_dev,
I40IW_DEBUG_CM,
"Allocating child CM Listener 
forIP=%pI4, vlan_id=%d, MAC=%pM\n",
@@ -1819,7 +1822,7 @@ static enum i40iw_status_code i40iw_add_mqh_4(

cm_parent_listen_node->cm_core->stats_listen_nodes_created--;
}
}
-   endfor_ifa(idev);
+
in_dev_put(idev);
}
}
diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c 
b/drivers/infiniband/hw/i40iw/i40iw_main.c
index 10932baee279..d44cf33df81a 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_main.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_main.c
@@ -1222,8 +1222,10 @@ static void i40iw_add_ipv4_addr(struct i40iw_device 
*iwdev)
if rdma_vlan_dev_vlan_id(dev) < 0x) &&
  (rdma_vlan_dev_real_dev(dev) == iwdev->netdev)) ||
(dev == iwdev->netdev))

[PATCH net-next v3 5/7] net: use new in_dev_ifa iterators

2019-05-31 Thread Florian Westphal
Use in_dev_for_each_ifa_rcu/rtnl instead.
This prevents sparse warnings once proper __rcu annotations are added.

Signed-off-by: Florian Westphal 

t di# Last commands done (6 commands done):
---
 net/ipv4/fib_frontend.c | 24 +---
 net/ipv4/igmp.c |  5 +++--
 net/ipv6/addrconf.c |  4 +---
 net/sctp/protocol.c |  2 +-
 net/smc/smc_clc.c   | 11 +++
 5 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 76055c66326a..c7cdb8d0d164 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -540,14 +540,22 @@ static int rtentry_to_fib_config(struct net *net, int 
cmd, struct rtentry *rt,
cfg->fc_oif = dev->ifindex;
cfg->fc_table = l3mdev_fib_table(dev);
if (colon) {
-   struct in_ifaddr *ifa;
-   struct in_device *in_dev = __in_dev_get_rtnl(dev);
+   const struct in_ifaddr *ifa;
+   struct in_device *in_dev;
+
+   in_dev = __in_dev_get_rtnl(dev);
if (!in_dev)
return -ENODEV;
+
*colon = ':';
-   for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
+
+   rcu_read_lock();
+   in_dev_for_each_ifa_rcu(ifa, in_dev) {
if (strcmp(ifa->ifa_label, devname) == 0)
break;
+   }
+   rcu_read_unlock();
+
if (!ifa)
return -ENODEV;
cfg->fc_prefsrc = ifa->ifa_local;
@@ -1177,8 +1185,8 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct 
in_ifaddr *iprim)
 *
 * Scan address list to be sure that addresses are really gone.
 */
-
-   for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
+   rcu_read_lock();
+   in_dev_for_each_ifa_rcu(ifa1, in_dev) {
if (ifa1 == ifa) {
/* promotion, keep the IP */
gone = 0;
@@ -1246,6 +1254,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct 
in_ifaddr *iprim)
}
}
}
+   rcu_read_unlock();
 
 no_promotions:
if (!(ok & BRD_OK))
@@ -1415,6 +1424,7 @@ static int fib_netdev_event(struct notifier_block *this, 
unsigned long event, vo
struct netdev_notifier_info_ext *info_ext = ptr;
struct in_device *in_dev;
struct net *net = dev_net(dev);
+   struct in_ifaddr *ifa;
unsigned int flags;
 
if (event == NETDEV_UNREGISTER) {
@@ -1429,9 +1439,9 @@ static int fib_netdev_event(struct notifier_block *this, 
unsigned long event, vo
 
switch (event) {
case NETDEV_UP:
-   for_ifa(in_dev) {
+   in_dev_for_each_ifa_rtnl(ifa, in_dev) {
fib_add_ifaddr(ifa);
-   } endfor_ifa(in_dev);
+   }
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
fib_sync_up(dev, RTNH_F_DEAD);
 #endif
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 6c2febc39dca..719bd8e4eea4 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -325,14 +325,15 @@ static __be32 igmpv3_get_srcaddr(struct net_device *dev,
 const struct flowi4 *fl4)
 {
struct in_device *in_dev = __in_dev_get_rcu(dev);
+   const struct in_ifaddr *ifa;
 
if (!in_dev)
return htonl(INADDR_ANY);
 
-   for_ifa(in_dev) {
+   in_dev_for_each_ifa_rcu(ifa, in_dev) {
if (fl4->saddr == ifa->ifa_local)
return fl4->saddr;
-   } endfor_ifa(in_dev);
+   }
 
return htonl(INADDR_ANY);
 }
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 683613e7355b..40b154d45ab4 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3127,11 +3127,9 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
struct in_device *in_dev = __in_dev_get_rtnl(dev);
if (in_dev && (dev->flags & IFF_UP)) {
struct in_ifaddr *ifa;
-
int flag = scope;
 
-   for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
-
+   in_dev_for_each_ifa_rtnl(ifa, in_dev) {
addr.s6_addr32[3] = ifa->ifa_local;
 
if (ifa->ifa_scope == RT_SCOPE_LINK)
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index f0631bf486b6..e29cf27cb633 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -96,7 +96,7 @@ static void sctp_v4_copy_addrlist(struct list_head *addrlist,
return;
}
 
-   for (ifa = in_dev-&g

[PATCH net-next v3 2/7] net: inetdevice: provide replacement iterators for in_ifaddr walk

2019-05-31 Thread Florian Westphal
The ifa_list is protected either by rcu or rtnl lock, but the
current iterators do not account for this.

This adds two iterators as replacement, a later patch in
the series will update them with the needed rcu/rtnl_dereference calls.

Its not done in this patch yet to avoid sparse warnings -- the fields
lack the proper __rcu annotation.

Signed-off-by: Florian Westphal 
---
 include/linux/inetdevice.h | 10 +-
 net/ipv4/devinet.c | 31 ---
 2 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index 367dc2a0f84a..d5d05503a04b 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -186,7 +186,7 @@ __be32 inet_confirm_addr(struct net *net, struct in_device 
*in_dev, __be32 dst,
 struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
__be32 mask);
 struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr);
-static __inline__ bool inet_ifa_match(__be32 addr, struct in_ifaddr *ifa)
+static inline bool inet_ifa_match(__be32 addr, const struct in_ifaddr *ifa)
 {
return !((addr^ifa->ifa_address)&ifa->ifa_mask);
 }
@@ -215,6 +215,14 @@ static __inline__ bool bad_mask(__be32 mask, __be32 addr)
 
 #define endfor_ifa(in_dev) }
 
+#define in_dev_for_each_ifa_rtnl(ifa, in_dev)  \
+   for (ifa = (in_dev)->ifa_list; ifa; \
+ifa = ifa->ifa_next)
+
+#define in_dev_for_each_ifa_rcu(ifa, in_dev)   \
+   for (ifa = (in_dev)->ifa_list; ifa; \
+ifa = ifa->ifa_next)
+
 static inline struct in_device *__in_dev_get_rcu(const struct net_device *dev)
 {
return rcu_dereference(dev->ip_ptr);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 701c5d113a34..7803a4d2951c 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -873,13 +873,12 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, 
struct nlmsghdr *nlh,
 static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa)
 {
struct in_device *in_dev = ifa->ifa_dev;
-   struct in_ifaddr *ifa1, **ifap;
+   struct in_ifaddr *ifa1;
 
if (!ifa->ifa_local)
return NULL;
 
-   for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL;
-ifap = &ifa1->ifa_next) {
+   in_dev_for_each_ifa_rtnl(ifa1, in_dev) {
if (ifa1->ifa_mask == ifa->ifa_mask &&
inet_ifa_match(ifa1->ifa_address, ifa) &&
ifa1->ifa_local == ifa->ifa_local)
@@ -1208,7 +1207,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, 
struct ifreq *ifr)
 static int inet_gifconf(struct net_device *dev, char __user *buf, int len, int 
size)
 {
struct in_device *in_dev = __in_dev_get_rtnl(dev);
-   struct in_ifaddr *ifa;
+   const struct in_ifaddr *ifa;
struct ifreq ifr;
int done = 0;
 
@@ -1218,7 +1217,7 @@ static int inet_gifconf(struct net_device *dev, char 
__user *buf, int len, int s
if (!in_dev)
goto out;
 
-   for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+   in_dev_for_each_ifa_rtnl(ifa, in_dev) {
if (!buf) {
done += size;
continue;
@@ -1321,10 +1320,11 @@ EXPORT_SYMBOL(inet_select_addr);
 static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
  __be32 local, int scope)
 {
-   int same = 0;
+   const struct in_ifaddr *ifa;
__be32 addr = 0;
+   int same = 0;
 
-   for_ifa(in_dev) {
+   in_dev_for_each_ifa_rcu(ifa, in_dev) {
if (!addr &&
(local == ifa->ifa_local || !local) &&
ifa->ifa_scope <= scope) {
@@ -1350,7 +1350,7 @@ static __be32 confirm_addr_indev(struct in_device 
*in_dev, __be32 dst,
same = 0;
}
}
-   } endfor_ifa(in_dev);
+   }
 
return same ? addr : 0;
 }
@@ -1424,7 +1424,7 @@ static void inetdev_changename(struct net_device *dev, 
struct in_device *in_dev)
struct in_ifaddr *ifa;
int named = 0;
 
-   for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+   in_dev_for_each_ifa_rtnl(ifa, in_dev) {
char old[IFNAMSIZ], *dot;
 
memcpy(old, ifa->ifa_label, IFNAMSIZ);
@@ -1454,10 +1454,9 @@ static void inetdev_send_gratuitous_arp(struct 
net_device *dev,
struct in_device *in_dev)
 
 {
-   struct in_ifaddr *ifa;
+   const struct in_ifaddr *ifa;
 
-   for (ifa = in_dev->ifa_list; ifa;
-ifa = ifa->ifa_next) {
+   in_dev_for_each_ifa_rtnl(ifa, in_dev) {
arp_

[PATCH net-next v3 7/7] net: ipv4: provide __rcu annotation for ifa_list

2019-05-31 Thread Florian Westphal
ifa_list is protected by rcu, yet code doesn't reflect this.

Add the __rcu annotations and fix up all places that are now reported by
sparse.

I've done this in the same commit to not add intermediate patches that
result in new warnings.

Reported-by: Eric Dumazet 
Signed-off-by: Florian Westphal 
---
 drivers/infiniband/hw/i40iw/i40iw_utils.c | 12 ++-
 drivers/infiniband/hw/nes/nes.c   |  8 +-
 drivers/infiniband/hw/usnic/usnic_ib_main.c   | 15 ++--
 drivers/isdn/hysdn/hysdn_net.c|  6 +-
 drivers/isdn/i4l/isdn_net.c   | 20 -
 drivers/net/ethernet/via/via-velocity.h   |  2 +-
 drivers/net/plip/plip.c   |  4 +-
 drivers/net/vmxnet3/vmxnet3_drv.c | 19 ++--
 drivers/net/wireless/ath/ath6kl/cfg80211.c|  4 +-
 .../net/wireless/marvell/mwifiex/cfg80211.c   |  2 +-
 include/linux/inetdevice.h| 21 ++---
 net/core/netpoll.c| 10 ++-
 net/core/pktgen.c |  8 +-
 net/ipv4/devinet.c| 88 ---
 net/mac80211/main.c   |  4 +-
 net/netfilter/nf_nat_redirect.c   | 12 +--
 16 files changed, 150 insertions(+), 85 deletions(-)

diff --git a/drivers/infiniband/hw/i40iw/i40iw_utils.c 
b/drivers/infiniband/hw/i40iw/i40iw_utils.c
index 337410f40860..016524683e17 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_utils.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_utils.c
@@ -174,10 +174,14 @@ int i40iw_inetaddr_event(struct notifier_block *notifier,
rcu_read_lock();
in = __in_dev_get_rcu(upper_dev);
 
-   if (!in->ifa_list)
-   local_ipaddr = 0;
-   else
-   local_ipaddr = ntohl(in->ifa_list->ifa_address);
+   local_ipaddr = 0;
+   if (in) {
+   struct in_ifaddr *ifa;
+
+   ifa = rcu_dereference(in->ifa_list);
+   if (ifa)
+   local_ipaddr = ntohl(ifa->ifa_address);
+   }
 
rcu_read_unlock();
} else {
diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c
index e00add6d78ec..29b324726ea6 100644
--- a/drivers/infiniband/hw/nes/nes.c
+++ b/drivers/infiniband/hw/nes/nes.c
@@ -183,7 +183,13 @@ static int nes_inetaddr_event(struct notifier_block 
*notifier,
 
rcu_read_lock();
in = 
__in_dev_get_rcu(upper_dev);
-   nesvnic->local_ipaddr = 
in->ifa_list->ifa_address;
+   if (in) {
+   struct in_ifaddr *ifa;
+
+   ifa = 
rcu_dereference(in->ifa_list);
+   if (ifa)
+   
nesvnic->local_ipaddr = ifa->ifa_address;
+   }
rcu_read_unlock();
} else {
nesvnic->local_ipaddr = 
ifa->ifa_address;
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c 
b/drivers/infiniband/hw/usnic/usnic_ib_main.c
index d88d9f8a7f9a..34c1f9d6c915 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_main.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c
@@ -427,11 +427,16 @@ static void *usnic_ib_device_add(struct pci_dev *dev)
if (netif_carrier_ok(us_ibdev->netdev))
usnic_fwd_carrier_up(us_ibdev->ufdev);
 
-   ind = in_dev_get(netdev);
-   if (ind->ifa_list)
-   usnic_fwd_add_ipaddr(us_ibdev->ufdev,
-ind->ifa_list->ifa_address);
-   in_dev_put(ind);
+   rcu_read_lock();
+   ind = __in_dev_get_rcu(netdev);
+   if (ind) {
+   const struct in_ifaddr *ifa;
+
+   ifa = rcu_dereference(ind->ifa_list);
+   if (ifa)
+   usnic_fwd_add_ipaddr(us_ibdev->ufdev, ifa->ifa_address);
+   }
+   rcu_read_unlock();
 
usnic_mac_ip_to_gid(us_ibdev->netdev->perm_addr,
us_ibdev->ufdev->inaddr, &gid.raw[0]);
diff --git a/drivers/isdn/hysdn/hysdn_net.c b/drivers/isdn/hysdn/hysdn_net.c
index 8e9c34f33d86..bea37ae30ebb 100644
--- a/drivers/isdn/hysdn/hysdn_net.c
+++ b/drivers/isdn/hysdn/hysdn_net.c
@@ -70,9 +70,13 @@ net_open(struct net_device *dev)
for (i = 0; i < ETH_ALEN; i++)
dev->dev_addr[i] = 0xfc;
if ((in_dev = dev->ip_ptr) != NULL) {
-   

Re: [PATCH net-next v2 1/7] afs: do not send list of client addresses

2019-05-31 Thread Florian Westphal
David Howells  wrote:
> Florian Westphal  wrote:
> 
> > David Howell says:
> 
> "Howells"

My bad.

> Apart from that:
> 
> Tested-by: David Howells 

Thanks, a lot, I've re-submitted this as v3 retaining your tested-by.


Re: [PATCH nf-next] netfilter: add support for matching IPv4 options

2019-06-01 Thread Florian Westphal
Pablo Neira Ayuso  wrote:
> > »   iph = skb_header_pointer(skb, *offset, sizeof(_iph), &_iph);
> > »   if (!iph || skb->protocol != htons(ETH_P_IP))
> > »   »   return -EBADMSG;
> 
> I mean, you make this check upfront from the _eval() path, ie.
> 
> static void nft_exthdr_ipv4_eval(const struct nft_expr *expr,
>  ...
> {
> ...
> 
> if (skb->protocol != htons(ETH_P_IP))
> goto err;

Wouldn't it be preferable to just use nft_pf() != NFPROTO_IPV4?


Re: [PATCH nf-next] netfilter: add support for matching IPv4 options

2019-06-01 Thread Florian Westphal
Pablo Neira Ayuso  wrote:
> > > if (skb->protocol != htons(ETH_P_IP))
> > > goto err;
> > 
> > Wouldn't it be preferable to just use nft_pf() != NFPROTO_IPV4?
> 
> Then IPv4 options extension won't work from bridge and netdev families
> too, right?

Ah, right.


[PATCH net-next] net: ipv4: fix rcu lockdep splat due to wrong annotation

2019-06-03 Thread Florian Westphal
syzbot triggered following splat when strict netlink
validation is enabled:

net/ipv4/devinet.c:1766 suspicious rcu_dereference_check() usage!

This occurs because we hold RTNL mutex, but no rcu read lock.
The second call site holds both, so just switch to the _rtnl variant.

Reported-by: syzbot+bad6e32808a3a97b1...@syzkaller.appspotmail.com
Fixes: 2638eb8b50cf ("net: ipv4: provide __rcu annotation for ifa_list")
Signed-off-by: Florian Westphal 
---
 net/ipv4/devinet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index ebaea05b4033..ed2e2dc745cd 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1763,7 +1763,7 @@ static int in_dev_dump_addr(struct in_device *in_dev, 
struct sk_buff *skb,
int ip_idx = 0;
int err;
 
-   in_dev_for_each_ifa_rcu(ifa, in_dev) {
+   in_dev_for_each_ifa_rtnl(ifa, in_dev) {
if (ip_idx < s_ip_idx) {
ip_idx++;
continue;
-- 
2.21.0



Re: [ipsec-next:testing 4/6] net/xfrm/xfrm_state.c:1792:9: error: '__xfrm6_tmpl_sort_cmp' undeclared; did you mean 'xfrm_tmpl_sort'?

2019-06-05 Thread Florian Westphal
kbuild test robot  wrote:
> tree:   
> https://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next.git 
> testing
> head:   ca78a3eaad69bd08ba41c144c21881dc694d4a32
> commit: 8dc6e3891a4be64c0cca5e8fe2c3ad33bc06543e [4/6] xfrm: remove state and 
> template sort indirections from xfrm_state_afinfo
> config: i386-randconfig-x003-201922 (attached as .config)
> compiler: gcc-7 (Debian 7.3.0-1) 7.3.0
> reproduce:
> git checkout 8dc6e3891a4be64c0cca5e8fe2c3ad33bc06543e
> # save the attached .config to linux build tree
> make ARCH=i386 
> If you fix the issue, kindly add following tag
> Reported-by: kbuild test robot 
> 
> All errors (new ones prefixed by >>):
> 
>net/xfrm/xfrm_state.c: In function 'xfrm_tmpl_sort':
> >> net/xfrm/xfrm_state.c:1792:9: error: '__xfrm6_tmpl_sort_cmp' undeclared 
> >> (first use in this function); did you mean 'xfrm_tmpl_sort'?
> __xfrm6_tmpl_sort_cmp, 5);
> ^
> xfrm_tmpl_sort
>net/xfrm/xfrm_state.c:1792:9: note: each undeclared identifier is reported 
> only once for each function it appears in
>net/xfrm/xfrm_state.c: In function 'xfrm_state_sort':
> >> net/xfrm/xfrm_state.c:1806:9: error: '__xfrm6_state_sort_cmp' undeclared 
> >> (first use in this function); did you mean '__xfrm6_state_addr_cmp'?
> __xfrm6_state_sort_cmp, 6);
> ^~
> __xfrm6_state_addr_cmp

this lacks stubs for CONFIG_IPV6=n case.

Steffen, as this is still only in your testing branch, I suggest you
squash this snipped into commit 8dc6e3891a4be64c0cca5e8fe2c3ad33bc06543e
("xfrm: remove state and template sort indirections from xfrm_state_afinfo"),
it resolves this problem for me.  Otherwise, I can make a formal submit,
just let me know.

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1845,6 +1845,9 @@ static int __xfrm6_tmpl_sort_cmp(const void *p)
return 4;
 }
 #else
+static inline int __xfrm6_state_sort_cmp(const void *p) { return 5; }
+static inline int __xfrm6_tmpl_sort_cmp(const void *p) { return 4; }
+
 static inline void
 __xfrm6_sort(void **dst, void **src, int n,
 int (*cmp)(const void *p), int maxclass)


Re: [ipsec-next:testing 4/6] net/xfrm/xfrm_state.c:1792:9: error: '__xfrm6_tmpl_sort_cmp' undeclared; did you mean 'xfrm_tmpl_sort'?

2019-06-06 Thread Florian Westphal
Steffen Klassert  wrote:
> On Wed, Jun 05, 2019 at 02:40:45PM +0200, Florian Westphal wrote:
> > 
> > Steffen, as this is still only in your testing branch, I suggest you
> > squash this snipped into commit 8dc6e3891a4be64c0cca5e8fe2c3ad33bc06543e
> > ("xfrm: remove state and template sort indirections from 
> > xfrm_state_afinfo"),
> > it resolves this problem for me.
> 
> Ok, I did that. Please doublecheck my work.

Looks good to me, also, the updated ipsec-next/testing now builds with
CONFIG_IPV6=n.

Thanks,
Florian


Re: [RFC net-next v2 1/1] net: sched: protect against loops in TC filter hooks

2019-06-06 Thread Florian Westphal
John Hurley  wrote:
> TC hooks allow the application of filters and actions to packets at both
> ingress and egress of the network stack. It is possible, with poor
> configuration, that this can produce loops whereby an ingress hook calls
> a mirred egress action that has an egress hook that redirects back to
> the first ingress etc. The TC core classifier protects against loops when
> doing reclassifies but there is no protection against a packet looping
> between multiple hooks. This can lead to stack overflow panics among other
> things.
> 
> Previous versions of the kernel (<4.2) had a TTL count in the tc_verd skb
> member that protected against loops. This was removed and the tc_verd
> variable replaced by bit fields.
> 
> Extend the TC fields in the skb with an additional 2 bits to store the TC
> hop count. This should use existing allocated memory in the skb.
> 
> Add the checking and setting of the new hop count to the act_mirred file
> given that it is the source of the loops. This means that the code
> additions are not in the main datapath.
> 
> v1->v2
> - change from per cpu counter to per skb tracking (Jamal)
> - move check/update from fast path to act_mirred (Daniel)
> 
> Signed-off-by: John Hurley 
> ---
>  include/linux/skbuff.h | 2 ++
>  net/sched/act_mirred.c | 9 +
>  2 files changed, 11 insertions(+)
> 
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 2ee5e63..f0dbc5b 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -645,6 +645,7 @@ typedef unsigned char *sk_buff_data_t;
>   *   @tc_at_ingress: used within tc_classify to distinguish in/egress
>   *   @tc_redirected: packet was redirected by a tc action
>   *   @tc_from_ingress: if tc_redirected, tc_at_ingress at time of redirect
> + *   @tc_hop_count: hop counter to prevent packet loops
>   *   @peeked: this packet has been seen already, so stats have been
>   *   done for it, don't do them again
>   *   @nf_trace: netfilter packet trace flag
> @@ -827,6 +828,7 @@ struct sk_buff {
>   __u8tc_at_ingress:1;
>   __u8tc_redirected:1;
>   __u8tc_from_ingress:1;
> + __u8tc_hop_count:2;

I dislike this, why can't we just use a pcpu counter?

The only problem is with recursion/nesting; whenever we
hit something that queues the skb for later we're safe.

We can't catch loops in real (physical) setups either,
e.g. bridge looping back on itself.


Re: [RFC net-next v2 1/1] net: sched: protect against loops in TC filter hooks

2019-06-06 Thread Florian Westphal
John Hurley  wrote:
> On Thu, Jun 6, 2019 at 1:58 PM Florian Westphal  wrote:
> > I dislike this, why can't we just use a pcpu counter?
> >
> > The only problem is with recursion/nesting; whenever we
> > hit something that queues the skb for later we're safe.
> >
> 
> Hi Florian,
> The per cpu counter (initial proposal) should protect against
> recursion through loops and the potential stack overflows.
> It will not protect against a packet infinitely looping through a poor
> configuration if (as you say) the packet is queued at some point and
> the cpu counter reset.

Yes, it won't help, but thats not harmful, such cycle will be
broken/resolved as soon as the configuration is fixed.

> The per skb tracking seems to accommodate both issues.

Yes, but I do not see the 'looping with queueing' as a problem,
it can also occur for different reasons.

> Do you see the how the cpu counter could stop infinite loops in the
> case of queuing?

No, but I don't think it has to.

> Or perhaps these are 2 different issues and should be treated differently?

The recursion is a problem, so, yes, I think these are different issues.

> > We can't catch loops in real (physical) setups either,
> > e.g. bridge looping back on itself.
> 
> yes, this is only targeted at 'internal' loops.

Right, however, I'm not sure we should bother with those, we can't
prevent this (looping packets) from happening for other reasons.

I'm sure you can make packets go in circles without tc, e.g. via
veth+bridge+netns.


Re: [RFC net-next v2 1/1] net: sched: protect against loops in TC filter hooks

2019-06-06 Thread Florian Westphal
David Miller  wrote:
> From: Florian Westphal 
> Date: Thu, 6 Jun 2019 14:58:18 +0200
> 
> >> @@ -827,6 +828,7 @@ struct sk_buff {
> >>__u8tc_at_ingress:1;
> >>__u8tc_redirected:1;
> >>__u8tc_from_ingress:1;
> >> +  __u8tc_hop_count:2;
> > 
> > I dislike this, why can't we just use a pcpu counter?
> 
> I understand that it's because the only precise context is per-SKB not
> per-cpu doing packet processing.  This has been discussed before.

I don't think its worth it, and it won't work with physical-world
loops (e.g. a bridge setup with no spanning tree and a closed loop).

Also I fear that if we start to do this for tc, we will also have to
followup later with more l2 hopcounts for other users, e.g. veth,
bridge, ovs, and so on.


Re: [PATCH] netfilter: nft_paylaod: add base type NFT_PAYLOAD_LL_HEADER_NO_TAG

2019-06-10 Thread Florian Westphal
we...@ucloud.cn  wrote:
> From: wenxu 
> 
> nft add rule bridge firewall rule-100-ingress ip protocol icmp drop

nft --debug=netlink add rule bridge firewall rule-100-ingress ip protocol icmp 
drop
bridge firewall rule-100-ingress
  [ payload load 2b @ link header + 12 => reg 1 ]
  [ cmp eq reg 1 0x0008 ]
  [ payload load 1b @ network header + 9 => reg 1 ]
  [ cmp eq reg 1 0x0001 ]
  [ immediate reg 0 drop ]

so problem is that nft inserts a dependency on the ethernet protocol
type (0x800).

But when vlan is involved, that will fail to compare.

It would also fail for qinq etc.

Because of vlan tag offload, the rule about will probably already work
just fine when nft userspace is patched to insert the dependency based
on 'meta protocol'.  Can you see if this patch works?

Subject: Change bridge l3 dependency to meta protocol

This examines skb->protocol instead of ethernet header type, which
might be different when vlan is involved.

nft payload expression will re-insert the vlan tag so ether type
will not be ETH_P_IP.

---
 src/meta.c|  6 +-
 src/payload.c | 20 
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/src/meta.c b/src/meta.c
index 583e790ff47d..1e8964eb48c4 100644
--- a/src/meta.c
+++ b/src/meta.c
@@ -539,7 +539,11 @@ static void meta_expr_pctx_update(struct proto_ctx *ctx,
proto_ctx_update(ctx, PROTO_BASE_TRANSPORT_HDR, 
&expr->location, desc);
break;
case NFT_META_PROTOCOL:
-   if (h->base < PROTO_BASE_NETWORK_HDR && ctx->family != 
NFPROTO_NETDEV)
+   if (h->base != PROTO_BASE_LL_HDR)
+   return;
+
+   if (ctx->family != NFPROTO_NETDEV &&
+   ctx->family != NFPROTO_BRIDGE)
return;
 
desc = proto_find_upper(h->desc, 
ntohs(mpz_get_uint16(right->value)));
diff --git a/src/payload.c b/src/payload.c
index 6a8118ece890..c99bb2f69977 100644
--- a/src/payload.c
+++ b/src/payload.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -307,6 +308,19 @@ payload_gen_special_dependency(struct eval_ctx *ctx, const 
struct expr *expr)
return NULL;
 }
 
+static const struct proto_desc proto_metaeth = {
+   .name   = "ethmeta",
+   .base   = PROTO_BASE_LL_HDR,
+   .protocols  = {
+   PROTO_LINK(__constant_htons(ETH_P_IP),   &proto_ip),
+   PROTO_LINK(__constant_htons(ETH_P_ARP),  &proto_arp),
+   PROTO_LINK(__constant_htons(ETH_P_IPV6), &proto_ip6),
+   },
+   .templates  = {
+   [0] = PROTO_META_TEMPLATE("protocol", ðertype_type, 
NFT_META_PROTOCOL, 16),
+   },
+};
+
 /**
  * payload_gen_dependency - generate match expression on payload dependency
  *
@@ -369,6 +383,12 @@ int payload_gen_dependency(struct eval_ctx *ctx, const 
struct expr *expr,
  "no %s protocol specified",
  proto_base_names[expr->payload.base - 1]);
 
+   if (ctx->pctx.family == NFPROTO_BRIDGE && desc == &proto_eth) {
+   if (expr->payload.desc == &proto_ip ||
+   expr->payload.desc == &proto_ip6)
+   desc = &proto_metaeth;
+   }
+
return payload_add_dependency(ctx, desc, expr->payload.desc, expr, res);
 }
 
-- 
2.21.0



Re: [PATCH] Fix dumping vlan rules

2019-07-13 Thread Florian Westphal
michael-...@fami-braun.de  wrote:
> From: "M. Braun" 
> 
> Given the following bridge rules:
> 1. ip protocol icmp accept
> 2. ether type vlan vlan type ip ip protocol icmp accept
> 
> The are currently both dumped by "nft list ruleset" as
> 1. ip protocol icmp accept
> 2. ip protocol icmp accept

Yes, thats a bug, the dependency removal is incorrect.

> +++ b/src/payload.c
> @@ -506,6 +506,18 @@ static bool payload_may_dependency_kill(struct 
> payload_dep_ctx *ctx,
>dep->left->payload.desc == &proto_ip6) &&
>   expr->payload.base == PROTO_BASE_TRANSPORT_HDR)
>   return false;
> + /* Do not kill
> +  *  ether type vlan and vlan type ip and ip protocol icmp
> +  * into
> +  *  ip protocol icmp
> +  * as this lacks ether type vlan.
> +  * More generally speaking, do not kill protocol type
> +  * for stacked protocols if we only have protcol type matches.
> +  */
> + if (dep->left->etype == EXPR_PAYLOAD && dep->op == OP_EQ &&
> + expr->flags & EXPR_F_PROTOCOL &&
> + expr->payload.base == dep->left->payload.base)
> + return false;

Can you please add a test case for this problem to
tests/py/bridge/vlan.t so we catch this when messing with dependency
handling in the future?

Also, please submit v2 directly to netfilter-devel@.

Thanks!


[RFC net] net: generate icmp redirects after netfilter forward hook

2019-07-18 Thread Florian Westphal
Quoting from
https://bugzilla.kernel.org/show_bug.cgi?id=204155

 
1. Configure an PPPoE interface (e.g., ppp0) with an IPv6 address and
   a prefixlen of 64; e.g., 2001:0DB8::1/64.
2. Configure a Netfilter ip6table rule to drop IN to OUT forwarding of traffic
   across the PPP interface: ip6tables -A FORWARD -i ppp+ -o ppp+ -j DROP
3. Create a Netfilter NFLOG rule to log all outbound traffic.
4. From an Internet IPv6 Address initiate TCP traffic to an IPv6 address
   within the /64 IPv6 address space from step 1, but an IPv6 address that
   is NOT configured on that interface; e.g., ` nc 2001:0DB8::2 80`
5. Observe the NFLOG showing the Netfilter ip6table filter FORWARD rule
   is matched and therefore the traffic should be dropped.
6. Observe the traffic from step 4, that should have been dropped,
   resulted in an outbound ICMPv6 Redirect with a source IPv6 address of
   the PPP interface’s Local Link to the Internet IPv6 Address.
 

Problem is that we emit the redirect before passing the packet to the
netfilter FORWARD hook.   The same "problem" exists in ipv4.

There are various counter-arguments to changing this, e.g.  we would
still emit such redirect when packet is dropped later in the stack
(e.g. in POSTROUTING or qdisc).

We will also still emit e.g. ICMPV6 PKTTOOBIG error, as that occurs
before FORWARD as well, and moving that seems wrong (packet
has to be dropped).

The only argument that I can think of in favor of this change
is the lack of a proper alternative to filtering such traffic.

PREROUTING would work, but at that point we lack the
"packet will be forwarded from ppp0 to ppp0" information that
we only have available in FORWARD.

Compile tested only.

Cc: Jason Muskat 
Signed-off-by: Florian Westphal 
---
 net/ipv4/ip_forward.c | 16 +--
 net/ipv6/ip6_output.c | 63 ---
 2 files changed, 44 insertions(+), 35 deletions(-)

diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 06f6f280b9ff..33c46470c966 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -69,6 +69,14 @@ static int ip_forward_finish(struct net *net, struct sock 
*sk, struct sk_buff *s
__IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS);
__IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len);
 
+   /*
+*  We now generate an ICMP HOST REDIRECT giving the route
+*  we calculated.
+*/
+   if (IPCB(skb)->flags & IPSKB_DOREDIRECT && !opt->srr &&
+   !skb_sec_path(skb))
+   ip_rt_send_redirect(skb);
+
 #ifdef CONFIG_NET_SWITCHDEV
if (skb->offload_l3_fwd_mark) {
consume_skb(skb);
@@ -143,14 +151,6 @@ int ip_forward(struct sk_buff *skb)
/* Decrease ttl after skb cow done */
ip_decrease_ttl(iph);
 
-   /*
-*  We now generate an ICMP HOST REDIRECT giving the route
-*  we calculated.
-*/
-   if (IPCB(skb)->flags & IPSKB_DOREDIRECT && !opt->srr &&
-   !skb_sec_path(skb))
-   ip_rt_send_redirect(skb);
-
if (net->ipv4.sysctl_ip_fwd_update_priority)
skb->priority = rt_tos2priority(iph->tos);
 
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 8e49fd62eea9..2dafd2da2926 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -383,11 +383,45 @@ static int ip6_forward_proxy_check(struct sk_buff *skb)
 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 struct sk_buff *skb)
 {
+   struct inet6_skb_parm *opt = IP6CB(skb);
struct dst_entry *dst = skb_dst(skb);
 
__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, 
skb->len);
 
+   /* IPv6 specs say nothing about it, but it is clear that we cannot
+  send redirects to source routed frames.
+  We don't send redirects to frames decapsulated from IPsec.
+*/
+   if (IP6CB(skb)->iif == dst->dev->ifindex &&
+   opt->srcrt == 0 && !skb_sec_path(skb)) {
+   struct ipv6hdr *hdr = ipv6_hdr(skb);
+   struct in6_addr *target = NULL;
+   struct inet_peer *peer;
+   struct rt6_info *rt;
+
+   /*
+*  incoming and outgoing devices are the same
+*  send a redirect.
+*/
+
+   rt = (struct rt6_info *) dst;
+   if (rt->rt6i_flags & RTF_GATEWAY)
+   target = &rt->rt6i_gateway;
+   else
+   target = &hdr->daddr;
+
+   peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
+
+   /* Limit redirects both by destination (here)
+  and by sour

Re: [PATCH net-next 1/3] flow_offload: move tc indirect block to flow offload

2019-07-25 Thread Florian Westphal
we...@ucloud.cn  wrote:
> From: wenxu 
> 
> move tc indirect block to flow_offload.c. The nf_tables
> can use the indr block architecture.

... to do what?  Can you please illustrate how this is going to be
used/useful?


Re: [PATCH net-next 2/3] flow_offload: Support get tcf block immediately

2019-07-25 Thread Florian Westphal
we...@ucloud.cn  wrote:
> From: wenxu 
> 
> It provide a callback to find the tcf block in
> the flow_indr_block_dev_get

Can you explain why you're making this change?
This will help us understand the concept/idea of your series.

The above describes what the patch does, but it should
explain why this is callback is added.


Re: [PATCH net] inet: frags: re-introduce skb coalescing for local delivery

2019-08-06 Thread Florian Westphal
Guillaume Nault  wrote:
> Before commit d4289fcc9b16 ("net: IP6 defrag: use rbtrees for IPv6
> defrag"), a netperf UDP_STREAM test[0] using big IPv6 datagrams (thus
> generating many fragments) and running over an IPsec tunnel, reported
> more than 6Gbps throughput. After that patch, the same test gets only
> 9Mbps when receiving on a be2net nic (driver can make a big difference
> here, for example, ixgbe doesn't seem to be affected).
> 
> By reusing the IPv4 defragmentation code, IPv6 lost fragment coalescing
> (IPv4 fragment coalescing was dropped by commit 14fe22e33462 ("Revert
> "ipv4: use skb coalescing in defragmentation"")).

[..]

> This patch is quite conservative and only coalesces skbs for local
> IPv4 and IPv6 delivery (in order to avoid changing skb geometry when
> forwarding). Coalescing could be extended in the future if need be, as
> more scenarios would probably benefit from it.

No objections from my side, so:

Acked-by: Florian Westphal 


[PATCH ipsec] xfrm: policy: avoid warning splat when merging nodes

2019-08-12 Thread Florian Westphal
syzbot reported a splat:
 xfrm_policy_inexact_list_reinsert+0x625/0x6e0 net/xfrm/xfrm_policy.c:877
 CPU: 1 PID: 6756 Comm: syz-executor.1 Not tainted 5.3.0-rc2+ #57
 Call Trace:
  xfrm_policy_inexact_node_reinsert net/xfrm/xfrm_policy.c:922 [inline]
  xfrm_policy_inexact_node_merge net/xfrm/xfrm_policy.c:958 [inline]
  xfrm_policy_inexact_insert_node+0x537/0xb50 net/xfrm/xfrm_policy.c:1023
  xfrm_policy_inexact_alloc_chain+0x62b/0xbd0 net/xfrm/xfrm_policy.c:1139
  xfrm_policy_inexact_insert+0xe8/0x1540 net/xfrm/xfrm_policy.c:1182
  xfrm_policy_insert+0xdf/0xce0 net/xfrm/xfrm_policy.c:1574
  xfrm_add_policy+0x4cf/0x9b0 net/xfrm/xfrm_user.c:1670
  xfrm_user_rcv_msg+0x46b/0x720 net/xfrm/xfrm_user.c:2676
  netlink_rcv_skb+0x1f0/0x460 net/netlink/af_netlink.c:2477
  xfrm_netlink_rcv+0x74/0x90 net/xfrm/xfrm_user.c:2684
  netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline]
  netlink_unicast+0x809/0x9a0 net/netlink/af_netlink.c:1328
  netlink_sendmsg+0xa70/0xd30 net/netlink/af_netlink.c:1917
  sock_sendmsg_nosec net/socket.c:637 [inline]
  sock_sendmsg net/socket.c:657 [inline]

There is no reproducer, however, the warning can be reproduced
by adding rules with ever smaller prefixes.

The sanity check ("does the policy match the node") uses the prefix value
of the node before its updated to the smaller value.

To fix this, update the prefix earlier.  The bug has no impact on tree
correctness, this is only to prevent a false warning.

Reported-by: syzbot+8cc27ace5f6972910...@syzkaller.appspotmail.com
Signed-off-by: Florian Westphal 
---
 net/xfrm/xfrm_policy.c | 6 --
 tools/testing/selftests/net/xfrm_policy.sh | 7 +++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 8ca637a72697..0fa7c5ce3b2c 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -912,6 +912,7 @@ static void xfrm_policy_inexact_node_reinsert(struct net 
*net,
} else if (delta > 0) {
p = &parent->rb_right;
} else {
+   bool same_prefixlen = node->prefixlen == n->prefixlen;
struct xfrm_policy *tmp;
 
hlist_for_each_entry(tmp, &n->hhead, bydst) {
@@ -919,9 +920,11 @@ static void xfrm_policy_inexact_node_reinsert(struct net 
*net,
hlist_del_rcu(&tmp->bydst);
}
 
+   node->prefixlen = prefixlen;
+
xfrm_policy_inexact_list_reinsert(net, node, family);
 
-   if (node->prefixlen == n->prefixlen) {
+   if (same_prefixlen) {
kfree_rcu(n, rcu);
return;
}
@@ -929,7 +932,6 @@ static void xfrm_policy_inexact_node_reinsert(struct net 
*net,
rb_erase(*p, new);
kfree_rcu(n, rcu);
n = node;
-   n->prefixlen = prefixlen;
goto restart;
}
}
diff --git a/tools/testing/selftests/net/xfrm_policy.sh 
b/tools/testing/selftests/net/xfrm_policy.sh
index 5445943bf07f..7a1bf94c5bd3 100755
--- a/tools/testing/selftests/net/xfrm_policy.sh
+++ b/tools/testing/selftests/net/xfrm_policy.sh
@@ -106,6 +106,13 @@ do_overlap()
 #
 # 10.0.0.0/24 and 10.0.1.0/24 nodes have been merged as 10.0.0.0/23.
 ip -net $ns xfrm policy add src 10.1.0.0/24 dst 10.0.0.0/23 dir fwd 
priority 200 action block
+
+# similar to above: add policies (with partially random address), with 
shrinking prefixes.
+for p in 29 28 27;do
+  for k in $(seq 1 32); do
+   ip -net $ns xfrm policy add src 10.253.1.$((RANDOM%255))/$p dst 
10.254.1.$((RANDOM%255))/$p dir fwd priority $((200+k)) action block 2>/dev/null
+  done
+done
 }
 
 do_esp_policy_get_check() {
-- 
2.21.0



Re: [PATCH net] netfilter: ebtables: Fix argument order to ADD_COUNTER

2019-08-12 Thread Florian Westphal
Todd Seidelmann  wrote:
> The ordering of arguments to the x_tables ADD_COUNTER macro
> appears to be wrong in ebtables (cf. ip_tables.c, ip6_tables.c,
> and arp_tables.c).
> 
> This causes data corruption in the ebtables userspace tools
> because they get incorrect packet & byte counts from the kernel.

Please send netfilter patches to netfilter-de...@vger.kernel.org .

Fixes: d72133e628803 ("netfilter: ebtables: use ADD_COUNTER macro")

Acked-by: Florian Westphal 


Re: [PATCH net-next 3/3] net: core: Namespace-ify sysctl_rmem_max and sysctl_wmem_max

2021-01-20 Thread Florian Westphal
menglong8.d...@gmail.com  wrote:
> From: Menglong Dong 
> 
> For now, sysctl_wmem_max and sysctl_rmem_max are globally unified.
> It's not convenient in some case. For example, when we use docker
> and try to control the default udp socket receive buffer for each
> container.
> 
> For that reason, make sysctl_wmem_max and sysctl_rmem_max
> per-namespace.

I think having those values be restricted by init netns is a desirable
property.


Re: kernel BUG at lib/string.c:LINE! (6)

2020-12-22 Thread Florian Westphal
Linus Torvalds  wrote:
> On Tue, Dec 22, 2020 at 6:44 AM syzbot
>  wrote:
> >
> > The issue was bisected to:
> >
> > commit 2f78788b55ba ("ilog2: improve ilog2 for constant arguments")
> 
> That looks unlikely, although possibly some constant folding
> improvement might make the fortify code notice something with it.
> 
> > detected buffer overflow in strlen
> > [ cut here ]
> > kernel BUG at lib/string.c:1149!
> > Call Trace:
> >  strlen include/linux/string.h:325 [inline]
> >  strlcpy include/linux/string.h:348 [inline]
> >  xt_rateest_tg_checkentry+0x2a5/0x6b0 net/netfilter/xt_RATEEST.c:143
> 
> Honestly, this just looks like the traditional bug in "strlcpy()".

Yes, thats exactly what this is, no idea why the bisection points
at ilog2 changes.

> That BSD function is complete garbage, exactly because it doesn't
> limit the source length. People tend to _think_ it does ("what's that
> size_t argument for?") but strlcpy() only limits the *destination*
> size, and the source is always read fully.

Right, I'll send a patch shortly.


[PATCH nf] netfilter: xt_RATEEST: reject non-null terminated string from userspace

2020-12-22 Thread Florian Westphal
syzbot reports:
detected buffer overflow in strlen
[..]
Call Trace:
 strlen include/linux/string.h:325 [inline]
 strlcpy include/linux/string.h:348 [inline]
 xt_rateest_tg_checkentry+0x2a5/0x6b0 net/netfilter/xt_RATEEST.c:143

strlcpy assumes src is a c-string. Check info->name before its used.

Reported-by: syzbot+e86f7c428c8c50db6...@syzkaller.appspotmail.com
Fixes: 5859034d7eb8793 ("[NETFILTER]: x_tables: add RATEEST target")
Signed-off-by: Florian Westphal 
---
RATEEST test in iptables.git still passes, syzbot repro setsockopt
fails with -ENAMETOOLONG.

diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 37253d399c6b..0d5c422f8745 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -115,6 +115,9 @@ static int xt_rateest_tg_checkentry(const struct 
xt_tgchk_param *par)
} cfg;
int ret;
 
+   if (strnlen(info->name, sizeof(est->name)) >= sizeof(est->name))
+   return -ENAMETOOLONG;
+
net_get_random_once(&jhash_rnd, sizeof(jhash_rnd));
 
mutex_lock(&xn->hash_lock);
-- 
2.26.2



Re: [PATCH nf] netfilter: xt_RATEEST: reject non-null terminated string from userspace

2020-12-22 Thread Florian Westphal
Linus Torvalds  wrote:
> On Tue, Dec 22, 2020 at 2:24 PM Florian Westphal  wrote:
> >
> > strlcpy assumes src is a c-string. Check info->name before its used.
> 
> If strlcpy is the only problem, then the fix is to use strscpy(),
> which doesn't have the design mistake that strlcpy has.

It would silence the reproducer, but the checkentry function calls
__xt_rateest_lookup which may 'strcmp(..., maybe_not_zero_terminated)'.


[PATCH net-next 0/3] mptcp: reject invalid mp_join requests right away

2020-11-30 Thread Florian Westphal
At the moment MPTCP can detect an invalid join request (invalid token,
max number of subflows reached, and so on) right away but cannot reject
the connection until the 3WHS has completed.
Instead the connection will complete and the subflow is reset afterwards.

To send the reset most information is already available, but we don't have
good spot where the reset could be sent:

1. The ->init_req callback is too early and also doesn't allow to return an
   error that could be used to inform the TCP stack that the SYN should be
   dropped.

2. The ->route_req callback lacks the skb needed to send a reset.

3. The ->send_synack callback is the best fit from the available hooks,
   but its called after the request socket has been inserted into the queue
   already. This means we'd have to remove it again right away.

>From a technical point of view, the second hook would be best:
 1. Its before insertion into listener queue.
 2. If it returns NULL TCP will drop the packet for us.

Problem is that we'd have to pass the skb to the function just for MPTCP.

Paolo suggested to merge init_req and route_req callbacks instead:
This makes all info available to MPTCP -- a return value of NULL drops the
packet and MPTCP can send the reset if needed.

Because 'route_req' has a 'const struct sock *', this means either removal
of const qualifier, or a bit of code churn to pass 'const' in security land.

This does the latter; I did not find any spots that need write access to struct
sock.

To recap, the two alternatives are:
1. Solve it entirely in MPTCP: use the ->send_synack callback to
   unlink the request socket from the listener & drop it.
2. Avoid 'security' churn by removing the const qualifier.




[PATCH net-next 1/3] security: add const qualifier to struct sock in various places

2020-11-30 Thread Florian Westphal
A followup change to tcp_request_sock_op would have to drop the 'const'
qualifier from the 'route_req' function as the
'security_inet_conn_request' call is moved there - and that function
expects a 'struct sock *'.

However, it turns out its also possible to add a const qualifier to
security_inet_conn_request instead.

Signed-off-by: Florian Westphal 
---
 The code churn is unfortunate.  Alternative would be to change
 the function signature of ->route_req:
 struct dst_entry *(*route_req)(struct sock *sk, ...
 [ i.e., drop 'const' ].  Thoughts?

 include/linux/lsm_audit.h   | 2 +-
 include/linux/lsm_hook_defs.h   | 2 +-
 include/linux/security.h| 4 ++--
 security/apparmor/include/net.h | 2 +-
 security/apparmor/lsm.c | 2 +-
 security/apparmor/net.c | 6 +++---
 security/lsm_audit.c| 4 ++--
 security/security.c | 2 +-
 security/selinux/hooks.c| 2 +-
 security/smack/smack_lsm.c  | 4 ++--
 10 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h
index 28f23b341c1c..cd23355d2271 100644
--- a/include/linux/lsm_audit.h
+++ b/include/linux/lsm_audit.h
@@ -26,7 +26,7 @@
 
 struct lsm_network_audit {
int netif;
-   struct sock *sk;
+   const struct sock *sk;
u16 family;
__be16 dport;
__be16 sport;
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 32a940117e7a..acc0494cceba 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -301,7 +301,7 @@ LSM_HOOK(void, LSM_RET_VOID, sk_clone_security, const 
struct sock *sk,
 struct sock *newsk)
 LSM_HOOK(void, LSM_RET_VOID, sk_getsecid, struct sock *sk, u32 *secid)
 LSM_HOOK(void, LSM_RET_VOID, sock_graft, struct sock *sk, struct socket 
*parent)
-LSM_HOOK(int, 0, inet_conn_request, struct sock *sk, struct sk_buff *skb,
+LSM_HOOK(int, 0, inet_conn_request, const struct sock *sk, struct sk_buff *skb,
 struct request_sock *req)
 LSM_HOOK(void, LSM_RET_VOID, inet_csk_clone, struct sock *newsk,
 const struct request_sock *req)
diff --git a/include/linux/security.h b/include/linux/security.h
index bc2725491560..0df62735651b 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1358,7 +1358,7 @@ void security_sk_clone(const struct sock *sk, struct sock 
*newsk);
 void security_sk_classify_flow(struct sock *sk, struct flowi *fl);
 void security_req_classify_flow(const struct request_sock *req, struct flowi 
*fl);
 void security_sock_graft(struct sock*sk, struct socket *parent);
-int security_inet_conn_request(struct sock *sk,
+int security_inet_conn_request(const struct sock *sk,
struct sk_buff *skb, struct request_sock *req);
 void security_inet_csk_clone(struct sock *newsk,
const struct request_sock *req);
@@ -1519,7 +1519,7 @@ static inline void security_sock_graft(struct sock *sk, 
struct socket *parent)
 {
 }
 
-static inline int security_inet_conn_request(struct sock *sk,
+static inline int security_inet_conn_request(const struct sock *sk,
struct sk_buff *skb, struct request_sock *req)
 {
return 0;
diff --git a/security/apparmor/include/net.h b/security/apparmor/include/net.h
index 2431c011800d..aadb4b29fb66 100644
--- a/security/apparmor/include/net.h
+++ b/security/apparmor/include/net.h
@@ -107,6 +107,6 @@ int aa_sock_file_perm(struct aa_label *label, const char 
*op, u32 request,
  struct socket *sock);
 
 int apparmor_secmark_check(struct aa_label *label, char *op, u32 request,
-  u32 secid, struct sock *sk);
+  u32 secid, const struct sock *sk);
 
 #endif /* __AA_NET_H */
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index ffeaee5ed968..1b0aba8eb723 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -1147,7 +1147,7 @@ static void apparmor_sock_graft(struct sock *sk, struct 
socket *parent)
 }
 
 #ifdef CONFIG_NETWORK_SECMARK
-static int apparmor_inet_conn_request(struct sock *sk, struct sk_buff *skb,
+static int apparmor_inet_conn_request(const struct sock *sk, struct sk_buff 
*skb,
  struct request_sock *req)
 {
struct aa_sk_ctx *ctx = SK_CTX(sk);
diff --git a/security/apparmor/net.c b/security/apparmor/net.c
index fa0e85568450..e0c1b50d6edd 100644
--- a/security/apparmor/net.c
+++ b/security/apparmor/net.c
@@ -211,7 +211,7 @@ static int apparmor_secmark_init(struct aa_secmark *secmark)
 }
 
 static int aa_secmark_perm(struct aa_profile *profile, u32 request, u32 secid,
-  struct common_audit_data *sa, struct sock *sk)
+  struct common_audit_data *sa)
 {
int i, ret;
struct aa_perms perms = { };
@@ -244,13 +244,13 @@ static int aa_secmark_perm(struct aa

[PATCH net-next 2/3] tcp: merge 'init_req' and 'route_req' functions

2020-11-30 Thread Florian Westphal
The Multipath-TCP standard (RFC 8684) says that an MPTCP host should send
a TCP reset if the token in a MP_JOIN request is unknown.

At this time we don't do this, the 3whs completes and the 'new subflow'
is reset afterwards.  There are two ways to allow MPTCP to send the
reset.

1. override 'send_synack' callback and emit the rst from there.
   The drawback is that the request socket gets inserted into the
   listeners queue just to get removed again right away.

2. Send the reset from the 'route_req' function instead.
   This avoids the 'add&remove request socket', but route_req lacks the
   skb that is required to send the TCP reset.

Instead of just adding the skb to that function for MPTCP sake alone,
Paolo suggested to merge init_req and route_req functions.

This saves one indirection from syn processing path and provides the skb
to the merged function at the same time.

'send reset on unknown mptcp join token' is added in next patch.

Suggested-by: Paolo Abeni 
Cc: Eric Dumazet 
Signed-off-by: Florian Westphal 
---
 include/net/tcp.h|  9 -
 net/ipv4/tcp_input.c |  9 ++---
 net/ipv4/tcp_ipv4.c  |  9 +++--
 net/ipv6/tcp_ipv6.c  |  9 +++--
 net/mptcp/subflow.c  | 36 
 5 files changed, 44 insertions(+), 28 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index d5d22c411918..4525d6256321 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2007,15 +2007,14 @@ struct tcp_request_sock_ops {
  const struct sock *sk,
  const struct sk_buff *skb);
 #endif
-   void (*init_req)(struct request_sock *req,
-const struct sock *sk_listener,
-struct sk_buff *skb);
 #ifdef CONFIG_SYN_COOKIES
__u32 (*cookie_init_seq)(const struct sk_buff *skb,
 __u16 *mss);
 #endif
-   struct dst_entry *(*route_req)(const struct sock *sk, struct flowi *fl,
-  const struct request_sock *req);
+   struct dst_entry *(*route_req)(const struct sock *sk,
+  struct sk_buff *skb,
+  struct flowi *fl,
+  struct request_sock *req);
u32 (*init_seq)(const struct sk_buff *skb);
u32 (*init_ts_off)(const struct net *net, const struct sk_buff *skb);
int (*send_synack)(const struct sock *sk, struct dst_entry *dst,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index fb3a7750f623..9e8a6c1aa019 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6799,18 +6799,13 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
/* Note: tcp_v6_init_req() might override ir_iif for link locals */
inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
 
-   af_ops->init_req(req, sk, skb);
-
-   if (security_inet_conn_request(sk, skb, req))
+   dst = af_ops->route_req(sk, skb, &fl, req);
+   if (!dst)
goto drop_and_free;
 
if (tmp_opt.tstamp_ok)
tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb);
 
-   dst = af_ops->route_req(sk, &fl, req);
-   if (!dst)
-   goto drop_and_free;
-
if (!want_cookie && !isn) {
/* Kill the following clause, if you dislike this way. */
if (!net->ipv4.sysctl_tcp_syncookies &&
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index e4b31e70bd30..af2338294598 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1444,9 +1444,15 @@ static void tcp_v4_init_req(struct request_sock *req,
 }
 
 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
+ struct sk_buff *skb,
  struct flowi *fl,
- const struct request_sock *req)
+ struct request_sock *req)
 {
+   tcp_v4_init_req(req, sk, skb);
+
+   if (security_inet_conn_request(sk, skb, req))
+   return NULL;
+
return inet_csk_route_req(sk, &fl->u.ip4, req);
 }
 
@@ -1466,7 +1472,6 @@ const struct tcp_request_sock_ops 
tcp_request_sock_ipv4_ops = {
.req_md5_lookup =   tcp_v4_md5_lookup,
.calc_md5_hash  =   tcp_v4_md5_hash_skb,
 #endif
-   .init_req   =   tcp_v4_init_req,
 #ifdef CONFIG_SYN_COOKIES
.cookie_init_seq =  cookie_v4_init_sequence,
 #endif
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 992cbf3eb9e3..1a1510513739 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -828,9 +828,15 @@ static void tcp_v6_init_req(struct request_sock *req,
 }
 
 static struct dst_entry *tcp_v6_route_req(const struct sock *sk,
+ 

[PATCH net-next 3/3] mptcp: emit tcp reset when a join request fails

2020-11-30 Thread Florian Westphal
RFC 8684 says:
 If the token is unknown or the host wants to refuse subflow establishment
 (for example, due to a limit on the number of subflows it will permit),
 the receiver will send back a reset (RST) signal, analogous to an unknown
 port in TCP, containing an MP_TCPRST option (Section 3.6) with an
 "MPTCP specific error" reason code.

mptcp-next doesn't support MP_TCPRST yet, this can be added in another
change.

Signed-off-by: Florian Westphal 
---
 net/mptcp/subflow.c | 47 ++---
 1 file changed, 36 insertions(+), 11 deletions(-)

diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index c55b8f176746..5a8005746bc8 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -112,9 +112,14 @@ static int __subflow_init_req(struct request_sock *req, 
const struct sock *sk_li
return 0;
 }
 
-static void subflow_init_req(struct request_sock *req,
-const struct sock *sk_listener,
-struct sk_buff *skb)
+/* Init mptcp request socket.
+ *
+ * Returns an error code if a JOIN has failed and a TCP reset
+ * should be sent.
+ */
+static int subflow_init_req(struct request_sock *req,
+   const struct sock *sk_listener,
+   struct sk_buff *skb)
 {
struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
@@ -125,7 +130,7 @@ static void subflow_init_req(struct request_sock *req,
 
ret = __subflow_init_req(req, sk_listener);
if (ret)
-   return;
+   return 0;
 
mptcp_get_options(skb, &mp_opt);
 
@@ -133,7 +138,7 @@ static void subflow_init_req(struct request_sock *req,
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE);
 
if (mp_opt.mp_join)
-   return;
+   return 0;
} else if (mp_opt.mp_join) {
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX);
}
@@ -157,7 +162,7 @@ static void subflow_init_req(struct request_sock *req,
} else {
subflow_req->mp_capable = 1;
}
-   return;
+   return 0;
}
 
err = mptcp_token_new_request(req);
@@ -175,7 +180,11 @@ static void subflow_init_req(struct request_sock *req,
subflow_req->remote_nonce = mp_opt.nonce;
subflow_req->msk = subflow_token_join_request(req, skb);
 
-   if (unlikely(req->syncookie) && subflow_req->msk) {
+   /* Can't fall back to TCP in this case. */
+   if (!subflow_req->msk)
+   return -EPERM;
+
+   if (unlikely(req->syncookie)) {
if (mptcp_can_accept_new_subflow(subflow_req->msk))
subflow_init_req_cookie_join_save(subflow_req, 
skb);
}
@@ -183,6 +192,8 @@ static void subflow_init_req(struct request_sock *req,
pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token,
 subflow_req->remote_nonce, subflow_req->msk);
}
+
+   return 0;
 }
 
 int mptcp_subflow_init_cookie_req(struct request_sock *req,
@@ -234,6 +245,7 @@ static struct dst_entry *subflow_v4_route_req(const struct 
sock *sk,
  struct request_sock *req)
 {
struct dst_entry *dst;
+   int err;
 
tcp_rsk(req)->is_mptcp = 1;
 
@@ -241,8 +253,14 @@ static struct dst_entry *subflow_v4_route_req(const struct 
sock *sk,
if (!dst)
return NULL;
 
-   subflow_init_req(req, sk, skb);
-   return dst;
+   err = subflow_init_req(req, sk, skb);
+   if (err == 0)
+   return dst;
+
+   dst_release(dst);
+   if (!req->syncookie)
+   tcp_request_sock_ops.send_reset(sk, skb);
+   return NULL;
 }
 
 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
@@ -252,6 +270,7 @@ static struct dst_entry *subflow_v6_route_req(const struct 
sock *sk,
  struct request_sock *req)
 {
struct dst_entry *dst;
+   int err;
 
tcp_rsk(req)->is_mptcp = 1;
 
@@ -259,8 +278,14 @@ static struct dst_entry *subflow_v6_route_req(const struct 
sock *sk,
if (!dst)
return NULL;
 
-   subflow_init_req(req, sk, skb);
-   return dst;
+   err = subflow_init_req(req, sk, skb);
+   if (err == 0)
+   return dst;
+
+   dst_release(dst);
+   if (!req->syncookie)
+   tcp6_request_sock_ops.send_reset(sk, skb);
+   return NULL;
 }
 #endif
 
-- 
2.26.2



Re: [Race] data race between eth_heder_cache_update() and neigh_hh_output()

2020-11-30 Thread Florian Westphal
Gong, Sishuai  wrote:
> Hi,
> 
> We found a data race in linux kernel 5.3.11 that we are able to reproduce in 
> x86 under specific interleavings. We are not sure about the consequence of 
> this race now but it seems that the two memcpy() can lead to some 
> inconsistency. We also noticed that both the writer and reader are protected 
> by locks, but the writer is protected using seqlock while the reader is 
> protected by rculock.

AFAICS reader uses same seqlock as the writer.

> --
> Write site
> 
>  /tmp/tmp.B7zb7od2zE-5.3.11/extract/linux-5.3.11/net/ethernet/eth.c:264
> 252  /**
> 253   * eth_header_cache_update - update cache entry
> 254   * @hh: destination cache entry
> 255   * @dev: network device
> 256   * @haddr: new hardware address
> 257   *
> 258   * Called by Address Resolution module to notify changes in 
> address.
> 259   */
> 260  void eth_header_cache_update(struct hh_cache *hh,
> 261   const struct net_device *dev,
> 262   const unsigned char *haddr)
> 263  {
>  ==>264  memcpy(((u8 *) hh->hh_data) + HH_DATA_OFF(sizeof(struct 
> ethhdr)),
> 265 haddr, ETH_ALEN);

This is called under write_seqlock_bh(hh->hh_lock)

> /tmp/tmp.B7zb7od2zE-5.3.11/extract/linux-5.3.11/include/net/neighbour.h:481
> 463  static inline int neigh_hh_output(const struct hh_cache *hh, 
> struct sk_buff *skb)
> 464  {
> 465  unsigned int hh_alen = 0;
> 466  unsigned int seq;
> 467  unsigned int hh_len;
> 468
> 469  do {
> 470  seq = read_seqbegin(&hh->hh_lock); 

This samples the seqcount.

> 471  hh_len = hh->hh_len;
> 472  if (likely(hh_len <= HH_DATA_MOD)) {
> 473  hh_alen = HH_DATA_MOD;
> 474
> 475  /* skb_push() would proceed silently if we have room 
> for
> 476   * the unaligned size but not for the aligned size:
> 477   * check headroom explicitly.
> 478   */
> 479  if (likely(skb_headroom(skb) >= HH_DATA_MOD)) {
> 480  /* this is inlined by gcc */
>  ==>481  memcpy(skb->data - HH_DATA_MOD, hh->hh_data,
> 482 HH_DATA_MOD);

[..]

> 492  } while (read_seqretry(&hh->hh_lock, seq));

... so this retries unless seqcount was stable.


Re: [PATCH] mptcp: print new line in mptcp_seq_show() if mptcp isn't in use

2020-12-04 Thread Florian Westphal
Jianguo Wu  wrote:
> From: Jianguo Wu 

A brief explanation would have helped.
This is for net tree.

> Signed-off-by: Jianguo Wu 

Fixes: fc518953bc9c8d7d ("mptcp: add and use MIB counter infrastructure")
Acked-by: Florian Westphal 


Re: [PATCH net-next,v4 3/9] net: resolve forwarding path from virtual netdevice and HW destination address

2020-11-18 Thread Florian Westphal
Pablo Neira Ayuso  wrote:
> +#define NET_DEVICE_PATH_STACK_MAX5
> +
> +struct net_device_path_stack {
> + int num_paths;
> + struct net_device_path  path[NET_DEVICE_PATH_STACK_MAX];
> +};

[..]

> +int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
> +   struct net_device_path_stack *stack)
> +{
> + const struct net_device *last_dev;
> + struct net_device_path_ctx ctx = {
> + .dev= dev,
> + .daddr  = daddr,
> + };
> + struct net_device_path *path;
> + int ret = 0, k;
> +
> + stack->num_paths = 0;
> + while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
> + last_dev = ctx.dev;
> + k = stack->num_paths++;
> + if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
> + return -1;

This guarantees k < NET_DEVICE_PATH_STACK_MAX, so we can fill
entire path[].

> + path = &stack->path[k];
> + memset(path, 0, sizeof(struct net_device_path));
> +
> + ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
> + if (ret < 0)
> + return -1;
> +
> + if (WARN_ON_ONCE(last_dev == ctx.dev))
> + return -1;
> + }

... but this means that stack->num_paths == NET_DEVICE_PATH_STACK_MAX
is possible, with k being last element.

> + path = &stack->path[stack->num_paths++];

... so this may result in a off by one?


Re: [PATCH net-next,v4 2/9] netfilter: flowtable: add xmit path types

2020-11-18 Thread Florian Westphal
Pablo Neira Ayuso  wrote:
> - if (unlikely(dst_xfrm(&rt->dst))) {
> + rt = (struct rtable *)tuplehash->tuple.dst_cache;
> +
> + if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) {
>   memset(skb->cb, 0, sizeof(struct inet_skb_parm));
>   IPCB(skb)->iif = skb->dev->ifindex;
>   IPCB(skb)->flags = IPSKB_FORWARDED;
>   return nf_flow_xmit_xfrm(skb, state, &rt->dst);
>   }
[..]

> +
> + if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH))

This needs to be XMIT_XFRM too.


Re: [PATCH net] netfilter: ipset: prevent uninit-value in hash_ip6_add

2020-11-19 Thread Florian Westphal
Eric Dumazet  wrote:
> From: Eric Dumazet 
> 
> syzbot found that we are not validating user input properly
> before copying 16 bytes [1].
> Using NLA_BINARY in ipaddr_policy[] for IPv6 address is not correct,
> since it ensures at most 16 bytes were provided.

Thanks Eric. Looks like this is the only case in ipset, the other 3
NLA_BINARY users do a

nla_len(tb[IPSET_ATTR_ETHER]) != ETH_ALEN))

before copying.


Re: [PATCH v2] aquantia: Remove the build_skb path

2020-11-19 Thread Florian Westphal
Ramsay, Lincoln  wrote:
> The build_skb path fails to allow for an SKB header, but the hardware
> buffer it is built around won't allow for this anyway.

What problem is being resolved here?


Re: [PATCH v2] aquantia: Remove the build_skb path

2020-11-19 Thread Florian Westphal
Ramsay, Lincoln  wrote:
> > Ramsay, Lincoln  wrote:
> > > The build_skb path fails to allow for an SKB header, but the hardware
> > > buffer it is built around won't allow for this anyway.
> > 
> > What problem is being resolved here?
> 
> Sorry... Do I need to re-post the context? (I thought the reply headers would 
> have kept this with the original patch that included the justification, plus 
> the discussion that led to this revised patch).

This is the only text that gets recorded in git, see

https://patchwork.kernel.org/project/netdevbpf/patch/cy4pr1001mb2311f01c543420e5f89c0f4de8...@cy4pr1001mb2311.namprd10.prod.outlook.com/

so, yes, please include this information in the patch description and
post a v3.

Thank you.


Re: [PATCH v3] aquantia: Remove the build_skb path

2020-11-19 Thread Florian Westphal
Ramsay, Lincoln  wrote:
> When performing IPv6 forwarding, there is an expectation that SKBs
> will have some headroom. When forwarding a packet from the aquantia
> driver, this does not always happen, triggering a kernel warning.
> 
> The build_skb path fails to allow for an SKB header, but the hardware

For build_skb path to work the buffer scheme would need to be changed
to reserve headroom, so yes, I think that the proposed patch is the
most convenient solution.

> buffer it is built around won't allow for this anyway. Just always use the
> slower codepath that copies memory into an allocated SKB.

I thought this changes the driver to always copy the entire packet, but
thats not true, see below.

> It seems that skb_headroom is only 14, when it is expected to be >= 16.

Yes, kernel expects to have some headroom in skbs.

> aq_ring.c has this code (edited slightly for brevity):
> 
> if (buff->is_eop && buff->len <= AQ_CFG_RX_FRAME_MAX - AQ_SKB_ALIGN) {
> skb = build_skb(aq_buf_vaddr(&buff->rxdata), AQ_CFG_RX_FRAME_MAX);
> skb_put(skb, buff->len);
> } else {
> skb = napi_alloc_skb(napi, AQ_CFG_RX_HDR_SIZE);
> 
> There is a significant difference between the SKB produced by these 2 code 
> paths. When napi_alloc_skb creates an SKB, there is a certain amount of 
> headroom reserved. The same pattern appears to be used in all of the other 
> ethernet drivers I have looked at. However, this is not done in the build_skb 
> codepath.

I think the above should be part of the commit message rather than this
meta-space (which gets removed by git-am).

> + skb = napi_alloc_skb(napi, AQ_CFG_RX_HDR_SIZE);
> + if (unlikely(!skb)) {

AQ_CFG_RX_HDR_SIZE is 256 byte, so for larger packets ...

> + memcpy(__skb_put(skb, hdr_len), aq_buf_vaddr(&buff->rxdata),
> + ALIGN(hdr_len, sizeof(long)));

This only copies the initial part and then...
> + if (buff->len - hdr_len > 0) {
> + skb_add_rx_frag(skb, 0, buff->rxdata.page,
> + buff->rxdata.pg_off + hdr_len,
> + buff->len - hdr_len,
>   AQ_CFG_RX_FRAME_MAX);

The rest is added as a frag.

IOW, this patch looks good to me, but could you update the
commit message so it becomes clear that this doesn't result in a full
copy?

Perhaps something like:
'Just always use the napi_alloc_skb() code path that passes the buffer
 as a page fragment', or similar.

Thanks.


Re: [PATCH v4] aquantia: Remove the build_skb path

2020-11-19 Thread Florian Westphal
Ramsay, Lincoln  wrote:

[ patch looks good to me, I have no further comments ]

> > For build_skb path to work the buffer scheme would need to be changed
> > to reserve headroom, so yes, I think that the proposed patch is the
> > most convenient solution.
> 
> I don't know about benefits/feasibility, but I did wonder if (in the event 
> that the "fast path" is possible), the dma_mapping could use an offset? The 
> page would include the skb header but the dma mapping would not. If that was 
> done though, only 1 RX frame would fit into the page (at least on my system, 
> where the RX frame seems to be 2k and the page is 4k). Also, there's a 
> possibility to set the "order" variable, so that multiple pages are created 
> at once and I'm not sure if this would work in that case.

Yes, this is what some drivers do, they allocate a page, pass
pageaddr + headroom_offset everywhere, except build_skb() which gets the
pageaddr followed by skb_reserve(skb, headroom_offset).

> > This only copies the initial part and then the rest is added as a frag.
> 
> Oh yeah. That's not as bad as I had thought then :)
> 
> I wonder though... if the "fast path" is possible, could the whole packet 
> (including header) be added as a frag, avoiding the header copy? Or is that 
> not how SKBs work?

No, you can either have skb->head point to the page (build_skb), or
skb->head needs to be kmalloc'd (napi_alloc_skb for example).

Both can have page frags. In the second case, at least L2 header
needs to be in skb->head (and ip stack would pull in L3 header as well
later on anyway).


Re: [PATCH net-next v2] net: openvswitch: Be liberal in tcp conntrack.

2020-11-19 Thread Florian Westphal
Jakub Kicinski  wrote:
> On Mon, 16 Nov 2020 18:31:26 +0530 nusid...@redhat.com wrote:
> > From: Numan Siddique 
> > 
> > There is no easy way to distinguish if a conntracked tcp packet is
> > marked invalid because of tcp_in_window() check error or because
> > it doesn't belong to an existing connection. With this patch,
> > openvswitch sets liberal tcp flag for the established sessions so
> > that out of window packets are not marked invalid.
> > 
> > A helper function - nf_ct_set_tcp_be_liberal(nf_conn) is added which
> > sets this flag for both the directions of the nf_conn.
> > 
> > Suggested-by: Florian Westphal 
> > Signed-off-by: Numan Siddique 
> 
> Florian, LGTY?

Sorry, this one sailed past me.

Acked-by: Florian Westphal 


Re: [PATCH 015/141] netfilter: Fix fall-through warnings for Clang

2020-11-20 Thread Florian Westphal
Gustavo A. R. Silva  wrote:
> In preparation to enable -Wimplicit-fallthrough for Clang, fix multiple
> warnings by explicitly adding multiple break statements instead of just
> letting the code fall through to the next case.

Acked-by: Florian Westphal 

Feel free to carry this in next iteration of series, if any.


Re: [PATCH 108/141] netfilter: ipt_REJECT: Fix fall-through warnings for Clang

2020-11-20 Thread Florian Westphal
Gustavo A. R. Silva  wrote:
> In preparation to enable -Wimplicit-fallthrough for Clang, fix a warning
> by explicitly adding a break statement instead of letting the code fall
> through to the next case.

Acked-by: Florian Westphal 


Re: [PATCH v5 2/3] net: add kcov handle to skb extensions

2020-11-21 Thread Florian Westphal
Ido Schimmel  wrote:
> On Thu, Oct 29, 2020 at 05:36:19PM +, Aleksandr Nogikh wrote:
> > From: Aleksandr Nogikh 
> > 
> > Remote KCOV coverage collection enables coverage-guided fuzzing of the
> > code that is not reachable during normal system call execution. It is
> > especially helpful for fuzzing networking subsystems, where it is
> > common to perform packet handling in separate work queues even for the
> > packets that originated directly from the user space.
> > 
> > Enable coverage-guided frame injection by adding kcov remote handle to
> > skb extensions. Default initialization in __alloc_skb and
> > __build_skb_around ensures that no socket buffer that was generated
> > during a system call will be missed.
> > 
> > Code that is of interest and that performs packet processing should be
> > annotated with kcov_remote_start()/kcov_remote_stop().
> > 
> > An alternative approach is to determine kcov_handle solely on the
> > basis of the device/interface that received the specific socket
> > buffer. However, in this case it would be impossible to distinguish
> > between packets that originated during normal background network
> > processes or were intentionally injected from the user space.
> > 
> > Signed-off-by: Aleksandr Nogikh 
> > Acked-by: Willem de Bruijn 
> 
> [...]
> 
> > @@ -249,6 +249,9 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t 
> > gfp_mask,
> >  
> > fclones->skb2.fclone = SKB_FCLONE_CLONE;
> > }
> > +
> > +   skb_set_kcov_handle(skb, kcov_common_handle());
> 
> Hi,
> 
> This causes skb extensions to be allocated for the allocated skb, but
> there are instances that blindly overwrite 'skb->extensions' by invoking
> skb_copy_header() after __alloc_skb(). For example, skb_copy(),
> __pskb_copy_fclone() and skb_copy_expand(). This results in the skb
> extensions being leaked [1].

[..]
> Other suggestions?

Aleksandr, why was this made into an skb extension in the first place?

AFAIU this feature is usually always disabled at build time.
For debug builds (test farm /debug kernel etc) its always needed.

If thats the case this u64 should be an sk_buff member, not an
extension.


Re: [Patch stable] netfilter: clear skb->next in NF_HOOK_LIST()

2020-11-21 Thread Florian Westphal
Cong Wang  wrote:
> From: Cong Wang 
> 
> NF_HOOK_LIST() uses list_del() to remove skb from the linked list,
> however, it is not sufficient as skb->next still points to other
> skb. We should just call skb_list_del_init() to clear skb->next,
> like the rest places which using skb list.
> 
> This has been fixed in upstream by commit ca58fbe06c54
> ("netfilter: add and use nf_hook_slow_list()").

Thanks Cong, agree with this change, afaics its applicable to 4.19.y and 5.4.y.



Re: [PATCH net-next] netfilter: bridge: reset skb->pkt_type after NF_INET_POST_ROUTING traversal

2020-11-23 Thread Florian Westphal
Antoine Tenart  wrote:
> Netfilter changes PACKET_OTHERHOST to PACKET_HOST before invoking the
> hooks as, while it's an expected value for a bridge, routing expects
> PACKET_HOST. The change is undone later on after hook traversal. This
> can be seen with pairs of functions updating skb>pkt_type and then
> reverting it to its original value:
> 
> For hook NF_INET_PRE_ROUTING:
>   setup_pre_routing / br_nf_pre_routing_finish
> 
> For hook NF_INET_FORWARD:
>   br_nf_forward_ip / br_nf_forward_finish
> 
> But the third case where netfilter does this, for hook
> NF_INET_POST_ROUTING, the packet type is changed in br_nf_post_routing
> but never reverted. A comment says:
> 
>   /* We assume any code from br_dev_queue_push_xmit onwards doesn't care
>* about the value of skb->pkt_type. */

[..]
> But when having a tunnel (say vxlan) attached to a bridge we have the
> following call trace:

> In this specific case, this creates issues such as when an ICMPv6 PTB
> should be sent back. When CONFIG_BRIDGE_NETFILTER is enabled, the PTB
> isn't sent (as skb_tunnel_check_pmtu checks if pkt_type is PACKET_HOST
> and returns early).
> 
> If the comment is right and no one cares about the value of
> skb->pkt_type after br_dev_queue_push_xmit (which isn't true), resetting
> it to its original value should be safe.

That comment is 18 years old, safe bet noone thought of
ipv6-in-tunnel-interface-added-as-bridge-port back then.

Reviewed-by: Florian Westphal 


[PATCH net-next] mptcp: put reference in mptcp timeout timer

2020-11-24 Thread Florian Westphal
On close this timer might be scheduled. mptcp uses sk_reset_timer for
this, so the a reference on the mptcp socket is taken.

This causes a refcount leak which can for example be reproduced
with 'mp_join_server_v4.pkt' from the mptcp-packetdrill repo.

The leak has nothing to do with join requests, v1_mp_capable_bind_no_cs.pkt
works too when replacing the last ack mpcapable to v1 instead of v0.

unreferenced object 0x888109bba040 (size 2744):
  comm "packetdrill", [..]
  backtrace:
[..] sk_prot_alloc.isra.0+0x2b/0xc0
[..] sk_clone_lock+0x2f/0x740
[..] mptcp_sk_clone+0x33/0x1a0
[..] subflow_syn_recv_sock+0x2b1/0x690 [..]

Fixes: e16163b6e2b7 ("mptcp: refactor shutdown and close")
Cc: Paolo Abeni 
Cc: Davide Caratti 
Signed-off-by: Florian Westphal 
---
 net/mptcp/protocol.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 4b7794835fea..dc979571f561 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1710,6 +1710,7 @@ static void mptcp_timeout_timer(struct timer_list *t)
struct sock *sk = from_timer(sk, t, sk_timer);
 
mptcp_schedule_work(sk);
+   sock_put(sk);
 }
 
 /* Find an idle subflow.  Return NULL if there is unacked data at tcp
-- 
2.26.2



Re: [PATCH v6 0/3] net, mac80211, kernel: enable KCOV remote coverage collection for 802.11 frame handling

2020-11-25 Thread Florian Westphal
Marco Elver  wrote:
[..]

> v6:
> * Revert usage of skb extensions due to potential memory leak. Patch 2/3 is 
> now
>   idential to that in v2.
> * Patches 1/3 and 3/3 are otherwise identical to v5.

The earlier series was already applied to net-next, so you need to
rebase on top of net-next and include a revert of the patch that added
the kcov skb extension.

Also, please indicate the git tree that you want this applied to in the
subject.


Re: [PATCH net-next] netfilter: bridge: reset skb->pkt_type after NF_INET_POST_ROUTING traversal

2020-11-28 Thread Florian Westphal
Jakub Kicinski  wrote:
> On Mon, 23 Nov 2020 19:32:53 +0100 Florian Westphal wrote:
> > That comment is 18 years old, safe bet noone thought of
> > ipv6-in-tunnel-interface-added-as-bridge-port back then.
> > 
> > Reviewed-by: Florian Westphal 
> 
> Sounds like a fix. Probably hard to pin point which commit to blame,
> but this should go to net, not net-next, right?

The commit predates git history, so probably a good idea to add
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")

... and apply it to net tree.


Re: [net-next] netfiler: conntrack: Add the option to set ct tcp flag - BE_LIBERAL per-ct basis.

2020-11-09 Thread Florian Westphal
nusid...@redhat.com  wrote:
> From: Numan Siddique 
> 
> Before calling nf_conntrack_in(), caller can set this flag in the
> connection template for a tcp packet and any errors in the
> tcp_in_window() will be ignored.
> 
> A helper function - nf_ct_set_tcp_be_liberal(nf_conn) is added which
> sets this flag for both the directions of the nf_conn.
> 
> openvswitch makes use of this feature so that any out of window tcp
> packets are not marked invalid. Prior to this there was no easy way
> to distinguish if conntracked packet is marked invalid because of
> tcp_in_window() check error or because it doesn't belong to an
> existing connection.
> 
> An earlier attempt (see the link) tried to solve this problem for
> openvswitch in a different way. Florian Westphal instead suggested
> to be liberal in openvswitch for tcp packets.
> 
> Link: 
> https://patchwork.ozlabs.org/project/netdev/patch/20201006083355.121018-1-nusid...@redhat.com/
> 
> Suggested-by: Florian Westphal 
> Signed-off-by: Numan Siddique 
> ---
>  include/net/netfilter/nf_conntrack_l4proto.h |  6 ++
>  net/netfilter/nf_conntrack_core.c| 13 +++--
>  net/openvswitch/conntrack.c  |  1 +
>  3 files changed, 18 insertions(+), 2 deletions(-)
> 
> diff --git a/include/net/netfilter/nf_conntrack_l4proto.h 
> b/include/net/netfilter/nf_conntrack_l4proto.h
> index 88186b95b3c2..572ae8d2a622 100644
> --- a/include/net/netfilter/nf_conntrack_l4proto.h
> +++ b/include/net/netfilter/nf_conntrack_l4proto.h
> @@ -203,6 +203,12 @@ static inline struct nf_icmp_net 
> *nf_icmpv6_pernet(struct net *net)
>  {
>   return &net->ct.nf_ct_proto.icmpv6;
>  }
> +
> +static inline void nf_ct_set_tcp_be_liberal(struct nf_conn *ct)
> +{
> + ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
> + ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
> +}
>  #endif
>  
>  #ifdef CONFIG_NF_CT_PROTO_DCCP
> diff --git a/net/netfilter/nf_conntrack_core.c 
> b/net/netfilter/nf_conntrack_core.c
> index 234b7cab37c3..8290c5b04e88 100644
> --- a/net/netfilter/nf_conntrack_core.c
> +++ b/net/netfilter/nf_conntrack_core.c
> @@ -1748,10 +1748,18 @@ static int nf_conntrack_handle_packet(struct nf_conn 
> *ct,
> struct sk_buff *skb,
> unsigned int dataoff,
> enum ip_conntrack_info ctinfo,
> -   const struct nf_hook_state *state)
> +   const struct nf_hook_state *state,
> +   union nf_conntrack_proto *tmpl_proto)

I would prefer if we could avoid adding yet another function argument.

AFAICS its enough to call the new nf_ct_set_tcp_be_liberal() helper
before nf_conntrack_confirm() in ovs_ct_commit(), e.g.:

diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -1235,10 +1235,13 @@ static int ovs_ct_commit(struct net *net, struct 
sw_flow_key *key,
if (!nf_ct_is_confirmed(ct)) {
err = ovs_ct_init_labels(ct, key, &info->labels.value,
 &info->labels.mask);
if (err)
return err;
+
+   if (nf_ct_protonum(ct) == IPPROTO_TCP)
+   nf_ct_set_tcp_be_liberal(ct);



Re: [net-next] netfiler: conntrack: Add the option to set ct tcp flag - BE_LIBERAL per-ct basis.

2020-11-10 Thread Florian Westphal
Numan Siddique  wrote:
> On Tue, Nov 10, 2020 at 3:06 AM Florian Westphal  wrote:
> Thanks for the comments. I actually tried this approach first, but it
> doesn't seem to work.
> I noticed that for the committed connections, the ct tcp flag -
> IP_CT_TCP_FLAG_BE_LIBERAL is
> not set when nf_conntrack_in() calls resolve_normal_ct().

Yes, it won't be set during nf_conntrack_in, thats why I suggested
to do it before confirming the connection.

> Would you expect that the tcp ct flags should have been preserved once
> the connection is committed ?

Yes, they are preserved when you set them after nf_conntrack_in(), else
we would already have trouble with hw flow offloading which needs to
turn off window checks as well.


Re: [net-next] netfiler: conntrack: Add the option to set ct tcp flag - BE_LIBERAL per-ct basis.

2020-11-10 Thread Florian Westphal
Numan Siddique  wrote:
> On Tue, Nov 10, 2020 at 5:55 PM Florian Westphal  wrote:
> >
> > Numan Siddique  wrote:
> > > On Tue, Nov 10, 2020 at 3:06 AM Florian Westphal  wrote:
> > > Thanks for the comments. I actually tried this approach first, but it
> > > doesn't seem to work.
> > > I noticed that for the committed connections, the ct tcp flag -
> > > IP_CT_TCP_FLAG_BE_LIBERAL is
> > > not set when nf_conntrack_in() calls resolve_normal_ct().
> >
> > Yes, it won't be set during nf_conntrack_in, thats why I suggested
> > to do it before confirming the connection.
> 
> Sorry for the confusion. What I mean is - I tested  your suggestion - i.e 
> called
> nf_ct_set_tcp_be_liberal()  before calling nf_conntrack_confirm().
> 
>  Once the connection is established, for subsequent packets, openvswitch
>  calls nf_conntrack_in() [1] to see if the packet is part of the
> existing connection or not (i.e ct.new or ct.est )
> and if the packet happens to be out-of-window then skb->_nfct is set
> to NULL. And the tcp
> be flags set during confirmation are not reflected when
> nf_conntrack_in() calls resolve_normal_ct().

Can you debug where this happens?  This looks very very wrong.
resolve_normal_ct() has no business to check any of those flags
(and I don't see where it uses them, it should only deal with the
 tuples).

The flags come into play when nf_conntrack_handle_packet() gets called
after resolve_normal_ct has found an entry, since that will end up
calling the tcp conntrack part.

The entry found/returned by resolve_normal_ct should be the same
nf_conn entry that was confirmed earlier, i.e. it should be in "liberal"
mode.


Re: [PATCH net-next v4] net: linux/skbuff.h: combine SKB_EXTENSIONS + KCOV handling

2020-11-16 Thread Florian Westphal
Matthieu Baerts  wrote:
> > --- linux-next-20201113.orig/include/linux/skbuff.h
> > +++ linux-next-20201113/include/linux/skbuff.h
> > @@ -4137,7 +4137,6 @@ static inline void skb_set_nfct(struct s
> >   #endif
> >   }
> > -#ifdef CONFIG_SKB_EXTENSIONS
> >   enum skb_ext_id {
> >   #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
> > SKB_EXT_BRIDGE_NF,
> > @@ -4151,12 +4150,11 @@ enum skb_ext_id {
> >   #if IS_ENABLED(CONFIG_MPTCP)
> > SKB_EXT_MPTCP,
> >   #endif
> > -#if IS_ENABLED(CONFIG_KCOV)
> > SKB_EXT_KCOV_HANDLE,
> > -#endif
> 
> I don't think we should remove this #ifdef: the number of extensions are
> currently limited to 8, we might not want to always have KCOV there even if
> we don't want it. I think adding items in this enum only when needed was the
> intension of Florian (+cc) when creating these SKB extensions.
> Also, this will increase a tiny bit some structures, see "struct skb_ext()".

Yes, I would also prefer to retrain the ifdef.

Another reason was to make sure that any skb_ext_add(..., MY_EXT) gives
a compile error if the extension is not enabled.

> So if we think it is better to remove these #ifdef here, we should be OK.
> But if we prefer not to do that, we should then not add stubs for
> skb_ext_{add,find}() and keep the ones for skb_[gs]et_kcov_handle().

Yes, exactly, I did not add these stubs because I could not figure out
a case where an empty skb_ext_{add,find} would be wanted.

If your code calls skb_ext_add() but no skb extensions exist you forgot
a SELECT/DEPENDS SKB_EXTENSIONS in Kconfig & compiler error would tell
you that.


Re: [PATCH net-next v4] net: linux/skbuff.h: combine SKB_EXTENSIONS + KCOV handling

2020-11-16 Thread Florian Westphal
Randy Dunlap  wrote:
> On 11/16/20 7:30 AM, Jakub Kicinski wrote:
> > On Mon, 16 Nov 2020 15:31:21 +0100 Florian Westphal wrote:
> >>>> @@ -4151,12 +4150,11 @@ enum skb_ext_id {
> >>>>   #if IS_ENABLED(CONFIG_MPTCP)
> >>>>  SKB_EXT_MPTCP,
> >>>>   #endif
> >>>> -#if IS_ENABLED(CONFIG_KCOV)
> >>>>  SKB_EXT_KCOV_HANDLE,
> >>>> -#endif  
> >>>
> >>> I don't think we should remove this #ifdef: the number of extensions are
> >>> currently limited to 8, we might not want to always have KCOV there even 
> >>> if
> >>> we don't want it. I think adding items in this enum only when needed was 
> >>> the
> >>> intension of Florian (+cc) when creating these SKB extensions.
> >>> Also, this will increase a tiny bit some structures, see "struct 
> >>> skb_ext()".  
> >>
> >> Yes, I would also prefer to retrain the ifdef.
> >>
> >> Another reason was to make sure that any skb_ext_add(..., MY_EXT) gives
> >> a compile error if the extension is not enabled.
> > 
> > Oh well, sorry for taking you down the wrong path Randy!
> 
> No problem.
> So we are back to v2, right?

Yes, you can still drop the line

>> +#if IS_ENABLED(CONFIG_KCOV) && IS_ENABLED(CONFIG_SKB_EXTENSIONS)

for enum skb_ext_id (alreadyt under SKB_EXTENSIONS).

Other than that v2 looks good to me.

Thanks!


Re: [PATCH net-next v5] net: linux/skbuff.h: combine SKB_EXTENSIONS + KCOV handling

2020-11-16 Thread Florian Westphal
Randy Dunlap  wrote:
> The previous Kconfig patch led to some other build errors as
> reported by the 0day bot and my own overnight build testing.
> 
> These are all in  when KCOV is enabled but
> SKB_EXTENSIONS is not enabled, so fix those by combining those conditions
> in the header file.

Thanks Randy.

Acked-by: Florian Westphal 


Re: [PATCH] xfrm: Fix wraparound in xfrm_policy_addr_delta()

2020-12-29 Thread Florian Westphal
Visa Hankala  wrote:
> Use three-way comparison for address elements to avoid integer
> wraparound in the result of xfrm_policy_addr_delta().
> 
> This ensures that the search trees are built and traversed correctly
> when the difference between compared address elements is larger
> than INT_MAX.

Please provide an update to tools/testing/selftests/net/xfrm_policy.sh
that shows that this is a problem.

>   switch (family) {
>   case AF_INET:
> - if (sizeof(long) == 4 && prefixlen == 0)
> - return ntohl(a->a4) - ntohl(b->a4);
> - return (ntohl(a->a4) & ((~0UL << (32 - prefixlen -
> -(ntohl(b->a4) & ((~0UL << (32 - prefixlen;
> + mask = ~0U << (32 - prefixlen);
> + ma = ntohl(a->a4) & mask;
> + mb = ntohl(b->a4) & mask;

This is suspicious.  Is prefixlen == 0 impossible?

If not, then after patch
mask = ~0U << 32;

... and function returns 0.


Re: [PATCH] xfrm: Fix wraparound in xfrm_policy_addr_delta()

2020-12-30 Thread Florian Westphal
Visa Hankala  wrote:
> On Tue, Dec 29, 2020 at 05:01:27PM +0100, Florian Westphal wrote:
> > This is suspicious.  Is prefixlen == 0 impossible?
> > 
> > If not, then after patch
> > mask = ~0U << 32;
> > 
> > ... and function returns 0.
> 
> With prefixlen == 0, there is only one equivalence class, so
> returning 0 seems reasonable to me.

Right, that seems reasonable indeed.

> Is there a reason why the function has treated /0 prefix as /32
> with IPv4? IPv6 does not have this treatment.

Not that I recall, looks like a bug.


Re: [PATCH] selftests: xfrm: fix test return value override issue in xfrm_policy.sh

2020-12-30 Thread Florian Westphal
Po-Hsu Lin  wrote:
> When running this xfrm_policy.sh test script, even with some cases
> marked as FAIL, the overall test result will still be PASS:
> 
> $ sudo ./xfrm_policy.sh
> PASS: policy before exception matches
> FAIL: expected ping to .254 to fail (exceptions)
> PASS: direct policy matches (exceptions)
> PASS: policy matches (exceptions)
> FAIL: expected ping to .254 to fail (exceptions and block policies)
> PASS: direct policy matches (exceptions and block policies)
> PASS: policy matches (exceptions and block policies)
> FAIL: expected ping to .254 to fail (exceptions and block policies after 
> hresh changes)
> PASS: direct policy matches (exceptions and block policies after hresh 
> changes)
> PASS: policy matches (exceptions and block policies after hresh changes)
> FAIL: expected ping to .254 to fail (exceptions and block policies after 
> hthresh change in ns3)
> PASS: direct policy matches (exceptions and block policies after hthresh 
> change in ns3)
> PASS: policy matches (exceptions and block policies after hthresh change in 
> ns3)
> FAIL: expected ping to .254 to fail (exceptions and block policies after 
> htresh change to normal)
> PASS: direct policy matches (exceptions and block policies after htresh 
> change to normal)
> PASS: policy matches (exceptions and block policies after htresh change to 
> normal)
> PASS: policies with repeated htresh change
> $ echo $?
> 0
> 
> This is because the $lret in check_xfrm() is not a local variable.

Acked-by: Florian Westphal 


Re: [PATCH v2] xfrm: Fix wraparound in xfrm_policy_addr_delta()

2020-12-30 Thread Florian Westphal
Visa Hankala  wrote:
> Use three-way comparison for address components to avoid integer
> wraparound in the result of xfrm_policy_addr_delta(). This ensures
> that the search trees are built and traversed correctly.
> 
> Treat IPv4 and IPv6 similarly by returning 0 when prefixlen == 0.
> Prefix /0 has only one equivalence class.

Acked-by: Florian Westphal 


[PATCH net 2/3] net: fix pmtu check in nopmtudisc mode

2021-01-05 Thread Florian Westphal
For some reason ip_tunnel insist on setting the DF bit anyway when the
inner header has the DF bit set, EVEN if the tunnel was configured with
'nopmtudisc'.

This means that the script added in the previous commit
cannot be made to work by adding the 'nopmtudisc' flag to the
ip tunnel configuration. Doing so breaks connectivity even for the
without-conntrack/netfilter scenario.

When nopmtudisc is set, the tunnel will skip the mtu check, so no
icmp error is sent to client. Then, because inner header has DF set,
the outer header gets added with DF bit set as well.

IP stack then sends an error to itself because the packet exceeds
the device MTU.

Fixes: 23a3647bc4f93 ("ip_tunnels: Use skb-len to PMTU check.")
Cc: Stefano Brivio 
Signed-off-by: Florian Westphal 
---
 net/ipv4/ip_tunnel.c | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index ee65c9225178..64594aa755f0 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -759,8 +759,11 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device 
*dev,
goto tx_error;
}
 
-   if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph,
-   0, 0, false)) {
+   df = tnl_params->frag_off;
+   if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
+   df |= (inner_iph->frag_off & htons(IP_DF));
+
+   if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
ip_rt_put(rt);
goto tx_error;
}
@@ -788,10 +791,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device 
*dev,
ttl = ip4_dst_hoplimit(&rt->dst);
}
 
-   df = tnl_params->frag_off;
-   if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
-   df |= (inner_iph->frag_off&htons(IP_DF));
-
max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
if (max_headroom > dev->needed_headroom)
-- 
2.26.2



[PATCH net 0/3] net: fix netfilter defrag/ip tunnel pmtu blackhole

2021-01-05 Thread Florian Westphal
Christian Perle reported a PMTU blackhole due to unexpected interaction
between the ip defragmentation that comes with connection tracking and
ip tunnels.

Unfortunately setting 'nopmtudisc' on the tunnel breaks the test
scenario even without netfilter.

Christinas setup looks like this:
 ++   +-+   ++
 |Router A|---|Wanrouter|---|Router B|
 ||.IPIP..| |..IPIP.||
 ++   +-+   ++
  / mtu 1400   \
 /  \
 ++  ++
 |Client A|  |Client B|
 ++  ++

MTU is 1500 everywhere, except on Router A to Wanrouter and
Wanrouter to Router B.

Router A and Router B use IPIP tunnel interfaces to tunnel traffic
between Client A and Client B over WAN.

Client A sends a 1400 byte UDP datagram to Client B.
This packet gets encapsulated in the IPIP tunnel.

This works, packet is received on client B.

When conntrack (or anything else that forces ip defragmentation) is
enabled on Router A, the packet gets dropped on Router A after
encapsulation because they exceed the link MTU.

Setting the 'nopmtudisc' flag on the IPIP tunnel makes things worse,
no packets pass even in the no-netfilter scenario.

Patch one is a reproducer script for selftest infra.

Patch two is a fix for 'nopmtudisc' behaviour so ip_tunnel will send
an icmp error to Client A.  This allows 'nopmtudisc' tunnel to forward
the UDP datagrams.

Patch three enables ip refragmentation for all reassembled packets, just
like ipv6.



[PATCH net 3/3] net: ip: always refragment ip defragmented packets

2021-01-05 Thread Florian Westphal
Conntrack reassembly records the largest fragment size seen in IPCB.
However, when this gets forwarded/transmitted, fragmentation will only
be forced if one of the fragmented packets had the DF bit set.

In that case, a flag in IPCB will force fragmentation even if the
MTU is large enough.

This should work fine, but this breaks with ip tunnels.
Consider client that sends a UDP datagram of size X to another host.

The client fragments the datagram, so two packets, of size y and z, are
sent. DF bit is not set on any of these packets.

Middlebox netfilter reassembles those packets back to single size-X
packet, before routing decision.

packet-size-vs-mtu checks in ip_forward are irrelevant, because DF bit
isn't set.  At output time, ip refragmentation is skipped as well
because x is still smaller than the mtu of the output device.

If ttransmit device is an ip tunnel, the packet size increases to
x+overhead.

Also, tunnel might be configured to force DF bit on outer header.

In this case, packet will be dropped (exceeds MTU) and an ICMP error is
generated back to sender.

But sender already respects the announced MTU, all the packets that
it sent did fit the announced mtu.

Force refragmentation as per original sizes unconditionally so ip tunnel
will encapsulate the fragments instead.

The only other solution I see is to place ip refragmentation in
the ip_tunnel code to handle this case.

Fixes: d6b915e29f4ad ("ip_fragment: don't forward defragmented DF packet")
Reported-by: Christian Perle 
Signed-off-by: Florian Westphal 
---
 net/ipv4/ip_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 89fff5f59eea..2ed0b01f72f0 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -302,7 +302,7 @@ static int __ip_finish_output(struct net *net, struct sock 
*sk, struct sk_buff *
if (skb_is_gso(skb))
return ip_finish_output_gso(net, sk, skb, mtu);
 
-   if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU))
+   if (skb->len > mtu || IPCB(skb)->frag_max_size)
return ip_fragment(net, sk, skb, mtu, ip_finish_output2);
 
return ip_finish_output2(net, sk, skb);
-- 
2.26.2



[PATCH net 1/3] selftests: netfilter: add selftest for ipip pmtu discovery with enabled connection tracking

2021-01-05 Thread Florian Westphal
Convert Christians bug description into a reproducer.

Cc: Shuah Khan 
Cc: Pablo Neira Ayuso 
Reported-by: Christian Perle 
Signed-off-by: Florian Westphal 
---
 tools/testing/selftests/netfilter/Makefile|   3 +-
 .../selftests/netfilter/ipip-conntrack-mtu.sh | 206 ++
 2 files changed, 208 insertions(+), 1 deletion(-)
 create mode 100755 tools/testing/selftests/netfilter/ipip-conntrack-mtu.sh

diff --git a/tools/testing/selftests/netfilter/Makefile 
b/tools/testing/selftests/netfilter/Makefile
index a374e10ef506..3006a8e5b41a 100644
--- a/tools/testing/selftests/netfilter/Makefile
+++ b/tools/testing/selftests/netfilter/Makefile
@@ -4,7 +4,8 @@
 TEST_PROGS := nft_trans_stress.sh nft_nat.sh bridge_brouter.sh \
conntrack_icmp_related.sh nft_flowtable.sh ipvs.sh \
nft_concat_range.sh nft_conntrack_helper.sh \
-   nft_queue.sh nft_meta.sh
+   nft_queue.sh nft_meta.sh \
+   ipip-conntrack-mtu.sh
 
 LDLIBS = -lmnl
 TEST_GEN_FILES =  nf-queue
diff --git a/tools/testing/selftests/netfilter/ipip-conntrack-mtu.sh 
b/tools/testing/selftests/netfilter/ipip-conntrack-mtu.sh
new file mode 100755
index ..4a6f5c3b3215
--- /dev/null
+++ b/tools/testing/selftests/netfilter/ipip-conntrack-mtu.sh
@@ -0,0 +1,206 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+# Conntrack needs to reassemble fragments in order to have complete
+# packets for rule matching.  Reassembly can lead to packet loss.
+
+# Consider the following setup:
+#++   +-+   ++
+#|Router A|---|Wanrouter|---|Router B|
+#||.IPIP..| |..IPIP.||
+#++   +-+   ++
+#   /  mtu 1400   \
+#  /   \
+#++ ++
+#|Client A| |Client B|
+#|| ||
+#++ ++
+
+# Router A and Router B use IPIP tunnel interfaces to tunnel traffic
+# between Client A and Client B over WAN. Wanrouter has MTU 1400 set
+# on its interfaces.
+
+rnd=$(mktemp -u )
+rx=$(mktemp)
+
+r_a="ns-ra-$rnd"
+r_b="ns-rb-$rnd"
+r_w="ns-rw-$rnd"
+c_a="ns-ca-$rnd"
+c_b="ns-cb-$rnd"
+
+checktool (){
+   if ! $1 > /dev/null 2>&1; then
+   echo "SKIP: Could not $2"
+   exit $ksft_skip
+   fi
+}
+
+checktool "iptables --version" "run test without iptables"
+checktool "ip -Version" "run test without ip tool"
+checktool "which nc" "run test without nc (netcat)"
+checktool "ip netns add ${r_a}" "create net namespace"
+
+for n in ${r_b} ${r_w} ${c_a} ${c_b};do
+   ip netns add ${n}
+done
+
+cleanup() {
+   for n in ${r_a} ${r_b} ${r_w} ${c_a} ${c_b};do
+   ip netns del ${n}
+   done
+   rm -f ${rx}
+}
+
+trap cleanup EXIT
+
+test_path() {
+   msg="$1"
+
+   ip netns exec ${c_b} nc -n -w 3 -q 3 -u -l -p 5000 > ${rx} < /dev/null &
+
+   sleep 1
+   for i in 1 2 3; do
+   head -c1400 /dev/zero | tr "\000" "a" | ip netns exec ${c_a} nc 
-n -w 1 -u 192.168.20.2 5000
+   done
+
+   wait
+
+   bytes=$(wc -c < ${rx})
+
+   if [ $bytes -eq 1400 ];then
+   echo "OK: PMTU $msg connection tracking"
+   else
+   echo "FAIL: PMTU $msg connection tracking: got $bytes, expected 
1400"
+   exit 1
+   fi
+}
+
+# Detailed setup for Router A
+# ---
+# Interfaces:
+# eth0: 10.2.2.1/24
+# eth1: 192.168.10.1/24
+# ipip0: No IP address, local 10.2.2.1 remote 10.4.4.1
+# Routes:
+# 192.168.20.0/24 dev ipip0(192.168.20.0/24 is subnet of Client B)
+# 10.4.4.1 via 10.2.2.254  (Router B via Wanrouter)
+# No iptables rules at all.
+
+ip link add veth0 netns ${r_a} type veth peer name veth0 netns ${r_w}
+ip link add veth1 netns ${r_a} type veth peer name veth0 netns ${c_a}
+
+l_addr="10.2.2.1"
+r_addr="10.4.4.1"
+ip netns exec ${r_a} ip link add ipip0 type ipip local ${l_addr} remote 
${r_addr} mode ipip || exit $ksft_skip
+
+for dev in lo veth0 veth1 ipip0; do
+ip -net ${r_a} link set $dev up
+done
+
+ip -net ${r_a} addr add 10.2.2.1/24 dev veth0
+ip -net ${r_a} addr add 192.168.10.1/24 dev veth1
+
+ip -net ${r_a} route add 192.168.20.0/24 dev ipip0
+ip -net ${r_a} route add 10.4.4.0/24 via 10.2.2.254
+
+ip netns exec ${r_a} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null
+
+# Detailed setup for Router B
+# ---
+# 

Re: [PATCH] tcp: remove obsolete paramter sysctl_tcp_low_latency

2021-01-07 Thread Florian Westphal
Jakub Kicinski  wrote:
> > Got it. But a question: why tcp_tw_recycle can be removed totally?
> > it is also part of uAPI
> 
> Good question, perhaps with tcp_tw_recycle we wanted to make sure users
> who depended on it notice removal, since the feature was broken by
> design? 
> 
> tcp_low_latency is an optimization, not functionality which users may
> depend on.
> 
> But I may be wrong so CCing authors.

I guess it was just a case of 'noone noticed'.
I'm not sure if anyone would notice dropping lowlatency sysctl, was just
a case of 'overly careful'.  Personally I'd rather have them gone so
'sysctl tcp.bla' shows if the feature exists/does something.


Re: 5.10.4+ hang with 'rmmod nf_conntrack'

2021-01-07 Thread Florian Westphal
Ben Greear  wrote:
> I noticed my system has a hung process trying to 'rmmod nf_conntrack'.
> 
> I've generally been doing the script that calls rmmod forever,
> but only extensively tested on 5.4 kernel and earlier.
> 
> If anyone has any ideas, please let me know.  This is from 'sysrq t'.  I 
> don't see
> any hung-task splats in dmesg.

rmmod on conntrack loops forever until the active conntrack object count 
reaches 0.
(plus a walk of the conntrack table to evict/put all entries).

> I'll see if it is reproducible and if so will try
> with lockdep enabled...

No idea, there was a regression in 5.6, but that was fixed by the time
5.7 was released.

Can't reproduce hangs with a script that injects a few dummy entries
and then removes the module:

added=0

add_and_rmmod()
{
while [ $added -lt 1000 ]; do
conntrack -I -s 
$(($RANDOM%256)).$(($RANDOM%256)).$(($RANDOM%256)).$(($RANDOM%255+1)) \
-d 
$(($RANDOM%256)).$(($RANDOM%256)).$(($RANDOM%256)).$(($RANDOM%255+1)) \
 --protonum 6 --timeout $(((RANDOM%120) + 240)) --state 
ESTABLISHED --sport $RANDOM --dport $RANDOM 2> /dev/null || break

added=$((added + 1))
if [ $((added % 1000)) -eq 0 ];then
echo $added
fi
done

echo rmmod after adding $added entries
conntrack -C
rmmod nf_conntrack_netlink
rmmod nf_conntrack
}

add_and_rmmod

I don't see how it would make a difference, but do you have any special 
conntrack features enabled
at run time, e.g. reliable netlink events? (If you don't know what I mean the 
answer is no).


Re: [PATCH net] netfilter: conntrack: fix reading nf_conntrack_buckets

2021-01-08 Thread Florian Westphal
Jesper Dangaard Brouer  wrote:
> The old way of changing the conntrack hashsize runtime was through changing
> the module param via file /sys/module/nf_conntrack/parameters/hashsize. This
> was extended to sysctl change in commit 3183ab8997a4 ("netfilter: conntrack:
> allow increasing bucket size via sysctl too").
> 
> The commit introduced second "user" variable nf_conntrack_htable_size_user
> which shadow actual variable nf_conntrack_htable_size. When hashsize is
> changed via module param this "user" variable isn't updated. This results in
> sysctl net/netfilter/nf_conntrack_buckets shows the wrong value when users
> update via the old way.

Oh, right!

Acked-by: Florian Westphal 


Re: [PATCH] netfilter: Fix memleak in nf_nat_init

2021-01-09 Thread Florian Westphal
Dinghao Liu  wrote:
> When register_pernet_subsys() fails, nf_nat_bysource
> should be freed just like when nf_ct_extend_register()
> fails.

Acked-by: Florian Westphal 


Re: [PATCH nf 2/2] netfilter: use actual socket sk rather than skb sk when routing harder

2020-10-29 Thread Florian Westphal
Jason A. Donenfeld  wrote:
> If netfilter changes the packet mark when mangling, the packet is
> rerouted using the route_me_harder set of functions. Prior to this
> commit, there's one big difference between route_me_harder and the
> ordinary initial routing functions, described in the comment above
> __ip_queue_xmit():
> 
>/* Note: skb->sk can be different from sk, in case of tunnels */
>int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
> 
> That function goes on to correctly make use of sk->sk_bound_dev_if,
> rather than skb->sk->sk_bound_dev_if. And indeed the comment is true: a
> tunnel will receive a packet in ndo_start_xmit with an initial skb->sk.
> It will make some transformations to that packet, and then it will send
> the encapsulated packet out of a *new* socket. That new socket will
> basically always have a different sk_bound_dev_if (otherwise there'd be
> a routing loop). So for the purposes of routing the encapsulated packet,
> the routing information as it pertains to the socket should come from
> that socket's sk, rather than the packet's original skb->sk. For that
> reason __ip_queue_xmit() and related functions all do the right thing.
> 
> One might argue that all tunnels should just call skb_orphan(skb) before
> transmitting the encapsulated packet into the new socket. But tunnels do
> *not* do this -- and this is wisely avoided in skb_scrub_packet() too --
> because features like TSQ rely on skb->destructor() being called when
> that buffer space is truely available again. Calling skb_orphan(skb) too
> early would result in buffers filling up unnecessarily and accounting
> info being all wrong. Instead, additional routing must take into account
> the new sk, just as __ip_queue_xmit() notes.
> 
> So, this commit addresses the problem by fishing the correct sk out of
> state->sk -- it's already set properly in the call to nf_hook() in
> __ip_local_out(), which receives the sk as part of its normal
> functionality. So we make sure to plumb state->sk through the various
> route_me_harder functions, and then make correct use of it following the
> example of __ip_queue_xmit().

Reviewed-by: Florian Westphal 


Re: WARNING in dst_release

2021-02-18 Thread Florian Westphal
syzbot  wrote:
> Hello,
> 
> syzbot has tested the proposed patch and the reproducer did not trigger any 
> issue:
> 
> Reported-and-tested-by: syzbot+b53bbea2ad64f9cf8...@syzkaller.appspotmail.com

#syz-fix: mptcp: reset last_snd on subflow close

[ This patch is currently in mptcp-next ]


Re: [PATCH] xfrm: Fix incorrect types in assignment

2021-02-19 Thread Florian Westphal
Yang Li  wrote:
> Fix the following sparse warnings:
> net/xfrm/xfrm_policy.c:1303:22: warning: incorrect type in assignment
> (different address spaces)
> Reported-by: Abaci Robot 
> Signed-off-by: Yang Li 
> ---
>  net/xfrm/xfrm_policy.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
> index b74f28c..5c67407 100644
> --- a/net/xfrm/xfrm_policy.c
> +++ b/net/xfrm/xfrm_policy.c
> @@ -1225,7 +1225,7 @@ static void xfrm_hash_rebuild(struct work_struct *work)
>   struct xfrm_policy *pol;
>   struct xfrm_policy *policy;
>   struct hlist_head *chain;
> - struct hlist_head *odst;
> + struct hlist_head __rcu *odst;

This doesn't look right.  Try something like

- odst = net->xfrm.policy_bydst[dir].table;
+ odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table,
   
lockdep_is_held(&net->xfrm.xfrm_policy_lock));


Re: [PATCH][next] netfilter: nf_log_bridge: Fix missing assignment of ret on a call to nf_log_register

2021-03-31 Thread Florian Westphal
Colin King  wrote:
> From: Colin Ian King 
> 
> Currently the call to nf_log_register is returning an error code that
> is not being assigned to ret and yet ret is being checked. Fix this by
> adding in the missing assignment.

Thanks for catching this.

Acked-by: Florian Westphal 


Re: [PATCH net 2/2] mptcp: revert "mptcp: provide subflow aware release function"

2021-04-01 Thread Florian Westphal
Paolo Abeni  wrote:
> This change reverts commit ad98dd37051e ("mptcp: provide subflow aware
> release function"). The latter introduced a deadlock spotted by
> syzkaller and is not needed anymore after the previous commit.
> 
> Fixes: ad98dd37051e ("mptcp: provide subflow aware release function")
> Signed-off-by: Paolo Abeni 

Paolo, thanks for passing this to -net.

Acked-by: Florian Westphal 


Re: [PATCH netfilter] netfilter: xt_IDLETIMER: fix idletimer_tg_helper non-kosher casts

2021-04-02 Thread Florian Westphal
Maciej Żenczykowski  wrote:
> From: Maciej Żenczykowski 
> 
> The code is relying on the identical layout of the beginning
> of the v0 and v1 structs, but this can easily lead to code bugs
> if one were to try to extend this further...

What is the concern?  These structs are part of ABI, they
cannot be changed.


[PATCH net-next] net: dccp: use net_generic storage

2021-04-08 Thread Florian Westphal
DCCP is virtually never used, so no need to use space in struct net for it.

Put the pernet ipv4/v6 socket in the dccp ipv4/ipv6 modules instead.

Signed-off-by: Florian Westphal 
---
 include/net/net_namespace.h |  4 
 include/net/netns/dccp.h| 12 
 net/dccp/ipv4.c | 24 
 net/dccp/ipv6.c | 24 
 4 files changed, 40 insertions(+), 24 deletions(-)
 delete mode 100644 include/net/netns/dccp.h

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 3802c8322ab0..fa5887143f0d 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -22,7 +22,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
@@ -130,9 +129,6 @@ struct net {
 #if defined(CONFIG_IP_SCTP) || defined(CONFIG_IP_SCTP_MODULE)
struct netns_sctp   sctp;
 #endif
-#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE)
-   struct netns_dccp   dccp;
-#endif
 #ifdef CONFIG_NETFILTER
struct netns_nf nf;
struct netns_xt xt;
diff --git a/include/net/netns/dccp.h b/include/net/netns/dccp.h
deleted file mode 100644
index cdbc4f5b8390..
--- a/include/net/netns/dccp.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __NETNS_DCCP_H__
-#define __NETNS_DCCP_H__
-
-struct sock;
-
-struct netns_dccp {
-   struct sock *v4_ctl_sk;
-   struct sock *v6_ctl_sk;
-};
-
-#endif
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 2455b0c0e486..ffc601a3b329 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -23,14 +23,21 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "ackvec.h"
 #include "ccid.h"
 #include "dccp.h"
 #include "feat.h"
 
+struct dccp_v4_pernet {
+   struct sock *v4_ctl_sk;
+};
+
+static unsigned int dccp_v4_pernet_id __read_mostly;
+
 /*
- * The per-net dccp.v4_ctl_sk socket is used for responding to
+ * The per-net v4_ctl_sk socket is used for responding to
  * the Out-of-the-blue (OOTB) packets. A control sock will be created
  * for this socket at the initialization time.
  */
@@ -513,7 +520,8 @@ static void dccp_v4_ctl_send_reset(const struct sock *sk, 
struct sk_buff *rxskb)
struct sk_buff *skb;
struct dst_entry *dst;
struct net *net = dev_net(skb_dst(rxskb)->dev);
-   struct sock *ctl_sk = net->dccp.v4_ctl_sk;
+   struct dccp_v4_pernet *pn;
+   struct sock *ctl_sk;
 
/* Never send a reset in response to a reset. */
if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET)
@@ -522,6 +530,8 @@ static void dccp_v4_ctl_send_reset(const struct sock *sk, 
struct sk_buff *rxskb)
if (skb_rtable(rxskb)->rt_type != RTN_LOCAL)
return;
 
+   pn = net_generic(net, dccp_v4_pernet_id);
+   ctl_sk = pn->v4_ctl_sk;
dst = dccp_v4_route_skb(net, ctl_sk, rxskb);
if (dst == NULL)
return;
@@ -1005,16 +1015,20 @@ static struct inet_protosw dccp_v4_protosw = {
 
 static int __net_init dccp_v4_init_net(struct net *net)
 {
+   struct dccp_v4_pernet *pn = net_generic(net, dccp_v4_pernet_id);
+
if (dccp_hashinfo.bhash == NULL)
return -ESOCKTNOSUPPORT;
 
-   return inet_ctl_sock_create(&net->dccp.v4_ctl_sk, PF_INET,
+   return inet_ctl_sock_create(&pn->v4_ctl_sk, PF_INET,
SOCK_DCCP, IPPROTO_DCCP, net);
 }
 
 static void __net_exit dccp_v4_exit_net(struct net *net)
 {
-   inet_ctl_sock_destroy(net->dccp.v4_ctl_sk);
+   struct dccp_v4_pernet *pn = net_generic(net, dccp_v4_pernet_id);
+
+   inet_ctl_sock_destroy(pn->v4_ctl_sk);
 }
 
 static void __net_exit dccp_v4_exit_batch(struct list_head *net_exit_list)
@@ -1026,6 +1040,8 @@ static struct pernet_operations dccp_v4_ops = {
.init   = dccp_v4_init_net,
.exit   = dccp_v4_exit_net,
.exit_batch = dccp_v4_exit_batch,
+   .id = &dccp_v4_pernet_id,
+   .size   = sizeof(struct dccp_v4_pernet),
 };
 
 static int __init dccp_v4_init(void)
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 2be5c69824f9..6f5304db5a67 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -27,13 +27,20 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include "dccp.h"
 #include "ipv6.h"
 #include "feat.h"
 
-/* The per-net dccp.v6_ctl_sk is used for sending RSTs and ACKs */
+struct dccp_v6_pernet {
+   struct sock *v6_ctl_sk;
+};
+
+static unsigned int dccp_v6_pernet_id __read_mostly;
+
+/* The per-net v6_ctl_sk is used for sending RSTs and ACKs */
 
 static const struct inet_connection_sock_af_ops dccp_ipv6_mapped;
 static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops;
@@ -254,7 +261,8 @@ static void dccp_v6_ctl_send_reset(const struct sock *sk, 
str

Re: [BUG / question] in routing rules, some options (e.g. ipproto, sport) cause rules to be ignored in presence of packet marks

2021-04-09 Thread Florian Westphal
Michal Soltys  wrote:
> On 3/29/21 10:52 PM, Ido Schimmel wrote:
> > 
> > ip_route_me_harder() does not set source / destination port in the
> > flow key, so it explains why fib rules that use them are not hit after
> > mangling the packet. These keys were added in 4.17, but I
> > don't think this use case every worked. You have a different experience?
> > 
> 
> So all the more recent additions to routing rules - src port, dst port, uid
> range and ipproto - are not functioning correctly with the second routing
> check.
>
> Are there plans to eventually fix that ?
> 
> While I just adjusted/rearranged my stuff to not rely on those, it should
> probably be at least documented otherwise (presumably in ip-rule manpage and
> perhaps in `ip rule help` as well).

Fixing this would be better. As Ido implies it should be enough to fully
populate the flow keys in ip(6)_route_me_harder.


Re: linux-next: build failure after merge of the net-next tree

2021-04-12 Thread Florian Westphal
Stephen Rothwell  wrote:
> net/bridge/netfilter/ebtables.c:1248:33: error: 'struct netns_xt' has no 
> member named 'tables'
>  1248 |  list_for_each_entry(t, &net->xt.tables[NFPROTO_BRIDGE], list) {
>   | ^
> include/linux/list.h:619:20: note: in definition of macro 'list_entry_is_head'
>   619 |  (&pos->member == (head))
>   |^~~~
> net/bridge/netfilter/ebtables.c:1248:2: note: in expansion of macro 
> 'list_for_each_entry'
>  1248 |  list_for_each_entry(t, &net->xt.tables[NFPROTO_BRIDGE], list) {
>   |  ^~~
> 
> Caused by commit
> 
>   5b53951cfc85 ("netfilter: ebtables: use net_generic infra")
> 
> interacting with commit
> 
>   7ee3c61dcd28 ("netfilter: bridge: add pre_exit hooks for ebtable 
> unregistration")

Right, the fixup is correct.


Re: [PATCH] netfilter: nf_conntrack: Add conntrack helper for ESP/IPsec

2021-04-14 Thread Florian Westphal
Cole Dishington  wrote:
> Introduce changes to add ESP connection tracking helper to netfilter
> conntrack. The connection tracking of ESP is based on IPsec SPIs. The
> underlying motivation for this patch was to allow multiple VPN ESP
> clients to be distinguished when using NAT.
>
> Added config flag CONFIG_NF_CT_PROTO_ESP to enable the ESP/IPsec
> conntrack helper.

Thanks for the effort to upstream out of tree code.

A couple of comments and questions below.

Preface: AFAIU this tracker aims to 'soft-splice' two independent
ESP connections, i.e.:

saddr:spi1 -> daddr
daddr:spi2 <- saddr

So that we basically get this conntrack:

saddr,daddr,spi1 (original)   daddr,saddr,spi2 (remote)

This can't be done as-is, because we don't know spi2 at the time the
first ESP packet is received.

The solution implemented here is introduction of a 'virtual esp id',
computed when first ESP packet is received, so conntrack really stores:

saddr,daddr,ID (original)   daddr,saddr,ID (remote)

Because the ID is never carried on the wire, this tracker hooks into
pkt_to_tuple() infra so that the conntrack tuple gets populated
as-needed.

If I got that right, I think it would be good to place some description
like this in the source code, this is unlike all the other trackers.

> index ..2441e031c68e
> --- /dev/null
> +++ b/include/linux/netfilter/nf_conntrack_proto_esp.h
> @@ -0,0 +1,25 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _CONNTRACK_PROTO_ESP_H
> +#define _CONNTRACK_PROTO_ESP_H
> +#include 
> +
> +/* ESP PROTOCOL HEADER */
> +
> +struct esphdr {
> + __u32 spi;
> +};
> +
> +struct nf_ct_esp {
> + unsigned int stream_timeout;
> + unsigned int timeout;
> +};
> +
> +#ifdef __KERNEL__
> +#include 
> +
> +void destroy_esp_conntrack_entry(struct nf_conn *ct);
> +
> +bool esp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
> +   struct net *net, struct nf_conntrack_tuple *tuple);
> +#endif /* __KERNEL__ */

No need for the __KERNEL__, this header is not exposed to userspace
(only those in include/uapi/).

>   struct {
>   __be16 key;
>   } gre;
> + struct {
> + __be16 spi;

__be32 ?

I now see that this "spi" seems to be allocated by the esp tracker.
Maybe 'esp_id' or something like that?

It doesn't appear to be related to the ESP header SPI value.

> --- a/include/net/netns/conntrack.h
> +++ b/include/net/netns/conntrack.h
> @@ -69,6 +69,27 @@ struct nf_gre_net {
>  };
>  #endif
>  
> +#ifdef CONFIG_NF_CT_PROTO_ESP
> +#define ESP_MAX_PORTS  1000
> +#define HASH_TAB_SIZE  ESP_MAX_PORTS

ESP? Ports?  Should this be 'slots'?  Maybe a comment helps, I don't
expect to see ports in an ESP tracker.

> +enum esp_conntrack {
> + ESP_CT_UNREPLIED,
> + ESP_CT_REPLIED,
> + ESP_CT_MAX
> +};
> +
> +struct nf_esp_net {
> + rwlock_t esp_table_lock;

This uses a rwlock but i only see writer locks being taken.
So this either should use a spinlock, or reader-parts should
take readlock, not wrlock.

(but also see below).

> + struct hlist_head ltable[HASH_TAB_SIZE];
> + struct hlist_head rtable[HASH_TAB_SIZE];
> + /* Initial lookup for remote end until rspi is known */
> + struct hlist_head incmpl_rtable[HASH_TAB_SIZE];
> + struct _esp_table *esp_table[ESP_MAX_PORTS];
> + unsigned int esp_timeouts[ESP_CT_MAX];
> +};

This is large structure -- >32kb.

Could this be moved to nf_conntrack_net?

It would also be good to not allocate these hash slots until after conntrack
is needed.

The esp_timeouts[] can be kept to avoid module dep problems.

(But also see below, I'm not sure homegrown hash table is the way to go).

>  struct ct_pcpu {
> diff --git a/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h 
> b/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h
> index 64390fac6f7e..9bbd76c325d2 100644
> --- a/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h
> +++ b/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h
> @@ -39,6 +39,9 @@ union nf_conntrack_man_proto {
..
> +#if 0
> +#define ESP_DEBUG 1
> +#define DEBUGP(format, args...) printk(KERN_DEBUG "%s: " format, __func__, 
> ## args)
> +#else
> +#undef ESP_DEBUG
> +#define DEBUGP(x, args...)
> +#endif

I suggest to get rid of all of DEBUGP(), either drop them, or, in cases
where they are useful, switch to pr_debug().

> +#define TEMP_SPI_START 1500
> +#define TEMP_SPI_MAX   (TEMP_SPI_START + ESP_MAX_PORTS - 1)

I think this could use an explanation.

> +struct _esp_table {
> + /* Hash table nodes for each required lookup
> +  * lnode: l_spi, l_ip, r_ip
> +  * rnode: r_spi, r_ip
> +  * incmpl_rnode: r_ip
> +  */
> + struct hlist_node lnode;
> + struct hlist_node rnode;
> + struct hlist_node incmpl_rnode;
> +
> + u32 l_spi;
> + u32 r_spi;
> + u32 l_ip;
> + u32 r_ip;

Hmm, ipv4 only.  Could this be chan

[PATCH ipsec-next 2/3] xfrm: remove stray synchronize_rcu from xfrm_init

2021-04-14 Thread Florian Westphal
This function is called during boot, from ipv4 stack, there is no need
to set the pointer to NULL (static storage duration, so already NULL).

No need for the synchronize_rcu either.  Remove both.

Signed-off-by: Florian Westphal 
---
 net/xfrm/xfrm_policy.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index c49f20657cdb..59691611a9ab 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -4134,9 +4134,6 @@ void __init xfrm_init(void)
 #ifdef CONFIG_XFRM_ESPINTCP
espintcp_init();
 #endif
-
-   RCU_INIT_POINTER(xfrm_if_cb, NULL);
-   synchronize_rcu();
 }
 
 #ifdef CONFIG_AUDITSYSCALL
-- 
2.26.3



[PATCH ipsec-next 1/3] flow: remove spi key from flowi struct

2021-04-14 Thread Florian Westphal
xfrm session decode ipv4 path (but not ipv6) sets this, but there are no
consumers.  Remove it.

Signed-off-by: Florian Westphal 
---
 include/net/flow.h |  3 ---
 net/xfrm/xfrm_policy.c | 39 ---
 2 files changed, 42 deletions(-)

diff --git a/include/net/flow.h b/include/net/flow.h
index 39d0cedcddee..6f5e70240071 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -59,7 +59,6 @@ union flowi_uli {
__le16  sport;
} dnports;
 
-   __be32  spi;
__be32  gre_key;
 
struct {
@@ -90,7 +89,6 @@ struct flowi4 {
 #define fl4_dport  uli.ports.dport
 #define fl4_icmp_type  uli.icmpt.type
 #define fl4_icmp_code  uli.icmpt.code
-#define fl4_ipsec_spi  uli.spi
 #define fl4_mh_typeuli.mht.type
 #define fl4_gre_keyuli.gre_key
 } __attribute__((__aligned__(BITS_PER_LONG/8)));
@@ -150,7 +148,6 @@ struct flowi6 {
 #define fl6_dport  uli.ports.dport
 #define fl6_icmp_type  uli.icmpt.type
 #define fl6_icmp_code  uli.icmpt.code
-#define fl6_ipsec_spi  uli.spi
 #define fl6_mh_typeuli.mht.type
 #define fl6_gre_keyuli.gre_key
__u32   mp_hash;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index b74f28cabe24..c49f20657cdb 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -3326,39 +3326,6 @@ decode_session4(struct sk_buff *skb, struct flowi *fl, 
bool reverse)
fl4->fl4_icmp_code = icmp[1];
}
break;
-   case IPPROTO_ESP:
-   if (xprth + 4 < skb->data ||
-   pskb_may_pull(skb, xprth + 4 - skb->data)) {
-   __be32 *ehdr;
-
-   xprth = skb_network_header(skb) + ihl * 4;
-   ehdr = (__be32 *)xprth;
-
-   fl4->fl4_ipsec_spi = ehdr[0];
-   }
-   break;
-   case IPPROTO_AH:
-   if (xprth + 8 < skb->data ||
-   pskb_may_pull(skb, xprth + 8 - skb->data)) {
-   __be32 *ah_hdr;
-
-   xprth = skb_network_header(skb) + ihl * 4;
-   ah_hdr = (__be32 *)xprth;
-
-   fl4->fl4_ipsec_spi = ah_hdr[1];
-   }
-   break;
-   case IPPROTO_COMP:
-   if (xprth + 4 < skb->data ||
-   pskb_may_pull(skb, xprth + 4 - skb->data)) {
-   __be16 *ipcomp_hdr;
-
-   xprth = skb_network_header(skb) + ihl * 4;
-   ipcomp_hdr = (__be16 *)xprth;
-
-   fl4->fl4_ipsec_spi = 
htonl(ntohs(ipcomp_hdr[1]));
-   }
-   break;
case IPPROTO_GRE:
if (xprth + 12 < skb->data ||
pskb_may_pull(skb, xprth + 12 - skb->data)) {
@@ -3377,7 +3344,6 @@ decode_session4(struct sk_buff *skb, struct flowi *fl, 
bool reverse)
}
break;
default:
-   fl4->fl4_ipsec_spi = 0;
break;
}
}
@@ -3470,12 +3436,7 @@ decode_session6(struct sk_buff *skb, struct flowi *fl, 
bool reverse)
fl6->flowi6_proto = nexthdr;
return;
 #endif
-   /* XXX Why are there these headers? */
-   case IPPROTO_AH:
-   case IPPROTO_ESP:
-   case IPPROTO_COMP:
default:
-   fl6->fl6_ipsec_spi = 0;
fl6->flowi6_proto = nexthdr;
return;
}
-- 
2.26.3



[PATCH ipsec-next 0/3] xfrm: minor cleanup and synchronize_rcu removal

2021-04-14 Thread Florian Westphal
First patch gets rid of SPI key from flowi struct.
xfrm_policy populates this but there are no consumers.

This is part of a different patch (not part of this) to replace
xfrm_decode_session internals with the flow dissector.

Second patch removes a synchronize_rcu/initialisation in the init path.
Third patch avoids a synchronize_rcu during netns destruction.

Florian Westphal (3):
  flow: remove spi key from flowi struct
  xfrm: remove stray synchronize_rcu from xfrm_init
  xfrm: avoid synchronize_rcu during netns destruction

 include/net/flow.h |  3 ---
 net/xfrm/xfrm_policy.c | 42 --
 net/xfrm/xfrm_user.c   | 10 +++---
 3 files changed, 7 insertions(+), 48 deletions(-)

-- 
2.26.3



[PATCH ipsec-next 3/3] xfrm: avoid synchronize_rcu during netns destruction

2021-04-14 Thread Florian Westphal
Use the new exit_pre hook to NULL the netlink socket.
The net namespace core will do a synchronize_rcu() between the exit_pre
and exit/exit_batch handlers.

Signed-off-by: Florian Westphal 
---
 net/xfrm/xfrm_user.c | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 5a0ef4361e43..9313592fa01f 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -3480,18 +3480,22 @@ static int __net_init xfrm_user_net_init(struct net 
*net)
return 0;
 }
 
+static void __net_exit xfrm_user_net_pre_exit(struct net *net)
+{
+   RCU_INIT_POINTER(net->xfrm.nlsk, NULL);
+}
+
 static void __net_exit xfrm_user_net_exit(struct list_head *net_exit_list)
 {
struct net *net;
-   list_for_each_entry(net, net_exit_list, exit_list)
-   RCU_INIT_POINTER(net->xfrm.nlsk, NULL);
-   synchronize_net();
+
list_for_each_entry(net, net_exit_list, exit_list)
netlink_kernel_release(net->xfrm.nlsk_stash);
 }
 
 static struct pernet_operations xfrm_user_net_ops = {
.init   = xfrm_user_net_init,
+   .pre_exit   = xfrm_user_net_pre_exit,
.exit_batch = xfrm_user_net_exit,
 };
 
-- 
2.26.3



  1   2   3   4   5   6   7   8   9   10   >