[PATCH net-next] macvlan: fix memory hole in macvlan_dev

2017-12-08 Thread Girish Moodalbail
Move 'macaddr_count' from after 'netpoll' to after 'nest_level' to pack
and reduce a memory hole.

Fixes: 88ca59d1aaf28c25 (macvlan: remove unused fields in struct macvlan_dev)
Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com>
---
 include/linux/if_macvlan.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index bedf54b..4cb7aee 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -30,10 +30,10 @@ struct macvlan_dev {
enum macvlan_mode   mode;
u16 flags;
int nest_level;
+   unsigned intmacaddr_count;
 #ifdef CONFIG_NET_POLL_CONTROLLER
struct netpoll  *netpoll;
 #endif
-   unsigned intmacaddr_count;
 };
 
 static inline void macvlan_count_rx(const struct macvlan_dev *vlan,
-- 
1.8.3.1



[PATCH net v2 0/1] NULL pointer dereference in ipvlan_port_destroy

2017-11-16 Thread Girish Moodalbail
>From code inspection it appeared that there is a possibility where in
ipvlan_port_destroy() might be dealing with a port (struct ipvl_port)
that has already been destroyed and is therefore already NULL. However,
we don't check for NULL and continue to access the fields which results
in a kernel panic.

When call to register_netdevice() (called from ipvlan_link_new()) fails,
inside that function we call ipvlan_uninit() (through ndo_uninit()) to
destroy the ipvlan port. Upon returning unsuccessfully from
register_netdevice() we go ahead and call ipvlan_port_destroy() again
which causes NULL pointer dereference panic.

To test this theory, I loaded up netdev-notifier-error-inject.ko and did 

$ sudo echo -22 > /sys/kernel/debug/notifier-error-inject/\
  netdev/actions/NETDEV_POST_INIT/error
$ sudo  ip li add ipvl0 link enp7s0 type ipvlan
...system panics...
BUG: unable to handle kernel NULL pointer dereference at 0820
IP: ipvlan_port_destroy+0x2a/0xf0 [ipvlan]

Similar issue exists in macvlan_port_destroy() and it will be addressed
by a separate patch. The following patch fixes the ipvlan case. I tested
my changes for regression by running LTP's ipvlan test case.

Girish Moodalbail (1):
  ipvlan: NULL pointer dereference panic in ipvlan_port_destroy

 drivers/net/ipvlan/ipvlan_main.c | 104 +--
 1 file changed, 55 insertions(+), 49 deletions(-)

-- 
1.8.3.1



[PATCH net v2 1/1] ipvlan: NULL pointer dereference panic in ipvlan_port_destroy

2017-11-16 Thread Girish Moodalbail
When call to register_netdevice() (called from ipvlan_link_new()) fails,
we call ipvlan_uninit() (through ndo_uninit()) to destroy the ipvlan
port. After returning unsuccessfully from register_netdevice() we go
ahead and call ipvlan_port_destroy() again which causes NULL pointer
dereference panic. Fix the issue by making ipvlan_init() and
ipvlan_uninit() call symmetric.

The ipvlan port will now be created inside ipvlan_init() and will be
destroyed in ipvlan_uninit().

Fixes: 2ad7bf363841 (ipvlan: Initial check-in of the IPVLAN driver)
Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com>
---
v1 -> v2:
  - took care of David Miller's comment on ipvlan_init() and
ipvlan_uninit() not being symmetric.
---
---
 drivers/net/ipvlan/ipvlan_main.c | 104 +--
 1 file changed, 55 insertions(+), 49 deletions(-)

diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index a266aa4..30cb803 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -107,16 +107,6 @@ static int ipvlan_port_create(struct net_device *dev)
struct ipvl_port *port;
int err, idx;
 
-   if (dev->type != ARPHRD_ETHER || dev->flags & IFF_LOOPBACK) {
-   netdev_err(dev, "Master is either lo or non-ether device\n");
-   return -EINVAL;
-   }
-
-   if (netdev_is_rx_handler_busy(dev)) {
-   netdev_err(dev, "Device is already in use.\n");
-   return -EBUSY;
-   }
-
port = kzalloc(sizeof(struct ipvl_port), GFP_KERNEL);
if (!port)
return -ENOMEM;
@@ -179,8 +169,9 @@ static void ipvlan_port_destroy(struct net_device *dev)
 static int ipvlan_init(struct net_device *dev)
 {
struct ipvl_dev *ipvlan = netdev_priv(dev);
-   const struct net_device *phy_dev = ipvlan->phy_dev;
-   struct ipvl_port *port = ipvlan->port;
+   struct net_device *phy_dev = ipvlan->phy_dev;
+   struct ipvl_port *port;
+   int err;
 
dev->state = (dev->state & ~IPVLAN_STATE_MASK) |
 (phy_dev->state & IPVLAN_STATE_MASK);
@@ -196,18 +187,27 @@ static int ipvlan_init(struct net_device *dev)
if (!ipvlan->pcpu_stats)
return -ENOMEM;
 
+   if (!netif_is_ipvlan_port(phy_dev)) {
+   err = ipvlan_port_create(phy_dev);
+   if (err < 0) {
+   free_percpu(ipvlan->pcpu_stats);
+   return err;
+   }
+   }
+   port = ipvlan_port_get_rtnl(phy_dev);
port->count += 1;
-
return 0;
 }
 
 static void ipvlan_uninit(struct net_device *dev)
 {
struct ipvl_dev *ipvlan = netdev_priv(dev);
-   struct ipvl_port *port = ipvlan->port;
+   struct net_device *phy_dev = ipvlan->phy_dev;
+   struct ipvl_port *port;
 
free_percpu(ipvlan->pcpu_stats);
 
+   port = ipvlan_port_get_rtnl(phy_dev);
port->count -= 1;
if (!port->count)
ipvlan_port_destroy(port->dev);
@@ -554,7 +554,6 @@ int ipvlan_link_new(struct net *src_net, struct net_device 
*dev,
struct net_device *phy_dev;
int err;
u16 mode = IPVLAN_MODE_L3;
-   bool create = false;
 
if (!tb[IFLA_LINK])
return -EINVAL;
@@ -568,28 +567,41 @@ int ipvlan_link_new(struct net *src_net, struct 
net_device *dev,
 
phy_dev = tmp->phy_dev;
} else if (!netif_is_ipvlan_port(phy_dev)) {
-   err = ipvlan_port_create(phy_dev);
-   if (err < 0)
-   return err;
-   create = true;
-   }
+   /* Exit early if the underlying link is invalid or busy */
+   if (phy_dev->type != ARPHRD_ETHER ||
+   phy_dev->flags & IFF_LOOPBACK) {
+   netdev_err(phy_dev,
+  "Master is either lo or non-ether device\n");
+   return -EINVAL;
+   }
 
-   if (data && data[IFLA_IPVLAN_MODE])
-   mode = nla_get_u16(data[IFLA_IPVLAN_MODE]);
+   if (netdev_is_rx_handler_busy(phy_dev)) {
+   netdev_err(phy_dev, "Device is already in use.\n");
+   return -EBUSY;
+   }
+   }
 
-   port = ipvlan_port_get_rtnl(phy_dev);
ipvlan->phy_dev = phy_dev;
ipvlan->dev = dev;
-   ipvlan->port = port;
ipvlan->sfeatures = IPVLAN_FEATURES;
ipvlan_adjust_mtu(ipvlan, phy_dev);
INIT_LIST_HEAD(>addrs);
 
-   /* Flags are per port and latest update overrides. User has
-* to be consistent in setting it just like the mode attribute.
+   /* TODO Probably put random address here to be presented to the
+* world but keep using 

Re: [PATCH net v2] ipv6: set all.accept_dad to 0 by default

2017-11-14 Thread Girish Moodalbail

On 11/14/17 11:10 AM, Stefano Brivio wrote:

On Tue, 14 Nov 2017 10:30:33 -0800
Girish Moodalbail <girish.moodalb...@oracle.com> wrote:


On 11/14/17 5:21 AM, Nicolas Dichtel wrote:

With commits 35e015e1f577 and a2d3f3e33853, the global 'accept_dad' flag
is also taken into account (default value is 1). If either global or
per-interface flag is non-zero, DAD will be enabled on a given interface.

This is not backward compatible: before those patches, the user could
disable DAD just by setting the per-interface flag to 0. Now, the
user instead needs to set both flags to 0 to actually disable DAD.

Restore the previous behaviour by setting the default for the global
'accept_dad' flag to 0. This way, DAD is still enabled by default,
as per-interface flags are set to 1 on device creation, but setting
them to 0 is enough to disable DAD on a given interface.

- Before 35e015e1f57a7 and a2d3f3e33853:
globalper-interfaceDAD enabled
[default]   1 1  yes
  X 0  no
  X 1  yes

- After 35e015e1f577 and a2d3f3e33853:
globalper-interfaceDAD enabled
[default]   1 1  yes
  0 0  no
  0 1  yes
  1 0  yes

- After this fix:
globalper-interfaceDAD enabled
  1 1  yes
  0 0  no
[default]   0 1  yes
  1 0  yes


Above table can be summarized to..

- After this fix:
globalper-interfaceDAD enabled
  1 X  yes
  0 0  no
[default]   0 1  yes

So, if global is set to '1', then irrespective of what the per-interface value
is DAD will be enabled. Is it not confusing. Shouldn't the more specific value
override the general value?


Might be a bit confusing, yes, but in order to implement an overriding
mechanism you would need to implement a tristate option as Eric K.
proposed. That is, by default you would have -1 (meaning "don't care")
on per-interface flags, and if this value is changed then the
per-interface value wins over the global one.

Sensible, but I think it's outside of the scope of this patch, which is
just intended to restore a specific pre-existing userspace expectation.


On the other hand, if the global is set to '0', then per-interface value will be
honored (overrides global). So, the meaning of global varies based on its value.
Isn't that confusing as well.


I don't find this confusing though. Setting the global flag always has
the meaning of "force enabling DAD on all interfaces".

You would have the same problem if you chose a logical AND between
global and per-interface flag. There, setting the global flag would mean
"force disabling DAD on all interfaces".

So the only indisputable improvement I see here would be to implement a
"don't care" value (either for global or for per-interface flags). But
I'd rather agree with Nicolas that we should fix a potentially broken
userspace assumption first.


Agree.

Thanks,
~Girish




Re: [PATCH net v2] ipv6: set all.accept_dad to 0 by default

2017-11-14 Thread Girish Moodalbail

On 11/14/17 5:21 AM, Nicolas Dichtel wrote:

With commits 35e015e1f577 and a2d3f3e33853, the global 'accept_dad' flag
is also taken into account (default value is 1). If either global or
per-interface flag is non-zero, DAD will be enabled on a given interface.

This is not backward compatible: before those patches, the user could
disable DAD just by setting the per-interface flag to 0. Now, the
user instead needs to set both flags to 0 to actually disable DAD.

Restore the previous behaviour by setting the default for the global
'accept_dad' flag to 0. This way, DAD is still enabled by default,
as per-interface flags are set to 1 on device creation, but setting
them to 0 is enough to disable DAD on a given interface.

- Before 35e015e1f57a7 and a2d3f3e33853:
   globalper-interfaceDAD enabled
[default]   1 1  yes
 X 0  no
 X 1  yes

- After 35e015e1f577 and a2d3f3e33853:
   globalper-interfaceDAD enabled
[default]   1 1  yes
 0 0  no
 0 1  yes
 1 0  yes

- After this fix:
   globalper-interfaceDAD enabled
 1 1  yes
 0 0  no
[default]   0 1  yes
 1 0  yes


Above table can be summarized to..

- After this fix:
  globalper-interfaceDAD enabled
1 X  yes
0 0  no
[default]   0 1  yes

So, if global is set to '1', then irrespective of what the per-interface value 
is DAD will be enabled. Is it not confusing. Shouldn't the more specific value 
override the general value?


On the other hand, if the global is set to '0', then per-interface value will be 
honored (overrides global). So, the meaning of global varies based on its value. 
Isn't that confusing as well.


thanks,
~Girish




Re: KASAN: use-after-free Read in rds_tcp_dev_event

2017-11-14 Thread Girish Moodalbail

On 11/14/17 5:22 AM, Sowmini Varadhan wrote:



A few questions.

- First off, why am I not seeing the original mail in this thread
   even when I search the mail archives, e.g.,
 https://lkml.org/lkml/2017/11/13/954

-  Girish Moodalbail writes:


The issue here is that we are trying to reference a network namespace
(struct net *) that is long gone (i.e., L532 below -- c_net is the culprit).


   The netns is not "long gone", we are still processing
   the NETDEV_UNREGISTER_FINAL for loopback.


Obviously, I was not talking about the current namespace.

Say there are two namespaces - ns1 and ns2 and that both have RDS connections. 
Deletion of ns1 will be fine. However when ns2 is being deleted, in the 
rds_tcp_dev_event() callback we walk through the global list and some nodes in 
that list will be referring to ns1 (that is "long gone"). If you read my earlier 
email, I was talking about ns1 which is already gone, and we are trying to 
access it from ns2.


~Girish



As I said in my
   earlier mail, the idea is to extract the list of unique conns
   that belong to the netns and then destroy both the conn, and
   all associated paths. Thus there can only be a single thread
   going through rds_tcp_kill_sock at any time (since we should
   only get the unregister_final/loopback one time for the netns).
   (See alos comment block in rds_tcp_dev_event about network activity
   quiescing). Thus there should be no concurrency issue.

   However when I just ehecked this, there may be some rds connection
   refcounting bug. When I quickly tested this, I'm not seeing the
   expected calls to conn_path_destroy. I'll need some time to take
   a look, this has been known to work, so something got broken along
   the way
  

I think we should move away from global list to a per-namespace list. The
global list are used only in two places (both of which are per-namespace
operations):


let's first understand the real root-cause before we start
redesigning data-structures.

--Sowmini







Re: KASAN: use-after-free Read in rds_tcp_dev_event

2017-11-13 Thread Girish Moodalbail

On 11/7/17 12:28 PM, syzbot wrote:

Hello,

syzkaller hit the following crash on 287683d027a3ff83feb6c7044430c79881664ecf
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/master
compiler: gcc (GCC) 7.1.1 20170620
.config is attached
Raw console output is attached.




==
BUG: KASAN: use-after-free in rds_tcp_kill_sock net/rds/tcp.c:530 [inline]
BUG: KASAN: use-after-free in rds_tcp_dev_event+0xc01/0xc90 net/rds/tcp.c:568
Read of size 8 at addr 8801cd879200 by task kworker/u4:3/147

CPU: 0 PID: 147 Comm: kworker/u4:3 Not tainted 4.14.0-rc7+ #156
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 
01/01/2011

Workqueue: netns cleanup_net
Call Trace:
  __dump_stack lib/dump_stack.c:16 [inline]
  dump_stack+0x194/0x257 lib/dump_stack.c:52
  print_address_description+0x73/0x250 mm/kasan/report.c:252
  kasan_report_error mm/kasan/report.c:351 [inline]
  kasan_report+0x25b/0x340 mm/kasan/report.c:409
  __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:430
  rds_tcp_kill_sock net/rds/tcp.c:530 [inline]
  rds_tcp_dev_event+0xc01/0xc90 net/rds/tcp.c:568


The issue here is that we are trying to reference a network namespace (struct 
net *) that is long gone (i.e., L532 below -- c_net is the culprit).


528 spin_lock_irq(_tcp_conn_lock);
529 list_for_each_entry_safe(tc, _tc, _tcp_conn_list,
 t_tcp_node) {
530 struct net *c_net = tc->t_cpath->cp_conn->c_net;
531
532 if (net != c_net || !tc->t_sock)
533 continue;
534 if (!list_has_conn(_list, tc->t_cpath->cp_conn))
535 list_move_tail(>t_tcp_node, _list);
536 }
537 spin_unlock_irq(_tcp_conn_lock);
538 list_for_each_entry_safe(tc, _tc, _list, t_tcp_node) {
539 rds_tcp_conn_paths_destroy(tc->t_cpath->cp_conn);
540 rds_conn_destroy(tc->t_cpath->cp_conn);
541 }

When a network namespace is deleted, devices within that namespace are 
unregistered and removed one by one. RDS is notified about this event through 
rds_tcp_dev_event() callback. When the loopback device is removed from the 
namespace, the above RDS callback function destroys all the RDS connections in 
that namespace.


The loop@L529 above walks through each of the rds_tcp connection in the global 
list (rds_tcp_conn_list) to see if that connection belongs to the namespace in 
question. It collects all such connections and destroys them (L538-540). 
However, it leaves behind some of the rds_tcp connections that shared the same 
underlying RDS connection (L534 and 535). These connections with pointer to 
stale network namespace are left behind in the global list. When the 2nd network 
namespace is deleted, we will hit the above stale pointer and hit UAF panic.


I think we should move away from global list to a per-namespace list. The global 
list are used only in two places (both of which are per-namespace operations):


 - to destroy all the RDS connections belonging to a namespace when the
   network namespace is being deleted.
 - to reset all the RDS connections  when socket parameters for a namespace are
   modified using sysctl

Thanks,
~Girish


Re: [Patch net] vlan: fix a use-after-free in vlan_device_event()

2017-11-09 Thread Girish Moodalbail

On 11/9/17 4:43 PM, Cong Wang wrote:

After refcnt reaches zero, vlan_vid_del() could free
dev->vlan_info via RCU:

RCU_INIT_POINTER(dev->vlan_info, NULL);
call_rcu(_info->rcu, vlan_info_rcu_free);

However, the pointer 'grp' still points to that memory
since it is set before vlan_vid_del():

 vlan_info = rtnl_dereference(dev->vlan_info);
 if (!vlan_info)
 goto out;
 grp = _info->grp;

Depends on when that RCU callback is scheduled, we could
trigger a use-after-free in vlan_group_for_each_dev()
right following this vlan_vid_del().

Fix it by moving vlan_vid_del() before setting grp. This
is also symmetric to the vlan_vid_add() we call in
vlan_device_event().

Reported-by: Fengguang Wu <fengguang...@intel.com>
Fixes: efc73f4bbc23 ("net: Fix memory leak - vlan_info struct")
Cc: Alexander Duyck <alexander.du...@gmail.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Girish Moodalbail <girish.moodalb...@oracle.com>
Signed-off-by: Cong Wang <xiyou.wangc...@gmail.com>


LGTM.

Reviewed-by: Girish Moodalbail <girish.moodalb...@oracle.com>

Thanks,
~Girish



---
  net/8021q/vlan.c | 6 +++---
  1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 9649579b5b9f..4a72ee4e2ae9 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -376,6 +376,9 @@ static int vlan_device_event(struct notifier_block *unused, 
unsigned long event,
dev->name);
vlan_vid_add(dev, htons(ETH_P_8021Q), 0);
}
+   if (event == NETDEV_DOWN &&
+   (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER))
+   vlan_vid_del(dev, htons(ETH_P_8021Q), 0);
  
  	vlan_info = rtnl_dereference(dev->vlan_info);

if (!vlan_info)
@@ -423,9 +426,6 @@ static int vlan_device_event(struct notifier_block *unused, 
unsigned long event,
struct net_device *tmp;
LIST_HEAD(close_list);
  
-		if (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)

-   vlan_vid_del(dev, htons(ETH_P_8021Q), 0);
-
/* Put all VLANs for this dev in the down state too.  */
vlan_group_for_each_dev(grp, i, vlandev) {
flgs = vlandev->flags;





Re: [vlan_device_event] BUG: unable to handle kernel paging request at 6b6b6ccf

2017-11-09 Thread Girish Moodalbail

On 11/8/17 10:34 PM, Cong Wang wrote:

On Wed, Nov 8, 2017 at 7:12 PM, Fengguang Wu  wrote:

Hi Alex,


So looking over the trace the panic seems to be happening after a
decnet interface is getting deleted. Is there any chance we could try
compiling the kernel without decnet support to see if that is the
source of these issues? I don't know if anyone on the Intel Wired Lan
team is testing with that enabled so if we can eliminate that as a
possible cause that would be useful.



Sure and thank you for the suggestion!

It looks disabling DECNET still triggers the vlan_device_event BUG.
However when looking at the dmesgs, I find another warning just before
the vlan_device_event BUG. Not sure if it's related one or independent
now-fixed issue.


Those decnet symbols are probably noises.


Right. This is a 32-bit Kernel compiled with CONFIG_PREEMPT=y (I am guessing 
that this has exposed some lock bug). Also, VLAN (8021q) is compiled into the 
kernel, so it registers a vlan_device_event() callback on boot. There may not be 
a VLAN device per-se.


Upon receiving NETDEV_DOWN event, we are calling

vlan_vid_del(dev, htons(ETH_P_8021Q), 0);

which in turn calls call_rcu() to queue vlan_info_free_rcu() to be called at 
some point. This free function frees the array[] 
(vlan_info.vlan_grp.vn_devices_array).  My guess is that vlan_info_free_rcu() is 
being called first and then the array[] is being accessed in vlan_device_event().


The netifd daemon in OpenWRT is marking the interface down and that is why it is 
generating NETDEV_DOWN event. And it uses ioctl(SIOCSIFFLAGS, ~IFF_UP) on a 
AF_UNIX socket. This results in a call to dev_ifsioc() in the kernel with only 
rtnl_lock() held and it is not in RCU read critical section.


~Girish




How do you reproduce it? And what is your setup? Vlan device on
top of your eth0 (e1000)?





[PATCH net] net: fix incorrect comment with regard to VLAN packet handling

2017-11-07 Thread Girish Moodalbail
The commit bcc6d4790361 ("net: vlan: make non-hw-accel rx path similar
to hw-accel") unified accel and non-accel path for VLAN RX. With that
fix we do not register any packet_type handler for VLANs anymore, so fix
the incorrect comment.

Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com>
---
 include/linux/netdevice.h | 8 
 1 file changed, 8 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2eaac7d..7098978 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4411,15 +4411,7 @@ void netdev_printk(const char *level, const struct 
net_device *dev,
  * Why 16. Because with 16 the only overlap we get on a hash of the
  * low nibble of the protocol value is RARP/SNAP/X.25.
  *
- *  NOTE:  That is no longer true with the addition of VLAN tags.  Not
- * sure which should go first, but I bet it won't make much
- * difference if we are running VLANs.  The good news is that
- * this protocol won't be in the list unless compiled in, so
- * the average user (w/out VLANs) will not be adversely affected.
- * --BLG
- *
  * 0800IP
- * 8100802.1Q VLAN
  * 0001802.3
  * 0002AX.25
  * 0004802.2
-- 
1.8.3.1



Re: [PATCH net 0/2] NULL pointer dereference in {ipvlan|macvlan}_port_destroy

2017-11-03 Thread Girish Moodalbail

On 11/2/17 10:05 PM, David Miller wrote:

From: Girish Moodalbail <girish.moodalb...@oracle.com>
Date: Tue, 31 Oct 2017 09:39:45 -0700


When call to register_netdevice() (called from ipvlan_link_new())
fails, inside that function we call ipvlan_uninit() (through
ndo_uninit()) to destroy the ipvlan port. Upon returning
unsuccessfully from register_netdevice() we go ahead and call
ipvlan_port_destroy() again which causes NULL pointer dereference
panic.


The problem is that ipvlan doesn't follow the proper convention that
->ndo_uninit() must only release resources allocated by ->ndo_init().

What needs to happen is that the port allocation occur in
->ndo_init().


I agree, will send out V2. I initially started off making them (ndo_init and 
ndo_uninit) symmetric by moving the port destruction out of ndo_uninit(), but I 
hit some WARN() errors. Will figure it out.


thanks,
~Girish



Your fix, while solving some cases, does not fully cover all of the
posibiities due to this bug.

Please fix this correctly by moving the port allocation and related
setup from link creation to ->ndo_init().

Thank you.





[PATCH net 1/2] ipvlan: NULL pointer dereference panic in ipvlan_port_destroy

2017-10-31 Thread Girish Moodalbail
When call to register_netdevice() (called from ipvlan_link_new()) fails,
we call ipvlan_uninit() (through ndo_uninit()) to destroy the ipvlan
port. Upon returning unsuccessfully from register_netdevice() we go
ahead and call ipvlan_port_destroy() again which causes NULL pointer
dereference panic. Fix it.

Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com>
---
 drivers/net/ipvlan/ipvlan_main.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index c74893c..00a62a1 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -602,6 +602,12 @@ int ipvlan_link_new(struct net *src_net, struct net_device 
*dev,
 unregister_netdev:
unregister_netdevice(dev);
 remove_ida:
+   /* Through the call to ipvlan_uninit (ndo_uninit callback) IPvlan port
+* might be already destroyed in failure path in register_netdevice()
+* or the above call in unregister_netdevice().
+*/
+   if (!ipvlan_port_get_rtnl(phy_dev))
+   return err;
ida_simple_remove(>ida, dev->dev_id);
 destroy_ipvlan_port:
if (create)
-- 
1.8.3.1



[PATCH net 0/2] NULL pointer dereference in {ipvlan|macvlan}_port_destroy

2017-10-31 Thread Girish Moodalbail
>From code inspection it appeared that there is a possibility where in
ipvlan_port_destroy() might be dealing with a port (struct ipvl_port)
that has already been destroyed and is therefore already NULL. However,
we don't check for NULL and continue to access the fields which results
in a kernel panic.

When call to register_netdevice() (called from ipvlan_link_new()) fails,
inside that function we call ipvlan_uninit() (through ndo_uninit()) to
destroy the ipvlan port. Upon returning unsuccessfully from
register_netdevice() we go ahead and call ipvlan_port_destroy() again
which causes NULL pointer dereference panic.

To test this theory, I loaded up netdev-notifier-error-inject.ko and did 

# echo -22 > /sys/kernel/debug/notifier-error-inject/\
  netdev/actions/NETDEV_POST_INIT/error

# ip li add ipvl0 link enp7s0 type ipvlan

BUG: unable to handle kernel NULL pointer dereference at 0820
IP: ipvlan_port_destroy+0x2a/0xf0 [ipvlan]

Similar issue exists in macvlan_port_destroy(). The following two
patches fixes ipvlan and macvlan module.

-
Girish Moodalbail (2):
  ipvlan: NULL pointer dereference panic in ipvlan_port_destroy
  macvlan: NULL pointer dereference panic in macvlan_port_destroy

 drivers/net/ipvlan/ipvlan_main.c | 6 ++
 drivers/net/macvlan.c| 5 -
 2 files changed, 10 insertions(+), 1 deletion(-)

-- 
1.8.3.1



[PATCH net 2/2] macvlan: NULL pointer dereference panic in macvlan_port_destroy

2017-10-31 Thread Girish Moodalbail
When call to register_netdevice() (called from macvlan_common_newlink())
fails, we call macvlan_uninit() (through ndo_uninit()) to destroy the
macvlan port. Upon returning unsuccessfully from register_netdevice() we
go ahead and call macvlan_port_destroy() again which causes NULL pointer
dereference panic. Fix it.

Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com>
---
 drivers/net/macvlan.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index d2aea96..2520de8 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -1189,6 +1189,9 @@ static void macvlan_port_destroy(struct net_device *dev)
struct macvlan_port *port = macvlan_port_get_rtnl(dev);
struct sk_buff *skb;
 
+   if (!port)
+   return;
+
dev->priv_flags &= ~IFF_MACVLAN_PORT;
netdev_rx_handler_unregister(dev);
 
@@ -1444,7 +1447,7 @@ int macvlan_common_newlink(struct net *src_net, struct 
net_device *dev,
unregister_netdevice(dev);
 destroy_macvlan_port:
if (create)
-   macvlan_port_destroy(port->dev);
+   macvlan_port_destroy(lowerdev);
return err;
 }
 EXPORT_SYMBOL_GPL(macvlan_common_newlink);
-- 
1.8.3.1



[PATCH] tap: reference to KVA of an unloaded module causes kernel panic

2017-10-27 Thread Girish Moodalbail
The commit 9a393b5d5988 ("tap: tap as an independent module") created a
separate tap module that implements tap functionality and exports
interfaces that will be used by macvtap and ipvtap modules to create
create respective tap devices.

However, that patch introduced a regression wherein the modules macvtap
and ipvtap can be removed (through modprobe -r) while there are
applications using the respective /dev/tapX devices. These applications
cause kernel to hold reference to /dev/tapX through 'struct cdev
macvtap_cdev' and 'struct cdev ipvtap_dev' defined in macvtap and ipvtap
modules respectively. So,  when the application is later closed the
kernel panics because we are referencing KVA that is present in the
unloaded modules.

--8<--- Example --8<--
$ sudo ip li add name mv0 link enp7s0 type macvtap
$ sudo ip li show mv0 |grep mv0| awk -e '{print $1 $2}'
  14:mv0@enp7s0:
$ cat /dev/tap14 &
$ lsmod |egrep -i 'tap|vlan'
macvtap16384  0
macvlan24576  1 macvtap
tap24576  3 macvtap
$ sudo modprobe -r macvtap
$ fg
cat /dev/tap14
^C

<...system panics...>
BUG: unable to handle kernel paging request at a038c500
IP: cdev_put+0xf/0x30
--8<-8<--

The fix is to set cdev.owner to the module that creates the tap device
(either macvtap or ipvtap). With this set, the operations (in
fs/char_dev.c) on char device holds and releases the module through
cdev_get() and cdev_put() and will not allow the module to unload
prematurely.

Fixes: 9a393b5d5988ea4e (tap: tap as an independent module)
Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com>
---
 drivers/net/ipvlan/ipvtap.c | 4 ++--
 drivers/net/macvtap.c   | 4 ++--
 drivers/net/tap.c   | 5 +++--
 include/linux/if_tap.h  | 4 ++--
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ipvlan/ipvtap.c b/drivers/net/ipvlan/ipvtap.c
index 5dea206..0bcc07f 100644
--- a/drivers/net/ipvlan/ipvtap.c
+++ b/drivers/net/ipvlan/ipvtap.c
@@ -197,8 +197,8 @@ static int ipvtap_init(void)
 {
int err;
 
-   err = tap_create_cdev(_cdev, _major, "ipvtap");
-
+   err = tap_create_cdev(_cdev, _major, "ipvtap",
+ THIS_MODULE);
if (err)
goto out1;
 
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index c2d0ea2..cba5cb3 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -204,8 +204,8 @@ static int macvtap_init(void)
 {
int err;
 
-   err = tap_create_cdev(_cdev, _major, "macvtap");
-
+   err = tap_create_cdev(_cdev, _major, "macvtap",
+ THIS_MODULE);
if (err)
goto out1;
 
diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 98ee6cc..1b10fcc 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -1249,8 +1249,8 @@ static int tap_list_add(dev_t major, const char 
*device_name)
return 0;
 }
 
-int tap_create_cdev(struct cdev *tap_cdev,
-   dev_t *tap_major, const char *device_name)
+int tap_create_cdev(struct cdev *tap_cdev, dev_t *tap_major,
+   const char *device_name, struct module *module)
 {
int err;
 
@@ -1259,6 +1259,7 @@ int tap_create_cdev(struct cdev *tap_cdev,
goto out1;
 
cdev_init(tap_cdev, _fops);
+   tap_cdev->owner = module;
err = cdev_add(tap_cdev, *tap_major, TAP_NUM_DEVS);
if (err)
goto out2;
diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
index 4837157..9ae41cd 100644
--- a/include/linux/if_tap.h
+++ b/include/linux/if_tap.h
@@ -73,8 +73,8 @@ struct tap_queue {
 int tap_get_minor(dev_t major, struct tap_dev *tap);
 void tap_free_minor(dev_t major, struct tap_dev *tap);
 int tap_queue_resize(struct tap_dev *tap);
-int tap_create_cdev(struct cdev *tap_cdev,
-   dev_t *tap_major, const char *device_name);
+int tap_create_cdev(struct cdev *tap_cdev, dev_t *tap_major,
+   const char *device_name, struct module *module);
 void tap_destroy_cdev(dev_t major, struct cdev *tap_cdev);
 
 #endif /*_LINUX_IF_TAP_H_*/
-- 
1.8.3.1



[PATCH net-next] macvlan: remove unused fields in struct macvlan_dev

2017-10-25 Thread Girish Moodalbail
commit 635b8c8ecdd2 ("tap: Renaming tap related APIs, data structures,
macros") captured all the tap related fields into a new struct tap_dev.
However, it failed to remove those fields from struct macvlan_dev.
Those fields are currently unused and must be removed. While there
I moved the comment for MAX_TAP_QUEUES to the right place.

Fixes: 635b8c8ecdd27142 (tap: Renaming tap related APIs, data structures, 
macros)
Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com>
---
 include/linux/if_macvlan.h | 15 ---
 include/linux/if_tap.h |  4 
 2 files changed, 4 insertions(+), 15 deletions(-)

diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index 10e319f..e13b369 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -10,13 +10,6 @@
 #include 
 
 struct macvlan_port;
-struct macvtap_queue;
-
-/*
- * Maximum times a macvtap device can be opened. This can be used to
- * configure the number of receive queue, e.g. for multiqueue virtio.
- */
-#define MAX_TAP_QUEUES 256
 
 #define MACVLAN_MC_FILTER_BITS 8
 #define MACVLAN_MC_FILTER_SZ   (1 << MACVLAN_MC_FILTER_BITS)
@@ -35,14 +28,6 @@ struct macvlan_dev {
netdev_features_t   set_features;
enum macvlan_mode   mode;
u16 flags;
-   /* This array tracks active taps. */
-   struct tap_queue__rcu *taps[MAX_TAP_QUEUES];
-   /* This list tracks all taps (both enabled and disabled) */
-   struct list_headqueue_list;
-   int numvtaps;
-   int numqueues;
-   netdev_features_t   tap_features;
-   int minor;
int nest_level;
 #ifdef CONFIG_NET_POLL_CONTROLLER
struct netpoll  *netpoll;
diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
index 4837157..d1b5173 100644
--- a/include/linux/if_tap.h
+++ b/include/linux/if_tap.h
@@ -22,6 +22,10 @@ static inline struct skb_array *tap_get_skb_array(struct 
file *f)
 #include 
 #include 
 
+/*
+ * Maximum times a tap device can be opened. This can be used to
+ * configure the number of receive queue, e.g. for multiqueue virtio.
+ */
 #define MAX_TAP_QUEUES 256
 
 struct tap_queue;
-- 
1.8.3.1



[PATCH v2] tap: double-free in error path in tap_open()

2017-10-25 Thread Girish Moodalbail
Double free of skb_array in tap module is causing kernel panic. When
tap_set_queue() fails we free skb_array right away by calling
skb_array_cleanup(). However, later on skb_array_cleanup() is called
again by tap_sock_destruct through sock_put(). This patch fixes that
issue.

Fixes: 362899b8725b35e3 (macvtap: switch to use skb array)
Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com>
---
v1 -> v2:
  - took care of an another issue in failure path of skb_array_init
---
 drivers/net/tap.c | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 21b71ae..98ee6cc 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -517,6 +517,10 @@ static int tap_open(struct inode *inode, struct file *file)
 _proto, 0);
if (!q)
goto err;
+   if (skb_array_init(>skb_array, tap->dev->tx_queue_len, GFP_KERNEL)) {
+   sk_free(>sk);
+   goto err;
+   }
 
RCU_INIT_POINTER(q->sock.wq, >wq);
init_waitqueue_head(>wq.wait);
@@ -540,22 +544,18 @@ static int tap_open(struct inode *inode, struct file 
*file)
if ((tap->dev->features & NETIF_F_HIGHDMA) && (tap->dev->features & 
NETIF_F_SG))
sock_set_flag(>sk, SOCK_ZEROCOPY);
 
-   err = -ENOMEM;
-   if (skb_array_init(>skb_array, tap->dev->tx_queue_len, GFP_KERNEL))
-   goto err_array;
-
err = tap_set_queue(tap, file, q);
-   if (err)
-   goto err_queue;
+   if (err) {
+   /* tap_sock_destruct() will take care of freeing skb_array */
+   goto err_put;
+   }
 
dev_put(tap->dev);
 
rtnl_unlock();
return err;
 
-err_queue:
-   skb_array_cleanup(>skb_array);
-err_array:
+err_put:
sock_put(>sk);
 err:
if (tap)
-- 
1.8.3.1



Re: [PATCH] tap: double-free in error path in tap_open()

2017-10-24 Thread Girish Moodalbail

On 10/24/17 6:55 PM, Jason Wang wrote:



On 2017年10月25日 05:41, Girish Moodalbail wrote:

Double free of skb_array in tap module is causing kernel panic. When
tap_set_queue() fails we free skb_array right away by calling
skb_array_cleanup(). However, later on skb_array_cleanup() is called
again by tap_sock_destruct through sock_put(). This patch fixes that
issue.

Fixes: 362899b8725b35e3 (macvtap: switch to use skb array)
Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com>
---
  drivers/net/tap.c | 12 ++--
  1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 21b71ae..878520b 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -542,20 +542,20 @@ static int tap_open(struct inode *inode, struct file 
*file)
  err = -ENOMEM;
  if (skb_array_init(>skb_array, tap->dev->tx_queue_len, GFP_KERNEL))
-    goto err_array;
+    goto err_put;


Looks like this will cause skb_array_cleanup() to be called when skb array was 
uninitialized?


Thanks Jason.

This is an existing issue outside of my code. I will send a v2 of the patch that 
incorporates the fix for this issue as well. I am considering moving 
skb_array_init() to the beginning of tap_open() (near to where we allocate 
memory for the tap_queue itself).


Thanks again.

regards,
~Girish




Thanks


  err = tap_set_queue(tap, file, q);
-    if (err)
-    goto err_queue;
+    if (err) {
+    /* tap_sock_destruct() will take care of freeing skb_array */
+    goto err_put;
+    }
  dev_put(tap->dev);
  rtnl_unlock();
  return err;
-err_queue:
-    skb_array_cleanup(>skb_array);
-err_array:
+err_put:
  sock_put(>sk);
  err:
  if (tap)






[PATCH] tap: double-free in error path in tap_open()

2017-10-24 Thread Girish Moodalbail
Double free of skb_array in tap module is causing kernel panic. When
tap_set_queue() fails we free skb_array right away by calling
skb_array_cleanup(). However, later on skb_array_cleanup() is called
again by tap_sock_destruct through sock_put(). This patch fixes that
issue.

Fixes: 362899b8725b35e3 (macvtap: switch to use skb array)
Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com>
---
 drivers/net/tap.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 21b71ae..878520b 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -542,20 +542,20 @@ static int tap_open(struct inode *inode, struct file 
*file)
 
err = -ENOMEM;
if (skb_array_init(>skb_array, tap->dev->tx_queue_len, GFP_KERNEL))
-   goto err_array;
+   goto err_put;
 
err = tap_set_queue(tap, file, q);
-   if (err)
-   goto err_queue;
+   if (err) {
+   /* tap_sock_destruct() will take care of freeing skb_array */
+   goto err_put;
+   }
 
dev_put(tap->dev);
 
rtnl_unlock();
return err;
 
-err_queue:
-   skb_array_cleanup(>skb_array);
-err_array:
+err_put:
sock_put(>sk);
 err:
if (tap)
-- 
1.8.3.1



[RFC] ip: introduce IFA_F_DHCP flag

2017-10-18 Thread Girish Moodalbail
This flag identifies that the address was obtained through DHCP.

Today there is no easy way to find out whether an address on an
interface is DHCP controlled or is static. Either you will need to
grep for 'dhclient' process (or something else in case one is using a
different DHCP client) or if you are using NetworkManager (or some
such), then you will need to query through their interface to find out
if an address is DHCP or not.

This flag will be set by the DHCP clients in the userspace when it
brings up the DHCP address on an interface. For example: ISC DHCP
client (aka dhclient) today brings up the address on an interface by
running ip-address(8) command (in dhclient-script). This command can
be extended to include 'dhcp' keyword in its 'add' or 'replace'
subcommand. Once this flag is set, the show subcommand can display the
keyword 'dhcp' against the address to indicate that the address was
obtained through DHCP.

This flag can also be set and obtained programmatically using
AF_NETLINK. Besides providing observability, this flag will be useful
for applications that need to prevent/allow certain settings on
addresses based on whether they are DHCP or not.

Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com>
---
 include/uapi/linux/if_addr.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/if_addr.h b/include/uapi/linux/if_addr.h
index 4318ab1..61aa8f8 100644
--- a/include/uapi/linux/if_addr.h
+++ b/include/uapi/linux/if_addr.h
@@ -52,6 +52,7 @@ enum {
 #define IFA_F_NOPREFIXROUTE0x200
 #define IFA_F_MCAUTOJOIN   0x400
 #define IFA_F_STABLE_PRIVACY   0x800
+#define IFA_F_DHCP 0x1000
 
 struct ifa_cacheinfo {
__u32   ifa_prefered;
-- 
1.8.3.1



Re: [RFC] Support for UNARP (RFC 1868)

2017-10-13 Thread Girish Moodalbail

On 10/12/17 5:53 PM, Mahesh Bandewar (महेश बंडेवार) wrote:

On Thu, Oct 12, 2017 at 4:06 PM, Girish Moodalbail
<girish.moodalb...@oracle.com> wrote:

Hello Eric,

The basic idea is to mark the ARP entry either FAILED or STALE as soon as we
can so that the subsequent packets that depend on that ARP entry will take
the slow path (neigh_resolve_output()).

Say, if base_reachable_time is 30 seconds, then an ARP entry will be in
reachable state somewhere between 15 to 45 seconds. Assuming the worst case,
the ARP entry will be in REACHABLE state for 45 seconds and the packets
continue to traverse the network towards the source machine and gets dropped
their since the VM has moved to destination machine.

Instead, based on the received UNARP packet if we mark the ARP entry

(a) FAILED
- we move to INCOMPLETE state and start the address resolution by sending
  out ARP packets (up to allowed maximum number) until we get ARP
response
  back at which point we move the ARP entry state to reachable.

(b) STALE
- we move to DELAY state and set the next timer to DELAY_PROBE_TIME
  (1 second) and continue to send queued packets in arp_queue.
- After 1 sec we move to PROBE state and start the address resolution
like
  in the case(a) above.

I was leaning towards (a).

One could arbitrarily increase the stale timeout (by changing no of
probes). So sender
will continue sending traffic to something that has already gone away.
STALE doesn't
mean bad but here the sender is clearly indicating it's going away so
FAILED seems to
be the only logical option.


I agree.




Will TCP flows be terminated, instead
of being smoothly migrated (TCP_REPAIR)



The TCP flows will not be terminated. Upon receiving UNARP packet, the ARP
entry will be marked FAILED. The subsequent TCP packets from the client
(towards that IP) will be queued (the first 3 packets in arp_queue and then
other packets get dropped) until the IP address is resolved again through
the slow path neigh_resolve_output().

The slow path marks the entry as INCOMPLETE and will start sending several
ARP requests (ucast_solicit + app_solicit + mcast_solicit) to resolve the
IP. If the resolution is successful, then the TCP packets will be sent out.
If not, we will invalidate the ARP entry and call arp_error_report() on the
queued packets (which will end up sending ICMP_HOST_UNREACH error). This
behavior is same as what will occur if TCP server disappears in the middle
of a connection.



What about IPv6 ? Or maybe more abruptly, do we still need to add
features to IPv4 in 2017,  22 years after this RFC came ? ;)



Legit question :). Well one thing I have seen in Networking is that an old
idea circles back around later and turns out to be useful in new contexts
and use cases. Like I enumerated in my initial email there are certain use
cases in Cloud that might benefit from UNARP.


It doesn't make sense to have this implemented only for IPv4. At this time if
equivalent IPv6 feature is missing, I don't see it being useful / acceptable.


Obviously UNARP is IPv4 only. I am not aware of any standard way of doing the 
same for IPv6. If such a standard doesn't exist, then we will have to go through 
IETF to get one done? If that is the case, can we please do this in a phased 
manner? This will atleast help the Cloud that are IPv4 only.


thanks,
~Girish


Re: [RFC] Support for UNARP (RFC 1868)

2017-10-12 Thread Girish Moodalbail

Hello Eric,

The basic idea is to mark the ARP entry either FAILED or STALE as soon as we can 
so that the subsequent packets that depend on that ARP entry will take the slow 
path (neigh_resolve_output()).


Say, if base_reachable_time is 30 seconds, then an ARP entry will be in 
reachable state somewhere between 15 to 45 seconds. Assuming the worst case, the 
ARP entry will be in REACHABLE state for 45 seconds and the packets continue to 
traverse the network towards the source machine and gets dropped their since the 
VM has moved to destination machine.


Instead, based on the received UNARP packet if we mark the ARP entry

(a) FAILED
   - we move to INCOMPLETE state and start the address resolution by sending
 out ARP packets (up to allowed maximum number) until we get ARP response
 back at which point we move the ARP entry state to reachable.

(b) STALE
   - we move to DELAY state and set the next timer to DELAY_PROBE_TIME
 (1 second) and continue to send queued packets in arp_queue.
   - After 1 sec we move to PROBE state and start the address resolution like
 in the case(a) above.

I was leaning towards (a). Please see in-line..





Hi Girish

Your description (or patch title) is misleading. You apparently
implement the receive side of the RFC.


You are right, it implements only the receive side of the RFC. If this RFC is 
accepted, then we can change arping(8) to generate UNARP requests. We could also 
add an option to ip-address(8) delete subcommand to generate UNARP whenever an 
address is deleted from the interface.



And the RFC had Proxy ARP in mind.

What about security implications ? 


Yes, this feature will extend the attack surface for L2 networks. However, the 
attack vectors for this feature should be same as that of the gratuitous ARP, 
right? The same attack mitigation techniques for gratuitous ARPs is equally 
applicable here.



Will TCP flows be terminated, instead
of being smoothly migrated (TCP_REPAIR)


The TCP flows will not be terminated. Upon receiving UNARP packet, the ARP entry 
will be marked FAILED. The subsequent TCP packets from the client (towards that 
IP) will be queued (the first 3 packets in arp_queue and then other packets get 
dropped) until the IP address is resolved again through the slow path 
neigh_resolve_output().


The slow path marks the entry as INCOMPLETE and will start sending several ARP 
requests (ucast_solicit + app_solicit + mcast_solicit) to resolve the IP. If the 
resolution is successful, then the TCP packets will be sent out. If not, we will 
invalidate the ARP entry and call arp_error_report() on the queued packets 
(which will end up sending ICMP_HOST_UNREACH error). This behavior is same as 
what will occur if TCP server disappears in the middle of a connection.




What about IPv6 ? Or maybe more abruptly, do we still need to add
features to IPv4 in 2017,  22 years after this RFC came ? ;)


Legit question :). Well one thing I have seen in Networking is that an old idea 
circles back around later and turns out to be useful in new contexts and use 
cases. Like I enumerated in my initial email there are certain use cases in 
Cloud that might benefit from UNARP.


regards,
~Girish



Thanks.






[RFC] Support for UNARP (RFC 1868)

2017-10-11 Thread Girish Moodalbail
Add support for UNARP, as detailed in the IETF RFC 1868 (ARP Extension -
UNARP). The central idea here is for a node to announce that it is
leaving the network and that all the nodes on the L2 broadcast domain to
update their ARP tables accordingly (i.e., mark the neighbor entry state
to FAILED). Even though the ARP timers on nodes would eventually  mark
such entries as FAILED it will be more robust if those entries gets
marked FAILED sooner with the help from the host that is going away.

Besides providing a solution for an usecase, as captured in RFC, of an
IP address moving across a proxy server, this feature is even more
important for certain use cases in the Cloud. Imagine a tenant who is
bringing up and down VM instances for some workload of theirs. If these
instances are part of a small subnet, then the new VM instances may be
assigned the same IP address (since the subnet pool is small) but with a
different MAC address. So, if there is a client which has a stale
mapping of the IP address to the old MAC address, then that client will
fail to communicate with the new VM instance for some time.

Another usecase that comes to mind is that of the Live VM
Migration. Imagine a client that is communicating with a VM. Now, let us
migrate this VM to a destination machine. The IP address to MAC address
mapping for a VM doesn't change after the Live Migration. However, there
will be a small amount of time (till the VM sends gratuitous ARP from
the destination machine) during which packets from a client will be
forwarded to the source machine. This occurs because:

 - the ARP entry in the client is not invalidated yet and it continues
   to use the same MAC address and

 - the MAC address table of all of the intermediate switches between the
   client and the source machine are not updated yet for the MAC address
   move.

This issue of forwarding the packets to wrong target could be avoided by
sending UNARP packets from the source machine. This would invalidate the
ARP entry on the client and forces it to resolve the IP address again by
broadcasting an ARP request to the network. The VM on the destination
machine would then respond back with an ARP response. The ARP response
back from the VM should also clean up the MAC address table of the
intermediate switches.

The following changes implements the UNARP receive processing in the
kernel. Once the changes are in the kernel, arping(8) program can be
updated to send UNARP packets.

Any Thoughts/Comments?

Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com>
---

Compile-tested only.

 net/ipv4/arp.c | 46 +++---
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 7c45b88..8cb9aa1 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -686,6 +686,7 @@ static int arp_process(struct net *net, struct sock *sk, 
struct sk_buff *skb)
struct neighbour *n;
struct dst_entry *reply_dst = NULL;
bool is_garp = false;
+   bool is_unarp;
 
/* arp_rcv below verifies the ARP header and verifies the device
 * is ARP'able.
@@ -695,6 +696,8 @@ static int arp_process(struct net *net, struct sock *sk, 
struct sk_buff *skb)
goto out_free_skb;
 
arp = arp_hdr(skb);
+   /* arp_rcv has already verified the header for the UNARP case */
+   is_unarp = arp->ar_hln == 0;
 
switch (dev_type) {
default:
@@ -741,8 +744,8 @@ static int arp_process(struct net *net, struct sock *sk, 
struct sk_buff *skb)
  * Extract fields
  */
arp_ptr = (unsigned char *)(arp + 1);
-   sha = arp_ptr;
-   arp_ptr += dev->addr_len;
+   sha = is_unarp ? NULL : arp_ptr;
+   arp_ptr += arp->ar_hln;
memcpy(, arp_ptr, 4);
arp_ptr += 4;
switch (dev_type) {
@@ -751,8 +754,8 @@ static int arp_process(struct net *net, struct sock *sk, 
struct sk_buff *skb)
break;
 #endif
default:
-   tha = arp_ptr;
-   arp_ptr += dev->addr_len;
+   tha = is_unarp ? NULL : arp_ptr;
+   arp_ptr += arp->ar_hln;
}
memcpy(, arp_ptr, 4);
 /*
@@ -874,7 +877,10 @@ static int arp_process(struct net *net, struct sock *sk, 
struct sk_buff *skb)
   It is possible, that this option should be enabled for some
   devices (strip is candidate)
 */
-   if (!n &&
+   /* If the packet is UNARP and we don't have the corresponding
+* neighbour entry, then there is nothing to do.
+*/
+   if (!n && !is_unarp &&
(is_garp ||
 (arp->ar_op == htons(ARPOP_REPLY) &&
  (addr_type == RTN_UNICAST ||
@@ -899,12 +905,15 @@ static int arp_process(struct net *net, struct sock *sk, 
struct sk_buff *skb)
  

Re: [PATCH] ipv6: gso: fix payload length when gso_size is zero

2017-10-05 Thread Girish Moodalbail

On 10/5/17 10:06 AM, Alexey Kodanev wrote:

When gso_size reset to zero for the tail segment in skb_segment(), later
in ipv6_gso_segment(), we will get incorrect payload_len for that segment.
inet_gso_segment() already has a check for gso_size before calculating
payload so fixing only IPv6 part.

The issue was found with LTP vxlan & gre tests over ixgbe NIC.

Fixes: 07b26c9454a2 ("gso: Support partial splitting at the frag_list pointer")
Signed-off-by: Alexey Kodanev <alexey.koda...@oracle.com>
---
  net/ipv6/ip6_offload.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index cdb3728..4a87f94 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -105,7 +105,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
  
  	for (skb = segs; skb; skb = skb->next) {

ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff);
-   if (gso_partial)
+   if (gso_partial && skb_is_gso(skb))
payload_len = skb_shinfo(skb)->gso_size +
  SKB_GSO_CB(skb)->data_offset +
  skb->head - (unsigned char *)(ipv6h + 1);



Reviewed-by: Girish Moodalbail <girish.moodalb...@oracle.com>


[PATCH net-next v3] vxlan: change vxlan_[config_]validate() to use netlink_ext_ack for error reporting

2017-08-11 Thread Girish Moodalbail
The kernel log is not where users expect error messages for netlink
requests; as we have extended acks now, we can replace pr_debug() with
NL_SET_ERR_MSG_ATTR().

Signed-off-by: Matthias Schiffer <mschif...@universe-factory.net>
Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com>

---
v2 -> v1:
   - improved messages based on the comments from Jiri Benc,
 Roopa Prabhu, and David Ahern

v1 -> v2:
   - addressed, error messages rewording, comments from Jiri Benc
   - started off with what Matthias had, and I covered error reporting
 for all of the unsuccessful returns in vxlan_validate() and
 vxlan_config_validate()
---
 drivers/net/vxlan.c | 99 +++--
 1 file changed, 73 insertions(+), 26 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 35e84a9e..ae3a1da 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2729,12 +2729,14 @@ static int vxlan_validate(struct nlattr *tb[], struct 
nlattr *data[],
 {
if (tb[IFLA_ADDRESS]) {
if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
-   pr_debug("invalid link address (not ethernet)\n");
+   NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS],
+   "Provided link layer address is not 
Ethernet");
return -EINVAL;
}
 
if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
-   pr_debug("invalid all zero ethernet address\n");
+   NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS],
+   "Provided Ethernet address is not 
unicast");
return -EADDRNOTAVAIL;
}
}
@@ -2742,18 +2744,27 @@ static int vxlan_validate(struct nlattr *tb[], struct 
nlattr *data[],
if (tb[IFLA_MTU]) {
u32 mtu = nla_get_u32(tb[IFLA_MTU]);
 
-   if (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU)
+   if (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU) {
+   NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU],
+   "MTU must be between 68 and 65535");
return -EINVAL;
+   }
}
 
-   if (!data)
+   if (!data) {
+   NL_SET_ERR_MSG(extack,
+  "Required attributes not provided to perform the 
operation");
return -EINVAL;
+   }
 
if (data[IFLA_VXLAN_ID]) {
u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);
 
-   if (id >= VXLAN_N_VID)
+   if (id >= VXLAN_N_VID) {
+   NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_ID],
+   "VXLAN ID must be lower than 
16777216");
return -ERANGE;
+   }
}
 
if (data[IFLA_VXLAN_PORT_RANGE]) {
@@ -2761,8 +2772,8 @@ static int vxlan_validate(struct nlattr *tb[], struct 
nlattr *data[],
= nla_data(data[IFLA_VXLAN_PORT_RANGE]);
 
if (ntohs(p->high) < ntohs(p->low)) {
-   pr_debug("port range %u .. %u not valid\n",
-ntohs(p->low), ntohs(p->high));
+   NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT_RANGE],
+   "Invalid source port range");
return -EINVAL;
}
}
@@ -2919,7 +2930,8 @@ static int vxlan_sock_add(struct vxlan_dev *vxlan)
 
 static int vxlan_config_validate(struct net *src_net, struct vxlan_config 
*conf,
 struct net_device **lower,
-struct vxlan_dev *old)
+struct vxlan_dev *old,
+struct netlink_ext_ack *extack)
 {
struct vxlan_net *vn = net_generic(src_net, vxlan_net_id);
struct vxlan_dev *tmp;
@@ -2933,6 +2945,8 @@ static int vxlan_config_validate(struct net *src_net, 
struct vxlan_config *conf,
 */
if ((conf->flags & ~VXLAN_F_ALLOWED_GPE) ||
!(conf->flags & VXLAN_F_COLLECT_METADATA)) {
+   NL_SET_ERR_MSG(extack,
+  "VXLAN GPE does not support this 
combination of attributes");
return -EINVAL;
}
}
@@ -2947,15 +2961,23 @@ static int vxlan_config_validate(struct net *src_net, 
struct vxlan_config *conf,
conf->saddr.sa.sa_family = conf->remote_ip.sa.sa_family;
}
 
-   if (conf->saddr.sa.sa_family != conf->remote_ip.sa.sa_family)
+   if (conf->saddr.sa.sa_family != conf->re

Re: [PATCH net-next v2] vxlan: change vxlan_[config_]validate() to use netlink_ext_ack for error reporting

2017-08-11 Thread Girish Moodalbail

[snip]


@@ -2933,6 +2945,8 @@ static int vxlan_config_validate(struct net *src_net, 
struct vxlan_config *conf,
  */
 if ((conf->flags & ~VXLAN_F_ALLOWED_GPE) ||
 !(conf->flags & VXLAN_F_COLLECT_METADATA)) {
+   NL_SET_ERR_MSG(extack,
+  "VXLAN GPE does not support this combination 
of attributes");
 return -EINVAL;
 }


"collect metadata not supported with vxlan gpe"


Actually, VXLAN GPE is only supported together with external keyword. From 
ip-link(8)...


  gpe - enables the Generic Protocol extension (VXLAN-GPE). Currently, this
   is only supported together with the external keyword.


That's completely wrong message. Not saying that the capitalization is
wrong, too. Girish's wording precisely explains what went wrong.


Furthermore, the condition not only checks for collect metadata but also it 
checks to see if any attribute specified is outside of what is allowed by 
VXLAN_F_ALLOWED_GPE.





 lowerdev = __dev_get_by_index(src_net, conf->remote_ifindex);
-   if (!lowerdev)
+   if (!lowerdev) {
+   NL_SET_ERR_MSG(extack,
+  "Specified interface for tunnel endpoint 
communications not found");
 return -ENODEV;


"invalid vxlan remote link interface, device not found"


Finally one that looks better :-) Modulo the missing capitalization,
though.


It is not the remote link interface. It is the local interface itself that needs 
to be specified.





-   if (vxlan_addr_multicast(>remote_ip))
+   if (vxlan_addr_multicast(>remote_ip)) {
+   NL_SET_ERR_MSG(extack,
+  "Interface need to be specified for multicast 
destination");


"vxlan remote link interface required for multicast remote destination"


I like this one better, too.


  #if IS_ENABLED(CONFIG_IPV6)
-   if (conf->flags & VXLAN_F_IPV6_LINKLOCAL)
+   if (conf->flags & VXLAN_F_IPV6_LINKLOCAL) {
+   NL_SET_ERR_MSG(extack,
+  "Interface need to be specified for 
link-local local/remote addresses");
 return -EINVAL;


"vxlan link interface required for link-local local/remote addresses"


Okay but to be consistent (and more clear), it should be "remote link
interface".


There is no remote link while configuring VXLAN. It is the local interface 
through which VXLAN packets will be send out.


Thanks all for the review,
~Girish


[PATCH net-next v2] vxlan: change vxlan_[config_]validate() to use netlink_ext_ack for error reporting

2017-08-10 Thread Girish Moodalbail
The kernel log is not where users expect error messages for netlink
requests; as we have extended acks now, we can replace pr_debug() with
NL_SET_ERR_MSG_ATTR().

Signed-off-by: Matthias Schiffer <mschif...@universe-factory.net>
Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com>

---
v1 -> v2:
   - addressed, error messages rewording, comments from Jiri Benc
   - started off with what Matthias had, and I covered error reporting
 for all of the unsuccessful returns
---
 drivers/net/vxlan.c | 98 +++--
 1 file changed, 72 insertions(+), 26 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 35e84a9e..ec302cd7 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2729,12 +2729,14 @@ static int vxlan_validate(struct nlattr *tb[], struct 
nlattr *data[],
 {
if (tb[IFLA_ADDRESS]) {
if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
-   pr_debug("invalid link address (not ethernet)\n");
+   NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS],
+   "Provided link layer address is not 
Ethernet");
return -EINVAL;
}
 
if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
-   pr_debug("invalid all zero ethernet address\n");
+   NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS],
+   "Provided Ethernet address is not 
unicast");
return -EADDRNOTAVAIL;
}
}
@@ -2742,18 +2744,27 @@ static int vxlan_validate(struct nlattr *tb[], struct 
nlattr *data[],
if (tb[IFLA_MTU]) {
u32 mtu = nla_get_u32(tb[IFLA_MTU]);
 
-   if (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU)
+   if (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU) {
+   NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU],
+   "MTU must be between 68 and 65535");
return -EINVAL;
+   }
}
 
-   if (!data)
+   if (!data) {
+   NL_SET_ERR_MSG(extack,
+  "Not enough attributes provided to perform the 
operation");
return -EINVAL;
+   }
 
if (data[IFLA_VXLAN_ID]) {
u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);
 
-   if (id >= VXLAN_N_VID)
+   if (id >= VXLAN_N_VID) {
+   NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_ID],
+   "VXLAN ID must be lower than 
16777216");
return -ERANGE;
+   }
}
 
if (data[IFLA_VXLAN_PORT_RANGE]) {
@@ -2761,8 +2772,8 @@ static int vxlan_validate(struct nlattr *tb[], struct 
nlattr *data[],
= nla_data(data[IFLA_VXLAN_PORT_RANGE]);
 
if (ntohs(p->high) < ntohs(p->low)) {
-   pr_debug("port range %u .. %u not valid\n",
-ntohs(p->low), ntohs(p->high));
+   NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT_RANGE],
+   "Provided source port range bounds 
is invalid");
return -EINVAL;
}
}
@@ -2919,7 +2930,8 @@ static int vxlan_sock_add(struct vxlan_dev *vxlan)
 
 static int vxlan_config_validate(struct net *src_net, struct vxlan_config 
*conf,
 struct net_device **lower,
-struct vxlan_dev *old)
+struct vxlan_dev *old,
+struct netlink_ext_ack *extack)
 {
struct vxlan_net *vn = net_generic(src_net, vxlan_net_id);
struct vxlan_dev *tmp;
@@ -2933,6 +2945,8 @@ static int vxlan_config_validate(struct net *src_net, 
struct vxlan_config *conf,
 */
if ((conf->flags & ~VXLAN_F_ALLOWED_GPE) ||
!(conf->flags & VXLAN_F_COLLECT_METADATA)) {
+   NL_SET_ERR_MSG(extack,
+  "VXLAN GPE does not support this 
combination of attributes");
return -EINVAL;
}
}
@@ -2947,15 +2961,23 @@ static int vxlan_config_validate(struct net *src_net, 
struct vxlan_config *conf,
conf->saddr.sa.sa_family = conf->remote_ip.sa.sa_family;
}
 
-   if (conf->saddr.sa.sa_family != conf->remote_ip.sa.sa_family)
+   if (conf->saddr.sa.sa_family != conf->remote_ip.sa.sa_family) {
+   NL_SET_ERR_MSG(extack,
+  "Local and remote address must be from the same 

Re: [PATCH net] geneve: maximum value of VNI cannot be used

2017-08-10 Thread Girish Moodalbail

On 8/9/17 10:41 PM, David Miller wrote:

From: Girish Moodalbail <girish.moodalb...@oracle.com>
Date: Tue,  8 Aug 2017 17:26:24 -0700


Geneve's Virtual Network Identifier (VNI) is 24 bit long, so the range
of values for it would be from 0 to 16777215 (2^24 -1).  However, one
cannot create a geneve device with VNI set to 16777215. This patch fixes
this issue.

Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com>


I always worry that someone, somewhere, might be using this in some
way and this will break things.

But I'll apply this for now.

Thanks David. As per the output of 'ip link help geneve', 16777215 is a valid 
value. However, due to incorrect check in the kernel that value was not supported.


$ ip link help geneve |egrep -A1 VNI
Usage: ... geneve id VNI
  remote ADDR
--
Where: VNI   := 0-16777215
   ADDR  := IP_ADDRESS

So, this is an off-by-one bug and no one had tripped over it until now.

regards,
~Girish



[PATCH net-next] geneve: use netlink_ext_ack for error reporting in rtnl operations

2017-08-09 Thread Girish Moodalbail
Add extack error messages for failure paths while creating/modifying
geneve devices. Once extack support is added to iproute2, more
meaningful and helpful error messages will be displayed making it easy
for users to discern what went wrong.

Before:
===
$ ip link add gen1 address 0:1:2:3:4:5:6 type geneve id 200 \
  remote 192.168.13.2
RTNETLINK answers: Invalid argument

After:
==
$ ip link add gen1 address 0:1:2:3:4:5:6 type geneve id 200 \
  remote 192.168.13.2
Error: Provided link layer address is not Ethernet

Also, netdev_dbg() calls used to log errors associated with Netlink
request have been removed.

Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com>
---
 drivers/net/geneve.c | 128 ---
 1 file changed, 92 insertions(+), 36 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 745d57ae..977dbcc 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -1086,21 +1086,33 @@ static int geneve_validate(struct nlattr *tb[], struct 
nlattr *data[],
   struct netlink_ext_ack *extack)
 {
if (tb[IFLA_ADDRESS]) {
-   if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
+   if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
+   NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS],
+   "Provided link layer address is not 
Ethernet");
return -EINVAL;
+   }
 
-   if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
+   if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
+   NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS],
+   "Provided Ethernet address is not 
unicast");
return -EADDRNOTAVAIL;
+   }
}
 
-   if (!data)
+   if (!data) {
+   NL_SET_ERR_MSG(extack,
+  "Not enough attributes provided to perform the 
operation");
return -EINVAL;
+   }
 
if (data[IFLA_GENEVE_ID]) {
__u32 vni =  nla_get_u32(data[IFLA_GENEVE_ID]);
 
-   if (vni >= GENEVE_VID_MASK)
+   if (vni >= GENEVE_VID_MASK) {
+   NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_ID],
+   "Geneve ID must be lower than 
16777216");
return -ERANGE;
+   }
}
 
return 0;
@@ -1158,6 +1170,7 @@ static bool geneve_dst_addr_equal(struct ip_tunnel_info 
*a,
 }
 
 static int geneve_configure(struct net *net, struct net_device *dev,
+   struct netlink_ext_ack *extack,
const struct ip_tunnel_info *info,
bool metadata, bool ipv6_rx_csum)
 {
@@ -1166,8 +1179,11 @@ static int geneve_configure(struct net *net, struct 
net_device *dev,
bool tun_collect_md, tun_on_same_port;
int err, encap_len;
 
-   if (metadata && !is_tnl_info_zero(info))
+   if (metadata && !is_tnl_info_zero(info)) {
+   NL_SET_ERR_MSG(extack,
+  "Device is externally controlled, so attributes 
(VNI, Port, and so on) must not be specified");
return -EINVAL;
+   }
 
geneve->net = net;
geneve->dev = dev;
@@ -1188,11 +1204,17 @@ static int geneve_configure(struct net *net, struct 
net_device *dev,
dev->needed_headroom = encap_len + ETH_HLEN;
 
if (metadata) {
-   if (tun_on_same_port)
+   if (tun_on_same_port) {
+   NL_SET_ERR_MSG(extack,
+  "There can be only one externally 
controlled device on a destination port");
return -EPERM;
+   }
} else {
-   if (tun_collect_md)
+   if (tun_collect_md) {
+   NL_SET_ERR_MSG(extack,
+  "There already exists an externally 
controlled device on this destination port");
return -EPERM;
+   }
}
 
dst_cache_reset(>info.dst_cache);
@@ -1214,31 +1236,41 @@ static void init_tnl_info(struct ip_tunnel_info *info, 
__u16 dst_port)
info->key.tp_dst = htons(dst_port);
 }
 
-static int geneve_nl2info(struct net_device *dev, struct nlattr *tb[],
- struct nlattr *data[], struct ip_tunnel_info *info,
- bool *metadata, bool *use_udp6_rx_checksums,
- bool changelink)
+static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack,
+ struct ip_tunnel_info *info, bool *metadata,
+ 

Re: [PATCH v2 iproute2] lib: Dump ext-ack string by default

2017-08-09 Thread Girish Moodalbail

On 8/8/17 7:30 AM, David Ahern wrote:

In time, errfn can be implemented for link, route, etc commands to
give a much more detailed response (e.g., point to the attribute
that failed). Doing so is much more complicated to process the
message and convert attribute ids to names.

In any case the error string returned by the kernel should be dumped
to the user, so make that happen now.

Signed-off-by: David Ahern 
---
v2
- check that errmsg is non-null

  lib/libnetlink.c | 15 ++-
  1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index 145de2cb0ccf..063f5cd6c7b1 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -61,7 +61,6 @@ static int err_attr_cb(const struct nlattr *attr, void *data)
return MNL_CB_OK;
  }
  
-

  /* dump netlink extended ack error message */
  static int nl_dump_ext_err(const struct nlmsghdr *nlh, nl_ext_ack_fn_t errfn)
  {
@@ -72,9 +71,6 @@ static int nl_dump_ext_err(const struct nlmsghdr *nlh, 
nl_ext_ack_fn_t errfn)
const char *errmsg = NULL;
uint32_t off = 0;
  
-	if (!errfn)

-   return 0;
-
/* no TLVs, nothing to do here */
if (!(nlh->nlmsg_flags & NLM_F_ACK_TLVS))
return 0;
@@ -99,7 +95,16 @@ static int nl_dump_ext_err(const struct nlmsghdr *nlh, 
nl_ext_ack_fn_t errfn)
err_nlh = >msg;
}
  
-	return errfn(errmsg, off, err_nlh);

+   if (errfn)
+   return errfn(errmsg, off, err_nlh);
+
+   if (errmsg) {
+   fprintf(stderr, "Error: %s\n", errmsg);


Should the above output end with a period '.'? All the error messages in the 
Kernel are statements without a terminating period, so the output will look 
something like this..


Error: Minimally Geneve ID and Remote IP address are required
Error: Provided Ethernet address is not unicast
Error: Provided link layer address is not Ethernet

Thanks,
~Girish





+
+   return 1;
+   }
+
+   return 0;
  }
  #else
  #warning "libmnl required for error support"





[PATCH net] geneve: maximum value of VNI cannot be used

2017-08-08 Thread Girish Moodalbail
Geneve's Virtual Network Identifier (VNI) is 24 bit long, so the range
of values for it would be from 0 to 16777215 (2^24 -1).  However, one
cannot create a geneve device with VNI set to 16777215. This patch fixes
this issue.

Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com>
---
 drivers/net/geneve.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 745d57ae..8b8565d 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -1099,7 +1099,7 @@ static int geneve_validate(struct nlattr *tb[], struct 
nlattr *data[],
if (data[IFLA_GENEVE_ID]) {
__u32 vni =  nla_get_u32(data[IFLA_GENEVE_ID]);
 
-   if (vni >= GENEVE_VID_MASK)
+   if (vni >= GENEVE_N_VID)
return -ERANGE;
}
 
-- 
1.8.3.1



[PATCH iproute2] geneve: support for modifying geneve device

2017-07-25 Thread Girish Moodalbail
Ability to change geneve device attributes was added to kernel through
commit 5b861f6baa3a ("geneve: add rtnl changelink support"), however one
cannot do the same through ip-link(8) command.  Changing the allowed
geneve device attributes using 'ip link set  type geneve id
 ' currently fails with 'operation not
supported' error.  This patch adds support for it.

Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com>
---
 ip/iplink_geneve.c | 82 +++---
 1 file changed, 60 insertions(+), 22 deletions(-)

diff --git a/ip/iplink_geneve.c b/ip/iplink_geneve.c
index 2c510fc..594a3e5 100644
--- a/ip/iplink_geneve.c
+++ b/ip/iplink_geneve.c
@@ -15,6 +15,8 @@
 #include "utils.h"
 #include "ip_common.h"
 
+#define GENEVE_ATTRSET(attrs, type) (((attrs) & (1L << (type))) != 0)
+
 static void print_explain(FILE *f)
 {
fprintf(f,
@@ -42,11 +44,20 @@ static void explain(void)
print_explain(stderr);
 }
 
+static void check_duparg(__u64 *attrs, int type, const char *key,
+const char *argv)
+{
+   if (!GENEVE_ATTRSET(*attrs, type)) {
+   *attrs |= (1L << type);
+   return;
+   }
+   duparg2(key, argv);
+}
+
 static int geneve_parse_opt(struct link_util *lu, int argc, char **argv,
  struct nlmsghdr *n)
 {
__u32 vni = 0;
-   int vni_set = 0;
__u32 daddr = 0;
struct in6_addr daddr6 = IN6ADDR_ANY_INIT;
__u32 label = 0;
@@ -55,22 +66,24 @@ static int geneve_parse_opt(struct link_util *lu, int argc, 
char **argv,
__u16 dstport = 0;
bool metadata = 0;
__u8 udpcsum = 0;
-   bool udpcsum_set = false;
__u8 udp6zerocsumtx = 0;
-   bool udp6zerocsumtx_set = false;
__u8 udp6zerocsumrx = 0;
-   bool udp6zerocsumrx_set = false;
+   __u64 attrs = 0;
+   bool set_op = (n->nlmsg_type == RTM_NEWLINK &&
+  !(n->nlmsg_flags & NLM_F_CREATE));
 
while (argc > 0) {
if (!matches(*argv, "id") ||
!matches(*argv, "vni")) {
NEXT_ARG();
+   check_duparg(, IFLA_GENEVE_ID, "id", *argv);
if (get_u32(, *argv, 0) ||
vni >= 1u << 24)
invarg("invalid id", *argv);
-   vni_set = 1;
} else if (!matches(*argv, "remote")) {
NEXT_ARG();
+   check_duparg(, IFLA_GENEVE_REMOTE, "remote",
+*argv);
if (!inet_get_addr(*argv, , )) {
fprintf(stderr, "Invalid address \"%s\"\n", 
*argv);
return -1;
@@ -82,6 +95,7 @@ static int geneve_parse_opt(struct link_util *lu, int argc, 
char **argv,
unsigned int uval;
 
NEXT_ARG();
+   check_duparg(, IFLA_GENEVE_TTL, "ttl", *argv);
if (strcmp(*argv, "inherit") != 0) {
if (get_unsigned(, *argv, 0))
invarg("invalid TTL", *argv);
@@ -94,6 +108,7 @@ static int geneve_parse_opt(struct link_util *lu, int argc, 
char **argv,
__u32 uval;
 
NEXT_ARG();
+   check_duparg(, IFLA_GENEVE_TOS, "tos", *argv);
if (strcmp(*argv, "inherit") != 0) {
if (rtnl_dsfield_a2n(, *argv))
invarg("bad TOS value", *argv);
@@ -105,36 +120,50 @@ static int geneve_parse_opt(struct link_util *lu, int 
argc, char **argv,
__u32 uval;
 
NEXT_ARG();
+   check_duparg(, IFLA_GENEVE_LABEL, "flowlabel",
+*argv);
if (get_u32(, *argv, 0) ||
(uval & ~LABEL_MAX_MASK))
invarg("invalid flowlabel", *argv);
label = htonl(uval);
} else if (!matches(*argv, "dstport")) {
NEXT_ARG();
+   check_duparg(, IFLA_GENEVE_PORT, "dstport",
+*argv);
if (get_u16(, *argv, 0))
invarg("dstport", *argv);
} else if (!matches(*argv, "external")) {
+   check_duparg(, IFLA_GENEVE_COLLECT_METADATA,
+*argv, *argv);
metad

[PATCH net-next v3 1/1] geneve: add rtnl changelink support

2017-07-20 Thread Girish Moodalbail
This patch adds changelink rtnl operation support for geneve devices
and the code changes involve:

  - added geneve_quiesce() which quiesces the geneve device data path
for both TX and RX. This lets us perform the changelink operation
atomically w.r.t data path. Also added geneve_unquiesce() to
reverse the operation of geneve_quiesce().

  - refactor geneve_newlink into geneve_nl2info to be used by both
geneve_newlink and geneve_changelink

  - geneve_nl2info takes a changelink boolean argument to isolate
changelink checks.

  - Allow changing only a few attributes (ttl, tos, and remote tunnel
endpoint IP address (within the same address family)):
- return -EOPNOTSUPP for attributes that cannot be changed for
  now. Incremental patches can make the non-supported one
  available in the future if needed.

Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com>
---
v2 -> v3:
   - removed the use of inline for new functions in my patch
   - removed an extra check for socket in the datapath and instead
 I am piggybacking on an already existing check
   - added more comments to quiesce/unquiesce functions

v1 -> v2:
   - added geneve_quiesce() and geneve_unquiesce() functions to
 perform the changelink operation atomically w.r.t data path
---
---
 drivers/net/geneve.c | 218 +--
 1 file changed, 176 insertions(+), 42 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index de8156c..0436a42 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -715,6 +715,7 @@ static int geneve_build_skb(struct dst_entry *dst, struct 
sk_buff *skb,
 
 static struct rtable *geneve_get_v4_rt(struct sk_buff *skb,
   struct net_device *dev,
+  struct geneve_sock *gs4,
   struct flowi4 *fl4,
   const struct ip_tunnel_info *info)
 {
@@ -724,7 +725,7 @@ static struct rtable *geneve_get_v4_rt(struct sk_buff *skb,
struct rtable *rt = NULL;
__u8 tos;
 
-   if (!rcu_dereference(geneve->sock4))
+   if (!gs4)
return ERR_PTR(-EIO);
 
memset(fl4, 0, sizeof(*fl4));
@@ -764,6 +765,7 @@ static struct rtable *geneve_get_v4_rt(struct sk_buff *skb,
 #if IS_ENABLED(CONFIG_IPV6)
 static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb,
   struct net_device *dev,
+  struct geneve_sock *gs6,
   struct flowi6 *fl6,
   const struct ip_tunnel_info *info)
 {
@@ -771,10 +773,8 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff 
*skb,
struct geneve_dev *geneve = netdev_priv(dev);
struct dst_entry *dst = NULL;
struct dst_cache *dst_cache;
-   struct geneve_sock *gs6;
__u8 prio;
 
-   gs6 = rcu_dereference(geneve->sock6);
if (!gs6)
return ERR_PTR(-EIO);
 
@@ -827,7 +827,7 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct 
net_device *dev,
__be16 df;
int err;
 
-   rt = geneve_get_v4_rt(skb, dev, , info);
+   rt = geneve_get_v4_rt(skb, dev, gs4, , info);
if (IS_ERR(rt))
return PTR_ERR(rt);
 
@@ -866,7 +866,7 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct 
net_device *dev,
__be16 sport;
int err;
 
-   dst = geneve_get_v6_dst(skb, dev, , info);
+   dst = geneve_get_v6_dst(skb, dev, gs6, , info);
if (IS_ERR(dst))
return PTR_ERR(dst);
 
@@ -951,8 +951,9 @@ static int geneve_fill_metadata_dst(struct net_device *dev, 
struct sk_buff *skb)
if (ip_tunnel_info_af(info) == AF_INET) {
struct rtable *rt;
struct flowi4 fl4;
+   struct geneve_sock *gs4 = rcu_dereference(geneve->sock4);
 
-   rt = geneve_get_v4_rt(skb, dev, , info);
+   rt = geneve_get_v4_rt(skb, dev, gs4, , info);
if (IS_ERR(rt))
return PTR_ERR(rt);
 
@@ -962,8 +963,9 @@ static int geneve_fill_metadata_dst(struct net_device *dev, 
struct sk_buff *skb)
} else if (ip_tunnel_info_af(info) == AF_INET6) {
struct dst_entry *dst;
struct flowi6 fl6;
+   struct geneve_sock *gs6 = rcu_dereference(geneve->sock6);
 
-   dst = geneve_get_v6_dst(skb, dev, , info);
+   dst = geneve_get_v6_dst(skb, dev, gs6, , info);
if (IS_ERR(dst))
return PTR_ERR(dst);
 
@@ -1140,6 +1142,15 @@ static bool is_tnl_info_zero(const struct ip_tunnel_info 
*info)
return true;
 }
 
+static bool geneve_dst_addr_equal(struct ip_tunnel_info *a,
+ struct ip_tunnel_info *b)
+{
+

Re: [PATCH net-next v2 1/1] geneve: add rtnl changelink support

2017-07-20 Thread Girish Moodalbail

Hello Pravin,


+/* Quiesces the geneve device data path for both TX and RX. */
+static inline void geneve_quiesce(struct geneve_dev *geneve,
+ struct geneve_sock **gs4,
+ struct geneve_sock **gs6)
+{
+   *gs4 = rtnl_dereference(geneve->sock4);
+   rcu_assign_pointer(geneve->sock4, NULL);
+
+#if IS_ENABLED(CONFIG_IPV6)
+   *gs6 = rtnl_dereference(geneve->sock6);
+   rcu_assign_pointer(geneve->sock6, NULL);
+#else
+   *gs6 = NULL;
+#endif
+   synchronize_net();
+}
+
+/* Resumes the geneve device data path for both TX and RX. */
+static inline void geneve_unquiesce(struct geneve_dev *geneve,
+   struct geneve_sock *gs4,
+   struct geneve_sock __maybe_unused *gs6)
+{
+   rcu_assign_pointer(geneve->sock4, gs4);
+#if IS_ENABLED(CONFIG_IPV6)
+   rcu_assign_pointer(geneve->sock6, gs6);
+#endif
+   synchronize_net();
+}
+
+static int geneve_changelink(struct net_device *dev, struct nlattr *tb[],
+struct nlattr *data[],
+struct netlink_ext_ack *extack)
+{
+   struct geneve_dev *geneve = netdev_priv(dev);
+   struct geneve_sock *gs4, *gs6;
+   struct ip_tunnel_info info;
+   bool metadata;
+   bool use_udp6_rx_checksums;
+   int err;
+
+   /* If the geneve device is configured for metadata (or externally
+* controlled, for example, OVS), then nothing can be changed.
+*/
+   if (geneve->collect_md)
+   return -EOPNOTSUPP;
+
+   /* Start with the existing info. */
+   memcpy(, >info, sizeof(info));
+   metadata = geneve->collect_md;
+   use_udp6_rx_checksums = geneve->use_udp6_rx_checksums;
+   err = geneve_nl2info(dev, tb, data, , ,
+_udp6_rx_checksums, true);
+   if (err)
+   return err;
+
+   if (!geneve_dst_addr_equal(>info, ))
+   dst_cache_reset(_cache);
+
+   geneve_quiesce(geneve, , );
+   geneve->info = info;
+   geneve->collect_md = metadata;
+   geneve->use_udp6_rx_checksums = use_udp6_rx_checksums;
+   geneve_unquiesce(geneve, gs4, gs6);
+

This is nice trick. But it adds check for the socket in datapath. did
you explore updating entire device state in single atomic transaction?


I did explore, however what I have now seemed like a more concise method to 
perform changelink operation atomically w.r.t the datapath.


That said, there is one thing I could do. Today we already check for the socket 
in datapath like below:


(a) ipv4 datapath today:


  geneve_xmit_skb(...)
  |
  +->geneve_get_v4_rt()
 |
 +---> if (!rcu_dereference(geneve->sock4))
return ERR_PTR(-EIO);

(b) ipv4 datapath with my current patch:


  geneve_xmit_skb(...)
  |
  +->if (!rcu_dereference(geneve->sock4))
  |  return -EIO;
  |
  +->geneve_get_v4_rt()
 |
 +---> if (!rcu_dereference(geneve->sock4))
return ERR_PTR(-EIO);

Perhaps, I could piggyback on the already existing check for NULL socket in 
geneve_get_v4_rt() and avoid the additional check I have added in datapath. (The 
situation is same for IPv6)


regards,
~Girish


Re: [PATCH net-next v2 1/1] geneve: add rtnl changelink support

2017-07-19 Thread Girish Moodalbail

On 7/19/17 4:51 PM, David Miller wrote:

From: Girish Moodalbail <girish.moodalb...@oracle.com>
Date: Tue, 18 Jul 2017 16:33:06 -0700


+static inline bool geneve_dst_addr_equal(struct ip_tunnel_info *a,

  ...

+static inline void geneve_quiesce(struct geneve_dev *geneve,

  ...

+static inline void geneve_unquiesce(struct geneve_dev *geneve,


Please no inline functions in foo.c files, let the compiler
decide.


Sure thing. Will do.

regards,
~Girish



Thanks.





[PATCH net-next v2 1/1] geneve: add rtnl changelink support

2017-07-18 Thread Girish Moodalbail
This patch adds changelink rtnl operation support for geneve devices
and the code changes involve:

  - add geneve_quiesce() which quiesces the geneve device data path
for both TX and RX. This lets us perform the changelink operation
atomically w.r.t data path. Also add geneve_unquiesce() to
reverse the operation of geneve_quiesce().

  - refactor geneve_newlink into geneve_nl2info to be used by both
geneve_newlink and geneve_changelink

  - geneve_nl2info takes a changelink boolean argument to isolate
changelink checks.

  - Allow changing only a few attributes (ttl, tos, and remote tunnel
endpoint IP address (within the same address family)):
- return -EOPNOTSUPP for attributes that cannot be changed for
  now. Incremental patches can make the non-supported one
  available in the future if needed.

Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com>
---
v0 -> v1:
   - added geneve_quiesce() and geneve_unquiesce() functions to
 perform the changelink operation atomically w.r.t data path
---
 drivers/net/geneve.c | 192 +--
 1 file changed, 157 insertions(+), 35 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index de8156c..829f541 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -827,6 +827,9 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct 
net_device *dev,
__be16 df;
int err;
 
+   if (!gs4)
+   return -EIO;
+
rt = geneve_get_v4_rt(skb, dev, , info);
if (IS_ERR(rt))
return PTR_ERR(rt);
@@ -866,6 +869,9 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct 
net_device *dev,
__be16 sport;
int err;
 
+   if (!gs6)
+   return -EIO;
+
dst = geneve_get_v6_dst(skb, dev, , info);
if (IS_ERR(dst))
return PTR_ERR(dst);
@@ -1140,6 +1146,15 @@ static bool is_tnl_info_zero(const struct ip_tunnel_info 
*info)
return true;
 }
 
+static inline bool geneve_dst_addr_equal(struct ip_tunnel_info *a,
+struct ip_tunnel_info *b)
+{
+   if (ip_tunnel_info_af(a) == AF_INET)
+   return a->key.u.ipv4.dst == b->key.u.ipv4.dst;
+   else
+   return ipv6_addr_equal(>key.u.ipv6.dst, >key.u.ipv6.dst);
+}
+
 static int geneve_configure(struct net *net, struct net_device *dev,
const struct ip_tunnel_info *info,
bool metadata, bool ipv6_rx_csum)
@@ -1197,24 +1212,22 @@ static void init_tnl_info(struct ip_tunnel_info *info, 
__u16 dst_port)
info->key.tp_dst = htons(dst_port);
 }
 
-static int geneve_newlink(struct net *net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[],
- struct netlink_ext_ack *extack)
+static int geneve_nl2info(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[], struct ip_tunnel_info *info,
+ bool *metadata, bool *use_udp6_rx_checksums,
+ bool changelink)
 {
-   bool use_udp6_rx_checksums = false;
-   struct ip_tunnel_info info;
-   bool metadata = false;
-
-   init_tnl_info(, GENEVE_UDP_PORT);
-
if (data[IFLA_GENEVE_REMOTE] && data[IFLA_GENEVE_REMOTE6])
return -EINVAL;
 
if (data[IFLA_GENEVE_REMOTE]) {
-   info.key.u.ipv4.dst =
+   if (changelink && (ip_tunnel_info_af(info) == AF_INET6))
+   return -EOPNOTSUPP;
+
+   info->key.u.ipv4.dst =
nla_get_in_addr(data[IFLA_GENEVE_REMOTE]);
 
-   if (IN_MULTICAST(ntohl(info.key.u.ipv4.dst))) {
+   if (IN_MULTICAST(ntohl(info->key.u.ipv4.dst))) {
netdev_dbg(dev, "multicast remote is unsupported\n");
return -EINVAL;
}
@@ -1222,21 +1235,24 @@ static int geneve_newlink(struct net *net, struct 
net_device *dev,
 
if (data[IFLA_GENEVE_REMOTE6]) {
  #if IS_ENABLED(CONFIG_IPV6)
-   info.mode = IP_TUNNEL_INFO_IPV6;
-   info.key.u.ipv6.dst =
+   if (changelink && (ip_tunnel_info_af(info) == AF_INET))
+   return -EOPNOTSUPP;
+
+   info->mode = IP_TUNNEL_INFO_IPV6;
+   info->key.u.ipv6.dst =
nla_get_in6_addr(data[IFLA_GENEVE_REMOTE6]);
 
-   if (ipv6_addr_type() &
+   if (ipv6_addr_type(>key.u.ipv6.dst) &
IPV6_ADDR_LINKLOCAL) {
netdev_dbg(dev, "link-local remote is unsupported\n");
return -EINVAL;
}
-   if (ipv6_addr_is_multicast()) {
+   if (ipv6_addr_is_multicast(>

Re: [PATCH net 0/4] macvlan: Fix some issues with changing mac addresses

2017-06-16 Thread Girish Moodalbail

On 6/16/17 6:36 AM, Vladislav Yasevich wrote:

There are some issues in macvlan wrt to changing it's mac address.
* An error is returned in the specified address is the same as an already
  assigned address.
* In passthru mode, the mac address of the macvlan device doesn't change.
* After changing the mac address of a passthru macvlan and then removing it,
  the mac address of the physical device remains changed.

This patch series attempts to resolve these issues.

Thanks
-vlad

Vladislav Yasevich (4):
  macvlan: Do not return error when setting the same mac address
  macvlan: Fix passthru macvlan mac address inheritance
  macvlan: convert port passthru to flags.


Above 3 patches looks good to me, so

Reviewed-by: Girish Moodalbail <girish.moodalb...@oracle.com>



  macvlan: Let passthru macvlan correctly restore lower mac address


However, I have few questions/comments on the above patch.

thanks,
~Girish



 drivers/net/macvlan.c | 85 ++-
 1 file changed, 71 insertions(+), 14 deletions(-)





Re: [PATCH net 4/4] macvlan: Let passthru macvlan correctly restore lower mac address

2017-06-16 Thread Girish Moodalbail
Sorry, it took sometime to wrap around this patch series since they all change 
one file and at times the same function :).



On 6/16/17 6:36 AM, Vladislav Yasevich wrote:

Passthru macvlans directly change the mac address of the lower
level device.  That's OK, but after the macvlan is deleted,
the lower device is left with changed address and one needs to
reboot to bring back the origina HW addresses.


s/origina/original/



This scenario is actually quite common with passthru macvtap devices.

This patch attempts to solve this, by storing the mac address
of the lower device in macvlan_port structure and keeping track of
it through the changes.

After this patch, any changes to the lower device mac address
done trough the macvlan device, will be reverted back.  Any
changes done directly to the lower device mac address will be kept.

Signed-off-by: Vladislav Yasevich 
---
 drivers/net/macvlan.c | 47 ---
 1 file changed, 44 insertions(+), 3 deletions(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index eb956ff..c551165 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -40,6 +40,7 @@
 #define MACVLAN_BC_QUEUE_LEN   1000

 #define MACVLAN_F_PASSTHRU 1
+#define MACVLAN_F_ADDRCHANGE   2

 struct macvlan_port {
struct net_device   *dev;
@@ -51,6 +52,7 @@ struct macvlan_port {
int count;
struct hlist_head   vlan_source_hash[MACVLAN_HASH_SIZE];
DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ);
+   unsigned char   perm_addr[ETH_ALEN];
 };

 struct macvlan_source_entry {
@@ -78,6 +80,21 @@ static inline void macvlan_set_passthru(struct macvlan_port 
*port)
port->flags |= MACVLAN_F_PASSTHRU;
 }

+static inline bool macvlan_addr_change(const struct macvlan_port *port)
+{
+   return port->flags & MACVLAN_F_ADDRCHANGE;
+}
+
+static inline void macvlan_set_addr_change(struct macvlan_port *port)
+{
+   port->flags |= MACVLAN_F_ADDRCHANGE;
+}
+
+static inline void macvlan_clear_addr_change(struct macvlan_port *port)
+{
+   port->flags &= ~MACVLAN_F_ADDRCHANGE;
+}
+
 /* Hash Ethernet address */
 static u32 macvlan_eth_hash(const unsigned char *addr)
 {
@@ -193,11 +210,11 @@ static void macvlan_hash_change_addr(struct macvlan_dev 
*vlan,
 static bool macvlan_addr_busy(const struct macvlan_port *port,
  const unsigned char *addr)
 {
-   /* Test to see if the specified multicast address is
+   /* Test to see if the specified address is
 * currently in use by the underlying device or
 * another macvlan.
 */
-   if (!macvlan_passthru(port) &&
+   if (!macvlan_passthru(port) && !macvlan_addr_change(port) &&
ether_addr_equal_64bits(port->dev->dev_addr, addr))
return true;

@@ -685,6 +702,7 @@ static int macvlan_sync_address(struct net_device *dev, 
unsigned char *addr)
 {
struct macvlan_dev *vlan = netdev_priv(dev);
struct net_device *lowerdev = vlan->lowerdev;
+   struct macvlan_port *port = vlan->port;
int err;

if (!(dev->flags & IFF_UP)) {
@@ -695,7 +713,7 @@ static int macvlan_sync_address(struct net_device *dev, 
unsigned char *addr)
if (macvlan_addr_busy(vlan->port, addr))
return -EBUSY;

-   if (!macvlan_passthru(vlan->port)) {
+   if (!macvlan_passthru(port)) {
err = dev_uc_add(lowerdev, addr);
if (err)
return err;
@@ -705,6 +723,15 @@ static int macvlan_sync_address(struct net_device *dev, 
unsigned char *addr)

macvlan_hash_change_addr(vlan, addr);
}
+   if (macvlan_passthru(port) && !macvlan_addr_change(port)) {
+   /* Since addr_change isn't set, we are here due to lower
+* device change.  Save the lower-dev address so we can
+* restore it later.
+*/
+   ether_addr_copy(vlan->port->perm_addr,
+   dev->dev_addr);


Did you meant to copy `addr' here? Since dev->dev_addr is that of the macvlan 
device whilst `addr' is from the lower parent device.



Thanks,
~Girish




[PATCH net-next v2] geneve: add missing rx stats accounting

2017-06-08 Thread Girish Moodalbail
There are few places on the receive path where packet drops and packet
errors were not accounted for. This patch fixes that issue.

Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com>
---
v0 -> v1:
   -modified to use canonical post-increment "x++"
---
 drivers/net/geneve.c | 36 
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 6ebb0f5..ff626db 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -212,6 +212,7 @@ static void geneve_rx(struct geneve_dev *geneve, struct 
geneve_sock *gs,
struct genevehdr *gnvh = geneve_hdr(skb);
struct metadata_dst *tun_dst = NULL;
struct pcpu_sw_netstats *stats;
+   unsigned int len;
int err = 0;
void *oiph;
 
@@ -225,8 +226,10 @@ static void geneve_rx(struct geneve_dev *geneve, struct 
geneve_sock *gs,
tun_dst = udp_tun_rx_dst(skb, geneve_get_sk_family(gs), flags,
 vni_to_tunnel_id(gnvh->vni),
 gnvh->opt_len * 4);
-   if (!tun_dst)
+   if (!tun_dst) {
+   geneve->dev->stats.rx_dropped++;
goto drop;
+   }
/* Update tunnel dst according to Geneve options. */
ip_tunnel_info_opts_set(_dst->u.tun_info,
gnvh->options, gnvh->opt_len * 4);
@@ -234,8 +237,11 @@ static void geneve_rx(struct geneve_dev *geneve, struct 
geneve_sock *gs,
/* Drop packets w/ critical options,
 * since we don't support any...
 */
-   if (gnvh->critical)
+   if (gnvh->critical) {
+   geneve->dev->stats.rx_frame_errors++;
+   geneve->dev->stats.rx_errors++;
goto drop;
+   }
}
 
skb_reset_mac_header(skb);
@@ -246,8 +252,10 @@ static void geneve_rx(struct geneve_dev *geneve, struct 
geneve_sock *gs,
skb_dst_set(skb, _dst->dst);
 
/* Ignore packet loops (and multicast echo) */
-   if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr))
+   if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr)) {
+   geneve->dev->stats.rx_errors++;
goto drop;
+   }
 
oiph = skb_network_header(skb);
skb_reset_network_header(skb);
@@ -279,13 +287,15 @@ static void geneve_rx(struct geneve_dev *geneve, struct 
geneve_sock *gs,
}
}
 
-   stats = this_cpu_ptr(geneve->dev->tstats);
-   u64_stats_update_begin(>syncp);
-   stats->rx_packets++;
-   stats->rx_bytes += skb->len;
-   u64_stats_update_end(>syncp);
-
-   gro_cells_receive(>gro_cells, skb);
+   len = skb->len;
+   err = gro_cells_receive(>gro_cells, skb);
+   if (likely(err == NET_RX_SUCCESS)) {
+   stats = this_cpu_ptr(geneve->dev->tstats);
+   u64_stats_update_begin(>syncp);
+   stats->rx_packets++;
+   stats->rx_bytes += len;
+   u64_stats_update_end(>syncp);
+   }
return;
 drop:
/* Consume bad packet */
@@ -334,7 +344,7 @@ static int geneve_udp_encap_recv(struct sock *sk, struct 
sk_buff *skb)
struct geneve_sock *gs;
int opts_len;
 
-   /* Need Geneve and inner Ethernet header to be present */
+   /* Need UDP and Geneve header to be present */
if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN)))
goto drop;
 
@@ -357,8 +367,10 @@ static int geneve_udp_encap_recv(struct sock *sk, struct 
sk_buff *skb)
opts_len = geneveh->opt_len * 4;
if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len,
 htons(ETH_P_TEB),
-!net_eq(geneve->net, dev_net(geneve->dev
+!net_eq(geneve->net, dev_net(geneve->dev {
+   geneve->dev->stats.rx_dropped++;
goto drop;
+   }
 
geneve_rx(geneve, gs, skb);
return 0;
-- 
1.8.3.1



[PATCH] geneve: add missing rx stats accounting

2017-06-08 Thread Girish Moodalbail
There are few places on the receive path where packet drops and packet
errors were not accounted for. This patch fixes that issue.

Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com>
---
 drivers/net/geneve.c | 36 
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 6ebb0f5..34a4d45 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -212,6 +212,7 @@ static void geneve_rx(struct geneve_dev *geneve, struct 
geneve_sock *gs,
struct genevehdr *gnvh = geneve_hdr(skb);
struct metadata_dst *tun_dst = NULL;
struct pcpu_sw_netstats *stats;
+   unsigned int len;
int err = 0;
void *oiph;
 
@@ -225,8 +226,10 @@ static void geneve_rx(struct geneve_dev *geneve, struct 
geneve_sock *gs,
tun_dst = udp_tun_rx_dst(skb, geneve_get_sk_family(gs), flags,
 vni_to_tunnel_id(gnvh->vni),
 gnvh->opt_len * 4);
-   if (!tun_dst)
+   if (!tun_dst) {
+   ++geneve->dev->stats.rx_dropped;
goto drop;
+   }
/* Update tunnel dst according to Geneve options. */
ip_tunnel_info_opts_set(_dst->u.tun_info,
gnvh->options, gnvh->opt_len * 4);
@@ -234,8 +237,11 @@ static void geneve_rx(struct geneve_dev *geneve, struct 
geneve_sock *gs,
/* Drop packets w/ critical options,
 * since we don't support any...
 */
-   if (gnvh->critical)
+   if (gnvh->critical) {
+   ++geneve->dev->stats.rx_frame_errors;
+   ++geneve->dev->stats.rx_errors;
goto drop;
+   }
}
 
skb_reset_mac_header(skb);
@@ -246,8 +252,10 @@ static void geneve_rx(struct geneve_dev *geneve, struct 
geneve_sock *gs,
skb_dst_set(skb, _dst->dst);
 
/* Ignore packet loops (and multicast echo) */
-   if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr))
+   if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr)) {
+   ++geneve->dev->stats.rx_errors;
goto drop;
+   }
 
oiph = skb_network_header(skb);
skb_reset_network_header(skb);
@@ -279,13 +287,15 @@ static void geneve_rx(struct geneve_dev *geneve, struct 
geneve_sock *gs,
}
}
 
-   stats = this_cpu_ptr(geneve->dev->tstats);
-   u64_stats_update_begin(>syncp);
-   stats->rx_packets++;
-   stats->rx_bytes += skb->len;
-   u64_stats_update_end(>syncp);
-
-   gro_cells_receive(>gro_cells, skb);
+   len = skb->len;
+   err = gro_cells_receive(>gro_cells, skb);
+   if (likely(err == NET_RX_SUCCESS)) {
+   stats = this_cpu_ptr(geneve->dev->tstats);
+   u64_stats_update_begin(>syncp);
+   stats->rx_packets++;
+   stats->rx_bytes += len;
+   u64_stats_update_end(>syncp);
+   }
return;
 drop:
/* Consume bad packet */
@@ -334,7 +344,7 @@ static int geneve_udp_encap_recv(struct sock *sk, struct 
sk_buff *skb)
struct geneve_sock *gs;
int opts_len;
 
-   /* Need Geneve and inner Ethernet header to be present */
+   /* Need UDP and Geneve header to be present */
if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN)))
goto drop;
 
@@ -357,8 +367,10 @@ static int geneve_udp_encap_recv(struct sock *sk, struct 
sk_buff *skb)
opts_len = geneveh->opt_len * 4;
if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len,
 htons(ETH_P_TEB),
-!net_eq(geneve->net, dev_net(geneve->dev
+!net_eq(geneve->net, dev_net(geneve->dev {
+   ++geneve->dev->stats.rx_dropped;
goto drop;
+   }
 
geneve_rx(geneve, gs, skb);
return 0;
-- 
1.8.3.1



[PATCH net-next] macsec: double accounting of dropped rx/tx packets

2017-05-19 Thread Girish Moodalbail
The macsec implementation shouldn't account for rx/tx packets that are
dropped in the netdev framework. The netdev framework itself accounts
for such packets by atomically updating struct net_device`rx_dropped and
struct net_device`tx_dropped fields. Later on when the stats for macsec
link is retrieved, the packets dropped in netdev framework will be
included in dev_get_stats() after calling macsec.c`macsec_get_stats64()

Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com>
---
 drivers/net/macsec.c | 15 +++
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index cdc347b..91642fd 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -588,8 +588,6 @@ static void count_tx(struct net_device *dev, int ret, int 
len)
stats->tx_packets++;
stats->tx_bytes += len;
u64_stats_update_end(>syncp);
-   } else {
-   dev->stats.tx_dropped++;
}
 }
 
@@ -883,7 +881,7 @@ static void macsec_decrypt_done(struct crypto_async_request 
*base, int err)
struct macsec_dev *macsec = macsec_priv(dev);
struct macsec_rx_sa *rx_sa = macsec_skb_cb(skb)->rx_sa;
struct macsec_rx_sc *rx_sc = rx_sa->sc;
-   int len, ret;
+   int len;
u32 pn;
 
aead_request_free(macsec_skb_cb(skb)->req);
@@ -904,11 +902,8 @@ static void macsec_decrypt_done(struct 
crypto_async_request *base, int err)
macsec_reset_skb(skb, macsec->secy.netdev);
 
len = skb->len;
-   ret = gro_cells_receive(>gro_cells, skb);
-   if (ret == NET_RX_SUCCESS)
+   if (gro_cells_receive(>gro_cells, skb) == NET_RX_SUCCESS)
count_rx(dev, len);
-   else
-   macsec->secy.netdev->stats.rx_dropped++;
 
rcu_read_unlock_bh();
 
@@ -1037,7 +1032,6 @@ static void handle_not_macsec(struct sk_buff *skb)
 */
list_for_each_entry_rcu(macsec, >secys, secys) {
struct sk_buff *nskb;
-   int ret;
struct pcpu_secy_stats *secy_stats = 
this_cpu_ptr(macsec->stats);
 
if (macsec->secy.validate_frames == MACSEC_VALIDATE_STRICT) {
@@ -1054,13 +1048,10 @@ static void handle_not_macsec(struct sk_buff *skb)
 
nskb->dev = macsec->secy.netdev;
 
-   ret = netif_rx(nskb);
-   if (ret == NET_RX_SUCCESS) {
+   if (netif_rx(nskb) == NET_RX_SUCCESS) {
u64_stats_update_begin(_stats->syncp);
secy_stats->stats.InPktsUntagged++;
u64_stats_update_end(_stats->syncp);
-   } else {
-   macsec->secy.netdev->stats.rx_dropped++;
}
}
 
-- 
1.8.3.1



Re: [PATCH net-next] geneve: add rtnl changelink support

2017-05-18 Thread Girish Moodalbail
TL DR; There is indeed a race between geneve_changelink() and geneve transmit 
path w.r.t attributes being changed and the old value of those attributes being 
used in the transmit patch. I will resubmit V2 of the patch with those issues 
addressed. Thanks!


Please see in-line for my other comments..




Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com>
---
 drivers/net/geneve.c | 149 ---
 1 file changed, 117 insertions(+), 32 deletions(-)


...

@@ -1169,45 +1181,58 @@ static void init_tnl_info(struct ip_tunnel_info *info, 
__u16 dst_port)
info->key.tp_dst = htons(dst_port);
 }

-static int geneve_newlink(struct net *net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[])
+static int geneve_nl2info(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[], struct ip_tunnel_info *info,
+ bool *metadata, bool *use_udp6_rx_checksums,
+ bool changelink)
 {
-   bool use_udp6_rx_checksums = false;
-   struct ip_tunnel_info info;
-   bool metadata = false;
+   struct geneve_dev *geneve = netdev_priv(dev);

-   init_tnl_info(, GENEVE_UDP_PORT);
+   if (changelink) {
+   /* if changelink operation, start with old existing info */
+   memcpy(info, >info, sizeof(*info));
+   *metadata = geneve->collect_md;
+   *use_udp6_rx_checksums = geneve->use_udp6_rx_checksums;
+   } else {
+   init_tnl_info(info, GENEVE_UDP_PORT);
+   }

if (data[IFLA_GENEVE_REMOTE] && data[IFLA_GENEVE_REMOTE6])
return -EINVAL;

if (data[IFLA_GENEVE_REMOTE]) {
-   info.key.u.ipv4.dst =
+   info->key.u.ipv4.dst =
nla_get_in_addr(data[IFLA_GENEVE_REMOTE]);

-   if (IN_MULTICAST(ntohl(info.key.u.ipv4.dst))) {
+   if (IN_MULTICAST(ntohl(info->key.u.ipv4.dst))) {
netdev_dbg(dev, "multicast remote is unsupported\n");
return -EINVAL;
}
+   if (changelink &&
+   ip_tunnel_info_af(>info) == AF_INET6) {
+   info->mode &= ~IP_TUNNEL_INFO_IPV6;
+   info->key.tun_flags &= ~TUNNEL_CSUM;
+   *use_udp6_rx_checksums = false;
+   }

This allows changelink to change ipv4 address but there are no changes
made to the geneve tunnel port hash table after this update.


The following code in geneve_changelink() does what you are asking for

+if (!geneve_dst_addr_equal(>info, ))
+dst_cache_reset(_cache);

geneve_nl2info() accrues all the allowed changes to be made and captures it in 
ip_tunnel_info structure and then the above code in geneve_changelink() ensures 
that all the route cache associated with the old remote address are released 
when the next lookup occurs.



We also
need to check to see if there is any conflicts with existing ports.


This is not needed since we don't support changing the remote port.



What is the barrier between the rx/tx threads and changelink process?


There is an issue here like you pointed out (thanks!). Will fix that issue.




}

if (data[IFLA_GENEVE_REMOTE6]) {
  #if IS_ENABLED(CONFIG_IPV6)
-   info.mode = IP_TUNNEL_INFO_IPV6;
-   info.key.u.ipv6.dst =
+   info->mode = IP_TUNNEL_INFO_IPV6;
+   info->key.u.ipv6.dst =
nla_get_in6_addr(data[IFLA_GENEVE_REMOTE6]);

-   if (ipv6_addr_type() &
+   if (ipv6_addr_type(>key.u.ipv6.dst) &
IPV6_ADDR_LINKLOCAL) {
netdev_dbg(dev, "link-local remote is unsupported\n");
return -EINVAL;
}
-   if (ipv6_addr_is_multicast()) {
+   if (ipv6_addr_is_multicast(>key.u.ipv6.dst)) {
netdev_dbg(dev, "multicast remote is unsupported\n");
return -EINVAL;
}
-   info.key.tun_flags |= TUNNEL_CSUM;
-   use_udp6_rx_checksums = true;
+   info->key.tun_flags |= TUNNEL_CSUM;
+   *use_udp6_rx_checksums = true;

Same here. We need to check/fix the geneve tunnel hash table according
to new IP address.


This is taken care by the call to dst_cache_reset() whenever the remote address 
changes. This function already takes care of races and contentions


8<-8<--
/**
 *  dst_cache_reset - invalidate the cache contents
 *  @dst_cache: the cache
 *
 *  This do not free the cached dst to avoid races and contentions.
 *  the dst will be freed on later cache lookup.
 */
s

Re: [PATCH net-next] geneve: add rtnl changelink support

2017-05-16 Thread Girish Moodalbail

On 5/16/17 12:31 PM, David Miller wrote:

From: Girish Moodalbail <girish.moodalb...@oracle.com>
Date: Mon, 15 May 2017 10:47:04 -0700


if (data[IFLA_GENEVE_REMOTE]) {
-   info.key.u.ipv4.dst =
+   info->key.u.ipv4.dst =
nla_get_in_addr(data[IFLA_GENEVE_REMOTE]);

-   if (IN_MULTICAST(ntohl(info.key.u.ipv4.dst))) {
+   if (IN_MULTICAST(ntohl(info->key.u.ipv4.dst))) {
netdev_dbg(dev, "multicast remote is unsupported\n");
return -EINVAL;
}
+   if (changelink &&
+   ip_tunnel_info_af(>info) == AF_INET6) {
+   info->mode &= ~IP_TUNNEL_INFO_IPV6;
+   info->key.tun_flags &= ~TUNNEL_CSUM;
+   *use_udp6_rx_checksums = false;
+   }
}


I don't understand this "changelink" guarded code, why do you need to
clear all of this state out if the existing tunnel type if AF_INET6
and only when doing a changelink?

In any event, I think you need to add a comment explaining it.



If geneve link was overlayed over IPv6 network and now the user modifies the 
link to be over IPv4 network by doing


# ip link set gen0 type geneve id 100 remote 192.168.13.2

Then we will need to

 - reset info->mode to be not IPv6 type
 - the default for UDP checksum over IPv4 is 'no', so reset that and
 - set use_udp6_rx_checksums to its default value which is false.

I will capture the above information concisely in a comment around that 
'changelink' guard.


thanks,
~Girish



[PATCH net-next] geneve: add rtnl changelink support

2017-05-15 Thread Girish Moodalbail
This patch adds changelink rtnl operation support for geneve devices.
Code changes involve:
  - refactor geneve_newlink into geneve_nl2info to be used by both
geneve_newlink and geneve_changelink
  - geneve_nl2info takes a changelink boolean argument to isolate
changelink checks and updates.
  - Allow changing only a few attributes:
- return -EOPNOTSUPP for attributes that cannot be changed for
  now. Incremental patches can make the non-supported one
  available in the future if needed.

Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com>
---
 drivers/net/geneve.c | 149 ---
 1 file changed, 117 insertions(+), 32 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index dec5d56..6528910 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -1112,6 +1112,18 @@ static bool is_tnl_info_zero(const struct ip_tunnel_info 
*info)
return true;
 }
 
+static inline bool geneve_dst_addr_equal(struct ip_tunnel_info *a,
+struct ip_tunnel_info *b)
+{
+   if (ip_tunnel_info_af(a) != ip_tunnel_info_af(b))
+   return false;
+
+   if (ip_tunnel_info_af(a) == AF_INET)
+   return a->key.u.ipv4.dst == b->key.u.ipv4.dst;
+   else
+   return ipv6_addr_equal(>key.u.ipv6.dst, >key.u.ipv6.dst);
+}
+
 static int geneve_configure(struct net *net, struct net_device *dev,
const struct ip_tunnel_info *info,
bool metadata, bool ipv6_rx_csum)
@@ -1169,45 +1181,58 @@ static void init_tnl_info(struct ip_tunnel_info *info, 
__u16 dst_port)
info->key.tp_dst = htons(dst_port);
 }
 
-static int geneve_newlink(struct net *net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[])
+static int geneve_nl2info(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[], struct ip_tunnel_info *info,
+ bool *metadata, bool *use_udp6_rx_checksums,
+ bool changelink)
 {
-   bool use_udp6_rx_checksums = false;
-   struct ip_tunnel_info info;
-   bool metadata = false;
+   struct geneve_dev *geneve = netdev_priv(dev);
 
-   init_tnl_info(, GENEVE_UDP_PORT);
+   if (changelink) {
+   /* if changelink operation, start with old existing info */
+   memcpy(info, >info, sizeof(*info));
+   *metadata = geneve->collect_md;
+   *use_udp6_rx_checksums = geneve->use_udp6_rx_checksums;
+   } else {
+   init_tnl_info(info, GENEVE_UDP_PORT);
+   }
 
if (data[IFLA_GENEVE_REMOTE] && data[IFLA_GENEVE_REMOTE6])
return -EINVAL;
 
if (data[IFLA_GENEVE_REMOTE]) {
-   info.key.u.ipv4.dst =
+   info->key.u.ipv4.dst =
nla_get_in_addr(data[IFLA_GENEVE_REMOTE]);
 
-   if (IN_MULTICAST(ntohl(info.key.u.ipv4.dst))) {
+   if (IN_MULTICAST(ntohl(info->key.u.ipv4.dst))) {
netdev_dbg(dev, "multicast remote is unsupported\n");
return -EINVAL;
}
+   if (changelink &&
+   ip_tunnel_info_af(>info) == AF_INET6) {
+   info->mode &= ~IP_TUNNEL_INFO_IPV6;
+   info->key.tun_flags &= ~TUNNEL_CSUM;
+   *use_udp6_rx_checksums = false;
+   }
}
 
if (data[IFLA_GENEVE_REMOTE6]) {
  #if IS_ENABLED(CONFIG_IPV6)
-   info.mode = IP_TUNNEL_INFO_IPV6;
-   info.key.u.ipv6.dst =
+   info->mode = IP_TUNNEL_INFO_IPV6;
+   info->key.u.ipv6.dst =
nla_get_in6_addr(data[IFLA_GENEVE_REMOTE6]);
 
-   if (ipv6_addr_type() &
+   if (ipv6_addr_type(>key.u.ipv6.dst) &
IPV6_ADDR_LINKLOCAL) {
netdev_dbg(dev, "link-local remote is unsupported\n");
return -EINVAL;
}
-   if (ipv6_addr_is_multicast()) {
+   if (ipv6_addr_is_multicast(>key.u.ipv6.dst)) {
netdev_dbg(dev, "multicast remote is unsupported\n");
return -EINVAL;
}
-   info.key.tun_flags |= TUNNEL_CSUM;
-   use_udp6_rx_checksums = true;
+   info->key.tun_flags |= TUNNEL_CSUM;
+   *use_udp6_rx_checksums = true;
 #else
return -EPFNOSUPPORT;
 #endif
@@ -1216,48 +1241,107 @@ static int geneve_newlink(struct net *net, struct 
net_device *dev,
if (data[IFLA_GENEVE_ID]) {
__u32 vni;
__u8 tvni[3];
+   _

Re: [PATCH iproute2 v2 1/1] vxlan: Add support for modifying vxlan device attributes

2017-05-10 Thread Girish Moodalbail

On 5/7/17 7:40 AM, Roman Mashak wrote:

Girish Moodalbail <girish.moodalb...@oracle.com> writes:


[...]


 ip/iplink_vxlan.c | 251 +++---
 1 file changed, 143 insertions(+), 108 deletions(-)

diff --git a/ip/iplink_vxlan.c b/ip/iplink_vxlan.c
index b4ebb13..2bd619d 100644
--- a/ip/iplink_vxlan.c
+++ b/ip/iplink_vxlan.c
@@ -21,6 +21,8 @@
 #include "utils.h"
 #include "ip_common.h"

+#define VXLAN_ATTRSET(attrs, type) (((attrs) & (1L << (type))) != 0)


I think you can drop '!= 0' part in the macro.


Sure it can be done that way as well. However, I have seen both forms in use in 
iproute2 code base, so I would like to keep this as is unless you feel strongly 
about it. Furthermore, running 'checkpatch.pl --strict' on the patch didn't 
complain too.


regards,
~Girish



[...]



+static void check_duparg(__u64 *attrs, int type, const char *key,
+const char *argv)
+{
+   if (!VXLAN_ATTRSET(*attrs, type)) {
+   *attrs |= (1L << type);
+   return;
+   }
+   duparg2(key, argv);
+}


[...]





Re: [PATCH] drivers: net: wimax: i2400m: i2400m-usb: Use time_after for time comparison

2017-05-08 Thread Girish Moodalbail

On 5/8/17 2:26 PM, David Miller wrote:

From: Karim Eshapa 
Date: Mon,  8 May 2017 18:58:02 +0200


diff --git a/drivers/net/wimax/i2400m/i2400m-usb.h 
b/drivers/net/wimax/i2400m/i2400m-usb.h
index 649ecad..6fc941c 100644
--- a/drivers/net/wimax/i2400m/i2400m-usb.h
+++ b/drivers/net/wimax/i2400m/i2400m-usb.h
@@ -131,7 +131,7 @@ static inline int edc_inc(struct edc *edc, u16 max_err, u16 
timeframe)
unsigned long now;

now = jiffies;
-   if (now - edc->timestart > timeframe) {
+   if (time_after(now - edc->timestart, (unsigned long)timeframe)) {


This is far from correct.

time_after() compares two "jiffies" timestamp values.  The second
argument here is not a jiffies timestamp value.



Perhaps, what is needed here is:

+   if (time_after(jiffies, edc->timestart + timeframe)) {

where in 'timeframe' is set in HZ in all the callers I checked (for the most 
part it is set to EDC_ERROR_TIMEFRAME which is 1HZ).


~Girish



[PATCH net-next] geneve: add rtnl changelink support

2017-05-08 Thread Girish Moodalbail
This patch adds changelink rtnl operation support for geneve devices.
Code changes involve:
  - refactor geneve_newlink into geneve_nl2info to be used by both
geneve_newlink and geneve_changelink
  - geneve_nl2info takes a changelink boolean argument to isolate
changelink checks and updates.
  - Allow changing only a few attributes:
- return -EOPNOTSUPP for attributes that cannot be changed for
  now. Incremental patches can make the non-supported one
  available in the future if needed.

Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com>
---
 drivers/net/geneve.c | 149 ---
 1 file changed, 117 insertions(+), 32 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index dec5d56..6528910 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -1112,6 +1112,18 @@ static bool is_tnl_info_zero(const struct ip_tunnel_info 
*info)
return true;
 }
 
+static inline bool geneve_dst_addr_equal(struct ip_tunnel_info *a,
+struct ip_tunnel_info *b)
+{
+   if (ip_tunnel_info_af(a) != ip_tunnel_info_af(b))
+   return false;
+
+   if (ip_tunnel_info_af(a) == AF_INET)
+   return a->key.u.ipv4.dst == b->key.u.ipv4.dst;
+   else
+   return ipv6_addr_equal(>key.u.ipv6.dst, >key.u.ipv6.dst);
+}
+
 static int geneve_configure(struct net *net, struct net_device *dev,
const struct ip_tunnel_info *info,
bool metadata, bool ipv6_rx_csum)
@@ -1169,45 +1181,58 @@ static void init_tnl_info(struct ip_tunnel_info *info, 
__u16 dst_port)
info->key.tp_dst = htons(dst_port);
 }
 
-static int geneve_newlink(struct net *net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[])
+static int geneve_nl2info(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[], struct ip_tunnel_info *info,
+ bool *metadata, bool *use_udp6_rx_checksums,
+ bool changelink)
 {
-   bool use_udp6_rx_checksums = false;
-   struct ip_tunnel_info info;
-   bool metadata = false;
+   struct geneve_dev *geneve = netdev_priv(dev);
 
-   init_tnl_info(, GENEVE_UDP_PORT);
+   if (changelink) {
+   /* if changelink operation, start with old existing info */
+   memcpy(info, >info, sizeof(*info));
+   *metadata = geneve->collect_md;
+   *use_udp6_rx_checksums = geneve->use_udp6_rx_checksums;
+   } else {
+   init_tnl_info(info, GENEVE_UDP_PORT);
+   }
 
if (data[IFLA_GENEVE_REMOTE] && data[IFLA_GENEVE_REMOTE6])
return -EINVAL;
 
if (data[IFLA_GENEVE_REMOTE]) {
-   info.key.u.ipv4.dst =
+   info->key.u.ipv4.dst =
nla_get_in_addr(data[IFLA_GENEVE_REMOTE]);
 
-   if (IN_MULTICAST(ntohl(info.key.u.ipv4.dst))) {
+   if (IN_MULTICAST(ntohl(info->key.u.ipv4.dst))) {
netdev_dbg(dev, "multicast remote is unsupported\n");
return -EINVAL;
}
+   if (changelink &&
+   ip_tunnel_info_af(>info) == AF_INET6) {
+   info->mode &= ~IP_TUNNEL_INFO_IPV6;
+   info->key.tun_flags &= ~TUNNEL_CSUM;
+   *use_udp6_rx_checksums = false;
+   }
}
 
if (data[IFLA_GENEVE_REMOTE6]) {
  #if IS_ENABLED(CONFIG_IPV6)
-   info.mode = IP_TUNNEL_INFO_IPV6;
-   info.key.u.ipv6.dst =
+   info->mode = IP_TUNNEL_INFO_IPV6;
+   info->key.u.ipv6.dst =
nla_get_in6_addr(data[IFLA_GENEVE_REMOTE6]);
 
-   if (ipv6_addr_type() &
+   if (ipv6_addr_type(>key.u.ipv6.dst) &
IPV6_ADDR_LINKLOCAL) {
netdev_dbg(dev, "link-local remote is unsupported\n");
return -EINVAL;
}
-   if (ipv6_addr_is_multicast()) {
+   if (ipv6_addr_is_multicast(>key.u.ipv6.dst)) {
netdev_dbg(dev, "multicast remote is unsupported\n");
return -EINVAL;
}
-   info.key.tun_flags |= TUNNEL_CSUM;
-   use_udp6_rx_checksums = true;
+   info->key.tun_flags |= TUNNEL_CSUM;
+   *use_udp6_rx_checksums = true;
 #else
return -EPFNOSUPPORT;
 #endif
@@ -1216,48 +1241,107 @@ static int geneve_newlink(struct net *net, struct 
net_device *dev,
if (data[IFLA_GENEVE_ID]) {
__u32 vni;
__u8 tvni[3];
+   _

[PATCH iproute2 v2 0/1] vxlan: support for modifying vxlan device

2017-05-06 Thread Girish Moodalbail
Hello all,

This patch adds support for modifying VXLAN device attributes. I have
refactored the vxlan_parse_opt() function to be more readable and not
use lot of bool variables.

I have tested my changes by running Linux Test Project's VXLAN
testcases, and I didn't see any regression.
---
v1->v2
- refactored vxlan_parse_opt() to not to use a bunch of
  foo_set variables

Girish Moodalbail (1):
  vxlan: Add support for modifying vxlan device attributes

 ip/iplink_vxlan.c | 251 +++---
 1 file changed, 143 insertions(+), 108 deletions(-)

-- 
1.8.3.1



[PATCH iproute2 v2 1/1] vxlan: Add support for modifying vxlan device attributes

2017-05-06 Thread Girish Moodalbail
Ability to change vxlan device attributes was added to kernel through
commit 8bcdc4f3a20b ("vxlan: add changelink support"), however one
cannot do the same through ip(8) command.  Changing the allowed vxlan
device attributes using 'ip link set dev  type vxlan
' currently fails with 'operation not supported'
error.  This failure is due to the incorrect rtnetlink message
construction for the 'ip link set' operation.

The vxlan_parse_opt() callback function is called for parsing options
for both 'ip link add' and 'ip link set'. For the 'add' case, we pass
down default values for those attributes that were not provided as CLI
options. However, for the 'set' case we should be only passing down the
explicitly provided attributes and not any other (default) attributes.

Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com>
---
 ip/iplink_vxlan.c | 251 +++---
 1 file changed, 143 insertions(+), 108 deletions(-)

diff --git a/ip/iplink_vxlan.c b/ip/iplink_vxlan.c
index b4ebb13..2bd619d 100644
--- a/ip/iplink_vxlan.c
+++ b/ip/iplink_vxlan.c
@@ -21,6 +21,8 @@
 #include "utils.h"
 #include "ip_common.h"
 
+#define VXLAN_ATTRSET(attrs, type) (((attrs) & (1L << (type))) != 0)
+
 static void print_explain(FILE *f)
 {
fprintf(f,
@@ -59,54 +61,50 @@ static void explain(void)
print_explain(stderr);
 }
 
+static void check_duparg(__u64 *attrs, int type, const char *key,
+const char *argv)
+{
+   if (!VXLAN_ATTRSET(*attrs, type)) {
+   *attrs |= (1L << type);
+   return;
+   }
+   duparg2(key, argv);
+}
+
 static int vxlan_parse_opt(struct link_util *lu, int argc, char **argv,
  struct nlmsghdr *n)
 {
__u32 vni = 0;
-   int vni_set = 0;
-   __u32 saddr = 0;
__u32 gaddr = 0;
__u32 daddr = 0;
-   struct in6_addr saddr6 = IN6ADDR_ANY_INIT;
struct in6_addr gaddr6 = IN6ADDR_ANY_INIT;
struct in6_addr daddr6 = IN6ADDR_ANY_INIT;
-   unsigned int link = 0;
-   __u8 tos = 0;
-   __u8 ttl = 0;
-   __u32 label = 0;
__u8 learning = 1;
-   __u8 proxy = 0;
-   __u8 rsc = 0;
-   __u8 l2miss = 0;
-   __u8 l3miss = 0;
-   __u8 noage = 0;
-   __u32 age = 0;
-   __u32 maxaddr = 0;
__u16 dstport = 0;
-   __u8 udpcsum = 0;
-   bool udpcsum_set = false;
-   __u8 udp6zerocsumtx = 0;
-   bool udp6zerocsumtx_set = false;
-   __u8 udp6zerocsumrx = 0;
-   bool udp6zerocsumrx_set = false;
-   __u8 remcsumtx = 0;
-   __u8 remcsumrx = 0;
__u8 metadata = 0;
-   __u8 gbp = 0;
-   __u8 gpe = 0;
-   int dst_port_set = 0;
-   struct ifla_vxlan_port_range range = { 0, 0 };
+   __u64 attrs = 0;
+   bool set_op = (n->nlmsg_type == RTM_NEWLINK &&
+  !(n->nlmsg_flags & NLM_F_CREATE));
 
while (argc > 0) {
if (!matches(*argv, "id") ||
!matches(*argv, "vni")) {
+   /* We will add ID attribute outside of the loop since we
+* need to consider metadata information as well.
+*/
NEXT_ARG();
+   check_duparg(, IFLA_VXLAN_ID, "id", *argv);
if (get_u32(, *argv, 0) ||
vni >= 1u << 24)
invarg("invalid id", *argv);
-   vni_set = 1;
} else if (!matches(*argv, "group")) {
+   if (daddr || !IN6_IS_ADDR_UNSPECIFIED()) {
+   fprintf(stderr, "vxlan: both group and remote");
+   fprintf(stderr, " cannot be specified\n");
+   return -1;
+   }
NEXT_ARG();
+   check_duparg(, IFLA_VXLAN_GROUP, "group", *argv);
if (!inet_get_addr(*argv, , )) {
fprintf(stderr, "Invalid address \"%s\"\n", 
*argv);
return -1;
@@ -114,7 +112,13 @@ static int vxlan_parse_opt(struct link_util *lu, int argc, 
char **argv,
if (!IN6_IS_ADDR_MULTICAST() && 
!IN_MULTICAST(ntohl(gaddr)))
invarg("invalid group address", *argv);
} else if (!matches(*argv, "remote")) {
+   if (gaddr || !IN6_IS_ADDR_UNSPECIFIED()) {
+   fprintf(stderr, "vxlan: both group and remote");
+   fprintf(stderr, " cannot be specified\n");
+   return -1;
+   

Re: [PATCH iproute2] vxlan: Add support for modifying vxlan device attributes

2017-05-04 Thread Girish Moodalbail

On 5/4/17 5:07 PM, Stephen Hemminger wrote:

On Thu,  4 May 2017 14:46:34 -0700
Girish Moodalbail <girish.moodalb...@oracle.com> wrote:


Ability to change vxlan device attributes was added to kernel through
commit 8bcdc4f3a20b ("vxlan: add changelink support"), however one
cannot do the same through ip(8) command.  Changing the allowed vxlan
device attributes using 'ip link set dev  type vxlan
' currently fails with 'operation not supported'
error.  This failure is due to the incorrect rtnetlink message
construction for the 'ip link set' operation.

The vxlan_parse_opt() callback function is called for parsing options
for both 'ip link add' and 'ip link set'. For the 'add' case, we pass
down default values for those attributes that were not provided as CLI
options. However, for the 'set' case we should be only passing down the
explicitly provided attributes and not any other (default) attributes.

Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com>
---


All these foo_set variables are ugly. This looks almost like machine
generated code. It doesn't read well.


I thought about it, however I wasn't sure if refactoring that whole routine will 
be well received so I decided to follow the current model that already existed 
in iplink_vxlan.c. I will re-submit a patch cleaning up that whole routine.


thanks,
~Girish



[PATCH iproute2] vxlan: Add support for modifying vxlan device attributes

2017-05-04 Thread Girish Moodalbail
Ability to change vxlan device attributes was added to kernel through
commit 8bcdc4f3a20b ("vxlan: add changelink support"), however one
cannot do the same through ip(8) command.  Changing the allowed vxlan
device attributes using 'ip link set dev  type vxlan
' currently fails with 'operation not supported'
error.  This failure is due to the incorrect rtnetlink message
construction for the 'ip link set' operation.

The vxlan_parse_opt() callback function is called for parsing options
for both 'ip link add' and 'ip link set'. For the 'add' case, we pass
down default values for those attributes that were not provided as CLI
options. However, for the 'set' case we should be only passing down the
explicitly provided attributes and not any other (default) attributes.

Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com>
---
 ip/iplink_vxlan.c | 73 ---
 1 file changed, 59 insertions(+), 14 deletions(-)

diff --git a/ip/iplink_vxlan.c b/ip/iplink_vxlan.c
index b4ebb13..c8959aa 100644
--- a/ip/iplink_vxlan.c
+++ b/ip/iplink_vxlan.c
@@ -72,16 +72,25 @@ static int vxlan_parse_opt(struct link_util *lu, int argc, 
char **argv,
struct in6_addr daddr6 = IN6ADDR_ANY_INIT;
unsigned int link = 0;
__u8 tos = 0;
+   bool tos_set = false;
__u8 ttl = 0;
+   bool ttl_set = false;
__u32 label = 0;
+   bool label_set = false;
__u8 learning = 1;
+   bool learning_set = false;
__u8 proxy = 0;
+   bool proxy_set = false;
__u8 rsc = 0;
+   bool rsc_set = false;
__u8 l2miss = 0;
+   bool l2miss_set = false;
__u8 l3miss = 0;
+   bool l3miss_set = false;
__u8 noage = 0;
__u32 age = 0;
__u32 maxaddr = 0;
+   bool maxaddr_set = false;
__u16 dstport = 0;
__u8 udpcsum = 0;
bool udpcsum_set = false;
@@ -90,12 +99,17 @@ static int vxlan_parse_opt(struct link_util *lu, int argc, 
char **argv,
__u8 udp6zerocsumrx = 0;
bool udp6zerocsumrx_set = false;
__u8 remcsumtx = 0;
+   bool remcsumtx_set = false;
__u8 remcsumrx = 0;
+   bool remcsumrx_set = false;
__u8 metadata = 0;
+   bool metadata_set = false;
__u8 gbp = 0;
__u8 gpe = 0;
int dst_port_set = 0;
struct ifla_vxlan_port_range range = { 0, 0 };
+   bool set_op = (n->nlmsg_type == RTM_NEWLINK &&
+  !(n->nlmsg_flags & NLM_F_CREATE));
 
while (argc > 0) {
if (!matches(*argv, "id") ||
@@ -152,6 +166,7 @@ static int vxlan_parse_opt(struct link_util *lu, int argc, 
char **argv,
invarg("TTL must be <= 255", *argv);
ttl = uval;
}
+   ttl_set = true;
} else if (!matches(*argv, "tos") ||
   !matches(*argv, "dsfield")) {
__u32 uval;
@@ -163,6 +178,7 @@ static int vxlan_parse_opt(struct link_util *lu, int argc, 
char **argv,
tos = uval;
} else
tos = 1;
+   tos_set = true;
} else if (!matches(*argv, "label") ||
   !matches(*argv, "flowlabel")) {
__u32 uval;
@@ -172,6 +188,7 @@ static int vxlan_parse_opt(struct link_util *lu, int argc, 
char **argv,
(uval & ~LABEL_MAX_MASK))
invarg("invalid flowlabel", *argv);
label = htonl(uval);
+   label_set = true;
} else if (!matches(*argv, "ageing")) {
NEXT_ARG();
if (strcmp(*argv, "none") == 0)
@@ -184,6 +201,7 @@ static int vxlan_parse_opt(struct link_util *lu, int argc, 
char **argv,
maxaddr = 0;
else if (get_u32(, *argv, 0))
invarg("max addresses", *argv);
+   maxaddr_set = true;
} else if (!matches(*argv, "port") ||
   !matches(*argv, "srcport")) {
NEXT_ARG();
@@ -199,24 +217,34 @@ static int vxlan_parse_opt(struct link_util *lu, int 
argc, char **argv,
dst_port_set = 1;
} else if (!matches(*argv, "nolearning")) {
learning = 0;
+   learning_set = true;
} else if (!matches(*argv, "learning")) {
learning = 1;
+   learning_set = true;
} else if (!matches(*argv, "noproxy")) {
  

Re: struct ip vs struct iphdr

2017-05-04 Thread Girish Moodalbail

On 5/4/17 9:42 AM, Oleg wrote:

  Hi, all.

It seems struct ip and struct iphdr are similar: struct ip, despite of
it name, doesn't contain anything but ip header.

So, my noob question, what is the difference between them?


Also, see this:

http://stackoverflow.com/questions/42840636/difference-between-struct-ip-and-struct-iphdr



Thanks.





[PATCH net-next] geneve: fix incorrect setting of UDP checksum flag

2017-04-27 Thread Girish Moodalbail
Creating a geneve link with 'udpcsum' set results in a creation of link
for which UDP checksum will NOT be computed on outbound packets, as can
be seen below.

11: gen0: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN
link/ether c2:85:27:b6:b4:15 brd ff:ff:ff:ff:ff:ff promiscuity 0
geneve id 200 remote 192.168.13.1 dstport 6081 noudpcsum

Similarly, creating a link with 'noudpcsum' set results in a creation
of link for which UDP checksum will be computed on outbound packets.

Fixes: 9b4437a5b870 ("geneve: Unify LWT and netdev handling.")
Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com>
---
 drivers/net/geneve.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 7074b40..dec5d56 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -1244,7 +1244,7 @@ static int geneve_newlink(struct net *net, struct 
net_device *dev,
metadata = true;
 
if (data[IFLA_GENEVE_UDP_CSUM] &&
-   !nla_get_u8(data[IFLA_GENEVE_UDP_CSUM]))
+   nla_get_u8(data[IFLA_GENEVE_UDP_CSUM]))
info.key.tun_flags |= TUNNEL_CSUM;
 
if (data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX] &&
-- 
1.8.3.1