[PATCH net-next] macvlan: fix memory hole in macvlan_dev
Move 'macaddr_count' from after 'netpoll' to after 'nest_level' to pack and reduce a memory hole. Fixes: 88ca59d1aaf28c25 (macvlan: remove unused fields in struct macvlan_dev) Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com> --- include/linux/if_macvlan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index bedf54b..4cb7aee 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -30,10 +30,10 @@ struct macvlan_dev { enum macvlan_mode mode; u16 flags; int nest_level; + unsigned intmacaddr_count; #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *netpoll; #endif - unsigned intmacaddr_count; }; static inline void macvlan_count_rx(const struct macvlan_dev *vlan, -- 1.8.3.1
[PATCH net v2 0/1] NULL pointer dereference in ipvlan_port_destroy
>From code inspection it appeared that there is a possibility where in ipvlan_port_destroy() might be dealing with a port (struct ipvl_port) that has already been destroyed and is therefore already NULL. However, we don't check for NULL and continue to access the fields which results in a kernel panic. When call to register_netdevice() (called from ipvlan_link_new()) fails, inside that function we call ipvlan_uninit() (through ndo_uninit()) to destroy the ipvlan port. Upon returning unsuccessfully from register_netdevice() we go ahead and call ipvlan_port_destroy() again which causes NULL pointer dereference panic. To test this theory, I loaded up netdev-notifier-error-inject.ko and did $ sudo echo -22 > /sys/kernel/debug/notifier-error-inject/\ netdev/actions/NETDEV_POST_INIT/error $ sudo ip li add ipvl0 link enp7s0 type ipvlan ...system panics... BUG: unable to handle kernel NULL pointer dereference at 0820 IP: ipvlan_port_destroy+0x2a/0xf0 [ipvlan] Similar issue exists in macvlan_port_destroy() and it will be addressed by a separate patch. The following patch fixes the ipvlan case. I tested my changes for regression by running LTP's ipvlan test case. Girish Moodalbail (1): ipvlan: NULL pointer dereference panic in ipvlan_port_destroy drivers/net/ipvlan/ipvlan_main.c | 104 +-- 1 file changed, 55 insertions(+), 49 deletions(-) -- 1.8.3.1
[PATCH net v2 1/1] ipvlan: NULL pointer dereference panic in ipvlan_port_destroy
When call to register_netdevice() (called from ipvlan_link_new()) fails, we call ipvlan_uninit() (through ndo_uninit()) to destroy the ipvlan port. After returning unsuccessfully from register_netdevice() we go ahead and call ipvlan_port_destroy() again which causes NULL pointer dereference panic. Fix the issue by making ipvlan_init() and ipvlan_uninit() call symmetric. The ipvlan port will now be created inside ipvlan_init() and will be destroyed in ipvlan_uninit(). Fixes: 2ad7bf363841 (ipvlan: Initial check-in of the IPVLAN driver) Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com> --- v1 -> v2: - took care of David Miller's comment on ipvlan_init() and ipvlan_uninit() not being symmetric. --- --- drivers/net/ipvlan/ipvlan_main.c | 104 +-- 1 file changed, 55 insertions(+), 49 deletions(-) diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index a266aa4..30cb803 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -107,16 +107,6 @@ static int ipvlan_port_create(struct net_device *dev) struct ipvl_port *port; int err, idx; - if (dev->type != ARPHRD_ETHER || dev->flags & IFF_LOOPBACK) { - netdev_err(dev, "Master is either lo or non-ether device\n"); - return -EINVAL; - } - - if (netdev_is_rx_handler_busy(dev)) { - netdev_err(dev, "Device is already in use.\n"); - return -EBUSY; - } - port = kzalloc(sizeof(struct ipvl_port), GFP_KERNEL); if (!port) return -ENOMEM; @@ -179,8 +169,9 @@ static void ipvlan_port_destroy(struct net_device *dev) static int ipvlan_init(struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); - const struct net_device *phy_dev = ipvlan->phy_dev; - struct ipvl_port *port = ipvlan->port; + struct net_device *phy_dev = ipvlan->phy_dev; + struct ipvl_port *port; + int err; dev->state = (dev->state & ~IPVLAN_STATE_MASK) | (phy_dev->state & IPVLAN_STATE_MASK); @@ -196,18 +187,27 @@ static int ipvlan_init(struct net_device *dev) if (!ipvlan->pcpu_stats) return -ENOMEM; + if (!netif_is_ipvlan_port(phy_dev)) { + err = ipvlan_port_create(phy_dev); + if (err < 0) { + free_percpu(ipvlan->pcpu_stats); + return err; + } + } + port = ipvlan_port_get_rtnl(phy_dev); port->count += 1; - return 0; } static void ipvlan_uninit(struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); - struct ipvl_port *port = ipvlan->port; + struct net_device *phy_dev = ipvlan->phy_dev; + struct ipvl_port *port; free_percpu(ipvlan->pcpu_stats); + port = ipvlan_port_get_rtnl(phy_dev); port->count -= 1; if (!port->count) ipvlan_port_destroy(port->dev); @@ -554,7 +554,6 @@ int ipvlan_link_new(struct net *src_net, struct net_device *dev, struct net_device *phy_dev; int err; u16 mode = IPVLAN_MODE_L3; - bool create = false; if (!tb[IFLA_LINK]) return -EINVAL; @@ -568,28 +567,41 @@ int ipvlan_link_new(struct net *src_net, struct net_device *dev, phy_dev = tmp->phy_dev; } else if (!netif_is_ipvlan_port(phy_dev)) { - err = ipvlan_port_create(phy_dev); - if (err < 0) - return err; - create = true; - } + /* Exit early if the underlying link is invalid or busy */ + if (phy_dev->type != ARPHRD_ETHER || + phy_dev->flags & IFF_LOOPBACK) { + netdev_err(phy_dev, + "Master is either lo or non-ether device\n"); + return -EINVAL; + } - if (data && data[IFLA_IPVLAN_MODE]) - mode = nla_get_u16(data[IFLA_IPVLAN_MODE]); + if (netdev_is_rx_handler_busy(phy_dev)) { + netdev_err(phy_dev, "Device is already in use.\n"); + return -EBUSY; + } + } - port = ipvlan_port_get_rtnl(phy_dev); ipvlan->phy_dev = phy_dev; ipvlan->dev = dev; - ipvlan->port = port; ipvlan->sfeatures = IPVLAN_FEATURES; ipvlan_adjust_mtu(ipvlan, phy_dev); INIT_LIST_HEAD(>addrs); - /* Flags are per port and latest update overrides. User has -* to be consistent in setting it just like the mode attribute. + /* TODO Probably put random address here to be presented to the +* world but keep using
Re: [PATCH net v2] ipv6: set all.accept_dad to 0 by default
On 11/14/17 11:10 AM, Stefano Brivio wrote: On Tue, 14 Nov 2017 10:30:33 -0800 Girish Moodalbail <girish.moodalb...@oracle.com> wrote: On 11/14/17 5:21 AM, Nicolas Dichtel wrote: With commits 35e015e1f577 and a2d3f3e33853, the global 'accept_dad' flag is also taken into account (default value is 1). If either global or per-interface flag is non-zero, DAD will be enabled on a given interface. This is not backward compatible: before those patches, the user could disable DAD just by setting the per-interface flag to 0. Now, the user instead needs to set both flags to 0 to actually disable DAD. Restore the previous behaviour by setting the default for the global 'accept_dad' flag to 0. This way, DAD is still enabled by default, as per-interface flags are set to 1 on device creation, but setting them to 0 is enough to disable DAD on a given interface. - Before 35e015e1f57a7 and a2d3f3e33853: globalper-interfaceDAD enabled [default] 1 1 yes X 0 no X 1 yes - After 35e015e1f577 and a2d3f3e33853: globalper-interfaceDAD enabled [default] 1 1 yes 0 0 no 0 1 yes 1 0 yes - After this fix: globalper-interfaceDAD enabled 1 1 yes 0 0 no [default] 0 1 yes 1 0 yes Above table can be summarized to.. - After this fix: globalper-interfaceDAD enabled 1 X yes 0 0 no [default] 0 1 yes So, if global is set to '1', then irrespective of what the per-interface value is DAD will be enabled. Is it not confusing. Shouldn't the more specific value override the general value? Might be a bit confusing, yes, but in order to implement an overriding mechanism you would need to implement a tristate option as Eric K. proposed. That is, by default you would have -1 (meaning "don't care") on per-interface flags, and if this value is changed then the per-interface value wins over the global one. Sensible, but I think it's outside of the scope of this patch, which is just intended to restore a specific pre-existing userspace expectation. On the other hand, if the global is set to '0', then per-interface value will be honored (overrides global). So, the meaning of global varies based on its value. Isn't that confusing as well. I don't find this confusing though. Setting the global flag always has the meaning of "force enabling DAD on all interfaces". You would have the same problem if you chose a logical AND between global and per-interface flag. There, setting the global flag would mean "force disabling DAD on all interfaces". So the only indisputable improvement I see here would be to implement a "don't care" value (either for global or for per-interface flags). But I'd rather agree with Nicolas that we should fix a potentially broken userspace assumption first. Agree. Thanks, ~Girish
Re: [PATCH net v2] ipv6: set all.accept_dad to 0 by default
On 11/14/17 5:21 AM, Nicolas Dichtel wrote: With commits 35e015e1f577 and a2d3f3e33853, the global 'accept_dad' flag is also taken into account (default value is 1). If either global or per-interface flag is non-zero, DAD will be enabled on a given interface. This is not backward compatible: before those patches, the user could disable DAD just by setting the per-interface flag to 0. Now, the user instead needs to set both flags to 0 to actually disable DAD. Restore the previous behaviour by setting the default for the global 'accept_dad' flag to 0. This way, DAD is still enabled by default, as per-interface flags are set to 1 on device creation, but setting them to 0 is enough to disable DAD on a given interface. - Before 35e015e1f57a7 and a2d3f3e33853: globalper-interfaceDAD enabled [default] 1 1 yes X 0 no X 1 yes - After 35e015e1f577 and a2d3f3e33853: globalper-interfaceDAD enabled [default] 1 1 yes 0 0 no 0 1 yes 1 0 yes - After this fix: globalper-interfaceDAD enabled 1 1 yes 0 0 no [default] 0 1 yes 1 0 yes Above table can be summarized to.. - After this fix: globalper-interfaceDAD enabled 1 X yes 0 0 no [default] 0 1 yes So, if global is set to '1', then irrespective of what the per-interface value is DAD will be enabled. Is it not confusing. Shouldn't the more specific value override the general value? On the other hand, if the global is set to '0', then per-interface value will be honored (overrides global). So, the meaning of global varies based on its value. Isn't that confusing as well. thanks, ~Girish
Re: KASAN: use-after-free Read in rds_tcp_dev_event
On 11/14/17 5:22 AM, Sowmini Varadhan wrote: A few questions. - First off, why am I not seeing the original mail in this thread even when I search the mail archives, e.g., https://lkml.org/lkml/2017/11/13/954 - Girish Moodalbail writes: The issue here is that we are trying to reference a network namespace (struct net *) that is long gone (i.e., L532 below -- c_net is the culprit). The netns is not "long gone", we are still processing the NETDEV_UNREGISTER_FINAL for loopback. Obviously, I was not talking about the current namespace. Say there are two namespaces - ns1 and ns2 and that both have RDS connections. Deletion of ns1 will be fine. However when ns2 is being deleted, in the rds_tcp_dev_event() callback we walk through the global list and some nodes in that list will be referring to ns1 (that is "long gone"). If you read my earlier email, I was talking about ns1 which is already gone, and we are trying to access it from ns2. ~Girish As I said in my earlier mail, the idea is to extract the list of unique conns that belong to the netns and then destroy both the conn, and all associated paths. Thus there can only be a single thread going through rds_tcp_kill_sock at any time (since we should only get the unregister_final/loopback one time for the netns). (See alos comment block in rds_tcp_dev_event about network activity quiescing). Thus there should be no concurrency issue. However when I just ehecked this, there may be some rds connection refcounting bug. When I quickly tested this, I'm not seeing the expected calls to conn_path_destroy. I'll need some time to take a look, this has been known to work, so something got broken along the way I think we should move away from global list to a per-namespace list. The global list are used only in two places (both of which are per-namespace operations): let's first understand the real root-cause before we start redesigning data-structures. --Sowmini
Re: KASAN: use-after-free Read in rds_tcp_dev_event
On 11/7/17 12:28 PM, syzbot wrote: Hello, syzkaller hit the following crash on 287683d027a3ff83feb6c7044430c79881664ecf git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/master compiler: gcc (GCC) 7.1.1 20170620 .config is attached Raw console output is attached. == BUG: KASAN: use-after-free in rds_tcp_kill_sock net/rds/tcp.c:530 [inline] BUG: KASAN: use-after-free in rds_tcp_dev_event+0xc01/0xc90 net/rds/tcp.c:568 Read of size 8 at addr 8801cd879200 by task kworker/u4:3/147 CPU: 0 PID: 147 Comm: kworker/u4:3 Not tainted 4.14.0-rc7+ #156 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: netns cleanup_net Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 print_address_description+0x73/0x250 mm/kasan/report.c:252 kasan_report_error mm/kasan/report.c:351 [inline] kasan_report+0x25b/0x340 mm/kasan/report.c:409 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:430 rds_tcp_kill_sock net/rds/tcp.c:530 [inline] rds_tcp_dev_event+0xc01/0xc90 net/rds/tcp.c:568 The issue here is that we are trying to reference a network namespace (struct net *) that is long gone (i.e., L532 below -- c_net is the culprit). 528 spin_lock_irq(_tcp_conn_lock); 529 list_for_each_entry_safe(tc, _tc, _tcp_conn_list, t_tcp_node) { 530 struct net *c_net = tc->t_cpath->cp_conn->c_net; 531 532 if (net != c_net || !tc->t_sock) 533 continue; 534 if (!list_has_conn(_list, tc->t_cpath->cp_conn)) 535 list_move_tail(>t_tcp_node, _list); 536 } 537 spin_unlock_irq(_tcp_conn_lock); 538 list_for_each_entry_safe(tc, _tc, _list, t_tcp_node) { 539 rds_tcp_conn_paths_destroy(tc->t_cpath->cp_conn); 540 rds_conn_destroy(tc->t_cpath->cp_conn); 541 } When a network namespace is deleted, devices within that namespace are unregistered and removed one by one. RDS is notified about this event through rds_tcp_dev_event() callback. When the loopback device is removed from the namespace, the above RDS callback function destroys all the RDS connections in that namespace. The loop@L529 above walks through each of the rds_tcp connection in the global list (rds_tcp_conn_list) to see if that connection belongs to the namespace in question. It collects all such connections and destroys them (L538-540). However, it leaves behind some of the rds_tcp connections that shared the same underlying RDS connection (L534 and 535). These connections with pointer to stale network namespace are left behind in the global list. When the 2nd network namespace is deleted, we will hit the above stale pointer and hit UAF panic. I think we should move away from global list to a per-namespace list. The global list are used only in two places (both of which are per-namespace operations): - to destroy all the RDS connections belonging to a namespace when the network namespace is being deleted. - to reset all the RDS connections when socket parameters for a namespace are modified using sysctl Thanks, ~Girish
Re: [Patch net] vlan: fix a use-after-free in vlan_device_event()
On 11/9/17 4:43 PM, Cong Wang wrote: After refcnt reaches zero, vlan_vid_del() could free dev->vlan_info via RCU: RCU_INIT_POINTER(dev->vlan_info, NULL); call_rcu(_info->rcu, vlan_info_rcu_free); However, the pointer 'grp' still points to that memory since it is set before vlan_vid_del(): vlan_info = rtnl_dereference(dev->vlan_info); if (!vlan_info) goto out; grp = _info->grp; Depends on when that RCU callback is scheduled, we could trigger a use-after-free in vlan_group_for_each_dev() right following this vlan_vid_del(). Fix it by moving vlan_vid_del() before setting grp. This is also symmetric to the vlan_vid_add() we call in vlan_device_event(). Reported-by: Fengguang Wu <fengguang...@intel.com> Fixes: efc73f4bbc23 ("net: Fix memory leak - vlan_info struct") Cc: Alexander Duyck <alexander.du...@gmail.com> Cc: Linus Torvalds <torva...@linux-foundation.org> Cc: Girish Moodalbail <girish.moodalb...@oracle.com> Signed-off-by: Cong Wang <xiyou.wangc...@gmail.com> LGTM. Reviewed-by: Girish Moodalbail <girish.moodalb...@oracle.com> Thanks, ~Girish --- net/8021q/vlan.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 9649579b5b9f..4a72ee4e2ae9 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -376,6 +376,9 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, dev->name); vlan_vid_add(dev, htons(ETH_P_8021Q), 0); } + if (event == NETDEV_DOWN && + (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) + vlan_vid_del(dev, htons(ETH_P_8021Q), 0); vlan_info = rtnl_dereference(dev->vlan_info); if (!vlan_info) @@ -423,9 +426,6 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, struct net_device *tmp; LIST_HEAD(close_list); - if (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER) - vlan_vid_del(dev, htons(ETH_P_8021Q), 0); - /* Put all VLANs for this dev in the down state too. */ vlan_group_for_each_dev(grp, i, vlandev) { flgs = vlandev->flags;
Re: [vlan_device_event] BUG: unable to handle kernel paging request at 6b6b6ccf
On 11/8/17 10:34 PM, Cong Wang wrote: On Wed, Nov 8, 2017 at 7:12 PM, Fengguang Wuwrote: Hi Alex, So looking over the trace the panic seems to be happening after a decnet interface is getting deleted. Is there any chance we could try compiling the kernel without decnet support to see if that is the source of these issues? I don't know if anyone on the Intel Wired Lan team is testing with that enabled so if we can eliminate that as a possible cause that would be useful. Sure and thank you for the suggestion! It looks disabling DECNET still triggers the vlan_device_event BUG. However when looking at the dmesgs, I find another warning just before the vlan_device_event BUG. Not sure if it's related one or independent now-fixed issue. Those decnet symbols are probably noises. Right. This is a 32-bit Kernel compiled with CONFIG_PREEMPT=y (I am guessing that this has exposed some lock bug). Also, VLAN (8021q) is compiled into the kernel, so it registers a vlan_device_event() callback on boot. There may not be a VLAN device per-se. Upon receiving NETDEV_DOWN event, we are calling vlan_vid_del(dev, htons(ETH_P_8021Q), 0); which in turn calls call_rcu() to queue vlan_info_free_rcu() to be called at some point. This free function frees the array[] (vlan_info.vlan_grp.vn_devices_array). My guess is that vlan_info_free_rcu() is being called first and then the array[] is being accessed in vlan_device_event(). The netifd daemon in OpenWRT is marking the interface down and that is why it is generating NETDEV_DOWN event. And it uses ioctl(SIOCSIFFLAGS, ~IFF_UP) on a AF_UNIX socket. This results in a call to dev_ifsioc() in the kernel with only rtnl_lock() held and it is not in RCU read critical section. ~Girish How do you reproduce it? And what is your setup? Vlan device on top of your eth0 (e1000)?
[PATCH net] net: fix incorrect comment with regard to VLAN packet handling
The commit bcc6d4790361 ("net: vlan: make non-hw-accel rx path similar to hw-accel") unified accel and non-accel path for VLAN RX. With that fix we do not register any packet_type handler for VLANs anymore, so fix the incorrect comment. Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com> --- include/linux/netdevice.h | 8 1 file changed, 8 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2eaac7d..7098978 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4411,15 +4411,7 @@ void netdev_printk(const char *level, const struct net_device *dev, * Why 16. Because with 16 the only overlap we get on a hash of the * low nibble of the protocol value is RARP/SNAP/X.25. * - * NOTE: That is no longer true with the addition of VLAN tags. Not - * sure which should go first, but I bet it won't make much - * difference if we are running VLANs. The good news is that - * this protocol won't be in the list unless compiled in, so - * the average user (w/out VLANs) will not be adversely affected. - * --BLG - * * 0800IP - * 8100802.1Q VLAN * 0001802.3 * 0002AX.25 * 0004802.2 -- 1.8.3.1
Re: [PATCH net 0/2] NULL pointer dereference in {ipvlan|macvlan}_port_destroy
On 11/2/17 10:05 PM, David Miller wrote: From: Girish Moodalbail <girish.moodalb...@oracle.com> Date: Tue, 31 Oct 2017 09:39:45 -0700 When call to register_netdevice() (called from ipvlan_link_new()) fails, inside that function we call ipvlan_uninit() (through ndo_uninit()) to destroy the ipvlan port. Upon returning unsuccessfully from register_netdevice() we go ahead and call ipvlan_port_destroy() again which causes NULL pointer dereference panic. The problem is that ipvlan doesn't follow the proper convention that ->ndo_uninit() must only release resources allocated by ->ndo_init(). What needs to happen is that the port allocation occur in ->ndo_init(). I agree, will send out V2. I initially started off making them (ndo_init and ndo_uninit) symmetric by moving the port destruction out of ndo_uninit(), but I hit some WARN() errors. Will figure it out. thanks, ~Girish Your fix, while solving some cases, does not fully cover all of the posibiities due to this bug. Please fix this correctly by moving the port allocation and related setup from link creation to ->ndo_init(). Thank you.
[PATCH net 1/2] ipvlan: NULL pointer dereference panic in ipvlan_port_destroy
When call to register_netdevice() (called from ipvlan_link_new()) fails, we call ipvlan_uninit() (through ndo_uninit()) to destroy the ipvlan port. Upon returning unsuccessfully from register_netdevice() we go ahead and call ipvlan_port_destroy() again which causes NULL pointer dereference panic. Fix it. Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com> --- drivers/net/ipvlan/ipvlan_main.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index c74893c..00a62a1 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -602,6 +602,12 @@ int ipvlan_link_new(struct net *src_net, struct net_device *dev, unregister_netdev: unregister_netdevice(dev); remove_ida: + /* Through the call to ipvlan_uninit (ndo_uninit callback) IPvlan port +* might be already destroyed in failure path in register_netdevice() +* or the above call in unregister_netdevice(). +*/ + if (!ipvlan_port_get_rtnl(phy_dev)) + return err; ida_simple_remove(>ida, dev->dev_id); destroy_ipvlan_port: if (create) -- 1.8.3.1
[PATCH net 0/2] NULL pointer dereference in {ipvlan|macvlan}_port_destroy
>From code inspection it appeared that there is a possibility where in ipvlan_port_destroy() might be dealing with a port (struct ipvl_port) that has already been destroyed and is therefore already NULL. However, we don't check for NULL and continue to access the fields which results in a kernel panic. When call to register_netdevice() (called from ipvlan_link_new()) fails, inside that function we call ipvlan_uninit() (through ndo_uninit()) to destroy the ipvlan port. Upon returning unsuccessfully from register_netdevice() we go ahead and call ipvlan_port_destroy() again which causes NULL pointer dereference panic. To test this theory, I loaded up netdev-notifier-error-inject.ko and did # echo -22 > /sys/kernel/debug/notifier-error-inject/\ netdev/actions/NETDEV_POST_INIT/error # ip li add ipvl0 link enp7s0 type ipvlan BUG: unable to handle kernel NULL pointer dereference at 0820 IP: ipvlan_port_destroy+0x2a/0xf0 [ipvlan] Similar issue exists in macvlan_port_destroy(). The following two patches fixes ipvlan and macvlan module. - Girish Moodalbail (2): ipvlan: NULL pointer dereference panic in ipvlan_port_destroy macvlan: NULL pointer dereference panic in macvlan_port_destroy drivers/net/ipvlan/ipvlan_main.c | 6 ++ drivers/net/macvlan.c| 5 - 2 files changed, 10 insertions(+), 1 deletion(-) -- 1.8.3.1
[PATCH net 2/2] macvlan: NULL pointer dereference panic in macvlan_port_destroy
When call to register_netdevice() (called from macvlan_common_newlink()) fails, we call macvlan_uninit() (through ndo_uninit()) to destroy the macvlan port. Upon returning unsuccessfully from register_netdevice() we go ahead and call macvlan_port_destroy() again which causes NULL pointer dereference panic. Fix it. Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com> --- drivers/net/macvlan.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index d2aea96..2520de8 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -1189,6 +1189,9 @@ static void macvlan_port_destroy(struct net_device *dev) struct macvlan_port *port = macvlan_port_get_rtnl(dev); struct sk_buff *skb; + if (!port) + return; + dev->priv_flags &= ~IFF_MACVLAN_PORT; netdev_rx_handler_unregister(dev); @@ -1444,7 +1447,7 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev, unregister_netdevice(dev); destroy_macvlan_port: if (create) - macvlan_port_destroy(port->dev); + macvlan_port_destroy(lowerdev); return err; } EXPORT_SYMBOL_GPL(macvlan_common_newlink); -- 1.8.3.1
[PATCH] tap: reference to KVA of an unloaded module causes kernel panic
The commit 9a393b5d5988 ("tap: tap as an independent module") created a separate tap module that implements tap functionality and exports interfaces that will be used by macvtap and ipvtap modules to create create respective tap devices. However, that patch introduced a regression wherein the modules macvtap and ipvtap can be removed (through modprobe -r) while there are applications using the respective /dev/tapX devices. These applications cause kernel to hold reference to /dev/tapX through 'struct cdev macvtap_cdev' and 'struct cdev ipvtap_dev' defined in macvtap and ipvtap modules respectively. So, when the application is later closed the kernel panics because we are referencing KVA that is present in the unloaded modules. --8<--- Example --8<-- $ sudo ip li add name mv0 link enp7s0 type macvtap $ sudo ip li show mv0 |grep mv0| awk -e '{print $1 $2}' 14:mv0@enp7s0: $ cat /dev/tap14 & $ lsmod |egrep -i 'tap|vlan' macvtap16384 0 macvlan24576 1 macvtap tap24576 3 macvtap $ sudo modprobe -r macvtap $ fg cat /dev/tap14 ^C <...system panics...> BUG: unable to handle kernel paging request at a038c500 IP: cdev_put+0xf/0x30 --8<-8<-- The fix is to set cdev.owner to the module that creates the tap device (either macvtap or ipvtap). With this set, the operations (in fs/char_dev.c) on char device holds and releases the module through cdev_get() and cdev_put() and will not allow the module to unload prematurely. Fixes: 9a393b5d5988ea4e (tap: tap as an independent module) Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com> --- drivers/net/ipvlan/ipvtap.c | 4 ++-- drivers/net/macvtap.c | 4 ++-- drivers/net/tap.c | 5 +++-- include/linux/if_tap.h | 4 ++-- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/net/ipvlan/ipvtap.c b/drivers/net/ipvlan/ipvtap.c index 5dea206..0bcc07f 100644 --- a/drivers/net/ipvlan/ipvtap.c +++ b/drivers/net/ipvlan/ipvtap.c @@ -197,8 +197,8 @@ static int ipvtap_init(void) { int err; - err = tap_create_cdev(_cdev, _major, "ipvtap"); - + err = tap_create_cdev(_cdev, _major, "ipvtap", + THIS_MODULE); if (err) goto out1; diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index c2d0ea2..cba5cb3 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -204,8 +204,8 @@ static int macvtap_init(void) { int err; - err = tap_create_cdev(_cdev, _major, "macvtap"); - + err = tap_create_cdev(_cdev, _major, "macvtap", + THIS_MODULE); if (err) goto out1; diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 98ee6cc..1b10fcc 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -1249,8 +1249,8 @@ static int tap_list_add(dev_t major, const char *device_name) return 0; } -int tap_create_cdev(struct cdev *tap_cdev, - dev_t *tap_major, const char *device_name) +int tap_create_cdev(struct cdev *tap_cdev, dev_t *tap_major, + const char *device_name, struct module *module) { int err; @@ -1259,6 +1259,7 @@ int tap_create_cdev(struct cdev *tap_cdev, goto out1; cdev_init(tap_cdev, _fops); + tap_cdev->owner = module; err = cdev_add(tap_cdev, *tap_major, TAP_NUM_DEVS); if (err) goto out2; diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h index 4837157..9ae41cd 100644 --- a/include/linux/if_tap.h +++ b/include/linux/if_tap.h @@ -73,8 +73,8 @@ struct tap_queue { int tap_get_minor(dev_t major, struct tap_dev *tap); void tap_free_minor(dev_t major, struct tap_dev *tap); int tap_queue_resize(struct tap_dev *tap); -int tap_create_cdev(struct cdev *tap_cdev, - dev_t *tap_major, const char *device_name); +int tap_create_cdev(struct cdev *tap_cdev, dev_t *tap_major, + const char *device_name, struct module *module); void tap_destroy_cdev(dev_t major, struct cdev *tap_cdev); #endif /*_LINUX_IF_TAP_H_*/ -- 1.8.3.1
[PATCH net-next] macvlan: remove unused fields in struct macvlan_dev
commit 635b8c8ecdd2 ("tap: Renaming tap related APIs, data structures, macros") captured all the tap related fields into a new struct tap_dev. However, it failed to remove those fields from struct macvlan_dev. Those fields are currently unused and must be removed. While there I moved the comment for MAX_TAP_QUEUES to the right place. Fixes: 635b8c8ecdd27142 (tap: Renaming tap related APIs, data structures, macros) Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com> --- include/linux/if_macvlan.h | 15 --- include/linux/if_tap.h | 4 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index 10e319f..e13b369 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -10,13 +10,6 @@ #include struct macvlan_port; -struct macvtap_queue; - -/* - * Maximum times a macvtap device can be opened. This can be used to - * configure the number of receive queue, e.g. for multiqueue virtio. - */ -#define MAX_TAP_QUEUES 256 #define MACVLAN_MC_FILTER_BITS 8 #define MACVLAN_MC_FILTER_SZ (1 << MACVLAN_MC_FILTER_BITS) @@ -35,14 +28,6 @@ struct macvlan_dev { netdev_features_t set_features; enum macvlan_mode mode; u16 flags; - /* This array tracks active taps. */ - struct tap_queue__rcu *taps[MAX_TAP_QUEUES]; - /* This list tracks all taps (both enabled and disabled) */ - struct list_headqueue_list; - int numvtaps; - int numqueues; - netdev_features_t tap_features; - int minor; int nest_level; #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *netpoll; diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h index 4837157..d1b5173 100644 --- a/include/linux/if_tap.h +++ b/include/linux/if_tap.h @@ -22,6 +22,10 @@ static inline struct skb_array *tap_get_skb_array(struct file *f) #include #include +/* + * Maximum times a tap device can be opened. This can be used to + * configure the number of receive queue, e.g. for multiqueue virtio. + */ #define MAX_TAP_QUEUES 256 struct tap_queue; -- 1.8.3.1
[PATCH v2] tap: double-free in error path in tap_open()
Double free of skb_array in tap module is causing kernel panic. When tap_set_queue() fails we free skb_array right away by calling skb_array_cleanup(). However, later on skb_array_cleanup() is called again by tap_sock_destruct through sock_put(). This patch fixes that issue. Fixes: 362899b8725b35e3 (macvtap: switch to use skb array) Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com> --- v1 -> v2: - took care of an another issue in failure path of skb_array_init --- drivers/net/tap.c | 18 +- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 21b71ae..98ee6cc 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -517,6 +517,10 @@ static int tap_open(struct inode *inode, struct file *file) _proto, 0); if (!q) goto err; + if (skb_array_init(>skb_array, tap->dev->tx_queue_len, GFP_KERNEL)) { + sk_free(>sk); + goto err; + } RCU_INIT_POINTER(q->sock.wq, >wq); init_waitqueue_head(>wq.wait); @@ -540,22 +544,18 @@ static int tap_open(struct inode *inode, struct file *file) if ((tap->dev->features & NETIF_F_HIGHDMA) && (tap->dev->features & NETIF_F_SG)) sock_set_flag(>sk, SOCK_ZEROCOPY); - err = -ENOMEM; - if (skb_array_init(>skb_array, tap->dev->tx_queue_len, GFP_KERNEL)) - goto err_array; - err = tap_set_queue(tap, file, q); - if (err) - goto err_queue; + if (err) { + /* tap_sock_destruct() will take care of freeing skb_array */ + goto err_put; + } dev_put(tap->dev); rtnl_unlock(); return err; -err_queue: - skb_array_cleanup(>skb_array); -err_array: +err_put: sock_put(>sk); err: if (tap) -- 1.8.3.1
Re: [PATCH] tap: double-free in error path in tap_open()
On 10/24/17 6:55 PM, Jason Wang wrote: On 2017年10月25日 05:41, Girish Moodalbail wrote: Double free of skb_array in tap module is causing kernel panic. When tap_set_queue() fails we free skb_array right away by calling skb_array_cleanup(). However, later on skb_array_cleanup() is called again by tap_sock_destruct through sock_put(). This patch fixes that issue. Fixes: 362899b8725b35e3 (macvtap: switch to use skb array) Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com> --- drivers/net/tap.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 21b71ae..878520b 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -542,20 +542,20 @@ static int tap_open(struct inode *inode, struct file *file) err = -ENOMEM; if (skb_array_init(>skb_array, tap->dev->tx_queue_len, GFP_KERNEL)) - goto err_array; + goto err_put; Looks like this will cause skb_array_cleanup() to be called when skb array was uninitialized? Thanks Jason. This is an existing issue outside of my code. I will send a v2 of the patch that incorporates the fix for this issue as well. I am considering moving skb_array_init() to the beginning of tap_open() (near to where we allocate memory for the tap_queue itself). Thanks again. regards, ~Girish Thanks err = tap_set_queue(tap, file, q); - if (err) - goto err_queue; + if (err) { + /* tap_sock_destruct() will take care of freeing skb_array */ + goto err_put; + } dev_put(tap->dev); rtnl_unlock(); return err; -err_queue: - skb_array_cleanup(>skb_array); -err_array: +err_put: sock_put(>sk); err: if (tap)
[PATCH] tap: double-free in error path in tap_open()
Double free of skb_array in tap module is causing kernel panic. When tap_set_queue() fails we free skb_array right away by calling skb_array_cleanup(). However, later on skb_array_cleanup() is called again by tap_sock_destruct through sock_put(). This patch fixes that issue. Fixes: 362899b8725b35e3 (macvtap: switch to use skb array) Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com> --- drivers/net/tap.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 21b71ae..878520b 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -542,20 +542,20 @@ static int tap_open(struct inode *inode, struct file *file) err = -ENOMEM; if (skb_array_init(>skb_array, tap->dev->tx_queue_len, GFP_KERNEL)) - goto err_array; + goto err_put; err = tap_set_queue(tap, file, q); - if (err) - goto err_queue; + if (err) { + /* tap_sock_destruct() will take care of freeing skb_array */ + goto err_put; + } dev_put(tap->dev); rtnl_unlock(); return err; -err_queue: - skb_array_cleanup(>skb_array); -err_array: +err_put: sock_put(>sk); err: if (tap) -- 1.8.3.1
[RFC] ip: introduce IFA_F_DHCP flag
This flag identifies that the address was obtained through DHCP. Today there is no easy way to find out whether an address on an interface is DHCP controlled or is static. Either you will need to grep for 'dhclient' process (or something else in case one is using a different DHCP client) or if you are using NetworkManager (or some such), then you will need to query through their interface to find out if an address is DHCP or not. This flag will be set by the DHCP clients in the userspace when it brings up the DHCP address on an interface. For example: ISC DHCP client (aka dhclient) today brings up the address on an interface by running ip-address(8) command (in dhclient-script). This command can be extended to include 'dhcp' keyword in its 'add' or 'replace' subcommand. Once this flag is set, the show subcommand can display the keyword 'dhcp' against the address to indicate that the address was obtained through DHCP. This flag can also be set and obtained programmatically using AF_NETLINK. Besides providing observability, this flag will be useful for applications that need to prevent/allow certain settings on addresses based on whether they are DHCP or not. Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com> --- include/uapi/linux/if_addr.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/if_addr.h b/include/uapi/linux/if_addr.h index 4318ab1..61aa8f8 100644 --- a/include/uapi/linux/if_addr.h +++ b/include/uapi/linux/if_addr.h @@ -52,6 +52,7 @@ enum { #define IFA_F_NOPREFIXROUTE0x200 #define IFA_F_MCAUTOJOIN 0x400 #define IFA_F_STABLE_PRIVACY 0x800 +#define IFA_F_DHCP 0x1000 struct ifa_cacheinfo { __u32 ifa_prefered; -- 1.8.3.1
Re: [RFC] Support for UNARP (RFC 1868)
On 10/12/17 5:53 PM, Mahesh Bandewar (महेश बंडेवार) wrote: On Thu, Oct 12, 2017 at 4:06 PM, Girish Moodalbail <girish.moodalb...@oracle.com> wrote: Hello Eric, The basic idea is to mark the ARP entry either FAILED or STALE as soon as we can so that the subsequent packets that depend on that ARP entry will take the slow path (neigh_resolve_output()). Say, if base_reachable_time is 30 seconds, then an ARP entry will be in reachable state somewhere between 15 to 45 seconds. Assuming the worst case, the ARP entry will be in REACHABLE state for 45 seconds and the packets continue to traverse the network towards the source machine and gets dropped their since the VM has moved to destination machine. Instead, based on the received UNARP packet if we mark the ARP entry (a) FAILED - we move to INCOMPLETE state and start the address resolution by sending out ARP packets (up to allowed maximum number) until we get ARP response back at which point we move the ARP entry state to reachable. (b) STALE - we move to DELAY state and set the next timer to DELAY_PROBE_TIME (1 second) and continue to send queued packets in arp_queue. - After 1 sec we move to PROBE state and start the address resolution like in the case(a) above. I was leaning towards (a). One could arbitrarily increase the stale timeout (by changing no of probes). So sender will continue sending traffic to something that has already gone away. STALE doesn't mean bad but here the sender is clearly indicating it's going away so FAILED seems to be the only logical option. I agree. Will TCP flows be terminated, instead of being smoothly migrated (TCP_REPAIR) The TCP flows will not be terminated. Upon receiving UNARP packet, the ARP entry will be marked FAILED. The subsequent TCP packets from the client (towards that IP) will be queued (the first 3 packets in arp_queue and then other packets get dropped) until the IP address is resolved again through the slow path neigh_resolve_output(). The slow path marks the entry as INCOMPLETE and will start sending several ARP requests (ucast_solicit + app_solicit + mcast_solicit) to resolve the IP. If the resolution is successful, then the TCP packets will be sent out. If not, we will invalidate the ARP entry and call arp_error_report() on the queued packets (which will end up sending ICMP_HOST_UNREACH error). This behavior is same as what will occur if TCP server disappears in the middle of a connection. What about IPv6 ? Or maybe more abruptly, do we still need to add features to IPv4 in 2017, 22 years after this RFC came ? ;) Legit question :). Well one thing I have seen in Networking is that an old idea circles back around later and turns out to be useful in new contexts and use cases. Like I enumerated in my initial email there are certain use cases in Cloud that might benefit from UNARP. It doesn't make sense to have this implemented only for IPv4. At this time if equivalent IPv6 feature is missing, I don't see it being useful / acceptable. Obviously UNARP is IPv4 only. I am not aware of any standard way of doing the same for IPv6. If such a standard doesn't exist, then we will have to go through IETF to get one done? If that is the case, can we please do this in a phased manner? This will atleast help the Cloud that are IPv4 only. thanks, ~Girish
Re: [RFC] Support for UNARP (RFC 1868)
Hello Eric, The basic idea is to mark the ARP entry either FAILED or STALE as soon as we can so that the subsequent packets that depend on that ARP entry will take the slow path (neigh_resolve_output()). Say, if base_reachable_time is 30 seconds, then an ARP entry will be in reachable state somewhere between 15 to 45 seconds. Assuming the worst case, the ARP entry will be in REACHABLE state for 45 seconds and the packets continue to traverse the network towards the source machine and gets dropped their since the VM has moved to destination machine. Instead, based on the received UNARP packet if we mark the ARP entry (a) FAILED - we move to INCOMPLETE state and start the address resolution by sending out ARP packets (up to allowed maximum number) until we get ARP response back at which point we move the ARP entry state to reachable. (b) STALE - we move to DELAY state and set the next timer to DELAY_PROBE_TIME (1 second) and continue to send queued packets in arp_queue. - After 1 sec we move to PROBE state and start the address resolution like in the case(a) above. I was leaning towards (a). Please see in-line.. Hi Girish Your description (or patch title) is misleading. You apparently implement the receive side of the RFC. You are right, it implements only the receive side of the RFC. If this RFC is accepted, then we can change arping(8) to generate UNARP requests. We could also add an option to ip-address(8) delete subcommand to generate UNARP whenever an address is deleted from the interface. And the RFC had Proxy ARP in mind. What about security implications ? Yes, this feature will extend the attack surface for L2 networks. However, the attack vectors for this feature should be same as that of the gratuitous ARP, right? The same attack mitigation techniques for gratuitous ARPs is equally applicable here. Will TCP flows be terminated, instead of being smoothly migrated (TCP_REPAIR) The TCP flows will not be terminated. Upon receiving UNARP packet, the ARP entry will be marked FAILED. The subsequent TCP packets from the client (towards that IP) will be queued (the first 3 packets in arp_queue and then other packets get dropped) until the IP address is resolved again through the slow path neigh_resolve_output(). The slow path marks the entry as INCOMPLETE and will start sending several ARP requests (ucast_solicit + app_solicit + mcast_solicit) to resolve the IP. If the resolution is successful, then the TCP packets will be sent out. If not, we will invalidate the ARP entry and call arp_error_report() on the queued packets (which will end up sending ICMP_HOST_UNREACH error). This behavior is same as what will occur if TCP server disappears in the middle of a connection. What about IPv6 ? Or maybe more abruptly, do we still need to add features to IPv4 in 2017, 22 years after this RFC came ? ;) Legit question :). Well one thing I have seen in Networking is that an old idea circles back around later and turns out to be useful in new contexts and use cases. Like I enumerated in my initial email there are certain use cases in Cloud that might benefit from UNARP. regards, ~Girish Thanks.
[RFC] Support for UNARP (RFC 1868)
Add support for UNARP, as detailed in the IETF RFC 1868 (ARP Extension - UNARP). The central idea here is for a node to announce that it is leaving the network and that all the nodes on the L2 broadcast domain to update their ARP tables accordingly (i.e., mark the neighbor entry state to FAILED). Even though the ARP timers on nodes would eventually mark such entries as FAILED it will be more robust if those entries gets marked FAILED sooner with the help from the host that is going away. Besides providing a solution for an usecase, as captured in RFC, of an IP address moving across a proxy server, this feature is even more important for certain use cases in the Cloud. Imagine a tenant who is bringing up and down VM instances for some workload of theirs. If these instances are part of a small subnet, then the new VM instances may be assigned the same IP address (since the subnet pool is small) but with a different MAC address. So, if there is a client which has a stale mapping of the IP address to the old MAC address, then that client will fail to communicate with the new VM instance for some time. Another usecase that comes to mind is that of the Live VM Migration. Imagine a client that is communicating with a VM. Now, let us migrate this VM to a destination machine. The IP address to MAC address mapping for a VM doesn't change after the Live Migration. However, there will be a small amount of time (till the VM sends gratuitous ARP from the destination machine) during which packets from a client will be forwarded to the source machine. This occurs because: - the ARP entry in the client is not invalidated yet and it continues to use the same MAC address and - the MAC address table of all of the intermediate switches between the client and the source machine are not updated yet for the MAC address move. This issue of forwarding the packets to wrong target could be avoided by sending UNARP packets from the source machine. This would invalidate the ARP entry on the client and forces it to resolve the IP address again by broadcasting an ARP request to the network. The VM on the destination machine would then respond back with an ARP response. The ARP response back from the VM should also clean up the MAC address table of the intermediate switches. The following changes implements the UNARP receive processing in the kernel. Once the changes are in the kernel, arping(8) program can be updated to send UNARP packets. Any Thoughts/Comments? Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com> --- Compile-tested only. net/ipv4/arp.c | 46 +++--- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 7c45b88..8cb9aa1 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -686,6 +686,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) struct neighbour *n; struct dst_entry *reply_dst = NULL; bool is_garp = false; + bool is_unarp; /* arp_rcv below verifies the ARP header and verifies the device * is ARP'able. @@ -695,6 +696,8 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) goto out_free_skb; arp = arp_hdr(skb); + /* arp_rcv has already verified the header for the UNARP case */ + is_unarp = arp->ar_hln == 0; switch (dev_type) { default: @@ -741,8 +744,8 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) * Extract fields */ arp_ptr = (unsigned char *)(arp + 1); - sha = arp_ptr; - arp_ptr += dev->addr_len; + sha = is_unarp ? NULL : arp_ptr; + arp_ptr += arp->ar_hln; memcpy(, arp_ptr, 4); arp_ptr += 4; switch (dev_type) { @@ -751,8 +754,8 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) break; #endif default: - tha = arp_ptr; - arp_ptr += dev->addr_len; + tha = is_unarp ? NULL : arp_ptr; + arp_ptr += arp->ar_hln; } memcpy(, arp_ptr, 4); /* @@ -874,7 +877,10 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) It is possible, that this option should be enabled for some devices (strip is candidate) */ - if (!n && + /* If the packet is UNARP and we don't have the corresponding +* neighbour entry, then there is nothing to do. +*/ + if (!n && !is_unarp && (is_garp || (arp->ar_op == htons(ARPOP_REPLY) && (addr_type == RTN_UNICAST || @@ -899,12 +905,15 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
Re: [PATCH] ipv6: gso: fix payload length when gso_size is zero
On 10/5/17 10:06 AM, Alexey Kodanev wrote: When gso_size reset to zero for the tail segment in skb_segment(), later in ipv6_gso_segment(), we will get incorrect payload_len for that segment. inet_gso_segment() already has a check for gso_size before calculating payload so fixing only IPv6 part. The issue was found with LTP vxlan & gre tests over ixgbe NIC. Fixes: 07b26c9454a2 ("gso: Support partial splitting at the frag_list pointer") Signed-off-by: Alexey Kodanev <alexey.koda...@oracle.com> --- net/ipv6/ip6_offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index cdb3728..4a87f94 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -105,7 +105,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, for (skb = segs; skb; skb = skb->next) { ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff); - if (gso_partial) + if (gso_partial && skb_is_gso(skb)) payload_len = skb_shinfo(skb)->gso_size + SKB_GSO_CB(skb)->data_offset + skb->head - (unsigned char *)(ipv6h + 1); Reviewed-by: Girish Moodalbail <girish.moodalb...@oracle.com>
[PATCH net-next v3] vxlan: change vxlan_[config_]validate() to use netlink_ext_ack for error reporting
The kernel log is not where users expect error messages for netlink requests; as we have extended acks now, we can replace pr_debug() with NL_SET_ERR_MSG_ATTR(). Signed-off-by: Matthias Schiffer <mschif...@universe-factory.net> Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com> --- v2 -> v1: - improved messages based on the comments from Jiri Benc, Roopa Prabhu, and David Ahern v1 -> v2: - addressed, error messages rewording, comments from Jiri Benc - started off with what Matthias had, and I covered error reporting for all of the unsuccessful returns in vxlan_validate() and vxlan_config_validate() --- drivers/net/vxlan.c | 99 +++-- 1 file changed, 73 insertions(+), 26 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 35e84a9e..ae3a1da 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2729,12 +2729,14 @@ static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[], { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { - pr_debug("invalid link address (not ethernet)\n"); + NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS], + "Provided link layer address is not Ethernet"); return -EINVAL; } if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { - pr_debug("invalid all zero ethernet address\n"); + NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS], + "Provided Ethernet address is not unicast"); return -EADDRNOTAVAIL; } } @@ -2742,18 +2744,27 @@ static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[], if (tb[IFLA_MTU]) { u32 mtu = nla_get_u32(tb[IFLA_MTU]); - if (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU) + if (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU) { + NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU], + "MTU must be between 68 and 65535"); return -EINVAL; + } } - if (!data) + if (!data) { + NL_SET_ERR_MSG(extack, + "Required attributes not provided to perform the operation"); return -EINVAL; + } if (data[IFLA_VXLAN_ID]) { u32 id = nla_get_u32(data[IFLA_VXLAN_ID]); - if (id >= VXLAN_N_VID) + if (id >= VXLAN_N_VID) { + NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_ID], + "VXLAN ID must be lower than 16777216"); return -ERANGE; + } } if (data[IFLA_VXLAN_PORT_RANGE]) { @@ -2761,8 +2772,8 @@ static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[], = nla_data(data[IFLA_VXLAN_PORT_RANGE]); if (ntohs(p->high) < ntohs(p->low)) { - pr_debug("port range %u .. %u not valid\n", -ntohs(p->low), ntohs(p->high)); + NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT_RANGE], + "Invalid source port range"); return -EINVAL; } } @@ -2919,7 +2930,8 @@ static int vxlan_sock_add(struct vxlan_dev *vxlan) static int vxlan_config_validate(struct net *src_net, struct vxlan_config *conf, struct net_device **lower, -struct vxlan_dev *old) +struct vxlan_dev *old, +struct netlink_ext_ack *extack) { struct vxlan_net *vn = net_generic(src_net, vxlan_net_id); struct vxlan_dev *tmp; @@ -2933,6 +2945,8 @@ static int vxlan_config_validate(struct net *src_net, struct vxlan_config *conf, */ if ((conf->flags & ~VXLAN_F_ALLOWED_GPE) || !(conf->flags & VXLAN_F_COLLECT_METADATA)) { + NL_SET_ERR_MSG(extack, + "VXLAN GPE does not support this combination of attributes"); return -EINVAL; } } @@ -2947,15 +2961,23 @@ static int vxlan_config_validate(struct net *src_net, struct vxlan_config *conf, conf->saddr.sa.sa_family = conf->remote_ip.sa.sa_family; } - if (conf->saddr.sa.sa_family != conf->remote_ip.sa.sa_family) + if (conf->saddr.sa.sa_family != conf->re
Re: [PATCH net-next v2] vxlan: change vxlan_[config_]validate() to use netlink_ext_ack for error reporting
[snip] @@ -2933,6 +2945,8 @@ static int vxlan_config_validate(struct net *src_net, struct vxlan_config *conf, */ if ((conf->flags & ~VXLAN_F_ALLOWED_GPE) || !(conf->flags & VXLAN_F_COLLECT_METADATA)) { + NL_SET_ERR_MSG(extack, + "VXLAN GPE does not support this combination of attributes"); return -EINVAL; } "collect metadata not supported with vxlan gpe" Actually, VXLAN GPE is only supported together with external keyword. From ip-link(8)... gpe - enables the Generic Protocol extension (VXLAN-GPE). Currently, this is only supported together with the external keyword. That's completely wrong message. Not saying that the capitalization is wrong, too. Girish's wording precisely explains what went wrong. Furthermore, the condition not only checks for collect metadata but also it checks to see if any attribute specified is outside of what is allowed by VXLAN_F_ALLOWED_GPE. lowerdev = __dev_get_by_index(src_net, conf->remote_ifindex); - if (!lowerdev) + if (!lowerdev) { + NL_SET_ERR_MSG(extack, + "Specified interface for tunnel endpoint communications not found"); return -ENODEV; "invalid vxlan remote link interface, device not found" Finally one that looks better :-) Modulo the missing capitalization, though. It is not the remote link interface. It is the local interface itself that needs to be specified. - if (vxlan_addr_multicast(>remote_ip)) + if (vxlan_addr_multicast(>remote_ip)) { + NL_SET_ERR_MSG(extack, + "Interface need to be specified for multicast destination"); "vxlan remote link interface required for multicast remote destination" I like this one better, too. #if IS_ENABLED(CONFIG_IPV6) - if (conf->flags & VXLAN_F_IPV6_LINKLOCAL) + if (conf->flags & VXLAN_F_IPV6_LINKLOCAL) { + NL_SET_ERR_MSG(extack, + "Interface need to be specified for link-local local/remote addresses"); return -EINVAL; "vxlan link interface required for link-local local/remote addresses" Okay but to be consistent (and more clear), it should be "remote link interface". There is no remote link while configuring VXLAN. It is the local interface through which VXLAN packets will be send out. Thanks all for the review, ~Girish
[PATCH net-next v2] vxlan: change vxlan_[config_]validate() to use netlink_ext_ack for error reporting
The kernel log is not where users expect error messages for netlink requests; as we have extended acks now, we can replace pr_debug() with NL_SET_ERR_MSG_ATTR(). Signed-off-by: Matthias Schiffer <mschif...@universe-factory.net> Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com> --- v1 -> v2: - addressed, error messages rewording, comments from Jiri Benc - started off with what Matthias had, and I covered error reporting for all of the unsuccessful returns --- drivers/net/vxlan.c | 98 +++-- 1 file changed, 72 insertions(+), 26 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 35e84a9e..ec302cd7 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2729,12 +2729,14 @@ static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[], { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { - pr_debug("invalid link address (not ethernet)\n"); + NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS], + "Provided link layer address is not Ethernet"); return -EINVAL; } if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { - pr_debug("invalid all zero ethernet address\n"); + NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS], + "Provided Ethernet address is not unicast"); return -EADDRNOTAVAIL; } } @@ -2742,18 +2744,27 @@ static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[], if (tb[IFLA_MTU]) { u32 mtu = nla_get_u32(tb[IFLA_MTU]); - if (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU) + if (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU) { + NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU], + "MTU must be between 68 and 65535"); return -EINVAL; + } } - if (!data) + if (!data) { + NL_SET_ERR_MSG(extack, + "Not enough attributes provided to perform the operation"); return -EINVAL; + } if (data[IFLA_VXLAN_ID]) { u32 id = nla_get_u32(data[IFLA_VXLAN_ID]); - if (id >= VXLAN_N_VID) + if (id >= VXLAN_N_VID) { + NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_ID], + "VXLAN ID must be lower than 16777216"); return -ERANGE; + } } if (data[IFLA_VXLAN_PORT_RANGE]) { @@ -2761,8 +2772,8 @@ static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[], = nla_data(data[IFLA_VXLAN_PORT_RANGE]); if (ntohs(p->high) < ntohs(p->low)) { - pr_debug("port range %u .. %u not valid\n", -ntohs(p->low), ntohs(p->high)); + NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT_RANGE], + "Provided source port range bounds is invalid"); return -EINVAL; } } @@ -2919,7 +2930,8 @@ static int vxlan_sock_add(struct vxlan_dev *vxlan) static int vxlan_config_validate(struct net *src_net, struct vxlan_config *conf, struct net_device **lower, -struct vxlan_dev *old) +struct vxlan_dev *old, +struct netlink_ext_ack *extack) { struct vxlan_net *vn = net_generic(src_net, vxlan_net_id); struct vxlan_dev *tmp; @@ -2933,6 +2945,8 @@ static int vxlan_config_validate(struct net *src_net, struct vxlan_config *conf, */ if ((conf->flags & ~VXLAN_F_ALLOWED_GPE) || !(conf->flags & VXLAN_F_COLLECT_METADATA)) { + NL_SET_ERR_MSG(extack, + "VXLAN GPE does not support this combination of attributes"); return -EINVAL; } } @@ -2947,15 +2961,23 @@ static int vxlan_config_validate(struct net *src_net, struct vxlan_config *conf, conf->saddr.sa.sa_family = conf->remote_ip.sa.sa_family; } - if (conf->saddr.sa.sa_family != conf->remote_ip.sa.sa_family) + if (conf->saddr.sa.sa_family != conf->remote_ip.sa.sa_family) { + NL_SET_ERR_MSG(extack, + "Local and remote address must be from the same
Re: [PATCH net] geneve: maximum value of VNI cannot be used
On 8/9/17 10:41 PM, David Miller wrote: From: Girish Moodalbail <girish.moodalb...@oracle.com> Date: Tue, 8 Aug 2017 17:26:24 -0700 Geneve's Virtual Network Identifier (VNI) is 24 bit long, so the range of values for it would be from 0 to 16777215 (2^24 -1). However, one cannot create a geneve device with VNI set to 16777215. This patch fixes this issue. Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com> I always worry that someone, somewhere, might be using this in some way and this will break things. But I'll apply this for now. Thanks David. As per the output of 'ip link help geneve', 16777215 is a valid value. However, due to incorrect check in the kernel that value was not supported. $ ip link help geneve |egrep -A1 VNI Usage: ... geneve id VNI remote ADDR -- Where: VNI := 0-16777215 ADDR := IP_ADDRESS So, this is an off-by-one bug and no one had tripped over it until now. regards, ~Girish
[PATCH net-next] geneve: use netlink_ext_ack for error reporting in rtnl operations
Add extack error messages for failure paths while creating/modifying geneve devices. Once extack support is added to iproute2, more meaningful and helpful error messages will be displayed making it easy for users to discern what went wrong. Before: === $ ip link add gen1 address 0:1:2:3:4:5:6 type geneve id 200 \ remote 192.168.13.2 RTNETLINK answers: Invalid argument After: == $ ip link add gen1 address 0:1:2:3:4:5:6 type geneve id 200 \ remote 192.168.13.2 Error: Provided link layer address is not Ethernet Also, netdev_dbg() calls used to log errors associated with Netlink request have been removed. Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com> --- drivers/net/geneve.c | 128 --- 1 file changed, 92 insertions(+), 36 deletions(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 745d57ae..977dbcc 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -1086,21 +1086,33 @@ static int geneve_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { - if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) + if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { + NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS], + "Provided link layer address is not Ethernet"); return -EINVAL; + } - if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) + if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { + NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS], + "Provided Ethernet address is not unicast"); return -EADDRNOTAVAIL; + } } - if (!data) + if (!data) { + NL_SET_ERR_MSG(extack, + "Not enough attributes provided to perform the operation"); return -EINVAL; + } if (data[IFLA_GENEVE_ID]) { __u32 vni = nla_get_u32(data[IFLA_GENEVE_ID]); - if (vni >= GENEVE_VID_MASK) + if (vni >= GENEVE_VID_MASK) { + NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_ID], + "Geneve ID must be lower than 16777216"); return -ERANGE; + } } return 0; @@ -1158,6 +1170,7 @@ static bool geneve_dst_addr_equal(struct ip_tunnel_info *a, } static int geneve_configure(struct net *net, struct net_device *dev, + struct netlink_ext_ack *extack, const struct ip_tunnel_info *info, bool metadata, bool ipv6_rx_csum) { @@ -1166,8 +1179,11 @@ static int geneve_configure(struct net *net, struct net_device *dev, bool tun_collect_md, tun_on_same_port; int err, encap_len; - if (metadata && !is_tnl_info_zero(info)) + if (metadata && !is_tnl_info_zero(info)) { + NL_SET_ERR_MSG(extack, + "Device is externally controlled, so attributes (VNI, Port, and so on) must not be specified"); return -EINVAL; + } geneve->net = net; geneve->dev = dev; @@ -1188,11 +1204,17 @@ static int geneve_configure(struct net *net, struct net_device *dev, dev->needed_headroom = encap_len + ETH_HLEN; if (metadata) { - if (tun_on_same_port) + if (tun_on_same_port) { + NL_SET_ERR_MSG(extack, + "There can be only one externally controlled device on a destination port"); return -EPERM; + } } else { - if (tun_collect_md) + if (tun_collect_md) { + NL_SET_ERR_MSG(extack, + "There already exists an externally controlled device on this destination port"); return -EPERM; + } } dst_cache_reset(>info.dst_cache); @@ -1214,31 +1236,41 @@ static void init_tnl_info(struct ip_tunnel_info *info, __u16 dst_port) info->key.tp_dst = htons(dst_port); } -static int geneve_nl2info(struct net_device *dev, struct nlattr *tb[], - struct nlattr *data[], struct ip_tunnel_info *info, - bool *metadata, bool *use_udp6_rx_checksums, - bool changelink) +static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack, + struct ip_tunnel_info *info, bool *metadata, +
Re: [PATCH v2 iproute2] lib: Dump ext-ack string by default
On 8/8/17 7:30 AM, David Ahern wrote: In time, errfn can be implemented for link, route, etc commands to give a much more detailed response (e.g., point to the attribute that failed). Doing so is much more complicated to process the message and convert attribute ids to names. In any case the error string returned by the kernel should be dumped to the user, so make that happen now. Signed-off-by: David Ahern--- v2 - check that errmsg is non-null lib/libnetlink.c | 15 ++- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/lib/libnetlink.c b/lib/libnetlink.c index 145de2cb0ccf..063f5cd6c7b1 100644 --- a/lib/libnetlink.c +++ b/lib/libnetlink.c @@ -61,7 +61,6 @@ static int err_attr_cb(const struct nlattr *attr, void *data) return MNL_CB_OK; } - /* dump netlink extended ack error message */ static int nl_dump_ext_err(const struct nlmsghdr *nlh, nl_ext_ack_fn_t errfn) { @@ -72,9 +71,6 @@ static int nl_dump_ext_err(const struct nlmsghdr *nlh, nl_ext_ack_fn_t errfn) const char *errmsg = NULL; uint32_t off = 0; - if (!errfn) - return 0; - /* no TLVs, nothing to do here */ if (!(nlh->nlmsg_flags & NLM_F_ACK_TLVS)) return 0; @@ -99,7 +95,16 @@ static int nl_dump_ext_err(const struct nlmsghdr *nlh, nl_ext_ack_fn_t errfn) err_nlh = >msg; } - return errfn(errmsg, off, err_nlh); + if (errfn) + return errfn(errmsg, off, err_nlh); + + if (errmsg) { + fprintf(stderr, "Error: %s\n", errmsg); Should the above output end with a period '.'? All the error messages in the Kernel are statements without a terminating period, so the output will look something like this.. Error: Minimally Geneve ID and Remote IP address are required Error: Provided Ethernet address is not unicast Error: Provided link layer address is not Ethernet Thanks, ~Girish + + return 1; + } + + return 0; } #else #warning "libmnl required for error support"
[PATCH net] geneve: maximum value of VNI cannot be used
Geneve's Virtual Network Identifier (VNI) is 24 bit long, so the range of values for it would be from 0 to 16777215 (2^24 -1). However, one cannot create a geneve device with VNI set to 16777215. This patch fixes this issue. Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com> --- drivers/net/geneve.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 745d57ae..8b8565d 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -1099,7 +1099,7 @@ static int geneve_validate(struct nlattr *tb[], struct nlattr *data[], if (data[IFLA_GENEVE_ID]) { __u32 vni = nla_get_u32(data[IFLA_GENEVE_ID]); - if (vni >= GENEVE_VID_MASK) + if (vni >= GENEVE_N_VID) return -ERANGE; } -- 1.8.3.1
[PATCH iproute2] geneve: support for modifying geneve device
Ability to change geneve device attributes was added to kernel through commit 5b861f6baa3a ("geneve: add rtnl changelink support"), however one cannot do the same through ip-link(8) command. Changing the allowed geneve device attributes using 'ip link set type geneve id ' currently fails with 'operation not supported' error. This patch adds support for it. Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com> --- ip/iplink_geneve.c | 82 +++--- 1 file changed, 60 insertions(+), 22 deletions(-) diff --git a/ip/iplink_geneve.c b/ip/iplink_geneve.c index 2c510fc..594a3e5 100644 --- a/ip/iplink_geneve.c +++ b/ip/iplink_geneve.c @@ -15,6 +15,8 @@ #include "utils.h" #include "ip_common.h" +#define GENEVE_ATTRSET(attrs, type) (((attrs) & (1L << (type))) != 0) + static void print_explain(FILE *f) { fprintf(f, @@ -42,11 +44,20 @@ static void explain(void) print_explain(stderr); } +static void check_duparg(__u64 *attrs, int type, const char *key, +const char *argv) +{ + if (!GENEVE_ATTRSET(*attrs, type)) { + *attrs |= (1L << type); + return; + } + duparg2(key, argv); +} + static int geneve_parse_opt(struct link_util *lu, int argc, char **argv, struct nlmsghdr *n) { __u32 vni = 0; - int vni_set = 0; __u32 daddr = 0; struct in6_addr daddr6 = IN6ADDR_ANY_INIT; __u32 label = 0; @@ -55,22 +66,24 @@ static int geneve_parse_opt(struct link_util *lu, int argc, char **argv, __u16 dstport = 0; bool metadata = 0; __u8 udpcsum = 0; - bool udpcsum_set = false; __u8 udp6zerocsumtx = 0; - bool udp6zerocsumtx_set = false; __u8 udp6zerocsumrx = 0; - bool udp6zerocsumrx_set = false; + __u64 attrs = 0; + bool set_op = (n->nlmsg_type == RTM_NEWLINK && + !(n->nlmsg_flags & NLM_F_CREATE)); while (argc > 0) { if (!matches(*argv, "id") || !matches(*argv, "vni")) { NEXT_ARG(); + check_duparg(, IFLA_GENEVE_ID, "id", *argv); if (get_u32(, *argv, 0) || vni >= 1u << 24) invarg("invalid id", *argv); - vni_set = 1; } else if (!matches(*argv, "remote")) { NEXT_ARG(); + check_duparg(, IFLA_GENEVE_REMOTE, "remote", +*argv); if (!inet_get_addr(*argv, , )) { fprintf(stderr, "Invalid address \"%s\"\n", *argv); return -1; @@ -82,6 +95,7 @@ static int geneve_parse_opt(struct link_util *lu, int argc, char **argv, unsigned int uval; NEXT_ARG(); + check_duparg(, IFLA_GENEVE_TTL, "ttl", *argv); if (strcmp(*argv, "inherit") != 0) { if (get_unsigned(, *argv, 0)) invarg("invalid TTL", *argv); @@ -94,6 +108,7 @@ static int geneve_parse_opt(struct link_util *lu, int argc, char **argv, __u32 uval; NEXT_ARG(); + check_duparg(, IFLA_GENEVE_TOS, "tos", *argv); if (strcmp(*argv, "inherit") != 0) { if (rtnl_dsfield_a2n(, *argv)) invarg("bad TOS value", *argv); @@ -105,36 +120,50 @@ static int geneve_parse_opt(struct link_util *lu, int argc, char **argv, __u32 uval; NEXT_ARG(); + check_duparg(, IFLA_GENEVE_LABEL, "flowlabel", +*argv); if (get_u32(, *argv, 0) || (uval & ~LABEL_MAX_MASK)) invarg("invalid flowlabel", *argv); label = htonl(uval); } else if (!matches(*argv, "dstport")) { NEXT_ARG(); + check_duparg(, IFLA_GENEVE_PORT, "dstport", +*argv); if (get_u16(, *argv, 0)) invarg("dstport", *argv); } else if (!matches(*argv, "external")) { + check_duparg(, IFLA_GENEVE_COLLECT_METADATA, +*argv, *argv); metad
[PATCH net-next v3 1/1] geneve: add rtnl changelink support
This patch adds changelink rtnl operation support for geneve devices and the code changes involve: - added geneve_quiesce() which quiesces the geneve device data path for both TX and RX. This lets us perform the changelink operation atomically w.r.t data path. Also added geneve_unquiesce() to reverse the operation of geneve_quiesce(). - refactor geneve_newlink into geneve_nl2info to be used by both geneve_newlink and geneve_changelink - geneve_nl2info takes a changelink boolean argument to isolate changelink checks. - Allow changing only a few attributes (ttl, tos, and remote tunnel endpoint IP address (within the same address family)): - return -EOPNOTSUPP for attributes that cannot be changed for now. Incremental patches can make the non-supported one available in the future if needed. Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com> --- v2 -> v3: - removed the use of inline for new functions in my patch - removed an extra check for socket in the datapath and instead I am piggybacking on an already existing check - added more comments to quiesce/unquiesce functions v1 -> v2: - added geneve_quiesce() and geneve_unquiesce() functions to perform the changelink operation atomically w.r.t data path --- --- drivers/net/geneve.c | 218 +-- 1 file changed, 176 insertions(+), 42 deletions(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index de8156c..0436a42 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -715,6 +715,7 @@ static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb, static struct rtable *geneve_get_v4_rt(struct sk_buff *skb, struct net_device *dev, + struct geneve_sock *gs4, struct flowi4 *fl4, const struct ip_tunnel_info *info) { @@ -724,7 +725,7 @@ static struct rtable *geneve_get_v4_rt(struct sk_buff *skb, struct rtable *rt = NULL; __u8 tos; - if (!rcu_dereference(geneve->sock4)) + if (!gs4) return ERR_PTR(-EIO); memset(fl4, 0, sizeof(*fl4)); @@ -764,6 +765,7 @@ static struct rtable *geneve_get_v4_rt(struct sk_buff *skb, #if IS_ENABLED(CONFIG_IPV6) static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb, struct net_device *dev, + struct geneve_sock *gs6, struct flowi6 *fl6, const struct ip_tunnel_info *info) { @@ -771,10 +773,8 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb, struct geneve_dev *geneve = netdev_priv(dev); struct dst_entry *dst = NULL; struct dst_cache *dst_cache; - struct geneve_sock *gs6; __u8 prio; - gs6 = rcu_dereference(geneve->sock6); if (!gs6) return ERR_PTR(-EIO); @@ -827,7 +827,7 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, __be16 df; int err; - rt = geneve_get_v4_rt(skb, dev, , info); + rt = geneve_get_v4_rt(skb, dev, gs4, , info); if (IS_ERR(rt)) return PTR_ERR(rt); @@ -866,7 +866,7 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, __be16 sport; int err; - dst = geneve_get_v6_dst(skb, dev, , info); + dst = geneve_get_v6_dst(skb, dev, gs6, , info); if (IS_ERR(dst)) return PTR_ERR(dst); @@ -951,8 +951,9 @@ static int geneve_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) if (ip_tunnel_info_af(info) == AF_INET) { struct rtable *rt; struct flowi4 fl4; + struct geneve_sock *gs4 = rcu_dereference(geneve->sock4); - rt = geneve_get_v4_rt(skb, dev, , info); + rt = geneve_get_v4_rt(skb, dev, gs4, , info); if (IS_ERR(rt)) return PTR_ERR(rt); @@ -962,8 +963,9 @@ static int geneve_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) } else if (ip_tunnel_info_af(info) == AF_INET6) { struct dst_entry *dst; struct flowi6 fl6; + struct geneve_sock *gs6 = rcu_dereference(geneve->sock6); - dst = geneve_get_v6_dst(skb, dev, , info); + dst = geneve_get_v6_dst(skb, dev, gs6, , info); if (IS_ERR(dst)) return PTR_ERR(dst); @@ -1140,6 +1142,15 @@ static bool is_tnl_info_zero(const struct ip_tunnel_info *info) return true; } +static bool geneve_dst_addr_equal(struct ip_tunnel_info *a, + struct ip_tunnel_info *b) +{ +
Re: [PATCH net-next v2 1/1] geneve: add rtnl changelink support
Hello Pravin, +/* Quiesces the geneve device data path for both TX and RX. */ +static inline void geneve_quiesce(struct geneve_dev *geneve, + struct geneve_sock **gs4, + struct geneve_sock **gs6) +{ + *gs4 = rtnl_dereference(geneve->sock4); + rcu_assign_pointer(geneve->sock4, NULL); + +#if IS_ENABLED(CONFIG_IPV6) + *gs6 = rtnl_dereference(geneve->sock6); + rcu_assign_pointer(geneve->sock6, NULL); +#else + *gs6 = NULL; +#endif + synchronize_net(); +} + +/* Resumes the geneve device data path for both TX and RX. */ +static inline void geneve_unquiesce(struct geneve_dev *geneve, + struct geneve_sock *gs4, + struct geneve_sock __maybe_unused *gs6) +{ + rcu_assign_pointer(geneve->sock4, gs4); +#if IS_ENABLED(CONFIG_IPV6) + rcu_assign_pointer(geneve->sock6, gs6); +#endif + synchronize_net(); +} + +static int geneve_changelink(struct net_device *dev, struct nlattr *tb[], +struct nlattr *data[], +struct netlink_ext_ack *extack) +{ + struct geneve_dev *geneve = netdev_priv(dev); + struct geneve_sock *gs4, *gs6; + struct ip_tunnel_info info; + bool metadata; + bool use_udp6_rx_checksums; + int err; + + /* If the geneve device is configured for metadata (or externally +* controlled, for example, OVS), then nothing can be changed. +*/ + if (geneve->collect_md) + return -EOPNOTSUPP; + + /* Start with the existing info. */ + memcpy(, >info, sizeof(info)); + metadata = geneve->collect_md; + use_udp6_rx_checksums = geneve->use_udp6_rx_checksums; + err = geneve_nl2info(dev, tb, data, , , +_udp6_rx_checksums, true); + if (err) + return err; + + if (!geneve_dst_addr_equal(>info, )) + dst_cache_reset(_cache); + + geneve_quiesce(geneve, , ); + geneve->info = info; + geneve->collect_md = metadata; + geneve->use_udp6_rx_checksums = use_udp6_rx_checksums; + geneve_unquiesce(geneve, gs4, gs6); + This is nice trick. But it adds check for the socket in datapath. did you explore updating entire device state in single atomic transaction? I did explore, however what I have now seemed like a more concise method to perform changelink operation atomically w.r.t the datapath. That said, there is one thing I could do. Today we already check for the socket in datapath like below: (a) ipv4 datapath today: geneve_xmit_skb(...) | +->geneve_get_v4_rt() | +---> if (!rcu_dereference(geneve->sock4)) return ERR_PTR(-EIO); (b) ipv4 datapath with my current patch: geneve_xmit_skb(...) | +->if (!rcu_dereference(geneve->sock4)) | return -EIO; | +->geneve_get_v4_rt() | +---> if (!rcu_dereference(geneve->sock4)) return ERR_PTR(-EIO); Perhaps, I could piggyback on the already existing check for NULL socket in geneve_get_v4_rt() and avoid the additional check I have added in datapath. (The situation is same for IPv6) regards, ~Girish
Re: [PATCH net-next v2 1/1] geneve: add rtnl changelink support
On 7/19/17 4:51 PM, David Miller wrote: From: Girish Moodalbail <girish.moodalb...@oracle.com> Date: Tue, 18 Jul 2017 16:33:06 -0700 +static inline bool geneve_dst_addr_equal(struct ip_tunnel_info *a, ... +static inline void geneve_quiesce(struct geneve_dev *geneve, ... +static inline void geneve_unquiesce(struct geneve_dev *geneve, Please no inline functions in foo.c files, let the compiler decide. Sure thing. Will do. regards, ~Girish Thanks.
[PATCH net-next v2 1/1] geneve: add rtnl changelink support
This patch adds changelink rtnl operation support for geneve devices and the code changes involve: - add geneve_quiesce() which quiesces the geneve device data path for both TX and RX. This lets us perform the changelink operation atomically w.r.t data path. Also add geneve_unquiesce() to reverse the operation of geneve_quiesce(). - refactor geneve_newlink into geneve_nl2info to be used by both geneve_newlink and geneve_changelink - geneve_nl2info takes a changelink boolean argument to isolate changelink checks. - Allow changing only a few attributes (ttl, tos, and remote tunnel endpoint IP address (within the same address family)): - return -EOPNOTSUPP for attributes that cannot be changed for now. Incremental patches can make the non-supported one available in the future if needed. Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com> --- v0 -> v1: - added geneve_quiesce() and geneve_unquiesce() functions to perform the changelink operation atomically w.r.t data path --- drivers/net/geneve.c | 192 +-- 1 file changed, 157 insertions(+), 35 deletions(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index de8156c..829f541 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -827,6 +827,9 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, __be16 df; int err; + if (!gs4) + return -EIO; + rt = geneve_get_v4_rt(skb, dev, , info); if (IS_ERR(rt)) return PTR_ERR(rt); @@ -866,6 +869,9 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, __be16 sport; int err; + if (!gs6) + return -EIO; + dst = geneve_get_v6_dst(skb, dev, , info); if (IS_ERR(dst)) return PTR_ERR(dst); @@ -1140,6 +1146,15 @@ static bool is_tnl_info_zero(const struct ip_tunnel_info *info) return true; } +static inline bool geneve_dst_addr_equal(struct ip_tunnel_info *a, +struct ip_tunnel_info *b) +{ + if (ip_tunnel_info_af(a) == AF_INET) + return a->key.u.ipv4.dst == b->key.u.ipv4.dst; + else + return ipv6_addr_equal(>key.u.ipv6.dst, >key.u.ipv6.dst); +} + static int geneve_configure(struct net *net, struct net_device *dev, const struct ip_tunnel_info *info, bool metadata, bool ipv6_rx_csum) @@ -1197,24 +1212,22 @@ static void init_tnl_info(struct ip_tunnel_info *info, __u16 dst_port) info->key.tp_dst = htons(dst_port); } -static int geneve_newlink(struct net *net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], - struct netlink_ext_ack *extack) +static int geneve_nl2info(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[], struct ip_tunnel_info *info, + bool *metadata, bool *use_udp6_rx_checksums, + bool changelink) { - bool use_udp6_rx_checksums = false; - struct ip_tunnel_info info; - bool metadata = false; - - init_tnl_info(, GENEVE_UDP_PORT); - if (data[IFLA_GENEVE_REMOTE] && data[IFLA_GENEVE_REMOTE6]) return -EINVAL; if (data[IFLA_GENEVE_REMOTE]) { - info.key.u.ipv4.dst = + if (changelink && (ip_tunnel_info_af(info) == AF_INET6)) + return -EOPNOTSUPP; + + info->key.u.ipv4.dst = nla_get_in_addr(data[IFLA_GENEVE_REMOTE]); - if (IN_MULTICAST(ntohl(info.key.u.ipv4.dst))) { + if (IN_MULTICAST(ntohl(info->key.u.ipv4.dst))) { netdev_dbg(dev, "multicast remote is unsupported\n"); return -EINVAL; } @@ -1222,21 +1235,24 @@ static int geneve_newlink(struct net *net, struct net_device *dev, if (data[IFLA_GENEVE_REMOTE6]) { #if IS_ENABLED(CONFIG_IPV6) - info.mode = IP_TUNNEL_INFO_IPV6; - info.key.u.ipv6.dst = + if (changelink && (ip_tunnel_info_af(info) == AF_INET)) + return -EOPNOTSUPP; + + info->mode = IP_TUNNEL_INFO_IPV6; + info->key.u.ipv6.dst = nla_get_in6_addr(data[IFLA_GENEVE_REMOTE6]); - if (ipv6_addr_type() & + if (ipv6_addr_type(>key.u.ipv6.dst) & IPV6_ADDR_LINKLOCAL) { netdev_dbg(dev, "link-local remote is unsupported\n"); return -EINVAL; } - if (ipv6_addr_is_multicast()) { + if (ipv6_addr_is_multicast(>
Re: [PATCH net 0/4] macvlan: Fix some issues with changing mac addresses
On 6/16/17 6:36 AM, Vladislav Yasevich wrote: There are some issues in macvlan wrt to changing it's mac address. * An error is returned in the specified address is the same as an already assigned address. * In passthru mode, the mac address of the macvlan device doesn't change. * After changing the mac address of a passthru macvlan and then removing it, the mac address of the physical device remains changed. This patch series attempts to resolve these issues. Thanks -vlad Vladislav Yasevich (4): macvlan: Do not return error when setting the same mac address macvlan: Fix passthru macvlan mac address inheritance macvlan: convert port passthru to flags. Above 3 patches looks good to me, so Reviewed-by: Girish Moodalbail <girish.moodalb...@oracle.com> macvlan: Let passthru macvlan correctly restore lower mac address However, I have few questions/comments on the above patch. thanks, ~Girish drivers/net/macvlan.c | 85 ++- 1 file changed, 71 insertions(+), 14 deletions(-)
Re: [PATCH net 4/4] macvlan: Let passthru macvlan correctly restore lower mac address
Sorry, it took sometime to wrap around this patch series since they all change one file and at times the same function :). On 6/16/17 6:36 AM, Vladislav Yasevich wrote: Passthru macvlans directly change the mac address of the lower level device. That's OK, but after the macvlan is deleted, the lower device is left with changed address and one needs to reboot to bring back the origina HW addresses. s/origina/original/ This scenario is actually quite common with passthru macvtap devices. This patch attempts to solve this, by storing the mac address of the lower device in macvlan_port structure and keeping track of it through the changes. After this patch, any changes to the lower device mac address done trough the macvlan device, will be reverted back. Any changes done directly to the lower device mac address will be kept. Signed-off-by: Vladislav Yasevich--- drivers/net/macvlan.c | 47 --- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index eb956ff..c551165 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -40,6 +40,7 @@ #define MACVLAN_BC_QUEUE_LEN 1000 #define MACVLAN_F_PASSTHRU 1 +#define MACVLAN_F_ADDRCHANGE 2 struct macvlan_port { struct net_device *dev; @@ -51,6 +52,7 @@ struct macvlan_port { int count; struct hlist_head vlan_source_hash[MACVLAN_HASH_SIZE]; DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ); + unsigned char perm_addr[ETH_ALEN]; }; struct macvlan_source_entry { @@ -78,6 +80,21 @@ static inline void macvlan_set_passthru(struct macvlan_port *port) port->flags |= MACVLAN_F_PASSTHRU; } +static inline bool macvlan_addr_change(const struct macvlan_port *port) +{ + return port->flags & MACVLAN_F_ADDRCHANGE; +} + +static inline void macvlan_set_addr_change(struct macvlan_port *port) +{ + port->flags |= MACVLAN_F_ADDRCHANGE; +} + +static inline void macvlan_clear_addr_change(struct macvlan_port *port) +{ + port->flags &= ~MACVLAN_F_ADDRCHANGE; +} + /* Hash Ethernet address */ static u32 macvlan_eth_hash(const unsigned char *addr) { @@ -193,11 +210,11 @@ static void macvlan_hash_change_addr(struct macvlan_dev *vlan, static bool macvlan_addr_busy(const struct macvlan_port *port, const unsigned char *addr) { - /* Test to see if the specified multicast address is + /* Test to see if the specified address is * currently in use by the underlying device or * another macvlan. */ - if (!macvlan_passthru(port) && + if (!macvlan_passthru(port) && !macvlan_addr_change(port) && ether_addr_equal_64bits(port->dev->dev_addr, addr)) return true; @@ -685,6 +702,7 @@ static int macvlan_sync_address(struct net_device *dev, unsigned char *addr) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; + struct macvlan_port *port = vlan->port; int err; if (!(dev->flags & IFF_UP)) { @@ -695,7 +713,7 @@ static int macvlan_sync_address(struct net_device *dev, unsigned char *addr) if (macvlan_addr_busy(vlan->port, addr)) return -EBUSY; - if (!macvlan_passthru(vlan->port)) { + if (!macvlan_passthru(port)) { err = dev_uc_add(lowerdev, addr); if (err) return err; @@ -705,6 +723,15 @@ static int macvlan_sync_address(struct net_device *dev, unsigned char *addr) macvlan_hash_change_addr(vlan, addr); } + if (macvlan_passthru(port) && !macvlan_addr_change(port)) { + /* Since addr_change isn't set, we are here due to lower +* device change. Save the lower-dev address so we can +* restore it later. +*/ + ether_addr_copy(vlan->port->perm_addr, + dev->dev_addr); Did you meant to copy `addr' here? Since dev->dev_addr is that of the macvlan device whilst `addr' is from the lower parent device. Thanks, ~Girish
[PATCH net-next v2] geneve: add missing rx stats accounting
There are few places on the receive path where packet drops and packet errors were not accounted for. This patch fixes that issue. Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com> --- v0 -> v1: -modified to use canonical post-increment "x++" --- drivers/net/geneve.c | 36 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 6ebb0f5..ff626db 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -212,6 +212,7 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, struct genevehdr *gnvh = geneve_hdr(skb); struct metadata_dst *tun_dst = NULL; struct pcpu_sw_netstats *stats; + unsigned int len; int err = 0; void *oiph; @@ -225,8 +226,10 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, tun_dst = udp_tun_rx_dst(skb, geneve_get_sk_family(gs), flags, vni_to_tunnel_id(gnvh->vni), gnvh->opt_len * 4); - if (!tun_dst) + if (!tun_dst) { + geneve->dev->stats.rx_dropped++; goto drop; + } /* Update tunnel dst according to Geneve options. */ ip_tunnel_info_opts_set(_dst->u.tun_info, gnvh->options, gnvh->opt_len * 4); @@ -234,8 +237,11 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, /* Drop packets w/ critical options, * since we don't support any... */ - if (gnvh->critical) + if (gnvh->critical) { + geneve->dev->stats.rx_frame_errors++; + geneve->dev->stats.rx_errors++; goto drop; + } } skb_reset_mac_header(skb); @@ -246,8 +252,10 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, skb_dst_set(skb, _dst->dst); /* Ignore packet loops (and multicast echo) */ - if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr)) + if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr)) { + geneve->dev->stats.rx_errors++; goto drop; + } oiph = skb_network_header(skb); skb_reset_network_header(skb); @@ -279,13 +287,15 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, } } - stats = this_cpu_ptr(geneve->dev->tstats); - u64_stats_update_begin(>syncp); - stats->rx_packets++; - stats->rx_bytes += skb->len; - u64_stats_update_end(>syncp); - - gro_cells_receive(>gro_cells, skb); + len = skb->len; + err = gro_cells_receive(>gro_cells, skb); + if (likely(err == NET_RX_SUCCESS)) { + stats = this_cpu_ptr(geneve->dev->tstats); + u64_stats_update_begin(>syncp); + stats->rx_packets++; + stats->rx_bytes += len; + u64_stats_update_end(>syncp); + } return; drop: /* Consume bad packet */ @@ -334,7 +344,7 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) struct geneve_sock *gs; int opts_len; - /* Need Geneve and inner Ethernet header to be present */ + /* Need UDP and Geneve header to be present */ if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN))) goto drop; @@ -357,8 +367,10 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) opts_len = geneveh->opt_len * 4; if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, htons(ETH_P_TEB), -!net_eq(geneve->net, dev_net(geneve->dev +!net_eq(geneve->net, dev_net(geneve->dev { + geneve->dev->stats.rx_dropped++; goto drop; + } geneve_rx(geneve, gs, skb); return 0; -- 1.8.3.1
[PATCH] geneve: add missing rx stats accounting
There are few places on the receive path where packet drops and packet errors were not accounted for. This patch fixes that issue. Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com> --- drivers/net/geneve.c | 36 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 6ebb0f5..34a4d45 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -212,6 +212,7 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, struct genevehdr *gnvh = geneve_hdr(skb); struct metadata_dst *tun_dst = NULL; struct pcpu_sw_netstats *stats; + unsigned int len; int err = 0; void *oiph; @@ -225,8 +226,10 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, tun_dst = udp_tun_rx_dst(skb, geneve_get_sk_family(gs), flags, vni_to_tunnel_id(gnvh->vni), gnvh->opt_len * 4); - if (!tun_dst) + if (!tun_dst) { + ++geneve->dev->stats.rx_dropped; goto drop; + } /* Update tunnel dst according to Geneve options. */ ip_tunnel_info_opts_set(_dst->u.tun_info, gnvh->options, gnvh->opt_len * 4); @@ -234,8 +237,11 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, /* Drop packets w/ critical options, * since we don't support any... */ - if (gnvh->critical) + if (gnvh->critical) { + ++geneve->dev->stats.rx_frame_errors; + ++geneve->dev->stats.rx_errors; goto drop; + } } skb_reset_mac_header(skb); @@ -246,8 +252,10 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, skb_dst_set(skb, _dst->dst); /* Ignore packet loops (and multicast echo) */ - if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr)) + if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr)) { + ++geneve->dev->stats.rx_errors; goto drop; + } oiph = skb_network_header(skb); skb_reset_network_header(skb); @@ -279,13 +287,15 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, } } - stats = this_cpu_ptr(geneve->dev->tstats); - u64_stats_update_begin(>syncp); - stats->rx_packets++; - stats->rx_bytes += skb->len; - u64_stats_update_end(>syncp); - - gro_cells_receive(>gro_cells, skb); + len = skb->len; + err = gro_cells_receive(>gro_cells, skb); + if (likely(err == NET_RX_SUCCESS)) { + stats = this_cpu_ptr(geneve->dev->tstats); + u64_stats_update_begin(>syncp); + stats->rx_packets++; + stats->rx_bytes += len; + u64_stats_update_end(>syncp); + } return; drop: /* Consume bad packet */ @@ -334,7 +344,7 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) struct geneve_sock *gs; int opts_len; - /* Need Geneve and inner Ethernet header to be present */ + /* Need UDP and Geneve header to be present */ if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN))) goto drop; @@ -357,8 +367,10 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) opts_len = geneveh->opt_len * 4; if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, htons(ETH_P_TEB), -!net_eq(geneve->net, dev_net(geneve->dev +!net_eq(geneve->net, dev_net(geneve->dev { + ++geneve->dev->stats.rx_dropped; goto drop; + } geneve_rx(geneve, gs, skb); return 0; -- 1.8.3.1
[PATCH net-next] macsec: double accounting of dropped rx/tx packets
The macsec implementation shouldn't account for rx/tx packets that are dropped in the netdev framework. The netdev framework itself accounts for such packets by atomically updating struct net_device`rx_dropped and struct net_device`tx_dropped fields. Later on when the stats for macsec link is retrieved, the packets dropped in netdev framework will be included in dev_get_stats() after calling macsec.c`macsec_get_stats64() Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com> --- drivers/net/macsec.c | 15 +++ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index cdc347b..91642fd 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -588,8 +588,6 @@ static void count_tx(struct net_device *dev, int ret, int len) stats->tx_packets++; stats->tx_bytes += len; u64_stats_update_end(>syncp); - } else { - dev->stats.tx_dropped++; } } @@ -883,7 +881,7 @@ static void macsec_decrypt_done(struct crypto_async_request *base, int err) struct macsec_dev *macsec = macsec_priv(dev); struct macsec_rx_sa *rx_sa = macsec_skb_cb(skb)->rx_sa; struct macsec_rx_sc *rx_sc = rx_sa->sc; - int len, ret; + int len; u32 pn; aead_request_free(macsec_skb_cb(skb)->req); @@ -904,11 +902,8 @@ static void macsec_decrypt_done(struct crypto_async_request *base, int err) macsec_reset_skb(skb, macsec->secy.netdev); len = skb->len; - ret = gro_cells_receive(>gro_cells, skb); - if (ret == NET_RX_SUCCESS) + if (gro_cells_receive(>gro_cells, skb) == NET_RX_SUCCESS) count_rx(dev, len); - else - macsec->secy.netdev->stats.rx_dropped++; rcu_read_unlock_bh(); @@ -1037,7 +1032,6 @@ static void handle_not_macsec(struct sk_buff *skb) */ list_for_each_entry_rcu(macsec, >secys, secys) { struct sk_buff *nskb; - int ret; struct pcpu_secy_stats *secy_stats = this_cpu_ptr(macsec->stats); if (macsec->secy.validate_frames == MACSEC_VALIDATE_STRICT) { @@ -1054,13 +1048,10 @@ static void handle_not_macsec(struct sk_buff *skb) nskb->dev = macsec->secy.netdev; - ret = netif_rx(nskb); - if (ret == NET_RX_SUCCESS) { + if (netif_rx(nskb) == NET_RX_SUCCESS) { u64_stats_update_begin(_stats->syncp); secy_stats->stats.InPktsUntagged++; u64_stats_update_end(_stats->syncp); - } else { - macsec->secy.netdev->stats.rx_dropped++; } } -- 1.8.3.1
Re: [PATCH net-next] geneve: add rtnl changelink support
TL DR; There is indeed a race between geneve_changelink() and geneve transmit path w.r.t attributes being changed and the old value of those attributes being used in the transmit patch. I will resubmit V2 of the patch with those issues addressed. Thanks! Please see in-line for my other comments.. Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com> --- drivers/net/geneve.c | 149 --- 1 file changed, 117 insertions(+), 32 deletions(-) ... @@ -1169,45 +1181,58 @@ static void init_tnl_info(struct ip_tunnel_info *info, __u16 dst_port) info->key.tp_dst = htons(dst_port); } -static int geneve_newlink(struct net *net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]) +static int geneve_nl2info(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[], struct ip_tunnel_info *info, + bool *metadata, bool *use_udp6_rx_checksums, + bool changelink) { - bool use_udp6_rx_checksums = false; - struct ip_tunnel_info info; - bool metadata = false; + struct geneve_dev *geneve = netdev_priv(dev); - init_tnl_info(, GENEVE_UDP_PORT); + if (changelink) { + /* if changelink operation, start with old existing info */ + memcpy(info, >info, sizeof(*info)); + *metadata = geneve->collect_md; + *use_udp6_rx_checksums = geneve->use_udp6_rx_checksums; + } else { + init_tnl_info(info, GENEVE_UDP_PORT); + } if (data[IFLA_GENEVE_REMOTE] && data[IFLA_GENEVE_REMOTE6]) return -EINVAL; if (data[IFLA_GENEVE_REMOTE]) { - info.key.u.ipv4.dst = + info->key.u.ipv4.dst = nla_get_in_addr(data[IFLA_GENEVE_REMOTE]); - if (IN_MULTICAST(ntohl(info.key.u.ipv4.dst))) { + if (IN_MULTICAST(ntohl(info->key.u.ipv4.dst))) { netdev_dbg(dev, "multicast remote is unsupported\n"); return -EINVAL; } + if (changelink && + ip_tunnel_info_af(>info) == AF_INET6) { + info->mode &= ~IP_TUNNEL_INFO_IPV6; + info->key.tun_flags &= ~TUNNEL_CSUM; + *use_udp6_rx_checksums = false; + } This allows changelink to change ipv4 address but there are no changes made to the geneve tunnel port hash table after this update. The following code in geneve_changelink() does what you are asking for +if (!geneve_dst_addr_equal(>info, )) +dst_cache_reset(_cache); geneve_nl2info() accrues all the allowed changes to be made and captures it in ip_tunnel_info structure and then the above code in geneve_changelink() ensures that all the route cache associated with the old remote address are released when the next lookup occurs. We also need to check to see if there is any conflicts with existing ports. This is not needed since we don't support changing the remote port. What is the barrier between the rx/tx threads and changelink process? There is an issue here like you pointed out (thanks!). Will fix that issue. } if (data[IFLA_GENEVE_REMOTE6]) { #if IS_ENABLED(CONFIG_IPV6) - info.mode = IP_TUNNEL_INFO_IPV6; - info.key.u.ipv6.dst = + info->mode = IP_TUNNEL_INFO_IPV6; + info->key.u.ipv6.dst = nla_get_in6_addr(data[IFLA_GENEVE_REMOTE6]); - if (ipv6_addr_type() & + if (ipv6_addr_type(>key.u.ipv6.dst) & IPV6_ADDR_LINKLOCAL) { netdev_dbg(dev, "link-local remote is unsupported\n"); return -EINVAL; } - if (ipv6_addr_is_multicast()) { + if (ipv6_addr_is_multicast(>key.u.ipv6.dst)) { netdev_dbg(dev, "multicast remote is unsupported\n"); return -EINVAL; } - info.key.tun_flags |= TUNNEL_CSUM; - use_udp6_rx_checksums = true; + info->key.tun_flags |= TUNNEL_CSUM; + *use_udp6_rx_checksums = true; Same here. We need to check/fix the geneve tunnel hash table according to new IP address. This is taken care by the call to dst_cache_reset() whenever the remote address changes. This function already takes care of races and contentions 8<-8<-- /** * dst_cache_reset - invalidate the cache contents * @dst_cache: the cache * * This do not free the cached dst to avoid races and contentions. * the dst will be freed on later cache lookup. */ s
Re: [PATCH net-next] geneve: add rtnl changelink support
On 5/16/17 12:31 PM, David Miller wrote: From: Girish Moodalbail <girish.moodalb...@oracle.com> Date: Mon, 15 May 2017 10:47:04 -0700 if (data[IFLA_GENEVE_REMOTE]) { - info.key.u.ipv4.dst = + info->key.u.ipv4.dst = nla_get_in_addr(data[IFLA_GENEVE_REMOTE]); - if (IN_MULTICAST(ntohl(info.key.u.ipv4.dst))) { + if (IN_MULTICAST(ntohl(info->key.u.ipv4.dst))) { netdev_dbg(dev, "multicast remote is unsupported\n"); return -EINVAL; } + if (changelink && + ip_tunnel_info_af(>info) == AF_INET6) { + info->mode &= ~IP_TUNNEL_INFO_IPV6; + info->key.tun_flags &= ~TUNNEL_CSUM; + *use_udp6_rx_checksums = false; + } } I don't understand this "changelink" guarded code, why do you need to clear all of this state out if the existing tunnel type if AF_INET6 and only when doing a changelink? In any event, I think you need to add a comment explaining it. If geneve link was overlayed over IPv6 network and now the user modifies the link to be over IPv4 network by doing # ip link set gen0 type geneve id 100 remote 192.168.13.2 Then we will need to - reset info->mode to be not IPv6 type - the default for UDP checksum over IPv4 is 'no', so reset that and - set use_udp6_rx_checksums to its default value which is false. I will capture the above information concisely in a comment around that 'changelink' guard. thanks, ~Girish
[PATCH net-next] geneve: add rtnl changelink support
This patch adds changelink rtnl operation support for geneve devices. Code changes involve: - refactor geneve_newlink into geneve_nl2info to be used by both geneve_newlink and geneve_changelink - geneve_nl2info takes a changelink boolean argument to isolate changelink checks and updates. - Allow changing only a few attributes: - return -EOPNOTSUPP for attributes that cannot be changed for now. Incremental patches can make the non-supported one available in the future if needed. Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com> --- drivers/net/geneve.c | 149 --- 1 file changed, 117 insertions(+), 32 deletions(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index dec5d56..6528910 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -1112,6 +1112,18 @@ static bool is_tnl_info_zero(const struct ip_tunnel_info *info) return true; } +static inline bool geneve_dst_addr_equal(struct ip_tunnel_info *a, +struct ip_tunnel_info *b) +{ + if (ip_tunnel_info_af(a) != ip_tunnel_info_af(b)) + return false; + + if (ip_tunnel_info_af(a) == AF_INET) + return a->key.u.ipv4.dst == b->key.u.ipv4.dst; + else + return ipv6_addr_equal(>key.u.ipv6.dst, >key.u.ipv6.dst); +} + static int geneve_configure(struct net *net, struct net_device *dev, const struct ip_tunnel_info *info, bool metadata, bool ipv6_rx_csum) @@ -1169,45 +1181,58 @@ static void init_tnl_info(struct ip_tunnel_info *info, __u16 dst_port) info->key.tp_dst = htons(dst_port); } -static int geneve_newlink(struct net *net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]) +static int geneve_nl2info(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[], struct ip_tunnel_info *info, + bool *metadata, bool *use_udp6_rx_checksums, + bool changelink) { - bool use_udp6_rx_checksums = false; - struct ip_tunnel_info info; - bool metadata = false; + struct geneve_dev *geneve = netdev_priv(dev); - init_tnl_info(, GENEVE_UDP_PORT); + if (changelink) { + /* if changelink operation, start with old existing info */ + memcpy(info, >info, sizeof(*info)); + *metadata = geneve->collect_md; + *use_udp6_rx_checksums = geneve->use_udp6_rx_checksums; + } else { + init_tnl_info(info, GENEVE_UDP_PORT); + } if (data[IFLA_GENEVE_REMOTE] && data[IFLA_GENEVE_REMOTE6]) return -EINVAL; if (data[IFLA_GENEVE_REMOTE]) { - info.key.u.ipv4.dst = + info->key.u.ipv4.dst = nla_get_in_addr(data[IFLA_GENEVE_REMOTE]); - if (IN_MULTICAST(ntohl(info.key.u.ipv4.dst))) { + if (IN_MULTICAST(ntohl(info->key.u.ipv4.dst))) { netdev_dbg(dev, "multicast remote is unsupported\n"); return -EINVAL; } + if (changelink && + ip_tunnel_info_af(>info) == AF_INET6) { + info->mode &= ~IP_TUNNEL_INFO_IPV6; + info->key.tun_flags &= ~TUNNEL_CSUM; + *use_udp6_rx_checksums = false; + } } if (data[IFLA_GENEVE_REMOTE6]) { #if IS_ENABLED(CONFIG_IPV6) - info.mode = IP_TUNNEL_INFO_IPV6; - info.key.u.ipv6.dst = + info->mode = IP_TUNNEL_INFO_IPV6; + info->key.u.ipv6.dst = nla_get_in6_addr(data[IFLA_GENEVE_REMOTE6]); - if (ipv6_addr_type() & + if (ipv6_addr_type(>key.u.ipv6.dst) & IPV6_ADDR_LINKLOCAL) { netdev_dbg(dev, "link-local remote is unsupported\n"); return -EINVAL; } - if (ipv6_addr_is_multicast()) { + if (ipv6_addr_is_multicast(>key.u.ipv6.dst)) { netdev_dbg(dev, "multicast remote is unsupported\n"); return -EINVAL; } - info.key.tun_flags |= TUNNEL_CSUM; - use_udp6_rx_checksums = true; + info->key.tun_flags |= TUNNEL_CSUM; + *use_udp6_rx_checksums = true; #else return -EPFNOSUPPORT; #endif @@ -1216,48 +1241,107 @@ static int geneve_newlink(struct net *net, struct net_device *dev, if (data[IFLA_GENEVE_ID]) { __u32 vni; __u8 tvni[3]; + _
Re: [PATCH iproute2 v2 1/1] vxlan: Add support for modifying vxlan device attributes
On 5/7/17 7:40 AM, Roman Mashak wrote: Girish Moodalbail <girish.moodalb...@oracle.com> writes: [...] ip/iplink_vxlan.c | 251 +++--- 1 file changed, 143 insertions(+), 108 deletions(-) diff --git a/ip/iplink_vxlan.c b/ip/iplink_vxlan.c index b4ebb13..2bd619d 100644 --- a/ip/iplink_vxlan.c +++ b/ip/iplink_vxlan.c @@ -21,6 +21,8 @@ #include "utils.h" #include "ip_common.h" +#define VXLAN_ATTRSET(attrs, type) (((attrs) & (1L << (type))) != 0) I think you can drop '!= 0' part in the macro. Sure it can be done that way as well. However, I have seen both forms in use in iproute2 code base, so I would like to keep this as is unless you feel strongly about it. Furthermore, running 'checkpatch.pl --strict' on the patch didn't complain too. regards, ~Girish [...] +static void check_duparg(__u64 *attrs, int type, const char *key, +const char *argv) +{ + if (!VXLAN_ATTRSET(*attrs, type)) { + *attrs |= (1L << type); + return; + } + duparg2(key, argv); +} [...]
Re: [PATCH] drivers: net: wimax: i2400m: i2400m-usb: Use time_after for time comparison
On 5/8/17 2:26 PM, David Miller wrote: From: Karim EshapaDate: Mon, 8 May 2017 18:58:02 +0200 diff --git a/drivers/net/wimax/i2400m/i2400m-usb.h b/drivers/net/wimax/i2400m/i2400m-usb.h index 649ecad..6fc941c 100644 --- a/drivers/net/wimax/i2400m/i2400m-usb.h +++ b/drivers/net/wimax/i2400m/i2400m-usb.h @@ -131,7 +131,7 @@ static inline int edc_inc(struct edc *edc, u16 max_err, u16 timeframe) unsigned long now; now = jiffies; - if (now - edc->timestart > timeframe) { + if (time_after(now - edc->timestart, (unsigned long)timeframe)) { This is far from correct. time_after() compares two "jiffies" timestamp values. The second argument here is not a jiffies timestamp value. Perhaps, what is needed here is: + if (time_after(jiffies, edc->timestart + timeframe)) { where in 'timeframe' is set in HZ in all the callers I checked (for the most part it is set to EDC_ERROR_TIMEFRAME which is 1HZ). ~Girish
[PATCH net-next] geneve: add rtnl changelink support
This patch adds changelink rtnl operation support for geneve devices. Code changes involve: - refactor geneve_newlink into geneve_nl2info to be used by both geneve_newlink and geneve_changelink - geneve_nl2info takes a changelink boolean argument to isolate changelink checks and updates. - Allow changing only a few attributes: - return -EOPNOTSUPP for attributes that cannot be changed for now. Incremental patches can make the non-supported one available in the future if needed. Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com> --- drivers/net/geneve.c | 149 --- 1 file changed, 117 insertions(+), 32 deletions(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index dec5d56..6528910 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -1112,6 +1112,18 @@ static bool is_tnl_info_zero(const struct ip_tunnel_info *info) return true; } +static inline bool geneve_dst_addr_equal(struct ip_tunnel_info *a, +struct ip_tunnel_info *b) +{ + if (ip_tunnel_info_af(a) != ip_tunnel_info_af(b)) + return false; + + if (ip_tunnel_info_af(a) == AF_INET) + return a->key.u.ipv4.dst == b->key.u.ipv4.dst; + else + return ipv6_addr_equal(>key.u.ipv6.dst, >key.u.ipv6.dst); +} + static int geneve_configure(struct net *net, struct net_device *dev, const struct ip_tunnel_info *info, bool metadata, bool ipv6_rx_csum) @@ -1169,45 +1181,58 @@ static void init_tnl_info(struct ip_tunnel_info *info, __u16 dst_port) info->key.tp_dst = htons(dst_port); } -static int geneve_newlink(struct net *net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]) +static int geneve_nl2info(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[], struct ip_tunnel_info *info, + bool *metadata, bool *use_udp6_rx_checksums, + bool changelink) { - bool use_udp6_rx_checksums = false; - struct ip_tunnel_info info; - bool metadata = false; + struct geneve_dev *geneve = netdev_priv(dev); - init_tnl_info(, GENEVE_UDP_PORT); + if (changelink) { + /* if changelink operation, start with old existing info */ + memcpy(info, >info, sizeof(*info)); + *metadata = geneve->collect_md; + *use_udp6_rx_checksums = geneve->use_udp6_rx_checksums; + } else { + init_tnl_info(info, GENEVE_UDP_PORT); + } if (data[IFLA_GENEVE_REMOTE] && data[IFLA_GENEVE_REMOTE6]) return -EINVAL; if (data[IFLA_GENEVE_REMOTE]) { - info.key.u.ipv4.dst = + info->key.u.ipv4.dst = nla_get_in_addr(data[IFLA_GENEVE_REMOTE]); - if (IN_MULTICAST(ntohl(info.key.u.ipv4.dst))) { + if (IN_MULTICAST(ntohl(info->key.u.ipv4.dst))) { netdev_dbg(dev, "multicast remote is unsupported\n"); return -EINVAL; } + if (changelink && + ip_tunnel_info_af(>info) == AF_INET6) { + info->mode &= ~IP_TUNNEL_INFO_IPV6; + info->key.tun_flags &= ~TUNNEL_CSUM; + *use_udp6_rx_checksums = false; + } } if (data[IFLA_GENEVE_REMOTE6]) { #if IS_ENABLED(CONFIG_IPV6) - info.mode = IP_TUNNEL_INFO_IPV6; - info.key.u.ipv6.dst = + info->mode = IP_TUNNEL_INFO_IPV6; + info->key.u.ipv6.dst = nla_get_in6_addr(data[IFLA_GENEVE_REMOTE6]); - if (ipv6_addr_type() & + if (ipv6_addr_type(>key.u.ipv6.dst) & IPV6_ADDR_LINKLOCAL) { netdev_dbg(dev, "link-local remote is unsupported\n"); return -EINVAL; } - if (ipv6_addr_is_multicast()) { + if (ipv6_addr_is_multicast(>key.u.ipv6.dst)) { netdev_dbg(dev, "multicast remote is unsupported\n"); return -EINVAL; } - info.key.tun_flags |= TUNNEL_CSUM; - use_udp6_rx_checksums = true; + info->key.tun_flags |= TUNNEL_CSUM; + *use_udp6_rx_checksums = true; #else return -EPFNOSUPPORT; #endif @@ -1216,48 +1241,107 @@ static int geneve_newlink(struct net *net, struct net_device *dev, if (data[IFLA_GENEVE_ID]) { __u32 vni; __u8 tvni[3]; + _
[PATCH iproute2 v2 0/1] vxlan: support for modifying vxlan device
Hello all, This patch adds support for modifying VXLAN device attributes. I have refactored the vxlan_parse_opt() function to be more readable and not use lot of bool variables. I have tested my changes by running Linux Test Project's VXLAN testcases, and I didn't see any regression. --- v1->v2 - refactored vxlan_parse_opt() to not to use a bunch of foo_set variables Girish Moodalbail (1): vxlan: Add support for modifying vxlan device attributes ip/iplink_vxlan.c | 251 +++--- 1 file changed, 143 insertions(+), 108 deletions(-) -- 1.8.3.1
[PATCH iproute2 v2 1/1] vxlan: Add support for modifying vxlan device attributes
Ability to change vxlan device attributes was added to kernel through commit 8bcdc4f3a20b ("vxlan: add changelink support"), however one cannot do the same through ip(8) command. Changing the allowed vxlan device attributes using 'ip link set dev type vxlan ' currently fails with 'operation not supported' error. This failure is due to the incorrect rtnetlink message construction for the 'ip link set' operation. The vxlan_parse_opt() callback function is called for parsing options for both 'ip link add' and 'ip link set'. For the 'add' case, we pass down default values for those attributes that were not provided as CLI options. However, for the 'set' case we should be only passing down the explicitly provided attributes and not any other (default) attributes. Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com> --- ip/iplink_vxlan.c | 251 +++--- 1 file changed, 143 insertions(+), 108 deletions(-) diff --git a/ip/iplink_vxlan.c b/ip/iplink_vxlan.c index b4ebb13..2bd619d 100644 --- a/ip/iplink_vxlan.c +++ b/ip/iplink_vxlan.c @@ -21,6 +21,8 @@ #include "utils.h" #include "ip_common.h" +#define VXLAN_ATTRSET(attrs, type) (((attrs) & (1L << (type))) != 0) + static void print_explain(FILE *f) { fprintf(f, @@ -59,54 +61,50 @@ static void explain(void) print_explain(stderr); } +static void check_duparg(__u64 *attrs, int type, const char *key, +const char *argv) +{ + if (!VXLAN_ATTRSET(*attrs, type)) { + *attrs |= (1L << type); + return; + } + duparg2(key, argv); +} + static int vxlan_parse_opt(struct link_util *lu, int argc, char **argv, struct nlmsghdr *n) { __u32 vni = 0; - int vni_set = 0; - __u32 saddr = 0; __u32 gaddr = 0; __u32 daddr = 0; - struct in6_addr saddr6 = IN6ADDR_ANY_INIT; struct in6_addr gaddr6 = IN6ADDR_ANY_INIT; struct in6_addr daddr6 = IN6ADDR_ANY_INIT; - unsigned int link = 0; - __u8 tos = 0; - __u8 ttl = 0; - __u32 label = 0; __u8 learning = 1; - __u8 proxy = 0; - __u8 rsc = 0; - __u8 l2miss = 0; - __u8 l3miss = 0; - __u8 noage = 0; - __u32 age = 0; - __u32 maxaddr = 0; __u16 dstport = 0; - __u8 udpcsum = 0; - bool udpcsum_set = false; - __u8 udp6zerocsumtx = 0; - bool udp6zerocsumtx_set = false; - __u8 udp6zerocsumrx = 0; - bool udp6zerocsumrx_set = false; - __u8 remcsumtx = 0; - __u8 remcsumrx = 0; __u8 metadata = 0; - __u8 gbp = 0; - __u8 gpe = 0; - int dst_port_set = 0; - struct ifla_vxlan_port_range range = { 0, 0 }; + __u64 attrs = 0; + bool set_op = (n->nlmsg_type == RTM_NEWLINK && + !(n->nlmsg_flags & NLM_F_CREATE)); while (argc > 0) { if (!matches(*argv, "id") || !matches(*argv, "vni")) { + /* We will add ID attribute outside of the loop since we +* need to consider metadata information as well. +*/ NEXT_ARG(); + check_duparg(, IFLA_VXLAN_ID, "id", *argv); if (get_u32(, *argv, 0) || vni >= 1u << 24) invarg("invalid id", *argv); - vni_set = 1; } else if (!matches(*argv, "group")) { + if (daddr || !IN6_IS_ADDR_UNSPECIFIED()) { + fprintf(stderr, "vxlan: both group and remote"); + fprintf(stderr, " cannot be specified\n"); + return -1; + } NEXT_ARG(); + check_duparg(, IFLA_VXLAN_GROUP, "group", *argv); if (!inet_get_addr(*argv, , )) { fprintf(stderr, "Invalid address \"%s\"\n", *argv); return -1; @@ -114,7 +112,13 @@ static int vxlan_parse_opt(struct link_util *lu, int argc, char **argv, if (!IN6_IS_ADDR_MULTICAST() && !IN_MULTICAST(ntohl(gaddr))) invarg("invalid group address", *argv); } else if (!matches(*argv, "remote")) { + if (gaddr || !IN6_IS_ADDR_UNSPECIFIED()) { + fprintf(stderr, "vxlan: both group and remote"); + fprintf(stderr, " cannot be specified\n"); + return -1; +
Re: [PATCH iproute2] vxlan: Add support for modifying vxlan device attributes
On 5/4/17 5:07 PM, Stephen Hemminger wrote: On Thu, 4 May 2017 14:46:34 -0700 Girish Moodalbail <girish.moodalb...@oracle.com> wrote: Ability to change vxlan device attributes was added to kernel through commit 8bcdc4f3a20b ("vxlan: add changelink support"), however one cannot do the same through ip(8) command. Changing the allowed vxlan device attributes using 'ip link set dev type vxlan ' currently fails with 'operation not supported' error. This failure is due to the incorrect rtnetlink message construction for the 'ip link set' operation. The vxlan_parse_opt() callback function is called for parsing options for both 'ip link add' and 'ip link set'. For the 'add' case, we pass down default values for those attributes that were not provided as CLI options. However, for the 'set' case we should be only passing down the explicitly provided attributes and not any other (default) attributes. Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com> --- All these foo_set variables are ugly. This looks almost like machine generated code. It doesn't read well. I thought about it, however I wasn't sure if refactoring that whole routine will be well received so I decided to follow the current model that already existed in iplink_vxlan.c. I will re-submit a patch cleaning up that whole routine. thanks, ~Girish
[PATCH iproute2] vxlan: Add support for modifying vxlan device attributes
Ability to change vxlan device attributes was added to kernel through commit 8bcdc4f3a20b ("vxlan: add changelink support"), however one cannot do the same through ip(8) command. Changing the allowed vxlan device attributes using 'ip link set dev type vxlan ' currently fails with 'operation not supported' error. This failure is due to the incorrect rtnetlink message construction for the 'ip link set' operation. The vxlan_parse_opt() callback function is called for parsing options for both 'ip link add' and 'ip link set'. For the 'add' case, we pass down default values for those attributes that were not provided as CLI options. However, for the 'set' case we should be only passing down the explicitly provided attributes and not any other (default) attributes. Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com> --- ip/iplink_vxlan.c | 73 --- 1 file changed, 59 insertions(+), 14 deletions(-) diff --git a/ip/iplink_vxlan.c b/ip/iplink_vxlan.c index b4ebb13..c8959aa 100644 --- a/ip/iplink_vxlan.c +++ b/ip/iplink_vxlan.c @@ -72,16 +72,25 @@ static int vxlan_parse_opt(struct link_util *lu, int argc, char **argv, struct in6_addr daddr6 = IN6ADDR_ANY_INIT; unsigned int link = 0; __u8 tos = 0; + bool tos_set = false; __u8 ttl = 0; + bool ttl_set = false; __u32 label = 0; + bool label_set = false; __u8 learning = 1; + bool learning_set = false; __u8 proxy = 0; + bool proxy_set = false; __u8 rsc = 0; + bool rsc_set = false; __u8 l2miss = 0; + bool l2miss_set = false; __u8 l3miss = 0; + bool l3miss_set = false; __u8 noage = 0; __u32 age = 0; __u32 maxaddr = 0; + bool maxaddr_set = false; __u16 dstport = 0; __u8 udpcsum = 0; bool udpcsum_set = false; @@ -90,12 +99,17 @@ static int vxlan_parse_opt(struct link_util *lu, int argc, char **argv, __u8 udp6zerocsumrx = 0; bool udp6zerocsumrx_set = false; __u8 remcsumtx = 0; + bool remcsumtx_set = false; __u8 remcsumrx = 0; + bool remcsumrx_set = false; __u8 metadata = 0; + bool metadata_set = false; __u8 gbp = 0; __u8 gpe = 0; int dst_port_set = 0; struct ifla_vxlan_port_range range = { 0, 0 }; + bool set_op = (n->nlmsg_type == RTM_NEWLINK && + !(n->nlmsg_flags & NLM_F_CREATE)); while (argc > 0) { if (!matches(*argv, "id") || @@ -152,6 +166,7 @@ static int vxlan_parse_opt(struct link_util *lu, int argc, char **argv, invarg("TTL must be <= 255", *argv); ttl = uval; } + ttl_set = true; } else if (!matches(*argv, "tos") || !matches(*argv, "dsfield")) { __u32 uval; @@ -163,6 +178,7 @@ static int vxlan_parse_opt(struct link_util *lu, int argc, char **argv, tos = uval; } else tos = 1; + tos_set = true; } else if (!matches(*argv, "label") || !matches(*argv, "flowlabel")) { __u32 uval; @@ -172,6 +188,7 @@ static int vxlan_parse_opt(struct link_util *lu, int argc, char **argv, (uval & ~LABEL_MAX_MASK)) invarg("invalid flowlabel", *argv); label = htonl(uval); + label_set = true; } else if (!matches(*argv, "ageing")) { NEXT_ARG(); if (strcmp(*argv, "none") == 0) @@ -184,6 +201,7 @@ static int vxlan_parse_opt(struct link_util *lu, int argc, char **argv, maxaddr = 0; else if (get_u32(, *argv, 0)) invarg("max addresses", *argv); + maxaddr_set = true; } else if (!matches(*argv, "port") || !matches(*argv, "srcport")) { NEXT_ARG(); @@ -199,24 +217,34 @@ static int vxlan_parse_opt(struct link_util *lu, int argc, char **argv, dst_port_set = 1; } else if (!matches(*argv, "nolearning")) { learning = 0; + learning_set = true; } else if (!matches(*argv, "learning")) { learning = 1; + learning_set = true; } else if (!matches(*argv, "noproxy")) {
Re: struct ip vs struct iphdr
On 5/4/17 9:42 AM, Oleg wrote: Hi, all. It seems struct ip and struct iphdr are similar: struct ip, despite of it name, doesn't contain anything but ip header. So, my noob question, what is the difference between them? Also, see this: http://stackoverflow.com/questions/42840636/difference-between-struct-ip-and-struct-iphdr Thanks.
[PATCH net-next] geneve: fix incorrect setting of UDP checksum flag
Creating a geneve link with 'udpcsum' set results in a creation of link for which UDP checksum will NOT be computed on outbound packets, as can be seen below. 11: gen0: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN link/ether c2:85:27:b6:b4:15 brd ff:ff:ff:ff:ff:ff promiscuity 0 geneve id 200 remote 192.168.13.1 dstport 6081 noudpcsum Similarly, creating a link with 'noudpcsum' set results in a creation of link for which UDP checksum will be computed on outbound packets. Fixes: 9b4437a5b870 ("geneve: Unify LWT and netdev handling.") Signed-off-by: Girish Moodalbail <girish.moodalb...@oracle.com> --- drivers/net/geneve.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 7074b40..dec5d56 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -1244,7 +1244,7 @@ static int geneve_newlink(struct net *net, struct net_device *dev, metadata = true; if (data[IFLA_GENEVE_UDP_CSUM] && - !nla_get_u8(data[IFLA_GENEVE_UDP_CSUM])) + nla_get_u8(data[IFLA_GENEVE_UDP_CSUM])) info.key.tun_flags |= TUNNEL_CSUM; if (data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX] && -- 1.8.3.1