date:20180131

[vhost:vhost 11/22] drivers//firmware/qemu_fw_cfg.c:379:23: error: 'VMCOREINFO_NOTE_SIZE' undeclared; did you mean 'MEI_CL_NAME_SIZE'?

2018-01-31 Thread kbuild test robot

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git vhost
head:   9991b95cb712cdd9a40240bde6274d5415476fb5
commit: 860c7fa9d6d433011e82a4f7f896893e914ce4a9 [11/22] fw_cfg: write 
vmcoreinfo details
config: i386-randconfig-c0-02011330 (attached as .config)
compiler: gcc-7 (Debian 7.2.0-12) 7.2.1 20171025
reproduce:
git checkout 860c7fa9d6d433011e82a4f7f896893e914ce4a9
# save the attached .config to linux build tree
make ARCH=i386 

All error/warnings (new ones prefixed by >>):

   In file included from include/linux/byteorder/little_endian.h:5:0,
from arch/x86/include/uapi/asm/byteorder.h:5,
from include/asm-generic/bitops/le.h:6,
from arch/x86/include/asm/bitops.h:518,
from include/linux/bitops.h:38,
from include/linux/kernel.h:11,
from include/linux/list.h:9,
from include/linux/module.h:9,
from drivers//firmware/qemu_fw_cfg.c:30:
   drivers//firmware/qemu_fw_cfg.c: In function 'write_vmcoreinfo':
>> drivers//firmware/qemu_fw_cfg.c:379:23: error: 'VMCOREINFO_NOTE_SIZE' 
>> undeclared (first use in this function); did you mean 'MEI_CL_NAME_SIZE'?
  .size = cpu_to_le32(VMCOREINFO_NOTE_SIZE),
  ^
   include/uapi/linux/byteorder/little_endian.h:33:51: note: in definition of 
macro '__cpu_to_le32'
#define __cpu_to_le32(x) ((__force __le32)(__u32)(x))
  ^
>> drivers//firmware/qemu_fw_cfg.c:379:11: note: in expansion of macro 
>> 'cpu_to_le32'
  .size = cpu_to_le32(VMCOREINFO_NOTE_SIZE),
  ^~~
   drivers//firmware/qemu_fw_cfg.c:379:23: note: each undeclared identifier is 
reported only once for each function it appears in
  .size = cpu_to_le32(VMCOREINFO_NOTE_SIZE),
  ^
   include/uapi/linux/byteorder/little_endian.h:33:51: note: in definition of 
macro '__cpu_to_le32'
#define __cpu_to_le32(x) ((__force __le32)(__u32)(x))
  ^
>> drivers//firmware/qemu_fw_cfg.c:379:11: note: in expansion of macro 
>> 'cpu_to_le32'
  .size = cpu_to_le32(VMCOREINFO_NOTE_SIZE),
  ^~~
>> drivers//firmware/qemu_fw_cfg.c:380:24: error: implicit declaration of 
>> function 'paddr_vmcoreinfo_note'; did you mean 'write_vmcoreinfo'? 
>> [-Werror=implicit-function-declaration]
  .paddr = cpu_to_le64(paddr_vmcoreinfo_note())
   ^
   include/uapi/linux/byteorder/little_endian.h:31:51: note: in definition of 
macro '__cpu_to_le64'
#define __cpu_to_le64(x) ((__force __le64)(__u64)(x))
  ^
>> drivers//firmware/qemu_fw_cfg.c:380:12: note: in expansion of macro 
>> 'cpu_to_le64'
  .paddr = cpu_to_le64(paddr_vmcoreinfo_note())
   ^~~
   cc1: some warnings being treated as errors

vim +379 drivers//firmware/qemu_fw_cfg.c

   360  
   361  #ifdef CONFIG_CRASH_CORE
   362  static ssize_t write_vmcoreinfo(struct device *dev, const struct 
fw_cfg_file *f)
   363  {
   364  struct vmci {
   365  __le16 host_format;
   366  __le16 guest_format;
   367  __le32 size;
   368  __le64 paddr;
   369  } __packed;
   370  static struct vmci *data;
   371  ssize_t ret;
   372  
   373  data = kmalloc(sizeof(struct vmci), GFP_KERNEL);
   374  if (!data)
   375  return -ENOMEM;
   376  
   377  *data = (struct vmci) {
   378  .guest_format = cpu_to_le16(VMCOREINFO_FORMAT_ELF),
 > 379  .size = cpu_to_le32(VMCOREINFO_NOTE_SIZE),
 > 380  .paddr = cpu_to_le64(paddr_vmcoreinfo_note())
   381  };
   382  /* spare ourself reading host format support for now since we
   383   * don't know what else to format - host may ignore ours
   384   */
   385  ret = fw_cfg_write_blob(dev, f->select, data, 0, sizeof(struct 
vmci));
   386  
   387  kfree(data);
   388  return ret;
   389  }
   390  #endif /* CONFIG_CRASH_CORE */
   391  

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip

Re: general protection fault in __netlink_ns_capable

2018-01-31 Thread Eric Biggers

On Thu, Jan 04, 2018 at 10:14:38AM -0800, Andrei Vagin wrote:
> On Thu, Jan 04, 2018 at 01:01:17PM +0100, Dmitry Vyukov wrote:
> > On Wed, Jan 3, 2018 at 8:37 AM, Andrei Vagin  wrote:
> > >> > Hello,
> > >> >
> > >> > syzkaller hit the following crash on
> > >> > 75aa5540627fdb3d8f86229776ea87f995275351
> > >> > git://git.cmpxchg.org/linux-mmots.git/master
> > >> > compiler: gcc (GCC) 7.1.1 20170620
> > >> > .config is attached
> > >> > Raw console output is attached.
> > >> > C reproducer is attached
> > >> > syzkaller reproducer is attached. See https://goo.gl/kgGztJ
> > >> > for information about syzkaller reproducers
> > >> >
> > >> >
> > >> > IMPORTANT: if you fix the bug, please add the following tag to the 
> > >> > commit:
> > >> > Reported-by: syzbot+e432865c29eb4c48c...@syzkaller.appspotmail.com
> > >> > It will help syzbot understand when the bug is fixed. See footer for
> > >> > details.
> > >> > If you forward the report, please keep this part and the footer.
> > >> >
> > >> > netlink: 3 bytes leftover after parsing attributes in process
> > >> > `syzkaller140561'.
> > >> > netlink: 3 bytes leftover after parsing attributes in process
> > >> > `syzkaller140561'.
> > >> > netlink: 3 bytes leftover after parsing attributes in process
> > >> > `syzkaller140561'.
> > >> > kasan: CONFIG_KASAN_INLINE enabled
> > >> > kasan: GPF could be caused by NULL-ptr deref or user memory access
> > >> > general protection fault:  [#1] SMP KASAN
> > >> > Dumping ftrace buffer:
> > >> >(ftrace buffer empty)
> > >> > Modules linked in:
> > >> > CPU: 1 PID: 3149 Comm: syzkaller140561 Not tainted 4.15.0-rc4-mm1+ #47
> > >> > Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> > >> > Google 01/01/2011
> > >> > RIP: 0010:__netlink_ns_capable+0x8b/0x120 net/netlink/af_netlink.c:868
> > >>
> > >> NETLINK_CB(skb).sk is NULL here. It looks like we have to use
> > >> sk_ns_capable instead of netlink_ns_capable:
> > >>
> > >> diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
> > >> index c688dc564b11..408c75de52ea 100644
> > >> --- a/net/core/rtnetlink.c
> > >> +++ b/net/core/rtnetlink.c
> > >> @@ -1762,7 +1762,7 @@ static struct net *get_target_net(struct sk_buff
> > >> *skb, int netnsid)
> > >> /* For now, the caller is required to have CAP_NET_ADMIN in
> > >>  * the user namespace owning the target net ns.
> > >>  */
> > >> -   if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
> > >> +   if (!sk_ns_capable(skb->sk, net->user_ns, CAP_NET_ADMIN)) {
> > >> put_net(net);
> > >> return ERR_PTR(-EACCES);
> > >> }
> > >>
> > >
> > > get_target_net() is used twice in the code. In rtnl_getlink(), we need
> > > to use netlink_ns_capable(skb, ...), but in rtnl_dump_ifinfo, we need to
> > > use sk_ns_capable(skb->sk, ...).
> > >
> > > Pls, take a look at this patch:
> > > https://patchwork.ozlabs.org/patch/854896/
> > > Subject: rtnetlink: give a user socket to get_target_net()
> > 
> > 
> > Please include this tag into the commit:
> > 
> 
> I sent v2 with this tag. Sorry for inconvenience.
> https://patchwork.ozlabs.org/patch/855147/
> 
> > > > IMPORTANT: if you fix the bug, please add the following tag to the 
> > > > commit:
> > > > Reported-by: syzbot+e432865c29eb4c48c...@syzkaller.appspotmail.com
> > > > It will help syzbot understand when the bug is fixed.

This crash is no longer occurring, thanks Andrei!  The Reported-by tag didn't
actually make it into the commit (f428fe4a04cc), so I'll tell syzbot instead:

#syz fix: rtnetlink: give a user socket to get_target_net()

- Eric

Re: KASAN: double-free or invalid-free in skb_free_head

2018-01-31 Thread Eric Biggers

On Sun, Dec 17, 2017 at 09:52:01PM -0800, syzbot wrote:
> Hello,
> 
> syzkaller hit the following crash on
> f3b5ad89de16f5d42e8ad36fbdf85f705c1ae051
> git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/master
> compiler: gcc (GCC) 7.1.1 20170620
> .config is attached
> Raw console output is attached.
> C reproducer is attached
> syzkaller reproducer is attached. See https://goo.gl/kgGztJ
> for information about syzkaller reproducers
> 
> 
> ==
> BUG: KASAN: double-free or invalid-free in skb_free_head+0x74/0xb0
> net/core/skbuff.c:550
> 
> CPU: 0 PID: 3142 Comm: sshd Not tainted 4.15.0-rc3+ #225
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> Google 01/01/2011
> Call Trace:
>  __dump_stack lib/dump_stack.c:17 [inline]
>  dump_stack+0x194/0x257 lib/dump_stack.c:53
>  print_address_description+0x73/0x250 mm/kasan/report.c:252
>  kasan_report_double_free+0x55/0x80 mm/kasan/report.c:333
>  kasan_slab_free+0xa3/0xc0 mm/kasan/kasan.c:514
>  __cache_free mm/slab.c:3488 [inline]
>  kfree+0xca/0x250 mm/slab.c:3803
>  skb_free_head+0x74/0xb0 net/core/skbuff.c:550
>  skb_release_data+0x58c/0x790 net/core/skbuff.c:570
>  skb_release_all+0x4a/0x60 net/core/skbuff.c:627
>  __kfree_skb net/core/skbuff.c:641 [inline]
>  consume_skb+0x153/0x490 net/core/skbuff.c:701
>  __dev_kfree_skb_any+0x85/0xa0 net/core/dev.c:2515
>  dev_consume_skb_any include/linux/netdevice.h:3276 [inline]
>  free_old_xmit_skbs.isra.28+0x15a/0x2c0 drivers/net/virtio_net.c:1167
>  start_xmit+0x1b9/0x1650 drivers/net/virtio_net.c:1314
>  __netdev_start_xmit include/linux/netdevice.h:4042 [inline]
>  netdev_start_xmit include/linux/netdevice.h:4051 [inline]
>  xmit_one net/core/dev.c:2991 [inline]
>  dev_hard_start_xmit+0x248/0xac0 net/core/dev.c:3007
>  sch_direct_xmit+0x31d/0x6d0 net/sched/sch_generic.c:187
>  __dev_xmit_skb net/core/dev.c:3189 [inline]
>  __dev_queue_xmit+0x16f4/0x2070 net/core/dev.c:3456
>  dev_queue_xmit+0x17/0x20 net/core/dev.c:3521
>  neigh_hh_output include/net/neighbour.h:472 [inline]
>  neigh_output include/net/neighbour.h:480 [inline]
>  ip_finish_output2+0xece/0x1460 net/ipv4/ip_output.c:229
>  ip_finish_output+0x85e/0xd10 net/ipv4/ip_output.c:317
>  NF_HOOK_COND include/linux/netfilter.h:239 [inline]
>  ip_output+0x1cc/0x860 net/ipv4/ip_output.c:405
>  dst_output include/net/dst.h:460 [inline]
>  ip_local_out+0x95/0x160 net/ipv4/ip_output.c:124
>  ip_queue_xmit+0x8c6/0x18e0 net/ipv4/ip_output.c:504
>  tcp_transmit_skb+0x1b12/0x38b0 net/ipv4/tcp_output.c:1176
>  tcp_write_xmit+0x680/0x5190 net/ipv4/tcp_output.c:2367
>  __tcp_push_pending_frames+0xa0/0x250 net/ipv4/tcp_output.c:2543
>  tcp_push+0x538/0x770 net/ipv4/tcp.c:730
>  tcp_sendmsg_locked+0x2663/0x3b30 net/ipv4/tcp.c:1424
>  tcp_sendmsg+0x2f/0x50 net/ipv4/tcp.c:1461
>  inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:763
>  sock_sendmsg_nosec net/socket.c:636 [inline]
>  sock_sendmsg+0xca/0x110 net/socket.c:646
>  sock_write_iter+0x31a/0x5d0 net/socket.c:915
>  call_write_iter include/linux/fs.h:1772 [inline]
>  new_sync_write fs/read_write.c:469 [inline]
>  __vfs_write+0x684/0x970 fs/read_write.c:482
>  vfs_write+0x189/0x510 fs/read_write.c:544
>  SYSC_write fs/read_write.c:589 [inline]
>  SyS_write+0xef/0x220 fs/read_write.c:581
>  entry_SYSCALL_64_fastpath+0x1f/0x96
> RIP: 0033:0x7f7a61d22370
> RSP: 002b:7ffc93b90b98 EFLAGS: 0246 ORIG_RAX: 0001
> RAX: ffda RBX: 563815b397d0 RCX: 7f7a61d22370
> RDX: 0038 RSI: 563815b2b460 RDI: 0003
> RBP: 7ffc93b90b20 R08: 0001 R09: 0101010101010101
> R10: 0008 R11: 0246 R12: 0040
> R13: 005e R14: 563815b304e0 R15: 7ffc93b90b70
> 
> Allocated by task 3142:
>  save_stack+0x43/0xd0 mm/kasan/kasan.c:447
>  set_track mm/kasan/kasan.c:459 [inline]
>  kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551
>  __do_kmalloc_node mm/slab.c:3672 [inline]
>  __kmalloc_node_track_caller+0x47/0x70 mm/slab.c:3686
>  __kmalloc_reserve.isra.41+0x41/0xd0 net/core/skbuff.c:137
>  __alloc_skb+0x13b/0x780 net/core/skbuff.c:205
>  alloc_skb_fclone include/linux/skbuff.h:1025 [inline]
>  sk_stream_alloc_skb+0x11d/0x900 net/ipv4/tcp.c:870
>  tcp_sendmsg_locked+0x1341/0x3b30 net/ipv4/tcp.c:1299
>  tcp_sendmsg+0x2f/0x50 net/ipv4/tcp.c:1461
>  inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:763
>  sock_sendmsg_nosec net/socket.c:636 [inline]
>  sock_sendmsg+0xca/0x110 net/socket.c:646
>  sock_write_iter+0x31a/0x5d0 net/socket.c:915
>  call_write_iter include/linux/fs.h:1772 [inline]
>  new_sync_write fs/read_write.c:469 [inline]
>  __vfs_write+0x684/0x970 fs/read_write.c:482
>  vfs_write+0x189/0x510 fs/read_write.c:544
>  SYSC_write fs/read_write.c:589 [inline]
>  SyS_write+0xef/0x220 fs/read_write.c:581
>  entry_SYSCALL_64_fastpath+0x1f/0x96
> 
> Freed by task 3142:
>  save_stack+0x43/0xd0 mm/kasan/kasan.c:447
>  set_track

[PATCH v2] iommu/vt-d: add NUMA awareness to intel_alloc_coherent()

2018-01-31 Thread Eric Dumazet

From: Eric Dumazet 

Some devices (like mlx4) try hard to allocate memory on selected
NUMA node, but it turns out intel_alloc_coherent() is not NUMA
aware yet.

Note that dma_generic_alloc_coherent() in arch/x86/kernel/pci-dma.c
gets this right.

Signed-off-by: Eric Dumazet 
Cc: Benjamin Serebrin 
Cc: David Woodhouse 
Cc: Joerg Roedel 
---
v2: no fallback to alloc_pages(), this is not needed and might even
hurt in OOM cases.

 drivers/iommu/intel-iommu.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 
a1373cf343269455808f66ad18dc0a2fb7aa73f2..3c538466a98bdb8fffdca688462b1350d536791b
 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -3735,7 +3735,7 @@ static void *intel_alloc_coherent(struct device *dev, 
size_t size,
}
 
if (!page)
-   page = alloc_pages(flags, order);
+   page = alloc_pages_node(dev_to_node(dev), flags, order);
if (!page)
return NULL;
memset(page_address(page), 0, size);

RE: [PATCHv2] tls: Add support for encryption using async offload accelerator

2018-01-31 Thread Vakul Garg



> -Original Message-
> From: Dave Watson [mailto:davejwat...@fb.com]
> Sent: Wednesday, January 31, 2018 11:05 PM
> To: Vakul Garg 
> Cc: linux-cry...@vger.kernel.org; il...@mellanox.com;
> avia...@mellanox.com; da...@davemloft.net; netdev@vger.kernel.org;
> Gilad Ben-Yossef 
> Subject: Re: [PATCHv2] tls: Add support for encryption using async offload
> accelerator
> 
> On 01/31/18 05:22 PM, Vakul Garg wrote:
> > > > On second though in stable we should probably just disable async
> > > > tfm allocations.
> > > > It's simpler. But this approach is still good for -next
> > > >
> > > >
> > > > Gilad
> > >
> > > I agree with Gilad, just disable async for now.
> > >
> >
> > How to do it? Can you help with the api name?
> 
> *aead = crypto_alloc_aead("gcm(aes)", 0, CRYPTO_ALG_ASYNC);
> 
> https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgit
> hub.com%2Fktls%2Fnet_next_ktls%2Fcommit%2Ff3b9b402e755e4b0623fa8
> 3f88137173fc249f2d=02%7C01%7Cvakul.garg%40nxp.com%7Cf1c707
> 9af97e48c9e89308d568d1633a%7C686ea1d3bc2b4c6fa92cd99c5c301635%
> 7C0%7C0%7C636530170887502735=qTdwbuDsCRmK1UHAYiOcwfPC
> U%2FPXgKg%2BICh%2F2N0b9Nw%3D=0
> 
 
The above patch you authored works fine. I tested it on my test platform.
I have nothing to contribute here.
Would you send it to stable kernel yourself or still want me to do it?


> > > If the flag MSG_DONTWAIT is set, we should be returning -EINPROGRESS
> > > and not wait for a response.  I had started working on a patch for
> > > that, but it's pretty tricky to get right.
> >
> > Can you point me to your WIP code branch for this?
> 
> https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgit
> hub.com%2Fktls%2Fnet_next_ktls%2Fcommit%2F9cc839aa551ed972d148ec
> ebf353b25ee93543b9=02%7C01%7Cvakul.garg%40nxp.com%7Cf1c70
> 79af97e48c9e89308d568d1633a%7C686ea1d3bc2b4c6fa92cd99c5c301635
> %7C0%7C0%7C636530170887502735=0ASwmPPXXSGCXpBGes9vBma
> y8Gojv0wSUGAOOOjBExc%3D=0
> 
> > If MSG_DONTWAIT is not used, will it be sane if enqueue the crypto
> > request to accelerator and return to user space back so that user
> > space can send more plaintext data while crypto accelerator is working in
> parallel?
> 
> Right, that's roughly what the above does.  I believe the tricky unfinished 
> part
> was getting poll() to work correctly if there is an async crypto request
> outstanding.  Currently the tls poll() just relies on backpressure from
> do_tcp_sendpages.
 
I wanted to add async accelerator support for ktls where multiple crypto 
requests can be pipelined.
But since you are already doing it, I won't duplicate the efforts.
I would leverage your work on my platform, test it and try to contribute from 
here.

Re: [Intel-wired-lan] [RFC v2 net-next 01/10] net: Add a new socket option for a future transmit time.

2018-01-31 Thread Richard Cochran

On Wed, Jan 31, 2018 at 04:49:36PM -0800, Jesus Sanchez-Palencia wrote:
> While implementing this today it crossed my mind that why don't we have the
> clockid_t set per socket (e.g. as an argument to SO_TXTIME) instead of per 
> packet?

Sounds good to me.

Thanks,
Richard

Re: [PATCH v4] openvswitch: Remove padding from packet before L3+ conntrack processing

2018-01-31 Thread Pravin Shelar

On Wed, Jan 31, 2018 at 6:48 PM, Ed Swierk  wrote:
> IPv4 and IPv6 packets may arrive with lower-layer padding that is not
> included in the L3 length. For example, a short IPv4 packet may have
> up to 6 bytes of padding following the IP payload when received on an
> Ethernet device with a minimum packet length of 64 bytes.
>
> Higher-layer processing functions in netfilter (e.g. nf_ip_checksum(),
> and help() in nf_conntrack_ftp) assume skb->len reflects the length of
> the L3 header and payload, rather than referring back to
> ip_hdr->tot_len or ipv6_hdr->payload_len, and get confused by
> lower-layer padding.
>
> In the normal IPv4 receive path, ip_rcv() trims the packet to
> ip_hdr->tot_len before invoking netfilter hooks. In the IPv6 receive
> path, ip6_rcv() does the same using ipv6_hdr->payload_len. Similarly
> in the br_netfilter receive path, br_validate_ipv4() and
> br_validate_ipv6() trim the packet to the L3 length before invoking
> netfilter hooks.
>
> Currently in the OVS conntrack receive path, ovs_ct_execute() pulls
> the skb to the L3 header but does not trim it to the L3 length before
> calling nf_conntrack_in(NF_INET_PRE_ROUTING). When
> nf_conntrack_proto_tcp encounters a packet with lower-layer padding,
> nf_ip_checksum() fails causing a "nf_ct_tcp: bad TCP checksum" log
> message. While extra zero bytes don't affect the checksum, the length
> in the IP pseudoheader does. That length is based on skb->len, and
> without trimming, it doesn't match the length the sender used when
> computing the checksum.
>
> In ovs_ct_execute(), trim the skb to the L3 length before higher-layer
> processing.
>
> Signed-off-by: Ed Swierk 

Acked-by: Pravin B Shelar

Re: [PATCH iproute2-next v2 0/6] ipaddress: Get rid of print_linkinfo_brief()

2018-01-31 Thread David Ahern

On 1/30/18 11:09 AM, Serhey Popovych wrote:
> With this series I propose to get rid of custom print_linkinfo_brief()
> in favor of print_linkinfo() to avoid code duplication.
> 

Cover letter needs updating in light of the change to patch 6

Re: [PATCH iproute2-next v2 5/6] utils: Introduce and use print_name_and_link() to print name@link

2018-01-31 Thread David Ahern

On 1/30/18 11:09 AM, Serhey Popovych wrote:
> There is at least three places implementing same things: two in
> ipaddress.c print_linkinfo() & print_linkinfo_brief() and one in
> bridge/link.c.
> 
> These two implementations diverge from each other very little:
> bridge/link.c does not support JSON output at the moment and
> print_linkinfo_brief() does not handle IFLA_LINK_NETNS case.
> 
> Introduce and use print_name_and_link() routine to handle name@link
> output in all possible variations; respect IFLA_LINK_NETNS attribute to
> handle case when link is in different namespace; use "if%d" template
> for interface name instead of "" to share logic with other
> code (e.g. ll_name_to_index() and ll_index_to_name()) supporting such
> template.
> 
> Signed-off-by: Serhey Popovych 
> ---
>  bridge/link.c   |   13 +++--
>  include/utils.h |4 
>  ip/ipaddress.c  |   48 ++--
>  lib/utils.c |   51 +++
>  4 files changed, 60 insertions(+), 56 deletions(-)
> 
> diff --git a/bridge/link.c b/bridge/link.c
> index a11cbb1..90c9734 100644
> --- a/bridge/link.c
> +++ b/bridge/link.c
> @@ -125,20 +125,13 @@ int print_linkinfo(const struct sockaddr_nl *who,
>   if (n->nlmsg_type == RTM_DELLINK)
>   fprintf(fp, "Deleted ");
>  
> - fprintf(fp, "%d: %s ", ifi->ifi_index,
> - tb[IFLA_IFNAME] ? rta_getattr_str(tb[IFLA_IFNAME]) : "");
> + fprintf(fp, "%d: ", ifi->ifi_index);
> +
> + print_name_and_link("%s: ", COLOR_NONE, name, tb);

It only needs tb[IFLA_LINK] so just pass it. Makes the arg list
consistent with the function name too.

Re: [PATCH iproute2-next v2 4/6] utils: Introduce and use get_ifname_rta()

2018-01-31 Thread David Ahern

On 1/30/18 11:09 AM, Serhey Popovych wrote:
> Be consistent in handling of IFLA_IFNAME attribute in all places: if
> there is no attribute report bug to stderr and use ll_index_to_name() as
> last measure to get name in "if%d" format instead of "".
> 
> Use check_ifname() to validate network device name: this catches both
> unexpected return from kernel and ll_index_to_name().

same comment as patch 1. IFLA_IFNAME is always sent by the kernel and
doing a lookup in a cache is wrong.

Re: [PATCH iproute2-next v2 2/6] ipaddress: Simplify print_linkinfo_brief() and it's usage

2018-01-31 Thread David Ahern

On 1/30/18 11:09 AM, Serhey Popovych wrote:
> Improve print_linkinfo_brief() and it's callers:
> 
>   1) Get rid of custom @struct filter pointer @pfilter: it is NULL in
>  all callers anyway and global @filter is used.
> 

Looks like I somehow dropped a vrf change that was going to use that.
Send a standalone patch to revert 63891c70137f (straight revert does not
work so some fixup is needed) and then a follow on for the second part
of this change.

>   2) Simplify calling code in ipaddr_list_flush_or_save() by
>  introducing intermediate variable of @struct nlmsghdr, drop
>  duplicated code: print_linkinfo_brief() never returns values other
>  than <= 0 so we can move print_selected_addrinfo() outside of each
>  block.
> 
> Signed-off-by: Serhey Popovych 
> ---
>  ip/ip_common.h |3 +--
>  ip/ipaddress.c |   60 
> 
>  ip/iplink.c|2 +-
>  3 files changed, 28 insertions(+), 37 deletions(-)
>

Re: [PATCH iproute2-next v2 1/6] ipaddress: Improve print_linkinfo()

2018-01-31 Thread David Ahern

On 1/30/18 11:09 AM, Serhey Popovych wrote:

> diff --git a/ip/ipaddress.c b/ip/ipaddress.c
> index 051a05f..f8fd392 100644
> --- a/ip/ipaddress.c
> +++ b/ip/ipaddress.c
> @@ -948,14 +948,14 @@ int print_linkinfo_brief(const struct sockaddr_nl *who,
>   parse_rtattr(tb, IFLA_MAX, IFLA_RTA(ifi), len);
>   if (tb[IFLA_IFNAME] == NULL) {
>   fprintf(stderr, "BUG: device with ifindex %d has nil ifname\n", 
> ifi->ifi_index);
> - name = "";
> + name = ll_index_to_name(ifi->ifi_index);

This is one of those "should never happen checks" since the kernel
always adds IFLA_IFNAME. Going to a cache to get a name for the index
when the existing message is missing that attribute seems wrong. I
realize the expectation is that the cache is empty today so if%d is
returned, but that could change and it is the idea of consulting a cache
that I think is wrong.

If that intention is to have a name I think it is safer to just have it
set to if%d here (or in a helper that the cache also uses).

>   } else {
>   name = rta_getattr_str(tb[IFLA_IFNAME]);
>   }
>  
>   if (pfilter->label &&
>   (!pfilter->family || pfilter->family == AF_PACKET) &&
> - fnmatch(pfilter->label, RTA_DATA(tb[IFLA_IFNAME]), 0))
> + fnmatch(pfilter->label, name, 0))
>   return -1;
>  
>   if (tb[IFLA_GROUP]) {
> @@ -1057,6 +1057,7 @@ int print_linkinfo(const struct sockaddr_nl *who,
>   struct ifinfomsg *ifi = NLMSG_DATA(n);
>   struct rtattr *tb[IFLA_MAX+1];
>   int len = n->nlmsg_len;
> + const char *name;
>   unsigned int m_flag = 0;
>  
>   if (n->nlmsg_type != RTM_NEWLINK && n->nlmsg_type != RTM_DELLINK)
> @@ -1067,18 +1068,22 @@ int print_linkinfo(const struct sockaddr_nl *who,
>   return -1;
>  
>   if (filter.ifindex && ifi->ifi_index != filter.ifindex)
> - return 0;
> + return -1;
>   if (filter.up && !(ifi->ifi_flags_UP))
> - return 0;
> + return -1;
>  
>   parse_rtattr(tb, IFLA_MAX, IFLA_RTA(ifi), len);
> - if (tb[IFLA_IFNAME] == NULL)
> + if (tb[IFLA_IFNAME] == NULL) {
>   fprintf(stderr, "BUG: device with ifindex %d has nil ifname\n", 
> ifi->ifi_index);
> + name = ll_index_to_name(ifi->ifi_index);

same here


> + } else {
> + name = rta_getattr_str(tb[IFLA_IFNAME]);
> + }
>  
>   if (filter.label &&
>   (!filter.family || filter.family == AF_PACKET) &&
> - fnmatch(filter.label, RTA_DATA(tb[IFLA_IFNAME]), 0))
> - return 0;
> + fnmatch(filter.label, name, 0))
> + return -1;
>  
>   if (tb[IFLA_GROUP]) {
>   int group = rta_getattr_u32(tb[IFLA_GROUP]);
> @@ -1105,16 +1110,7 @@ int print_linkinfo(const struct sockaddr_nl *who,
>   print_bool(PRINT_ANY, "deleted", "Deleted ", true);
>  
>   print_int(PRINT_ANY, "ifindex", "%d: ", ifi->ifi_index);
> - if (tb[IFLA_IFNAME]) {
> - print_color_string(PRINT_ANY,
> -COLOR_IFNAME,
> -"ifname", "%s",
> -rta_getattr_str(tb[IFLA_IFNAME]));
> - } else {
> - print_null(PRINT_JSON, "ifname", NULL, NULL);
> - print_color_null(PRINT_FP, COLOR_IFNAME,
> -  "ifname", "%s", "");
> - }
> + print_color_string(PRINT_ANY, COLOR_IFNAME, "ifname", "%s", name);
>  
>   if (tb[IFLA_LINK]) {
>   int iflink = rta_getattr_u32(tb[IFLA_LINK]);
>

Re: [PATCH iproute2-next 0/4] ip: Introduce and use helper to read /proc/net/dev

2018-01-31 Thread David Ahern

On 1/31/18 4:04 PM, Stephen Hemminger wrote:
> 
> /proc/net/dev is legacy and unextensible.
> 
> I would rather see netlink used everywhere and not /proc/net/dev or sysfs!
> 

agreed. ll_init_map is already called in some places so a device cache
already exists.

[PATCH v4] openvswitch: Remove padding from packet before L3+ conntrack processing

2018-01-31 Thread Ed Swierk

IPv4 and IPv6 packets may arrive with lower-layer padding that is not
included in the L3 length. For example, a short IPv4 packet may have
up to 6 bytes of padding following the IP payload when received on an
Ethernet device with a minimum packet length of 64 bytes.

Higher-layer processing functions in netfilter (e.g. nf_ip_checksum(),
and help() in nf_conntrack_ftp) assume skb->len reflects the length of
the L3 header and payload, rather than referring back to
ip_hdr->tot_len or ipv6_hdr->payload_len, and get confused by
lower-layer padding.

In the normal IPv4 receive path, ip_rcv() trims the packet to
ip_hdr->tot_len before invoking netfilter hooks. In the IPv6 receive
path, ip6_rcv() does the same using ipv6_hdr->payload_len. Similarly
in the br_netfilter receive path, br_validate_ipv4() and
br_validate_ipv6() trim the packet to the L3 length before invoking
netfilter hooks.

Currently in the OVS conntrack receive path, ovs_ct_execute() pulls
the skb to the L3 header but does not trim it to the L3 length before
calling nf_conntrack_in(NF_INET_PRE_ROUTING). When
nf_conntrack_proto_tcp encounters a packet with lower-layer padding,
nf_ip_checksum() fails causing a "nf_ct_tcp: bad TCP checksum" log
message. While extra zero bytes don't affect the checksum, the length
in the IP pseudoheader does. That length is based on skb->len, and
without trimming, it doesn't match the length the sender used when
computing the checksum.

In ovs_ct_execute(), trim the skb to the L3 length before higher-layer
processing.

Signed-off-by: Ed Swierk 
---
 net/openvswitch/conntrack.c | 34 ++
 1 file changed, 34 insertions(+)

diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index d558e88..285f879 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -1097,6 +1097,36 @@ static int ovs_ct_commit(struct net *net, struct 
sw_flow_key *key,
return 0;
 }
 
+/* Trim the skb to the length specified by the IP/IPv6 header,
+ * removing any trailing lower-layer padding. This prepares the skb
+ * for higher-layer processing that assumes skb->len excludes padding
+ * (such as nf_ip_checksum). The caller needs to pull the skb to the
+ * network header, and ensure ip_hdr/ipv6_hdr points to valid data.
+ */
+static int ovs_skb_network_trim(struct sk_buff *skb)
+{
+   unsigned int len;
+   int err;
+
+   switch (skb->protocol) {
+   case htons(ETH_P_IP):
+   len = ntohs(ip_hdr(skb)->tot_len);
+   break;
+   case htons(ETH_P_IPV6):
+   len = sizeof(struct ipv6hdr)
+   + ntohs(ipv6_hdr(skb)->payload_len);
+   break;
+   default:
+   len = skb->len;
+   }
+
+   err = pskb_trim_rcsum(skb, len);
+   if (err)
+   kfree_skb(skb);
+
+   return err;
+}
+
 /* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
  * value if 'skb' is freed.
  */
@@ -,6 +1141,10 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
nh_ofs = skb_network_offset(skb);
skb_pull_rcsum(skb, nh_ofs);
 
+   err = ovs_skb_network_trim(skb);
+   if (err)
+   return err;
+
if (key->ip.frag != OVS_FRAG_TYPE_NONE) {
err = handle_fragments(net, key, info->zone.id, skb);
if (err)
-- 
1.9.1

Re: [PATCH] tcp_lp: use 64-bit arithmetic instead of 32-bit

2018-01-31 Thread Andrew Lunn

On Wed, Jan 31, 2018 at 07:07:49PM -0600, Gustavo A. R. Silva wrote:
> 
> Hi Alan,
> 
> Quoting Alan Cox :
> 
> >On Wed, 31 Jan 2018 18:24:07 -0600
> >"Gustavo A. R. Silva"  wrote:
> >
> >>Cast to s64 some variables and a macro in order to give the
> >>compiler complete information about the proper arithmetic to
> >>use. Notice that these elements are used in contexts that
> >>expect expressions of type s64 (64 bits, signed).
> >>
> >>Currently such expression are being evaluated using 32-bit
> >>arithmetic.
> >
> >The question you need to ask is 'can it overflow 32bit maths', otherwise
> >you are potentially making the system do extra work for no reason.
> >
> 
> Yeah, I get your point and it seems that in this particular case there is no
> risk of a 32bit overflow, but in general and IMHO as the code evolves, the
> use of incorrect arithmetic may have security implications in the future, so
> I advocate for code correctness in this case.

Hi Gustavo

Is this on the hotpath? How much overhead does it add to 32 bit
architectures which don't have 64 bit arithmetic in hardware? There
are a lot of embedded systems which are 32 bit.

Andrew

Re: [PATCH] iommu/vt-d: add NUMA awareness to intel_alloc_coherent()

2018-01-31 Thread Eric Dumazet

On Wed, 2018-01-31 at 14:45 -0800, Eric Dumazet wrote:
> From: Eric Dumazet 
> 
> Some devices (like mlx4) try hard to allocate memory on selected
> NUMA node, but it turns out intel_alloc_coherent() is not NUMA
> aware yet.
> 
> Note that dma_generic_alloc_coherent() in arch/x86/kernel/pci-dma.c
> gets this right.
> 
> Signed-off-by: Eric Dumazet 
> Cc: Benjamin Serebrin 
> Cc: David Woodhouse 
> Cc: Joerg Roedel 
> ---
>  drivers/iommu/intel-iommu.c |7 +--
>  1 file changed, 5 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
> index 
> a1373cf343269455808f66ad18dc0a2fb7aa73f2..0efef077abc099eb29ebc5cefdd1b996f025dffd
>  100644
> --- a/drivers/iommu/intel-iommu.c
> +++ b/drivers/iommu/intel-iommu.c
> @@ -3734,8 +3734,11 @@ static void *intel_alloc_coherent(struct device *dev, 
> size_t size,
>   }
>   }
>  
> - if (!page)
> - page = alloc_pages(flags, order);
> + if (!page) {
> + page = alloc_pages_node(dev_to_node(dev), flags, order);
> + if (!page)
> + page = alloc_pages(flags, order);

I'll send a V2 without the fallback to alloc_pages()

This seems not necessary at all.


> + }
>   if (!page)
>   return NULL;
>   memset(page_address(page), 0, size);

Re: [Patch net] xt_RATEEST: acquire xt_rateest_mutex for hash insert

2018-01-31 Thread Eric Dumazet

On Wed, 2018-01-31 at 16:26 -0800, Cong Wang wrote:
> rateest_hash is supposed to be protected by xt_rateest_mutex.
> 
> Reported-by: 
> Fixes: 5859034d7eb8 ("[NETFILTER]: x_tables: add RATEEST target")
> Cc: Pablo Neira Ayuso 
> Signed-off-by: Cong Wang 
> ---
>  net/netfilter/xt_RATEEST.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
> index 498b54fd04d7..83ec3a282755 100644
> --- a/net/netfilter/xt_RATEEST.c
> +++ b/net/netfilter/xt_RATEEST.c
> @@ -36,7 +36,9 @@ static void xt_rateest_hash_insert(struct xt_rateest *est)
>   unsigned int h;
>  
>   h = xt_rateest_hash(est->name);
> + mutex_lock(_rateest_mutex);
>   hlist_add_head(>list, _hash[h]);
> + mutex_unlock(_rateest_mutex);
>  }

We probably should make this module netns aware, otherwise bad things
will happen.

(It seems multiple threads could run, so getting the mutex twice 
(xt_rateest_lookup then xt_rateest_hash_insert() is racy)

Re: [PATCH] tcp_lp: use 64-bit arithmetic instead of 32-bit

2018-01-31 Thread Gustavo A. R. Silva



Hi Alan,

Quoting Alan Cox :


On Wed, 31 Jan 2018 18:24:07 -0600
"Gustavo A. R. Silva"  wrote:


Cast to s64 some variables and a macro in order to give the
compiler complete information about the proper arithmetic to
use. Notice that these elements are used in contexts that
expect expressions of type s64 (64 bits, signed).

Currently such expression are being evaluated using 32-bit
arithmetic.


The question you need to ask is 'can it overflow 32bit maths', otherwise
you are potentially making the system do extra work for no reason.



Yeah, I get your point and it seems that in this particular case there  
is no risk of a 32bit overflow, but in general and IMHO as the code  
evolves, the use of incorrect arithmetic may have security  
implications in the future, so I advocate for code correctness in this  
case.


Thanks
--
Gustavo

Re: [Intel-wired-lan] [RFC v2 net-next 01/10] net: Add a new socket option for a future transmit time.

2018-01-31 Thread Jesus Sanchez-Palencia

Hi,

On 01/18/2018 09:13 AM, Richard Cochran wrote:
> On Thu, Jan 18, 2018 at 09:42:27AM +0100, Miroslav Lichvar wrote:
>> In the discussion about the v1 patchset, there was a question if the
>> cmsg should include a clockid_t. Without that, how can an application
>> prevent the packet from being sent using an incorrect clock, e.g.
>> the system clock when it expects it to be a PHC, or a different PHC
>> when the socket is not bound to a specific interface?
> 
> Right, the clockid_t should be passed in through the CMSG along with
> the time.

While implementing this today it crossed my mind that why don't we have the
clockid_t set per socket (e.g. as an argument to SO_TXTIME) instead of per 
packet?

The only use-case that we could think of that would be 'blocked' was using
sendmmsg() to send a packet to different interfaces with a single syscall, but
I'm not sure how common that is.

What do you think?

Thanks,
Jesus

>  
> Thanks,
> Richard
>

Re: net: hang in unregister_netdevice: waiting for lo to become free

2018-01-31 Thread Xin Long

On Tue, Jan 30, 2018 at 11:59 PM, David Ahern  wrote:
> On 1/30/18 1:57 PM, David Ahern wrote:
>> On 1/30/18 1:08 PM, Daniel Borkmann wrote:
>>> On 01/30/2018 07:32 PM, Cong Wang wrote:
 On Tue, Jan 30, 2018 at 4:09 AM, Dmitry Vyukov  wrote:
> Hello,
>
> The following program creates a hang in unregister_netdevice.
> cleanup_net work hangs there forever periodically printing
> "unregister_netdevice: waiting for lo to become free. Usage count = 3"
> and creation of any new network namespaces hangs forever.

 Interestingly, this is not reproducible on net-next.
>>>
>>> The most recent change on netns refcnt was 4ee806d51176 ("net: tcp: close
>>> sock if net namespace is exiting") in net/net-next from 5 days ago, maybe
>>> fixed due to that?
>>>
>>
>> This appears to be the commit introducing the refcnt leak:
>>
>> $ git bisect bad
>> dbc2b5e9a09e9a6664679a667ff81cff6e5f2641 is the first bad commit
>> commit dbc2b5e9a09e9a6664679a667ff81cff6e5f2641
>> Author: Xin Long 
>> Date:   Fri May 12 14:39:52 2017 +0800
>>
>> sctp: fix src address selection if using secondary addresses for ipv6
>>
>>
>> v4.14 is bad. Running bisect in the background while doing other things
>>
>
> Interesting. The commit that avoids the refcnt leak is
>
> commit 955ec4cb3b54c7c389a9f830be7d3ae2056b9212
> Author: David Ahern 
> Date:   Wed Jan 24 19:45:29 2018 -0800
>
> net/ipv6: Do not allow route add with a device that is down
>
> That commit does not intentionally address the problem so it is just
> masking the problematic code introduced by the commit above.
Thanks, David A.

I'm still on a trip. will look into this asap.

[PATCH] tcp_lp: use 64-bit arithmetic instead of 32-bit

2018-01-31 Thread Gustavo A. R. Silva

Cast to s64 some variables and a macro in order to give the
compiler complete information about the proper arithmetic to
use. Notice that these elements are used in contexts that
expect expressions of type s64 (64 bits, signed).

Currently such expression are being evaluated using 32-bit
arithmetic.

Addresses-Coverity-ID: 200687
Addresses-Coverity-ID: 200688
Addresses-Coverity-ID: 200689
Signed-off-by: Gustavo A. R. Silva 
---
 net/ipv4/tcp_lp.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index ae10ed6..4999111 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -134,7 +134,7 @@ static u32 tcp_lp_remote_hz_estimator(struct sock *sk)
 {
struct tcp_sock *tp = tcp_sk(sk);
struct lp *lp = inet_csk_ca(sk);
-   s64 rhz = lp->remote_hz << 6;   /* remote HZ << 6 */
+   s64 rhz = (s64)lp->remote_hz << 6;  /* remote HZ << 6 */
s64 m = 0;
 
/* not yet record reference time
@@ -147,7 +147,7 @@ static u32 tcp_lp_remote_hz_estimator(struct sock *sk)
tp->rx_opt.rcv_tsecr == lp->local_ref_time)
goto out;
 
-   m = TCP_TS_HZ *
+   m = (s64)TCP_TS_HZ *
(tp->rx_opt.rcv_tsval - lp->remote_ref_time) /
(tp->rx_opt.rcv_tsecr - lp->local_ref_time);
if (m < 0)
@@ -193,8 +193,8 @@ static u32 tcp_lp_owd_calculator(struct sock *sk)
 
if (lp->flag & LP_VALID_RHZ) {
owd =
-   tp->rx_opt.rcv_tsval * (LP_RESOL / lp->remote_hz) -
-   tp->rx_opt.rcv_tsecr * (LP_RESOL / TCP_TS_HZ);
+   (s64)tp->rx_opt.rcv_tsval * (LP_RESOL / lp->remote_hz) -
+   (s64)tp->rx_opt.rcv_tsecr * (LP_RESOL / TCP_TS_HZ);
if (owd < 0)
owd = -owd;
}
-- 
2.7.4

Re: [PATCH] tcp_lp: use 64-bit arithmetic instead of 32-bit

2018-01-31 Thread Alan Cox

On Wed, 31 Jan 2018 18:24:07 -0600
"Gustavo A. R. Silva"  wrote:

> Cast to s64 some variables and a macro in order to give the
> compiler complete information about the proper arithmetic to
> use. Notice that these elements are used in contexts that
> expect expressions of type s64 (64 bits, signed).
> 
> Currently such expression are being evaluated using 32-bit
> arithmetic.

The question you need to ask is 'can it overflow 32bit maths', otherwise
you are potentially making the system do extra work for no reason.

Alan

[Patch net] xt_RATEEST: acquire xt_rateest_mutex for hash insert

2018-01-31 Thread Cong Wang

rateest_hash is supposed to be protected by xt_rateest_mutex.

Reported-by: 
Fixes: 5859034d7eb8 ("[NETFILTER]: x_tables: add RATEEST target")
Cc: Pablo Neira Ayuso 
Signed-off-by: Cong Wang 
---
 net/netfilter/xt_RATEEST.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 498b54fd04d7..83ec3a282755 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -36,7 +36,9 @@ static void xt_rateest_hash_insert(struct xt_rateest *est)
unsigned int h;
 
h = xt_rateest_hash(est->name);
+   mutex_lock(_rateest_mutex);
hlist_add_head(>list, _hash[h]);
+   mutex_unlock(_rateest_mutex);
 }
 
 struct xt_rateest *xt_rateest_lookup(const char *name)
-- 
2.13.0

[RFC v2 06/14] tcp_smc: Make SMC use TCP extra-option framework

2018-01-31 Thread Christoph Paasch

Adopt the extra-option framework for SMC.
It allows us to entirely remove SMC-code out of the TCP-stack.

The static key is gone, as this is now covered by the static key of the
extra-option framework.

We allocate state (struct tcp_smc_opt) that indicates whether SMC was
successfully negotiated or not and check this state in the relevant
functions.

Cc: Ursula Braun 
Signed-off-by: Christoph Paasch 
Reviewed-by: Mat Martineau 
---
 include/linux/tcp.h  |   3 +-
 include/net/inet_sock.h  |   3 +-
 include/net/tcp.h|   4 -
 net/ipv4/tcp.c   |   5 --
 net/ipv4/tcp_input.c |  36 -
 net/ipv4/tcp_minisocks.c |  18 -
 net/ipv4/tcp_output.c|  54 --
 net/smc/af_smc.c | 190 +--
 8 files changed, 186 insertions(+), 127 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 6e1f0f29bf24..0958b3760cfc 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -257,8 +257,7 @@ struct tcp_sock {
syn_fastopen_ch:1, /* Active TFO re-enabling probe */
syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
save_syn:1, /* Save headers of SYN packet */
-   is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */
-   syn_smc:1;  /* SYN includes SMC */
+   is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
u32 tlp_high_seq;   /* snd_nxt at the time of TLP retransmit. */
 
 /* RTT measurement */
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 0a671c32d6b9..4efa6cb14705 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -90,8 +90,7 @@ struct inet_request_sock {
wscale_ok  : 1,
ecn_ok : 1,
acked  : 1,
-   no_srccheck: 1,
-   smc_ok : 1;
+   no_srccheck: 1;
u32 ir_mark;
union {
struct ip_options_rcu __rcu *ireq_opt;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index be6709e380a6..2a565883e2ef 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2093,10 +2093,6 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1);
 }
 
-#if IS_ENABLED(CONFIG_SMC)
-extern struct static_key_false tcp_have_smc;
-#endif
-
 struct tcp_extopt_store;
 
 struct tcp_extopt_ops {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ffb5f4fbd935..f08542d91e1c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -292,11 +292,6 @@ EXPORT_SYMBOL(sysctl_tcp_mem);
 atomic_long_t tcp_memory_allocated;/* Current allocated memory. */
 EXPORT_SYMBOL(tcp_memory_allocated);
 
-#if IS_ENABLED(CONFIG_SMC)
-DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
-EXPORT_SYMBOL(tcp_have_smc);
-#endif
-
 /*
  * Current number of TCP sockets.
  */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 187e3fa761c8..fd2693baee4a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3691,24 +3691,6 @@ static void tcp_parse_fastopen_option(int len, const 
unsigned char *cookie,
foc->exp = exp_opt;
 }
 
-static int smc_parse_options(const struct tcphdr *th,
-struct tcp_options_received *opt_rx,
-const unsigned char *ptr,
-int opsize)
-{
-#if IS_ENABLED(CONFIG_SMC)
-   if (static_branch_unlikely(_have_smc)) {
-   if (th->syn && !(opsize & 1) &&
-   opsize >= TCPOLEN_EXP_SMC_BASE &&
-   get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) {
-   opt_rx->smc_ok = 1;
-   return 1;
-   }
-   }
-#endif
-   return 0;
-}
-
 /* Look for tcp options. Normally only called on SYN and SYNACK packets.
  * But, this can also be called on packets in the established flow when
  * the fast version below fails.
@@ -3816,9 +3798,6 @@ void tcp_parse_options(const struct net *net,
tcp_parse_fastopen_option(opsize -
TCPOLEN_EXP_FASTOPEN_BASE,
ptr + 2, th->syn, foc, true);
-   else if (smc_parse_options(th, opt_rx, ptr,
-  opsize))
-   break;
else if (opsize >= TCPOLEN_EXP_BASE)

tcp_extopt_parse(get_unaligned_be32(ptr),
 opsize, ptr, skb,
@@ -5595,16 +5574,6 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, 
struct sk_buff *synack,

[RFC v2 00/14] Generic TCP-option framework and adoption for TCP-SMC and TCP-MD5

2018-01-31 Thread Christoph Paasch

Resubmit as v2 RFC to get some feedback before net-next opens up again.
Only minor changes (see below).


This patchset introduces a generic framework for handling TCP-options.

TCP-options like TCP_MD5 and SMC are rather rare use-cases, but their
implementation is rather intrusive to the TCP-stack. Other, more recent
TCP extensions like TCP-crypt, MPTCP or TCP-AO would make this situation
even worse.

This new framework allows to add these TCP-options in a modular way. Writing,
reading and acting upon these options is done through callbacks that get
registered to a TCP-socket. A TCP-socket has a list of "extra" TCP-options
that it will use.

We make TCP-SMC and TCP-MD5SIG adopt this new framework. As can be seen, there
is now no more TCP-SMC code in the TCP-files and the TCP-MD5 code has been
reduced to a bare minimum.

This patchset is admittedly rather big, but we wanted to show where the
framework will lead to and what it enables. Suggestions as to how to better
structure the patchset is appreciated.

There is still work to be done to more efficiently check for extra TCP options
in performance-sensitive code paths. A rate-limited static key would nearly
eliminate overhead if no extra TCP options are in use system-wide, or a flag
in a likely-hot cache line could work well.

For now we opted for a simple if (unlikely(!hlist_empty(...)) check.

Feedback is very welcome!

Thanks,
Mat & Christoph


Changelog:
===
v1 -> v2:
* Some minor fixes thanks to the buildbot when certain configs
  are disabled (Patch 5 and 12)
* Add spdx-header in the new files (Patch 11)
* Added Ivan Delande to the CC-list as he did some TCP-MD5
  changes in the past.


Christoph Paasch (13):
  tcp: Write options after the header has been fully done
  tcp: Pass sock and skb to tcp_options_write
  tcp: Allow tcp_fast_parse_options to drop segments
  tcp_smc: Make smc_parse_options return 1 on success
  tcp_smc: Make SMC use TCP extra-option framework
  tcp_md5: Don't pass along md5-key
  tcp_md5: Detect key inside tcp_v4_send_ack instead of passing it as an
argument
  tcp_md5: Detect key inside tcp_v6_send_response instead of passing it
as an argument
  tcp_md5: Check for TCP_MD5 after TCP Timestamps in
tcp_established_options
  tcp_md5: Move TCP-MD5 code out of TCP itself
  tcp_md5: Use tcp_extra_options in output path
  tcp_md5: Cleanup TCP-code
  tcp_md5: Use TCP extra-options on the input path

Mat Martineau (1):
  tcp: Register handlers for extra TCP options

 drivers/infiniband/hw/cxgb4/cm.c |2 +-
 include/linux/inet_diag.h|1 +
 include/linux/tcp.h  |   43 +-
 include/linux/tcp_md5.h  |   40 ++
 include/net/inet_sock.h  |3 +-
 include/net/tcp.h|  213 +++---
 net/ipv4/Makefile|1 +
 net/ipv4/syncookies.c|6 +-
 net/ipv4/tcp.c   |  391 ---
 net/ipv4/tcp_diag.c  |   81 +--
 net/ipv4/tcp_input.c |  137 ++--
 net/ipv4/tcp_ipv4.c  |  556 ++--
 net/ipv4/tcp_md5.c   | 1359 ++
 net/ipv4/tcp_minisocks.c |   75 +--
 net/ipv4/tcp_output.c|  182 +
 net/ipv6/syncookies.c|6 +-
 net/ipv6/tcp_ipv6.c  |  390 ++-
 net/smc/af_smc.c |  190 +-
 18 files changed, 2228 insertions(+), 1448 deletions(-)
 create mode 100644 include/linux/tcp_md5.h
 create mode 100644 net/ipv4/tcp_md5.c

-- 
2.16.1

[RFC v2 13/14] tcp_md5: Cleanup TCP-code

2018-01-31 Thread Christoph Paasch

Now that we have consolidated the TCP_MD5 output path, we can cleanup
TCP and its callbacks to MD5.

These callbacks are solely there to handle the different
address-familiese (v4, v6 and v4mapped).

Now that we have isolated the TCP_MD5-code it is acceptable to add a bit
more complexity inside tcp_md5.c to handle these address-families at the
benefit of getting rid of these callbacks in tcp_sock, together with its
assignments in tcp_v4/6_connect,...

Cc: Ivan Delalande 
Signed-off-by: Christoph Paasch 
Reviewed-by: Mat Martineau 
---
 include/linux/tcp.h |   5 -
 include/linux/tcp_md5.h |  18 +--
 include/net/tcp.h   |  24 
 net/ipv4/tcp.c  |   2 +-
 net/ipv4/tcp_ipv4.c |   8 --
 net/ipv4/tcp_md5.c  | 340 ++--
 net/ipv6/tcp_ipv6.c |  17 ---
 7 files changed, 155 insertions(+), 259 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index d4d22b9c19be..36f9bedeb6b1 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -388,11 +388,6 @@ struct tcp_sock {
   * while socket was owned by user.
   */
 
-#ifdef CONFIG_TCP_MD5SIG
-/* TCP AF-Specific parts; only used by MD5 Signature support so far */
-   const struct tcp_sock_af_ops*af_specific;
-#endif
-
 /* TCP fastopen related information */
struct tcp_fastopen_request *fastopen_req;
/* fastopen_rsk points to request_sock that resulted in this big
diff --git a/include/linux/tcp_md5.h b/include/linux/tcp_md5.h
index 94a29c4f6fd1..441be65ec893 100644
--- a/include/linux/tcp_md5.h
+++ b/include/linux/tcp_md5.h
@@ -27,28 +27,14 @@ struct tcp_md5sig_key {
struct rcu_head rcu;
 };
 
-extern const struct tcp_sock_af_ops tcp_sock_ipv4_specific;
-extern const struct tcp_sock_af_ops tcp_sock_ipv6_specific;
-extern const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific;
-
 /* - functions */
-int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
-   const struct sock *sk, const struct sk_buff *skb);
 
-struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
-const struct sock *addr_sk);
+int tcp_md5_parse_keys(struct sock *sk, int optname, char __user *optval,
+  int optlen);
 
 bool tcp_v4_inbound_md5_hash(const struct sock *sk,
 const struct sk_buff *skb);
 
-struct tcp_md5sig_key *tcp_v6_md5_lookup(const struct sock *sk,
-const struct sock *addr_sk);
-
-int tcp_v6_md5_hash_skb(char *md5_hash,
-   const struct tcp_md5sig_key *key,
-   const struct sock *sk,
-   const struct sk_buff *skb);
-
 bool tcp_v6_inbound_md5_hash(const struct sock *sk,
 const struct sk_buff *skb);
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index d2738cb01cf2..ceb8ac1e17bd 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1730,32 +1730,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 const struct tcp_request_sock_ops *af_ops,
 struct sock *sk, struct sk_buff *skb);
 
-/* TCP af-specific functions */
-struct tcp_sock_af_ops {
-#ifdef CONFIG_TCP_MD5SIG
-   struct tcp_md5sig_key   *(*md5_lookup) (const struct sock *sk,
-   const struct sock *addr_sk);
-   int (*calc_md5_hash)(char *location,
-const struct tcp_md5sig_key *md5,
-const struct sock *sk,
-const struct sk_buff *skb);
-   int (*md5_parse)(struct sock *sk,
-int optname,
-char __user *optval,
-int optlen);
-#endif
-};
-
 struct tcp_request_sock_ops {
u16 mss_clamp;
-#ifdef CONFIG_TCP_MD5SIG
-   struct tcp_md5sig_key *(*req_md5_lookup)(const struct sock *sk,
-const struct sock *addr_sk);
-   int (*calc_md5_hash) (char *location,
- const struct tcp_md5sig_key *md5,
- const struct sock *sk,
- const struct sk_buff *skb);
-#endif
void (*init_req)(struct request_sock *req,
 const struct sock *sk_listener,
 struct sk_buff *skb);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index fc5c9cb19b9b..795f2dfb 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2828,7 +2828,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
case TCP_MD5SIG:
case TCP_MD5SIG_EXT:
/* Read

[RFC v2 04/14] tcp_smc: Make smc_parse_options return 1 on success

2018-01-31 Thread Christoph Paasch

As we allow a generic TCP-option parser that also parses experimental
TCP options, we need to add a return-value to smc_parse_options() that
indicates whether the option actually matched or not.

Cc: Ursula Braun 
Signed-off-by: Christoph Paasch 
Reviewed-by: Mat Martineau 
---
 net/ipv4/tcp_input.c | 13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1fbabcc99b62..94ba88b2246b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3691,19 +3691,22 @@ static void tcp_parse_fastopen_option(int len, const 
unsigned char *cookie,
foc->exp = exp_opt;
 }
 
-static void smc_parse_options(const struct tcphdr *th,
- struct tcp_options_received *opt_rx,
- const unsigned char *ptr,
- int opsize)
+static int smc_parse_options(const struct tcphdr *th,
+struct tcp_options_received *opt_rx,
+const unsigned char *ptr,
+int opsize)
 {
 #if IS_ENABLED(CONFIG_SMC)
if (static_branch_unlikely(_have_smc)) {
if (th->syn && !(opsize & 1) &&
opsize >= TCPOLEN_EXP_SMC_BASE &&
-   get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC)
+   get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) {
opt_rx->smc_ok = 1;
+   return 1;
+   }
}
 #endif
+   return 0;
 }
 
 /* Look for tcp options. Normally only called on SYN and SYNACK packets.
-- 
2.16.1

[RFC v2 02/14] tcp: Pass sock and skb to tcp_options_write

2018-01-31 Thread Christoph Paasch

An upcoming patch adds a configurable, per-socket list of TCP options to
populate in the TCP header. This requires tcp_options_write() to know the
socket (to use the options list) and the skb (to provide visibility to the
packet data for options like TCP_MD5SIG).

Signed-off-by: Christoph Paasch 
Reviewed-by: Mat Martineau 
---
 net/ipv4/tcp_output.c | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index df50c7dc1a43..e598bf54e3fb 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -444,10 +444,14 @@ struct tcp_out_options {
  * At least SACK_PERM as the first option is known to lead to a disaster
  * (but it may well be that other scenarios fail similarly).
  */
-static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
+static void tcp_options_write(__be32 *ptr, struct sk_buff *skb, struct sock 
*sk,
  struct tcp_out_options *opts)
 {
u16 options = opts->options;/* mungable copy */
+   struct tcp_sock *tp = NULL;
+
+   if (sk_fullsock(sk))
+   tp = tcp_sk(sk);
 
if (unlikely(OPTION_MD5 & options)) {
*ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
@@ -1136,7 +1140,7 @@ static int tcp_transmit_skb(struct sock *sk, struct 
sk_buff *skb, int clone_it,
 */
th->window  = htons(min(tp->rcv_wnd, 65535U));
}
-   tcp_options_write((__be32 *)(th + 1), tp, );
+   tcp_options_write((__be32 *)(th + 1), skb, sk, );
 #ifdef CONFIG_TCP_MD5SIG
/* Calculate the MD5 hash, as we have all we need now */
if (md5) {
@@ -3248,7 +3252,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, 
struct dst_entry *dst,
/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
th->window = htons(min(req->rsk_rcv_wnd, 65535U));
th->doff = (tcp_header_size >> 2);
-   tcp_options_write((__be32 *)(th + 1), NULL, );
+   tcp_options_write((__be32 *)(th + 1), skb, req_to_sk(req), );
__TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
 
 #ifdef CONFIG_TCP_MD5SIG
-- 
2.16.1

[RFC v2 12/14] tcp_md5: Use tcp_extra_options in output path

2018-01-31 Thread Christoph Paasch

This patch starts making use of the extra_option framework for TCP_MD5.

One tricky part is that extra_options are called at the end of the
tcp_syn_options(), while TCP_MD5 is called at the beginning.

TCP_MD5 is called at the beginning because it wants to disable
TCP-timestamps (for option-space reasons). So, in the _prepare-function
of the extra options we need to undo the work that was done when
enabling TCP timestamps.

Another thing to note is that in tcp_v4_send_reset (and its IPv6
counterpart), we were looking previously for the listening-socket (if sk
== NULL) in case there was an MD5 signature in the TCP-option space of
the incoming packet.

With the extra-option framework we can't do this anymore, because
extra-options are part of the TCP-socket's tcp_option_list. If there is
no socket, it means we can't parse the option.

This shouldn't have an impact, because when we receive a segment and
there is not established socket, we will match on the listening socket
(if it's still there). Then, when we decide to respond with a RST in
tcp_rcv_state_process, we will give to tcp_v4_send_reset() the
listening-socket and thus will parse the TCP_MD5 option.

Cc: Ivan Delalande 
Signed-off-by: Christoph Paasch 
Reviewed-by: Mat Martineau 
---

Notes:
v2: * Fix compiler warning about unused variable when RCU-debugging is 
disabled

 include/linux/tcp.h  |  10 +-
 include/linux/tcp_md5.h  |  62 -
 net/ipv4/tcp_ipv4.c  |  55 
 net/ipv4/tcp_md5.c   | 695 +--
 net/ipv4/tcp_minisocks.c |  12 -
 net/ipv4/tcp_output.c|  68 +
 net/ipv6/tcp_ipv6.c  |  23 --
 7 files changed, 487 insertions(+), 438 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index ef0279194ef9..d4d22b9c19be 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -127,11 +127,11 @@ struct tcp_out_options {
u16 mss;/* 0 to disable */
u8 ws;  /* window scale, 0 to disable */
u8 num_sack_blocks; /* number of SACK blocks to include */
-   u8 hash_size;   /* bytes in hash_location */
-   __u8 *hash_location;/* temporary pointer, overloaded */
__u32 tsval, tsecr; /* need to include OPTION_TS */
struct tcp_fastopen_cookie *fastopen_cookie;/* Fast open cookie */
+#ifdef CONFIG_TCP_MD5SIG
struct tcp_md5sig_key *md5; /* TCP_MD5 signature key */
+#endif
 };
 
 /* This is the max number of SACKS that we'll generate and process. It's safe
@@ -391,9 +391,6 @@ struct tcp_sock {
 #ifdef CONFIG_TCP_MD5SIG
 /* TCP AF-Specific parts; only used by MD5 Signature support so far */
const struct tcp_sock_af_ops*af_specific;
-
-/* TCP MD5 Signature Option information */
-   struct tcp_md5sig_info  __rcu *md5sig_info;
 #endif
 
 /* TCP fastopen related information */
@@ -451,9 +448,6 @@ struct tcp_timewait_sock {
long  tw_ts_recent_stamp;
 
struct hlist_head tcp_option_list;
-#ifdef CONFIG_TCP_MD5SIG
-   struct tcp_md5sig_key *tw_md5_key;
-#endif
 };
 
 static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
diff --git a/include/linux/tcp_md5.h b/include/linux/tcp_md5.h
index d4a2175030d0..94a29c4f6fd1 100644
--- a/include/linux/tcp_md5.h
+++ b/include/linux/tcp_md5.h
@@ -27,25 +27,6 @@ struct tcp_md5sig_key {
struct rcu_head rcu;
 };
 
-/* - sock block */
-struct tcp_md5sig_info {
-   struct hlist_head   head;
-   struct rcu_head rcu;
-};
-
-union tcp_md5sum_block {
-   struct tcp4_pseudohdr ip4;
-#if IS_ENABLED(CONFIG_IPV6)
-   struct tcp6_pseudohdr ip6;
-#endif
-};
-
-/* - pool: digest algorithm, hash description and scratch buffer */
-struct tcp_md5sig_pool {
-   struct ahash_request*md5_req;
-   void*scratch;
-};
-
 extern const struct tcp_sock_af_ops tcp_sock_ipv4_specific;
 extern const struct tcp_sock_af_ops tcp_sock_ipv6_specific;
 extern const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific;
@@ -57,37 +38,9 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct 
tcp_md5sig_key *key,
 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 const struct sock *addr_sk);
 
-void tcp_v4_md5_destroy_sock(struct sock *sk);
-
-int tcp_v4_md5_send_response_prepare(struct sk_buff *skb, u8 flags,
-unsigned int remaining,
-struct tcp_out_options *opts,
-const struct sock *sk);
-
-void tcp_v4_md5_send_response_write(__be32 *topt, struct sk_buff *skb,
-   struct tcphdr *t1,
-   struct tcp_out_options *opts,
-   const struct sock *sk);
-
-int tcp_v6_md5_send_response_prepare(struct

[RFC v2 08/14] tcp_md5: Detect key inside tcp_v4_send_ack instead of passing it as an argument

2018-01-31 Thread Christoph Paasch

This will simplify to consolidate the TCP_MD5-code into a single place.

Cc: Ivan Delalande 
Signed-off-by: Christoph Paasch 
Reviewed-by: Mat Martineau 
---
 net/ipv4/tcp_ipv4.c | 18 ++
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4112594d04be..4211f8e38ef9 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -764,7 +764,6 @@ static void tcp_v4_send_reset(const struct sock *sk, struct 
sk_buff *skb)
 static void tcp_v4_send_ack(const struct sock *sk,
struct sk_buff *skb, u32 seq, u32 ack,
u32 win, u32 tsval, u32 tsecr, int oif,
-   struct tcp_md5sig_key *key,
int reply_flags, u8 tos)
 {
const struct tcphdr *th = tcp_hdr(skb);
@@ -773,6 +772,9 @@ static void tcp_v4_send_ack(const struct sock *sk,
__be32 opt[(MAX_TCP_OPTION_SPACE >> 2)];
} rep;
struct hlist_head *extopt_list = NULL;
+#ifdef CONFIG_TCP_MD5SIG
+   struct tcp_md5sig_key *key;
+#endif
struct net *net = sock_net(sk);
struct ip_reply_arg arg;
int offset = 0;
@@ -803,6 +805,17 @@ static void tcp_v4_send_ack(const struct sock *sk,
rep.th.ack = 1;
rep.th.window  = htons(win);
 
+#ifdef CONFIG_TCP_MD5SIG
+   if (sk->sk_state == TCP_TIME_WAIT) {
+   key = tcp_twsk_md5_key(tcp_twsk(sk));
+   } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
+   key = tcp_md5_do_lookup(sk, (union tcp_md5_addr 
*)_hdr(skb)->saddr,
+   AF_INET);
+   } else {
+   key = NULL; /* Should not happen */
+   }
+#endif
+
if (unlikely(extopt_list && !hlist_empty(extopt_list))) {
unsigned int remaining;
struct tcp_out_options opts;
@@ -872,7 +885,6 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct 
sk_buff *skb)
tcp_time_stamp_raw() + tcptw->tw_ts_offset,
tcptw->tw_ts_recent,
tw->tw_bound_dev_if,
-   tcp_twsk_md5_key(tcptw),
tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
tw->tw_tos
);
@@ -900,8 +912,6 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, 
struct sk_buff *skb,
tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
req->ts_recent,
0,
-   tcp_md5_do_lookup(sk, (union tcp_md5_addr 
*)_hdr(skb)->saddr,
- AF_INET),
inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 
0,
ip_hdr(skb)->tos);
 }
-- 
2.16.1

[RFC v2 09/14] tcp_md5: Detect key inside tcp_v6_send_response instead of passing it as an argument

2018-01-31 Thread Christoph Paasch

We want to move all the TCP-MD5 code to a single place which enables us
to factor the TCP-MD5 code out of the TCP-stack into the extra-option
framework.

Detection of whether or not to drop the segment (as done in
tcp_v6_send_reset()) has now been moved to tcp_v6_send_response().
So we needed to adapt the latter so that it can handle the case where we
want to exit without sending anything.

Cc: Ivan Delalande 
Signed-off-by: Christoph Paasch 
Reviewed-by: Mat Martineau 
---
 net/ipv6/tcp_ipv6.c | 119 +---
 1 file changed, 57 insertions(+), 62 deletions(-)

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 202bf011f462..8c6d0362299e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -82,12 +82,6 @@ static const struct inet_connection_sock_af_ops 
ipv6_specific;
 #ifdef CONFIG_TCP_MD5SIG
 static const struct tcp_sock_af_ops tcp_sock_ipv6_specific;
 static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific;
-#else
-static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk,
-  const struct in6_addr *addr)
-{
-   return NULL;
-}
 #endif
 
 static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
@@ -779,12 +773,11 @@ static const struct tcp_request_sock_ops 
tcp_request_sock_ipv6_ops = {
 
 static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, 
u32 seq,
 u32 ack, u32 win, u32 tsval, u32 tsecr,
-int oif, struct tcp_md5sig_key *key, int rst,
-u8 tclass, __be32 label)
+int oif, int rst, u8 tclass, __be32 label)
 {
const struct tcphdr *th = tcp_hdr(skb);
struct tcphdr *t1;
-   struct sk_buff *buff;
+   struct sk_buff *buff = NULL;
struct flowi6 fl6;
struct net *net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
struct sock *ctl_sk = net->ipv6.tcp_sk;
@@ -793,10 +786,54 @@ static void tcp_v6_send_response(const struct sock *sk, 
struct sk_buff *skb, u32
__be32 *topt;
struct hlist_head *extopt_list = NULL;
struct tcp_out_options extraopts;
+#ifdef CONFIG_TCP_MD5SIG
+   struct tcp_md5sig_key *key = NULL;
+   const __u8 *hash_location = NULL;
+   struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+#endif
 
if (tsecr)
tot_len += TCPOLEN_TSTAMP_ALIGNED;
 #ifdef CONFIG_TCP_MD5SIG
+   rcu_read_lock();
+   hash_location = tcp_parse_md5sig_option(th);
+   if (sk && sk_fullsock(sk)) {
+   key = tcp_v6_md5_do_lookup(sk, >saddr);
+   } else if (sk && sk->sk_state == TCP_TIME_WAIT) {
+   struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
+
+   key = tcp_twsk_md5_key(tcptw);
+   } else if (sk && sk->sk_state == TCP_NEW_SYN_RECV) {
+   key = tcp_v6_md5_do_lookup(sk, >saddr);
+   } else if (hash_location) {
+   unsigned char newhash[16];
+   struct sock *sk1 = NULL;
+   int genhash;
+
+   /* active side is lost. Try to find listening socket through
+* source port, and then find md5 key through listening socket.
+* we are not loose security here:
+* Incoming packet is checked with md5 hash with finding key,
+* no RST generated if md5 hash doesn't match.
+*/
+   sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev),
+   _hashinfo, NULL, 0,
+   >saddr,
+   th->source, >daddr,
+   ntohs(th->source), tcp_v6_iif(skb),
+   tcp_v6_sdif(skb));
+   if (!sk1)
+   goto out;
+
+   key = tcp_v6_md5_do_lookup(sk1, >saddr);
+   if (!key)
+   goto out;
+
+   genhash = tcp_v6_md5_hash_skb(newhash, key, NULL, skb);
+   if (genhash || memcmp(hash_location, newhash, 16) != 0)
+   goto out;
+   }
+
if (key)
tot_len += TCPOLEN_MD5SIG_ALIGNED;
 #endif
@@ -823,7 +860,7 @@ static void tcp_v6_send_response(const struct sock *sk, 
struct sk_buff *skb, u32
buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,
 GFP_ATOMIC);
if (!buff)
-   return;
+   goto out;
 
skb_reserve(buff, MAX_HEADER + sizeof(struct ipv6hdr) + tot_len);
 
@@ -900,24 +937,21 @@ static void tcp_v6_send_response(const struct sock *sk, 
struct sk_buff *skb, u32
TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
if (rst)
TCP_INC_STATS(net,

[RFC v2 10/14] tcp_md5: Check for TCP_MD5 after TCP Timestamps in tcp_established_options

2018-01-31 Thread Christoph Paasch

It really does not matter, because we never use TCP timestamps when
TCP_MD5 is enabled (see tcp_syn_options).

Moving TCP_MD5 a bit lower allows for easier adoption of the
tcp_extra_option framework.

Cc: Ivan Delalande 
Signed-off-by: Christoph Paasch 
Reviewed-by: Mat Martineau 
---
 net/ipv4/tcp_output.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index facbdf4fe9be..97e6aecc03eb 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -662,6 +662,13 @@ static unsigned int tcp_established_options(struct sock 
*sk, struct sk_buff *skb
 
opts->options = 0;
 
+   if (likely(tp->rx_opt.tstamp_ok)) {
+   opts->options |= OPTION_TS;
+   opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0;
+   opts->tsecr = tp->rx_opt.ts_recent;
+   size += TCPOLEN_TSTAMP_ALIGNED;
+   }
+
 #ifdef CONFIG_TCP_MD5SIG
opts->md5 = tp->af_specific->md5_lookup(sk, sk);
if (unlikely(opts->md5)) {
@@ -672,13 +679,6 @@ static unsigned int tcp_established_options(struct sock 
*sk, struct sk_buff *skb
opts->md5 = NULL;
 #endif
 
-   if (likely(tp->rx_opt.tstamp_ok)) {
-   opts->options |= OPTION_TS;
-   opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0;
-   opts->tsecr = tp->rx_opt.ts_recent;
-   size += TCPOLEN_TSTAMP_ALIGNED;
-   }
-
if (unlikely(!hlist_empty(>tcp_option_list)))
size += tcp_extopt_prepare(skb, 0, MAX_TCP_OPTION_SPACE - size,
   opts, tcp_to_sk(tp));
-- 
2.16.1

[RFC v2 11/14] tcp_md5: Move TCP-MD5 code out of TCP itself

2018-01-31 Thread Christoph Paasch

This is all just copy-pasting the TCP_MD5-code into functions that are
placed in net/ipv4/tcp_md5.c.

Cc: Ivan Delalande 
Signed-off-by: Christoph Paasch 
Reviewed-by: Mat Martineau 
---

Notes:
v2: * Add SPDX-identifier (Mat Martineau's feedback)

 include/linux/inet_diag.h |1 +
 include/linux/tcp_md5.h   |  137 ++
 include/net/tcp.h |   77 
 net/ipv4/Makefile |1 +
 net/ipv4/tcp.c|  133 +-
 net/ipv4/tcp_diag.c   |   81 +---
 net/ipv4/tcp_input.c  |   38 --
 net/ipv4/tcp_ipv4.c   |  520 ++---
 net/ipv4/tcp_md5.c| 1103 +
 net/ipv4/tcp_minisocks.c  |   23 +-
 net/ipv4/tcp_output.c |4 +-
 net/ipv6/tcp_ipv6.c   |  318 +
 12 files changed, 1304 insertions(+), 1132 deletions(-)
 create mode 100644 include/linux/tcp_md5.h
 create mode 100644 net/ipv4/tcp_md5.c

diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h
index 39faaaf843e1..1ef6727e41c9 100644
--- a/include/linux/inet_diag.h
+++ b/include/linux/inet_diag.h
@@ -2,6 +2,7 @@
 #ifndef _INET_DIAG_H_
 #define _INET_DIAG_H_ 1
 
+#include 
 #include 
 
 struct net;
diff --git a/include/linux/tcp_md5.h b/include/linux/tcp_md5.h
new file mode 100644
index ..d4a2175030d0
--- /dev/null
+++ b/include/linux/tcp_md5.h
@@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_TCP_MD5_H
+#define _LINUX_TCP_MD5_H
+
+#include 
+
+#ifdef CONFIG_TCP_MD5SIG
+#include 
+
+#include 
+
+union tcp_md5_addr {
+   struct in_addr  a4;
+#if IS_ENABLED(CONFIG_IPV6)
+   struct in6_addr a6;
+#endif
+};
+
+/* - key database */
+struct tcp_md5sig_key {
+   struct hlist_node   node;
+   u8  keylen;
+   u8  family; /* AF_INET or AF_INET6 */
+   union tcp_md5_addr  addr;
+   u8  prefixlen;
+   u8  key[TCP_MD5SIG_MAXKEYLEN];
+   struct rcu_head rcu;
+};
+
+/* - sock block */
+struct tcp_md5sig_info {
+   struct hlist_head   head;
+   struct rcu_head rcu;
+};
+
+union tcp_md5sum_block {
+   struct tcp4_pseudohdr ip4;
+#if IS_ENABLED(CONFIG_IPV6)
+   struct tcp6_pseudohdr ip6;
+#endif
+};
+
+/* - pool: digest algorithm, hash description and scratch buffer */
+struct tcp_md5sig_pool {
+   struct ahash_request*md5_req;
+   void*scratch;
+};
+
+extern const struct tcp_sock_af_ops tcp_sock_ipv4_specific;
+extern const struct tcp_sock_af_ops tcp_sock_ipv6_specific;
+extern const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific;
+
+/* - functions */
+int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
+   const struct sock *sk, const struct sk_buff *skb);
+
+struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
+const struct sock *addr_sk);
+
+void tcp_v4_md5_destroy_sock(struct sock *sk);
+
+int tcp_v4_md5_send_response_prepare(struct sk_buff *skb, u8 flags,
+unsigned int remaining,
+struct tcp_out_options *opts,
+const struct sock *sk);
+
+void tcp_v4_md5_send_response_write(__be32 *topt, struct sk_buff *skb,
+   struct tcphdr *t1,
+   struct tcp_out_options *opts,
+   const struct sock *sk);
+
+int tcp_v6_md5_send_response_prepare(struct sk_buff *skb, u8 flags,
+unsigned int remaining,
+struct tcp_out_options *opts,
+const struct sock *sk);
+
+void tcp_v6_md5_send_response_write(__be32 *topt, struct sk_buff *skb,
+   struct tcphdr *t1,
+   struct tcp_out_options *opts,
+   const struct sock *sk);
+
+bool tcp_v4_inbound_md5_hash(const struct sock *sk,
+const struct sk_buff *skb);
+
+void tcp_v4_md5_syn_recv_sock(const struct sock *listener, struct sock *sk);
+
+void tcp_v6_md5_syn_recv_sock(const struct sock *listener, struct sock *sk);
+
+void tcp_md5_time_wait(struct sock *sk, struct inet_timewait_sock *tw);
+
+struct tcp_md5sig_key *tcp_v6_md5_lookup(const struct sock *sk,
+const struct sock *addr_sk);
+
+int tcp_v6_md5_hash_skb(char *md5_hash,
+   const struct tcp_md5sig_key *key,
+   const struct sock *sk,
+   const struct sk_buff *skb);
+
+bool tcp_v6_inbound_md5_hash(const struct sock *sk,
+const struct sk_buff *skb);
+
+static inline void tcp_md5_twsk_destructor(struct tcp_timewait_sock

[RFC v2 03/14] tcp: Allow tcp_fast_parse_options to drop segments

2018-01-31 Thread Christoph Paasch

After parsing the TCP-options, some option-kinds might trigger a drop of
the segment (e.g., as is the case for TCP_MD5). As we are moving to
consolidate the TCP_MD5-code in follow-up patches, we need to add the
capability to drop a segment right after parsing the options in
tcp_fast_parse_options().

Originally, tcp_fast_parse_options() returned false, when there is no
timestamp option, except in the case of the slow-path processing through
tcp_parse_options() where it always returns true.

So, the return-value of tcp_fast_parse_options() was kind of
inconsistent. With this patch, we make it return true when the segment
should get dropped based on the parsed options, and false otherwise.

In tcp_validate_incoming, we will then just check for
tp->rx_opt.saw_tstamp to see if we should verify PAWS.

The goto will be used in a follow-up patch to check whether one of the
options triggers a drop of the segment.

Signed-off-by: Christoph Paasch 
Reviewed-by: Mat Martineau 
---
 net/ipv4/tcp_input.c | 15 ++-
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index cfa51cfd2d99..1fbabcc99b62 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3847,6 +3847,8 @@ static bool tcp_parse_aligned_timestamp(struct tcp_sock 
*tp, const struct tcphdr
 
 /* Fast parse options. This hopes to only see timestamps.
  * If it is wrong it falls back on tcp_parse_options().
+ *
+ * Returns true if we should drop this packet based on present TCP-options.
  */
 static bool tcp_fast_parse_options(const struct net *net,
   const struct sk_buff *skb,
@@ -3857,18 +3859,19 @@ static bool tcp_fast_parse_options(const struct net 
*net,
 */
if (th->doff == (sizeof(*th) / 4)) {
tp->rx_opt.saw_tstamp = 0;
-   return false;
+   goto extra_opt_check;
} else if (tp->rx_opt.tstamp_ok &&
   th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
if (tcp_parse_aligned_timestamp(tp, th))
-   return true;
+   goto extra_opt_check;
}
 
tcp_parse_options(net, skb, >rx_opt, 1, NULL);
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
tp->rx_opt.rcv_tsecr -= tp->tsoffset;
 
-   return true;
+extra_opt_check:
+   return false;
 }
 
 #ifdef CONFIG_TCP_MD5SIG
@@ -5188,9 +5191,11 @@ static bool tcp_validate_incoming(struct sock *sk, 
struct sk_buff *skb,
struct tcp_sock *tp = tcp_sk(sk);
bool rst_seq_match = false;
 
+   if (tcp_fast_parse_options(sock_net(sk), skb, th, tp))
+   goto discard;
+
/* RFC1323: H1. Apply PAWS check first. */
-   if (tcp_fast_parse_options(sock_net(sk), skb, th, tp) &&
-   tp->rx_opt.saw_tstamp &&
+   if (tp->rx_opt.saw_tstamp &&
tcp_paws_discard(sk, skb)) {
if (!th->rst) {
NET_INC_STATS(sock_net(sk), 
LINUX_MIB_PAWSESTABREJECTED);
-- 
2.16.1

[RFC v2 01/14] tcp: Write options after the header has been fully done

2018-01-31 Thread Christoph Paasch

The generic TCP-option framework will need to have access to the full
TCP-header (e.g., if we want to compute a checksum for TCP-MD5).

Thus, we move the call to tcp_options_write() to after all the fields in
the header have been filled out.

Signed-off-by: Christoph Paasch 
Reviewed-by: Mat Martineau 
---
 net/ipv4/tcp_output.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e9f985e42405..df50c7dc1a43 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1126,7 +1126,6 @@ static int tcp_transmit_skb(struct sock *sk, struct 
sk_buff *skb, int clone_it,
}
}
 
-   tcp_options_write((__be32 *)(th + 1), tp, );
skb_shinfo(skb)->gso_type = sk->sk_gso_type;
if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
th->window  = htons(tcp_select_window(sk));
@@ -1137,6 +1136,7 @@ static int tcp_transmit_skb(struct sock *sk, struct 
sk_buff *skb, int clone_it,
 */
th->window  = htons(min(tp->rcv_wnd, 65535U));
}
+   tcp_options_write((__be32 *)(th + 1), tp, );
 #ifdef CONFIG_TCP_MD5SIG
/* Calculate the MD5 hash, as we have all we need now */
if (md5) {
@@ -3247,8 +3247,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, 
struct dst_entry *dst,
 
/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
th->window = htons(min(req->rsk_rcv_wnd, 65535U));
-   tcp_options_write((__be32 *)(th + 1), NULL, );
th->doff = (tcp_header_size >> 2);
+   tcp_options_write((__be32 *)(th + 1), NULL, );
__TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
 
 #ifdef CONFIG_TCP_MD5SIG
-- 
2.16.1

[RFC v2 14/14] tcp_md5: Use TCP extra-options on the input path

2018-01-31 Thread Christoph Paasch

The checks are now being done through the extra-option framework. For
TCP MD5 this means that the check happens a bit later than usual.

Cc: Ivan Delalande 
Signed-off-by: Christoph Paasch 
Reviewed-by: Mat Martineau 
---
 include/linux/tcp_md5.h | 23 +--
 net/ipv4/tcp_input.c|  8 
 net/ipv4/tcp_ipv4.c |  9 -
 net/ipv4/tcp_md5.c  | 29 -
 net/ipv6/tcp_ipv6.c |  9 -
 5 files changed, 25 insertions(+), 53 deletions(-)

diff --git a/include/linux/tcp_md5.h b/include/linux/tcp_md5.h
index 441be65ec893..fe84c706299c 100644
--- a/include/linux/tcp_md5.h
+++ b/include/linux/tcp_md5.h
@@ -32,30 +32,9 @@ struct tcp_md5sig_key {
 int tcp_md5_parse_keys(struct sock *sk, int optname, char __user *optval,
   int optlen);
 
-bool tcp_v4_inbound_md5_hash(const struct sock *sk,
-const struct sk_buff *skb);
-
-bool tcp_v6_inbound_md5_hash(const struct sock *sk,
-const struct sk_buff *skb);
-
 int tcp_md5_diag_get_aux(struct sock *sk, bool net_admin, struct sk_buff *skb);
 
 int tcp_md5_diag_get_aux_size(struct sock *sk, bool net_admin);
 
-#else
-
-static inline bool tcp_v4_inbound_md5_hash(const struct sock *sk,
-  const struct sk_buff *skb)
-{
-   return false;
-}
-
-static inline bool tcp_v6_inbound_md5_hash(const struct sock *sk,
-  const struct sk_buff *skb)
-{
-   return false;
-}
-
-#endif
-
+#endif /* CONFIG_TCP_MD5SIG */
 #endif /* _LINUX_TCP_MD5_H */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1ac1d8d431ad..56cdc3093d6a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3774,14 +3774,6 @@ void tcp_parse_options(const struct net *net,
TCP_SKB_CB(skb)->sacked = (ptr - 2) - 
(unsigned char *)th;
}
break;
-#ifdef CONFIG_TCP_MD5SIG
-   case TCPOPT_MD5SIG:
-   /*
-* The MD5 Hash has already been
-* checked (see tcp_v{4,6}_do_rcv()).
-*/
-   break;
-#endif
case TCPOPT_FASTOPEN:
tcp_parse_fastopen_option(
opsize - TCPOLEN_FASTOPEN_BASE,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 6a839c1280b3..c5405bd62322 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -62,7 +62,6 @@
 #include 
 #include 
 #include 
-#include 
 
 #include 
 #include 
@@ -1249,11 +1248,6 @@ int tcp_v4_rcv(struct sk_buff *skb)
struct sock *nsk;
 
sk = req->rsk_listener;
-   if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
-   sk_drops_add(sk, skb);
-   reqsk_put(req);
-   goto discard_it;
-   }
if (unlikely(sk->sk_state != TCP_LISTEN)) {
inet_csk_reqsk_queue_drop_and_put(sk, req);
goto lookup;
@@ -1293,9 +1287,6 @@ int tcp_v4_rcv(struct sk_buff *skb)
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_and_relse;
 
-   if (tcp_v4_inbound_md5_hash(sk, skb))
-   goto discard_and_relse;
-
nf_reset(skb);
 
if (tcp_filter(sk, skb))
diff --git a/net/ipv4/tcp_md5.c b/net/ipv4/tcp_md5.c
index e05db5af06ee..ad41b9fd6f88 100644
--- a/net/ipv4/tcp_md5.c
+++ b/net/ipv4/tcp_md5.c
@@ -30,6 +30,10 @@ static DEFINE_PER_CPU(struct tcp_md5sig_pool, 
tcp_md5sig_pool);
 static DEFINE_MUTEX(tcp_md5sig_mutex);
 static bool tcp_md5sig_pool_populated;
 
+static bool tcp_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb,
+struct tcp_options_received *opt_rx,
+struct tcp_extopt_store *store);
+
 static unsigned int tcp_md5_extopt_prepare(struct sk_buff *skb, u8 flags,
   unsigned int remaining,
   struct tcp_out_options *opts,
@@ -77,6 +81,7 @@ struct tcp_md5_extopt {
 
 static const struct tcp_extopt_ops tcp_md5_extra_ops = {
.option_kind= TCPOPT_MD5SIG,
+   .check  = tcp_inbound_md5_hash,
.prepare= tcp_md5_extopt_prepare,
.write  = tcp_md5_extopt_write,
.response_prepare   = tcp_md5_send_response_prepare,
@@ -863,8 +868,8 @@ static struct tcp_md5sig_key *tcp_v4_md5_lookup(const 
struct sock *sk,
 }
 
 /* Called with rcu_read_lock() */
-bool tcp_v4_inbound_md5_hash(const struct sock *sk,
-const struct sk_buff

[RFC v2 05/14] tcp: Register handlers for extra TCP options

2018-01-31 Thread Christoph Paasch

From: Mat Martineau 

Allow additional TCP options to be handled by registered hook
functions.

Registered options have a priority that determines the order in which
options are prepared and written. Lower priority numbers are handled
first.

Option parsing will call the provided 'parse' function when a TCP option
number is not recognized by the normal option parsing code.

After parsing, there are two places where we post-process the options.
First, a 'check' callback that allows to drop the packet based on the
parsed options (e.g., useful for TCP MD5SIG). Then, a 'post_process'
function that gets called after other validity checks (aka, in-window,
PAWS,...). This post_process function can then update other state for
this particular extra-option.

In the output-path, the 'prepare' function determines the required space
for registered options and store associated data. 'write' adds the option
to the TCP header.

These additional TCP-options are stored in hlists of the TCP-socket. To
pass the state and options around during the 3-way handshake and in
time-wait state, the hlists are also on the tcp_request_sock and
tcp_timewait_sock.
The list is copied from the listener to the request-socket (calling into
the 'copy' callback). Then, moved from the request-socket to the
TCP-socket and finally to the time-wait socket.

Signed-off-by: Mat Martineau 
Signed-off-by: Christoph Paasch 
---

Notes:
v2: * Fix a compiler error in tcp_twsk_destructor when TCP_MD5SIG is 
disabled

 drivers/infiniband/hw/cxgb4/cm.c |   2 +-
 include/linux/tcp.h  |  28 
 include/net/tcp.h| 110 -
 net/ipv4/syncookies.c|   6 +-
 net/ipv4/tcp.c   | 327 ++-
 net/ipv4/tcp_input.c |  49 +-
 net/ipv4/tcp_ipv4.c  |  98 +---
 net/ipv4/tcp_minisocks.c |  34 +++-
 net/ipv4/tcp_output.c|  40 ++---
 net/ipv6/syncookies.c|   6 +-
 net/ipv6/tcp_ipv6.c  |  32 
 11 files changed, 677 insertions(+), 55 deletions(-)

diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index 21db3b48a617..a1ea5583f07b 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -3746,7 +3746,7 @@ static void build_cpl_pass_accept_req(struct sk_buff 
*skb, int stid , u8 tos)
 */
memset(_opt, 0, sizeof(tmp_opt));
tcp_clear_options(_opt);
-   tcp_parse_options(_net, skb, _opt, 0, NULL);
+   tcp_parse_options(_net, skb, _opt, 0, NULL, NULL);
 
req = __skb_push(skb, sizeof(*req));
memset(req, 0, sizeof(*req));
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 8f4c54986f97..6e1f0f29bf24 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -115,6 +115,24 @@ static inline void tcp_clear_options(struct 
tcp_options_received *rx_opt)
 #endif
 }
 
+#define OPTION_SACK_ADVERTISE  (1 << 0)
+#define OPTION_TS  (1 << 1)
+#define OPTION_MD5 (1 << 2)
+#define OPTION_WSCALE  (1 << 3)
+#define OPTION_FAST_OPEN_COOKIE(1 << 8)
+#define OPTION_SMC (1 << 9)
+
+struct tcp_out_options {
+   u16 options;/* bit field of OPTION_* */
+   u16 mss;/* 0 to disable */
+   u8 ws;  /* window scale, 0 to disable */
+   u8 num_sack_blocks; /* number of SACK blocks to include */
+   u8 hash_size;   /* bytes in hash_location */
+   __u8 *hash_location;/* temporary pointer, overloaded */
+   __u32 tsval, tsecr; /* need to include OPTION_TS */
+   struct tcp_fastopen_cookie *fastopen_cookie;/* Fast open cookie */
+};
+
 /* This is the max number of SACKS that we'll generate and process. It's safe
  * to increase this, although since:
  *   size = TCPOLEN_SACK_BASE_ALIGNED (4) + n * TCPOLEN_SACK_PERBLOCK (8)
@@ -137,6 +155,7 @@ struct tcp_request_sock {
  * FastOpen it's the seq#
  * after data-in-SYN.
  */
+   struct hlist_head   tcp_option_list;
 };
 
 static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
@@ -384,6 +403,8 @@ struct tcp_sock {
 */
struct request_sock *fastopen_rsk;
u32 *saved_syn;
+
+   struct hlist_head tcp_option_list;
 };
 
 enum tsq_enum {
@@ -411,6 +432,11 @@ static inline struct tcp_sock *tcp_sk(const struct sock 
*sk)
return (struct tcp_sock *)sk;
 }
 
+static inline struct sock *tcp_to_sk(const struct tcp_sock *tp)
+{
+   return (struct sock *)tp;
+}
+
 struct tcp_timewait_sock {
struct inet_timewait_sock tw_sk;
 #define tw_rcv_nxt tw_sk.__tw_common.skc_tw_rcv_nxt
@@ -423,6 +449,8 @@ struct

[RFC v2 07/14] tcp_md5: Don't pass along md5-key

2018-01-31 Thread Christoph Paasch

It is much cleaner to store the key-pointer in tcp_out_options. It
allows to remove some MD5-specific code out of the function-arguments
and paves the way to adopting the TCP-option framework with TCP-MD5.

Cc: Ivan Delalande 
Signed-off-by: Christoph Paasch 
Reviewed-by: Mat Martineau 
---
 include/linux/tcp.h   |  1 +
 net/ipv4/tcp_output.c | 46 +++---
 2 files changed, 20 insertions(+), 27 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 0958b3760cfc..ef0279194ef9 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -131,6 +131,7 @@ struct tcp_out_options {
__u8 *hash_location;/* temporary pointer, overloaded */
__u32 tsval, tsecr; /* need to include OPTION_TS */
struct tcp_fastopen_cookie *fastopen_cookie;/* Fast open cookie */
+   struct tcp_md5sig_key *md5; /* TCP_MD5 signature key */
 };
 
 /* This is the max number of SACKS that we'll generate and process. It's safe
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 549e33a30b41..facbdf4fe9be 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -520,21 +520,18 @@ static void tcp_options_write(__be32 *ptr, struct sk_buff 
*skb, struct sock *sk,
  * network wire format yet.
  */
 static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
-   struct tcp_out_options *opts,
-   struct tcp_md5sig_key **md5)
+   struct tcp_out_options *opts)
 {
struct tcp_sock *tp = tcp_sk(sk);
unsigned int remaining = MAX_TCP_OPTION_SPACE;
struct tcp_fastopen_request *fastopen = tp->fastopen_req;
 
 #ifdef CONFIG_TCP_MD5SIG
-   *md5 = tp->af_specific->md5_lookup(sk, sk);
-   if (*md5) {
+   opts->md5 = tp->af_specific->md5_lookup(sk, sk);
+   if (opts->md5) {
opts->options |= OPTION_MD5;
remaining -= TCPOLEN_MD5SIG_ALIGNED;
}
-#else
-   *md5 = NULL;
 #endif
 
/* We always get an MSS option.  The option bytes which will be seen in
@@ -549,7 +546,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct 
sk_buff *skb,
opts->mss = tcp_advertise_mss(sk);
remaining -= TCPOLEN_MSS_ALIGNED;
 
-   if (likely(sock_net(sk)->ipv4.sysctl_tcp_timestamps && !*md5)) {
+   if (likely(sock_net(sk)->ipv4.sysctl_tcp_timestamps && !opts->md5)) {
opts->options |= OPTION_TS;
opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
opts->tsecr = tp->rx_opt.ts_recent;
@@ -593,14 +590,13 @@ static unsigned int tcp_synack_options(const struct sock 
*sk,
   struct request_sock *req,
   unsigned int mss, struct sk_buff *skb,
   struct tcp_out_options *opts,
-  const struct tcp_md5sig_key *md5,
   struct tcp_fastopen_cookie *foc)
 {
struct inet_request_sock *ireq = inet_rsk(req);
unsigned int remaining = MAX_TCP_OPTION_SPACE;
 
 #ifdef CONFIG_TCP_MD5SIG
-   if (md5) {
+   if (opts->md5) {
opts->options |= OPTION_MD5;
remaining -= TCPOLEN_MD5SIG_ALIGNED;
 
@@ -658,8 +654,7 @@ static unsigned int tcp_synack_options(const struct sock 
*sk,
  * final wire format yet.
  */
 static unsigned int tcp_established_options(struct sock *sk, struct sk_buff 
*skb,
-   struct tcp_out_options *opts,
-   struct tcp_md5sig_key **md5)
+   struct tcp_out_options *opts)
 {
struct tcp_sock *tp = tcp_sk(sk);
unsigned int size = 0;
@@ -668,13 +663,13 @@ static unsigned int tcp_established_options(struct sock 
*sk, struct sk_buff *skb
opts->options = 0;
 
 #ifdef CONFIG_TCP_MD5SIG
-   *md5 = tp->af_specific->md5_lookup(sk, sk);
-   if (unlikely(*md5)) {
+   opts->md5 = tp->af_specific->md5_lookup(sk, sk);
+   if (unlikely(opts->md5)) {
opts->options |= OPTION_MD5;
size += TCPOLEN_MD5SIG_ALIGNED;
}
 #else
-   *md5 = NULL;
+   opts->md5 = NULL;
 #endif
 
if (likely(tp->rx_opt.tstamp_ok)) {
@@ -992,7 +987,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff 
*skb, int clone_it,
struct tcp_out_options opts;
unsigned int tcp_options_size, tcp_header_size;
struct sk_buff *oskb = NULL;
-   struct tcp_md5sig_key *md5;
struct tcphdr *th;
int err;
 
@@ -1021,10 +1015,9 @@ static int tcp_transmit_skb(struct sock *sk, struct 
sk_buff *skb, int clone_it,
memset(, 0, sizeof(opts));
 
if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
-   tcp_options_size =

Re: [PATCH iproute2-next 0/4] ip: Introduce and use helper to read /proc/net/dev

2018-01-31 Thread Stephen Hemminger

On Wed, 31 Jan 2018 21:49:45 +0200
Serhey Popovych  wrote:

> Currently there is two places in ip(8) where /proc/net/dev is read line
> by line with nearly identical steps: iptunnel.c and ip6tunnel.c
> 
> On the other hand we have iptuntap.c that uses /sys/class/net that could
> be problematic in case of unshare(1)d network namespace without sysfs
> being mounted.
> 
> Introduce and use do_each_proc_net_dev() helper to read data from
> /proc/net/dev line by line and pass this information to implementation
> specific callback function.
> 
> See individual patch description message for more details.
> 
> Series is open for reviews and comments.
> 
> Tested only by compiling and executing ip [-46] [-s] [-d] tunnel in
> various combinations: no problem so far. More can be done by request.
> 
> Thanks,
> Serhii
> 
> Serhey Popovych (4):
>   utils: Introduce do_each_proc_net_dev() helper
>   iptunnel: Use do_each_proc_net_dev()
>   ip6tunnel: Use do_each_proc_net_dev()
>   tuntap: Use do_each_proc_net_dev()
> 
>  include/utils.h |   10 ++
>  ip/ip6tunnel.c  |   94 --
>  ip/iptunnel.c   |  102 
> +--
>  ip/iptuntap.c   |   59 ++--
>  lib/utils.c |   51 
>  5 files changed, 170 insertions(+), 146 deletions(-)
> 

/proc/net/dev is legacy and unextensible.

I would rather see netlink used everywhere and not /proc/net/dev or sysfs!

[Patch net] xt_cgroup: initialize info->priv in cgroup_mt_check_v1()

2018-01-31 Thread Cong Wang

xt_cgroup_info_v1->priv is an internal pointer only used for kernel,
we should not trust what user-space provides.

Reported-by: 
Fixes: c38c4597e4bf ("netfilter: implement xt_cgroup cgroup2 path match")
Cc: Pablo Neira Ayuso 
Signed-off-by: Cong Wang 
---
 net/netfilter/xt_cgroup.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c
index 1db1ce59079f..891f4e7e8ea7 100644
--- a/net/netfilter/xt_cgroup.c
+++ b/net/netfilter/xt_cgroup.c
@@ -52,6 +52,7 @@ static int cgroup_mt_check_v1(const struct xt_mtchk_param 
*par)
return -EINVAL;
}
 
+   info->priv = NULL;
if (info->has_path) {
cgrp = cgroup_get_from_path(info->path);
if (IS_ERR(cgrp)) {
-- 
2.13.0

Re: [RFC PATCH 1/2] hv_netvsc: Split netvsc_revoke_buf() and netvsc_teardown_gpadl()

2018-01-31 Thread Stephen Hemminger

On Wed, 31 Jan 2018 12:16:49 +0100
Mohammed Gamal  wrote:

> On Tue, 2018-01-30 at 11:29 -0800, Stephen Hemminger wrote:
> > On Tue, 23 Jan 2018 10:34:04 +0100
> > Mohammed Gamal  wrote:
> >   
> > > Split each of the functions into two for each of send/recv buffers
> > > 
> > > Signed-off-by: Mohammed Gamal   
> > 
> > Splitting these functions is not necessary  
> 
> How so? We need to send each message independently, and hence the split
> (see cover letter). Is there another way?

This is all that is needed.


Subject: [PATCH] hv_netvsc: work around for gpadl teardown on older windows
 server

On WS2012 the host ignores messages after vmbus channel is closed.
Workaround this by doing what Windows does and send the teardown
before close on older versions of NVSP protocol.

Reported-by: Mohammed Gamal 
Fixes: 0cf737808ae7 ("hv_netvsc: netvsc_teardown_gpadl() split")
Signed-off-by: Stephen Hemminger 
---
 drivers/net/hyperv/netvsc.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 17e529af79dc..1a3df0eff42f 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -574,10 +574,17 @@ void netvsc_device_remove(struct hv_device *device)
 */
netdev_dbg(ndev, "net device safe to remove\n");
 
+   /* Workaround for older versions of Windows require that
+* buffer be revoked before channel is disabled
+*/
+   if (net_device->nvsp_version < NVSP_PROTOCOL_VERSION_4)
+   netvsc_teardown_gpadl(device, net_device);
+
/* Now, we can close the channel safely */
vmbus_close(device->channel);
 
-   netvsc_teardown_gpadl(device, net_device);
+   if (net_device->nvsp_version >= NVSP_PROTOCOL_VERSION_4)
+   netvsc_teardown_gpadl(device, net_device);
 
/* And dissassociate NAPI context from device */
for (i = 0; i < net_device->num_chn; i++)
-- 
2.15.1

[PATCH] iommu/vt-d: add NUMA awareness to intel_alloc_coherent()

2018-01-31 Thread Eric Dumazet

From: Eric Dumazet 

Some devices (like mlx4) try hard to allocate memory on selected
NUMA node, but it turns out intel_alloc_coherent() is not NUMA
aware yet.

Note that dma_generic_alloc_coherent() in arch/x86/kernel/pci-dma.c
gets this right.

Signed-off-by: Eric Dumazet 
Cc: Benjamin Serebrin 
Cc: David Woodhouse 
Cc: Joerg Roedel 
---
 drivers/iommu/intel-iommu.c |7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 
a1373cf343269455808f66ad18dc0a2fb7aa73f2..0efef077abc099eb29ebc5cefdd1b996f025dffd
 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -3734,8 +3734,11 @@ static void *intel_alloc_coherent(struct device *dev, 
size_t size,
}
}
 
-   if (!page)
-   page = alloc_pages(flags, order);
+   if (!page) {
+   page = alloc_pages_node(dev_to_node(dev), flags, order);
+   if (!page)
+   page = alloc_pages(flags, order);
+   }
if (!page)
return NULL;
memset(page_address(page), 0, size);

RE: [net-next 2/7] fm10k: cleanup unnecessary parenthesis in fm10k_iov.c

2018-01-31 Thread Keller, Jacob E

> -Original Message-
> From: netdev-ow...@vger.kernel.org [mailto:netdev-ow...@vger.kernel.org]
> On Behalf Of Joe Perches
> Sent: Wednesday, January 31, 2018 3:35 AM
> To: Kirsher, Jeffrey T ; da...@davemloft.net
> Cc: Keller, Jacob E ; netdev@vger.kernel.org;
> nhor...@redhat.com; sassm...@redhat.com; jogre...@redhat.com
> Subject: Re: [net-next 2/7] fm10k: cleanup unnecessary parenthesis in
> fm10k_iov.c
> 
> On Wed, 2018-01-24 at 14:45 -0800, Jeff Kirsher wrote:
> > This fixes a few warnings found by checkpatch.pl --strict
> []
> > diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c
> b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c
> []
> > @@ -353,7 +353,7 @@ int fm10k_iov_resume(struct pci_dev *pdev)
> > struct fm10k_vf_info *vf_info = _data->vf_info[i];
> >
> > /* allocate all but the last GLORT to the VFs */
> > -   if (i == ((~hw->mac.dglort_map) >>
> FM10K_DGLORTMAP_MASK_SHIFT))
> > +   if (i == (~hw->mac.dglort_map >>
> FM10K_DGLORTMAP_MASK_SHIFT))
> 
> Strictly, only the if needs parentheses here, but I
> think it reads better before this change.
> 

I don't really find the extra paranthesis around ~hw->mac.dglort_map to be that 
useful here.

Thanks,
Jake

> > @@ -511,7 +511,7 @@ int fm10k_iov_configure(struct pci_dev *pdev, int
> num_vfs)
> > return err;
> >
> > /* allocate VFs if not already allocated */
> > -   if (num_vfs && (num_vfs != current_vfs)) {
> > +   if (num_vfs && num_vfs != current_vfs) {
>

Re: [PATCH net] net: memcontrol: charge allocated memory after mem_cgroup_sk_alloc()

2018-01-31 Thread Roman Gushchin

On Thu, Jan 25, 2018 at 12:03:02PM -0500, David Miller wrote:
> From: Roman Gushchin 
> Date: Thu, 25 Jan 2018 00:19:11 +
> 
> > @@ -476,6 +477,10 @@ struct sock *inet_csk_accept(struct sock *sk, int 
> > flags, int *err, bool kern)
> > spin_unlock_bh(>fastopenq.lock);
> > }
> > mem_cgroup_sk_alloc(newsk);
> > +   amt = sk_memory_allocated(newsk);
> > +   if (amt && newsk->sk_memcg)
> > +   mem_cgroup_charge_skmem(newsk->sk_memcg, amt);
> > +
> 
> This looks confusing to me.
> 
> sk_memory_allocated() is the total amount of memory used by all
> sockets for a particular "struct proto", not just for that specific
> socket.
> 
> Maybe I don't understand how this socket memcg stuff works, but it
> seems like you should be looking instead at how much memory is
> allocated to this specific socket.

So, the patch below takes the per-socket charge into account,
and it _almost_ works: css leak is weaker by a couple orders
of magnitude, but still exists. I believe, the problem is
that we need additional synchronization for sk_memcg and
sk_forward_alloc fields; and I'm really out of ideas how
to do it without heavy artillery like introducing a new
field for unaccounted memcg charge. As I can see, we
check the sk_memcg field without socket lock; and we
do set it from a concurrent context.
Most likely, I do miss something...

So I really start thinking that reverting 9f1c2674b328
("net: memcontrol: defer call to mem_cgroup_sk_alloc()")
and fixing the original issue differently might be easier
and a proper way to go. Does it makes sense?

Thanks!

--

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 4ca46dc08e63..287de1501a30 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -476,6 +476,12 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, 
int *err, bool kern)
spin_unlock_bh(>fastopenq.lock);
}
mem_cgroup_sk_alloc(newsk);
+   if (mem_cgroup_sockets_enabled && newsk->sk_memcg) {
+   int amt = sk_mem_pages(newsk->sk_forward_alloc);
+   if (amt > 0)
+   mem_cgroup_charge_skmem(newsk->sk_memcg, amt);
+   }
+
 out:
release_sock(sk);
if (req)

Re: [PATCH v4 2/2] bnx2x: disable GSO where gso_size is too big for hardware

2018-01-31 Thread Eric Dumazet

On Wed, 2018-01-31 at 14:15 +1100, Daniel Axtens wrote:
> If a bnx2x card is passed a GSO packet with a gso_size larger than
> ~9700 bytes, it will cause a firmware error that will bring the card
> down:
> 
> bnx2x: [bnx2x_attn_int_deasserted3:4323(enP24p1s0f0)]MC assert!
> bnx2x: [bnx2x_mc_assert:720(enP24p1s0f0)]XSTORM_ASSERT_LIST_INDEX 0x2
> bnx2x: [bnx2x_mc_assert:736(enP24p1s0f0)]XSTORM_ASSERT_INDEX 0x0 = 0x 
> 0x25e43e47 0x00463e01 0x00010052
> bnx2x: [bnx2x_mc_assert:750(enP24p1s0f0)]Chip Revision: everest3, FW Version: 
> 7_13_1
> ... (dump of values continues) ...
> 
> Detect when the mac length of a GSO packet is greater than the maximum
> packet size (9700 bytes) and disable GSO.
> 
> Signed-off-by: Daniel Axtens 
> 
> ---
> 
> v4: Only call the slow check if the gso_size is large.
> Eric - I think this is what you had in mind?
> Manish - do you think this is an acceptable performance trade-off?
>  GSO will work for any packet size, and only jumbo frames will
>have to do the slower test.

Yes this looks good to me, thanks !

Reviewed-by: Eric Dumazet

RE: [PATCH v4 2/2] bnx2x: disable GSO where gso_size is too big for hardware

2018-01-31 Thread Daniel Axtens

"Chopra, Manish"  writes:

>> -Original Message-
>> From: Daniel Axtens [mailto:d...@axtens.net]
>> Sent: Wednesday, January 31, 2018 8:46 AM
>> To: netdev@vger.kernel.org
>> Cc: Daniel Axtens ; Eric Dumazet ;
>> Chopra, Manish ; Jason Wang
>> ; Pravin Shelar ; Marcelo Ricardo
>> Leitner 
>> Subject: [PATCH v4 2/2] bnx2x: disable GSO where gso_size is too big for
>> hardware
>> 
>> If a bnx2x card is passed a GSO packet with a gso_size larger than
>> ~9700 bytes, it will cause a firmware error that will bring the card
>> down:
>> 
>> bnx2x: [bnx2x_attn_int_deasserted3:4323(enP24p1s0f0)]MC assert!
>> bnx2x: [bnx2x_mc_assert:720(enP24p1s0f0)]XSTORM_ASSERT_LIST_INDEX 0x2
>> bnx2x: [bnx2x_mc_assert:736(enP24p1s0f0)]XSTORM_ASSERT_INDEX 0x0 =
>> 0x 0x25e43e47 0x00463e01 0x00010052
>> bnx2x: [bnx2x_mc_assert:750(enP24p1s0f0)]Chip Revision: everest3, FW
>> Version: 7_13_1 ... (dump of values continues) ...
>> 
>> Detect when the mac length of a GSO packet is greater than the maximum
>> packet size (9700 bytes) and disable GSO.
>> 
>> Signed-off-by: Daniel Axtens 
>> 
>> ---
>> 
>> v4: Only call the slow check if the gso_size is large.
>> Eric - I think this is what you had in mind?
>> Manish - do you think this is an acceptable performance trade-off?
>>  GSO will work for any packet size, and only jumbo frames will
>>   have to do the slower test.
>> 
>> Again, only build-tested.
>> ---
>>  drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 18
>> ++
>>  1 file changed, 18 insertions(+)
>> 
>> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
>> b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
>> index 7b08323e3f3d..74fc9af4aadb 100644
>> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
>> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
>> @@ -12934,6 +12934,24 @@ static netdev_features_t
>> bnx2x_features_check(struct sk_buff *skb,
>>struct net_device *dev,
>>netdev_features_t features)
>>  {
>> +/*
>> + * A skb with gso_size + header length > 9700 will cause a
>> + * firmware panic. Drop GSO support.
>> + *
>> + * Eventually the upper layer should not pass these packets down.
>> + *
>> + * For speed, if the gso_size is <= 9000, assume there will
>> + * not be 700 bytes of headers and pass it through. Only do a
>> + * full (slow) validation if the gso_size is > 9000.
>> + *
>> + * (Due to the way SKB_BY_FRAGS works this will also do a full
>> + * validation in that case.)
>> + */
>> +if (unlikely(skb_is_gso(skb) &&
>> + (skb_shinfo(skb)->gso_size > 9000) &&
>> + !skb_gso_validate_mac_len(skb, 9700)))
>> +features &= ~NETIF_F_GSO_MASK;
>
> Hi Daniel,
>
> Obviously, it could be bad from performance perspective since every gso 
> packet has to do these check.
> When running iperf/netperf performance benchmark, where GSO is likely to 
> occur.
>
> Why do you have to put two checks for skb_is_gso() and gso_size ? Isn't 
> gso_size > anything means GSO skb ?

Yes, the check is redundant. I do it to make my logic clearer.

The compiler will optimise it into one check. I compiled with gcc-7.2
and gcc-4.8, both targeting amd64, and in both cases I could only find
one relevant cmp:

skb_is_gso():
/home/dja/dev/linux/linux/include/linux/skbuff.h:4032
3686:   48 8b 8f d0 00 00 00mov0xd0(%rdi),%rcx
bnx2x_features_check():
/home/dja/dev/linux/linux/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c:12950
368d:   66 81 7c 01 04 28 23cmpw   $0x2328,0x4(%rcx,%rax,1)
3694:   0f 87 ea 01 00 00   ja 3884 

0x2328 is decimal 9000.

> I assume it won't cause disabling the offload if "headers [L2 + L3 + L4] + 
> gso_size" is still <= 9700. ?

That is correct. The flow is:

If the gso_size is less than or equal to 9000, offload is enabled with
no further checks. This is the fast path.

If the gso_size is greater than 9000, then the "headers [L2 + L3 + L4] +
gso_size" is checked. This is an out-of-line check done once per GSO skb.

If headers + gso_size is <= 9700, offload is enabled.

If headers + gso_size > 9700, offload is disabled.

Regards,
Daniel
>
> Thanks,
> Manish

Re: KASAN: slab-out-of-bounds Read in sctp_send_reset_streams

2018-01-31 Thread Eric Biggers

On Sat, Dec 09, 2017 at 02:40:00AM -0800, syzbot wrote:
> Hello,
> 
> syzkaller hit the following crash on
> 328b4ed93b69a6f2083d52f31a240a09e5de386a
> git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/master
> compiler: gcc (GCC) 7.1.1 20170620
> .config is attached
> Raw console output is attached.
> C reproducer is attached
> syzkaller reproducer is attached. See https://goo.gl/kgGztJ
> for information about syzkaller reproducers
> 
> 
> ==
> BUG: KASAN: slab-out-of-bounds in sctp_send_reset_streams+0xadf/0xc10
> net/sctp/stream.c:314
> Read of size 2 at addr 8801d8a6c048 by task syzkaller104411/3085
> 
> CPU: 0 PID: 3085 Comm: syzkaller104411 Not tainted 4.15.0-rc2+ #119
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> Google 01/01/2011
> Call Trace:
>  __dump_stack lib/dump_stack.c:17 [inline]
>  dump_stack+0x194/0x257 lib/dump_stack.c:53
>  print_address_description+0x73/0x250 mm/kasan/report.c:252
>  kasan_report_error mm/kasan/report.c:351 [inline]
>  kasan_report+0x25b/0x340 mm/kasan/report.c:409
>  __asan_report_load2_noabort+0x14/0x20 mm/kasan/report.c:428
>  sctp_send_reset_streams+0xadf/0xc10 net/sctp/stream.c:314
>  sctp_setsockopt_reset_streams net/sctp/socket.c:3905 [inline]
>  sctp_setsockopt+0x70d/0x5d50 net/sctp/socket.c:4195
>  compat_sock_common_setsockopt+0x104/0x140 net/core/sock.c:2981
>  C_SYSC_setsockopt net/compat.c:403 [inline]
>  compat_SyS_setsockopt+0x17c/0x410 net/compat.c:386
>  do_syscall_32_irqs_on arch/x86/entry/common.c:327 [inline]
>  do_fast_syscall_32+0x3ee/0xf9d arch/x86/entry/common.c:389
>  entry_SYSENTER_compat+0x51/0x60 arch/x86/entry/entry_64_compat.S:125
> RIP: 0023:0xf7f1bc79
> RSP: 002b:ff9893bc EFLAGS: 0282 ORIG_RAX: 016e
> RAX: ffda RBX: 0005 RCX: 0084
> RDX: 0077 RSI: 2018b000 RDI: 0008
> RBP: 001c R08:  R09: 
> R10:  R11:  R12: 
> R13:  R14:  R15: 
> 
> Allocated by task 3085:
>  save_stack+0x43/0xd0 mm/kasan/kasan.c:447
>  set_track mm/kasan/kasan.c:459 [inline]
>  kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551
>  __do_kmalloc mm/slab.c:3711 [inline]
>  __kmalloc_track_caller+0x15e/0x760 mm/slab.c:3726
>  memdup_user+0x2c/0x90 mm/util.c:164
>  sctp_setsockopt_reset_streams net/sctp/socket.c:3897 [inline]
>  sctp_setsockopt+0x6a6/0x5d50 net/sctp/socket.c:4195
>  compat_sock_common_setsockopt+0x104/0x140 net/core/sock.c:2981
>  C_SYSC_setsockopt net/compat.c:403 [inline]
>  compat_SyS_setsockopt+0x17c/0x410 net/compat.c:386
>  do_syscall_32_irqs_on arch/x86/entry/common.c:327 [inline]
>  do_fast_syscall_32+0x3ee/0xf9d arch/x86/entry/common.c:389
>  entry_SYSENTER_compat+0x51/0x60 arch/x86/entry/entry_64_compat.S:125
> 
> Freed by task 16:
>  save_stack+0x43/0xd0 mm/kasan/kasan.c:447
>  set_track mm/kasan/kasan.c:459 [inline]
>  kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524
>  __cache_free mm/slab.c:3491 [inline]
>  kfree+0xca/0x250 mm/slab.c:3806
>  selinux_cred_free+0x48/0x70 security/selinux/hooks.c:3814
>  security_cred_free+0x48/0x80 security/security.c:995
>  put_cred_rcu+0x106/0x400 kernel/cred.c:117
>  __rcu_reclaim kernel/rcu/rcu.h:195 [inline]
>  rcu_do_batch kernel/rcu/tree.c:2758 [inline]
>  invoke_rcu_callbacks kernel/rcu/tree.c:3012 [inline]
>  __rcu_process_callbacks kernel/rcu/tree.c:2979 [inline]
>  rcu_process_callbacks+0xd74/0x17d0 kernel/rcu/tree.c:2996
>  __do_softirq+0x29d/0xbb2 kernel/softirq.c:285
> 
> The buggy address belongs to the object at 8801d8a6c040
>  which belongs to the cache kmalloc-32 of size 32
> The buggy address is located 8 bytes inside of
>  32-byte region [8801d8a6c040, 8801d8a6c060)
> The buggy address belongs to the page:
> page:6b05592a count:1 mapcount:0 mapping:1ca7267d
> index:0x8801d8a6cfc1
> flags: 0x2fffc000100(slab)
> raw: 02fffc000100 8801d8a6c000 8801d8a6cfc1 0001003f
> raw: ea000762f920 ea00076133e0 8801db0001c0 
> page dumped because: kasan: bad access detected
> 
> Memory state around the buggy address:
>  8801d8a6bf00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>  8801d8a6bf80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
> > 8801d8a6c000: fb fb fb fb fc fc fc fc 00 fc fc fc fc fc fc fc
>   ^
>  8801d8a6c080: fb fb fb fb fc fc fc fc 00 00 00 00 fc fc fc fc
>  8801d8a6c100: fb fb fb fb fc fc fc fc fb fb fb fb fc fc fc fc
> ==
> 
> 
> ---
> This bug is generated by a dumb bot. It may contain errors.
> See https://goo.gl/tpsmEJ for details.
> Direct all questions to syzkal...@googlegroups.com.
> Please credit me with: Reported-by: syzbot

[PATCH net] tcp_bbr: fix pacing_gain to always be unity when using lt_bw

2018-01-31 Thread Neal Cardwell

This commit fixes the pacing_gain to remain at BBR_UNIT (1.0) when
using lt_bw and returning from the PROBE_RTT state to PROBE_BW.

Previously, when using lt_bw, upon exiting PROBE_RTT and entering
PROBE_BW the bbr_reset_probe_bw_mode() code could sometimes randomly
end up with a cycle_idx of 0 and hence have bbr_advance_cycle_phase()
set a pacing gain above 1.0. In such cases this would result in a
pacing rate that is 1.25x higher than intended, potentially resulting
in a high loss rate for a little while until we stop using the lt_bw a
bit later.

This commit is a stable candidate for kernels back as far as 4.9.

Fixes: 0f8782ea1497 ("tcp_bbr: add BBR congestion control")
Signed-off-by: Neal Cardwell 
Signed-off-by: Yuchung Cheng 
Signed-off-by: Soheil Hassas Yeganeh 
Reported-by: Beyers Cronje 
---
 net/ipv4/tcp_bbr.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 8322f26e770e..25c5a0b60cfc 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -481,7 +481,8 @@ static void bbr_advance_cycle_phase(struct sock *sk)
 
bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
bbr->cycle_mstamp = tp->delivered_mstamp;
-   bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx];
+   bbr->pacing_gain = bbr->lt_use_bw ? BBR_UNIT :
+   bbr_pacing_gain[bbr->cycle_idx];
 }
 
 /* Gain cycling: cycle pacing gain to converge to fair share of available bw. 
*/
@@ -490,8 +491,7 @@ static void bbr_update_cycle_phase(struct sock *sk,
 {
struct bbr *bbr = inet_csk_ca(sk);
 
-   if ((bbr->mode == BBR_PROBE_BW) && !bbr->lt_use_bw &&
-   bbr_is_next_cycle_phase(sk, rs))
+   if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs))
bbr_advance_cycle_phase(sk);
 }
 
-- 
2.16.0.rc1.238.g530d649a79-goog

Re: BUG: 4.14.11 unable to handle kernel NULL pointer dereference in xfrm_lookup

2018-01-31 Thread Markus Berner

> I'm running into a NULL pointer dereference after updating from Linux 
4.1.6 to

> 4.14.11 (see kernel log below).

We are running into the same problem on our production machine, running 
CoreOS 1576.5.0 Stable with the 4.14.11 kernel on a KVM Cloud VM. It is 
not as easy to reproduce though in our case – we observed a total of 5 
crashes in the last 2 weeks - all except one on the production machine.


> I still can't reproduce it with my tests. This is probably some race
> triggered due to your aggressive roadwarrior setup which I don't have.

We have a similar setup to Tobias
- 2 Network Interfaces (KVM/virtio): Public and local VLAN
- Strongswan VPN in Tunnel mode between local VLAN and on-premise 
network, running in a Docker container
- Quite a few iptables NAT and forwarding rules regarding other local 
Docker containers


Some Observations:
- The workaround of locking the IRQs of the Rx/Tx queues of all network 
interfaces to CPU0 Tobias described a while back did not prevent the 
crashes in our case
- The bug does not seem to correlate with load in our case, but load in 
general is quite low.


I am happy to help if I can, but unfortunately our possibilities are a 
bit limited; both due to lack of kernel dev know-how as well as trying 
out changes to configuration on the production machine. I subscribed to 
LKML only now to respond, so I hope the reply works (and to the correct 
message).


Markus

Example Stack Trace below:

[740051.374799] BUG: unable to handle kernel NULL pointer dereference at 
0020

[740051.379386] IP: xfrm_lookup+0x32/0x8a0
[740051.379941] PGD 8004648d6067 P4D 8004648d6067 PUD 461405067 
PMD 0

[740051.380697] Oops:  [#1] SMP PTI
[740051.381060] Modules linked in: iptable_mangle drbg authenc echainiv 
esp4 xfrm6_mode_tunnel xfrm4_mode_tunnel cbc binfmt_misc veth netconsole 
configfs softdog xt_nat nf_log_ipv4 nf_log_common xt_LOG xt_limit 
xt_policy xt_comment xt_multiport ipt_MASQUERADE nf_nat_masquerade_ipv4 
nf_conntrack_netlink nfnetlink xfrm_user xfrm_algo iptable_nat 
nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 xt_addrtype iptable_filter 
xt_conntrack nf_nat nf_conntrack libcrc32c crc32c_generic br_netfilter 
bridge stp llc overlay sb_edac edac_core nls_ascii nls_cp437 kvm_intel 
vfat fat kvm mousedev psmouse i2c_piix4 irqbypass evdev virtio_balloon 
i2c_core pvpanic button sch_fq_codel hid_generic usbhid hid ext4 crc16 
mbcache jbd2 fscrypto dm_verity dm_bufio virtio_blk virtio_net uhci_hcd 
ehci_pci ata_piix ehci_hcd crc32c_intel
[740051.389167]  libata virtio_pci usbcore virtio_ring scsi_mod virtio 
usb_common dm_mirror dm_region_hash dm_log dm_mod dax

[740051.391444] CPU: 2 PID: 13516 Comm: java Not tainted 4.14.11-coreos #1
[740051.392792] Hardware name: QEMU CloudSigma, BIOS Bochs 01/01/2011
[740051.394120] task: 903022738000 task.stack: a791c768
[740051.395399] RIP: 0010:xfrm_lookup+0x32/0x8a0
[740051.396456] RSP: 0018:9030bfc838e8 EFLAGS: 00010246
[740051.397656] RAX:  RBX: 9030bfc83960 RCX: 

[740051.399526] RDX: 9030bfc83960 RSI:  RDI: 

[740051.401471] RBP:  R08: 0002 R09: 
46d7a6d9
[740051.403260] R10:  R11: 62f99322 R12: 
0002
[740051.405049] R13: 980e3080 R14: 903095c8a0a0 R15: 
9812dc20
[740051.406767] FS:  7f7d68cf6700() GS:9030bfc8() 
knlGS:

[740051.408900] CS:  0010 DS:  ES:  CR0: 80050033
[740051.410159] CR2: 0020 CR3: 0005fd444005 CR4: 
001606e0

[740051.411805] Call Trace:
[740051.412534]  
[740051.413190]  __xfrm_route_forward+0x61/0x100
[740051.414198]  ip_forward+0x39e/0x470
[740051.415148]  ? ip_rcv_finish+0xa5/0x3f0
[740051.416225]  br_netfilter_enable+0x10c/0x3e0 [br_netfilter]
[740051.417491]  nf_hook_slow+0x39/0xb0
[740051.418530]  ip_rcv+0x303/0x3a0
[740051.419647]  ? inet_del_offload+0x40/0x40
[740051.420303]  __netif_receive_skb_core+0x2c9/0xb60
[740051.420998]  ? x2apic_send_IPI+0x46/0x50
[740051.421648]  ? check_preempt_curr+0x56/0x90
[740051.422309]  ? ttwu_do_wakeup+0x19/0x150
[740051.422920]  ? netif_receive_skb_internal+0x42/0xf0
[740051.423710]  netif_receive_skb_internal+0x42/0xf0
[740051.424717]  br_port_flags_change+0x1d4/0x260 [bridge]
[740051.425884]  ? br_fdb_update+0xc3/0x2c0 [bridge]
[740051.427019]  br_handle_frame_finish+0x1e2/0x510 [bridge]
[740051.428339]  ? lock_timer_base+0x67/0x80
[740051.429036]  ? ipt_do_table+0x35f/0x610
[740051.429906]  ? br_port_flags_change+0x260/0x260 [bridge]
[740051.430683]  br_nf_hook_thresh+0xde/0x12a0 [br_netfilter]
[740051.431465]  ? br_port_flags_change+0x260/0x260 [bridge]
[740051.432217]  br_nf_hook_thresh+0xa8c/0x12a0 [br_netfilter]
[740051.433032]  ? br_port_flags_change+0x260/0x260 [bridge]
[740051.433966]  ? nf_nat_ipv4_in+0x28/0x80 [nf_nat_ipv4]
[740051.434848]  br_nf_hook_thresh+0xe2a/0x12a0

Re: [BUG iproute2] ip tuntap show

2018-01-31 Thread Serhey Popovych

David Ahern wrote:
> On 1/31/18 10:21 AM, Serhey Popovych wrote:
>> Eric Dumazet wrote:
>>> ip tuntap enumerates devices using /sys/class/net which is unusual.
>>>
>>> Should we replace this enumeration using /proc/net/dev like "ip tunnel" ?
>>>
>>> After "unshare -n" maybe mounting /sys should not be required for
>>> proper iproute2 behavior.
>>>
>>> At least ip command should adopt a common enumeration method.
>>>
>>> What do you think ?
>>
>> It seems main reason for using /sys/class/net is to get additional
>> information for netdev like "owner", "group" and "tun_flags".
>>
>> On the other hand at least iptunnel and ip6tunnel uses nearly identical
>> code to parse /proc/net/dev.
>>
>> Having single routine that reads /proc/net/dev and calls implementation
>> specific callback function with given network device name is good idea.
>>
>> I can try to prepare v1 for this, if no one objects this.
>>
> 
> pid_name function needs help too. comm is allocated via sprintf, freed,
> and used again and then returned to caller.
> 

Posted v1, but without pid_name patch, will prepare and send separately.

Thanks,
Serhii



signature.asc
Description: OpenPGP digital signature

[PATCH iproute2-next 4/4] tuntap: Use do_each_proc_net_dev()

2018-01-31 Thread Serhey Popovych

Now we have helper to iterate over entries in /proc/net/dev we can
simplify and cleanup do_tunnels_list() in ip/iptuntap.c.

While there replace printf("\n") with fputc('\n', stdout) and
printf() with fputs() where string does not contain format specifiers.

Signed-off-by: Serhey Popovych 
---
 ip/iptuntap.c |   59 -
 1 file changed, 25 insertions(+), 34 deletions(-)

diff --git a/ip/iptuntap.c b/ip/iptuntap.c
index 09f2be2..01b68ad 100644
--- a/ip/iptuntap.c
+++ b/ip/iptuntap.c
@@ -348,44 +348,35 @@ next:
globfree();
 }
 
-
-static int do_show(int argc, char **argv)
+static pnd_result_t do_tuntap_list(char *name, char *stats, void *arg)
 {
-   DIR *dir;
-   struct dirent *d;
long flags, owner = -1, group = -1;
 
-   dir = opendir("/sys/class/net");
-   if (!dir) {
-   perror("opendir");
-   return -1;
+   if (read_prop(name, "tun_flags", ))
+   return PND_NEXT;
+
+   read_prop(name, "owner", );
+   read_prop(name, "group", );
+
+   printf("%s:", name);
+   print_flags(flags);
+   if (owner != -1)
+   printf(" user %ld", owner);
+   if (group != -1)
+   printf(" group %ld", group);
+   fputc('\n', stdout);
+   if (show_details) {
+   fputs("\tAttached to processes:", stdout);
+   show_processes(name);
+   fputc('\n', stdout);
}
-   while ((d = readdir(dir))) {
-   if (d->d_name[0] == '.' &&
-   (d->d_name[1] == 0 || d->d_name[1] == '.'))
-   continue;
-
-   if (read_prop(d->d_name, "tun_flags", ))
-   continue;
-
-   read_prop(d->d_name, "owner", );
-   read_prop(d->d_name, "group", );
-
-   printf("%s:", d->d_name);
-   print_flags(flags);
-   if (owner != -1)
-   printf(" user %ld", owner);
-   if (group != -1)
-   printf(" group %ld", group);
-   printf("\n");
-   if (show_details) {
-   printf("\tAttached to processes:");
-   show_processes(d->d_name);
-   printf("\n");
-   }
-   }
-   closedir(dir);
-   return 0;
+
+   return PND_NEXT;
+}
+
+static int do_show(int argc, char **argv)
+{
+   return do_each_proc_net_dev(do_tuntap_list, NULL);
 }
 
 int do_iptuntap(int argc, char **argv)
-- 
1.7.10.4

[PATCH iproute2-next 2/4] iptunnel: Use do_each_proc_net_dev()

2018-01-31 Thread Serhey Popovych

Now we have helper to iterate over entries in /proc/net/dev we can
simplify and cleanup do_tunnels_list() in ip/iptunnel.c.

While there replace printf("\n") with fputc('\n', stdout) and printf()
with fputs() where string does not contain format specifiers.

Also move tunnel parameter matching to new static helper function to
match ip6tunnel.c.

Signed-off-by: Serhey Popovych 
---
 ip/iptunnel.c |  102 ++---
 1 file changed, 47 insertions(+), 55 deletions(-)

diff --git a/ip/iptunnel.c b/ip/iptunnel.c
index 0aa3b33..b60a7b7 100644
--- a/ip/iptunnel.c
+++ b/ip/iptunnel.c
@@ -373,66 +373,58 @@ static void print_tunnel(struct ip_tunnel_parm *p)
printf("%s  Checksum output packets.", _SL_);
 }
 
-static int do_tunnels_list(struct ip_tunnel_parm *p)
+/*
+ * @p1: user specified parameter
+ * @p2: database entry
+ */
+static int ip_tunnel_parm_match(const struct ip_tunnel_parm *p1,
+   const struct ip_tunnel_parm *p2)
 {
-   char buf[512];
-   int err = -1;
-   FILE *fp = fopen("/proc/net/dev", "r");
+   return ((!p1->link || p1->link == p2->link) &&
+   (!p1->name[0] || strcmp(p1->name, p2->name) == 0) &&
+   (!p1->iph.daddr || p1->iph.daddr == p2->iph.daddr) &&
+   (!p1->iph.saddr || p1->iph.saddr == p2->iph.saddr) &&
+   (!p1->i_key || p1->i_key == p2->i_key));
+}
 
-   if (fp == NULL) {
-   perror("fopen");
-   return -1;
-   }
+static pnd_result_t do_tunnels_list(char *name, char *stats, void *arg)
+{
+   struct ip_tunnel_parm p1, *p = arg;
+   int index, type;
+
+   if (p->name[0] && strcmp(p->name, name))
+   return PND_NEXT;
 
-   /* skip header lines */
-   if (!fgets(buf, sizeof(buf), fp) ||
-   !fgets(buf, sizeof(buf), fp)) {
-   fprintf(stderr, "/proc/net/dev read error\n");
-   goto end;
+   index = ll_name_to_index(name);
+   if (index <= 0)
+   return PND_NEXT;
+
+   type = ll_index_to_type(index);
+   if (type == -1) {
+   fprintf(stderr, "Failed to get type of \"%s\"\n", name);
+   return PND_NEXT;
}
 
-   while (fgets(buf, sizeof(buf), fp) != NULL) {
-   char name[IFNAMSIZ];
-   int index, type;
-   struct ip_tunnel_parm p1 = {};
-   char *ptr;
-
-   buf[sizeof(buf) - 1] = 0;
-   ptr = strchr(buf, ':');
-   if (ptr == NULL ||
-   (*ptr++ = 0, sscanf(buf, "%s", name) != 1)) {
-   fprintf(stderr, "Wrong format for /proc/net/dev. Giving 
up.\n");
-   goto end;
-   }
-   if (p->name[0] && strcmp(p->name, name))
-   continue;
-   index = ll_name_to_index(name);
-   if (index == 0)
-   continue;
-   type = ll_index_to_type(index);
-   if (type == -1) {
-   fprintf(stderr, "Failed to get type of \"%s\"\n", name);
-   continue;
-   }
-   if (type != ARPHRD_TUNNEL && type != ARPHRD_IPGRE && type != 
ARPHRD_SIT)
-   continue;
-   if (tnl_get_ioctl(name, ))
-   continue;
-   if ((p->link && p1.link != p->link) ||
-   (p->name[0] && strcmp(p1.name, p->name)) ||
-   (p->iph.daddr && p1.iph.daddr != p->iph.daddr) ||
-   (p->iph.saddr && p1.iph.saddr != p->iph.saddr) ||
-   (p->i_key && p1.i_key != p->i_key))
-   continue;
-   print_tunnel();
-   if (show_stats)
-   tnl_print_stats(ptr);
-   printf("\n");
+   switch (type) {
+   case ARPHRD_TUNNEL:
+   case ARPHRD_IPGRE:
+   case ARPHRD_SIT:
+   return PND_NEXT;
}
-   err = 0;
- end:
-   fclose(fp);
-   return err;
+
+   memset(, 0, sizeof(p1));
+   if (tnl_get_ioctl(name, ))
+   return PND_NEXT;
+
+   if (!ip_tunnel_parm_match(p, ))
+   return PND_NEXT;
+
+   print_tunnel();
+   if (show_stats)
+   tnl_print_stats(stats);
+   fputc('\n', stdout);
+
+   return PND_NEXT;
 }
 
 static int do_show(int argc, char **argv)
@@ -446,7 +438,7 @@ static int do_show(int argc, char **argv)
 
basedev = tnl_defname();
if (!basedev)
-   return do_tunnels_list();
+   return do_each_proc_net_dev(do_tunnels_list, );
 
if (tnl_get_ioctl(p.name[0] ? p.name : basedev, ))
return -1;
-- 
1.7.10.4

[PATCH iproute2-next 3/4] ip6tunnel: Use do_each_proc_net_dev()

2018-01-31 Thread Serhey Popovych

Now we have helper to iterate over entries in /proc/net/dev we can
simplify and cleanup do_tunnels_list() in ip/iptunnel.c.

While there replace printf("\n") with fputc('\n', stdout) and printf()
with fputs() where string does not contain format specifiers.

Signed-off-by: Serhey Popovych 
---
 ip/ip6tunnel.c |   94 ++--
 1 file changed, 37 insertions(+), 57 deletions(-)

diff --git a/ip/ip6tunnel.c b/ip/ip6tunnel.c
index 783e28a..ccd5603 100644
--- a/ip/ip6tunnel.c
+++ b/ip/ip6tunnel.c
@@ -336,68 +336,48 @@ static int ip6_tnl_parm_match(const struct ip6_tnl_parm2 
*p1,
(!p1->flags || (p1->flags & p2->flags)));
 }
 
-static int do_tunnels_list(struct ip6_tnl_parm2 *p)
+static pnd_result_t do_tunnels_list(char *name, char *stats, void *arg)
 {
-   char buf[512];
-   int err = -1;
-   FILE *fp = fopen("/proc/net/dev", "r");
+   struct ip6_tnl_parm2 p1, *p = arg;
+   int index, type;
 
-   if (fp == NULL) {
-   perror("fopen");
-   return -1;
-   }
+   if (p->name[0] && strcmp(p->name, name))
+   return PND_NEXT;
+
+   index = ll_name_to_index(name);
+   if (index <= 0)
+   return PND_NEXT;
 
-   /* skip two lines at the begenning of the file */
-   if (!fgets(buf, sizeof(buf), fp) ||
-   !fgets(buf, sizeof(buf), fp)) {
-   fprintf(stderr, "/proc/net/dev read error\n");
-   goto end;
+   type = ll_index_to_type(index);
+   if (type == -1) {
+   fprintf(stderr, "Failed to get type of \"%s\"\n", name);
+   return PND_NEXT;
}
 
-   while (fgets(buf, sizeof(buf), fp) != NULL) {
-   char name[IFNAMSIZ];
-   int index, type;
-   struct ip6_tnl_parm2 p1 = {};
-   char *ptr;
-
-   buf[sizeof(buf) - 1] = '\0';
-   if ((ptr = strchr(buf, ':')) == NULL ||
-   (*ptr++ = 0, sscanf(buf, "%s", name) != 1)) {
-   fprintf(stderr, "Wrong format for /proc/net/dev. Giving 
up.\n");
-   goto end;
-   }
-   if (p->name[0] && strcmp(p->name, name))
-   continue;
-   index = ll_name_to_index(name);
-   if (index == 0)
-   continue;
-   type = ll_index_to_type(index);
-   if (type == -1) {
-   fprintf(stderr, "Failed to get type of \"%s\"\n", name);
-   continue;
-   }
-   if (type != ARPHRD_TUNNEL6 && type != ARPHRD_IP6GRE)
-   continue;
-   ip6_tnl_parm_init(, 0);
-   if (type == ARPHRD_IP6GRE)
-   p1.proto = IPPROTO_GRE;
-   strcpy(p1.name, name);
-   p1.link = ll_name_to_index(p1.name);
-   if (p1.link == 0)
-   continue;
-   if (tnl_get_ioctl(p1.name, ))
-   continue;
-   if (!ip6_tnl_parm_match(p, ))
-   continue;
-   print_tunnel();
-   if (show_stats)
-   tnl_print_stats(ptr);
-   printf("\n");
+   switch (type) {
+   case ARPHRD_TUNNEL6:
+   case ARPHRD_IP6GRE:
+   return PND_NEXT;
}
-   err = 0;
- end:
-   fclose(fp);
-   return err;
+
+   ip6_tnl_parm_init(, 0);
+   if (type == ARPHRD_IP6GRE)
+   p1.proto = IPPROTO_GRE;
+   p1.link = index;
+   name = strcpy(p1.name, name);
+
+   if (tnl_get_ioctl(name, ))
+   return PND_NEXT;
+
+   if (!ip6_tnl_parm_match(p, ))
+   return PND_NEXT;
+
+   print_tunnel();
+   if (show_stats)
+   tnl_print_stats(stats);
+   fputc('\n', stdout);
+
+   return PND_NEXT;
 }
 
 static int do_show(int argc, char **argv)
@@ -412,7 +392,7 @@ static int do_show(int argc, char **argv)
return -1;
 
if (!p.name[0] || show_stats)
-   do_tunnels_list();
+   do_each_proc_net_dev(do_tunnels_list, );
else {
if (tnl_get_ioctl(p.name, ))
return -1;
-- 
1.7.10.4

[PATCH iproute2-next 1/4] utils: Introduce do_each_proc_net_dev() helper

2018-01-31 Thread Serhey Popovych

It is natural for ip(8) tool to access /proc/net/dev for various
information and there at least two places implementing same iteration
over lines in this file: iptunnel.c and ip6tunnel.c.

To unify interface and avoid code duplication introduce helper that
reads line from /proc/net/dev, passes it to given callback function
that may return following three states to control do_each_proc_net_dev()
behaviour:

  o PND_ERROR - stop reading lines and return -1
  o PND_OK- stop reading lines and return 0
  o PND_NEXT  - continue by reading next line.

Signed-off-by: Serhey Popovych 
---
 include/utils.h |   10 ++
 lib/utils.c |   51 +++
 2 files changed, 61 insertions(+)

diff --git a/include/utils.h b/include/utils.h
index 0394268..819b0ca 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -285,6 +285,16 @@ int iplink_parse(int argc, char **argv, struct iplink_req 
*req,
 int do_each_netns(int (*func)(char *nsname, void *arg), void *arg,
bool show_label);
 
+typedef enum {
+   PND_ERROR = -1,
+   PND_OK= 0,
+   PND_NEXT  = 1
+} pnd_result_t;
+
+typedef pnd_result_t (pnd_func_t)(char *name, char *stats, void *arg);
+
+int do_each_proc_net_dev(pnd_func_t f, void *arg);
+
 char *int_to_str(int val, char *buf);
 int get_guid(__u64 *guid, const char *arg);
 int get_real_family(int rtm_type, int rtm_family);
diff --git a/lib/utils.c b/lib/utils.c
index 8e15625..4f142d8 100644
--- a/lib/utils.c
+++ b/lib/utils.c
@@ -1375,6 +1375,57 @@ int do_each_netns(int (*func)(char *nsname, void *arg), 
void *arg,
return netns_foreach(on_netns, );
 }
 
+int do_each_proc_net_dev(pnd_func_t f, void *arg)
+{
+   static const char pnd[] = "/proc/net/dev";
+   char buf[512];
+   int err = -1;
+   FILE *fp;
+
+   fp = fopen(pnd, "r");
+   if (!fp) {
+   perror("fopen");
+   return -1;
+   }
+
+   /* skip two lines at the begenning of the file */
+   if (!fgets(buf, sizeof(buf), fp) ||
+   !fgets(buf, sizeof(buf), fp)) {
+   perror("fgets");
+   goto end;
+   }
+
+   while (fgets(buf, sizeof(buf), fp)) {
+   char *stats, *name = NULL;
+   pnd_result_t result;
+
+   buf[sizeof(buf) - 1] = '\0';
+   stats = strchr(buf, ':');
+   if (stats) {
+   *stats++ = '\0';
+   if (!check_ifname(buf))
+   name = buf;
+   }
+   if (!name) {
+   fprintf(stderr,
+   "Wrong format for \"%s\". Giving up.\n", pnd);
+   goto end;
+   }
+
+   result = f(name, stats, arg);
+   if (result == PND_NEXT)
+   continue;
+   if (result == PND_OK)
+   break;
+   goto end;
+   }
+   if (feof(fp))
+   err = 0;
+end:
+   fclose(fp);
+   return err;
+}
+
 char *int_to_str(int val, char *buf)
 {
sprintf(buf, "%d", val);
-- 
1.7.10.4

[PATCH iproute2-next 0/4] ip: Introduce and use helper to read /proc/net/dev

2018-01-31 Thread Serhey Popovych

Currently there is two places in ip(8) where /proc/net/dev is read line
by line with nearly identical steps: iptunnel.c and ip6tunnel.c

On the other hand we have iptuntap.c that uses /sys/class/net that could
be problematic in case of unshare(1)d network namespace without sysfs
being mounted.

Introduce and use do_each_proc_net_dev() helper to read data from
/proc/net/dev line by line and pass this information to implementation
specific callback function.

See individual patch description message for more details.

Series is open for reviews and comments.

Tested only by compiling and executing ip [-46] [-s] [-d] tunnel in
various combinations: no problem so far. More can be done by request.

Thanks,
Serhii

Serhey Popovych (4):
  utils: Introduce do_each_proc_net_dev() helper
  iptunnel: Use do_each_proc_net_dev()
  ip6tunnel: Use do_each_proc_net_dev()
  tuntap: Use do_each_proc_net_dev()

 include/utils.h |   10 ++
 ip/ip6tunnel.c  |   94 --
 ip/iptunnel.c   |  102 +--
 ip/iptuntap.c   |   59 ++--
 lib/utils.c |   51 
 5 files changed, 170 insertions(+), 146 deletions(-)

-- 
1.7.10.4

Re: AF_PACKET V4/AF_XDP userspace API questions

2018-01-31 Thread Björn Töpel

2018-01-30 8:58 GMT+01:00 Ilias Apalodimas :
> We've noticed 3 different hardware approaches in receiving payloads
>
> 1. Host driver needs to pre-load descriptor ring with addresses of RAM
> buffers to write arriving data.
> The "standard" functionality for most NICs is (in little detail) fetch the
> descriptor, write the payload to host RAM and update the descriptor
> accordingly.
> So for these NICs, buffer addresses are provided in RX descriptors (RX
> descriptors are two-way communication entity).
> This translates to "1 ring + 1 buffer array" model, or the packet array
> model in short.
>
> 2. There's a category of NICs (Chelsio and Netcope are the ones we are aware
> of) that split that into two one-way entities:
> One to communicate buffer addresses from host to NIC and one to communicate
> packets/payloads from NIC to host.
> So the driver provides a set of unstructured, contiguous memory areas to the
> NIC, the NIC decides where to place the packets in memory and updates the
> descriptors accordingly (the descriptor ring is not pre-loaded with any data
> and the NIC is free to write the packet anywhere in the provided contiguous
> memory).
> This is a "1 ring + 1 set of areas" model, or the tape model in short.
>
> 3. The last hardware approach we are aware of is NICs that you provide
> multiple array buffers (128, 256, 1500, 9000 etc).
> The NIC then decides in which array slot to place the packet depending on
> it's size.
> This is "1 ring + X buffer arrays" model or the multi packet array in short.
>
> Is memory schemes 2 and 3 supported? If not do you plan on supporting them?
>

Note that AF_PACKET V4 has evolved into AF_XDP, and will have the same
buffer constraints as XDP has/will get.

Option 2 (zero copy) with the current interface will probably be a bit
tricky. It could be possible, but would require the kernel/driver to
track when enough buffers has been gifted to the kernel to form a a
contiguous memory.

As for option 3, supporting different buffer sizes on the same socket
could in theory be supported, but that would require too much
house-holding from the driver. Supporting multiple (different) buffer
sizes to the same interface using different sockets should be doable
from a driver perspective.

Thanks for raising the concern -- the kernel side of AF_XDP/zero copy
must be usable for multiple vendors to make sense. I'm open to any
suggestions to the API that would make it easier for other vendors.


Thanks,
Björn

> Regards
> Ilias
>
>
>
>

RE: [RFC crypto v3 8/9] chtls: Register the ULP

2018-01-31 Thread Atul Gupta



-Original Message-
From: Dave Watson [mailto:davejwat...@fb.com] 
Sent: Wednesday, January 31, 2018 10:14 PM
To: Atul Gupta 
Cc: s...@queasysnail.net; herb...@gondor.apana.org.au; 
linux-cry...@vger.kernel.org; ganes...@chelsio.co; netdev@vger.kernel.org; 
da...@davemloft.net; Boris Pismenny ; Ilya Lesokhin 

Subject: Re: [RFC crypto v3 8/9] chtls: Register the ULP

On 01/31/18 04:14 PM, Atul Gupta wrote:
> 
> 
> On Tuesday 30 January 2018 10:41 PM, Dave Watson wrote:
> > On 01/30/18 06:51 AM, Atul Gupta wrote:
> > 
> > > What I was referring is that passing "tls" ulp type in setsockopt 
> > > may be insufficient to make the decision when multi HW assist 
> > > Inline TLS solution exists.
> > Setting the ULP doesn't choose HW or SW implementation, I think that 
> > should be done later when setting up crypto with
> > 
> > setsockopt(SOL_TLS, TLS_TX, struct crypto_info).
> setsockpot [mentioned above] is quite late for driver to enable HW 
> implementation, we require something as early as tls_init 
> [setsockopt(sock, SOL_TCP, TCP_ULP, "tls", sizeof("tls"))], for driver 
> to set HW prot and offload connection beside Inline Tx/Rx.
> > 
> > Any reason we can't use ethtool to choose HW vs SW implementation, 
> > if available on the device?
> Thought about it,  the interface index is not available to fetch 
> netdev and caps check to set HW prot eg. bind [prot.hash] --> tls_hash to 
> program HW.

Perhaps this is the part I don't follow - why do you need to override hash and 
check for LISTEN?  I briefly looked through the patch named "CPL handler 
definition", this looks like it is a full TCP offload?

Yes, this is connection and record layer offload, and the reason I used 
different ulp type, need to see what additional info or check can help setup 
the required sk prot.

Re: sctp netns "unregister_netdevice: waiting for lo to become free. Usage count = 1"

2018-01-31 Thread Tommi Rantala


On 31.01.2018 14:31, Neil Horman wrote:

On Wed, Jan 31, 2018 at 11:42:24AM +0200, Tommi Rantala wrote:

I think there's a problem in the dst refcounting in sctp_v4_get_dst()

There's a dst_entry struct that has >0 refcnt after running the testcase,
which makes it impossible to delete the loopback device, as that dst is
never freed.

I'll try to make a patch.


Are you looking at the second for loop there, which uses ip_route_output_key,
but discards the result if dst is already set?  That does look a bit wonky, and
the same problem may exist in the ipv6 path.  Let me know what the result is.


Yes, that was it. Did you receive the email I sent with the patch?
(I'm not seeing that message e.g. at spinics.net linux-sctp archive, so 
just wondering if that email got lost somehow...)


I'll check the ipv6 case, did not try it yet.

Tommi

Re: [PATCHv2] tls: Add support for encryption using async offload accelerator

2018-01-31 Thread Gilad Ben-Yossef

On Wed, Jan 31, 2018 at 7:34 PM, Dave Watson  wrote:
> On 01/31/18 05:22 PM, Vakul Garg wrote:
>> > > On second though in stable we should probably just disable async tfm
>> > > allocations.
>> > > It's simpler. But this approach is still good for -next
>> > >
>> > >
>> > > Gilad
>> >
>> > I agree with Gilad, just disable async for now.
>> >
>>
>> How to do it? Can you help with the api name?
>
> *aead = crypto_alloc_aead("gcm(aes)", 0, CRYPTO_ALG_ASYNC);
>
> https://github.com/ktls/net_next_ktls/commit/f3b9b402e755e4b0623fa83f88137173fc249f2d

I said disabling async tfms is the right way to go for -stable since
it's the simplest and less risky way
of plugging this bug.

I don't think this is the way to go for -next (and it seems davem
agrees with me). Vakul's
patch looks good to me for now.

>
>> > If the flag MSG_DONTWAIT is set, we should be returning -EINPROGRESS and
>> > not wait for a response.  I had started working on a patch for that, but 
>> > it's
>> > pretty tricky to get right.
>>

That would be a great addition, but I don't think we need to wait for
that. It can come later.

Gilad
-- 
Gilad Ben-Yossef
Chief Coffee Drinker

"If you take a class in large-scale robotics, can you end up in a
situation where the homework eats your dog?"
 -- Jean-Baptiste Queru

Re: [BUG iproute2] ip tuntap show

2018-01-31 Thread Serhey Popovych

David Ahern wrote:
> On 1/31/18 10:21 AM, Serhey Popovych wrote:
>> Eric Dumazet wrote:
>>> ip tuntap enumerates devices using /sys/class/net which is unusual.
>>>
>>> Should we replace this enumeration using /proc/net/dev like "ip tunnel" ?
>>>
>>> After "unshare -n" maybe mounting /sys should not be required for
>>> proper iproute2 behavior.
>>>
>>> At least ip command should adopt a common enumeration method.
>>>
>>> What do you think ?
>>
>> It seems main reason for using /sys/class/net is to get additional
>> information for netdev like "owner", "group" and "tun_flags".
>>
>> On the other hand at least iptunnel and ip6tunnel uses nearly identical
>> code to parse /proc/net/dev.
>>
>> Having single routine that reads /proc/net/dev and calls implementation
>> specific callback function with given network device name is good idea.
>>
>> I can try to prepare v1 for this, if no one objects this.
>>
> 
> pid_name function needs help too. comm is allocated via sprintf, freed,
> and used again and then returned to caller.
> 

Nice :-) will fix that too. Thanks for pointing.



signature.asc
Description: OpenPGP digital signature

Re: [BUG iproute2] ip tuntap show

2018-01-31 Thread David Ahern

On 1/31/18 10:21 AM, Serhey Popovych wrote:
> Eric Dumazet wrote:
>> ip tuntap enumerates devices using /sys/class/net which is unusual.
>>
>> Should we replace this enumeration using /proc/net/dev like "ip tunnel" ?
>>
>> After "unshare -n" maybe mounting /sys should not be required for
>> proper iproute2 behavior.
>>
>> At least ip command should adopt a common enumeration method.
>>
>> What do you think ?
> 
> It seems main reason for using /sys/class/net is to get additional
> information for netdev like "owner", "group" and "tun_flags".
> 
> On the other hand at least iptunnel and ip6tunnel uses nearly identical
> code to parse /proc/net/dev.
> 
> Having single routine that reads /proc/net/dev and calls implementation
> specific callback function with given network device name is good idea.
> 
> I can try to prepare v1 for this, if no one objects this.
> 

pid_name function needs help too. comm is allocated via sprintf, freed,
and used again and then returned to caller.

Re: [PATCHv2] tls: Add support for encryption using async offload accelerator

2018-01-31 Thread Dave Watson

On 01/31/18 05:22 PM, Vakul Garg wrote:
> > > On second though in stable we should probably just disable async tfm
> > > allocations.
> > > It's simpler. But this approach is still good for -next
> > >
> > >
> > > Gilad
> > 
> > I agree with Gilad, just disable async for now.
> > 
> 
> How to do it? Can you help with the api name?

*aead = crypto_alloc_aead("gcm(aes)", 0, CRYPTO_ALG_ASYNC);

https://github.com/ktls/net_next_ktls/commit/f3b9b402e755e4b0623fa83f88137173fc249f2d

> > If the flag MSG_DONTWAIT is set, we should be returning -EINPROGRESS and
> > not wait for a response.  I had started working on a patch for that, but 
> > it's
> > pretty tricky to get right.
> 
> Can you point me to your WIP code branch for this?

https://github.com/ktls/net_next_ktls/commit/9cc839aa551ed972d148ecebf353b25ee93543b9

> If MSG_DONTWAIT is not used, will it be sane if enqueue the crypto request to 
> accelerator and return to user space back so that user space can send more 
> plaintext data while 
> crypto accelerator is working in parallel?

Right, that's roughly what the above does.  I believe the tricky
unfinished part was getting poll() to work correctly if there is an
async crypto request outstanding.  Currently the tls poll() just
relies on backpressure from do_tcp_sendpages.

Re: [PATCH 01/10] net/sched: kconfig: Remove empty help texts

2018-01-31 Thread Ulf Magnusson

On Wed, Jan 31, 2018 at 4:43 PM, David Miller  wrote:
> From: Masahiro Yamada 
> Date: Thu, 1 Feb 2018 00:37:27 +0900
>
>> 2018-02-01 0:32 GMT+09:00 David Miller :
>>> From: Ulf Magnusson 
>>> Date: Tue, 30 Jan 2018 20:05:23 +0100
>>>
 In preparation for adding a warning ("kconfig: Warn if help text is
 blank"): https://lkml.org/lkml/2018/1/30/516

 Signed-off-by: Ulf Magnusson 
>>>
>>> Applied.
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-kbuild" in
>>> the body of a message to majord...@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
>>
>> I had applied the whole series into linux-kbuild.
>>
>>
>> Do you want to apply it to your tree?
>
> I already did :-)
>
> It won't cause any problems for it to be in both of our trees so let's
> just leave it this way.

Yeah, sorry about that mess. I had assumed this would require
coordination from more people. :)

Masahiro: Thanks for merging it so quickly!

Cheers,
Ulf

RE: [PATCHv2] tls: Add support for encryption using async offload accelerator

2018-01-31 Thread Vakul Garg



> -Original Message-
> From: Dave Watson [mailto:davejwat...@fb.com]
> Sent: Wednesday, January 31, 2018 8:52 PM
> To: Vakul Garg 
> Cc: linux-cry...@vger.kernel.org; il...@mellanox.com;
> avia...@mellanox.com; da...@davemloft.net; netdev@vger.kernel.org;
> Gilad Ben-Yossef 
> Subject: Re: [PATCHv2] tls: Add support for encryption using async offload
> accelerator
> 
> On 01/31/18 09:34 PM, Vakul Garg wrote:
> > Async crypto accelerators (e.g. drivers/crypto/caam) support
> > offloading GCM operation. If they are enabled, crypto_aead_encrypt()
> > return error code -EINPROGRESS. In this case tls_do_encryption() needs
> > to wait on a completion till the time the response for crypto offload
> > request is received.
> 
> Comments from V1
> > On Wed, Jan 31, 2018 at 8:10 AM, Gilad Ben-Yossef
>  wrote:
> >> Hi Vakul,
> >>
> >> On Wed, Jan 31, 2018 at 12:36 PM, Vakul Garg 
> wrote:
> >>> Async crypto accelerators (e.g. drivers/crypto/caam) support
> >>> offloading GCM operation. If they are enabled, crypto_aead_encrypt()
> >>> return error code -EINPROGRESS. In this case tls_do_encryption()
> >>> needs to wait on a completion till the time the response for crypto
> >>> offload request is received.
> >>>
> >>
> >> Thank you for this patch. I think it is actually a bug fix and should
> >> probably go into stable
> >
> > On second though in stable we should probably just disable async tfm
> > allocations.
> > It's simpler. But this approach is still good for -next
> >
> >
> > Gilad
> 
> I agree with Gilad, just disable async for now.
> 

How to do it? Can you help with the api name?

> If the flag MSG_DONTWAIT is set, we should be returning -EINPROGRESS and
> not wait for a response.  I had started working on a patch for that, but it's
> pretty tricky to get right.

Can you point me to your WIP code branch for this?

If MSG_DONTWAIT is not used, will it be sane if enqueue the crypto request to 
accelerator and return to user space back so that user space can send more 
plaintext data while 
crypto accelerator is working in parallel?

Re: [BUG iproute2] ip tuntap show

2018-01-31 Thread Serhey Popovych

Eric Dumazet wrote:
> ip tuntap enumerates devices using /sys/class/net which is unusual.
> 
> Should we replace this enumeration using /proc/net/dev like "ip tunnel" ?
> 
> After "unshare -n" maybe mounting /sys should not be required for
> proper iproute2 behavior.
> 
> At least ip command should adopt a common enumeration method.
> 
> What do you think ?

It seems main reason for using /sys/class/net is to get additional
information for netdev like "owner", "group" and "tun_flags".

On the other hand at least iptunnel and ip6tunnel uses nearly identical
code to parse /proc/net/dev.

Having single routine that reads /proc/net/dev and calls implementation
specific callback function with given network device name is good idea.

I can try to prepare v1 for this, if no one objects this.

> 
> Thanks.
> 
> lpaa5:~# cat /proc/net/dev
> Inter-|   Receive|  Transmit
>  face |bytespackets errs drop fifo frame compressed multicast|bytes
> packets errs drop fifo colls carrier compressed
>   adp0:   0   0000 0  0 00
>0000 0   0  0
> uinput4:   0   0000 0  0 00   
> 0000 0   0  0
> dynencap4:   0   0000 0  0 00 
>   0000 0   0  0
> sixtofour0:  6968866733000 0  0 0   
> 4807434445000 0   0  0
> uinput6:   0   0000 0  0 00   
> 0000 0   0  0
> gretap0:   0   0000 0  0 00   
> 0000 0   0  0
> ip6gre0:   0   0000 0  0 00   
> 0000 0   0  0
>  tunl0:   0   0000 0  0 00
>0000 0   0  0
> ip6tnl0:   0   0000 0  0 00   
> 0000 0   0  0
>   gre0:   0   0000 0  0 00
>0000 0   0  0
>   sit0:   0   0000 0  0 00
>0000 0   0  0
> sixdecap0:   0   0000 0  0 0   559849 
>2371000 0   0  0
> lo: 80924524  222675000 0  0 0 80924524  
> 222675000 0   0  0
>   eth2: 8898473491 131730002000 0  089 
> 8606929352 128406009000 0   0  0
> dynencap6:   0   0000 0  0 00 
>   0000 0   0  0
>   eth1: 9219322658 137558705000 0  0  1592 
> 9438658576 140822798000 0   0  0
>   eth0: 18117796393 269288710000 0  0  1682 
> 18045588486 269228813030 0   0  0
> 
> lpaa5:~# unshare -n
> 
> lpaa5:~# cat /proc/net/dev
> Inter-|   Receive|  Transmit
>  face |bytespackets errs drop fifo frame compressed multicast|bytes
> packets errs drop fifo colls carrier compressed
> lo:   0   0000 0  0 00
>0000 0   0  0
> 
> lpaa5:~# ip tuntap
> adp0: tap vnet_hdr UNKNOWN_FLAGS:940
> 
> lpaa5:~# mount -t sysfs sysfs /sys
> 
> lpaa5:~# ip tuntap
> 
> lpaa5:~# ls -l /sys/class/net
> total 0
> -rw-r--r-- 1 root root 4096 Jan 31 08:51 bonding_masters
> lrwxrwxrwx 1 root root0 Jan 31 08:51 lo -> ../../devices/virtual/net/lo
> lpaa5:~# 
> 




signature.asc
Description: OpenPGP digital signature

[BUG iproute2] ip tuntap show

2018-01-31 Thread Eric Dumazet

ip tuntap enumerates devices using /sys/class/net which is unusual.

Should we replace this enumeration using /proc/net/dev like "ip tunnel" ?

After "unshare -n" maybe mounting /sys should not be required for
proper iproute2 behavior.

At least ip command should adopt a common enumeration method.

What do you think ?

Thanks.

lpaa5:~# cat /proc/net/dev
Inter-|   Receive|  Transmit
 face |bytespackets errs drop fifo frame compressed multicast|bytes
packets errs drop fifo colls carrier compressed
  adp0:   0   0000 0  0 00  
 0000 0   0  0
uinput4:   0   0000 0  0 00 
  0000 0   0  0
dynencap4:   0   0000 0  0 00   
0000 0   0  0
sixtofour0:  6968866733000 0  0 0   480743  
  4445000 0   0  0
uinput6:   0   0000 0  0 00 
  0000 0   0  0
gretap0:   0   0000 0  0 00 
  0000 0   0  0
ip6gre0:   0   0000 0  0 00 
  0000 0   0  0
 tunl0:   0   0000 0  0 00  
 0000 0   0  0
ip6tnl0:   0   0000 0  0 00 
  0000 0   0  0
  gre0:   0   0000 0  0 00  
 0000 0   0  0
  sit0:   0   0000 0  0 00  
 0000 0   0  0
sixdecap0:   0   0000 0  0 0   559849   
 2371000 0   0  0
lo: 80924524  222675000 0  0 0 80924524  
222675000 0   0  0
  eth2: 8898473491 131730002000 0  089 
8606929352 128406009000 0   0  0
dynencap6:   0   0000 0  0 00   
0000 0   0  0
  eth1: 9219322658 137558705000 0  0  1592 
9438658576 140822798000 0   0  0
  eth0: 18117796393 269288710000 0  0  1682 
18045588486 269228813030 0   0  0

lpaa5:~# unshare -n

lpaa5:~# cat /proc/net/dev
Inter-|   Receive|  Transmit
 face |bytespackets errs drop fifo frame compressed multicast|bytes
packets errs drop fifo colls carrier compressed
lo:   0   0000 0  0 00  
 0000 0   0  0

lpaa5:~# ip tuntap
adp0: tap vnet_hdr UNKNOWN_FLAGS:940

lpaa5:~# mount -t sysfs sysfs /sys

lpaa5:~# ip tuntap

lpaa5:~# ls -l /sys/class/net
total 0
-rw-r--r-- 1 root root 4096 Jan 31 08:51 bonding_masters
lrwxrwxrwx 1 root root0 Jan 31 08:51 lo -> ../../devices/virtual/net/lo
lpaa5:~#

Re: [PATCH bpf-next v8 0/5] libbpf: add XDP binding support

2018-01-31 Thread Daniel Borkmann

On 01/30/2018 09:50 PM, Eric Leblond wrote:
> Hello Daniel,
> 
> No problem with the delay in the answer. I'm doing far worse.
> 
> Here is an updated version:
> - add if_link.h in uapi and remove the definition
> - fix a commit message
> - remove uapi from a include

Fyi, this still needs to wait for a bit in the queue due to current
merge window where bpf-next is closed during that time [0]. Thanks!

  [0] https://www.spinics.net/lists/netdev/msg481490.html

Re: [RFC crypto v3 8/9] chtls: Register the ULP

2018-01-31 Thread Dave Watson

On 01/31/18 04:14 PM, Atul Gupta wrote:
> 
> 
> On Tuesday 30 January 2018 10:41 PM, Dave Watson wrote:
> > On 01/30/18 06:51 AM, Atul Gupta wrote:
> > 
> > > What I was referring is that passing "tls" ulp type in setsockopt
> > > may be insufficient to make the decision when multi HW assist Inline
> > > TLS solution exists.
> > Setting the ULP doesn't choose HW or SW implementation, I think that
> > should be done later when setting up crypto with
> > 
> > setsockopt(SOL_TLS, TLS_TX, struct crypto_info).
> setsockpot [mentioned above] is quite late for driver to enable HW
> implementation, we require something as early as tls_init [setsockopt(sock,
> SOL_TCP, TCP_ULP, "tls", sizeof("tls"))], for driver to set HW prot and
> offload connection beside Inline Tx/Rx.
> > 
> > Any reason we can't use ethtool to choose HW vs SW implementation, if
> > available on the device?
> Thought about it,  the interface index is not available to fetch netdev and
> caps check to set HW prot eg. bind [prot.hash] --> tls_hash to program HW.

Perhaps this is the part I don't follow - why do you need to override
hash and check for LISTEN?  I briefly looked through the patch named
"CPL handler definition", this looks like it is a full TCP offload?

Re: [PATCH net-next 1/1] rtnetlink: enable IFLA_IF_NETNSID for RTM_NEWLINK

2018-01-31 Thread Christian Brauner

On Wed, Jan 31, 2018 at 10:30:44AM -0500, David Miller wrote:
> From: Christian Brauner 
> Date: Mon, 29 Jan 2018 18:07:20 +0100
> 
> > - Backwards Compatibility:
> >   If userspace wants to determine whether RTM_NEWLINK supports the
> >   IFLA_IF_NETNSID property they should first send an RTM_GETLINK request
> >   with IFLA_IF_NETNSID on lo. If either EACCESS is returned or the reply
> >   does not include IFLA_IF_NETNSID userspace should assume that
> >   IFLA_IF_NETNSID is not supported on this kernel.
> >   If the reply does contain an IFLA_IF_NETNSID property userspace
> >   can send an RTM_NEWLINK with a IFLA_IF_NETNSID property. If they receive
> >   EOPNOTSUPP then the kernel does not support the IFLA_IF_NETNSID property
> >   with RTM_NEWLINK. Userpace should then fallback to other means.
> > 
> > - Security:
> >   Callers must have CAP_NET_ADMIN in the owning user namespace of the
> >   target network namespace.
> > 
> > Signed-off-by: Christian Brauner 
> 
> Applied.

Thanks!

[PATCH] be2net: remove redundant initialization of 'head' and pointer txq

2018-01-31 Thread Colin King

From: Colin Ian King 

Variable head is initialized to a value that is never read and is
being updated to a new value a few lines later, hence this
initialization is redundant and can be safely removed as well
as the now unused pointer txq.

Cleans up clang warning:
drivers/net/ethernet/emulex/benet/be_main.c:996:6: warning: Value
stored to 'head' during its initialization is never read

Signed-off-by: Colin Ian King 
---
 drivers/net/ethernet/emulex/benet/be_main.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be_main.c 
b/drivers/net/ethernet/emulex/benet/be_main.c
index c36c81959198..d81e2d37bc3d 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -991,9 +991,8 @@ static u32 be_xmit_enqueue(struct be_adapter *adapter, 
struct be_tx_obj *txo,
 {
u32 i, copied = 0, wrb_cnt = skb_wrb_cnt(skb);
struct device *dev = >pdev->dev;
-   struct be_queue_info *txq = >q;
bool map_single = false;
-   u32 head = txq->head;
+   u32 head;
dma_addr_t busaddr;
int len;
 
-- 
2.15.1

[stable] dccp: CVE-2017-8824: use-after-free in DCCP code

2018-01-31 Thread Ben Hutchings

Please queue up this commit for stable:

69c64866ce07 dccp: CVE-2017-8824: use-after-free in DCCP code

Ben.

-- 
Ben Hutchings
Software Developer, Codethink Ltd.

Re: [PATCH 01/10] net/sched: kconfig: Remove empty help texts

2018-01-31 Thread David Miller

From: Masahiro Yamada 
Date: Thu, 1 Feb 2018 00:37:27 +0900

> 2018-02-01 0:32 GMT+09:00 David Miller :
>> From: Ulf Magnusson 
>> Date: Tue, 30 Jan 2018 20:05:23 +0100
>>
>>> In preparation for adding a warning ("kconfig: Warn if help text is
>>> blank"): https://lkml.org/lkml/2018/1/30/516
>>>
>>> Signed-off-by: Ulf Magnusson 
>>
>> Applied.
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kbuild" in
>> the body of a message to majord...@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 
> I had applied the whole series into linux-kbuild.
> 
> 
> Do you want to apply it to your tree?

I already did :-)

It won't cause any problems for it to be in both of our trees so let's
just leave it this way.

Re: [PATCH 01/10] net/sched: kconfig: Remove empty help texts

2018-01-31 Thread Masahiro Yamada

2018-02-01 0:32 GMT+09:00 David Miller :
> From: Ulf Magnusson 
> Date: Tue, 30 Jan 2018 20:05:23 +0100
>
>> In preparation for adding a warning ("kconfig: Warn if help text is
>> blank"): https://lkml.org/lkml/2018/1/30/516
>>
>> Signed-off-by: Ulf Magnusson 
>
> Applied.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kbuild" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


I had applied the whole series into linux-kbuild.


Do you want to apply it to your tree?


-- 
Best Regards
Masahiro Yamada

Re: [PATCH net] netfilter: on sockopt() acquire sock lock only in the required scope

2018-01-31 Thread Pablo Neira Ayuso

On Tue, Jan 30, 2018 at 07:01:40PM +0100, Paolo Abeni wrote:
> Syzbot reported several deadlocks in the netfilter area caused by
> rtnl lock and socket lock being acquired with a different order on
> different code paths, leading to backtraces like the following one:
[...]
> The problem, as Florian noted, is that nf_setsockopt() is always
> called with the socket held, even if the lock itself is required only
> for very tight scopes and only for some operation.

Applied, thanks Paolo.

Re: [PATCH net] r8169: fix RTL8168EP take too long to complete driver initialization.

2018-01-31 Thread David Miller

From: Chunhao Lin 
Date: Wed, 31 Jan 2018 01:32:36 +0800

> Driver check the wrong register bit in rtl_ocp_tx_cond() that keep driver
> waiting until timeout.
> 
> Fix this by waiting for the right register bit.
> 
> Signed-off-by: Chunhao Lin 

Applied and queued up for -stable.

Re: [PATCH net v2] ip6mr: fix stale iterator

2018-01-31 Thread David Miller

From: Nikolay Aleksandrov 
Date: Wed, 31 Jan 2018 16:29:30 +0200

> When we dump the ip6mr mfc entries via proc, we initialize an iterator
> with the table to dump but we don't clear the cache pointer which might
> be initialized from a prior read on the same descriptor that ended. This
> can result in lock imbalance (an unnecessary unlock) leading to other
> crashes and hangs. Clear the cache pointer like ipmr does to fix the issue.
> Thanks for the reliable reproducer.
> 
> Here's syzbot's trace:
 ...
> Reported-by: syzbot 
> 
> Signed-off-by: Nikolay Aleksandrov 
> ---
> v2: make sure the trace doesn't ruin the patch
> No fixes tag because it seems this has been there forever.

Applied and queued up for -stable.

Re: [PATCH] tcp_nv: fix potential integer overflow in tcpnv_acked

2018-01-31 Thread David Miller

From: "Gustavo A. R. Silva" 
Date: Tue, 30 Jan 2018 22:21:48 -0600

> Add suffix ULL to constant 8 in order to avoid a potential integer
> overflow and give the compiler complete information about the proper
> arithmetic to use. Notice that this constant is used in a context that
> expects an expression of type u64.
> 
> The current cast to u64 effectively applies to the whole expression
> as an argument of type u64 to be passed to div64_u64, but it does
> not prevent it from being evaluated using 32-bit arithmetic instead
> of 64-bit arithmetic.
> 
> Also, once the expression is properly evaluated using 64-bit arithmentic,
> there is no need for the parentheses and the external cast to u64.
> 
> Addresses-Coverity-ID: 1357588 ("Unintentional integer overflow")
> Signed-off-by: Gustavo A. R. Silva 

Applied.

Re: [PATCHv2] tls: Add support for encryption using async offload accelerator

2018-01-31 Thread David Miller

From: Vakul Garg 
Date: Wed, 31 Jan 2018 21:34:37 +0530

> Async crypto accelerators (e.g. drivers/crypto/caam) support offloading
> GCM operation. If they are enabled, crypto_aead_encrypt() return error
> code -EINPROGRESS. In this case tls_do_encryption() needs to wait on a
> completion till the time the response for crypto offload request is
> received.
> 
> Signed-off-by: Vakul Garg 
> ---
> v1-v2:
>  - Used crypto_wait_req() to wait for async operation completion
>  - Passed CRYPTO_TFM_REQ_MAY_BACKLOG to crypto_aead_encrypt

Applied.

Re: [PATCH] openvswitch: meter: Use 64-bit arithmetic instead of 32-bit

2018-01-31 Thread David Miller

From: "Gustavo A. R. Silva" 
Date: Tue, 30 Jan 2018 22:55:33 -0600

> Add suffix LL to constant 1000 in order to give the compiler
> complete information about the proper arithmetic to use. Notice
> that this constant is used in a context that expects an expression
> of type long long int (64 bits, signed).
> 
> The expression (band->burst_size + band->rate) * 1000 is currently
> being evaluated using 32-bit arithmetic.
> 
> Addresses-Coverity-ID: 1461563 ("Unintentional integer overflow")
> Signed-off-by: Gustavo A. R. Silva 

Applied.

Re: [PATCH 01/10] net/sched: kconfig: Remove empty help texts

2018-01-31 Thread David Miller

From: Ulf Magnusson 
Date: Tue, 30 Jan 2018 20:05:23 +0100

> In preparation for adding a warning ("kconfig: Warn if help text is
> blank"): https://lkml.org/lkml/2018/1/30/516
> 
> Signed-off-by: Ulf Magnusson 

Applied.

Re: [PATCH net,stable] qmi_wwan: Add support for Quectel EP06

2018-01-31 Thread David Miller

From: Kristian Evensen 
Date: Tue, 30 Jan 2018 14:12:55 +0100

> The Quectel EP06 is a Cat. 6 LTE modem. It uses the same interface as
> the EC20/EC25 for QMI, and requires the same "set DTR"-quirk to work.
> 
> Signed-off-by: Kristian Evensen 

Applied and queued up for -stable.

Re: [PATCH net-next 1/1] rtnetlink: enable IFLA_IF_NETNSID for RTM_NEWLINK

2018-01-31 Thread David Miller

From: Christian Brauner 
Date: Mon, 29 Jan 2018 18:07:20 +0100

> - Backwards Compatibility:
>   If userspace wants to determine whether RTM_NEWLINK supports the
>   IFLA_IF_NETNSID property they should first send an RTM_GETLINK request
>   with IFLA_IF_NETNSID on lo. If either EACCESS is returned or the reply
>   does not include IFLA_IF_NETNSID userspace should assume that
>   IFLA_IF_NETNSID is not supported on this kernel.
>   If the reply does contain an IFLA_IF_NETNSID property userspace
>   can send an RTM_NEWLINK with a IFLA_IF_NETNSID property. If they receive
>   EOPNOTSUPP then the kernel does not support the IFLA_IF_NETNSID property
>   with RTM_NEWLINK. Userpace should then fallback to other means.
> 
> - Security:
>   Callers must have CAP_NET_ADMIN in the owning user namespace of the
>   target network namespace.
> 
> Signed-off-by: Christian Brauner 

Applied.

Re: [PATCHv2] tls: Add support for encryption using async offload accelerator

2018-01-31 Thread Dave Watson

On 01/31/18 09:34 PM, Vakul Garg wrote:
> Async crypto accelerators (e.g. drivers/crypto/caam) support offloading
> GCM operation. If they are enabled, crypto_aead_encrypt() return error
> code -EINPROGRESS. In this case tls_do_encryption() needs to wait on a
> completion till the time the response for crypto offload request is
> received.

Comments from V1
> On Wed, Jan 31, 2018 at 8:10 AM, Gilad Ben-Yossef  wrote:
>> Hi Vakul,
>>
>> On Wed, Jan 31, 2018 at 12:36 PM, Vakul Garg  wrote:
>>> Async crypto accelerators (e.g. drivers/crypto/caam) support offloading
>>> GCM operation. If they are enabled, crypto_aead_encrypt() return error
>>> code -EINPROGRESS. In this case tls_do_encryption() needs to wait on a
>>> completion till the time the response for crypto offload request is
>>> received.
>>>
>>
>> Thank you for this patch. I think it is actually a bug fix and should
>> probably go into stable
>
> On second though in stable we should probably just disable async tfm
> allocations.
> It's simpler. But this approach is still good for -next
>
>
> Gilad

I agree with Gilad, just disable async for now. 

If the flag MSG_DONTWAIT is set, we should be returning -EINPROGRESS
and not wait for a response.  I had started working on a patch for
that, but it's pretty tricky to get right.

Re: [PATCH net v2] ip6mr: fix stale iterator

2018-01-31 Thread David Miller

From: Dmitry Vyukov 
Date: Wed, 31 Jan 2018 15:49:15 +0100

> Don't we need to Cc stable 2.6 in this case or something like this. We
> want it to be backported.

Networking changes do not CC: stable.

Please read the netdev FAQ, thank you.

Re: [PATCH] net: pxa168_eth: add netconsole support

2018-01-31 Thread David Miller

From: Alexander Monakov 
Date: Wed, 31 Jan 2018 17:05:47 +0300 (MSK)

> And is this a matter of efficiency (not calling napi_schedule when there's
> nothing to do and not keeping interrupts enabled for its duration), or also
> a matter of correctness?

If you don't mask interrupts properly around scheduling and finishing
NAPI poll, you can lose events so it is a matter of correctness.

Re: [PATCH net v2] ip6mr: fix stale iterator

2018-01-31 Thread Dmitry Vyukov

On Wed, Jan 31, 2018 at 3:52 PM, Nikolay Aleksandrov
 wrote:
> On 31/01/18 16:49, Dmitry Vyukov wrote:
>> On Wed, Jan 31, 2018 at 3:29 PM, Nikolay Aleksandrov
>>  wrote:
>>> When we dump the ip6mr mfc entries via proc, we initialize an iterator
>>> with the table to dump but we don't clear the cache pointer which might
>>> be initialized from a prior read on the same descriptor that ended. This
>>> can result in lock imbalance (an unnecessary unlock) leading to other
>>> crashes and hangs. Clear the cache pointer like ipmr does to fix the issue.
>>> Thanks for the reliable reproducer.
> [snip]
>>> Reported-by: syzbot 
>>> 
>>> Signed-off-by: Nikolay Aleksandrov 
>>> ---
>>> v2: make sure the trace doesn't ruin the patch
>>> No fixes tag because it seems this has been there forever.
>>
>> Don't we need to Cc stable 2.6 in this case or something like this. We
>> want it to be backported.
>
> AFAIK Dave takes care of queueing the patches for stable backports and
> maintainers get them from his stable queue.

Good to know. Thanks.

>>>  net/ipv6/ip6mr.c | 1 +
>>>  1 file changed, 1 insertion(+)
>>>
>>> diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
>>> index a2e1a864eb46..4fc566ec7e79 100644
>>> --- a/net/ipv6/ip6mr.c
>>> +++ b/net/ipv6/ip6mr.c
>>> @@ -495,6 +495,7 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, 
>>> loff_t *pos)
>>> return ERR_PTR(-ENOENT);
>>>
>>> it->mrt = mrt;
>>> +   it->cache = NULL;
>>> return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
>>> : SEQ_START_TOKEN;
>>>  }
>>> --
>>> 2.1.4
>>>
>>> --
>>> You received this message because you are subscribed to the Google Groups 
>>> "syzkaller-bugs" group.
>>> To unsubscribe from this group and stop receiving emails from it, send an 
>>> email to syzkaller-bugs+unsubscr...@googlegroups.com.
>>> To view this discussion on the web visit 
>>> https://groups.google.com/d/msgid/syzkaller-bugs/1517408970-14210-1-git-send-email-nikolay%40cumulusnetworks.com.
>>> For more options, visit https://groups.google.com/d/optout.
>

Re: [Xen-devel] [PATCHv2] xen-netfront: remove warning when unloading module

2018-01-31 Thread Oleksandr Andrushchenko


Hi, Eduardo!

I am working on a frontend driver (PV DRM) and also seeing some strange

things on driver unloading:

xt# rmmod -f drm_xen_front.ko
[ 3236.462497] [drm] Unregistering XEN PV vdispl
[ 3236.485745] [drm:xen_drv_remove [drm_xen_front]] *ERROR* Backend 
state is InitWait while removing driver

[ 3236.486950] vdispl vdispl-0: 22 freeing event channel 11
[ 3236.496123] vdispl vdispl-0: failed to write error node for 
device/vdispl/0 (22 freeing event channel 11)

[ 3236.496271] vdispl vdispl-0: 22 freeing event channel 12
[ 3236.501633] vdispl vdispl-0: failed to write error node for 
device/vdispl/0 (22 freeing event channel 12)


These are somewhat different from your use-case with grant references, 
but I have a question:


do you really see that XenbusStateClosed and XenbusStateClosing are

called? In my driver I can't see those and once I tried to dig deeper 
into the problem


I saw that on driver removal it is disconnected from XenBus, so no 
backend state


change events come in via .otherend_changed callback.

The only difference I see here is that the backend is a user-space 
application


Thank you,
Oleksandr

On 11/23/2017 04:18 PM, Eduardo Otubo wrote:

v2:
  * Replace busy wait with wait_event()/wake_up_all()
  * Cannot garantee that at the time xennet_remove is called, the
xen_netback state will not be XenbusStateClosed, so added a
condition for that
  * There's a small chance for the xen_netback state is
XenbusStateUnknown by the time the xen_netfront switches to Closed,
so added a condition for that.

When unloading module xen_netfront from guest, dmesg would output
warning messages like below:

   [  105.236836] xen:grant_table: WARNING: g.e. 0x903 still in use!
   [  105.236839] deferring g.e. 0x903 (pfn 0x35805)

This problem relies on netfront and netback being out of sync. By the time
netfront revokes the g.e.'s netback didn't have enough time to free all of
them, hence displaying the warnings on dmesg.

The trick here is to make netfront to wait until netback frees all the g.e.'s
and only then continue to cleanup for the module removal, and this is done by
manipulating both device states.

Signed-off-by: Eduardo Otubo 
---
  drivers/net/xen-netfront.c | 18 ++
  1 file changed, 18 insertions(+)

diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 8b8689c6d887..391432e2725d 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -87,6 +87,8 @@ struct netfront_cb {
  /* IRQ name is queue name with "-tx" or "-rx" appended */
  #define IRQ_NAME_SIZE (QUEUE_NAME_SIZE + 3)
  
+static DECLARE_WAIT_QUEUE_HEAD(module_unload_q);

+
  struct netfront_stats {
u64 packets;
u64 bytes;
@@ -2021,10 +2023,12 @@ static void netback_changed(struct xenbus_device *dev,
break;
  
  	case XenbusStateClosed:

+   wake_up_all(_unload_q);
if (dev->state == XenbusStateClosed)
break;
/* Missed the backend's CLOSING state -- fallthrough */
case XenbusStateClosing:
+   wake_up_all(_unload_q);
xenbus_frontend_closed(dev);
break;
}
@@ -2130,6 +2134,20 @@ static int xennet_remove(struct xenbus_device *dev)
  
  	dev_dbg(>dev, "%s\n", dev->nodename);
  
+	if (xenbus_read_driver_state(dev->otherend) != XenbusStateClosed) {

+   xenbus_switch_state(dev, XenbusStateClosing);
+   wait_event(module_unload_q,
+  xenbus_read_driver_state(dev->otherend) ==
+  XenbusStateClosing);
+
+   xenbus_switch_state(dev, XenbusStateClosed);
+   wait_event(module_unload_q,
+  xenbus_read_driver_state(dev->otherend) ==
+  XenbusStateClosed ||
+  xenbus_read_driver_state(dev->otherend) ==
+  XenbusStateUnknown);
+   }
+
xennet_disconnect_backend(info);
  
  	unregister_netdev(info->netdev);

Re: [PATCH] wireless: zd1211rw: remove redundant assignment of pointer 'q'

2018-01-31 Thread Dan Carpenter

On Wed, Jan 31, 2018 at 02:58:57PM +0200, Andy Shevchenko wrote:
> On Tue, Jan 30, 2018 at 8:25 PM, Colin King  wrote:
> > From: Colin Ian King 
> >
> > Pointer q is initialized and then almost immediately afterwards being
> > re-assigned the same value. Remove the second redundant assignment.
> >
> 
> Don't you see strange that in the same context of the patch two users
> of q are present?
> 
> How did you test this?
> 

The patch is obviously correct, I don't understand the question.

regards,
dan carpenter

Re: [PATCH net v2] ip6mr: fix stale iterator

2018-01-31 Thread Nikolay Aleksandrov

On 31/01/18 16:49, Dmitry Vyukov wrote:
> On Wed, Jan 31, 2018 at 3:29 PM, Nikolay Aleksandrov
>  wrote:
>> When we dump the ip6mr mfc entries via proc, we initialize an iterator
>> with the table to dump but we don't clear the cache pointer which might
>> be initialized from a prior read on the same descriptor that ended. This
>> can result in lock imbalance (an unnecessary unlock) leading to other
>> crashes and hangs. Clear the cache pointer like ipmr does to fix the issue.
>> Thanks for the reliable reproducer.
[snip]
>> Reported-by: syzbot 
>> 
>> Signed-off-by: Nikolay Aleksandrov 
>> ---
>> v2: make sure the trace doesn't ruin the patch
>> No fixes tag because it seems this has been there forever.
> 
> Don't we need to Cc stable 2.6 in this case or something like this. We
> want it to be backported.

AFAIK Dave takes care of queueing the patches for stable backports and
maintainers get them from his stable queue.

> 
>>
>>  net/ipv6/ip6mr.c | 1 +
>>  1 file changed, 1 insertion(+)
>>
>> diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
>> index a2e1a864eb46..4fc566ec7e79 100644
>> --- a/net/ipv6/ip6mr.c
>> +++ b/net/ipv6/ip6mr.c
>> @@ -495,6 +495,7 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, 
>> loff_t *pos)
>> return ERR_PTR(-ENOENT);
>>
>> it->mrt = mrt;
>> +   it->cache = NULL;
>> return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
>> : SEQ_START_TOKEN;
>>  }
>> --
>> 2.1.4
>>
>> --
>> You received this message because you are subscribed to the Google Groups 
>> "syzkaller-bugs" group.
>> To unsubscribe from this group and stop receiving emails from it, send an 
>> email to syzkaller-bugs+unsubscr...@googlegroups.com.
>> To view this discussion on the web visit 
>> https://groups.google.com/d/msgid/syzkaller-bugs/1517408970-14210-1-git-send-email-nikolay%40cumulusnetworks.com.
>> For more options, visit https://groups.google.com/d/optout.

Re: [PATCH bpf v2] bpf: fix null pointer deref in bpf_prog_test_run_xdp

2018-01-31 Thread Jesper Dangaard Brouer

On Wed, 31 Jan 2018 12:58:56 +0100
Daniel Borkmann  wrote:

> syzkaller was able to generate the following XDP program ...
> 
>   (18) r0 = 0x0
>   (61) r5 = *(u32 *)(r1 +12)
>   (04) (u32) r0 += (u32) 0
>   (95) exit
> 
> ... and trigger a NULL pointer dereference in ___bpf_prog_run()
> via bpf_prog_test_run_xdp() where this was attempted to run.
> 
> Reason is that recent xdp_rxq_info addition to XDP programs
> updated all drivers, but not bpf_prog_test_run_xdp(), where
> xdp_buff is set up. Thus when context rewriter does the deref
> on the netdev it's NULL at runtime. Fix it by using xdp_rxq
> from loopback dev. __netif_get_rx_queue() helper can also be
> reused in various other locations later on.
> 
> Fixes: 02dd3291b2f0 ("bpf: finally expose xdp_rxq_info to XDP bpf-programs")
> Reported-by: syzbot+1eb094057b338eb1f...@syzkaller.appspotmail.com
> Signed-off-by: Daniel Borkmann 
> Cc: Jesper Dangaard Brouer 

Acked-by: Jesper Dangaard Brouer 

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

Re: [PATCH net v2] ip6mr: fix stale iterator

2018-01-31 Thread Dmitry Vyukov

On Wed, Jan 31, 2018 at 3:29 PM, Nikolay Aleksandrov
 wrote:
> When we dump the ip6mr mfc entries via proc, we initialize an iterator
> with the table to dump but we don't clear the cache pointer which might
> be initialized from a prior read on the same descriptor that ended. This
> can result in lock imbalance (an unnecessary unlock) leading to other
> crashes and hangs. Clear the cache pointer like ipmr does to fix the issue.
> Thanks for the reliable reproducer.
>
> Here's syzbot's trace:
>  WARNING: bad unlock balance detected!
>  4.15.0-rc3+ #128 Not tainted
>  syzkaller971460/3195 is trying to release lock (mrt_lock) at:
>  [<6898068d>] ipmr_mfc_seq_stop+0xe1/0x130 net/ipv6/ip6mr.c:553
>  but there are no more locks to release!
>
>  other info that might help us debug this:
>  1 lock held by syzkaller971460/3195:
>   #0:  (>lock){+.+.}, at: [<744a6565>] seq_read+0xd5/0x13d0
>  fs/seq_file.c:165
>
>  stack backtrace:
>  CPU: 1 PID: 3195 Comm: syzkaller971460 Not tainted 4.15.0-rc3+ #128
>  Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
>  Google 01/01/2011
>  Call Trace:
>   __dump_stack lib/dump_stack.c:17 [inline]
>   dump_stack+0x194/0x257 lib/dump_stack.c:53
>   print_unlock_imbalance_bug+0x12f/0x140 kernel/locking/lockdep.c:3561
>   __lock_release kernel/locking/lockdep.c:3775 [inline]
>   lock_release+0x5f9/0xda0 kernel/locking/lockdep.c:4023
>   __raw_read_unlock include/linux/rwlock_api_smp.h:225 [inline]
>   _raw_read_unlock+0x1a/0x30 kernel/locking/spinlock.c:255
>   ipmr_mfc_seq_stop+0xe1/0x130 net/ipv6/ip6mr.c:553
>   traverse+0x3bc/0xa00 fs/seq_file.c:135
>   seq_read+0x96a/0x13d0 fs/seq_file.c:189
>   proc_reg_read+0xef/0x170 fs/proc/inode.c:217
>   do_loop_readv_writev fs/read_write.c:673 [inline]
>   do_iter_read+0x3db/0x5b0 fs/read_write.c:897
>   compat_readv+0x1bf/0x270 fs/read_write.c:1140
>   do_compat_preadv64+0xdc/0x100 fs/read_write.c:1189
>   C_SYSC_preadv fs/read_write.c:1209 [inline]
>   compat_SyS_preadv+0x3b/0x50 fs/read_write.c:1203
>   do_syscall_32_irqs_on arch/x86/entry/common.c:327 [inline]
>   do_fast_syscall_32+0x3ee/0xf9d arch/x86/entry/common.c:389
>   entry_SYSENTER_compat+0x51/0x60 arch/x86/entry/entry_64_compat.S:125
>  RIP: 0023:0xf7f73c79
>  RSP: 002b:e574a15c EFLAGS: 0292 ORIG_RAX: 014d
>  RAX: ffda RBX: 000f RCX: 20a3afb0
>  RDX: 0001 RSI: 0067 RDI: 
>  RBP:  R08:  R09: 
>  R10:  R11:  R12: 
>  R13:  R14:  R15: 
>  BUG: sleeping function called from invalid context at lib/usercopy.c:25
>  in_atomic(): 1, irqs_disabled(): 0, pid: 3195, name: syzkaller971460
>  INFO: lockdep is turned off.
>  CPU: 1 PID: 3195 Comm: syzkaller971460 Not tainted 4.15.0-rc3+ #128
>  Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
>  Google 01/01/2011
>  Call Trace:
>   __dump_stack lib/dump_stack.c:17 [inline]
>   dump_stack+0x194/0x257 lib/dump_stack.c:53
>   ___might_sleep+0x2b2/0x470 kernel/sched/core.c:6060
>   __might_sleep+0x95/0x190 kernel/sched/core.c:6013
>   __might_fault+0xab/0x1d0 mm/memory.c:4525
>   _copy_to_user+0x2c/0xc0 lib/usercopy.c:25
>   copy_to_user include/linux/uaccess.h:155 [inline]
>   seq_read+0xcb4/0x13d0 fs/seq_file.c:279
>   proc_reg_read+0xef/0x170 fs/proc/inode.c:217
>   do_loop_readv_writev fs/read_write.c:673 [inline]
>   do_iter_read+0x3db/0x5b0 fs/read_write.c:897
>   compat_readv+0x1bf/0x270 fs/read_write.c:1140
>   do_compat_preadv64+0xdc/0x100 fs/read_write.c:1189
>   C_SYSC_preadv fs/read_write.c:1209 [inline]
>   compat_SyS_preadv+0x3b/0x50 fs/read_write.c:1203
>   do_syscall_32_irqs_on arch/x86/entry/common.c:327 [inline]
>   do_fast_syscall_32+0x3ee/0xf9d arch/x86/entry/common.c:389
>   entry_SYSENTER_compat+0x51/0x60 arch/x86/entry/entry_64_compat.S:125
>  RIP: 0023:0xf7f73c79
>  RSP: 002b:e574a15c EFLAGS: 0292 ORIG_RAX: 014d
>  RAX: ffda RBX: 000f RCX: 20a3afb0
>  RDX: 0001 RSI: 0067 RDI: 
>  RBP:  R08:  R09: 
>  R10:  R11:  R12: 
>  R13:  R14:  R15: 
>  WARNING: CPU: 1 PID: 3195 at lib/usercopy.c:26 _copy_to_user+0xb5/0xc0
>  lib/usercopy.c:26
>
> Reported-by: syzbot 
> 
> Signed-off-by: Nikolay Aleksandrov 
> ---
> v2: make sure the trace doesn't ruin the patch
> No fixes tag because it seems this has been there forever.

Don't we need to Cc stable 2.6 in this case or something like this. We
want it to be backported.

>
>  net/ipv6/ip6mr.c | 1 +
>  1 file changed, 1

[PATCH net v2] ip6mr: fix stale iterator

2018-01-31 Thread Nikolay Aleksandrov

When we dump the ip6mr mfc entries via proc, we initialize an iterator
with the table to dump but we don't clear the cache pointer which might
be initialized from a prior read on the same descriptor that ended. This
can result in lock imbalance (an unnecessary unlock) leading to other
crashes and hangs. Clear the cache pointer like ipmr does to fix the issue.
Thanks for the reliable reproducer.

Here's syzbot's trace:
 WARNING: bad unlock balance detected!
 4.15.0-rc3+ #128 Not tainted
 syzkaller971460/3195 is trying to release lock (mrt_lock) at:
 [<6898068d>] ipmr_mfc_seq_stop+0xe1/0x130 net/ipv6/ip6mr.c:553
 but there are no more locks to release!

 other info that might help us debug this:
 1 lock held by syzkaller971460/3195:
  #0:  (>lock){+.+.}, at: [<744a6565>] seq_read+0xd5/0x13d0
 fs/seq_file.c:165

 stack backtrace:
 CPU: 1 PID: 3195 Comm: syzkaller971460 Not tainted 4.15.0-rc3+ #128
 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
 Google 01/01/2011
 Call Trace:
  __dump_stack lib/dump_stack.c:17 [inline]
  dump_stack+0x194/0x257 lib/dump_stack.c:53
  print_unlock_imbalance_bug+0x12f/0x140 kernel/locking/lockdep.c:3561
  __lock_release kernel/locking/lockdep.c:3775 [inline]
  lock_release+0x5f9/0xda0 kernel/locking/lockdep.c:4023
  __raw_read_unlock include/linux/rwlock_api_smp.h:225 [inline]
  _raw_read_unlock+0x1a/0x30 kernel/locking/spinlock.c:255
  ipmr_mfc_seq_stop+0xe1/0x130 net/ipv6/ip6mr.c:553
  traverse+0x3bc/0xa00 fs/seq_file.c:135
  seq_read+0x96a/0x13d0 fs/seq_file.c:189
  proc_reg_read+0xef/0x170 fs/proc/inode.c:217
  do_loop_readv_writev fs/read_write.c:673 [inline]
  do_iter_read+0x3db/0x5b0 fs/read_write.c:897
  compat_readv+0x1bf/0x270 fs/read_write.c:1140
  do_compat_preadv64+0xdc/0x100 fs/read_write.c:1189
  C_SYSC_preadv fs/read_write.c:1209 [inline]
  compat_SyS_preadv+0x3b/0x50 fs/read_write.c:1203
  do_syscall_32_irqs_on arch/x86/entry/common.c:327 [inline]
  do_fast_syscall_32+0x3ee/0xf9d arch/x86/entry/common.c:389
  entry_SYSENTER_compat+0x51/0x60 arch/x86/entry/entry_64_compat.S:125
 RIP: 0023:0xf7f73c79
 RSP: 002b:e574a15c EFLAGS: 0292 ORIG_RAX: 014d
 RAX: ffda RBX: 000f RCX: 20a3afb0
 RDX: 0001 RSI: 0067 RDI: 
 RBP:  R08:  R09: 
 R10:  R11:  R12: 
 R13:  R14:  R15: 
 BUG: sleeping function called from invalid context at lib/usercopy.c:25
 in_atomic(): 1, irqs_disabled(): 0, pid: 3195, name: syzkaller971460
 INFO: lockdep is turned off.
 CPU: 1 PID: 3195 Comm: syzkaller971460 Not tainted 4.15.0-rc3+ #128
 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
 Google 01/01/2011
 Call Trace:
  __dump_stack lib/dump_stack.c:17 [inline]
  dump_stack+0x194/0x257 lib/dump_stack.c:53
  ___might_sleep+0x2b2/0x470 kernel/sched/core.c:6060
  __might_sleep+0x95/0x190 kernel/sched/core.c:6013
  __might_fault+0xab/0x1d0 mm/memory.c:4525
  _copy_to_user+0x2c/0xc0 lib/usercopy.c:25
  copy_to_user include/linux/uaccess.h:155 [inline]
  seq_read+0xcb4/0x13d0 fs/seq_file.c:279
  proc_reg_read+0xef/0x170 fs/proc/inode.c:217
  do_loop_readv_writev fs/read_write.c:673 [inline]
  do_iter_read+0x3db/0x5b0 fs/read_write.c:897
  compat_readv+0x1bf/0x270 fs/read_write.c:1140
  do_compat_preadv64+0xdc/0x100 fs/read_write.c:1189
  C_SYSC_preadv fs/read_write.c:1209 [inline]
  compat_SyS_preadv+0x3b/0x50 fs/read_write.c:1203
  do_syscall_32_irqs_on arch/x86/entry/common.c:327 [inline]
  do_fast_syscall_32+0x3ee/0xf9d arch/x86/entry/common.c:389
  entry_SYSENTER_compat+0x51/0x60 arch/x86/entry/entry_64_compat.S:125
 RIP: 0023:0xf7f73c79
 RSP: 002b:e574a15c EFLAGS: 0292 ORIG_RAX: 014d
 RAX: ffda RBX: 000f RCX: 20a3afb0
 RDX: 0001 RSI: 0067 RDI: 
 RBP:  R08:  R09: 
 R10:  R11:  R12: 
 R13:  R14:  R15: 
 WARNING: CPU: 1 PID: 3195 at lib/usercopy.c:26 _copy_to_user+0xb5/0xc0
 lib/usercopy.c:26

Reported-by: syzbot 

Signed-off-by: Nikolay Aleksandrov 
---
v2: make sure the trace doesn't ruin the patch
No fixes tag because it seems this has been there forever.

 net/ipv6/ip6mr.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index a2e1a864eb46..4fc566ec7e79 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -495,6 +495,7 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, 
loff_t *pos)
return ERR_PTR(-ENOENT);
 
it->mrt = mrt;
+   it->cache = NULL;
return *pos ? ipmr_mfc_seq_idx(net,

Re: [PATCH] net: pxa168_eth: add netconsole support

2018-01-31 Thread Alexander Monakov

> > +#ifdef CONFIG_NET_POLL_CONTROLLER
> > +static void pxa168_eth_netpoll(struct net_device *dev)
> > +{
> > +   struct pxa168_eth_private *pep = netdev_priv(dev);
> > +   napi_schedule(>napi);
> > +}
> > +#endif
> 
> This definitely is not sufficient.
> 
> Look at what other drivers do.

Sorry, I did that, and got confused because some drivers bracket the interrupt
handler with disable_irq/enable_irq, some other drivers use local_irq_save/
local_irq_restore (which seems like a no-op because netconsole already uses
spin_lock_irqsave and netpoll_send_udp checks irqs_disabled), and a few drivers
call bare napi_schedule.

> You have to invoke the interrupt handler with the device's interrupts 
> disabled,

Is this required for correctness? I have to admit I'm unsure why.

> collect the events, and most importantly mask chip interrupts before 
> scheduling
> the NAPI instance.

And is this a matter of efficiency (not calling napi_schedule when there's
nothing to do and not keeping interrupts enabled for its duration), or also
a matter of correctness?

Sorry I'm not sending a revised patch yet, but without a firm understanding
of why changes are needed that would be a bit of a sin.

Thanks.
Alexander

Re: [PATCH] netfilter: fix out-of-bounds accesses in clusterip_tg_check()

2018-01-31 Thread Pablo Neira Ayuso

On Tue, Jan 30, 2018 at 03:21:34PM +0100, Dmitry Vyukov wrote:
> Commit 136e92bbec0a switched local_nodes from an array to a bitmask
> but did not add proper bounds checks. As the result
> clusterip_config_init_nodelist() can both over-read
> ipt_clusterip_tgt_info.local_nodes and over-write
> clusterip_config.local_nodes.
> 
> Add bounds checks for both.

Applied, thanks Dmitry.

Re: [PATCH] netfilter: fix pointer leaks to userspace

2018-01-31 Thread Pablo Neira Ayuso

On Mon, Jan 29, 2018 at 01:21:20PM +0100, Dmitry Vyukov wrote:
> Several netfilter matches and targets put kernel pointers into
> info objects, but don't set usersize in descriptors.
> This leads to kernel pointer leaks if a match/target is set
> and then read back to userspace.
> 
> Properly set usersize for these matches/targets.
> 
> Found with manual code inspection.

Applied, thanks!

I think this fixes:

ec2318904965 xtables: extend matches and targets with .usersize

So I'm going to add the Fixes: tag here, no problem.

[RFC PATCH 01/24] xsk: AF_XDP sockets buildable skeleton

2018-01-31 Thread Björn Töpel

From: Björn Töpel 

Buildable skeleton. Move on, nothing to see.

Signed-off-by: Björn Töpel 
---
 include/linux/socket.h  |   5 +-
 include/uapi/linux/if_xdp.h |  32 +
 net/Kconfig |   1 +
 net/Makefile|   1 +
 net/core/sock.c |  12 ++--
 net/xdp/Kconfig |   7 ++
 net/xdp/Makefile|   1 +
 net/xdp/xsk.c   | 133 
 net/xdp/xsk.h   |  18 +
 security/selinux/hooks.c|   4 +-
 security/selinux/include/classmap.h |   4 +-
 11 files changed, 211 insertions(+), 7 deletions(-)
 create mode 100644 include/uapi/linux/if_xdp.h
 create mode 100644 net/xdp/Kconfig
 create mode 100644 net/xdp/Makefile
 create mode 100644 net/xdp/xsk.c
 create mode 100644 net/xdp/xsk.h

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 9286a5a8c60c..ada0102ff8db 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -207,8 +207,9 @@ struct ucred {
 * PF_SMC protocol family that
 * reuses AF_INET address family
 */
+#define AF_XDP 44  /* XDP sockets  */
 
-#define AF_MAX 44  /* For now.. */
+#define AF_MAX 45  /* For now.. */
 
 /* Protocol families, same as address families. */
 #define PF_UNSPEC  AF_UNSPEC
@@ -257,6 +258,7 @@ struct ucred {
 #define PF_KCM AF_KCM
 #define PF_QIPCRTR AF_QIPCRTR
 #define PF_SMC AF_SMC
+#define PF_XDP AF_XDP
 #define PF_MAX AF_MAX
 
 /* Maximum queue length specifiable by listen.  */
@@ -337,6 +339,7 @@ struct ucred {
 #define SOL_NFC280
 #define SOL_KCM281
 #define SOL_TLS282
+#define SOL_XDP283
 
 /* IPX options */
 #define IPX_TYPE   1
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
new file mode 100644
index ..cd09232e16c1
--- /dev/null
+++ b/include/uapi/linux/if_xdp.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * if_xdp: XDP socket user-space interface
+ *
+ * Copyright(c) 2017 Intel Corporation.
+ *
+ * Author(s): Björn Töpel 
+ *   Magnus Karlsson 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#ifndef _LINUX_IF_XDP_H
+#define _LINUX_IF_XDP_H
+
+#include 
+
+struct sockaddr_xdp {
+   __u16   sxdp_family;
+   __u32   sxdp_ifindex;
+   __u32   sxdp_queue_id;
+};
+
+/* XDP socket options */
+#define XDP_MEM_REG1
+#define XDP_RX_RING2
+#define XDP_TX_RING3
+
+#endif /* _LINUX_IF_XDP_H */
diff --git a/net/Kconfig b/net/Kconfig
index 37ec8e67af57..03e5c64b411d 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -59,6 +59,7 @@ source "net/tls/Kconfig"
 source "net/xfrm/Kconfig"
 source "net/iucv/Kconfig"
 source "net/smc/Kconfig"
+source "net/xdp/Kconfig"
 
 config INET
bool "TCP/IP networking"
diff --git a/net/Makefile b/net/Makefile
index 14fede520840..9df8e6f827f8 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -86,3 +86,4 @@ obj-y += l3mdev/
 endif
 obj-$(CONFIG_QRTR) += qrtr/
 obj-$(CONFIG_NET_NCSI) += ncsi/
+obj-$(CONFIG_XDP_SOCKETS)  += xdp/
diff --git a/net/core/sock.c b/net/core/sock.c
index abf4cbff99b2..4d29430f4671 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -226,7 +226,8 @@ static struct lock_class_key 
af_family_kern_slock_keys[AF_MAX];
   x "AF_RXRPC" ,   x "AF_ISDN" ,   x "AF_PHONET"   , \
   x "AF_IEEE802154",   x "AF_CAIF" ,   x "AF_ALG"  , \
   x "AF_NFC"   ,   x "AF_VSOCK",   x "AF_KCM"  , \
-  x "AF_QIPCRTR",  x "AF_SMC"  ,   x "AF_MAX"
+  x "AF_QIPCRTR",  x "AF_SMC"  ,   x "AF_XDP"  , \
+  x "AF_MAX"
 
 static const char *const af_family_key_strings[AF_MAX+1] = {
_sock_locks("sk_lock-")
@@ -262,7 +263,8 @@ static const char *const 
af_family_rlock_key_strings[AF_MAX+1] = {
   "rlock-AF_RXRPC" , "rlock-AF_ISDN" , "rlock-AF_PHONET"   ,
   "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG"  ,
   "rlock-AF_NFC"   , "rlock-AF_VSOCK", "rlock-AF_KCM"  ,
-  "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_MAX"
+  "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_XDP"  ,
+  "rlock-AF_MAX"
 };
 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
   "wlock-AF_UNSPEC", "wlock-AF_UNIX" , "wlock-AF_INET" ,
@@ -279,7 +281,8 @@ static const char *const 
af_family_wlock_key_strings[AF_MAX+1] = {
   "wlock-AF_RXRPC" , "wlock-AF_ISDN" ,

[RFC PATCH 02/24] xsk: add user memory registration sockopt

2018-01-31 Thread Björn Töpel

From: Björn Töpel 

The XDP_MEM_REG socket option allows a process to register a window of
user space memory to the kernel. This memory will later be used as
frame data buffer.

Signed-off-by: Björn Töpel 
---
 include/uapi/linux/if_xdp.h |   7 ++
 net/xdp/xsk.c   | 294 +++-
 net/xdp/xsk.h   |  19 ++-
 3 files changed, 316 insertions(+), 4 deletions(-)

diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index cd09232e16c1..3f8c90c708b4 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -29,4 +29,11 @@ struct sockaddr_xdp {
 #define XDP_RX_RING2
 #define XDP_TX_RING3
 
+struct xdp_mr_req {
+   __u64   addr;   /* Start of packet data area */
+   __u64   len;/* Length of packet data area */
+   __u32   frame_size; /* Frame size */
+   __u32   data_headroom;  /* Frame head room */
+};
+
 #endif /* _LINUX_IF_XDP_H */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 2d7c08a50c60..333ce1450cc7 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -19,18 +19,235 @@
 
 #include 
 #include 
+#include 
+#include 
+#include 
 #include 
 #include 
 
 #include "xsk.h"
 
+#define XSK_UMEM_MIN_FRAME_SIZE 2048
+
 struct xdp_sock {
/* struct sock must be the first member of struct xdp_sock */
struct sock sk;
+   struct xsk_umem *umem;
 };
 
+static struct xdp_sock *xdp_sk(struct sock *sk)
+{
+   return (struct xdp_sock *)sk;
+}
+
+static void xsk_umem_unpin_pages(struct xsk_umem *umem)
+{
+   unsigned int i;
+
+   if (umem->pgs) {
+   for (i = 0; i < umem->npgs; i++) {
+   struct page *page = umem->pgs[i];
+
+   set_page_dirty_lock(page);
+   put_page(page);
+   }
+
+   kfree(umem->pgs);
+   umem->pgs = NULL;
+   }
+}
+
+static void xsk_umem_destroy(struct xsk_umem *umem)
+{
+   struct mm_struct *mm;
+   struct task_struct *task;
+   unsigned long diff;
+
+   if (!umem)
+   return;
+
+   xsk_umem_unpin_pages(umem);
+
+   task = get_pid_task(umem->pid, PIDTYPE_PID);
+   put_pid(umem->pid);
+   if (!task)
+   goto out;
+   mm = get_task_mm(task);
+   put_task_struct(task);
+   if (!mm)
+   goto out;
+
+   diff = umem->size >> PAGE_SHIFT;
+
+   down_write(>mmap_sem);
+   mm->pinned_vm -= diff;
+   up_write(>mmap_sem);
+   mmput(mm);
+out:
+   kfree(umem);
+}
+
+static struct xsk_umem *xsk_umem_create(u64 addr, u64 size, u32 frame_size,
+   u32 data_headroom)
+{
+   struct xsk_umem *umem;
+   unsigned int nframes;
+   int size_chk;
+
+   if (frame_size < XSK_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) {
+   /* Strictly speaking we could support this, if:
+* - huge pages, or*
+* - using an IOMMU, or
+* - making sure the memory area is consecutive
+* but for now, we simply say "computer says no".
+*/
+   return ERR_PTR(-EINVAL);
+   }
+
+   if (!is_power_of_2(frame_size))
+   return ERR_PTR(-EINVAL);
+
+   if (!PAGE_ALIGNED(addr)) {
+   /* Memory area has to be page size aligned. For
+* simplicity, this might change.
+*/
+   return ERR_PTR(-EINVAL);
+   }
+
+   if ((addr + size) < addr)
+   return ERR_PTR(-EINVAL);
+
+   nframes = size / frame_size;
+   if (nframes == 0)
+   return ERR_PTR(-EINVAL);
+
+   data_headroom = ALIGN(data_headroom, 64);
+
+   size_chk = frame_size - data_headroom - XSK_KERNEL_HEADROOM;
+   if (size_chk < 0)
+   return ERR_PTR(-EINVAL);
+
+   umem = kzalloc(sizeof(*umem), GFP_KERNEL);
+   if (!umem)
+   return ERR_PTR(-ENOMEM);
+
+   umem->pid = get_task_pid(current, PIDTYPE_PID);
+   umem->size = (size_t)size;
+   umem->address = (unsigned long)addr;
+   umem->frame_size = frame_size;
+   umem->nframes = nframes;
+   umem->data_headroom = data_headroom;
+   umem->pgs = NULL;
+
+   return umem;
+}
+
+static int xsk_umem_pin_pages(struct xsk_umem *umem)
+{
+   unsigned int gup_flags = FOLL_WRITE;
+   long npgs;
+   int err;
+
+   /* XXX Fix so that we don't always pin.
+* "copy to user" from interrupt context, but how?
+*/
+   umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), GFP_ATOMIC);
+   if (!umem->pgs)
+   return -ENOMEM;
+
+   npgs = get_user_pages(umem->address, umem->npgs,
+ gup_flags, >pgs[0], NULL);
+   if (npgs != umem->npgs) {
+   if (npgs >= 0) {
+   umem->npgs = npgs;
+

[RFC PATCH 14/24] i40e: implemented page recycling buff_pool

2018-01-31 Thread Björn Töpel

From: Björn Töpel 

Added a buff_poll implementation that do page recycling.

Signed-off-by: Björn Töpel 
---
 drivers/net/ethernet/intel/i40e/buff_pool.c | 385 
 drivers/net/ethernet/intel/i40e/buff_pool.h |   6 +
 2 files changed, 391 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/buff_pool.c 
b/drivers/net/ethernet/intel/i40e/buff_pool.c
index 8c51f61ca71d..42b6cf5042e9 100644
--- a/drivers/net/ethernet/intel/i40e/buff_pool.c
+++ b/drivers/net/ethernet/intel/i40e/buff_pool.c
@@ -283,3 +283,388 @@ void i40e_buff_pool_destroy(struct buff_pool *pool)
kfree(pool);
 }
 
+/* Recycling allocator */
+
+struct i40e_bpr_header {
+   dma_addr_t dma;
+#if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536)
+   __u32 page_offset;
+#else
+   __u16 page_offset;
+#endif
+   __u16 pagecnt_bias;
+};
+
+struct i40e_bpr_pool {
+   unsigned int buff_tot_len;
+   unsigned int buff_len;
+   unsigned int headroom;
+   unsigned int pg_order;
+   unsigned int pg_size;
+   struct device *dev;
+   unsigned int head;
+   unsigned int tail;
+   unsigned int buffs_size_mask;
+   struct i40e_bpr_header *buffs[0];
+};
+
+#define I40E_BPRHDR_ALIGNED_SIZE ALIGN(sizeof(struct i40e_bpr_header), \
+  SMP_CACHE_BYTES)
+
+static int i40e_bpr_alloc(void *pool, unsigned long *handle)
+{
+   struct i40e_bpr_pool *impl = (struct i40e_bpr_pool *)pool;
+   struct i40e_bpr_header *hdr;
+   struct page *pg;
+   dma_addr_t dma;
+
+   if (impl->head != impl->tail) {
+   *handle = (unsigned long)impl->buffs[impl->head];
+   impl->head = (impl->head + 1) & impl->buffs_size_mask;
+
+   return 0;
+   }
+
+   pg = dev_alloc_pages(impl->pg_order);
+   if (unlikely(!pg))
+   return -ENOMEM;
+
+   dma = dma_map_page_attrs(impl->dev, pg, 0, impl->pg_size,
+DMA_FROM_DEVICE, I40E_RX_DMA_ATTR);
+
+   if (dma_mapping_error(impl->dev, dma)) {
+   __free_pages(pg, impl->pg_order);
+   return -ENOMEM;
+   }
+
+   hdr = (struct i40e_bpr_header *)page_address(pg);
+   hdr->dma = dma;
+   hdr->page_offset = I40E_BPRHDR_ALIGNED_SIZE;
+   hdr->pagecnt_bias = 1;
+
+   *handle = (unsigned long)hdr;
+
+   return 0;
+}
+
+static void i40e_bpr_free(void *pool, unsigned long handle)
+{
+   struct i40e_bpr_pool *impl = (struct i40e_bpr_pool *)pool;
+   struct i40e_bpr_header *hdr;
+   unsigned int tail;
+
+   hdr = (struct i40e_bpr_header *)handle;
+   tail = (impl->tail + 1) & impl->buffs_size_mask;
+   /* Is full? */
+   if (tail == impl->head) {
+   dma_unmap_page_attrs(impl->dev, hdr->dma, impl->pg_size,
+DMA_FROM_DEVICE, I40E_RX_DMA_ATTR);
+   __page_frag_cache_drain(virt_to_head_page(hdr),
+   hdr->pagecnt_bias);
+   }
+
+   impl->buffs[impl->tail] = hdr;
+   impl->tail = tail;
+}
+
+static unsigned int i40e_bpr_buff_size(void *pool)
+{
+   struct i40e_bpr_pool *impl = (struct i40e_bpr_pool *)pool;
+
+   return impl->buff_len;
+}
+
+static unsigned int i40e_bpr_total_buff_size(void *pool)
+{
+   struct i40e_bpr_pool *impl = (struct i40e_bpr_pool *)pool;
+
+   return impl->buff_tot_len;
+}
+
+static unsigned int i40e_bpr_buff_headroom(void *pool)
+{
+   struct i40e_bpr_pool *impl = (struct i40e_bpr_pool *)pool;
+
+   return impl->headroom;
+}
+
+static unsigned int i40e_bpr_buff_truesize(void *pool)
+{
+   struct i40e_bpr_pool *impl = (struct i40e_bpr_pool *)pool;
+
+   return impl->buff_tot_len;
+}
+
+static void *i40e_bpr_buff_ptr(void *pool, unsigned long handle)
+{
+   struct i40e_bpr_header *hdr;
+
+   hdr = (struct i40e_bpr_header *)handle;
+
+   return ((void *)hdr) + hdr->page_offset;
+}
+
+static bool i40e_page_is_reusable(struct page *page)
+{
+   return (page_to_nid(page) == numa_mem_id()) &&
+   !page_is_pfmemalloc(page);
+}
+
+static bool i40e_can_reuse_page(struct i40e_bpr_header *hdr)
+{
+   unsigned int pagecnt_bias = hdr->pagecnt_bias;
+   struct page *page = virt_to_head_page(hdr);
+
+   if (unlikely(!i40e_page_is_reusable(page)))
+   return false;
+
+#if (PAGE_SIZE < 8192)
+   if (unlikely((page_count(page) - pagecnt_bias) > 1))
+   return false;
+#else
+#define I40E_LAST_OFFSET \
+   (PAGE_SIZE - I40E_RXBUFFER_3072 - I40E_BPRHDR_ALIGNED_SIZE)
+   if (hdr->page_offset > I40E_LAST_OFFSET)
+   return false;
+#endif
+
+   if (unlikely(!pagecnt_bias)) {
+   page_ref_add(page, USHRT_MAX);
+   hdr->pagecnt_bias = USHRT_MAX;
+   }
+
+   return true;
+}
+
+static int i40e_bpr_buff_convert_to_page(void *pool, unsigned long handle,
+

[RFC PATCH 19/24] xsk: add support for zero copy Rx

2018-01-31 Thread Björn Töpel

From: Björn Töpel 

In this commit we start making use of the new ndo_bpf sub-commands,
and try to enable zero copy, if available.

Signed-off-by: Björn Töpel 
---
 net/xdp/xsk.c | 185 +-
 1 file changed, 145 insertions(+), 40 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index f372c3288301..f05ab825d157 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -29,15 +29,21 @@
 #include 
 #include 
 
+#include 
+#include 
+
 #include "xsk.h"
 #include "xsk_buff.h"
 #include "xsk_ring.h"
+#include "xsk_buff_pool.h"
+#include "xsk_packet_array.h"
 
 #define XSK_UMEM_MIN_FRAME_SIZE 2048
 #define XSK_ARRAY_SIZE 512
 
 struct xsk_info {
struct xsk_packet_array *pa;
+   struct buff_pool *bp;
spinlock_t pa_lock;
struct xsk_queue *q;
struct xsk_umem *umem;
@@ -56,8 +62,24 @@ struct xdp_sock {
struct mutex tx_mutex;
u32 ifindex;
u16 queue_id;
+   bool zc_mode;
 };
 
+static inline bool xsk_is_zc_cap(struct xdp_sock *xs)
+{
+   return xs->zc_mode;
+}
+
+static void xsk_set_zc_cap(struct xdp_sock *xs)
+{
+   xs->zc_mode = true;
+}
+
+static void xsk_clear_zc_cap(struct xdp_sock *xs)
+{
+   xs->zc_mode = false;
+}
+
 static struct xdp_sock *xdp_sk(struct sock *sk)
 {
return (struct xdp_sock *)sk;
@@ -323,6 +345,22 @@ static int xsk_init_tx_ring(struct sock *sk, int mr_fd, 
u32 desc_nr)
return xsk_init_ring(sk, mr_fd, desc_nr, >tx);
 }
 
+static void xsk_disable_zc(struct xdp_sock *xs)
+{
+   struct netdev_bpf bpf = {};
+
+   if (!xsk_is_zc_cap(xs))
+   return;
+
+   bpf.command = XDP_UNREGISTER_XSK;
+   bpf.xsk.queue_id = xs->queue_id;
+
+   rtnl_lock();
+   (void)xs->dev->netdev_ops->ndo_bpf(xs->dev, );
+   rtnl_unlock();
+   xsk_clear_zc_cap(xs);
+}
+
 static int xsk_release(struct socket *sock)
 {
struct sock *sk = sock->sk;
@@ -344,14 +382,22 @@ static int xsk_release(struct socket *sock)
xs_prev = xs->dev->_rx[xs->queue_id].xs;
rcu_assign_pointer(xs->dev->_rx[xs->queue_id].xs, NULL);
 
+   xsk_disable_zc(xs);
+
/* Wait for driver to stop using the xdp socket. */
synchronize_net();
 
xskpa_destroy(xs->rx.pa);
-   xskpa_destroy(xs->tx.pa);
-   xsk_umem_destroy(xs_prev->umem);
+   bpool_destroy(xs->rx.bp);
xskq_destroy(xs_prev->rx.q);
+   xsk_buff_info_destroy(xs->rx.buff_info);
+
+   xskpa_destroy(xs->tx.pa);
xskq_destroy(xs_prev->tx.q);
+   xsk_buff_info_destroy(xs->tx.buff_info);
+
+   xsk_umem_destroy(xs_prev->umem);
+
kobject_put(_prev->dev->_rx[xs->queue_id].kobj);
dev_put(xs_prev->dev);
}
@@ -365,6 +411,45 @@ static int xsk_release(struct socket *sock)
return 0;
 }
 
+static int xsk_dma_map_pool_cb(struct buff_pool *pool, struct device *dev,
+  enum dma_data_direction dir,
+  unsigned long attrs)
+{
+   struct xsk_buff_pool *bp = (struct xsk_buff_pool *)pool->pool;
+
+   return xsk_buff_dma_map(bp->bi, dev, dir, attrs);
+}
+
+static void xsk_error_report(void *ctx, int err)
+{
+   struct xsk_sock *xs = (struct xsk_sock *)ctx;
+}
+
+static void xsk_try_enable_zc(struct xdp_sock *xs)
+{
+   struct xsk_rx_parms rx_parms = {};
+   struct netdev_bpf bpf = {};
+   int err;
+
+   if (!xs->dev->netdev_ops->ndo_bpf)
+   return;
+
+   rx_parms.buff_pool = xs->rx.bp;
+   rx_parms.dma_map = xsk_dma_map_pool_cb;
+   rx_parms.error_report_ctx = xs;
+   rx_parms.error_report = xsk_error_report;
+
+   bpf.command = XDP_REGISTER_XSK;
+   bpf.xsk.rx_parms = _parms;
+   bpf.xsk.queue_id = xs->queue_id;
+
+   rtnl_lock();
+   err = xs->dev->netdev_ops->ndo_bpf(xs->dev, );
+   rtnl_unlock();
+   if (!err)
+   xsk_set_zc_cap(xs);
+}
+
 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 {
struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
@@ -429,6 +514,13 @@ static int xsk_bind(struct socket *sock, struct sockaddr 
*addr, int addr_len)
goto out_rx_pa;
}
 
+   /* ...and Rx buffer pool is used for zerocopy. */
+   xs->rx.bp = xsk_buff_pool_create(xs->rx.buff_info, xs->rx.q);
+   if (!xs->rx.bp) {
+   err = -ENOMEM;
+   goto out_rx_bp;
+   }
+
/* Tx */
xs->tx.buff_info = xsk_buff_info_create(xs->tx.umem);
if (!xs->tx.buff_info) {
@@ -446,12 +538,17 @@ static int xsk_bind(struct socket *sock, struct sockaddr 
*addr, int addr_len)
 
rcu_assign_pointer(dev->_rx[sxdp->sxdp_queue_id].xs, xs);
 
+   xsk_try_enable_zc(xs);
+
goto out_unlock;

[RFC PATCH 12/24] xsk: add iterator functions to xsk_ring

2018-01-31 Thread Björn Töpel

From: Björn Töpel 

Add packet array like functionality that acts directly on the
user/kernel shared ring. We'll use this in the zerocopy Rx scenario.

TODO Better naming...

Signed-off-by: Björn Töpel 
---
 net/xdp/xsk_ring.c |   3 +-
 net/xdp/xsk_ring.h | 136 -
 2 files changed, 126 insertions(+), 13 deletions(-)

diff --git a/net/xdp/xsk_ring.c b/net/xdp/xsk_ring.c
index 11b590506ddf..f154ddfabcfc 100644
--- a/net/xdp/xsk_ring.c
+++ b/net/xdp/xsk_ring.c
@@ -41,7 +41,8 @@ struct xsk_queue *xskq_create(u32 nentries)
q->queue_ops.enqueue = xskq_enqueue_from_array;
q->queue_ops.enqueue_completed = xskq_enqueue_completed_from_array;
q->queue_ops.dequeue = xskq_dequeue_to_array;
-   q->used_idx = 0;
+   q->used_idx_head = 0;
+   q->used_idx_tail = 0;
q->last_avail_idx = 0;
q->ring_mask = nentries - 1;
q->num_free = 0;
diff --git a/net/xdp/xsk_ring.h b/net/xdp/xsk_ring.h
index c9d61195ab2d..43c841d55093 100644
--- a/net/xdp/xsk_ring.h
+++ b/net/xdp/xsk_ring.h
@@ -27,7 +27,8 @@ struct xsk_queue {
struct xsk_user_queue queue_ops;
struct xdp_desc *ring;
 
-   u32 used_idx;
+   u32 used_idx_head;
+   u32 used_idx_tail;
u32 last_avail_idx;
u32 ring_mask;
u32 num_free;
@@ -51,8 +52,7 @@ static inline unsigned int xsk_get_data_headroom(struct 
xsk_umem *umem)
  *
  * Returns true if the entry is a valid, otherwise false
  **/
-static inline bool xskq_is_valid_entry(struct xsk_queue *q,
-  struct xdp_desc *d)
+static inline bool xskq_is_valid_entry(struct xsk_queue *q, struct xdp_desc *d)
 {
unsigned int buff_len;
 
@@ -115,7 +115,7 @@ static inline int xskq_nb_avail(struct xsk_queue *q, int 
dcnt)
 static inline int xskq_enqueue(struct xsk_queue *q,
   const struct xdp_desc *d, int dcnt)
 {
-   unsigned int used_idx = q->used_idx;
+   unsigned int used_idx = q->used_idx_tail;
int i;
 
if (q->num_free < dcnt)
@@ -136,11 +136,12 @@ static inline int xskq_enqueue(struct xsk_queue *q,
smp_wmb();
 
for (i = dcnt - 1; i >= 0; i--) {
-   unsigned int idx = (q->used_idx + i) & q->ring_mask;
+   unsigned int idx = (q->used_idx_tail + i) & q->ring_mask;
 
q->ring[idx].flags = d[i].flags & ~XDP_DESC_KERNEL;
}
-   q->used_idx += dcnt;
+   q->used_idx_head += dcnt;
+   q->used_idx_tail += dcnt;
 
return 0;
 }
@@ -157,7 +158,7 @@ static inline int xskq_enqueue_from_array(struct 
xsk_packet_array *a,
  u32 dcnt)
 {
struct xsk_queue *q = (struct xsk_queue *)a->q_ops;
-   unsigned int used_idx = q->used_idx;
+   unsigned int used_idx = q->used_idx_tail;
struct xdp_desc *d = a->items;
int i;
 
@@ -180,12 +181,13 @@ static inline int xskq_enqueue_from_array(struct 
xsk_packet_array *a,
smp_wmb();
 
for (i = dcnt - 1; i >= 0; i--) {
-   unsigned int idx = (q->used_idx + i) & q->ring_mask;
+   unsigned int idx = (q->used_idx_tail + i) & q->ring_mask;
unsigned int didx = (a->start + i) & a->mask;
 
q->ring[idx].flags = d[didx].flags & ~XDP_DESC_KERNEL;
}
-   q->used_idx += dcnt;
+   q->used_idx_tail += dcnt;
+   q->used_idx_head += dcnt;
 
return 0;
 }
@@ -204,7 +206,7 @@ static inline int xskq_enqueue_completed_from_array(struct 
xsk_packet_array *a,
u32 dcnt)
 {
struct xsk_queue *q = (struct xsk_queue *)a->q_ops;
-   unsigned int used_idx = q->used_idx;
+   unsigned int used_idx = q->used_idx_tail;
struct xdp_desc *d = a->items;
int i, j;
 
@@ -233,13 +235,14 @@ static inline int 
xskq_enqueue_completed_from_array(struct xsk_packet_array *a,
smp_wmb();
 
for (j = i - 1; j >= 0; j--) {
-   unsigned int idx = (q->used_idx + j) & q->ring_mask;
+   unsigned int idx = (q->used_idx_tail + j) & q->ring_mask;
unsigned int didx = (a->start + j) & a->mask;
 
q->ring[idx].flags = d[didx].flags & ~XDP_DESC_KERNEL;
}
q->num_free -= i;
-   q->used_idx += i;
+   q->used_idx_tail += i;
+   q->used_idx_head += i;
 
return i;
 }
@@ -301,6 +304,115 @@ static inline void xskq_set_buff_info(struct xsk_queue *q,
q->validation = validation;
 }
 
+/* --- */
+
+struct xskq_iter {
+   unsigned int head;
+   unsigned int tail;
+};
+
+static inline bool xskq_iter_end(struct xskq_iter *it)
+{
+   return it->tail == it->head;
+}
+
+static inline void xskq_iter_validate(struct xsk_queue *q, struct xskq_iter 
*it)
+{
+   while (it->head != it->tail) {
+   unsigned int idx = it->head &

1 2 >

1 - 100 of 162 matches

Mail list logo