date:20161108

[PATCH 2/3] vhost: better detection of available buffers

2016-11-08 Thread Jason Wang

We should use vq->last_avail_idx instead of vq->avail_idx in the
checking of vhost_vq_avail_empty() since latter is the cached avail
index from guest but we want to know if there's pending available
buffers in the virtqueue.

Signed-off-by: Jason Wang 
---
 drivers/vhost/vhost.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index c6f2d89..fdf4cdf 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2230,7 +2230,7 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct 
vhost_virtqueue *vq)
if (r)
return false;
 
-   return vhost16_to_cpu(vq, avail_idx) == vq->avail_idx;
+   return vhost16_to_cpu(vq, avail_idx) == vq->last_avail_idx;
 }
 EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
 
-- 
2.7.4

[PATCH 3/3] vhost_net: tx support batching

2016-11-08 Thread Jason Wang

This patch tries to utilize tuntap rx batching by peeking the tx
virtqueue during transmission, if there's more available buffers in
the virtqueue, set MSG_MORE flag for a hint for tuntap to batch the
packets. The maximum number of batched tx packets were specified
through a module parameter: tx_bached.

When use 16 as tx_batched:

Pktgen test shows 16% on tx pps in guest.
Netperf test does not show obvious regression.

For safety, 1 were used as the default value for tx_batched.

Signed-off-by: Jason Wang 
---
 drivers/vhost/net.c   | 15 ++-
 drivers/vhost/vhost.c |  1 +
 drivers/vhost/vhost.h |  1 +
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 5dc128a..51c378e 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -35,6 +35,10 @@ module_param(experimental_zcopytx, int, 0444);
 MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
   " 1 -Enable; 0 - Disable");
 
+static int tx_batched = 1;
+module_param(tx_batched, int, 0444);
+MODULE_PARM_DESC(tx_batched, "Number of patches batched in TX");
+
 /* Max number of bytes transferred before requeueing the job.
  * Using this limit prevents one virtqueue from starving others. */
 #define VHOST_NET_WEIGHT 0x8
@@ -454,6 +458,16 @@ static void handle_tx(struct vhost_net *net)
msg.msg_control = NULL;
ubufs = NULL;
}
+   total_len += len;
+   if (vq->delayed < tx_batched &&
+   total_len < VHOST_NET_WEIGHT &&
+   !vhost_vq_avail_empty(>dev, vq)) {
+   vq->delayed++;
+   msg.msg_flags |= MSG_MORE;
+   } else {
+   vq->delayed = 0;
+   msg.msg_flags &= ~MSG_MORE;
+   }
/* TODO: Check specific error and bomb out unless ENOBUFS? */
err = sock->ops->sendmsg(sock, , len);
if (unlikely(err < 0)) {
@@ -472,7 +486,6 @@ static void handle_tx(struct vhost_net *net)
vhost_add_used_and_signal(>dev, vq, head, 0);
else
vhost_zerocopy_signal_used(net, vq);
-   total_len += len;
vhost_net_tx_packet(net);
if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
vhost_poll_queue(>poll);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index fdf4cdf..bc362c7 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -311,6 +311,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
vq->busyloop_timeout = 0;
vq->umem = NULL;
vq->iotlb = NULL;
+   vq->delayed = 0;
 }
 
 static int vhost_worker(void *data)
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 78f3c5f..9f81a94 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -141,6 +141,7 @@ struct vhost_virtqueue {
bool user_be;
 #endif
u32 busyloop_timeout;
+   int delayed;
 };
 
 struct vhost_msg_node {
-- 
2.7.4

[PATCH 1/3] tuntap: rx batching

2016-11-08 Thread Jason Wang

Backlog were used for tuntap rx, but it can only process 1 packet at
one time since it was scheduled during sendmsg() synchronously in
process context. This lead bad cache utilization so this patch tries
to do some batching before call rx NAPI. This is done through:

- accept MSG_MORE as a hint from sendmsg() caller, if it was set,
  batch the packet temporarily in a linked list and submit them all
  once MSG_MORE were cleared.
- implement a tuntap specific NAPI handler for processing this kind of
  possible batching. (This could be done by extending backlog to
  support skb like, but using a tun specific one looks cleaner and
  easier for future extension).

Signed-off-by: Jason Wang 
---
 drivers/net/tun.c | 71 ++-
 1 file changed, 65 insertions(+), 6 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 1588469..d40583b 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -74,6 +74,7 @@
 #include 
 
 #include 
+#include 
 
 /* Uncomment to enable debugging */
 /* #define TUN_DEBUG 1 */
@@ -169,6 +170,8 @@ struct tun_file {
struct list_head next;
struct tun_struct *detached;
struct skb_array tx_array;
+   struct napi_struct napi;
+   struct sk_buff_head process_queue;
 };
 
 struct tun_flow_entry {
@@ -522,6 +525,8 @@ static void tun_queue_purge(struct tun_file *tfile)
while ((skb = skb_array_consume(>tx_array)) != NULL)
kfree_skb(skb);
 
+   skb_queue_purge(>sk.sk_write_queue);
+   skb_queue_purge(>process_queue);
skb_queue_purge(>sk.sk_error_queue);
 }
 
@@ -532,6 +537,11 @@ static void __tun_detach(struct tun_file *tfile, bool 
clean)
 
tun = rtnl_dereference(tfile->tun);
 
+   if (tun && clean) {
+   napi_disable(>napi);
+   netif_napi_del(>napi);
+   }
+
if (tun && !tfile->detached) {
u16 index = tfile->queue_index;
BUG_ON(index >= tun->numqueues);
@@ -587,6 +597,7 @@ static void tun_detach_all(struct net_device *dev)
 
for (i = 0; i < n; i++) {
tfile = rtnl_dereference(tun->tfiles[i]);
+   napi_disable(>napi);
BUG_ON(!tfile);
tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
tfile->socket.sk->sk_data_ready(tfile->socket.sk);
@@ -603,6 +614,7 @@ static void tun_detach_all(struct net_device *dev)
synchronize_net();
for (i = 0; i < n; i++) {
tfile = rtnl_dereference(tun->tfiles[i]);
+   netif_napi_del(>napi);
/* Drop read queue */
tun_queue_purge(tfile);
sock_put(>sk);
@@ -618,6 +630,41 @@ static void tun_detach_all(struct net_device *dev)
module_put(THIS_MODULE);
 }
 
+static int tun_poll(struct napi_struct *napi, int budget)
+{
+   struct tun_file *tfile = container_of(napi, struct tun_file, napi);
+   struct sk_buff_head *input_queue =
+  >socket.sk->sk_write_queue;
+   struct sk_buff *skb;
+   unsigned int received = 0;
+
+   while (1) {
+   while ((skb = __skb_dequeue(>process_queue))) {
+   netif_receive_skb(skb);
+   if (++received >= budget)
+   return received;
+   }
+
+   spin_lock(_queue->lock);
+   if (skb_queue_empty(input_queue)) {
+   spin_unlock(_queue->lock);
+   break;
+   }
+   skb_queue_splice_tail_init(input_queue, >process_queue);
+   spin_unlock(_queue->lock);
+   }
+
+   if (received < budget) {
+   napi_complete(napi);
+   if (skb_peek(>socket.sk->sk_write_queue) &&
+   unlikely(napi_schedule_prep(napi))) {
+   __napi_schedule(napi);
+   }
+   }
+
+   return received;
+}
+
 static int tun_attach(struct tun_struct *tun, struct file *file, bool 
skip_filter)
 {
struct tun_file *tfile = file->private_data;
@@ -666,9 +713,11 @@ static int tun_attach(struct tun_struct *tun, struct file 
*file, bool skip_filte
 
if (tfile->detached)
tun_enable_queue(tfile);
-   else
+   else {
sock_hold(>sk);
-
+   netif_napi_add(tun->dev, >napi, tun_poll, 64);
+   napi_enable(>napi);
+   }
tun_set_real_num_queues(tun);
 
/* device is allowed to go away first, so no need to hold extra
@@ -1150,7 +1199,7 @@ static struct sk_buff *tun_alloc_skb(struct tun_file 
*tfile,
 /* Get packet from user space buffer */
 static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
void *msg_control, struct iov_iter *from,
-   int noblock)
+   int noblock, bool more)
 {
struct tun_pi

[PATCH] net/mlx4_en: Fix bpf_prog_add ref_cnt in mlx4

2016-11-08 Thread Zhiyi Sun

There are rx_ring_num queues. Each queue will load xdp prog. So
bpf_prog_add() should add rx_ring_num to ref_cnt.

Signed-off-by: Zhiyi Sun 
---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c 
b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 12c99a2..d25e150 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2650,7 +2650,7 @@ static int mlx4_xdp_set(struct net_device *dev, struct 
bpf_prog *prog)
 */
if (priv->xdp_ring_num == xdp_ring_num) {
if (prog) {
-   prog = bpf_prog_add(prog, priv->rx_ring_num - 1);
+   prog = bpf_prog_add(prog, priv->rx_ring_num);
if (IS_ERR(prog))
return PTR_ERR(prog);
}
@@ -2680,7 +2680,7 @@ static int mlx4_xdp_set(struct net_device *dev, struct 
bpf_prog *prog)
}
 
if (prog) {
-   prog = bpf_prog_add(prog, priv->rx_ring_num - 1);
+   prog = bpf_prog_add(prog, priv->rx_ring_num);
if (IS_ERR(prog))
return PTR_ERR(prog);
}
-- 
2.9.3

Re: [PATCH for-next 09/11] IB/hns: Change qpn allocation to round-robin mode.

2016-11-08 Thread Leon Romanovsky

On Fri, Nov 04, 2016 at 04:36:31PM +, Salil Mehta wrote:
> From: "Wei Hu (Xavier)" 
>
> When using CM to establish connections, qp number that was freed
> just now will be rejected by ib core. To fix these problem, We
> change qpn allocation to round-robin mode. We added the round-robin
> mode for allocating resources using bitmap. We use round-robin mode
> for qp number and non round-robing mode for other resources like
> cq number, pd number etc.
>
> Signed-off-by: Wei Hu (Xavier) 
> Signed-off-by: Salil Mehta  

Reviewed-by: Leon Romanovsky 


signature.asc
Description: PGP signature

Re: [PATCH for-next 03/11] IB/hns: Optimize the logic of allocating memory using APIs

2016-11-08 Thread Leon Romanovsky

On Fri, Nov 04, 2016 at 04:36:25PM +, Salil Mehta wrote:
> From: "Wei Hu (Xavier)" 
>
> This patch modified the logic of allocating memory using APIs in
> hns RoCE driver. We used kcalloc instead of kmalloc_array and
> bitmap_zero. And When kcalloc failed, call vzalloc to alloc
> memory.
>
> Signed-off-by: Wei Hu (Xavier) 
> Signed-off-by: Ping Zhang 
> Signed-off-by: Salil Mehta  
> ---
>  drivers/infiniband/hw/hns/hns_roce_mr.c |   15 ---
>  1 file changed, 8 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c 
> b/drivers/infiniband/hw/hns/hns_roce_mr.c
> index fb87883..d3dfb5f 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_mr.c
> +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
> @@ -137,11 +137,12 @@ static int hns_roce_buddy_init(struct hns_roce_buddy 
> *buddy, int max_order)
>
>   for (i = 0; i <= buddy->max_order; ++i) {
>   s = BITS_TO_LONGS(1 << (buddy->max_order - i));
> - buddy->bits[i] = kmalloc_array(s, sizeof(long), GFP_KERNEL);
> - if (!buddy->bits[i])
> - goto err_out_free;
> -
> - bitmap_zero(buddy->bits[i], 1 << (buddy->max_order - i));
> + buddy->bits[i] = kcalloc(s, sizeof(long), GFP_KERNEL);
> + if (!buddy->bits[i]) {
> + buddy->bits[i] = vzalloc(s * sizeof(long));

I wonder, why don't you use directly vzalloc instead of kcalloc fallback?

> + if (!buddy->bits[i])
> + goto err_out_free;
> + }
>   }


signature.asc
Description: PGP signature

Re: Virtio_net support vxlan encapsulation package TSO offload discuss

2016-11-08 Thread Jason Wang




On 2016年11月08日 19:58, Zhangming (James, Euler) wrote:

On 2016年11月08日 19:17, Jason Wang wrote:


On 2016年11月08日 19:13, Jason Wang wrote:

Cc Michael

On 2016年11月08日 16:34, Zhangming (James, Euler) wrote:

In container scenario, OVS is installed in the Virtual machine, and
all the containers connected to the OVS will communicated through
VXLAN encapsulation.

By now, virtio_net does not support TSO offload for VXLAN
encapsulated TSO package. In this condition, the performance is not
good, sender is bottleneck

I googled this scenario, but I didn’t find any information. Will
virtio_net support VXLAN encapsulation package TSO offload later?


Yes and for both sender and receiver.


My idea is virtio_net open encapsulated TSO offload, and transport
encapsulation info to TUN, TUN will parse the info and build skb with
encapsulation info.

OVS or kernel on the host should be modified to support this. Using
this method, the TCP performance aremore than 2x as before.

Any advice and suggestions for this idea or new idea will be greatly
appreciated!

Best regards,

James zhang


Sounds very good. And we may also need features bits
(VIRTIO_NET_F_GUEST|HOST_GSO_X) for this.

This is in fact one of items in networking todo list. (See
http://www.linux-kvm.org/page/NetworkingTodo). While at it, we'd
better support not only VXLAN but also other tunnels.

Cc Vlad who is working on extending virtio-net headers.


We can start with the spec work, or if you've already had some bits
you can post them as RFC for early review.

Thanks

Below is my demo code
Virtio_net.c
static int virtnet_probe(struct virtio_device *vdev), add belows codes:
 if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) ||
// avoid gso segment, it should be negotiation later, because 
in the demo I reuse num_buffers.
 virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
 dev->hw_enc_features |= NETIF_F_TSO;
 dev->hw_enc_features |= NETIF_F_ALL_CSUM;
 dev->hw_enc_features |= NETIF_F_GSO_UDP_TUNNEL;
 dev->hw_enc_features |= NETIF_F_GSO_UDP_TUNNEL_CSUM;
 dev->hw_enc_features |= NETIF_F_GSO_TUNNEL_REMCSUM;

 dev->features |= NETIF_F_GSO_UDP_TUNNEL;
 dev->features |= NETIF_F_GSO_UDP_TUNNEL_CSUM;
 dev->features |= NETIF_F_GSO_TUNNEL_REMCSUM;
 }

static int xmit_skb(struct send_queue *sq, struct sk_buff *skb), add below to 
pieces of codes

 if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL)
 hdr->hdr.gso_type |= VIRTIO_NET_HDR_GSO_TUNNEL;
 if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM)
 hdr->hdr.gso_type |= VIRTIO_NET_HDR_GSO_TUNNEL_CSUM;
 if (skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM)
 hdr->hdr.gso_type |= VIRTIO_NET_HDR_GSO_TUNNEL_REMCSUM;

 if (skb->encapsulation && skb_is_gso(skb)) {
 inner_mac_len = skb_inner_network_header(skb) - 
skb_inner_mac_header(skb);
 tnl_len = skb_inner_mac_header(skb) - skb_mac_header(skb);
 if ( !(inner_mac_len >> DATA_LEN_SHIFT) && !(tnl_len >> 
DATA_LEN_SHIFT) ) {
 hdr->hdr.flags |= VIRTIO_NET_HDR_F_ENCAPSULATION;
 hdr->num_buffers = (__virtio16)((inner_mac_len << 
DATA_LEN_SHIFT) | tnl_len); //we reuse num_buffers for simple , we should add 
extend member for later.
 }  else
 hdr->num_buffers = 0;
 }

Tun.c
 if (memcpy_fromiovecend((void *), iv, offset, 
tun->vnet_hdr_sz))//read header with negotiation length
 return -EFAULT;

 if (hdr.gso_type & VIRTIO_NET_HDR_GSO_TUNNEL)  
//set tunnel gso info
 skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
 if (hdr.gso_type & VIRTIO_NET_HDR_GSO_TUNNEL_CSUM)
 skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
 if (hdr.gso_type & VIRTIO_NET_HDR_GSO_TUNNEL_REMCSUM)
 skb_shinfo(skb)->gso_type |= SKB_GSO_TUNNEL_REMCSUM;

 if (hdr.flags & VIRTIO_NET_HDR_F_ENCAPSULATION) {  
//read tunnel info from header and set to built skb.
 tnl_len = tun16_to_cpu(tun, hdr.num_buffers) & 
TUN_TNL_LEN_MASK;
 payload_mac_len = tun16_to_cpu(tun, hdr.num_buffers) >> 
TUN_DATA_LEN_SHIFT;
 mac_len = skb_network_header(skb) - skb_mac_header(skb);
 skb_set_inner_mac_header(skb, tnl_len - mac_len);
 skb_set_inner_network_header(skb, tnl_len + payload_mac_len - 
mac_len);
 skb->encapsulation = 1;
 }




Something like this, and you probably need do something more:

- use

Re: [PATCH] net: ipv4: ip_send_unicast_reply should set oif only if it is L3 master

2016-11-08 Thread Lorenzo Colitti

On Wed, Nov 9, 2016 at 7:50 AM, David Ahern  wrote:
> @@ -1577,7 +1577,8 @@ void ip_send_unicast_reply(struct sock *sk, struct 
> sk_buff *skb,

Tested-by: Lorenzo Colitti 

This fixes the IPv4 test, thanks. I notice that 4.8 didn't have
e0d56fdd73, so if this patch can get into 4.9 then there will be no
release that had the behaviour change. Not sure if that's possible any
more though.

Can you also fix tcp_v6_send_response, which suffers from the same
problem? Perhaps revert this hunk of e0d56fdd73 ("net: l3mdev: remove
redundant calls"):

@@ -818,12 +818,8 @@ static void tcp_v6_send_response(const struct
sock *sk, struct sk_buff *skb, u32
fl6.flowi6_proto = IPPROTO_TCP;
if (rt6_need_strict() && !oif)
fl6.flowi6_oif = tcp_v6_iif(skb);
-   else {
-   if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
-   oif = skb->skb_iif;
-
-   fl6.flowi6_oif = oif;
-   }
+   else
+   fl6.flowi6_oif = oif ? : skb->skb_iif;

fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark);
fl6.fl6_dport = t1->dest;

[v16, 7/7] mmc: sdhci-of-esdhc: fix host version for T4240-R1.0-R2.0

2016-11-08 Thread Yangbo Lu

The eSDHC of T4240-R1.0-R2.0 has incorrect vender version and spec version.
Acturally the right version numbers should be VVN=0x13 and SVN = 0x1.
This patch adds the GUTS driver support for eSDHC driver to match SoC.
And fix host version to avoid that incorrect version numbers break down
the ADMA data transfer.

Signed-off-by: Yangbo Lu 
Acked-by: Ulf Hansson 
Acked-by: Scott Wood 
Acked-by: Arnd Bergmann 
---
Changes for v2:
- Got SVR through iomap instead of dts
Changes for v3:
- Managed GUTS through syscon instead of iomap in eSDHC driver
Changes for v4:
- Got SVR by GUTS driver instead of SYSCON
Changes for v5:
- Changed to get SVR through API fsl_guts_get_svr()
- Combined patch 4, patch 5 and patch 6 into one
Changes for v6:
- Added 'Acked-by: Ulf Hansson'
Changes for v7:
- None
Changes for v8:
- Added 'Acked-by: Scott Wood'
Changes for v9:
- None
Changes for v10:
- None
Changes for v11:
- Changed to use soc_device_match
Changes for v12:
- Matched soc through .family field instead of .soc_id
Changes for v13:
- None
Changes for v14:
- None
Changes for v15:
- None
Changes for v16:
- Added 'Acked-by: Arnd'
---
 drivers/mmc/host/Kconfig  |  1 +
 drivers/mmc/host/sdhci-of-esdhc.c | 20 
 2 files changed, 21 insertions(+)

diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig
index 5cf7eba..4128a3c 100644
--- a/drivers/mmc/host/Kconfig
+++ b/drivers/mmc/host/Kconfig
@@ -144,6 +144,7 @@ config MMC_SDHCI_OF_ESDHC
depends on MMC_SDHCI_PLTFM
depends on PPC || ARCH_MXC || ARCH_LAYERSCAPE
select MMC_SDHCI_IO_ACCESSORS
+   select FSL_GUTS
help
  This selects the Freescale eSDHC controller support.
 
diff --git a/drivers/mmc/host/sdhci-of-esdhc.c 
b/drivers/mmc/host/sdhci-of-esdhc.c
index fb71c86..57bdb9e 100644
--- a/drivers/mmc/host/sdhci-of-esdhc.c
+++ b/drivers/mmc/host/sdhci-of-esdhc.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include "sdhci-pltfm.h"
 #include "sdhci-esdhc.h"
@@ -28,6 +29,7 @@
 struct sdhci_esdhc {
u8 vendor_ver;
u8 spec_ver;
+   bool quirk_incorrect_hostver;
 };
 
 /**
@@ -73,6 +75,8 @@ static u32 esdhc_readl_fixup(struct sdhci_host *host,
 static u16 esdhc_readw_fixup(struct sdhci_host *host,
 int spec_reg, u32 value)
 {
+   struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
+   struct sdhci_esdhc *esdhc = sdhci_pltfm_priv(pltfm_host);
u16 ret;
int shift = (spec_reg & 0x2) * 8;
 
@@ -80,6 +84,12 @@ static u16 esdhc_readw_fixup(struct sdhci_host *host,
ret = value & 0x;
else
ret = (value >> shift) & 0x;
+   /* Workaround for T4240-R1.0-R2.0 eSDHC which has incorrect
+* vendor version and spec version information.
+*/
+   if ((spec_reg == SDHCI_HOST_VERSION) &&
+   (esdhc->quirk_incorrect_hostver))
+   ret = (VENDOR_V_23 << SDHCI_VENDOR_VER_SHIFT) | SDHCI_SPEC_200;
return ret;
 }
 
@@ -558,6 +568,12 @@ static const struct sdhci_pltfm_data sdhci_esdhc_le_pdata 
= {
.ops = _esdhc_le_ops,
 };
 
+static struct soc_device_attribute soc_incorrect_hostver[] = {
+   { .family = "QorIQ T4240", .revision = "1.0", },
+   { .family = "QorIQ T4240", .revision = "2.0", },
+   { },
+};
+
 static void esdhc_init(struct platform_device *pdev, struct sdhci_host *host)
 {
struct sdhci_pltfm_host *pltfm_host;
@@ -571,6 +587,10 @@ static void esdhc_init(struct platform_device *pdev, 
struct sdhci_host *host)
esdhc->vendor_ver = (host_ver & SDHCI_VENDOR_VER_MASK) >>
 SDHCI_VENDOR_VER_SHIFT;
esdhc->spec_ver = host_ver & SDHCI_SPEC_VER_MASK;
+   if (soc_device_match(soc_incorrect_hostver))
+   esdhc->quirk_incorrect_hostver = true;
+   else
+   esdhc->quirk_incorrect_hostver = false;
 }
 
 static int sdhci_esdhc_probe(struct platform_device *pdev)
-- 
2.1.0.27.g96db324

[v16, 6/7] base: soc: Check for NULL SoC device attributes

2016-11-08 Thread Yangbo Lu

From: Geert Uytterhoeven 

If soc_device_match() is used to check the value of a specific
attribute that is not present for the current SoC, the kernel crashes
with a NULL pointer dereference.

Fix this by explicitly checking for the absence of a needed property,
and considering this a non-match.

Signed-off-by: Geert Uytterhoeven 
Acked-by: Arnd Bergmann 
---
Changes for v16:
- Added this patch
---
 drivers/base/soc.c | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/base/soc.c b/drivers/base/soc.c
index 0c5cf87..0e701e2 100644
--- a/drivers/base/soc.c
+++ b/drivers/base/soc.c
@@ -167,19 +167,23 @@ static int soc_device_match_one(struct device *dev, void 
*arg)
const struct soc_device_attribute *match = arg;
 
if (match->machine &&
-   !glob_match(match->machine, soc_dev->attr->machine))
+   (!soc_dev->attr->machine ||
+!glob_match(match->machine, soc_dev->attr->machine)))
return 0;
 
if (match->family &&
-   !glob_match(match->family, soc_dev->attr->family))
+   (!soc_dev->attr->family ||
+!glob_match(match->family, soc_dev->attr->family)))
return 0;
 
if (match->revision &&
-   !glob_match(match->revision, soc_dev->attr->revision))
+   (!soc_dev->attr->revision ||
+!glob_match(match->revision, soc_dev->attr->revision)))
return 0;
 
if (match->soc_id &&
-   !glob_match(match->soc_id, soc_dev->attr->soc_id))
+   (!soc_dev->attr->soc_id ||
+!glob_match(match->soc_id, soc_dev->attr->soc_id)))
return 0;
 
return 1;
-- 
2.1.0.27.g96db324

[v16, 2/7] dt: bindings: move guts devicetree doc out of powerpc directory

2016-11-08 Thread Yangbo Lu

Move guts devicetree doc to Documentation/devicetree/bindings/soc/fsl/
since it's used by not only PowerPC but also ARM. And add a specification
for 'little-endian' property.

Signed-off-by: Yangbo Lu 
Acked-by: Rob Herring 
Acked-by: Scott Wood 
Acked-by: Arnd Bergmann 
---
Changes for v4:
- Added this patch
Changes for v5:
- Modified the description for little-endian property
Changes for v6:
- None
Changes for v7:
- None
Changes for v8:
- Added 'Acked-by: Scott Wood'
- Added 'Acked-by: Rob Herring'
Changes for v9:
- None
Changes for v10:
- None
Changes for v11:
- None
Changes for v12:
- None
Changes for v13:
- None
Changes for v14:
- None
Changes for v15:
- None
Changes for v16:
- Added 'Acked-by: Arnd'
---
 Documentation/devicetree/bindings/{powerpc => soc}/fsl/guts.txt | 3 +++
 1 file changed, 3 insertions(+)
 rename Documentation/devicetree/bindings/{powerpc => soc}/fsl/guts.txt (91%)

diff --git a/Documentation/devicetree/bindings/powerpc/fsl/guts.txt 
b/Documentation/devicetree/bindings/soc/fsl/guts.txt
similarity index 91%
rename from Documentation/devicetree/bindings/powerpc/fsl/guts.txt
rename to Documentation/devicetree/bindings/soc/fsl/guts.txt
index b71b203..07adca9 100644
--- a/Documentation/devicetree/bindings/powerpc/fsl/guts.txt
+++ b/Documentation/devicetree/bindings/soc/fsl/guts.txt
@@ -25,6 +25,9 @@ Recommended properties:
  - fsl,liodn-bits : Indicates the number of defined bits in the LIODN
registers, for those SOCs that have a PAMU device.
 
+ - little-endian : Indicates that the global utilities block is little
+   endian. The default is big endian.
+
 Examples:
global-utilities@e {/* global utilities block */
compatible = "fsl,mpc8548-guts";
-- 
2.1.0.27.g96db324

[PATCH] mwifiex: fix memory leak in mwifiex_save_hidden_ssid_channels()

2016-11-08 Thread Ricky Liang

kmemleak reports memory leak in mwifiex_save_hidden_ssid_channels():

unreferenced object 0xffc0a2914780 (size 192):
  comm "ksdioirqd/mmc2", pid 2004, jiffies 4307182506 (age 820.684s)
  hex dump (first 32 bytes):
00 06 47 49 4e 2d 32 67 01 03 c8 60 6c 03 01 40  ..GIN-2g...`l..@
07 10 54 57 20 34 04 1e 64 05 24 84 03 24 95 04  ..TW 4..d.$..$..
  backtrace:
[] create_object+0x164/0x2b4
[] kmemleak_alloc+0x50/0x88
[] __kmalloc_track_caller+0x1bc/0x264
[] kmemdup+0x38/0x64
[] mwifiex_fill_new_bss_desc+0x3c/0x130 [mwifiex]
[] mwifiex_save_curr_bcn+0x4ec/0x640 [mwifiex]
[] mwifiex_handle_event_ext_scan_report+0x1d4/0x268 
[mwifiex]
[] mwifiex_process_sta_event+0x378/0x898 [mwifiex]
[] mwifiex_process_event+0x1a8/0x1e8 [mwifiex]
[] mwifiex_main_process+0x258/0x534 [mwifiex]
[] 0xffbffc258858
[] process_sdio_pending_irqs+0xf8/0x160
[] sdio_irq_thread+0x9c/0x1a4
[] kthread+0xf4/0x100
[] ret_from_fork+0xc/0x50
[] 0x

Signed-off-by: Ricky Liang 
---
 drivers/net/wireless/marvell/mwifiex/scan.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/wireless/marvell/mwifiex/scan.c 
b/drivers/net/wireless/marvell/mwifiex/scan.c
index 97c9765..98ce072 100644
--- a/drivers/net/wireless/marvell/mwifiex/scan.c
+++ b/drivers/net/wireless/marvell/mwifiex/scan.c
@@ -1671,6 +1671,10 @@ static int mwifiex_save_hidden_ssid_channels(struct 
mwifiex_private *priv,
}
 
 done:
+   /* beacon_ie buffer was allocated in function
+* mwifiex_fill_new_bss_desc(). Free it now.
+*/
+   kfree(bss_desc->beacon_buf);
kfree(bss_desc);
return 0;
 }
-- 
2.6.6

Re: net/sctp: null-ptr-deref in sctp_inet_listen

2016-11-08 Thread Xin Long

On Wed, Nov 9, 2016 at 2:46 AM, Andrey Konovalov  wrote:
> Hi Xin,
>
> Your patch seems to be fixing the issue.
>
> Tested-by: Andrey Konovalov 
>
> Thanks!
>
> On Tue, Nov 8, 2016 at 11:06 AM, Xin Long  wrote:
>> On Tue, Nov 8, 2016 at 5:44 AM, Andrey Konovalov  
>> wrote:
>>> Hi,
>>>
>>> I've got the following error report while running the syzkaller fuzzer:
>>>
>>> kasan: CONFIG_KASAN_INLINE enabled
>>> kasan: GPF could be caused by NULL-ptr deref or user memory access
>>> general protection fault:  [#1] SMP KASAN
>>> Modules linked in:
>>> CPU: 1 PID: 3851 Comm: a.out Not tainted 4.9.0-rc4+ #354
>>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
>>> task: 880065f1d800 task.stack: 88006384
>>> RIP: 0010:[]  []
>>> sctp_inet_listen+0x29b/0x790 net/sctp/socket.c:6870
>>> RSP: 0018:880063847dd0  EFLAGS: 00010202
>>> RAX: dc00 RBX: 11000c708fbd RCX: 
>>> RDX:  RSI:  RDI: 0002
>>> RBP: 880063847e70 R08: dc00 R09: dc00
>>> R10: 0002 R11: 0002 R12: 88006b350800
>>> R13:  R14: 11000d66a1a5 R15: 
>>> FS:  7fd1f0f3d7c0() GS:88006cd0() knlGS:
>>> CS:  0010 DS:  ES:  CR0: 80050033
>>> CR2: 2000 CR3: 64af9000 CR4: 06e0
>>> Stack:
>>>  880063847de0 880066165900 88006b350d20 41b58ab3
>>>  847ff589 83941280 dc00 
>>>  880069b9f740  880063847e38 819f04ef
>>> Call Trace:
>>>  [< inline >] SYSC_listen net/socket.c:1396
>>>  [] SyS_listen+0x206/0x250 net/socket.c:1382
>>>  [] entry_SYSCALL_64_fastpath+0x1f/0xc2
>>> arch/x86/entry/entry_64.S:209
>>> Code: 00 0f 85 f4 04 00 00 4d 8b ac 24 28 05 00 00 49 b8 00 00 00 00
>>> 00 fc ff df 49 8d 7d 02 48 89 fe 49 89 fa 48 c1 ee 03 41 83 e2 07 <46>
>>> 0f b6 0c 06 41 83 c2 01 45 38 ca 7c 09 45 84 c9 0f 85 87 04
>>> RIP  [] sctp_inet_listen+0x29b/0x790 
>>> net/sctp/socket.c:6870
>>>  RSP 
>>> ---[ end trace f2b501fc22999b37 ]---
>>>
>>> A reproducer is attached.
>>>
>>> On commit bc33b0ca11e3df46a4fa7639ba488c9d4911 (Nov 5).
>>>
>> This is a shutdown injection issue.
>> sctp_shutdown need a sk->state check, just like tcp_shutdown:
>>
>> --- a/net/sctp/socket.c
>> +++ b/net/sctp/socket.c
>> @@ -4287,7 +4287,8 @@ static void sctp_shutdown(struct sock *sk, int how)
>> if (!sctp_style(sk, TCP))
>> return;
>>
>> -   if (how & SEND_SHUTDOWN) {
>> +   if (how & SEND_SHUTDOWN &&
>> +   (1 << sk->sk_state) & (SCTP_SS_ESTABLISHED | SCTP_SS_CLOSING)) {
>> sk->sk_state = SCTP_SS_CLOSING;
>> ep = sctp_sk(sk)->ep;
>> if (!list_empty(>asocs)) {
this fix may break TYPE_SCTP_PRIMITIVE_SHUTDOWN statetable,
could you give the following one a try ? thanks.

--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4288,9 +4288,9 @@ static void sctp_shutdown(struct sock *sk, int how)
return;

if (how & SEND_SHUTDOWN) {
-   sk->sk_state = SCTP_SS_CLOSING;
ep = sctp_sk(sk)->ep;
if (!list_empty(>asocs)) {
+   sk->sk_state = SCTP_SS_CLOSING;
asoc = list_entry(ep->asocs.next,
  struct sctp_association, asocs);
sctp_primitive_SHUTDOWN(net, asoc, NULL);

[v16, 4/7] MAINTAINERS: add entry for Freescale SoC drivers

2016-11-08 Thread Yangbo Lu

Add maintainer entry for Freescale SoC drivers including
the QE library and the GUTS driver now. Also add maintainer
for QE library.

Signed-off-by: Yangbo Lu 
Acked-by: Scott Wood 
Acked-by: Qiang Zhao 
Acked-by: Arnd Bergmann 
---
Changes for v8:
- Added this patch
Changes for v9:
- Added linux-arm mail list
- Removed GUTS driver entry
Changes for v10:
- Changed 'DRIVER' to 'DRIVERS'
- Added 'Acked-by' of Scott and Qiang
Changes for v11:
- None
Changes for v12:
- None
Changes for v13:
- None
Changes for v14:
- None
Changes for v15:
- None
Changes for v16:
- Added 'Acked-by: Arnd'
---
 MAINTAINERS | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 9be761f..e1a8835 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5045,9 +5045,18 @@ S:   Maintained
 F: drivers/net/ethernet/freescale/fman
 F: Documentation/devicetree/bindings/powerpc/fsl/fman.txt
 
+FREESCALE SOC DRIVERS
+M: Scott Wood 
+L: linuxppc-...@lists.ozlabs.org
+L: linux-arm-ker...@lists.infradead.org
+S: Maintained
+F: drivers/soc/fsl/
+F: include/linux/fsl/
+
 FREESCALE QUICC ENGINE LIBRARY
+M: Qiang Zhao 
 L: linuxppc-...@lists.ozlabs.org
-S: Orphan
+S: Maintained
 F: drivers/soc/fsl/qe/
 F: include/soc/fsl/*qe*.h
 F: include/soc/fsl/*ucc*.h
-- 
2.1.0.27.g96db324

[v16, 5/7] base: soc: introduce soc_device_match() interface

2016-11-08 Thread Yangbo Lu

From: Arnd Bergmann 

We keep running into cases where device drivers want to know the exact
version of the a SoC they are currently running on. In the past, this has
usually been done through a vendor specific API that can be called by a
driver, or by directly accessing some kind of version register that is
not part of the device itself but that belongs to a global register area
of the chip.

Common reasons for doing this include:

- A machine is not using devicetree or similar for passing data about
  on-chip devices, but just announces their presence using boot-time
  platform devices, and the machine code itself does not care about the
  revision.

- There is existing firmware or boot loaders with existing DT binaries
  with generic compatible strings that do not identify the particular
  revision of each device, but the driver knows which SoC revisions
  include which part.

- A prerelease version of a chip has some quirks and we are using the same
  version of the bootloader and the DT blob on both the prerelease and the
  final version. An update of the DT binding seems inappropriate because
  that would involve maintaining multiple copies of the dts and/or
  bootloader.

This patch introduces the soc_device_match() interface that is meant to
work like of_match_node() but instead of identifying the version of a
device, it identifies the SoC itself using a vendor-agnostic interface.

Unlike of_match_node(), we do not do an exact string compare but instead
use glob_match() to allow wildcards in strings.

Signed-off-by: Arnd Bergmann 
Signed-off-by: Yangbo Lu 
Acked-by: Greg Kroah-Hartman 
---
Changes for v11:
- Added this patch for soc match
Changes for v12:
- Corrected the author
- Rewrited soc_device_match with while loop
Changes for v13:
- Added ack from Greg
Changes for v14:
- None
Changes for v15:
- None
Changes for v16:
- None
---
 drivers/base/Kconfig|  1 +
 drivers/base/soc.c  | 66 +
 include/linux/sys_soc.h |  3 +++
 3 files changed, 70 insertions(+)

diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index d02e7c0..2abea87 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -237,6 +237,7 @@ config GENERIC_CPU_AUTOPROBE
 
 config SOC_BUS
bool
+   select GLOB
 
 source "drivers/base/regmap/Kconfig"
 
diff --git a/drivers/base/soc.c b/drivers/base/soc.c
index b63f23e..0c5cf87 100644
--- a/drivers/base/soc.c
+++ b/drivers/base/soc.c
@@ -13,6 +13,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static DEFINE_IDA(soc_ida);
 
@@ -159,3 +160,68 @@ static int __init soc_bus_register(void)
return bus_register(_bus_type);
 }
 core_initcall(soc_bus_register);
+
+static int soc_device_match_one(struct device *dev, void *arg)
+{
+   struct soc_device *soc_dev = container_of(dev, struct soc_device, dev);
+   const struct soc_device_attribute *match = arg;
+
+   if (match->machine &&
+   !glob_match(match->machine, soc_dev->attr->machine))
+   return 0;
+
+   if (match->family &&
+   !glob_match(match->family, soc_dev->attr->family))
+   return 0;
+
+   if (match->revision &&
+   !glob_match(match->revision, soc_dev->attr->revision))
+   return 0;
+
+   if (match->soc_id &&
+   !glob_match(match->soc_id, soc_dev->attr->soc_id))
+   return 0;
+
+   return 1;
+}
+
+/*
+ * soc_device_match - identify the SoC in the machine
+ * @matches: zero-terminated array of possible matches
+ *
+ * returns the first matching entry of the argument array, or NULL
+ * if none of them match.
+ *
+ * This function is meant as a helper in place of of_match_node()
+ * in cases where either no device tree is available or the information
+ * in a device node is insufficient to identify a particular variant
+ * by its compatible strings or other properties. For new devices,
+ * the DT binding should always provide unique compatible strings
+ * that allow the use of of_match_node() instead.
+ *
+ * The calling function can use the .data entry of the
+ * soc_device_attribute to pass a structure or function pointer for
+ * each entry.
+ */
+const struct soc_device_attribute *soc_device_match(
+   const struct soc_device_attribute *matches)
+{
+   int ret = 0;
+
+   if (!matches)
+   return NULL;
+
+   while (!ret) {
+   if (!(matches->machine || matches->family ||
+ matches->revision || matches->soc_id))
+   break;
+   ret = bus_for_each_dev(_bus_type, NULL, (void *)matches,
+  soc_device_match_one);
+   if (!ret)
+   matches++;
+   else
+   return matches;
+   }
+   return NULL;
+}

[v16, 1/7] ARM64: dts: ls2080a: add device configuration node

2016-11-08 Thread Yangbo Lu

Add the dts node for device configuration unit that provides
general purpose configuration and status for the device.

Signed-off-by: Yangbo Lu 
Acked-by: Scott Wood 
Acked-by: Arnd Bergmann 
---
Changes for v5:
- Added this patch
Changes for v6:
- None
Changes for v7:
- None
Changes for v8:
- Added 'Acked-by: Scott Wood'
Changes for v9:
- None
Changes for v10:
- None
Changes for v11:
- None
Changes for v12:
- None
Changes for v13:
- None
Changes for v14:
- None
Changes for v15:
- None
Changes for v16:
- Added 'Acked-by: Arnd'
---
 arch/arm64/boot/dts/freescale/fsl-ls2080a.dtsi | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/arch/arm64/boot/dts/freescale/fsl-ls2080a.dtsi 
b/arch/arm64/boot/dts/freescale/fsl-ls2080a.dtsi
index 7f0dc13..d058e56 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls2080a.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-ls2080a.dtsi
@@ -216,6 +216,12 @@
clocks = <>;
};
 
+   dcfg: dcfg@1e0 {
+   compatible = "fsl,ls2080a-dcfg", "syscon";
+   reg = <0x0 0x1e0 0x0 0x1>;
+   little-endian;
+   };
+
serial0: serial@21c0500 {
compatible = "fsl,ns16550", "ns16550a";
reg = <0x0 0x21c0500 0x0 0x100>;
-- 
2.1.0.27.g96db324

[v16, 3/7] soc: fsl: add GUTS driver for QorIQ platforms

2016-11-08 Thread Yangbo Lu

The global utilities block controls power management, I/O device
enabling, power-onreset(POR) configuration monitoring, alternate
function selection for multiplexed signals,and clock control.

This patch adds a driver to manage and access global utilities block.
Initially only reading SVR and registering soc device are supported.
Other guts accesses, such as reading RCW, should eventually be moved
into this driver as well.

Signed-off-by: Yangbo Lu 
Acked-by: Arnd Bergmann 
---
Changes for v4:
- Added this patch
Changes for v5:
- Modified copyright info
- Changed MODULE_LICENSE to GPL
- Changed EXPORT_SYMBOL_GPL to EXPORT_SYMBOL
- Made FSL_GUTS user-invisible
- Added a complete compatible list for GUTS
- Stored guts info in file-scope variable
- Added mfspr() getting SVR
- Redefined GUTS APIs
- Called fsl_guts_init rather than using platform driver
- Removed useless parentheses
- Removed useless 'extern' key words
Changes for v6:
- Made guts thread safe in fsl_guts_init
Changes for v7:
- Removed 'ifdef' for function declaration in guts.h
Changes for v8:
- Fixes lines longer than 80 characters checkpatch issue
- Added 'Acked-by: Scott Wood'
Changes for v9:
- None
Changes for v10:
- None
Changes for v11:
- Changed to platform driver
Changes for v12:
- Removed "signed-off-by: Scott"
- Defined fsl_soc_die_attr struct array instead of
  soc_device_attribute
- Re-designed soc_device_attribute for QorIQ SoC
- Other minor fixes
Changes for v13:
- Rebased
- Removed text after 'bool' in Kconfig
- Removed ARCH ifdefs
- Added more bits for ls1021a mask
- Used devm
Changes for v14:
- Used devm_ioremap_resource
Changes for v15:
- Fixed error code for devm_ioremap_resource
Changes for v16:
- Removed header file svr.h and calculated REV_MAJ/MIN in this driver
- Added 'Acked-by: Arnd'
---
 drivers/soc/Kconfig  |   3 +-
 drivers/soc/fsl/Kconfig  |  18 
 drivers/soc/fsl/Makefile |   1 +
 drivers/soc/fsl/guts.c   | 236 +++
 include/linux/fsl/guts.h | 125 +++--
 5 files changed, 333 insertions(+), 50 deletions(-)
 create mode 100644 drivers/soc/fsl/Kconfig
 create mode 100644 drivers/soc/fsl/guts.c

diff --git a/drivers/soc/Kconfig b/drivers/soc/Kconfig
index e6e90e8..f31bceb 100644
--- a/drivers/soc/Kconfig
+++ b/drivers/soc/Kconfig
@@ -1,8 +1,7 @@
 menu "SOC (System On Chip) specific Drivers"
 
 source "drivers/soc/bcm/Kconfig"
-source "drivers/soc/fsl/qbman/Kconfig"
-source "drivers/soc/fsl/qe/Kconfig"
+source "drivers/soc/fsl/Kconfig"
 source "drivers/soc/mediatek/Kconfig"
 source "drivers/soc/qcom/Kconfig"
 source "drivers/soc/rockchip/Kconfig"
diff --git a/drivers/soc/fsl/Kconfig b/drivers/soc/fsl/Kconfig
new file mode 100644
index 000..7a9fb9b
--- /dev/null
+++ b/drivers/soc/fsl/Kconfig
@@ -0,0 +1,18 @@
+#
+# Freescale SOC drivers
+#
+
+source "drivers/soc/fsl/qbman/Kconfig"
+source "drivers/soc/fsl/qe/Kconfig"
+
+config FSL_GUTS
+   bool
+   select SOC_BUS
+   help
+ The global utilities block controls power management, I/O device
+ enabling, power-onreset(POR) configuration monitoring, alternate
+ function selection for multiplexed signals,and clock control.
+ This driver is to manage and access global utilities block.
+ Initially only reading SVR and registering soc device are supported.
+ Other guts accesses, such as reading RCW, should eventually be moved
+ into this driver as well.
diff --git a/drivers/soc/fsl/Makefile b/drivers/soc/fsl/Makefile
index 75e1f53..44b3beb 100644
--- a/drivers/soc/fsl/Makefile
+++ b/drivers/soc/fsl/Makefile
@@ -5,3 +5,4 @@
 obj-$(CONFIG_FSL_DPAA) += qbman/
 obj-$(CONFIG_QUICC_ENGINE) += qe/
 obj-$(CONFIG_CPM)  += qe/
+obj-$(CONFIG_FSL_GUTS) += guts.o
diff --git a/drivers/soc/fsl/guts.c b/drivers/soc/fsl/guts.c
new file mode 100644
index 000..0ac8826
--- /dev/null
+++ b/drivers/soc/fsl/guts.c
@@ -0,0 +1,236 @@
+/*
+ * Freescale QorIQ Platforms GUTS Driver
+ *
+ * Copyright (C) 2016 Freescale Semiconductor, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+struct guts {
+   struct ccsr_guts __iomem *regs;
+   bool little_endian;
+};
+
+struct fsl_soc_die_attr {
+   char*die;
+   u32 svr;
+   u32 mask;
+};
+
+static struct guts *guts;
+static struct

[v16, 0/7] Fix eSDHC host version register bug

2016-11-08 Thread Yangbo Lu

This patchset is used to fix a host version register bug in the T4240-R1.0-R2.0
eSDHC controller. To match the SoC version and revision, 15 previous version
patchsets had tried many methods but all of them were rejected by reviewers.
Such as
- dts compatible method
- syscon method
- ifdef PPC method
- GUTS driver getting SVR method
Anrd suggested a soc_device_match method in v10, and this is the only available
method left now. This v11 patchset introduces the soc_device_match interface in
soc driver.

The first four patches of Yangbo are to add the GUTS driver. This is used to
register a soc device which contain soc version and revision information.
The other three patches introduce the soc_device_match method in soc driver
and apply it on esdhc driver to fix this bug.

---
Changes for v15:
- Dropped patch 'dt: bindings: update Freescale DCFG compatible'
  since the work had been done by below patch on ShawnGuo's linux tree.
  'dt-bindings: fsl: add LS1043A/LS1046A/LS2080A compatible for SCFG
   and DCFG'
- Fixed error code issue in guts driver
Changes for v16:
- Dropped patch 'powerpc/fsl: move mpc85xx.h to include/linux/fsl'
- Added a bug-fix patch from Geert
---

Arnd Bergmann (1):
  base: soc: introduce soc_device_match() interface

Geert Uytterhoeven (1):
  base: soc: Check for NULL SoC device attributes

Yangbo Lu (5):
  ARM64: dts: ls2080a: add device configuration node
  dt: bindings: move guts devicetree doc out of powerpc directory
  soc: fsl: add GUTS driver for QorIQ platforms
  MAINTAINERS: add entry for Freescale SoC drivers
  mmc: sdhci-of-esdhc: fix host version for T4240-R1.0-R2.0

 .../bindings/{powerpc => soc}/fsl/guts.txt |   3 +
 MAINTAINERS|  11 +-
 arch/arm64/boot/dts/freescale/fsl-ls2080a.dtsi |   6 +
 drivers/base/Kconfig   |   1 +
 drivers/base/soc.c |  70 ++
 drivers/mmc/host/Kconfig   |   1 +
 drivers/mmc/host/sdhci-of-esdhc.c  |  20 ++
 drivers/soc/Kconfig|   3 +-
 drivers/soc/fsl/Kconfig|  18 ++
 drivers/soc/fsl/Makefile   |   1 +
 drivers/soc/fsl/guts.c | 236 +
 include/linux/fsl/guts.h   | 125 ++-
 include/linux/sys_soc.h|   3 +
 13 files changed, 447 insertions(+), 51 deletions(-)
 rename Documentation/devicetree/bindings/{powerpc => soc}/fsl/guts.txt (91%)
 create mode 100644 drivers/soc/fsl/Kconfig
 create mode 100644 drivers/soc/fsl/guts.c

-- 
2.1.0.27.g96db324

[PATCHv2 net] igmp: do not remove igmp souce list info when set link down

2016-11-08 Thread Hangbin Liu

In commit 24cf3af(igmp: call ip_mc_clear_src...), we forgot to remove
igmpv3_clear_delrec() in ip_mc_down(), which also called ip_mc_clear_src().
This make us clear all IGMPv3 source filter info after NETDEV_DOWN.
Move igmpv3_clear_delrec() to ip_mc_destroy_dev() and then no need
ip_mc_clear_src() in ip_mc_destroy_dev().

On the other hand, we should restore back instead of free all source filter
info in igmpv3_del_delrec(). Or we will not able to restore IGMPv3 source
filter info after NETDEV_UP and NETDEV_POST_TYPE_CHANGE.

Signed-off-by: Hangbin Liu 
---
 net/ipv4/igmp.c | 50 --
 1 file changed, 36 insertions(+), 14 deletions(-)

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 606cc3e..15db786 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -162,7 +162,7 @@ static int unsolicited_report_interval(struct in_device 
*in_dev)
 }
 
 static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im);
-static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr);
+static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im);
 static void igmpv3_clear_delrec(struct in_device *in_dev);
 static int sf_setstate(struct ip_mc_list *pmc);
 static void sf_markstate(struct ip_mc_list *pmc);
@@ -1130,10 +1130,15 @@ static void igmpv3_add_delrec(struct in_device *in_dev, 
struct ip_mc_list *im)
spin_unlock_bh(_dev->mc_tomb_lock);
 }
 
-static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr)
+/*
+ * restore ip_mc_list deleted records
+ */
+static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im)
 {
struct ip_mc_list *pmc, *pmc_prev;
-   struct ip_sf_list *psf, *psf_next;
+   struct ip_sf_list *psf;
+   struct net *net = dev_net(in_dev->dev);
+   __be32 multiaddr = im->multiaddr;
 
spin_lock_bh(_dev->mc_tomb_lock);
pmc_prev = NULL;
@@ -1149,16 +1154,26 @@ static void igmpv3_del_delrec(struct in_device *in_dev, 
__be32 multiaddr)
in_dev->mc_tomb = pmc->next;
}
spin_unlock_bh(_dev->mc_tomb_lock);
+
+   spin_lock_bh(>lock);
if (pmc) {
-   for (psf = pmc->tomb; psf; psf = psf_next) {
-   psf_next = psf->sf_next;
-   kfree(psf);
+   im->interface = pmc->interface;
+   im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
+   im->sfmode = pmc->sfmode;
+   if (pmc->sfmode == MCAST_INCLUDE) {
+   im->tomb = pmc->tomb;
+   im->sources = pmc->sources;
+   for (psf = im->sources; psf; psf = psf->sf_next)
+   psf->sf_crcount = im->crcount;
}
in_dev_put(pmc->interface);
-   kfree(pmc);
}
+   spin_unlock_bh(>lock);
 }
 
+/*
+ * flush ip_mc_list deleted records
+ */
 static void igmpv3_clear_delrec(struct in_device *in_dev)
 {
struct ip_mc_list *pmc, *nextpmc;
@@ -1366,7 +1381,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 
addr)
ip_mc_hash_add(in_dev, im);
 
 #ifdef CONFIG_IP_MULTICAST
-   igmpv3_del_delrec(in_dev, im->multiaddr);
+   igmpv3_del_delrec(in_dev, im);
 #endif
igmp_group_added(im);
if (!in_dev->dead)
@@ -1626,8 +1641,12 @@ void ip_mc_remap(struct in_device *in_dev)
 
ASSERT_RTNL();
 
-   for_each_pmc_rtnl(in_dev, pmc)
+   for_each_pmc_rtnl(in_dev, pmc) {
+#ifdef CONFIG_IP_MULTICAST
+   igmpv3_del_delrec(in_dev, pmc);
+#endif
igmp_group_added(pmc);
+   }
 }
 
 /* Device going down */
@@ -1648,7 +1667,6 @@ void ip_mc_down(struct in_device *in_dev)
in_dev->mr_gq_running = 0;
if (del_timer(_dev->mr_gq_timer))
__in_dev_put(in_dev);
-   igmpv3_clear_delrec(in_dev);
 #endif
 
ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS);
@@ -1688,8 +1706,12 @@ void ip_mc_up(struct in_device *in_dev)
 #endif
ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
 
-   for_each_pmc_rtnl(in_dev, pmc)
+   for_each_pmc_rtnl(in_dev, pmc) {
+#ifdef CONFIG_IP_MULTICAST
+   igmpv3_del_delrec(in_dev, pmc);
+#endif
igmp_group_added(pmc);
+   }
 }
 
 /*
@@ -1704,13 +1726,13 @@ void ip_mc_destroy_dev(struct in_device *in_dev)
 
/* Deactivate timers */
ip_mc_down(in_dev);
+#ifdef CONFIG_IP_MULTICAST
+   igmpv3_clear_delrec(in_dev);
+#endif
 
while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) {
in_dev->mc_list = i->next_rcu;
in_dev->mc_count--;
-
-   /* We've dropped the groups in ip_mc_down already */
-   ip_mc_clear_src(i);
ip_ma_put(i);
}
 }
-- 
2.5.5

Re: 484611357c19 introduces arbitrary kernel write bug (root-only)

2016-11-08 Thread Josef Bacik


On 11/08/2016 07:23 PM, Jann Horn wrote:

In 484611357c19 (not in any stable kernel yet), functionality is
introduced that allows root (and afaics nobody else, since nobody else
is allowed to perform pointer arithmetic) to basically write to (and
read from) arbitrary kernel memory. There are multiple bugs in the
validation logic:

 - A bitwise AND of values in the ranges [a,b] and [c,d] is assumed to
always result in a value
   >= a However, for the combination of ranges [1,1] and [1,2],
this calculates a minimum of 1
   while actually, 1&2 is zero. This is the bug that my crasher
(below) triggers.


Ugh crap.  I had this logic right before, but changed it to deal with the case 
of -value & -value which would make the min_value -value.  Instead if min and 
max are both positive then the min should be 0.  I'll fix this up and add a 
testcase, nice catch.



 - a%b is assumed to always be smaller than b-1. However, for b==0,
this will calculate an upper
   limit of -1 while the values will actually always be zero.


Yup you're right.


 - I'm not sure about this, but I think that, when only one end of the
range is bounded, the logic will
   incorrectly also treat the other end as a bounded, and because of
the usage of bound
   placeholders that are smaller than the actual maximum values, this
could be used to perform
   out-of-bounds accesses.


Yeah I think you're right, if we have register A min bounded at say 
REGISTER_MAX_VALUE, and then have register B not min bounded at all so we 
default to the REGISTER_MIN_VALUE we and did a add we could end up thinking the 
minimum was 0, when it could be anything.  I'll fix this as well.


Thanks for looking at all this, I'll get this fixed up in the morning with test 
cases and send it out,


Josef

Re: [v2] cw1200: Don't leak memory if krealloc failes

2016-11-08 Thread Kalle Valo

Johannes Thumshirn  wrote:
> The call to krealloc() in wsm_buf_reserve() directly assigns the newly
> returned memory to buf->begin. This is all fine except when krealloc()
> failes we loose the ability to free the old memory pointed to by
> buf->begin. If we just create a temporary variable to assign memory to
> and assign the memory to it we can mitigate the memory leak.
> 
> Signed-off-by: Johannes Thumshirn 
> Cc: Johannes Berg 

Patch applied to wireless-drivers-next.git, thanks.

9afdd6128c39 cw1200: Don't leak memory if krealloc failes

-- 
https://patchwork.kernel.org/patch/9358185/

Documentation about submitting wireless patches and checking status
from patchwork:

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches

Re: brcmfmac: proto: add callback for queuing TX data

2016-11-08 Thread Kalle Valo

Rafał Miłecki wrote:
> From: Rafał Miłecki 
> 
> So far our core code was calling brcmf_fws_process_skb which wasn't
> a proper thing to do. If case of devices using msgbuf protocol fwsignal
> shouldn't be used. It was an unnecessary extra layer simply calling
> a protocol specifix txdata function.
> 
> Please note we already have txdata callback, but it's used for calls
> between bcdc and fwsignal so it couldn't be simply used there.
> 
> This makes core code more generic (instead of bcdc/fwsignal specific).
> 
> Signed-off-by: Rafał Miłecki 

Patch applied to wireless-drivers-next.git, thanks.

b073ac1fcf42 brcmfmac: proto: add callback for queuing TX data

-- 
https://patchwork.kernel.org/patch/9351305/

Documentation about submitting wireless patches and checking status
from patchwork:

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches

Re: [PATCH] wireless: fix bogus maybe-uninitialized warning

2016-11-08 Thread Kalle Valo

Arnd Bergmann  writes:

> The hostap_80211_rx() function is supposed to set up the mac addresses
> for four possible cases, based on two bits of input data. For
> some reason, gcc decides that it's possible that none of the these
> four cases apply and the addresses remain uninitialized:
>
> drivers/net/wireless/intersil/hostap/hostap_80211_rx.c: In function 
> ‘hostap_80211_rx’:
> arch/x86/include/asm/string_32.h:77:14: warning: ‘src’ may be used 
> uninitialized in this function [-Wmaybe-uninitialized]
> drivers/net/wireless/intel/ipw2x00/libipw_rx.c: In function ‘libipw_rx’:
> arch/x86/include/asm/string_32.h:77:14: error: ‘dst’ may be used 
> uninitialized in this function [-Werror=maybe-uninitialized]
> arch/x86/include/asm/string_32.h:78:22: error: ‘*((void *)+4)’ may be 
> used uninitialized in this function [-Werror=maybe-uninitialized]
>
> This warning is clearly nonsense, but changing the last case into
> 'default' makes it obvious to the compiler too, which avoids the
> warning and probably leads to better object code too.
>
> The same code is duplicated several times in the kernel, so this
> patch uses the same workaround for all copies. The exact configuration
> was hit only very rarely in randconfig builds and I only saw it
> in three drivers, but I assume that all of them are potentially
> affected, and it's better to keep the code consistent.
>
> Signed-off-by: Arnd Bergmann 
> ---
>  drivers/net/wireless/ath/ath6kl/wmi.c  | 8 
>  drivers/net/wireless/intel/ipw2x00/libipw_rx.c | 2 +-
>  drivers/net/wireless/intersil/hostap/hostap_80211_rx.c | 2 +-
>  net/wireless/lib80211_crypt_tkip.c | 2 +-
>  4 files changed, 7 insertions(+), 7 deletions(-)

[...]

> --- a/net/wireless/lib80211_crypt_tkip.c
> +++ b/net/wireless/lib80211_crypt_tkip.c
> @@ -556,7 +556,7 @@ static void michael_mic_hdr(struct sk_buff *skb, u8 * hdr)
>   memcpy(hdr, hdr11->addr3, ETH_ALEN);/* DA */
>   memcpy(hdr + ETH_ALEN, hdr11->addr4, ETH_ALEN); /* SA */
>   break;
> - case 0:
> + default:
>   memcpy(hdr, hdr11->addr1, ETH_ALEN);/* DA */
>   memcpy(hdr + ETH_ALEN, hdr11->addr2, ETH_ALEN); /* SA */
>   break;

Ideally we prefer that drivers/net/wireless and net/wireless changes are
split into different patches as they get applied to different trees.
Johannes, is it ok if I take this change through my tree this time?

-- 
Kalle Valo

Re: 484611357c19 introduces arbitrary kernel write bug (root-only)

2016-11-08 Thread Andy Lutomirski

On Tue, Nov 8, 2016 at 4:23 PM, Jann Horn  wrote:
> In 484611357c19 (not in any stable kernel yet), functionality is
> introduced that allows root (and afaics nobody else, since nobody else
> is allowed to perform pointer arithmetic) to basically write to (and
> read from) arbitrary kernel memory. There are multiple bugs in the
> validation logic:
>

I was curious, so I gave the code a quick read.  I also see:

+   /* PTR_TO_MAP_VALUE_ADJ is used for doing pointer math inside of a map
+* elem value.  We only allow this if we can statically verify that
+* access from this register are going to fall within the size of the
+* map element.
+*/
+   PTR_TO_MAP_VALUE_ADJ,

shouldn't this document what logical type this is?  Is it a pointer?
Is it an offset?  (It seems to be checked as though it's a pointer
with a max offset of "max_value", which makes very little sense to
me.)

regs[i].min_value = BPF_REGISTER_MIN_RANGE;
where min_value is a u64 and BPF_REGISTER_MIN_RANGE is negative.
Shouldn't those be s64?

init_reg_state() duplicates reset_reg_range_values().

That's all I've read so far.

484611357c19 introduces arbitrary kernel write bug (root-only)

2016-11-08 Thread Jann Horn

In 484611357c19 (not in any stable kernel yet), functionality is
introduced that allows root (and afaics nobody else, since nobody else
is allowed to perform pointer arithmetic) to basically write to (and
read from) arbitrary kernel memory. There are multiple bugs in the
validation logic:

 - A bitwise AND of values in the ranges [a,b] and [c,d] is assumed to
always result in a value
   >= a However, for the combination of ranges [1,1] and [1,2],
this calculates a minimum of 1
   while actually, 1&2 is zero. This is the bug that my crasher
(below) triggers.
 - a%b is assumed to always be smaller than b-1. However, for b==0,
this will calculate an upper
   limit of -1 while the values will actually always be zero.
 - I'm not sure about this, but I think that, when only one end of the
range is bounded, the logic will
   incorrectly also treat the other end as a bounded, and because of
the usage of bound
   placeholders that are smaller than the actual maximum values, this
could be used to perform
   out-of-bounds accesses.

The fun part here is that, as soon as the validation is just
off-by-one, arithmetic transformations can be used to turn that into
out-of-bounds accesses at arbitrary offsets. The crasher turns the
off-by-one into a memory write at offset 0x1000.

Here's the crasher program:
=
#define _GNU_SOURCE
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

/* start from kernel */
#define BPF_EMIT_CALL(FUNC) \
((struct bpf_insn) {\
.code  = BPF_JMP | BPF_CALL,\
.dst_reg = 0,   \
.src_reg = 0,   \
.off   = 0, \
.imm   = (FUNC) }) /* ??? */
#define BPF_MOV32_IMM(DST, IMM) \
((struct bpf_insn) {\
.code  = BPF_ALU | BPF_MOV | BPF_K, \
.dst_reg = DST, \
.src_reg = 0,   \
.off   = 0, \
.imm   = IMM })
#define BPF_REG_ARG1BPF_REG_1
#define BPF_REG_ARG2BPF_REG_2
#define BPF_REG_ARG3BPF_REG_3
#define BPF_REG_ARG4BPF_REG_4
#define BPF_REG_ARG5BPF_REG_5
#define BPF_PSEUDO_MAP_FD   1
#define BPF_LD_IMM64_RAW(DST, SRC, IMM) \
((struct bpf_insn) {\
.code  = BPF_LD | BPF_DW | BPF_IMM, \
.dst_reg = DST, \
.src_reg = SRC, \
.off   = 0, \
.imm   = (__u32) (IMM) }),  \
((struct bpf_insn) {\
.code  = 0, /* zero is reserved opcode */   \
.dst_reg = 0,   \
.src_reg = 0,   \
.off   = 0, \
.imm   = ((__u64) (IMM)) >> 32 })
#define BPF_ALU32_IMM(OP, DST, IMM) \
((struct bpf_insn) {\
.code  = BPF_ALU | BPF_OP(OP) | BPF_K,  \
.dst_reg = DST, \
.src_reg = 0,   \
.off   = 0, \
.imm   = IMM })
#define BPF_LD_MAP_FD(DST, MAP_FD)  \
BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)
#define BPF_ALU32_REG(OP, DST, SRC) \
((struct bpf_insn) {\
.code  = BPF_ALU | BPF_OP(OP) | BPF_X,  \
.dst_reg = DST, \
.src_reg = SRC, \
.off   = 0, \
.imm   = 0 })
#define BPF_EXIT_INSN() \
((struct bpf_insn) {\
.code  = BPF_JMP | BPF_EXIT,\
.dst_reg = 0,   \
.src_reg = 0,   \
.off   = 0, \
.imm   = 0 })
/* Memory store, *(uint *) (dst_reg + off16) = src_reg */
#define BPF_STX_MEM(SIZE, DST, SRC, OFF)\
((struct bpf_insn) {\
.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM,\
.dst_reg = DST, \
.src_reg = SRC, \
.off   = OFF,   \
.imm   = 0 })
#define BPF_REG_FP  BPF_REG_10
#define BPF_MOV64_REG(DST, SRC) \
((struct bpf_insn) {\
.code  = BPF_ALU64 | BPF_MOV | BPF_X,   \
.dst_reg = DST, \
.src_reg = SRC, \
.off   = 0, \
.imm   = 0 })
#define BPF_ALU64_IMM(OP, DST, IMM) \
((struct bpf_insn) {\
.code  = BPF_ALU64 | BPF_OP(OP) | BPF_K,\
.dst_reg = DST, \
.src_reg = 0,   \
.off   = 0, \
.imm   = IMM })
#define BPF_MOV64_REG(DST, SRC) \
((struct bpf_insn) {\
.code  = BPF_ALU64 | BPF_MOV | BPF_X,   \
.dst_reg = DST,

Re: [RESEND][PATCH v4] cgroup: Use CAP_SYS_RESOURCE to allow a process to migrate other tasks between cgroups

2016-11-08 Thread Andy Lutomirski

On Tue, Nov 8, 2016 at 4:03 PM, Alexei Starovoitov
 wrote:
> On Tue, Nov 08, 2016 at 03:51:40PM -0800, Andy Lutomirski wrote:
>> On Tue, Nov 8, 2016 at 3:28 PM, John Stultz  wrote:
>> > This patch adds logic to allows a process to migrate other tasks
>> > between cgroups if they have CAP_SYS_RESOURCE.
>> >
>> > In Android (where this feature originated), the ActivityManager tracks
>> > various application states (TOP_APP, FOREGROUND, BACKGROUND, SYSTEM,
>> > etc), and then as applications change states, the SchedPolicy logic
>> > will migrate the application tasks between different cgroups used
>> > to control the different application states (for example, there is a
>> > background cpuset cgroup which can limit background tasks to stay
>> > on one low-power cpu, and the bg_non_interactive cpuctrl cgroup can
>> > then further limit those background tasks to a small percentage of
>> > that one cpu's cpu time).
>> >
>> > However, for security reasons, Android doesn't want to make the
>> > system_server (the process that runs the ActivityManager and
>> > SchedPolicy logic), run as root. So in the Android common.git
>> > kernel, they have some logic to allow cgroups to loosen their
>> > permissions so CAP_SYS_NICE tasks can migrate other tasks between
>> > cgroups.
>> >
>> > I feel the approach taken there overloads CAP_SYS_NICE a bit much
>> > for non-android environments.
>> >
>> > So this patch, as suggested by Michael Kerrisk, simply adds a
>> > check for CAP_SYS_RESOURCE.
>> >
>> > I've tested this with AOSP master, and this seems to work well
>> > as Zygote and system_server already use CAP_SYS_RESOURCE. I've
>> > also submitted patches against the android-4.4 kernel to change
>> > it to use CAP_SYS_RESOURCE, and the Android developers just merged
>> > it.
>> >
>>
>> I hate to say it, but I think I may see a problem.  Current
>> developments are afoot to make cgroups do more than resource control.
>> For example, there's Landlock and there's Daniel's ingress/egress
>> filter thing.  Current cgroup controllers can mostly just DoS their
>> controlled processes.  These new controllers (or controller-like
>> things) can exfiltrate data and change semantics.
>>
>> Does anyone have a security model in mind for these controllers and
>> the cgroups that they're attached to?  I'm reasonably confident that
>> CAP_SYS_RESOURCE is not the answer...
>
> and specifically the answer is... ?
> Also would be great if you start with specifying the question first
> and the problem you're trying to solve.
>

I don't have a good answer right now.  Here are some constraints, though:

1. An insufficiently privileged process should not be able to move a
victim into a dangerous cgroup.

2. An insufficiently privileged process should not be able to move
itself into a dangerous cgroup and then use execve to gain privilege
such that the execve'd program can be compromised.

3. An insufficiently privileged process should not be able to make an
existing cgroup dangerous in a way that could compromise a victim in
that cgroup.

4. An insufficiently privileged process should not be able to make a
cgroup dangerous in a way that bypasses protections that would
otherwise protect execve() as used by itself or some other process in
that cgroup.

Keep in mind that "dangerous" may apply to a cgroup's descendents in
addition to the cgroup being controlled.

Re: [PATCH v2 3/6] qedi: Add QLogic FastLinQ offload iSCSI driver framework.

2016-11-08 Thread Martin K. Petersen

> "Arun" == Arun Easi  writes:

>> It's fine to post the patches split up to ease the review
>> process. But whatever we commit must obviously be bisectable.

Arun> If it is alright with you, we would like to have all of our
Arun> initial patches for the driver (qedi) squashed as a single commit
Arun> to the tree. We will ensure that this single combined commit
Arun> compiles clean.

That's fine with me.

-- 
Martin K. Petersen  Oracle Linux Engineering

Re: [RESEND][PATCH v4] cgroup: Use CAP_SYS_RESOURCE to allow a process to migrate other tasks between cgroups

2016-11-08 Thread Alexei Starovoitov

On Tue, Nov 08, 2016 at 03:51:40PM -0800, Andy Lutomirski wrote:
> On Tue, Nov 8, 2016 at 3:28 PM, John Stultz  wrote:
> > This patch adds logic to allows a process to migrate other tasks
> > between cgroups if they have CAP_SYS_RESOURCE.
> >
> > In Android (where this feature originated), the ActivityManager tracks
> > various application states (TOP_APP, FOREGROUND, BACKGROUND, SYSTEM,
> > etc), and then as applications change states, the SchedPolicy logic
> > will migrate the application tasks between different cgroups used
> > to control the different application states (for example, there is a
> > background cpuset cgroup which can limit background tasks to stay
> > on one low-power cpu, and the bg_non_interactive cpuctrl cgroup can
> > then further limit those background tasks to a small percentage of
> > that one cpu's cpu time).
> >
> > However, for security reasons, Android doesn't want to make the
> > system_server (the process that runs the ActivityManager and
> > SchedPolicy logic), run as root. So in the Android common.git
> > kernel, they have some logic to allow cgroups to loosen their
> > permissions so CAP_SYS_NICE tasks can migrate other tasks between
> > cgroups.
> >
> > I feel the approach taken there overloads CAP_SYS_NICE a bit much
> > for non-android environments.
> >
> > So this patch, as suggested by Michael Kerrisk, simply adds a
> > check for CAP_SYS_RESOURCE.
> >
> > I've tested this with AOSP master, and this seems to work well
> > as Zygote and system_server already use CAP_SYS_RESOURCE. I've
> > also submitted patches against the android-4.4 kernel to change
> > it to use CAP_SYS_RESOURCE, and the Android developers just merged
> > it.
> >
> 
> I hate to say it, but I think I may see a problem.  Current
> developments are afoot to make cgroups do more than resource control.
> For example, there's Landlock and there's Daniel's ingress/egress
> filter thing.  Current cgroup controllers can mostly just DoS their
> controlled processes.  These new controllers (or controller-like
> things) can exfiltrate data and change semantics.
> 
> Does anyone have a security model in mind for these controllers and
> the cgroups that they're attached to?  I'm reasonably confident that
> CAP_SYS_RESOURCE is not the answer...

and specifically the answer is... ?
Also would be great if you start with specifying the question first
and the problem you're trying to solve.

Re: [PATCH v2 3/6] qedi: Add QLogic FastLinQ offload iSCSI driver framework.

2016-11-08 Thread Arun Easi

Martin,

On Tue, 8 Nov 2016, 3:49pm -, Martin K. Petersen wrote:

> > "Arun" == Arun Easi  writes:
> 
> Arun,
> 
> Arun> qedi is the new iSCSI driver, which we are trying to submit, for
> Arun> our 41000 series CNA. This patch series were broken up into
> Arun> logical blocks for review purpose, but were not made to compile
> Arun> individually. It is our impression that this is acceptable for
> Arun> SCSI and all the initial "qedi" patches will be squashed and
> Arun> committed as a single commit. Please let us know if we are
> Arun> mistaken, and if so, we will post another series with this taken
> Arun> care of.
> 
> It's fine to post the patches split up to ease the review process. But
> whatever we commit must obviously be bisectable.
> 

If it is alright with you, we would like to have all of our initial 
patches for the driver (qedi) squashed as a single commit to the tree. We 
will ensure that this single combined commit compiles clean.

Regards,
-Arun

Re: [RESEND][PATCH v4] cgroup: Use CAP_SYS_RESOURCE to allow a process to migrate other tasks between cgroups

2016-11-08 Thread Andy Lutomirski

On Tue, Nov 8, 2016 at 3:28 PM, John Stultz  wrote:
> This patch adds logic to allows a process to migrate other tasks
> between cgroups if they have CAP_SYS_RESOURCE.
>
> In Android (where this feature originated), the ActivityManager tracks
> various application states (TOP_APP, FOREGROUND, BACKGROUND, SYSTEM,
> etc), and then as applications change states, the SchedPolicy logic
> will migrate the application tasks between different cgroups used
> to control the different application states (for example, there is a
> background cpuset cgroup which can limit background tasks to stay
> on one low-power cpu, and the bg_non_interactive cpuctrl cgroup can
> then further limit those background tasks to a small percentage of
> that one cpu's cpu time).
>
> However, for security reasons, Android doesn't want to make the
> system_server (the process that runs the ActivityManager and
> SchedPolicy logic), run as root. So in the Android common.git
> kernel, they have some logic to allow cgroups to loosen their
> permissions so CAP_SYS_NICE tasks can migrate other tasks between
> cgroups.
>
> I feel the approach taken there overloads CAP_SYS_NICE a bit much
> for non-android environments.
>
> So this patch, as suggested by Michael Kerrisk, simply adds a
> check for CAP_SYS_RESOURCE.
>
> I've tested this with AOSP master, and this seems to work well
> as Zygote and system_server already use CAP_SYS_RESOURCE. I've
> also submitted patches against the android-4.4 kernel to change
> it to use CAP_SYS_RESOURCE, and the Android developers just merged
> it.
>

I hate to say it, but I think I may see a problem.  Current
developments are afoot to make cgroups do more than resource control.
For example, there's Landlock and there's Daniel's ingress/egress
filter thing.  Current cgroup controllers can mostly just DoS their
controlled processes.  These new controllers (or controller-like
things) can exfiltrate data and change semantics.

Does anyone have a security model in mind for these controllers and
the cgroups that they're attached to?  I'm reasonably confident that
CAP_SYS_RESOURCE is not the answer...

Re: [PATCH v2 3/6] qedi: Add QLogic FastLinQ offload iSCSI driver framework.

2016-11-08 Thread Martin K. Petersen

> "Arun" == Arun Easi  writes:

Arun,

Arun> qedi is the new iSCSI driver, which we are trying to submit, for
Arun> our 41000 series CNA. This patch series were broken up into
Arun> logical blocks for review purpose, but were not made to compile
Arun> individually. It is our impression that this is acceptable for
Arun> SCSI and all the initial "qedi" patches will be squashed and
Arun> committed as a single commit. Please let us know if we are
Arun> mistaken, and if so, we will post another series with this taken
Arun> care of.

It's fine to post the patches split up to ease the review process. But
whatever we commit must obviously be bisectable.

-- 
Martin K. Petersen  Oracle Linux Engineering

Re: [PATCH v2 3/6] qedi: Add QLogic FastLinQ offload iSCSI driver framework.

2016-11-08 Thread Arun Easi

[ Sending on behalf of Manish to cover for the time difference. ]

Hi Martin, James,

I would like to request your input on this kbuild test error on the 
series, where they compile fine together, but is not bisectable.

qedi is the new iSCSI driver, which we are trying to submit, for our 41000 
series CNA. This patch series were broken up into logical blocks for 
review purpose, but were not made to compile individually. It is our 
impression that this is acceptable for SCSI and all the initial "qedi" 
patches will be squashed and committed as a single commit. Please let us 
know if we are mistaken, and if so, we will post another series 
with this taken care of.

FYI, this series accompany additions to the common core module, "qed", 
that goes under drivers/net/. The patches for the qed module compiles fine 
individually and so is bisectable.

In regards to the additional warnings brought out by kbuild test on 
"PATCH v2 6/6" and "PATCH v2 3/6", we will post a v3 with the fixes.

Regards,
-Arun

On Tue, 8 Nov 2016, 2:52am -, kbuild test robot wrote:

> Hi Manish,
> 
> [auto build test ERROR on net-next/master]
> [also build test ERROR on v4.9-rc4]
> [if your patch is applied to the wrong git tree, please drop us a note to 
> help improve the system]
> 
> url:
> https://github.com/0day-ci/linux/commits/Manish-Rangankar/qed-Add-support-for-hardware-offloaded-iSCSI/20161108-180027
> config: ia64-allmodconfig (attached as .config)
> compiler: ia64-linux-gcc (GCC) 6.2.0
> reproduce:
> wget 
> https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross
>  -O ~/bin/make.cross
> chmod +x ~/bin/make.cross
> # save the attached .config to linux build tree
> make.cross ARCH=ia64 
> 
> Note: the 
> linux-review/Manish-Rangankar/qed-Add-support-for-hardware-offloaded-iSCSI/20161108-180027
>  HEAD dd4d1d0e0785d20cdcfdf9b2c792c564a79b2de2 builds fine.
>   It only hurts bisectibility.
>

Re: [PATCH] net: ethernet: ti: davinci_cpdma: free memory while channel destroy

2016-11-08 Thread Grygorii Strashko




On 11/08/2016 07:16 AM, Ivan Khoronzhuk wrote:

While create/destroy channel operation memory is not freed. It was
supposed that memory is freed while driver remove. But a channel
can be created and destroyed many times while changing number of
channels with ethtool.

Based on net-next/master


^?



Signed-off-by: Ivan Khoronzhuk 


Reviewed-by: Grygorii Strashko 


---
 drivers/net/ethernet/ti/davinci_cpdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ti/davinci_cpdma.c 
b/drivers/net/ethernet/ti/davinci_cpdma.c
index 05afc05..07fc92d 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.c
+++ b/drivers/net/ethernet/ti/davinci_cpdma.c
@@ -586,7 +586,7 @@ int cpdma_chan_destroy(struct cpdma_chan *chan)
cpdma_chan_stop(chan);
ctlr->channels[chan->chan_num] = NULL;
ctlr->chan_num--;
-
+   devm_kfree(ctlr->dev, chan);
cpdma_chan_split_pool(ctlr);

spin_unlock_irqrestore(>lock, flags);



--
regards,
-grygorii

Re: [PATCH net v2 2/4] net: ethernet: ti: cpsw: fix device and of_node leaks

2016-11-08 Thread Grygorii Strashko




On 11/03/2016 12:40 PM, Johan Hovold wrote:

Make sure to drop the references taken by of_get_child_by_name() and
bus_find_device() before returning from cpsw_phy_sel().

Note that holding a reference to the cpsw-phy-sel device does not
prevent the devres-managed private data from going away.

Fixes: 5892cd135e16 ("drivers: net: cpsw-phy-sel: Add new driver...")
Cc: Mugunthan V N 
Cc: Grygorii Strashko 
Cc: linux-o...@vger.kernel.org
Signed-off-by: Johan Hovold 
---


Reviewed-by: Grygorii Strashko 


 drivers/net/ethernet/ti/cpsw-phy-sel.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/ti/cpsw-phy-sel.c 
b/drivers/net/ethernet/ti/cpsw-phy-sel.c
index 054a8dd23dae..ba1e45ff6aae 100644
--- a/drivers/net/ethernet/ti/cpsw-phy-sel.c
+++ b/drivers/net/ethernet/ti/cpsw-phy-sel.c
@@ -176,9 +176,12 @@ void cpsw_phy_sel(struct device *dev, phy_interface_t 
phy_mode, int slave)
}

dev = bus_find_device(_bus_type, NULL, node, match);
+   of_node_put(node);
priv = dev_get_drvdata(dev);

priv->cpsw_phy_sel(priv, phy_mode, slave);
+
+   put_device(dev);
 }
 EXPORT_SYMBOL_GPL(cpsw_phy_sel);




--
regards,
-grygorii

Re: [PATCH] net/netfilter: Fix use uninitialized warn in nft_range_eval()

2016-11-08 Thread Pablo Neira Ayuso

On Mon, Nov 07, 2016 at 08:41:14AM -0700, Shuah Khan wrote:
> Fix the following warn:
> 
>CC [M]  net/netfilter/nft_range.o
> 8601,8605c9105
>  net/netfilter/nft_range.c: In function ‘nft_range_eval’:
>  net/netfilter/nft_range.c:45:5: warning: ‘mismatch’ may be used 
> uninitialized in this function [-Wmaybe-uninitialized]
>if (mismatch)
>   ^

You probably using an old tree snapshot? This was already fixed by:

commit d2e4d593516e877f1f6fb40031eb495f36606e16
Author: Arnd Bergmann 
Date:   Tue Oct 18 00:05:30 2016 +0200

netfilter: nf_tables: avoid uninitialized variable warning

The newly added nft_range_eval() function handles the two possible
nft range operations, but as the compiler warning points out,
any unexpected value would lead to the 'mismatch' variable being
used without being initialized:

Re: [iproute PATCH 1/2] ipaddress: Simplify vf_info parsing

2016-11-08 Thread Greg

On Tue, 2016-11-08 at 22:29 +0100, Phil Sutter wrote:
> Commit 7b8179c780a1a ("iproute2: Add new command to ip link to
> enable/disable VF spoof check") tried to add support for
> IFLA_VF_SPOOFCHK in a backwards-compatible manner, but aparently overdid
> it: parse_rtattr_nested() handles missing attributes perfectly fine in
> that it will leave the relevant field unassigned so calling code can
> just compare against NULL. There is no need to layback from the previous
> (IFLA_VF_TX_RATE) attribute to the next to check if IFLA_VF_SPOOFCHK is
> present or not. To the contrary, it establishes a potentially incorrect
> assumption of these two attributes directly following each other which
> may not be the case (although up to now, kernel aligns them this way).
> 
> This patch cleans up the code to adhere to the common way of checking
> for attribute existence. It has been tested to return correct results
> regardless of whether the kernel exports IFLA_VF_SPOOFCHK or not.
> 
> Signed-off-by: Phil Sutter 
> ---
>  ip/ipaddress.c | 44 ++--
>  1 file changed, 10 insertions(+), 34 deletions(-)
> 
> diff --git a/ip/ipaddress.c b/ip/ipaddress.c
> index 7f05258f43453..df0f1b9c94c58 100644
> --- a/ip/ipaddress.c
> +++ b/ip/ipaddress.c
> @@ -322,10 +322,7 @@ static void print_vfinfo(FILE *fp, struct rtattr *vfinfo)
>  {
>   struct ifla_vf_mac *vf_mac;
>   struct ifla_vf_tx_rate *vf_tx_rate;
> - struct ifla_vf_spoofchk *vf_spoofchk;
> - struct ifla_vf_link_state *vf_linkstate;
>   struct rtattr *vf[IFLA_VF_MAX + 1] = {};
> - struct rtattr *tmp;
>  
>   SPRINT_BUF(b1);
>  
> @@ -339,31 +336,6 @@ static void print_vfinfo(FILE *fp, struct rtattr *vfinfo)
>   vf_mac = RTA_DATA(vf[IFLA_VF_MAC]);
>   vf_tx_rate = RTA_DATA(vf[IFLA_VF_TX_RATE]);
>  
> - /* Check if the spoof checking vf info type is supported by
> -  * this kernel.
> -  */
> - tmp = (struct rtattr *)((char *)vf[IFLA_VF_TX_RATE] +
> - vf[IFLA_VF_TX_RATE]->rta_len);
> -
> - if (tmp->rta_type != IFLA_VF_SPOOFCHK)
> - vf_spoofchk = NULL;
> - else
> - vf_spoofchk = RTA_DATA(vf[IFLA_VF_SPOOFCHK]);
> -
> - if (vf_spoofchk) {
> - /* Check if the link state vf info type is supported by
> -  * this kernel.
> -  */
> - tmp = (struct rtattr *)((char *)vf[IFLA_VF_SPOOFCHK] +
> - vf[IFLA_VF_SPOOFCHK]->rta_len);
> -
> - if (tmp->rta_type != IFLA_VF_LINK_STATE)
> - vf_linkstate = NULL;
> - else
> - vf_linkstate = RTA_DATA(vf[IFLA_VF_LINK_STATE]);
> - } else
> - vf_linkstate = NULL;
> -
>   fprintf(fp, "%svf %d MAC %s", _SL_, vf_mac->vf,
>   ll_addr_n2a((unsigned char *)_mac->mac,
>   ETH_ALEN, 0, b1, sizeof(b1)));
> @@ -407,14 +379,18 @@ static void print_vfinfo(FILE *fp, struct rtattr 
> *vfinfo)
>   if (vf_rate->min_tx_rate)
>   fprintf(fp, ", min_tx_rate %dMbps", 
> vf_rate->min_tx_rate);
>   }
> + if (vf[IFLA_VF_SPOOFCHK]) {
> + struct ifla_vf_spoofchk *vf_spoofchk =
> + RTA_DATA(vf[IFLA_VF_SPOOFCHK]);
>  
> - if (vf_spoofchk && vf_spoofchk->setting != -1) {
> - if (vf_spoofchk->setting)
> - fprintf(fp, ", spoof checking on");
> - else
> - fprintf(fp, ", spoof checking off");
> + if (vf_spoofchk->setting != -1)
> + fprintf(fp, ", spoof checking %s",
> + vf_spoofchk->setting ? "on" : "off");

I wrote some of this code at a time when I was pretty new to Linux
kernel net programming and I really just didn't understand it.  It
appears you're doing it more correctly than I.

Thanks for cleaning it up.

Reviewed-by: Greg Rose 

- Greg

>   }
> - if (vf_linkstate) {
> + if (vf[IFLA_VF_LINK_STATE]) {
> + struct ifla_vf_link_state *vf_linkstate =
> + RTA_DATA(vf[IFLA_VF_LINK_STATE]);
> +
>   if (vf_linkstate->link_state == IFLA_VF_LINK_STATE_AUTO)
>   fprintf(fp, ", link-state auto");
>   else if (vf_linkstate->link_state == IFLA_VF_LINK_STATE_ENABLE)

Re: [Patch net] ipvs: use IPVS_CMD_ATTR_MAX for family.maxattr

2016-11-08 Thread Pablo Neira Ayuso

On Fri, Nov 04, 2016 at 11:58:44AM +0100, Simon Horman wrote:
> On Thu, Nov 03, 2016 at 05:14:03PM -0700, Cong Wang wrote:
> > family.maxattr is the max index for policy[], the size of
> > ops[] is determined with ARRAY_SIZE().
> > 
> > Reported-by: Andrey Konovalov 
> > Tested-by: Andrey Konovalov 
> > Cc: Pablo Neira Ayuso 
> > Signed-off-by: Cong Wang 
> 
> 
> Signed-off-by: Simon Horman 
> 
> Pablo, can you take this one into nf?

Applied, thanks!

[PATCH] net: ipv4: ip_send_unicast_reply should set oif only if it is L3 master

2016-11-08 Thread David Ahern

Lorenzo noted an Android unit test failed due to commit e0d56fdd7342:
  "The expectation in the test was that the RST replying to a SYN sent to a
  closed port should be generated with oif=0. In other words it should not
  prefer the interface where the SYN came in on, but instead should follow
  whatever the routing table says it should do."

Since this a change in behavior, revert the change to
ip_send_unicast_reply such that the oif in the flow is set to the skb_iif
only if skb_iif is an L3 master.

Fixes: e0d56fdd7342 ("net: l3mdev: remove redundant calls")
Reported-by: Lorenzo Colitti 
Signed-off-by: David Ahern 
---
 net/ipv4/ip_output.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 49714010ac2e..9403fa3850be 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1577,7 +1577,8 @@ void ip_send_unicast_reply(struct sock *sk, struct 
sk_buff *skb,
}
 
oif = arg->bound_dev_if;
-   oif = oif ? : skb->skb_iif;
+   if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
+   oif = skb->skb_iif;
 
flowi4_init_output(, oif,
   IP4_REPLY_MARK(net, skb->mark),
-- 
2.1.4

Re: [PATCH 7/8] tools lib bpf: fix maps resolution

2016-11-08 Thread Wangnan (F)


Hi Eric,

During testing this patch I find a segfault, please see inline comment.

In addition, since both the BPF map array and map names should be done
after symbol table is collected, merging bpf_object__init_maps and
bpf_object__init_maps_name would be a good practice, making code
simpler.

So I prepare a new patch. Please have a look at:

http://lkml.kernel.org/g/20161108215734.28905-1-wangn...@huawei.com

New version ensure not crashing in any case user provides a corrupted
maps section, including array of bpf maps, maps with different definition
structures and very short map definition.

Thank you.

On 2016/10/16 14:18, Eric Leblond wrote:

It is not correct to assimilate the elf data of the maps section
to an array of map definition. In fact the sizes differ. The
offset provided in the symbol section has to be used instead.

This patch fixes a bug causing a elf with two maps not to load
correctly.

Signed-off-by: Eric Leblond 
---
  tools/lib/bpf/libbpf.c | 50 +++---
  1 file changed, 35 insertions(+), 15 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 1fe4532..f72628b 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -186,6 +186,7 @@ struct bpf_program {
  struct bpf_map {
int fd;
char *name;
+   size_t offset;
struct bpf_map_def def;
void *priv;
bpf_map_clear_priv_t clear_priv;
@@ -529,13 +530,6 @@ bpf_object__init_maps(struct bpf_object *obj, void *data,
  
  	pr_debug("maps in %s: %zd bytes\n", obj->path, size);
  
-	obj->maps = calloc(nr_maps, sizeof(obj->maps[0]));

-   if (!obj->maps) {
-   pr_warning("alloc maps for object failed\n");
-   return -ENOMEM;
-   }
-   obj->nr_maps = nr_maps;
-
for (i = 0; i < nr_maps; i++) {
struct bpf_map_def *def = >maps[i].def;
  
@@ -547,23 +541,42 @@ bpf_object__init_maps(struct bpf_object *obj, void *data,

obj->maps[i].fd = -1;
  
  		/* Save map definition into obj->maps */

-   *def = ((struct bpf_map_def *)data)[i];
+   *def = *(struct bpf_map_def *)(data + obj->maps[i].offset);
}


Here, nr_maps is still size / sizeof(struct bpf_map_def), so obj->maps[i]
can be invalid.


return 0;
  }
  
  static int

-bpf_object__init_maps_name(struct bpf_object *obj)
+bpf_object__init_maps_symbol(struct bpf_object *obj)
  {
int i;
+   int nr_maps = 0;
Elf_Data *symbols = obj->efile.symbols;
+   size_t map_idx = 0;
  
  	if (!symbols || obj->efile.maps_shndx < 0)

return -EINVAL;
  
+	/* get the number of maps */

+   for (i = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) {
+   GElf_Sym sym;
+
+   if (!gelf_getsym(symbols, i, ))
+   continue;
+   if (sym.st_shndx != obj->efile.maps_shndx)
+   continue;
+   nr_maps++;
+   }
+
+   obj->maps = calloc(nr_maps, sizeof(obj->maps[0]));
+   if (!obj->maps) {
+   pr_warning("alloc maps for object failed\n");
+   return -ENOMEM;
+   }
+   obj->nr_maps = nr_maps;
+
for (i = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) {
GElf_Sym sym;
-   size_t map_idx;
const char *map_name;
  
  		if (!gelf_getsym(symbols, i, ))

@@ -574,12 +587,12 @@ bpf_object__init_maps_name(struct bpf_object *obj)
map_name = elf_strptr(obj->efile.elf,
  obj->efile.strtabidx,
  sym.st_name);
-   map_idx = sym.st_value / sizeof(struct bpf_map_def);
if (map_idx >= obj->nr_maps) {
pr_warning("index of map \"%s\" is buggy: %zu > %zu\n",
   map_name, map_idx, obj->nr_maps);
continue;
}
+   obj->maps[map_idx].offset = sym.st_value;
obj->maps[map_idx].name = strdup(map_name);
if (!obj->maps[map_idx].name) {
pr_warning("failed to alloc map name\n");
@@ -587,6 +600,7 @@ bpf_object__init_maps_name(struct bpf_object *obj)
}
pr_debug("map %zu is \"%s\"\n", map_idx,
 obj->maps[map_idx].name);
+   map_idx++;
}
return 0;
  }
@@ -647,8 +661,6 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
data->d_buf,
data->d_size);
else if (strcmp(name, "maps") == 0) {
-   err = bpf_object__init_maps(obj, data->d_buf,
-   data->d_size);
obj->efile.maps_shndx = idx;
} else if (sh.sh_type

[PATCH v5] Net Driver: Add Cypress GX3 VID=04b4 PID=3610.

2016-11-08 Thread chris.roth

From: Allan Chou 

Add support for Cypress GX3 SuperSpeed to Gigabit Ethernet
Bridge Controller (Vendor=04b4 ProdID=3610).

Patch verified on x64 linux kernel 4.7.4, 4.8.6, 4.9-rc4 systems
with the Kensington SD4600P USB-C Universal Dock with Power,
which uses the Cypress GX3 SuperSpeed to Gigabit Ethernet Bridge
Controller.

A similar patch was signed-off and tested-by Allan Chou
 on 2015-12-01.

Allan verified his similar patch on x86 Linux kernel 4.1.6 system
with Cypress GX3 SuperSpeed to Gigabit Ethernet Bridge Controller.

Tested-by: Allan Chou 
Tested-by: Chris Roth 
Tested-by: Artjom Simon 

Signed-off-by: Allan Chou 
Signed-off-by: Chris Roth 
---
Changes in v4, v5:
 - Add verification of patch on 4.8.6, 4.9-rc4 (v4)
 - Add tester Artjom Simon  (v4)
 - Reformat spaces to tabs (v5)

 drivers/net/usb/ax88179_178a.c | 17 +
 1 file changed, 17 insertions(+)

diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c
index e6338c1..8a6675d 100644
--- a/drivers/net/usb/ax88179_178a.c
+++ b/drivers/net/usb/ax88179_178a.c
@@ -1656,6 +1656,19 @@ static const struct driver_info ax88178a_info = {
.tx_fixup = ax88179_tx_fixup,
 };
 
+static const struct driver_info cypress_GX3_info = {
+   .description = "Cypress GX3 SuperSpeed to Gigabit Ethernet Controller",
+   .bind = ax88179_bind,
+   .unbind = ax88179_unbind,
+   .status = ax88179_status,
+   .link_reset = ax88179_link_reset,
+   .reset = ax88179_reset,
+   .stop = ax88179_stop,
+   .flags = FLAG_ETHER | FLAG_FRAMING_AX,
+   .rx_fixup = ax88179_rx_fixup,
+   .tx_fixup = ax88179_tx_fixup,
+};
+
 static const struct driver_info dlink_dub1312_info = {
.description = "D-Link DUB-1312 USB 3.0 to Gigabit Ethernet Adapter",
.bind = ax88179_bind,
@@ -1718,6 +1731,10 @@ static const struct usb_device_id products[] = {
USB_DEVICE(0x0b95, 0x178a),
.driver_info = (unsigned long)_info,
 }, {
+   /* Cypress GX3 SuperSpeed to Gigabit Ethernet Bridge Controller */
+   USB_DEVICE(0x04b4, 0x3610),
+   .driver_info = (unsigned long)_GX3_info,
+}, {
/* D-Link DUB-1312 USB 3.0 to Gigabit Ethernet Adapter */
USB_DEVICE(0x2001, 0x4a00),
.driver_info = (unsigned long)_dub1312_info,
-- 
2.7.4

Re: [PATCH net-next 2/3] ptp: igb: Use the high resolution frequency method.

2016-11-08 Thread Keller, Jacob E

On Tue, 2016-11-08 at 22:49 +0100, Richard Cochran wrote:
> The 82580 and related devices offer a frequency resolution of about
> 0.029 ppb.  This patch lets users of the device benefit from the
> increased frequency resolution when tuning the clock.
> 
> Signed-off-by: Richard Cochran 
> ---

Additionally, what about min/max frequency check? Wouldn't this need to
be updated for the new adjfine operation?

Thanks,
Jake

Re: [PATCH net-next 2/3] ptp: igb: Use the high resolution frequency method.

2016-11-08 Thread Keller, Jacob E

On Tue, 2016-11-08 at 22:49 +0100, Richard Cochran wrote:
> The 82580 and related devices offer a frequency resolution of about
> 0.029 ppb.  This patch lets users of the device benefit from the
> increased frequency resolution when tuning the clock.
> 
> Signed-off-by: Richard Cochran 
> ---
>  drivers/net/ethernet/intel/igb/igb_ptp.c | 16 
>  1 file changed, 8 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/igb/igb_ptp.c
> b/drivers/net/ethernet/intel/igb/igb_ptp.c
> index a7895c4..c30eea8 100644
> --- a/drivers/net/ethernet/intel/igb/igb_ptp.c
> +++ b/drivers/net/ethernet/intel/igb/igb_ptp.c
> @@ -226,7 +226,7 @@ static int igb_ptp_adjfreq_82576(struct
> ptp_clock_info *ptp, s32 ppb)
>   return 0;
>  }
>  
> -static int igb_ptp_adjfreq_82580(struct ptp_clock_info *ptp, s32
> ppb)
> +static int igb_ptp_adjfine_82580(struct ptp_clock_info *ptp, long
> scaled_ppm)
>  {
>   struct igb_adapter *igb = container_of(ptp, struct
> igb_adapter,
>      ptp_caps);
> @@ -235,13 +235,13 @@ static int igb_ptp_adjfreq_82580(struct
> ptp_clock_info *ptp, s32 ppb)
>   u64 rate;
>   u32 inca;
>  
> - if (ppb < 0) {
> + if (scaled_ppm < 0) {
>   neg_adj = 1;
> - ppb = -ppb;
> + scaled_ppm = -scaled_ppm;
>   }
> - rate = ppb;
> - rate <<= 26;
> - rate = div_u64(rate, 1953125);
> + rate = scaled_ppm;
> + rate <<= 13;
> + rate = div_u64(rate, 15625);
>  

I'm curious how you generate the new math here, since this can be
tricky, and I could use more examples in order to port to some of the
other drivers implementations. I'm not quit sure how to handle the
value when the lower 16 bits are fractional.

Thanks,
Jake

>   inca = rate & INCVALUE_MASK;
>   if (neg_adj)
> @@ -1103,7 +1103,7 @@ void igb_ptp_init(struct igb_adapter *adapter)
>   adapter->ptp_caps.max_adj = 6249;
>   adapter->ptp_caps.n_ext_ts = 0;
>   adapter->ptp_caps.pps = 0;
> - adapter->ptp_caps.adjfreq = igb_ptp_adjfreq_82580;
> + adapter->ptp_caps.adjfine = igb_ptp_adjfine_82580;
>   adapter->ptp_caps.adjtime = igb_ptp_adjtime_82576;
>   adapter->ptp_caps.gettime64 = igb_ptp_gettime_82576;
>   adapter->ptp_caps.settime64 = igb_ptp_settime_82576;
> @@ -1131,7 +1131,7 @@ void igb_ptp_init(struct igb_adapter *adapter)
>   adapter->ptp_caps.n_pins = IGB_N_SDP;
>   adapter->ptp_caps.pps = 1;
>   adapter->ptp_caps.pin_config = adapter->sdp_config;
> - adapter->ptp_caps.adjfreq = igb_ptp_adjfreq_82580;
> + adapter->ptp_caps.adjfine = igb_ptp_adjfine_82580;
>   adapter->ptp_caps.adjtime = igb_ptp_adjtime_i210;
>   adapter->ptp_caps.gettime64 = igb_ptp_gettime_i210;
>   adapter->ptp_caps.settime64 = igb_ptp_settime_i210;

[PATCH] tools lib bpf: fix maps resolution

2016-11-08 Thread Wang Nan

From: Eric Leblond 

It is not correct to assimilate the elf data of the maps section
to an array of map definition. In fact the sizes differ. The
offset provided in the symbol section has to be used instead.

This patch fixes a bug causing a elf with two maps not to load
correctly.

Wang Nan added:

This patch requires a name for each BPF map, so array of BPF maps is
not allowed. This restriction is reasonable, because kernel verifier
forbid indexing BPF map from such array unless the index is a fixed
value, but if the index is fixed why not merging it into name?

For example:

Program like this:
  ...
  unsigned long cpu = get_smp_processor_id();
  int *pval = map_lookup_elem(_array[cpu], );
  ...

Generates bytecode like this:

0: (b7) r1 = 0
1: (63) *(u32 *)(r10 -4) = r1
2: (b7) r1 = 680997
3: (63) *(u32 *)(r10 -8) = r1
4: (85) call 8
5: (67) r0 <<= 4
6: (18) r1 = 0x112dd000
8: (0f) r0 += r1
9: (bf) r2 = r10
10: (07) r2 += -4
11: (bf) r1 = r0
12: (85) call 1

Where instruction 8 is the computation, 8 and 11 render r1 to an invalid
value for function map_lookup_elem, causes verifier report error.

Signed-off-by: Eric Leblond 
Signed-off-by: Wang Nan 
[Merge bpf_object__init_maps_name into bpf_object__init_maps
 Fix segfault for buggy BPF script
 Validate obj->maps
]
Cc: Alexei Starovoitov 
Cc: Arnaldo Carvalho de Melo 
Cc: Li Zefan 
---
 tools/lib/bpf/libbpf.c | 142 ++---
 1 file changed, 98 insertions(+), 44 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index b699aea..96a2b2f 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -185,6 +185,7 @@ struct bpf_program {
 struct bpf_map {
int fd;
char *name;
+   size_t offset;
struct bpf_map_def def;
void *priv;
bpf_map_clear_priv_t clear_priv;
@@ -513,57 +514,106 @@ bpf_object__init_kversion(struct bpf_object *obj,
 }
 
 static int
-bpf_object__init_maps(struct bpf_object *obj, void *data,
- size_t size)
+bpf_object__validate_maps(struct bpf_object *obj)
 {
-   size_t nr_maps;
int i;
 
-   nr_maps = size / sizeof(struct bpf_map_def);
-   if (!data || !nr_maps) {
-   pr_debug("%s doesn't need map definition\n",
-obj->path);
+   /*
+* If there's only 1 map, the only error case should have been
+* catched in bpf_object__init_maps().
+*/
+   if (!obj->maps || !obj->nr_maps || (obj->nr_maps == 1))
return 0;
-   }
 
-   pr_debug("maps in %s: %zd bytes\n", obj->path, size);
+   for (i = 1; i < obj->nr_maps; i++) {
+   const struct bpf_map *a = >maps[i - 1];
+   const struct bpf_map *b = >maps[i];
 
-   obj->maps = calloc(nr_maps, sizeof(obj->maps[0]));
-   if (!obj->maps) {
-   pr_warning("alloc maps for object failed\n");
-   return -ENOMEM;
+   if (b->offset - a->offset < sizeof(struct bpf_map_def)) {
+   pr_warning("corrupted map section in %s: map \"%s\" too 
small\n",
+  obj->path, a->name);
+   return -EINVAL;
+   }
}
-   obj->nr_maps = nr_maps;
-
-   for (i = 0; i < nr_maps; i++) {
-   struct bpf_map_def *def = >maps[i].def;
+   return 0;
+}
 
-   /*
-* fill all fd with -1 so won't close incorrect
-* fd (fd=0 is stdin) when failure (zclose won't close
-* negative fd)).
-*/
-   obj->maps[i].fd = -1;
+static int compare_bpf_map(const void *_a, const void *_b)
+{
+   const struct bpf_map *a = _a;
+   const struct bpf_map *b = _b;
 
-   /* Save map definition into obj->maps */
-   *def = ((struct bpf_map_def *)data)[i];
-   }
-   return 0;
+   return a->offset - b->offset;
 }
 
 static int
-bpf_object__init_maps_name(struct bpf_object *obj)
+bpf_object__init_maps(struct bpf_object *obj)
 {
-   int i;
+   int i, map_idx, nr_maps = 0;
+   Elf_Scn *scn;
+   Elf_Data *data;
Elf_Data *symbols = obj->efile.symbols;
 
-   if (!symbols || obj->efile.maps_shndx < 0)
+   if (obj->efile.maps_shndx < 0)
+   return -EINVAL;
+   if (!symbols)
+   return -EINVAL;
+
+   scn = elf_getscn(obj->efile.elf, obj->efile.maps_shndx);
+   if (scn)
+   data = elf_getdata(scn, NULL);
+   if (!scn || !data) {
+   pr_warning("failed to get Elf_Data from map section %d\n",
+  obj->efile.maps_shndx);
return -EINVAL;
+   }
 
+   /*
+* Count number of maps. Each map has a name.
+* Array of maps is not supported: only the first element is
+* considered.
+

Re: [PATCH net-next 0/3] PHC frequency fine tuning

2016-11-08 Thread Keller, Jacob E

On Tue, 2016-11-08 at 22:49 +0100, Richard Cochran wrote:
> This series expands the PTP Hardware Clock subsystem by adding a
> method that passes the frequency tuning word to the the drivers
> without dropping the low order bits.  Keeping those bits is useful
> for
> drivers whose frequency resolution is higher than 1 ppb.
> 

Makes sense.

> The appended script (below) runs a simple demonstration of the
> improvement.  This test needs two Intel i210 PCIe cards installed in
> the same PC, with their SDP0 pins connected by copper
> wire.  Measuring
> the estimated offset (from the ptp4l servo) and the true offset (from
> the PPS) over one hour yields the following statistics.
> 
> > 
> >    |   Est. Before |Est. After |   True Before |True
> > After |
> > +---+---+---+
> > ---|
> > min| -5.20e+01 | -1.60e+01 | -3.10e+01 |
> > -1.00e+00 |
> > max| +5.70e+01 | +2.50e+01 | +8.50e+01 |
> > +4.00e+01 |
> > pk-pk: | +1.09e+02 | +4.10e+01 | +1.16e+02 |
> > +4.10e+01 |
> > mean   | +6.47e-02 | +1.28e-02 | +2.422083e+01 |
> > +1.826083e+01 |
> > stddev | +1.158006e+01 | +4.581982e+00 | +1.207708e+01 |
> > +4.981435e+00 |
> 
> Here the numbers in units of nanoseconds, and the ~20 nanosecond PPS
> offset is due to input/output delays on the i210's external interface
> logic.
> 
> With the series applied, both the peak to peak error and the standard
> deviation improve by a factor of more than two.  These two graphs
> show
> the improvement nicely.
> 
>   http://linuxptp.sourceforge.net/fine-tuning/fine-est.png
> 
>   http://linuxptp.sourceforge.net/fine-tuning/fine-tru.png
> 

Wow, nice! I'll take a look at the actual patches in a few minutes, but
this is a really nice improvement!

Thanks,
Jake

> 
> Thanks,
> Richard
> 
> Richard Cochran (3):
>   ptp: Introduce a high resolution frequency adjustment method.
>   ptp: igb: Use the high resolution frequency method.
>   ptp: dp83640: Use the high resolution frequency method.
> 
>  drivers/net/ethernet/intel/igb/igb_ptp.c | 16 
>  drivers/net/phy/dp83640.c| 14 +++---
>  drivers/ptp/ptp_clock.c  |  5 -
>  include/linux/ptp_clock_kernel.h |  8 
>  4 files changed, 27 insertions(+), 16 deletions(-)
>

[PATCH net-next 3/3] ptp: dp83640: Use the high resolution frequency method.

2016-11-08 Thread Richard Cochran

The dp83640 has a frequency resolution of about 0.029 ppb.
This patch lets users of the device benefit from the
increased frequency resolution when tuning the clock.

Signed-off-by: Richard Cochran 
---
 drivers/net/phy/dp83640.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/net/phy/dp83640.c b/drivers/net/phy/dp83640.c
index 7a240fc..e2460a5 100644
--- a/drivers/net/phy/dp83640.c
+++ b/drivers/net/phy/dp83640.c
@@ -375,7 +375,7 @@ static int periodic_output(struct dp83640_clock *clock,
 
 /* ptp clock methods */
 
-static int ptp_dp83640_adjfreq(struct ptp_clock_info *ptp, s32 ppb)
+static int ptp_dp83640_adjfine(struct ptp_clock_info *ptp, long scaled_ppm)
 {
struct dp83640_clock *clock =
container_of(ptp, struct dp83640_clock, caps);
@@ -384,13 +384,13 @@ static int ptp_dp83640_adjfreq(struct ptp_clock_info 
*ptp, s32 ppb)
int neg_adj = 0;
u16 hi, lo;
 
-   if (ppb < 0) {
+   if (scaled_ppm < 0) {
neg_adj = 1;
-   ppb = -ppb;
+   scaled_ppm = -scaled_ppm;
}
-   rate = ppb;
-   rate <<= 26;
-   rate = div_u64(rate, 1953125);
+   rate = scaled_ppm;
+   rate <<= 13;
+   rate = div_u64(rate, 15625);
 
hi = (rate >> 16) & PTP_RATE_HI_MASK;
if (neg_adj)
@@ -1035,7 +1035,7 @@ static void dp83640_clock_init(struct dp83640_clock 
*clock, struct mii_bus *bus)
clock->caps.n_per_out   = N_PER_OUT;
clock->caps.n_pins  = DP83640_N_PINS;
clock->caps.pps = 0;
-   clock->caps.adjfreq = ptp_dp83640_adjfreq;
+   clock->caps.adjfine = ptp_dp83640_adjfine;
clock->caps.adjtime = ptp_dp83640_adjtime;
clock->caps.gettime64   = ptp_dp83640_gettime;
clock->caps.settime64   = ptp_dp83640_settime;
-- 
2.1.4

[PATCH net-next 2/3] ptp: igb: Use the high resolution frequency method.

2016-11-08 Thread Richard Cochran

The 82580 and related devices offer a frequency resolution of about
0.029 ppb.  This patch lets users of the device benefit from the
increased frequency resolution when tuning the clock.

Signed-off-by: Richard Cochran 
---
 drivers/net/ethernet/intel/igb/igb_ptp.c | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_ptp.c 
b/drivers/net/ethernet/intel/igb/igb_ptp.c
index a7895c4..c30eea8 100644
--- a/drivers/net/ethernet/intel/igb/igb_ptp.c
+++ b/drivers/net/ethernet/intel/igb/igb_ptp.c
@@ -226,7 +226,7 @@ static int igb_ptp_adjfreq_82576(struct ptp_clock_info 
*ptp, s32 ppb)
return 0;
 }
 
-static int igb_ptp_adjfreq_82580(struct ptp_clock_info *ptp, s32 ppb)
+static int igb_ptp_adjfine_82580(struct ptp_clock_info *ptp, long scaled_ppm)
 {
struct igb_adapter *igb = container_of(ptp, struct igb_adapter,
   ptp_caps);
@@ -235,13 +235,13 @@ static int igb_ptp_adjfreq_82580(struct ptp_clock_info 
*ptp, s32 ppb)
u64 rate;
u32 inca;
 
-   if (ppb < 0) {
+   if (scaled_ppm < 0) {
neg_adj = 1;
-   ppb = -ppb;
+   scaled_ppm = -scaled_ppm;
}
-   rate = ppb;
-   rate <<= 26;
-   rate = div_u64(rate, 1953125);
+   rate = scaled_ppm;
+   rate <<= 13;
+   rate = div_u64(rate, 15625);
 
inca = rate & INCVALUE_MASK;
if (neg_adj)
@@ -1103,7 +1103,7 @@ void igb_ptp_init(struct igb_adapter *adapter)
adapter->ptp_caps.max_adj = 6249;
adapter->ptp_caps.n_ext_ts = 0;
adapter->ptp_caps.pps = 0;
-   adapter->ptp_caps.adjfreq = igb_ptp_adjfreq_82580;
+   adapter->ptp_caps.adjfine = igb_ptp_adjfine_82580;
adapter->ptp_caps.adjtime = igb_ptp_adjtime_82576;
adapter->ptp_caps.gettime64 = igb_ptp_gettime_82576;
adapter->ptp_caps.settime64 = igb_ptp_settime_82576;
@@ -1131,7 +1131,7 @@ void igb_ptp_init(struct igb_adapter *adapter)
adapter->ptp_caps.n_pins = IGB_N_SDP;
adapter->ptp_caps.pps = 1;
adapter->ptp_caps.pin_config = adapter->sdp_config;
-   adapter->ptp_caps.adjfreq = igb_ptp_adjfreq_82580;
+   adapter->ptp_caps.adjfine = igb_ptp_adjfine_82580;
adapter->ptp_caps.adjtime = igb_ptp_adjtime_i210;
adapter->ptp_caps.gettime64 = igb_ptp_gettime_i210;
adapter->ptp_caps.settime64 = igb_ptp_settime_i210;
-- 
2.1.4

[PATCH net-next 0/3] PHC frequency fine tuning

2016-11-08 Thread Richard Cochran

This series expands the PTP Hardware Clock subsystem by adding a
method that passes the frequency tuning word to the the drivers
without dropping the low order bits.  Keeping those bits is useful for
drivers whose frequency resolution is higher than 1 ppb.

The appended script (below) runs a simple demonstration of the
improvement.  This test needs two Intel i210 PCIe cards installed in
the same PC, with their SDP0 pins connected by copper wire.  Measuring
the estimated offset (from the ptp4l servo) and the true offset (from
the PPS) over one hour yields the following statistics.

||   Est. Before |Est. After |   True Before |True After |
|+---+---+---+---|
| min| -5.20e+01 | -1.60e+01 | -3.10e+01 | -1.00e+00 |
| max| +5.70e+01 | +2.50e+01 | +8.50e+01 | +4.00e+01 |
| pk-pk: | +1.09e+02 | +4.10e+01 | +1.16e+02 | +4.10e+01 |
| mean   | +6.47e-02 | +1.28e-02 | +2.422083e+01 | +1.826083e+01 |
| stddev | +1.158006e+01 | +4.581982e+00 | +1.207708e+01 | +4.981435e+00 |

Here the numbers in units of nanoseconds, and the ~20 nanosecond PPS
offset is due to input/output delays on the i210's external interface
logic.

With the series applied, both the peak to peak error and the standard
deviation improve by a factor of more than two.  These two graphs show
the improvement nicely.

  http://linuxptp.sourceforge.net/fine-tuning/fine-est.png

  http://linuxptp.sourceforge.net/fine-tuning/fine-tru.png


Thanks,
Richard

Richard Cochran (3):
  ptp: Introduce a high resolution frequency adjustment method.
  ptp: igb: Use the high resolution frequency method.
  ptp: dp83640: Use the high resolution frequency method.

 drivers/net/ethernet/intel/igb/igb_ptp.c | 16 
 drivers/net/phy/dp83640.c| 14 +++---
 drivers/ptp/ptp_clock.c  |  5 -
 include/linux/ptp_clock_kernel.h |  8 
 4 files changed, 27 insertions(+), 16 deletions(-)

-- 
2.1.4

---
#!/bin/sh

set -e
set -x

killall ptp4l || true

DUR=3600
ETHA=eth6
ETHB=eth3
DEVA=/dev/ptp`ethtool -T $ETHA | awk '/PTP/ {print $4}'`
DEVB=/dev/ptp`ethtool -T $ETHB | awk '/PTP/ {print $4}'`

testptp -d $DEVA -p 0

for x in $DEVA $DEVB; do
testptp -d $x -f 0
testptp -d $x -s
done

testptp -d $DEVA -L 0,2  # periodic output
testptp -d $DEVB -L 0,1  # external time stamp
testptp -d $DEVA -p 20

ptp4l -m -q -2 -i $ETHA > log.master &
ptp4l -m -q -2 -i $ETHB -s > log.slave &

sleep 60
testptp -d $DEVB -e $DUR > log.pps
tail -n $DUR log.slave > log.est

killall ptp4l

[PATCH net-next 1/3] ptp: Introduce a high resolution frequency adjustment method.

2016-11-08 Thread Richard Cochran

The internal PTP Hardware Clock (PHC) interface limits the resolution for
frequency adjustments to one part per billion.  However, some hardware
devices allow finer adjustment, and making use of the increased resolution
improves synchronization measurably on such devices.

This patch adds an alternative method that allows finer frequency tuning
by passing the scaled ppm value to PHC drivers.  This value comes from
user space, and it has a resolution of about 0.015 ppb.  We also deprecate
the older method, anticipating its removal once existing drivers have been
converted over.

Signed-off-by: Richard Cochran 
Suggested-by: Ulrik De Bie 
---
 drivers/ptp/ptp_clock.c  | 5 -
 include/linux/ptp_clock_kernel.h | 8 
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c
index 86280b7..9c13381 100644
--- a/drivers/ptp/ptp_clock.c
+++ b/drivers/ptp/ptp_clock.c
@@ -153,7 +153,10 @@ static int ptp_clock_adjtime(struct posix_clock *pc, 
struct timex *tx)
s32 ppb = scaled_ppm_to_ppb(tx->freq);
if (ppb > ops->max_adj || ppb < -ops->max_adj)
return -ERANGE;
-   err = ops->adjfreq(ops, ppb);
+   if (ops->adjfine)
+   err = ops->adjfine(ops, tx->freq);
+   else
+   err = ops->adjfreq(ops, ppb);
ptp->dialed_frequency = tx->freq;
} else if (tx->modes == 0) {
tx->freq = ptp->dialed_frequency;
diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h
index 5ad54fc6..b76d47a 100644
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -58,7 +58,14 @@ struct system_device_crosststamp;
  *
  * clock operations
  *
+ * @adjfine:  Adjusts the frequency of the hardware clock.
+ *parameter scaled_ppm: Desired frequency offset from
+ *nominal frequency in parts per million, but with a
+ *16 bit binary fractional field.
+ *
  * @adjfreq:  Adjusts the frequency of the hardware clock.
+ *This method is deprecated.  New drivers should implement
+ *the @adjfine method instead.
  *parameter delta: Desired frequency offset from nominal frequency
  *in parts per billion
  *
@@ -108,6 +115,7 @@ struct ptp_clock_info {
int n_pins;
int pps;
struct ptp_pin_desc *pin_config;
+   int (*adjfine)(struct ptp_clock_info *ptp, long scaled_ppm);
int (*adjfreq)(struct ptp_clock_info *ptp, s32 delta);
int (*adjtime)(struct ptp_clock_info *ptp, s64 delta);
int (*gettime64)(struct ptp_clock_info *ptp, struct timespec64 *ts);
-- 
2.1.4

Re: [PATCH] bpf: Remove unused but set variables

2016-11-08 Thread Alexei Starovoitov

On Tue, Nov 08, 2016 at 04:40:28PM +0100, Tobias Klauser wrote:
> Remove the unused but set variables min_set and max_set in
> adjust_reg_min_max_vals to fix the following warning when building with
> 'W=1':
> 
>   kernel/bpf/verifier.c:1483:7: warning: variable ‘min_set’ set but not used 
> [-Wunused-but-set-variable]
> 
> There is no warning about max_set being unused, but since it is only
> used in the assignment of min_set it can be removed as well.
> 
> They were introduced in commit 484611357c19 ("bpf: allow access into map
> value arrays") but seem to have never been used.
> 
> Cc: Josef Bacik 
> Signed-off-by: Tobias Klauser 
> ---

Acked-by: Alexei Starovoitov

Re: [RFC PATCH net-next] net: ethtool: add support for forward error correction modes

2016-11-08 Thread Vidya Sagar Ravipati

On Thu, Nov 3, 2016 at 6:24 AM, Gal Pressman  wrote:
>
>
> On 25/10/2016 05:50, Vidya Sagar Ravipati wrote:
>> SET FEC option:
>> root@tor: ethtool --set-fec  swp1 encoding [off | RS | BaseR | auto] autoneg 
>> [off | on]
>>
>> Encoding: Types of encoding
>> Off:  Turning off any encoding
>> RS :  enforcing RS-FEC encoding on supported speeds
>> BaseR  :  enforcing Base R encoding on supported speeds
>> Auto   :  Default FEC settings  for  divers , and would represent
>
> divers? :)
Drivers :)
>
>>   asking the hardware to essentially go into a best effort mode.
>>
>> Here are a few examples of what we would expect if encoding=auto:
>> - if autoneg is on, we are  expecting FEC to be negotiated as on or off
>>   as long as protocol supports it
>> - if the hardware is capable of detecting the FEC encoding on it's
>>   receiver it will reconfigure its encoder to match
>> - in absence of the above, the configuration would be set to IEEE
>>   defaults.
>
> Not sure I follow, why do we need an autoneg option if encoding type can be 
> set to auto?
Auto is one of the FEC configuration modes  which indicates  the
drivers to set the IEEE defaults based on speed/duplex combination of
the port. i.e. RS FEC mode will be set in case of 100G/full

Auto negotiation is the configuration for the link  to negotiate the
FEC capabilities and different FEC modes with other endpoint  using
encoded bits D44:47 in base link code word.

[iproute PATCH 1/2] ipaddress: Simplify vf_info parsing

2016-11-08 Thread Phil Sutter

Commit 7b8179c780a1a ("iproute2: Add new command to ip link to
enable/disable VF spoof check") tried to add support for
IFLA_VF_SPOOFCHK in a backwards-compatible manner, but aparently overdid
it: parse_rtattr_nested() handles missing attributes perfectly fine in
that it will leave the relevant field unassigned so calling code can
just compare against NULL. There is no need to layback from the previous
(IFLA_VF_TX_RATE) attribute to the next to check if IFLA_VF_SPOOFCHK is
present or not. To the contrary, it establishes a potentially incorrect
assumption of these two attributes directly following each other which
may not be the case (although up to now, kernel aligns them this way).

This patch cleans up the code to adhere to the common way of checking
for attribute existence. It has been tested to return correct results
regardless of whether the kernel exports IFLA_VF_SPOOFCHK or not.

Signed-off-by: Phil Sutter 
---
 ip/ipaddress.c | 44 ++--
 1 file changed, 10 insertions(+), 34 deletions(-)

diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 7f05258f43453..df0f1b9c94c58 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -322,10 +322,7 @@ static void print_vfinfo(FILE *fp, struct rtattr *vfinfo)
 {
struct ifla_vf_mac *vf_mac;
struct ifla_vf_tx_rate *vf_tx_rate;
-   struct ifla_vf_spoofchk *vf_spoofchk;
-   struct ifla_vf_link_state *vf_linkstate;
struct rtattr *vf[IFLA_VF_MAX + 1] = {};
-   struct rtattr *tmp;
 
SPRINT_BUF(b1);
 
@@ -339,31 +336,6 @@ static void print_vfinfo(FILE *fp, struct rtattr *vfinfo)
vf_mac = RTA_DATA(vf[IFLA_VF_MAC]);
vf_tx_rate = RTA_DATA(vf[IFLA_VF_TX_RATE]);
 
-   /* Check if the spoof checking vf info type is supported by
-* this kernel.
-*/
-   tmp = (struct rtattr *)((char *)vf[IFLA_VF_TX_RATE] +
-   vf[IFLA_VF_TX_RATE]->rta_len);
-
-   if (tmp->rta_type != IFLA_VF_SPOOFCHK)
-   vf_spoofchk = NULL;
-   else
-   vf_spoofchk = RTA_DATA(vf[IFLA_VF_SPOOFCHK]);
-
-   if (vf_spoofchk) {
-   /* Check if the link state vf info type is supported by
-* this kernel.
-*/
-   tmp = (struct rtattr *)((char *)vf[IFLA_VF_SPOOFCHK] +
-   vf[IFLA_VF_SPOOFCHK]->rta_len);
-
-   if (tmp->rta_type != IFLA_VF_LINK_STATE)
-   vf_linkstate = NULL;
-   else
-   vf_linkstate = RTA_DATA(vf[IFLA_VF_LINK_STATE]);
-   } else
-   vf_linkstate = NULL;
-
fprintf(fp, "%svf %d MAC %s", _SL_, vf_mac->vf,
ll_addr_n2a((unsigned char *)_mac->mac,
ETH_ALEN, 0, b1, sizeof(b1)));
@@ -407,14 +379,18 @@ static void print_vfinfo(FILE *fp, struct rtattr *vfinfo)
if (vf_rate->min_tx_rate)
fprintf(fp, ", min_tx_rate %dMbps", 
vf_rate->min_tx_rate);
}
+   if (vf[IFLA_VF_SPOOFCHK]) {
+   struct ifla_vf_spoofchk *vf_spoofchk =
+   RTA_DATA(vf[IFLA_VF_SPOOFCHK]);
 
-   if (vf_spoofchk && vf_spoofchk->setting != -1) {
-   if (vf_spoofchk->setting)
-   fprintf(fp, ", spoof checking on");
-   else
-   fprintf(fp, ", spoof checking off");
+   if (vf_spoofchk->setting != -1)
+   fprintf(fp, ", spoof checking %s",
+   vf_spoofchk->setting ? "on" : "off");
}
-   if (vf_linkstate) {
+   if (vf[IFLA_VF_LINK_STATE]) {
+   struct ifla_vf_link_state *vf_linkstate =
+   RTA_DATA(vf[IFLA_VF_LINK_STATE]);
+
if (vf_linkstate->link_state == IFLA_VF_LINK_STATE_AUTO)
fprintf(fp, ", link-state auto");
else if (vf_linkstate->link_state == IFLA_VF_LINK_STATE_ENABLE)
-- 
2.10.0

[iproute PATCH 2/2] ipaddress: Print IFLA_VF_QUERY_RSS_EN setting

2016-11-08 Thread Phil Sutter

Signed-off-by: Phil Sutter 
---
 ip/ipaddress.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index df0f1b9c94c58..c9f769fb748e4 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -405,6 +405,14 @@ static void print_vfinfo(FILE *fp, struct rtattr *vfinfo)
fprintf(fp, ", trust %s",
vf_trust->setting ? "on" : "off");
}
+   if (vf[IFLA_VF_RSS_QUERY_EN]) {
+   struct ifla_vf_rss_query_en *rss_query =
+   RTA_DATA(vf[IFLA_VF_RSS_QUERY_EN]);
+
+   if (rss_query->setting != -1)
+   fprintf(fp, ", query_rss %s",
+   rss_query->setting ? "on" : "off");
+   }
if (vf[IFLA_VF_STATS] && show_stats)
print_vf_stats64(fp, vf[IFLA_VF_STATS]);
 }
-- 
2.10.0

[iproute PATCH 0/2] Resend: Simplify and enhance vf_info parsing

2016-11-08 Thread Phil Sutter

This patch series got lost in a discussion about whether the code the
first patch removes is necessary or not - static analysis as well as my
tests showed it is not. Therefore resending this with updated
description of patch 1 to contain the discussion's gist.

Phil Sutter (2):
  ipaddress: Simplify vf_info parsing
  ipaddress: Print IFLA_VF_QUERY_RSS_EN setting

 ip/ipaddress.c | 52 ++--
 1 file changed, 18 insertions(+), 34 deletions(-)

-- 
2.10.0

Re: [PATCH net-next 1/2] bnxt_en: do not call napi_hash_add()

2016-11-08 Thread Michael Chan

On Tue, Nov 8, 2016 at 11:06 AM, Eric Dumazet  wrote:
>
> From: Eric Dumazet 
>
> This is automatically done from netif_napi_add(), and we want to not
> export napi_hash_add() anymore in the following patch.
>
> Signed-off-by: Eric Dumazet 
> Cc: Michael Chan 

Acked-by: Michael Chan

Re: [PATCH] [RFC] net: phy: phy drivers should not set SUPPORTED_Pause or SUPPORTED_Asym_Pause

2016-11-08 Thread Timur Tabi


On 11/07/2016 10:30 AM, Timur Tabi wrote:


I'm still don't understand 100% how these flags really work, because I
just can't shake the feeling that they should not be set for every phy.
  If these flags are supposed to be turned on universally, then why are
they even an option?


So I've been giving this more thought.  Can you tell me if the following 
is correct:


1) PHY drivers and/or phylib sets the SUPPORTED_Pause | 
SUPPORTED_AsymPause bits in phydev->supported.  This indicates that the 
PHY supports pause frames.


2) The MAC driver checks phydev->supported before it calls phy_start(). 
 If (SUPPORTED_Pause | SUPPORTED_AsymPause) is set, then it sets those 
bits in phydev->advertising if it wants to enable pause frame support.


3) When the link state changes, the MAC driver checks 
phydev->advertising, and if the bits are set, then it enables those 
features in the MAC.


--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm
Technologies, Inc.  Qualcomm Technologies, Inc. is a member of the
Code Aurora Forum, a Linux Foundation Collaborative Project.

Re: [PATCH v2] irqchip/renesas-irqc: Postpone driver initialization

2016-11-08 Thread Geert Uytterhoeven

Hi Florian,

On Tue, Nov 8, 2016 at 8:42 PM, Florian Fainelli  wrote:
> On 11/08/2016 11:35 AM, Geert Uytterhoeven wrote:
>> Currently the renesas-irqc driver uses postcore_initcall().
>>
>> However, the new CPG/MSSR driver uses subsys_initcall(). Hence the
>> IRQC's probe will be deferred, which causes the Micrel Ethernet PHY to
>> not find its interrupt on R-Car Gen2 and RZ/G, as the of_mdio subsystem
>> does not support deferred probe yet.
>
> Is not that the more correct fix to implement though?

Sure it is. But nothing has happened since this was reported ca. 1 year ago.
Cfr. "of_mdiobus_register_phy() and deferred probe"
https://lkml.org/lkml/2015/10/22/377

My MDIO foo is not that strong...

Gr{oetje,eeting}s,

Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds

Re: net/l2tp: use-after-free write in l2tp_ip6_close

2016-11-08 Thread Andrey Konovalov

Hi Cong,

Tried with your patch, still seeing the reports.

Thanks!

On Tue, Nov 8, 2016 at 12:02 AM, Cong Wang  wrote:
> On Mon, Nov 7, 2016 at 2:35 PM, Andrey Konovalov  
> wrote:
>> Hi,
>>
>> I've got the following error report while running the syzkaller fuzzer:
>>
>> ==
>> BUG: KASAN: use-after-free in l2tp_ip6_close+0x239/0x2a0 at addr
>> 8800677276d8
>> Write of size 8 by task a.out/8668
>> CPU: 0 PID: 8668 Comm: a.out Not tainted 4.9.0-rc4+ #354
>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
>>  8800694d7b00 81b46a64 88006adb5780 8800677276c0
>>  880067727c68 8800677276c0 8800694d7b28 8150a86c
>>  8800694d7bb8 88006adb5780 8800e77276d8 8800694d7ba8
>> Call Trace:
>>  [< inline >] __dump_stack lib/dump_stack.c:15
>>  [] dump_stack+0xb3/0x10f lib/dump_stack.c:51
>>  [] kasan_object_err+0x1c/0x70 mm/kasan/report.c:156
>>  [< inline >] print_address_description mm/kasan/report.c:194
>>  [] kasan_report_error+0x1f7/0x4d0 mm/kasan/report.c:283
>>  [< inline >] kasan_report mm/kasan/report.c:303
>>  [] __asan_report_store8_noabort+0x3e/0x40
>> mm/kasan/report.c:329
>>  [< inline >] __write_once_size ./include/linux/compiler.h:272
>>  [< inline >] __hlist_del ./include/linux/list.h:622
>>  [< inline >] hlist_del_init ./include/linux/list.h:637
>>  [] l2tp_ip6_close+0x239/0x2a0 net/l2tp/l2tp_ip6.c:239
>>  [] inet_release+0xef/0x1c0 net/ipv4/af_inet.c:415
>>  [] inet6_release+0x50/0x70 net/ipv6/af_inet6.c:422
>>  [] sock_release+0x8e/0x1d0 net/socket.c:570
>>  [] sock_close+0x16/0x20 net/socket.c:1017
>>  [] __fput+0x29d/0x720 fs/file_table.c:208
>>  [] fput+0x15/0x20 fs/file_table.c:244
>>  [] task_work_run+0xf8/0x170 kernel/task_work.c:116
>>  [< inline >] exit_task_work ./include/linux/task_work.h:21
>>  [] do_exit+0x883/0x2ac0 kernel/exit.c:828
>>  [] do_group_exit+0x10e/0x340 kernel/exit.c:931
>>  [< inline >] SYSC_exit_group kernel/exit.c:942
>>  [] SyS_exit_group+0x1d/0x20 kernel/exit.c:940
>>  [] entry_SYSCALL_64_fastpath+0x1f/0xc2
>> arch/x86/entry/entry_64.S:209
>
> I guess we need to lock the sock for l2tp_ip6_disconnect() too.
>
> diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
> index ad3468c..ea2ae66 100644
> --- a/net/l2tp/l2tp_ip6.c
> +++ b/net/l2tp/l2tp_ip6.c
> @@ -410,7 +410,7 @@ static int l2tp_ip6_disconnect(struct sock *sk, int flags)
> if (sock_flag(sk, SOCK_ZAPPED))
> return 0;
>
> -   return __udp_disconnect(sk, flags);
> +   return udp_disconnect(sk, flags);
>  }
>
>  static int l2tp_ip6_getname(struct socket *sock, struct sockaddr *uaddr,
>
> --
> You received this message because you are subscribed to the Google Groups 
> "syzkaller" group.
> To unsubscribe from this group and stop receiving emails from it, send an 
> email to syzkaller+unsubscr...@googlegroups.com.
> For more options, visit https://groups.google.com/d/optout.

Re: [PATCH v2] irqchip/renesas-irqc: Postpone driver initialization

2016-11-08 Thread Florian Fainelli

On 11/08/2016 11:35 AM, Geert Uytterhoeven wrote:
> Currently the renesas-irqc driver uses postcore_initcall().
> 
> However, the new CPG/MSSR driver uses subsys_initcall(). Hence the
> IRQC's probe will be deferred, which causes the Micrel Ethernet PHY to
> not find its interrupt on R-Car Gen2 and RZ/G, as the of_mdio subsystem
> does not support deferred probe yet.

Is not that the more correct fix to implement though?
-- 
Florian

[PATCH v2] irqchip/renesas-irqc: Postpone driver initialization

2016-11-08 Thread Geert Uytterhoeven

Currently the renesas-irqc driver uses postcore_initcall().

However, the new CPG/MSSR driver uses subsys_initcall(). Hence the
IRQC's probe will be deferred, which causes the Micrel Ethernet PHY to
not find its interrupt on R-Car Gen2 and RZ/G, as the of_mdio subsystem
does not support deferred probe yet.

Replace postcore_initcall() by device_initcall() to work around this.

Note that on R-Mobile APE6, where the PFC/GPIO combo uses the IRQC as
its parent interrupt controller, this does cause a few additional probe
deferrals (for SCIFA0, SD0, SD1, and MMC). But the affected drivers
handle that fine.

Signed-off-by: Geert Uytterhoeven 
Tested-by: Sergei Shtylyov 
---
v2:
  - Drop RFC state,
  - Add Tested-by,
  - Improved description.
---
 drivers/irqchip/irq-renesas-irqc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/irqchip/irq-renesas-irqc.c 
b/drivers/irqchip/irq-renesas-irqc.c
index 52304b139aa46a60..992849e54d00ea77 100644
--- a/drivers/irqchip/irq-renesas-irqc.c
+++ b/drivers/irqchip/irq-renesas-irqc.c
@@ -295,7 +295,7 @@ static int __init irqc_init(void)
 {
return platform_driver_register(_device_driver);
 }
-postcore_initcall(irqc_init);
+device_initcall(irqc_init);
 
 static void __exit irqc_exit(void)
 {
-- 
1.9.1

Re: [Intel-wired-lan] [PATCH] igb: use igb_adapter->io_addr instead of e1000_hw->hw_addr

2016-11-08 Thread Alexander Duyck

On Tue, Nov 8, 2016 at 10:37 AM, Corinna Vinschen  wrote:
> On Nov  8 09:16, Hisashi T Fujinaka wrote:
>> On Tue, 8 Nov 2016, Corinna Vinschen wrote:
>> > On Nov  8 15:06, Cao jin wrote:
>> > > When running as guest, under certain condition, it will oops as 
>> > > following.
>> > > writel() in igb_configure_tx_ring() results in oops, because hw->hw_addr
>> > > is NULL. While other register access won't oops kernel because they use
>> > > wr32/rd32 which have a defense against NULL pointer.
>> > > [...]
>> >
>> > Incidentally we're just looking for a solution to that problem too.
>> > Do three patches to fix the same problem at rougly the same time already
>> > qualify as freak accident?
>> >
>> > FTR, I attached my current patch, which I was planning to submit after
>> > some external testing.
>> >
>> > However, all three patches have one thing in common:  They workaround
>> > a somewhat dubious resetting of the hardware address to NULL in case
>> > reading from a register failed.
>> >
>> > That makes me wonder if setting the hardware address to NULL in
>> > rd32/igb_rd32 is really such a good idea.  It's performed in a function
>> > which return value is *never* tested for validity in the calling
>> > functions and leads to subsequent crashes since no tests for hw_addr ==
>> > NULL are performed.
>> >
>> > Maybe commit 22a8b2915 should be reconsidered?  Isn't there some more
>> > graceful way to handle the "surprise removal"?
>>
>> Answering this from my home account because, well, work is Outlook.
>>
>> "Reconsidering" would be great. In fact, revert if if you'd like. I'm
>> uncertain that the surprise removal code actually works the way I
>> thought previously and I think I took a lot of it out of my local code.
>>
>> Unfortuantely I don't have any equipment that I can use to reproduce
>> surprise removal any longer so that means I wouldn't be able to test
>> anything. I have to defer to you or Cao Jin.
>
> I'm not too keen to rip out a PCIe NIC under power from my locale
> desktop machine, but I think an actual surprise removal is not the
> problem.
>
> As described in my git log entry, the error condition in igb_rd32 can be
> triggered during a suspend.  The HW has been put into a sleep state but
> some register read requests are apparently not guarded against that
> situation.  Reading a register in this state returns -1, thus a suspend
> is erroneously triggering the "surprise removal" sequence.

The question I would have is what is reading the device when it is in
this state.  The watchdog and any other functions that would read the
device should be disabled.

One possibility could be a race between a call to igb_close and the
igb_suspend function.  We have seen some of those pop up recently on
ixgbe and it looks like igb has the same bug.  We should probably be
using the rtnl_lock to guarantee that netif_device_detach and the call
to __igb_close are completed before igb_close could possibly be called
by the network stack.

> Here's a raw idea:
>
> - Note that device is suspended in e1000_hw struct.  Don't trigger
>   error sequence in igb_rd32 if so (...and return a 0 value???)

The thing is that a suspended device should not be accessed at all.
If we are accessing it while it is suspended then that is a bug.  If
you could throw a WARN_ON call in igb_rd32 to capture where this is
being triggered that might be useful.

> - Otherwise assume it's actually a surprise removal.  In theory that
>   should somehow trigger a device removal sequence, kind of like
>   calling igb_remove, no?

Well a read of the MMIO region while suspended is more of a surprise
read since there shouldn't be anything going on.  We need to isolate
where that read is coming from and fix it.

Thanks.

- Alex

Re: [PATCH 13/17] batman-adv: Consume skb in receive handlers

2016-11-08 Thread Sven Eckelmann

On Dienstag, 8. November 2016 09:43:01 CET Eric Dumazet wrote:
[...]
> Sure, but your patch 13/17 should address this right away.
[...]

Fair enough. I've asked Simon to resubmit the patches with the
"consume_skb -> conditional kfree_skb/consume_skb" patch squashed
into patch 13.

Kind regards,
Sven


signature.asc
Description: This is a digitally signed message part.

Re: [PATCH net-next v4] cadence: Add LSO support.

2016-11-08 Thread Florian Fainelli

On 11/08/2016 05:41 AM, Rafal Ozieblo wrote:
> New Cadence GEM hardware support Large Segment Offload (LSO):
> TCP segmentation offload (TSO) as well as UDP fragmentation
> offload (UFO). Support for those features was added to the driver.
> 
> Signed-off-by: Rafal Ozieblo 
> ---

> -#define MACB_MAX_TX_LEN  ((unsigned int)((1 << 
> MACB_TX_FRMLEN_SIZE) - 1))
> -#define GEM_MAX_TX_LEN   ((unsigned int)((1 << 
> GEM_TX_FRMLEN_SIZE) - 1))
> +/* Max length of transmit frame must be a multiple of 8 bytes */
> +#define MACB_TX_LEN_ALIGN8
> +#define MACB_MAX_TX_LEN  ((unsigned int)((1 << 
> MACB_TX_FRMLEN_SIZE) - 1) & ~((unsigned int)(MACB_TX_LEN_ALIGN - 1)))
> +#define GEM_MAX_TX_LEN   ((unsigned int)((1 << 
> GEM_TX_FRMLEN_SIZE) - 1) & ~((unsigned int)(MACB_TX_LEN_ALIGN - 1)))
>  
>  #define GEM_MTU_MIN_SIZE ETH_MIN_MTU
> +#define MACB_NETIF_LSO   (NETIF_F_TSO | NETIF_F_UFO)

Not a huge fan of this definition, since it is always used in conjuction
with netdev_features_t, having it expanded all the time is kind of nicer
for the reader, but this is just personal preference here.

>  
>  #define MACB_WOL_HAS_MAGIC_PACKET(0x1 << 0)
>  #define MACB_WOL_ENABLED (0x1 << 1)
> @@ -1223,7 +1228,8 @@ static void macb_poll_controller(struct net_device *dev)
>  
>  static unsigned int macb_tx_map(struct macb *bp,
>   struct macb_queue *queue,
> - struct sk_buff *skb)
> + struct sk_buff *skb,
> + unsigned int hdrlen)
>  {
>   dma_addr_t mapping;
>   unsigned int len, entry, i, tx_head = queue->tx_head;
> @@ -1231,14 +1237,27 @@ static unsigned int macb_tx_map(struct macb *bp,
>   struct macb_dma_desc *desc;
>   unsigned int offset, size, count = 0;
>   unsigned int f, nr_frags = skb_shinfo(skb)->nr_frags;
> - unsigned int eof = 1;
> - u32 ctrl;
> + unsigned int eof = 1, mss_mfs = 0;
> + u32 ctrl, lso_ctrl = 0, seq_ctrl = 0;
> +
> + /* LSO */
> + if (skb_shinfo(skb)->gso_size != 0) {
> + if (IPPROTO_UDP == (ip_hdr(skb)->protocol))

Most checks are usually done the other way with the left and right
member swapped.

> + /* UDP - UFO */
> + lso_ctrl = MACB_LSO_UFO_ENABLE;
> + else
> + /* TCP - TSO */
> + lso_ctrl = MACB_LSO_TSO_ENABLE;
> + }

>  
>   /* Then, map paged data from fragments */
> @@ -1311,6 +1332,20 @@ static unsigned int macb_tx_map(struct macb *bp,
>   desc = >tx_ring[entry];
>   desc->ctrl = ctrl;
>  
> + if (lso_ctrl) {
> + if (lso_ctrl == MACB_LSO_UFO_ENABLE)
> + /* include header and FCS in value given to h/w */
> + mss_mfs = skb_shinfo(skb)->gso_size +
> + skb_transport_offset(skb) + 4;

ETH_FCS_LEN instead of 4?


> +static netdev_features_t macb_features_check(struct sk_buff *skb,
> +  struct net_device *dev,
> +  netdev_features_t features)
> +{
> + unsigned int nr_frags, f;
> + unsigned int hdrlen;
> +
> + /* Validate LSO compatibility */
> +
> + /* there is only one buffer */
> + if (!skb_is_nonlinear(skb))
> + return features;
> +
> + /* length of header */
> + hdrlen = skb_transport_offset(skb);
> + if (IPPROTO_TCP == (ip_hdr(skb)->protocol))
> + hdrlen += tcp_hdrlen(skb);

Same here, please reverse the left and right members, no need for
parenthesis aground ip_hdr(skb)->protocol.

> +
> + /* For LSO:
> +  * When software supplies two or more payload buffers all payload 
> buffers
> +  * apart from the last must be a multiple of 8 bytes in size.
> +  */
> + if (!IS_ALIGNED(skb_headlen(skb) - hdrlen, MACB_TX_LEN_ALIGN))
> + return features & ~MACB_NETIF_LSO;
> +
> + nr_frags = skb_shinfo(skb)->nr_frags;
> + /* No need to check last fragment */
> + nr_frags--;
> + for (f = 0; f < nr_frags; f++) {
> + const skb_frag_t *frag = _shinfo(skb)->frags[f];
> +
> + if (!IS_ALIGNED(skb_frag_size(frag), MACB_TX_LEN_ALIGN))
> + return features & ~MACB_NETIF_LSO;
> + }
> + return features;
> +}
> +
>  static inline int macb_clear_csum(struct sk_buff *skb)
>  {
>   /* no change for packets without checksum offloading */
> @@ -1374,7 +1456,27 @@ static int macb_start_xmit(struct sk_buff *skb, struct 
> net_device *dev)
>   struct macb *bp = netdev_priv(dev);
>   struct macb_queue *queue = >queues[queue_index];
>   unsigned long flags;
> - unsigned int count, nr_frags, frag_size, f;
> + unsigned int desc_cnt, nr_frags, frag_size, f;
> + unsigned int is_lso = 0, is_udp, hdrlen;
> +
> + is_lso

[PATCH net-next 2/2] net: napi_hash_add() is no longer exported

2016-11-08 Thread Eric Dumazet

From: Eric Dumazet 

There are no more users except from net/core/dev.c
napi_hash_add() can now be static.

Signed-off-by: Eric Dumazet 
Cc: Michael Chan 
---
 include/linux/netdevice.h |   11 ---
 net/core/dev.c|3 +--
 2 files changed, 1 insertion(+), 13 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 
66fd61c681d90d4a7ecc3bf7bae44c2b3b1fe10c..d64135a0ab718b9e119646b74f92901e8fe4b356
 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -468,17 +468,6 @@ static inline void napi_complete(struct napi_struct *n)
 }
 
 /**
- * napi_hash_add - add a NAPI to global hashtable
- * @napi: NAPI context
- *
- * Generate a new napi_id and store a @napi under it in napi_hash.
- * Used for busy polling (CONFIG_NET_RX_BUSY_POLL).
- * Note: This is normally automatically done from netif_napi_add(),
- * so might disappear in a future Linux version.
- */
-void napi_hash_add(struct napi_struct *napi);
-
-/**
  * napi_hash_del - remove a NAPI from global table
  * @napi: NAPI context
  *
diff --git a/net/core/dev.c b/net/core/dev.c
index 
0260ad314506c621215a0e3449392bc01aad55ca..e2148d9845868d7ca5d2c6b853cb0fbe18b32163
 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5017,7 +5017,7 @@ EXPORT_SYMBOL(sk_busy_loop);
 
 #endif /* CONFIG_NET_RX_BUSY_POLL */
 
-void napi_hash_add(struct napi_struct *napi)
+static void napi_hash_add(struct napi_struct *napi)
 {
if (test_bit(NAPI_STATE_NO_BUSY_POLL, >state) ||
test_and_set_bit(NAPI_STATE_HASHED, >state))
@@ -5037,7 +5037,6 @@ void napi_hash_add(struct napi_struct *napi)
 
spin_unlock(_hash_lock);
 }
-EXPORT_SYMBOL_GPL(napi_hash_add);
 
 /* Warning : caller is responsible to make sure rcu grace period
  * is respected before freeing memory containing @napi

[PATCH net-next 1/2] bnxt_en: do not call napi_hash_add()

2016-11-08 Thread Eric Dumazet

From: Eric Dumazet 

This is automatically done from netif_napi_add(), and we want to not
export napi_hash_add() anymore in the following patch.

Signed-off-by: Eric Dumazet 
Cc: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c |1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 
a042da1ff4b90e9aae4f76db71c99c2c4da321d3..d313b02485a10b2b7995076578ce3632865b475f
 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -4954,7 +4954,6 @@ static void bnxt_init_napi(struct bnxt *bp)
bnapi = bp->bnapi[cp_nr_rings];
netif_napi_add(bp->dev, >napi,
   bnxt_poll_nitroa0, 64);
-   napi_hash_add(>napi);
}
} else {
bnapi = bp->bnapi[0];

Re: [PATCH v3 4/4] posix-timers: make it configurable

2016-11-08 Thread John Stultz

On Tue, Nov 8, 2016 at 10:19 AM, Nicolas Pitre  wrote:
> On Tue, 8 Nov 2016, John Stultz wrote:
>
>> One spot of concern is that the
>> tools/testing/selftests/timers/posix_timers.c test hangs testing
>> virtual itimers. Looking through the code I'm not seeing where an
>> error case is missed.
>>
>> The strace looks like:
>> ...
>> write(1, "Testing posix timers. False nega"..., 66Testing posix
>> timers. False negative may happen on CPU execution
>> ) = 66
>> write(1, "based timers if other threads ru"..., 48based timers if
>> other threads run on the CPU...
>> ) = 48
>> write(1, "Check itimer virtual... ", 24Check itimer virtual... ) = 24
>> rt_sigaction(SIGVTALRM, {0x400a80, [VTALRM], SA_RESTORER|SA_RESTART,
>> 0x7fb73306ccb0}, {SIG_DFL, [], 0}, 8) = 0
>> gettimeofday({1478710402, 937476}, NULL) = 0
>> setitimer(ITIMER_VIRTUAL, {it_interval={0, 0}, it_value={2, 0}}, NULL) = 0
>> 
>>
>>
>> Where as with posix timers enabled:
>> ...
>> write(1, "Testing posix timers. False nega"..., 138Testing posix
>> timers. False negative may happen on CPU execution
>> based timers if other threads run on the CPU...
>> Check itimer virtual... ) = 138
>> rt_sigaction(SIGVTALRM, {0x400a80, [VTALRM], SA_RESTORER|SA_RESTART,
>> 0x7f231ba8ccb0}, {SIG_DFL, [], 0}, 8) = 0
>> gettimeofday({1478626751, 904856}, NULL) = 0
>> setitimer(ITIMER_VIRTUAL, {it_interval={0, 0}, it_value={2, 0}}, NULL) = 0
>> --- SIGVTALRM {si_signo=SIGVTALRM, si_code=SI_KERNEL} ---
>> rt_sigreturn()  = 0
>
> I'll have a look.
>
>> So I suspect you were a little too aggressive with the #ifdefs around
>> the itimers/signal code, or we need to make sure we return an error on
>> the setitimer ITIMER_VIRTUAL case as well.
>
> Well, it seemed to me that with POSIX_TIMERS=n, all the code that would
> set up that signal is gone, so there was no point keeping the code to
> deliver it.
>
> Now... would it make more sense to remove itimer support as well when
> POSIX_TIMERS=n?  The same reasoning would apply.

Yes, returning an error with itimers seems needed if the signal bits
are missing.

Though I do worry that since getitimer/setitimer are older obsolete
interfaces which the posix timers api is supposed to replace, folks
might be surprised to see it removed when setting POSIX_TIMERS=n. So
some additional notes in the kconfig description may be needed.

thanks
-john

Re: net/sctp: null-ptr-deref in sctp_inet_listen

2016-11-08 Thread Andrey Konovalov

Hi Xin,

Your patch seems to be fixing the issue.

Tested-by: Andrey Konovalov 

Thanks!

On Tue, Nov 8, 2016 at 11:06 AM, Xin Long  wrote:
> On Tue, Nov 8, 2016 at 5:44 AM, Andrey Konovalov  
> wrote:
>> Hi,
>>
>> I've got the following error report while running the syzkaller fuzzer:
>>
>> kasan: CONFIG_KASAN_INLINE enabled
>> kasan: GPF could be caused by NULL-ptr deref or user memory access
>> general protection fault:  [#1] SMP KASAN
>> Modules linked in:
>> CPU: 1 PID: 3851 Comm: a.out Not tainted 4.9.0-rc4+ #354
>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
>> task: 880065f1d800 task.stack: 88006384
>> RIP: 0010:[]  []
>> sctp_inet_listen+0x29b/0x790 net/sctp/socket.c:6870
>> RSP: 0018:880063847dd0  EFLAGS: 00010202
>> RAX: dc00 RBX: 11000c708fbd RCX: 
>> RDX:  RSI:  RDI: 0002
>> RBP: 880063847e70 R08: dc00 R09: dc00
>> R10: 0002 R11: 0002 R12: 88006b350800
>> R13:  R14: 11000d66a1a5 R15: 
>> FS:  7fd1f0f3d7c0() GS:88006cd0() knlGS:
>> CS:  0010 DS:  ES:  CR0: 80050033
>> CR2: 2000 CR3: 64af9000 CR4: 06e0
>> Stack:
>>  880063847de0 880066165900 88006b350d20 41b58ab3
>>  847ff589 83941280 dc00 
>>  880069b9f740  880063847e38 819f04ef
>> Call Trace:
>>  [< inline >] SYSC_listen net/socket.c:1396
>>  [] SyS_listen+0x206/0x250 net/socket.c:1382
>>  [] entry_SYSCALL_64_fastpath+0x1f/0xc2
>> arch/x86/entry/entry_64.S:209
>> Code: 00 0f 85 f4 04 00 00 4d 8b ac 24 28 05 00 00 49 b8 00 00 00 00
>> 00 fc ff df 49 8d 7d 02 48 89 fe 49 89 fa 48 c1 ee 03 41 83 e2 07 <46>
>> 0f b6 0c 06 41 83 c2 01 45 38 ca 7c 09 45 84 c9 0f 85 87 04
>> RIP  [] sctp_inet_listen+0x29b/0x790 net/sctp/socket.c:6870
>>  RSP 
>> ---[ end trace f2b501fc22999b37 ]---
>>
>> A reproducer is attached.
>>
>> On commit bc33b0ca11e3df46a4fa7639ba488c9d4911 (Nov 5).
>>
> This is a shutdown injection issue.
> sctp_shutdown need a sk->state check, just like tcp_shutdown:
>
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -4287,7 +4287,8 @@ static void sctp_shutdown(struct sock *sk, int how)
> if (!sctp_style(sk, TCP))
> return;
>
> -   if (how & SEND_SHUTDOWN) {
> +   if (how & SEND_SHUTDOWN &&
> +   (1 << sk->sk_state) & (SCTP_SS_ESTABLISHED | SCTP_SS_CLOSING)) {
> sk->sk_state = SCTP_SS_CLOSING;
> ep = sctp_sk(sk)->ep;
> if (!list_empty(>asocs)) {

Re: [PATCH] usbnet: prevent device rpm suspend in usbnet_probe function

2016-11-08 Thread Alan Stern

On Tue, 8 Nov 2016, Bjørn Mork wrote:

> Alan Stern  writes:
> 
> > On Tue, 8 Nov 2016, Kai-Heng Feng wrote:
> >
> >> Hi,
> >> 
> >> On Mon, Nov 7, 2016 at 7:02 PM, Oliver Neukum  wrote:
> >> > On Fri, 2016-11-04 at 17:57 +0800, Kai-Heng Feng wrote:
> >> >> Sometimes cdc_mbim failed to probe if runtime pm is enabled:
> >> >> [9.305626] cdc_mbim: probe of 2-2:1.12 failed with error -22
> >> >>
> >> >> This can be solved by increase its pm usage counter.
> >> >>
> >> >> Signed-off-by: Kai-Heng Feng 
> >> >
> >> > For the record:
> >> >
> >> > NAK. This fixes a symptom. If this patch helps something is broken in
> >> > device core. We need to find that.
> >> >
> >> 
> >> Please check attached dmesg with usbcore.dyndbg="+p".
> >
> > The log shows that the device went into suspend _before_ the cdc_mbim 
> > driver was probed, not during the probe.  Then just before the probe 
> > was started, the USB core tried to resume the device and the resume 
> > failed.
> >
> > The log shows a bunch of other problems with this device:
> >
> > [3.862253] usb 2-2: config 1 has an invalid interface number: 12 but 
> > max is 1
> > [3.862254] usb 2-2: config 1 has an invalid interface number: 13 but 
> > max is 1
> > [3.862254] usb 2-2: config 1 has an invalid interface number: 13 but 
> > max is 1
> > [3.862255] usb 2-2: config 1 has no interface number 0
> > [3.862256] usb 2-2: config 1 has no interface number 1
> 
> These messages are completely harmless and normal for Sierra Wireless
> devices.  They use the interface number to identify the type of
> function, causing this mismatch between the number of interfaces and the
> inteface numbers. Boy, that looks weird in writing :)
> 
> Ref this discussion we had a few years ago:
> http://www.spinics.net/lists/linux-usb/msg77499.html
> 
> No, I didn't expect you to remember that :)

You're right; I didn't remember it.  But seeing those messages again in
the mailing list archives, they do look a little familiar.

> > [8.295180] usb 2-2: Disable of device-initiated U1 failed.
> > [8.295322] usb 2-2: Disable of device-initiated U2 failed.
> >
> > I get the impression that the device won't work properly with runtime 
> > PM at all.
> 
> I suspect the device is an EM7455?  If so, then it does work fine with
> runtime PM, as long as we're talking USB2.  Not sure about USB3 runtime
> PM though.  Cannot test it. The Lenovo laptop I got with one of these
> modems has disabled the USB3 link on the m.2 modem slot for some reason.

These problems could very well be caused by running at SuperSpeed
(USB-3) instead of high speed (USB-2).

Is there any way to test what happens when the device is attached to 
the computer by a USB-2 cable?  That would prevent it from operating at 
SuperSpeed.

The main point, however, is that the proposed patch doesn't seem to
address the true problem, which is that the device gets suspended
between probes.  The patch only tries to prevent it from being
suspended during a probe -- which is already prevented by the USB core.

Alan Stern

Re: linux-next: Tree for Nov 8 (netdev, netfilter)

2016-11-08 Thread Randy Dunlap

On 11/07/16 23:38, Stephen Rothwell wrote:
> Hi all,
> 
> Changes since 20161028:


on i386 or x86_64:

net/built-in.o: In function `nf_sk_lookup_slow_v4':
(.text+0x97414): undefined reference to `udp4_lib_lookup'

when these are not enabled:
#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \
IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY)

and
CONFIG_NF_SOCKET_IPV4=y

See net/ipv4/netfilter/nf_socket_ipv4.c.


Reported-by: Randy Dunlap 
-- 
~Randy

Re: [Intel-wired-lan] [PATCH] igb: use igb_adapter->io_addr instead of e1000_hw->hw_addr

2016-11-08 Thread Corinna Vinschen

On Nov  8 09:16, Hisashi T Fujinaka wrote:
> On Tue, 8 Nov 2016, Corinna Vinschen wrote:
> > On Nov  8 15:06, Cao jin wrote:
> > > When running as guest, under certain condition, it will oops as following.
> > > writel() in igb_configure_tx_ring() results in oops, because hw->hw_addr
> > > is NULL. While other register access won't oops kernel because they use
> > > wr32/rd32 which have a defense against NULL pointer.
> > > [...]
> > 
> > Incidentally we're just looking for a solution to that problem too.
> > Do three patches to fix the same problem at rougly the same time already
> > qualify as freak accident?
> > 
> > FTR, I attached my current patch, which I was planning to submit after
> > some external testing.
> > 
> > However, all three patches have one thing in common:  They workaround
> > a somewhat dubious resetting of the hardware address to NULL in case
> > reading from a register failed.
> > 
> > That makes me wonder if setting the hardware address to NULL in
> > rd32/igb_rd32 is really such a good idea.  It's performed in a function
> > which return value is *never* tested for validity in the calling
> > functions and leads to subsequent crashes since no tests for hw_addr ==
> > NULL are performed.
> > 
> > Maybe commit 22a8b2915 should be reconsidered?  Isn't there some more
> > graceful way to handle the "surprise removal"?
> 
> Answering this from my home account because, well, work is Outlook.
> 
> "Reconsidering" would be great. In fact, revert if if you'd like. I'm
> uncertain that the surprise removal code actually works the way I
> thought previously and I think I took a lot of it out of my local code.
> 
> Unfortuantely I don't have any equipment that I can use to reproduce
> surprise removal any longer so that means I wouldn't be able to test
> anything. I have to defer to you or Cao Jin.

I'm not too keen to rip out a PCIe NIC under power from my locale
desktop machine, but I think an actual surprise removal is not the
problem.

As described in my git log entry, the error condition in igb_rd32 can be
triggered during a suspend.  The HW has been put into a sleep state but
some register read requests are apparently not guarded against that
situation.  Reading a register in this state returns -1, thus a suspend
is erroneously triggering the "surprise removal" sequence.

Here's a raw idea:

- Note that device is suspended in e1000_hw struct.  Don't trigger
  error sequence in igb_rd32 if so (...and return a 0 value???)

- Otherwise assume it's actually a surprise removal.  In theory that
  should somehow trigger a device removal sequence, kind of like
  calling igb_remove, no?


Thanks,
Corinna


signature.asc
Description: PGP signature

Re: [Intel-wired-lan] [PATCH] igb: use igb_adapter->io_addr instead of e1000_hw->hw_addr

2016-11-08 Thread Hisashi T Fujinaka


On Tue, 8 Nov 2016, Hisashi T Fujinaka wrote:


Incidentally we're just looking for a solution to that problem too.
Do three patches to fix the same problem at rougly the same time already
qualify as freak accident?

FTR, I attached my current patch, which I was planning to submit after
some external testing.

However, all three patches have one thing in common:  They workaround
a somewhat dubious resetting of the hardware address to NULL in case
reading from a register failed.

That makes me wonder if setting the hardware address to NULL in
rd32/igb_rd32 is really such a good idea.  It's performed in a function
which return value is *never* tested for validity in the calling
functions and leads to subsequent crashes since no tests for hw_addr ==
NULL are performed.

Maybe commit 22a8b2915 should be reconsidered?  Isn't there some more
graceful way to handle the "surprise removal"?


Answering this from my home account because, well, work is Outlook.

"Reconsidering" would be great. In fact, revert if if you'd like. I'm
uncertain that the surprise removal code actually works the way I
thought previously and I think I took a lot of it out of my local code.

Unfortuantely I don't have any equipment that I can use to reproduce
surprise removal any longer so that means I wouldn't be able to test
anything. I have to defer to you or Cao Jin.


Whoops. Never mind. I was just told that I had a bug that Alex Duyck and
Cao Jin just fixed. I'd stick to listening to Alex.

--
Hisashi T Fujinaka - ht...@twofifty.com

Re: [PATCH v3 4/4] posix-timers: make it configurable

2016-11-08 Thread Nicolas Pitre

On Tue, 8 Nov 2016, John Stultz wrote:

> One spot of concern is that the
> tools/testing/selftests/timers/posix_timers.c test hangs testing
> virtual itimers. Looking through the code I'm not seeing where an
> error case is missed.
> 
> The strace looks like:
> ...
> write(1, "Testing posix timers. False nega"..., 66Testing posix
> timers. False negative may happen on CPU execution
> ) = 66
> write(1, "based timers if other threads ru"..., 48based timers if
> other threads run on the CPU...
> ) = 48
> write(1, "Check itimer virtual... ", 24Check itimer virtual... ) = 24
> rt_sigaction(SIGVTALRM, {0x400a80, [VTALRM], SA_RESTORER|SA_RESTART,
> 0x7fb73306ccb0}, {SIG_DFL, [], 0}, 8) = 0
> gettimeofday({1478710402, 937476}, NULL) = 0
> setitimer(ITIMER_VIRTUAL, {it_interval={0, 0}, it_value={2, 0}}, NULL) = 0
> 
> 
> 
> Where as with posix timers enabled:
> ...
> write(1, "Testing posix timers. False nega"..., 138Testing posix
> timers. False negative may happen on CPU execution
> based timers if other threads run on the CPU...
> Check itimer virtual... ) = 138
> rt_sigaction(SIGVTALRM, {0x400a80, [VTALRM], SA_RESTORER|SA_RESTART,
> 0x7f231ba8ccb0}, {SIG_DFL, [], 0}, 8) = 0
> gettimeofday({1478626751, 904856}, NULL) = 0
> setitimer(ITIMER_VIRTUAL, {it_interval={0, 0}, it_value={2, 0}}, NULL) = 0
> --- SIGVTALRM {si_signo=SIGVTALRM, si_code=SI_KERNEL} ---
> rt_sigreturn()  = 0

I'll have a look.

> So I suspect you were a little too aggressive with the #ifdefs around
> the itimers/signal code, or we need to make sure we return an error on
> the setitimer ITIMER_VIRTUAL case as well.

Well, it seemed to me that with POSIX_TIMERS=n, all the code that would 
set up that signal is gone, so there was no point keeping the code to 
deliver it.

Now... would it make more sense to remove itimer support as well when 
POSIX_TIMERS=n?  The same reasoning would apply.


Nicolas

RE: Is there a maximum bytes in flight limitation in the tcp stack? -->limit in scp

2016-11-08 Thread De Schepper, Koen (Nokia - BE)

Seems to be a limitation in the application. We used scp, and it (still) seems 
to limit the bytes in flight. Using our own application, we didn't see a limit 
indeed. Thanks for your response, and sorry for the noise...

Koen.

> -Original Message-
> From: Yuchung Cheng [mailto:ych...@google.com]
> Sent: dinsdag 8 november 2016 5:51
> To: De Schepper, Koen (Nokia - BE)  labs.com>
> Cc: netdev@vger.kernel.org
> Subject: Re: Is there a maximum bytes in flight limitation in the tcp stack?
> 
> On Thu, Nov 3, 2016 at 9:37 AM, De Schepper, Koen (Nokia - BE)
>  wrote:
> >
> > Hi,
> >
> > We experience some limit on the maximum packets in flight which seem
> not to be related with the receive or write buffers. Does somebody know if
> there is an issue with a maximum of around 1MByte (or sometimes 2Mbyte)
> of data in flight per TCP flow?
> 
> does not ring a bell. I've definitely see cubic reaching >2MB cwnd (inflight)
> some packet trace will help.
> 
> btw, tcp_rmem is the maximum receive buffer including all header and
> control overhead. the receive window announced is (very roughly) half
> of your rcvbuf.
> 
> >
> > It seems to be a strict and stable limit independent from the CC (tested
> with Cubic, Reno and DCTCP). On a link of 200Mbps and 200ms RTT our link is
> only 20% (sometimes 40%, see conditions below) utilized for a single TCP
> flow with no drop experienced at all (no bottleneck in the AQM or RTT
> emulation, as it supports more throughput if multiple flows are active).
> >
> > Some configuration changes we already tried on both client and server
> (kernel 3.18.9):
> >
> > net.ipv4.tcp_no_metrics_save = 1
> > net.ipv4.tcp_rmem = 4096 87380 6291456
> > net.ipv4.tcp_wmem = 4096 16384 4194304
> >
> > SERVER# ss -i
> > tcpESTAB  0  1049728  10.187.255.211:46642 10.187.16.194:ssh
> >  dctcp wscale:7,7 rto:408 rtt:204.333/0.741 ato:40 mss:1448 
> > cwnd:1466
> send 83.1Mbps unacked:728 rcv_rtt:212 rcv_space:29200
> > CLIENT# ss -i
> > tcpESTAB  0  288  10.187.16.194:ssh  
> > 10.187.255.211:46642
> >  dctcp wscale:7,7 rto:404 rtt:203.389/0.213 ato:40 mss:1448 cwnd:78
> send 4.4Mbps unacked:8 rcv_rtt:204 rcv_space:1074844
> >
> > When increasing the write and receive mem further (they were already
> way above 1 or 2 MB) it steps to double (40%; 2Mbytes in flight):
> > net.ipv4.tcp_no_metrics_save = 1
> > net.ipv4.tcp_rmem = 4096 800 16291456
> > net.ipv4.tcp_wmem = 4096 800 16291456
> >
> > SERVER # ss -i
> > tcpESTAB  0  2068976  10.187.255.212:54637 10.187.16.112:ssh
> >  cubic wscale:8,8 rto:404 rtt:202.622/0.061 ato:40 mss:1448 
> > cwnd:1849
> ssthresh:1140 send 105.7Mbps unacked:1457 rcv_rtt:217.5 rcv_space:29200
> > CLIENT# ss -i
> > tcpESTAB  0  648  10.187.16.112:ssh  
> > 10.187.255.212:54637
> >  cubic wscale:8,8 rto:404 rtt:201.956/0.038 ato:40 mss:1448 cwnd:132
> send 7.6Mbps unacked:18 rcv_rtt:204 rcv_space:2093044
> >
> > Further increasing (x10) does not help anymore...
> > net.ipv4.tcp_no_metrics_save = 1
> > net.ipv4.tcp_rmem = 4096 8000 162914560
> > net.ipv4.tcp_wmem = 4096 8000 162914560
> >
> > As all these parameters autotune, it is hard to find out which one is
> limiting... In the examples, above unacked does not want to go higher, while
> congestion window in the server is big enough... rcv_space could be limiting,
> but it tunes up if I change the server with the higher buffers (switching to
> 2MByte in flight).
> >
> > We also tried tcp_limit_output_bytes, setting it bigger (x10) and
> smaller(/10), without effect. We've put it in /etc/sysctl.conf and rebooted, 
> to
> make sure that it is effective.
> >
> > Some more detailed tests that had an effect on the 1 or 2MByte:
> > - It seems that with TSO off, if we configure a bigger wmem buffer, an
> ongoing flow suddenly is able to immediately double its bytes in flight limit.
> We configured further up to more than 10x the buffer, but no further
> increase helps, and the limits we saw are only 1MByte and 2Mbyte (no
> intermediate values depending on any parameter). When setting tcp_wmem
> smaller again, the 2MByte limit stays on the ongoing flow. We have to restart
> the flow to make the buffer reduction to 1MByte effective.
> > - With TSO on, only the 2MByte limit is effective, independent from the
> wmem buffer. We have to restart the flow to make a tso change effective.
> >
> > Koen.
> >

Re: net/sunrpc/clnt.c:2773 suspicious rcu_dereference_check() usage!

2016-11-08 Thread Anna Schumaker

On 11/08/2016 12:43 PM, Ross Zwisler wrote:
> On Tue, Nov 08, 2016 at 07:42:59AM -0500, Anna Schumaker wrote:
>> On 11/08/2016 07:09 AM, Jeff Layton wrote:
>>> On Tue, 2016-11-08 at 06:53 -0500, Jeff Layton wrote:
 On Mon, 2016-11-07 at 22:42 -0700, Ross Zwisler wrote:
>
> I've got a virtual machine that has some NFS mounts, and with a newly 
> compiled
> kernel based on v4.9-rc3 I see the following warning/info message:
>
> [   42.750181] ===
> [   42.750192] [ INFO: suspicious RCU usage. ]
> [   42.750203] 4.9.0-rc3-2-g7b6e7de #3 Not tainted
> [   42.750213] ---
> [   42.750225] net/sunrpc/clnt.c:2773 suspicious rcu_dereference_check() 
> usage!
> [   42.750235] 
> [   42.750235] other info that might help us debug this:
> [   42.750235] 
> [   42.750246] 
> [   42.750246] rcu_scheduler_active = 1, debug_locks = 0
> [   42.750257] 1 lock held by mount.nfs4/6440:
> [   42.750278]  #0: 
> [   42.750299]  (
> [   42.750319] &(>nfs_client_lock)->rlock
> [   42.750340] ){+.+...}
> [   42.750362] , at: 
> [   42.750372] [] nfs_get_client+0x105/0x5e0
> [   42.750383] 
> [   42.750383] stack backtrace:
> [   42.750394] CPU: 0 PID: 6440 Comm: mount.nfs4 Not tainted 
> 4.9.0-rc3-2-g7b6e7de #3
> [   42.750406] Hardware name: Intel Corporation PURLEY/PURLEY, BIOS 
> PLYDCRB1.MBH.0096.D23.1608240105 08/24/2016
> [   42.750429]  c992fa68 8150730f 88014ec8da40 
> 0001
> [   42.750452]  c992fa98 810bc3f7 880150b0b228 
> 88015068dbb0
> [   42.750475]  c992fb38 88014fc99180 c992fac0 
> 81b243e5
> [   42.750486] Call Trace:
> [   42.750498]  [] dump_stack+0x67/0x98
> [   42.750511]  [] lockdep_rcu_suspicious+0xe7/0x120
> [   42.750524]  [] 
> rpc_clnt_xprt_switch_has_addr+0x115/0x150
> [   42.750536]  [] nfs_get_client+0x244/0x5e0
> [   42.750549]  [] ? nfs_get_client+0xfc/0x5e0
> [   42.750561]  [] nfs4_set_client+0x98/0x130
> [   42.750574]  [] nfs4_create_server+0x13e/0x390
> [   42.750588]  [] nfs4_remote_mount+0x2e/0x60
> [   42.750600]  [] mount_fs+0x39/0x170
> [   42.750614]  [] vfs_kern_mount+0x6b/0x150
> [   42.750626]  [] ? nfs_do_root_mount+0x3c/0xc0
> [   42.750639]  [] nfs_do_root_mount+0x86/0xc0
> [   42.750652]  [] nfs4_try_mount+0x44/0xc0
> [   42.750664]  [] ? get_nfs_version+0x27/0x90
> [   42.750677]  [] nfs_fs_mount+0x4ac/0xd80
> [   42.750689]  [] ? lockdep_init_map+0x88/0x1f0
> [   42.750701]  [] ? nfs_clone_super+0x130/0x130
> [   42.750713]  [] ? param_set_portnr+0x70/0x70
> [   42.750726]  [] mount_fs+0x39/0x170
> [   42.750740]  [] vfs_kern_mount+0x6b/0x150
> [   42.750752]  [] do_mount+0x1f1/0xd10
> [   42.750765]  [] ? copy_mount_options+0xa1/0x140
> [   42.750777]  [] SyS_mount+0x83/0xd0
> [   42.750790]  [] do_syscall_64+0x5c/0x130
> [   42.750802]  [] entry_SYSCALL64_slow_path+0x25/0x25
>
> This rcu_dereference_check() was introduced by the following commit:
>
> commit 39e5d2df959dd4aea81fa33d765d2a5cc67a0512
> Author: Andy Adamson 
> Date:   Fri Sep 9 09:22:25 2016 -0400
>
> SUNRPC search xprt switch for sockaddr
> 
> Signed-off-by: Andy Adamson 
> Signed-off-by: Anna Schumaker 
>
> Thanks,
> - Ross

 Thanks Ross,
>>
>> Hi Ross,
>>
>> Can you try this patch and let me know if it helps:
>>
>> http://git.linux-nfs.org/?p=anna/linux-nfs.git;a=commitdiff;h=bb29dd84333a96f309c6d0f88b285b5b78927058
>>
>> I'm planning on sending it to Linus soon, so it should be in rc5.
> 
> Hi Anna,
> 
> Yep, this patch makes the warning go away in my setup.

Great!  Thanks for testing!

Anna

> 
> Thanks,
> - Ross
>

Re: net/sunrpc/clnt.c:2773 suspicious rcu_dereference_check() usage!

2016-11-08 Thread Ross Zwisler

On Tue, Nov 08, 2016 at 07:42:59AM -0500, Anna Schumaker wrote:
> On 11/08/2016 07:09 AM, Jeff Layton wrote:
> > On Tue, 2016-11-08 at 06:53 -0500, Jeff Layton wrote:
> >> On Mon, 2016-11-07 at 22:42 -0700, Ross Zwisler wrote:
> >>>
> >>> I've got a virtual machine that has some NFS mounts, and with a newly 
> >>> compiled
> >>> kernel based on v4.9-rc3 I see the following warning/info message:
> >>>
> >>> [   42.750181] ===
> >>> [   42.750192] [ INFO: suspicious RCU usage. ]
> >>> [   42.750203] 4.9.0-rc3-2-g7b6e7de #3 Not tainted
> >>> [   42.750213] ---
> >>> [   42.750225] net/sunrpc/clnt.c:2773 suspicious rcu_dereference_check() 
> >>> usage!
> >>> [   42.750235] 
> >>> [   42.750235] other info that might help us debug this:
> >>> [   42.750235] 
> >>> [   42.750246] 
> >>> [   42.750246] rcu_scheduler_active = 1, debug_locks = 0
> >>> [   42.750257] 1 lock held by mount.nfs4/6440:
> >>> [   42.750278]  #0: 
> >>> [   42.750299]  (
> >>> [   42.750319] &(>nfs_client_lock)->rlock
> >>> [   42.750340] ){+.+...}
> >>> [   42.750362] , at: 
> >>> [   42.750372] [] nfs_get_client+0x105/0x5e0
> >>> [   42.750383] 
> >>> [   42.750383] stack backtrace:
> >>> [   42.750394] CPU: 0 PID: 6440 Comm: mount.nfs4 Not tainted 
> >>> 4.9.0-rc3-2-g7b6e7de #3
> >>> [   42.750406] Hardware name: Intel Corporation PURLEY/PURLEY, BIOS 
> >>> PLYDCRB1.MBH.0096.D23.1608240105 08/24/2016
> >>> [   42.750429]  c992fa68 8150730f 88014ec8da40 
> >>> 0001
> >>> [   42.750452]  c992fa98 810bc3f7 880150b0b228 
> >>> 88015068dbb0
> >>> [   42.750475]  c992fb38 88014fc99180 c992fac0 
> >>> 81b243e5
> >>> [   42.750486] Call Trace:
> >>> [   42.750498]  [] dump_stack+0x67/0x98
> >>> [   42.750511]  [] lockdep_rcu_suspicious+0xe7/0x120
> >>> [   42.750524]  [] 
> >>> rpc_clnt_xprt_switch_has_addr+0x115/0x150
> >>> [   42.750536]  [] nfs_get_client+0x244/0x5e0
> >>> [   42.750549]  [] ? nfs_get_client+0xfc/0x5e0
> >>> [   42.750561]  [] nfs4_set_client+0x98/0x130
> >>> [   42.750574]  [] nfs4_create_server+0x13e/0x390
> >>> [   42.750588]  [] nfs4_remote_mount+0x2e/0x60
> >>> [   42.750600]  [] mount_fs+0x39/0x170
> >>> [   42.750614]  [] vfs_kern_mount+0x6b/0x150
> >>> [   42.750626]  [] ? nfs_do_root_mount+0x3c/0xc0
> >>> [   42.750639]  [] nfs_do_root_mount+0x86/0xc0
> >>> [   42.750652]  [] nfs4_try_mount+0x44/0xc0
> >>> [   42.750664]  [] ? get_nfs_version+0x27/0x90
> >>> [   42.750677]  [] nfs_fs_mount+0x4ac/0xd80
> >>> [   42.750689]  [] ? lockdep_init_map+0x88/0x1f0
> >>> [   42.750701]  [] ? nfs_clone_super+0x130/0x130
> >>> [   42.750713]  [] ? param_set_portnr+0x70/0x70
> >>> [   42.750726]  [] mount_fs+0x39/0x170
> >>> [   42.750740]  [] vfs_kern_mount+0x6b/0x150
> >>> [   42.750752]  [] do_mount+0x1f1/0xd10
> >>> [   42.750765]  [] ? copy_mount_options+0xa1/0x140
> >>> [   42.750777]  [] SyS_mount+0x83/0xd0
> >>> [   42.750790]  [] do_syscall_64+0x5c/0x130
> >>> [   42.750802]  [] entry_SYSCALL64_slow_path+0x25/0x25
> >>>
> >>> This rcu_dereference_check() was introduced by the following commit:
> >>>
> >>> commit 39e5d2df959dd4aea81fa33d765d2a5cc67a0512
> >>> Author: Andy Adamson 
> >>> Date:   Fri Sep 9 09:22:25 2016 -0400
> >>>
> >>> SUNRPC search xprt switch for sockaddr
> >>> 
> >>> Signed-off-by: Andy Adamson 
> >>> Signed-off-by: Anna Schumaker 
> >>>
> >>> Thanks,
> >>> - Ross
> >>
> >> Thanks Ross,
> 
> Hi Ross,
> 
> Can you try this patch and let me know if it helps:
> 
> http://git.linux-nfs.org/?p=anna/linux-nfs.git;a=commitdiff;h=bb29dd84333a96f309c6d0f88b285b5b78927058
> 
> I'm planning on sending it to Linus soon, so it should be in rc5.

Hi Anna,

Yep, this patch makes the warning go away in my setup.

Thanks,
- Ross

Re: [PATCH v3 4/4] posix-timers: make it configurable

2016-11-08 Thread John Stultz

On Mon, Nov 7, 2016 at 2:14 PM, Nicolas Pitre  wrote:
> Some embedded systems have no use for them.  This removes about
> 22KB from the kernel binary size when configured out.
>
> Corresponding syscalls are routed to a stub logging the attempt to
> use those syscalls which should be enough of a clue if they were
> disabled without proper consideration. They are: timer_create,
> timer_gettime: timer_getoverrun, timer_settime, timer_delete,
> clock_adjtime.
>
> The clock_settime, clock_gettime, clock_getres and clock_nanosleep
> syscalls are replaced by simple wrappers compatible with CLOCK_REALTIME,
> CLOCK_MONOTONIC and CLOCK_BOOTTIME only which should cover the vast
> majority of use cases with very little code.
>
> Signed-off-by: Nicolas Pitre 
> Reviewed-by: Josh Triplett 
> Acked-by: Richard Cochran 

So I have no design objections to the patch overall.

I ran this through my timekeeping tests last night and it passed a
fair number of the tests, considering.

I of course see a lot of failures around timer_creates failing
(set-timer-lat), and cases where clockids aren't supported.
So I'll need to see about updating the tests to fail more gracefully
with this change.

One spot of concern is that the
tools/testing/selftests/timers/posix_timers.c test hangs testing
virtual itimers. Looking through the code I'm not seeing where an
error case is missed.

The strace looks like:
...
write(1, "Testing posix timers. False nega"..., 66Testing posix
timers. False negative may happen on CPU execution
) = 66
write(1, "based timers if other threads ru"..., 48based timers if
other threads run on the CPU...
) = 48
write(1, "Check itimer virtual... ", 24Check itimer virtual... ) = 24
rt_sigaction(SIGVTALRM, {0x400a80, [VTALRM], SA_RESTORER|SA_RESTART,
0x7fb73306ccb0}, {SIG_DFL, [], 0}, 8) = 0
gettimeofday({1478710402, 937476}, NULL) = 0
setitimer(ITIMER_VIRTUAL, {it_interval={0, 0}, it_value={2, 0}}, NULL) = 0



Where as with posix timers enabled:
...
write(1, "Testing posix timers. False nega"..., 138Testing posix
timers. False negative may happen on CPU execution
based timers if other threads run on the CPU...
Check itimer virtual... ) = 138
rt_sigaction(SIGVTALRM, {0x400a80, [VTALRM], SA_RESTORER|SA_RESTART,
0x7f231ba8ccb0}, {SIG_DFL, [], 0}, 8) = 0
gettimeofday({1478626751, 904856}, NULL) = 0
setitimer(ITIMER_VIRTUAL, {it_interval={0, 0}, it_value={2, 0}}, NULL) = 0
--- SIGVTALRM {si_signo=SIGVTALRM, si_code=SI_KERNEL} ---
rt_sigreturn()  = 0
gettimeofday({1478626753, 906137}, NULL) = 0
write(1, "[OK]\nCheck itimer prof... ", 26[OK]
...

So I suspect you were a little too aggressive with the #ifdefs around
the itimers/signal code, or we need to make sure we return an error on
the setitimer ITIMER_VIRTUAL case as well.

thanks
-john

Re: [PATCH 13/17] batman-adv: Consume skb in receive handlers

2016-11-08 Thread Eric Dumazet

On Tue, 2016-11-08 at 18:28 +0100, Sven Eckelmann wrote:
> On Dienstag, 8. November 2016 08:59:49 CET Eric Dumazet wrote:
> [...]
> > > +free_skb:
> > >   consume_skb(skb);
> > > - return NET_RX_SUCCESS;
> > > +
> > > + return ret;
> > >  }
> > 
> > 
> > Okay, but we do have kfree_skb() and consume_skb() and they should be
> > used appropriately.
> 
> Yes, this patch is one part of reaching this goal. Some other parts are also
> in this patchset. But other changes like the one you've mention here (change
> some consume_skb partially back to kfree_skb) have still to be done. But
> first we have to clean up the main portion of the mess :)

Sure, but your patch 13/17 should address this right away.

You must not call consume_skb() if you are dropping a packet.

Prior to this patch, kfree_skb() was properly called, and after this
patch, consume_skb() is called instead.


-   ret = (*batadv_rx_handler[idx])(skb, hard_iface);
-
-   if (ret == NET_RX_DROP)
-   kfree_skb(skb);
+   (*batadv_rx_handler[idx])(skb, hard_iface);
 
You can not claim working on these issues and at the same time add them
back.

Re: [PATCH 13/17] batman-adv: Consume skb in receive handlers

2016-11-08 Thread Sven Eckelmann

On Dienstag, 8. November 2016 08:59:49 CET Eric Dumazet wrote:
[...]
> > +free_skb:
> > consume_skb(skb);
> > -   return NET_RX_SUCCESS;
> > +
> > +   return ret;
> >  }
> 
> 
> Okay, but we do have kfree_skb() and consume_skb() and they should be
> used appropriately.

Yes, this patch is one part of reaching this goal. Some other parts are also
in this patchset. But other changes like the one you've mention here (change
some consume_skb partially back to kfree_skb) have still to be done. But
first we have to clean up the main portion of the mess :)

Kind regards,
Sven

signature.asc
Description: This is a digitally signed message part.

Re: [Intel-wired-lan] [PATCH] igb: use igb_adapter->io_addr instead of e1000_hw->hw_addr

2016-11-08 Thread Hisashi T Fujinaka


On Tue, 8 Nov 2016, Corinna Vinschen wrote:


On Nov  8 15:06, Cao jin wrote:

When running as guest, under certain condition, it will oops as following.
writel() in igb_configure_tx_ring() results in oops, because hw->hw_addr
is NULL. While other register access won't oops kernel because they use
wr32/rd32 which have a defense against NULL pointer.

[  141.225449] pcieport :00:1c.0: AER: Multiple Uncorrected (Fatal)
error received: id=0101
[  141.225523] igb :01:00.1: PCIe Bus Error:
severity=Uncorrected (Fatal), type=Unaccessible,
id=0101(Unregistered Agent ID)
[  141.299442] igb :01:00.1: broadcast error_detected message
[  141.300539] igb :01:00.0 enp1s0f0: PCIe link lost, device now
detached
[  141.351019] igb :01:00.1 enp1s0f1: PCIe link lost, device now
detached
[  143.465904] pcieport :00:1c.0: Root Port link has been reset
[  143.465994] igb :01:00.1: broadcast slot_reset message
[  143.466039] igb :01:00.0: enabling device ( -> 0002)
[  144.389078] igb :01:00.1: enabling device ( -> 0002)
[  145.312078] igb :01:00.1: broadcast resume message
[  145.322211] BUG: unable to handle kernel paging request at
3818
[  145.361275] IP: []
igb_configure_tx_ring+0x14d/0x280 [igb]
[  145.400048] PGD 0
[  145.438007] Oops: 0002 [#1] SMP

A similiar issue & solution could be found at:
http://patchwork.ozlabs.org/patch/689592/

Signed-off-by: Cao jin 
---
 drivers/net/ethernet/intel/igb/igb_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c 
b/drivers/net/ethernet/intel/igb/igb_main.c
index edc9a6a..3f240ac 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -3390,7 +3390,7 @@ void igb_configure_tx_ring(struct igb_adapter *adapter,
 tdba & 0xULL);
wr32(E1000_TDBAH(reg_idx), tdba >> 32);

-   ring->tail = hw->hw_addr + E1000_TDT(reg_idx);
+   ring->tail = adapter->io_addr + E1000_TDT(reg_idx);
wr32(E1000_TDH(reg_idx), 0);
writel(0, ring->tail);

@@ -3729,7 +3729,7 @@ void igb_configure_rx_ring(struct igb_adapter *adapter,
 ring->count * sizeof(union e1000_adv_rx_desc));

/* initialize head and tail */
-   ring->tail = hw->hw_addr + E1000_RDT(reg_idx);
+   ring->tail = adapter->io_addr + E1000_RDT(reg_idx);
wr32(E1000_RDH(reg_idx), 0);
writel(0, ring->tail);

--
2.1.0


Incidentally we're just looking for a solution to that problem too.
Do three patches to fix the same problem at rougly the same time already
qualify as freak accident?

FTR, I attached my current patch, which I was planning to submit after
some external testing.

However, all three patches have one thing in common:  They workaround
a somewhat dubious resetting of the hardware address to NULL in case
reading from a register failed.

That makes me wonder if setting the hardware address to NULL in
rd32/igb_rd32 is really such a good idea.  It's performed in a function
which return value is *never* tested for validity in the calling
functions and leads to subsequent crashes since no tests for hw_addr ==
NULL are performed.

Maybe commit 22a8b2915 should be reconsidered?  Isn't there some more
graceful way to handle the "surprise removal"?


Answering this from my home account because, well, work is Outlook.

"Reconsidering" would be great. In fact, revert if if you'd like. I'm
uncertain that the surprise removal code actually works the way I
thought previously and I think I took a lot of it out of my local code.

Unfortuantely I don't have any equipment that I can use to reproduce
surprise removal any longer so that means I wouldn't be able to test
anything. I have to defer to you or Cao Jin.

--
Hisashi T Fujinaka - ht...@twofifty.com (todd.fujin...@intel.com)

Re: Why are IPv6 host and anycast routes referencing lo device?

2016-11-08 Thread Hannes Frederic Sowa

On 08.11.2016 02:08, David Ahern wrote:
> 
> Can anyone explain why host routes and anycast routes for IPv6 are added with 
> the device set to loopback versus the device with the address:
> 
> local ::1 dev lo  proto none  metric 0  pref medium
> local 2000:1:: dev lo  proto none  metric 0  pref medium
> local 2000:1::3 dev lo  proto none  metric 0  pref medium
> local 2100:2:: dev lo  proto none  metric 0  pref medium
> local 2100:2::3 dev lo  proto none  metric 0  pref medium

Does it really matter? For global valid unicast addresses we still
implement the weak model. Thus the interface does not matter at all.

> This behavior differs from IPv4 where host routes use the device with the 
> address:
> 
> broadcast 10.1.1.0 dev eth0  proto kernel  scope link  src 10.1.1.3
> local 10.1.1.3 dev eth0  proto kernel  scope host  src 10.1.1.3
> broadcast 10.1.1.255 dev eth0  proto kernel  scope link  src 10.1.1.3
> broadcast 10.100.2.0 dev eth2  proto kernel  scope link  src 10.100.2.3
> local 10.100.2.3 dev eth2  proto kernel  scope host  src 10.100.2.3
> broadcast 10.100.2.255 dev eth2  proto kernel  scope link  src 10.100.2.3
> 
> The use of loopback pre-dates the git history, so wondering if someone 
> recalls the reason why. We would like to change that to make it consistent 
> with IPv4 - with a sysctl to maintain backwards compatibility.

A sysctl for that sounds like a really bad idea.

Internally the sysctl will change the reference counting of interfaces
and routes towards each other, have small but difficult to find
semantically changes inside the kernel, just for switchting the
interface in iproute/netlink dumps?

If there a good reasons (which can very well be) to switch to have the
interface with the address in the routes, we should switch without
providing the backwards compatibility sysctl.

Bye,
Hannes

Re: [PATCH 13/17] batman-adv: Consume skb in receive handlers

2016-11-08 Thread Eric Dumazet

On Tue, 2016-11-08 at 17:45 +0100, Simon Wunderlich wrote:
> From: Sven Eckelmann 
> 
> Receiving functions in Linux consume the supplied skbuff. Doing the same in
> the batadv_rx_handler functions makes the behavior more similar to the rest
> of the Linux network code.
> 
> Signed-off-by: Sven Eckelmann 
> Signed-off-by: Simon Wunderlich 
> ---
>  net/batman-adv/bat_iv_ogm.c |  17 +++--
>  net/batman-adv/bat_v_elp.c  |  25 ---
>  net/batman-adv/bat_v_ogm.c  |  10 +--
>  net/batman-adv/main.c   |  11 +--
>  net/batman-adv/network-coding.c |  11 +--
>  net/batman-adv/routing.c| 149 
> +++-
>  6 files changed, 141 insertions(+), 82 deletions(-)
> 
> diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
> index 310f391..b9941bf 100644
> --- a/net/batman-adv/bat_iv_ogm.c
> +++ b/net/batman-adv/bat_iv_ogm.c
> @@ -1823,17 +1823,18 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb,
>   struct batadv_ogm_packet *ogm_packet;
>   u8 *packet_pos;
>   int ogm_offset;
> - bool ret;
> + bool res;
> + int ret = NET_RX_DROP;
>  
> - ret = batadv_check_management_packet(skb, if_incoming, BATADV_OGM_HLEN);
> - if (!ret)
> - return NET_RX_DROP;
> + res = batadv_check_management_packet(skb, if_incoming, BATADV_OGM_HLEN);
> + if (!res)
> + goto free_skb;
>  
>   /* did we receive a B.A.T.M.A.N. IV OGM packet on an interface
>* that does not have B.A.T.M.A.N. IV enabled ?
>*/
>   if (bat_priv->algo_ops->iface.enable != batadv_iv_ogm_iface_enable)
> - return NET_RX_DROP;
> + goto free_skb;
>  
>   batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX);
>   batadv_add_counter(bat_priv, BATADV_CNT_MGMT_RX_BYTES,
> @@ -1854,8 +1855,12 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb,
>   ogm_packet = (struct batadv_ogm_packet *)packet_pos;
>   }
>  
> + ret = NET_RX_SUCCESS;
> +
> +free_skb:
>   consume_skb(skb);
> - return NET_RX_SUCCESS;
> +
> + return ret;
>  }


Okay, but we do have kfree_skb() and consume_skb() and they should be
used appropriately.

[PATCH 08/17] batman-adv: Simple (re)broadcast avoidance

2016-11-08 Thread Simon Wunderlich

From: Linus Lüssing 

With this patch, (re)broadcasting on a specific interfaces is avoided:

* No neighbor: There is no need to broadcast on an interface if there
  is no node behind it.

* Single neighbor is source: If there is just one neighbor on an
  interface and if this neighbor is the one we actually got this
  broadcast packet from, then we do not need to echo it back.

* Single neighbor is originator: If there is just one neighbor on
  an interface and if this neighbor is the originator of this
  broadcast packet, then we do not need to echo it back.

Goodies for BATMAN V:

("Upgrade your BATMAN IV network to V now to get these for free!")

Thanks to the split of OGMv1 into two packet types, OGMv2 and ELP
that is, we can now apply the same optimizations stated above to OGMv2
packets, too.

Furthermore, with BATMAN V, rebroadcasts can be reduced in certain
multi interface cases, too, where BATMAN IV cannot. This is thanks to
the removal of the "secondary interface originator" concept in BATMAN V.

Signed-off-by: Linus Lüssing 
Signed-off-by: Sven Eckelmann 
Signed-off-by: Simon Wunderlich 
---
 net/batman-adv/bat_v_ogm.c  | 56 +
 net/batman-adv/hard-interface.c | 52 ++
 net/batman-adv/hard-interface.h | 16 
 net/batman-adv/originator.c | 13 +++---
 net/batman-adv/routing.c|  2 +-
 net/batman-adv/send.c   | 55 +++-
 net/batman-adv/send.h   |  3 ++-
 net/batman-adv/soft-interface.c |  2 +-
 net/batman-adv/types.h  |  2 ++
 9 files changed, 193 insertions(+), 8 deletions(-)

diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
index 61ff5f8..9922ccd 100644
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -140,6 +140,7 @@ static void batadv_v_ogm_send(struct work_struct *work)
unsigned char *ogm_buff, *pkt_buff;
int ogm_buff_len;
u16 tvlv_len = 0;
+   int ret;
 
bat_v = container_of(work, struct batadv_priv_bat_v, ogm_wq.work);
bat_priv = container_of(bat_v, struct batadv_priv, bat_v);
@@ -182,6 +183,31 @@ static void batadv_v_ogm_send(struct work_struct *work)
if (!kref_get_unless_zero(_iface->refcount))
continue;
 
+   ret = batadv_hardif_no_broadcast(hard_iface, NULL, NULL);
+   if (ret) {
+   char *type;
+
+   switch (ret) {
+   case BATADV_HARDIF_BCAST_NORECIPIENT:
+   type = "no neighbor";
+   break;
+   case BATADV_HARDIF_BCAST_DUPFWD:
+   type = "single neighbor is source";
+   break;
+   case BATADV_HARDIF_BCAST_DUPORIG:
+   type = "single neighbor is originator";
+   break;
+   default:
+   type = "unknown";
+   }
+
+   batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "OGM2 from 
ourselve on %s surpressed: %s\n",
+  hard_iface->net_dev->name, type);
+
+   batadv_hardif_put(hard_iface);
+   continue;
+   }
+
batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
   "Sending own OGM2 packet (originator %pM, seqno %u, 
throughput %u, TTL %d) on interface %s [%pM]\n",
   ogm_packet->orig, ntohl(ogm_packet->seqno),
@@ -651,6 +677,7 @@ static void batadv_v_ogm_process(const struct sk_buff *skb, 
int ogm_offset,
struct batadv_hard_iface *hard_iface;
struct batadv_ogm2_packet *ogm_packet;
u32 ogm_throughput, link_throughput, path_throughput;
+   int ret;
 
ethhdr = eth_hdr(skb);
ogm_packet = (struct batadv_ogm2_packet *)(skb->data + ogm_offset);
@@ -716,6 +743,35 @@ static void batadv_v_ogm_process(const struct sk_buff 
*skb, int ogm_offset,
if (!kref_get_unless_zero(_iface->refcount))
continue;
 
+   ret = batadv_hardif_no_broadcast(hard_iface,
+ogm_packet->orig,
+hardif_neigh->orig);
+
+   if (ret) {
+   char *type;
+
+   switch (ret) {
+   case BATADV_HARDIF_BCAST_NORECIPIENT:
+   type = "no neighbor";
+   break;
+   case BATADV_HARDIF_BCAST_DUPFWD:
+   type = "single neighbor is source";
+   break;
+   case

[PATCH 01/17] batman-adv: Introduce missing headers for genetlink restructure

2016-11-08 Thread Simon Wunderlich

From: Sven Eckelmann 

Fixes: 56989f6d8568 ("genetlink: mark families as __ro_after_init")
Fixes: 2ae0f17df1cd ("genetlink: use idr to track families")
Signed-off-by: Sven Eckelmann 
Signed-off-by: Simon Wunderlich 
---
 net/batman-adv/netlink.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 005012b..2171281 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -20,11 +20,14 @@
 
 #include 
 #include 
+#include 
 #include 
+#include 
 #include 
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
-- 
2.10.1

Re: [PATCH] usbnet: prevent device rpm suspend in usbnet_probe function

2016-11-08 Thread Bjørn Mork

Alan Stern  writes:

> On Tue, 8 Nov 2016, Kai-Heng Feng wrote:
>
>> Hi,
>> 
>> On Mon, Nov 7, 2016 at 7:02 PM, Oliver Neukum  wrote:
>> > On Fri, 2016-11-04 at 17:57 +0800, Kai-Heng Feng wrote:
>> >> Sometimes cdc_mbim failed to probe if runtime pm is enabled:
>> >> [9.305626] cdc_mbim: probe of 2-2:1.12 failed with error -22
>> >>
>> >> This can be solved by increase its pm usage counter.
>> >>
>> >> Signed-off-by: Kai-Heng Feng 
>> >
>> > For the record:
>> >
>> > NAK. This fixes a symptom. If this patch helps something is broken in
>> > device core. We need to find that.
>> >
>> 
>> Please check attached dmesg with usbcore.dyndbg="+p".
>
> The log shows that the device went into suspend _before_ the cdc_mbim 
> driver was probed, not during the probe.  Then just before the probe 
> was started, the USB core tried to resume the device and the resume 
> failed.
>
> The log shows a bunch of other problems with this device:
>
> [3.862253] usb 2-2: config 1 has an invalid interface number: 12 but max 
> is 1
> [3.862254] usb 2-2: config 1 has an invalid interface number: 13 but max 
> is 1
> [3.862254] usb 2-2: config 1 has an invalid interface number: 13 but max 
> is 1
> [3.862255] usb 2-2: config 1 has no interface number 0
> [3.862256] usb 2-2: config 1 has no interface number 1

These messages are completely harmless and normal for Sierra Wireless
devices.  They use the interface number to identify the type of
function, causing this mismatch between the number of interfaces and the
inteface numbers. Boy, that looks weird in writing :)

Ref this discussion we had a few years ago:
http://www.spinics.net/lists/linux-usb/msg77499.html

No, I didn't expect you to remember that :)


> [8.295180] usb 2-2: Disable of device-initiated U1 failed.
> [8.295322] usb 2-2: Disable of device-initiated U2 failed.
>
> I get the impression that the device won't work properly with runtime 
> PM at all.

I suspect the device is an EM7455?  If so, then it does work fine with
runtime PM, as long as we're talking USB2.  Not sure about USB3 runtime
PM though.  Cannot test it. The Lenovo laptop I got with one of these
modems has disabled the USB3 link on the m.2 modem slot for some reason.


Bjørn

Re: [PATCH v3 0/4] make POSIX timers optional with some Kconfig help

2016-11-08 Thread Thomas Gleixner

On Mon, 7 Nov 2016, Nicolas Pitre wrote:

> Many embedded systems don't need the full POSIX timer support.
> Configuring them out provides a nice kernel image size reduction.
> 
> When POSIX timers are configured out, the PTP clock subsystem should be
> left out as well. However a bunch of ethernet drivers currently *select*
> the later in their Kconfig entries. Therefore some more work was needed
> to break that hard dependency from those drivers without preventing their
> usage altogether.
> 
> Therefore this series also includes kconfig changes to implement a new
> keyword to express some reverse dependencies like "select" does, named
> "imply", and still allowing for the target config symbol to be disabled
> if the user or a direct dependency says so.
> 
> At this point I'd like to gather ACKs especially from people in the "To"
> field. Ideally this would need to go upstream as a single series to avoid
> cross subsystem dependency issues.  So far it was suggested that this should 
> go
> via the kbuild tree.

For the whole series:

Acked-by: Thomas Gleixner

[PATCH 10/17] batman-adv: Count all non-success TX packets as dropped

2016-11-08 Thread Simon Wunderlich

From: Sven Eckelmann 

A failure during the submission also causes dropped packets.
batadv_interface_tx should therefore also increase the DROPPED counter for
these returns.

Signed-off-by: Sven Eckelmann 
Signed-off-by: Simon Wunderlich 
---
 net/batman-adv/soft-interface.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 2f0304e..7b3494a 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -386,7 +386,7 @@ static int batadv_interface_tx(struct sk_buff *skb,
ret = batadv_send_skb_via_tt(bat_priv, skb, dst_hint,
 vid);
}
-   if (ret == NET_XMIT_DROP)
+   if (ret != NET_XMIT_SUCCESS)
goto dropped_freed;
}
 
-- 
2.10.1

[PATCH 05/17] batman-adv: Remove unnecessary lockdep in batadv_mcast_mla_list_free

2016-11-08 Thread Simon Wunderlich

From: Linus Lüssing 

batadv_mcast_mla_list_free() just frees some leftovers of a local feast
in batadv_mcast_mla_update(). No lockdep needed as it has nothing to do
with bat_priv->mcast.mla_list.

Signed-off-by: Linus Lüssing 
Signed-off-by: Sven Eckelmann 
Signed-off-by: Simon Wunderlich 
---
 net/batman-adv/multicast.c | 8 ++--
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index 13661f4..45757fa 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -231,19 +231,15 @@ static int batadv_mcast_mla_bridge_get(struct net_device 
*dev,
 
 /**
  * batadv_mcast_mla_list_free - free a list of multicast addresses
- * @bat_priv: the bat priv with all the soft interface information
  * @mcast_list: the list to free
  *
  * Removes and frees all items in the given mcast_list.
  */
-static void batadv_mcast_mla_list_free(struct batadv_priv *bat_priv,
-  struct hlist_head *mcast_list)
+static void batadv_mcast_mla_list_free(struct hlist_head *mcast_list)
 {
struct batadv_hw_addr *mcast_entry;
struct hlist_node *tmp;
 
-   lockdep_assert_held(_priv->tt.commit_lock);
-
hlist_for_each_entry_safe(mcast_entry, tmp, mcast_list, list) {
hlist_del(_entry->list);
kfree(mcast_entry);
@@ -560,7 +556,7 @@ void batadv_mcast_mla_update(struct batadv_priv *bat_priv)
batadv_mcast_mla_tt_add(bat_priv, _list);
 
 out:
-   batadv_mcast_mla_list_free(bat_priv, _list);
+   batadv_mcast_mla_list_free(_list);
 }
 
 /**
-- 
2.10.1

[PATCH 04/17] batman-adv: Add wrapper for ARP reply creation

2016-11-08 Thread Simon Wunderlich

From: Linus Lüssing 

Removing duplicate code.

Signed-off-by: Linus Lüssing 
Signed-off-by: Sven Eckelmann 
Signed-off-by: Simon Wunderlich 
---
 net/batman-adv/distributed-arp-table.c | 67 --
 1 file changed, 40 insertions(+), 27 deletions(-)

diff --git a/net/batman-adv/distributed-arp-table.c 
b/net/batman-adv/distributed-arp-table.c
index cbb4f32..49576c5 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -949,6 +949,41 @@ static unsigned short batadv_dat_get_vid(struct sk_buff 
*skb, int *hdr_size)
 }
 
 /**
+ * batadv_dat_arp_create_reply - create an ARP Reply
+ * @bat_priv: the bat priv with all the soft interface information
+ * @ip_src: ARP sender IP
+ * @ip_dst: ARP target IP
+ * @hw_src: Ethernet source and ARP sender MAC
+ * @hw_dst: Ethernet destination and ARP target MAC
+ * @vid: VLAN identifier (optional, set to zero otherwise)
+ *
+ * Creates an ARP Reply from the given values, optionally encapsulated in a
+ * VLAN header.
+ *
+ * Return: An skb containing an ARP Reply.
+ */
+static struct sk_buff *
+batadv_dat_arp_create_reply(struct batadv_priv *bat_priv, __be32 ip_src,
+   __be32 ip_dst, u8 *hw_src, u8 *hw_dst,
+   unsigned short vid)
+{
+   struct sk_buff *skb;
+
+   skb = arp_create(ARPOP_REPLY, ETH_P_ARP, ip_dst, bat_priv->soft_iface,
+ip_src, hw_dst, hw_src, hw_dst);
+   if (!skb)
+   return NULL;
+
+   skb_reset_mac_header(skb);
+
+   if (vid & BATADV_VLAN_HAS_TAG)
+   skb = vlan_insert_tag(skb, htons(ETH_P_8021Q),
+ vid & VLAN_VID_MASK);
+
+   return skb;
+}
+
+/**
  * batadv_dat_snoop_outgoing_arp_request - snoop the ARP request and try to
  * answer using DAT
  * @bat_priv: the bat priv with all the soft interface information
@@ -1005,20 +1040,12 @@ bool batadv_dat_snoop_outgoing_arp_request(struct 
batadv_priv *bat_priv,
goto out;
}
 
-   skb_new = arp_create(ARPOP_REPLY, ETH_P_ARP, ip_src,
-bat_priv->soft_iface, ip_dst, hw_src,
-dat_entry->mac_addr, hw_src);
+   skb_new = batadv_dat_arp_create_reply(bat_priv, ip_dst, ip_src,
+ dat_entry->mac_addr,
+ hw_src, vid);
if (!skb_new)
goto out;
 
-   if (vid & BATADV_VLAN_HAS_TAG) {
-   skb_new = vlan_insert_tag(skb_new, htons(ETH_P_8021Q),
- vid & VLAN_VID_MASK);
-   if (!skb_new)
-   goto out;
-   }
-
-   skb_reset_mac_header(skb_new);
skb_new->protocol = eth_type_trans(skb_new,
   bat_priv->soft_iface);
bat_priv->stats.rx_packets++;
@@ -1081,25 +1108,11 @@ bool batadv_dat_snoop_incoming_arp_request(struct 
batadv_priv *bat_priv,
if (!dat_entry)
goto out;
 
-   skb_new = arp_create(ARPOP_REPLY, ETH_P_ARP, ip_src,
-bat_priv->soft_iface, ip_dst, hw_src,
-dat_entry->mac_addr, hw_src);
-
+   skb_new = batadv_dat_arp_create_reply(bat_priv, ip_dst, ip_src,
+ dat_entry->mac_addr, hw_src, vid);
if (!skb_new)
goto out;
 
-   /* the rest of the TX path assumes that the mac_header offset pointing
-* to the inner Ethernet header has been set, therefore reset it now.
-*/
-   skb_reset_mac_header(skb_new);
-
-   if (vid & BATADV_VLAN_HAS_TAG) {
-   skb_new = vlan_insert_tag(skb_new, htons(ETH_P_8021Q),
- vid & VLAN_VID_MASK);
-   if (!skb_new)
-   goto out;
-   }
-
/* To preserve backwards compatibility, the node has choose the outgoing
 * format based on the incoming request packet type. The assumption is
 * that a node not using the 4addr packet format doesn't support it.
-- 
2.10.1

[PATCH 13/17] batman-adv: Consume skb in receive handlers

2016-11-08 Thread Simon Wunderlich

From: Sven Eckelmann 

Receiving functions in Linux consume the supplied skbuff. Doing the same in
the batadv_rx_handler functions makes the behavior more similar to the rest
of the Linux network code.

Signed-off-by: Sven Eckelmann 
Signed-off-by: Simon Wunderlich 
---
 net/batman-adv/bat_iv_ogm.c |  17 +++--
 net/batman-adv/bat_v_elp.c  |  25 ---
 net/batman-adv/bat_v_ogm.c  |  10 +--
 net/batman-adv/main.c   |  11 +--
 net/batman-adv/network-coding.c |  11 +--
 net/batman-adv/routing.c| 149 +++-
 6 files changed, 141 insertions(+), 82 deletions(-)

diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 310f391..b9941bf 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -1823,17 +1823,18 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb,
struct batadv_ogm_packet *ogm_packet;
u8 *packet_pos;
int ogm_offset;
-   bool ret;
+   bool res;
+   int ret = NET_RX_DROP;
 
-   ret = batadv_check_management_packet(skb, if_incoming, BATADV_OGM_HLEN);
-   if (!ret)
-   return NET_RX_DROP;
+   res = batadv_check_management_packet(skb, if_incoming, BATADV_OGM_HLEN);
+   if (!res)
+   goto free_skb;
 
/* did we receive a B.A.T.M.A.N. IV OGM packet on an interface
 * that does not have B.A.T.M.A.N. IV enabled ?
 */
if (bat_priv->algo_ops->iface.enable != batadv_iv_ogm_iface_enable)
-   return NET_RX_DROP;
+   goto free_skb;
 
batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX);
batadv_add_counter(bat_priv, BATADV_CNT_MGMT_RX_BYTES,
@@ -1854,8 +1855,12 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb,
ogm_packet = (struct batadv_ogm_packet *)packet_pos;
}
 
+   ret = NET_RX_SUCCESS;
+
+free_skb:
consume_skb(skb);
-   return NET_RX_SUCCESS;
+
+   return ret;
 }
 
 #ifdef CONFIG_BATMAN_ADV_DEBUGFS
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index ee08540..81a0501 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -492,20 +492,21 @@ int batadv_v_elp_packet_recv(struct sk_buff *skb,
struct batadv_elp_packet *elp_packet;
struct batadv_hard_iface *primary_if;
struct ethhdr *ethhdr = (struct ethhdr *)skb_mac_header(skb);
-   bool ret;
+   bool res;
+   int ret;
 
-   ret = batadv_check_management_packet(skb, if_incoming, BATADV_ELP_HLEN);
-   if (!ret)
-   return NET_RX_DROP;
+   res = batadv_check_management_packet(skb, if_incoming, BATADV_ELP_HLEN);
+   if (!res)
+   goto free_skb;
 
if (batadv_is_my_mac(bat_priv, ethhdr->h_source))
-   return NET_RX_DROP;
+   goto free_skb;
 
/* did we receive a B.A.T.M.A.N. V ELP packet on an interface
 * that does not have B.A.T.M.A.N. V ELP enabled ?
 */
if (strcmp(bat_priv->algo_ops->name, "BATMAN_V") != 0)
-   return NET_RX_DROP;
+   goto free_skb;
 
elp_packet = (struct batadv_elp_packet *)skb->data;
 
@@ -516,14 +517,16 @@ int batadv_v_elp_packet_recv(struct sk_buff *skb,
 
primary_if = batadv_primary_if_get_selected(bat_priv);
if (!primary_if)
-   goto out;
+   goto free_skb;
 
batadv_v_elp_neigh_update(bat_priv, ethhdr->h_source, if_incoming,
  elp_packet);
 
-out:
-   if (primary_if)
-   batadv_hardif_put(primary_if);
+   ret = NET_RX_SUCCESS;
+   batadv_hardif_put(primary_if);
+
+free_skb:
consume_skb(skb);
-   return NET_RX_SUCCESS;
+
+   return ret;
 }
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
index 9922ccd..ef3a88d 100644
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -810,18 +810,18 @@ int batadv_v_ogm_packet_recv(struct sk_buff *skb,
 * B.A.T.M.A.N. V enabled ?
 */
if (strcmp(bat_priv->algo_ops->name, "BATMAN_V") != 0)
-   return NET_RX_DROP;
+   goto free_skb;
 
if (!batadv_check_management_packet(skb, if_incoming, BATADV_OGM2_HLEN))
-   return NET_RX_DROP;
+   goto free_skb;
 
if (batadv_is_my_mac(bat_priv, ethhdr->h_source))
-   return NET_RX_DROP;
+   goto free_skb;
 
ogm_packet = (struct batadv_ogm2_packet *)skb->data;
 
if (batadv_is_my_mac(bat_priv, ogm_packet->orig))
-   return NET_RX_DROP;
+   goto free_skb;
 
batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX);
batadv_add_counter(bat_priv, BATADV_CNT_MGMT_RX_BYTES,
@@ -842,6 +842,8 @@ int batadv_v_ogm_packet_recv(struct sk_buff *skb,
}
 
ret = NET_RX_SUCCESS;
+
+free_skb:

[PATCH 11/17] batman-adv: Consume skb in batadv_frag_send_packet

2016-11-08 Thread Simon Wunderlich

From: Sven Eckelmann 

Sending functions in Linux consume the supplied skbuff. Doing the same in
batadv_frag_send_packet avoids the hack of returning -1 (-EPERM) to signal
the caller that he is responsible for cleaning up the skb.

Signed-off-by: Sven Eckelmann 
Signed-off-by: Simon Wunderlich 
---
 net/batman-adv/fragmentation.c | 50 --
 1 file changed, 29 insertions(+), 21 deletions(-)

diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index a2e28a1..9c561e6 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -20,6 +20,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -441,8 +442,7 @@ static struct sk_buff *batadv_frag_create(struct sk_buff 
*skb,
  * @orig_node: final destination of the created fragments
  * @neigh_node: next-hop of the created fragments
  *
- * Return: the netdev tx status or -1 in case of error.
- * When -1 is returned the skb is not consumed.
+ * Return: the netdev tx status or a negative errno code on a failure
  */
 int batadv_frag_send_packet(struct sk_buff *skb,
struct batadv_orig_node *orig_node,
@@ -455,7 +455,7 @@ int batadv_frag_send_packet(struct sk_buff *skb,
unsigned int mtu = neigh_node->if_incoming->net_dev->mtu;
unsigned int header_size = sizeof(frag_header);
unsigned int max_fragment_size, max_packet_size;
-   int ret = -1;
+   int ret;
 
/* To avoid merge and refragmentation at next-hops we never send
 * fragments larger than BATADV_FRAG_MAX_FRAG_SIZE
@@ -465,13 +465,17 @@ int batadv_frag_send_packet(struct sk_buff *skb,
max_packet_size = max_fragment_size * BATADV_FRAG_MAX_FRAGMENTS;
 
/* Don't even try to fragment, if we need more than 16 fragments */
-   if (skb->len > max_packet_size)
-   goto out;
+   if (skb->len > max_packet_size) {
+   ret = -EAGAIN;
+   goto free_skb;
+   }
 
bat_priv = orig_node->bat_priv;
primary_if = batadv_primary_if_get_selected(bat_priv);
-   if (!primary_if)
-   goto out;
+   if (!primary_if) {
+   ret = -EINVAL;
+   goto put_primary_if;
+   }
 
/* Create one header to be copied to all fragments */
frag_header.packet_type = BATADV_UNICAST_FRAG;
@@ -496,34 +500,35 @@ int batadv_frag_send_packet(struct sk_buff *skb,
/* Eat and send fragments from the tail of skb */
while (skb->len > max_fragment_size) {
skb_fragment = batadv_frag_create(skb, _header, mtu);
-   if (!skb_fragment)
-   goto out;
+   if (!skb_fragment) {
+   ret = -ENOMEM;
+   goto free_skb;
+   }
 
batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_TX);
batadv_add_counter(bat_priv, BATADV_CNT_FRAG_TX_BYTES,
   skb_fragment->len + ETH_HLEN);
ret = batadv_send_unicast_skb(skb_fragment, neigh_node);
if (ret != NET_XMIT_SUCCESS) {
-   /* return -1 so that the caller can free the original
-* skb
-*/
-   ret = -1;
-   goto out;
+   ret = NET_XMIT_DROP;
+   goto free_skb;
}
 
frag_header.no++;
 
/* The initial check in this function should cover this case */
if (frag_header.no == BATADV_FRAG_MAX_FRAGMENTS - 1) {
-   ret = -1;
-   goto out;
+   ret = -EINVAL;
+   goto free_skb;
}
}
 
/* Make room for the fragment header. */
if (batadv_skb_head_push(skb, header_size) < 0 ||
-   pskb_expand_head(skb, header_size + ETH_HLEN, 0, GFP_ATOMIC) < 0)
-   goto out;
+   pskb_expand_head(skb, header_size + ETH_HLEN, 0, GFP_ATOMIC) < 0) {
+   ret = -ENOMEM;
+   goto free_skb;
+   }
 
memcpy(skb->data, _header, header_size);
 
@@ -532,10 +537,13 @@ int batadv_frag_send_packet(struct sk_buff *skb,
batadv_add_counter(bat_priv, BATADV_CNT_FRAG_TX_BYTES,
   skb->len + ETH_HLEN);
ret = batadv_send_unicast_skb(skb, neigh_node);
+   /* skb was consumed */
+   skb = NULL;
 
-out:
-   if (primary_if)
-   batadv_hardif_put(primary_if);
+put_primary_if:
+   batadv_hardif_put(primary_if);
+free_skb:
+   kfree_skb(skb);
 
return ret;
 }
-- 
2.10.1

[PATCH 09/17] batman-adv: use consume_skb for non-dropped packets

2016-11-08 Thread Simon Wunderlich

From: Sven Eckelmann 

kfree_skb assumes that an skb is dropped after an failure and notes that.
consume_skb should be used in non-failure situations. Such information is
important for dropmonitor netlink which tells how many packets were dropped
and where this drop happened.

Signed-off-by: Sven Eckelmann 
Signed-off-by: Simon Wunderlich 
---
 net/batman-adv/bat_iv_ogm.c | 13 -
 net/batman-adv/fragmentation.c  | 20 ++--
 net/batman-adv/network-coding.c | 24 +++-
 net/batman-adv/send.c   | 27 +++
 net/batman-adv/send.h   |  3 ++-
 net/batman-adv/soft-interface.c |  2 +-
 6 files changed, 59 insertions(+), 30 deletions(-)

diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 0b9be62..310f391 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -698,7 +698,7 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char 
*packet_buff,
 
forw_packet_aggr->skb = netdev_alloc_skb_ip_align(NULL, skb_size);
if (!forw_packet_aggr->skb) {
-   batadv_forw_packet_free(forw_packet_aggr);
+   batadv_forw_packet_free(forw_packet_aggr, true);
return;
}
 
@@ -1611,7 +1611,7 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff 
*skb, int ogm_offset,
if (hardif_neigh)
batadv_hardif_neigh_put(hardif_neigh);
 
-   kfree_skb(skb_priv);
+   consume_skb(skb_priv);
 }
 
 /**
@@ -1783,6 +1783,7 @@ static void 
batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work)
struct delayed_work *delayed_work;
struct batadv_forw_packet *forw_packet;
struct batadv_priv *bat_priv;
+   bool dropped = false;
 
delayed_work = to_delayed_work(work);
forw_packet = container_of(delayed_work, struct batadv_forw_packet,
@@ -1792,8 +1793,10 @@ static void 
batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work)
hlist_del(_packet->list);
spin_unlock_bh(_priv->forw_bat_list_lock);
 
-   if (atomic_read(_priv->mesh_state) == BATADV_MESH_DEACTIVATING)
+   if (atomic_read(_priv->mesh_state) == BATADV_MESH_DEACTIVATING) {
+   dropped = true;
goto out;
+   }
 
batadv_iv_ogm_emit(forw_packet);
 
@@ -1810,7 +1813,7 @@ static void 
batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work)
batadv_iv_ogm_schedule(forw_packet->if_incoming);
 
 out:
-   batadv_forw_packet_free(forw_packet);
+   batadv_forw_packet_free(forw_packet, dropped);
 }
 
 static int batadv_iv_ogm_receive(struct sk_buff *skb,
@@ -1851,7 +1854,7 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb,
ogm_packet = (struct batadv_ogm_packet *)packet_pos;
}
 
-   kfree_skb(skb);
+   consume_skb(skb);
return NET_RX_SUCCESS;
 }
 
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index 2b967a3..a2e28a1 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -42,17 +42,23 @@
 /**
  * batadv_frag_clear_chain - delete entries in the fragment buffer chain
  * @head: head of chain with entries.
+ * @dropped: whether the chain is cleared because all fragments are dropped
  *
  * Free fragments in the passed hlist. Should be called with appropriate lock.
  */
-static void batadv_frag_clear_chain(struct hlist_head *head)
+static void batadv_frag_clear_chain(struct hlist_head *head, bool dropped)
 {
struct batadv_frag_list_entry *entry;
struct hlist_node *node;
 
hlist_for_each_entry_safe(entry, node, head, list) {
hlist_del(>list);
-   kfree_skb(entry->skb);
+
+   if (dropped)
+   kfree_skb(entry->skb);
+   else
+   consume_skb(entry->skb);
+
kfree(entry);
}
 }
@@ -73,7 +79,7 @@ void batadv_frag_purge_orig(struct batadv_orig_node 
*orig_node,
spin_lock_bh(>lock);
 
if (!check_cb || check_cb(chain)) {
-   batadv_frag_clear_chain(>fragment_list);
+   batadv_frag_clear_chain(>fragment_list, true);
chain->size = 0;
}
 
@@ -118,7 +124,7 @@ static bool batadv_frag_init_chain(struct 
batadv_frag_table_entry *chain,
return false;
 
if (!hlist_empty(>fragment_list))
-   batadv_frag_clear_chain(>fragment_list);
+   batadv_frag_clear_chain(>fragment_list, true);
 
chain->size = 0;
chain->seqno = seqno;
@@ -220,7 +226,7 @@ static bool batadv_frag_insert_packet(struct 
batadv_orig_node *orig_node,
 * exceeds the maximum size of one merged packet. Don't allow
 * packets to have different total_size.
 */
-

[PATCH 14/17] batman-adv: Remove dev_queue_xmit return code exception

2016-11-08 Thread Simon Wunderlich

From: Sven Eckelmann 

No caller of batadv_send_skb_to_orig is expecting the results to be -1
(-EPERM) anymore when the skbuff was not consumed. They will instead expect
that the skbuff is always consumed. Having such return code filter is
therefore not needed anymore.

Signed-off-by: Sven Eckelmann 
Signed-off-by: Simon Wunderlich 
---
 net/batman-adv/send.c | 17 ++---
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index b00aac7..9ea272e 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -64,8 +64,11 @@ static void batadv_send_outstanding_bcast_packet(struct 
work_struct *work);
  * If neigh_node is NULL, then the packet is broadcasted using hard_iface,
  * otherwise it is sent as unicast to the given neighbor.
  *
- * Return: NET_TX_DROP in case of error or the result of dev_queue_xmit(skb)
- * otherwise
+ * Regardless of the return value, the skb is consumed.
+ *
+ * Return: A negative errno code is returned on a failure. A success does not
+ * guarantee the frame will be transmitted as it may be dropped due
+ * to congestion or traffic shaping.
  */
 int batadv_send_skb_packet(struct sk_buff *skb,
   struct batadv_hard_iface *hard_iface,
@@ -73,7 +76,6 @@ int batadv_send_skb_packet(struct sk_buff *skb,
 {
struct batadv_priv *bat_priv;
struct ethhdr *ethhdr;
-   int ret;
 
bat_priv = netdev_priv(hard_iface->soft_iface);
 
@@ -111,15 +113,8 @@ int batadv_send_skb_packet(struct sk_buff *skb,
/* dev_queue_xmit() returns a negative result on error.  However on
 * congestion and traffic shaping, it drops and returns NET_XMIT_DROP
 * (which is > 0). This will not be treated as an error.
-*
-* a negative value cannot be returned because it could be interepreted
-* as not consumed skb by callers of batadv_send_skb_to_orig.
 */
-   ret = dev_queue_xmit(skb);
-   if (ret < 0)
-   ret = NET_XMIT_DROP;
-
-   return ret;
+   return dev_queue_xmit(skb);
 send_skb_err:
kfree_skb(skb);
return NET_XMIT_DROP;
-- 
2.10.1

[PATCH 02/17] batman-adv: Mark batadv_netlink_ops as const

2016-11-08 Thread Simon Wunderlich

From: Sven Eckelmann 

The genl_ops don't need to be written by anyone and thus can be moved in a
ro memory range.

Signed-off-by: Sven Eckelmann 
Signed-off-by: Simon Wunderlich 
---
 net/batman-adv/netlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 2171281..0627381 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -530,7 +530,7 @@ batadv_netlink_dump_hardifs(struct sk_buff *msg, struct 
netlink_callback *cb)
return msg->len;
 }
 
-static struct genl_ops batadv_netlink_ops[] = {
+static const struct genl_ops batadv_netlink_ops[] = {
{
.cmd = BATADV_CMD_GET_MESH_INFO,
.flags = GENL_ADMIN_PERM,
-- 
2.10.1

[PATCH 15/17] batman-adv: Disallow mcast src address for data frames

2016-11-08 Thread Simon Wunderlich

From: Sven Eckelmann 

The routing checks are validating the source mac address of the outer
ethernet header. They reject every source mac address which is a broadcast
address. But they also have to reject any multicast mac addresses.

Signed-off-by: Sven Eckelmann 
[s...@simonwunderlich.de: fix commit message typo]
Signed-off-by: Simon Wunderlich 
---
 net/batman-adv/routing.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index caf1866..9646623 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -368,8 +368,8 @@ int batadv_recv_icmp_packet(struct sk_buff *skb,
if (is_broadcast_ether_addr(ethhdr->h_dest))
goto free_skb;
 
-   /* packet with broadcast sender address */
-   if (is_broadcast_ether_addr(ethhdr->h_source))
+   /* packet with broadcast/multicast sender address */
+   if (is_multicast_ether_addr(ethhdr->h_source))
goto free_skb;
 
/* not for me */
@@ -466,8 +466,8 @@ static int batadv_check_unicast_packet(struct batadv_priv 
*bat_priv,
if (is_broadcast_ether_addr(ethhdr->h_dest))
return -EBADR;
 
-   /* packet with broadcast sender address */
-   if (is_broadcast_ether_addr(ethhdr->h_source))
+   /* packet with broadcast/multicast sender address */
+   if (is_multicast_ether_addr(ethhdr->h_source))
return -EBADR;
 
/* not for me */
@@ -1159,8 +1159,8 @@ int batadv_recv_bcast_packet(struct sk_buff *skb,
if (!is_broadcast_ether_addr(ethhdr->h_dest))
goto free_skb;
 
-   /* packet with broadcast sender address */
-   if (is_broadcast_ether_addr(ethhdr->h_source))
+   /* packet with broadcast/multicast sender address */
+   if (is_multicast_ether_addr(ethhdr->h_source))
goto free_skb;
 
/* ignore broadcasts sent by myself */
-- 
2.10.1

[PATCH 06/17] batman-adv: Remove unused skb_reset_mac_header()

2016-11-08 Thread Simon Wunderlich

From: Linus Lüssing 

During broadcast queueing, the skb_reset_mac_header() sets the skb
to a place invalid for a MAC header, pointing right into the
batman-adv broadcast packet. Luckily, no one seems to actually use
eth_hdr(skb) afterwards until batadv_send_skb_packet() resets the
header to a valid position again.

Therefore removing this unnecessary, weird skb_reset_mac_header()
call.

Signed-off-by: Linus Lüssing 
Signed-off-by: Sven Eckelmann 
Signed-off-by: Simon Wunderlich 
---
 net/batman-adv/send.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index e1e9136..be3f6d7 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -586,8 +586,6 @@ int batadv_add_bcast_packet_to_list(struct batadv_priv 
*bat_priv,
bcast_packet = (struct batadv_bcast_packet *)newskb->data;
bcast_packet->ttl--;
 
-   skb_reset_mac_header(newskb);
-
forw_packet->skb = newskb;
 
INIT_DELAYED_WORK(_packet->delayed_work,
-- 
2.10.1

[PATCH 16/17] batman-adv: Disallow zero and mcast src address for mgmt frames

2016-11-08 Thread Simon Wunderlich

From: Sven Eckelmann 

The routing check for management frames is validating the source mac
address in the outer ethernet header. It rejects every source mac address
which is a broadcast address. But it also has to reject the zero-mac
address and multicast mac addresses.

Signed-off-by: Sven Eckelmann 
Signed-off-by: Simon Wunderlich 
---
 net/batman-adv/routing.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 9646623..381f040 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -196,8 +196,8 @@ bool batadv_check_management_packet(struct sk_buff *skb,
if (!is_broadcast_ether_addr(ethhdr->h_dest))
return false;
 
-   /* packet with broadcast sender address */
-   if (is_broadcast_ether_addr(ethhdr->h_source))
+   /* packet with invalid sender address */
+   if (!is_valid_ether_addr(ethhdr->h_source))
return false;
 
/* create a copy of the skb, if needed, to modify it. */
-- 
2.10.1

[PATCH 03/17] batman-adv: Close two alignment holes in batadv_hard_iface

2016-11-08 Thread Simon Wunderlich

From: Sven Eckelmann 

Signed-off-by: Sven Eckelmann 
Signed-off-by: Simon Wunderlich 
---
 net/batman-adv/types.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 673a22e..c9db184 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -123,8 +123,8 @@ struct batadv_hard_iface_bat_v {
  * @list: list node for batadv_hardif_list
  * @if_num: identificator of the interface
  * @if_status: status of the interface for batman-adv
- * @net_dev: pointer to the net_device
  * @num_bcasts: number of payload re-broadcasts on this interface (ARQ)
+ * @net_dev: pointer to the net_device
  * @hardif_obj: kobject of the per interface sysfs "mesh" directory
  * @refcount: number of contexts the object is used
  * @batman_adv_ptype: packet type describing packets that should be processed 
by
@@ -141,8 +141,8 @@ struct batadv_hard_iface {
struct list_head list;
s16 if_num;
char if_status;
-   struct net_device *net_dev;
u8 num_bcasts;
+   struct net_device *net_dev;
struct kobject *hardif_obj;
struct kref refcount;
struct packet_type batman_adv_ptype;
-- 
2.10.1

[PATCH 17/17] batman-adv: Reject unicast packet with zero/mcast dst address

2016-11-08 Thread Simon Wunderlich

From: Sven Eckelmann 

An unicast batman-adv packet cannot be transmitted to a multicast or zero
mac address. So reject incoming packets which still have these classes of
addresses as destination mac address in the outer ethernet header.

Signed-off-by: Sven Eckelmann 
Signed-off-by: Simon Wunderlich 
---
 net/batman-adv/routing.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 381f040..7d9ae4b 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -364,8 +364,8 @@ int batadv_recv_icmp_packet(struct sk_buff *skb,
 
ethhdr = eth_hdr(skb);
 
-   /* packet with unicast indication but broadcast recipient */
-   if (is_broadcast_ether_addr(ethhdr->h_dest))
+   /* packet with unicast indication but non-unicast recipient */
+   if (!is_valid_ether_addr(ethhdr->h_dest))
goto free_skb;
 
/* packet with broadcast/multicast sender address */
@@ -462,8 +462,8 @@ static int batadv_check_unicast_packet(struct batadv_priv 
*bat_priv,
 
ethhdr = eth_hdr(skb);
 
-   /* packet with unicast indication but broadcast recipient */
-   if (is_broadcast_ether_addr(ethhdr->h_dest))
+   /* packet with unicast indication but non-unicast recipient */
+   if (!is_valid_ether_addr(ethhdr->h_dest))
return -EBADR;
 
/* packet with broadcast/multicast sender address */
-- 
2.10.1

[PATCH 07/17] batman-adv: Use own timer for multicast TT and TVLV updates

2016-11-08 Thread Simon Wunderlich

From: Linus Lüssing 

Instead of latching onto the OGM period, this patch introduces a worker
dedicated to multicast TT and TVLV updates.

The reasoning is, that upon roaming especially the translation table
should be updated timely to minimize connectivity issues.

With BATMAN V, the idea is to greatly increase the OGM interval to
reduce overhead. Unfortunately, right now this could lead to
a bad user experience if multicast traffic is involved.

Therefore this patch introduces a fixed 500ms update interval for
multicast TT entries and the multicast TVLV.

Signed-off-by: Linus Lüssing 
Signed-off-by: Sven Eckelmann 
Signed-off-by: Simon Wunderlich 
---
 net/batman-adv/main.h  |  1 +
 net/batman-adv/multicast.c | 62 ++
 net/batman-adv/multicast.h |  6 
 net/batman-adv/translation-table.c |  4 ---
 net/batman-adv/types.h |  4 ++-
 5 files changed, 60 insertions(+), 17 deletions(-)

diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index daddca9..a6cc804 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -48,6 +48,7 @@
 #define BATADV_TT_CLIENT_TEMP_TIMEOUT 60 /* in milliseconds */
 #define BATADV_TT_WORK_PERIOD 5000 /* 5 seconds */
 #define BATADV_ORIG_WORK_PERIOD 1000 /* 1 second */
+#define BATADV_MCAST_WORK_PERIOD 500 /* 0.5 seconds */
 #define BATADV_DAT_ENTRY_TIMEOUT (5 * 6) /* 5 mins in milliseconds */
 /* sliding packet range of received originator messages in sequence numbers
  * (should be a multiple of our word size)
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index 45757fa..090a69f 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -33,6 +33,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -48,6 +49,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -60,6 +62,18 @@
 #include "translation-table.h"
 #include "tvlv.h"
 
+static void batadv_mcast_mla_update(struct work_struct *work);
+
+/**
+ * batadv_mcast_start_timer - schedule the multicast periodic worker
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+static void batadv_mcast_start_timer(struct batadv_priv *bat_priv)
+{
+   queue_delayed_work(batadv_event_workqueue, _priv->mcast.work,
+  msecs_to_jiffies(BATADV_MCAST_WORK_PERIOD));
+}
+
 /**
  * batadv_mcast_get_bridge - get the bridge on top of the softif if it exists
  * @soft_iface: netdev struct of the mesh interface
@@ -255,6 +269,8 @@ static void batadv_mcast_mla_list_free(struct hlist_head 
*mcast_list)
  * translation table except the ones listed in the given mcast_list.
  *
  * If mcast_list is NULL then all are retracted.
+ *
+ * Do not call outside of the mcast worker! (or cancel mcast worker first)
  */
 static void batadv_mcast_mla_tt_retract(struct batadv_priv *bat_priv,
struct hlist_head *mcast_list)
@@ -262,7 +278,7 @@ static void batadv_mcast_mla_tt_retract(struct batadv_priv 
*bat_priv,
struct batadv_hw_addr *mcast_entry;
struct hlist_node *tmp;
 
-   lockdep_assert_held(_priv->tt.commit_lock);
+   WARN_ON(delayed_work_pending(_priv->mcast.work));
 
hlist_for_each_entry_safe(mcast_entry, tmp, _priv->mcast.mla_list,
  list) {
@@ -287,6 +303,8 @@ static void batadv_mcast_mla_tt_retract(struct batadv_priv 
*bat_priv,
  *
  * Adds multicast listener announcements from the given mcast_list to the
  * translation table if they have not been added yet.
+ *
+ * Do not call outside of the mcast worker! (or cancel mcast worker first)
  */
 static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv,
struct hlist_head *mcast_list)
@@ -294,7 +312,7 @@ static void batadv_mcast_mla_tt_add(struct batadv_priv 
*bat_priv,
struct batadv_hw_addr *mcast_entry;
struct hlist_node *tmp;
 
-   lockdep_assert_held(_priv->tt.commit_lock);
+   WARN_ON(delayed_work_pending(_priv->mcast.work));
 
if (!mcast_list)
return;
@@ -528,13 +546,18 @@ static bool batadv_mcast_mla_tvlv_update(struct 
batadv_priv *bat_priv)
 }
 
 /**
- * batadv_mcast_mla_update - update the own MLAs
+ * __batadv_mcast_mla_update - update the own MLAs
  * @bat_priv: the bat priv with all the soft interface information
  *
  * Updates the own multicast listener announcements in the translation
  * table as well as the own, announced multicast tvlv container.
+ *
+ * Note that non-conflicting reads and writes to bat_priv->mcast.mla_list
+ * in batadv_mcast_mla_tt_retract() and batadv_mcast_mla_tt_add() are
+ * ensured by the non-parallel execution of the worker this function
+ * belongs to.
  */
-void batadv_mcast_mla_update(struct batadv_priv *bat_priv)
+static void

[PATCH 12/17] batman-adv: Consume skb in batadv_send_skb_to_orig

2016-11-08 Thread Simon Wunderlich

From: Sven Eckelmann 

Sending functions in Linux consume the supplied skbuff. Doing the same in
batadv_send_skb_to_orig avoids the hack of returning -1 (-EPERM) to signal
the caller that he is responsible for cleaning up the skb.

Signed-off-by: Sven Eckelmann 
Signed-off-by: Simon Wunderlich 
---
 net/batman-adv/routing.c  | 11 ++-
 net/batman-adv/send.c | 39 ++-
 net/batman-adv/tp_meter.c |  6 --
 net/batman-adv/tvlv.c |  5 +
 4 files changed, 25 insertions(+), 36 deletions(-)

diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index a4cb157..4d2679a 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -262,9 +262,6 @@ static int batadv_recv_my_icmp_packet(struct batadv_priv 
*bat_priv,
icmph->ttl = BATADV_TTL;
 
res = batadv_send_skb_to_orig(skb, orig_node, NULL);
-   if (res == -1)
-   goto out;
-
ret = NET_RX_SUCCESS;
 
break;
@@ -325,8 +322,7 @@ static int batadv_recv_icmp_ttl_exceeded(struct batadv_priv 
*bat_priv,
icmp_packet->ttl = BATADV_TTL;
 
res = batadv_send_skb_to_orig(skb, orig_node, NULL);
-   if (res != -1)
-   ret = NET_RX_SUCCESS;
+   ret = NET_RX_SUCCESS;
 
 out:
if (primary_if)
@@ -413,8 +409,7 @@ int batadv_recv_icmp_packet(struct sk_buff *skb,
 
/* route it */
res = batadv_send_skb_to_orig(skb, orig_node, recv_if);
-   if (res != -1)
-   ret = NET_RX_SUCCESS;
+   ret = NET_RX_SUCCESS;
 
 out:
if (orig_node)
@@ -702,8 +697,6 @@ static int batadv_route_unicast_packet(struct sk_buff *skb,
 
len = skb->len;
res = batadv_send_skb_to_orig(skb, orig_node, recv_if);
-   if (res == -1)
-   goto out;
 
/* translate transmit result into receive result */
if (res == NET_XMIT_SUCCESS) {
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index 0f86293..b00aac7 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -165,11 +165,9 @@ int batadv_send_unicast_skb(struct sk_buff *skb,
  * host, NULL can be passed as recv_if and no interface alternating is
  * attempted.
  *
- * Return: -1 on failure (and the skb is not consumed), -EINPROGRESS if the
- * skb is buffered for later transmit or the NET_XMIT status returned by the
+ * Return: negative errno code on a failure, -EINPROGRESS if the skb is
+ * buffered for later transmit or the NET_XMIT status returned by the
  * lower routine if the packet has been passed down.
- *
- * If the returning value is not -1 the skb has been consumed.
  */
 int batadv_send_skb_to_orig(struct sk_buff *skb,
struct batadv_orig_node *orig_node,
@@ -177,12 +175,14 @@ int batadv_send_skb_to_orig(struct sk_buff *skb,
 {
struct batadv_priv *bat_priv = orig_node->bat_priv;
struct batadv_neigh_node *neigh_node;
-   int ret = -1;
+   int ret;
 
/* batadv_find_router() increases neigh_nodes refcount if found. */
neigh_node = batadv_find_router(bat_priv, orig_node, recv_if);
-   if (!neigh_node)
-   goto out;
+   if (!neigh_node) {
+   ret = -EINVAL;
+   goto free_skb;
+   }
 
/* Check if the skb is too large to send in one piece and fragment
 * it if needed.
@@ -191,8 +191,10 @@ int batadv_send_skb_to_orig(struct sk_buff *skb,
skb->len > neigh_node->if_incoming->net_dev->mtu) {
/* Fragment and send packet. */
ret = batadv_frag_send_packet(skb, orig_node, neigh_node);
+   /* skb was consumed */
+   skb = NULL;
 
-   goto out;
+   goto put_neigh_node;
}
 
/* try to network code the packet, if it is received on an interface
@@ -204,9 +206,13 @@ int batadv_send_skb_to_orig(struct sk_buff *skb,
else
ret = batadv_send_unicast_skb(skb, neigh_node);
 
-out:
-   if (neigh_node)
-   batadv_neigh_node_put(neigh_node);
+   /* skb was consumed */
+   skb = NULL;
+
+put_neigh_node:
+   batadv_neigh_node_put(neigh_node);
+free_skb:
+   kfree_skb(skb);
 
return ret;
 }
@@ -327,7 +333,7 @@ int batadv_send_skb_unicast(struct batadv_priv *bat_priv,
 {
struct batadv_unicast_packet *unicast_packet;
struct ethhdr *ethhdr;
-   int res, ret = NET_XMIT_DROP;
+   int ret = NET_XMIT_DROP;
 
if (!orig_node)
goto out;
@@ -364,13 +370,12 @@ int batadv_send_skb_unicast(struct batadv_priv *bat_priv,
if (batadv_tt_global_client_is_roaming(bat_priv, ethhdr->h_dest, vid))
unicast_packet->ttvn = unicast_packet->ttvn - 1;
 
-   res = batadv_send_skb_to_orig(skb, orig_node, NULL);
-   if (res != -1)
-   ret =

[PATCH 00/17] pull request for net-next: batman-adv 2016-11-08

2016-11-08 Thread Simon Wunderlich

Hi David,

this is our first feature pull request for batman-adv. There is one more
set pending after this one.

Please pull or let me know of any problem!

Thank you,
  Simon

The following changes since commit a283ad5066cd63f595224c7476001cfc367fdf2e:

  Merge tag 'batadv-next-for-davem-20161027' of 
git://git.open-mesh.org/linux-merge (2016-10-29 16:26:50 -0400)

are available in the git repository at:

  git://git.open-mesh.org/linux-merge.git tags/batadv-next-for-davem-20161108

for you to fetch changes up to 33581cefe4d182d99e9f8a66156507b06e7c9265:

  batman-adv: Reject unicast packet with zero/mcast dst address (2016-10-30 
11:11:40 +0100)


This feature and cleanup patchset includes the following changes:

 - netlink and code cleanups by Sven Eckelmann (3 patches)

 - Cleanup and minor fixes by Linus Luessing (3 patches)

 - Speed up multicast update intervals, by Linus Luessing

 - Avoid (re)broadcast in meshes for some easy cases,
   by Linus Luessing

 - Clean up tx return state handling, by Sven Eckelmann (6 patches)

 - Fix some special mac address handling cases, by Sven Eckelmann
   (3 patches)


Linus Lüssing (5):
  batman-adv: Add wrapper for ARP reply creation
  batman-adv: Remove unnecessary lockdep in batadv_mcast_mla_list_free
  batman-adv: Remove unused skb_reset_mac_header()
  batman-adv: Use own timer for multicast TT and TVLV updates
  batman-adv: Simple (re)broadcast avoidance

Sven Eckelmann (12):
  batman-adv: Introduce missing headers for genetlink restructure
  batman-adv: Mark batadv_netlink_ops as const
  batman-adv: Close two alignment holes in batadv_hard_iface
  batman-adv: use consume_skb for non-dropped packets
  batman-adv: Count all non-success TX packets as dropped
  batman-adv: Consume skb in batadv_frag_send_packet
  batman-adv: Consume skb in batadv_send_skb_to_orig
  batman-adv: Consume skb in receive handlers
  batman-adv: Remove dev_queue_xmit return code exception
  batman-adv: Disallow mcast src address for data frames
  batman-adv: Disallow zero and mcast src address for mgmt frames
  batman-adv: Reject unicast packet with zero/mcast dst address

 net/batman-adv/bat_iv_ogm.c|  30 --
 net/batman-adv/bat_v_elp.c |  25 +++--
 net/batman-adv/bat_v_ogm.c |  66 +++-
 net/batman-adv/distributed-arp-table.c |  67 +++-
 net/batman-adv/fragmentation.c |  70 -
 net/batman-adv/hard-interface.c|  52 ++
 net/batman-adv/hard-interface.h|  16 +++
 net/batman-adv/main.c  |  11 +-
 net/batman-adv/main.h  |   1 +
 net/batman-adv/multicast.c |  70 ++---
 net/batman-adv/multicast.h |   6 --
 net/batman-adv/netlink.c   |   5 +-
 net/batman-adv/network-coding.c|  35 ---
 net/batman-adv/originator.c|  13 ++-
 net/batman-adv/routing.c   | 180 -
 net/batman-adv/send.c  | 140 ++---
 net/batman-adv/send.h  |   6 +-
 net/batman-adv/soft-interface.c|   6 +-
 net/batman-adv/tp_meter.c  |   6 --
 net/batman-adv/translation-table.c |   4 -
 net/batman-adv/tvlv.c  |   5 +-
 net/batman-adv/types.h |  10 +-
 22 files changed, 571 insertions(+), 253 deletions(-)

1 2 >

1 - 100 of 151 matches

Mail list logo