date:20170307

Re: [PATCH 24/29] drivers: convert iblock_req.pending from atomic_t to refcount_t

2017-03-07 Thread Nicholas A. Bellinger

Hi Elena,

On Mon, 2017-03-06 at 16:21 +0200, Elena Reshetova wrote:
> refcount_t type and corresponding API should be
> used instead of atomic_t when the variable is used as
> a reference counter. This allows to avoid accidental
> refcounter overflows that might lead to use-after-free
> situations.
> 
> Signed-off-by: Elena Reshetova 
> Signed-off-by: Hans Liljestrand 
> Signed-off-by: Kees Cook 
> Signed-off-by: David Windsor 
> ---
>  drivers/target/target_core_iblock.c | 12 ++--
>  drivers/target/target_core_iblock.h |  3 ++-
>  2 files changed, 8 insertions(+), 7 deletions(-)

For the target_core_iblock part:

Acked-by: Nicholas Bellinger

Re: [PATCH v3] {net,IB}/{rxe,usnic}: Utilize generic mac to eui32 function

2017-03-07 Thread Leon Romanovsky

On Tue, Mar 07, 2017 at 09:31:58PM +0200, Yuval Shaia wrote:
> This logic seems to be duplicated in (at least) three separate files.
> Move it to one place so code can be re-use.
>
> Signed-off-by: Yuval Shaia 
> ---
> v0 -> v1:
>   * Add missing #include
>   * Rename to genaddrconf_ifid_eui48
> v1 -> v2:
>   * Reset eui[0] to default if dev_id is used
> v2 -> v3:
>   * Add helper function to avoid re-setting eui[0] to default if
> dev_id is used
> ---
>  drivers/infiniband/hw/usnic/usnic_common_util.h | 11 +++
>  drivers/infiniband/sw/rxe/rxe_net.c | 11 ++-
>  include/net/addrconf.h  | 25 
> +++--
>  3 files changed, 24 insertions(+), 23 deletions(-)

Not promising statistics :)

>
> diff --git a/drivers/infiniband/hw/usnic/usnic_common_util.h 
> b/drivers/infiniband/hw/usnic/usnic_common_util.h
> index b54986d..d91b035 100644
> --- a/drivers/infiniband/hw/usnic/usnic_common_util.h
> +++ b/drivers/infiniband/hw/usnic/usnic_common_util.h
> @@ -34,6 +34,8 @@
>  #ifndef USNIC_CMN_UTIL_H
>  #define USNIC_CMN_UTIL_H
>
> +#include 
> +
>  static inline void
>  usnic_mac_to_gid(const char *const mac, char *raw_gid)
>  {
> @@ -57,14 +59,7 @@ usnic_mac_ip_to_gid(const char *const mac, const __be32 
> inaddr, char *raw_gid)
>   raw_gid[1] = 0x80;
>   memset(_gid[2], 0, 2);
>   memcpy(_gid[4], , 4);
> - raw_gid[8] = mac[0]^2;
> - raw_gid[9] = mac[1];
> - raw_gid[10] = mac[2];
> - raw_gid[11] = 0xff;
> - raw_gid[12] = 0xfe;
> - raw_gid[13] = mac[3];
> - raw_gid[14] = mac[4];
> - raw_gid[15] = mac[5];
> + addrconf_addr_eui48(_gid[8], mac);
>  }
>
>  static inline void
> diff --git a/drivers/infiniband/sw/rxe/rxe_net.c 
> b/drivers/infiniband/sw/rxe/rxe_net.c
> index d8610960..ab8ea23 100644
> --- a/drivers/infiniband/sw/rxe/rxe_net.c
> +++ b/drivers/infiniband/sw/rxe/rxe_net.c
> @@ -38,6 +38,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>
> @@ -86,18 +87,10 @@ struct rxe_recv_sockets recv_sockets;
>
>  static __be64 rxe_mac_to_eui64(struct net_device *ndev)
>  {

It is worth to drop this wrapper completely. The rxe_mac_to_eui64 is
called twice in the same file.

> - unsigned char *mac_addr = ndev->dev_addr;
>   __be64 eui64;
>   unsigned char *dst = (unsigned char *)
>
> - dst[0] = mac_addr[0] ^ 2;
> - dst[1] = mac_addr[1];
> - dst[2] = mac_addr[2];
> - dst[3] = 0xff;
> - dst[4] = 0xfe;
> - dst[5] = mac_addr[3];
> - dst[6] = mac_addr[4];
> - dst[7] = mac_addr[5];
> + addrconf_addr_eui48(dst, ndev->dev_addr);
>
>   return eui64;
>  }
> diff --git a/include/net/addrconf.h b/include/net/addrconf.h
> index 17c6fd8..28274ed 100644
> --- a/include/net/addrconf.h
> +++ b/include/net/addrconf.h
> @@ -103,12 +103,25 @@ int addrconf_prefix_rcv_add_addr(struct net *net, 
> struct net_device *dev,
>u32 addr_flags, bool sllao, bool tokenized,
>__u32 valid_lft, u32 prefered_lft);
>
> +static inline void addrconf_addr_eui48_xor(u8 *eui, const char *const addr, 
> bool xor)
> +{
> + memcpy(eui, addr, 3);
> + if (xor)
> + eui[0] ^= 2;
> + eui[3] = 0xFF;
> + eui[4] = 0xFE;
> + memcpy(eui + 5, addr + 3, 3);
> +}
> +
> +static inline void addrconf_addr_eui48(u8 *eui, const char *const addr)
> +{
> + addrconf_addr_eui48_xor(eui, addr, true);

Just put your "eui[0] ^= 2" here and remove redundant "if (xor)".
> +}
> +
>  static inline int addrconf_ifid_eui48(u8 *eui, struct net_device *dev)
>  {
>   if (dev->addr_len != ETH_ALEN)
>   return -1;
> - memcpy(eui, dev->dev_addr, 3);
> - memcpy(eui + 5, dev->dev_addr + 3, 3);
>
>   /*
>* The zSeries OSA network cards can be shared among various
> @@ -123,14 +136,14 @@ static inline int addrconf_ifid_eui48(u8 *eui, struct 
> net_device *dev)
>* case.  Hence the resulting interface identifier has local
>* scope according to RFC2373.
>*/
> +
> + addrconf_addr_eui48_xor(eui, dev->dev_addr, !dev->dev_id);
> +
>   if (dev->dev_id) {
>   eui[3] = (dev->dev_id >> 8) & 0xFF;
>   eui[4] = dev->dev_id & 0xFF;
> - } else {
> - eui[3] = 0xFF;
> - eui[4] = 0xFE;
> - eui[0] ^= 2;

Leave this line.
Thanks

>   }
> +
>   return 0;
>  }
>
> --
> 2.7.4
>


signature.asc
Description: PGP signature

Re: [PATCH net 0/3] rds: tcp: fix various rds-tcp issues during netns create/delete sequences

2017-03-07 Thread David Miller

From: Sowmini Varadhan 
Date: Sat,  4 Mar 2017 08:57:32 -0800

> Dmitry Vyukov reported some syszkaller panics during netns deletion.
 ...

Series applied, thanks!

Re: linux-next: build failure after merge of the rcu tree

2017-03-07 Thread Paul E. McKenney

On Wed, Mar 08, 2017 at 12:16:05PM +1100, Stephen Rothwell wrote:
> Hi Paul,
> 
> On Mon, 13 Feb 2017 17:43:24 +1100 Stephen Rothwell  
> wrote:
> >
> > On Sun, 12 Feb 2017 20:37:48 -0800 "Paul E. McKenney" 
> >  wrote:
> > >
> > > I chickened out on that commit for this merge window, so it will come
> > > back at -rc1.  But I will cover that when I rebase to -rc1.  
> > 
> > OK, thanks.
> 
> [PATCH] smc: merge fix for "mm: Rename SLAB_DESTROY_BY_RCU to 
> SLAB_TYPESAFE_BY_RCU"
> is needed again ... maybe time to rebase?

On my list for tomorrow morning.  ;-)

Thanx, Paul

[PATCH v4 net-next 5/6] drivers: net: xgene-v2: Add transmit and receive

2017-03-07 Thread Iyappan Subramanian

This patch adds,
- Transmit
- Transmit completion poll
- Receive poll
- NAPI handler

and enables the driver.

Signed-off-by: Iyappan Subramanian 
Signed-off-by: Keyur Chudgar 
---
 drivers/net/ethernet/apm/Kconfig   |   1 +
 drivers/net/ethernet/apm/Makefile  |   1 +
 drivers/net/ethernet/apm/xgene-v2/Kconfig  |  11 ++
 drivers/net/ethernet/apm/xgene-v2/Makefile |   6 +
 drivers/net/ethernet/apm/xgene-v2/main.c   | 248 -
 drivers/net/ethernet/apm/xgene-v2/main.h   |   1 +
 6 files changed, 267 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/apm/xgene-v2/Kconfig
 create mode 100644 drivers/net/ethernet/apm/xgene-v2/Makefile

diff --git a/drivers/net/ethernet/apm/Kconfig b/drivers/net/ethernet/apm/Kconfig
index ec63d70..59efe5b 100644
--- a/drivers/net/ethernet/apm/Kconfig
+++ b/drivers/net/ethernet/apm/Kconfig
@@ -1 +1,2 @@
 source "drivers/net/ethernet/apm/xgene/Kconfig"
+source "drivers/net/ethernet/apm/xgene-v2/Kconfig"
diff --git a/drivers/net/ethernet/apm/Makefile 
b/drivers/net/ethernet/apm/Makefile
index 65ce32a..946b2a4 100644
--- a/drivers/net/ethernet/apm/Makefile
+++ b/drivers/net/ethernet/apm/Makefile
@@ -3,3 +3,4 @@
 #
 
 obj-$(CONFIG_NET_XGENE) += xgene/
+obj-$(CONFIG_NET_XGENE_V2) += xgene-v2/
diff --git a/drivers/net/ethernet/apm/xgene-v2/Kconfig 
b/drivers/net/ethernet/apm/xgene-v2/Kconfig
new file mode 100644
index 000..1205861
--- /dev/null
+++ b/drivers/net/ethernet/apm/xgene-v2/Kconfig
@@ -0,0 +1,11 @@
+config NET_XGENE_V2
+   tristate "APM X-Gene SoC Ethernet-v2 Driver"
+   depends on HAS_DMA
+   depends on ARCH_XGENE || COMPILE_TEST
+   help
+ This is the Ethernet driver for the on-chip ethernet interface
+ which uses a linked list of DMA descriptor architecture (v2) for
+ APM X-Gene SoCs.
+
+ To compile this driver as a module, choose M here. This module will
+ be called xgene-enet-v2.
diff --git a/drivers/net/ethernet/apm/xgene-v2/Makefile 
b/drivers/net/ethernet/apm/xgene-v2/Makefile
new file mode 100644
index 000..735309c
--- /dev/null
+++ b/drivers/net/ethernet/apm/xgene-v2/Makefile
@@ -0,0 +1,6 @@
+#
+# Makefile for APM X-Gene Ethernet v2 driver
+#
+
+xgene-enet-v2-objs := main.o mac.o enet.o ring.o
+obj-$(CONFIG_NET_XGENE_V2) += xgene-enet-v2.o
diff --git a/drivers/net/ethernet/apm/xgene-v2/main.c 
b/drivers/net/ethernet/apm/xgene-v2/main.c
index c96b4cc..b16ef43 100644
--- a/drivers/net/ethernet/apm/xgene-v2/main.c
+++ b/drivers/net/ethernet/apm/xgene-v2/main.c
@@ -113,7 +113,7 @@ static int xge_refill_buffers(struct net_device *ndev, u32 
nbuf)
raw_desc->m1 = cpu_to_le64(SET_BITS(NEXT_DESC_ADDRL, addr_lo) |
   SET_BITS(NEXT_DESC_ADDRH, addr_hi) |
   SET_BITS(PKT_ADDRH,
-   dma_addr >> PKT_ADDRL_LEN));
+   upper_32_bits(dma_addr)));
 
dma_wmb();
raw_desc->m0 = cpu_to_le64(SET_BITS(PKT_ADDRL, dma_addr) |
@@ -177,6 +177,194 @@ static void xge_free_irq(struct net_device *ndev)
devm_free_irq(dev, pdata->resources.irq, pdata);
 }
 
+static bool is_tx_slot_available(struct xge_raw_desc *raw_desc)
+{
+   if (GET_BITS(E, le64_to_cpu(raw_desc->m0)) &&
+   (GET_BITS(PKT_SIZE, le64_to_cpu(raw_desc->m0)) == SLOT_EMPTY))
+   return true;
+
+   return false;
+}
+
+static netdev_tx_t xge_start_xmit(struct sk_buff *skb, struct net_device *ndev)
+{
+   struct xge_pdata *pdata = netdev_priv(ndev);
+   struct device *dev = >pdev->dev;
+   static dma_addr_t dma_addr;
+   struct xge_desc_ring *tx_ring;
+   struct xge_raw_desc *raw_desc;
+   u64 addr_lo, addr_hi;
+   void *pkt_buf;
+   u8 tail;
+   u16 len;
+
+   tx_ring = pdata->tx_ring;
+   tail = tx_ring->tail;
+   len = skb_headlen(skb);
+   raw_desc = _ring->raw_desc[tail];
+
+   if (!is_tx_slot_available(raw_desc)) {
+   netif_stop_queue(ndev);
+   return NETDEV_TX_BUSY;
+   }
+
+   /* Packet buffers should be 64B aligned */
+   pkt_buf = dma_zalloc_coherent(dev, XGENE_ENET_STD_MTU, _addr,
+ GFP_ATOMIC);
+   if (unlikely(!pkt_buf)) {
+   dev_kfree_skb_any(skb);
+   return NETDEV_TX_OK;
+   }
+   memcpy(pkt_buf, skb->data, len);
+
+   addr_hi = GET_BITS(NEXT_DESC_ADDRH, le64_to_cpu(raw_desc->m1));
+   addr_lo = GET_BITS(NEXT_DESC_ADDRL, le64_to_cpu(raw_desc->m1));
+   raw_desc->m1 = cpu_to_le64(SET_BITS(NEXT_DESC_ADDRL, addr_lo) |
+  SET_BITS(NEXT_DESC_ADDRH, addr_hi) |
+  SET_BITS(PKT_ADDRH,
+   upper_32_bits(dma_addr)));
+
+

[PATCH net 0/2] bpf: htab fixes

2017-03-07 Thread Alexei Starovoitov

Two bpf hashtable fixes. See individual patches for details.

Alexei Starovoitov (2):
  bpf: fix struct htab_elem layout
  bpf: convert htab map to hlist_nulls

 include/linux/list_nulls.h|   5 ++
 include/linux/rculist_nulls.h |  14 +
 kernel/bpf/hashtab.c  | 119 --
 3 files changed, 99 insertions(+), 39 deletions(-)

-- 
2.8.0

[PATCH net 1/2] bpf: fix struct htab_elem layout

2017-03-07 Thread Alexei Starovoitov

when htab_elem is removed from the bucket list the htab_elem.hash_node.next
field should not be overridden too early otherwise we have a tiny race window
between lookup and delete.
The bug was discovered by manual code analysis and reproducible
only with explicit udelay() in lookup_elem_raw().

Fixes: 6c9059817432 ("bpf: pre-allocate hash map elements")
Reported-by: Jonathan Perry 
Signed-off-by: Alexei Starovoitov 
Acked-by: Daniel Borkmann 
---
 kernel/bpf/hashtab.c | 25 -
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 3ea87fb19a94..63c86a7be2a1 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -45,8 +45,13 @@ enum extra_elem_state {
 struct htab_elem {
union {
struct hlist_node hash_node;
-   struct bpf_htab *htab;
-   struct pcpu_freelist_node fnode;
+   struct {
+   void *padding;
+   union {
+   struct bpf_htab *htab;
+   struct pcpu_freelist_node fnode;
+   };
+   };
};
union {
struct rcu_head rcu;
@@ -162,7 +167,8 @@ static int prealloc_init(struct bpf_htab *htab)
 offsetof(struct htab_elem, lru_node),
 htab->elem_size, htab->map.max_entries);
else
-   pcpu_freelist_populate(>freelist, htab->elems,
+   pcpu_freelist_populate(>freelist,
+  htab->elems + offsetof(struct htab_elem, 
fnode),
   htab->elem_size, htab->map.max_entries);
 
return 0;
@@ -217,6 +223,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
int err, i;
u64 cost;
 
+   BUILD_BUG_ON(offsetof(struct htab_elem, htab) !=
+offsetof(struct htab_elem, hash_node.pprev));
+   BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
+offsetof(struct htab_elem, hash_node.pprev));
+
if (lru && !capable(CAP_SYS_ADMIN))
/* LRU implementation is much complicated than other
 * maps.  Hence, limit to CAP_SYS_ADMIN for now.
@@ -582,9 +593,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab 
*htab, void *key,
int err = 0;
 
if (prealloc) {
-   l_new = (struct htab_elem *)pcpu_freelist_pop(>freelist);
-   if (!l_new)
+   struct pcpu_freelist_node *l;
+
+   l = pcpu_freelist_pop(>freelist);
+   if (!l)
err = -E2BIG;
+   else
+   l_new = container_of(l, struct htab_elem, fnode);
} else {
if (atomic_inc_return(>count) > htab->map.max_entries) {
atomic_dec(>count);
-- 
2.8.0

[PATCH net 2/2] bpf: convert htab map to hlist_nulls

2017-03-07 Thread Alexei Starovoitov

when all map elements are pre-allocated one cpu can delete and reuse htab_elem
while another cpu is still walking the hlist. In such case the lookup may
miss the element. Convert hlist to hlist_nulls to avoid such scenario.
When bucket lock is taken there is no need to take such precautions,
so only convert map_lookup and map_get_next to nulls.
The race window is extremely small and only reproducible with explicit
udelay() inside lookup_nulls_elem_raw()

Similar to hlist add hlist_nulls_for_each_entry_safe() and
hlist_nulls_entry_safe() helpers.

Fixes: 6c9059817432 ("bpf: pre-allocate hash map elements")
Reported-by: Jonathan Perry 
Signed-off-by: Alexei Starovoitov 
Acked-by: Daniel Borkmann 
---
 include/linux/list_nulls.h|  5 +++
 include/linux/rculist_nulls.h | 14 +++
 kernel/bpf/hashtab.c  | 94 +++
 3 files changed, 79 insertions(+), 34 deletions(-)

diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h
index b01fe1009084..87ff4f58a2f0 100644
--- a/include/linux/list_nulls.h
+++ b/include/linux/list_nulls.h
@@ -29,6 +29,11 @@ struct hlist_nulls_node {
((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls))
 
 #define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member)
+
+#define hlist_nulls_entry_safe(ptr, type, member) \
+   ({ typeof(ptr) ptr = (ptr); \
+  !is_a_nulls(ptr) ? hlist_nulls_entry(ptr, type, member) : 
NULL; \
+   })
 /**
  * ptr_is_a_nulls - Test if a ptr is a nulls
  * @ptr: ptr to be tested
diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
index 4ae95f7e8597..a23a33153180 100644
--- a/include/linux/rculist_nulls.h
+++ b/include/linux/rculist_nulls.h
@@ -156,5 +156,19 @@ static inline void hlist_nulls_add_tail_rcu(struct 
hlist_nulls_node *n,
({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); 
\
pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)))
 
+/**
+ * hlist_nulls_for_each_entry_safe -
+ *   iterate over list of given type safe against removal of list entry
+ * @tpos:  the type * to use as a loop cursor.
+ * @pos:   the  hlist_nulls_node to use as a loop cursor.
+ * @head:  the head for your list.
+ * @member:the name of the hlist_nulls_node within the struct.
+ */
+#define hlist_nulls_for_each_entry_safe(tpos, pos, head, member)   
\
+   for (({barrier();}),
\
+pos = rcu_dereference_raw(hlist_nulls_first_rcu(head));
\
+   (!is_a_nulls(pos)) &&   
\
+   ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member);
\
+  pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)); 1; });)
 #endif
 #endif
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 63c86a7be2a1..afe5bab376c9 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -13,11 +13,12 @@
 #include 
 #include 
 #include 
+#include 
 #include "percpu_freelist.h"
 #include "bpf_lru_list.h"
 
 struct bucket {
-   struct hlist_head head;
+   struct hlist_nulls_head head;
raw_spinlock_t lock;
 };
 
@@ -44,7 +45,7 @@ enum extra_elem_state {
 /* each htab element is struct htab_elem + key + value */
 struct htab_elem {
union {
-   struct hlist_node hash_node;
+   struct hlist_nulls_node hash_node;
struct {
void *padding;
union {
@@ -337,7 +338,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
goto free_htab;
 
for (i = 0; i < htab->n_buckets; i++) {
-   INIT_HLIST_HEAD(>buckets[i].head);
+   INIT_HLIST_NULLS_HEAD(>buckets[i].head, i);
raw_spin_lock_init(>buckets[i].lock);
}
 
@@ -377,28 +378,52 @@ static inline struct bucket *__select_bucket(struct 
bpf_htab *htab, u32 hash)
return >buckets[hash & (htab->n_buckets - 1)];
 }
 
-static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, 
u32 hash)
 {
return &__select_bucket(htab, hash)->head;
 }
 
-static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
+/* this lookup function can only be called with bucket lock taken */
+static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 
hash,
 void *key, u32 key_size)
 {
+   struct hlist_nulls_node *n;
struct htab_elem *l;
 
-   hlist_for_each_entry_rcu(l, head, hash_node)
+   hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
if (l->hash == hash && !memcmp(>key, key, key_size))
return l;
 
return NULL;
 }
 
+/* can be

Re: [PATCH v4 net-next 4/6] drivers: net: xgene-v2: Add base driver

2017-03-07 Thread Rami Rosen

Hi,
One minor comment:

The return type of xge_init_hw() should be changed to be void, as
the method xge_port_reset() always returns 0;  and also the return type
of xge_port_reset() should be changed to be void, it never fails; see
in [PATCH v4 net-next 3/6] drivers: net: xgene-v2: Add ethernet
hardware configuration.


+static int xge_init_hw(struct net_device *ndev)
+{
+   struct xge_pdata *pdata = netdev_priv(ndev);
+   int ret;
+
+   ret = xge_port_reset(ndev);
+   if (ret)
+   return ret;
+
+   xge_port_init(ndev);
+   pdata->nbufs = NUM_BUFS;
+
+   return 0;
+}

Regards,
Rami Rosen

Re: [PATCH net-next RFC 3/4] vhost: interrupt coalescing support

2017-03-07 Thread Jason Wang




On 2017年03月07日 01:31, Willem de Bruijn wrote:

On Mon, Mar 6, 2017 at 4:28 AM, Jason Wang  wrote:


On 2017年03月03日 22:39, Willem de Bruijn wrote:

+void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq);
+static enum hrtimer_restart vhost_coalesce_timer(struct hrtimer *timer)
+{
+   struct vhost_virtqueue *vq =
+   container_of(timer, struct vhost_virtqueue, ctimer);
+
+   if (mutex_trylock(>mutex)) {
+   vq->coalesce_frames = vq->max_coalesce_frames;
+   vhost_signal(vq->dev, vq);
+   mutex_unlock(>mutex);
+   }
+
+   /* TODO: restart if lock failed and not held by handle_tx */
+   return HRTIMER_NORESTART;
+}
+


Then we may lose an interrupt forever if no new tx request? I believe we
need e.g vhost_poll_queue() here.

Absolutely, I need to fix this. The common case for failing to grab
the lock is competition with handle_tx. With careful coding we can
probably avoid scheduling another run with vhost_poll_queue in
the common case.


Yes, probably add some checking after releasing the mutex_lock in 
handle_tx().


Thans



Your patch v7 cancels the pending hrtimer at the start of handle_tx.
I need to reintroduce that, and also only schedule a timer at the end
of handle_tx, not immediately when vq->coalesce_frames becomes
non-zero.

Re: [PATCH net] tcp: fix various issues for sockets morphing to listen state

2017-03-07 Thread David Miller

From: Eric Dumazet 
Date: Fri, 03 Mar 2017 14:08:21 -0800

> From: Eric Dumazet 
> 
> Dmitry Vyukov reported a divide by 0 triggered by syzkaller, exploiting
> tcp_disconnect() path that was never really considered and/or used
> before syzkaller ;)
> 
> I was not able to reproduce the bug, but it seems issues here are the
> three possible actions that assumed they would never trigger on a
> listener.
> 
> 1) tcp_write_timer_handler
> 2) tcp_delack_timer_handler
> 3) MTU reduction
> 
> Only IPv6 MTU reduction was properly testing TCP_CLOSE and TCP_LISTEN
>  states from tcp_v6_mtu_reduced()
> 
> 
> Signed-off-by: Eric Dumazet 
> Reported-by: Dmitry Vyukov 

Applied and queued up for -stable.

Re: [PATCH net] rxrpc: Call state should be read with READ_ONCE() under some circumstances

2017-03-07 Thread David Miller

From: David Howells 
Date: Sat, 04 Mar 2017 00:01:41 +

> The call state may be changed at any time by the data-ready routine in
> response to received packets, so if the call state is to be read and acted
> upon several times in a function, READ_ONCE() must be used unless the call
> state lock is held.
> 
> Signed-off-by: David Howells 

Applied.

Re: [PATCH] net: initialize msg.msg_flags in recvfrom

2017-03-07 Thread Alexander Potapenko

On Tue, Mar 7, 2017 at 3:26 PM, Eric Dumazet  wrote:
> On Tue, 2017-03-07 at 14:58 +0100, Alexander Potapenko wrote:
>> KMSAN (KernelMemorySanitizer, a new error detection tool) reports use
>> of uninitialized memory in put_cmsg()):
>
> I would prefer that you do not put the stack trace in the changelog,
> same for the reproducer since this has little value in understanding the
> impact.
Understood.
Should be ok to put the report/reproducer below the triple dash, right?

> It looks like a false positive, but you do not say.
Ah, now I see.
Irrespective of the value of (MSG_CMSG_COMPAT & msg->msg_flags) the
code will return 0 either directly from put_cmsg(), or from
put_cmsg_compat().
I wouldn't call this a false positive, as KMSAN can't possibly know
that both branches taken depending on the uninitialized condition are
safe.
But I can imagine this to be less of an problem to the code owners who
do know that :)

> recvmsg() does not care about msg.msg_flags, only KMSAN.
>
> (The important part is that msg.msg_control and msg.msg_controllen are
> 0)
>
> Fine to avoid the false positive, but better be explicit in the
> changelog and says there is no visible effect for this bug.
Ok, I'll change the description.
> If there is a visible effect, please state so instead of technical
> details.
>
> We try to reduce S/N in the changelogs ;)
>
> Thanks a lot !
>
>



-- 
Alexander Potapenko
Software Engineer

Google Germany GmbH
Erika-Mann-Straße, 33
80636 München

Geschäftsführer: Matthew Scott Sucherman, Paul Terence Manicle
Registergericht und -nummer: Hamburg, HRB 86891
Sitz der Gesellschaft: Hamburg

Re: please add some examples to the ip man page

2017-03-07 Thread David Ahern

On 3/7/17 6:25 PM, Stephen Hemminger wrote:
> On Wed, 08 Mar 2017 08:46:01 +0800
> 積丹尼 Dan Jacobson  wrote:
> 
>>> "SH" == Stephen Hemminger  writes:  
>>
>> SH> Sure. Submit a patch.  
>>
>> If only I could figure out how to use the command.
> 
> $ ip link show
> 
> $ ip addr show
> 
> $ ip route
> 

a lot of examples here:
https://www.kernel.org/doc/Documentation/networking/vrf.txt

Re: [PATCH v4 net-next 4/6] drivers: net: xgene-v2: Add base driver

2017-03-07 Thread Florian Fainelli

On 03/07/2017 05:08 PM, Iyappan Subramanian wrote:
> This patch adds,
> 
>  - probe, remove, shutdown
>  - open, close and stats
>  - create and delete ring
>  - request and delete irq
> 
> Signed-off-by: Iyappan Subramanian 
> Signed-off-by: Keyur Chudgar 
> ---

> + pdata->resources.phy_mode = phy_mode;
> +
> + if (pdata->resources.phy_mode != PHY_INTERFACE_MODE_RGMII) {
> + dev_err(dev, "Incorrect phy-connection-type specified\n");
> + return -ENODEV;
> + }

This does not take into account all other PHY_INTERFACE_MODE_RGMII
variants, is that really intentional here?

> +
> + ret = platform_get_irq(pdev, 0);
> + if (ret <= 0) {

0 can be a valid interrupt on some platforms AFAIR, so you may want to
just check < 0.

> + dev_err(dev, "Unable to get ENET IRQ\n");
> + ret = ret ? : -ENXIO;
> + return ret;
> + }
> + pdata->resources.irq = ret;
> +

> +static int xge_request_irq(struct net_device *ndev)
> +{
> + struct xge_pdata *pdata = netdev_priv(ndev);
> + struct device *dev = >pdev->dev;
> + int ret;
> +
> + snprintf(pdata->irq_name, IRQ_ID_SIZE, "%s", ndev->name);
> +
> + ret = devm_request_irq(dev, pdata->resources.irq, xge_irq,
> +0, pdata->irq_name, pdata);
> + if (ret)
> + netdev_err(ndev, "Failed to request irq %s\n", pdata->irq_name);

The preference for network driver is to manage the request_irq() in the
ndo_open() callback and free_irq() in the ndo_close() which kind of
defeats the purpose of using devm_* functions for that purpose.


> +static int xge_open(struct net_device *ndev)
> +{
> + struct xge_pdata *pdata = netdev_priv(ndev);
> + int ret;
> +
> + ret = xge_create_desc_rings(ndev);
> + if (ret)
> + return ret;
> +
> + napi_enable(>napi);
> + ret = xge_request_irq(ndev);
> + if (ret)
> + return ret;
> +
> + xge_intr_enable(pdata);
> + xge_wr_csr(pdata, DMARXCTRL, 1);
> + xge_mac_enable(pdata);
> + netif_start_queue(ndev);
> + netif_carrier_on(ndev);

Can't you use PHYLIB to get the link indication and not manage the link
state manually here? Setting netif_carrier_on() without checking the
actualy physical medium is just plain wrong/


> +static void xge_timeout(struct net_device *ndev)
> +{
> + struct xge_pdata *pdata = netdev_priv(ndev);
> +
> + rtnl_lock();
> +
> + if (netif_running(ndev)) {

Reduce indention here.

> + netif_carrier_off(ndev);
> + netif_stop_queue(ndev);
> + xge_intr_disable(pdata);
> + napi_disable(>napi);
> +

> +static int xge_probe(struct platform_device *pdev)
> +{
> + struct device *dev = >dev;
> + struct net_device *ndev;
> + struct xge_pdata *pdata;
> + int ret;
> +
> + ndev = alloc_etherdev(sizeof(struct xge_pdata));

sizeof(*pdata).
-- 
Florian

[PATCH v4 net-next 0/6] drivers: net: xgene-v2: Add RGMII based 1G driver

2017-03-07 Thread Iyappan Subramanian

This patch set adds support for RGMII based 1GbE hardware which uses a linked
list of DMA descriptor architecture (v2) for APM X-Gene SoCs.

Signed-off-by: Iyappan Subramanian 
---
v4: Address review comments from v3
- fixed local variable declarations to reverse christmas tree order

v3: Address review comments from v2
- fixed kbuild warnings (this 'if' clause does not guard)

v2: Address review comments from v1
- moved create_desc_ring and delete_desc_ring to open() and close()
  respectively
- changed to use dma_zalloc APIs
- fixed tx_timeout()
- removed tx completion polling upper bound
- added error checking on rx packets
- added netif_stop_queue() and netif_wake_queue()

v1:
- Initial version
---

Iyappan Subramanian (6):
  drivers: net: xgene-v2: Add DMA descriptor
  drivers: net: xgene-v2: Add mac configuration
  drivers: net: xgene-v2: Add ethernet hardware configuration
  drivers: net: xgene-v2: Add base driver
  drivers: net: xgene-v2: Add transmit and receive
  MAINTAINERS: Add entry for APM X-Gene SoC Ethernet (v2) driver

 MAINTAINERS|   6 +
 drivers/net/ethernet/apm/Kconfig   |   1 +
 drivers/net/ethernet/apm/Makefile  |   1 +
 drivers/net/ethernet/apm/xgene-v2/Kconfig  |  11 +
 drivers/net/ethernet/apm/xgene-v2/Makefile |   6 +
 drivers/net/ethernet/apm/xgene-v2/enet.c   |  71 +++
 drivers/net/ethernet/apm/xgene-v2/enet.h   |  43 ++
 drivers/net/ethernet/apm/xgene-v2/mac.c| 116 +
 drivers/net/ethernet/apm/xgene-v2/mac.h| 110 +
 drivers/net/ethernet/apm/xgene-v2/main.c   | 756 +
 drivers/net/ethernet/apm/xgene-v2/main.h   |  75 +++
 drivers/net/ethernet/apm/xgene-v2/ring.c   |  81 
 drivers/net/ethernet/apm/xgene-v2/ring.h   | 119 +
 13 files changed, 1396 insertions(+)
 create mode 100644 drivers/net/ethernet/apm/xgene-v2/Kconfig
 create mode 100644 drivers/net/ethernet/apm/xgene-v2/Makefile
 create mode 100644 drivers/net/ethernet/apm/xgene-v2/enet.c
 create mode 100644 drivers/net/ethernet/apm/xgene-v2/enet.h
 create mode 100644 drivers/net/ethernet/apm/xgene-v2/mac.c
 create mode 100644 drivers/net/ethernet/apm/xgene-v2/mac.h
 create mode 100644 drivers/net/ethernet/apm/xgene-v2/main.c
 create mode 100644 drivers/net/ethernet/apm/xgene-v2/main.h
 create mode 100644 drivers/net/ethernet/apm/xgene-v2/ring.c
 create mode 100644 drivers/net/ethernet/apm/xgene-v2/ring.h

-- 
1.9.1

[PATCH v4 net-next 4/6] drivers: net: xgene-v2: Add base driver

2017-03-07 Thread Iyappan Subramanian

This patch adds,

 - probe, remove, shutdown
 - open, close and stats
 - create and delete ring
 - request and delete irq

Signed-off-by: Iyappan Subramanian 
Signed-off-by: Keyur Chudgar 
---
 drivers/net/ethernet/apm/xgene-v2/main.c | 510 +++
 1 file changed, 510 insertions(+)
 create mode 100644 drivers/net/ethernet/apm/xgene-v2/main.c

diff --git a/drivers/net/ethernet/apm/xgene-v2/main.c 
b/drivers/net/ethernet/apm/xgene-v2/main.c
new file mode 100644
index 000..c96b4cc
--- /dev/null
+++ b/drivers/net/ethernet/apm/xgene-v2/main.c
@@ -0,0 +1,510 @@
+/*
+ * Applied Micro X-Gene SoC Ethernet v2 Driver
+ *
+ * Copyright (c) 2017, Applied Micro Circuits Corporation
+ * Author(s): Iyappan Subramanian 
+ *   Keyur Chudgar 
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see .
+ */
+
+#include "main.h"
+
+static const struct acpi_device_id xge_acpi_match[];
+
+static int xge_get_resources(struct xge_pdata *pdata)
+{
+   struct platform_device *pdev;
+   struct net_device *ndev;
+   struct device *dev;
+   struct resource *res;
+   int phy_mode, ret = 0;
+
+   pdev = pdata->pdev;
+   dev = >dev;
+   ndev = pdata->ndev;
+
+   res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+   if (!res) {
+   dev_err(dev, "Resource enet_csr not defined\n");
+   return -ENODEV;
+   }
+
+   pdata->resources.base_addr = devm_ioremap(dev, res->start,
+ resource_size(res));
+   if (!pdata->resources.base_addr) {
+   dev_err(dev, "Unable to retrieve ENET Port CSR region\n");
+   return -ENOMEM;
+   }
+
+   if (!device_get_mac_address(dev, ndev->dev_addr, ETH_ALEN))
+   eth_hw_addr_random(ndev);
+
+   memcpy(ndev->perm_addr, ndev->dev_addr, ndev->addr_len);
+
+   phy_mode = device_get_phy_mode(dev);
+   if (phy_mode < 0) {
+   dev_err(dev, "Unable to get phy-connection-type\n");
+   return phy_mode;
+   }
+   pdata->resources.phy_mode = phy_mode;
+
+   if (pdata->resources.phy_mode != PHY_INTERFACE_MODE_RGMII) {
+   dev_err(dev, "Incorrect phy-connection-type specified\n");
+   return -ENODEV;
+   }
+
+   ret = platform_get_irq(pdev, 0);
+   if (ret <= 0) {
+   dev_err(dev, "Unable to get ENET IRQ\n");
+   ret = ret ? : -ENXIO;
+   return ret;
+   }
+   pdata->resources.irq = ret;
+
+   return 0;
+}
+
+static int xge_refill_buffers(struct net_device *ndev, u32 nbuf)
+{
+   struct xge_pdata *pdata = netdev_priv(ndev);
+   struct xge_desc_ring *ring = pdata->rx_ring;
+   const u8 slots = XGENE_ENET_NUM_DESC - 1;
+   struct device *dev = >pdev->dev;
+   struct xge_raw_desc *raw_desc;
+   u64 addr_lo, addr_hi;
+   u8 tail = ring->tail;
+   struct sk_buff *skb;
+   dma_addr_t dma_addr;
+   u16 len;
+   int i;
+
+   for (i = 0; i < nbuf; i++) {
+   raw_desc = >raw_desc[tail];
+
+   len = XGENE_ENET_STD_MTU;
+   skb = netdev_alloc_skb(ndev, len);
+   if (unlikely(!skb))
+   return -ENOMEM;
+
+   dma_addr = dma_map_single(dev, skb->data, len, DMA_FROM_DEVICE);
+   if (dma_mapping_error(dev, dma_addr)) {
+   netdev_err(ndev, "DMA mapping error\n");
+   dev_kfree_skb_any(skb);
+   return -EINVAL;
+   }
+
+   ring->pkt_info[tail].skb = skb;
+   ring->pkt_info[tail].dma_addr = dma_addr;
+
+   addr_hi = GET_BITS(NEXT_DESC_ADDRH, le64_to_cpu(raw_desc->m1));
+   addr_lo = GET_BITS(NEXT_DESC_ADDRL, le64_to_cpu(raw_desc->m1));
+   raw_desc->m1 = cpu_to_le64(SET_BITS(NEXT_DESC_ADDRL, addr_lo) |
+  SET_BITS(NEXT_DESC_ADDRH, addr_hi) |
+  SET_BITS(PKT_ADDRH,
+   dma_addr >> PKT_ADDRL_LEN));
+
+   dma_wmb();
+   raw_desc->m0 = cpu_to_le64(SET_BITS(PKT_ADDRL, dma_addr) |
+

[PATCH v4 net-next 6/6] MAINTAINERS: Add entry for APM X-Gene SoC Ethernet (v2) driver

2017-03-07 Thread Iyappan Subramanian

This patch adds a MAINTAINERS entry for the ethernet driver for
the on-chip ethernet interface which uses a linked list of DMA
descriptor architecture (v2) for APM X-Gene SoCs.

Signed-off-by: Iyappan Subramanian 
Signed-off-by: Keyur Chudgar 
---
 MAINTAINERS  | 6 ++
 drivers/net/ethernet/apm/xgene-v2/mac.c  | 2 +-
 drivers/net/ethernet/apm/xgene-v2/main.c | 8 
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index c265a5f..e04d3a6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -902,6 +902,12 @@ F: drivers/net/phy/mdio-xgene.c
 F: Documentation/devicetree/bindings/net/apm-xgene-enet.txt
 F: Documentation/devicetree/bindings/net/apm-xgene-mdio.txt
 
+APPLIED MICRO (APM) X-GENE SOC ETHERNET (V2) DRIVER
+M: Iyappan Subramanian 
+M: Keyur Chudgar 
+S: Supported
+F: drivers/net/ethernet/apm/xgene-v2/
+
 APPLIED MICRO (APM) X-GENE SOC PMU
 M: Tai Nguyen 
 S: Supported
diff --git a/drivers/net/ethernet/apm/xgene-v2/mac.c 
b/drivers/net/ethernet/apm/xgene-v2/mac.c
index 9c3d32d..c3189de 100644
--- a/drivers/net/ethernet/apm/xgene-v2/mac.c
+++ b/drivers/net/ethernet/apm/xgene-v2/mac.c
@@ -77,8 +77,8 @@ static void xge_mac_set_speed(struct xge_pdata *pdata)
 
 void xge_mac_set_station_addr(struct xge_pdata *pdata)
 {
-   u32 addr0, addr1;
u8 *dev_addr = pdata->ndev->dev_addr;
+   u32 addr0, addr1;
 
addr0 = (dev_addr[3] << 24) | (dev_addr[2] << 16) |
(dev_addr[1] << 8) | dev_addr[0];
diff --git a/drivers/net/ethernet/apm/xgene-v2/main.c 
b/drivers/net/ethernet/apm/xgene-v2/main.c
index b16ef43..ae76977 100644
--- a/drivers/net/ethernet/apm/xgene-v2/main.c
+++ b/drivers/net/ethernet/apm/xgene-v2/main.c
@@ -27,9 +27,9 @@ static int xge_get_resources(struct xge_pdata *pdata)
 {
struct platform_device *pdev;
struct net_device *ndev;
-   struct device *dev;
-   struct resource *res;
int phy_mode, ret = 0;
+   struct resource *res;
+   struct device *dev;
 
pdev = pdata->pdev;
dev = >dev;
@@ -190,9 +190,9 @@ static netdev_tx_t xge_start_xmit(struct sk_buff *skb, 
struct net_device *ndev)
 {
struct xge_pdata *pdata = netdev_priv(ndev);
struct device *dev = >pdev->dev;
-   static dma_addr_t dma_addr;
struct xge_desc_ring *tx_ring;
struct xge_raw_desc *raw_desc;
+   static dma_addr_t dma_addr;
u64 addr_lo, addr_hi;
void *pkt_buf;
u8 tail;
@@ -526,7 +526,7 @@ static int xge_close(struct net_device *ndev)
 static int xge_napi(struct napi_struct *napi, const int budget)
 {
struct net_device *ndev = napi->dev;
-   struct xge_pdata *pdata = netdev_priv(ndev);
+   struct xge_pdata *pdata;
int processed;
 
pdata = netdev_priv(ndev);
-- 
1.9.1

Re: linux-next: build failure after merge of the rcu tree

2017-03-07 Thread Stephen Rothwell

Hi Paul,

On Mon, 13 Feb 2017 17:43:24 +1100 Stephen Rothwell  
wrote:
>
> On Sun, 12 Feb 2017 20:37:48 -0800 "Paul E. McKenney" 
>  wrote:
> >
> > I chickened out on that commit for this merge window, so it will come
> > back at -rc1.  But I will cover that when I rebase to -rc1.  
> 
> OK, thanks.

[PATCH] smc: merge fix for "mm: Rename SLAB_DESTROY_BY_RCU to 
SLAB_TYPESAFE_BY_RCU"
is needed again ... maybe time to rebase?

-- 
Cheers,
Stephen Rothwell

[PATCH v4 net-next 3/6] drivers: net: xgene-v2: Add ethernet hardware configuration

2017-03-07 Thread Iyappan Subramanian

This patch adds functions to configure ethernet hardware.

Signed-off-by: Iyappan Subramanian 
Signed-off-by: Keyur Chudgar 
---
 drivers/net/ethernet/apm/xgene-v2/enet.c | 71 
 drivers/net/ethernet/apm/xgene-v2/enet.h | 43 +++
 2 files changed, 114 insertions(+)
 create mode 100644 drivers/net/ethernet/apm/xgene-v2/enet.c
 create mode 100644 drivers/net/ethernet/apm/xgene-v2/enet.h

diff --git a/drivers/net/ethernet/apm/xgene-v2/enet.c 
b/drivers/net/ethernet/apm/xgene-v2/enet.c
new file mode 100644
index 000..b49edee
--- /dev/null
+++ b/drivers/net/ethernet/apm/xgene-v2/enet.c
@@ -0,0 +1,71 @@
+/*
+ * Applied Micro X-Gene SoC Ethernet v2 Driver
+ *
+ * Copyright (c) 2017, Applied Micro Circuits Corporation
+ * Author(s): Iyappan Subramanian 
+ *   Keyur Chudgar 
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see .
+ */
+
+#include "main.h"
+
+void xge_wr_csr(struct xge_pdata *pdata, u32 offset, u32 val)
+{
+   void __iomem *addr = pdata->resources.base_addr + offset;
+
+   iowrite32(val, addr);
+}
+
+u32 xge_rd_csr(struct xge_pdata *pdata, u32 offset)
+{
+   void __iomem *addr = pdata->resources.base_addr + offset;
+
+   return ioread32(addr);
+}
+
+int xge_port_reset(struct net_device *ndev)
+{
+   struct xge_pdata *pdata = netdev_priv(ndev);
+
+   xge_wr_csr(pdata, ENET_SRST, 0x3);
+   xge_wr_csr(pdata, ENET_SRST, 0x2);
+   xge_wr_csr(pdata, ENET_SRST, 0x0);
+
+   xge_wr_csr(pdata, ENET_SHIM, DEVM_ARAUX_COH | DEVM_AWAUX_COH);
+
+   return 0;
+}
+
+static void xge_traffic_resume(struct net_device *ndev)
+{
+   struct xge_pdata *pdata = netdev_priv(ndev);
+
+   xge_wr_csr(pdata, CFG_FORCE_LINK_STATUS_EN, 1);
+   xge_wr_csr(pdata, FORCE_LINK_STATUS, 1);
+
+   xge_wr_csr(pdata, CFG_LINK_AGGR_RESUME, 1);
+   xge_wr_csr(pdata, RX_DV_GATE_REG, 1);
+}
+
+int xge_port_init(struct net_device *ndev)
+{
+   struct xge_pdata *pdata = netdev_priv(ndev);
+
+   pdata->phy_speed = SPEED_1000;
+   xge_mac_init(pdata);
+   xge_traffic_resume(ndev);
+
+   return 0;
+}
diff --git a/drivers/net/ethernet/apm/xgene-v2/enet.h 
b/drivers/net/ethernet/apm/xgene-v2/enet.h
new file mode 100644
index 000..40371cf
--- /dev/null
+++ b/drivers/net/ethernet/apm/xgene-v2/enet.h
@@ -0,0 +1,43 @@
+/*
+ * Applied Micro X-Gene SoC Ethernet v2 Driver
+ *
+ * Copyright (c) 2017, Applied Micro Circuits Corporation
+ * Author(s): Iyappan Subramanian 
+ *   Keyur Chudgar 
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see .
+ */
+
+#ifndef __XGENE_ENET_V2_ENET_H__
+#define __XGENE_ENET_V2_ENET_H__
+
+#define ENET_CLKEN 0xc008
+#define ENET_SRST  0xc000
+#define ENET_SHIM  0xc010
+#define CFG_MEM_RAM_SHUTDOWN   0xd070
+#define BLOCK_MEM_RDY  0xd074
+
+#define DEVM_ARAUX_COH BIT(19)
+#define DEVM_AWAUX_COH BIT(3)
+
+#define CFG_FORCE_LINK_STATUS_EN   0x229c
+#define FORCE_LINK_STATUS  0x22a0
+#define CFG_LINK_AGGR_RESUME   0x27c8
+#define RX_DV_GATE_REG 0x2dfc
+
+void xge_wr_csr(struct xge_pdata *pdata, u32 offset, u32 val);
+u32 xge_rd_csr(struct xge_pdata *pdata, u32 offset);
+int xge_port_reset(struct net_device *ndev);
+
+#endif  /* __XGENE_ENET_V2_ENET__H__ */
-- 
1.9.1

Re: please add some examples to the ip man page

2017-03-07 Thread Stephen Hemminger

On Wed, 08 Mar 2017 08:46:01 +0800
積丹尼 Dan Jacobson  wrote:

> > "SH" == Stephen Hemminger  writes:  
> 
> SH> Sure. Submit a patch.  
> 
> If only I could figure out how to use the command.

$ ip link show

$ ip addr show

$ ip route

Question on ixgbe flow director

2017-03-07 Thread tndave


Hi,

I have few questions regarding ixgbe flow director.
As per my understanding flow director in ixgbe can work in 2 exclusive ways,
a. Using ATR filters - where flow director is setup in HW by driver
identifying transmit traffic. And based on that, receive traffic of the
same flow get assigned/directed to same queue.

b. Perfect filter, where user can manually program flow director using
ethtool so that receive packets gets directed to specified rx queue
(depending on on how ethtool flow-type and action etc,.); But with
perfect filters there is no intelligence involved alike ATR has on
identifying transit, right?

Few question regarding ixgbe ATR,
1. does ATR works in case if protocol is UDP? (Based on the current
ixgbe_atr() it only supports TCP)
2. Does ATR flow director can be programmed using ethtool?
(As per my understanding only perfect filter can be programmed from
ethtool, is that so?)

Thanks in advance,
-Tushar

[PATCH v4 net-next 2/6] drivers: net: xgene-v2: Add mac configuration

2017-03-07 Thread Iyappan Subramanian

This patch adds functions to configure and control mac.  This
patch also adds helper functions to get/set registers.

Signed-off-by: Iyappan Subramanian 
Signed-off-by: Keyur Chudgar 
---
 drivers/net/ethernet/apm/xgene-v2/mac.c | 116 
 drivers/net/ethernet/apm/xgene-v2/mac.h | 110 ++
 2 files changed, 226 insertions(+)
 create mode 100644 drivers/net/ethernet/apm/xgene-v2/mac.c
 create mode 100644 drivers/net/ethernet/apm/xgene-v2/mac.h

diff --git a/drivers/net/ethernet/apm/xgene-v2/mac.c 
b/drivers/net/ethernet/apm/xgene-v2/mac.c
new file mode 100644
index 000..9c3d32d
--- /dev/null
+++ b/drivers/net/ethernet/apm/xgene-v2/mac.c
@@ -0,0 +1,116 @@
+/*
+ * Applied Micro X-Gene SoC Ethernet v2 Driver
+ *
+ * Copyright (c) 2017, Applied Micro Circuits Corporation
+ * Author(s): Iyappan Subramanian 
+ *   Keyur Chudgar 
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see .
+ */
+
+#include "main.h"
+
+void xge_mac_reset(struct xge_pdata *pdata)
+{
+   xge_wr_csr(pdata, MAC_CONFIG_1, SOFT_RESET);
+   xge_wr_csr(pdata, MAC_CONFIG_1, 0);
+}
+
+static void xge_mac_set_speed(struct xge_pdata *pdata)
+{
+   u32 icm0, icm2, ecm0, mc2;
+   u32 intf_ctrl, rgmii;
+
+   icm0 = xge_rd_csr(pdata, ICM_CONFIG0_REG_0);
+   icm2 = xge_rd_csr(pdata, ICM_CONFIG2_REG_0);
+   ecm0 = xge_rd_csr(pdata, ECM_CONFIG0_REG_0);
+   rgmii = xge_rd_csr(pdata, RGMII_REG_0);
+   mc2 = xge_rd_csr(pdata, MAC_CONFIG_2);
+   intf_ctrl = xge_rd_csr(pdata, INTERFACE_CONTROL);
+   icm2 |= CFG_WAITASYNCRD_EN;
+
+   switch (pdata->phy_speed) {
+   case SPEED_10:
+   SET_REG_BITS(, INTF_MODE, 1);
+   SET_REG_BITS(_ctrl, HD_MODE, 0);
+   SET_REG_BITS(, CFG_MACMODE, 0);
+   SET_REG_BITS(, CFG_WAITASYNCRD, 500);
+   SET_REG_BIT(, CFG_SPEED_125, 0);
+   break;
+   case SPEED_100:
+   SET_REG_BITS(, INTF_MODE, 1);
+   SET_REG_BITS(_ctrl, HD_MODE, 1);
+   SET_REG_BITS(, CFG_MACMODE, 1);
+   SET_REG_BITS(, CFG_WAITASYNCRD, 80);
+   SET_REG_BIT(, CFG_SPEED_125, 0);
+   break;
+   default:
+   SET_REG_BITS(, INTF_MODE, 2);
+   SET_REG_BITS(_ctrl, HD_MODE, 2);
+   SET_REG_BITS(, CFG_MACMODE, 2);
+   SET_REG_BITS(, CFG_WAITASYNCRD, 16);
+   SET_REG_BIT(, CFG_SPEED_125, 1);
+   break;
+   }
+
+   mc2 |= FULL_DUPLEX | CRC_EN | PAD_CRC;
+   SET_REG_BITS(, CFG_WFIFOFULLTHR, 0x32);
+
+   xge_wr_csr(pdata, MAC_CONFIG_2, mc2);
+   xge_wr_csr(pdata, INTERFACE_CONTROL, intf_ctrl);
+   xge_wr_csr(pdata, RGMII_REG_0, rgmii);
+   xge_wr_csr(pdata, ICM_CONFIG0_REG_0, icm0);
+   xge_wr_csr(pdata, ICM_CONFIG2_REG_0, icm2);
+   xge_wr_csr(pdata, ECM_CONFIG0_REG_0, ecm0);
+}
+
+void xge_mac_set_station_addr(struct xge_pdata *pdata)
+{
+   u32 addr0, addr1;
+   u8 *dev_addr = pdata->ndev->dev_addr;
+
+   addr0 = (dev_addr[3] << 24) | (dev_addr[2] << 16) |
+   (dev_addr[1] << 8) | dev_addr[0];
+   addr1 = (dev_addr[5] << 24) | (dev_addr[4] << 16);
+
+   xge_wr_csr(pdata, STATION_ADDR0, addr0);
+   xge_wr_csr(pdata, STATION_ADDR1, addr1);
+}
+
+void xge_mac_init(struct xge_pdata *pdata)
+{
+   xge_mac_reset(pdata);
+   xge_mac_set_speed(pdata);
+   xge_mac_set_station_addr(pdata);
+}
+
+void xge_mac_enable(struct xge_pdata *pdata)
+{
+   u32 data;
+
+   data = xge_rd_csr(pdata, MAC_CONFIG_1);
+   data |= TX_EN | RX_EN;
+   xge_wr_csr(pdata, MAC_CONFIG_1, data);
+
+   data = xge_rd_csr(pdata, MAC_CONFIG_1);
+}
+
+void xge_mac_disable(struct xge_pdata *pdata)
+{
+   u32 data;
+
+   data = xge_rd_csr(pdata, MAC_CONFIG_1);
+   data &= ~(TX_EN | RX_EN);
+   xge_wr_csr(pdata, MAC_CONFIG_1, data);
+}
diff --git a/drivers/net/ethernet/apm/xgene-v2/mac.h 
b/drivers/net/ethernet/apm/xgene-v2/mac.h
new file mode 100644
index 000..0fce6ae
--- /dev/null
+++ b/drivers/net/ethernet/apm/xgene-v2/mac.h
@@ -0,0 +1,110 @@
+/*
+ * Applied Micro X-Gene SoC Ethernet v2 Driver
+ *
+ * Copyright (c) 2017, Applied Micro Circuits Corporation
+ * Author(s): Iyappan

[PATCH v4 net-next 1/6] drivers: net: xgene-v2: Add DMA descriptor

2017-03-07 Thread Iyappan Subramanian

This patch adds DMA descriptor setup and interrupt enable/disable
functions.

Signed-off-by: Iyappan Subramanian 
Signed-off-by: Keyur Chudgar 
---
 drivers/net/ethernet/apm/xgene-v2/main.h |  74 +++
 drivers/net/ethernet/apm/xgene-v2/ring.c |  81 +
 drivers/net/ethernet/apm/xgene-v2/ring.h | 119 +++
 3 files changed, 274 insertions(+)
 create mode 100644 drivers/net/ethernet/apm/xgene-v2/main.h
 create mode 100644 drivers/net/ethernet/apm/xgene-v2/ring.c
 create mode 100644 drivers/net/ethernet/apm/xgene-v2/ring.h

diff --git a/drivers/net/ethernet/apm/xgene-v2/main.h 
b/drivers/net/ethernet/apm/xgene-v2/main.h
new file mode 100644
index 000..a2f8712
--- /dev/null
+++ b/drivers/net/ethernet/apm/xgene-v2/main.h
@@ -0,0 +1,74 @@
+/*
+ * Applied Micro X-Gene SoC Ethernet v2 Driver
+ *
+ * Copyright (c) 2017, Applied Micro Circuits Corporation
+ * Author(s): Iyappan Subramanian 
+ *   Keyur Chudgar 
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see .
+ */
+
+#ifndef __XGENE_ENET_V2_MAIN_H__
+#define __XGENE_ENET_V2_MAIN_H__
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "mac.h"
+#include "enet.h"
+#include "ring.h"
+
+#define XGENE_ENET_V2_VERSION  "v1.0"
+#define XGENE_ENET_STD_MTU 1536
+#define XGENE_ENET_MIN_FRAME   60
+#define IRQ_ID_SIZE 16
+
+struct xge_resource {
+   void __iomem *base_addr;
+   int phy_mode;
+   u32 irq;
+};
+
+struct xge_stats {
+   u64 tx_packets;
+   u64 tx_bytes;
+   u64 rx_packets;
+   u64 rx_bytes;
+};
+
+/* ethernet private data */
+struct xge_pdata {
+   struct xge_resource resources;
+   struct xge_desc_ring *tx_ring;
+   struct xge_desc_ring *rx_ring;
+   struct platform_device *pdev;
+   char irq_name[IRQ_ID_SIZE];
+   struct net_device *ndev;
+   struct napi_struct napi;
+   struct xge_stats stats;
+   int phy_speed;
+   u8 nbufs;
+};
+
+#endif /* __XGENE_ENET_V2_MAIN_H__ */
diff --git a/drivers/net/ethernet/apm/xgene-v2/ring.c 
b/drivers/net/ethernet/apm/xgene-v2/ring.c
new file mode 100644
index 000..3881082
--- /dev/null
+++ b/drivers/net/ethernet/apm/xgene-v2/ring.c
@@ -0,0 +1,81 @@
+/*
+ * Applied Micro X-Gene SoC Ethernet v2 Driver
+ *
+ * Copyright (c) 2017, Applied Micro Circuits Corporation
+ * Author(s): Iyappan Subramanian 
+ *   Keyur Chudgar 
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see .
+ */
+
+#include "main.h"
+
+/* create circular linked list of descriptors */
+void xge_setup_desc(struct xge_desc_ring *ring)
+{
+   struct xge_raw_desc *raw_desc;
+   dma_addr_t dma_h, next_dma;
+   u16 offset;
+   int i;
+
+   for (i = 0; i < XGENE_ENET_NUM_DESC; i++) {
+   raw_desc = >raw_desc[i];
+
+   offset = (i + 1) & (XGENE_ENET_NUM_DESC - 1);
+   next_dma = ring->dma_addr + (offset * XGENE_ENET_DESC_SIZE);
+
+   raw_desc->m0 = cpu_to_le64(SET_BITS(E, 1) |
+  SET_BITS(PKT_SIZE, SLOT_EMPTY));
+   dma_h = upper_32_bits(next_dma);
+   raw_desc->m1 = cpu_to_le64(SET_BITS(NEXT_DESC_ADDRL, next_dma) |
+  SET_BITS(NEXT_DESC_ADDRH, dma_h));
+   }
+}
+
+void xge_update_tx_desc_addr(struct xge_pdata *pdata)
+{
+   struct xge_desc_ring *ring = pdata->tx_ring;
+   dma_addr_t dma_addr = ring->dma_addr;
+
+   xge_wr_csr(pdata, DMATXDESCL, dma_addr);
+   xge_wr_csr(pdata, DMATXDESCH,

[PATCH net-next v2 2/2] mpls: allow TTL propagation from IP packets to be configured

2017-03-07 Thread Robert Shearman

Allow TTL propagation from IP packets to MPLS packets to be
configured. Add a new optional LWT attribute, MPLS_IPTUNNEL_TTL, which
allows the TTL to be set in the resulting MPLS packet, with the value
of 0 having the semantics of enabling propagation of the TTL from the
IP header (i.e. non-zero values disable propagation).

Also allow the configuration to be overridden globally by reusing the
same sysctl to control whether the TTL is propagated from IP packets
into the MPLS header. If the per-LWT attribute is set then it
overrides the global configuration. If the TTL isn't propagated then a
default TTL value is used which can be configured via a new sysctl,
"net.mpls.default_ttl". This is kept separate from the configuration
of whether IP TTL propagation is enabled as it can be used in the
future when non-IP payloads are supported (i.e. where there is no
payload TTL that can be propagated).

Signed-off-by: Robert Shearman 
---
 Documentation/networking/mpls-sysctl.txt |  8 
 include/net/mpls_iptunnel.h  |  2 +
 include/net/netns/mpls.h |  1 +
 include/uapi/linux/mpls_iptunnel.h   |  2 +
 net/mpls/af_mpls.c   | 11 ++
 net/mpls/mpls_iptunnel.c | 64 +---
 6 files changed, 75 insertions(+), 13 deletions(-)

diff --git a/Documentation/networking/mpls-sysctl.txt 
b/Documentation/networking/mpls-sysctl.txt
index 9badd1d6685f..2f24a1912a48 100644
--- a/Documentation/networking/mpls-sysctl.txt
+++ b/Documentation/networking/mpls-sysctl.txt
@@ -30,6 +30,14 @@ ip_ttl_propagate - BOOL
0 - disabled / RFC 3443 [Short] Pipe Model
1 - enabled / RFC 3443 Uniform Model (default)
 
+default_ttl - BOOL
+   Default TTL value to use for MPLS packets where it cannot be
+   propagated from an IP header, either because one isn't present
+   or ip_ttl_propagate has been disabled.
+
+   Possible values: 1 - 255
+   Default: 255
+
 conf//input - BOOL
Control whether packets can be input on this interface.
 
diff --git a/include/net/mpls_iptunnel.h b/include/net/mpls_iptunnel.h
index 179253f9dcfd..a18af6a16eb5 100644
--- a/include/net/mpls_iptunnel.h
+++ b/include/net/mpls_iptunnel.h
@@ -19,6 +19,8 @@
 struct mpls_iptunnel_encap {
u32 label[MAX_NEW_LABELS];
u8  labels;
+   u8  ttl_propagate;
+   u8  default_ttl;
 };
 
 static inline struct mpls_iptunnel_encap *mpls_lwtunnel_encap(struct 
lwtunnel_state *lwtstate)
diff --git a/include/net/netns/mpls.h b/include/net/netns/mpls.h
index 58e0e46c4a5c..1b68aed6e1b9 100644
--- a/include/net/netns/mpls.h
+++ b/include/net/netns/mpls.h
@@ -11,6 +11,7 @@ struct ctl_table_header;
 struct netns_mpls {
size_t platform_labels;
int ip_ttl_propagate;
+   int default_ttl;
struct mpls_route __rcu * __rcu *platform_label;
 
struct ctl_table_header *ctl;
diff --git a/include/uapi/linux/mpls_iptunnel.h 
b/include/uapi/linux/mpls_iptunnel.h
index d80a0498f77e..f5e45095b0bb 100644
--- a/include/uapi/linux/mpls_iptunnel.h
+++ b/include/uapi/linux/mpls_iptunnel.h
@@ -16,11 +16,13 @@
 /* MPLS tunnel attributes
  * [RTA_ENCAP] = {
  * [MPLS_IPTUNNEL_DST]
+ * [MPLS_IPTUNNEL_TTL]
  * }
  */
 enum {
MPLS_IPTUNNEL_UNSPEC,
MPLS_IPTUNNEL_DST,
+   MPLS_IPTUNNEL_TTL,
__MPLS_IPTUNNEL_MAX,
 };
 #define MPLS_IPTUNNEL_MAX (__MPLS_IPTUNNEL_MAX - 1)
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index d4a51da8a0ce..a8710d334a60 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -34,6 +34,7 @@
 static int zero = 0;
 static int one = 1;
 static int label_limit = (1 << 20) - 1;
+static int ttl_max = 255;
 
 static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt,
   struct nlmsghdr *nlh, struct net *net, u32 portid,
@@ -2027,6 +2028,15 @@ static const struct ctl_table mpls_table[] = {
.extra1 = ,
.extra2 = ,
},
+   {
+   .procname   = "default_ttl",
+   .data   = MPLS_NS_SYSCTL_OFFSET(mpls.default_ttl),
+   .maxlen = sizeof(int),
+   .mode   = 0644,
+   .proc_handler   = proc_dointvec_minmax,
+   .extra1 = ,
+   .extra2 = _max,
+   },
{ }
 };
 
@@ -2038,6 +2048,7 @@ static int mpls_net_init(struct net *net)
net->mpls.platform_labels = 0;
net->mpls.platform_label = NULL;
net->mpls.ip_ttl_propagate = 1;
+   net->mpls.default_ttl = 255;
 
table = kmemdup(mpls_table, sizeof(mpls_table), GFP_KERNEL);
if (table == NULL)
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index e4e4424f9eb1..da2fb02e0f27 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c
@@ -29,6 +29,7 @@
 
 static const struct nla_policy

[PATCH net-next v2 1/2] mpls: allow TTL propagation to IP packets to be configured

2017-03-07 Thread Robert Shearman

Provide the ability to control on a per-route basis whether the TTL
value from an MPLS packet is propagated to an IPv4/IPv6 packet when
the last label is popped as per the theoretical model in RFC 3443
through a new route attribute, RTA_TTL_PROPAGATE which can be 0 to
mean disable propagation and 1 to mean enable propagation.

In order to provide the ability to change the behaviour for packets
arriving with IPv4/IPv6 Explicit Null labels and to provide an easy
way for a user to change the behaviour for all existing routes without
having to reprogram them, a global knob is provided. This is done
through the addition of a new per-namespace sysctl,
"net.mpls.ip_ttl_propagate", which defaults to enabled. If the
per-route attribute is set (either enabled or disabled) then it
overrides the global configuration.

Signed-off-by: Robert Shearman 
---
 Documentation/networking/mpls-sysctl.txt | 11 
 include/net/netns/mpls.h |  2 +
 include/uapi/linux/rtnetlink.h   |  1 +
 net/mpls/af_mpls.c   | 88 ++--
 net/mpls/internal.h  |  7 +++
 5 files changed, 93 insertions(+), 16 deletions(-)

diff --git a/Documentation/networking/mpls-sysctl.txt 
b/Documentation/networking/mpls-sysctl.txt
index 15d8d16934fd..9badd1d6685f 100644
--- a/Documentation/networking/mpls-sysctl.txt
+++ b/Documentation/networking/mpls-sysctl.txt
@@ -19,6 +19,17 @@ platform_labels - INTEGER
Possible values: 0 - 1048575
Default: 0
 
+ip_ttl_propagate - BOOL
+   Control whether TTL is propagated from the IPv4/IPv6 header to
+   the MPLS header on imposing labels and propagated from the
+   MPLS header to the IPv4/IPv6 header on popping the last label.
+
+   If disabled, the MPLS transport network will appear as a
+   single hop to transit traffic.
+
+   0 - disabled / RFC 3443 [Short] Pipe Model
+   1 - enabled / RFC 3443 Uniform Model (default)
+
 conf//input - BOOL
Control whether packets can be input on this interface.
 
diff --git a/include/net/netns/mpls.h b/include/net/netns/mpls.h
index d29203651c01..58e0e46c4a5c 100644
--- a/include/net/netns/mpls.h
+++ b/include/net/netns/mpls.h
@@ -10,7 +10,9 @@ struct ctl_table_header;
 
 struct netns_mpls {
size_t platform_labels;
+   int ip_ttl_propagate;
struct mpls_route __rcu * __rcu *platform_label;
+
struct ctl_table_header *ctl;
 };
 
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 6546917d605a..30fb25e851db 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -319,6 +319,7 @@ enum rtattr_type_t {
RTA_EXPIRES,
RTA_PAD,
RTA_UID,
+   RTA_TTL_PROPAGATE,
__RTA_MAX
 };
 
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 3818686182b2..d4a51da8a0ce 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -32,6 +32,7 @@
 #define MPLS_NEIGH_TABLE_UNSPEC (NEIGH_LINK_TABLE + 1)
 
 static int zero = 0;
+static int one = 1;
 static int label_limit = (1 << 20) - 1;
 
 static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt,
@@ -220,8 +221,8 @@ static struct mpls_nh *mpls_select_multipath(struct 
mpls_route *rt,
return >rt_nh[nh_index];
 }
 
-static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb,
-   struct mpls_entry_decoded dec)
+static bool mpls_egress(struct net *net, struct mpls_route *rt,
+   struct sk_buff *skb, struct mpls_entry_decoded dec)
 {
enum mpls_payload_type payload_type;
bool success = false;
@@ -244,24 +245,33 @@ static bool mpls_egress(struct mpls_route *rt, struct 
sk_buff *skb,
payload_type = ip_hdr(skb)->version;
 
switch (payload_type) {
-   case MPT_IPV4: {
-   struct iphdr *hdr4 = ip_hdr(skb);
+   case MPT_IPV4:
+   if (rt->rt_ttl_propagate == MPLS_TTL_PROP_ENABLED ||
+   (rt->rt_ttl_propagate == MPLS_TTL_PROP_DEFAULT &&
+net->mpls.ip_ttl_propagate)) {
+   struct iphdr *hdr4 = ip_hdr(skb);
+
+   csum_replace2(>check,
+ htons(hdr4->ttl << 8),
+ htons(dec.ttl << 8));
+   hdr4->ttl = dec.ttl;
+   }
skb->protocol = htons(ETH_P_IP);
-   csum_replace2(>check,
- htons(hdr4->ttl << 8),
- htons(dec.ttl << 8));
-   hdr4->ttl = dec.ttl;
success = true;
break;
-   }
-   case MPT_IPV6: {
-   struct ipv6hdr *hdr6 = ipv6_hdr(skb);
+   case MPT_IPV6:
+   if (rt->rt_ttl_propagate == MPLS_TTL_PROP_ENABLED ||
+   (rt->rt_ttl_propagate == MPLS_TTL_PROP_DEFAULT &&
+

[PATCH net-next v2 0/2] mpls: allow TTL propagation to/from IP packets to be configured

2017-03-07 Thread Robert Shearman

It is sometimes desirable to present an MPLS transport network as a
single hop to traffic transiting it because it prevents confusion when
diagnosing failures. An example of where confusion can be generated is
when addresses used in the provider network overlap with addresses in
the overlay network and the addresses get exposed through ICMP errors
generated as packets transit the provider network.

In addition, RFC 3443 defines two methods of deriving TTL for an
outgoing packet: Uniform Model where the TTL is propagated to/from the
MPLS header and both Pipe Models and Short Pipe Models (with and
without PHP) where the TTL is not propagated to/from the MPLS header.

Changes in v2:
 - add references to RFC 3443 as suggested by David Ahern
 - fix setting of skb->protocol as noticed by David Ahern
 - implement per-route/per-LWT configurability as suggested by Eric
   Biederman
 - split into two patches for ease of review

Robert Shearman (2):
  mpls: allow TTL propagation to IP packets to be configured
  mpls: allow TTL propagation from IP packets to be configured

 Documentation/networking/mpls-sysctl.txt | 19 ++
 include/net/mpls_iptunnel.h  |  2 +
 include/net/netns/mpls.h |  3 +
 include/uapi/linux/mpls_iptunnel.h   |  2 +
 include/uapi/linux/rtnetlink.h   |  1 +
 net/mpls/af_mpls.c   | 99 ++--
 net/mpls/internal.h  |  7 +++
 net/mpls/mpls_iptunnel.c | 64 -
 8 files changed, 168 insertions(+), 29 deletions(-)

-- 
2.1.4

[RFC net-next sample action optimization 3/3] openvswitch: Optimize sample action for the clone use cases

2017-03-07 Thread Andy Zhou

With the introduction of open flow 'clone' action, the OVS user space
can now translate the 'clone' action into kernel datapath 'sample'
action, with 100% probability, to ensure that the clone semantics,
which is that the packet seen by the clone action is the same as the
packet seen by the action after clone, is faithfully carried out
in the datapath.

While the sample action in the datpath has the matching semantics,
its implementation is only optimized for its original use.
Specifically, there are two limitation: First, there is a 3 level of
nesting restriction, enforced at the flow downloading time. This
limit turns out to be too restrictive for the 'clone' use case.
Second, the implementation avoid recursive call only if the sample
action list has a single userspace action.

The optimization implemented in this series removes the static
nesting limit check, instead, implement the run time recursion limit
check, and recursion avoidance similar to that of the 'recirc' action.
This optimization solve both #1 and #2 issues above.

Another optimization implemented is to avoid coping flow key as
long as the actions enclosed does not change the flow key. The
detection is performed only once at the flow downloading time.

The third optimization implemented is to rewrite the action list
at flow downloading time in order to save the fast path from parsing
the sample action list in its original form repeatedly.

Signed-off-by: Andy Zhou 
---
 net/openvswitch/actions.c  | 106 ++--
 net/openvswitch/datapath.h |   7 +++
 net/openvswitch/flow_netlink.c | 118 -
 3 files changed, 140 insertions(+), 91 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 259aea9..2e8c372 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -930,71 +930,52 @@ static int output_userspace(struct datapath *dp, struct 
sk_buff *skb,
 }
 
 static int sample(struct datapath *dp, struct sk_buff *skb,
- struct sw_flow_key *key, const struct nlattr *attr,
- const struct nlattr *actions, int actions_len)
+ struct sw_flow_key *key, const struct nlattr *attr)
 {
-   const struct nlattr *acts_list = NULL;
-   const struct nlattr *a;
-   int rem;
-   u32 cutlen = 0;
-
-   for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
-a = nla_next(a, )) {
-   u32 probability;
-
-   switch (nla_type(a)) {
-   case OVS_SAMPLE_ATTR_PROBABILITY:
-   probability = nla_get_u32(a);
-   if (!probability || prandom_u32() > probability)
-   return 0;
-   break;
-
-   case OVS_SAMPLE_ATTR_ACTIONS:
-   acts_list = a;
-   break;
-   }
-   }
+   struct nlattr *actions;
+   struct nlattr *sample_arg;
+   struct sw_flow_key *orig = key;
+   int rem = nla_len(attr);
+   int err = 0;
+   const struct sample_arg *arg;
 
-   rem = nla_len(acts_list);
-   a = nla_data(acts_list);
+   /* The first action is always 'OVS_SAMPLE_ATTR_AUX'. */
+   sample_arg = nla_data(attr);
+   arg = nla_data(sample_arg);
+   actions = nla_next(sample_arg, );
 
-   /* Actions list is empty, do nothing */
-   if (unlikely(!rem))
-   return 0;
+   if (arg->probability != U32_MAX)
+   if (!arg->probability || prandom_u32() > arg->probability)
+   return 0;
 
-   /* The only known usage of sample action is having a single user-space
-* action, or having a truncate action followed by a single user-space
-* action. Treat this usage as a special case.
-* The output_userspace() should clone the skb to be sent to the
-* user space. This skb will be consumed by its caller.
+   /* In case the sample actions won't change 'key',
+* we can use key for the clone execution.
+* Otherwise, try to allocate a key from the
+* next recursion level of 'flow_keys'. If
+* successful, we can still execute the clone
+* actions  without deferring.
+*
+* Defer the clone action if the action recursion
+* limit has been reached.
 */
-   if (unlikely(nla_type(a) == OVS_ACTION_ATTR_TRUNC)) {
-   struct ovs_action_trunc *trunc = nla_data(a);
-
-   if (skb->len > trunc->max_len)
-   cutlen = skb->len - trunc->max_len;
-
-   a = nla_next(a, );
+   if (!arg->exec) {
+   __this_cpu_inc(exec_actions_level);
+   key = clone_key(key);
}
 
-   if (likely(nla_type(a) == OVS_ACTION_ATTR_USERSPACE &&
-  nla_is_last(a, rem)))
-   return output_userspace(dp, skb, key, a, actions,
-

Re: please add some examples to the ip man page

2017-03-07 Thread 積丹尼 Dan Jacobson

> "SH" == Stephen Hemminger  writes:

SH> Sure. Submit a patch.

If only I could figure out how to use the command.

[RFC net-next sample action optimization 2/3] openvswitch: Refactor recirc key allocation.

2017-03-07 Thread Andy Zhou

The logic of allocating and copy key for each 'exec_actions_level'
was specific to execute_recirc(). However, future patches will reuse
as well.  Refactor the logic into its own function clone_key().

Signed-off-by: Andy Zhou 
---
 net/openvswitch/actions.c | 72 +--
 1 file changed, 44 insertions(+), 28 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 75182e9..259aea9 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2014 Nicira, Inc.
+ * Copyright (c) 2007-2017 Nicira, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -75,7 +75,7 @@ struct ovs_frag_data {
 
 #define DEFERRED_ACTION_FIFO_SIZE 10
 #define OVS_RECURSION_LIMIT 5
-#define OVS_DEFERRED_ACTION_THRESHOLD (OVS_RECURSION_LIMIT - 2)
+#define OVS_ACTION_RECURSION_THRESHOLD (OVS_RECURSION_LIMIT - 2)
 struct action_fifo {
int head;
int tail;
@@ -83,14 +83,32 @@ struct action_fifo {
struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE];
 };
 
-struct recirc_keys {
-   struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD];
+struct action_flow_keys {
+   struct sw_flow_key key[OVS_ACTION_RECURSION_THRESHOLD];
 };
 
 static struct action_fifo __percpu *action_fifos;
-static struct recirc_keys __percpu *recirc_keys;
+static struct action_flow_keys __percpu *flow_keys;
 static DEFINE_PER_CPU(int, exec_actions_level);
 
+/* Make a clone of the 'key', using the pre-allocated percpu 'flow_keys'
+ * space. Since the storage is pre-allocated, the caller does not
+ * need to check for NULL return pointer.
+ */
+static struct sw_flow_key *clone_key(const struct sw_flow_key *key_)
+{
+   struct action_flow_keys *keys = this_cpu_ptr(flow_keys);
+   int level = this_cpu_read(exec_actions_level);
+   struct sw_flow_key *key = NULL;
+
+   if (level <= OVS_ACTION_RECURSION_THRESHOLD) {
+   key = >key[level - 1];
+   *key = *key_;
+   }
+
+   return key;
+}
+
 static void action_fifo_init(struct action_fifo *fifo)
 {
fifo->head = 0;
@@ -1090,8 +1108,8 @@ static int execute_recirc(struct datapath *dp, struct 
sk_buff *skb,
  struct sw_flow_key *key,
  const struct nlattr *a, int rem)
 {
+   struct sw_flow_key *recirc_key;
struct deferred_action *da;
-   int level;
 
if (!is_flow_key_valid(key)) {
int err;
@@ -1115,29 +1133,27 @@ static int execute_recirc(struct datapath *dp, struct 
sk_buff *skb,
return 0;
}
 
-   level = this_cpu_read(exec_actions_level);
-   if (level <= OVS_DEFERRED_ACTION_THRESHOLD) {
-   struct recirc_keys *rks = this_cpu_ptr(recirc_keys);
-   struct sw_flow_key *recirc_key = >key[level - 1];
-
-   *recirc_key = *key;
+   /* If we are within the limit of 'OVS_ACTION_RECURSION_THRESHOLD',
+* recirc immediately, otherwise, defer it for later execution.
+*/
+   recirc_key = clone_key(key);
+   if (recirc_key) {
recirc_key->recirc_id = nla_get_u32(a);
ovs_dp_process_packet(skb, recirc_key);
-
-   return 0;
-   }
-
-   da = add_deferred_actions(skb, key, NULL, 0);
-   if (da) {
-   da->pkt_key.recirc_id = nla_get_u32(a);
} else {
-   kfree_skb(skb);
-
-   if (net_ratelimit())
-   pr_warn("%s: deferred action limit reached, drop recirc 
action\n",
-   ovs_dp_name(dp));
+   da = add_deferred_actions(skb, key, NULL, 0);
+   if (da) {
+   recirc_key = >pkt_key;
+   recirc_key->recirc_id = nla_get_u32(a);
+   } else {
+   /* Log an error in case action fifo is full.
+*/
+   kfree_skb(skb);
+   if (net_ratelimit())
+   pr_warn("%s: deferred action limit reached, 
drop recirc action\n",
+   ovs_dp_name(dp));
+   }
}
-
return 0;
 }
 
@@ -1327,8 +1343,8 @@ int action_fifos_init(void)
if (!action_fifos)
return -ENOMEM;
 
-   recirc_keys = alloc_percpu(struct recirc_keys);
-   if (!recirc_keys) {
+   flow_keys = alloc_percpu(struct action_flow_keys);
+   if (!flow_keys) {
free_percpu(action_fifos);
return -ENOMEM;
}
@@ -1339,5 +1355,5 @@ int action_fifos_init(void)
 void action_fifos_exit(void)
 {
free_percpu(action_fifos);
-   free_percpu(recirc_keys);
+   free_percpu(flow_keys);
 }
-- 
1.8.3.1

[RFC net-next sample action optimization 0/3]

2017-03-07 Thread Andy Zhou

The sample action can be used for translating Openflow 'clone' action.
However its implementation has not been sufficiently optimized for this
use case. This series attempts to close the gap.

Patch 3 commit message has more details on the specific optimizations
implemented.


Andy Zhou (3):
  openvswitch: deferred fifo api change
  openvswitch: Refactor recirc key allocation.
  openvswitch: Optimize sample action for the clone use cases

 net/openvswitch/actions.c  | 190 ++---
 net/openvswitch/datapath.h |   7 ++
 net/openvswitch/flow_netlink.c | 118 +
 3 files changed, 192 insertions(+), 123 deletions(-)

-- 
1.8.3.1

[RFC net-next sample action optimization 1/3] openvswitch: deferred fifo api change

2017-03-07 Thread Andy Zhou

add_deferred_actions() API currently requires actions to be passed in
as a fully encoded netlink message. So far both 'sample' and 'recirc'
actions happens to carry actions as fully encoded netlink messages.
However, this requirement is more restrictive than necessary, future
patch will need to pass in action lists that are not fully encoded
by themselves.

Signed-off-by: Andy Zhou 
---
 net/openvswitch/actions.c | 18 +++---
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index c82301c..75182e9 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -51,6 +51,7 @@ static int do_execute_actions(struct datapath *dp, struct 
sk_buff *skb,
 struct deferred_action {
struct sk_buff *skb;
const struct nlattr *actions;
+   int actions_len;
 
/* Store pkt_key clone when creating deferred action. */
struct sw_flow_key pkt_key;
@@ -119,8 +120,9 @@ static struct deferred_action *action_fifo_put(struct 
action_fifo *fifo)
 
 /* Return true if fifo is not full */
 static struct deferred_action *add_deferred_actions(struct sk_buff *skb,
-   const struct sw_flow_key 
*key,
-   const struct nlattr *attr)
+   const struct sw_flow_key *key,
+   const struct nlattr *actions,
+   const int actions_len)
 {
struct action_fifo *fifo;
struct deferred_action *da;
@@ -129,7 +131,8 @@ static struct deferred_action *add_deferred_actions(struct 
sk_buff *skb,
da = action_fifo_put(fifo);
if (da) {
da->skb = skb;
-   da->actions = attr;
+   da->actions = actions;
+   da->actions_len = actions_len;
da->pkt_key = *key;
}
 
@@ -966,7 +969,8 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
/* Skip the sample action when out of memory. */
return 0;
 
-   if (!add_deferred_actions(skb, key, a)) {
+   if (!add_deferred_actions(skb, key, nla_data(acts_list),
+ nla_len(acts_list))) {
if (net_ratelimit())
pr_warn("%s: deferred actions limit reached, dropping 
sample action\n",
ovs_dp_name(dp));
@@ -1123,7 +1127,7 @@ static int execute_recirc(struct datapath *dp, struct 
sk_buff *skb,
return 0;
}
 
-   da = add_deferred_actions(skb, key, NULL);
+   da = add_deferred_actions(skb, key, NULL, 0);
if (da) {
da->pkt_key.recirc_id = nla_get_u32(a);
} else {
@@ -1278,10 +1282,10 @@ static void process_deferred_actions(struct datapath 
*dp)
struct sk_buff *skb = da->skb;
struct sw_flow_key *key = >pkt_key;
const struct nlattr *actions = da->actions;
+   int actions_len = da->actions_len;
 
if (actions)
-   do_execute_actions(dp, skb, key, actions,
-  nla_len(actions));
+   do_execute_actions(dp, skb, key, actions, actions_len);
else
ovs_dp_process_packet(skb, key);
} while (!action_fifo_is_empty(fifo));
-- 
1.8.3.1

[PATCH net-next] net: stmicro: replace kzalloc with devm_kzalloc

2017-03-07 Thread Joao Pinto

The axi variable was not being freed upon device removal.
With devm_kzalloc it ensures that it is properly freed.

Signed-off-by: Joao Pinto 
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c 
b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 433a842..0ba1caf 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -108,7 +108,7 @@ static struct stmmac_axi *stmmac_axi_setup(struct 
platform_device *pdev)
if (!np)
return NULL;
 
-   axi = kzalloc(sizeof(*axi), GFP_KERNEL);
+   axi = devm_kzalloc(>dev, sizeof(*axi), GFP_KERNEL);
if (!axi) {
of_node_put(np);
return ERR_PTR(-ENOMEM);
-- 
2.9.3

[PATCH] net: via: via-rhine: use new api ethtool_{get|set}_link_ksettings

2017-03-07 Thread Philippe Reynes

The ethtool api {get|set}_settings is deprecated.
We move this driver to new api {get|set}_link_ksettings.

As I don't have the hardware, I'd be very pleased if
someone may test this patch.

Signed-off-by: Philippe Reynes 
---
 drivers/net/ethernet/via/via-rhine.c |   14 --
 1 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/via/via-rhine.c 
b/drivers/net/ethernet/via/via-rhine.c
index c068c58..4cf41f7 100644
--- a/drivers/net/ethernet/via/via-rhine.c
+++ b/drivers/net/ethernet/via/via-rhine.c
@@ -2303,25 +2303,27 @@ static void netdev_get_drvinfo(struct net_device *dev, 
struct ethtool_drvinfo *i
strlcpy(info->bus_info, dev_name(hwdev), sizeof(info->bus_info));
 }
 
-static int netdev_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+static int netdev_get_link_ksettings(struct net_device *dev,
+struct ethtool_link_ksettings *cmd)
 {
struct rhine_private *rp = netdev_priv(dev);
int rc;
 
mutex_lock(>task_lock);
-   rc = mii_ethtool_gset(>mii_if, cmd);
+   rc = mii_ethtool_get_link_ksettings(>mii_if, cmd);
mutex_unlock(>task_lock);
 
return rc;
 }
 
-static int netdev_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+static int netdev_set_link_ksettings(struct net_device *dev,
+const struct ethtool_link_ksettings *cmd)
 {
struct rhine_private *rp = netdev_priv(dev);
int rc;
 
mutex_lock(>task_lock);
-   rc = mii_ethtool_sset(>mii_if, cmd);
+   rc = mii_ethtool_set_link_ksettings(>mii_if, cmd);
rhine_set_carrier(>mii_if);
mutex_unlock(>task_lock);
 
@@ -2391,14 +2393,14 @@ static int rhine_set_wol(struct net_device *dev, struct 
ethtool_wolinfo *wol)
 
 static const struct ethtool_ops netdev_ethtool_ops = {
.get_drvinfo= netdev_get_drvinfo,
-   .get_settings   = netdev_get_settings,
-   .set_settings   = netdev_set_settings,
.nway_reset = netdev_nway_reset,
.get_link   = netdev_get_link,
.get_msglevel   = netdev_get_msglevel,
.set_msglevel   = netdev_set_msglevel,
.get_wol= rhine_get_wol,
.set_wol= rhine_set_wol,
+   .get_link_ksettings = netdev_get_link_ksettings,
+   .set_link_ksettings = netdev_set_link_ksettings,
 };
 
 static int netdev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
-- 
1.7.4.4

Re: [PATCH net 0/7] bnx2x: PTP crash, VF VLAN fixes

2017-03-07 Thread David Miller

From: Michal Schmidt 
Date: Fri,  3 Mar 2017 17:08:27 +0100

> here are fixes for a crash with PTP, a crash in setting of VF multicast
> addresses, and non-working VLAN filters configuration from the VF side.

Series applied, thanks.

Re: [RFC PATCH] uapi: fix linux/packet_diag.h userspace compilation error

2017-03-07 Thread David Miller

From: "Dmitry V. Levin" 
Date: Tue, 7 Mar 2017 23:28:02 +0300

> On Tue, Mar 07, 2017 at 12:16:49PM -0800, David Miller wrote:
>> From: "Dmitry V. Levin" 
>> Date: Tue, 28 Feb 2017 04:39:30 +0300
>> 
>> > Replace MAX_ADDR_LEN with its numeric value to fix the following
>> > linux/packet_diag.h userspace compilation error:
>> > 
>> > /usr/include/linux/packet_diag.h:67:17: error: 'MAX_ADDR_LEN' undeclared 
>> > here (not in a function)
>> >   __u8 pdmc_addr[MAX_ADDR_LEN];
>> > 
>> > This is not the first case in the UAPI where the numeric value
>> > of MAX_ADDR_LEN is used, uapi/linux/if_link.h already does the same,
>> > and there are no UAPI headers besides these two that use MAX_ADDR_LEN.
>> > 
>> > The alternative fix would be to include  which
>> > pulls in other headers and a lot of definitions with them.
>> > 
>> > Signed-off-by: Dmitry V. Levin 
>> 
>> If if_link.h includes netdevice.h properly, let's try to do the same here as 
>> well.
> 
> Sorry if my words weren't clear enough: no, if_link.h doesn't include
> netdevice.h, it uses the _numeric_ value instead:
> 
> $ grep MAX_ADDR_LEN include/uapi/linux/if_link.h 
>   __u8 mac[32]; /* MAX_ADDR_LEN */

Ok then we might as well do the same thing here, please resubmit this
patch formally.

Thanks.

Re: pull request (net): ipsec 2017-03-06

2017-03-07 Thread David Miller

From: Steffen Klassert 
Date: Mon, 6 Mar 2017 07:57:25 +0100

> 1) Fix lockdep splat on xfrm policy subsystem initialization.
>From Florian Westphal.
> 
> 2) When using socket policies on IPv4-mapped IPv6 addresses,
>we access the flow informations of the wrong address family
>what leads to an out of bounds access. Fix this by using
>the family we get with the dst_entry, like we do it for the
>standard policy lookup.
> 
> 3) vti6 can report a PMTU below IPV6_MIN_MTU. Fix this by
>adding a check for that before sending a ICMPV6_PKT_TOOBIG
>message.
> 
> Please pull or let me know if there are problems.

Pulled, thanks!

Re: [Patch net] ipv6: reorder icmpv6_init() and ip6_mr_init()

2017-03-07 Thread David Miller

From: Cong Wang 
Date: Sun,  5 Mar 2017 12:34:53 -0800

> Andrey reported the following kernel crash:
...
> This is because net->ipv6.mr6_tables is not initialized at that point,
> ip6mr_rules_init() is not called yet, therefore on the error path when
> we iterator the list, we trigger this oops. Fix this by reordering
> ip6mr_rules_init() before icmpv6_sk_init().
> 
> Reported-by: Andrey Konovalov 
> Signed-off-by: Cong Wang 

Applied, thanks.

Re: net: heap out-of-bounds in fib6_clean_node/rt6_fill_node/fib6_age/fib6_prune_clone

2017-03-07 Thread David Ahern

On 3/7/17 1:43 AM, Dmitry Vyukov wrote:
> This is on c1ae3cfa0e89fa1a7ecc4c99031f5e9ae99d9201. No other kernel
> output from your patch (pr_err).

Is the below supposed to be from the same qemu instance at the time of
the crash? cpu1 and cpu2 are both supposedly doing a route insert?


> 
> [ cut here ]
> WARNING: CPU: 1 PID: 30179 at net/ipv6/ip6_fib.c:158
> rt6_rcu_free+0x61/0x70 net/ipv6/ip6_fib.c:158
> Kernel panic - not syncing: panic_on_warn set ...
> 
> CPU: 1 PID: 30179 Comm: syz-executor3 Not tainted 4.11.0-rc1+ #310
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
> Call Trace:
>  __dump_stack lib/dump_stack.c:16 [inline]
>  dump_stack+0x2fb/0x3fd lib/dump_stack.c:52
>  panic+0x20f/0x426 kernel/panic.c:180
>  __warn+0x1c4/0x1e0 kernel/panic.c:541
>  warn_slowpath_null+0x2c/0x40 kernel/panic.c:584
>  rt6_rcu_free+0x61/0x70 net/ipv6/ip6_fib.c:158
>  rt6_release+0x1ee/0x290 net/ipv6/ip6_fib.c:189
>  fib6_add_rt2node net/ipv6/ip6_fib.c:922 [inline]
>  fib6_add+0x1d51/0x3290 net/ipv6/ip6_fib.c:1081
>  __ip6_ins_rt+0x60/0x80 net/ipv6/route.c:948
>  ip6_route_add+0x1a7/0x310 net/ipv6/route.c:2130
>  inet6_rtm_newroute+0x191/0x1b0 net/ipv6/route.c:3294
>  rtnetlink_rcv_msg+0x609/0x860 net/core/rtnetlink.c:4104
>  netlink_rcv_skb+0x2ab/0x390 net/netlink/af_netlink.c:2298
>  rtnetlink_rcv+0x2a/0x40 net/core/rtnetlink.c:4110
>  netlink_unicast_kernel net/netlink/af_netlink.c:1231 [inline]
>  netlink_unicast+0x525/0x730 net/netlink/af_netlink.c:1257
>  netlink_sendmsg+0xab3/0xe70 net/netlink/af_netlink.c:1803
>  sock_sendmsg_nosec net/socket.c:633 [inline]
>  sock_sendmsg+0xca/0x110 net/socket.c:643
>  sock_write_iter+0x326/0x600 net/socket.c:846
>  call_write_iter include/linux/fs.h:1733 [inline]
>  do_iter_readv_writev fs/read_write.c:696 [inline]
>  __do_readv_writev+0xbbc/0x10a0 fs/read_write.c:862
>  do_readv_writev+0x13f/0x200 fs/read_write.c:894
>  vfs_writev+0x87/0xc0 fs/read_write.c:921
>  do_writev+0x110/0x2c0 fs/read_write.c:954
>  SYSC_writev fs/read_write.c:1027 [inline]
>  SyS_writev+0x27/0x30 fs/read_write.c:1024
>  entry_SYSCALL_64_fastpath+0x1f/0xc2
> RIP: 0033:0x4458d9
> RSP: 002b:7f31fcf33b58 EFLAGS: 0292 ORIG_RAX: 0014
> RAX: ffda RBX: 0005 RCX: 004458d9
> RDX: 0001 RSI: 207cd000 RDI: 0005
> RBP: 006e30c0 R08:  R09: 
> R10:  R11: 0292 R12: 00708000
> R13: 20fad000 R14: 1000 R15: 0003
> 
> 
> 
> [ cut here ]
> WARNING: CPU: 2 PID: 31175 at net/ipv6/ip6_fib.c:158
> rt6_rcu_free+0x61/0x70 net/ipv6/ip6_fib.c:158
> Kernel panic - not syncing: panic_on_warn set ...
> 
> CPU: 2 PID: 31175 Comm: syz-executor1 Not tainted 4.11.0-rc1+ #310
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
> Call Trace:
>  __dump_stack lib/dump_stack.c:16 [inline]
>  dump_stack+0x2fb/0x3fd lib/dump_stack.c:52
>  panic+0x20f/0x426 kernel/panic.c:180
>  __warn+0x1c4/0x1e0 kernel/panic.c:541
>  warn_slowpath_null+0x2c/0x40 kernel/panic.c:584
>  rt6_rcu_free+0x61/0x70 net/ipv6/ip6_fib.c:158
>  rt6_release+0x1ee/0x290 net/ipv6/ip6_fib.c:189
>  fib6_add_rt2node net/ipv6/ip6_fib.c:922 [inline]
>  fib6_add+0x1d51/0x3290 net/ipv6/ip6_fib.c:1081
> kvm_vm_ioctl_deassign_device: device hasn't been assigned before, so
> cannot be deassigned
>  __ip6_ins_rt+0x60/0x80 net/ipv6/route.c:948
>  ip6_route_add+0x1a7/0x310 net/ipv6/route.c:2130
>  inet6_rtm_newroute+0x191/0x1b0 net/ipv6/route.c:3294
>  rtnetlink_rcv_msg+0x609/0x860 net/core/rtnetlink.c:4104
>  netlink_rcv_skb+0x2ab/0x390 net/netlink/af_netlink.c:2298
>  rtnetlink_rcv+0x2a/0x40 net/core/rtnetlink.c:4110
>  netlink_unicast_kernel net/netlink/af_netlink.c:1231 [inline]
>  netlink_unicast+0x525/0x730 net/netlink/af_netlink.c:1257
>  netlink_sendmsg+0xab3/0xe70 net/netlink/af_netlink.c:1803
>  sock_sendmsg_nosec net/socket.c:633 [inline]
>  sock_sendmsg+0xca/0x110 net/socket.c:643
>  sock_write_iter+0x326/0x600 net/socket.c:846
>  call_write_iter include/linux/fs.h:1733 [inline]
>  do_iter_readv_writev fs/read_write.c:696 [inline]
>  __do_readv_writev+0xbbc/0x10a0 fs/read_write.c:862
>  do_readv_writev+0x13f/0x200 fs/read_write.c:894
>  vfs_writev+0x87/0xc0 fs/read_write.c:921
>  do_writev+0x110/0x2c0 fs/read_write.c:954
>  SYSC_writev fs/read_write.c:1027 [inline]
>  SyS_writev+0x27/0x30 fs/read_write.c:1024
>  entry_SYSCALL_64_fastpath+0x1f/0xc2
> RIP: 0033:0x4458d9
> RSP: 002b:7f1639006b58 EFLAGS: 0292 ORIG_RAX: 0014
> RAX: ffda RBX: 0019 RCX: 004458d9
> RDX: 0001 RSI: 207cd000 RDI: 0019
> RBP: 006e30c0 R08:  R09: 
> R10:  R11: 0292 R12: 00708000
> R13: 0010 R14: 0003 R15: 
>

Re: [PATCH] fjes: Do not load fjes driver if system does not have extended socket device.

2017-03-07 Thread David Miller

From: Yasuaki Ishimatsu 
Date: Thu, 2 Mar 2017 11:56:44 -0500

> +
>  /* fjes_init_module - Driver Registration Routine */
>  static int __init fjes_init_module(void)
>  {
>   int result;
> + bool found = false;
> +

Please order local variable declarations from longest to shortest
line.

Thanks.

Re: [PATCH v3 net-next 2/6] drivers: net: xgene-v2: Add mac configuration

2017-03-07 Thread David Miller

From: Iyappan Subramanian 
Date: Fri,  3 Mar 2017 17:09:28 -0800

> +void xge_mac_set_station_addr(struct xge_pdata *pdata)
> +{
> + u32 addr0, addr1;
> + u8 *dev_addr = pdata->ndev->dev_addr;

Please order local variable declarations from longest to shortest line.

Re: [PATCH net] dccp: fix use-after-free in dccp_feat_activate_values

2017-03-07 Thread David Miller

From: Eric Dumazet 
Date: Sun, 05 Mar 2017 10:52:16 -0800

> From: Eric Dumazet 
> 
> Dmitry reported crashes in DCCP stack [1]
> 
> Problem here is that when I got rid of listener spinlock, I missed the
> fact that DCCP stores a complex state in struct dccp_request_sock,
> while TCP does not.
> 
> Since multiple cpus could access it at the same time, we need to add
> protection.
 ...
> Fixes: 079096f103fa ("tcp/dccp: install syn_recv requests into ehash table")
> Signed-off-by: Eric Dumazet 
> Reported-by: Dmitry Vyukov 
> Tested-by: Dmitry Vyukov 

Applied and queued up for -stable.

Re: [PATCH] udp: avoid ufo handling on IP payload compression packets

2017-03-07 Thread David Miller

From: Alexey Kodanev 
Date: Fri,  3 Mar 2017 15:37:32 +0300

> diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
> index b67719f..18383ef 100644
> --- a/net/ipv4/ip_output.c
> +++ b/net/ipv4/ip_output.c
> @@ -960,7 +960,10 @@ static int __ip_append_data(struct sock *sk,
>   cork->length += length;
>   if length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb))) &&
>   (sk->sk_protocol == IPPROTO_UDP) &&
> - (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
> + (rt->dst.dev->features & NETIF_F_UFO) &&
> +#ifdef CONFIG_XFRM
> + !rt->dst.xfrm &&
> +#endif

As Steffen has suggested, please use dst_xfrm().

Re: [PATCH] arp: Race condition in gratuitous ARP reception handling.

2017-03-07 Thread David Miller

From: 
Date: Thu, 2 Mar 2017 14:59:47 +0100

> @@ -836,19 +843,30 @@ static int arp_process(struct net *net, struct sock 
> *sk, struct sk_buff *skb)
>   n = __neigh_lookup(_tbl, , dev, 0);
>  
>   if (IN_DEV_ARP_ACCEPT(in_dev)) {
> - unsigned int addr_type = inet_addr_type_dev_table(net, dev, 
> sip);
> + unsigned int addr_type = inet_addr_type_dev_table(net,
> +   dev, sip);

Please don't mix coding style changes with real modifications.

> + } else if (n) {
> + unsigned int addr_type = inet_addr_type_dev_table(net,
> +   dev, sip);

Thanks.

Re: net: BUG in unix_notinflight

2017-03-07 Thread Cong Wang

On Tue, Mar 7, 2017 at 12:37 AM, Dmitry Vyukov  wrote:
> On Mon, Mar 6, 2017 at 11:34 PM, Cong Wang  wrote:
>> The problem here is there is no lock protecting concurrent unix_detach_fds()
>> even though unix_notinflight() is already serialized, if we call
>> unix_notinflight()
>> twice on the same file pointer, we trigger this bug...
>>
>> I don't know what is the right lock here to serialize it.
>
>
> What exactly here needs to be protected?
>
> 1484 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
> 1485 {
> 1486 int i;
> 1487
> 1488 scm->fp = UNIXCB(skb).fp;
> 1489 UNIXCB(skb).fp = NULL;
> 1490
> 1491 for (i = scm->fp->count-1; i >= 0; i--)
> 1492 unix_notinflight(scm->fp->user, scm->fp->fp[i]);
> 1493 }
>
> Whole unix_notinflight happens under global unix_gc_lock.
>
> Is it that 2 threads call unix_detach_fds for the same skb, and then
> call unix_notinflight for the same fd twice?

Not the same skb, but their UNIXCB(skb).fp points to the same place,
therefore we call unix_notinflight() twice on the same fp->user and
fp->fp[i], although we have refcounting but still able to trigger this
warning.

[PATCH v2] net: intel: ixgbe: use new api ethtool_{get|set}_link_ksettings

2017-03-07 Thread Philippe Reynes

The ethtool api {get|set}_settings is deprecated.
We move this driver to new api {get|set}_link_ksettings.

As I don't have the hardware, I'd be very pleased if
someone may test this patch.

Signed-off-by: Philippe Reynes 
---
Changelog:
v2:
- fix compilation (thanks andrewx bowers)

 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c |  168 --
 1 files changed, 91 insertions(+), 77 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
index 90fa5bf..0da0752f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@ -186,60 +186,62 @@ static u32 ixgbe_get_supported_10gtypes(struct ixgbe_hw 
*hw)
}
 }
 
-static int ixgbe_get_settings(struct net_device *netdev,
- struct ethtool_cmd *ecmd)
+static int ixgbe_get_link_ksettings(struct net_device *netdev,
+   struct ethtool_link_ksettings *cmd)
 {
struct ixgbe_adapter *adapter = netdev_priv(netdev);
struct ixgbe_hw *hw = >hw;
ixgbe_link_speed supported_link;
bool autoneg = false;
+   u32 supported, advertising;
+
+   ethtool_convert_link_mode_to_legacy_u32(,
+   cmd->link_modes.supported);
 
hw->mac.ops.get_link_capabilities(hw, _link, );
 
/* set the supported link speeds */
if (supported_link & IXGBE_LINK_SPEED_10GB_FULL)
-   ecmd->supported |= ixgbe_get_supported_10gtypes(hw);
+   supported |= ixgbe_get_supported_10gtypes(hw);
if (supported_link & IXGBE_LINK_SPEED_1GB_FULL)
-   ecmd->supported |= (ixgbe_isbackplane(hw->phy.media_type)) ?
+   supported |= (ixgbe_isbackplane(hw->phy.media_type)) ?
   SUPPORTED_1000baseKX_Full :
   SUPPORTED_1000baseT_Full;
if (supported_link & IXGBE_LINK_SPEED_100_FULL)
-   ecmd->supported |= SUPPORTED_100baseT_Full;
+   supported |= SUPPORTED_100baseT_Full;
if (supported_link & IXGBE_LINK_SPEED_10_FULL)
-   ecmd->supported |= SUPPORTED_10baseT_Full;
+   supported |= SUPPORTED_10baseT_Full;
 
/* default advertised speed if phy.autoneg_advertised isn't set */
-   ecmd->advertising = ecmd->supported;
+   advertising = supported;
/* set the advertised speeds */
if (hw->phy.autoneg_advertised) {
-   ecmd->advertising = 0;
+   advertising = 0;
if (hw->phy.autoneg_advertised & IXGBE_LINK_SPEED_10_FULL)
-   ecmd->advertising |= ADVERTISED_10baseT_Full;
+   advertising |= ADVERTISED_10baseT_Full;
if (hw->phy.autoneg_advertised & IXGBE_LINK_SPEED_100_FULL)
-   ecmd->advertising |= ADVERTISED_100baseT_Full;
+   advertising |= ADVERTISED_100baseT_Full;
if (hw->phy.autoneg_advertised & IXGBE_LINK_SPEED_10GB_FULL)
-   ecmd->advertising |= ecmd->supported & ADVRTSD_MSK_10G;
+   advertising |= supported & ADVRTSD_MSK_10G;
if (hw->phy.autoneg_advertised & IXGBE_LINK_SPEED_1GB_FULL) {
-   if (ecmd->supported & SUPPORTED_1000baseKX_Full)
-   ecmd->advertising |= ADVERTISED_1000baseKX_Full;
+   if (supported & SUPPORTED_1000baseKX_Full)
+   advertising |= ADVERTISED_1000baseKX_Full;
else
-   ecmd->advertising |= ADVERTISED_1000baseT_Full;
+   advertising |= ADVERTISED_1000baseT_Full;
}
} else {
if (hw->phy.multispeed_fiber && !autoneg) {
if (supported_link & IXGBE_LINK_SPEED_10GB_FULL)
-   ecmd->advertising = ADVERTISED_1baseT_Full;
+   advertising = ADVERTISED_1baseT_Full;
}
}
 
if (autoneg) {
-   ecmd->supported |= SUPPORTED_Autoneg;
-   ecmd->advertising |= ADVERTISED_Autoneg;
-   ecmd->autoneg = AUTONEG_ENABLE;
+   supported |= SUPPORTED_Autoneg;
+   advertising |= ADVERTISED_Autoneg;
+   cmd->base.autoneg = AUTONEG_ENABLE;
} else
-   ecmd->autoneg = AUTONEG_DISABLE;
-
-   ecmd->transceiver = XCVR_EXTERNAL;
+   cmd->base.autoneg = AUTONEG_DISABLE;
 
/* Determine the remaining settings based on the PHY type. */
switch (adapter->hw.phy.type) {
@@ -248,14 +250,14 @@ static int ixgbe_get_settings(struct net_device *netdev,
case ixgbe_phy_x550em_ext_t:
case ixgbe_phy_fw:
case ixgbe_phy_cu_unknown:
-

Re: net: BUG in unix_notinflight

2017-03-07 Thread Willy Tarreau

On Wed, Mar 08, 2017 at 12:23:56AM +0200, Nikolay Borisov wrote:
> 
> >>
> >>
> >> New report from linux-next/c0b7b2b33bd17f7155956d0338ce92615da686c9
> >>
> >> [ cut here ]
> >> kernel BUG at net/unix/garbage.c:149!
> >> invalid opcode:  [#1] SMP KASAN
> >> Dumping ftrace buffer:
> >>(ftrace buffer empty)
> >> Modules linked in:
> >> CPU: 0 PID: 1806 Comm: syz-executor7 Not tainted 4.10.0-next-20170303+ #6
> >> Hardware name: Google Google Compute Engine/Google Compute Engine,
> >> BIOS Google 01/01/2011
> >> task: 880121c64740 task.stack: 88012c9e8000
> >> RIP: 0010:unix_notinflight+0x417/0x5d0 net/unix/garbage.c:149
> >> RSP: 0018:88012c9ef0f8 EFLAGS: 00010297
> >> RAX: 880121c64740 RBX: 11002593de23 RCX: 8801c490c628
> >> RDX:  RSI: 11002593de27 RDI: 8557e504
> >> RBP: 88012c9ef220 R08: 0001 R09: 
> >> R10: dc00 R11: ed002593de55 R12: 8801c490c0c0
> >> R13: 88012c9ef1f8 R14: 85101620 R15: dc00
> >> FS:  013d3940() GS:8801dbe0() 
> >> knlGS:
> >> CS:  0010 DS:  ES:  CR0: 80050033
> >> CR2: 01fd8cd8 CR3: 0001cce69000 CR4: 001426f0
> >> Call Trace:
> >>  unix_detach_fds.isra.23+0xfa/0x170 net/unix/af_unix.c:1490
> >>  unix_destruct_scm+0xf4/0x200 net/unix/af_unix.c:1499
> > 
> > The problem here is there is no lock protecting concurrent unix_detach_fds()
> > even though unix_notinflight() is already serialized, if we call
> > unix_notinflight()
> > twice on the same file pointer, we trigger this bug...
> > 
> > I don't know what is the right lock here to serialize it.
> > 
> 
> 
> I reported something similar a while ago
> https://lists.gt.net/linux/kernel/2534612
> 
> And Miklos Szeredi then produced the following patch :
> 
> https://patchwork.kernel.org/patch/9305121/
> 
> However, this was never applied. I wonder if the patch makes sense?

I don't know but there's a hint at the bottom of the thread with
Davem's response to which there was no followup :

  "Why would I apply a patch that's an RFC, doesn't have a proper commit
   message, lacks a proper signoff, and also lacks ACK's and feedback
   from other knowledgable developers?"

So at least this point makes sense, maybe the patch is fine but was
not sufficiently reviewed or acked ? Maybe it was proposed as an RFC
to start a discussion and never went to the final status of a patch
waiting for being applied ?

Willy

Re: net: BUG in unix_notinflight

2017-03-07 Thread Nikolay Borisov


>>
>>
>> New report from linux-next/c0b7b2b33bd17f7155956d0338ce92615da686c9
>>
>> [ cut here ]
>> kernel BUG at net/unix/garbage.c:149!
>> invalid opcode:  [#1] SMP KASAN
>> Dumping ftrace buffer:
>>(ftrace buffer empty)
>> Modules linked in:
>> CPU: 0 PID: 1806 Comm: syz-executor7 Not tainted 4.10.0-next-20170303+ #6
>> Hardware name: Google Google Compute Engine/Google Compute Engine,
>> BIOS Google 01/01/2011
>> task: 880121c64740 task.stack: 88012c9e8000
>> RIP: 0010:unix_notinflight+0x417/0x5d0 net/unix/garbage.c:149
>> RSP: 0018:88012c9ef0f8 EFLAGS: 00010297
>> RAX: 880121c64740 RBX: 11002593de23 RCX: 8801c490c628
>> RDX:  RSI: 11002593de27 RDI: 8557e504
>> RBP: 88012c9ef220 R08: 0001 R09: 
>> R10: dc00 R11: ed002593de55 R12: 8801c490c0c0
>> R13: 88012c9ef1f8 R14: 85101620 R15: dc00
>> FS:  013d3940() GS:8801dbe0() knlGS:
>> CS:  0010 DS:  ES:  CR0: 80050033
>> CR2: 01fd8cd8 CR3: 0001cce69000 CR4: 001426f0
>> Call Trace:
>>  unix_detach_fds.isra.23+0xfa/0x170 net/unix/af_unix.c:1490
>>  unix_destruct_scm+0xf4/0x200 net/unix/af_unix.c:1499
> 
> The problem here is there is no lock protecting concurrent unix_detach_fds()
> even though unix_notinflight() is already serialized, if we call
> unix_notinflight()
> twice on the same file pointer, we trigger this bug...
> 
> I don't know what is the right lock here to serialize it.
> 


I reported something similar a while ago
https://lists.gt.net/linux/kernel/2534612

And Miklos Szeredi then produced the following patch :

https://patchwork.kernel.org/patch/9305121/

However, this was never applied. I wonder if the patch makes sense?

Re: [PATCH net 0/2] net: fix possible sock_hold() misuses

2017-03-07 Thread David Miller

From: Eric Dumazet 
Date: Fri,  3 Mar 2017 21:01:01 -0800

> skb_complete_wifi_ack() and skb_complete_tx_timestamp() currently
> call sock_hold() on sockets that might have transitioned their sk_refcnt
> to zero already.

Series applied and queued up for -stable, thanks.

Re: [PATCH 1/2 net] ibmvnic: Fix overflowing firmware/hardware TX queue

2017-03-07 Thread David Miller

From: Thomas Falcon 
Date: Sun,  5 Mar 2017 12:18:41 -0600

> Use a counter to track the number of outstanding transmissions sent
> that have not received completions. If the counter reaches the maximum
> number of queue entries, stop transmissions on that queue. As we receive 
> more completions from firmware, wake the queue once the counter reaches 
> an acceptable level.
> 
> This patch prevents hardware/firmware TX queue from filling up and
> and generating errors.  Since incorporating this fix, internal testing
> has reported that these firmware errors have stopped.
> 
> Signed-off-by: Thomas Falcon 

Applied.

Re: [PATCH 2/2 net] ibmvnic: Allocate number of rx/tx buffers agreed on by firmware

2017-03-07 Thread David Miller

From: Thomas Falcon 
Date: Sun,  5 Mar 2017 12:18:42 -0600

> The amount of TX/RX buffers that the vNIC driver currently allocates
> is different from the amount agreed upon in negotiation with firmware.
> Correct that by allocating the requested number of buffers confirmed
> by firmware.
> 
> Signed-off-by: Thomas Falcon 

Applied.

Re: please add some examples to the ip man page

2017-03-07 Thread Stephen Hemminger

On Wed, 08 Mar 2017 03:10:32 +0800
積丹尼 Dan Jacobson  wrote:

> $ man ip
> should have some usage examples. Even one would be fine.

Sure. Submit a patch.

Re: [PATCH] net/sched: act_skbmod: remove unneeded rcu_read_unlock in tcf_skbmod_dump

2017-03-07 Thread David Miller

From: Alexey Khoroshilov 
Date: Sun,  5 Mar 2017 03:01:55 +0300

> Found by Linux Driver Verification project (linuxtesting.org).
> 
> Signed-off-by: Alexey Khoroshilov 

Applied and queued up for -stable, thanks.

Re: [PATCH net-next 1/1] decnet: Use TCP nagle macro instead of literal number in decnet

2017-03-07 Thread David Miller

From: f...@ikuai8.com
Date: Sat,  4 Mar 2017 22:10:28 +0800

> From: Gao Feng 
> 
> Use existing TCP nagle macro TCP_NAGLE_OFF and TCP_NAGLE_CORK instead
> of the literal number 1 and 2 in the current decnet codes.
> 
> Signed-off-by: Gao Feng 

Applied.

Re: net: heap out-of-bounds in fib6_clean_node/rt6_fill_node/fib6_age/fib6_prune_clone

2017-03-07 Thread Dmitry Vyukov

On Tue, Mar 7, 2017 at 8:02 PM, Dmitry Vyukov  wrote:
> On Tue, Mar 7, 2017 at 7:43 PM, David Ahern  wrote:
>> On 3/7/17 11:13 AM, Dmitry Vyukov wrote:
 on this warning:

 /* dst.next really should not be set at this point */
 if (rt->dst.next && rt->dst.next->ops->family != AF_INET6) {
 pr_warn("fib6_add: adding rt with bad next -- family %d dst
 flags %x\n",
 rt->dst.next->ops->family, rt->dst.next->flags);

 WARN_ON(1);
 }

 You should have seen the pr_warn in the log preceding the WARN_ON dump.
>>>
>>> Right. They all have the same "IPv6: fib6_add: adding rt with bad next
>>> -- family 2 dst flags 6"
>>
>> remove the previous changes and try the attached.
>
>
> Doing this now.
> FWIW I've also applied your last patch with missing "iter->dst.flags
> &= ~DST_IN_FIB;" and restored the warning in rt6_rcu_free and it did
> not fire (in a limited run). I only saw the "WARNING in fib6_add" that
> I already reported.


So far I've hit only:
[ 1103.840031] BUG: KASAN: slab-out-of-bounds in fib6_age+0x3fd/0x480
at addr 8800799d2254
without any preceeding warnings.
But note that since the kernel is heavily stressed I can reliably get
any pr_err output if it happens right before BUG/WARNING. Anything
that happens minutes before will be lots because there are tons of
output.

Re: [PATCH v2 2/2] can: spi: hi311x: Add Holt HI-311x CAN driver

2017-03-07 Thread Akshay Bhat



On 01/17/2017 02:22 PM, Akshay Bhat wrote:
> This patch adds support for the Holt HI-311x CAN controller. The HI311x
> CAN controller is capable of transmitting and receiving standard data
> frames, extended data frames and remote frames. The HI311x interfaces
> with the host over SPI.
> 
> Datasheet: www.holtic.com/documents/371-hi-3110_v-rev-jpdf.do
> 
> Signed-off-by: Akshay Bhat 
> ---
> 


Hi Marc,

Wanted to check if this patch can be included in the next kernel release
(4.12).

Thanks,
Akshay

[PATCH] uapi: fix linux/packet_diag.h userspace compilation error

2017-03-07 Thread Dmitry V. Levin

Replace MAX_ADDR_LEN with its numeric value to fix the following
linux/packet_diag.h userspace compilation error:

/usr/include/linux/packet_diag.h:67:17: error: 'MAX_ADDR_LEN' undeclared here 
(not in a function)
  __u8 pdmc_addr[MAX_ADDR_LEN];

This is not the first case in the UAPI where the numeric value
of MAX_ADDR_LEN is used instead of symbolic one, uapi/linux/if_link.h
already does the same:

$ grep MAX_ADDR_LEN include/uapi/linux/if_link.h 
__u8 mac[32]; /* MAX_ADDR_LEN */

There are no UAPI headers besides these two that use MAX_ADDR_LEN.

Signed-off-by: Dmitry V. Levin 
Acked-by: Pavel Emelyanov 
---
 include/uapi/linux/packet_diag.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/uapi/linux/packet_diag.h b/include/uapi/linux/packet_diag.h
index d08c63f..0c5d5dd 100644
--- a/include/uapi/linux/packet_diag.h
+++ b/include/uapi/linux/packet_diag.h
@@ -64,7 +64,7 @@ struct packet_diag_mclist {
__u32   pdmc_count;
__u16   pdmc_type;
__u16   pdmc_alen;
-   __u8pdmc_addr[MAX_ADDR_LEN];
+   __u8pdmc_addr[32]; /* MAX_ADDR_LEN */
 };
 
 struct packet_diag_ring {
-- 
ldv

Re: [PATCH] net: toshiba: ps3_genic_net: use new api ethtool_{get|set}_link_ksettings

2017-03-07 Thread Geoff Levand


On 03/05/2017 02:21 PM, Philippe Reynes wrote:

The ethtool api {get|set}_settings is deprecated.
We move this driver to new api {get|set}_link_ksettings.

As I don't have the hardware, I'd be very pleased if
someone may test this patch.


I tested this applied to v4.11-rc1 and it seems to work OK.
Thanks for your contribution.

Tested-by: Geoff Levand

Re: [RFC PATCH] uapi: fix linux/packet_diag.h userspace compilation error

2017-03-07 Thread Dmitry V. Levin

On Tue, Mar 07, 2017 at 12:16:49PM -0800, David Miller wrote:
> From: "Dmitry V. Levin" 
> Date: Tue, 28 Feb 2017 04:39:30 +0300
> 
> > Replace MAX_ADDR_LEN with its numeric value to fix the following
> > linux/packet_diag.h userspace compilation error:
> > 
> > /usr/include/linux/packet_diag.h:67:17: error: 'MAX_ADDR_LEN' undeclared 
> > here (not in a function)
> >   __u8 pdmc_addr[MAX_ADDR_LEN];
> > 
> > This is not the first case in the UAPI where the numeric value
> > of MAX_ADDR_LEN is used, uapi/linux/if_link.h already does the same,
> > and there are no UAPI headers besides these two that use MAX_ADDR_LEN.
> > 
> > The alternative fix would be to include  which
> > pulls in other headers and a lot of definitions with them.
> > 
> > Signed-off-by: Dmitry V. Levin 
> 
> If if_link.h includes netdevice.h properly, let's try to do the same here as 
> well.

Sorry if my words weren't clear enough: no, if_link.h doesn't include
netdevice.h, it uses the _numeric_ value instead:

$ grep MAX_ADDR_LEN include/uapi/linux/if_link.h 
__u8 mac[32]; /* MAX_ADDR_LEN */


-- 
ldv

Re: [PATCH v1 net-next] net: dwc-xlgmac: Initial driver for DesignWare Enterprise Ethernet

2017-03-07 Thread David Miller

From: Jie Deng 
Date: Wed, 1 Mar 2017 12:00:25 +0800

> +static int xlgmac_init(struct xlgmac_pdata *pdata)
> +{
> + struct net_device *netdev = pdata->netdev;
> + struct xlgmac_hw_ops *hw_ops = >hw_ops;

Please order local variable declarations from longest to shortest line.

Please fix this in your entire submission.

> +
> +err_netdev_register:
> +err_init:

Avoid multiple error labels in the same exact location, by having only
one which describes what the error path unwinds.  For example,  here you
could say "err_free_netdev".

Please fix this up in your entire submission.

Re: [PATCH] net: qcom/emac: optimize QDF2400 SGMII RX/TX impedence values

2017-03-07 Thread David Miller

From: Timur Tabi 
Date: Tue, 28 Feb 2017 17:16:02 -0600

> Adjust the impedance values of the RX and TX lanes in the SGMII block
> so that they are closer to optimal values.
> 
> Signed-off-by: Timur Tabi 

Applied to net-next, thanks.

Re: [PATCH] net/mlx5e: add IPV6 dependency

2017-03-07 Thread David Miller

From: Arnd Bergmann 
Date: Tue, 28 Feb 2017 22:12:04 +0100

> The ethernet support now calls directly into the ipv6 core code, which
> fails if IPV6 is a loadable module but mlx5 is built-in:
> 
> drivers/net/ethernet/mellanox/mlx5/core/en_tc.o: In function 
> `mlx5e_create_encap_header_ipv6':
> en_tc.c:(.text.mlx5e_create_encap_header_ipv6+0x110): undefined reference to 
> `ip6_route_output_flags'
> 
> This adds a dependency to ensure that MLX5_CORE_EN can only be built
> if we are able link the kernel successfully. The downside is that the
> ethernet option can be hidden. Alternatively we could make MLX5_CORE
> depend on "IPV6 || !IPV6", which would force MLX5_CORE to be a module
> when IPV6 is, including in configurations where we don't use the ethernet
> support at all.
> 
> Signed-off-by: Arnd Bergmann 

Applied.

Re: [PATCH net-next 1/1] net: rmnet_data: Initial implementation

2017-03-07 Thread David Miller

From: Subash Abhinov Kasiviswanathan 
Date: Thu, 23 Feb 2017 20:30:54 -0700

> +struct rmnet_nl_msg_s {
> + uint16_t reserved;
> + uint16_t message_type;
> + uint16_t reserved2:14;
> + uint16_t crd:2;

Inside the kernel you should use "u32", "u16", "u8", etc. for purely
internal things, and for datastructures exported to userspace you
should use "__u32", "__u16", "__u8".

Furthermore, if the members have specific endianness you should
properly use the "__le32", "__le16", "__be32", "__be16" etc. types.

> +/**
> + * rmnet_vnd_ioctl() - IOCTL NDO callback

Please use netlink for device instantiation and configuration
rather than ioctls.

Re: [PATCH v2] dt: emac: document device-tree based phy discovery and setup

2017-03-07 Thread David Miller

From: Christian Lamparter 
Date: Mon, 27 Feb 2017 21:54:50 +0100

> This patch adds documentation for a new "phy-handle" property,
> "fixed-link" and "mdio" sub-node. These allows the enumeration
> of PHYs which are supported by the phy library under drivers/net/phy.
> 
> The EMAC ethernet controller in IBM and AMCC 4xx chips is
> currently stuck with a few privately defined phy
> implementations. It has no support for PHYs which
> are supported by the generic phylib.
> 
> Acked-by: Rob Herring 
> Reviewed-by: Florian Fainelli 
> Signed-off-by: Christian Lamparter 

Applied, thanks.

Re: [RFC PATCH] uapi: fix linux/packet_diag.h userspace compilation error

2017-03-07 Thread David Miller

From: "Dmitry V. Levin" 
Date: Tue, 28 Feb 2017 04:39:30 +0300

> Replace MAX_ADDR_LEN with its numeric value to fix the following
> linux/packet_diag.h userspace compilation error:
> 
> /usr/include/linux/packet_diag.h:67:17: error: 'MAX_ADDR_LEN' undeclared here 
> (not in a function)
>   __u8 pdmc_addr[MAX_ADDR_LEN];
> 
> This is not the first case in the UAPI where the numeric value
> of MAX_ADDR_LEN is used, uapi/linux/if_link.h already does the same,
> and there are no UAPI headers besides these two that use MAX_ADDR_LEN.
> 
> The alternative fix would be to include  which
> pulls in other headers and a lot of definitions with them.
> 
> Signed-off-by: Dmitry V. Levin 

If if_link.h includes netdevice.h properly, let's try to do the same here as 
well.

Thanks.

Re: net: heap out-of-bounds in fib6_clean_node/rt6_fill_node/fib6_age/fib6_prune_clone

2017-03-07 Thread Dmitry Vyukov

On Tue, Mar 7, 2017 at 8:30 PM, Dmitry Vyukov  wrote:
>>> On 3/7/17 11:13 AM, Dmitry Vyukov wrote:
> on this warning:
>
> /* dst.next really should not be set at this point */
> if (rt->dst.next && rt->dst.next->ops->family != AF_INET6) {
> pr_warn("fib6_add: adding rt with bad next -- family %d dst
> flags %x\n",
> rt->dst.next->ops->family, rt->dst.next->flags);
>
> WARN_ON(1);
> }
>
> You should have seen the pr_warn in the log preceding the WARN_ON dump.

 Right. They all have the same "IPv6: fib6_add: adding rt with bad next
 -- family 2 dst flags 6"
>>>
>>> remove the previous changes and try the attached.
>>
>>
>> Doing this now.
>> FWIW I've also applied your last patch with missing "iter->dst.flags
>> &= ~DST_IN_FIB;" and restored the warning in rt6_rcu_free and it did
>> not fire (in a limited run). I only saw the "WARNING in fib6_add" that
>> I already reported.
>
>
> So far I've hit only:
> [ 1103.840031] BUG: KASAN: slab-out-of-bounds in fib6_age+0x3fd/0x480
> at addr 8800799d2254
> without any preceeding warnings.
> But note that since the kernel is heavily stressed I can reliably get
> any pr_err output if it happens right before BUG/WARNING. Anything
> that happens minutes before will be lots because there are tons of
> output.



So far 6 "KASAN: slab-out-of-bounds Read in fib6_age" but no other warnings.

Re: [PATCH net] net/tunnel: set inner protocol in network gro hooks

2017-03-07 Thread Alexander Duyck

On Tue, Mar 7, 2017 at 9:33 AM, Paolo Abeni  wrote:
> The gso code of several tunnels type (gre and udp tunnels)
> takes for granted that the skb->inner_protocol is properly
> initialized and drops the packet elsewhere.
>
> On the forwarding path no one is initializing such field,
> so gro encapsulated packets are dropped on forward.
>
> Since commit 38720352412a ("gre: Use inner_proto to obtain
> inner header protocol"), this can be reproduced when the
> encapsulated packets use gre as the tunneling protocol.
>
> The issue happens also with vxlan and geneve tunnels since
> commit 8bce6d7d0d1e ("udp: Generalize skb_udp_segment"), if the
> forwarding host's ingress nic has h/w offload for such tunnel
> and a vxlan/geneve device is configured on top of it, regardless
> of the configured peer address and vni.
>
> To address the issue, this change initialize the inner_protocol
> field for encapsulated packets in both ipv4 and ipv6 gro complete
> callbacks.
>
> Fixes: 38720352412a ("gre: Use inner_proto to obtain inner header protocol")
> Fixes: 8bce6d7d0d1e ("udp: Generalize skb_udp_segment")
> Signed-off-by: Paolo Abeni 

Looks good to me.

Acked-by: Alexander Duyck 

> ---
>  net/ipv4/af_inet.c | 4 +++-
>  net/ipv6/ip6_offload.c | 4 +++-
>  2 files changed, 6 insertions(+), 2 deletions(-)
>
> diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
> index 602d40f..5091f46 100644
> --- a/net/ipv4/af_inet.c
> +++ b/net/ipv4/af_inet.c
> @@ -1487,8 +1487,10 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff)
> int proto = iph->protocol;
> int err = -ENOSYS;
>
> -   if (skb->encapsulation)
> +   if (skb->encapsulation) {
> +   skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IP));
> skb_set_inner_network_header(skb, nhoff);
> +   }
>
> csum_replace2(>check, iph->tot_len, newlen);
> iph->tot_len = newlen;
> diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
> index 0838e6d..93e58a5 100644
> --- a/net/ipv6/ip6_offload.c
> +++ b/net/ipv6/ip6_offload.c
> @@ -294,8 +294,10 @@ static int ipv6_gro_complete(struct sk_buff *skb, int 
> nhoff)
> struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + nhoff);
> int err = -ENOSYS;
>
> -   if (skb->encapsulation)
> +   if (skb->encapsulation) {
> +   skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IPV6));
> skb_set_inner_network_header(skb, nhoff);
> +   }
>
> iph->payload_len = htons(skb->len - nhoff - sizeof(*iph));
>
> --
> 2.9.3
>

[PATCH net-next] liquidio: add support for XPS

2017-03-07 Thread Felix Manlunas

From: Rick Farrington 

Add support for XPS.

Signed-off-by: Rick Farrington 
Signed-off-by: Felix Manlunas 
Signed-off-by: Derek Chickles 
Signed-off-by: Satanand Burla 
---
 drivers/net/ethernet/cavium/liquidio/lio_main.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c 
b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index be9c0e3..dffed43 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -2553,6 +2553,15 @@ static inline int setup_io_queues(struct octeon_device 
*octeon_dev,
__func__);
return 1;
}
+
+   if (octeon_dev->ioq_vector) {
+   struct octeon_ioq_vector *ioq_vector;
+
+   ioq_vector = _dev->ioq_vector[q];
+   netif_set_xps_queue(netdev,
+   _vector->affinity_mask,
+   ioq_vector->iq_index);
+   }
}
 
return 0;

Re: [PATCH v2 0/6] Updates for Marvell Switch SoCs

2017-03-07 Thread Chris Packham

Hi Gregory,

On 08/03/17 06:10, Gregory CLEMENT wrote:
> Hi Chris,
>
>  On jeu., févr. 16 2017, Chris Packham  
> wrote:
>
>> Shortly after I posted my last series I got access to a more recent
>> Marvell SDK which had some device tree support for the switch SoCs I'd
>> been wanting. It was still based on an older kernel but it was a huge
>> improvement over what came before.
>>
>> Patch 1/6 is a typo I noticed after my initial series was applied.
>>
>> Patch 2/6 is a bit of a cleanup. I did initially struggle with how to
>> access individual parts of the DFX block as well as retaining a handle on
>> the entire thing for the switch driver to use.
>>
>> Patch 3/6 is a re-jig of the dtsi files which is needed by 5/6. This is
>> required because I need to use the coreclk label on a different node. It
>> also means I don't have to disable nodes for blocks that only exist on
>> the Armada-XP.
>>
>> Patch 4/6, 5/6 are split from the previous versions.
>>
>> Patch 6/6 is the device tree portion of a change already in clk-next.
>
>
> I applied patches 2, 3 and 6 on mvebu/dt with the acked-by from Rob when
> he gave it.
>
> Patch 1 is already part of 4.11-rc1.
>
> For patch 4 and 5 I understand that either we don't need it or it should
> be a updated version so I skipped them.
>

That's correct, thanks.

Re: [PATCH 1/1] rds: ib: add error handle

2017-03-07 Thread Santosh Shilimkar


On 3/6/2017 11:48 PM, Zhu Yanjun wrote:

In the function rds_ib_setup_qp, the error handle is missing. When some
error occurs, it is possible that memory leak occurs. As such, error
handle is added.

Cc: Joe Jin 
Reviewed-by: Junxiao Bi 
Reviewed-by: Guanglei Li 
Signed-off-by: Zhu Yanjun 
---

Looks good.
Acked-by: Santosh Shilimkar

RE: [PATCH net 3/7 v2] bnx2x: fix possible overrun of VFPF multicast addresses array

2017-03-07 Thread Mintz, Yuval

> It is too late to check for the limit of the number of VF multicast addresses
> after they have already been copied to the req->multicast[] array, possibly
> overflowing it.
> 
> Do the check before copying.
> 
> Checking early also avoids having to (and forgetting to) unlock vf2pf_mutex.
> 
> While we're looking at the error paths in the function, also return an error
> code from it when the PF responds with an error. Even though the caller
> ignores it.
> 
> v2: Move the check before bnx2x_vfpf_prep() as suggested by Yuval.
> 
> Signed-off-by: Michal Schmidt 

Acked-by: Yuval Mintz

[PATCH v3] {net,IB}/{rxe,usnic}: Utilize generic mac to eui32 function

2017-03-07 Thread Yuval Shaia

This logic seems to be duplicated in (at least) three separate files.
Move it to one place so code can be re-use.

Signed-off-by: Yuval Shaia 
---
v0 -> v1:
* Add missing #include
* Rename to genaddrconf_ifid_eui48
v1 -> v2:
* Reset eui[0] to default if dev_id is used
v2 -> v3:
* Add helper function to avoid re-setting eui[0] to default if
  dev_id is used
---
 drivers/infiniband/hw/usnic/usnic_common_util.h | 11 +++
 drivers/infiniband/sw/rxe/rxe_net.c | 11 ++-
 include/net/addrconf.h  | 25 +++--
 3 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/drivers/infiniband/hw/usnic/usnic_common_util.h 
b/drivers/infiniband/hw/usnic/usnic_common_util.h
index b54986d..d91b035 100644
--- a/drivers/infiniband/hw/usnic/usnic_common_util.h
+++ b/drivers/infiniband/hw/usnic/usnic_common_util.h
@@ -34,6 +34,8 @@
 #ifndef USNIC_CMN_UTIL_H
 #define USNIC_CMN_UTIL_H
 
+#include 
+
 static inline void
 usnic_mac_to_gid(const char *const mac, char *raw_gid)
 {
@@ -57,14 +59,7 @@ usnic_mac_ip_to_gid(const char *const mac, const __be32 
inaddr, char *raw_gid)
raw_gid[1] = 0x80;
memset(_gid[2], 0, 2);
memcpy(_gid[4], , 4);
-   raw_gid[8] = mac[0]^2;
-   raw_gid[9] = mac[1];
-   raw_gid[10] = mac[2];
-   raw_gid[11] = 0xff;
-   raw_gid[12] = 0xfe;
-   raw_gid[13] = mac[3];
-   raw_gid[14] = mac[4];
-   raw_gid[15] = mac[5];
+   addrconf_addr_eui48(_gid[8], mac);
 }
 
 static inline void
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c 
b/drivers/infiniband/sw/rxe/rxe_net.c
index d8610960..ab8ea23 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.c
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -38,6 +38,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -86,18 +87,10 @@ struct rxe_recv_sockets recv_sockets;
 
 static __be64 rxe_mac_to_eui64(struct net_device *ndev)
 {
-   unsigned char *mac_addr = ndev->dev_addr;
__be64 eui64;
unsigned char *dst = (unsigned char *)
 
-   dst[0] = mac_addr[0] ^ 2;
-   dst[1] = mac_addr[1];
-   dst[2] = mac_addr[2];
-   dst[3] = 0xff;
-   dst[4] = 0xfe;
-   dst[5] = mac_addr[3];
-   dst[6] = mac_addr[4];
-   dst[7] = mac_addr[5];
+   addrconf_addr_eui48(dst, ndev->dev_addr);
 
return eui64;
 }
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 17c6fd8..28274ed 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -103,12 +103,25 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct 
net_device *dev,
 u32 addr_flags, bool sllao, bool tokenized,
 __u32 valid_lft, u32 prefered_lft);
 
+static inline void addrconf_addr_eui48_xor(u8 *eui, const char *const addr, 
bool xor)
+{
+   memcpy(eui, addr, 3);
+   if (xor)
+   eui[0] ^= 2;
+   eui[3] = 0xFF;
+   eui[4] = 0xFE;
+   memcpy(eui + 5, addr + 3, 3);
+}
+
+static inline void addrconf_addr_eui48(u8 *eui, const char *const addr)
+{
+   addrconf_addr_eui48_xor(eui, addr, true);
+}
+
 static inline int addrconf_ifid_eui48(u8 *eui, struct net_device *dev)
 {
if (dev->addr_len != ETH_ALEN)
return -1;
-   memcpy(eui, dev->dev_addr, 3);
-   memcpy(eui + 5, dev->dev_addr + 3, 3);
 
/*
 * The zSeries OSA network cards can be shared among various
@@ -123,14 +136,14 @@ static inline int addrconf_ifid_eui48(u8 *eui, struct 
net_device *dev)
 * case.  Hence the resulting interface identifier has local
 * scope according to RFC2373.
 */
+
+   addrconf_addr_eui48_xor(eui, dev->dev_addr, !dev->dev_id);
+
if (dev->dev_id) {
eui[3] = (dev->dev_id >> 8) & 0xFF;
eui[4] = dev->dev_id & 0xFF;
-   } else {
-   eui[3] = 0xFF;
-   eui[4] = 0xFE;
-   eui[0] ^= 2;
}
+
return 0;
 }
 
-- 
2.7.4

Re: [PATCH v2] {net,IB}/{rxe,usnic}: Utilize generic mac to eui32 function

2017-03-07 Thread Yuval Shaia

On Tue, Mar 07, 2017 at 04:27:11PM +0200, Leon Romanovsky wrote:
> On Mon, Mar 06, 2017 at 08:54:06PM +0200, Yuval Shaia wrote:
> > This logic seems to be duplicated in (at least) three separate files.
> > Move it to one place so code can be re-use.
> >
> > Signed-off-by: Yuval Shaia 
> > ---
> > v0 -> v1:
> > * Add missing #include
> > * Rename to genaddrconf_ifid_eui48
> > v1 -> v2:
> > * Reset eui[0] to default if dev_id is used
> > ---
> >  drivers/infiniband/hw/usnic/usnic_common_util.h | 11 +++
> >  drivers/infiniband/sw/rxe/rxe_net.c | 11 ++-
> >  include/net/addrconf.h  | 19 +--
> >  3 files changed, 18 insertions(+), 23 deletions(-)
> >
> >  * scope according to RFC2373.
> >  */
> > if (dev->dev_id) {
> > +   eui[0] = dev->dev_addr[0];
> > eui[3] = (dev->dev_id >> 8) & 0xFF;
> > eui[4] = dev->dev_id & 0xFF;
> > -   } else {
> > -   eui[3] = 0xFF;
> > -   eui[4] = 0xFE;
> > -   eui[0] ^= 2;
> > }
> > +
> > return 0;
> >  }
> 
> Technically, the code is correct now, but it doesn't look right
> to set the value and restore it right after that.

Agree.
I will soon send v3 that offers an alternative by adding new helper
function so this re-assignment can be avoid.

> 
> Thanks
> 
> >
> > --
> > 2.7.4
> >

Re: netlink: GPF in netlink_unicast

2017-03-07 Thread Paul Moore

On Tue, Mar 7, 2017 at 1:44 PM, Paul Moore  wrote:
> On Tue, Mar 7, 2017 at 10:55 AM, Richard Guy Briggs  wrote:
>> On 2017-03-07 09:29, Paul Moore wrote:
>>> On Mon, Mar 6, 2017 at 11:03 PM, Richard Guy Briggs  wrote:
>>> > On 2017-03-06 10:10, Cong Wang wrote:
>>> >> On Mon, Mar 6, 2017 at 2:54 AM, Dmitry Vyukov  wrote:
>>> >> > Hello,
>>> >> >
>>> >> > I've got the following crash while running syzkaller fuzzer on
>>> >> > net-next/8d70eeb84ab277377c017af6a21d0a337025dede:
>>> >> >
>>> >> > kasan: GPF could be caused by NULL-ptr deref or user memory access
>>> >> > general protection fault:  [#1] SMP KASAN
>>> >> > Dumping ftrace buffer:
>>> >> >(ftrace buffer empty)
>>> >> > Modules linked in:
>>> >> > CPU: 0 PID: 883 Comm: kauditd Not tainted 4.10.0+ #6
>>> >> > Hardware name: Google Google Compute Engine/Google Compute Engine,
>>> >> > BIOS Google 01/01/2011
>>> >> > task: 8801d79f0240 task.stack: 8801d7a2
>>> >> > RIP: 0010:sock_sndtimeo include/net/sock.h:2162 [inline]
>>> >> > RIP: 0010:netlink_unicast+0xdd/0x730 net/netlink/af_netlink.c:1249
>>> >> > RSP: 0018:8801d7a27c38 EFLAGS: 00010206
>>> >> > RAX: 0056 RBX: 8801d7a27cd0 RCX: 
>>> >> > RDX:  RSI:  RDI: 02b0
>>> >> > RBP: 8801d7a27cf8 R08: ed00385cf286 R09: ed00385cf286
>>> >> > R10: 0006 R11: ed00385cf285 R12: 
>>> >> > R13: dc00 R14: 8801c2fc3c80 R15: 014000c0
>>> >> > FS:  () GS:8801dbe0() 
>>> >> > knlGS:
>>> >> > CS:  0010 DS:  ES:  CR0: 80050033
>>> >> > CR2: 20cfd000 CR3: 0001c758f000 CR4: 001406f0
>>> >> > Call Trace:
>>> >> >  kauditd_send_unicast_skb+0x3c/0x70 kernel/audit.c:482
>>> >> >  kauditd_thread+0x174/0xb00 kernel/audit.c:599
>>> >> >  kthread+0x326/0x3f0 kernel/kthread.c:229
>>> >> >  ret_from_fork+0x31/0x40 arch/x86/entry/entry_64.S:430
>>> >> > Code: 44 89 fe e8 56 15 ff ff 8b 8d 70 ff ff ff 49 89 c6 31 c0 85 c9
>>> >> > 75 27 e8 b2 b2 f4 fd 49 8d bc 24 b0 02 00 00 48 89 f8 48 c1 e8 03 <42>
>>> >> > 80 3c 28 00 0f 85 37 06 00 00 49 8b 84 24 b0 02 00 00 4c 8d
>>> >> > RIP: sock_sndtimeo include/net/sock.h:2162 [inline] RSP: 
>>> >> > 8801d7a27c38
>>> >> > RIP: netlink_unicast+0xdd/0x730 net/netlink/af_netlink.c:1249 RSP:
>>> >> > 8801d7a27c38
>>> >> > ---[ end trace ad1bba9d457430b6 ]---
>>> >> > Kernel panic - not syncing: Fatal exception
>>> >> >
>>> >> >
>>> >> > This is not reproducible and seems to be caused by an elusive race.
>>> >> > However, looking at the code I don't see any proper protection of
>>> >> > audit_sock (other than the if (!audit_pid) which is obviously not
>>> >> > enough to protect against races).
>>> >>
>>> >> audit_cmd_mutex is supposed to protect it, I think.
>>> >> But kauditd_send_unicast_skb() seems not holding this mutex.
>>> >
>>> > H, I wonder if it makes sense to wrap most of the contents of the
>>> > outer while loop in kauditd_thread in the audit_cmd_mutex, or around the
>>> > first two innter while loops and the "if (auditd)" condition after the
>>> > "quick_loop:" label.  The condition on auditd is supposed to catch that
>>> > case.  We don't want it locked while playing with the scheduler at the
>>> > bottom of that function.
>>>
>>> Let me look into this and play around with a few things.  I suspected
>>> there might be a problem here, so I've got thoughts on how we might
>>> resolve it; I just need to see code them up and see what option sucks
>>> the least.
>>>
>>> FWIW Richard, yes wrapping most of kauditd_thread *should* resolve
>>> this but it's pretty heavy handed and not my first choice.
>>
>> That's why the inner loops made a bit more sense since it wasn't really
>> necessary and ran afoul of the scheduler anyways.
>
> One of my preferred options was to get us away from protecting
> everything with the audit_cmd_mutex by creating a new locking approach
> for the auditd connection state (using RCU/spinlocks since it rarely
> changes in practice) and leaving the audit_cmd_mutex for it's
> traditional role.  This should minimize the performance impact of the
> lock and clean things up a bit.  I'm also moving all the auditd
> connection state into a single struct (instead of several variables
> associated only by convention) which moves us oh so slightly closer to
> allowing multiple auditd connections (hey, it's something).
>
> It's taking a bit longer than expected as I'm dealing with a bit of a
> head cold (or something) and my mind is far less than 100% at the
> moment ...

Ooof.  I just noticed something, and maybe this is the fever talking,
but why do we ever NULL out audit_sock and why are we bothering with
those holds/puts?  We create the audit netlink socket in
audit_net_init() and it should remain valid until we kill it in

[PATCH ethtool] ethtool: print hash function with ethtool -x|--show-rxfh-indir

2017-03-07 Thread Jakub Kicinski

Make ethtool -x|--show-rxfh-indir print the RSS hash function name.

Signed-off-by: Jakub Kicinski 
---
 ethtool.8.in |  3 ++-
 ethtool.c| 31 +++
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/ethtool.8.in b/ethtool.8.in
index 5c36c06385f6..5f5a141c1482 100644
--- a/ethtool.8.in
+++ b/ethtool.8.in
@@ -854,7 +854,8 @@ Show the device's time stamping capabilities and associated 
PTP
 hardware clock.
 .TP
 .B \-x \-\-show\-rxfh\-indir \-\-show\-rxfh
-Retrieves the receive flow hash indirection table and/or RSS hash key.
+Retrieves the receive flow hash indirection table, hash function and RSS hash
+key.
 .TP
 .B \-X \-\-set\-rxfh\-indir \-\-rxfh
 Configures the receive flow hash indirection table and/or RSS hash key.
diff --git a/ethtool.c b/ethtool.c
index 7af039e26b50..2d8dd54752c4 100644
--- a/ethtool.c
+++ b/ethtool.c
@@ -3638,6 +3638,35 @@ static int do_grxfhindir(struct cmd_context *ctx,
return 0;
 }
 
+static void print_grxfh_hfunc(struct cmd_context *ctx, __u8 hfunc)
+{
+   unsigned int func_id = ffs(hfunc) - 1;
+   struct ethtool_gstrings *strings;
+
+   printf("RSS hash function: ");
+   if (!hfunc) {
+   printf("unknown\n");
+   return;
+   }
+
+   strings = get_stringset(ctx, ETH_SS_RSS_HASH_FUNCS, 0, 1);
+   if (!strings) {
+   if (errno != EOPNOTSUPP)
+   printf("%s\n", strerror(errno));
+   else
+   printf("unnamed (%d)\n", func_id);
+   return;
+   }
+
+   if (func_id < strings->len)
+   printf("%s\n",
+  (char *)strings->data + func_id * ETH_GSTRING_LEN);
+   else
+   printf("unnamed (%d)\n", func_id);
+
+   free(strings);
+}
+
 static int do_grxfh(struct cmd_context *ctx)
 {
struct ethtool_rxfh rss_head = {0};
@@ -3683,6 +3712,8 @@ static int do_grxfh(struct cmd_context *ctx)
 
print_indir_table(ctx, _count, rss->indir_size, rss->rss_config);
 
+   print_grxfh_hfunc(ctx, rss->hfunc);
+
indir_bytes = rss->indir_size * sizeof(rss->rss_config[0]);
hkey = ((char *)rss->rss_config + indir_bytes);
 
-- 
2.11.0

Re: [PATCH 10/29] drivers, md: convert stripe_head.count from atomic_t to refcount_t

2017-03-07 Thread Shaohua Li

On Mon, Mar 06, 2017 at 04:20:57PM +0200, Elena Reshetova wrote:
> refcount_t type and corresponding API should be
> used instead of atomic_t when the variable is used as
> a reference counter. This allows to avoid accidental
> refcounter overflows that might lead to use-after-free
> situations.
> 
> Signed-off-by: Elena Reshetova 
> Signed-off-by: Hans Liljestrand 
> Signed-off-by: Kees Cook 
> Signed-off-by: David Windsor 
> ---
>  drivers/md/raid5-cache.c |  8 +++---
>  drivers/md/raid5.c   | 66 
> 
>  drivers/md/raid5.h   |  3 ++-
>  3 files changed, 39 insertions(+), 38 deletions(-)
> 
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index 3f307be..6c05e12 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c

snip
>  sh->check_state, sh->reconstruct_state);
>  
>   analyse_stripe(sh, );
> @@ -4924,7 +4924,7 @@ static void activate_bit_delay(struct r5conf *conf,
>   struct stripe_head *sh = list_entry(head.next, struct 
> stripe_head, lru);
>   int hash;
>   list_del_init(>lru);
> - atomic_inc(>count);
> + refcount_inc(>count);
>   hash = sh->hash_lock_index;
>   __release_stripe(conf, sh, _inactive_list[hash]);
>   }
> @@ -5240,7 +5240,7 @@ static struct stripe_head *__get_priority_stripe(struct 
> r5conf *conf, int group)
>   sh->group = NULL;
>   }
>   list_del_init(>lru);
> - BUG_ON(atomic_inc_return(>count) != 1);
> + BUG_ON(refcount_inc_not_zero(>count));

This changes the behavior. refcount_inc_not_zero doesn't inc if original value 
is 0

Thanks,
Shaohua

Re: net: heap out-of-bounds in fib6_clean_node/rt6_fill_node/fib6_age/fib6_prune_clone

2017-03-07 Thread Dmitry Vyukov

On Tue, Mar 7, 2017 at 7:43 PM, David Ahern  wrote:
> On 3/7/17 11:13 AM, Dmitry Vyukov wrote:
>>> on this warning:
>>>
>>> /* dst.next really should not be set at this point */
>>> if (rt->dst.next && rt->dst.next->ops->family != AF_INET6) {
>>> pr_warn("fib6_add: adding rt with bad next -- family %d dst
>>> flags %x\n",
>>> rt->dst.next->ops->family, rt->dst.next->flags);
>>>
>>> WARN_ON(1);
>>> }
>>>
>>> You should have seen the pr_warn in the log preceding the WARN_ON dump.
>>
>> Right. They all have the same "IPv6: fib6_add: adding rt with bad next
>> -- family 2 dst flags 6"
>
> remove the previous changes and try the attached.


Doing this now.
FWIW I've also applied your last patch with missing "iter->dst.flags
&= ~DST_IN_FIB;" and restored the warning in rt6_rcu_free and it did
not fire (in a limited run). I only saw the "WARNING in fib6_add" that
I already reported.

please add some examples to the ip man page

2017-03-07 Thread 積丹尼 Dan Jacobson

$ man ip
should have some usage examples. Even one would be fine.

Re: [PATCH 08/29] drivers, md: convert mddev.active from atomic_t to refcount_t

2017-03-07 Thread Shaohua Li

On Mon, Mar 06, 2017 at 04:20:55PM +0200, Elena Reshetova wrote:
> refcount_t type and corresponding API should be
> used instead of atomic_t when the variable is used as
> a reference counter. This allows to avoid accidental
> refcounter overflows that might lead to use-after-free
> situations.

Looks good. Let me know how do you want to route the patch to upstream.
 
> Signed-off-by: Elena Reshetova 
> Signed-off-by: Hans Liljestrand 
> Signed-off-by: Kees Cook 
> Signed-off-by: David Windsor 
> ---
>  drivers/md/md.c | 6 +++---
>  drivers/md/md.h | 3 ++-
>  2 files changed, 5 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 985374f..94c8ebf 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -449,7 +449,7 @@ EXPORT_SYMBOL(md_unplug);
>  
>  static inline struct mddev *mddev_get(struct mddev *mddev)
>  {
> - atomic_inc(>active);
> + refcount_inc(>active);
>   return mddev;
>  }
>  
> @@ -459,7 +459,7 @@ static void mddev_put(struct mddev *mddev)
>  {
>   struct bio_set *bs = NULL;
>  
> - if (!atomic_dec_and_lock(>active, _mddevs_lock))
> + if (!refcount_dec_and_lock(>active, _mddevs_lock))
>   return;
>   if (!mddev->raid_disks && list_empty(>disks) &&
>   mddev->ctime == 0 && !mddev->hold_active) {
> @@ -495,7 +495,7 @@ void mddev_init(struct mddev *mddev)
>   INIT_LIST_HEAD(>all_mddevs);
>   setup_timer(>safemode_timer, md_safemode_timeout,
>   (unsigned long) mddev);
> - atomic_set(>active, 1);
> + refcount_set(>active, 1);
>   atomic_set(>openers, 0);
>   atomic_set(>active_io, 0);
>   spin_lock_init(>lock);
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index b8859cb..4811663 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -22,6 +22,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -360,7 +361,7 @@ struct mddev {
>*/
>   struct mutexopen_mutex;
>   struct mutexreconfig_mutex;
> - atomic_tactive; /* general refcount */
> + refcount_t  active; /* general refcount */
>   atomic_topeners;/* number of active 
> opens */
>  
>   int changed;/* True if we might 
> need to
> -- 
> 2.7.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC net-next v2 1/4] skbuff: add stub to help computing crc32c on SCTP packets

2017-03-07 Thread Alexander Duyck

On Mon, Mar 6, 2017 at 1:51 PM, Davide Caratti  wrote:
> On Tue, 2017-02-28 at 14:46 -0800, Alexander Duyck wrote:
>> On Tue, Feb 28, 2017 at 2:32 AM, Davide Caratti  wrote:
>> >
>> > sctp_compute_checksum requires crc32c symbol (provided by libcrc32c), so
>> > it can't be used in net core. Like it has been done previously with other
>> > symbols (e.g. ipv6_dst_lookup), introduce a stub struct skb_checksum_ops
>> > to allow computation of SCTP checksum in net core after sctp.ko (and thus
>> > libcrc32c) has been loaded.
>>
>> At a minimum the name really needs to change.  SCTP does not do
>> checksums.  It does a CRC, and a CRC is a very different thing.  The
>> fact that somebody decided that offloading a CRC could use the same
>> framework is very unfortunate, and your patch descriptions in this
>> whole set are calling out a CRC as checksums which it is not.
>
> hello Alexander,
>
> thank you for contributing to this topic. I see there has been a similar
> discussion some months ago
> (https://www.mail-archive.com/netdev@vger.kernel.org/msg94955.html).
>
>> I don't want to see anything "checksum" or "csum" related in the
>> naming when it comes to dealing with SCTP unless we absolutely have
>> to have it.  So any function names or structures with sctp in the name
>> should call out "crc32" or "crc", please don't use checksum.
>
> On Wed, 2017-03-01 at 10:53 +, David Laight wrote:
>> Then also change all the places that refer the IP 1's compliment
>> checksum to ipchecksum.
>
> (but crc32 uses a different polynomial than crc32c! :-) ) I understand
> your concerns, nevertheless we are writing to a member of struct sctphdr
> whose name is 'checksum' since the earliest introduction of SCTP; moreover,
> similar terminology ('crc32c checksum') is used throughout all RFC4960.
> That's why I don't think anybody will be confused by usage of 'csum' or
> 'checksum' words.
>
> On Tue, 2017-02-28 at 19:17 -0800, Tom Herbert wrote:
>> I agree that internal functions to sctp should not refer to checksum,
>> but I think we need to take care to be consistent with any external
>> API (even if somebody made a mistake defining it this way :-) ). As
>> you know the checksum interface must be very precisely defined, there
>> is no leeway for ambiguity.
>
> We can make the new symbols more generic removing 'sctp' from the
> symbol name, and writing explicitly that skb needs crc32c (rather than
> skb does not need internet checksum).
>
> Proposal:
> we use crc32c, possibly combined with 'csum' or 'checksum', just like
> it has been done in RFC4960.  So, symbol names can be replaced as follows:
>
> RFC v2 name  | RFC v3 name
> -+-
> warn_sctp_csum_update| warn_crc32c_csum_update
> warn_sctp_csum_combine   | warn_crc32c_csum_combine
> sctp_csum_stub   | crc32c_csum_stub
> sctp_csum_ops| crc32c_csum_ops
> skb_sctp_csum_help   | skb_crc32c_csum_help
> skb->csum_not_inet   | skb->crc32c_csum
>
> please let me know if the proposal can be acceptable from your point of view.

I do like this approach better.  You might even take this one step
further.  You could convert crc32_csum into a 1 bit enum for now.
Basically you would use 0 for 1's compliement csum, and 1 to represent
a crc32c csum.  Then if we end up having to add another bit for
something like FCoE in the future it would give us 4 possible checksum
types instead of just giving us 1 with a bit mask.

> On Tue, 2017-02-28 at 11:50 -0800, Tom Herbert wrote:
>> Unfortunately this potentially pushes the skbuf flags over 32 bits if
>> I count correctly. I suggest that you rename csum_bad to
>> csum_not_inet. Looks like csum_bad is only set by a grand total of one
>> driver and I don't believe that is enough to justify its existence.
>> It's probably a good time to remove it.
>
> you are right: find below the current layout obtained with 'allyesconfig':
>
> short unsigned int queue_mapping;   /*   140 2 */
> unsigned char  __cloned_offset[0];  /*   142 0 */
> unsigned char  cloned:1;/*   142: 7  1 */
> unsigned char  nohdr:1; /*   142: 6  1 */
> unsigned char  fclone:2;/*   142: 4  1 */
> unsigned char  peeked:1;/*   142: 3  1 */
> unsigned char  head_frag:1; /*   142: 2  1 */
> unsigned char  xmit_more:1; /*   142: 1  1 */
> unsigned char  __unused:1;  /*   142: 0  1 */
>
> /* XXX 1 byte hole, try to pack */
> unsigned int   headers_start[0];/*   144 0 */
> unsigned char  __pkt_type_offset[0];/*   144 0 */
> unsigned char  pkt_type:3;  /*   144: 5  1 */
>
>

Re: net: heap out-of-bounds in fib6_clean_node/rt6_fill_node/fib6_age/fib6_prune_clone

2017-03-07 Thread David Ahern

On 3/7/17 11:13 AM, Dmitry Vyukov wrote:
>> on this warning:
>>
>> /* dst.next really should not be set at this point */
>> if (rt->dst.next && rt->dst.next->ops->family != AF_INET6) {
>> pr_warn("fib6_add: adding rt with bad next -- family %d dst
>> flags %x\n",
>> rt->dst.next->ops->family, rt->dst.next->flags);
>>
>> WARN_ON(1);
>> }
>>
>> You should have seen the pr_warn in the log preceding the WARN_ON dump.
> 
> Right. They all have the same "IPv6: fib6_add: adding rt with bad next
> -- family 2 dst flags 6"

remove the previous changes and try the attached.
diff --git a/include/net/dst.h b/include/net/dst.h
index 049af33da3b6..d164eb8ceab8 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -58,6 +58,7 @@ struct dst_entry {
 #define DST_XFRM_TUNNEL0x0080
 #define DST_XFRM_QUEUE 0x0100
 #define DST_METADATA   0x0200
+#define DST_IN_FIB 0x0400
 
short   error;
 
diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h
index c84b3287e38b..cd0df8f76420 100644
--- a/include/net/dst_ops.h
+++ b/include/net/dst_ops.h
@@ -15,6 +15,7 @@ struct dst_ops {
unsigned short  family;
unsigned intgc_thresh;
 
+   void(*dump)(struct dst_entry *);
int (*gc)(struct dst_ops *ops);
struct dst_entry *  (*check)(struct dst_entry *, __u32 cookie);
unsigned int(*default_advmss)(const struct dst_entry *);
diff --git a/net/core/dst.c b/net/core/dst.c
index 960e503b5a52..c98447fe8510 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -232,6 +232,9 @@ void __dst_free(struct dst_entry *dst)
 {
spin_lock_bh(_garbage.lock);
___dst_free(dst);
+if (dst->flags & DST_IN_FIB)
+   pr_warn("dst %p is marked as in fib\n", dst);
+//WARN_ON(dst->flags & DST_IN_FIB);
dst->next = dst_garbage.list;
dst_garbage.list = dst;
if (dst_garbage.timer_inc > DST_GC_INC) {
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index e4266746e4a2..d4539d9a463e 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -155,6 +155,7 @@ static void node_free(struct fib6_node *fn)
 
 static void rt6_rcu_free(struct rt6_info *rt)
 {
+WARN_ON(rt->dst.flags & DST_IN_FIB);
call_rcu(>dst.rcu_head, dst_rcu_free);
 }
 
@@ -878,6 +879,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct 
rt6_info *rt,
return err;
 
rt->dst.rt6_next = iter;
+   rt->dst.flags |= DST_IN_FIB;
*ins = rt;
rt->rt6i_node = fn;
atomic_inc(>rt6i_ref);
@@ -907,6 +909,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct 
rt6_info *rt,
*ins = rt;
rt->rt6i_node = fn;
rt->dst.rt6_next = iter->dst.rt6_next;
+   rt->dst.flags |= DST_IN_FIB;
atomic_inc(>rt6i_ref);
if (!info->skip_notify)
inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
@@ -916,6 +919,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct 
rt6_info *rt,
}
nsiblings = iter->rt6i_nsiblings;
fib6_purge_rt(iter, fn, info->nl_net);
+   iter->dst.flags &= ~DST_IN_FIB;
rt6_release(iter);
 
if (nsiblings) {
@@ -926,6 +930,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct 
rt6_info *rt,
if (rt6_qualify_for_ecmp(iter)) {
*ins = iter->dst.rt6_next;
fib6_purge_rt(iter, fn, info->nl_net);
+   iter->dst.flags &= ~DST_IN_FIB;
rt6_release(iter);
nsiblings--;
} else {
@@ -974,6 +979,21 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 !atomic_read(>dst.__refcnt)))
return -EINVAL;
 
+if (rt->dst.ops->family != AF_INET6) {
+   pr_warn("fib6_add: adding rt with family is %d dst flags %x\n",
+   rt->dst.ops->family, rt->dst.flags);
+
+   WARN_ON(1);
+}
+/* dst.next really should not be set at this point */
+if (rt->dst.next && rt->dst.next->ops->family != AF_INET6) {
+   pr_warn("fib6_add: adding rt with bad next -- family %d dst flags %x\n",
+   rt->dst.next->ops->family, rt->dst.next->flags);
+
+   if (rt->dst.ops->dump)
+   rt->dst.ops->dump(>dst);
+}
+
if (info->nlh) {
if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
allow_create = 0;
@@ -1444,6 +1464,7 @@ static void fib6_del_route(struct fib6_node *fn, struct 
rt6_info **rtp,
read_unlock(>ipv6.fib6_walker_lock);
 
rt->dst.rt6_next = NULL;
+   rt->dst.flags

Re: netlink: GPF in netlink_unicast

2017-03-07 Thread Paul Moore

On Tue, Mar 7, 2017 at 10:55 AM, Richard Guy Briggs  wrote:
> On 2017-03-07 09:29, Paul Moore wrote:
>> On Mon, Mar 6, 2017 at 11:03 PM, Richard Guy Briggs  wrote:
>> > On 2017-03-06 10:10, Cong Wang wrote:
>> >> On Mon, Mar 6, 2017 at 2:54 AM, Dmitry Vyukov  wrote:
>> >> > Hello,
>> >> >
>> >> > I've got the following crash while running syzkaller fuzzer on
>> >> > net-next/8d70eeb84ab277377c017af6a21d0a337025dede:
>> >> >
>> >> > kasan: GPF could be caused by NULL-ptr deref or user memory access
>> >> > general protection fault:  [#1] SMP KASAN
>> >> > Dumping ftrace buffer:
>> >> >(ftrace buffer empty)
>> >> > Modules linked in:
>> >> > CPU: 0 PID: 883 Comm: kauditd Not tainted 4.10.0+ #6
>> >> > Hardware name: Google Google Compute Engine/Google Compute Engine,
>> >> > BIOS Google 01/01/2011
>> >> > task: 8801d79f0240 task.stack: 8801d7a2
>> >> > RIP: 0010:sock_sndtimeo include/net/sock.h:2162 [inline]
>> >> > RIP: 0010:netlink_unicast+0xdd/0x730 net/netlink/af_netlink.c:1249
>> >> > RSP: 0018:8801d7a27c38 EFLAGS: 00010206
>> >> > RAX: 0056 RBX: 8801d7a27cd0 RCX: 
>> >> > RDX:  RSI:  RDI: 02b0
>> >> > RBP: 8801d7a27cf8 R08: ed00385cf286 R09: ed00385cf286
>> >> > R10: 0006 R11: ed00385cf285 R12: 
>> >> > R13: dc00 R14: 8801c2fc3c80 R15: 014000c0
>> >> > FS:  () GS:8801dbe0() 
>> >> > knlGS:
>> >> > CS:  0010 DS:  ES:  CR0: 80050033
>> >> > CR2: 20cfd000 CR3: 0001c758f000 CR4: 001406f0
>> >> > Call Trace:
>> >> >  kauditd_send_unicast_skb+0x3c/0x70 kernel/audit.c:482
>> >> >  kauditd_thread+0x174/0xb00 kernel/audit.c:599
>> >> >  kthread+0x326/0x3f0 kernel/kthread.c:229
>> >> >  ret_from_fork+0x31/0x40 arch/x86/entry/entry_64.S:430
>> >> > Code: 44 89 fe e8 56 15 ff ff 8b 8d 70 ff ff ff 49 89 c6 31 c0 85 c9
>> >> > 75 27 e8 b2 b2 f4 fd 49 8d bc 24 b0 02 00 00 48 89 f8 48 c1 e8 03 <42>
>> >> > 80 3c 28 00 0f 85 37 06 00 00 49 8b 84 24 b0 02 00 00 4c 8d
>> >> > RIP: sock_sndtimeo include/net/sock.h:2162 [inline] RSP: 
>> >> > 8801d7a27c38
>> >> > RIP: netlink_unicast+0xdd/0x730 net/netlink/af_netlink.c:1249 RSP:
>> >> > 8801d7a27c38
>> >> > ---[ end trace ad1bba9d457430b6 ]---
>> >> > Kernel panic - not syncing: Fatal exception
>> >> >
>> >> >
>> >> > This is not reproducible and seems to be caused by an elusive race.
>> >> > However, looking at the code I don't see any proper protection of
>> >> > audit_sock (other than the if (!audit_pid) which is obviously not
>> >> > enough to protect against races).
>> >>
>> >> audit_cmd_mutex is supposed to protect it, I think.
>> >> But kauditd_send_unicast_skb() seems not holding this mutex.
>> >
>> > H, I wonder if it makes sense to wrap most of the contents of the
>> > outer while loop in kauditd_thread in the audit_cmd_mutex, or around the
>> > first two innter while loops and the "if (auditd)" condition after the
>> > "quick_loop:" label.  The condition on auditd is supposed to catch that
>> > case.  We don't want it locked while playing with the scheduler at the
>> > bottom of that function.
>>
>> Let me look into this and play around with a few things.  I suspected
>> there might be a problem here, so I've got thoughts on how we might
>> resolve it; I just need to see code them up and see what option sucks
>> the least.
>>
>> FWIW Richard, yes wrapping most of kauditd_thread *should* resolve
>> this but it's pretty heavy handed and not my first choice.
>
> That's why the inner loops made a bit more sense since it wasn't really
> necessary and ran afoul of the scheduler anyways.

One of my preferred options was to get us away from protecting
everything with the audit_cmd_mutex by creating a new locking approach
for the auditd connection state (using RCU/spinlocks since it rarely
changes in practice) and leaving the audit_cmd_mutex for it's
traditional role.  This should minimize the performance impact of the
lock and clean things up a bit.  I'm also moving all the auditd
connection state into a single struct (instead of several variables
associated only by convention) which moves us oh so slightly closer to
allowing multiple auditd connections (hey, it's something).

It's taking a bit longer than expected as I'm dealing with a bit of a
head cold (or something) and my mind is far less than 100% at the
moment ...

-- 
paul moore
www.paul-moore.com

Re: net: heap out-of-bounds in fib6_clean_node/rt6_fill_node/fib6_age/fib6_prune_clone

2017-03-07 Thread David Ahern

On 3/7/17 1:43 AM, Dmitry Vyukov wrote:
> This is on c1ae3cfa0e89fa1a7ecc4c99031f5e9ae99d9201. No other kernel
> output from your patch (pr_err).
> 
> [ cut here ]
> WARNING: CPU: 1 PID: 30179 at net/ipv6/ip6_fib.c:158
> rt6_rcu_free+0x61/0x70 net/ipv6/ip6_fib.c:158
> Kernel panic - not syncing: panic_on_warn set ...

you have panic_on_warn set ...

> 
> CPU: 1 PID: 30179 Comm: syz-executor3 Not tainted 4.11.0-rc1+ #310
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
> Call Trace:
>  __dump_stack lib/dump_stack.c:16 [inline]
>  dump_stack+0x2fb/0x3fd lib/dump_stack.c:52
>  panic+0x20f/0x426 kernel/panic.c:180
>  __warn+0x1c4/0x1e0 kernel/panic.c:541
>  warn_slowpath_null+0x2c/0x40 kernel/panic.c:584
>  rt6_rcu_free+0x61/0x70 net/ipv6/ip6_fib.c:158

and this is my WARN_ON in rt6_rcu_free which is showing an additional
change is needed

>  rt6_release+0x1ee/0x290 net/ipv6/ip6_fib.c:189
>  fib6_add_rt2node net/ipv6/ip6_fib.c:922 [inline]

in fib6_add_rt2node for the route replace path (whitespace damaged on
the copy-paste):

@@ -916,6 +919,7 @@ static int fib6_add_rt2node(struct fib6_node *fn,
struct rt6_info *rt,
}
nsiblings = iter->rt6i_nsiblings;
fib6_purge_rt(iter, fn, info->nl_net);
+   iter->dst.flags &= ~DST_IN_FIB;
rt6_release(iter);

if (nsiblings) {
@@ -926,6 +930,7 @@ static int fib6_add_rt2node(struct fib6_node *fn,
struct rt6_info *rt,
if (rt6_qualify_for_ecmp(iter)) {
*ins = iter->dst.rt6_next;
fib6_purge_rt(iter, fn,
info->nl_net);
+   iter->dst.flags &= ~DST_IN_FIB;
rt6_release(iter);
nsiblings--;
} else {

Re: [PATCHv3 net-next 21/22] net: mvpp2: set dma mask and coherent dma mask on PPv2.2

2017-03-07 Thread Thomas Petazzoni

Hello,

On Tue, 7 Mar 2017 17:24:21 +, David Laight wrote:

> Are the coherent mappings just used for ring structures?
> If it might be reasonable to allocate them as a single entity,
> thus guaranteeing they all reside in a single 4G region.

Do we have the guarantee that a DMA coherent allocation will not span a
4G boundary?

Thanks,

Thomas
-- 
Thomas Petazzoni, CTO, Free Electrons
Embedded Linux and Kernel engineering
http://free-electrons.com

Re: net: heap out-of-bounds in fib6_clean_node/rt6_fill_node/fib6_age/fib6_prune_clone

2017-03-07 Thread Dmitry Vyukov

On Tue, Mar 7, 2017 at 7:03 PM, David Ahern  wrote:
> On 3/7/17 2:21 AM, Dmitry Vyukov wrote:
>> I've commented that warning just to see I can obtain more information.
>> Then I also got this:
>>
>> [ cut here ]
>> WARNING: CPU: 2 PID: 3990 at net/ipv6/ip6_fib.c:991
>> fib6_add+0x2e12/0x3290 net/ipv6/ip6_fib.c:991 net/ipv6/ip6_fib.c:991
>> Kernel panic - not syncing: panic_on_warn set ...
>
> again panic_on_warn is triggering ...
>
>>
>> CPU: 2 PID: 3990 Comm: kworker/2:4 Not tainted 4.11.0-rc1+ #311
>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
>> Workqueue: ipv6_addrconf addrconf_dad_work
>> Call Trace:
>>  __dump_stack lib/dump_stack.c:16 [inline]
>>  __dump_stack lib/dump_stack.c:16 [inline] lib/dump_stack.c:52
>>  dump_stack+0x2fb/0x3fd lib/dump_stack.c:52 lib/dump_stack.c:52
>>  panic+0x20f/0x426 kernel/panic.c:180 kernel/panic.c:180
>>  __warn+0x1c4/0x1e0 kernel/panic.c:541 kernel/panic.c:541
>>  warn_slowpath_null+0x2c/0x40 kernel/panic.c:584 kernel/panic.c:584
>>  fib6_add+0x2e12/0x3290 net/ipv6/ip6_fib.c:991 net/ipv6/ip6_fib.c:991
>
> on this warning:
>
> /* dst.next really should not be set at this point */
> if (rt->dst.next && rt->dst.next->ops->family != AF_INET6) {
> pr_warn("fib6_add: adding rt with bad next -- family %d dst
> flags %x\n",
> rt->dst.next->ops->family, rt->dst.next->flags);
>
> WARN_ON(1);
> }
>
> You should have seen the pr_warn in the log preceding the WARN_ON dump.


Right. They all have the same "IPv6: fib6_add: adding rt with bad next
-- family 2 dst flags 6"

[  171.222795] IPv6: fib6_add: adding rt with bad next -- family 2 dst flags 6
[  171.223809] [ cut here ]
[  171.224407] WARNING: CPU: 3 PID: 27 at net/ipv6/ip6_fib.c:991
fib6_add+0x2e12/0x3290
[  171.225327] Kernel panic - not syncing: panic_on_warn set ...
[  171.225327]
[  171.226066] CPU: 3 PID: 27 Comm: kworker/3:0 Not tainted 4.11.0-rc1+ #311
[  171.226304] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS Bochs 01/01/2011
[  171.226304] Workqueue: ipv6_addrconf addrconf_dad_work
[  171.226304] Call Trace:
[  171.226304]  dump_stack+0x2fb/0x3fd
[  171.226304]  ? arch_local_irq_restore+0x53/0x53
[  171.226304]  ? vprintk_emit+0x566/0x770
[  171.226304]  ? console_unlock+0xf50/0xf50
[  171.226304]  ? vprintk_emit+0x566/0x770
[  171.226304]  ? console_unlock+0xf50/0xf50
[  171.226304]  ? vprintk_emit+0x566/0x770
[  171.226304]  ? console_unlock+0xf50/0xf50
[  171.226304]  ? check_noncircular+0x20/0x20
[  171.226304]  ? trace_hardirqs_on+0xd/0x10
[  171.226304]  ? perf_trace_lock_acquire+0x141/0xa00
[  171.226304]  ? trace_hardirqs_off+0xd/0x10
[  171.226304]  ? quarantine_put+0xea/0x190
[  171.226304]  ? check_noncircular+0x20/0x20
[  171.236060]  ? vprintk_default+0x28/0x30
[  171.236662]  ? vprintk_func+0x47/0x90
[  171.236662]  ? printk+0xc8/0xf9
[  171.236662]  ? load_image_and_restore+0x134/0x134
[  171.236662]  ? pointer+0xac0/0xac0
[  171.236662]  panic+0x20f/0x426
[  171.236662]  ? copy_mm+0x1219/0x1219
[  171.236662]  ? vprintk_func+0x47/0x90
[  171.236662]  ? printk+0xc8/0xf9
[  171.236662]  ? fib6_add+0x2e12/0x3290
[  171.236662]  __warn+0x1c4/0x1e0
[  171.236662]  warn_slowpath_null+0x2c/0x40
[  171.236662]  fib6_add+0x2e12/0x3290
[  171.236662]  ? kasan_check_write+0x14/0x20
[  171.236662]  ? netlink_broadcast_filtered+0x734/0x1380
[  171.236662]  ? fib6_force_start_gc+0xf0/0xf0
[  171.236662]  ? netlink_has_listeners+0x450/0x450
[  171.236662]  ? memcpy+0x45/0x50
[  171.236662]  ? __nla_put+0x37/0x40
[  171.236662]  ? nla_put+0xf9/0x130
[  171.236662]  ? skb_put+0x149/0x1c0
[  171.236662]  ? kasan_check_write+0x14/0x20
[  171.236662]  ? do_raw_write_lock+0xbd/0x1e0
[  171.236662]  __ip6_ins_rt+0x60/0x80
[  171.236662]  ip6_ins_rt+0x19b/0x220
[  171.236662]  ? ip6_route_info_create+0x2380/0x2380
[  171.236662]  ? nlmsg_notify+0xaf/0x160
[  171.236662]  ? rtnl_notify+0xbb/0xe0
[  171.236662]  __ipv6_ifa_notify+0x62e/0x7a0
[  171.251057]  ipv6_ifa_notify+0xdf/0x1d0
[  171.251057]  ? __ipv6_ifa_notify+0x7a0/0x7a0
[  171.251057]  addrconf_dad_completed+0xe6/0x950
[  171.251057]  ? addrconf_verify_work+0x20/0x20
[  171.251057]  ? kasan_check_write+0x14/0x20
[  171.251057]  addrconf_dad_work+0x32a/0xea0
[  171.251057]  ? addrconf_ifdown+0x1ad0/0x1ad0
[  171.251057]  ? rcu_pm_notify+0xc0/0xc0
[  171.251057]  ? wq_update_unbound_numa+0x8d0/0x8d0
[  171.251057]  ? kasan_check_write+0x14/0x20
[  171.251057]  process_one_work+0xc06/0x1c40
[  171.251057]  ? process_one_work+0xb3d/0x1c40
[  171.251057]  ? pwq_dec_nr_in_flight+0x470/0x470
[  171.251057]  ? preempt_notifier_register+0x1f0/0x1f0
[  171.259856]  ? __schedule+0x893/0x22d0
[  171.259856]  ? kasan_check_write+0x14/0x20
[  171.259856]  ? worker_thread+0x47d/0x19f0
[  171.259856]  ? lock_set_class+0xc00/0xc00
[  171.259856]  ? worker_thread+0x467/0x19f0
[  171.259856]  ? lock_acquire+0x630/0x630
[

Re: net: heap out-of-bounds in fib6_clean_node/rt6_fill_node/fib6_age/fib6_prune_clone

2017-03-07 Thread David Ahern

On 3/7/17 2:21 AM, Dmitry Vyukov wrote:
> I've commented that warning just to see I can obtain more information.
> Then I also got this:
> 
> [ cut here ]
> WARNING: CPU: 2 PID: 3990 at net/ipv6/ip6_fib.c:991
> fib6_add+0x2e12/0x3290 net/ipv6/ip6_fib.c:991 net/ipv6/ip6_fib.c:991
> Kernel panic - not syncing: panic_on_warn set ...

again panic_on_warn is triggering ...

> 
> CPU: 2 PID: 3990 Comm: kworker/2:4 Not tainted 4.11.0-rc1+ #311
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
> Workqueue: ipv6_addrconf addrconf_dad_work
> Call Trace:
>  __dump_stack lib/dump_stack.c:16 [inline]
>  __dump_stack lib/dump_stack.c:16 [inline] lib/dump_stack.c:52
>  dump_stack+0x2fb/0x3fd lib/dump_stack.c:52 lib/dump_stack.c:52
>  panic+0x20f/0x426 kernel/panic.c:180 kernel/panic.c:180
>  __warn+0x1c4/0x1e0 kernel/panic.c:541 kernel/panic.c:541
>  warn_slowpath_null+0x2c/0x40 kernel/panic.c:584 kernel/panic.c:584
>  fib6_add+0x2e12/0x3290 net/ipv6/ip6_fib.c:991 net/ipv6/ip6_fib.c:991

on this warning:

/* dst.next really should not be set at this point */
if (rt->dst.next && rt->dst.next->ops->family != AF_INET6) {
pr_warn("fib6_add: adding rt with bad next -- family %d dst
flags %x\n",
rt->dst.next->ops->family, rt->dst.next->flags);

WARN_ON(1);
}

You should have seen the pr_warn in the log preceding the WARN_ON dump.


>  __ip6_ins_rt+0x60/0x80 net/ipv6/route.c:948 net/ipv6/route.c:948
>  ip6_ins_rt+0x19b/0x220 net/ipv6/route.c:959 net/ipv6/route.c:959
>  __ipv6_ifa_notify+0x62e/0x7a0 net/ipv6/addrconf.c:5485 
> net/ipv6/addrconf.c:5485
>  ipv6_ifa_notify+0xdf/0x1d0 net/ipv6/addrconf.c:5518 net/ipv6/addrconf.c:5518
>  addrconf_dad_completed+0xe6/0x950 net/ipv6/addrconf.c:3983
> net/ipv6/addrconf.c:3983
>  addrconf_dad_begin net/ipv6/addrconf.c:3797 [inline]
>  addrconf_dad_begin net/ipv6/addrconf.c:3797 [inline] net/ipv6/addrconf.c:3897
>  addrconf_dad_work+0x32a/0xea0 net/ipv6/addrconf.c:3897 
> net/ipv6/addrconf.c:3897
>  process_one_work+0xc06/0x1c40 kernel/workqueue.c:2096 kernel/workqueue.c:2096
>  worker_thread+0x223/0x19f0 kernel/workqueue.c:2230 kernel/workqueue.c:2230
>  kthread+0x334/0x400 kernel/kthread.c:229 kernel/kthread.c:229
>  ret_from_fork+0x31/0x40 arch/x86/entry/entry_64.S:430
> arch/x86/entry/entry_64.S:430
> 
> 
> 
> And this without any preceding warnings:
> 
> ==
> BUG: KASAN: slab-out-of-bounds in fib6_age+0x3fd/0x480
> net/ipv6/ip6_fib.c:1787 at addr 88004d4fbe54

another ipv4 route in ipv6 fib walk

Re: net: heap out-of-bounds in fib6_clean_node/rt6_fill_node/fib6_age/fib6_prune_clone

2017-03-07 Thread Dmitry Vyukov

On Tue, Mar 7, 2017 at 6:17 PM, 'David Ahern' via syzkaller
 wrote:
> On 3/7/17 1:43 AM, Dmitry Vyukov wrote:
>> This is on c1ae3cfa0e89fa1a7ecc4c99031f5e9ae99d9201. No other kernel
>> output from your patch (pr_err).
>
> Is the below supposed to be from the same qemu instance at the time of
> the crash? cpu1 and cpu2 are both supposedly doing a route insert?


No, it's all from different instances.

>> [ cut here ]
>> WARNING: CPU: 1 PID: 30179 at net/ipv6/ip6_fib.c:158
>> rt6_rcu_free+0x61/0x70 net/ipv6/ip6_fib.c:158
>> Kernel panic - not syncing: panic_on_warn set ...
>>
>> CPU: 1 PID: 30179 Comm: syz-executor3 Not tainted 4.11.0-rc1+ #310
>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
>> Call Trace:
>>  __dump_stack lib/dump_stack.c:16 [inline]
>>  dump_stack+0x2fb/0x3fd lib/dump_stack.c:52
>>  panic+0x20f/0x426 kernel/panic.c:180
>>  __warn+0x1c4/0x1e0 kernel/panic.c:541
>>  warn_slowpath_null+0x2c/0x40 kernel/panic.c:584
>>  rt6_rcu_free+0x61/0x70 net/ipv6/ip6_fib.c:158
>>  rt6_release+0x1ee/0x290 net/ipv6/ip6_fib.c:189
>>  fib6_add_rt2node net/ipv6/ip6_fib.c:922 [inline]
>>  fib6_add+0x1d51/0x3290 net/ipv6/ip6_fib.c:1081
>>  __ip6_ins_rt+0x60/0x80 net/ipv6/route.c:948
>>  ip6_route_add+0x1a7/0x310 net/ipv6/route.c:2130
>>  inet6_rtm_newroute+0x191/0x1b0 net/ipv6/route.c:3294
>>  rtnetlink_rcv_msg+0x609/0x860 net/core/rtnetlink.c:4104
>>  netlink_rcv_skb+0x2ab/0x390 net/netlink/af_netlink.c:2298
>>  rtnetlink_rcv+0x2a/0x40 net/core/rtnetlink.c:4110
>>  netlink_unicast_kernel net/netlink/af_netlink.c:1231 [inline]
>>  netlink_unicast+0x525/0x730 net/netlink/af_netlink.c:1257
>>  netlink_sendmsg+0xab3/0xe70 net/netlink/af_netlink.c:1803
>>  sock_sendmsg_nosec net/socket.c:633 [inline]
>>  sock_sendmsg+0xca/0x110 net/socket.c:643
>>  sock_write_iter+0x326/0x600 net/socket.c:846
>>  call_write_iter include/linux/fs.h:1733 [inline]
>>  do_iter_readv_writev fs/read_write.c:696 [inline]
>>  __do_readv_writev+0xbbc/0x10a0 fs/read_write.c:862
>>  do_readv_writev+0x13f/0x200 fs/read_write.c:894
>>  vfs_writev+0x87/0xc0 fs/read_write.c:921
>>  do_writev+0x110/0x2c0 fs/read_write.c:954
>>  SYSC_writev fs/read_write.c:1027 [inline]
>>  SyS_writev+0x27/0x30 fs/read_write.c:1024
>>  entry_SYSCALL_64_fastpath+0x1f/0xc2
>> RIP: 0033:0x4458d9
>> RSP: 002b:7f31fcf33b58 EFLAGS: 0292 ORIG_RAX: 0014
>> RAX: ffda RBX: 0005 RCX: 004458d9
>> RDX: 0001 RSI: 207cd000 RDI: 0005
>> RBP: 006e30c0 R08:  R09: 
>> R10:  R11: 0292 R12: 00708000
>> R13: 20fad000 R14: 1000 R15: 0003
>>
>>
>>
>> [ cut here ]
>> WARNING: CPU: 2 PID: 31175 at net/ipv6/ip6_fib.c:158
>> rt6_rcu_free+0x61/0x70 net/ipv6/ip6_fib.c:158
>> Kernel panic - not syncing: panic_on_warn set ...
>>
>> CPU: 2 PID: 31175 Comm: syz-executor1 Not tainted 4.11.0-rc1+ #310
>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
>> Call Trace:
>>  __dump_stack lib/dump_stack.c:16 [inline]
>>  dump_stack+0x2fb/0x3fd lib/dump_stack.c:52
>>  panic+0x20f/0x426 kernel/panic.c:180
>>  __warn+0x1c4/0x1e0 kernel/panic.c:541
>>  warn_slowpath_null+0x2c/0x40 kernel/panic.c:584
>>  rt6_rcu_free+0x61/0x70 net/ipv6/ip6_fib.c:158
>>  rt6_release+0x1ee/0x290 net/ipv6/ip6_fib.c:189
>>  fib6_add_rt2node net/ipv6/ip6_fib.c:922 [inline]
>>  fib6_add+0x1d51/0x3290 net/ipv6/ip6_fib.c:1081
>> kvm_vm_ioctl_deassign_device: device hasn't been assigned before, so
>> cannot be deassigned
>>  __ip6_ins_rt+0x60/0x80 net/ipv6/route.c:948
>>  ip6_route_add+0x1a7/0x310 net/ipv6/route.c:2130
>>  inet6_rtm_newroute+0x191/0x1b0 net/ipv6/route.c:3294
>>  rtnetlink_rcv_msg+0x609/0x860 net/core/rtnetlink.c:4104
>>  netlink_rcv_skb+0x2ab/0x390 net/netlink/af_netlink.c:2298
>>  rtnetlink_rcv+0x2a/0x40 net/core/rtnetlink.c:4110
>>  netlink_unicast_kernel net/netlink/af_netlink.c:1231 [inline]
>>  netlink_unicast+0x525/0x730 net/netlink/af_netlink.c:1257
>>  netlink_sendmsg+0xab3/0xe70 net/netlink/af_netlink.c:1803
>>  sock_sendmsg_nosec net/socket.c:633 [inline]
>>  sock_sendmsg+0xca/0x110 net/socket.c:643
>>  sock_write_iter+0x326/0x600 net/socket.c:846
>>  call_write_iter include/linux/fs.h:1733 [inline]
>>  do_iter_readv_writev fs/read_write.c:696 [inline]
>>  __do_readv_writev+0xbbc/0x10a0 fs/read_write.c:862
>>  do_readv_writev+0x13f/0x200 fs/read_write.c:894
>>  vfs_writev+0x87/0xc0 fs/read_write.c:921
>>  do_writev+0x110/0x2c0 fs/read_write.c:954
>>  SYSC_writev fs/read_write.c:1027 [inline]
>>  SyS_writev+0x27/0x30 fs/read_write.c:1024
>>  entry_SYSCALL_64_fastpath+0x1f/0xc2
>> RIP: 0033:0x4458d9
>> RSP: 002b:7f1639006b58 EFLAGS: 0292 ORIG_RAX: 0014
>> RAX: ffda RBX: 0019 RCX: 004458d9
>> RDX: 0001 RSI: 207cd000 RDI:

[PATCH net] net/tunnel: set inner protocol in network gro hooks

2017-03-07 Thread Paolo Abeni

The gso code of several tunnels type (gre and udp tunnels)
takes for granted that the skb->inner_protocol is properly
initialized and drops the packet elsewhere.

On the forwarding path no one is initializing such field,
so gro encapsulated packets are dropped on forward.

Since commit 38720352412a ("gre: Use inner_proto to obtain
inner header protocol"), this can be reproduced when the
encapsulated packets use gre as the tunneling protocol.

The issue happens also with vxlan and geneve tunnels since
commit 8bce6d7d0d1e ("udp: Generalize skb_udp_segment"), if the
forwarding host's ingress nic has h/w offload for such tunnel
and a vxlan/geneve device is configured on top of it, regardless
of the configured peer address and vni.

To address the issue, this change initialize the inner_protocol
field for encapsulated packets in both ipv4 and ipv6 gro complete
callbacks.

Fixes: 38720352412a ("gre: Use inner_proto to obtain inner header protocol")
Fixes: 8bce6d7d0d1e ("udp: Generalize skb_udp_segment")
Signed-off-by: Paolo Abeni 
---
 net/ipv4/af_inet.c | 4 +++-
 net/ipv6/ip6_offload.c | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 602d40f..5091f46 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1487,8 +1487,10 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff)
int proto = iph->protocol;
int err = -ENOSYS;
 
-   if (skb->encapsulation)
+   if (skb->encapsulation) {
+   skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IP));
skb_set_inner_network_header(skb, nhoff);
+   }
 
csum_replace2(>check, iph->tot_len, newlen);
iph->tot_len = newlen;
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 0838e6d..93e58a5 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -294,8 +294,10 @@ static int ipv6_gro_complete(struct sk_buff *skb, int 
nhoff)
struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + nhoff);
int err = -ENOSYS;
 
-   if (skb->encapsulation)
+   if (skb->encapsulation) {
+   skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IPV6));
skb_set_inner_network_header(skb, nhoff);
+   }
 
iph->payload_len = htons(skb->len - nhoff - sizeof(*iph));
 
-- 
2.9.3

RE: [PATCHv3 net-next 21/22] net: mvpp2: set dma mask and coherent dma mask on PPv2.2

2017-03-07 Thread David Laight

From: Thomas Petazzoni
> Sent: 07 March 2017 15:53
> On PPv2.2, the streaming mappings can be anywhere in the first 40 bits
> of the physical address space. However, for the coherent mappings, we
> still need them to be in the first 32 bits of the address space,
> because all BM pools share a single register to store the high 32 bits
> of the BM pool address, which means all BM pools must be allocated in
> the same 4GB memory area.

Are the coherent mappings just used for ring structures?
If it might be reasonable to allocate them as a single entity,
thus guaranteeing they all reside in a single 4G region.

David

hello

2017-03-07 Thread duncan

hi

Re: [PATCH v2 0/6] Updates for Marvell Switch SoCs

2017-03-07 Thread Gregory CLEMENT

Hi Chris,
 
 On jeu., févr. 16 2017, Chris Packham  
wrote:

> Shortly after I posted my last series I got access to a more recent
> Marvell SDK which had some device tree support for the switch SoCs I'd
> been wanting. It was still based on an older kernel but it was a huge
> improvement over what came before.
>
> Patch 1/6 is a typo I noticed after my initial series was applied.
>
> Patch 2/6 is a bit of a cleanup. I did initially struggle with how to
> access individual parts of the DFX block as well as retaining a handle on
> the entire thing for the switch driver to use.
>
> Patch 3/6 is a re-jig of the dtsi files which is needed by 5/6. This is
> required because I need to use the coreclk label on a different node. It
> also means I don't have to disable nodes for blocks that only exist on
> the Armada-XP.
>
> Patch 4/6, 5/6 are split from the previous versions.
>
> Patch 6/6 is the device tree portion of a change already in clk-next.


I applied patches 2, 3 and 6 on mvebu/dt with the acked-by from Rob when
he gave it.

Patch 1 is already part of 4.11-rc1.

For patch 4 and 5 I understand that either we don't need it or it should
be a updated version so I skipped them.

Thanks,

Gregory

>
> Chris Packham (6):
>   ARM: dts: Fix typo in armada-xp-98dx4251
> Changes in v2
> - new
> Changes in v3:
> - none
>   ARM: dts: armada-xp-98dx3236: combine dfx server nodes
> Changes in v2:
> - none
> Changes in v3:
> - none
>   ARM: dts: Use armada-370-xp as a base for armada-xp-98dx3236
> Changes in v2:
> - Update root compatible strings in armada-xp-98dx3336.dtsi,
>   armada-xp-98dx4251.dtsi, armada-xp-db-dxbc2.dts and
>   armada-xp-db-xc3-24g4xg.dts
> Changes in v3:
> - none
>   ARM: dts: mvebu: Add binding for mv98dx3236-soc-id
> Changes in v3:
> - new, split from driver
>   ARM: mvebu: Add driver for mv98dx3236-soc-id
> Changes in v2:
> - none
> Changes in v3:
> - split from dts changes
>   ARM: dts: mvebu: Move mv98dx3236 clock bindings
> Changes in v2:
> - New. Split out from "clk: mvebu: Expand mv98dx3236-core-clock support"
> Changes in v3:
> - Clarify why the old location was wrong (but worked).
>
>
>  .../bindings/arm/marvell/mv98dx3236-soc-id.txt |  14 ++
>  .../devicetree/bindings/clock/mvebu-core-clock.txt |   7 +
>  .../bindings/clock/mvebu-gated-clock.txt   |  11 ++
>  .../devicetree/bindings/net/marvell,prestera.txt   |  13 +-
>  arch/arm/boot/dts/armada-xp-98dx3236.dtsi  | 213 
> -
>  arch/arm/boot/dts/armada-xp-98dx3336.dtsi  |   2 +-
>  arch/arm/boot/dts/armada-xp-98dx4251.dtsi  |   2 +-
>  arch/arm/boot/dts/armada-xp-db-dxbc2.dts   |   2 +-
>  arch/arm/boot/dts/armada-xp-db-xc3-24g4xg.dts  |   2 +-
>  arch/arm/mach-mvebu/mvebu-soc-id.c |  43 -
>  10 files changed, 242 insertions(+), 67 deletions(-)
>  create mode 100644 
> Documentation/devicetree/bindings/arm/marvell/mv98dx3236-soc-id.txt
>
> -- 
> 2.11.0.24.ge6920cf

-- 
Gregory Clement, Free Electrons
Kernel, drivers, real-time and embedded Linux
development, consulting, training and support.
http://free-electrons.com

Re: [4.9.13] use after free in ipv4_mtu

2017-03-07 Thread Eric Dumazet

On Tue, 2017-03-07 at 08:29 -0800, Stephen Hemminger wrote:

> +   WARN_ONCE(strcmp(default_qdisc_ops->id, "fq"),
> + "TCP BBR should only be used with FQ qdisc\n");
> +
>  

Why would that be needed, especially for people that properly setup
their qdisc ? Maybe they do not want to force fq on all devices like tun
or taps ;)

Also we intend to provide a fallback in case FQ is not in the qdisc
chain : Some routers admins really want fq_codel (or whatever qdisc)

TCP will handle the pacing itself, at a small cost for the ones that
hate fq.

[PATCH RESEND v1] qed: Fix copy of uninitialized memory

2017-03-07 Thread Robert Foss

In qed_ll2_start_ooo() the ll2_info variable is uninitialized and then
passed to qed_ll2_acquire_connection() where it is copied into a new
memory space.

This shouldn't cause any issue as long as non of the copied memory is
every read.
But the potential for a bug being introduced by reading this memory
is real.

Detected by CoverityScan, CID#1399632 ("Uninitialized scalar variable")

Signed-off-by: Robert Foss 
---
 drivers/net/ethernet/qlogic/qed/qed_ll2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c 
b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index 9a0b9af10a57..5fb34db377c8 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -968,7 +968,7 @@ static int qed_ll2_start_ooo(struct qed_dev *cdev,
 {
struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
u8 *handle = >pf_params.iscsi_pf_params.ll2_ooo_queue_id;
-   struct qed_ll2_conn ll2_info;
+   struct qed_ll2_conn ll2_info = { 0 };
int rc;
 
ll2_info.conn_type = QED_LL2_TYPE_ISCSI_OOO;
-- 
2.11.0.453.g787f75f05

Re: [4.9.13] use after free in ipv4_mtu

2017-03-07 Thread Stephen Hemminger

On Mon, 06 Mar 2017 08:20:03 -0800
Eric Dumazet  wrote:

> On Mon, 2017-03-06 at 05:45 -0800, Eric Dumazet wrote:
> > On Mon, 2017-03-06 at 14:33 +0800, Daniel J Blueman wrote:  
> 
> > > I do change the network queueing discipline and related at runtime [1]
> > > which may be triggering this, though I did think I saw the KASAN
> > > report only after resuming from suspend. rf(un)kill and other tweaking
> > > may have been involved too.
> > > 
> > > Thanks,
> > >   Dan
> > > 
> > > [1] /etc/sysctl.d/90-tcp.conf
> > > 
> > > net.core.default_qdisc = fq_codel
> > > net.ipv4.tcp_congestion_control = bbr
> > > net.ipv4.tcp_slow_start_after_idle = 0
> > > net.ipv4.tcp_ecn = 1  
> 
> BTW, fq_codel is not suitable for BBR.
> 
> Only fq contains the needed pacing for BBR.

What about something like this???

diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index b89bce4c721e..647be997a9c5 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -801,6 +801,9 @@ static void bbr_init(struct sock *sk)
struct bbr *bbr = inet_csk_ca(sk);
u64 bw;
 
+   WARN_ONCE(strcmp(default_qdisc_ops->id, "fq"),
+ "TCP BBR should only be used with FQ qdisc\n");
+
bbr->prior_cwnd = 0;
bbr->tso_segs_goal = 0;  /* default segs per skb until first ACK */
bbr->rtt_cnt = 0;

Re: [PATCH net 0/3] rds: tcp: fix various rds-tcp issues during netns create/delete sequences

2017-03-07 Thread Santosh Shilimkar


On 3/7/2017 12:28 AM, Dmitry Vyukov wrote:

On Tue, Mar 7, 2017 at 2:04 AM, santosh.shilim...@oracle.com
 wrote:

On 3/4/17 8:57 AM, Sowmini Varadhan wrote:


Dmitry Vyukov reported some syszkaller panics during netns deletion.

While I have not been able to reproduce those exact panics, my attempts
to do so uncovered a few other problems, which are fixed patch 2 and
patch 3 of this patch series. In addition, as mentioned in,
 https://www.spinics.net/lists/netdev/msg422997.html
code-inspection points that the rds_connection needs to take an explicit
refcnt on the struct net so that it is held down until all cleanup is
completed for netns removal, and this is fixed by patch1.


Hopefully Dmitry can try the series and see if it fixes the issue(s).
The fixes looks good to me.

FWIW, Acked-by: Santosh Shilimkar 



I've applied the patches for testing. I've seen the reported crashes
only few times, so it won't provide a good testing. But at least it
can detect any regressions.


Thanks Dmitry !!

Re: [PATCH] PCI: Add pci reset quirk for Cavium VNIC

2017-03-07 Thread Robin Murphy

On 07/03/17 15:04, Radoslaw Biernacki wrote:
> From: Radoslaw Biernacki 
> 
> PCI reset quirk is needed for Cavium Function NIC since it does not
> handle a function level reset.
> This cause problems when VNIC is used from userspace by vfio.
> If application (or VM) does not stop the VNIC queues, HW may cause
> overwrite of memory locations when next application that use it will
> establish new SMMU mappings. More likely HW it will cause SMMU exception,
> when network packet will came before new SMMU mappings will be made.
> 
> Signed-off-by: Radoslaw Biernacki 
> Reviewed-by: Sunil Goutham 
> ---
>  drivers/pci/quirks.c | 90 
> 
>  1 file changed, 90 insertions(+)
> 
> diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
> index f754453..f7cdbe5 100644
> --- a/drivers/pci/quirks.c
> +++ b/drivers/pci/quirks.c
> @@ -3783,10 +3783,98 @@ static int reset_chelsio_generic_dev(struct pci_dev 
> *dev, int probe)
>   return 0;
>  }
>  
> +#define CAVIUM_VNIC_QSET_RQ_0_7_CFG  (0x010600)
> +#define CAVIUM_VNIC_QSET_CQ_0_7_CFG  (0x010400)
> +#define CAVIUM_VNIC_QSET_CQ_0_7_CFG2 (0x010408)
> +#define CAVIUM_VNIC_QSET_SQ_0_7_CFG  (0x010800)
> +#define CAVIUM_VNIC_QSET_SQ_0_7_STATUS   (0x010840)
> +#define CAVIUM_VNIC_QSET_RBDR_0_1_CFG(0x010C00)
> +#define CAVIUM_VNIC_QSET_RBDR_0_1_STATUS0(0x010C40)
> +
> +#define CAVIUM_VNIC_Q_SHIFT  (18)
> +#define CAVIUM_VNIC_CQ_RESET (1ULL << 41)
> +#define CAVIUM_VNIC_SQ_RESET (1ULL << 17)
> +#define CAVIUM_VNIC_RBDR_RESET   (1ULL << 43)
> +#define CAVIUM_VNIC_RBDR_FIFO_STATE_SHIFT (62)
> +
> +/* Poll a register for a specific value */
> +static int cavium_vnic_poll(struct pci_dev *pdev,
> + void __iomem *addr, size_t bit_pos,
> + size_t bits, u64 val)
> +{
> + u64 bit_mask;
> + u64 reg_val;
> + size_t timeout = 10;
> +
> + bit_mask = (1ULL << bits) - 1;
> + bit_mask = (bit_mask << bit_pos);

GENMASK_ULL(bit_pos, bit_pos + bits - 1)? Admittedly it's not an awful
lot tidier :/

> +
> + while (timeout) {
> + reg_val = readq(addr);
> + if (((reg_val & bit_mask) >> bit_pos) == val)
> + return 0;
> + usleep_range(1000, 2000);
> + timeout--;
> + }

This is essentially an open-coded version of readq_poll_timeout() from
, which could be used instead. Doing "val <<= bitpos"
first might help simplify the condition, although since it's only called
with val == 0 it seems that "(reg_val & bit_mask)" would suffice.

Robin.

> + dev_err(>dev, "Poll on addr %p failed\n", addr);
> + return -1;
> +}
> +
> +static int cavium_vnic_reset(struct pci_dev *pdev, int probe)
> +{
> + size_t qidx;
> + void __iomem *bar_base;
> + void __iomem *qset_base;
> +
> + bar_base = pci_iomap(pdev, 0, 0);
> + if (!bar_base)
> + return -ENOMEM;
> +
> + /* For each of 8 RQ/CQ/SQ (queues) in VF */
> + for (qidx = 0; qidx < 8; qidx++) {
> + /* Disable receive queue */
> + qset_base = bar_base + (qidx << CAVIUM_VNIC_Q_SHIFT);
> + writeq(0, qset_base + CAVIUM_VNIC_QSET_RQ_0_7_CFG);
> +
> + /* Disable timer threshold (doesn't get reset upon CQ reset */
> + writeq(0, qset_base + CAVIUM_VNIC_QSET_CQ_0_7_CFG2);
> + /* Disable completion queue */
> + writeq(0, qset_base + CAVIUM_VNIC_QSET_CQ_0_7_CFG);
> + /* Reset completion queue */
> + writeq(CAVIUM_VNIC_CQ_RESET,
> +   qset_base + CAVIUM_VNIC_QSET_CQ_0_7_CFG);
> +
> + /* Disable send queue */
> + writeq(0, qset_base + CAVIUM_VNIC_QSET_SQ_0_7_CFG);
> + /* Reset send queue */
> + writeq(CAVIUM_VNIC_SQ_RESET,
> +qset_base + CAVIUM_VNIC_QSET_SQ_0_7_CFG);
> + }
> +
> + /* Reset and disable both RBDR's */
> + for (qidx = 0; qidx < 2; qidx++) {
> + qset_base = bar_base +
> + (qidx << CAVIUM_VNIC_Q_SHIFT);
> + writeq(CAVIUM_VNIC_RBDR_RESET,
> +qset_base + CAVIUM_VNIC_QSET_RBDR_0_1_CFG);
> + writeq(0, qset_base + CAVIUM_VNIC_QSET_RBDR_0_1_CFG);
> + if (cavium_vnic_poll(pdev, qset_base +
> +  CAVIUM_VNIC_QSET_RBDR_0_1_STATUS0,
> +  CAVIUM_VNIC_RBDR_FIFO_STATE_SHIFT,
> +  2, 0x00))
> + dev_err(>dev, "Timeout on RBDR reset sequence");
> + }
> +
> + pci_iounmap(pdev, bar_base);
> + return 0;
> +}
> +
>  #define PCI_DEVICE_ID_INTEL_82599_SFP_VF   0x10ed
>  #define PCI_DEVICE_ID_INTEL_IVB_M_VGA  0x0156
>  #define

Re: netlink: GPF in netlink_unicast

2017-03-07 Thread Richard Guy Briggs

On 2017-03-07 09:29, Paul Moore wrote:
> On Mon, Mar 6, 2017 at 11:03 PM, Richard Guy Briggs  wrote:
> > On 2017-03-06 10:10, Cong Wang wrote:
> >> On Mon, Mar 6, 2017 at 2:54 AM, Dmitry Vyukov  wrote:
> >> > Hello,
> >> >
> >> > I've got the following crash while running syzkaller fuzzer on
> >> > net-next/8d70eeb84ab277377c017af6a21d0a337025dede:
> >> >
> >> > kasan: GPF could be caused by NULL-ptr deref or user memory access
> >> > general protection fault:  [#1] SMP KASAN
> >> > Dumping ftrace buffer:
> >> >(ftrace buffer empty)
> >> > Modules linked in:
> >> > CPU: 0 PID: 883 Comm: kauditd Not tainted 4.10.0+ #6
> >> > Hardware name: Google Google Compute Engine/Google Compute Engine,
> >> > BIOS Google 01/01/2011
> >> > task: 8801d79f0240 task.stack: 8801d7a2
> >> > RIP: 0010:sock_sndtimeo include/net/sock.h:2162 [inline]
> >> > RIP: 0010:netlink_unicast+0xdd/0x730 net/netlink/af_netlink.c:1249
> >> > RSP: 0018:8801d7a27c38 EFLAGS: 00010206
> >> > RAX: 0056 RBX: 8801d7a27cd0 RCX: 
> >> > RDX:  RSI:  RDI: 02b0
> >> > RBP: 8801d7a27cf8 R08: ed00385cf286 R09: ed00385cf286
> >> > R10: 0006 R11: ed00385cf285 R12: 
> >> > R13: dc00 R14: 8801c2fc3c80 R15: 014000c0
> >> > FS:  () GS:8801dbe0() 
> >> > knlGS:
> >> > CS:  0010 DS:  ES:  CR0: 80050033
> >> > CR2: 20cfd000 CR3: 0001c758f000 CR4: 001406f0
> >> > Call Trace:
> >> >  kauditd_send_unicast_skb+0x3c/0x70 kernel/audit.c:482
> >> >  kauditd_thread+0x174/0xb00 kernel/audit.c:599
> >> >  kthread+0x326/0x3f0 kernel/kthread.c:229
> >> >  ret_from_fork+0x31/0x40 arch/x86/entry/entry_64.S:430
> >> > Code: 44 89 fe e8 56 15 ff ff 8b 8d 70 ff ff ff 49 89 c6 31 c0 85 c9
> >> > 75 27 e8 b2 b2 f4 fd 49 8d bc 24 b0 02 00 00 48 89 f8 48 c1 e8 03 <42>
> >> > 80 3c 28 00 0f 85 37 06 00 00 49 8b 84 24 b0 02 00 00 4c 8d
> >> > RIP: sock_sndtimeo include/net/sock.h:2162 [inline] RSP: 8801d7a27c38
> >> > RIP: netlink_unicast+0xdd/0x730 net/netlink/af_netlink.c:1249 RSP:
> >> > 8801d7a27c38
> >> > ---[ end trace ad1bba9d457430b6 ]---
> >> > Kernel panic - not syncing: Fatal exception
> >> >
> >> >
> >> > This is not reproducible and seems to be caused by an elusive race.
> >> > However, looking at the code I don't see any proper protection of
> >> > audit_sock (other than the if (!audit_pid) which is obviously not
> >> > enough to protect against races).
> >>
> >> audit_cmd_mutex is supposed to protect it, I think.
> >> But kauditd_send_unicast_skb() seems not holding this mutex.
> >
> > H, I wonder if it makes sense to wrap most of the contents of the
> > outer while loop in kauditd_thread in the audit_cmd_mutex, or around the
> > first two innter while loops and the "if (auditd)" condition after the
> > "quick_loop:" label.  The condition on auditd is supposed to catch that
> > case.  We don't want it locked while playing with the scheduler at the
> > bottom of that function.
> 
> Let me look into this and play around with a few things.  I suspected
> there might be a problem here, so I've got thoughts on how we might
> resolve it; I just need to see code them up and see what option sucks
> the least.
> 
> FWIW Richard, yes wrapping most of kauditd_thread *should* resolve
> this but it's pretty heavy handed and not my first choice.

That's why the inner loops made a bit more sense since it wasn't really
necessary and ran afoul of the scheduler anyways.

> paul moore

- RGB

--
Richard Guy Briggs 
Kernel Security Engineering, Base Operating Systems, Red Hat
Remote, Ottawa, Canada
Voice: +1.647.777.2635, Internal: (81) 32635

[PATCHv3 net-next 22/22] net: mvpp2: finally add the PPv2.2 compatible string

2017-03-07 Thread Thomas Petazzoni

Now that the mvpp2 driver has been modified to accommodate the support
for PPv2.2, we can finally advertise this support by adding the
appropriate compatible string.

At the same time, we update the Kconfig description of the MVPP2 driver.

Signed-off-by: Thomas Petazzoni 
---
 drivers/net/ethernet/marvell/Kconfig | 4 ++--
 drivers/net/ethernet/marvell/mvpp2.c | 4 
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/marvell/Kconfig 
b/drivers/net/ethernet/marvell/Kconfig
index d2555e8b..da6fb82 100644
--- a/drivers/net/ethernet/marvell/Kconfig
+++ b/drivers/net/ethernet/marvell/Kconfig
@@ -82,13 +82,13 @@ config MVNETA_BM
  that all dependencies are met.
 
 config MVPP2
-   tristate "Marvell Armada 375 network interface support"
+   tristate "Marvell Armada 375/7K/8K network interface support"
depends on ARCH_MVEBU || COMPILE_TEST
depends on HAS_DMA
select MVMDIO
---help---
  This driver supports the network interface units in the
- Marvell ARMADA 375 SoC.
+ Marvell ARMADA 375, 7K and 8K SoCs.
 
 config PXA168_ETH
tristate "Marvell pxa168 ethernet support"
diff --git a/drivers/net/ethernet/marvell/mvpp2.c 
b/drivers/net/ethernet/marvell/mvpp2.c
index 92c47f3..af5bfa1 100644
--- a/drivers/net/ethernet/marvell/mvpp2.c
+++ b/drivers/net/ethernet/marvell/mvpp2.c
@@ -7037,6 +7037,10 @@ static const struct of_device_id mvpp2_match[] = {
.compatible = "marvell,armada-375-pp2",
.data = (void *)MVPP21,
},
+   {
+   .compatible = "marvell,armada-7k-pp22",
+   .data = (void *)MVPP22,
+   },
{ }
 };
 MODULE_DEVICE_TABLE(of, mvpp2_match);
-- 
2.7.4

[PATCHv3 net-next 15/22] net: mvpp2: handle register mapping and access for PPv2.2

2017-03-07 Thread Thomas Petazzoni

This commit adjusts the mvpp2 driver register mapping and access logic
to support PPv2.2, to handle a number of differences.

Due to how the registers are laid out in memory, the Device Tree binding
for the "reg" property is different:

 - On PPv2.1, we had a first area for the packet processor
   registers (common to all ports), and then one area per port.

 - On PPv2.2, we have a first area for the packet processor
   registers (common to all ports), and a second area for numerous other
   registers, including a large number of per-port registers

In addition, on PPv2.2, the area for the common registers is split into
so-called "address spaces" of 64 KB each. They allow to access per-CPU
registers, where each CPU has its own copy of some registers. A few
other registers, which have a single copy, also need to be accessed from
those per-CPU windows if they are related to a per-CPU register. For
example:

  - Writing to MVPP2_TXQ_NUM_REG selects a TX queue. This register is a
per-CPU register, it must be accessed from the current CPU register
window.

  - Then a write to MVPP2_TXQ_PENDING_REG, MVPP2_TXQ_DESC_ADDR_REG (and
a few others) will affect the TX queue that was selected by the
write to MVPP2_TXQ_NUM_REG. It must be accessed from the same CPU
window as the write to the TXQ_NUM_REG.

Therefore, the ->base member of 'struct mvpp2' is replaced with a
->cpu_base[] array, each entry pointing to a mapping of the per-CPU
area. Since PPv2.1 doesn't have this concept of per-CPU windows, all
entries in ->cpu_base[] point to the same io-remapped area.

The existing mvpp2_read() and mvpp2_write() accessors use cpu_base[0],
they are used for registers for which the CPU window doesn't matter.

mvpp2_percpu_read() and mvpp2_percpu_write() are new accessors added to
access the registers for which the CPU window does matter, which is why
they take a "cpu" as argument.

The driver is then changed to use mvpp2_percpu_read() and
mvpp2_percpu_write() where it matters.

Signed-off-by: Thomas Petazzoni 
---
 drivers/net/ethernet/marvell/mvpp2.c | 257 +--
 1 file changed, 188 insertions(+), 69 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2.c 
b/drivers/net/ethernet/marvell/mvpp2.c
index 2eec380..2b4b4f0 100644
--- a/drivers/net/ethernet/marvell/mvpp2.c
+++ b/drivers/net/ethernet/marvell/mvpp2.c
@@ -295,6 +295,8 @@
 #define  MVPP2_GMAC_TX_FIFO_MIN_TH_MASK(v) (((v) << 6) & \
MVPP2_GMAC_TX_FIFO_MIN_TH_ALL_MASK)
 
+#define MVPP22_GMAC_BASE(port) (0x7000 + (port) * 0x1000 + 0xe00)
+
 #define MVPP2_CAUSE_TXQ_SENT_DESC_ALL_MASK 0xff
 
 /* Descriptor ring Macros */
@@ -622,6 +624,11 @@ enum mvpp2_prs_l3_cast {
  */
 #define MVPP2_BM_SHORT_PKT_SIZEMVPP2_RX_MAX_PKT_SIZE(512)
 
+#define MVPP21_ADDR_SPACE_SZ   0
+#define MVPP22_ADDR_SPACE_SZ   SZ_64K
+
+#define MVPP2_MAX_CPUS 4
+
 enum mvpp2_bm_type {
MVPP2_BM_FREE,
MVPP2_BM_SWF_LONG,
@@ -633,8 +640,14 @@ enum mvpp2_bm_type {
 /* Shared Packet Processor resources */
 struct mvpp2 {
/* Shared registers' base addresses */
-   void __iomem *base;
void __iomem *lms_base;
+   void __iomem *iface_base;
+
+   /* On PPv2.2, each CPU can access the base register through a
+* separate address space, each 64 KB apart from each
+* other.
+*/
+   void __iomem *cpu_base[MVPP2_MAX_CPUS];
 
/* Common clocks */
struct clk *pp_clk;
@@ -680,6 +693,11 @@ struct mvpp2_port_pcpu {
 struct mvpp2_port {
u8 id;
 
+   /* Index of the port from the "group of ports" complex point
+* of view
+*/
+   int gop_id;
+
int irq;
 
struct mvpp2 *priv;
@@ -996,12 +1014,60 @@ static int txq_number = MVPP2_MAX_TXQ;
 
 static void mvpp2_write(struct mvpp2 *priv, u32 offset, u32 data)
 {
-   writel(data, priv->base + offset);
+   writel(data, priv->cpu_base[0] + offset);
 }
 
 static u32 mvpp2_read(struct mvpp2 *priv, u32 offset)
 {
-   return readl(priv->base + offset);
+   return readl(priv->cpu_base[0] + offset);
+}
+
+/* These accessors should be used to access:
+ *
+ * - per-CPU registers, where each CPU has its own copy of the
+ *   register.
+ *
+ *   MVPP2_BM_VIRT_ALLOC_REG
+ *   MVPP2_BM_ADDR_HIGH_ALLOC
+ *   MVPP22_BM_ADDR_HIGH_RLS_REG
+ *   MVPP2_BM_VIRT_RLS_REG
+ *   MVPP2_ISR_RX_TX_CAUSE_REG
+ *   MVPP2_ISR_RX_TX_MASK_REG
+ *   MVPP2_TXQ_NUM_REG
+ *   MVPP2_AGGR_TXQ_UPDATE_REG
+ *   MVPP2_TXQ_RSVD_REQ_REG
+ *   MVPP2_TXQ_RSVD_RSLT_REG
+ *   MVPP2_TXQ_SENT_REG
+ *   MVPP2_RXQ_NUM_REG
+ *
+ * - global registers that must be accessed through a specific CPU
+ *   window, because they are related to an access to a per-CPU
+ *   register
+ *
+ *   MVPP2_BM_PHY_ALLOC_REG(related to MVPP2_BM_VIRT_ALLOC_REG)
+ *   MVPP2_BM_PHY_RLS_REG  (related to

[PATCHv3 net-next 17/22] net: mvpp2: add AXI bridge initialization for PPv2.2

2017-03-07 Thread Thomas Petazzoni

The PPv2.2 unit is connected to an AXI bus on Armada 7K/8K, so this
commit adds the necessary initialization of the AXI bridge.

Signed-off-by: Thomas Petazzoni 
---
 drivers/net/ethernet/marvell/mvpp2.c | 85 
 1 file changed, 85 insertions(+)

diff --git a/drivers/net/ethernet/marvell/mvpp2.c 
b/drivers/net/ethernet/marvell/mvpp2.c
index bd7dc4b6..0e10303 100644
--- a/drivers/net/ethernet/marvell/mvpp2.c
+++ b/drivers/net/ethernet/marvell/mvpp2.c
@@ -154,6 +154,34 @@
 #define MVPP2_WIN_REMAP(w) (0x4040 + ((w) << 2))
 #define MVPP2_BASE_ADDR_ENABLE 0x4060
 
+/* AXI Bridge Registers */
+#define MVPP22_AXI_BM_WR_ATTR_REG  0x4100
+#define MVPP22_AXI_BM_RD_ATTR_REG  0x4104
+#define MVPP22_AXI_AGGRQ_DESCR_RD_ATTR_REG 0x4110
+#define MVPP22_AXI_TXQ_DESCR_WR_ATTR_REG   0x4114
+#define MVPP22_AXI_TXQ_DESCR_RD_ATTR_REG   0x4118
+#define MVPP22_AXI_RXQ_DESCR_WR_ATTR_REG   0x411c
+#define MVPP22_AXI_RX_DATA_WR_ATTR_REG 0x4120
+#define MVPP22_AXI_TX_DATA_RD_ATTR_REG 0x4130
+#define MVPP22_AXI_RD_NORMAL_CODE_REG  0x4150
+#define MVPP22_AXI_RD_SNOOP_CODE_REG   0x4154
+#define MVPP22_AXI_WR_NORMAL_CODE_REG  0x4160
+#define MVPP22_AXI_WR_SNOOP_CODE_REG   0x4164
+
+/* Values for AXI Bridge registers */
+#define MVPP22_AXI_ATTR_CACHE_OFFS 0
+#define MVPP22_AXI_ATTR_DOMAIN_OFFS12
+
+#define MVPP22_AXI_CODE_CACHE_OFFS 0
+#define MVPP22_AXI_CODE_DOMAIN_OFFS4
+
+#define MVPP22_AXI_CODE_CACHE_NON_CACHE0x3
+#define MVPP22_AXI_CODE_CACHE_WR_CACHE 0x7
+#define MVPP22_AXI_CODE_CACHE_RD_CACHE 0xb
+
+#define MVPP22_AXI_CODE_DOMAIN_OUTER_DOM   2
+#define MVPP22_AXI_CODE_DOMAIN_SYSTEM  3
+
 /* Interrupt Cause and Mask registers */
 #define MVPP2_ISR_RX_THRESHOLD_REG(rxq)(0x5200 + 4 * (rxq))
 #define MVPP2_MAX_ISR_RX_THRESHOLD 0xf0
@@ -6664,6 +6692,60 @@ static void mvpp2_rx_fifo_init(struct mvpp2 *priv)
mvpp2_write(priv, MVPP2_RX_FIFO_INIT_REG, 0x1);
 }
 
+static void mvpp2_axi_init(struct mvpp2 *priv)
+{
+   u32 val, rdval, wrval;
+
+   mvpp2_write(priv, MVPP22_BM_ADDR_HIGH_RLS_REG, 0x0);
+
+   /* AXI Bridge Configuration */
+
+   rdval = MVPP22_AXI_CODE_CACHE_RD_CACHE
+   << MVPP22_AXI_ATTR_CACHE_OFFS;
+   rdval |= MVPP22_AXI_CODE_DOMAIN_OUTER_DOM
+   << MVPP22_AXI_ATTR_DOMAIN_OFFS;
+
+   wrval = MVPP22_AXI_CODE_CACHE_WR_CACHE
+   << MVPP22_AXI_ATTR_CACHE_OFFS;
+   wrval |= MVPP22_AXI_CODE_DOMAIN_OUTER_DOM
+   << MVPP22_AXI_ATTR_DOMAIN_OFFS;
+
+   /* BM */
+   mvpp2_write(priv, MVPP22_AXI_BM_WR_ATTR_REG, wrval);
+   mvpp2_write(priv, MVPP22_AXI_BM_RD_ATTR_REG, rdval);
+
+   /* Descriptors */
+   mvpp2_write(priv, MVPP22_AXI_AGGRQ_DESCR_RD_ATTR_REG, rdval);
+   mvpp2_write(priv, MVPP22_AXI_TXQ_DESCR_WR_ATTR_REG, wrval);
+   mvpp2_write(priv, MVPP22_AXI_TXQ_DESCR_RD_ATTR_REG, rdval);
+   mvpp2_write(priv, MVPP22_AXI_RXQ_DESCR_WR_ATTR_REG, wrval);
+
+   /* Buffer Data */
+   mvpp2_write(priv, MVPP22_AXI_TX_DATA_RD_ATTR_REG, rdval);
+   mvpp2_write(priv, MVPP22_AXI_RX_DATA_WR_ATTR_REG, wrval);
+
+   val = MVPP22_AXI_CODE_CACHE_NON_CACHE
+   << MVPP22_AXI_CODE_CACHE_OFFS;
+   val |= MVPP22_AXI_CODE_DOMAIN_SYSTEM
+   << MVPP22_AXI_CODE_DOMAIN_OFFS;
+   mvpp2_write(priv, MVPP22_AXI_RD_NORMAL_CODE_REG, val);
+   mvpp2_write(priv, MVPP22_AXI_WR_NORMAL_CODE_REG, val);
+
+   val = MVPP22_AXI_CODE_CACHE_RD_CACHE
+   << MVPP22_AXI_CODE_CACHE_OFFS;
+   val |= MVPP22_AXI_CODE_DOMAIN_OUTER_DOM
+   << MVPP22_AXI_CODE_DOMAIN_OFFS;
+
+   mvpp2_write(priv, MVPP22_AXI_RD_SNOOP_CODE_REG, val);
+
+   val = MVPP22_AXI_CODE_CACHE_WR_CACHE
+   << MVPP22_AXI_CODE_CACHE_OFFS;
+   val |= MVPP22_AXI_CODE_DOMAIN_OUTER_DOM
+   << MVPP22_AXI_CODE_DOMAIN_OFFS;
+
+   mvpp2_write(priv, MVPP22_AXI_WR_SNOOP_CODE_REG, val);
+}
+
 /* Initialize network controller common part HW */
 static int mvpp2_init(struct platform_device *pdev, struct mvpp2 *priv)
 {
@@ -6683,6 +6765,9 @@ static int mvpp2_init(struct platform_device *pdev, 
struct mvpp2 *priv)
if (dram_target_info)
mvpp2_conf_mbus_windows(dram_target_info, priv);
 
+   if (priv->hw_version == MVPP22)
+   mvpp2_axi_init(priv);
+
/* Disable HW PHY polling */
if (priv->hw_version == MVPP21) {
val = readl(priv->lms_base + MVPP2_PHY_AN_CFG0_REG);
-- 
2.7.4

1 2 >

1 - 100 of 163 matches

Mail list logo