date:20170131

Re: [PATCHv3 net-next 5/7] net: add confirm_neigh method to dst_ops

2017-01-31 Thread Steffen Klassert

On Tue, Jan 31, 2017 at 11:57:05PM +0200, Julian Anastasov wrote:
>  
>  static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
> diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
> index 177e208..c010ee0 100644
> --- a/net/xfrm/xfrm_policy.c
> +++ b/net/xfrm/xfrm_policy.c
> @@ -2856,6 +2856,20 @@ static struct neighbour *xfrm_neigh_lookup(const 
> struct dst_entry *dst,
>   return dst->path->ops->neigh_lookup(dst, skb, daddr);
>  }
>  
> +static void xfrm_confirm_neigh(const struct dst_entry *dst, const void 
> *daddr)
> +{
> + const struct dst_entry *path = dst->path;
> +
> + if (path == dst) {

I think path can not be equal to dst here, otherwise we would
have an infinite recursion.

> + dst->ops->confirm_neigh(dst, daddr);
> + } else {
> + /* daddr can be from different family and we need the
> +  * tunnel address. How to get it?
> +  */

This is only called on a xfrm_dst, so you should have dst->xfrm set.
You can get the daddr of this transform with:

xfrm_address_t *daddr = >id.daddr;


> + path->ops->confirm_neigh(path, NULL);

I think here it is better to go through the whole chain
of transformations with

child->ops->confirm_neigh(path, daddr);

> + }
> +}
> +
>  int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
>  {
>   int err = 0;
> @@ -2882,6 +2896,8 @@ int xfrm_policy_register_afinfo(struct 
> xfrm_policy_afinfo *afinfo)
>   dst_ops->link_failure = xfrm_link_failure;
>   if (likely(dst_ops->neigh_lookup == NULL))
>   dst_ops->neigh_lookup = xfrm_neigh_lookup;
> + if (likely(!dst_ops->confirm_neigh))
> + dst_ops->confirm_neigh = xfrm_confirm_neigh;

We also have address family depended dst_ops, look for
xfrm4_dst_ops_template/xfrm6_dst_ops_template.

[PATCH net-next v2 5/5] bridge: vlan dst_metadata hooks in ingress and egress paths

2017-01-31 Thread Roopa Prabhu

From: Roopa Prabhu 

- ingress hook:
- if port is a tunnel port, use tunnel info in
  attached dst_metadata to map it to a local vlan
- egress hook:
- if port is a tunnel port, use tunnel info attached to
  vlan to set dst_metadata on the skb

CC: Nikolay Aleksandrov 
Signed-off-by: Roopa Prabhu 
---
 net/bridge/br_forward.c|2 +-
 net/bridge/br_input.c  |8 +-
 net/bridge/br_private.h|2 ++
 net/bridge/br_private_tunnel.h |   11 
 net/bridge/br_vlan.c   |7 ++
 net/bridge/br_vlan_tunnel.c|   54 
 6 files changed, 82 insertions(+), 2 deletions(-)

diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index a0f9d00..e5d4821 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -80,7 +80,7 @@ static void __br_forward(const struct net_bridge_port *to,
int br_hook;
 
vg = nbp_vlan_group_rcu(to);
-   skb = br_handle_vlan(to->br, vg, skb);
+   skb = br_handle_vlan(to->br, to, vg, skb);
if (!skb)
return;
 
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 855b72f..fba38d8 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include "br_private.h"
+#include "br_private_tunnel.h"
 
 /* Hook for brouter */
 br_should_route_hook_t __rcu *br_should_route_hook __read_mostly;
@@ -57,7 +58,7 @@ static int br_pass_frame_up(struct sk_buff *skb)
 
indev = skb->dev;
skb->dev = brdev;
-   skb = br_handle_vlan(br, vg, skb);
+   skb = br_handle_vlan(br, NULL, vg, skb);
if (!skb)
return NET_RX_DROP;
/* update the multicast stats if the packet is IGMP/MLD */
@@ -261,6 +262,11 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
return RX_HANDLER_CONSUMED;
 
p = br_port_get_rcu(skb->dev);
+   if (p->flags & BR_VLAN_TUNNEL) {
+   if (br_handle_ingress_vlan_tunnel(skb, p,
+ nbp_vlan_group_rcu(p)))
+   goto drop;
+   }
 
if (unlikely(is_link_local_ether_addr(dest))) {
u16 fwd_mask = p->br->group_fwd_mask_required;
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 61de90f..40177df 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -775,6 +775,7 @@ bool br_allowed_egress(struct net_bridge_vlan_group *vg,
   const struct sk_buff *skb);
 bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid);
 struct sk_buff *br_handle_vlan(struct net_bridge *br,
+  const struct net_bridge_port *port,
   struct net_bridge_vlan_group *vg,
   struct sk_buff *skb);
 int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags);
@@ -874,6 +875,7 @@ static inline bool br_should_learn(struct net_bridge_port 
*p,
 }
 
 static inline struct sk_buff *br_handle_vlan(struct net_bridge *br,
+const struct net_bridge_port *port,
 struct net_bridge_vlan_group *vg,
 struct sk_buff *skb)
 {
diff --git a/net/bridge/br_private_tunnel.h b/net/bridge/br_private_tunnel.h
index 1c8d0d5..4a447a3 100644
--- a/net/bridge/br_private_tunnel.h
+++ b/net/bridge/br_private_tunnel.h
@@ -40,6 +40,11 @@ int br_fill_vlan_tunnel_info(struct sk_buff *skb,
 void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port);
 void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg,
  struct net_bridge_vlan *vlan);
+int br_handle_ingress_vlan_tunnel(struct sk_buff *skb,
+ struct net_bridge_port *p,
+ struct net_bridge_vlan_group *vg);
+int br_handle_egress_vlan_tunnel(struct sk_buff *skb,
+struct net_bridge_vlan *vlan);
 #else
 static inline int vlan_tunnel_init(struct net_bridge_vlan_group *vg)
 {
@@ -67,6 +72,12 @@ static inline void vlan_tunnel_info_del(struct 
net_bridge_vlan_group *vg,
 {
 }
 
+static inline int br_handle_ingress_vlan_tunnel(struct sk_buff *skb,
+   struct net_bridge_port *p,
+   struct net_bridge_vlan_group 
*vg)
+{
+   return 0;
+}
 #endif
 
 #endif
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 64002e3..62e68c0 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -341,6 +341,7 @@ static void __vlan_flush(struct net_bridge_vlan_group *vg)
 }
 
 struct sk_buff *br_handle_vlan(struct net_bridge *br,
+  const struct net_bridge_port *p,
   struct

[PATCH net-next v2 2/5] vxlan: support fdb and learning in COLLECT_METADATA mode

2017-01-31 Thread Roopa Prabhu

From: Roopa Prabhu 

Vxlan COLLECT_METADATA mode today solves the per-vni netdev
scalability problem in l3 networks. It expects all forwarding
information to be present in dst_metadata. This patch series
enhances collect metadata mode to include the case where only
vni is present in dst_metadata, and the vxlan driver can then use
the rest of the forwarding information datbase to make forwarding
decisions. There is no change to default COLLECT_METADATA
behaviour. These changes only apply to COLLECT_METADATA when
used with the bridging use-case with a special dst_metadata
tunnel info flag (eg: where vxlan device is part of a bridge).
For all this to work, the vxlan driver will need to now support a
single fdb table hashed by mac + vni. This series essentially makes
this happen.

use-case and workflow:
vxlan collect metadata device participates in bridging vlan
to vn-segments. Bridge driver above the vxlan device,
sends the vni corresponding to the vlan in the dst_metadata.
vxlan driver will lookup forwarding database with (mac + vni)
for the required remote destination information to forward the
packet.

Changes introduced by this patch:
- allow learning and forwarding database state in vxlan netdev in
  COLLECT_METADATA mode. Current behaviour is not changed
  by default. tunnel info flag IP_TUNNEL_INFO_BRIDGE is used
  to support the new bridge friendly mode.
- A single fdb table hashed by (mac, vni) to allow fdb entries with
  multiple vnis in the same fdb table
- rx path already has the vni
- tx path expects a vni in the packet with dst_metadata
- prior to this series, fdb remote_dsts carried remote vni and
  the vxlan device carrying the fdb table represented the
  source vni. With the vxlan device now representing multiple vnis,
  this patch adds a src vni attribute to the fdb entry. The remote
  vni already uses NDA_VNI attribute. This patch introduces
  NDA_SRC_VNI netlink attribute to represent the src vni in a multi
  vni fdb table.

iproute2 example (patched and pruned iproute2 output to just show
relevant fdb entries):
example shows same host mac learnt on two vni's.

before (netdev per vni):
$bridge fdb show | grep "00:02:00:00:00:03"
00:02:00:00:00:03 dev vxlan1001 dst 12.0.0.8 self
00:02:00:00:00:03 dev vxlan1000 dst 12.0.0.8 self

after this patch with collect metadata in bridged mode (single netdev):
$bridge fdb show | grep "00:02:00:00:00:03"
00:02:00:00:00:03 dev vxlan0 src_vni 1001 dst 12.0.0.8 self
00:02:00:00:00:03 dev vxlan0 src_vni 1000 dst 12.0.0.8 self

Signed-off-by: Roopa Prabhu 
---
 drivers/net/vxlan.c|  196 +---
 include/uapi/linux/neighbour.h |1 +
 2 files changed, 126 insertions(+), 71 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 19b1653..6f16882 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -75,6 +75,7 @@ struct vxlan_fdb {
struct list_head  remotes;
u8eth_addr[ETH_ALEN];
u16   state;/* see ndm_state */
+   __be32vni;
u8flags;/* see ndm_flags */
 };
 
@@ -302,6 +303,10 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct 
vxlan_dev *vxlan,
if (rdst->remote_vni != vxlan->default_dst.remote_vni &&
nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni)))
goto nla_put_failure;
+   if ((vxlan->flags & VXLAN_F_COLLECT_METADATA) && fdb->vni &&
+   nla_put_u32(skb, NDA_SRC_VNI,
+   be32_to_cpu(fdb->vni)))
+   goto nla_put_failure;
if (rdst->remote_ifindex &&
nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex))
goto nla_put_failure;
@@ -400,34 +405,51 @@ static u32 eth_hash(const unsigned char *addr)
return hash_64(value, FDB_HASH_BITS);
 }
 
+static u32 eth_vni_hash(const unsigned char *addr, __be32 vni)
+{
+   /* use 1 byte of OUI and 3 bytes of NIC */
+   u32 key = get_unaligned((u32 *)(addr + 2));
+
+   return jhash_2words(key, vni, vxlan_salt) & (FDB_HASH_SIZE - 1);
+}
+
 /* Hash chain to use given mac address */
 static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
-   const u8 *mac)
+   const u8 *mac, __be32 vni)
 {
-   return >fdb_head[eth_hash(mac)];
+   if (vxlan->flags & VXLAN_F_COLLECT_METADATA)
+   return >fdb_head[eth_vni_hash(mac, vni)];
+   else
+   return >fdb_head[eth_hash(mac)];
 }
 
 /* Look up Ethernet address in forwarding table */
 static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan,
-   const u8 *mac)
+ const u8 *mac, __be32 vni)
 {
-   struct hlist_head *head =

[PATCH net-next v2 0/5] bridge: per vlan dst_metadata support

2017-01-31 Thread Roopa Prabhu

From: Roopa Prabhu 

High level summary:
lwt and dst_metadata have enabled vxlan l3 deployments
to use a single vxlan netdev for multiple vnis eliminating the scalability
problem with using a single vxlan netdev per vni. This series tries to
do the same for vxlan netdevs in pure l2 bridged networks.
Use-case/deployment and details are below.

Deployment scerario details:
As we know VXLAN is used to build layer 2 virtual networks across the
underlay layer3 infrastructure. A VXLAN tunnel endpoint (VTEP)
originates and terminates VXLAN tunnels. And a VTEP can be a TOR switch
or a vswitch in the hypervisor. This patch series mainly
focuses on the TOR switch configured as a Vtep. Vxlan segment ID (vni)
along with vlan id is used to identify layer 2 segments in a vxlan
overlay network. Vxlan bridging is the function provided by Vteps to terminate
vxlan tunnels and map the vxlan vni to traditional end host vlan. This is
covered in the "VXLAN Deployment Scenarios" in sections 6 and 6.1 in RFC 7348.
To provide vxlan bridging function, a vtep has to map vlan to a vni. The rfc
says that the ingress VTEP device shall remove the IEEE 802.1Q VLAN tag in
the original Layer 2 packet if there is one before encapsulating the packet
into the VXLAN format to transmit it through the underlay network. The remote
VTEP devices have information about the VLAN in which the packet will be
placed based on their own VLAN-to-VXLAN VNI mapping configurations.

Existing solution:
Without this patch series one can deploy such a vtep configuration by
adding the local ports and vxlan netdevs into a vlan filtering bridge.
The local ports are configured as trunk ports carrying all vlans.
A vxlan netdev per vni is added to the bridge. Vlan mapping to vni is
achieved by configuring the vlan as pvid on the corresponding vxlan netdev.
The vxlan netdev only receives traffic corresponding to the vlan it is mapped
to. This configuration maps traffic belonging to a vlan to the corresponding
vxlan segment.

  ---
 |  bridge   |
 |   |
  ---
|100,200   |100 (pvid)|200 (pvid)
|  |  |
   swp1  vxlan1000  vxlan2000

This provides the required vxlan bridging function but poses a
scalability problem with using a separate vxlan netdev for each vni.

Solution in this patch series:
The Goal is to use a single vxlan device to carry all vnis similar
to the vxlan collect metadata mode but additionally allowing the bridge
and vxlan driver to carry all the forwarding information and also learn.
This implementation uses the existing dst_metadata infrastructure to map
vlan to a tunnel id.
- vxlan driver changes:
- enable collect metadata mode to be used with learning,
  replication and fdb
- A single fdb table hashed by (mac, vni)
- rx path already has the vni
- tx path expects a vni in the packet with dst_metadata and relies
  on learnt or static forwarding information table to forward the packet

- Bridge driver changes: per vlan dst_metadata support:
- Our use case is vxlan and 1-1 mapping between vlan and vni, but I have
  kept the api generic for any tunnel info
- Uapi to configure/unconfigure/dump per vlan tunnel data
- new bridge port flag to turn this feature on/off. off by default
- ingress hook:
- if port is a tunnel port, use tunnel info in
  attached dst_metadata to map it to a local vlan
- egress hook:
- if port is a tunnel port, use tunnel info attached to vlan
  to set dst_metadata on the skb

Other approaches tried and vetoed:
- tc vlan push/pop and tunnel metadata dst:
- though tc can be used to do part of this, these patches address a 
deployment
  case where bridge driver vlan filtering and forwarding information
  database along with vxlan driver forwarding information table and learning
  are required.
- making vxlan driver understand vlan-vni mapping:
- I had a series almost ready with this one but soon realized
  it duplicated a lot of vlan handling code in the vxlan driver

Roopa Prabhu (5):
  ip_tunnels: new IP_TUNNEL_INFO_BRIDGE flag for ip_tunnel_info mode
  vxlan: support fdb and learning in COLLECT_METADATA mode
  bridge: uapi: add per vlan tunnel info
  bridge: per vlan dst_metadata netlink support
  bridge: vlan dst_metadata hooks in ingress and egress paths

 drivers/net/vxlan.c|  209 +---
 include/linux/if_bridge.h  |1 +
 include/net/ip_tunnels.h   |1 +
 include/uapi/linux/if_bridge.h |   11 ++
 include/uapi/linux/if_link.h   |1 +
 include/uapi/linux/neighbour.h |1 +
 net/bridge/Makefile|5 +-
 net/bridge/br_forward.c|2 +-
 net/bridge/br_input.c  |8 +-

[PATCH net-next v2 3/5] bridge: uapi: add per vlan tunnel info

2017-01-31 Thread Roopa Prabhu

From: Roopa Prabhu 

New nested netlink attribute to associate tunnel info per vlan.
This is used by bridge driver to send tunnel metadata to
bridge ports in vlan tunnel mode. This patch also adds new per
port flag IFLA_BRPORT_VLAN_TUNNEL to enable vlan tunnel mode.
off by default.

One example use for this is a vxlan bridging gateway or vtep
which maps vlans to vn-segments (or vnis). User can configure
per-vlan tunnel information which the bridge driver can use
to bridge vlan into the corresponding vn-segment.

Signed-off-by: Roopa Prabhu 
---
 include/linux/if_bridge.h  |1 +
 include/uapi/linux/if_bridge.h |   11 +++
 include/uapi/linux/if_link.h   |1 +
 3 files changed, 13 insertions(+)

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index debc9d5..c5847dc 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -47,6 +47,7 @@ struct br_ip_list {
 #define BR_PROXYARP_WIFI   BIT(10)
 #define BR_MCAST_FLOOD BIT(11)
 #define BR_MULTICAST_TO_UNICASTBIT(12)
+#define BR_VLAN_TUNNEL BIT(13)
 
 #define BR_DEFAULT_AGEING_TIME (300 * HZ)
 
diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index ab92bca..a9e6244 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -118,6 +118,7 @@ enum {
IFLA_BRIDGE_FLAGS,
IFLA_BRIDGE_MODE,
IFLA_BRIDGE_VLAN_INFO,
+   IFLA_BRIDGE_VLAN_TUNNEL_INFO,
__IFLA_BRIDGE_MAX,
 };
 #define IFLA_BRIDGE_MAX (__IFLA_BRIDGE_MAX - 1)
@@ -134,6 +135,16 @@ struct bridge_vlan_info {
__u16 vid;
 };
 
+enum {
+   IFLA_BRIDGE_VLAN_TUNNEL_UNSPEC,
+   IFLA_BRIDGE_VLAN_TUNNEL_ID,
+   IFLA_BRIDGE_VLAN_TUNNEL_VID,
+   IFLA_BRIDGE_VLAN_TUNNEL_FLAGS,
+   __IFLA_BRIDGE_VLAN_TUNNEL_MAX,
+};
+
+#define IFLA_BRIDGE_VLAN_TUNNEL_MAX (__IFLA_BRIDGE_VLAN_TUNNEL_MAX - 1)
+
 struct bridge_vlan_xstats {
__u64 rx_bytes;
__u64 rx_packets;
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index b9aa564..320fc1e 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -322,6 +322,7 @@ enum {
IFLA_BRPORT_PAD,
IFLA_BRPORT_MCAST_FLOOD,
IFLA_BRPORT_MCAST_TO_UCAST,
+   IFLA_BRPORT_VLAN_TUNNEL,
__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
-- 
1.7.10.4

[PATCH net-next v2 4/5] bridge: per vlan dst_metadata netlink support

2017-01-31 Thread Roopa Prabhu

From: Roopa Prabhu 

This patch adds support to attach per vlan tunnel info dst
metadata. This enables bridge driver to map vlan to tunnel_info
at ingress and egress. It uses the kernel dst_metadata infrastructure.

The initial use case is vlan to vni bridging, but the api is generic
to extend to any tunnel_info in the future:
- Uapi to configure/unconfigure/dump per vlan tunnel data
- netlink functions to configure vlan and tunnel_info mapping
- Introduces bridge port flag BR_LWT_VLAN to enable attach/detach
dst_metadata to bridged packets on ports. off by default.
- changes to existing code is mainly refactor some existing vlan
handling netlink code + hooks for new vlan tunnel code
- I have kept the vlan tunnel code isolated in separate files.
- most of the netlink vlan tunnel code is handling of vlan-tunid
ranges (follows the vlan range handling code). To conserve space
vlan-tunid by default are always dumped in ranges if applicable.

Use case:
example use for this is a vxlan bridging gateway or vtep
which maps vlans to vn-segments (or vnis).

iproute2 example (patched and pruned iproute2 output to just show
relevant fdb entries):
example shows same host mac learnt on two vni's and
vlan 100 maps to vni 1000, vlan 101 maps to vni 1001

before (netdev per vni):
$bridge fdb show | grep "00:02:00:00:00:03"
00:02:00:00:00:03 dev vxlan1001 vlan 101 master bridge
00:02:00:00:00:03 dev vxlan1001 dst 12.0.0.8 self
00:02:00:00:00:03 dev vxlan1000 vlan 100 master bridge
00:02:00:00:00:03 dev vxlan1000 dst 12.0.0.8 self

after this patch with collect metdata in bridged mode (single netdev):
$bridge fdb show | grep "00:02:00:00:00:03"
00:02:00:00:00:03 dev vxlan0 vlan 101 master bridge
00:02:00:00:00:03 dev vxlan0 src_vni 1001 dst 12.0.0.8 self
00:02:00:00:00:03 dev vxlan0 vlan 100 master bridge
00:02:00:00:00:03 dev vxlan0 src_vni 1000 dst 12.0.0.8 self

CC: Nikolay Aleksandrov 
Signed-off-by: Roopa Prabhu 
---
 net/bridge/Makefile|5 +-
 net/bridge/br_netlink.c|  140 +--
 net/bridge/br_netlink_tunnel.c |  296 
 net/bridge/br_private.h|   10 ++
 net/bridge/br_private_tunnel.h |   72 ++
 net/bridge/br_vlan.c   |   17 ++-
 net/bridge/br_vlan_tunnel.c|  149 
 7 files changed, 641 insertions(+), 48 deletions(-)
 create mode 100644 net/bridge/br_netlink_tunnel.c
 create mode 100644 net/bridge/br_private_tunnel.h
 create mode 100644 net/bridge/br_vlan_tunnel.c

diff --git a/net/bridge/Makefile b/net/bridge/Makefile
index 0aefc01..40b1ede 100644
--- a/net/bridge/Makefile
+++ b/net/bridge/Makefile
@@ -6,7 +6,8 @@ obj-$(CONFIG_BRIDGE) += bridge.o
 
 bridge-y   := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \
br_ioctl.o br_stp.o br_stp_bpdu.o \
-   br_stp_if.o br_stp_timer.o br_netlink.o
+   br_stp_if.o br_stp_timer.o br_netlink.o \
+   br_netlink_tunnel.o
 
 bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o
 
@@ -18,7 +19,7 @@ obj-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o
 
 bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o
 
-bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o
+bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o br_vlan_tunnel.o
 
 bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o
 
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 6c087cd..6dbfc2f 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -20,6 +20,7 @@
 
 #include "br_private.h"
 #include "br_private_stp.h"
+#include "br_private_tunnel.h"
 
 static int __get_num_vlan_infos(struct net_bridge_vlan_group *vg,
u32 filter_mask)
@@ -95,9 +96,10 @@ static size_t br_get_link_af_size_filtered(const struct 
net_device *dev,
   u32 filter_mask)
 {
struct net_bridge_vlan_group *vg = NULL;
-   struct net_bridge_port *p;
+   struct net_bridge_port *p = NULL;
struct net_bridge *br;
int num_vlan_infos;
+   size_t vinfo_sz = 0;
 
rcu_read_lock();
if (br_port_exists(dev)) {
@@ -110,8 +112,13 @@ static size_t br_get_link_af_size_filtered(const struct 
net_device *dev,
num_vlan_infos = br_get_num_vlan_infos(vg, filter_mask);
rcu_read_unlock();
 
+   if (p && (p->flags & BR_VLAN_TUNNEL))
+   vinfo_sz += br_get_vlan_tunnel_info_size(vg);
+
/* Each VLAN is returned in bridge_vlan_info along with flags */
-   return num_vlan_infos * nla_total_size(sizeof(struct bridge_vlan_info));
+   vinfo_sz += num_vlan_infos * nla_total_size(sizeof(struct 
bridge_vlan_info));
+
+   return vinfo_sz;
 }
 
 static inline size_t br_port_info_size(void)
@@ -128,6 +135,7 @@ static inline size_t

[PATCH net-next v2 1/5] ip_tunnels: new IP_TUNNEL_INFO_BRIDGE flag for ip_tunnel_info mode

2017-01-31 Thread Roopa Prabhu

From: Roopa Prabhu 

New ip_tunnel_info flag to represent bridged tunnel metadata.
Used by bridge driver later in the series to pass per vlan dst
metadata to bridge ports.

Signed-off-by: Roopa Prabhu 
---
 include/net/ip_tunnels.h |1 +
 1 file changed, 1 insertion(+)

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 3d4ca4d..9505679 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -58,6 +58,7 @@ struct ip_tunnel_key {
 /* Flags for ip_tunnel_info mode. */
 #define IP_TUNNEL_INFO_TX  0x01/* represents tx tunnel parameters */
 #define IP_TUNNEL_INFO_IPV60x02/* key contains IPv6 addresses */
+#define IP_TUNNEL_INFO_BRIDGE  0x04/* represents a bridged tunnel id */
 
 /* Maximum tunnel options length. */
 #define IP_TUNNEL_OPTS_MAX \
-- 
1.7.10.4

[PATCH net v2 1/2] mlx4: Fix memory leak after mlx4_en_update_priv()

2017-01-31 Thread Martin KaFai Lau

In mlx4_en_update_priv(), dst->tx_ring[t] and dst->tx_cq[t]
are over-written by src->tx_ring[t] and src->tx_cq[t] without
first calling kfree.

One of the reproducible code paths is by doing 'ethtool -L'.

The fix is to do the kfree in mlx4_en_free_resources().

Here is the kmemleak report:
unreferenced object 0x880841211800 (size 2048):
  comm "ethtool", pid 3096, jiffies 4294716940 (age 528.353s)
  hex dump (first 32 bytes):
00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  
00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  
  backtrace:
[] kmemleak_alloc+0x28/0x50
[] kmem_cache_alloc_trace+0x103/0x260
[] mlx4_en_try_alloc_resources+0x118/0x1a0
[] mlx4_en_set_ringparam+0x169/0x210
[] dev_ethtool+0xae5/0x2190
[] dev_ioctl+0x168/0x6f0
[] sock_do_ioctl+0x42/0x50
[] sock_ioctl+0x21b/0x2d0
[] do_vfs_ioctl+0x93/0x6a0
[] SyS_ioctl+0x79/0x90
[] entry_SYSCALL_64_fastpath+0x18/0xad
[] 0x
unreferenced object 0x880841213000 (size 2048):
  comm "ethtool", pid 3096, jiffies 4294716940 (age 528.353s)
  hex dump (first 32 bytes):
00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  
00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  
  backtrace:
[] kmemleak_alloc+0x28/0x50
[] kmem_cache_alloc_trace+0x103/0x260
[] mlx4_en_try_alloc_resources+0x13b/0x1a0
[] mlx4_en_set_ringparam+0x169/0x210
[] dev_ethtool+0xae5/0x2190
[] dev_ioctl+0x168/0x6f0
[] sock_do_ioctl+0x42/0x50
[] sock_ioctl+0x21b/0x2d0
[] do_vfs_ioctl+0x93/0x6a0
[] SyS_ioctl+0x79/0x90
[] entry_SYSCALL_64_fastpath+0x18/0xad
[] 0x

(gdb) list *mlx4_en_try_alloc_resources+0x118
0x8170e0a8 is in mlx4_en_try_alloc_resources 
(drivers/net/ethernet/mellanox/mlx4/en_netdev.c:2145).
2140if (!dst->tx_ring_num[t])
2141continue;
2142
2143dst->tx_ring[t] = kzalloc(sizeof(struct mlx4_en_tx_ring 
*) *
2144  MAX_TX_RINGS, GFP_KERNEL);
2145if (!dst->tx_ring[t])
2146goto err_free_tx;
2147
2148dst->tx_cq[t] = kzalloc(sizeof(struct mlx4_en_cq *) *
2149MAX_TX_RINGS, GFP_KERNEL);
(gdb) list *mlx4_en_try_alloc_resources+0x13b
0x8170e0cb is in mlx4_en_try_alloc_resources 
(drivers/net/ethernet/mellanox/mlx4/en_netdev.c:2150).
2145if (!dst->tx_ring[t])
2146goto err_free_tx;
2147
2148dst->tx_cq[t] = kzalloc(sizeof(struct mlx4_en_cq *) *
2149MAX_TX_RINGS, GFP_KERNEL);
2150if (!dst->tx_cq[t]) {
2151kfree(dst->tx_ring[t]);
2152goto err_free_tx;
2153}
2154}

Fixes: ec25bc04ed8e ("net/mlx4_en: Add resilience in low memory systems")
Cc: Eugenia Emantayev 
Cc: Saeed Mahameed 
Cc: Tariq Toukan 
Signed-off-by: Martin KaFai Lau 
---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 8 ++--
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c 
b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 761f8b12399c..3abcead208d2 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2042,6 +2042,8 @@ static void mlx4_en_free_resources(struct mlx4_en_priv 
*priv)
if (priv->tx_cq[t] && priv->tx_cq[t][i])
mlx4_en_destroy_cq(priv, >tx_cq[t][i]);
}
+   kfree(priv->tx_ring[t]);
+   kfree(priv->tx_cq[t]);
}
 
for (i = 0; i < priv->rx_ring_num; i++) {
@@ -2214,7 +2216,6 @@ void mlx4_en_destroy_netdev(struct net_device *dev)
 {
struct mlx4_en_priv *priv = netdev_priv(dev);
struct mlx4_en_dev *mdev = priv->mdev;
-   int t;
 
en_dbg(DRV, priv, "Destroying netdev on port:%d\n", priv->port);
 
@@ -2248,11 +2249,6 @@ void mlx4_en_destroy_netdev(struct net_device *dev)
mlx4_en_free_resources(priv);
mutex_unlock(>state_lock);
 
-   for (t = 0; t < MLX4_EN_NUM_TX_TYPES; t++) {
-   kfree(priv->tx_ring[t]);
-   kfree(priv->tx_cq[t]);
-   }
-
free_netdev(dev);
 }
 
-- 
2.5.1

[PATCH net v2 2/2] mlx4: xdp_prog becomes inactive after ethtool '-L' or '-G'

2017-01-31 Thread Martin KaFai Lau

After calling mlx4_en_try_alloc_resources (e.g. by changing the
number of rx-queues with ethtool -L), the existing xdp_prog becomes
inactive.

The bug is that the xdp_prog ptr has not been carried over from
the old rx-queues to the new rx-queues

Fixes: 47a38e155037 ("net/mlx4_en: add support for fast rx drop bpf program")
Cc: Brenden Blanco 
Cc: Saeed Mahameed 
Cc: Tariq Toukan 
Signed-off-by: Martin KaFai Lau 
---
 drivers/net/ethernet/mellanox/mlx4/en_ethtool.c |  4 ++--
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c  | 27 +
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h|  3 ++-
 3 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c 
b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
index d5a9372ed84d..9aa422691954 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
@@ -1099,7 +1099,7 @@ static int mlx4_en_set_ringparam(struct net_device *dev,
memcpy(_prof, priv->prof, sizeof(struct mlx4_en_port_profile));
new_prof.tx_ring_size = tx_size;
new_prof.rx_ring_size = rx_size;
-   err = mlx4_en_try_alloc_resources(priv, tmp, _prof);
+   err = mlx4_en_try_alloc_resources(priv, tmp, _prof, true);
if (err)
goto out;
 
@@ -1774,7 +1774,7 @@ static int mlx4_en_set_channels(struct net_device *dev,
new_prof.tx_ring_num[TX_XDP] = xdp_count;
new_prof.rx_ring_num = channel->rx_count;
 
-   err = mlx4_en_try_alloc_resources(priv, tmp, _prof);
+   err = mlx4_en_try_alloc_resources(priv, tmp, _prof, true);
if (err)
goto out;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c 
b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 3abcead208d2..3b4961a8e8e4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2186,9 +2186,11 @@ static void mlx4_en_update_priv(struct mlx4_en_priv *dst,
 
 int mlx4_en_try_alloc_resources(struct mlx4_en_priv *priv,
struct mlx4_en_priv *tmp,
-   struct mlx4_en_port_profile *prof)
+   struct mlx4_en_port_profile *prof,
+   bool carry_xdp_prog)
 {
-   int t;
+   struct bpf_prog *xdp_prog;
+   int i, t;
 
mlx4_en_copy_priv(tmp, priv, prof);
 
@@ -2202,6 +2204,23 @@ int mlx4_en_try_alloc_resources(struct mlx4_en_priv 
*priv,
}
return -ENOMEM;
}
+
+   /* All rx_rings has the same xdp_prog.  Pick the first one. */
+   xdp_prog = rcu_dereference_protected(
+   priv->rx_ring[0]->xdp_prog,
+   lockdep_is_held(>mdev->state_lock));
+
+   if (xdp_prog && carry_xdp_prog) {
+   xdp_prog = bpf_prog_add(xdp_prog, tmp->rx_ring_num);
+   if (IS_ERR(xdp_prog)) {
+   mlx4_en_free_resources(tmp);
+   return PTR_ERR(xdp_prog);
+   }
+   for (i = 0; i < tmp->rx_ring_num; i++)
+   rcu_assign_pointer(tmp->rx_ring[i]->xdp_prog,
+  xdp_prog);
+   }
+
return 0;
 }
 
@@ -2751,7 +2770,7 @@ static int mlx4_xdp_set(struct net_device *dev, struct 
bpf_prog *prog)
en_warn(priv, "Reducing the number of TX rings, to not exceed 
the max total rings number.\n");
}
 
-   err = mlx4_en_try_alloc_resources(priv, tmp, _prof);
+   err = mlx4_en_try_alloc_resources(priv, tmp, _prof, false);
if (err) {
if (prog)
bpf_prog_sub(prog, priv->rx_ring_num - 1);
@@ -3495,7 +3514,7 @@ int mlx4_en_reset_config(struct net_device *dev,
memcpy(_prof, priv->prof, sizeof(struct mlx4_en_port_profile));
memcpy(_prof.hwtstamp_config, _config, sizeof(ts_config));
 
-   err = mlx4_en_try_alloc_resources(priv, tmp, _prof);
+   err = mlx4_en_try_alloc_resources(priv, tmp, _prof, true);
if (err)
goto out;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h 
b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index ba1c6cd0cc79..cec59bc264c9 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -679,7 +679,8 @@ void mlx4_en_set_stats_bitmap(struct mlx4_dev *dev,
 
 int mlx4_en_try_alloc_resources(struct mlx4_en_priv *priv,
struct mlx4_en_priv *tmp,
-   struct mlx4_en_port_profile *prof);
+   struct mlx4_en_port_profile *prof,
+   bool carry_xdp_prog);
 void mlx4_en_safe_replace_resources(struct mlx4_en_priv *priv,
struct mlx4_en_priv *tmp);
 
-- 
2.5.1

[PATCH net v2 0/2] mlx4: Misc bug fixes after reinitializing queues

2017-01-31 Thread Martin KaFai Lau

This patchset fixes misc bugs after reinitializing
queues (e.g. by ethtool -L).

v2:
* Add another fix to mem leak in tx_ring[t] and tx_cq[t]
* In mlx4_en_try_alloc_resources(),
  move all xdp_prog logic after calling mlx4_en_alloc_resources()

Martin KaFai Lau (2):
  mlx4: Fix memory leak after mlx4_en_update_priv()
  mlx4: xdp_prog becomes inactive after mlx4_en_try_alloc_resources()

 drivers/net/ethernet/mellanox/mlx4/en_ethtool.c |  4 +--
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c  | 35 ++---
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h|  3 ++-
 3 files changed, 29 insertions(+), 13 deletions(-)

-- 
2.5.1

Re: [PATCH net-next 00/10] bnxt_en: Add XDP support.

2017-01-31 Thread Alexei Starovoitov

On Tue, Jan 31, 2017 at 9:33 PM, Andy Gospodarek  wrote:
> On Tue, Jan 31, 2017 at 10:36 AM, Andy Gospodarek  wrote:
>> On Mon, Jan 30, 2017 at 08:47:47PM -0800, Alexei Starovoitov wrote:
>>> On Mon, Jan 30, 2017 at 08:49:25PM -0500, Michael Chan wrote:
>>> > The first 8 patches refactor the code (rx/tx code paths and ring logic)
>>> > and add the basic infrastructure to support XDP.  The 9th patch adds
>>> > basic ndo_xdp to support XDP_DROP and XDP_PASS only.  The 10th patch
>>> > completes the series with XDP_TX.
>>>
>>> Looks great.
>>> Could you please share performance numbers ?
>>
>> I'll post some later today.
>
> I finally got my system moved around to what I'd hoped would be the
> right spot in my lab, but the system used for generating the traffic
> was only able to send 6Mpps with pktgen, so it was not a great test.
>
> My receiving system with i7-6700 CPU @ 3.40GHz seemed to have no issue
> handling this 6Mpps load -- mpstat showed only one core was ~25%
> utilitzed with all of that servicing softirqs.  The rest of the cores
> were 100% idle.
>
> I'm going to search for other traffic generation tools/systems to make
> sure I can get at least line-rate for the 10GbE cards I was using.

hmm. last time I tried pktgen on bnx2x it was easily doing 14Mpps with burst on.
Have you been using samples/pktgen/pktgen_sample03_burst_single_flow.sh ?
Waiting for this set to land to start benchmarking on bnxt.
So having a baseline will certainly help :)

Thanks!

RE: [PATCH net-next 1/2] net: Introduce ife encapsulation module

2017-01-31 Thread Yotam Gigi

>-Original Message-
>From: kbuild test robot [mailto:l...@intel.com]
>Sent: Wednesday, February 01, 2017 1:58 AM
>To: Yotam Gigi <yot...@mellanox.com>
>Cc: kbuild-...@01.org; j...@mojatatu.com; da...@davemloft.net;
>netdev@vger.kernel.org; Jiri Pirko <j...@mellanox.com>; Elad Raz
><el...@mellanox.com>; Ido Schimmel <ido...@mellanox.com>; Yotam Gigi
><yot...@mellanox.com>
>Subject: Re: [PATCH net-next 1/2] net: Introduce ife encapsulation module
>
>Hi Yotam,
>
>[auto build test ERROR on net-next/master]
>
>url:
>https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.co
>m%2F0day-ci%2Flinux%2Fcommits%2FYotam-Gigi%2FExtract-IFE-logic-to-
>module%2F20170131-
>222757=02%7C01%7Cyotamg%40mellanox.com%7C78ded484c30746e7ecba08
>d44a350e8e%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636215039177
>989111=ZjfwyvJCJWthy4VSx2Yyteu%2BU3VkVdSIqAPiaCUprqo%3D
>ed=0
>config: alpha-allyesconfig (attached as .config)
>compiler: alpha-linux-gnu-gcc (Debian 6.1.1-9) 6.1.1 20160705
>reproduce:
>wget
>https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgit.kernel
>.org%2Fcgit%2Flinux%2Fkernel%2Fgit%2Fwfg%2Flkp-
>tests.git%2Fplain%2Fsbin%2Fmake.cross=02%7C01%7Cyotamg%40mellanox.
>com%7C78ded484c30746e7ecba08d44a350e8e%7Ca652971c7d2e4d9ba6a4d149256f
>461b%7C0%7C0%7C636215039177999119=PFLoGWvLTSWuY2e%2F%2F5C%2
>F%2BGlpNGHQgeX3UPXYxrgYqzc%3D=0 -O ~/bin/make.cross
>chmod +x ~/bin/make.cross
># save the attached .config to linux build tree
>make.cross ARCH=alpha
>
>Note: the linux-review/Yotam-Gigi/Extract-IFE-logic-to-module/20170131-222757
>HEAD 236a17de4759e7d4d2927d4ac50329ec788ec655 builds fine.
>  It only hurts bisectibility.
>
>All errors (new ones prefixed by >>):
>
>   net/ife/built-in.o: In function `ife_tlv_meta_encode':
>>> (.text+0x420): multiple definition of `ife_tlv_meta_encode'
>   net/sched/built-in.o:(.text+0x18800): first defined here
>
>---
>0-DAY kernel test infrastructureOpen Source Technology Center
>https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.01.or
>g%2Fpipermail%2Fkbuild-
>all=02%7C01%7Cyotamg%40mellanox.com%7C78ded484c30746e7ecba08d44a
>350e8e%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C63621503917799911
>9=m2yy4YGzOaxrLF%2FjqgxikQ8GELNBj%2F9IlSOlS%2F831tY%3D
>=0   Intel Corporation

Sorry for that. I will send v2 soon.

Re: [PATCH net] mlx4: xdp_prog becomes inactive after ethtool '-L' or '-G'

2017-01-31 Thread Martin KaFai Lau

On Mon, Jan 30, 2017 at 07:18:28PM +0200, Tariq Toukan wrote:
> Hi Martin,
>
> Thanks for your patch.
>
> It looks good to me, in general.
> I just have one small comment below.
Thanks for your feedback and sorry for the delay.

>
> On 28/01/2017 9:40 AM, Martin KaFai Lau wrote:
> > If the rx-queues ever get re-initialized (e.g. by changing the
> > number of rx-queues with ethtool -L), the existing xdp_prog becomes
> > inactive.
> >
> > The bug is that the xdp_prog ptr has not been carried over from
> > the old rx-queues to the new rx-queues
> >
> > Fixes: 47a38e155037 ("net/mlx4_en: add support for fast rx drop bpf 
> > program")
> > Signed-off-by: Martin KaFai Lau 
> > ---
> ...
> > diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c 
> > b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> > index 761f8b12399c..f4179086b3c6 100644
> > --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> > +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> > @@ -2184,23 +2184,57 @@ static void mlx4_en_update_priv(struct mlx4_en_priv 
> > *dst,
> >   int mlx4_en_try_alloc_resources(struct mlx4_en_priv *priv,
> > struct mlx4_en_priv *tmp,
> > -   struct mlx4_en_port_profile *prof)
> > +   struct mlx4_en_port_profile *prof,
> > +   bool carry_xdp_prog)
> >   {
> > -   int t;
> > +   struct bpf_prog *xdp_prog = NULL;
> > +   int err;
> > +   int i;
> > mlx4_en_copy_priv(tmp, priv, prof);
> > +   if (carry_xdp_prog) {
> > +   /* All rx_rings has the same xdp_prog.  Pick the first one */
> > +   xdp_prog = rcu_dereference_protected(
> > +   priv->rx_ring[0]->xdp_prog,
> > +   lockdep_is_held(>mdev->state_lock));
> > +
> > +   if (xdp_prog) {
> > +   xdp_prog = bpf_prog_add(xdp_prog, tmp->rx_ring_num);
> > +   if (IS_ERR(xdp_prog)) {
> > +   err = PTR_ERR(xdp_prog);
> > +   xdp_prog = NULL;
> > +   goto err_free;
> > +   }
> > +   }
> > +   }
> Why do you prefer dealing with xdp_prog in two stages? You can handle it all
> at once, after "mlx4_en_alloc_resources()" succeeds.
If bpf_prog_add() did fail, resources allocated for tmp had to
be freed.  I was thinking it is not safe to call mlx4_en_free_resources()
at this point.  Since your feedback, I took another look and re-read the
'goto err' path in mlx4_en_alloc_resources(), I realized we can use
mlx4_en_free_resources() here for the bpf_prog_add() error case.  Hence,
agree with your suggestion.

A side note though:
after taking another look at mlx4_en_free_resources(), I might have found
another issue.  I need to run some tests to confirm first to avoid any false
alarm.

I will post v2.

Thanks,
--Martin


> > +
> > if (mlx4_en_alloc_resources(tmp)) {
> > en_warn(priv,
> > "%s: Resource allocation failed, using previous 
> > configuration\n",
> > __func__);
> > -   for (t = 0; t < MLX4_EN_NUM_TX_TYPES; t++) {
> > -   kfree(tmp->tx_ring[t]);
> > -   kfree(tmp->tx_cq[t]);
> > -   }
> > -   return -ENOMEM;
> > +   err = -ENOMEM;
> > +   goto err_free;
> > +   }
> > +
> > +   if (xdp_prog) {
> > +   for (i = 0; i < tmp->rx_ring_num; i++)
> > +   rcu_assign_pointer(tmp->rx_ring[i]->xdp_prog,
> > +  xdp_prog);
> > }
> > +
> > return 0;
> > +
> > +err_free:
> > +   if (xdp_prog)
> > +   bpf_prog_sub(xdp_prog, tmp->rx_ring_num);
> > +
> > +   for (i = 0; i < MLX4_EN_NUM_TX_TYPES; i++) {
> > +   kfree(tmp->tx_ring[i]);
> > +   kfree(tmp->tx_cq[i]);
> > +   }
> > +
> > +   return err;
> >   }
> >   void mlx4_en_safe_replace_resources(struct mlx4_en_priv *priv,
> >
> Regards,
> Tariq Toukan.

Re: [PATCH net v2] be2net: fix initial MAC setting

2017-01-31 Thread Sriharsha Basavapatna

On Wed, Feb 1, 2017 at 12:31 AM, Ivan Vecera  wrote:
> Recent commit 34393529163a ("be2net: fix MAC addr setting on privileged
> BE3 VFs") allows privileged BE3 VFs to set its MAC address during
> initialization. Although the initial MAC for such VFs is already
> programmed by parent PF the subsequent setting performed by VF is OK,
> but in certain cases (after fresh boot) this command in VF can fail.
>
> The MAC should be initialized only when:
> 1) no MAC is programmed (always except BE3 VFs during first init)
> 2) programmed MAC is different from requested (e.g. MAC is set when
>interface is down). In this case the initial MAC programmed by PF
>needs to be deleted.
>
> The adapter->dev_mac contains MAC address currently programmed in HW so
> it should be zeroed when the MAC is deleted from HW and should not be
> filled when MAC is set when interface is down in be_mac_addr_set() as
> no programming is performed in this case.
>
> Example of failure without the fix (immediately after fresh boot):
>
> # ip link set eth0 up  <- eth0 is BE3 PF
> be2net :01:00.0 eth0: Link is Up
>
> # echo 1 > /sys/class/net/eth0/device/sriov_numvfs  <- Create 1 VF
> ...
> be2net :01:04.0: Emulex OneConnect(be3): VF  port 0
>
> # ip link set eth8 up  <- eth8 is created privileged VF
> be2net :01:04.0: opcode 59-1 failed:status 1-76
> RTNETLINK answers: Input/output error
>
> # echo 0 > /sys/class/net/eth0/device/sriov_numvfs  <- Delete VF
> iommu: Removing device :01:04.0 from group 33
> ...
>
> # echo 1 > /sys/class/net/eth0/device/sriov_numvfs  <- Create it again
> iommu: Removing device :01:04.0 from group 33
> ...
>
> # ip link set eth8 up
> be2net :01:04.0 eth8: Link is Up
>
> Initialization is now OK.
>
> v2 - Corrected the comment and condition check suggested by Suresh & Harsha
>
> Fixes: 34393529163a ("be2net: fix MAC addr setting on privileged BE3 VFs")
> Cc: Sathya Perla 
> Cc: Ajit Khaparde 
> Cc: Sriharsha Basavapatna 
> Cc: Somnath Kotur 
> Signed-off-by: Ivan Vecera 
> ---
>  drivers/net/ethernet/emulex/benet/be_main.c | 33 
> -
>  1 file changed, 28 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/net/ethernet/emulex/benet/be_main.c 
> b/drivers/net/ethernet/emulex/benet/be_main.c
> index 1a7f8ad7b9c6..cd49a54c538d 100644
> --- a/drivers/net/ethernet/emulex/benet/be_main.c
> +++ b/drivers/net/ethernet/emulex/benet/be_main.c
> @@ -362,8 +362,10 @@ static int be_mac_addr_set(struct net_device *netdev, 
> void *p)
> status = -EPERM;
> goto err;
> }
> -done:
> +
> +   /* Remember currently programmed MAC */
> ether_addr_copy(adapter->dev_mac, addr->sa_data);
> +done:
> ether_addr_copy(netdev->dev_addr, addr->sa_data);
> dev_info(dev, "MAC address changed to %pM\n", addr->sa_data);
> return 0;
> @@ -3618,8 +3620,10 @@ static void be_disable_if_filters(struct be_adapter 
> *adapter)
>  {
> /* Don't delete MAC on BE3 VFs without FILTMGMT privilege  */
> if (!BEx_chip(adapter) || !be_virtfn(adapter) ||
> -   check_privilege(adapter, BE_PRIV_FILTMGMT))
> +   check_privilege(adapter, BE_PRIV_FILTMGMT)) {
> be_dev_mac_del(adapter, adapter->pmac_id[0]);
> +   eth_zero_addr(adapter->dev_mac);
> +   }
>
> be_clear_uc_list(adapter);
> be_clear_mc_list(adapter);
> @@ -3773,12 +3777,27 @@ static int be_enable_if_filters(struct be_adapter 
> *adapter)
> if (status)
> return status;
>
> -   /* Don't add MAC on BE3 VFs without FILTMGMT privilege */
> -   if (!BEx_chip(adapter) || !be_virtfn(adapter) ||
> -   check_privilege(adapter, BE_PRIV_FILTMGMT)) {
> +   /* Normally this condition usually true as the ->dev_mac is zeroed.
> +* But on BE3 VFs the initial MAC is pre-programmed by PF and
> +* subsequent be_dev_mac_add() can fail (after fresh boot)
> +*/
> +   if (!ether_addr_equal(adapter->dev_mac, adapter->netdev->dev_addr)) {
> +   int old_pmac_id = -1;
> +
> +   /* Remember old programmed MAC if any - can happen on BE3 VF 
> */
> +   if (!is_zero_ether_addr(adapter->dev_mac))
> +   old_pmac_id = adapter->pmac_id[0];
> +
> status = be_dev_mac_add(adapter, adapter->netdev->dev_addr);
> if (status)
> return status;
> +
> +   /* Delete the old programmed MAC as we successfully programmed
> +* a new MAC
> +*/
> +   if (old_pmac_id >= 0 && old_pmac_id != adapter->pmac_id[0])
> +   be_dev_mac_del(adapter, old_pmac_id);
> +
> ether_addr_copy(adapter->dev_mac, adapter->netdev->dev_addr);
> }
>

Re: [PATCH 4.10-rc3 12/13] net: ath5k: fix build errors when linux/phy*.h is removed from net/dsa.h

2017-01-31 Thread Kalle Valo

Russell King  writes:

> Fix these errors reported by the 0-day builder by replacing the
> linux/export.h include with linux/module.h.
>
> In file included from include/linux/platform_device.h:14:0,
>  from drivers/net/wireless/ath/ath5k/ahb.c:20:
> include/linux/device.h:1463:1: warning: data definition has no type or 
> storage class
>  module_init(__driver##_init); \
>  ^
> include/linux/platform_device.h:228:2: note: in expansion of macro 
> 'module_driver'
>   module_driver(__platform_driver, platform_driver_register, \
>   ^
> drivers/net/wireless/ath/ath5k/ahb.c:233:1: note: in expansion of macro 
> 'module_platform_driver'
>  module_platform_driver(ath_ahb_driver);
>  ^~
> include/linux/device.h:1463:1: error: type defaults to 'int' in declaration 
> of 'module_init' [-Werror=implicit-int]
>  module_init(__driver##_init); \
>  ^
> include/linux/platform_device.h:228:2: note: in expansion of macro 
> 'module_driver'
>   module_driver(__platform_driver, platform_driver_register, \
>   ^
> drivers/net/wireless/ath/ath5k/ahb.c:233:1: note: in expansion of macro 
> 'module_platform_driver'
>  module_platform_driver(ath_ahb_driver);
>  ^~
> drivers/net/wireless/ath/ath5k/ahb.c:233:1: warning: parameter names (without 
> types) in function declaration
> In file included from include/linux/platform_device.h:14:0,
>  from drivers/net/wireless/ath/ath5k/ahb.c:20:
> include/linux/device.h:1468:1: warning: data definition has no type or 
> storage class
>  module_exit(__driver##_exit);
>  ^
> include/linux/platform_device.h:228:2: note: in expansion of macro 
> 'module_driver'
>   module_driver(__platform_driver, platform_driver_register, \
>   ^
> drivers/net/wireless/ath/ath5k/ahb.c:233:1: note: in expansion of macro 
> 'module_platform_driver'
>  module_platform_driver(ath_ahb_driver);
>  ^~
> include/linux/device.h:1468:1: error: type defaults to 'int' in declaration 
> of 'module_exit' [-Werror=implicit-int]
>  module_exit(__driver##_exit);
>  ^
> include/linux/platform_device.h:228:2: note: in expansion of macro 
> 'module_driver'
>   module_driver(__platform_driver, platform_driver_register, \
>   ^
> drivers/net/wireless/ath/ath5k/ahb.c:233:1: note: in expansion of macro 
> 'module_platform_driver'
>  module_platform_driver(ath_ahb_driver);
>  ^~
> drivers/net/wireless/ath/ath5k/ahb.c:233:1: warning: parameter names (without 
> types) in function declaration
> In file included from include/linux/platform_device.h:14:0,
>  from drivers/net/wireless/ath/ath5k/ahb.c:20:
> drivers/net/wireless/ath/ath5k/ahb.c:233:24: warning: 'ath_ahb_driver_exit' 
> defined but not used [-Wunused-function]
>  module_platform_driver(ath_ahb_driver);
> ^
> include/linux/device.h:1464:20: note: in definition of macro 'module_driver'
>  static void __exit __driver##_exit(void) \
> ^~~~
> drivers/net/wireless/ath/ath5k/ahb.c:233:1: note: in expansion of macro 
> 'module_platform_driver'
>  module_platform_driver(ath_ahb_driver);
>  ^~
> drivers/net/wireless/ath/ath5k/ahb.c:233:24: warning: 'ath_ahb_driver_init' 
> defined but not used [-Wunused-function]
>  module_platform_driver(ath_ahb_driver);
> ^
> include/linux/device.h:1459:19: note: in definition of macro 'module_driver'
>  static int __init __driver##_init(void) \
>^~~~
> drivers/net/wireless/ath/ath5k/ahb.c:233:1: note: in expansion of macro 
> 'module_platform_driver'
>  module_platform_driver(ath_ahb_driver);
>  ^~
>
> Signed-off-by: Russell King 

Looks good to me:

Acked-by: Kalle Valo 

I assume Dave will take this so I'll drop the patch from my queue.

-- 
Kalle Valo

Re: [PATCH net-next 00/10] bnxt_en: Add XDP support.

2017-01-31 Thread Andy Gospodarek

On Tue, Jan 31, 2017 at 10:36 AM, Andy Gospodarek  wrote:
> On Mon, Jan 30, 2017 at 08:47:47PM -0800, Alexei Starovoitov wrote:
>> On Mon, Jan 30, 2017 at 08:49:25PM -0500, Michael Chan wrote:
>> > The first 8 patches refactor the code (rx/tx code paths and ring logic)
>> > and add the basic infrastructure to support XDP.  The 9th patch adds
>> > basic ndo_xdp to support XDP_DROP and XDP_PASS only.  The 10th patch
>> > completes the series with XDP_TX.
>>
>> Looks great.
>> Could you please share performance numbers ?
>
> I'll post some later today.

I finally got my system moved around to what I'd hoped would be the
right spot in my lab, but the system used for generating the traffic
was only able to send 6Mpps with pktgen, so it was not a great test.

My receiving system with i7-6700 CPU @ 3.40GHz seemed to have no issue
handling this 6Mpps load -- mpstat showed only one core was ~25%
utilitzed with all of that servicing softirqs.  The rest of the cores
were 100% idle.

I'm going to search for other traffic generation tools/systems to make
sure I can get at least line-rate for the 10GbE cards I was using.

>
>>
>> Also please add something like:
>>   if (prog && prog->xdp_adjust_head) {
>>   netdev_warn(dev, "Does not support bpf_xdp_adjust_head()\n");
>>   return -EOPNOTSUPP;
>>   }
>> unless you plan to add adjut_head support until net-next closes.
>> Note, it's must have for load balancer functionality.

net: macb: lack of statistics - tx_dropped

2017-01-31 Thread Hyung Jin Jung

macb doesn't maintain statistics of dropped packets when transmitting.
so that it's not possible to detect packet loss in user space (sysfs, procfs...)

macb_start_xmit should to increase tx_dropped in case of exception - buffer 
full, dma mapping fail

Davy

Re: [RFC PATCH 1/2] af_packet: direct dma for packet ineterface

2017-01-31 Thread John Fastabend

On 17-01-30 05:31 PM, Willem de Bruijn wrote:
 V3 header formats added bulk polling via socket calls and timers
 used in the polling interface to return every n milliseconds. Currently,
 I don't see any way to support this in hardware because we can't
 know if the hardware is in the middle of a DMA operation or not
 on a slot. So when a timer fires I don't know how to advance the
 descriptor ring leaving empty descriptors similar to how the software
 ring works. The easiest (best?) route is to simply not support this.
>>>
>>> From a performance pov bulking is essential. Systems like netmap that
>>> also depend on transferring control between kernel and userspace,
>>> report[1] that they need at least bulking size 8, to amortize the overhead.
> 
> To introduce interrupt moderation, ixgbe_do_ddma only has to elide the
> sk_data_ready, and schedule an hrtimer if one is not scheduled yet.
> 
> If I understand correctly, the difficulty lies in v3 requiring that the
> timer "close" the block when the timer expires. That may not be worth
> implementing, indeed.
> 

Yep that is where I just gave up and decided it wasn't worth it.

> Hardware interrupt moderation and napi may already give some
> moderation, even with a sock_def_readable call for each packet. If
> considering a v4 format, I'll again suggest virtio virtqueues. Those
> have interrupt suppression built in with EVENT_IDX.


Agreed. On paper now I'm considering moving to something like this after
getting some feedback here. Of course I'll need to play with the code a
bit to see what it looks like. I'll need a couple weeks probably to get
this sorted out.

> 
>>> Likely, but I would like that we do a measurement based approach.  Lets
>>> benchmark with this V2 header format, and see how far we are from
>>> target, and see what lights-up in perf report and if it is something we
>>> can address.
>>
>> Yep I'm hoping to get to this sometime this week.
> 
> Perhaps also without filling in the optional metadata data fields
> in tpacket and sockaddr_ll.
> 
>>> E.g. how will you support XDP_TX?  AFAIK you cannot remove/detach a
>>> packet with this solution (and place it on a TX queue and wait for DMA
>>> TX completion).
>>>
>>
>> This is something worth exploring. tpacket_v2 uses a fixed ring with
>> slots so all the pages are allocated and assigned to the ring at init
>> time. To xmit a packet in this case the user space application would
>> be required to leave the packet descriptor on the rx side pinned
>> until the tx side DMA has completed. Then it can unpin the rx side
>> and return it to the driver. This works if the TX/RX processing is
>> fast enough to keep up. For many things this is good enough.
>>
>> For some work loads though this may not be sufficient. In which
>> case a tpacket_v4 would be useful that can push down a new set
>> of "slots" every n packets. Where n is sufficiently large to keep
>> the workload running.
> 
> Here, too, virtio rings may help.
> 
> The extra level of indirection allows out of order completions,
> reducing the chance of running out of rx descriptors when redirecting
> a subset of packets to a tx ring, as that does not block the entire ring.
> 
> And passing explicit descriptors from userspace enables pointing to
> new memory regions. On the flipside, they now have to be checked for
> safety against region bounds.
> 
>> This is similar in many ways to virtio/vhost interaction.
> 
> Ah, I only saw this after writing the above :)
> 

yep but glad to get some validation on this idea.

Re: [RFC PATCH 0/2] rx zero copy interface for af_packet

2017-01-31 Thread John Fastabend

On 17-01-31 11:39 AM, tndave wrote:
> 
> 
> On 01/27/2017 01:33 PM, John Fastabend wrote:
>> This is an experimental implementation of rx zero copy for af_packet.
>> Its a bit rough and likely has errors but the plan is to clean it up
>> over the next few months.
>>
>> And seeing I said I would post it in another thread a few days back
>> here it is.
> 
> This sounds good (believe me I have been thinking along the lines :)
> From driver Rx side, we always premap RX buffers so best to map them to
> shmem for PF_PACKET sockets.
> Also, I like the idea that user can put selected queue (may be queues in
> future?) to PF_PACKET mode keeping rest of the queues as it is.
> Zero copy and removing skb setup & processing overhead on RX certainly
> makes things faster and help latency. Zero copy is good on Tx however
> without skb should we figure out how to use segmentation and checksum 
> offloading
> features of HW. Can this be considered in tpacket V4 hdr!
> 

Yes, I'll try to create another RFC in a week or two. Thanks.

> -Tushar

Re: [RFC PATCH 1/2] af_packet: direct dma for packet ineterface

2017-01-31 Thread John Fastabend

[...]

>>>   
 The ndo operations and new socket option PACKET_RX_DIRECT work by
 giving a queue_index to run the direct dma operations over. Once
 setsockopt returns successfully the indicated queue is mapped
 directly to the requesting application and can not be used for
 other purposes. Also any kernel layers such as tc will be bypassed
 and need to be implemented in the hardware via some other mechanism
 such as tc offload or other offload interfaces.  
>>>
>>> Will this also need to bypass XDP too?  
>>
>> There is nothing stopping this from working with XDP but why would
>> you want to run XDP here?
>>
>> Dropping a packet for example is not really useful because its
>> already in memory user space can read. Modifying the packet seems
>> pointless user space can modify it. 
>>
>> Maybe pushing a copy of the packet
>> to kernel stack is useful in some case? But I can't see where I would
>> want this.
> 
> Wouldn't it be useful to pass ARP packets to kernel stack?
> (E.g. if your HW filter is based on MAC addr match)
> 

Problem is we already zero-copied the packet into user space. I really
don't want to have a packet in user space and kernel space at the same
time. That seems like a big mess to me around security and isolation.

Much better to have the hardware push arp packet onto the correct hardware
queue so that arp packets get sent into the stack. With the ARP example
its easy enough to put a high priority rule in hardware to match the
protocol.

> 
>>> E.g. how will you support XDP_TX?  AFAIK you cannot remove/detach a
>>> packet with this solution (and place it on a TX queue and wait for DMA
>>> TX completion).
>>>   
>>
>> This is something worth exploring. tpacket_v2 uses a fixed ring with
>> slots so all the pages are allocated and assigned to the ring at init
>> time. To xmit a packet in this case the user space application would
>> be required to leave the packet descriptor on the rx side pinned
>> until the tx side DMA has completed. Then it can unpin the rx side
>> and return it to the driver. This works if the TX/RX processing is
>> fast enough to keep up. For many things this is good enough.
> 
> Sounds tricky.
>  
>> For some work loads though this may not be sufficient. In which
>> case a tpacket_v4 would be useful that can push down a new set
>> of "slots" every n packets. Where n is sufficiently large to keep
>> the workload running. This is similar in many ways to virtio/vhost
>> interaction.
> 
> This starts to sound like to need a page pool like facility with
> pages premapped DMA and premapped to userspace...
> 

I'm not sure what premapped to userspace means in this case. Here the
application uses mmap or some other mechanism to get a set of pages and
then pushes them down to the device. I think a mechanism such as that
used in virtio would solve this problem nicely. I'll take a look at it
and send another RFC out.

>>>   
> [...]
>>>
>>> Guess, I don't understand the details of the af_packet versions well
>>> enough, but can you explain to me, how userspace knows what slots it
>>> can read/fetch, and how it marks when it is complete/finished so the
>>> kernel knows it can reuse this slot?
>>>   
>>
>> At init time user space allocates a ring of buffers. Each buffer has
>> space to hold the packet descriptor + packet payload. The API gives this
>> to the driver to initialize DMA engine and assign addresses. At init
>> time all buffers are "owned" by the driver which is indicated by a status bit
>> in the descriptor header.
>>
>> Userspace can spin on the status bit to know when the driver has handed
>> it to userspace. The driver will check the status bit before returning
>> the buffer to the hardware. Then a series of kicks are used to wake up
>> userspace (if its not spinning) and to wake up the driver if it is overrun
>> and needs to return buffers into its pool (not implemented yet). The
>> kick to wake up the driver could in a future v4 be used to push new
>> buffers to the driver if needed.
> 
> As I wrote above, this status bit spinning approach is good and actually
> achieving a bulking effect indirectly.
> 

Yep.

Thanks,
John

Re: [RFC PATCH 2/2] ixgbe: add af_packet direct copy support

2017-01-31 Thread John Fastabend

On 17-01-30 06:53 PM, Alexei Starovoitov wrote:
> On 1/27/17 1:34 PM, John Fastabend wrote:
>> +h2 = page_address(rx_buffer->page) + rx_buffer->page_offset - hdrlen;
>> +eth = page_address(rx_buffer->page) + rx_buffer->page_offset,
> 
> I don't think it compiles ;)

Well that is what I get for doing some last minute checkpatch fixes
and not doing a build test before sending it out. Oh well just an RFC
to get some general feedback.

> 
>> +/* This indicates a bug in ixgbe leaving for testing purposes */
>> +WARN_ON(TP_STATUS_USER & h2->tp_status);
>> +len = le16_to_cpu(rx_desc->wb.upper.length);
>> +h2->tp_len = len;
>> +h2->tp_snaplen = len;
>> +h2->tp_mac = ALIGN(TPACKET_ALIGN(TPACKET2_HDRLEN), L1_CACHE_BYTES);
>> +h2->tp_net = h2->tp_mac + ETH_HLEN;
>> +h2->tp_sec = div_s64_rem(ns, NSEC_PER_SEC, );
>> +h2->tp_nsec = rem;
>> +
>> +sll = (void *)h2 + TPACKET_ALIGN(sizeof(struct tpacket2_hdr));
>> +sll->sll_halen = ETH_HLEN;
>> +memcpy(sll->sll_addr, eth->h_source, ETH_ALEN);
>> +sll->sll_family = AF_PACKET;
>> +sll->sll_hatype = rx_ring->netdev->type;
>> +sll->sll_protocol = eth->h_proto;
>> +sll->sll_pkttype = PACKET_HOST;
>> +sll->sll_ifindex = rx_ring->netdev->ifindex;
> 
> performance wise it looks very expensive to do all these header copies
> and integer divide for every packet.
> I think unless we move to new dumb and simple header format
> performance of this approach is not going to be satisfactory.
> 

Sure I'm not opposed to moving to a v4 in fact I think it would help
in a lot of ways. I'll try to fire off some benchmarks and then move
to a v4 to see how that works out.

.John

Re: [RFC PATCH 0/2] rx zero copy interface for af_packet

2017-01-31 Thread John Fastabend

On 17-01-31 08:30 AM, Sowmini Varadhan wrote:
> On (01/27/17 13:33), John Fastabend wrote:
>>
>> This is an experimental implementation of rx zero copy for af_packet.
>> Its a bit rough and likely has errors but the plan is to clean it up
>> over the next few months.
>>
>> And seeing I said I would post it in another thread a few days back
>> here it is.
> 
> One question/comment about this: sure, this saves us an skb copy
> on the rx side, but at least for the Tx side, I think there may
> be a trade-off between the overhead from the skb setup and the
> ease of offloading checksum (and UFO where it is available) to
> consider, even for PF_PACKET.
> 

Yes although as Willem suggested and I pushed a quick comment
at the end of the patch, virtio descriptors might be a better
options for a v4 descriptor type because they have mechanisms
to handle checksum and others in place already.

> Using PF_PACKET works well for stateless datagram protocols like 
> UDP, and for UDP sockets, we find that just switching to Jumbo
> (to simulate a poor-man's-UFO) gives us significant improvement
> in both throughput and latency for our RDBMS workloads - and 
> having the sk_buff facilitates using existing driver-kernel interfaces
> for offload easily, so while we may gain some perf improvment by shaving
> of the sk_buff overhead, the trade-off needs to be considered.

Of course but many workloads/environments can not use jumbo
frames nor would it be helpful if your average pkt size is
128B or something around there.


> 
> --Sowmini
> 
> 
>

Re: [PATCH 13/17] net: stmmac: Implement NAPI for TX

2017-01-31 Thread David Miller

From: Corentin Labbe 
Date: Tue, 31 Jan 2017 10:11:48 +0100

> The stmmac driver run TX completion under NAPI but without checking
> the work done by the TX completion function.

The current behavior is correct and completely intentional.

A driver should _never_ account TX work to the NAPI poll budget.

This is because TX liberation is orders of magnitude cheaper than
receiving a packet, and such SKB freeing makes more SKBs available
for RX processing.

Therefore, TX work should never count against the NAPI budget.

Please do not fix something which is not broken.

Re: [PATCH net-next 2/5] vxlan: support fdb and learning in COLLECT_METADATA mode

2017-01-31 Thread Roopa Prabhu

On 1/31/17, 3:37 PM, Jonathan Toppins wrote:
> On 01/31/2017 12:57 AM, Roopa Prabhu wrote:
[snip]
>>
>> diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
>> index 19b1653..b80c405 100644
>> --- a/drivers/net/vxlan.c
>> +++ b/drivers/net/vxlan.c
>> @@ -57,6 +57,8 @@
>>  
>>  static const u8 all_zeros_mac[ETH_ALEN + 2];
>>  
>> +static u32 fdb_salt __read_mostly;
>> +
>>  static int vxlan_sock_add(struct vxlan_dev *vxlan);
>>  
>>  /* per-network namespace private data for this module */
>> @@ -75,6 +77,7 @@ struct vxlan_fdb {
>>  struct list_head  remotes;
>>  u8eth_addr[ETH_ALEN];
>>  u16   state;/* see ndm_state */
>> +__be32vni;
>>  u8flags;/* see ndm_flags */
>>  };
>>  
>> @@ -302,6 +305,10 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct 
>> vxlan_dev *vxlan,
>>  if (rdst->remote_vni != vxlan->default_dst.remote_vni &&
>>  nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni)))
>>  goto nla_put_failure;
>> +if ((vxlan->flags & VXLAN_F_COLLECT_METADATA) && fdb->vni &&
>> +nla_put_u32(skb, NDA_SRC_VNI,
>> +be32_to_cpu(fdb->vni)))
>> +goto nla_put_failure;
>>  if (rdst->remote_ifindex &&
>>  nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex))
>>  goto nla_put_failure;
>> @@ -400,34 +407,51 @@ static u32 eth_hash(const unsigned char *addr)
>>  return hash_64(value, FDB_HASH_BITS);
>>  }
>>  
>> +static u32 eth_vni_hash(const unsigned char *addr, __be32 vni)
>> +{
>> +/* use 1 byte of OUI and 3 bytes of NIC */
>> +u32 key = get_unaligned((u32 *)(addr + 2));
>> +
>> +return jhash_2words(key, vni, fdb_salt) & (FDB_HASH_SIZE - 1);
> Not seeing where fdb_salt gets set to anything, why not just use a
> constant zero here?
>
>
oversight. intended to use vxlan_salt which is already initialized but not used 
in the vxlan driver.
 fixed in next queued up version (just following bridge fdb code here which 
uses salt).

thanks.

Re: [PATCHv2 net-next 1/2] net: dsa: mv88e6xxx: Workaround missing PHY ID on mv88e6390

2017-01-31 Thread Florian Fainelli

On 01/31/2017 06:40 PM, Andrew Lunn wrote:
> The internal PHYs of the mv88e6390 do not have a model ID. Trap any
> calls to the ID register, and if it is zero, return the ID for the
> mv88e6390. The Marvell PHY driver can then bind to this ID.
> 
> Signed-off-by: Andrew Lunn 

Reviewed-by: Florian Fainelli 
-- 
Florian

Re: [PATCHv2 net-next 1/2] net: dsa: mv88e6xxx: Workaround missing PHY ID on mv88e6390

2017-01-31 Thread Vivien Didelot

Hi Andrew,

Andrew Lunn  writes:

> The internal PHYs of the mv88e6390 do not have a model ID. Trap any
> calls to the ID register, and if it is zero, return the ID for the
> mv88e6390. The Marvell PHY driver can then bind to this ID.
>
> Signed-off-by: Andrew Lunn 

Reviewed-by: Vivien Didelot 

Thanks!

Vivien

[PATCH net] net: phy: Fix lack of reference count on PHY driver

2017-01-31 Thread Florian Fainelli

From: Mao Wenan 

There is currently no reference count being held on the PHY driver,
which makes it possible to remove the PHY driver module while the PHY
state machine is running and polling the PHY. This could cause crashes
similar to this one to show up:

[   43.361162] BUG: unable to handle kernel NULL pointer dereference at 
0140
[   43.361162] IP: phy_state_machine+0x32/0x490
[   43.361162] PGD 59dc067
[   43.361162] PUD 0
[   43.361162]
[   43.361162] Oops:  [#1] SMP
[   43.361162] Modules linked in: dsa_loop [last unloaded: broadcom]
[   43.361162] CPU: 0 PID: 1299 Comm: kworker/0:3 Not tainted 4.10.0-rc5+ #415
[   43.361162] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS Ubuntu-1.8.2-1ubuntu2 04/01/2014
[   43.361162] Workqueue: events_power_efficient phy_state_machine
[   43.361162] task: 880006782b80 task.stack: c9184000
[   43.361162] RIP: 0010:phy_state_machine+0x32/0x490
[   43.361162] RSP: 0018:c9187e18 EFLAGS: 0246
[   43.361162] RAX:  RBX: 8800059e53c0 RCX:
880006a15c60
[   43.361162] RDX: 880006782b80 RSI:  RDI:
8800059e5428
[   43.361162] RBP: c9187e48 R08: 880006a15c40 R09:

[   43.361162] R10:  R11:  R12:
8800059e5428
[   43.361162] R13: 8800059e5000 R14:  R15:
880006a15c40
[   43.361162] FS:  () GS:880006a0()
knlGS:
[   43.361162] CS:  0010 DS:  ES:  CR0: 80050033
[   43.361162] CR2: 0140 CR3: 05979000 CR4:
06f0
[   43.361162] Call Trace:
[   43.361162]  process_one_work+0x1b4/0x3e0
[   43.361162]  worker_thread+0x43/0x4d0
[   43.361162]  ? __schedule+0x17f/0x4e0
[   43.361162]  kthread+0xf7/0x130
[   43.361162]  ? process_one_work+0x3e0/0x3e0
[   43.361162]  ? kthread_create_on_node+0x40/0x40
[   43.361162]  ret_from_fork+0x29/0x40
[   43.361162] Code: 56 41 55 41 54 4c 8d 67 68 53 4c 8d af 40 fc ff ff
48 89 fb 4c 89 e7 48 83 ec 08 e8 c9 9d 27 00 48 8b 83 60 ff ff ff 44 8b
73 98 <48> 8b 90 40 01 00 00 44 89 f0 48 85 d2 74 08 4c 89 ef ff d2 8b

Keep references on the PHY driver module right before we are going to
utilize it in phy_attach_direct(), and conversely when we don't use it
anymore in phy_detach().

Signed-off-by: Mao Wenan 
[florian: rebase, rework commit message]
Signed-off-by: Florian Fainelli 
---
 drivers/net/phy/phy_device.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 92b08383cafa..0d8f4d3847f6 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -920,6 +920,11 @@ int phy_attach_direct(struct net_device *dev, struct 
phy_device *phydev,
return -EIO;
}
 
+   if (!try_module_get(d->driver->owner)) {
+   dev_err(>dev, "failed to get the device driver module\n");
+   return -EIO;
+   }
+
get_device(d);
 
/* Assume that if there is no driver, that it doesn't
@@ -977,6 +982,7 @@ int phy_attach_direct(struct net_device *dev, struct 
phy_device *phydev,
 error:
phy_detach(phydev);
put_device(d);
+   module_put(d->driver->owner);
if (ndev_owner != bus->owner)
module_put(bus->owner);
return err;
@@ -1059,6 +1065,7 @@ void phy_detach(struct phy_device *phydev)
bus = phydev->mdio.bus;
 
put_device(>mdio.dev);
+   module_put(phydev->mdio.dev.driver->owner);
if (ndev_owner != bus->owner)
module_put(bus->owner);
 }
-- 
2.9.3

Re: [PATCH 4.10-rc3 00/13] net: dsa: remove unnecessary phy.h include

2017-01-31 Thread Florian Fainelli

On 01/31/2017 11:17 AM, Russell King - ARM Linux wrote:
> Including phy.h and phy_fixed.h into net/dsa.h causes phy*.h to be an
> unnecessary dependency for quite a large amount of the kernel.  There's
> very little which actually requires definitions from phy.h in net/dsa.h
> - the include itself only wants the declaration of a couple of
> structures and IFNAMSIZ.
> 
> Add linux/if.h for IFNAMSIZ, declarations for the structures, phy.h to
> mv88e6xxx.h as it needs it for phy_interface_t, and remove both phy.h
> and phy_fixed.h from net/dsa.h.
> 
> This patch reduces from around 800 files rebuilt to around 40 - even
> with ccache, the time difference is noticable.
> 
> In order to make this change, several drivers need to be updated to
> include necessary headers that they were picking up through this
> include.  This has resulted in a much larger patch series.
> 
> I'm assuming the 0-day builder has had 24 hours with this series, and
> hasn't reported any further issues with it - the last issue was two
> weeks ago (before I became ill) which I fixed over the last weekend.
> 
> I'm hoping this doesn't conflict with what's already in net-next...

For the entire series:

Acked-by: Florian Fainelli 

Thanks a lot for doing that.

> 
>  arch/mips/cavium-octeon/octeon-platform.c | 4 
>  drivers/net/dsa/mv88e6xxx/mv88e6xxx.h | 1 +
>  drivers/net/ethernet/broadcom/bgmac.c | 2 ++
>  drivers/net/ethernet/cadence/macb.h   | 2 ++
>  drivers/net/ethernet/cavium/liquidio/lio_main.c   | 1 +
>  drivers/net/ethernet/cavium/liquidio/lio_vf_main.c| 1 +
>  drivers/net/ethernet/cavium/liquidio/octeon_console.c | 1 +
>  drivers/net/ethernet/freescale/fman/fman_memac.c  | 1 +
>  drivers/net/ethernet/marvell/mvneta.c | 1 +
>  drivers/net/ethernet/qualcomm/emac/emac-sgmii.c   | 1 +
>  drivers/net/usb/lan78xx.c | 1 +
>  drivers/net/wireless/ath/ath5k/ahb.c  | 2 +-
>  drivers/target/iscsi/iscsi_target_login.c | 1 +
>  include/net/dsa.h | 6 --
>  net/core/netprio_cgroup.c | 1 +
>  net/sunrpc/xprtrdma/svc_rdma_backchannel.c| 1 +
>  16 files changed, 20 insertions(+), 7 deletions(-)
> 


-- 
Florian

[PATCHv2 net-next 0/2] Work around missing PHY prodcut ID in mv88e6390

2017-01-31 Thread Andrew Lunn

The internal PHYs of the MV88E6390 have a Marvell OUI, but the product
ID is zero. Work around this by trapping reads to the ID, and if it is
zero, return the MV88E6390 family ID.

v2: Move the workaround into the central mdio read function.
Enable the temperature sensor, even though it does not work on the 6390,
but it does on the 6341, which has the same ID problem.

Andrew Lunn (2):
  net: dsa: mv88e6xxx: Workaround missing PHY ID on mv88e6390
  net: phy: Marvell: Add mv88e6390 internal PHY

 drivers/net/dsa/mv88e6xxx/chip.c |  8 
 drivers/net/phy/marvell.c| 20 
 include/linux/marvell_phy.h  |  6 ++
 3 files changed, 34 insertions(+)

-- 
2.11.0

[PATCHv2 net-next 2/2] net: phy: Marvell: Add mv88e6390 internal PHY

2017-01-31 Thread Andrew Lunn

The mv88e6390 Ethernet switch has internal PHYs. These PHYs don't have
an model ID in the ID2 register. So the MDIO driver in the switch
intercepts reads to this register, and returns the switch family ID.
Extend the Marvell PHY driver by including this ID, and treat the PHY
as a 88E1540.

Signed-off-by: Andrew Lunn 
Reviewed-by: Florian Fainelli 
---
 drivers/net/phy/marvell.c   | 20 
 include/linux/marvell_phy.h |  6 ++
 2 files changed, 26 insertions(+)

diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index a3e3733813a7..1a0ac48cbc50 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -2142,6 +2142,25 @@ static struct phy_driver marvell_drivers[] = {
.get_strings = marvell_get_strings,
.get_stats = marvell_get_stats,
},
+   {
+   .phy_id = MARVELL_PHY_ID_88E6390,
+   .phy_id_mask = MARVELL_PHY_ID_MASK,
+   .name = "Marvell 88E6390",
+   .features = PHY_GBIT_FEATURES,
+   .flags = PHY_HAS_INTERRUPT,
+   .probe = m88e1510_probe,
+   .config_init = _config_init,
+   .config_aneg = _config_aneg,
+   .read_status = _read_status,
+   .ack_interrupt = _ack_interrupt,
+   .config_intr = _config_intr,
+   .did_interrupt = _did_interrupt,
+   .resume = _resume,
+   .suspend = _suspend,
+   .get_sset_count = marvell_get_sset_count,
+   .get_strings = marvell_get_strings,
+   .get_stats = marvell_get_stats,
+   },
 };
 
 module_phy_driver(marvell_drivers);
@@ -2160,6 +2179,7 @@ static struct mdio_device_id __maybe_unused marvell_tbl[] 
= {
{ MARVELL_PHY_ID_88E1510, MARVELL_PHY_ID_MASK },
{ MARVELL_PHY_ID_88E1540, MARVELL_PHY_ID_MASK },
{ MARVELL_PHY_ID_88E3016, MARVELL_PHY_ID_MASK },
+   { MARVELL_PHY_ID_88E6390, MARVELL_PHY_ID_MASK },
{ }
 };
 
diff --git a/include/linux/marvell_phy.h b/include/linux/marvell_phy.h
index a57f0dfb6db7..3d616d7f65bf 100644
--- a/include/linux/marvell_phy.h
+++ b/include/linux/marvell_phy.h
@@ -19,6 +19,12 @@
 #define MARVELL_PHY_ID_88E1540 0x01410eb0
 #define MARVELL_PHY_ID_88E3016 0x01410e60
 
+/* The MV88e6390 Ethernet switch contains embedded PHYs. These PHYs do
+ * not have a model ID. So the switch driver traps reads to the ID2
+ * register and returns the switch family ID
+ */
+#define MARVELL_PHY_ID_88E6390 0x01410f90
+
 /* struct phy_device dev_flags definitions */
 #define MARVELL_PHY_M1145_FLAGS_RESISTANCE 0x0001
 #define MARVELL_PHY_M1118_DNS323_LEDS  0x0002
-- 
2.11.0

[PATCHv2 net-next 1/2] net: dsa: mv88e6xxx: Workaround missing PHY ID on mv88e6390

2017-01-31 Thread Andrew Lunn

The internal PHYs of the mv88e6390 do not have a model ID. Trap any
calls to the ID register, and if it is zero, return the ID for the
mv88e6390. The Marvell PHY driver can then bind to this ID.

Signed-off-by: Andrew Lunn 
---
 drivers/net/dsa/mv88e6xxx/chip.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index bf385377a461..29190303ace0 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -2930,6 +2930,14 @@ static int mv88e6xxx_mdio_read(struct mii_bus *bus, int 
phy, int reg)
err = chip->info->ops->phy_read(chip, bus, phy, reg, );
mutex_unlock(>reg_lock);
 
+   if (reg == MII_PHYSID2) {
+   /* Some internal PHYS don't have a model number.  Use
+* the mv88e6390 family model number instead.
+*/
+   if (!(val & 0x3f0))
+   val |= PORT_SWITCH_ID_PROD_NUM_6390;
+   }
+
return err ? err : val;
 }
 
-- 
2.11.0

linux-next: manual merge of the net-next tree with the char-misc.current tree

2017-01-31 Thread Stephen Rothwell

Hi all,

Today's linux-next merge of the net-next tree got a conflict in:

  drivers/net/hyperv/netvsc.c

between commit:

  433e19cf33d3 ("Drivers: hv: vmbus: finally fix hv_need_to_signal_on_read()")

from the char-misc.current tree and commits:

  0b307ebd6834 ("netvsc: remove no longer needed receive staging buffers")
  46b4f7f5d1f7 ("netvsc: eliminate per-device outstanding send counter")

from the net-next tree.

I fixed it up (I think, but I have no idea what this code actually
does :-) - see below) and can carry the fix as necessary. This is now
fixed as far as linux-next is concerned, but any non trivial conflicts
should be mentioned to your upstream maintainer when your tree is
submitted for merging.  You may also want to consider cooperating with
the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc drivers/net/hyperv/netvsc.c
index 86e5749226ef,5cfdb1a1b4c1..
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@@ -1289,74 -1245,25 +1245,29 @@@ void netvsc_channel_cb(void *context
else
device = channel->device_obj;
  
-   net_device = get_inbound_net_device(device);
-   if (!net_device)
-   return;
ndev = hv_get_drvdata(device);
-   buffer = get_per_channel_state(channel);
+   if (unlikely(!ndev))
+   return;
+ 
+   net_device = net_device_to_netvsc_device(ndev);
+   if (unlikely(net_device->destroy) &&
+   netvsc_channel_idle(net_device, q_idx))
+   return;
  
 +  /* commit_rd_index() -> hv_signal_on_read() needs this. */
 +  init_cached_read_index(channel);
 +
-   do {
-   desc = get_next_pkt_raw(channel);
-   if (desc != NULL) {
-   netvsc_process_raw_pkt(device,
-  channel,
-  net_device,
-  ndev,
-  desc->trans_id,
-  desc);
- 
-   put_pkt_raw(channel, desc);
-   need_to_commit = true;
-   continue;
-   }
-   if (need_to_commit) {
-   need_to_commit = false;
-   commit_rd_index(channel);
-   }
- 
-   ret = vmbus_recvpacket_raw(channel, buffer, bufferlen,
-  _recvd, _id);
-   if (ret == 0) {
-   if (bytes_recvd > 0) {
-   desc = (struct vmpacket_descriptor *)buffer;
-   netvsc_process_raw_pkt(device,
-  channel,
-  net_device,
-  ndev,
-  request_id,
-  desc);
-   } else {
-   /*
-* We are done for this pass.
-*/
-   break;
-   }
- 
-   } else if (ret == -ENOBUFS) {
-   if (bufferlen > NETVSC_PACKET_SIZE)
-   kfree(buffer);
-   /* Handle large packet */
-   buffer = kmalloc(bytes_recvd, GFP_ATOMIC);
-   if (buffer == NULL) {
-   /* Try again next time around */
-   netdev_err(ndev,
-  "unable to allocate buffer of size "
-  "(%d)!!\n", bytes_recvd);
-   break;
-   }
- 
-   bufferlen = bytes_recvd;
-   }
+   while ((desc = get_next_pkt_raw(channel)) != NULL) {
+   netvsc_process_raw_pkt(device, channel, net_device,
+  ndev, desc->trans_id, desc);
  
+   put_pkt_raw(channel, desc);
+   need_to_commit = true;
 +  init_cached_read_index(channel);
+   }
  
-   } while (1);
- 
-   if (bufferlen > NETVSC_PACKET_SIZE)
-   kfree(buffer);
+   if (need_to_commit)
+   commit_rd_index(channel);
  
netvsc_chk_recv_comp(net_device, channel, q_idx);
  }

[PATCH net-next] net: ipv6: add NLM_F_APPEND in notifications when applicable

2017-01-31 Thread David Ahern

IPv6 does not set the NLM_F_APPEND flag in notifications to signal that
a NEWROUTE is an append versus a new route or a replaced one. Add the
flag if the request has it.

Signed-off-by: David Ahern 
---
sending this one outside of the multipath patch set since it has
nothing to do with multipath.

 net/ipv6/ip6_fib.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index ef5485204522..bcaf247232d7 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -746,6 +746,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct 
rt6_info *rt,
u16 nlflags = NLM_F_EXCL;
int err;
 
+   if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND))
+   nlflags |= NLM_F_APPEND;
+
ins = >leaf;
 
for (iter = fn->leaf; iter; iter = iter->dst.rt6_next) {
-- 
2.1.4

Re: [PATCH] net: fix ndo_features_check/ndo_fix_features comment ordering

2017-01-31 Thread Eric Dumazet

On Tue, 2017-01-31 at 16:03 -0800, Dimitris Michailidis wrote:
> Commit cdba756f5803a2 ("net: move ndo_features_check() close to
> ndo_start_xmit()") inadvertently moved the doc comment for
> .ndo_fix_features instead of .ndo_features_check. Fix the comment
> ordering.
> 
> Fixes: cdba756f5803a2 ("net: move ndo_features_check() close to 
> ndo_start_xmit()")
> Signed-off-by: Dimitris Michailidis 
> ---


LGTM

Acked-by: Eric Dumazet 

I am currently doing a reorg of net_device fields to improve locality,
I am not sure I will touch comments. We might do this later when things
are stabilized.

[PATCH] net: fix ndo_features_check/ndo_fix_features comment ordering

2017-01-31 Thread Dimitris Michailidis

Commit cdba756f5803a2 ("net: move ndo_features_check() close to
ndo_start_xmit()") inadvertently moved the doc comment for
.ndo_fix_features instead of .ndo_features_check. Fix the comment
ordering.

Fixes: cdba756f5803a2 ("net: move ndo_features_check() close to 
ndo_start_xmit()")
Signed-off-by: Dimitris Michailidis 
---
 include/linux/netdevice.h | 29 +++--
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9bde9558b596..70ad0291d517 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -866,11 +866,15 @@ struct netdev_xdp {
  * of useless work if you return NETDEV_TX_BUSY.
  * Required; cannot be NULL.
  *
- * netdev_features_t (*ndo_fix_features)(struct net_device *dev,
- * netdev_features_t features);
- * Adjusts the requested feature flags according to device-specific
- * constraints, and returns the resulting flags. Must not modify
- * the device state.
+ * netdev_features_t (*ndo_features_check)(struct sk_buff *skb,
+ *struct net_device *dev
+ *netdev_features_t features);
+ * Called by core transmit path to determine if device is capable of
+ * performing offload operations on a given packet. This is to give
+ * the device an opportunity to implement any restrictions that cannot
+ * be otherwise expressed by feature flags. The check is called with
+ * the set of features that the stack has calculated and it returns
+ * those the driver believes to be appropriate.
  *
  * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb,
  * void *accel_priv, select_queue_fallback_t fallback);
@@ -1028,6 +1032,12 @@ struct netdev_xdp {
  * Called to release previously enslaved netdev.
  *
  *  Feature/offload setting functions.
+ * netdev_features_t (*ndo_fix_features)(struct net_device *dev,
+ * netdev_features_t features);
+ * Adjusts the requested feature flags according to device-specific
+ * constraints, and returns the resulting flags. Must not modify
+ * the device state.
+ *
  * int (*ndo_set_features)(struct net_device *dev, netdev_features_t features);
  * Called to update device configuration to new features. Passed
  * feature set might be less than what was returned by ndo_fix_features()).
@@ -1100,15 +1110,6 @@ struct netdev_xdp {
  * Callback to use for xmit over the accelerated station. This
  * is used in place of ndo_start_xmit on accelerated net
  * devices.
- * netdev_features_t (*ndo_features_check)(struct sk_buff *skb,
- *struct net_device *dev
- *netdev_features_t features);
- * Called by core transmit path to determine if device is capable of
- * performing offload operations on a given packet. This is to give
- * the device an opportunity to implement any restrictions that cannot
- * be otherwise expressed by feature flags. The check is called with
- * the set of features that the stack has calculated and it returns
- * those the driver believes to be appropriate.
  * int (*ndo_set_tx_maxrate)(struct net_device *dev,
  *  int queue_index, u32 maxrate);
  * Called when a user wants to set a max-rate limitation of specific
-- 
2.11.0.483.g087da7b7c-goog

Re: [PATCH net-next 1/2] net: Introduce ife encapsulation module

2017-01-31 Thread kbuild test robot

Hi Yotam,

[auto build test ERROR on net-next/master]

url:
https://github.com/0day-ci/linux/commits/Yotam-Gigi/Extract-IFE-logic-to-module/20170131-222757
config: alpha-allyesconfig (attached as .config)
compiler: alpha-linux-gnu-gcc (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
wget 
https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross
 -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=alpha 

Note: the linux-review/Yotam-Gigi/Extract-IFE-logic-to-module/20170131-222757 
HEAD 236a17de4759e7d4d2927d4ac50329ec788ec655 builds fine.
  It only hurts bisectibility.

All errors (new ones prefixed by >>):

   net/ife/built-in.o: In function `ife_tlv_meta_encode':
>> (.text+0x420): multiple definition of `ife_tlv_meta_encode'
   net/sched/built-in.o:(.text+0x18800): first defined here

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip

RE: Outlook Web Access

2017-01-31 Thread Anita Patrick



From: Anita Patrick
Sent: Tuesday, January 31, 2017 12:50 PM
Subject: Outlook Web Access

Dear E-mail User

Please Log on to OutlookWeb Access 
to update your outlook account to the new secured version 2017.

Thank You

Re: [PATCH net-next 2/5] vxlan: support fdb and learning in COLLECT_METADATA mode

2017-01-31 Thread Jonathan Toppins

On 01/31/2017 12:57 AM, Roopa Prabhu wrote:
> From: Roopa Prabhu 
> 
> Vxlan COLLECT_METADATA mode today solves the per-vni netdev
> scalability problem in l3 networks. It expects all forwarding
> information to be present in dst_metadata. This patch series
> enhances collect metadata mode to include the case where only
> vni is present in dst_metadata, and the vxlan driver can then use
> the rest of the forwarding information datbase to make forwarding
> decisions. There is no change to default COLLECT_METADATA
> behaviour. These changes only apply to COLLECT_METADATA when
> used with the bridging use-case with a special dst_metadata
> tunnel info flag (eg: where vxlan device is part of a bridge).
> For all this to work, the vxlan driver will need to now support a
> single fdb table hashed by mac + vni. This series essentially makes
> this happen.
> 
> use-case and workflow:
> vxlan collect metadata device participates in bridging vlan
> to vn-segments. Bridge driver above the vxlan device,
> sends the vni corresponding to the vlan in the dst_metadata.
> vxlan driver will lookup forwarding database with (mac + vni)
> for the required remote destination information to forward the
> packet.
> 
> Changes introduced by this patch:
> - allow learning and forwarding database state in vxlan netdev in
>   COLLECT_METADATA mode. Current behaviour is not changed
>   by default. tunnel info flag IP_TUNNEL_INFO_BRIDGE is used
>   to support the new bridge friendly mode.
> - A single fdb table hashed by (mac, vni) to allow fdb entries with
>   multiple vnis in the same fdb table
> - rx path already has the vni
> - tx path expects a vni in the packet with dst_metadata
> - prior to this series, fdb remote_dsts carried remote vni and
>   the vxlan device carrying the fdb table represented the
>   source vni. With the vxlan device now representing multiple vnis,
>   this patch adds a src vni attribute to the fdb entry. The remote
>   vni already uses NDA_VNI attribute. This patch introduces
>   NDA_SRC_VNI netlink attribute to represent the src vni in a multi
>   vni fdb table.
> 
> iproute2 example (patched and pruned iproute2 output to just show
> relevant fdb entries):
> example shows same host mac learnt on two vni's.
> 
> before (netdev per vni):
> $bridge fdb show | grep "00:02:00:00:00:03"
> 00:02:00:00:00:03 dev vxlan1001 dst 12.0.0.8 self
> 00:02:00:00:00:03 dev vxlan1000 dst 12.0.0.8 self
> 
> after this patch with collect metadata in bridged mode (single netdev):
> $bridge fdb show | grep "00:02:00:00:00:03"
> 00:02:00:00:00:03 dev vxlan0 src_vni 1001 dst 12.0.0.8 self
> 00:02:00:00:00:03 dev vxlan0 src_vni 1000 dst 12.0.0.8 self
> 
> Signed-off-by: Roopa Prabhu 
> ---
>  drivers/net/vxlan.c|  211 
> +---
>  include/uapi/linux/neighbour.h |1 +
>  2 files changed, 136 insertions(+), 76 deletions(-)
> 
> diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
> index 19b1653..b80c405 100644
> --- a/drivers/net/vxlan.c
> +++ b/drivers/net/vxlan.c
> @@ -57,6 +57,8 @@
>  
>  static const u8 all_zeros_mac[ETH_ALEN + 2];
>  
> +static u32 fdb_salt __read_mostly;
> +
>  static int vxlan_sock_add(struct vxlan_dev *vxlan);
>  
>  /* per-network namespace private data for this module */
> @@ -75,6 +77,7 @@ struct vxlan_fdb {
>   struct list_head  remotes;
>   u8eth_addr[ETH_ALEN];
>   u16   state;/* see ndm_state */
> + __be32vni;
>   u8flags;/* see ndm_flags */
>  };
>  
> @@ -302,6 +305,10 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct 
> vxlan_dev *vxlan,
>   if (rdst->remote_vni != vxlan->default_dst.remote_vni &&
>   nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni)))
>   goto nla_put_failure;
> + if ((vxlan->flags & VXLAN_F_COLLECT_METADATA) && fdb->vni &&
> + nla_put_u32(skb, NDA_SRC_VNI,
> + be32_to_cpu(fdb->vni)))
> + goto nla_put_failure;
>   if (rdst->remote_ifindex &&
>   nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex))
>   goto nla_put_failure;
> @@ -400,34 +407,51 @@ static u32 eth_hash(const unsigned char *addr)
>   return hash_64(value, FDB_HASH_BITS);
>  }
>  
> +static u32 eth_vni_hash(const unsigned char *addr, __be32 vni)
> +{
> + /* use 1 byte of OUI and 3 bytes of NIC */
> + u32 key = get_unaligned((u32 *)(addr + 2));
> +
> + return jhash_2words(key, vni, fdb_salt) & (FDB_HASH_SIZE - 1);

Not seeing where fdb_salt gets set to anything, why not just use a
constant zero here?

> +}
> +
>  /* Hash chain to use given mac address */
>  static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
> - const u8 *mac)
> + const u8

Re: [PATCH V3 net-next 02/14] net/ena: fix error handling when probe fails

2017-01-31 Thread Netanel Belgazal


Hi,

You are right. I'll remove this patch.

Regards,

Netanel

On 01/28/2017 01:33 AM, Lino Sanfilippo wrote:

Hi,

On 26.01.2017 23:18, Netanel Belgazal wrote:

When driver fails in probe, it will release all resources,
including adapter.
In case of probe failure, ena_remove should not try to
free the adapter resources.

Signed-off-by: Netanel Belgazal 
---
 drivers/net/ethernet/amazon/ena/ena_netdev.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c 
b/drivers/net/ethernet/amazon/ena/ena_netdev.c

index 7493ea3..cb60567 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -3046,6 +3046,7 @@ static int ena_probe(struct pci_dev *pdev, 
const struct pci_device_id *ent)

 err_free_region:
 ena_release_bars(ena_dev, pdev);
 err_free_ena_dev:
+pci_set_drvdata(pdev, NULL);
 vfree(ena_dev);
 err_disable_device:
 pci_disable_device(pdev);



Is this change really a "fix"? remove() should only be called if
probe() has been successful before, otherwise not. Did you experience
something different?

Regards,
Lino

Re: [PATCH] net: phy: broadcom: rehook BCM54612E specific init

2017-01-31 Thread Florian Fainelli

On 01/31/2017 01:54 PM, Rafał Miłecki wrote:
> From: Rafał Miłecki 
> 
> This extra BCM54612E code in PHY driver isn't really aneg specific. Even
> without it aneg works OK but the problem is no packets pass through PHY.
> 
> Moreover putting this code inside config_aneg callback didn't allow
> resuming PHY correctly. When driver called phy_stop and phy_start it was
> putting PHY machine into RESUMING state. After that machine was
> switching into AN and NOLINK without ever calling phy_start_aneg. This
> prevented this extra setup from being called and PHY didn't work.
> 
> This change has been verified to fix network on BCM47186B0 SoC device
> with BCM54612E.
> 
> Signed-off-by: Rafał Miłecki 

Reviewed-by: Florian Fainelli 
-- 
Florian

Re: [PATCH] r8152: Allocate interrupt buffer as part of struct r8152

2017-01-31 Thread Guenter Roeck

On Tue, Jan 31, 2017 at 02:53:47PM -0500, Alan Stern wrote:
> On Tue, 31 Jan 2017, Guenter Roeck wrote:
> 
> > When unloading the r8152 driver using the 'unbind' sysfs attribute
> > in a system with KASAN enabled, the following error message is seen
> > on a regular basis.
> 
> ...
> 
> > The two-byte allocation in conjunction with code analysis suggests that
> > the interrupt buffer has been overwritten. Added instrumentation in the
> > driver shows that the interrupt handler is called after RTL8152_UNPLUG
> > was set, and that this event is associated with the error message above.
> > This suggests that there are situations where the interrupt buffer is used
> > after it has been freed.
> > 
> > To avoid the problem, allocate the interrupt buffer as part of struct
> > r8152.
> > 
> > Cc: Hayes Wang 
> > Signed-off-by: Guenter Roeck 
> > ---
> > The problem is seen in chromeos-4.4, but there is not reason to believe
> > that it does not occur with the upstream kernel. It is still seen in
> > chromeos-4.4 after all patches from upstream and linux-next have been
> > applied to the driver.
> > 
> > While relatively simple, I am not really convinced that this is the best
> > (or even an acceptable) solution for this problem. I am open to suggestions
> > for a better fix.
> 
> The proper approach is to keep the allocation as it is, but _before_
> deallocating the buffer, make sure that the interrupt buffer won't be
> accessed any more.  This may involve calling usb_kill_urb(), or

usb_kill_urb() is called. I added some more logging. The sequence is
interrupt_handler()
...
usb_kill_urb(intr_urb);
...
kfree(intr_buff);
...
BUG kmalloc-128

which leaves me a bit puzzled; the interrupt handler is called well before
the memory is freed, and it turns out that it is not always called.

I'll do some digging in the usb core.

Guenter

Re: [PATCH] r8152: Allocate interrupt buffer as part of struct r8152

2017-01-31 Thread Guenter Roeck

On Tue, Jan 31, 2017 at 11:53:31AM -0800, Eric Dumazet wrote:
> On Tue, 2017-01-31 at 11:06 -0800, Guenter Roeck wrote:
> > When unloading the r8152 driver using the 'unbind' sysfs attribute
> > in a system with KASAN enabled, the following error message is seen
> > on a regular basis.
> 
> >  
> >  static int alloc_all_mem(struct r8152 *tp)
> > @@ -1423,10 +1420,6 @@ static int alloc_all_mem(struct r8152 *tp)
> > if (!tp->intr_urb)
> > goto err1;
> >  
> > -   tp->intr_buff = kmalloc(INTBUFSIZE, GFP_KERNEL);
> > -   if (!tp->intr_buff)
> > -   goto err1;
> > -
> > tp->intr_interval = (int)ep_intr->desc.bInterval;
> > usb_fill_int_urb(tp->intr_urb, tp->udev, usb_rcvintpipe(tp->udev, 3),
> >  tp->intr_buff, INTBUFSIZE, intr_callback,
> 
> This might lead to intr_buff being backed by vzalloc() instead of
> kzalloc() (check alloc_netdev_mqs())
> 
> It looks like it could cause a bug.
> 
I also strongly suspect that it just fixes the symptom, but not the root cause
of the problem.

Thanks,
Guenter

Re: [PATCH net] bpf: expose netns inode to bpf programs

2017-01-31 Thread David Ahern

On 1/25/17 8:27 PM, Alexei Starovoitov wrote:
> in cases where bpf programs are looking at sockets and packets
> that belong to different netns, it could be useful to read netns inode,
> so that programs can make intelligent decisions.
> For example to disallow raw sockets in all non-init netns the program can do:
> if (sk->type == SOCK_RAW && sk->netns_inum != 0xf075)
>   return 0;
> where 0xf075 inode comes from /proc/pid/ns/net
> 
> Similarly TC cls_bpf/act_bpf and socket filters can do
> if (skb->netns_inum == expected_inode)
> 
> The lack of netns awareness was a concern even for socket filters,
> since the application can attach the same bpf program to sockets
> in a different netns. Just like tc cls_bpf program can work in
> different netns as well, so it has to be addressed uniformly
> across all types of bpf programs.
> 
> Signed-off-by: Alexei Starovoitov 
> ---
> with corresponding change in 'ip vrf' that David Ahern is working on,
> this will address 'malfunction' concern that Andy discovered in 'ip vrf',
> hence this fix is needed for 'net'.

FWIW, the iproute2 patch (along with a few other namespace related fixups) can 
be found here:

https://github.com/dsahern/iproute2

vrf/ip-vrf branch.

Re: [PATCH net] mlx4: xdp_prog becomes inactive after ethtool '-L' or '-G'

2017-01-31 Thread Martin KaFai Lau

On Tue, Jan 31, 2017 at 01:11:40PM -0500, David Miller wrote:
> From: Tariq Toukan 
> Date: Mon, 30 Jan 2017 19:18:28 +0200
>
> > It looks good to me, in general.
> > I just have one small comment below.
>
> Martin, please address Tariq's feedback.
Sorry for the delay.  I am on PTO for this week and
have been travelling.

I will take a closer look when I get
a more stable intenet connection tonight.

Thanks,
--Martin

Re: [Intel-wired-lan] [PATCH] net: intel: e1000e: use new api ethtool_{get|set}_link_ksettings

2017-01-31 Thread Philippe Reynes

Hi Sasha,

On 1/31/17, Neftin, Sasha  wrote:
>
> Philippe,
>
> We will look into and try test this patch. I would like ask question. I
> see that this thread has been started from implementation for e1000
> code. Why do you decide shift implementation to e1000e?

I've just sent two patch for two drivers (e1000 and e1000e).


> Sasha
>
>
Philippe

[PATCHv3 net-next 6/7] net: use dst_confirm_neigh for UDP, RAW, ICMP, L2TP

2017-01-31 Thread Julian Anastasov

When same struct dst_entry can be used for many different
neighbours we can not use it for pending confirmations.

The datagram protocols can use MSG_CONFIRM to confirm the
neighbour. When used with MSG_PROBE we do not reach the
code where neighbour is confirmed, so we have to do the
same slow lookup by using the dst_confirm_neigh() helper.
When MSG_PROBE is not used, ip_append_data/ip6_append_data
will set the skb flag dst_pending_confirm.

Reported-by: YueHaibing 
Fixes: 5110effee8fd ("net: Do delayed neigh confirmation.")
Fixes: f2bb4bedf35d ("ipv4: Cache output routes in fib_info nexthops.")
Signed-off-by: Julian Anastasov 
Acked-by: Eric Dumazet 
---
 net/ipv4/ip_output.c  |  6 ++
 net/ipv4/ping.c   |  3 ++-
 net/ipv4/raw.c|  6 +-
 net/ipv4/udp.c|  3 ++-
 net/ipv6/ip6_output.c |  6 ++
 net/ipv6/raw.c|  6 +-
 net/ipv6/route.c  | 27 ++-
 net/ipv6/udp.c|  3 ++-
 net/l2tp/l2tp_ip6.c   |  3 ++-
 9 files changed, 44 insertions(+), 19 deletions(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 27f1db7..ff0fcaa 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -889,6 +889,9 @@ static inline int ip_ufo_append_data(struct sock *sk,
 
skb->csum = 0;
 
+   if (flags & MSG_CONFIRM)
+   skb_set_dst_pending_confirm(skb, 1);
+
__skb_queue_tail(queue, skb);
} else if (skb_is_gso(skb)) {
goto append;
@@ -1089,6 +1092,9 @@ static int __ip_append_data(struct sock *sk,
exthdrlen = 0;
csummode = CHECKSUM_NONE;
 
+   if ((flags & MSG_CONFIRM) && !skb_prev)
+   skb_set_dst_pending_confirm(skb, 1);
+
/*
 * Put the packet on the pending queue.
 */
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 592db6a..6ee792d 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -848,7 +848,8 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr 
*msg, size_t len)
return err;
 
 do_confirm:
-   dst_confirm(>dst);
+   if (msg->msg_flags & MSG_PROBE)
+   dst_confirm_neigh(>dst, );
if (!(msg->msg_flags & MSG_PROBE) || len)
goto back_from_confirm;
err = 0;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 4e49e5c..8119e1f 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -383,6 +383,9 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 
*fl4,
 
sock_tx_timestamp(sk, sockc->tsflags, _shinfo(skb)->tx_flags);
 
+   if (flags & MSG_CONFIRM)
+   skb_set_dst_pending_confirm(skb, 1);
+
skb->transport_header = skb->network_header;
err = -EFAULT;
if (memcpy_from_msg(iph, msg, length))
@@ -666,7 +669,8 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, 
size_t len)
return len;
 
 do_confirm:
-   dst_confirm(>dst);
+   if (msg->msg_flags & MSG_PROBE)
+   dst_confirm_neigh(>dst, );
if (!(msg->msg_flags & MSG_PROBE) || len)
goto back_from_confirm;
err = 0;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index d6dddcf..4bdb358 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1088,7 +1088,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, 
size_t len)
return err;
 
 do_confirm:
-   dst_confirm(>dst);
+   if (msg->msg_flags & MSG_PROBE)
+   dst_confirm_neigh(>dst, >daddr);
if (!(msg->msg_flags_PROBE) || len)
goto back_from_confirm;
err = 0;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 7d90cab..5d944c1 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1145,6 +1145,9 @@ static inline int ip6_ufo_append_data(struct sock *sk,
skb->protocol = htons(ETH_P_IPV6);
skb->csum = 0;
 
+   if (flags & MSG_CONFIRM)
+   skb_set_dst_pending_confirm(skb, 1);
+
__skb_queue_tail(queue, skb);
} else if (skb_is_gso(skb)) {
goto append;
@@ -1517,6 +1520,9 @@ static int __ip6_append_data(struct sock *sk,
exthdrlen = 0;
dst_exthdrlen = 0;
 
+   if ((flags & MSG_CONFIRM) && !skb_prev)
+   skb_set_dst_pending_confirm(skb, 1);
+
/*
 * Put the packet on the pending queue
 */
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index ea89073..f174e76 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -654,6 +654,9 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr 
*msg, int length,
 
skb->ip_summed = CHECKSUM_NONE;
 
+   if (flags & MSG_CONFIRM)
+

[PATCHv3 net-next 4/7] tcp: replace dst_confirm with sk_dst_confirm

2017-01-31 Thread Julian Anastasov

When same struct dst_entry can be used for many different
neighbours we can not use it for pending confirmations.
Use the new sk_dst_confirm() helper to propagate the
indication from received packets to sock_confirm_neigh().

Reported-by: YueHaibing 
Reported-by: YueHaibing 
Fixes: 5110effee8fd ("net: Do delayed neigh confirmation.")
Fixes: f2bb4bedf35d ("ipv4: Cache output routes in fib_info nexthops.")
Tested-by: YueHaibing 
Signed-off-by: Julian Anastasov 
Acked-by: Eric Dumazet 
---
 net/ipv4/tcp_input.c   | 12 +++-
 net/ipv4/tcp_metrics.c |  7 ++-
 net/ipv4/tcp_output.c  |  2 ++
 3 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3de6eba..b3e88bb 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3644,11 +3644,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff 
*skb, int flag)
if (tp->tlp_high_seq)
tcp_process_tlp_ack(sk, ack, flag);
 
-   if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
-   struct dst_entry *dst = __sk_dst_get(sk);
-   if (dst)
-   dst_confirm(dst);
-   }
+   if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
+   sk_dst_confirm(sk);
 
if (icsk->icsk_pending == ICSK_TIME_RETRANS)
tcp_schedule_loss_probe(sk);
@@ -5995,7 +5992,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff 
*skb)
break;
 
case TCP_FIN_WAIT1: {
-   struct dst_entry *dst;
int tmo;
 
/* If we enter the TCP_FIN_WAIT1 state and we are a
@@ -6022,9 +6018,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff 
*skb)
tcp_set_state(sk, TCP_FIN_WAIT2);
sk->sk_shutdown |= SEND_SHUTDOWN;
 
-   dst = __sk_dst_get(sk);
-   if (dst)
-   dst_confirm(dst);
+   sk_dst_confirm(sk);
 
if (!sock_flag(sk, SOCK_DEAD)) {
/* Wake up lingering close() */
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index b9ed0d5..0f46e5f 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -375,12 +375,10 @@ void tcp_update_metrics(struct sock *sk)
u32 val;
int m;
 
+   sk_dst_confirm(sk);
if (sysctl_tcp_nometrics_save || !dst)
return;
 
-   if (dst->flags & DST_HOST)
-   dst_confirm(dst);
-
rcu_read_lock();
if (icsk->icsk_backoff || !tp->srtt_us) {
/* This session failed to estimate rtt. Why?
@@ -493,11 +491,10 @@ void tcp_init_metrics(struct sock *sk)
struct tcp_metrics_block *tm;
u32 val, crtt = 0; /* cached RTT scaled by 8 */
 
+   sk_dst_confirm(sk);
if (!dst)
goto reset;
 
-   dst_confirm(dst);
-
rcu_read_lock();
tm = tcp_get_metrics(sk, dst, true);
if (!tm) {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 671c695..c1f8a59 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -973,6 +973,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff 
*skb, int clone_it,
skb_set_hash_from_sk(skb, sk);
atomic_add(skb->truesize, >sk_wmem_alloc);
 
+   skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
+
/* Build TCP header and checksum it. */
th = (struct tcphdr *)skb->data;
th->source  = inet->inet_sport;
-- 
1.9.3

[PATCHv3 net-next 3/7] sctp: add dst_pending_confirm flag

2017-01-31 Thread Julian Anastasov

Add new transport flag to allow sockets to confirm neighbour.
When same struct dst_entry can be used for many different
neighbours we can not use it for pending confirmations.
The flag is propagated from transport to every packet.
It is reset when cached dst is reset.

Reported-by: YueHaibing 
Fixes: 5110effee8fd ("net: Do delayed neigh confirmation.")
Fixes: f2bb4bedf35d ("ipv4: Cache output routes in fib_info nexthops.")
Signed-off-by: Julian Anastasov 
Acked-by: Eric Dumazet 
Acked-by: Neil Horman 
---
 include/net/sctp/sctp.h|  6 ++
 include/net/sctp/structs.h |  4 
 net/sctp/associola.c   |  3 +--
 net/sctp/output.c  | 10 +-
 net/sctp/outqueue.c|  2 +-
 net/sctp/sm_make_chunk.c   |  6 ++
 net/sctp/sm_sideeffect.c   |  2 +-
 net/sctp/socket.c  |  4 ++--
 net/sctp/transport.c   | 17 -
 9 files changed, 38 insertions(+), 16 deletions(-)

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 3cfd365b..480b65a 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -593,10 +593,8 @@ static inline void sctp_v4_map_v6(union sctp_addr *addr)
  */
 static inline struct dst_entry *sctp_transport_dst_check(struct sctp_transport 
*t)
 {
-   if (t->dst && !dst_check(t->dst, t->dst_cookie)) {
-   dst_release(t->dst);
-   t->dst = NULL;
-   }
+   if (t->dst && !dst_check(t->dst, t->dst_cookie))
+   sctp_transport_dst_release(t);
 
return t->dst;
 }
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 231fa9ac..6a68504 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -804,6 +804,8 @@ struct sctp_transport {
 
__u32 burst_limited;/* Holds old cwnd when max.burst is applied */
 
+   __u32 dst_pending_confirm;  /* need to confirm neighbour */
+
/* Destination */
struct dst_entry *dst;
/* Source address. */
@@ -950,6 +952,8 @@ void sctp_transport_route(struct sctp_transport *, union 
sctp_addr *,
 void sctp_transport_reset(struct sctp_transport *);
 void sctp_transport_update_pmtu(struct sock *, struct sctp_transport *, u32);
 void sctp_transport_immediate_rtx(struct sctp_transport *);
+void sctp_transport_dst_release(struct sctp_transport *t);
+void sctp_transport_dst_confirm(struct sctp_transport *t);
 
 
 /* This is the structure we use to queue packets as they come into
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index e50dc6d..2a6835b 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -832,8 +832,7 @@ void sctp_assoc_control_transport(struct sctp_association 
*asoc,
if (transport->state != SCTP_UNCONFIRMED)
transport->state = SCTP_INACTIVE;
else {
-   dst_release(transport->dst);
-   transport->dst = NULL;
+   sctp_transport_dst_release(transport);
ulp_notify = false;
}
 
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 07ab506..814eac0 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -546,6 +546,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t 
gfp)
struct sctp_association *asoc = tp->asoc;
struct sctp_chunk *chunk, *tmp;
int pkt_count, gso = 0;
+   int confirm;
struct dst_entry *dst;
struct sk_buff *head;
struct sctphdr *sh;
@@ -624,7 +625,14 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t 
gfp)
asoc->peer.last_sent_to = tp;
}
head->ignore_df = packet->ipfragok;
-   tp->af_specific->sctp_xmit(head, tp);
+   confirm = tp->dst_pending_confirm;
+   if (confirm)
+   skb_set_dst_pending_confirm(head, 1);
+   /* neighbour should be confirmed on successful transmission or
+* positive error
+*/
+   if (tp->af_specific->sctp_xmit(head, tp) >= 0 && confirm)
+   tp->dst_pending_confirm = 0;
 
 out:
list_for_each_entry_safe(chunk, tmp, >chunk_list, list) {
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 65abe22..db352e5 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -1654,7 +1654,7 @@ static void sctp_check_transmitted(struct sctp_outq *q,
 
if (forward_progress) {
if (transport->dst)
-   dst_confirm(transport->dst);
+   sctp_transport_dst_confirm(transport);
}
}
 
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index ad3445b..c7d3249 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -,8 +,7 @@ static void sctp_asconf_param_success(struct 
sctp_association *asoc,
local_bh_enable();

[PATCHv3 net-next 5/7] net: add confirm_neigh method to dst_ops

2017-01-31 Thread Julian Anastasov

Add confirm_neigh method to dst_ops and use it from IPv4 and IPv6
to lookup and confirm the neighbour. Its usage via the new helper
dst_confirm_neigh() should be restricted to MSG_PROBE users for
performance reasons.

Signed-off-by: Julian Anastasov 
Acked-by: Eric Dumazet 
---
 include/net/arp.h  | 16 
 include/net/dst.h  |  7 +++
 include/net/dst_ops.h  |  2 ++
 include/net/ndisc.h| 17 +
 net/ipv4/route.c   | 19 +++
 net/ipv6/route.c   | 16 
 net/xfrm/xfrm_policy.c | 16 
 7 files changed, 93 insertions(+)

diff --git a/include/net/arp.h b/include/net/arp.h
index 5e0f891..65619a2 100644
--- a/include/net/arp.h
+++ b/include/net/arp.h
@@ -35,6 +35,22 @@ static inline struct neighbour *__ipv4_neigh_lookup(struct 
net_device *dev, u32
return n;
 }
 
+static inline void __ipv4_confirm_neigh(struct net_device *dev, u32 key)
+{
+   struct neighbour *n;
+
+   rcu_read_lock_bh();
+   n = __ipv4_neigh_lookup_noref(dev, key);
+   if (n) {
+   unsigned long now = jiffies;
+
+   /* avoid dirtying neighbour */
+   if (n->confirmed != now)
+   n->confirmed = now;
+   }
+   rcu_read_unlock_bh();
+}
+
 void arp_init(void);
 int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg);
 void arp_send(int type, int ptype, __be32 dest_ip,
diff --git a/include/net/dst.h b/include/net/dst.h
index 6835d22..3a3b34b 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -477,6 +477,13 @@ static inline struct neighbour *dst_neigh_lookup_skb(const 
struct dst_entry *dst
return IS_ERR(n) ? NULL : n;
 }
 
+static inline void dst_confirm_neigh(const struct dst_entry *dst,
+const void *daddr)
+{
+   if (dst->ops->confirm_neigh)
+   dst->ops->confirm_neigh(dst, daddr);
+}
+
 static inline void dst_link_failure(struct sk_buff *skb)
 {
struct dst_entry *dst = skb_dst(skb);
diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h
index 8a2b66d..c84b328 100644
--- a/include/net/dst_ops.h
+++ b/include/net/dst_ops.h
@@ -33,6 +33,8 @@ struct dst_ops {
struct neighbour *  (*neigh_lookup)(const struct dst_entry *dst,
struct sk_buff *skb,
const void *daddr);
+   void(*confirm_neigh)(const struct dst_entry *dst,
+const void *daddr);
 
struct kmem_cache   *kmem_cachep;
 
diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index d562a2f..8a02146 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -391,6 +391,23 @@ static inline struct neighbour *__ipv6_neigh_lookup(struct 
net_device *dev, cons
return n;
 }
 
+static inline void __ipv6_confirm_neigh(struct net_device *dev,
+   const void *pkey)
+{
+   struct neighbour *n;
+
+   rcu_read_lock_bh();
+   n = __ipv6_neigh_lookup_noref(dev, pkey);
+   if (n) {
+   unsigned long now = jiffies;
+
+   /* avoid dirtying neighbour */
+   if (n->confirmed != now)
+   n->confirmed = now;
+   }
+   rcu_read_unlock_bh();
+}
+
 int ndisc_init(void);
 int ndisc_late_init(void);
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 4b7c231..cb494a5 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -154,6 +154,7 @@ static u32 *ipv4_cow_metrics(struct dst_entry *dst, 
unsigned long old)
 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
   struct sk_buff *skb,
   const void *daddr);
+static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 
 static struct dst_ops ipv4_dst_ops = {
.family =   AF_INET,
@@ -168,6 +169,7 @@ static struct neighbour *ipv4_neigh_lookup(const struct 
dst_entry *dst,
.redirect = ip_do_redirect,
.local_out =__ip_local_out,
.neigh_lookup = ipv4_neigh_lookup,
+   .confirm_neigh =ipv4_confirm_neigh,
 };
 
 #define ECN_OR_COST(class) TC_PRIO_##class
@@ -461,6 +463,23 @@ static struct neighbour *ipv4_neigh_lookup(const struct 
dst_entry *dst,
return neigh_create(_tbl, pkey, dev);
 }
 
+static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
+{
+   struct net_device *dev = dst->dev;
+   const __be32 *pkey = daddr;
+   const struct rtable *rt;
+
+   rt = (const struct rtable *)dst;
+   if (rt->rt_gateway)
+   pkey = (const __be32 *)>rt_gateway;
+   else if (!daddr ||
+(rt->rt_flags &
+ (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
+   return;

[PATCHv3 net-next 1/7] sock: add sk_dst_pending_confirm flag

2017-01-31 Thread Julian Anastasov

Add new sock flag to allow sockets to confirm neighbour.
When same struct dst_entry can be used for many different
neighbours we can not use it for pending confirmations.
As not all call paths lock the socket use full word for
the flag.

Add sk_dst_confirm as replacement for dst_confirm when
called for received packets.

Signed-off-by: Julian Anastasov 
Acked-by: Eric Dumazet 
---
 include/net/sock.h | 12 
 net/core/sock.c|  2 ++
 2 files changed, 14 insertions(+)

diff --git a/include/net/sock.h b/include/net/sock.h
index 7144750..e113786 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -240,6 +240,7 @@ struct sock_common {
   *@sk_wq: sock wait queue and async head
   *@sk_rx_dst: receive input route used by early demux
   *@sk_dst_cache: destination cache
+  *@sk_dst_pending_confirm: need to confirm neighbour
   *@sk_policy: flow policy
   *@sk_receive_queue: incoming packets
   *@sk_wmem_alloc: transmit queue bytes committed
@@ -393,6 +394,8 @@ struct sock {
struct sk_buff_head sk_write_queue;
__s32   sk_peek_off;
int sk_write_pending;
+   __u32   sk_dst_pending_confirm;
+   /* Note: 32bit hole on 64bit arches */
longsk_sndtimeo;
struct timer_list   sk_timer;
__u32   sk_priority;
@@ -1764,6 +1767,7 @@ static inline void dst_negative_advice(struct sock *sk)
if (ndst != dst) {
rcu_assign_pointer(sk->sk_dst_cache, ndst);
sk_tx_queue_clear(sk);
+   sk->sk_dst_pending_confirm = 0;
}
}
 }
@@ -1774,6 +1778,7 @@ static inline void dst_negative_advice(struct sock *sk)
struct dst_entry *old_dst;
 
sk_tx_queue_clear(sk);
+   sk->sk_dst_pending_confirm = 0;
/*
 * This can be called while sk is owned by the caller only,
 * with no state that can be checked in a rcu_dereference_check() cond
@@ -1789,6 +1794,7 @@ static inline void dst_negative_advice(struct sock *sk)
struct dst_entry *old_dst;
 
sk_tx_queue_clear(sk);
+   sk->sk_dst_pending_confirm = 0;
old_dst = xchg((__force struct dst_entry **)>sk_dst_cache, dst);
dst_release(old_dst);
 }
@@ -1809,6 +1815,12 @@ static inline void dst_negative_advice(struct sock *sk)
 
 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie);
 
+static inline void sk_dst_confirm(struct sock *sk)
+{
+   if (!sk->sk_dst_pending_confirm)
+   sk->sk_dst_pending_confirm = 1;
+}
+
 bool sk_mc_loop(struct sock *sk);
 
 static inline bool sk_can_gso(const struct sock *sk)
diff --git a/net/core/sock.c b/net/core/sock.c
index 8b35debf..b743565 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -502,6 +502,7 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 
cookie)
 
if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
sk_tx_queue_clear(sk);
+   sk->sk_dst_pending_confirm = 0;
RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
dst_release(dst);
return NULL;
@@ -1519,6 +1520,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const 
gfp_t priority)
af_family_clock_key_strings[newsk->sk_family]);
 
newsk->sk_dst_cache = NULL;
+   newsk->sk_dst_pending_confirm = 0;
newsk->sk_wmem_queued   = 0;
newsk->sk_forward_alloc = 0;
atomic_set(>sk_drops, 0);
-- 
1.9.3

[PATCHv3 net-next 0/7] net: dst_confirm replacement

2017-01-31 Thread Julian Anastasov

This patchset addresses the problem of neighbour
confirmation where received replies from one nexthop
can cause confirmation of different nexthop when using
the same dst. Thanks to YueHaibing 
for tracking the dst->pending_confirm problem.

Sockets can obtain cached output route. Such
routes can be to known nexthop (rt_gateway=IP) or to be
used simultaneously for different nexthop IPs by different
subnet prefixes (nh->nh_scope = RT_SCOPE_HOST, rt_gateway=0).

At first look, there are more problems:

- dst_confirm() sets flag on dst and not on dst->path,
as result, indication is lost when XFRM is used

- DNAT can change the nexthop, so the really used nexthop is
not confirmed

So, the following solution is to avoid using
dst->pending_confirm.

The current dst_confirm() usage is as follows:

Protocols confirming dst on received packets:
- TCP (1 dst per socket)
- SCTP (1 dst per transport)
- CXGB*

Protocols supporting sendmsg with MSG_CONFIRM [ | MSG_PROBE ] to
confirm neighbour:
- UDP IPv4/IPv6
- ICMPv4 PING
- RAW IPv4/IPv6
- L2TP/IPv6

MSG_CONFIRM for other purposes (fix not needed):
- CAN

Sending without locking the socket:
- UDP (when no cork)
- RAW (when hdrincl=1)

Redirects from old to new GW:
- rt6_do_redirect


The patchset includes the following changes:

1. sock: add sk_dst_pending_confirm flag

- used only by TCP with patch 4 to remember the received
indication in sk->sk_dst_pending_confirm

2. net: add dst_pending_confirm flag to skbuff

- skb->dst_pending_confirm will be used by all protocols
in following patches, via skb_{set,get}_dst_pending_confirm

3. sctp: add dst_pending_confirm flag

- SCTP uses per-transport dsts and can not use
sk->sk_dst_pending_confirm like TCP

4. tcp: replace dst_confirm with sk_dst_confirm

5. net: add confirm_neigh method to dst_ops

- IPv4 and IPv6 provision for slow neigh lookups for MSG_PROBE users.
I decided to use neigh lookup only for this case because on
MSG_PROBE the skb may pass MTU checks but it does not reach
the neigh confirmation code. This patch will be used from patch 6.

- xfrm_confirm_neigh: support is incomplete here, only routes with
known nexthops (gateway) are supported because the tunnel address
is slow to obtain. Or there is solution to this problem?

6. net: use dst_confirm_neigh for UDP, RAW, ICMP, L2TP

- dst_confirm conversion for UDP, RAW, ICMP and L2TP/IPv6

- these protocols use MSG_CONFIRM propagated by ip*_append_data
to skb->dst_pending_confirm. sk->sk_dst_pending_confirm is not
used because some sending paths do not lock the socket. For
MSG_PROBE we use the slow lookup (dst_confirm_neigh).

- there are also 2 cases that need the slow lookup:
__ip6_rt_update_pmtu and rt6_do_redirect. I hope
_hdr(skb)->saddr is the correct nexthop address to use here.

7. net: pending_confirm is not used anymore

- I failed to understand the CXGB* code, I see dst_confirm()
calls but I'm not sure dst_neigh_output() was called. For now
I just removed the dst->pending_confirm flag and left all
dst_confirm() calls there. Any better idea?

- Now may be old function neigh_output() should be restored
instead of dst_neigh_output?


Julian Anastasov (7):
  sock: add sk_dst_pending_confirm flag
  net: add dst_pending_confirm flag to skbuff
  sctp: add dst_pending_confirm flag
  tcp: replace dst_confirm with sk_dst_confirm
  net: add confirm_neigh method to dst_ops
  net: use dst_confirm_neigh for UDP, RAW, ICMP, L2TP
  net: pending_confirm is not used anymore

 drivers/net/vrf.c  |  5 -
 include/linux/skbuff.h | 12 
 include/net/arp.h  | 16 
 include/net/dst.h  | 21 +
 include/net/dst_ops.h  |  2 ++
 include/net/ndisc.h| 17 +
 include/net/sctp/sctp.h|  6 ++
 include/net/sctp/structs.h |  4 
 include/net/sock.h | 26 ++
 net/core/dst.c |  1 -
 net/core/sock.c|  2 ++
 net/ipv4/ip_output.c   | 11 ++-
 net/ipv4/ping.c|  3 ++-
 net/ipv4/raw.c |  6 +-
 net/ipv4/route.c   | 19 +++
 net/ipv4/tcp_input.c   | 12 +++-
 net/ipv4/tcp_metrics.c |  7 ++-
 net/ipv4/tcp_output.c  |  2 ++
 net/ipv4/udp.c |  3 ++-
 net/ipv6/ip6_output.c  |  7 +++
 net/ipv6/raw.c |  6 +-
 net/ipv6/route.c   | 43 ++-
 net/ipv6/udp.c |  3 ++-
 net/l2tp/l2tp_ip6.c|  3 ++-
 net/sctp/associola.c   |  3 +--
 net/sctp/output.c  | 10 +-
 net/sctp/outqueue.c|  2 +-
 net/sctp/sm_make_chunk.c   |  6 ++
 net/sctp/sm_sideeffect.c   |  2 +-
 net/sctp/socket.c  |  4 ++--
 net/sctp/transport.c   | 17 -
 net/xfrm/xfrm_policy.c | 16 
 32 files changed, 233 insertions(+), 64 deletions(-)

-- 
1.9.3

[PATCHv3 net-next 2/7] net: add dst_pending_confirm flag to skbuff

2017-01-31 Thread Julian Anastasov

Add new skbuff flag to allow protocols to confirm neighbour.
When same struct dst_entry can be used for many different
neighbours we can not use it for pending confirmations.

Add sock_confirm_neigh() helper to confirm the neighbour and
use it for IPv4, IPv6 and VRF before dst_neigh_output.

Signed-off-by: Julian Anastasov 
Acked-by: Eric Dumazet 
---
 drivers/net/vrf.c  |  5 -
 include/linux/skbuff.h | 12 
 include/net/sock.h | 14 ++
 net/ipv4/ip_output.c   |  5 -
 net/ipv6/ip6_output.c  |  1 +
 5 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 264fc15..630eafd 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -378,6 +378,7 @@ static int vrf_finish_output6(struct net *net, struct sock 
*sk,
if (unlikely(!neigh))
neigh = __neigh_create(_tbl, nexthop, dst->dev, false);
if (!IS_ERR(neigh)) {
+   sock_confirm_neigh(skb, neigh);
ret = dst_neigh_output(dst, neigh, skb);
rcu_read_unlock_bh();
return ret;
@@ -574,8 +575,10 @@ static int vrf_finish_output(struct net *net, struct sock 
*sk, struct sk_buff *s
neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
if (unlikely(!neigh))
neigh = __neigh_create(_tbl, , dev, false);
-   if (!IS_ERR(neigh))
+   if (!IS_ERR(neigh)) {
+   sock_confirm_neigh(skb, neigh);
ret = dst_neigh_output(dst, neigh, skb);
+   }
 
rcu_read_unlock_bh();
 err:
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6f63b7e..3ac3c3b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -613,6 +613,7 @@ static inline bool skb_mstamp_after(const struct skb_mstamp 
*t1,
  * @wifi_acked_valid: wifi_acked was set
  * @wifi_acked: whether frame was acked on wifi or not
  * @no_fcs:  Request NIC to treat last 4 bytes as Ethernet FCS
+ * @dst_pending_confirm: need to confirm neighbour
   *@napi_id: id of the NAPI struct this skb came from
  * @secmark: security marking
  * @mark: Generic packet mark
@@ -743,6 +744,7 @@ struct sk_buff {
__u8csum_level:2;
__u8csum_bad:1;
 
+   __u8dst_pending_confirm:1;
 #ifdef CONFIG_IPV6_NDISC_NODETYPE
__u8ndisc_nodetype:2;
 #endif
@@ -3694,6 +3696,16 @@ static inline bool skb_rx_queue_recorded(const struct 
sk_buff *skb)
return skb->queue_mapping != 0;
 }
 
+static inline void skb_set_dst_pending_confirm(struct sk_buff *skb, u32 val)
+{
+   skb->dst_pending_confirm = val;
+}
+
+static inline bool skb_get_dst_pending_confirm(const struct sk_buff *skb)
+{
+   return skb->dst_pending_confirm != 0;
+}
+
 static inline struct sec_path *skb_sec_path(struct sk_buff *skb)
 {
 #ifdef CONFIG_XFRM
diff --git a/include/net/sock.h b/include/net/sock.h
index e113786..1bc821e 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1821,6 +1821,20 @@ static inline void sk_dst_confirm(struct sock *sk)
sk->sk_dst_pending_confirm = 1;
 }
 
+static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n)
+{
+   if (skb_get_dst_pending_confirm(skb)) {
+   struct sock *sk = skb->sk;
+   unsigned long now = jiffies;
+
+   /* avoid dirtying neighbour */
+   if (n->confirmed != now)
+   n->confirmed = now;
+   if (sk && sk->sk_dst_pending_confirm)
+   sk->sk_dst_pending_confirm = 0;
+   }
+}
+
 bool sk_mc_loop(struct sock *sk);
 
 static inline bool sk_can_gso(const struct sock *sk)
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index fac275c4..27f1db7 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -222,7 +222,10 @@ static int ip_finish_output2(struct net *net, struct sock 
*sk, struct sk_buff *s
if (unlikely(!neigh))
neigh = __neigh_create(_tbl, , dev, false);
if (!IS_ERR(neigh)) {
-   int res = dst_neigh_output(dst, neigh, skb);
+   int res;
+
+   sock_confirm_neigh(skb, neigh);
+   res = dst_neigh_output(dst, neigh, skb);
 
rcu_read_unlock_bh();
return res;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 38122d0..7d90cab 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -119,6 +119,7 @@ static int ip6_finish_output2(struct net *net, struct sock 
*sk, struct sk_buff *
if (unlikely(!neigh))
neigh = __neigh_create(_tbl, nexthop, dst->dev, false);
if (!IS_ERR(neigh)) {
+   sock_confirm_neigh(skb, neigh);
ret = dst_neigh_output(dst, neigh, skb);
rcu_read_unlock_bh();
return ret;
-- 
1.9.3

[PATCHv3 net-next 7/7] net: pending_confirm is not used anymore

2017-01-31 Thread Julian Anastasov

When same struct dst_entry can be used for many different
neighbours we can not use it for pending confirmations.
As last step, we can remove the pending_confirm flag.

Reported-by: YueHaibing 
Fixes: 5110effee8fd ("net: Do delayed neigh confirmation.")
Fixes: f2bb4bedf35d ("ipv4: Cache output routes in fib_info nexthops.")
Signed-off-by: Julian Anastasov 
Acked-by: Eric Dumazet 
---
 include/net/dst.h | 14 ++
 net/core/dst.c|  1 -
 2 files changed, 2 insertions(+), 13 deletions(-)

diff --git a/include/net/dst.h b/include/net/dst.h
index 3a3b34b..84a1043 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -59,8 +59,6 @@ struct dst_entry {
 #define DST_XFRM_QUEUE 0x0100
 #define DST_METADATA   0x0200
 
-   unsigned short  pending_confirm;
-
short   error;
 
/* A non-zero value of dst->obsolete forces by-hand validation
@@ -78,6 +76,8 @@ struct dst_entry {
 #define DST_OBSOLETE_KILL  -2
unsigned short  header_len; /* more space at head required 
*/
unsigned short  trailer_len;/* space to reserve at tail */
+   unsigned short  __pad3;
+
 #ifdef CONFIG_IP_ROUTE_CLASSID
__u32   tclassid;
 #else
@@ -440,7 +440,6 @@ static inline void dst_rcu_free(struct rcu_head *head)
 
 static inline void dst_confirm(struct dst_entry *dst)
 {
-   dst->pending_confirm = 1;
 }
 
 static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
@@ -448,15 +447,6 @@ static inline int dst_neigh_output(struct dst_entry *dst, 
struct neighbour *n,
 {
const struct hh_cache *hh;
 
-   if (dst->pending_confirm) {
-   unsigned long now = jiffies;
-
-   dst->pending_confirm = 0;
-   /* avoid dirtying neighbour */
-   if (n->confirmed != now)
-   n->confirmed = now;
-   }
-
hh = >hh;
if ((n->nud_state & NUD_CONNECTED) && hh->hh_len)
return neigh_hh_output(hh, skb);
diff --git a/net/core/dst.c b/net/core/dst.c
index b5cbbe0..960e503 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -190,7 +190,6 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
dst->__use = 0;
dst->lastuse = jiffies;
dst->flags = flags;
-   dst->pending_confirm = 0;
dst->next = NULL;
if (!(flags & DST_NOCOUNT))
dst_entries_add(ops, 1);
-- 
1.9.3

Re: [PATCH] rtnetlink: Handle IFLA_MASTER parameter when processing rtnl_newlink

2017-01-31 Thread David Ahern

On 1/30/17 4:23 PM, Theuns Verwoerd wrote:
> Allow a master interface to be specified as one of the parameters when
> creating a new interface via rtnl_newlink.  Previously this would
> require invoking interface creation, waiting for it to complete, and
> then separately binding that new interface to a master.
> 
> In particular, this is used when creating a macvlan child interface for
> VRRP in a VRF configuration, allowing the interface creator to specify
> directly what master interface should be inherited by the child,
> without having to deal with asynchronous complications and potential
> race conditions.
> 
> Signed-off-by: Theuns Verwoerd 
> ---
>  net/core/rtnetlink.c | 7 ++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
> 

I can't see any harm in doing the enslavement in the same command that creates 
the link and it does simplify VRR setup.

Acked-by: David Ahern

[PATCH] net: phy: broadcom: rehook BCM54612E specific init

2017-01-31 Thread Rafał Miłecki

From: Rafał Miłecki 

This extra BCM54612E code in PHY driver isn't really aneg specific. Even
without it aneg works OK but the problem is no packets pass through PHY.

Moreover putting this code inside config_aneg callback didn't allow
resuming PHY correctly. When driver called phy_stop and phy_start it was
putting PHY machine into RESUMING state. After that machine was
switching into AN and NOLINK without ever calling phy_start_aneg. This
prevented this extra setup from being called and PHY didn't work.

This change has been verified to fix network on BCM47186B0 SoC device
with BCM54612E.

Signed-off-by: Rafał Miłecki 
---
 drivers/net/phy/broadcom.c | 67 +++---
 1 file changed, 33 insertions(+), 34 deletions(-)

diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index 794b9ec81ba5..9cd8b27d1292 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -46,6 +46,34 @@ static int bcm54210e_config_init(struct phy_device *phydev)
return 0;
 }
 
+static int bcm54612e_config_init(struct phy_device *phydev)
+{
+   /* Clear TX internal delay unless requested. */
+   if ((phydev->interface != PHY_INTERFACE_MODE_RGMII_ID) &&
+   (phydev->interface != PHY_INTERFACE_MODE_RGMII_TXID)) {
+   /* Disable TXD to GTXCLK clock delay (default set) */
+   /* Bit 9 is the only field in shadow register 00011 */
+   bcm_phy_write_shadow(phydev, 0x03, 0);
+   }
+
+   /* Clear RX internal delay unless requested. */
+   if ((phydev->interface != PHY_INTERFACE_MODE_RGMII_ID) &&
+   (phydev->interface != PHY_INTERFACE_MODE_RGMII_RXID)) {
+   u16 reg;
+
+   reg = bcm54xx_auxctl_read(phydev,
+ MII_BCM54XX_AUXCTL_SHDWSEL_MISC);
+   /* Disable RXD to RXC delay (default set) */
+   reg &= ~MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_SKEW_EN;
+   /* Clear shadow selector field */
+   reg &= ~MII_BCM54XX_AUXCTL_SHDWSEL_MASK;
+   bcm54xx_auxctl_write(phydev, MII_BCM54XX_AUXCTL_SHDWSEL_MISC,
+MII_BCM54XX_AUXCTL_MISC_WREN | reg);
+   }
+
+   return 0;
+}
+
 static int bcm54810_config(struct phy_device *phydev)
 {
int rc, val;
@@ -250,6 +278,10 @@ static int bcm54xx_config_init(struct phy_device *phydev)
err = bcm54210e_config_init(phydev);
if (err)
return err;
+   } else if (BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54612E) {
+   err = bcm54612e_config_init(phydev);
+   if (err)
+   return err;
} else if (BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54810) {
err = bcm54810_config(phydev);
if (err)
@@ -395,39 +427,6 @@ static int bcm5481_config_aneg(struct phy_device *phydev)
return ret;
 }
 
-static int bcm54612e_config_aneg(struct phy_device *phydev)
-{
-   int ret;
-
-   /* First, auto-negotiate. */
-   ret = genphy_config_aneg(phydev);
-
-   /* Clear TX internal delay unless requested. */
-   if ((phydev->interface != PHY_INTERFACE_MODE_RGMII_ID) &&
-   (phydev->interface != PHY_INTERFACE_MODE_RGMII_TXID)) {
-   /* Disable TXD to GTXCLK clock delay (default set) */
-   /* Bit 9 is the only field in shadow register 00011 */
-   bcm_phy_write_shadow(phydev, 0x03, 0);
-   }
-
-   /* Clear RX internal delay unless requested. */
-   if ((phydev->interface != PHY_INTERFACE_MODE_RGMII_ID) &&
-   (phydev->interface != PHY_INTERFACE_MODE_RGMII_RXID)) {
-   u16 reg;
-
-   reg = bcm54xx_auxctl_read(phydev,
- MII_BCM54XX_AUXCTL_SHDWSEL_MISC);
-   /* Disable RXD to RXC delay (default set) */
-   reg &= ~MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_SKEW_EN;
-   /* Clear shadow selector field */
-   reg &= ~MII_BCM54XX_AUXCTL_SHDWSEL_MASK;
-   bcm54xx_auxctl_write(phydev, MII_BCM54XX_AUXCTL_SHDWSEL_MISC,
-MII_BCM54XX_AUXCTL_MISC_WREN | reg);
-   }
-
-   return ret;
-}
-
 static int brcm_phy_setbits(struct phy_device *phydev, int reg, int set)
 {
int val;
@@ -590,7 +589,7 @@ static struct phy_driver broadcom_drivers[] = {
.features   = PHY_GBIT_FEATURES,
.flags  = PHY_HAS_MAGICANEG | PHY_HAS_INTERRUPT,
.config_init= bcm54xx_config_init,
-   .config_aneg= bcm54612e_config_aneg,
+   .config_aneg= genphy_config_aneg,
.read_status= genphy_read_status,
.ack_interrupt  = bcm_phy_ack_intr,
.config_intr= bcm_phy_config_intr,
-- 
2.11.0

Re: [PATCHv2 RFC net-next 0/7] net: dst_confirm replacement

2017-01-31 Thread Julian Anastasov

Hello,

On Mon, 30 Jan 2017, David Miller wrote:

> From: Julian Anastasov 
> > 
> > So, the following solution is to avoid using
> > dst->pending_confirm.
> 
> For the most part this series looks good to me, nice work.

OK, I'm posting v3 after removing the RFC tag from v2,
I just added the Acked-by tags.

Regards

--
Julian Anastasov

[net-next 7/8] net/mlx5e: XDP Tx, no inline copy on ConnectX-5

2017-01-31 Thread Saeed Mahameed

ConnectX-5 and later HW generations will report min inline mode ==
MLX5_INLINE_MODE_NONE, which means driver is not required to copy packet
headers to inline fields of TX WQE.

Avoid copy to inline segment in XDP TX routine when HW inline mode doesn't
require it.

This will improve CPU utilization and boost XDP TX performance.

Tested with xdp2 single flow:
CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz
HCA: Mellanox Technologies MT28800 Family [ConnectX-5 Ex]

Before: 7.4Mpps
After:  7.8Mpps
Improvement: 5%

Signed-off-by: Saeed Mahameed 
Reviewed-by: Tariq Toukan 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  3 +--
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |  3 +--
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   | 20 +---
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 9193111ae05e..a1741d3833db 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -120,8 +120,7 @@
 #define MLX5E_XDP_IHS_DS_COUNT \
DIV_ROUND_UP(MLX5E_XDP_MIN_INLINE - 2, MLX5_SEND_WQE_DS)
 #define MLX5E_XDP_TX_DS_COUNT \
-   (MLX5E_XDP_IHS_DS_COUNT + \
-(sizeof(struct mlx5e_tx_wqe) / MLX5_SEND_WQE_DS) + 1 /* SG DS */)
+   ((sizeof(struct mlx5e_tx_wqe) / MLX5_SEND_WQE_DS) + 1 /* SG DS */)
 #define MLX5E_XDP_TX_WQEBBS \
DIV_ROUND_UP(MLX5E_XDP_TX_DS_COUNT, MLX5_SEND_WQEBB_NUM_DS)
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 505f97aeb60c..e47ba0365f20 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1806,8 +1806,7 @@ static void mlx5e_build_xdpsq_param(struct mlx5e_priv 
*priv,
MLX5_SET(wq, wq, log_wq_sz, priv->params.log_sq_size);
 
param->max_inline = priv->params.tx_max_inline;
-   /* FOR XDP SQs will support only L2 inline mode */
-   param->min_inline_mode = MLX5_INLINE_MODE_NONE;
+   param->min_inline_mode = priv->params.tx_min_inline_mode;
param->type = MLX5E_SQ_XDP;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 965e69e9ff1e..b039b87742a6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -657,9 +657,10 @@ static inline bool mlx5e_xmit_xdp_frame(struct mlx5e_rq 
*rq,
struct mlx5_wqe_ctrl_seg *cseg = >ctrl;
struct mlx5_wqe_eth_seg  *eseg = >eth;
struct mlx5_wqe_data_seg *dseg;
+   u8 ds_cnt = MLX5E_XDP_TX_DS_COUNT;
 
ptrdiff_t data_offset = xdp->data - xdp->data_hard_start;
-   dma_addr_t dma_addr  = di->addr + data_offset + MLX5E_XDP_MIN_INLINE;
+   dma_addr_t dma_addr  = di->addr + data_offset;
unsigned int dma_len = xdp->data_end - xdp->data;
 
if (unlikely(dma_len < MLX5E_XDP_MIN_INLINE ||
@@ -680,17 +681,22 @@ static inline bool mlx5e_xmit_xdp_frame(struct mlx5e_rq 
*rq,
return false;
}
 
-   dma_len -= MLX5E_XDP_MIN_INLINE;
dma_sync_single_for_device(sq->pdev, dma_addr, dma_len,
   PCI_DMA_TODEVICE);
 
memset(wqe, 0, sizeof(*wqe));
 
-   /* copy the inline part */
-   memcpy(eseg->inline_hdr.start, xdp->data, MLX5E_XDP_MIN_INLINE);
-   eseg->inline_hdr.sz = cpu_to_be16(MLX5E_XDP_MIN_INLINE);
+   dseg = (struct mlx5_wqe_data_seg *)eseg + 1;
+   /* copy the inline part if required */
+   if (sq->min_inline_mode != MLX5_INLINE_MODE_NONE) {
+   memcpy(eseg->inline_hdr.start, xdp->data, MLX5E_XDP_MIN_INLINE);
+   eseg->inline_hdr.sz = cpu_to_be16(MLX5E_XDP_MIN_INLINE);
+   dma_len  -= MLX5E_XDP_MIN_INLINE;
+   dma_addr += MLX5E_XDP_MIN_INLINE;
 
-   dseg = (struct mlx5_wqe_data_seg *)cseg + (MLX5E_XDP_TX_DS_COUNT - 1);
+   ds_cnt   += MLX5E_XDP_IHS_DS_COUNT;
+   dseg++;
+   }
 
/* write the dma part */
dseg->addr   = cpu_to_be64(dma_addr);
@@ -698,7 +704,7 @@ static inline bool mlx5e_xmit_xdp_frame(struct mlx5e_rq *rq,
dseg->lkey   = sq->mkey_be;
 
cseg->opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_SEND);
-   cseg->qpn_ds = cpu_to_be32((sq->sqn << 8) | MLX5E_XDP_TX_DS_COUNT);
+   cseg->qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt);
 
sq->db.xdp.di[pi] = *di;
wi->opcode = MLX5_OPCODE_SEND;
-- 
2.11.0

[net-next 8/8] net/mlx5e: Bring back bfreg uar map dedicated pointer

2017-01-31 Thread Saeed Mahameed

4K Uar series modified the mlx5e driver to use the new bfreg API,
and mistakenly removed the sq->uar_map iomem data path dedicated
pointer, which was meant to be read from xmit path for cache locality
utilization.

Fix that by returning that pointer to the SQ struct.

Fixes: 7309cb4ad71e ("IB/mlx5: Support 4k UAR for libmlx5")
Signed-off-by: Saeed Mahameed 
Reviewed-by: Tariq Toukan 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  | 5 +++--
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index a1741d3833db..562cd8f193a8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -475,6 +475,7 @@ struct mlx5e_sq {
/* read only */
struct mlx5_wq_cyc wq;
u32dma_fifo_mask;
+   void __iomem  *uar_map;
struct netdev_queue   *txq;
u32sqn;
u16bf_buf_size;
@@ -830,9 +831,9 @@ static inline void mlx5e_tx_notify_hw(struct mlx5e_sq *sq,
 */
wmb();
if (bf_sz)
-   __iowrite64_copy(sq->bfreg.map + ofst, ctrl, bf_sz);
+   __iowrite64_copy(sq->uar_map + ofst, ctrl, bf_sz);
else
-   mlx5_write64((__be32 *)ctrl, sq->bfreg.map + ofst, NULL);
+   mlx5_write64((__be32 *)ctrl, sq->uar_map + ofst, NULL);
/* flush the write-combining mapped buffer */
wmb();
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index e47ba0365f20..4146e8f4cae0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1016,6 +1016,7 @@ static int mlx5e_create_sq(struct mlx5e_channel *c,
if (err)
return err;
 
+   sq->uar_map = sq->bfreg.map;
param->wq.db_numa_node = cpu_to_node(c->cpu);
 
err = mlx5_wq_cyc_create(mdev, >wq, sqc_wq, >wq,
-- 
2.11.0

[net-next 4/8] net/mlx5: TX WQE update

2017-01-31 Thread Saeed Mahameed

Add new TX WQE fields for Connect-X5 vlan insertion support,
type and vlan_tci, when type = MLX5_ETH_WQE_INSERT_VLAN the
HW will insert the vlan and prio fields (vlan_tci) to the packet.

Those bits and the inline header fields are mutually exclusive, and
valid only when:
MLX5_CAP_ETH(mdev, wqe_inline_mode) == MLX5_CAP_INLINE_MODE_NOT_REQUIRED
and MLX5_CAP_ETH(mdev, wqe_vlan_insert),
who will be set in ConnectX-5 and later HW generations.

Signed-off-by: Saeed Mahameed 
Reviewed-by: Tariq Toukan 
---
 drivers/infiniband/hw/mlx5/qp.c |  6 +++---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c |  4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c |  8 
 include/linux/mlx5/mlx5_ifc.h   |  3 ++-
 include/linux/mlx5/qp.h | 16 ++--
 5 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 6a83fb32599d..e31bf11ae64f 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -2984,20 +2984,20 @@ static void *set_eth_seg(struct mlx5_wqe_eth_seg *eseg,
 
if (wr->opcode == IB_WR_LSO) {
struct ib_ud_wr *ud_wr = container_of(wr, struct ib_ud_wr, wr);
-   int size_of_inl_hdr_start = sizeof(eseg->inline_hdr_start);
+   int size_of_inl_hdr_start = sizeof(eseg->inline_hdr.start);
u64 left, leftlen, copysz;
void *pdata = ud_wr->header;
 
left = ud_wr->hlen;
eseg->mss = cpu_to_be16(ud_wr->mss);
-   eseg->inline_hdr_sz = cpu_to_be16(left);
+   eseg->inline_hdr.sz = cpu_to_be16(left);
 
/*
 * check if there is space till the end of queue, if yes,
 * copy all in one shot, otherwise copy till the end of queue,
 * rollback and than the copy the left
 */
-   leftlen = qend - (void *)eseg->inline_hdr_start;
+   leftlen = qend - (void *)eseg->inline_hdr.start;
copysz = min_t(u64, leftlen, left);
 
memcpy(seg - size_of_inl_hdr_start, pdata, copysz);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index fd8dff6acc12..965e69e9ff1e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -687,8 +687,8 @@ static inline bool mlx5e_xmit_xdp_frame(struct mlx5e_rq *rq,
memset(wqe, 0, sizeof(*wqe));
 
/* copy the inline part */
-   memcpy(eseg->inline_hdr_start, xdp->data, MLX5E_XDP_MIN_INLINE);
-   eseg->inline_hdr_sz = cpu_to_be16(MLX5E_XDP_MIN_INLINE);
+   memcpy(eseg->inline_hdr.start, xdp->data, MLX5E_XDP_MIN_INLINE);
+   eseg->inline_hdr.sz = cpu_to_be16(MLX5E_XDP_MIN_INLINE);
 
dseg = (struct mlx5_wqe_data_seg *)cseg + (MLX5E_XDP_TX_DS_COUNT - 1);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index cfb68371c397..678c07c8fbb0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -284,18 +284,18 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, 
struct sk_buff *skb)
wi->num_bytes = num_bytes;
 
if (skb_vlan_tag_present(skb)) {
-   mlx5e_insert_vlan(eseg->inline_hdr_start, skb, ihs, _data,
+   mlx5e_insert_vlan(eseg->inline_hdr.start, skb, ihs, _data,
  _len);
ihs += VLAN_HLEN;
} else {
-   memcpy(eseg->inline_hdr_start, skb_data, ihs);
+   memcpy(eseg->inline_hdr.start, skb_data, ihs);
mlx5e_tx_skb_pull_inline(_data, _len, ihs);
}
 
-   eseg->inline_hdr_sz = cpu_to_be16(ihs);
+   eseg->inline_hdr.sz = cpu_to_be16(ihs);
 
ds_cnt  = sizeof(*wqe) / MLX5_SEND_WQE_DS;
-   ds_cnt += DIV_ROUND_UP(ihs - sizeof(eseg->inline_hdr_start),
+   ds_cnt += DIV_ROUND_UP(ihs - sizeof(eseg->inline_hdr.start),
   MLX5_SEND_WQE_DS);
dseg= (struct mlx5_wqe_data_seg *)cseg + ds_cnt;
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index cc8ae860cd45..afcd4736d8df 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -577,7 +577,8 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits {
u8 lro_cap[0x1];
u8 lro_psh_flag[0x1];
u8 lro_time_stamp[0x1];
-   u8 reserved_at_5[0x3];
+   u8 reserved_at_5[0x2];
+   u8 wqe_vlan_insert[0x1];
u8 self_lb_en_modifiable[0x1];
u8 reserved_at_9[0x2];
u8 max_lso_cap[0x5];
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index

[net-next 5/8] net/mlx5e: Calc vlan_tag_present only once on xmit

2017-01-31 Thread Saeed Mahameed

Cache skb_vlan_tag_present(skb) and pass it wherever needed in xmit
routines.

Signed-off-by: Saeed Mahameed 
Reviewed-by: Tariq Toukan 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 22 --
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 678c07c8fbb0..ac76fb4f5510 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -148,15 +148,16 @@ static inline int mlx5e_skb_l3_header_offset(struct 
sk_buff *skb)
return mlx5e_skb_l2_header_offset(skb);
 }
 
-static inline unsigned int mlx5e_calc_min_inline(enum mlx5_inline_modes mode,
-struct sk_buff *skb)
+static inline unsigned int
+mlx5e_calc_min_inline(enum mlx5_inline_modes mode, struct sk_buff *skb,
+ bool vlan_present)
 {
int hlen;
 
switch (mode) {
case MLX5_INLINE_MODE_TCP_UDP:
hlen = eth_get_headlen(skb->data, skb_headlen(skb));
-   if (hlen == ETH_HLEN && !skb_vlan_tag_present(skb))
+   if (hlen == ETH_HLEN && !vlan_present)
hlen += VLAN_HLEN;
return hlen;
case MLX5_INLINE_MODE_IP:
@@ -174,7 +175,8 @@ static inline unsigned int mlx5e_calc_min_inline(enum 
mlx5_inline_modes mode,
 }
 
 static inline u16 mlx5e_get_inline_hdr_size(struct mlx5e_sq *sq,
-   struct sk_buff *skb, bool bf)
+   struct sk_buff *skb,
+   bool vlan_present, bool bf)
 {
/* Some NIC TX decisions, e.g loopback, are based on the packet
 * headers and occur before the data gather.
@@ -183,13 +185,13 @@ static inline u16 mlx5e_get_inline_hdr_size(struct 
mlx5e_sq *sq,
if (bf) {
u16 ihs = skb_headlen(skb);
 
-   if (skb_vlan_tag_present(skb))
+   if (vlan_present)
ihs += VLAN_HLEN;
 
if (ihs <= sq->max_inline)
return skb_headlen(skb);
}
-   return mlx5e_calc_min_inline(sq->min_inline_mode, skb);
+   return mlx5e_calc_min_inline(sq->min_inline_mode, skb, vlan_present);
 }
 
 static inline void mlx5e_tx_skb_pull_inline(unsigned char **skb_data,
@@ -228,6 +230,7 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, 
struct sk_buff *skb)
struct mlx5_wqe_eth_seg  *eseg = >eth;
struct mlx5_wqe_data_seg *dseg;
 
+   bool vlan_present = skb_vlan_tag_present(skb);
unsigned char *skb_data = skb->data;
unsigned int skb_len = skb->len;
u8  opcode = MLX5_OPCODE_SEND;
@@ -277,15 +280,14 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, 
struct sk_buff *skb)
bf = sq->bf_budget &&
 !skb->xmit_more &&
 !skb_shinfo(skb)->nr_frags;
-   ihs = mlx5e_get_inline_hdr_size(sq, skb, bf);
+   ihs = mlx5e_get_inline_hdr_size(sq, skb, vlan_present, bf);
num_bytes = max_t(unsigned int, skb->len, ETH_ZLEN);
}
 
wi->num_bytes = num_bytes;
 
-   if (skb_vlan_tag_present(skb)) {
-   mlx5e_insert_vlan(eseg->inline_hdr.start, skb, ihs, _data,
- _len);
+   if (vlan_present) {
+   mlx5e_insert_vlan(eseg->inline_hdr.start, skb, ihs, _data, 
_len);
ihs += VLAN_HLEN;
} else {
memcpy(eseg->inline_hdr.start, skb_data, ihs);
-- 
2.11.0

[PATCH net-next] liquidio: fix for iq and droq cnts going negative

2017-01-31 Thread Felix Manlunas

From: Satanand Burla 

Flush the mmio writes before releasing spin locks.
if the maintained counts get too high > 2M force
writeback of the counts to clear them

Signed-off-by: Satanand Burla 
Signed-off-by: Felix Manlunas 
Signed-off-by: Raghu Vatsavayi 
Signed-off-by: Derek Chickles 
---
 drivers/net/ethernet/cavium/liquidio/lio_main.c| 6 +-
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 6 +-
 drivers/net/ethernet/cavium/liquidio/liquidio_common.h | 1 +
 drivers/net/ethernet/cavium/liquidio/octeon_device.c   | 4 
 4 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c 
b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index fbe1986..c12cfa4 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -2445,7 +2445,11 @@ static int liquidio_napi_poll(struct napi_struct *napi, 
int budget)
__func__, iq_no);
}
 
-   if ((work_done < budget) && (tx_done)) {
+   /* force enable interrupt if reg cnts are high to avoid wraparound */
+   if ((work_done < budget && tx_done) ||
+   (iq->pkt_in_done >= MAX_REG_CNT) ||
+   (droq->pkt_count >= MAX_REG_CNT)) {
+   tx_done = 1;
napi_complete_done(napi, work_done);
octeon_process_droq_poll_cmd(droq->oct_dev, droq->q_no,
 POLL_EVENT_ENABLE_INTR, 0);
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c 
b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
index 78cfa8b..631f1c0f 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
@@ -1630,7 +1630,11 @@ static int liquidio_napi_poll(struct napi_struct *napi, 
int budget)
__func__, iq_no);
}
 
-   if ((work_done < budget) && (tx_done)) {
+   /* force enable interrupt if reg cnts are high to avoid wraparound */
+   if ((work_done < budget && tx_done) ||
+   (iq->pkt_in_done >= MAX_REG_CNT) ||
+   (droq->pkt_count >= MAX_REG_CNT)) {
+   tx_done = 1;
napi_complete_done(napi, work_done);
octeon_process_droq_poll_cmd(droq->oct_dev, droq->q_no,
 POLL_EVENT_ENABLE_INTR, 0);
diff --git a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h 
b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
index bc0af8a..294c6f3 100644
--- a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
+++ b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
@@ -99,6 +99,7 @@ enum octeon_tag_type {
 #define CVM_DRV_APP_END (CVM_DRV_INVALID_APP - 1)
 
 #define BYTES_PER_DHLEN_UNIT8
+#define MAX_REG_CNT 200U
 
 static inline u32 incr_index(u32 index, u32 count, u32 max)
 {
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_device.c 
b/drivers/net/ethernet/cavium/liquidio/octeon_device.c
index a8df493..9675ffb 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_device.c
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_device.c
@@ -1361,6 +1361,8 @@ void lio_enable_irq(struct octeon_droq *droq, struct 
octeon_instr_queue *iq)
spin_lock_bh(>lock);
writel(droq->pkt_count, droq->pkts_sent_reg);
droq->pkt_count = 0;
+   /* this write needs to be flushed before we release the lock */
+   mmiowb();
spin_unlock_bh(>lock);
oct = droq->oct_dev;
}
@@ -1368,6 +1370,8 @@ void lio_enable_irq(struct octeon_droq *droq, struct 
octeon_instr_queue *iq)
spin_lock_bh(>lock);
writel(iq->pkt_in_done, iq->inst_cnt_reg);
iq->pkt_in_done = 0;
+   /* this write needs to be flushed before we release the lock */
+   mmiowb();
spin_unlock_bh(>lock);
oct = iq->oct_dev;
}

[net-next 3/8] net/mlx5e: Lower bound MPWRQ stride by HCA cacheline

2017-01-31 Thread Saeed Mahameed

From: Daniel Jurgens 

MPWRQ stride should be at least the HCA cacheline, the current default
is 64B which will cause data integrity issues in case
HCA_CAP.cach_line_128byte capability is set.

Signed-off-by: Daniel Jurgens 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  | 9 +++--
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 4 ++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 46f728de9e76..9193111ae05e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -70,8 +70,13 @@
 
 #define MLX5_RX_HEADROOM NET_SKB_PAD
 
-#define MLX5_MPWRQ_LOG_STRIDE_SIZE 6  /* >= 6, HW restriction */
-#define MLX5_MPWRQ_LOG_STRIDE_SIZE_CQE_COMPRESS8  /* >= 6, HW 
restriction */
+#define MLX5_MPWRQ_MIN_LOG_STRIDE_SZ(mdev) \
+   (6 + MLX5_CAP_GEN(mdev, cache_line_128byte)) /* HW restriction */
+#define MLX5_MPWRQ_LOG_STRIDE_SZ(mdev, req) \
+   max_t(u32, MLX5_MPWRQ_MIN_LOG_STRIDE_SZ(mdev), req)
+#define MLX5_MPWRQ_DEF_LOG_STRIDE_SZ(mdev)   
MLX5_MPWRQ_LOG_STRIDE_SZ(mdev, 6)
+#define MLX5_MPWRQ_CQE_CMPRS_LOG_STRIDE_SZ(mdev) 
MLX5_MPWRQ_LOG_STRIDE_SZ(mdev, 8)
+
 #define MLX5_MPWRQ_LOG_WQE_SZ  18
 #define MLX5_MPWRQ_WQE_PAGE_ORDER  (MLX5_MPWRQ_LOG_WQE_SZ - PAGE_SHIFT > 0 ? \
MLX5_MPWRQ_LOG_WQE_SZ - PAGE_SHIFT : 0)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index e829143efc14..e7a1da1ea4b7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -89,8 +89,8 @@ static void mlx5e_set_rq_type_params(struct mlx5e_priv *priv, 
u8 rq_type)
MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE_MPW;
priv->params.mpwqe_log_stride_sz =
MLX5E_GET_PFLAG(priv, MLX5E_PFLAG_RX_CQE_COMPRESS) ?
-   MLX5_MPWRQ_LOG_STRIDE_SIZE_CQE_COMPRESS :
-   MLX5_MPWRQ_LOG_STRIDE_SIZE;
+   MLX5_MPWRQ_CQE_CMPRS_LOG_STRIDE_SZ(priv->mdev) :
+   MLX5_MPWRQ_DEF_LOG_STRIDE_SZ(priv->mdev);
priv->params.mpwqe_log_num_strides = MLX5_MPWRQ_LOG_WQE_SZ -
priv->params.mpwqe_log_stride_sz;
break;
-- 
2.11.0

[net-next 6/8] net/mlx5e: Tx, no inline copy on ConnectX-5

2017-01-31 Thread Saeed Mahameed

ConnectX-5 and later HW generations will report min inline mode ==
MLX5_INLINE_MODE_NONE, which means driver is not required to copy packet
headers to inline fields of TX WQE.

When inline is not required, vlan insertion will be handled in the
TX descriptor rather than copy to inline.

For LSO case driver is still required to copy headers, for the HW to
duplicate on wire.

This will improve CPU utilization and boost TX performance.

Tested with pktgen burst single flow:
CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz
HCA: Mellanox Technologies MT28800 Family [ConnectX-5 Ex]

Before: 15.1Mpps
After:  17.2Mpps
Improvement: 14%

Signed-off-by: Saeed Mahameed 
Reviewed-by: Tariq Toukan 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 13 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c   | 29 +--
 2 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index e7a1da1ea4b7..505f97aeb60c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1029,9 +1029,7 @@ static int mlx5e_create_sq(struct mlx5e_channel *c,
 
sq->bf_buf_size = (1 << MLX5_CAP_GEN(mdev, log_bf_reg_size)) / 2;
sq->max_inline  = param->max_inline;
-   sq->min_inline_mode =
-   MLX5_CAP_ETH(mdev, wqe_inline_mode) == 
MLX5_CAP_INLINE_MODE_VPORT_CONTEXT ?
-   param->min_inline_mode : 0;
+   sq->min_inline_mode = param->min_inline_mode;
 
err = mlx5e_alloc_sq_db(sq, cpu_to_node(c->cpu));
if (err)
@@ -1095,7 +1093,10 @@ static int mlx5e_enable_sq(struct mlx5e_sq *sq, struct 
mlx5e_sq_param *param)
MLX5_SET(sqc,  sqc, tis_num_0, param->type == MLX5E_SQ_ICO ?
   0 : priv->tisn[sq->tc]);
MLX5_SET(sqc,  sqc, cqn,sq->cq.mcq.cqn);
-   MLX5_SET(sqc,  sqc, min_wqe_inline_mode, sq->min_inline_mode);
+
+   if (MLX5_CAP_ETH(mdev, wqe_inline_mode) == 
MLX5_CAP_INLINE_MODE_VPORT_CONTEXT)
+   MLX5_SET(sqc,  sqc, min_wqe_inline_mode, sq->min_inline_mode);
+
MLX5_SET(sqc,  sqc, state,  MLX5_SQC_STATE_RST);
MLX5_SET(sqc,  sqc, tis_lst_sz, param->type == MLX5E_SQ_ICO ? 0 : 1);
 
@@ -3533,6 +3534,10 @@ static void mlx5e_build_nic_netdev_priv(struct 
mlx5_core_dev *mdev,
MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS;
priv->params.tx_max_inline = mlx5e_get_max_inline_cap(mdev);
mlx5_query_min_inline(mdev, >params.tx_min_inline_mode);
+   if (priv->params.tx_min_inline_mode == MLX5_INLINE_MODE_NONE &&
+   !MLX5_CAP_ETH(mdev, wqe_vlan_insert))
+   priv->params.tx_min_inline_mode = MLX5_INLINE_MODE_L2;
+
priv->params.num_tc= 1;
priv->params.rss_hfunc = ETH_RSS_HASH_XOR;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index ac76fb4f5510..27f70580e0e7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -155,6 +155,8 @@ mlx5e_calc_min_inline(enum mlx5_inline_modes mode, struct 
sk_buff *skb,
int hlen;
 
switch (mode) {
+   case MLX5_INLINE_MODE_NONE:
+   return 0;
case MLX5_INLINE_MODE_TCP_UDP:
hlen = eth_get_headlen(skb->data, skb_headlen(skb));
if (hlen == ETH_HLEN && !vlan_present)
@@ -286,20 +288,23 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, 
struct sk_buff *skb)
 
wi->num_bytes = num_bytes;
 
-   if (vlan_present) {
-   mlx5e_insert_vlan(eseg->inline_hdr.start, skb, ihs, _data, 
_len);
-   ihs += VLAN_HLEN;
-   } else {
-   memcpy(eseg->inline_hdr.start, skb_data, ihs);
-   mlx5e_tx_skb_pull_inline(_data, _len, ihs);
+   ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS;
+   if (ihs) {
+   if (vlan_present) {
+   mlx5e_insert_vlan(eseg->inline_hdr.start, skb, ihs, 
_data, _len);
+   ihs += VLAN_HLEN;
+   } else {
+   memcpy(eseg->inline_hdr.start, skb_data, ihs);
+   mlx5e_tx_skb_pull_inline(_data, _len, ihs);
+   }
+   eseg->inline_hdr.sz = cpu_to_be16(ihs);
+   ds_cnt += DIV_ROUND_UP(ihs - sizeof(eseg->inline_hdr.start), 
MLX5_SEND_WQE_DS);
+   } else if (vlan_present) {
+   eseg->insert.type = cpu_to_be16(MLX5_ETH_WQE_INSERT_VLAN);
+   eseg->insert.vlan_tci = cpu_to_be16(skb_vlan_tag_get(skb));
}
 
-   eseg->inline_hdr.sz = cpu_to_be16(ihs);
-
-   ds_cnt  = sizeof(*wqe) / MLX5_SEND_WQE_DS;
-   ds_cnt += DIV_ROUND_UP(ihs - sizeof(eseg->inline_hdr.start),
-

[pull request][net-next 0/8] Mellanox mlx5 updates 2017-01-31

2017-01-31 Thread Saeed Mahameed

Hi Dave,

This pull request includes two new features and two small fixes for net-next,
Details are bleow.

Please pull and let me know if there's any problem.

Thanks,
Saeed.

---

The following changes since commit 624374a56419c2d6d428c862f32cc1b20519095d:

  Merge branch 'bgmac-phy-init' (2017-01-31 13:44:50 -0500)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git 
tags/mlx5-updates-2017-01-31

for you to fetch changes up to 4abee9190fcac371116aa73c2e0f86c6b72f8fe3:

  net/mlx5e: Bring back bfreg uar map dedicated pointer (2017-01-31 21:34:10 
+0200)


mlx5-updates-2017-01-31

This series includes some updates to mlx5 core and ethernet driver.

We got one patch from Or to fix some static checker warnings.

2nd and 3rd patches from Dan came to add the support for 128B cache line
in the HCA, which will configures the hardware to use 128B alignment only
on systems with 128B cache lines, otherwise it will be kept as the current
default of 64B.

>From me four patches to support no inline copy on TX on ConnectX-5 and
later HCAs.  Starting with two small infrastructure changes and
refactoring patches followed by two patches to add the actual support for
both xmit ndo and XDP xmit routines.
Last patch is a simple fix to return a mistakenly removed pointer from the
SQ structure, which was remove in previous submission of mlx5 4K UAR.

Saeed.


Daniel Jurgens (2):
  net/mlx5: Configure cache line size for start and end padding
  net/mlx5e: Lower bound MPWRQ stride by HCA cacheline

Or Gerlitz (1):
  net/mlx5: Fixed static checker warnings

Saeed Mahameed (5):
  net/mlx5: TX WQE update
  net/mlx5e: Calc vlan_tag_present only once on xmit
  net/mlx5e: Tx, no inline copy on ConnectX-5
  net/mlx5e: XDP Tx, no inline copy on ConnectX-5
  net/mlx5e: Bring back bfreg uar map dedicated pointer

 drivers/infiniband/hw/mlx5/qp.c   |  6 +--
 drivers/net/ethernet/mellanox/mlx5/core/en.h  | 17 +---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 21 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   | 20 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c   |  8 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c   | 47 +--
 drivers/net/ethernet/mellanox/mlx5/core/main.c|  6 +++
 include/linux/mlx5/device.h   |  3 +-
 include/linux/mlx5/mlx5_ifc.h |  9 +++--
 include/linux/mlx5/qp.h   | 16 +++-
 10 files changed, 99 insertions(+), 54 deletions(-)

[net-next 2/8] net/mlx5: Configure cache line size for start and end padding

2017-01-31 Thread Saeed Mahameed

From: Daniel Jurgens 

There is a hardware feature that will pad the start or end of a DMA to
be cache line aligned to avoid RMWs on the last cache line. The default
cache line size setting for this feature is 64B. This change configures
the hardware to use 128B alignment on systems with 128B cache lines.

Signed-off-by: Daniel Jurgens 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/main.c | 6 ++
 include/linux/mlx5/mlx5_ifc.h  | 6 --
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 84f7970c5080..ca09895b3a05 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -543,6 +543,12 @@ static int handle_hca_cap(struct mlx5_core_dev *dev)
 
MLX5_SET(cmd_hca_cap, set_hca_cap, log_uar_page_sz, PAGE_SHIFT - 12);
 
+   if (MLX5_CAP_GEN_MAX(dev, cache_line_128byte))
+   MLX5_SET(cmd_hca_cap,
+set_hca_cap,
+cache_line_128byte,
+cache_line_size() == 128 ? 1 : 0);
+
err = set_caps(dev, set_ctx, set_sz,
   MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE);
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index a919dfb920ae..cc8ae860cd45 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -804,10 +804,12 @@ struct mlx5_ifc_cmd_hca_cap_bits {
u8 reserved_at_150[0xa];
u8 log_max_ra_res_qp[0x6];
 
-   u8 pad_cap[0x1];
+   u8 end_pad[0x1];
u8 cc_query_allowed[0x1];
u8 cc_modify_allowed[0x1];
-   u8 reserved_at_163[0xd];
+   u8 start_pad[0x1];
+   u8 cache_line_128byte[0x1];
+   u8 reserved_at_163[0xb];
u8 gid_table_size[0x10];
 
u8 out_of_seq_cnt[0x1];
-- 
2.11.0

[net-next 1/8] net/mlx5: Fixed static checker warnings

2017-01-31 Thread Saeed Mahameed

From: Or Gerlitz 

For some reason, sparse doesn't like using an expression of type (!x)
with a bitwise | and &.  In order to mitigate that, we use a local
variable.

Since getting a typeof(bitfield) is incorrect, we cast such cases.

This removes the following sparse complaints on the core driver
(and similar ones on the IB driver too):

drivers/net/ethernet/mellanox/mlx5/core/srq.c:83:9: warning: dubious: !x & y
drivers/net/ethernet/mellanox/mlx5/core/srq.c:96:9: warning: dubious: !x & y
drivers/net/ethernet/mellanox/mlx5/core/port.c:59:9: warning: dubious: !x & y
drivers/net/ethernet/mellanox/mlx5/core/vport.c:561:9: warning: dubious: !x & y

Signed-off-by: Or Gerlitz 
Signed-off-by: Matan Barak 
Reported-by: Bart Van Assche 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 8 
 include/linux/mlx5/device.h | 3 ++-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 640f10f2e994..f00855920894 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -487,11 +487,11 @@ static int __parse_cls_flower(struct mlx5e_priv *priv,
MLX5_SET(fte_match_set_lyr_2_4, headers_c, cvlan_tag, 
1);
MLX5_SET(fte_match_set_lyr_2_4, headers_v, cvlan_tag, 
1);
 
-   MLX5_SET(fte_match_set_lyr_2_4, headers_c, first_vid, 
mask->vlan_id);
-   MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_vid, 
key->vlan_id);
+   MLX5_SET(fte_match_set_lyr_2_4, headers_c, first_vid, 
(u16)mask->vlan_id);
+   MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_vid, 
(u16)key->vlan_id);
 
-   MLX5_SET(fte_match_set_lyr_2_4, headers_c, first_prio, 
mask->vlan_priority);
-   MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_prio, 
key->vlan_priority);
+   MLX5_SET(fte_match_set_lyr_2_4, headers_c, first_prio, 
(u8)mask->vlan_priority);
+   MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_prio, 
(u8)key->vlan_priority);
}
}
 
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 7b6cd67a263f..dd0b253bd15d 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -67,10 +67,11 @@
 
 /* insert a value to a struct */
 #define MLX5_SET(typ, p, fld, v) do { \
+   typeof(v) _v = v; \
BUILD_BUG_ON(__mlx5_st_sz_bits(typ) % 32); \
*((__be32 *)(p) + __mlx5_dw_off(typ, fld)) = \
cpu_to_be32((be32_to_cpu(*((__be32 *)(p) + __mlx5_dw_off(typ, fld))) & \
-(~__mlx5_dw_mask(typ, fld))) | (((v) & __mlx5_mask(typ, 
fld)) \
+(~__mlx5_dw_mask(typ, fld))) | (((_v) & __mlx5_mask(typ, 
fld)) \
 << __mlx5_dw_bit_off(typ, fld))); \
 } while (0)
 
-- 
2.11.0

Re: [PATCH net-next 0/5] bridge: per vlan dst_metadata support

2017-01-31 Thread Roopa Prabhu

On 1/31/17, 8:41 AM, Stephen Hemminger wrote:
> On Mon, 30 Jan 2017 21:57:10 -0800
> Roopa Prabhu  wrote:
>
>> From: Roopa Prabhu 
>>
>> High level summary:
>> lwt and dst_metadata have enabled vxlan l3 deployments
>> to use a single vxlan netdev for multiple vnis eliminating the scalability
>> problem with using a single vxlan netdev per vni. This series tries to
>> do the same for vxlan netdevs in pure l2 bridged networks.
>> Use-case/deployment and details are below.
>>
>> Deployment scerario details:
>> As we know VXLAN is used to build layer 2 virtual networks across the
>> underlay layer3 infrastructure. A VXLAN tunnel endpoint (VTEP)
>> originates and terminates VXLAN tunnels. And a VTEP can be a TOR switch
>> or a vswitch in the hypervisor. This patch series mainly
>> focuses on the TOR switch configured as a Vtep. Vxlan segment ID (vni)
>> along with vlan id is used to identify layer 2 segments in a vxlan
>> overlay network. Vxlan bridging is the function provided by Vteps to 
>> terminate
>> vxlan tunnels and map the vxlan vni to traditional end host vlan. This is
>> covered in the "VXLAN Deployment Scenarios" in sections 6 and 6.1 in RFC 
>> 7348.
>> To provide vxlan bridging function, a vtep has to map vlan to a vni. The rfc
>> says that the ingress VTEP device shall remove the IEEE 802.1Q VLAN tag in
>> the original Layer 2 packet if there is one before encapsulating the packet
>> into the VXLAN format to transmit it through the underlay network. The remote
>> VTEP devices have information about the VLAN in which the packet will be
>> placed based on their own VLAN-to-VXLAN VNI mapping configurations.
>>
>> Existing solution:
>> Without this patch series one can deploy such a vtep configuration by
>> adding the local ports and vxlan netdevs into a vlan filtering bridge.
>> The local ports are configured as trunk ports carrying all vlans.
>> A vxlan netdev per vni is added to the bridge. Vlan mapping to vni is
>> achieved by configuring the vlan as pvid on the corresponding vxlan netdev.
>> The vxlan netdev only receives traffic corresponding to the vlan it is mapped
>> to. This configuration maps traffic belonging to a vlan to the corresponding
>> vxlan segment.
>>
>>   ---
>>  |  bridge   |
>>  |   |
>>   ---
>> |100,200   |100 (pvid)|200 (pvid)
>> |  |  |
>>swp1  vxlan1000  vxlan2000
>> 
>> This provides the required vxlan bridging function but poses a
>> scalability problem with using a separate vxlan netdev for each vni.
>>
>> Solution in this patch series:
>> The Goal is to use a single vxlan device to carry all vnis similar
>> to the vxlan collect metadata mode but additionally allowing the bridge
>> and vxlan driver to carry all the forwarding information and also learn.
>> This implementation uses the existing dst_metadata infrastructure to map
>> vlan to a tunnel id.
>> - vxlan driver changes:
>> - enable collect metadata mode to be used with learning,
>>   replication and fdb
>> - A single fdb table hashed by (mac, vni)
>> - rx path already has the vni
>> - tx path expects a vni in the packet with dst_metadata and relies
>>   on learnt or static forwarding information table to forward the packet
>>
>> - Bridge driver changes: per vlan dst_metadata support:
>> - Our use case is vxlan and 1-1 mapping between vlan and vni, but I have
>>   kept the api generic for any tunnel info
>> - Uapi to configure/unconfigure/dump per vlan tunnel data
>> - new bridge port flag to turn this feature on/off. off by default
>> - ingress hook:
>> - if port is a tunnel port, use tunnel info in
>>   attached dst_metadata to map it to a local vlan
>> - egress hook:
>> - if port is a tunnel port, use tunnel info attached to vlan
>>   to set dst_metadata on the skb
>>
>> Other approaches tried and vetoed:
>> - tc vlan push/pop and tunnel metadata dst:
>> - though tc can be used to do part of this, these patches address a 
>> deployment
>>   case where bridge driver vlan filtering and forwarding information
>>   database along with vxlan driver forwarding information table and 
>> learning
>>   are required.
>> - making vxlan driver understand vlan-vni mapping:
>> - I had a series almost ready with this one but soon realized
>>   it duplicated a lot of vlan handling code in the vxlan driver
>>
>> Roopa Prabhu (5):
>>   ip_tunnels: new IP_TUNNEL_INFO_BRIDGE flag for ip_tunnel_info mode
>>   vxlan: support fdb and learning in COLLECT_METADATA mode
>>   bridge: uapi: add per vlan tunnel info
>>   bridge: per vlan dst_metadata netlink support
>>   bridge: vlan dst_metadata hooks in ingress and egress paths
>>

Re: [PATCH net-next 5/6] drivers: net: xgene-v2: Add transmit and receive

2017-01-31 Thread kbuild test robot

Hi Iyappan,

[auto build test WARNING on net-next/master]

url:
https://github.com/0day-ci/linux/commits/Iyappan-Subramanian/drivers-net-xgene-v2-Add-RGMII-based-1G-driver/20170201-034317
config: parisc-allyesconfig (attached as .config)
compiler: hppa-linux-gnu-gcc (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
wget 
https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross
 -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=parisc 

All warnings (new ones prefixed by >>):

   In file included from include/linux/swab.h:4:0,
from include/uapi/linux/byteorder/big_endian.h:12,
from include/linux/byteorder/big_endian.h:4,
from arch/parisc/include/uapi/asm/byteorder.h:4,
from arch/parisc/include/asm/bitops.h:10,
from include/linux/bitops.h:36,
from include/linux/kernel.h:10,
from include/linux/list.h:8,
from include/linux/resource_ext.h:17,
from include/linux/acpi.h:26,
from drivers/net/ethernet/apm/xgene-v2/main.h:25,
from drivers/net/ethernet/apm/xgene-v2/main.c:22:
   drivers/net/ethernet/apm/xgene-v2/main.c: In function 'xge_refill_buffers':
   drivers/net/ethernet/apm/xgene-v2/main.c:162:20: warning: right shift count 
>= width of type [-Wshift-count-overflow]
  dma_addr >> PKT_ADDRL_LEN));
   ^
   include/uapi/linux/swab.h:129:32: note: in definition of macro '__swab64'
 (__builtin_constant_p((__u64)(x)) ? \
   ^
>> include/linux/byteorder/generic.h:85:21: note: in expansion of macro 
>> '__cpu_to_le64'
#define cpu_to_le64 __cpu_to_le64
^
   drivers/net/ethernet/apm/xgene-v2/main.c:161:9: note: in expansion of macro 
'SET_BITS'
SET_BITS(PKT_ADDRH,
^~~~
   drivers/net/ethernet/apm/xgene-v2/main.c:162:20: warning: right shift count 
>= width of type [-Wshift-count-overflow]
  dma_addr >> PKT_ADDRL_LEN));
   ^
   include/uapi/linux/swab.h:131:12: note: in definition of macro '__swab64'
 __fswab64(x))
   ^
>> include/linux/byteorder/generic.h:85:21: note: in expansion of macro 
>> '__cpu_to_le64'
#define cpu_to_le64 __cpu_to_le64
^
   drivers/net/ethernet/apm/xgene-v2/main.c:161:9: note: in expansion of macro 
'SET_BITS'
SET_BITS(PKT_ADDRH,
^~~~
   drivers/net/ethernet/apm/xgene-v2/main.c: In function 'xge_start_xmit':
   drivers/net/ethernet/apm/xgene-v2/main.c:346:19: warning: right shift count 
>= width of type [-Wshift-count-overflow]
 dma_addr >> PKT_ADDRL_LEN));
  ^
   include/uapi/linux/swab.h:129:32: note: in definition of macro '__swab64'
 (__builtin_constant_p((__u64)(x)) ? \
   ^
>> include/linux/byteorder/generic.h:85:21: note: in expansion of macro 
>> '__cpu_to_le64'
#define cpu_to_le64 __cpu_to_le64
^
   drivers/net/ethernet/apm/xgene-v2/main.c:345:8: note: in expansion of macro 
'SET_BITS'
   SET_BITS(PKT_ADDRH,
   ^~~~
   drivers/net/ethernet/apm/xgene-v2/main.c:346:19: warning: right shift count 
>= width of type [-Wshift-count-overflow]
 dma_addr >> PKT_ADDRL_LEN));
  ^
   include/uapi/linux/swab.h:131:12: note: in definition of macro '__swab64'
 __fswab64(x))
   ^
>> include/linux/byteorder/generic.h:85:21: note: in expansion of macro 
>> '__cpu_to_le64'
#define cpu_to_le64 __cpu_to_le64
^
   drivers/net/ethernet/apm/xgene-v2/main.c:345:8: note: in expansion of macro 
'SET_BITS'
   SET_BITS(PKT_ADDRH,
   ^~~~
   drivers/net/ethernet/apm/xgene-v2/main.c: In function 'xge_rx_poll':
   drivers/net/ethernet/apm/xgene-v2/main.c:453:23: warning: left shift count 
>= width of type [-Wshift-count-overflow]
  dma_addr = (addr_hi << PKT_ADDRL_LEN) | addr_lo;
  ^~

vim +/__cpu_to_le64 +85 include/linux/byteorder/generic.h

^1da177e Linus Torvalds 2005-04-16  69   *  cpu_to_[bl]eXX(__uXX x)
^1da177e Linus Torvalds 2005-04-16  70   *  [bl]eXX_to_cpu(__uXX x)
^1da177e Linus Torvalds 2005-04-16  71   *
^1da177e Linus Torvalds 2005-04-16  72   * The same, but takes a pointer to the 
value to convert
^1da177e Linus Torvalds 2005-04-16  73   *  cpu_to_[bl]eXXp(__uXX x)
^1da177e Linus Torvalds 2005-04-16  74   *  [bl]eXX_to_cpup(__uXX x)
^1da177e Linus Torvalds 2005-04-16  75   *
^1da177e Linus Torvalds 2005-04-16  76   * The same, but change in situ
^1da177e Linus Torvalds 2005-04-16  77   *  cpu_to_[bl]eXXs(__uXX x)
^1da177e Linus Torvalds

Re: [PATCH] rtnetlink: Handle IFLA_MASTER parameter when processing rtnl_newlink

2017-01-31 Thread David Miller

From: David Ahern 
Date: Tue, 31 Jan 2017 13:25:32 -0700

> On 1/30/17 11:49 PM, Cong Wang wrote:
>> On Mon, Jan 30, 2017 at 3:23 PM, Theuns Verwoerd
>>  wrote:
>>> @@ -2653,6 +2653,11 @@ static int rtnl_newlink(struct sk_buff *skb, struct 
>>> nlmsghdr *nlh)
>>> if (err < 0)
>>> goto out_unregister;
>>> }
>>> +   if (tb[IFLA_MASTER]) {
>>> +   err = do_set_master(dev, 
>>> nla_get_u32(tb[IFLA_MASTER]));
>>> +   if (err)
>>> +   goto out_unregister;
>>> +   }
>>>  out:
>>> if (link_net)
>>> put_net(link_net);
>> 
>> Not sure if it is too late to call do_set_master() after
>> dev_change_net_namespace().
>> 
> 
> The master device index is relative to a namespace. If both are given then 
> the namespace change must be done first.

Right and this is exactly how do_setlink() handles this.

Re: [PATCH net-next 5/6] drivers: net: xgene-v2: Add transmit and receive

2017-01-31 Thread kbuild test robot

Hi Iyappan,

[auto build test WARNING on net-next/master]

url:
https://github.com/0day-ci/linux/commits/Iyappan-Subramanian/drivers-net-xgene-v2-Add-RGMII-based-1G-driver/20170201-034317
config: i386-allmodconfig (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
# save the attached .config to linux build tree
make ARCH=i386 

All warnings (new ones prefixed by >>):

   In file included from include/linux/byteorder/little_endian.h:4:0,
from arch/x86/include/uapi/asm/byteorder.h:4,
from include/asm-generic/bitops/le.h:5,
from arch/x86/include/asm/bitops.h:517,
from include/linux/bitops.h:36,
from include/linux/kernel.h:10,
from include/linux/list.h:8,
from include/linux/resource_ext.h:17,
from include/linux/acpi.h:26,
from drivers/net/ethernet/apm/xgene-v2/main.h:25,
from drivers/net/ethernet/apm/xgene-v2/main.c:22:
   drivers/net/ethernet/apm/xgene-v2/main.c: In function 'xge_refill_buffers':
>> drivers/net/ethernet/apm/xgene-v2/main.c:162:20: warning: right shift count 
>> >= width of type [-Wshift-count-overflow]
  dma_addr >> PKT_ADDRL_LEN));
   ^
   include/uapi/linux/byteorder/little_endian.h:30:51: note: in definition of 
macro '__cpu_to_le64'
#define __cpu_to_le64(x) ((__force __le64)(__u64)(x))
  ^
>> drivers/net/ethernet/apm/xgene-v2/main.c:161:9: note: in expansion of macro 
>> 'SET_BITS'
SET_BITS(PKT_ADDRH,
^~~~
   drivers/net/ethernet/apm/xgene-v2/main.c: In function 'xge_start_xmit':
   drivers/net/ethernet/apm/xgene-v2/main.c:346:19: warning: right shift count 
>= width of type [-Wshift-count-overflow]
 dma_addr >> PKT_ADDRL_LEN));
  ^
   include/uapi/linux/byteorder/little_endian.h:30:51: note: in definition of 
macro '__cpu_to_le64'
#define __cpu_to_le64(x) ((__force __le64)(__u64)(x))
  ^
   drivers/net/ethernet/apm/xgene-v2/main.c:345:8: note: in expansion of macro 
'SET_BITS'
   SET_BITS(PKT_ADDRH,
   ^~~~
   drivers/net/ethernet/apm/xgene-v2/main.c: In function 'xge_rx_poll':
>> drivers/net/ethernet/apm/xgene-v2/main.c:453:23: warning: left shift count 
>> >= width of type [-Wshift-count-overflow]
  dma_addr = (addr_hi << PKT_ADDRL_LEN) | addr_lo;
  ^~
--
   drivers/net/ethernet/apm/xgene-v2/ring.c: In function 'xge_setup_desc':
>> drivers/net/ethernet/apm/xgene-v2/ring.c:40:20: warning: right shift count 
>> >= width of type [-Wshift-count-overflow]
  dma_h = next_dma >> NEXT_DESC_ADDRL_LEN;
   ^~
   drivers/net/ethernet/apm/xgene-v2/ring.c: In function 
'xge_update_tx_desc_addr':
   drivers/net/ethernet/apm/xgene-v2/ring.c:51:42: warning: right shift count 
>= width of type [-Wshift-count-overflow]
 xge_wr_csr(pdata, DMATXDESCH, (dma_addr >> NEXT_DESC_ADDRL_LEN));
 ^~
   drivers/net/ethernet/apm/xgene-v2/ring.c: In function 
'xge_update_rx_desc_addr':
   drivers/net/ethernet/apm/xgene-v2/ring.c:59:42: warning: right shift count 
>= width of type [-Wshift-count-overflow]
 xge_wr_csr(pdata, DMARXDESCH, (dma_addr >> NEXT_DESC_ADDRL_LEN));
 ^~

vim +162 drivers/net/ethernet/apm/xgene-v2/main.c

90db21d34 Iyappan Subramanian 2017-01-31   16   * GNU General Public License 
for more details.
90db21d34 Iyappan Subramanian 2017-01-31   17   *
90db21d34 Iyappan Subramanian 2017-01-31   18   * You should have received a 
copy of the GNU General Public License
90db21d34 Iyappan Subramanian 2017-01-31   19   * along with this program.  If 
not, see .
90db21d34 Iyappan Subramanian 2017-01-31   20   */
90db21d34 Iyappan Subramanian 2017-01-31   21  
90db21d34 Iyappan Subramanian 2017-01-31  @22  #include "main.h"
90db21d34 Iyappan Subramanian 2017-01-31   23  
90db21d34 Iyappan Subramanian 2017-01-31   24  static const struct 
acpi_device_id xge_acpi_match[];
90db21d34 Iyappan Subramanian 2017-01-31   25  
90db21d34 Iyappan Subramanian 2017-01-31   26  static int 
xge_get_resources(struct xge_pdata *pdata)
90db21d34 Iyappan Subramanian 2017-01-31   27  {
90db21d34 Iyappan Subramanian 2017-01-31   28   struct platform_device *pdev;
90db21d34 Iyappan Subramanian 2017-01-31   29   struct net_device *ndev;
90db21d34 Iyappan Subramanian 2017-01-31   30   struct device *dev;
90db21d34 Iyappan Subramanian 2017-01-31   31   struct resource *res;
90db21d34 Iyappan Subramanian 2017-01-31   32   int phy_mode, ret = 0;
90db21d34 Iyappan Subramanian 2017-01-31   33  
90db21d34 Iyappan Subramanian 2017-01-31   34   pdev = pdata->pdev;
90db21d34 Iyappan Subramanian

Re: [PATCH] rtnetlink: Handle IFLA_MASTER parameter when processing rtnl_newlink

2017-01-31 Thread David Ahern

On 1/30/17 11:49 PM, Cong Wang wrote:
> On Mon, Jan 30, 2017 at 3:23 PM, Theuns Verwoerd
>  wrote:
>> @@ -2653,6 +2653,11 @@ static int rtnl_newlink(struct sk_buff *skb, struct 
>> nlmsghdr *nlh)
>> if (err < 0)
>> goto out_unregister;
>> }
>> +   if (tb[IFLA_MASTER]) {
>> +   err = do_set_master(dev, 
>> nla_get_u32(tb[IFLA_MASTER]));
>> +   if (err)
>> +   goto out_unregister;
>> +   }
>>  out:
>> if (link_net)
>> put_net(link_net);
> 
> Not sure if it is too late to call do_set_master() after
> dev_change_net_namespace().
> 

The master device index is relative to a namespace. If both are given then the 
namespace change must be done first.

Re: [PATCH net-next 5/6] drivers: net: xgene-v2: Add transmit and receive

2017-01-31 Thread Florian Fainelli

On 01/31/2017 11:03 AM, Iyappan Subramanian wrote:
> This patch adds,
> - Transmit
> - Transmit completion poll
> - Receive poll
> - NAPI handler
> 
> and enables the driver.
> 
> Signed-off-by: Iyappan Subramanian 
> Signed-off-by: Keyur Chudgar 
> ---

> +
> + tx_ring = pdata->tx_ring;
> + tail = tx_ring->tail;
> + len = skb_headlen(skb);
> + raw_desc = _ring->raw_desc[tail];
> +
> + /* Tx descriptor not available */
> + if (!GET_BITS(E, le64_to_cpu(raw_desc->m0)) ||
> + GET_BITS(PKT_SIZE, le64_to_cpu(raw_desc->m0)))
> + return NETDEV_TX_BUSY;
> +
> + /* Packet buffers should be 64B aligned */
> + pkt_buf = dma_alloc_coherent(dev, XGENE_ENET_STD_MTU, _addr,
> +  GFP_ATOMIC);
> + if (unlikely(!pkt_buf))
> + goto out;

Can't you obtain a DMA-API mapping for skb->data and pass it down to the
hardware? This copy here is inefficient.

> +
> + memcpy(pkt_buf, skb->data, len);
> +
> + addr_hi = GET_BITS(NEXT_DESC_ADDRH, le64_to_cpu(raw_desc->m1));
> + addr_lo = GET_BITS(NEXT_DESC_ADDRL, le64_to_cpu(raw_desc->m1));
> + raw_desc->m1 = cpu_to_le64(SET_BITS(NEXT_DESC_ADDRL, addr_lo) |
> +SET_BITS(NEXT_DESC_ADDRH, addr_hi) |
> +SET_BITS(PKT_ADDRH,
> + dma_addr >> PKT_ADDRL_LEN));
> +
> + dma_wmb();
> +
> + raw_desc->m0 = cpu_to_le64(SET_BITS(PKT_ADDRL, dma_addr) |
> +SET_BITS(PKT_SIZE, len) |
> +SET_BITS(E, 0));
> +
> + skb_tx_timestamp(skb);
> + xge_wr_csr(pdata, DMATXCTRL, 1);
> +
> + pdata->stats.tx_packets++;
> + pdata->stats.tx_bytes += skb->len;

This is both racy and incorrect. Racy because after you wrote DMATXCTRL,
your TX completion can run, and it can do that while interrupting your
CPU presumably, and free the SKB, therefore making you access a freed
SKB (or it should, if it does not), it's also incorrect, because before
you get signaled a TX completion, there is no guarantee that the packets
did actually make it through, you must update your stats in the TX
completion handler.

> +
> + tx_ring->skbs[tail] = skb;
> + tx_ring->pkt_bufs[tail] = pkt_buf;
> + tx_ring->tail = (tail + 1) & (XGENE_ENET_NUM_DESC - 1);
> +
> +out:
> + dev_kfree_skb_any(skb);

Don't do this, remember a pointer to the SKB, free the SKB in TX
completion handler, preferably in NAPI context.

> +
> + return NETDEV_TX_OK;
> +}
> +
> +static void xge_txc_poll(struct net_device *ndev, unsigned int budget)
> +{
> + struct xge_pdata *pdata = netdev_priv(ndev);
> + struct device *dev = >pdev->dev;
> + struct xge_desc_ring *tx_ring;
> + struct xge_raw_desc *raw_desc;
> + u64 addr_lo, addr_hi;
> + dma_addr_t dma_addr;
> + void *pkt_buf;
> + bool pktsent;
> + u32 data;
> + u8 head;
> + int i;
> +
> + tx_ring = pdata->tx_ring;
> + head = tx_ring->head;
> +
> + data = xge_rd_csr(pdata, DMATXSTATUS);
> + pktsent = data & TX_PKT_SENT;
> + if (unlikely(!pktsent))
> + return;
> +
> + for (i = 0; i < budget; i++) {

TX completion handlers should run unbound and free the entire TX ring,
don't make it obey to an upper bound.

> + raw_desc = _ring->raw_desc[head];
> +
> + if (!GET_BITS(E, le64_to_cpu(raw_desc->m0)))
> + break;
> +
> + dma_rmb();
> +
> + addr_hi = GET_BITS(PKT_ADDRH, le64_to_cpu(raw_desc->m1));
> + addr_lo = GET_BITS(PKT_ADDRL, le64_to_cpu(raw_desc->m0));
> + dma_addr = (addr_hi << PKT_ADDRL_LEN) | addr_lo;
> +
> + pkt_buf = tx_ring->pkt_bufs[head];
> +
> + /* clear pktstart address and pktsize */
> + raw_desc->m0 = cpu_to_le64(SET_BITS(E, 1) |
> +SET_BITS(PKT_SIZE, 0));
> + xge_wr_csr(pdata, DMATXSTATUS, 1);
> +
> + dma_free_coherent(dev, XGENE_ENET_STD_MTU, pkt_buf, dma_addr);
> +
> + head = (head + 1) & (XGENE_ENET_NUM_DESC - 1);
> + }
> +
> + tx_ring->head = head;
> +}
> +
> +static int xge_rx_poll(struct net_device *ndev, unsigned int budget)
> +{
> + struct xge_pdata *pdata = netdev_priv(ndev);
> + struct device *dev = >pdev->dev;
> + dma_addr_t addr_hi, addr_lo, dma_addr;
> + struct xge_desc_ring *rx_ring;
> + struct xge_raw_desc *raw_desc;
> + struct sk_buff *skb;
> + int i, npkts, ret = 0;
> + bool pktrcvd;
> + u32 data;
> + u8 head;
> + u16 len;
> +
> + rx_ring = pdata->rx_ring;
> + head = rx_ring->head;
> +
> + data = xge_rd_csr(pdata, DMARXSTATUS);
> + pktrcvd = data & RXSTATUS_RXPKTRCVD;
> +
> + if (unlikely(!pktrcvd))
> + return 0;
> +
> + npkts = 0;
> + for (i = 0; i < budget;

Re: [PATCH net-next 4/6] drivers: net: xgene-v2: Add base driver

2017-01-31 Thread Florian Fainelli

On 01/31/2017 11:03 AM, Iyappan Subramanian wrote:
> This patch adds,
> 
>  - probe, remove, shutdown
>  - open, close and stats
>  - create and delete ring
>  - request and delete irq
> 
> Signed-off-by: Iyappan Subramanian 
> Signed-off-by: Keyur Chudgar 
> ---

> +static void xge_delete_desc_rings(struct net_device *ndev)
> +{
> + struct xge_pdata *pdata = netdev_priv(ndev);
> + struct device *dev = >pdev->dev;
> + struct xge_desc_ring *ring;
> +
> + ring = pdata->tx_ring;
> + if (ring) {
> + if (ring->skbs)
> + devm_kfree(dev, ring->skbs);
> + if (ring->pkt_bufs)
> + devm_kfree(dev, ring->pkt_bufs);
> + devm_kfree(dev, ring);
> + }

The very fact that you have to do the devm_kfree suggests that the way
you manage the lifetime of the ring is not appropriate, and in fact, if
we look at how xge_create_desc_ring() is called, in the driver's probe
function indicates that if the network interface is never openeded, we
are just wasting memory sitting there and doing nothing. You should
consider moving this to the ndo_open(), resp. ndo_close() functions to
optimize memory consumption wrt. the network interface state.

> +
> + ring = pdata->rx_ring;
> + if (ring) {
> + if (ring->skbs)
> + devm_kfree(dev, ring->skbs);
> + devm_kfree(dev, ring);
> + }
> +}
> +
> +static struct xge_desc_ring *xge_create_desc_ring(struct net_device *ndev)
> +{
> + struct xge_pdata *pdata = netdev_priv(ndev);
> + struct device *dev = >pdev->dev;
> + struct xge_desc_ring *ring;
> + u16 size;
> +
> + ring = devm_kzalloc(dev, sizeof(struct xge_desc_ring), GFP_KERNEL);
> + if (!ring)
> + return NULL;
> +
> + ring->ndev = ndev;
> +
> + size = XGENE_ENET_DESC_SIZE * XGENE_ENET_NUM_DESC;
> + ring->desc_addr = dmam_alloc_coherent(dev, size, >dma_addr,
> +   GFP_KERNEL | __GFP_ZERO);

There is no dmam_zalloc_coherent()? Then again, that seems to be a
candidate for dma_zalloc_coherent() and moving this to the ndo_open()
function.

> + if (!ring->desc_addr) {
> + devm_kfree(dev, ring);
> + return NULL;
> + }
> +
> + xge_setup_desc(ring);
> +
> + return ring;
> +}
> +
> +static int xge_refill_buffers(struct net_device *ndev, u32 nbuf)
> +{
> + struct xge_pdata *pdata = netdev_priv(ndev);
> + struct xge_desc_ring *ring = pdata->rx_ring;
> + const u8 slots = XGENE_ENET_NUM_DESC - 1;
> + struct device *dev = >pdev->dev;
> + struct xge_raw_desc *raw_desc;
> + u64 addr_lo, addr_hi;
> + u8 tail = ring->tail;
> + struct sk_buff *skb;
> + dma_addr_t dma_addr;
> + u16 len;
> + int i;
> +
> + for (i = 0; i < nbuf; i++) {
> + raw_desc = >raw_desc[tail];
> +
> + len = XGENE_ENET_STD_MTU;
> + skb = netdev_alloc_skb(ndev, len);
> + if (unlikely(!skb))
> + return -ENOMEM;

Are not you leaving holes in your RX ring if you do that?

> +
> + dma_addr = dma_map_single(dev, skb->data, len, DMA_FROM_DEVICE);
> + if (dma_mapping_error(dev, dma_addr)) {
> + netdev_err(ndev, "DMA mapping error\n");
> + dev_kfree_skb_any(skb);
> + return -EINVAL;
> + }


> +static void xge_timeout(struct net_device *ndev)
> +{
> + struct xge_pdata *pdata = netdev_priv(ndev);
> + struct netdev_queue *txq;
> +
> + xge_mac_reset(pdata);
> +
> + txq = netdev_get_tx_queue(ndev, 0);
> + txq->trans_start = jiffies;
> + netif_tx_start_queue(txq);

It most likely is not that simple, don't you want to walk the list of
pending transmissed SKBs and free them all?

> +}
> +
> +static void xge_get_stats64(struct net_device *ndev,
> + struct rtnl_link_stats64 *storage)
> +{
> + struct xge_pdata *pdata = netdev_priv(ndev);
> + struct xge_stats *stats = >stats;
> +
> + storage->tx_packets += stats->tx_packets;
> + storage->tx_bytes += stats->tx_bytes;
> +
> + storage->rx_packets += stats->rx_packets;
> + storage->rx_bytes += stats->rx_bytes;

Pretty sure you need some synchronization primitives here for non 64-bit
architectures (maybe this driver is not used outside of 64-bit, but still).


> +
> + ndev->hw_features = ndev->features;
> +
> + ret = dma_coerce_mask_and_coherent(dev, DMA_BIT_MASK(64));
> + if (ret) {
> + netdev_err(ndev, "No usable DMA configuration\n");
> + goto err;
> + }
> +
> + ret = xge_init_hw(ndev);
> + if (ret)
> + goto err;

Missing netif_carrier_off() right before the register_netdev().

> +
> + ret = register_netdev(ndev);
> + if (ret) {
> + netdev_err(ndev, "Failed to register

Re: [PATCH 4.10-rc3 08/13] net: emac: fix build errors when linux/phy*.h is removed from net/dsa.h

2017-01-31 Thread Timur Tabi


On 01/31/2017 01:19 PM, Russell King wrote:

drivers/net/ethernet/qualcomm/emac/emac-sgmii.c:58:12: error: dereferencing 
pointer to incomplete type 'struct phy_device'

Add linux/phy.h to emac-sgmii.c

Signed-off-by: Russell King 
---
 drivers/net/ethernet/qualcomm/emac/emac-sgmii.c | 1 +


The version of emac-sgmii.c on net-next does not need this fixed.  I already 
removed all references to phy_device in commit "net: qcom/emac: always use 
autonegotiation to configure the SGMII link".


--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm
Technologies, Inc.  Qualcomm Technologies, Inc. is a member of the
Code Aurora Forum, a Linux Foundation Collaborative Project.

Re: [PATCH net-next 0/6] drivers: net: xgene-v2: Add RGMII based 1G driver

2017-01-31 Thread Andrew Lunn

On Tue, Jan 31, 2017 at 11:03:15AM -0800, Iyappan Subramanian wrote:
> This patch set adds support for RGMII based 1GbE hardware which uses a linked
> list of DMA descriptor architecture (v2) for APM X-Gene SoCs.

Hi Iyappan

Should we assume there are more patches to follow adding MDIO bus
support and phylib integration?

Andrew

Re: [PATCH] net:phy fix driver reference count error when attach and detach phy device

2017-01-31 Thread Florian Fainelli

On 01/23/2017 01:33 AM, maowenan wrote:
> 
> 
> On 2017/1/6 12:48, Florian Fainelli wrote:
>> Le 01/05/17 à 19:39, maowenan a écrit :
>>>
>>>
>>> On 2017/1/6 11:21, Florian Fainelli wrote:
 +Andrew,

 Le 01/05/17 à 18:29, maowenan a écrit :
>>> @Florian Fainelli, what's your comments about this patch?
>>
>> I am trying to reproduce what you are seeing, but at first glance is 
>> looks like an
>> appropriate solution to me. Do you mind giving me a couple more days?
>>
>> Thanks!
>> --
>> Florian
>
> Hi Florian, 
>   Do you have any update about this patch?

 Your patch is not complete, there are now MDIO device (which PHY devices
 are a superset of) that would also need a similar fix.

>>> ok, is there any patch to fix MDIO yet?  if not, i will verify it and give 
>>> a fix patch?
>>>
>>
>> No, there is not a patch yet, your approach looks okay, but need to be
>> made general and cover MDIO devices as well.
>>
>> Thank you!
>>
> 
> Hi Florian,
> Sorry I can't get you. There has already existed codes which are not 
> originally written by me to cover MDIO device in phy_attach_direct and 
> phy_detach in my patch .
> Please help check, thank you.
> phy_attach_direct:
> struct device *d = >mdio.dev;
> ...
> get_device(d);
> ...
> 
> phy_detach:
>   put_device(>mdio.dev);   /*--MDIO device--*/
> + module_put(phydev->mdio.dev.driver->owner);
>   module_put(bus->owner);

Took me a while, but I can finally reproduce this here as well, will
come up with a fix, thanks for your patience!
-- 
Florian

[PATCH] net: ethernet: ti: cpsw: fix NULL pointer dereference in switch mode

2017-01-31 Thread Grygorii Strashko

In switch mode on struct cpsw_slave->ndev field will be initialized with
proper value only for the one cpsw slave port, as result
cpsw_get_usage_count() will generate "Unable to handle kernel NULL pointer
dereference" exception when first ethernet interface is opening
cpsw_ndo_open(). This issue causes boot regression on AM335x EVM and
reproducible on am57xx-evm (switch mode).
Fix it by adding additional check for !cpsw->slaves[i].ndev in
cpsw_get_usage_count().

Cc: Ivan Khoronzhuk 
fixes: 03fd01ad0eea ("net: ethernet: ti: cpsw: don't duplicate ndev_running")
Signed-off-by: Grygorii Strashko 
---
 drivers/net/ethernet/ti/cpsw.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 67b7323..35a95dc 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -677,7 +677,7 @@ static int cpsw_get_usage_count(struct cpsw_common *cpsw)
u32 usage_count = 0;
 
for (i = 0; i < cpsw->data.slaves; i++)
-   if (netif_running(cpsw->slaves[i].ndev))
+   if (cpsw->slaves[i].ndev && netif_running(cpsw->slaves[i].ndev))
usage_count++;
 
return usage_count;
-- 
2.10.1.dirty

Re: [PATCH net-next 4/6] drivers: net: xgene-v2: Add base driver

2017-01-31 Thread Andrew Lunn

> + phy_mode = device_get_phy_mode(dev);
> + if (phy_mode < 0) {
> + dev_err(dev, "Unable to get phy-connection-type\n");
> + return phy_mode;
> + }
> + pdata->resources.phy_mode = phy_mode;
> +
> + if (pdata->resources.phy_mode != PHY_INTERFACE_MODE_RGMII) {
> + dev_err(dev, "Incorrect phy-connection-type specified\n");
> + return -ENODEV;
> + }

This seems a bit limiting. What if you need to use:

PHY_INTERFACE_MODE_RGMII_ID,
PHY_INTERFACE_MODE_RGMII_RXID,
PHY_INTERFACE_MODE_RGMII_TXID,

in order to set the RGMII delays.

   Andrew

Re: [PATCH] r8152: Allocate interrupt buffer as part of struct r8152

2017-01-31 Thread Eric Dumazet

On Tue, 2017-01-31 at 11:06 -0800, Guenter Roeck wrote:
> When unloading the r8152 driver using the 'unbind' sysfs attribute
> in a system with KASAN enabled, the following error message is seen
> on a regular basis.

>  
>  static int alloc_all_mem(struct r8152 *tp)
> @@ -1423,10 +1420,6 @@ static int alloc_all_mem(struct r8152 *tp)
>   if (!tp->intr_urb)
>   goto err1;
>  
> - tp->intr_buff = kmalloc(INTBUFSIZE, GFP_KERNEL);
> - if (!tp->intr_buff)
> - goto err1;
> -
>   tp->intr_interval = (int)ep_intr->desc.bInterval;
>   usb_fill_int_urb(tp->intr_urb, tp->udev, usb_rcvintpipe(tp->udev, 3),
>tp->intr_buff, INTBUFSIZE, intr_callback,

This might lead to intr_buff being backed by vzalloc() instead of
kzalloc() (check alloc_netdev_mqs())

It looks like it could cause a bug.

Re: [PATCH] r8152: Allocate interrupt buffer as part of struct r8152

2017-01-31 Thread Alan Stern

On Tue, 31 Jan 2017, Guenter Roeck wrote:

> When unloading the r8152 driver using the 'unbind' sysfs attribute
> in a system with KASAN enabled, the following error message is seen
> on a regular basis.

...

> The two-byte allocation in conjunction with code analysis suggests that
> the interrupt buffer has been overwritten. Added instrumentation in the
> driver shows that the interrupt handler is called after RTL8152_UNPLUG
> was set, and that this event is associated with the error message above.
> This suggests that there are situations where the interrupt buffer is used
> after it has been freed.
> 
> To avoid the problem, allocate the interrupt buffer as part of struct
> r8152.
> 
> Cc: Hayes Wang 
> Signed-off-by: Guenter Roeck 
> ---
> The problem is seen in chromeos-4.4, but there is not reason to believe
> that it does not occur with the upstream kernel. It is still seen in
> chromeos-4.4 after all patches from upstream and linux-next have been
> applied to the driver.
> 
> While relatively simple, I am not really convinced that this is the best
> (or even an acceptable) solution for this problem. I am open to suggestions
> for a better fix.

The proper approach is to keep the allocation as it is, but _before_
deallocating the buffer, make sure that the interrupt buffer won't be
accessed any more.  This may involve calling usb_kill_urb(), or
synchronize_irq(), or something similar.

Alan Stern

Re: [PATCH 4.10-rc3 01/13] net: sunrpc: fix build errors when linux/phy*.h is removed from net/dsa.h

2017-01-31 Thread Anna Schumaker

Hi Russell,

On 01/31/2017 02:18 PM, Russell King wrote:
> Removing linux/phy.h from net/dsa.h reveals a build error in the sunrpc
> code:
> 
> net/sunrpc/xprtrdma/svc_rdma_backchannel.c: In function 'xprt_rdma_bc_put':
> net/sunrpc/xprtrdma/svc_rdma_backchannel.c:277:2: error: implicit declaration 
> of function 'module_put' [-Werror=implicit-function-declaration]
> net/sunrpc/xprtrdma/svc_rdma_backchannel.c: In function 'xprt_setup_rdma_bc':
> net/sunrpc/xprtrdma/svc_rdma_backchannel.c:348:7: error: implicit declaration 
> of function 'try_module_get' [-Werror=implicit-function-declaration]
> 
> Fix this by adding linux/module.h to svc_rdma_backchannel.c
> 
> Signed-off-by: Russell King 

This patch looks okay to me:

Acked-by: Anna Schumaker 

> ---
>  net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c 
> b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
> index 288e35c2d8f4..cb1e48e54eb1 100644
> --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
> +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
> @@ -4,6 +4,7 @@
>   * Support for backward direction RPCs on RPC/RDMA (server-side).
>   */
>  
> +#include 
>  #include 
>  #include "xprt_rdma.h"
>  
>

Re: [RFC PATCH 0/2] rx zero copy interface for af_packet

2017-01-31 Thread tndave




On 01/27/2017 01:33 PM, John Fastabend wrote:

This is an experimental implementation of rx zero copy for af_packet.
Its a bit rough and likely has errors but the plan is to clean it up
over the next few months.

And seeing I said I would post it in another thread a few days back
here it is.


This sounds good (believe me I have been thinking along the lines :)
From driver Rx side, we always premap RX buffers so best to map them to
shmem for PF_PACKET sockets.
Also, I like the idea that user can put selected queue (may be queues in
future?) to PF_PACKET mode keeping rest of the queues as it is.
Zero copy and removing skb setup & processing overhead on RX certainly
makes things faster and help latency. Zero copy is good on Tx however
without skb should we figure out how to use segmentation and checksum 
offloading features of HW. Can this be considered in tpacket V4 hdr!


-Tushar

Re: [PATCH 4.10-rc3 09/13] iscsi: fix build errors when linux/phy*.h is removed from net/dsa.h

2017-01-31 Thread Bart Van Assche

On Tue, 2017-01-31 at 19:19 +, Russell King wrote:
> drivers/target/iscsi/iscsi_target_login.c:1135:7: error: implicit declaration 
> of function 'try_module_get' [-Werror=implicit-function-declaration]
> 
> Add linux/module.h to iscsi_target_login.c.
> 
> Signed-off-by: Russell King 
> ---
>  drivers/target/iscsi/iscsi_target_login.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/target/iscsi/iscsi_target_login.c 
> b/drivers/target/iscsi/iscsi_target_login.c
> index 450f51deb2a2..eab274d17b5c 100644
> --- a/drivers/target/iscsi/iscsi_target_login.c
> +++ b/drivers/target/iscsi/iscsi_target_login.c
> @@ -17,6 +17,7 @@
>   
> **/
>  
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 

Reviewed-by: Bart Van Assche

[PATCH 4.10-rc3 10/13] MIPS: Octeon: Remove unnecessary MODULE_*()

2017-01-31 Thread Russell King

octeon-platform.c can not be built as a module for two reasons:

(a) the Makefile doesn't allow it:
obj-y := cpu.o setup.o octeon-platform.o octeon-irq.o csrc-octeon.o

(b) the multiple *_initcall() statements, each of which are translated
to a module_init() call when attempting a module build, become
aliases to init_module().  Having more than one alias will cause a
build error.

Hence, rather than adding a linux/module.h include, remove the redundant
MODULE_*() from this file.

Acked-by: David Daney 
Signed-off-by: Russell King 
---
 arch/mips/cavium-octeon/octeon-platform.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/arch/mips/cavium-octeon/octeon-platform.c 
b/arch/mips/cavium-octeon/octeon-platform.c
index 37a932d9148c..8297ce714c5e 100644
--- a/arch/mips/cavium-octeon/octeon-platform.c
+++ b/arch/mips/cavium-octeon/octeon-platform.c
@@ -1060,7 +1060,3 @@ static int __init octeon_publish_devices(void)
return of_platform_bus_probe(NULL, octeon_ids, NULL);
 }
 arch_initcall(octeon_publish_devices);
-
-MODULE_AUTHOR("David Daney ");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Platform driver for Octeon SOC");
-- 
2.7.4

[PATCH 4.10-rc3 11/13] net: liquidio: fix build errors when linux/phy*.h is removed from net/dsa.h

2017-01-31 Thread Russell King

drivers/net/ethernet/cavium/liquidio/lio_vf_main.c:30: error: expected 
declaration specifiers or '...' before string constant
drivers/net/ethernet/cavium/liquidio/lio_vf_main.c:30: warning: data definition 
has no type or storage class
drivers/net/ethernet/cavium/liquidio/lio_vf_main.c:30: error: type defaults to 
'int' in declaration of 'MODULE_AUTHOR'
drivers/net/ethernet/cavium/liquidio/lio_vf_main.c:30: error: function 
declaration isn't a prototype
drivers/net/ethernet/cavium/liquidio/lio_vf_main.c:31: error: expected 
declaration specifiers or '...' before string constant
drivers/net/ethernet/cavium/liquidio/lio_vf_main.c:31: warning: data definition 
has no type or storage class
drivers/net/ethernet/cavium/liquidio/lio_vf_main.c:31: error: type defaults to 
'int' in declaration of 'MODULE_DESCRIPTION'
drivers/net/ethernet/cavium/liquidio/lio_vf_main.c:31: error: function 
declaration isn't a prototype
drivers/net/ethernet/cavium/liquidio/lio_vf_main.c:32: error: expected 
declaration specifiers or '...' before string constant
drivers/net/ethernet/cavium/liquidio/lio_vf_main.c:32: warning: data definition 
has no type or storage class
drivers/net/ethernet/cavium/liquidio/lio_vf_main.c:32: error: type defaults to 
'int' in declaration of 'MODULE_LICENSE'
drivers/net/ethernet/cavium/liquidio/lio_vf_main.c:32: error: function 
declaration isn't a prototype
drivers/net/ethernet/cavium/liquidio/lio_vf_main.c:33: error: expected 
declaration specifiers or '...' before string constant
drivers/net/ethernet/cavium/liquidio/lio_vf_main.c:33: warning: data definition 
has no type or storage class
drivers/net/ethernet/cavium/liquidio/lio_vf_main.c:33: error: type defaults to 
'int' in declaration of 'MODULE_VERSION'
drivers/net/ethernet/cavium/liquidio/lio_vf_main.c:33: error: function 
declaration isn't a prototype
drivers/net/ethernet/cavium/liquidio/lio_vf_main.c:36: error: expected ')' 
before 'int'
drivers/net/ethernet/cavium/liquidio/lio_vf_main.c:37: error: expected ')' 
before string constant
drivers/net/ethernet/cavium/liquidio/lio_vf_main.c:325: warning: data 
definition has no type or storage class
drivers/net/ethernet/cavium/liquidio/lio_vf_main.c:325: error: type defaults to 
'int' in declaration of 'MODULE_DEVICE_TABLE'
drivers/net/ethernet/cavium/liquidio/lio_vf_main.c:325: warning: parameter 
names (without types) in function declaration
drivers/net/ethernet/cavium/liquidio/lio_vf_main.c:3250: warning: data 
definition has no type or storage class
drivers/net/ethernet/cavium/liquidio/lio_vf_main.c:3250: error: type defaults 
to 'int' in declaration of 'module_init'
drivers/net/ethernet/cavium/liquidio/lio_vf_main.c:3250: warning: parameter 
names (without types) in function declaration
drivers/net/ethernet/cavium/liquidio/lio_vf_main.c:3251: warning: data 
definition has no type or storage class
drivers/net/ethernet/cavium/liquidio/lio_vf_main.c:3251: error: type defaults 
to 'int' in declaration of 'module_exit'
drivers/net/ethernet/cavium/liquidio/lio_vf_main.c:3251: warning: parameter 
names (without types) in function declaration
drivers/net/ethernet/cavium/liquidio/lio_main.c:36: error: expected declaration 
specifiers or '...' before string constant
drivers/net/ethernet/cavium/liquidio/lio_main.c:36: warning: data definition 
has no type or storage class
drivers/net/ethernet/cavium/liquidio/lio_main.c:36: error: type defaults to 
'int' in declaration of 'MODULE_AUTHOR'
drivers/net/ethernet/cavium/liquidio/lio_main.c:36: error: function declaration 
isn't a prototype
drivers/net/ethernet/cavium/liquidio/lio_main.c:37: error: expected declaration 
specifiers or '...' before string constant
drivers/net/ethernet/cavium/liquidio/lio_main.c:37: warning: data definition 
has no type or storage class
drivers/net/ethernet/cavium/liquidio/lio_main.c:37: error: type defaults to 
'int' in declaration of 'MODULE_DESCRIPTION'
drivers/net/ethernet/cavium/liquidio/lio_main.c:37: error: function declaration 
isn't a prototype
drivers/net/ethernet/cavium/liquidio/lio_main.c:38: error: expected declaration 
specifiers or '...' before string constant
drivers/net/ethernet/cavium/liquidio/lio_main.c:38: warning: data definition 
has no type or storage class
drivers/net/ethernet/cavium/liquidio/lio_main.c:38: error: type defaults to 
'int' in declaration of 'MODULE_LICENSE'
drivers/net/ethernet/cavium/liquidio/lio_main.c:38: error: function declaration 
isn't a prototype
drivers/net/ethernet/cavium/liquidio/lio_main.c:39: error: expected declaration 
specifiers or '...' before string constant
drivers/net/ethernet/cavium/liquidio/lio_main.c:39: warning: data definition 
has no type or storage class
drivers/net/ethernet/cavium/liquidio/lio_main.c:39: error: type defaults to 
'int' in declaration of 'MODULE_VERSION'
drivers/net/ethernet/cavium/liquidio/lio_main.c:39: error: function declaration 
isn't a prototype
drivers/net/ethernet/cavium/liquidio/lio_main.c:40: error: expected declaration 
specifiers or '...'

[PATCH 4.10-rc3 07/13] net: mvneta: fix build errors when linux/phy*.h is removed from net/dsa.h

2017-01-31 Thread Russell King

drivers/net/ethernet/marvell/mvneta.c:2694:26: error: storage size of 'status' 
isn't known
drivers/net/ethernet/marvell/mvneta.c:2695:26: error: storage size of 'changed' 
isn't known
drivers/net/ethernet/marvell/mvneta.c:2695:9: error: variable 'changed' has 
initializer but incomplete type
drivers/net/ethernet/marvell/mvneta.c:2709:2: error: implicit declaration of 
function 'fixed_phy_update_state' [-Werror=implicit-function-declaration]

Add linux/phy_fixed.h to mvneta.c

Signed-off-by: Russell King 
---
 drivers/net/ethernet/marvell/mvneta.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/marvell/mvneta.c 
b/drivers/net/ethernet/marvell/mvneta.c
index e05e22705cf7..eb0eb3e62ca0 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -28,6 +28,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
-- 
2.7.4

[PATCH 4.10-rc3 04/13] net: lan78xx: fix build errors when linux/phy*.h is removed from net/dsa.h

2017-01-31 Thread Russell King

drivers/net/usb/lan78xx.c:394:33: sparse: expected ; at end of declaration
drivers/net/usb/lan78xx.c:394:33: sparse: Expected } at end of 
struct-union-enum-specifier
drivers/net/usb/lan78xx.c:394:33: sparse: got interface
drivers/net/usb/lan78xx.c:403:1: sparse: Expected ; at the end of type 
declaration
drivers/net/usb/lan78xx.c:403:1: sparse: got }

Add linux/phy.h to lan78xx.c

Signed-off-by: Russell King 
---
 drivers/net/usb/lan78xx.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index 08f8703e4d54..9889a70ff4f6 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "lan78xx.h"
 
 #define DRIVER_AUTHOR  "WOOJUNG HUH "
-- 
2.7.4

[PATCH 4.10-rc3 09/13] iscsi: fix build errors when linux/phy*.h is removed from net/dsa.h

2017-01-31 Thread Russell King

drivers/target/iscsi/iscsi_target_login.c:1135:7: error: implicit declaration 
of function 'try_module_get' [-Werror=implicit-function-declaration]

Add linux/module.h to iscsi_target_login.c.

Signed-off-by: Russell King 
---
 drivers/target/iscsi/iscsi_target_login.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/target/iscsi/iscsi_target_login.c 
b/drivers/target/iscsi/iscsi_target_login.c
index 450f51deb2a2..eab274d17b5c 100644
--- a/drivers/target/iscsi/iscsi_target_login.c
+++ b/drivers/target/iscsi/iscsi_target_login.c
@@ -17,6 +17,7 @@
  
**/
 
 #include 
+#include 
 #include 
 #include 
 #include 
-- 
2.7.4

[PATCH 4.10-rc3 02/13] net: cgroups: fix build errors when linux/phy*.h is removed from net/dsa.h

2017-01-31 Thread Russell King

net/core/netprio_cgroup.c:303:16: error: expected declaration specifiers or 
'...' before string constant
MODULE_LICENSE("GPL v2");
   ^~~~

Add linux/module.h to fix this.

Signed-off-by: Russell King 
---
 net/core/netprio_cgroup.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 2ec86fc552df..756637dc7a57 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -13,6 +13,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
-- 
2.7.4

[PATCH 4.10-rc3 05/13] net: bgmac: fix build errors when linux/phy*.h is removed from net/dsa.h

2017-01-31 Thread Russell King

drivers/net/ethernet/broadcom/bgmac.c:1015:17: error: dereferencing pointer to 
incomplete type 'struct mii_bus'
drivers/net/ethernet/broadcom/bgmac.c:1185:2: error: implicit declaration of 
function 'phy_start' [-Werror=implicit-function-declaration]
drivers/net/ethernet/broadcom/bgmac.c:1198:2: error: implicit declaration of 
function 'phy_stop' [-Werror=implicit-function-declaration]
drivers/net/ethernet/broadcom/bgmac.c:1239:9: error: implicit declaration of 
function 'phy_mii_ioctl' [-Werror=implicit-function-declaration]
drivers/net/ethernet/broadcom/bgmac.c:1389:28: error: 
'phy_ethtool_get_link_ksettings' undeclared here (not in a function)
drivers/net/ethernet/broadcom/bgmac.c:1390:28: error: 
'phy_ethtool_set_link_ksettings' undeclared here (not in a function)
drivers/net/ethernet/broadcom/bgmac.c:1403:13: error: dereferencing pointer to 
incomplete type 'struct phy_device'
drivers/net/ethernet/broadcom/bgmac.c:1417:3: error: implicit declaration of 
function 'phy_print_status' [-Werror=implicit-function-declaration]
drivers/net/ethernet/broadcom/bgmac.c:1424:26: error: storage size of 
'fphy_status' isn't known
drivers/net/ethernet/broadcom/bgmac.c:1424:9: error: variable 'fphy_status' has 
initializer but incomplete type
drivers/net/ethernet/broadcom/bgmac.c:1425:11: warning: excess elements in 
struct initializer
drivers/net/ethernet/broadcom/bgmac.c:1425:3: error: unknown field 'link' 
specified in initializer
drivers/net/ethernet/broadcom/bgmac.c:1426:12: note: in expansion of macro 
'SPEED_1000'
drivers/net/ethernet/broadcom/bgmac.c:1426:3: error: unknown field 'speed' 
specified in initializer
drivers/net/ethernet/broadcom/bgmac.c:1427:13: note: in expansion of macro 
'DUPLEX_FULL'
drivers/net/ethernet/broadcom/bgmac.c:1427:3: error: unknown field 'duplex' 
specified in initializer
drivers/net/ethernet/broadcom/bgmac.c:1432:12: error: implicit declaration of 
function 'fixed_phy_register' [-Werror=implicit-function-declaration]
drivers/net/ethernet/broadcom/bgmac.c:1432:31: error: 'PHY_POLL' undeclared 
(first use in this function)
drivers/net/ethernet/broadcom/bgmac.c:1438:8: error: implicit declaration of 
function 'phy_connect_direct' [-Werror=implicit-function-declaration]
drivers/net/ethernet/broadcom/bgmac.c:1439:6: error: 'PHY_INTERFACE_MODE_MII' 
undeclared (first use in this function)
drivers/net/ethernet/broadcom/bgmac.c:1521:2: error: implicit declaration of 
function 'phy_disconnect' [-Werror=implicit-function-declaration]
drivers/net/ethernet/broadcom/bgmac.c:1541:15: error: expected declaration 
specifiers or '...' before string constant

Add linux/phy.h to bgmac.c

Signed-off-by: Russell King 
---
 drivers/net/ethernet/broadcom/bgmac.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bgmac.c 
b/drivers/net/ethernet/broadcom/bgmac.c
index 0e066dc6b8cc..58a2bd3c0458 100644
--- a/drivers/net/ethernet/broadcom/bgmac.c
+++ b/drivers/net/ethernet/broadcom/bgmac.c
@@ -12,6 +12,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include "bgmac.h"
 
 static bool bgmac_wait_value(struct bgmac *bgmac, u16 reg, u32 mask,
-- 
2.7.4

[PATCH 4.10-rc3 03/13] net: macb: fix build errors when linux/phy*.h is removed from net/dsa.h

2017-01-31 Thread Russell King

drivers/net/ethernet/cadence/macb.h:862:33: sparse: expected ; at end of 
declaration
drivers/net/ethernet/cadence/macb.h:862:33: sparse: Expected } at end of 
struct-union-enum-specifier
drivers/net/ethernet/cadence/macb.h:862:33: sparse: got phy_interface
drivers/net/ethernet/cadence/macb.h:877:1: sparse: Expected ; at the end of 
type declaration
drivers/net/ethernet/cadence/macb.h:877:1: sparse: got }
In file included from drivers/net/ethernet/cadence/macb_pci.c:29:0:
drivers/net/ethernet/cadence/macb.h:862:2: error: unknown type name 
'phy_interface_t'
 phy_interface_t  phy_interface;
 ^~~

Add linux/phy.h to macb.h

Signed-off-by: Russell King 
---
 drivers/net/ethernet/cadence/macb.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/cadence/macb.h 
b/drivers/net/ethernet/cadence/macb.h
index d67adad67be1..383da8cf5f6d 100644
--- a/drivers/net/ethernet/cadence/macb.h
+++ b/drivers/net/ethernet/cadence/macb.h
@@ -10,6 +10,8 @@
 #ifndef _MACB_H
 #define _MACB_H
 
+#include 
+
 #define MACB_GREGS_NBR 16
 #define MACB_GREGS_VERSION 2
 #define MACB_MAX_QUEUES 8
-- 
2.7.4

[PATCH 4.10-rc3 08/13] net: emac: fix build errors when linux/phy*.h is removed from net/dsa.h

2017-01-31 Thread Russell King

drivers/net/ethernet/qualcomm/emac/emac-sgmii.c:58:12: error: dereferencing 
pointer to incomplete type 'struct phy_device'

Add linux/phy.h to emac-sgmii.c

Signed-off-by: Russell King 
---
 drivers/net/ethernet/qualcomm/emac/emac-sgmii.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c 
b/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c
index bf722a9bb09d..5e31fb7e4ab8 100644
--- a/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c
+++ b/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "emac.h"
 #include "emac-mac.h"
 #include "emac-sgmii.h"
-- 
2.7.4

[PATCH 4.10-rc3 12/13] net: ath5k: fix build errors when linux/phy*.h is removed from net/dsa.h

2017-01-31 Thread Russell King

Fix these errors reported by the 0-day builder by replacing the
linux/export.h include with linux/module.h.

In file included from include/linux/platform_device.h:14:0,
 from drivers/net/wireless/ath/ath5k/ahb.c:20:
include/linux/device.h:1463:1: warning: data definition has no type or storage 
class
 module_init(__driver##_init); \
 ^
include/linux/platform_device.h:228:2: note: in expansion of macro 
'module_driver'
  module_driver(__platform_driver, platform_driver_register, \
  ^
drivers/net/wireless/ath/ath5k/ahb.c:233:1: note: in expansion of macro 
'module_platform_driver'
 module_platform_driver(ath_ahb_driver);
 ^~
include/linux/device.h:1463:1: error: type defaults to 'int' in declaration of 
'module_init' [-Werror=implicit-int]
 module_init(__driver##_init); \
 ^
include/linux/platform_device.h:228:2: note: in expansion of macro 
'module_driver'
  module_driver(__platform_driver, platform_driver_register, \
  ^
drivers/net/wireless/ath/ath5k/ahb.c:233:1: note: in expansion of macro 
'module_platform_driver'
 module_platform_driver(ath_ahb_driver);
 ^~
drivers/net/wireless/ath/ath5k/ahb.c:233:1: warning: parameter names (without 
types) in function declaration
In file included from include/linux/platform_device.h:14:0,
 from drivers/net/wireless/ath/ath5k/ahb.c:20:
include/linux/device.h:1468:1: warning: data definition has no type or storage 
class
 module_exit(__driver##_exit);
 ^
include/linux/platform_device.h:228:2: note: in expansion of macro 
'module_driver'
  module_driver(__platform_driver, platform_driver_register, \
  ^
drivers/net/wireless/ath/ath5k/ahb.c:233:1: note: in expansion of macro 
'module_platform_driver'
 module_platform_driver(ath_ahb_driver);
 ^~
include/linux/device.h:1468:1: error: type defaults to 'int' in declaration of 
'module_exit' [-Werror=implicit-int]
 module_exit(__driver##_exit);
 ^
include/linux/platform_device.h:228:2: note: in expansion of macro 
'module_driver'
  module_driver(__platform_driver, platform_driver_register, \
  ^
drivers/net/wireless/ath/ath5k/ahb.c:233:1: note: in expansion of macro 
'module_platform_driver'
 module_platform_driver(ath_ahb_driver);
 ^~
drivers/net/wireless/ath/ath5k/ahb.c:233:1: warning: parameter names (without 
types) in function declaration
In file included from include/linux/platform_device.h:14:0,
 from drivers/net/wireless/ath/ath5k/ahb.c:20:
drivers/net/wireless/ath/ath5k/ahb.c:233:24: warning: 'ath_ahb_driver_exit' 
defined but not used [-Wunused-function]
 module_platform_driver(ath_ahb_driver);
^
include/linux/device.h:1464:20: note: in definition of macro 'module_driver'
 static void __exit __driver##_exit(void) \
^~~~
drivers/net/wireless/ath/ath5k/ahb.c:233:1: note: in expansion of macro 
'module_platform_driver'
 module_platform_driver(ath_ahb_driver);
 ^~
drivers/net/wireless/ath/ath5k/ahb.c:233:24: warning: 'ath_ahb_driver_init' 
defined but not used [-Wunused-function]
 module_platform_driver(ath_ahb_driver);
^
include/linux/device.h:1459:19: note: in definition of macro 'module_driver'
 static int __init __driver##_init(void) \
   ^~~~
drivers/net/wireless/ath/ath5k/ahb.c:233:1: note: in expansion of macro 
'module_platform_driver'
 module_platform_driver(ath_ahb_driver);
 ^~

Signed-off-by: Russell King 
---
 drivers/net/wireless/ath/ath5k/ahb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/ath5k/ahb.c 
b/drivers/net/wireless/ath/ath5k/ahb.c
index 2ca88b593e4c..c0794f5988b3 100644
--- a/drivers/net/wireless/ath/ath5k/ahb.c
+++ b/drivers/net/wireless/ath/ath5k/ahb.c
@@ -16,10 +16,10 @@
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
+#include 
 #include 
 #include 
 #include 
-#include 
 #include 
 #include "ath5k.h"
 #include "debug.h"
-- 
2.7.4

[PATCH 4.10-rc3 06/13] net: fman: fix build errors when linux/phy*.h is removed from net/dsa.h

2017-01-31 Thread Russell King

drivers/net/ethernet/freescale/fman/fman_memac.c:519:21: error: dereferencing 
pointer to incomplete type 'struct fixed_phy_status'

Add linux/phy_fixed.h to fman_memac.c

Signed-off-by: Russell King 
---
 drivers/net/ethernet/freescale/fman/fman_memac.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/freescale/fman/fman_memac.c 
b/drivers/net/ethernet/freescale/fman/fman_memac.c
index 71a5ded9d1de..cd6a53eaf161 100644
--- a/drivers/net/ethernet/freescale/fman/fman_memac.c
+++ b/drivers/net/ethernet/freescale/fman/fman_memac.c
@@ -38,6 +38,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 /* PCS registers */
-- 
2.7.4

[PATCH 4.10-rc3 13/13] net: dsa: remove unnecessary phy*.h includes

2017-01-31 Thread Russell King

Including phy.h and phy_fixed.h into net/dsa.h causes phy*.h to be an
unnecessary dependency for quite a large amount of the kernel.  There's
very little which actually requires definitions from phy.h in net/dsa.h
- the include itself only wants the declaration of a couple of
structures and IFNAMSIZ.

Add linux/if.h for IFNAMSIZ, declarations for the structures, phy.h to
mv88e6xxx.h as it needs it for phy_interface_t, and remove both phy.h
and phy_fixed.h from net/dsa.h.

This patch reduces from around 800 files rebuilt to around 40 - even
with ccache, the time difference is noticable.

Tested-by: Vivien Didelot 
Reviewed-by: Florian Fainelli 
Signed-off-by: Russell King 
---
 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h | 1 +
 include/net/dsa.h | 6 --
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h 
b/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h
index af54baea47cf..3a949095068a 100644
--- a/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h
+++ b/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #ifndef UINT64_MAX
 #define UINT64_MAX (u64)(~((u64)0))
diff --git a/include/net/dsa.h b/include/net/dsa.h
index b122196d5a1f..887b2f98f9ea 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -11,15 +11,17 @@
 #ifndef __LINUX_NET_DSA_H
 #define __LINUX_NET_DSA_H
 
+#include 
 #include 
 #include 
 #include 
 #include 
 #include 
-#include 
-#include 
 #include 
 
+struct phy_device;
+struct fixed_phy_status;
+
 enum dsa_tag_protocol {
DSA_TAG_PROTO_NONE = 0,
DSA_TAG_PROTO_DSA,
-- 
2.7.4

[PATCH 4.10-rc3 01/13] net: sunrpc: fix build errors when linux/phy*.h is removed from net/dsa.h

2017-01-31 Thread Russell King

Removing linux/phy.h from net/dsa.h reveals a build error in the sunrpc
code:

net/sunrpc/xprtrdma/svc_rdma_backchannel.c: In function 'xprt_rdma_bc_put':
net/sunrpc/xprtrdma/svc_rdma_backchannel.c:277:2: error: implicit declaration 
of function 'module_put' [-Werror=implicit-function-declaration]
net/sunrpc/xprtrdma/svc_rdma_backchannel.c: In function 'xprt_setup_rdma_bc':
net/sunrpc/xprtrdma/svc_rdma_backchannel.c:348:7: error: implicit declaration 
of function 'try_module_get' [-Werror=implicit-function-declaration]

Fix this by adding linux/module.h to svc_rdma_backchannel.c

Signed-off-by: Russell King 
---
 net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c 
b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index 288e35c2d8f4..cb1e48e54eb1 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -4,6 +4,7 @@
  * Support for backward direction RPCs on RPC/RDMA (server-side).
  */
 
+#include 
 #include 
 #include "xprt_rdma.h"
 
-- 
2.7.4

[PATCH 4.10-rc3 00/13] net: dsa: remove unnecessary phy.h include

2017-01-31 Thread Russell King - ARM Linux

Including phy.h and phy_fixed.h into net/dsa.h causes phy*.h to be an
unnecessary dependency for quite a large amount of the kernel.  There's
very little which actually requires definitions from phy.h in net/dsa.h
- the include itself only wants the declaration of a couple of
structures and IFNAMSIZ.

Add linux/if.h for IFNAMSIZ, declarations for the structures, phy.h to
mv88e6xxx.h as it needs it for phy_interface_t, and remove both phy.h
and phy_fixed.h from net/dsa.h.

This patch reduces from around 800 files rebuilt to around 40 - even
with ccache, the time difference is noticable.

In order to make this change, several drivers need to be updated to
include necessary headers that they were picking up through this
include.  This has resulted in a much larger patch series.

I'm assuming the 0-day builder has had 24 hours with this series, and
hasn't reported any further issues with it - the last issue was two
weeks ago (before I became ill) which I fixed over the last weekend.

I'm hoping this doesn't conflict with what's already in net-next...

 arch/mips/cavium-octeon/octeon-platform.c | 4 
 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h | 1 +
 drivers/net/ethernet/broadcom/bgmac.c | 2 ++
 drivers/net/ethernet/cadence/macb.h   | 2 ++
 drivers/net/ethernet/cavium/liquidio/lio_main.c   | 1 +
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c| 1 +
 drivers/net/ethernet/cavium/liquidio/octeon_console.c | 1 +
 drivers/net/ethernet/freescale/fman/fman_memac.c  | 1 +
 drivers/net/ethernet/marvell/mvneta.c | 1 +
 drivers/net/ethernet/qualcomm/emac/emac-sgmii.c   | 1 +
 drivers/net/usb/lan78xx.c | 1 +
 drivers/net/wireless/ath/ath5k/ahb.c  | 2 +-
 drivers/target/iscsi/iscsi_target_login.c | 1 +
 include/net/dsa.h | 6 --
 net/core/netprio_cgroup.c | 1 +
 net/sunrpc/xprtrdma/svc_rdma_backchannel.c| 1 +
 16 files changed, 20 insertions(+), 7 deletions(-)

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
according to speedtest.net.

[PATCH net v2] be2net: fix initial MAC setting

2017-01-31 Thread Ivan Vecera

Recent commit 34393529163a ("be2net: fix MAC addr setting on privileged
BE3 VFs") allows privileged BE3 VFs to set its MAC address during
initialization. Although the initial MAC for such VFs is already
programmed by parent PF the subsequent setting performed by VF is OK,
but in certain cases (after fresh boot) this command in VF can fail.

The MAC should be initialized only when:
1) no MAC is programmed (always except BE3 VFs during first init)
2) programmed MAC is different from requested (e.g. MAC is set when
   interface is down). In this case the initial MAC programmed by PF
   needs to be deleted.

The adapter->dev_mac contains MAC address currently programmed in HW so
it should be zeroed when the MAC is deleted from HW and should not be
filled when MAC is set when interface is down in be_mac_addr_set() as
no programming is performed in this case.

Example of failure without the fix (immediately after fresh boot):

# ip link set eth0 up  <- eth0 is BE3 PF
be2net :01:00.0 eth0: Link is Up

# echo 1 > /sys/class/net/eth0/device/sriov_numvfs  <- Create 1 VF
...
be2net :01:04.0: Emulex OneConnect(be3): VF  port 0

# ip link set eth8 up  <- eth8 is created privileged VF
be2net :01:04.0: opcode 59-1 failed:status 1-76
RTNETLINK answers: Input/output error

# echo 0 > /sys/class/net/eth0/device/sriov_numvfs  <- Delete VF
iommu: Removing device :01:04.0 from group 33
...

# echo 1 > /sys/class/net/eth0/device/sriov_numvfs  <- Create it again
iommu: Removing device :01:04.0 from group 33
...

# ip link set eth8 up
be2net :01:04.0 eth8: Link is Up

Initialization is now OK.

v2 - Corrected the comment and condition check suggested by Suresh & Harsha

Fixes: 34393529163a ("be2net: fix MAC addr setting on privileged BE3 VFs")
Cc: Sathya Perla 
Cc: Ajit Khaparde 
Cc: Sriharsha Basavapatna 
Cc: Somnath Kotur 
Signed-off-by: Ivan Vecera 
---
 drivers/net/ethernet/emulex/benet/be_main.c | 33 -
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be_main.c 
b/drivers/net/ethernet/emulex/benet/be_main.c
index 1a7f8ad7b9c6..cd49a54c538d 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -362,8 +362,10 @@ static int be_mac_addr_set(struct net_device *netdev, void 
*p)
status = -EPERM;
goto err;
}
-done:
+
+   /* Remember currently programmed MAC */
ether_addr_copy(adapter->dev_mac, addr->sa_data);
+done:
ether_addr_copy(netdev->dev_addr, addr->sa_data);
dev_info(dev, "MAC address changed to %pM\n", addr->sa_data);
return 0;
@@ -3618,8 +3620,10 @@ static void be_disable_if_filters(struct be_adapter 
*adapter)
 {
/* Don't delete MAC on BE3 VFs without FILTMGMT privilege  */
if (!BEx_chip(adapter) || !be_virtfn(adapter) ||
-   check_privilege(adapter, BE_PRIV_FILTMGMT))
+   check_privilege(adapter, BE_PRIV_FILTMGMT)) {
be_dev_mac_del(adapter, adapter->pmac_id[0]);
+   eth_zero_addr(adapter->dev_mac);
+   }
 
be_clear_uc_list(adapter);
be_clear_mc_list(adapter);
@@ -3773,12 +3777,27 @@ static int be_enable_if_filters(struct be_adapter 
*adapter)
if (status)
return status;
 
-   /* Don't add MAC on BE3 VFs without FILTMGMT privilege */
-   if (!BEx_chip(adapter) || !be_virtfn(adapter) ||
-   check_privilege(adapter, BE_PRIV_FILTMGMT)) {
+   /* Normally this condition usually true as the ->dev_mac is zeroed.
+* But on BE3 VFs the initial MAC is pre-programmed by PF and
+* subsequent be_dev_mac_add() can fail (after fresh boot)
+*/
+   if (!ether_addr_equal(adapter->dev_mac, adapter->netdev->dev_addr)) {
+   int old_pmac_id = -1;
+
+   /* Remember old programmed MAC if any - can happen on BE3 VF */
+   if (!is_zero_ether_addr(adapter->dev_mac))
+   old_pmac_id = adapter->pmac_id[0];
+
status = be_dev_mac_add(adapter, adapter->netdev->dev_addr);
if (status)
return status;
+
+   /* Delete the old programmed MAC as we successfully programmed
+* a new MAC
+*/
+   if (old_pmac_id >= 0 && old_pmac_id != adapter->pmac_id[0])
+   be_dev_mac_del(adapter, old_pmac_id);
+
ether_addr_copy(adapter->dev_mac, adapter->netdev->dev_addr);
}
 
@@ -4552,6 +4571,10 @@ static int be_mac_setup(struct be_adapter *adapter)
 
memcpy(adapter->netdev->dev_addr, mac, ETH_ALEN);
memcpy(adapter->netdev->perm_addr, mac, ETH_ALEN);
+
+   /* Initial MAC for BE3 VFs is already programmed by PF */
+

[PATCH] r8152: Allocate interrupt buffer as part of struct r8152

2017-01-31 Thread Guenter Roeck

When unloading the r8152 driver using the 'unbind' sysfs attribute
in a system with KASAN enabled, the following error message is seen
on a regular basis.

BUG kmalloc-128 (Not tainted): Poison overwritten
-
INFO: 0xffc0a9522a00-0xffc0a9522a01. First byte 0xee instead of 0x6b
INFO: Allocated in rtl8152_open+0x318/0x5dc [r8152] age=69847 cpu=4 pid=1345
init: mtpd main process (2372) terminated with status 253
init: mtpd main process ended, respawning
alloc_debug_processing+0x124/0x178
___slab_alloc.constprop.59+0x530/0x65c
__slab_alloc.isra.56.constprop.58+0x48/0x74
kmem_cache_alloc_trace+0xd8/0x34c
rtl8152_open+0x318/0x5dc [r8152]
__dev_open+0xcc/0x140
__dev_change_flags+0xc8/0x1a8
dev_change_flags+0x50/0xa0
do_setlink+0x440/0xcd4
rtnl_newlink+0x414/0x7cc
rtnetlink_rcv_msg+0x238/0x268
netlink_rcv_skb+0xa4/0x128
rtnetlink_rcv+0x2c/0x3c
netlink_unicast+0x1e8/0x2e0
netlink_sendmsg+0x4c0/0x4e4
sock_sendmsg+0x70/0x8c
INFO: Freed in free_all_mem+0x10c/0x12c [r8152] age=271 cpu=2 pid=5992
free_debug_processing+0x278/0x37c
__slab_free+0x84/0x440
kfree+0x2d4/0x37c
free_all_mem+0x10c/0x12c [r8152]
rtl8152_close+0xf4/0x10c [r8152]
__dev_close_many+0xe0/0x118
dev_close_many+0xb8/0x174
rollback_registered_many+0x19c/0x3fc
unregister_netdevice_queue+0xe4/0x188
unregister_netdev+0x28/0x38
rtl8152_disconnect+0x7c/0xb0 [r8152]
usb_unbind_interface+0xd8/0x2cc
__device_release_driver+0x10c/0x1a8
device_release_driver+0x30/0x44
bus_remove_device+0x1e0/0x208
device_del+0x21c/0x2cc
INFO: Slab 0xffbdc2a5c880 objects=16 used=14 fp=0xffc0a9523400 
flags=0x4080
INFO: Object 0xffc0a9522a00 @offset=2560 fp=0xffc0a9522200

Bytes b4 ffc0a95229f0: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a  

Object ffc0a9522a00: ee 30 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  
.0kk
Object ffc0a9522a10: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  

Object ffc0a9522a20: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  

Object ffc0a9522a30: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  

Object ffc0a9522a40: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  

Object ffc0a9522a50: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  

Object ffc0a9522a60: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  

Object ffc0a9522a70: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b a5  
kkk.
Redzone ffc0a9522a80: bb bb bb bb bb bb bb bb  

Padding ffc0a9522bc0: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a  


The two-byte allocation in conjunction with code analysis suggests that
the interrupt buffer has been overwritten. Added instrumentation in the
driver shows that the interrupt handler is called after RTL8152_UNPLUG
was set, and that this event is associated with the error message above.
This suggests that there are situations where the interrupt buffer is used
after it has been freed.

To avoid the problem, allocate the interrupt buffer as part of struct
r8152.

Cc: Hayes Wang 
Signed-off-by: Guenter Roeck 
---
The problem is seen in chromeos-4.4, but there is not reason to believe
that it does not occur with the upstream kernel. It is still seen in
chromeos-4.4 after all patches from upstream and linux-next have been
applied to the driver.

While relatively simple, I am not really convinced that this is the best
(or even an acceptable) solution for this problem. I am open to suggestions
for a better fix.

 drivers/net/usb/r8152.c | 9 +
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index ad42295356dd..afbfa728b48e 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -641,7 +641,7 @@ struct r8152 {
u32 coalesce;
u16 ocp_base;
u16 speed;
-   u8 *intr_buff;
+   u8 intr_buff[INTBUFSIZE] cacheline_aligned;
u8 version;
u8 duplex;
u8 autoneg;
@@ -1342,9 +1342,6 @@ static void free_all_mem(struct r8152 *tp)
 
usb_free_urb(tp->intr_urb);
tp->intr_urb = NULL;
-
-   kfree(tp->intr_buff);
-   tp->intr_buff = NULL;
 }
 
 static int alloc_all_mem(struct r8152 *tp)
@@ -1423,10 +1420,6 @@ static int alloc_all_mem(struct r8152 *tp)
if (!tp->intr_urb)
goto err1;
 
-   tp->intr_buff = kmalloc(INTBUFSIZE, GFP_KERNEL);
-   if (!tp->intr_buff)
-   goto err1;
-
tp->intr_interval = (int)ep_intr->desc.bInterval;
usb_fill_int_urb(tp->intr_urb, tp->udev, usb_rcvintpipe(tp->udev, 3),
 tp->intr_buff, INTBUFSIZE, intr_callback,
-- 
2.7.4

[PATCH net-next 3/6] drivers: net: xgene-v2: Add ethernet hardware configuration

2017-01-31 Thread Iyappan Subramanian

This patch adds functions to configure ethernet hardware.

Signed-off-by: Iyappan Subramanian 
Signed-off-by: Keyur Chudgar 
---
 drivers/net/ethernet/apm/xgene-v2/enet.c | 71 
 drivers/net/ethernet/apm/xgene-v2/enet.h | 43 +++
 2 files changed, 114 insertions(+)
 create mode 100644 drivers/net/ethernet/apm/xgene-v2/enet.c
 create mode 100644 drivers/net/ethernet/apm/xgene-v2/enet.h

diff --git a/drivers/net/ethernet/apm/xgene-v2/enet.c 
b/drivers/net/ethernet/apm/xgene-v2/enet.c
new file mode 100644
index 000..b49edee
--- /dev/null
+++ b/drivers/net/ethernet/apm/xgene-v2/enet.c
@@ -0,0 +1,71 @@
+/*
+ * Applied Micro X-Gene SoC Ethernet v2 Driver
+ *
+ * Copyright (c) 2017, Applied Micro Circuits Corporation
+ * Author(s): Iyappan Subramanian 
+ *   Keyur Chudgar 
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see .
+ */
+
+#include "main.h"
+
+void xge_wr_csr(struct xge_pdata *pdata, u32 offset, u32 val)
+{
+   void __iomem *addr = pdata->resources.base_addr + offset;
+
+   iowrite32(val, addr);
+}
+
+u32 xge_rd_csr(struct xge_pdata *pdata, u32 offset)
+{
+   void __iomem *addr = pdata->resources.base_addr + offset;
+
+   return ioread32(addr);
+}
+
+int xge_port_reset(struct net_device *ndev)
+{
+   struct xge_pdata *pdata = netdev_priv(ndev);
+
+   xge_wr_csr(pdata, ENET_SRST, 0x3);
+   xge_wr_csr(pdata, ENET_SRST, 0x2);
+   xge_wr_csr(pdata, ENET_SRST, 0x0);
+
+   xge_wr_csr(pdata, ENET_SHIM, DEVM_ARAUX_COH | DEVM_AWAUX_COH);
+
+   return 0;
+}
+
+static void xge_traffic_resume(struct net_device *ndev)
+{
+   struct xge_pdata *pdata = netdev_priv(ndev);
+
+   xge_wr_csr(pdata, CFG_FORCE_LINK_STATUS_EN, 1);
+   xge_wr_csr(pdata, FORCE_LINK_STATUS, 1);
+
+   xge_wr_csr(pdata, CFG_LINK_AGGR_RESUME, 1);
+   xge_wr_csr(pdata, RX_DV_GATE_REG, 1);
+}
+
+int xge_port_init(struct net_device *ndev)
+{
+   struct xge_pdata *pdata = netdev_priv(ndev);
+
+   pdata->phy_speed = SPEED_1000;
+   xge_mac_init(pdata);
+   xge_traffic_resume(ndev);
+
+   return 0;
+}
diff --git a/drivers/net/ethernet/apm/xgene-v2/enet.h 
b/drivers/net/ethernet/apm/xgene-v2/enet.h
new file mode 100644
index 000..40371cf
--- /dev/null
+++ b/drivers/net/ethernet/apm/xgene-v2/enet.h
@@ -0,0 +1,43 @@
+/*
+ * Applied Micro X-Gene SoC Ethernet v2 Driver
+ *
+ * Copyright (c) 2017, Applied Micro Circuits Corporation
+ * Author(s): Iyappan Subramanian 
+ *   Keyur Chudgar 
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see .
+ */
+
+#ifndef __XGENE_ENET_V2_ENET_H__
+#define __XGENE_ENET_V2_ENET_H__
+
+#define ENET_CLKEN 0xc008
+#define ENET_SRST  0xc000
+#define ENET_SHIM  0xc010
+#define CFG_MEM_RAM_SHUTDOWN   0xd070
+#define BLOCK_MEM_RDY  0xd074
+
+#define DEVM_ARAUX_COH BIT(19)
+#define DEVM_AWAUX_COH BIT(3)
+
+#define CFG_FORCE_LINK_STATUS_EN   0x229c
+#define FORCE_LINK_STATUS  0x22a0
+#define CFG_LINK_AGGR_RESUME   0x27c8
+#define RX_DV_GATE_REG 0x2dfc
+
+void xge_wr_csr(struct xge_pdata *pdata, u32 offset, u32 val);
+u32 xge_rd_csr(struct xge_pdata *pdata, u32 offset);
+int xge_port_reset(struct net_device *ndev);
+
+#endif  /* __XGENE_ENET_V2_ENET__H__ */
-- 
1.9.1

1 2 3 >

1 - 100 of 254 matches

Mail list logo