date:20150811

[PATCH iproute2] ip-link: fix a typo in help message

2015-08-11 Thread Zhang Shengju

fix a typo: "noarp" -> "arp"

Signed-off-by: Zhang Shengju 
---
 ip/iplink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ip/iplink.c b/ip/iplink.c
index 7e5c466..fa8aa47 100644
--- a/ip/iplink.c
+++ b/ip/iplink.c
@@ -507,7 +507,7 @@ int iplink_parse(int argc, char **argv, struct iplink_req 
*req,
} else if (strcmp(*argv, "off") == 0) {
req->i.ifi_flags |= IFF_NOARP;
} else
-   return on_off("noarp", *argv);
+   return on_off("arp", *argv);
} else if (strcmp(*argv, "vf") == 0) {
struct rtattr *vflist;
NEXT_ARG();
-- 
1.8.3.1



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH net-next] net: atl1c: add BQL support

2015-08-11 Thread Ron Angeles

This BQL implementation is mostly derived from its related driver, alx.
Tested on AR8131 (rev c0) [1969:1063]. Saturated a 100mbps link with 5
concurrent runs of netperf. Ping latency dropped from 14ms to 3ms.

Signed-off-by: Ron Angeles 
---
 drivers/net/ethernet/atheros/atl1c/atl1c_main.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c 
b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
index 932bd18..2795d6d 100644
--- a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
+++ b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
@@ -874,6 +874,8 @@ static void atl1c_clean_tx_ring(struct atl1c_adapter 
*adapter,
atl1c_clean_buffer(pdev, buffer_info);
}
 
+   netdev_reset_queue(adapter->netdev);
+
/* Zero out Tx-buffers */
memset(tpd_ring->desc, 0, sizeof(struct atl1c_tpd_desc) *
ring_count);
@@ -1551,6 +1553,7 @@ static bool atl1c_clean_tx_irq(struct atl1c_adapter 
*adapter,
u16 next_to_clean = atomic_read(&tpd_ring->next_to_clean);
u16 hw_next_to_clean;
u16 reg;
+   unsigned int total_bytes = 0, total_packets = 0;
 
reg = type == atl1c_trans_high ? REG_TPD_PRI1_CIDX : REG_TPD_PRI0_CIDX;
 
@@ -1558,12 +1561,18 @@ static bool atl1c_clean_tx_irq(struct atl1c_adapter 
*adapter,
 
while (next_to_clean != hw_next_to_clean) {
buffer_info = &tpd_ring->buffer_info[next_to_clean];
+   if (buffer_info->skb) {
+   total_bytes += buffer_info->skb->len;
+   total_packets++;
+   }
atl1c_clean_buffer(pdev, buffer_info);
if (++next_to_clean == tpd_ring->count)
next_to_clean = 0;
atomic_set(&tpd_ring->next_to_clean, next_to_clean);
}
 
+   netdev_completed_queue(adapter->netdev, total_packets, total_bytes);
+
if (netif_queue_stopped(adapter->netdev) &&
netif_carrier_ok(adapter->netdev)) {
netif_wake_queue(adapter->netdev);
@@ -2256,6 +2265,7 @@ static netdev_tx_t atl1c_xmit_frame(struct sk_buff *skb,
spin_unlock_irqrestore(&adapter->tx_lock, flags);
dev_kfree_skb_any(skb);
} else {
+   netdev_sent_queue(adapter->netdev, skb->len);
atl1c_tx_queue(adapter, skb, tpd, type);
spin_unlock_irqrestore(&adapter->tx_lock, flags);
}
-- 
2.4.6
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH iproute2 1/3] iplink: add missing link type

2015-08-11 Thread Zhang Shengju

Add missing link type "bridge_slave".

Signed-off-by: Zhang Shengju 
---
 ip/iplink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ip/iplink.c b/ip/iplink.c
index de5a3c9..7e5c466 100644
--- a/ip/iplink.c
+++ b/ip/iplink.c
@@ -93,7 +93,7 @@ void iplink_usage(void)
fprintf(stderr, "TYPE := { vlan | veth | vcan | dummy | ifb | 
macvlan | macvtap |\n");
fprintf(stderr, "  bridge | bond | ipoib | ip6tnl | 
ipip | sit | vxlan |\n");
fprintf(stderr, "  gre | gretap | ip6gre | ip6gretap | 
vti | nlmon |\n");
-   fprintf(stderr, "  bond_slave | ipvlan | geneve }\n");
+   fprintf(stderr, "  bond_slave | ipvlan | geneve | 
bridge_slave }\n");
}
exit(-1);
 }
-- 
1.8.3.1



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH iproute2 2/3] iplink: use the short format to print help info

2015-08-11 Thread Zhang Shengju

Allow to print link type usage by: ip link help bridge

Signed-off-by: Zhang Shengju 
---
 ip/iplink_bridge.c | 16 ++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/ip/iplink_bridge.c b/ip/iplink_bridge.c
index 297160c..1e69960 100644
--- a/ip/iplink_bridge.c
+++ b/ip/iplink_bridge.c
@@ -17,9 +17,9 @@
 #include "utils.h"
 #include "ip_common.h"
 
-static void explain(void)
+static void print_explain(FILE *f)
 {
-   fprintf(stderr,
+   fprintf(f,
"Usage: ... bridge [ forward_delay FORWARD_DELAY ]\n"
"  [ hello_time HELLO_TIME ]\n"
"  [ max_age MAX_AGE ]\n"
@@ -29,6 +29,11 @@ static void explain(void)
);
 }
 
+static void explain(void)
+{
+   print_explain(stderr);
+}
+
 static int bridge_parse_opt(struct link_util *lu, int argc, char **argv,
struct nlmsghdr *n)
 {
@@ -111,9 +116,16 @@ static void bridge_print_opt(struct link_util *lu, FILE 
*f, struct rtattr *tb[])
rta_getattr_u32(tb[IFLA_BR_MAX_AGE]));
 }
 
+static void bridge_print_help(struct link_util *lu, int argc, char **argv,
+   FILE *f)
+{
+   print_explain(f);
+}
+
 struct link_util bridge_link_util = {
.id = "bridge",
.maxattr= IFLA_BR_MAX,
.parse_opt  = bridge_parse_opt,
.print_opt  = bridge_print_opt,
+   .print_help = bridge_print_help,
 };
-- 
1.8.3.1



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH iproute2 0/3] iplink: shortify printing the usage of bridge

2015-08-11 Thread Zhang Shengju

This patch set enables bridge related links to print usage with short format.
Or else it is needed to use the following way:

  ip link { add | del | set } type TYPE help

Zhang Shengju (3):
  iplink: add missing link type
  iplink: use the short format to print help info
  iplink: shortify printing the usage of link type

 ip/iplink.c  |  2 +-
 ip/iplink_bridge.c   | 16 ++--
 ip/iplink_bridge_slave.c | 16 ++--
 3 files changed, 29 insertions(+), 5 deletions(-)

-- 
1.8.3.1



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH iproute2 3/3] iplink: shortify printing the usage of link type

2015-08-11 Thread Zhang Shengju

Allow to print link type usage by: ip link help bridge_slave

Signed-off-by: Zhang Shengju 
---
 ip/iplink_bridge_slave.c | 16 ++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/ip/iplink_bridge_slave.c b/ip/iplink_bridge_slave.c
index a285185..4593872 100644
--- a/ip/iplink_bridge_slave.c
+++ b/ip/iplink_bridge_slave.c
@@ -19,9 +19,9 @@
 #include "utils.h"
 #include "ip_common.h"
 
-static void explain(void)
+static void print_explain(FILE *f)
 {
-   fprintf(stderr,
+   fprintf(f,
"Usage: ... bridge_slave [ state STATE ] [ priority PRIO ] 
[cost COST ]\n"
"[ guard {on | off} ]\n"
"[ hairpin {on | off} ] \n"
@@ -32,6 +32,11 @@ static void explain(void)
);
 }
 
+static void explain(void)
+{
+   print_explain(stderr);
+}
+
 static const char *port_states[] = {
[BR_STATE_DISABLED] = "disabled",
[BR_STATE_LISTENING] = "listening",
@@ -172,10 +177,17 @@ static int bridge_slave_parse_opt(struct link_util *lu, 
int argc, char **argv,
return 0;
 }
 
+static void bridge_slave_print_help(struct link_util *lu, int argc, char 
**argv,
+   FILE *f)
+{
+   print_explain(f);
+}
+
 struct link_util bridge_slave_link_util = {
.id = "bridge",
.maxattr= IFLA_BRPORT_MAX,
.print_opt  = bridge_slave_print_opt,
.parse_opt  = bridge_slave_parse_opt,
+   .print_help = bridge_slave_print_help,
.slave  = true,
 };
-- 
1.8.3.1



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH net-next 6/6] geneve: Remove duplicate dev list

2015-08-11 Thread Pravin B Shelar

Geneve driver maintains list and hash table of Geneve devices.
Following patch removes the duplicate Geneve list and iterate
hash table when it is needed.

Signed-off-by: Pravin B Shelar 
---
 drivers/net/geneve.c | 28 
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index af50061..ad1cb45 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -39,7 +39,6 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with 
corrupted ECN");
 
 /* per-network namespace private data for this module */
 struct geneve_net {
-   struct list_head  geneve_list;
struct hlist_head vni_list[VNI_HASH_SIZE];
struct list_headsock_list;
 };
@@ -56,7 +55,6 @@ struct geneve_dev {
u8 ttl; /* TTL override */
u8 tos; /* TOS override */
struct sockaddr_in remote;  /* IPv4 address for link partner */
-   struct list_head   next;/* geneve's per namespace list */
__be16 dst_port;
bool   collect_md;
 };
@@ -818,7 +816,6 @@ static int geneve_configure(struct net *net, struct 
net_device *dev,
if (err)
return err;
 
-   list_add(&geneve->next, &gn->geneve_list);
hlist_add_head_rcu(&geneve->hlist, &gn->vni_list[hash]);
return 0;
 }
@@ -864,7 +861,6 @@ static void geneve_dellink(struct net_device *dev, struct 
list_head *head)
if (!hlist_unhashed(&geneve->hlist))
hlist_del_rcu(&geneve->hlist);
 
-   list_del(&geneve->next);
unregister_netdevice_queue(dev, head);
 }
 
@@ -952,7 +948,6 @@ static __net_init int geneve_init_net(struct net *net)
struct geneve_net *gn = net_generic(net, geneve_net_id);
unsigned int h;
 
-   INIT_LIST_HEAD(&gn->geneve_list);
INIT_LIST_HEAD(&gn->sock_list);
 
for (h = 0; h < VNI_HASH_SIZE; ++h)
@@ -964,8 +959,11 @@ static __net_init int geneve_init_net(struct net *net)
 static void __net_exit geneve_exit_net(struct net *net)
 {
struct geneve_net *gn = net_generic(net, geneve_net_id);
-   struct geneve_dev *geneve, *next;
+   struct hlist_head *vni_list_head;
+   struct hlist_node *next;
struct net_device *dev, *aux;
+   struct geneve_dev *geneve;
+   unsigned int h;
LIST_HEAD(list);
 
rtnl_lock();
@@ -975,13 +973,19 @@ static void __net_exit geneve_exit_net(struct net *net)
if (dev->rtnl_link_ops == &geneve_link_ops)
unregister_netdevice_queue(dev, &list);
 
-   /* now gather any other geneve devices that were created in this ns */
-   list_for_each_entry_safe(geneve, next, &gn->geneve_list, next) {
-   /* If geneve->dev is in the same netns, it was already added
-* to the list by the previous loop.
+   for (h = 0; h < VNI_HASH_SIZE; ++h) {
+   vni_list_head = &gn->vni_list[h];
+
+   /* now gather any other geneve devices that were created
+* in this ns
 */
-   if (!net_eq(dev_net(geneve->dev), net))
-   unregister_netdevice_queue(geneve->dev, &list);
+   hlist_for_each_entry_safe(geneve, next, vni_list_head, hlist) {
+   /* If geneve->dev is in the same netns, it was
+* already added to the list by the previous loop.
+*/
+   if (!net_eq(dev_net(geneve->dev), net))
+   unregister_netdevice_queue(geneve->dev, &list);
+   }
}
 
/* unregister the devices gathered above */
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH net-next 4/6] openvswitch: Use Geneve device.

2015-08-11 Thread Pravin B Shelar

With help of tunnel metadata mode OVS can directly use
Geneve devices to implement Geneve tunnels.
This patch removes all of the OVS specific Geneve code
and make OVS use a Geneve net_device. Basic geneve vport
is still there to handle compatibility with current
userspace application.

Signed-off-by: Pravin B Shelar 
---
 net/openvswitch/Kconfig|   2 +-
 net/openvswitch/vport-geneve.c | 179 -
 2 files changed, 33 insertions(+), 148 deletions(-)

diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index 422dc05..87b98c01 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -59,7 +59,7 @@ config OPENVSWITCH_VXLAN
 config OPENVSWITCH_GENEVE
tristate "Open vSwitch Geneve tunneling support"
depends on OPENVSWITCH
-   depends on GENEVE_CORE
+   depends on GENEVE
default OPENVSWITCH
---help---
  If you say Y here, then the Open vSwitch will be able create geneve 
vport.
diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c
index 1da3a14..fa37c95 100644
--- a/net/openvswitch/vport-geneve.c
+++ b/net/openvswitch/vport-geneve.c
@@ -26,95 +26,44 @@
 
 #include "datapath.h"
 #include "vport.h"
+#include "vport-netdev.h"
 
 static struct vport_ops ovs_geneve_vport_ops;
-
 /**
  * struct geneve_port - Keeps track of open UDP ports
- * @gs: The socket created for this port number.
- * @name: vport name.
+ * @dst_port: destination port.
  */
 struct geneve_port {
-   struct geneve_sock *gs;
-   char name[IFNAMSIZ];
+   u16 port_no;
 };
 
-static LIST_HEAD(geneve_ports);
-
 static inline struct geneve_port *geneve_vport(const struct vport *vport)
 {
return vport_priv(vport);
 }
 
-/* Convert 64 bit tunnel ID to 24 bit VNI. */
-static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni)
-{
-#ifdef __BIG_ENDIAN
-   vni[0] = (__force __u8)(tun_id >> 16);
-   vni[1] = (__force __u8)(tun_id >> 8);
-   vni[2] = (__force __u8)tun_id;
-#else
-   vni[0] = (__force __u8)((__force u64)tun_id >> 40);
-   vni[1] = (__force __u8)((__force u64)tun_id >> 48);
-   vni[2] = (__force __u8)((__force u64)tun_id >> 56);
-#endif
-}
-
-/* Convert 24 bit VNI to 64 bit tunnel ID. */
-static __be64 vni_to_tunnel_id(const __u8 *vni)
-{
-#ifdef __BIG_ENDIAN
-   return (vni[0] << 16) | (vni[1] << 8) | vni[2];
-#else
-   return (__force __be64)(((__force u64)vni[0] << 40) |
-   ((__force u64)vni[1] << 48) |
-   ((__force u64)vni[2] << 56));
-#endif
-}
-
-static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb)
-{
-   struct vport *vport = gs->rcv_data;
-   struct genevehdr *geneveh = geneve_hdr(skb);
-   int opts_len;
-   struct ip_tunnel_info tun_info;
-   __be64 key;
-   __be16 flags;
-
-   opts_len = geneveh->opt_len * 4;
-
-   flags = TUNNEL_KEY | TUNNEL_GENEVE_OPT |
-   (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0) |
-   (geneveh->oam ? TUNNEL_OAM : 0) |
-   (geneveh->critical ? TUNNEL_CRIT_OPT : 0);
-
-   key = vni_to_tunnel_id(geneveh->vni);
-
-   ip_tunnel_info_init(&tun_info, ip_hdr(skb),
-   udp_hdr(skb)->source, udp_hdr(skb)->dest,
-   key, flags, geneveh->options, opts_len);
-
-   ovs_vport_receive(vport, skb, &tun_info);
-}
-
 static int geneve_get_options(const struct vport *vport,
  struct sk_buff *skb)
 {
struct geneve_port *geneve_port = geneve_vport(vport);
-   struct inet_sock *sk = inet_sk(geneve_port->gs->sock->sk);
 
-   if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(sk->inet_sport)))
+   if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, geneve_port->port_no))
return -EMSGSIZE;
return 0;
 }
 
-static void geneve_tnl_destroy(struct vport *vport)
+static int geneve_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
+ struct ip_tunnel_info *egress_tun_info)
 {
struct geneve_port *geneve_port = geneve_vport(vport);
+   struct net *net = ovs_dp_get_net(vport->dp);
+   __be16 dport = htons(geneve_port->port_no);
+   __be16 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true);
 
-   geneve_sock_release(geneve_port->gs);
-
-   ovs_vport_deferred_free(vport);
+   return ovs_tunnel_get_egress_info(egress_tun_info,
+ ovs_dp_get_net(vport->dp),
+ OVS_CB(skb)->egress_tun_info,
+ IPPROTO_UDP, skb->mark, sport, dport);
 }
 
 static struct vport *geneve_tnl_create(const struct vport_parms *parms)
@@ -122,11 +71,11 @@ static struct vport *geneve_tnl_create(const struct 
vport_parms *parms)
struct net *net = ovs_dp_get_net(parms->dp);
struct nlattr *options = parms->options;

[PATCH net-next 5/6] geneve: Consolidate Geneve functionality in single module.

2015-08-11 Thread Pravin B Shelar

geneve_core module handles send and receive functionality.
This way OVS could use the Geneve API. Now with use of
tunnel meatadata mode OVS can directly use Geneve netdevice.
So there is no need for separate module for Geneve. Following
patch consolidates Geneve protocol processing in single module.

Signed-off-by: Pravin B Shelar 
---
 drivers/net/Kconfig|   2 +-
 drivers/net/geneve.c   | 433 ---
 include/net/geneve.h   |  34 
 net/ipv4/Kconfig   |  14 --
 net/ipv4/Makefile  |   1 -
 net/ipv4/geneve_core.c | 447 -
 6 files changed, 375 insertions(+), 556 deletions(-)
 delete mode 100644 net/ipv4/geneve_core.c

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index c18f9e6..0002ab7 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -181,7 +181,7 @@ config VXLAN
 
 config GENEVE
tristate "Generic Network Virtualization Encapsulation netdev"
-   depends on INET && GENEVE_CORE
+   depends on INET
select NET_IP_TUNNEL
---help---
  This allows one to create geneve virtual interfaces that provide
diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index a463383..af50061 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define GENEVE_NETDEV_VER  "0.6"
 
@@ -33,13 +34,18 @@ static bool log_ecn_error = true;
 module_param(log_ecn_error, bool, 0644);
 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
 
+#define GENEVE_VER 0
+#define GENEVE_BASE_HLEN (sizeof(struct udphdr) + sizeof(struct genevehdr))
+
 /* per-network namespace private data for this module */
 struct geneve_net {
struct list_head  geneve_list;
struct hlist_head vni_list[VNI_HASH_SIZE];
-   struct geneve_dev __rcu *collect_md_tun;
+   struct list_headsock_list;
 };
 
+static int geneve_net_id;
+
 /* Pseudo network device */
 struct geneve_dev {
struct hlist_node  hlist;   /* vni hash table */
@@ -55,7 +61,15 @@ struct geneve_dev {
bool   collect_md;
 };
 
-static int geneve_net_id;
+struct geneve_sock {
+   struct geneve_dev __rcu *collect_md_tun;
+   struct geneve_net   *gn;
+   struct list_headlist;
+   struct socket   *sock;
+   struct rcu_head rcu;
+   int refcnt;
+   struct udp_offload  udp_offloads;
+};
 
 static inline __u32 geneve_net_vni_hash(u8 vni[3])
 {
@@ -77,6 +91,7 @@ static __be64 vni_to_tunnel_id(const __u8 *vni)
 }
 
 static struct geneve_dev *geneve_lookup(struct geneve_net *gn,
+   struct geneve_sock *gs,
struct iphdr *iph,
struct genevehdr *gnvh)
 {
@@ -94,23 +109,28 @@ static struct geneve_dev *geneve_lookup(struct geneve_net 
*gn,
}
}
 
-   return rcu_dereference(gn->collect_md_tun);
+   return rcu_dereference(gs->collect_md_tun);
+}
+
+static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
+{
+   return (struct genevehdr *)(udp_hdr(skb) + 1);
 }
 
 /* geneve receive/decap routine */
 static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb)
 {
struct genevehdr *gnvh = geneve_hdr(skb);
+   struct geneve_net *gn = gs->gn;
struct metadata_dst *tun_dst = NULL;
struct geneve_dev *geneve = NULL;
struct pcpu_sw_netstats *stats;
-   struct geneve_net *gn;
struct iphdr *iph;
+   bool xnet;
int err;
 
iph = ip_hdr(skb); /* Still outer IP header... */
-   gn = gs->rcv_data;
-   geneve = geneve_lookup(gn, iph, gnvh);
+   geneve = geneve_lookup(gn, gs, iph, gnvh);
if (!geneve)
goto drop;
 
@@ -132,16 +152,18 @@ static void geneve_rx(struct geneve_sock *gs, struct 
sk_buff *skb)
opts = ip_tunnel_info_opts(&tun_dst->u.tun_info,
   gnvh->opt_len * 4);
memcpy(opts, gnvh->options, gnvh->opt_len * 4);
+   xnet = false;
} else {
/* Drop packets w/ critical options,
 * since we don't support any...
 */
if (gnvh->critical)
goto drop;
+   xnet = !net_eq(geneve->net, dev_net(geneve->dev));
}
 
skb_reset_mac_header(skb);
-   skb_scrub_packet(skb, !net_eq(geneve->net, dev_net(geneve->dev)));
+   skb_scrub_packet(skb, xnet);
skb->protocol = eth_type_trans(skb, geneve->dev);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 
@@ -173,7 +195,6 @@ static void geneve_rx(struct geneve_sock *gs, struct 
sk_buff *skb)
skb_dst_set(skb, (struct dst_entry *)tun_dst);
 
netif_rx(skb);
-
return;
 drop:
if (geneve

[PATCH net-next 0/6] Geneve: Add support for tunnel metadata.

2015-08-11 Thread Pravin B Shelar

Following patches adds spport for Geneve tunnel metadata
mode. OVS can make use of Geneve net-device with tunnel
metadata API from kernel.

This also allows us to consolidate Geneve implementation
from two kernel modules geneve_core and geneve to single
geneve module. geneve_core module was targeted to share
Geneve encap and decap code between Geneve netdevice and
OVS Geneve tunnel implementation, Since OVS no longer
needs these API, Geneve code can be consolidated into
single geneve module.


Pravin B Shelar (6):
  tunnel: introduce udp_tun_rx_dst()
  geneve: Make dst-port configurable.
  geneve: Add support to collect tunnel metadata.
  openvswitch: Use Geneve device.
  geneve: Consolidate Geneve functionality in single module.
  geneve: Remove duplicate dev list

 drivers/net/Kconfig|   2 +-
 drivers/net/geneve.c   | 731 ++---
 drivers/net/vxlan.c|  18 +-
 include/net/dst_metadata.h |  27 ++
 include/net/geneve.h   |  35 +-
 include/net/udp_tunnel.h   |   3 +
 include/uapi/linux/if_link.h   |   2 +
 net/ipv4/Kconfig   |  14 -
 net/ipv4/Makefile  |   1 -
 net/ipv4/geneve_core.c | 447 -
 net/ipv4/ip_gre.c  |  21 +-
 net/ipv4/udp_tunnel.c  |  24 +-
 net/openvswitch/Kconfig|   2 +-
 net/openvswitch/vport-geneve.c | 179 ++
 14 files changed, 719 insertions(+), 787 deletions(-)
 delete mode 100644 net/ipv4/geneve_core.c

-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH net-next 3/6] geneve: Add support to collect tunnel metadata.

2015-08-11 Thread Pravin B Shelar

Following patch create new tunnel flag which enable
tunnel metadata collection on given device. These devices
can be used by tunnel metadata based routing or by OVS.

Signed-off-by: Pravin B Shelar 
---
 drivers/net/geneve.c | 350 ---
 include/net/geneve.h |   3 +
 include/uapi/linux/if_link.h |   1 +
 3 files changed, 270 insertions(+), 84 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 5e9bab8..a463383 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -36,6 +37,7 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with 
corrupted ECN");
 struct geneve_net {
struct list_head  geneve_list;
struct hlist_head vni_list[VNI_HASH_SIZE];
+   struct geneve_dev __rcu *collect_md_tun;
 };
 
 /* Pseudo network device */
@@ -50,6 +52,7 @@ struct geneve_dev {
struct sockaddr_in remote;  /* IPv4 address for link partner */
struct list_head   next;/* geneve's per namespace list */
__be16 dst_port;
+   bool   collect_md;
 };
 
 static int geneve_net_id;
@@ -62,40 +65,80 @@ static inline __u32 geneve_net_vni_hash(u8 vni[3])
return hash_32(vnid, VNI_HASH_BITS);
 }
 
-/* geneve receive/decap routine */
-static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb)
+static __be64 vni_to_tunnel_id(const __u8 *vni)
+{
+#ifdef __BIG_ENDIAN
+   return (vni[0] << 16) | (vni[1] << 8) | vni[2];
+#else
+   return (__force __be64)(((__force u64)vni[0] << 40) |
+   ((__force u64)vni[1] << 48) |
+   ((__force u64)vni[2] << 56));
+#endif
+}
+
+static struct geneve_dev *geneve_lookup(struct geneve_net *gn,
+   struct iphdr *iph,
+   struct genevehdr *gnvh)
 {
-   struct genevehdr *gnvh = geneve_hdr(skb);
-   struct geneve_dev *dummy, *geneve = NULL;
-   struct geneve_net *gn;
-   struct iphdr *iph = NULL;
-   struct pcpu_sw_netstats *stats;
struct hlist_head *vni_list_head;
-   int err = 0;
+   struct geneve_dev *geneve;
__u32 hash;
 
-   iph = ip_hdr(skb); /* Still outer IP header... */
-
-   gn = gs->rcv_data;
-
/* Find the device for this VNI */
hash = geneve_net_vni_hash(gnvh->vni);
vni_list_head = &gn->vni_list[hash];
-   hlist_for_each_entry_rcu(dummy, vni_list_head, hlist) {
-   if (!memcmp(gnvh->vni, dummy->vni, sizeof(dummy->vni)) &&
-   iph->saddr == dummy->remote.sin_addr.s_addr) {
-   geneve = dummy;
-   break;
+   hlist_for_each_entry_rcu(geneve, vni_list_head, hlist) {
+   if (!memcmp(gnvh->vni, geneve->vni, sizeof(geneve->vni)) &&
+   iph->saddr == geneve->remote.sin_addr.s_addr) {
+   return geneve;
}
}
+
+   return rcu_dereference(gn->collect_md_tun);
+}
+
+/* geneve receive/decap routine */
+static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb)
+{
+   struct genevehdr *gnvh = geneve_hdr(skb);
+   struct metadata_dst *tun_dst = NULL;
+   struct geneve_dev *geneve = NULL;
+   struct pcpu_sw_netstats *stats;
+   struct geneve_net *gn;
+   struct iphdr *iph;
+   int err;
+
+   iph = ip_hdr(skb); /* Still outer IP header... */
+   gn = gs->rcv_data;
+   geneve = geneve_lookup(gn, iph, gnvh);
if (!geneve)
goto drop;
 
-   /* Drop packets w/ critical options,
-* since we don't support any...
-*/
-   if (gnvh->critical)
-   goto drop;
+   if (geneve->collect_md) {
+   __be16 flags;
+   void *opts;
+
+   flags = TUNNEL_KEY | TUNNEL_GENEVE_OPT |
+   (gnvh->oam ? TUNNEL_OAM : 0) |
+   (gnvh->critical ? TUNNEL_CRIT_OPT : 0);
+
+   tun_dst = udp_tun_rx_dst(skb, flags,
+vni_to_tunnel_id(gnvh->vni),
+gnvh->opt_len * 4);
+   if (!tun_dst)
+   goto drop;
+
+   /* Update tunnel dst according to Geneve options. */
+   opts = ip_tunnel_info_opts(&tun_dst->u.tun_info,
+  gnvh->opt_len * 4);
+   memcpy(opts, gnvh->options, gnvh->opt_len * 4);
+   } else {
+   /* Drop packets w/ critical options,
+* since we don't support any...
+*/
+   if (gnvh->critical)
+   goto drop;
+   }
 
skb_reset_mac_header(skb);
skb_scrub_packet(skb, !net_eq(geneve->net, dev_net(geneve->dev)));
@@ -103,7 +146,8 @@ static void geneve_rx(struct g

[PATCH net-next 1/6] tunnel: introduce udp_tun_rx_dst()

2015-08-11 Thread Pravin B Shelar

Introduce function udp_tun_rx_dst() to initialize tunnel dst on
receive path.

Signed-off-by: Pravin B Shelar 
---
 drivers/net/vxlan.c| 18 ++
 include/net/dst_metadata.h | 27 +++
 include/net/udp_tunnel.h   |  3 +++
 net/ipv4/ip_gre.c  | 21 +
 net/ipv4/udp_tunnel.c  | 24 +++-
 5 files changed, 60 insertions(+), 33 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 06c0731..94a12ef 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1269,26 +1269,12 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct 
sk_buff *skb)
}
 
if (vxlan_collect_metadata(vs)) {
-   const struct iphdr *iph = ip_hdr(skb);
-
-   tun_dst = metadata_dst_alloc(sizeof(*md), GFP_ATOMIC);
+   tun_dst = udp_tun_rx_dst(skb, TUNNEL_KEY,
+cpu_to_be64(vni >> 8), sizeof(*md));
if (!tun_dst)
goto drop;
 
info = &tun_dst->u.tun_info;
-   info->key.ipv4_src = iph->saddr;
-   info->key.ipv4_dst = iph->daddr;
-   info->key.ipv4_tos = iph->tos;
-   info->key.ipv4_ttl = iph->ttl;
-   info->key.tp_src = udp_hdr(skb)->source;
-   info->key.tp_dst = udp_hdr(skb)->dest;
-
-   info->mode = IP_TUNNEL_INFO_RX;
-   info->key.tun_flags = TUNNEL_KEY;
-   info->key.tun_id = cpu_to_be64(vni >> 8);
-   if (udp_hdr(skb)->check != 0)
-   info->key.tun_flags |= TUNNEL_CSUM;
-
md = ip_tunnel_info_opts(info, sizeof(*md));
} else {
memset(md, 0, sizeof(*md));
diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 075f523..c0934bd 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -53,4 +53,31 @@ static inline bool skb_valid_dst(const struct sk_buff *skb)
 struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags);
 struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t 
flags);
 
+static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb,
+__be16 flags,
+__be64 tunnel_id,
+int md_size)
+{
+   const struct iphdr *iph = ip_hdr(skb);
+   struct metadata_dst *tun_dst;
+   struct ip_tunnel_info *info;
+
+   tun_dst = metadata_dst_alloc(md_size, GFP_ATOMIC);
+   if (!tun_dst)
+   return NULL;
+
+   info = &tun_dst->u.tun_info;
+   info->key.ipv4_src = iph->saddr;
+   info->key.ipv4_dst = iph->daddr;
+   info->key.ipv4_tos = iph->tos;
+   info->key.ipv4_ttl = iph->ttl;
+
+   info->mode = IP_TUNNEL_INFO_RX;
+   info->key.tun_flags = flags;
+   info->key.tun_id = tunnel_id;
+   info->key.tp_src = 0;
+   info->key.tp_dst = 0;
+   return tun_dst;
+}
+
 #endif /* __NET_DST_METADATA_H */
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index c491c12..0ca17873 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -93,6 +93,9 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock 
*sk,
 
 void udp_tunnel_sock_release(struct socket *sock);
 
+struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, __be16 flags,
+__be64 tunnel_id, int md_size);
+
 static inline struct sk_buff *udp_tunnel_handle_offloads(struct sk_buff *skb,
 bool udp_csum)
 {
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index fb44d69..51f722a 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -400,25 +400,14 @@ static int ipgre_rcv(struct sk_buff *skb, const struct 
tnl_ptk_info *tpi)
if (tunnel) {
skb_pop_mac_header(skb);
if (tunnel->collect_md) {
-   struct ip_tunnel_info *info;
+   __be16 flags;
+   __be64 tun_id;
 
-   tun_dst = metadata_dst_alloc(0, GFP_ATOMIC);
+   flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
+   tun_id = key_to_tunnel_id(tpi->key);
+   tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
if (!tun_dst)
return PACKET_REJECT;
-
-   info = &tun_dst->u.tun_info;
-   info->key.ipv4_src = iph->saddr;
-   info->key.ipv4_dst = iph->daddr;
-   info->key.ipv4_tos = iph->tos;
-   info->key.ipv4_ttl = iph->ttl;
-
-   info->mode = IP_TUNNEL_INFO_RX;
-   info->key.tun_flags = tpi->flags &
- (TUNNEL_CSUM |

[PATCH net-next 2/6] geneve: Make dst-port configurable.

2015-08-11 Thread Pravin B Shelar

Add netlink interface to configure Geneve UDP port number.
So that user can configure it for a Gevene device.

Signed-off-by: Pravin B Shelar 
---
 drivers/net/geneve.c | 14 --
 include/uapi/linux/if_link.h |  1 +
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 78d49d1..5e9bab8 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -49,6 +49,7 @@ struct geneve_dev {
u8 tos; /* TOS override */
struct sockaddr_in remote;  /* IPv4 address for link partner */
struct list_head   next;/* geneve's per namespace list */
+   __be16 dst_port;
 };
 
 static int geneve_net_id;
@@ -157,7 +158,7 @@ static int geneve_open(struct net_device *dev)
struct geneve_net *gn = net_generic(geneve->net, geneve_net_id);
struct geneve_sock *gs;
 
-   gs = geneve_sock_add(net, htons(GENEVE_UDP_PORT), geneve_rx, gn,
+   gs = geneve_sock_add(net, geneve->dst_port, geneve_rx, gn,
 false, false);
if (IS_ERR(gs))
return PTR_ERR(gs);
@@ -225,7 +226,7 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct 
net_device *dev)
/* no need to handle local destination and encap bypass...yet... */
 
err = geneve_xmit_skb(gs, rt, skb, fl4.saddr, fl4.daddr,
- tos, ttl, 0, sport, htons(GENEVE_UDP_PORT), 0,
+ tos, ttl, 0, sport, geneve->dst_port, 0,
  geneve->vni, 0, NULL, false,
  !net_eq(geneve->net, dev_net(geneve->dev)));
if (err < 0)
@@ -378,6 +379,11 @@ static int geneve_newlink(struct net *net, struct 
net_device *dev,
if (data[IFLA_GENEVE_TOS])
geneve->tos = nla_get_u8(data[IFLA_GENEVE_TOS]);
 
+   if (data[IFLA_GENEVE_PORT])
+   geneve->dst_port = htons(nla_get_u16(data[IFLA_GENEVE_PORT]));
+   else
+   geneve->dst_port = htons(GENEVE_UDP_PORT);
+
list_add(&geneve->next, &gn->geneve_list);
 
hlist_add_head_rcu(&geneve->hlist, &gn->vni_list[hash]);
@@ -402,6 +408,7 @@ static size_t geneve_get_size(const struct net_device *dev)
nla_total_size(sizeof(struct in_addr)) + /* IFLA_GENEVE_REMOTE 
*/
nla_total_size(sizeof(__u8)) +  /* IFLA_GENEVE_TTL */
nla_total_size(sizeof(__u8)) +  /* IFLA_GENEVE_TOS */
+   nla_total_size(sizeof(__u16)) +  /* IFLA_GENEVE_PORT */
0;
 }
 
@@ -422,6 +429,9 @@ static int geneve_fill_info(struct sk_buff *skb, const 
struct net_device *dev)
nla_put_u8(skb, IFLA_GENEVE_TOS, geneve->tos))
goto nla_put_failure;
 
+   if (nla_put_u32(skb, IFLA_GENEVE_PORT, ntohs(geneve->dst_port)))
+   goto nla_put_failure;
+
return 0;
 
 nla_put_failure:
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index d450be3..26d4412 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -400,6 +400,7 @@ enum {
IFLA_GENEVE_REMOTE,
IFLA_GENEVE_TTL,
IFLA_GENEVE_TOS,
+   IFLA_GENEVE_PORT,   /* destination port */
__IFLA_GENEVE_MAX
 };
 #define IFLA_GENEVE_MAX(__IFLA_GENEVE_MAX - 1)
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: ipv6_mc_check_mld - kernel BUG at net/core/skbuff.c:1128

2015-08-11 Thread David Miller

From: Linus Lüssing 
Date: Tue, 11 Aug 2015 23:47:25 +0200

> On Tue, Aug 11, 2015 at 10:51:40PM +0200, Linus Lüssing wrote:
>> On Mon, Aug 10, 2015 at 02:56:12PM -0700, Brenden Blanco wrote:
>> > Doing some code reading with Alexei, we found a suspect commit, which
>> > introduces an skb_get and skb_may_pull of the same skb, which leads to the 
>> > BUG
>> > when skb->len == len.
>> 
>> Urgh, didn't know that pskb_may_pull() doesn't like an skb with a
>> reference count greater than one... But yes, the BUG() call in
>> skbuff.c:1128 / pskb_expand_head() says that (though in this case
>> the BUG() in skbuff.c call actually seems kinda weird (/"wrong"?), as
>> it isn't shared between different code paths).
> 
> The more I think about it, I'm tending to remove the BUG() call in
> pskb_expand_head() as in this case it obviously isn't a bug.

Calling pskb_expand_head() with a shared SKB is absolutely,
positively, a bug.  You just don't understand why it is.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] net: Use SK_MEM_QUANTUM as minimum for tcp/udp rmem/wmem

2015-08-11 Thread Calvin Owens

Commit 8133534c760d4083 ("net: limit tcp/udp rmem/wmem to
SOCK_{RCV,SND}BUF_MIN") modified four sysctls to enforce that the values
written to them are not less than SOCK_MIN_{RCV,SND}BUF.

That change causes 4096 (or SK_MEM_QUANTUM) to no longer be accepted as
a valid value for 'min' in tcp_wmem and udp_wmem_min. 4096 has been the
default for both of those sysctls for a long time, and unfortunately
seems to be an extremely popular setting. This change breaks a large
number of sysctl configurations at FB.

That commit referred to b1cb59cf2efe7971 ("net: sysctl_net_core: check
SNDBUF and RCVBUF for min length"), which choose to use the SOCK_MIN
constants as the lower limits to avoid nasty bugs. But AFAICS, a limit
of SOCK_MIN_SNDBUF isn't necessary to do that: the BUG_ON cited in the
commit message seems to have happened because unix_stream_sendmsg()
expects a minimum of a full page (ie SK_MEM_QUANTUM) and the math broke,
not because it had less than SOCK_MIN_SNDBUF allocated.

Nothing seems to assume that it has at least SOCK_MIN_SNDBUF to play
with, so I think enforcing a minimum of SK_MEM_QUANTUM avoids the sort
of bugs 8133534c was trying to avoid, and it does so without breaking
anybody's sysctl configurations.

Fixes: 8133534c760d4083 ("net: limit tcp/udp rmem/wmem to SOCK_MIN...")
Signed-off-by: Calvin Owens 
---
 net/ipv4/sysctl_net_ipv4.c | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 433231c..a214b6a 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -41,8 +41,7 @@ static int tcp_syn_retries_min = 1;
 static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
 static int ip_ping_group_range_min[] = { 0, 0 };
 static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
-static int min_sndbuf = SOCK_MIN_SNDBUF;
-static int min_rcvbuf = SOCK_MIN_RCVBUF;
+static int min_buf = SK_MEM_QUANTUM;
 
 /* Update system visible IP port range */
 static void set_local_port_range(struct net *net, int range[2])
@@ -530,7 +529,7 @@ static struct ctl_table ipv4_table[] = {
.maxlen = sizeof(sysctl_tcp_wmem),
.mode   = 0644,
.proc_handler   = proc_dointvec_minmax,
-   .extra1 = &min_sndbuf,
+   .extra1 = &min_buf,
},
{
.procname   = "tcp_notsent_lowat",
@@ -545,7 +544,7 @@ static struct ctl_table ipv4_table[] = {
.maxlen = sizeof(sysctl_tcp_rmem),
.mode   = 0644,
.proc_handler   = proc_dointvec_minmax,
-   .extra1 = &min_rcvbuf,
+   .extra1 = &min_buf,
},
{
.procname   = "tcp_app_win",
@@ -758,7 +757,7 @@ static struct ctl_table ipv4_table[] = {
.maxlen = sizeof(sysctl_udp_rmem_min),
.mode   = 0644,
.proc_handler   = proc_dointvec_minmax,
-   .extra1 = &min_rcvbuf,
+   .extra1 = &min_buf,
},
{
.procname   = "udp_wmem_min",
@@ -766,7 +765,7 @@ static struct ctl_table ipv4_table[] = {
.maxlen = sizeof(sysctl_udp_wmem_min),
.mode   = 0644,
.proc_handler   = proc_dointvec_minmax,
-   .extra1 = &min_sndbuf,
+   .extra1 = &min_buf,
},
{ }
 };
-- 
1.8.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH net] netconsole: Check for carrier before calling netpoll_send_udp()

2015-08-11 Thread Jonathan Maxwell

> I personally think that drivers need to synchronize such things
> internally.  They are the only entity which knows when it's "OK"
> to do whatever the netpoll method does, and they are also the only
> entity which can properly synchronize such checks.

Thanks agreed. I am testing the following ixgbe patch on my reproducer
that  checks for resetting/removing/down state flags in ixgbe_poll() and
bails if true. It does that check in other ixgbe routines as well. It's working
fine so far. We will need to do something similar for vmxnet3 as well and
possibly other drivers.

--- a/drivers/net/ixgbe/ixgbe_main.c 2015-08-10 17:13:02.899400508 +1000
+++ b/drivers/net/ixgbe/ixgbe_main.c.patch 2015-08-12 11:34:49.951053887 +1000
@@ -2672,6 +2672,11 @@
  int per_ring_budget;
  bool clean_complete = true;

+ if (test_bit(__IXGBE_DOWN, &adapter->state) ||
+test_bit(__IXGBE_REMOVING, &adapter->state) ||
+test_bit(__IXGBE_RESETTING, &adapter->state))
+ return budget;
+
 #ifdef CONFIG_IXGBE_DCA
  if (adapter->flags & IXGBE_FLAG_DCA_ENABLED)
  ixgbe_update_dca(q_vector);
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v7 5/5] samples/bpf: example of get selected PMU counter value

2015-08-11 Thread Alexei Starovoitov


On 8/6/15 12:02 AM, Kaixu Xia wrote:

This is a simple example and shows how to use the new ability
to get the selected Hardware PMU counter value.

Signed-off-by: Kaixu Xia 
---
  samples/bpf/Makefile   |  4 +++
  samples/bpf/bpf_helpers.h  |  2 ++
  samples/bpf/tracex6_kern.c | 26 ++
  samples/bpf/tracex6_user.c | 68 ++


1.
I see a bunch of warnings building it:
  HOSTCC  samples/bpf/tracex6_user.o
../samples/bpf/tracex6_user.c: In function ‘test_bpf_perf_event’:
../samples/bpf/tracex6_user.c:49:2: warning: passing argument 1 of 
‘close’ makes integer from pointer without a cast [enabled by default]

In file included from ../samples/bpf/tracex6_user.c:2:0:
/usr/include/unistd.h:354:12: note: expected ‘int’ but argument is of 
type ‘int *’
../samples/bpf/tracex6_user.c:20:16: warning: unused variable ‘value’ 
[-Wunused-variable]
../samples/bpf/tracex6_user.c:42:8: warning: ignoring return value of 
‘system’, declared with attribute warn_unused_result [-Wunused-result]
../samples/bpf/tracex6_user.c:43:8: warning: ignoring return value of 
‘system’, declared with attribute warn_unused_result [-Wunused-result]
../samples/bpf/tracex6_user.c:44:8: warning: ignoring return value of 
‘system’, declared with attribute warn_unused_result [-Wunused-result]

  HOSTLD  samples/bpf/tracex6
  HOSTCC  samples/bpf/lathist_user.o
  HOSTLD  samples/bpf/lathist
clang  -nostdinc -isystem /usr/lib/gcc/x86_64-linux-gnu/4.7/include 
-I../arch/x86/include -Iarch/x86/include/generated/uapi 
-Iarch/x86/include/generated  -I../include -Iinclude 
-I../arch/x86/include/uapi -Iarch/x86/include/generated/uapi 
-I../include/uapi -Iinclude/generated/uapi -include 
../include/linux/kconfig.h  \

-D__KERNEL__ -Wno-unused-value -Wno-pointer-sign \
		-O2 -emit-llvm -c ../samples/bpf/tracex6_kern.c -o -| 
../tools/bpf/llvm/bld/Debug+Asserts/bin/llc -march=bpf -filetype=obj -o 
samples/bpf/tracex6_kern.o
../samples/bpf/tracex6_kern.c:13:22: warning: declaration of 'struct 
pt_regs' will not be visible outside of this function [-Wvisibility]

int bpf_prog1(struct pt_regs *ctx)

Please fix.

2.
the example is incomplete.
Please add read_trace_pipe() otherwise it exits without printing
anything useful.

3.
please replace ls and pwd with ls > /dev/null
the spam on the screen is unnecessary.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [net-next PATCH 1/3] net: make default tx_queue_len configurable

2015-08-11 Thread Alexei Starovoitov

On Tue, Aug 11, 2015 at 06:23:35PM +0200, Phil Sutter wrote:
> 
> I have an unfinished solution in the oven, but being kept busy with
> other things for now. The action plan is as follows:
> 
> 1) Introduce IFF_NO_QUEUE net_device->priv_flag.
> 2) Have attach_default_qdiscs() and attach_one_default_qdisc() treat
>IFF_NO_QUEUE as alternative to tx_queue_len == 0.
> 3) Add warning to register_netdevice() if tx_queue_len == 0.
> 4) Change virtual NIC drivers to set IFF_NO_QUEUE and leave tx_queue_len
>alone.
> 5) Eventually drop all special handling for tx_queue_len == 0.
> 
> I am currently somewhere in 2) and need to implement 4) for veth as PoC to
> check if 2) suffices in all situations we want. Not sure if 3) is
> desireable at all or if there are valid cases for a literally zero
> length TX queue length.

sounds like you want to change default qdisc from pfifo_fast to noqueue
for veth, right?
In general 'changing the default' may be an acceptable thing, but then
it needs to strongly justified. How much performance does it bring?
Also why introduce the flag? Why not just add 'tx_queue_len = 0;' 
to veth_setup() like the whole bunch of devices do?

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: ipv6_mc_check_mld - kernel BUG at net/core/skbuff.c:1128

2015-08-11 Thread Alexei Starovoitov

On Tue, Aug 11, 2015 at 11:47:25PM +0200, Linus Lüssing wrote:
> On Tue, Aug 11, 2015 at 10:51:40PM +0200, Linus Lüssing wrote:
> > On Mon, Aug 10, 2015 at 02:56:12PM -0700, Brenden Blanco wrote:
> > > Doing some code reading with Alexei, we found a suspect commit, which
> > > introduces an skb_get and skb_may_pull of the same skb, which leads to 
> > > the BUG
> > > when skb->len == len.
> > 
> > Urgh, didn't know that pskb_may_pull() doesn't like an skb with a
> > reference count greater than one... But yes, the BUG() call in
> > skbuff.c:1128 / pskb_expand_head() says that (though in this case
> > the BUG() in skbuff.c call actually seems kinda weird (/"wrong"?), as
> > it isn't shared between different code paths).
> 
> The more I think about it, I'm tending to remove the BUG() call in
> pskb_expand_head() as in this case it obviously isn't a bug.
> 
> The skb_get() allows a simple and in my opinion easy to read cleanup
> part of skb_trimmed for any caller of ip{v6,}_mc_check_mld(). No need
> to check whether skb == skb_trimmed for a caller for instance,
> simply checking whether skb_trimmed exists is enough.
> 
> 
> Any objections to remove the "if (skb_shared(skb)) BUG()" part in
> pskb_expand_head()? Or would there be any other undesired side
> effects in utilising skb_get() like that?

That fundamental check was there for 10+ years and cannot be removed.
bridge already did skb_share_check() before reaching this
__ipv6_mc_check_mld() path.
There is no reason to do skb_get() there.
It wasn't there before commit 9afd85c9e4552 which claims to do:
'Some small refactoring was done to enhance readibility',
but doing skb_get()+pskb_may_pull() which is incorrect.
Avoiding unnecessary skb_clone() is a good thing, but it should be
done without messing with skb->users, since this code path
already owns skb.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCHv2 0/3] gianfar: filer changes

2015-08-11 Thread Jakub Kicinski

From: Jakub Kicinski 

Hi,

respinning with examples as requested.

Jakub Kicinski (3):
  gianfar: correct filer table writing
  gianfar: correct list membership accounting
  gianfar: remove faulty filer optimizer

 drivers/net/ethernet/freescale/gianfar_ethtool.c | 345 +--
 1 file changed, 4 insertions(+), 341 deletions(-)

-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCHv2 3/3] gianfar: remove faulty filer optimizer

2015-08-11 Thread Jakub Kicinski

From: Jakub Kicinski 

Current filer rule optimization is broken in several ways:
 (1) Can perform reads/writes beyond end of allocated tables.
 (gianfar_ethtool.c:1326).

(2) It breaks badly for rules with more than 2 specifiers
 (e.g. matching ip, port, tos).

Example:
# ethtool -N eth2 flow-type udp4 dst-ip 10.0.0.1 dst-port 1 tos 1 action 1
Added rule with ID 254
# ethtool -N eth2 flow-type udp4 dst-ip 10.0.0.2 dst-port 2 tos 2 action 9
Added rule with ID 253
# ethtool -N eth2 flow-type udp4 dst-ip 10.0.0.3 dst-port 3 tos 3 action 17
Added rule with ID 252
# ./filer_decode /sys/kernel/debug/gfar1/filer_raw
00: MASK == 0210 AND Q:00   ctrl:0080 prop:0210
01: FPR  == 0210 AND CLE Q:00   ctrl:0281 prop:0210
02: MASK ==  AND Q:00   ctrl:0080 prop:
03: DPT  == 0003 AND Q:00   ctrl:008e prop:0003
04: TOS  == 0003 AND Q:00   ctrl:008a prop:0003
05: DIA  == 0a03 AND Q:11   ctrl:448c prop:0a03
06: DPT  == 0002 AND Q:00   ctrl:008e prop:0002
07: TOS  == 0002 AND Q:00   ctrl:008a prop:0002
08: DIA  == 0a02 AND Q:09   ctrl:248c prop:0a02
09: DIA  == 0a01 AND Q:00   ctrl:008c prop:0a01
0a: DPT  == 0001 AND Q:00   ctrl:008e prop:0001
0b: TOS  == 0001 CLE Q:01   ctrl:060a prop:0001
ff: MASK >=  Q:00   ctrl:0020 prop:

(Entire cluster gets AND-ed together).

 (3) We observed that the masking rules it generates do not
 play well with clustering on P2020.  Only first rule
 of the cluster would ever fire.  Given that optimizer
 relies heavily on masking this is very hard to fix.

Example:
# ethtool -N eth2 flow-type udp4 dst-ip 10.0.0.1 dst-port 1  action 1
Added rule with ID 254
# ethtool -N eth2 flow-type udp4 dst-ip 10.0.0.2 dst-port 2  action 9
Added rule with ID 253
# ethtool -N eth2 flow-type udp4 dst-ip 10.0.0.3 dst-port 3  action 17
Added rule with ID 252
# ./filer_decode /sys/kernel/debug/gfar1/filer_raw
00: MASK == 0210 AND Q:00   ctrl:0080 prop:0210
01: FPR  == 0210 AND CLE Q:00   ctrl:0281 prop:0210
02: MASK ==  AND Q:00   ctrl:0080 prop:
03: DPT  == 0003 AND Q:00   ctrl:008e prop:0003
04: DIA  == 0a03 Q:11   ctrl:440c prop:0a03
05: DPT  == 0002 AND Q:00   ctrl:008e prop:0002
06: DIA  == 0a02 Q:09   ctrl:240c prop:0a02
07: DIA  == 0a01 AND Q:00   ctrl:008c prop:0a01
08: DPT  == 0001 CLE Q:01   ctrl:060e prop:0001
ff: MASK >=  Q:00   ctrl:0020 prop:

Which looks correct according to the spec but only the first
(eth id 252)/last added rule for 10.0.0.3 will ever trigger.
As if filer did not treat the AND CLE as cluster start but
also kept AND-ing the rules.  We found no errata covering this.


The fact that nobody noticed (2) or (3) makes me think
that this feature is not very widely used and we should just
remove it.

Reported-by: Aleksander Dutkowski 
Signed-off-by: Jakub Kicinski 
---
v2: - add examples
- drop rule reordering from problems as it doesn't
  actually happen
---
 drivers/net/ethernet/freescale/gianfar_ethtool.c | 337 ---
 1 file changed, 337 deletions(-)

diff --git a/drivers/net/ethernet/freescale/gianfar_ethtool.c 
b/drivers/net/ethernet/freescale/gianfar_ethtool.c
index b955ed83ca98..6bdc89179b72 100644
--- a/drivers/net/ethernet/freescale/gianfar_ethtool.c
+++ b/drivers/net/ethernet/freescale/gianfar_ethtool.c
@@ -902,27 +902,6 @@ static int gfar_check_filer_hardware(struct gfar_private 
*priv)
return 0;
 }
 
-static int gfar_comp_asc(const void *a, const void *b)
-{
-   return memcmp(a, b, 4);
-}
-
-static int gfar_comp_desc(const void *a, const void *b)
-{
-   return -memcmp(a, b, 4);
-}
-
-static void gfar_swap(void *a, void *b, int size)
-{
-   u32 *_a = a;
-   u32 *_b = b;
-
-   swap(_a[0], _b[0]);
-   swap(_a[1], _b[1]);
-   swap(_a[2], _b[2]);
-   swap(_a[3], _b[3]);
-}
-
 /* Write a mask to filer cache */
 static void gfar_set_mask(u32 mask, struct filer_table *tab)
 {
@@ -1272,310 +1251,6 @@ static int gfar_convert_to_filer(struct 
ethtool_rx_flow_spec *rule,
return 0;
 }
 
-/* Copy size filer entries */
-static void gfar_copy_filer_entries(struct gfar_filer_entry dst[0],
-   struct gfar_filer_entry src[0], s32 size)
-{
-   while (size > 0) {
-   size--;
-   dst[size].ctrl = src[size].ctrl;
-   dst[size].prop = src[size].prop;
-   }
-}
-
-/* D

[PATCHv2 2/3] gianfar: correct list membership accounting

2015-08-11 Thread Jakub Kicinski

From: Jakub Kicinski 

At a cost of one line let's make sure .count is correct
when calling gfar_process_filer_changes().

Signed-off-by: Jakub Kicinski 
---
v2: no change
---
 drivers/net/ethernet/freescale/gianfar_ethtool.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/freescale/gianfar_ethtool.c 
b/drivers/net/ethernet/freescale/gianfar_ethtool.c
index e543d3b01838..b955ed83ca98 100644
--- a/drivers/net/ethernet/freescale/gianfar_ethtool.c
+++ b/drivers/net/ethernet/freescale/gianfar_ethtool.c
@@ -1723,13 +1723,14 @@ static int gfar_add_cls(struct gfar_private *priv,
}
 
 process:
+   priv->rx_list.count++;
ret = gfar_process_filer_changes(priv);
if (ret)
goto clean_list;
-   priv->rx_list.count++;
return ret;
 
 clean_list:
+   priv->rx_list.count--;
list_del(&temp->list);
 clean_mem:
kfree(temp);
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCHv2 1/3] gianfar: correct filer table writing

2015-08-11 Thread Jakub Kicinski

From: Jakub Kicinski 

MAX_FILER_IDX is the last usable index.  Using less-than
will already guarantee that one entry for catch-all rule
will be left, no need to subtract 1 here.

Signed-off-by: Jakub Kicinski 
---
v2: no change
---
 drivers/net/ethernet/freescale/gianfar_ethtool.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/freescale/gianfar_ethtool.c 
b/drivers/net/ethernet/freescale/gianfar_ethtool.c
index 555e461b0cfe..e543d3b01838 100644
--- a/drivers/net/ethernet/freescale/gianfar_ethtool.c
+++ b/drivers/net/ethernet/freescale/gianfar_ethtool.c
@@ -1585,11 +1585,10 @@ static int gfar_write_filer_table(struct gfar_private 
*priv,
return -EBUSY;
 
/* Fill regular entries */
-   for (; i < MAX_FILER_IDX - 1 && (tab->fe[i].ctrl | tab->fe[i].prop);
-i++)
+   for (; i < MAX_FILER_IDX && (tab->fe[i].ctrl | tab->fe[i].prop); i++)
gfar_write_filer(priv, i, tab->fe[i].ctrl, tab->fe[i].prop);
/* Fill the rest with fall-troughs */
-   for (; i < MAX_FILER_IDX - 1; i++)
+   for (; i < MAX_FILER_IDX; i++)
gfar_write_filer(priv, i, 0x60, 0x);
/* Last entry must be default accept
 * because that's what people expect
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH net-next 2/2] hv_netvsc: Implement set_channels ethtool op

2015-08-11 Thread Andrew Schwartzmeyer

This enables the use of ethtool --set-channels devname combined N to
change the number of vRSS queues. Separate rx, tx, and other parameters
are not supported. The maximum is rsscap.num_recv_que. It passes the
given value to rndis_filter_device_add through the device_info->num_chn
field.

If the procedure fails, it attempts to recover to the prior state. If
the recovery fails, it logs an error and aborts.

Current num_chn is saved and restored when changing the MTU.

Signed-off-by: Andrew Schwartzmeyer 
---
 drivers/net/hyperv/netvsc_drv.c | 97 +
 1 file changed, 97 insertions(+)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 21845202a52d..f3b9d3eb753b 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -770,6 +770,101 @@ static void netvsc_get_channels(struct net_device *net,
}
 }
 
+static int netvsc_set_channels(struct net_device *net,
+  struct ethtool_channels *channels)
+{
+   struct net_device_context *net_device_ctx = netdev_priv(net);
+   struct hv_device *dev = net_device_ctx->device_ctx;
+   struct netvsc_device *nvdev = hv_get_drvdata(dev);
+   struct netvsc_device_info device_info;
+   const u32 num_chn = nvdev->num_chn;
+   const u32 max_chn = min_t(u32, nvdev->max_chn, num_online_cpus());
+   int ret = 0;
+   bool recovering = false;
+
+   if (!nvdev || nvdev->destroy)
+   return -ENODEV;
+
+   if (nvdev->nvsp_version < NVSP_PROTOCOL_VERSION_5) {
+   pr_info("vRSS unsupported before NVSP Version 5\n");
+   return -EINVAL;
+   }
+
+   /* We do not support rx, tx, or other */
+   if (!channels ||
+   channels->rx_count ||
+   channels->tx_count ||
+   channels->other_count ||
+   (channels->combined_count < 1))
+   return -EINVAL;
+
+   if (channels->combined_count > max_chn) {
+   pr_info("combined channels too high, using %d\n", max_chn);
+   channels->combined_count = max_chn;
+   }
+
+   ret = netvsc_close(net);
+   if (ret)
+   goto out;
+
+ do_set:
+   nvdev->start_remove = true;
+   rndis_filter_device_remove(dev);
+
+   nvdev->num_chn = channels->combined_count;
+
+   net_device_ctx->device_ctx = dev;
+   hv_set_drvdata(dev, net);
+
+   memset(&device_info, 0, sizeof(device_info));
+   device_info.num_chn = nvdev->num_chn; /* passed to RNDIS */
+   device_info.ring_size = ring_size;
+   device_info.max_num_vrss_chns = max_num_vrss_chns;
+
+   ret = rndis_filter_device_add(dev, &device_info);
+   if (ret) {
+   if (recovering) {
+   netdev_err(net, "unable to add netvsc device (ret 
%d)\n", ret);
+   return ret;
+   }
+   goto recover;
+   }
+
+   nvdev = hv_get_drvdata(dev);
+
+   ret = netif_set_real_num_tx_queues(net, nvdev->num_chn);
+   if (ret) {
+   if (recovering) {
+   netdev_err(net, "could not set tx queue count (ret 
%d)\n", ret);
+   return ret;
+   }
+   goto recover;
+   }
+
+   ret = netif_set_real_num_rx_queues(net, nvdev->num_chn);
+   if (ret) {
+   if (recovering) {
+   netdev_err(net, "could not set rx queue count (ret 
%d)\n", ret);
+   return ret;
+   }
+   goto recover;
+   }
+
+ out:
+   netvsc_open(net);
+
+   return ret;
+
+ recover:
+   /* If the above failed, we attempt to recover through the same
+* process but with the original number of channels.
+*/
+   netdev_err(net, "could not set channels, recovering\n");
+   recovering = true;
+   channels->combined_count = num_chn;
+   goto do_set;
+}
+
 static int netvsc_change_mtu(struct net_device *ndev, int mtu)
 {
struct net_device_context *ndevctx = netdev_priv(ndev);
@@ -802,6 +897,7 @@ static int netvsc_change_mtu(struct net_device *ndev, int 
mtu)
 
memset(&device_info, 0, sizeof(device_info));
device_info.ring_size = ring_size;
+   device_info.num_chn = nvdev->num_chn;
device_info.max_num_vrss_chns = max_num_vrss_chns;
rndis_filter_device_add(hdev, &device_info);
 
@@ -891,6 +987,7 @@ static const struct ethtool_ops ethtool_ops = {
.get_drvinfo= netvsc_get_drvinfo,
.get_link   = ethtool_op_get_link,
.get_channels   = netvsc_get_channels,
+   .set_channels   = netvsc_set_channels,
 };
 
 static const struct net_device_ops device_ops = {
-- 
2.4.2

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH net-next 1/2] hv_netvsc: Set vRSS with num_chn in RNDIS filter

2015-08-11 Thread Andrew Schwartzmeyer

Uses device_info->num_chn to pass user provided number of vRSS
queues (from ethtool --set-channels) to rndis_filter_device_add. If
nonzero and less than the maximum, set net_device->num_chn to the given
value; else default to prior algorithm.

Always initialize struct device_info to 0, otherwise not all its fields
are guaranteed to be 0, which is necessary when checking if num_chn has
been purposefully set.

Signed-off-by: Andrew Schwartzmeyer 
---
 drivers/net/hyperv/hyperv_net.h   | 1 +
 drivers/net/hyperv/netvsc_drv.c   | 3 +++
 drivers/net/hyperv/rndis_filter.c | 7 ++-
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 5ce7020ca530..5fa98f599b3d 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -162,6 +162,7 @@ struct netvsc_device_info {
bool link_state;/* 0 - link up, 1 - link down */
int  ring_size;
u32  max_num_vrss_chns;
+   u32  num_chn;
 };
 
 enum rndis_device_state {
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 7b36d5fecc1f..21845202a52d 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -799,6 +799,8 @@ static int netvsc_change_mtu(struct net_device *ndev, int 
mtu)
 
ndevctx->device_ctx = hdev;
hv_set_drvdata(hdev, ndev);
+
+   memset(&device_info, 0, sizeof(device_info));
device_info.ring_size = ring_size;
device_info.max_num_vrss_chns = max_num_vrss_chns;
rndis_filter_device_add(hdev, &device_info);
@@ -1022,6 +1024,7 @@ static int netvsc_probe(struct hv_device *dev,
net->needed_headroom = max_needed_headroom;
 
/* Notify the netvsc driver of the new device */
+   memset(&device_info, 0, sizeof(device_info));
device_info.ring_size = ring_size;
device_info.max_num_vrss_chns = max_num_vrss_chns;
ret = rndis_filter_device_add(dev, &device_info);
diff --git a/drivers/net/hyperv/rndis_filter.c 
b/drivers/net/hyperv/rndis_filter.c
index 9b8263db49cc..5931a799aa17 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -1125,7 +1125,12 @@ int rndis_filter_device_add(struct hv_device *dev,
 */
node_cpu_mask = cpumask_of_node(cpu_to_node(dev->channel->target_cpu));
num_possible_rss_qs = cpumask_weight(node_cpu_mask);
-   net_device->num_chn = min(num_possible_rss_qs, num_rss_qs);
+
+   /* We will use the given number of channels if available. */
+   if (device_info->num_chn && device_info->num_chn < net_device->max_chn)
+   net_device->num_chn = device_info->num_chn;
+   else
+   net_device->num_chn = min(num_possible_rss_qs, num_rss_qs);
 
num_rss_qs = net_device->num_chn - 1;
net_device->num_sc_offered = num_rss_qs;
-- 
2.4.2

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCHv3 net-next 03/10] ipv6: Export nf_ct_frag6_gather()

2015-08-11 Thread Joe Stringer

Signed-off-by: Joe Stringer 
Acked-by: Thomas Graf 
---
 net/ipv6/netfilter/nf_conntrack_reasm.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c 
b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 6d02498..701cd2b 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -633,6 +633,7 @@ ret_orig:
kfree_skb(clone);
return skb;
 }
+EXPORT_SYMBOL_GPL(nf_ct_frag6_gather);
 
 void nf_ct_frag6_consume_orig(struct sk_buff *skb)
 {
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCHv3 net-next 02/10] openvswitch: Move MASKED* macros to datapath.h

2015-08-11 Thread Joe Stringer

This will allow the ovs-conntrack code to reuse these macros.

Signed-off-by: Joe Stringer 
Acked-by: Thomas Graf 
---
 net/openvswitch/actions.c  | 52 ++
 net/openvswitch/datapath.h |  4 
 2 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index a0ac410..a75f320 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -185,10 +185,6 @@ static int pop_mpls(struct sk_buff *skb, struct 
sw_flow_key *key,
return 0;
 }
 
-/* 'KEY' must not have any bits set outside of the 'MASK' */
-#define MASKED(OLD, KEY, MASK) ((KEY) | ((OLD) & ~(MASK)))
-#define SET_MASKED(OLD, KEY, MASK) ((OLD) = MASKED(OLD, KEY, MASK))
-
 static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key,
const __be32 *mpls_lse, const __be32 *mask)
 {
@@ -201,7 +197,7 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key 
*flow_key,
return err;
 
stack = (__be32 *)skb_mpls_header(skb);
-   lse = MASKED(*stack, *mpls_lse, *mask);
+   lse = OVS_MASKED(*stack, *mpls_lse, *mask);
if (skb->ip_summed == CHECKSUM_COMPLETE) {
__be32 diff[] = { ~(*stack), lse };
 
@@ -244,9 +240,9 @@ static void ether_addr_copy_masked(u8 *dst_, const u8 
*src_, const u8 *mask_)
const u16 *src = (const u16 *)src_;
const u16 *mask = (const u16 *)mask_;
 
-   SET_MASKED(dst[0], src[0], mask[0]);
-   SET_MASKED(dst[1], src[1], mask[1]);
-   SET_MASKED(dst[2], src[2], mask[2]);
+   OVS_SET_MASKED(dst[0], src[0], mask[0]);
+   OVS_SET_MASKED(dst[1], src[1], mask[1]);
+   OVS_SET_MASKED(dst[2], src[2], mask[2]);
 }
 
 static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key,
@@ -330,10 +326,10 @@ static void update_ipv6_checksum(struct sk_buff *skb, u8 
l4_proto,
 static void mask_ipv6_addr(const __be32 old[4], const __be32 addr[4],
   const __be32 mask[4], __be32 masked[4])
 {
-   masked[0] = MASKED(old[0], addr[0], mask[0]);
-   masked[1] = MASKED(old[1], addr[1], mask[1]);
-   masked[2] = MASKED(old[2], addr[2], mask[2]);
-   masked[3] = MASKED(old[3], addr[3], mask[3]);
+   masked[0] = OVS_MASKED(old[0], addr[0], mask[0]);
+   masked[1] = OVS_MASKED(old[1], addr[1], mask[1]);
+   masked[2] = OVS_MASKED(old[2], addr[2], mask[2]);
+   masked[3] = OVS_MASKED(old[3], addr[3], mask[3]);
 }
 
 static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto,
@@ -350,15 +346,15 @@ static void set_ipv6_addr(struct sk_buff *skb, u8 
l4_proto,
 static void set_ipv6_fl(struct ipv6hdr *nh, u32 fl, u32 mask)
 {
/* Bits 21-24 are always unmasked, so this retains their values. */
-   SET_MASKED(nh->flow_lbl[0], (u8)(fl >> 16), (u8)(mask >> 16));
-   SET_MASKED(nh->flow_lbl[1], (u8)(fl >> 8), (u8)(mask >> 8));
-   SET_MASKED(nh->flow_lbl[2], (u8)fl, (u8)mask);
+   OVS_SET_MASKED(nh->flow_lbl[0], (u8)(fl >> 16), (u8)(mask >> 16));
+   OVS_SET_MASKED(nh->flow_lbl[1], (u8)(fl >> 8), (u8)(mask >> 8));
+   OVS_SET_MASKED(nh->flow_lbl[2], (u8)fl, (u8)mask);
 }
 
 static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl,
   u8 mask)
 {
-   new_ttl = MASKED(nh->ttl, new_ttl, mask);
+   new_ttl = OVS_MASKED(nh->ttl, new_ttl, mask);
 
csum_replace2(&nh->check, htons(nh->ttl << 8), htons(new_ttl << 8));
nh->ttl = new_ttl;
@@ -384,7 +380,7 @@ static int set_ipv4(struct sk_buff *skb, struct sw_flow_key 
*flow_key,
 * makes sense to check if the value actually changed.
 */
if (mask->ipv4_src) {
-   new_addr = MASKED(nh->saddr, key->ipv4_src, mask->ipv4_src);
+   new_addr = OVS_MASKED(nh->saddr, key->ipv4_src, mask->ipv4_src);
 
if (unlikely(new_addr != nh->saddr)) {
set_ip_addr(skb, nh, &nh->saddr, new_addr);
@@ -392,7 +388,7 @@ static int set_ipv4(struct sk_buff *skb, struct sw_flow_key 
*flow_key,
}
}
if (mask->ipv4_dst) {
-   new_addr = MASKED(nh->daddr, key->ipv4_dst, mask->ipv4_dst);
+   new_addr = OVS_MASKED(nh->daddr, key->ipv4_dst, mask->ipv4_dst);
 
if (unlikely(new_addr != nh->daddr)) {
set_ip_addr(skb, nh, &nh->daddr, new_addr);
@@ -480,7 +476,8 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key 
*flow_key,
*(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL);
}
if (mask->ipv6_hlimit) {
-   SET_MASKED(nh->hop_limit, key->ipv6_hlimit, mask->ipv6_hlimit);
+   OVS_SET_MASKED(nh->hop_limit, key->ipv6_hlimit,
+  mask->ipv6_hlimit);
flow_key->ip.ttl = nh->hop_limit;
}
return 0;
@@ -509,8 +506,8 @@ static int set_udp(struct sk_buff *skb, struct sw_flow_key 
*flow_key,

[PATCHv3 net-next 01/10] openvswitch: Serialize acts with original netlink len

2015-08-11 Thread Joe Stringer

Previously, we used the kernel-internal netlink actions length to
calculate the size of messages to serialize back to userspace.
However,the sw_flow_actions may not be formatted exactly the same as the
actions on the wire, so store the original actions length when
de-serializing and re-use the original length when serializing.

Signed-off-by: Joe Stringer 
---
v2: No change.
v3: Preserve original length across buffer resize.
---
 net/openvswitch/datapath.c | 2 +-
 net/openvswitch/flow.h | 1 +
 net/openvswitch/flow_netlink.c | 2 ++
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index ffe984f..d5b5473 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -713,7 +713,7 @@ static size_t ovs_flow_cmd_msg_size(const struct 
sw_flow_actions *acts,
 
/* OVS_FLOW_ATTR_ACTIONS */
if (should_fill_actions(ufid_flags))
-   len += nla_total_size(acts->actions_len);
+   len += nla_total_size(acts->orig_len);
 
return len
+ nla_total_size(sizeof(struct ovs_flow_stats)) /* 
OVS_FLOW_ATTR_STATS */
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index b62cdb3..082a87b 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -144,6 +144,7 @@ struct sw_flow_id {
 
 struct sw_flow_actions {
struct rcu_head rcu;
+   size_t orig_len;/* From flow_cmd_new netlink actions size */
u32 actions_len;
struct nlattr actions[];
 };
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index a6eb77a..96cad8c 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -1619,6 +1619,7 @@ static struct nlattr *reserve_sfa_size(struct 
sw_flow_actions **sfa,
 
memcpy(acts->actions, (*sfa)->actions, (*sfa)->actions_len);
acts->actions_len = (*sfa)->actions_len;
+   acts->orig_len = (*sfa)->orig_len;
kfree(*sfa);
*sfa = acts;
 
@@ -2223,6 +2224,7 @@ int ovs_nla_copy_actions(const struct nlattr *attr,
if (IS_ERR(*sfa))
return PTR_ERR(*sfa);
 
+   (*sfa)->orig_len = nla_len(attr);
err = __ovs_nla_copy_actions(attr, key, 0, sfa, key->eth.type,
 key->eth.tci, log);
if (err)
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCHv3 net-next 04/10] dst: Add __skb_dst_copy() variation

2015-08-11 Thread Joe Stringer

This variation on skb_dst_copy() doesn't require two skbs.

Signed-off-by: Joe Stringer 
---
 include/net/dst.h | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/net/dst.h b/include/net/dst.h
index 2578811..0539940 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -285,13 +285,18 @@ static inline void skb_dst_drop(struct sk_buff *skb)
}
 }
 
-static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff 
*oskb)
+static inline void __skb_dst_copy(struct sk_buff *nskb, unsigned long refdst)
 {
-   nskb->_skb_refdst = oskb->_skb_refdst;
+   nskb->_skb_refdst = refdst;
if (!(nskb->_skb_refdst & SKB_DST_NOREF))
dst_clone(skb_dst(nskb));
 }
 
+static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff 
*oskb)
+{
+   __skb_dst_copy(nskb, oskb->_skb_refdst);
+}
+
 /**
  * skb_dst_force - makes sure skb dst is refcounted
  * @skb: buffer
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCHv3 net-next 00/10] OVS conntrack support

2015-08-11 Thread Joe Stringer

The goal of this series is to allow OVS to send packets through the Linux
kernel connection tracker, and subsequently match on fields populated by
conntrack.

This version addresses the feedback from v2, mostly going over the dst
reference taking and skb metadata to check that they are restored correctly
after fragmentation.

This functionality is enabled through the CONFIG_OPENVSWITCH_CONNTRACK option.

The branch below has been updated with the corresponding userspace pieces:
https://github.com/justinpettit/ovs conntrack

v3: Better handling of skb->dst in output/fragmentation
Address cases involving l2 metadata and checksums
Make MRU types more consistent
Better cleanup in error paths
Fix sparse warnings

v2: Split out per-netns connlabel width setting functions
Simplify reference tracking in output path.
Handle output cases where flow key is invalidated by prior push/pop
Store entire L2 header to apply to fragments
Various bits of refactoring, comments, styles, log improvements
Defer patch to scrub skb
Rebase

v1: First non-RFC post.
Fragment handling.
Conntrack label support.

Joe Stringer (9):
  openvswitch: Serialize acts with original netlink len
  openvswitch: Move MASKED* macros to datapath.h
  ipv6: Export nf_ct_frag6_gather()
  dst: Add __skb_dst_copy() variation
  openvswitch: Add conntrack action
  netfilter: Always export nf_connlabels_replace()
  netfilter: connlabels: Export setting connlabel length
  openvswitch: Allow matching on conntrack label
  openvswitch: Allow attaching helpers to ct action

Justin Pettit (1):
  openvswitch: Allow matching on conntrack mark

 include/net/dst.h   |   9 +-
 include/net/netfilter/nf_conntrack_labels.h |   4 +
 include/uapi/linux/openvswitch.h|  49 ++
 net/ipv6/netfilter/nf_conntrack_reasm.c |   1 +
 net/netfilter/nf_conntrack_labels.c |  34 +-
 net/netfilter/xt_connlabel.c|  16 +-
 net/openvswitch/Kconfig |  11 +
 net/openvswitch/Makefile|   2 +
 net/openvswitch/actions.c   | 232 --
 net/openvswitch/conntrack.c | 688 
 net/openvswitch/conntrack.h | 126 +
 net/openvswitch/datapath.c  |  74 ++-
 net/openvswitch/datapath.h  |  12 +
 net/openvswitch/flow.c  |   5 +
 net/openvswitch/flow.h  |   9 +
 net/openvswitch/flow_netlink.c  | 103 -
 net/openvswitch/flow_netlink.h  |   4 +-
 net/openvswitch/vport.c |   1 +
 18 files changed, 1299 insertions(+), 81 deletions(-)
 create mode 100644 net/openvswitch/conntrack.c
 create mode 100644 net/openvswitch/conntrack.h

-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCHv3 net-next 08/10] netfilter: connlabels: Export setting connlabel length

2015-08-11 Thread Joe Stringer

Add functions to change connlabel length into nf_conntrack_labels.c so
they may be reused by other modules like OVS and nftables without
needing to jump through xt_match_check() hoops.

Suggested-by: Florian Westphal 
Signed-off-by: Joe Stringer 
---
v2: Protect connlabel modification with spinlock.
Fix reference leak in error case.
Style fixups.
---
 include/net/netfilter/nf_conntrack_labels.h |  4 
 net/netfilter/nf_conntrack_labels.c | 32 +
 net/netfilter/xt_connlabel.c| 16 ---
 3 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack_labels.h 
b/include/net/netfilter/nf_conntrack_labels.h
index dec6336..7e2b1d0 100644
--- a/include/net/netfilter/nf_conntrack_labels.h
+++ b/include/net/netfilter/nf_conntrack_labels.h
@@ -54,7 +54,11 @@ int nf_connlabels_replace(struct nf_conn *ct,
 #ifdef CONFIG_NF_CONNTRACK_LABELS
 int nf_conntrack_labels_init(void);
 void nf_conntrack_labels_fini(void);
+int nf_connlabels_get(struct net *net, unsigned int n_bits);
+void nf_connlabels_put(struct net *net);
 #else
 static inline int nf_conntrack_labels_init(void) { return 0; }
 static inline void nf_conntrack_labels_fini(void) {}
+static inline int nf_connlabels_get(struct net *net, unsigned int n_bits) { 
return 0; }
+static inline void nf_connlabels_put(struct net *net) {}
 #endif
diff --git a/net/netfilter/nf_conntrack_labels.c 
b/net/netfilter/nf_conntrack_labels.c
index daa7c13..3ce5c31 100644
--- a/net/netfilter/nf_conntrack_labels.c
+++ b/net/netfilter/nf_conntrack_labels.c
@@ -14,6 +14,8 @@
 #include 
 #include 
 
+static spinlock_t nf_connlabels_lock;
+
 static unsigned int label_bits(const struct nf_conn_labels *l)
 {
unsigned int longs = l->words;
@@ -89,6 +91,35 @@ int nf_connlabels_replace(struct nf_conn *ct,
 }
 EXPORT_SYMBOL_GPL(nf_connlabels_replace);
 
+int nf_connlabels_get(struct net *net, unsigned int n_bits)
+{
+   size_t words;
+
+   if (n_bits > (NF_CT_LABELS_MAX_SIZE * BITS_PER_BYTE))
+   return -ERANGE;
+
+   words = BITS_TO_LONGS(n_bits);
+
+   spin_lock(&nf_connlabels_lock);
+   net->ct.labels_used++;
+   if (words > net->ct.label_words)
+   net->ct.label_words = words;
+   spin_unlock(&nf_connlabels_lock);
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(nf_connlabels_get);
+
+void nf_connlabels_put(struct net *net)
+{
+   spin_lock(&nf_connlabels_lock);
+   net->ct.labels_used--;
+   if (net->ct.labels_used == 0)
+   net->ct.label_words = 0;
+   spin_unlock(&nf_connlabels_lock);
+}
+EXPORT_SYMBOL_GPL(nf_connlabels_put);
+
 static struct nf_ct_ext_type labels_extend __read_mostly = {
.len= sizeof(struct nf_conn_labels),
.align  = __alignof__(struct nf_conn_labels),
@@ -97,6 +128,7 @@ static struct nf_ct_ext_type labels_extend __read_mostly = {
 
 int nf_conntrack_labels_init(void)
 {
+   spin_lock_init(&nf_connlabels_lock);
return nf_ct_extend_register(&labels_extend);
 }
 
diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c
index 9f8719d..bb9cbeb 100644
--- a/net/netfilter/xt_connlabel.c
+++ b/net/netfilter/xt_connlabel.c
@@ -42,10 +42,6 @@ static int connlabel_mt_check(const struct xt_mtchk_param 
*par)
XT_CONNLABEL_OP_SET;
struct xt_connlabel_mtinfo *info = par->matchinfo;
int ret;
-   size_t words;
-
-   if (info->bit > XT_CONNLABEL_MAXBIT)
-   return -ERANGE;
 
if (info->options & ~options) {
pr_err("Unknown options in mask %x\n", info->options);
@@ -59,19 +55,15 @@ static int connlabel_mt_check(const struct xt_mtchk_param 
*par)
return ret;
}
 
-   par->net->ct.labels_used++;
-   words = BITS_TO_LONGS(info->bit+1);
-   if (words > par->net->ct.label_words)
-   par->net->ct.label_words = words;
-
+   ret = nf_connlabels_get(par->net, info->bit + 1);
+   if (ret < 0)
+   nf_ct_l3proto_module_put(par->family);
return ret;
 }
 
 static void connlabel_mt_destroy(const struct xt_mtdtor_param *par)
 {
-   par->net->ct.labels_used--;
-   if (par->net->ct.labels_used == 0)
-   par->net->ct.label_words = 0;
+   nf_connlabels_put(par->net);
nf_ct_l3proto_module_put(par->family);
 }
 
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCHv3 net-next 07/10] netfilter: Always export nf_connlabels_replace()

2015-08-11 Thread Joe Stringer

The following patches will reuse this code from OVS.

Signed-off-by: Joe Stringer 
---
 net/netfilter/nf_conntrack_labels.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/net/netfilter/nf_conntrack_labels.c 
b/net/netfilter/nf_conntrack_labels.c
index bb53f12..daa7c13 100644
--- a/net/netfilter/nf_conntrack_labels.c
+++ b/net/netfilter/nf_conntrack_labels.c
@@ -48,7 +48,6 @@ int nf_connlabel_set(struct nf_conn *ct, u16 bit)
 }
 EXPORT_SYMBOL_GPL(nf_connlabel_set);
 
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 static void replace_u32(u32 *address, u32 mask, u32 new)
 {
u32 old, tmp;
@@ -89,7 +88,6 @@ int nf_connlabels_replace(struct nf_conn *ct,
return 0;
 }
 EXPORT_SYMBOL_GPL(nf_connlabels_replace);
-#endif
 
 static struct nf_ct_ext_type labels_extend __read_mostly = {
.len= sizeof(struct nf_conn_labels),
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCHv3 net-next 05/10] openvswitch: Add conntrack action

2015-08-11 Thread Joe Stringer

Expose the kernel connection tracker via OVS. Userspace components can
make use of the "ct()" action, followed by "recirculate", to populate
the conntracking state in the OVS flow key, and subsequently match on
that state.

Example ODP flows allowing traffic from 1->2, only replies from 2->1:
in_port=1,tcp,action=ct(commit,zone=1),2
in_port=2,ct_state=-trk,tcp,action=ct(zone=1),recirc(1)
recirc_id=1,in_port=2,ct_state=+trk+est-new,tcp,action=1

IP fragments are handled by transparently assembling them as part of the
ct action. The maximum received unit (MRU) size is tracked so that
refragmentation can occur during output.

IP frag handling contributed by Andy Zhou.

Signed-off-by: Joe Stringer 
Signed-off-by: Justin Pettit 
Signed-off-by: Andy Zhou 
---
This can be tested with the corresponding userspace component here:
https://www.github.com/justinpettit/openvswitch conntrack

v2: Don't take references to devs or dsts in output path.
Shift ovs_ct_init()/ovs_ct_exit() into this patch
Handle output case where flow key is invalidated
Store the entire L2 header to apply to fragments
Various minor simplifications
Improve comments/logs
Style fixes
Rebase
v3: Clone dst in output, free final dst reference properly.
Handle CHECKSUM_COMPLETE after fragmentation
Restore L2 skb metadata after fragmentation
Make MRU types more consistent
Better cleanup in error paths
Fix sparse warnings
---
 include/uapi/linux/openvswitch.h |  41 
 net/openvswitch/Kconfig  |  11 +
 net/openvswitch/Makefile |   2 +
 net/openvswitch/actions.c| 170 +-
 net/openvswitch/conntrack.c  | 475 +++
 net/openvswitch/conntrack.h  |  97 
 net/openvswitch/datapath.c   |  72 --
 net/openvswitch/datapath.h   |   8 +
 net/openvswitch/flow.c   |   3 +
 net/openvswitch/flow.h   |   6 +
 net/openvswitch/flow_netlink.c   |  72 --
 net/openvswitch/flow_netlink.h   |   4 +-
 net/openvswitch/vport.c  |   1 +
 13 files changed, 925 insertions(+), 37 deletions(-)
 create mode 100644 net/openvswitch/conntrack.c
 create mode 100644 net/openvswitch/conntrack.h

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index d6b8854..1dae30a 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -164,6 +164,9 @@ enum ovs_packet_cmd {
  * %OVS_USERSPACE_ATTR_EGRESS_TUN_PORT attribute, which is sent only if the
  * output port is actually a tunnel port. Contains the output tunnel key
  * extracted from the packet as nested %OVS_TUNNEL_KEY_ATTR_* attributes.
+ * @OVS_PACKET_ATTR_MRU: Present for an %OVS_PACKET_CMD_ACTION and
+ * %OVS_PACKET_ATTR_USERSPACE action specify the Maximum received fragment
+ * size.
  *
  * These attributes follow the &struct ovs_header within the Generic Netlink
  * payload for %OVS_PACKET_* commands.
@@ -180,6 +183,7 @@ enum ovs_packet_attr {
OVS_PACKET_ATTR_UNUSED2,
OVS_PACKET_ATTR_PROBE,  /* Packet operation is a feature probe,
   error logging should be suppressed. */
+   OVS_PACKET_ATTR_MRU,/* Maximum received IP fragment size. */
__OVS_PACKET_ATTR_MAX
 };
 
@@ -319,6 +323,8 @@ enum ovs_key_attr {
OVS_KEY_ATTR_MPLS,  /* array of struct ovs_key_mpls.
 * The implementation may restrict
 * the accepted length of the array. */
+   OVS_KEY_ATTR_CT_STATE,  /* u8 bitmask of OVS_CS_F_* */
+   OVS_KEY_ATTR_CT_ZONE,   /* u16 connection tracking zone. */
 
 #ifdef __KERNEL__
OVS_KEY_ATTR_TUNNEL_INFO,  /* struct ip_tunnel_info */
@@ -431,6 +437,15 @@ struct ovs_key_nd {
__u8nd_tll[ETH_ALEN];
 };
 
+/* OVS_KEY_ATTR_CT_STATE flags */
+#define OVS_CS_F_NEW   0x01 /* Beginning of a new connection. */
+#define OVS_CS_F_ESTABLISHED   0x02 /* Part of an existing connection. */
+#define OVS_CS_F_RELATED   0x04 /* Related to an established
+* connection. */
+#define OVS_CS_F_INVALID   0x20 /* Could not track connection. */
+#define OVS_CS_F_REPLY_DIR 0x40 /* Flow is in the reply direction. */
+#define OVS_CS_F_TRACKED   0x80 /* Conntrack has occurred. */
+
 /**
  * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands.
  * @OVS_FLOW_ATTR_KEY: Nested %OVS_KEY_ATTR_* attributes specifying the flow
@@ -595,6 +610,29 @@ struct ovs_action_hash {
 };
 
 /**
+ * enum ovs_ct_attr - Attributes for %OVS_ACTION_ATTR_CT action.
+ * @OVS_CT_ATTR_FLAGS: u32 connection tracking flags.
+ * @OVS_CT_ATTR_ZONE: u16 connection tracking zone.
+ * @OVS_CT_ATTR_HELPER: variable length string defining conntrack ALG.
+ */
+enum ovs_ct_attr {
+   OVS_CT_ATTR_UNSPEC,
+   OVS_CT_ATTR_FLAGS,  /* u8 bitmask of OVS_CT_F_*. */
+   OVS_CT_ATTR_ZONE,   /* u16 zone id. */
+

[PATCHv3 net-next 06/10] openvswitch: Allow matching on conntrack mark

2015-08-11 Thread Joe Stringer

From: Justin Pettit 

Allow matching and setting the conntrack mark field. As with conntrack
state and zone, these are populated by executing the ct() action. Unlike
these, the ct_mark is also a writable field. The set_field() action may
be used to modify the mark, which will take effect on the most recent
conntrack entry.

E.g.: actions:ct(zone=0),ct(zone=1),set_field(1->ct_mark)

This will perform conntrack lookup in zone 0, then lookup in zone 1,
then modify the mark for the entry in zone 1. The mark for the entry in
zone 0 is unchanged. The conntrack entry itself must be committed using
the "commit" flag in the conntrack action flags for this change to persist.

Signed-off-by: Justin Pettit 
Signed-off-by: Joe Stringer 
---
 include/uapi/linux/openvswitch.h |  1 +
 net/openvswitch/actions.c|  6 ++
 net/openvswitch/conntrack.c  | 40 
 net/openvswitch/conntrack.h  | 14 ++
 net/openvswitch/flow.c   |  1 +
 net/openvswitch/flow.h   |  1 +
 net/openvswitch/flow_netlink.c   | 15 ++-
 7 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 1dae30a..207788c 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -325,6 +325,7 @@ enum ovs_key_attr {
 * the accepted length of the array. */
OVS_KEY_ATTR_CT_STATE,  /* u8 bitmask of OVS_CS_F_* */
OVS_KEY_ATTR_CT_ZONE,   /* u16 connection tracking zone. */
+   OVS_KEY_ATTR_CT_MARK,   /* u32 connection tracking mark */
 
 #ifdef __KERNEL__
OVS_KEY_ATTR_TUNNEL_INFO,  /* struct ip_tunnel_info */
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 5e1ed86..5acd7e7 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -957,6 +957,12 @@ static int execute_masked_set_action(struct sk_buff *skb,
err = set_mpls(skb, flow_key, nla_data(a), get_mask(a,
__be32 *));
break;
+
+   case OVS_KEY_ATTR_CT_MARK:
+   err = ovs_ct_set_mark(skb, flow_key, nla_get_u32(a),
+ *get_mask(a, u32 *));
+   break;
+
}
 
return err;
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 586ce66..81b80da 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -101,6 +101,15 @@ u16 ovs_ct_get_zone(const struct sk_buff *skb)
return ct ? nf_ct_zone(ct) : NF_CT_DEFAULT_ZONE;
 }
 
+u32 ovs_ct_get_mark(const struct sk_buff *skb)
+{
+   enum ip_conntrack_info ctinfo;
+   struct nf_conn *ct;
+
+   ct = nf_ct_get(skb, &ctinfo);
+   return ct ? ct->mark : 0;
+}
+
 static bool __ovs_ct_state_valid(u8 state)
 {
return (state && !(state & OVS_CS_F_INVALID));
@@ -192,6 +201,7 @@ static void __ovs_ct_update_key(struct sk_buff *skb, struct 
sw_flow_key *key,
 {
key->ct.state = state;
key->ct.zone = zone;
+   key->ct.mark = ovs_ct_get_mark(skb);
 }
 
 static void ovs_ct_update_key(struct sk_buff *skb, struct sw_flow_key *key,
@@ -323,6 +333,32 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
return err;
 }
 
+int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key,
+   u32 ct_mark, u32 mask)
+{
+#ifdef CONFIG_NF_CONNTRACK_MARK
+   enum ip_conntrack_info ctinfo;
+   struct nf_conn *ct;
+   u32 new_mark;
+
+   /* This must happen directly after lookup/commit. */
+   ct = nf_ct_get(skb, &ctinfo);
+   if (!ct)
+   return -EINVAL;
+
+   new_mark = ct_mark | (ct->mark & ~(mask));
+   if (ct->mark != new_mark) {
+   ct->mark = new_mark;
+   nf_conntrack_event_cache(IPCT_MARK, ct);
+   key->ct.mark = ct_mark;
+   }
+
+   return 0;
+#else
+   return -ENOTSUPP;
+#endif
+}
+
 static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
[OVS_CT_ATTR_FLAGS] = { .minlen = sizeof(u32),
.maxlen = sizeof(u32) },
@@ -386,6 +422,10 @@ bool ovs_ct_verify(enum ovs_key_attr attr)
if (attr & OVS_KEY_ATTR_CT_ZONE)
return true;
 #endif
+#ifdef CONFIG_NF_CONNTRACK_MARK
+   if (attr & OVS_KEY_ATTR_CT_MARK)
+   return true;
+#endif
 
return false;
 }
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index 0e09a6d..b0f06b4 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -37,6 +37,9 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *, 
struct sk_buff *);
 int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *,
   const struct ovs_conntrack_info *);
 
+int ovs_ct_set_mark(struct sk_buff *, struct sw_flow_key *, u32 ct_mark,
+

[PATCHv3 net-next 10/10] openvswitch: Allow attaching helpers to ct action

2015-08-11 Thread Joe Stringer

Add support for using conntrack helpers to assist protocol detection.
The new OVS_CT_ATTR_HELPER attribute of the ct action specifies a helper
to be used for this connection.

Example ODP flows allowing FTP connections from ports 1->2:
in_port=1,tcp,action=ct(helper=ftp,commit),2
in_port=2,tcp,ct_state=-trk,action=ct(),recirc(1)
recirc_id=1,in_port=2,tcp,ct_state=+trk-new+est,action=1
recirc_id=1,in_port=2,tcp,ct_state=+trk+rel,action=1

Signed-off-by: Joe Stringer 
---
 include/uapi/linux/openvswitch.h |   1 +
 net/openvswitch/conntrack.c  | 109 ++-
 2 files changed, 108 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index f360dc9..e816170 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -626,6 +626,7 @@ enum ovs_ct_attr {
OVS_CT_ATTR_UNSPEC,
OVS_CT_ATTR_FLAGS,  /* u8 bitmask of OVS_CT_F_*. */
OVS_CT_ATTR_ZONE,   /* u16 zone id. */
+   OVS_CT_ATTR_HELPER,
__OVS_CT_ATTR_MAX
 };
 
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 6a64a32..1f2a9bc 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -30,6 +31,7 @@ struct ovs_ct_len_tbl {
 };
 
 struct ovs_conntrack_info {
+   struct nf_conntrack_helper *helper;
struct nf_conn *ct;
u32 flags;
u16 zone;
@@ -145,6 +147,51 @@ bool ovs_ct_state_valid(const struct sw_flow_key *key)
return __ovs_ct_state_valid(key->ct.state);
 }
 
+/* 'skb' should already be pulled to nh_ofs. */
+static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
+{
+   const struct nf_conntrack_helper *helper;
+   const struct nf_conn_help *help;
+   enum ip_conntrack_info ctinfo;
+   unsigned int protoff;
+   struct nf_conn *ct;
+
+   ct = nf_ct_get(skb, &ctinfo);
+   if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+   return NF_ACCEPT;
+
+   help = nfct_help(ct);
+   if (!help)
+   return NF_ACCEPT;
+
+   helper = rcu_dereference(help->helper);
+   if (!helper)
+   return NF_ACCEPT;
+
+   switch (proto) {
+   case NFPROTO_IPV4:
+   protoff = ip_hdrlen(skb);
+   break;
+   case NFPROTO_IPV6: {
+   u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+   __be16 frag_off;
+
+   protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr),
+  &nexthdr, &frag_off);
+   if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
+   pr_debug("proto header not found\n");
+   return NF_ACCEPT;
+   }
+   break;
+   }
+   default:
+   WARN_ONCE(1, "helper invoked on non-IP family!");
+   return NF_DROP;
+   }
+
+   return helper->help(skb, protoff, ct, ctinfo);
+}
+
 static int handle_fragments(struct net *net, struct sw_flow_key *key,
u16 zone, struct sk_buff *skb)
 {
@@ -217,6 +264,13 @@ static bool skb_nfct_cached(const struct net *net, const 
struct sk_buff *skb,
return false;
if (info->zone != nf_ct_zone(ct))
return false;
+   if (info->helper) {
+   struct nf_conn_help *help;
+
+   help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER);
+   if (help && help->helper != info->helper)
+   return false;
+   }
 
return true;
 }
@@ -274,6 +328,11 @@ static int __ovs_ct_lookup(struct net *net, const struct 
sw_flow_key *key,
if (nf_conntrack_in(net, info->family, NF_INET_PRE_ROUTING,
skb) != NF_ACCEPT)
return -ENOENT;
+
+   if (ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
+   WARN_ONCE(1, "helper rejected packet");
+   return -EINVAL;
+   }
}
 
return 0;
@@ -420,15 +479,41 @@ int ovs_ct_set_label(struct sk_buff *skb, struct 
sw_flow_key *key,
 #endif
 }
 
+static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
+const struct sw_flow_key *key, bool log)
+{
+   struct nf_conntrack_helper *helper;
+   struct nf_conn_help *help;
+
+   helper = nf_conntrack_helper_try_module_get(name, info->family,
+   key->ip.proto);
+   if (!helper) {
+   OVS_NLERR(log, "Unknown helper \"%s\"", name);
+   return -ENOENT;
+   }
+
+   help = nf_ct_helper_ext_add(info->ct, helper, GFP_KERNEL);
+   if (!help) {
+   module_put(helper->me);
+   return -ENOMEM;
+   }
+
+   help->helper = helper;
+   info->helper = helper;
+

[PATCHv3 net-next 09/10] openvswitch: Allow matching on conntrack label

2015-08-11 Thread Joe Stringer

Allow matching and setting the conntrack label field. As with ct_mark,
this is populated by executing the ct() action, and is a writable field.
The set_field() action may be used to modify the label, which will take
effect on the most recent conntrack entry.

E.g.: actions:ct(zone=1),set_field(1->ct_label)

This will perform conntrack lookup in zone 1, then modify the label for
that entry. The conntrack entry itself must be committed using the
"commit" flag in the conntrack action flags for this change to persist.

Signed-off-by: Joe Stringer 
---
v2: Split out setting the connlabel size for the current namespace.
---
 include/uapi/linux/openvswitch.h |  6 
 net/openvswitch/actions.c|  4 +++
 net/openvswitch/conntrack.c  | 68 
 net/openvswitch/conntrack.h  | 15 +
 net/openvswitch/flow.c   |  1 +
 net/openvswitch/flow.h   |  1 +
 net/openvswitch/flow_netlink.c   | 18 ++-
 7 files changed, 112 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 207788c..f360dc9 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -326,6 +326,7 @@ enum ovs_key_attr {
OVS_KEY_ATTR_CT_STATE,  /* u8 bitmask of OVS_CS_F_* */
OVS_KEY_ATTR_CT_ZONE,   /* u16 connection tracking zone. */
OVS_KEY_ATTR_CT_MARK,   /* u32 connection tracking mark */
+   OVS_KEY_ATTR_CT_LABEL,  /* 16-octet connection tracking label */
 
 #ifdef __KERNEL__
OVS_KEY_ATTR_TUNNEL_INFO,  /* struct ip_tunnel_info */
@@ -438,6 +439,11 @@ struct ovs_key_nd {
__u8nd_tll[ETH_ALEN];
 };
 
+#define OVS_CT_LABEL_LEN   16
+struct ovs_key_ct_label {
+   __u8ct_label[OVS_CT_LABEL_LEN];
+};
+
 /* OVS_KEY_ATTR_CT_STATE flags */
 #define OVS_CS_F_NEW   0x01 /* Beginning of a new connection. */
 #define OVS_CS_F_ESTABLISHED   0x02 /* Part of an existing connection. */
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 5acd7e7..74524e4 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -963,6 +963,10 @@ static int execute_masked_set_action(struct sk_buff *skb,
  *get_mask(a, u32 *));
break;
 
+   case OVS_KEY_ATTR_CT_LABEL:
+   err = ovs_ct_set_label(skb, flow_key, nla_data(a),
+   get_mask(a, struct ovs_key_ct_label *));
+   break;
}
 
return err;
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 81b80da..6a64a32 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -110,6 +111,30 @@ u32 ovs_ct_get_mark(const struct sk_buff *skb)
return ct ? ct->mark : 0;
 }
 
+void ovs_ct_get_label(const struct sk_buff *skb,
+ struct ovs_key_ct_label *label)
+{
+   enum ip_conntrack_info ctinfo;
+   struct nf_conn_labels *cl = NULL;
+   struct nf_conn *ct;
+
+   ct = nf_ct_get(skb, &ctinfo);
+   if (ct)
+   cl = nf_ct_labels_find(ct);
+
+   if (cl) {
+   size_t len = cl->words * sizeof(long);
+
+   if (len > OVS_CT_LABEL_LEN)
+   len = OVS_CT_LABEL_LEN;
+   else if (len < OVS_CT_LABEL_LEN)
+   memset(label, 0, OVS_CT_LABEL_LEN);
+   memcpy(label, cl->bits, len);
+   } else {
+   memset(label, 0, OVS_CT_LABEL_LEN);
+   }
+}
+
 static bool __ovs_ct_state_valid(u8 state)
 {
return (state && !(state & OVS_CS_F_INVALID));
@@ -202,6 +227,7 @@ static void __ovs_ct_update_key(struct sk_buff *skb, struct 
sw_flow_key *key,
key->ct.state = state;
key->ct.zone = zone;
key->ct.mark = ovs_ct_get_mark(skb);
+   ovs_ct_get_label(skb, &key->ct.label);
 }
 
 static void ovs_ct_update_key(struct sk_buff *skb, struct sw_flow_key *key,
@@ -359,6 +385,41 @@ int ovs_ct_set_mark(struct sk_buff *skb, struct 
sw_flow_key *key,
 #endif
 }
 
+int ovs_ct_set_label(struct sk_buff *skb, struct sw_flow_key *key,
+const struct ovs_key_ct_label *label,
+const struct ovs_key_ct_label *mask)
+{
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+   enum ip_conntrack_info ctinfo;
+   struct nf_conn_labels *cl;
+   struct nf_conn *ct;
+   int err;
+
+   /* This must happen directly after lookup/commit. */
+   ct = nf_ct_get(skb, &ctinfo);
+   if (!ct)
+   return -EINVAL;
+
+   cl = nf_ct_labels_find(ct);
+   if (!cl) {
+   nf_ct_labels_ext_add(ct);
+   cl = nf_ct_labels_find(ct);
+   }
+   if (!cl || cl->words * sizeof(long) < OVS_CT_LABEL_LEN)
+   return -ENOSPC;
+
+   err = nf_connlabels_replace(ct, (u32 *)label, (u32 *)mask,
+

[PATCH 4/6] dlm: use sctp 1-to-1 API

2015-08-11 Thread Marcelo Ricardo Leitner

DLM is using 1-to-many API but in a 1-to-1 fashion. That is, it's not
needed but this causes it to use sctp_do_peeloff() to mimic an
kernel_accept() and this causes a symbol dependency on sctp module.

By switching it to 1-to-1 API we can avoid this dependency and also
reduce quite a lot of SCTP-specific code in lowcomms.c.

The caveat is that now DLM won't always use the same src port. It will
choose a random one, just like TCP code. This allows the peers to
attempt simultaneous connections, which now are handled just like for
TCP.

Even more sharing between TCP and SCTP code on DLM is possible, but it
is intentionally left for a later commit.

Note that for using nodes with this commit, you have to have at least
the early fixes on this patchset otherwise it will trigger some issues
on old nodes.

Signed-off-by: Marcelo Ricardo Leitner 
---
 fs/dlm/lowcomms.c | 671 +++---
 1 file changed, 237 insertions(+), 434 deletions(-)

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 
54a0031067de3ae6d3cd0106d7b9d4e30c956cfd..856d750be96b64d06c48a1a4e2a68785e8dda048
 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -120,12 +120,10 @@ struct connection {
struct cbuf cb;
int retries;
 #define MAX_CONNECT_RETRIES 3
-   int sctp_assoc;
struct hlist_node list;
struct connection *othercon;
struct work_struct rwork; /* Receive workqueue */
struct work_struct swork; /* Send workqueue */
-   bool try_new_addr;
 };
 #define sock2con(x) ((struct connection *)(x)->sk_user_data)
 
@@ -252,26 +250,6 @@ static struct connection *nodeid2con(int nodeid, gfp_t 
allocation)
return con;
 }
 
-/* This is a bit drastic, but only called when things go wrong */
-static struct connection *assoc2con(int assoc_id)
-{
-   int i;
-   struct connection *con;
-
-   mutex_lock(&connections_lock);
-
-   for (i = 0 ; i < CONN_HASH_SIZE; i++) {
-   hlist_for_each_entry(con, &connection_hash[i], list) {
-   if (con->sctp_assoc == assoc_id) {
-   mutex_unlock(&connections_lock);
-   return con;
-   }
-   }
-   }
-   mutex_unlock(&connections_lock);
-   return NULL;
-}
-
 static struct dlm_node_addr *find_node_addr(int nodeid)
 {
struct dlm_node_addr *na;
@@ -322,14 +300,14 @@ static int nodeid_to_addr(int nodeid, struct 
sockaddr_storage *sas_out,
spin_lock(&dlm_node_addrs_spin);
na = find_node_addr(nodeid);
if (na && na->addr_count) {
+   memcpy(&sas, na->addr[na->curr_addr_index],
+  sizeof(struct sockaddr_storage));
+
if (try_new_addr) {
na->curr_addr_index++;
if (na->curr_addr_index == na->addr_count)
na->curr_addr_index = 0;
}
-
-   memcpy(&sas, na->addr[na->curr_addr_index ],
-   sizeof(struct sockaddr_storage));
}
spin_unlock(&dlm_node_addrs_spin);
 
@@ -459,18 +437,23 @@ static inline void lowcomms_connect_sock(struct 
connection *con)
 
 static void lowcomms_state_change(struct sock *sk)
 {
-   if (sk->sk_state == TCP_ESTABLISHED)
+   /* SCTP layer is not calling sk_data_ready when the connection
+* is done, so we catch the signal through here. Also, it
+* doesn't switch socket state when entering shutdown, so we
+* skip the write in that case.
+*/
+   if (sk->sk_shutdown) {
+   if (sk->sk_shutdown == RCV_SHUTDOWN)
+   lowcomms_data_ready(sk);
+   } else if (sk->sk_state == TCP_ESTABLISHED) {
lowcomms_write_space(sk);
+   }
 }
 
 int dlm_lowcomms_connect_node(int nodeid)
 {
struct connection *con;
 
-   /* with sctp there's no connecting without sending */
-   if (dlm_config.ci_protocol != 0)
-   return 0;
-
if (nodeid == dlm_our_nodeid())
return 0;
 
@@ -542,264 +525,6 @@ static void close_connection(struct connection *con, bool 
and_other,
mutex_unlock(&con->sock_mutex);
 }
 
-/* We only send shutdown messages to nodes that are not part of the cluster
- * or if we get multiple connections from a node.
- */
-static void sctp_send_shutdown(sctp_assoc_t associd)
-{
-   static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
-   struct msghdr outmessage;
-   struct cmsghdr *cmsg;
-   struct sctp_sndrcvinfo *sinfo;
-   int ret;
-   struct connection *con;
-
-   con = nodeid2con(0,0);
-   BUG_ON(con == NULL);
-
-   outmessage.msg_name = NULL;
-   outmessage.msg_namelen = 0;
-   outmessage.msg_control = outcmsg;
-   outmessage.msg_controllen = sizeof(outcmsg);
-   outmessage.msg_flags = MSG_EOR;
-
-   cmsg = CMSG_FIRSTHDR(&outmessage);

[PATCH 1/6] dlm: fix connection stealing if using SCTP

2015-08-11 Thread Marcelo Ricardo Leitner

When using SCTP and accepting a new connection, DLM currently validates
if the peer trying to connect to it is one of the cluster nodes, but it
doesn't check if it already has a connection to it or not.

If it already had a connection, it will be overwritten, and the new one
will be used for writes, possibly causing the node to leave the cluster
due to communication breakage.

Still, one could DoS the node by attempting N connections and keeping
them open.

As said, but being explicit, both situations are only triggerable from
other cluster nodes, but are doable with only user-level perms.

Signed-off-by: Marcelo Ricardo Leitner 
---
 fs/dlm/lowcomms.c | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 
754fd6c0b7470bab272b071e6ca6e4969e4e4209..bc04f5e3af7ac5fe107a7a26555777364de8bc15
 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -535,7 +535,9 @@ static void close_connection(struct connection *con, bool 
and_other)
mutex_unlock(&con->sock_mutex);
 }
 
-/* We only send shutdown messages to nodes that are not part of the cluster */
+/* We only send shutdown messages to nodes that are not part of the cluster
+ * or if we get multiple connections from a node.
+ */
 static void sctp_send_shutdown(sctp_assoc_t associd)
 {
static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
@@ -718,6 +720,14 @@ static void process_sctp_notification(struct connection 
*con,
if (!new_con)
return;
 
+   if (new_con->sock) {
+   log_print("reject connect from node %d: "
+ "already has a connection.",
+ nodeid);
+   sctp_send_shutdown(prim.ssp_assoc_id);
+   return;
+   }
+
/* Peel off a new sock */
lock_sock(con->sock->sk);
ret = sctp_do_peeloff(con->sock->sk,
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 3/6] dlm: fix not reconnecting on connecting error handling

2015-08-11 Thread Marcelo Ricardo Leitner

If we don't clear that bit, lowcomms_connect_sock() will not schedule
another attempt, and no further attempt will be done.

Signed-off-by: Marcelo Ricardo Leitner 
---
 fs/dlm/lowcomms.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 
749deb3b69b2932fb18e7ae70c06e4ced15bd9b6..54a0031067de3ae6d3cd0106d7b9d4e30c956cfd
 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1253,6 +1253,7 @@ out_err:
  con->retries, result);
mutex_unlock(&con->sock_mutex);
msleep(1000);
+   clear_bit(CF_CONNECT_PENDING, &con->flags);
lowcomms_connect_sock(con);
return;
}
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 0/6] use sctp 1-to-1 API

2015-08-11 Thread Marcelo Ricardo Leitner

Cc'ing netdev and sctp maintainers so we keep everyone in the loop.
This series will be handled via dlm tree now.

Currently, loading dlm module will require loading sctp module too and
this is not wanted. This link is caused only by the usage of
sctp_do_peeloff(), and just it.

Previously this operation was performed through getsockopt() but it
caused a file descriptor to be allocated, as it was initially intended
only for userspace, and this caused serious issues. Please see
2f2d76cc3e93 ("dlm: Do not allocate a fd for peeloff") for more info on
that issue.

Previous attempts on breaking such link by either creating a new
getsockopt() option or by changing the current one were rejected on
netdev, as both were deemed unpractical.

So, as Vlad Yasevich noticed, DLM has no bigger reason to use that call.
Thus we can avoid using it by switching to 1-to-1 (TCP style) API.

Such move also simplificates how DLM deals with SCTP sockets because now
it can deal with them almost like TCP ones. I.e., there is no need to
handle SCTP events and error recovery is simplified in some places.

While switching to 1-to-1 API a couple of issues were noticed on DLM.
They are fixed in the first patches of this series and are the minimum
requirements to have both implementations compatible. That said, usage
of mixed versions without them will cause instability.

Tested with test applications kindly provided by David Teigland, on a
two node cluster, with TCP and SCTP with 1 and 2 addresses each.

Many thanks,

Marcelo Ricardo Leitner (6):
  dlm: fix connection stealing if using SCTP
  dlm: fix race while closing connections
  dlm: fix not reconnecting on connecting error handling
  dlm: use sctp 1-to-1 API
  dlm: replace BUG_ON with a less severe handling
  dlm: fix reconnecting but not sending data

 fs/dlm/lowcomms.c | 704 --
 1 file changed, 261 insertions(+), 443 deletions(-)

-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/6] dlm: fix race while closing connections

2015-08-11 Thread Marcelo Ricardo Leitner

When a connection have issues DLM may need to close it.  Therefore we
should also cancel pending workqueues for such connection at that time,
and not just when dlm is not willing to use this connection anymore.

Also, if we don't clear CF_CONNECT_PENDING flag, the error handling
routines won't be able to re-connect as lowcomms_connect_sock() will
check for it.

Signed-off-by: Marcelo Ricardo Leitner 
---
 fs/dlm/lowcomms.c | 29 +++--
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 
bc04f5e3af7ac5fe107a7a26555777364de8bc15..749deb3b69b2932fb18e7ae70c06e4ced15bd9b6
 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -514,17 +514,24 @@ static void make_sockaddr(struct sockaddr_storage *saddr, 
uint16_t port,
 }
 
 /* Close a remote connection and tidy up */
-static void close_connection(struct connection *con, bool and_other)
+static void close_connection(struct connection *con, bool and_other,
+bool tx, bool rx)
 {
-   mutex_lock(&con->sock_mutex);
+   clear_bit(CF_CONNECT_PENDING, &con->flags);
+   clear_bit(CF_WRITE_PENDING, &con->flags);
+   if (tx && cancel_work_sync(&con->swork))
+   log_print("canceled swork for node %d", con->nodeid);
+   if (rx && cancel_work_sync(&con->rwork))
+   log_print("canceled rwork for node %d", con->nodeid);
 
+   mutex_lock(&con->sock_mutex);
if (con->sock) {
sock_release(con->sock);
con->sock = NULL;
}
if (con->othercon && and_other) {
/* Will only re-enter once. */
-   close_connection(con->othercon, false);
+   close_connection(con->othercon, false, true, true);
}
if (con->rx_page) {
__free_page(con->rx_page);
@@ -902,7 +909,7 @@ out_resched:
 out_close:
mutex_unlock(&con->sock_mutex);
if (ret != -EAGAIN) {
-   close_connection(con, false);
+   close_connection(con, false, true, false);
/* Reconnect when there is something to send */
}
/* Don't return success if we really got EOF */
@@ -1622,7 +1629,7 @@ out:
 
 send_error:
mutex_unlock(&con->sock_mutex);
-   close_connection(con, false);
+   close_connection(con, false, false, true);
lowcomms_connect_sock(con);
return;
 
@@ -1654,15 +1661,9 @@ int dlm_lowcomms_close(int nodeid)
log_print("closing connection to node %d", nodeid);
con = nodeid2con(nodeid, 0);
if (con) {
-   clear_bit(CF_CONNECT_PENDING, &con->flags);
-   clear_bit(CF_WRITE_PENDING, &con->flags);
set_bit(CF_CLOSE, &con->flags);
-   if (cancel_work_sync(&con->swork))
-   log_print("canceled swork for node %d", nodeid);
-   if (cancel_work_sync(&con->rwork))
-   log_print("canceled rwork for node %d", nodeid);
+   close_connection(con, true, true, true);
clean_one_writequeue(con);
-   close_connection(con, true);
}
 
spin_lock(&dlm_node_addrs_spin);
@@ -1745,7 +1746,7 @@ static void stop_conn(struct connection *con)
 
 static void free_conn(struct connection *con)
 {
-   close_connection(con, true);
+   close_connection(con, true, true, true);
if (con->othercon)
kmem_cache_free(con_cache, con->othercon);
hlist_del(&con->list);
@@ -1816,7 +1817,7 @@ fail_unlisten:
dlm_allow_conn = 0;
con = nodeid2con(0,0);
if (con) {
-   close_connection(con, false);
+   close_connection(con, false, true, true);
kmem_cache_free(con_cache, con);
}
 fail_destroy:
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 5/6] dlm: replace BUG_ON with a less severe handling

2015-08-11 Thread Marcelo Ricardo Leitner

BUG_ON() is a severe action for this case, specially now that DLM with
SCTP will use 1 socket per association. Instead, we can just close the
socket on this error condition and return from the function.

Also move the check to an earlier stage as it won't change and thus we
can abort as soon as possible.

Although this issue was reported when still using SCTP with 1-to-many
API, this cleanup wouldn't be that simple back then because we couldn't
close the socket and making sure such event would cease would be hard.
And actually, previous code was closing the association, yet SCTP layer
is still raising the new data event. Probably a bug to be fixed in SCTP.

Reported-by: 
Signed-off-by: Marcelo Ricardo Leitner 
---
 fs/dlm/lowcomms.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 
856d750be96b64d06c48a1a4e2a68785e8dda048..4ea64e93e6b19ea24132606cffaca8bb502d18ab
 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -542,6 +542,10 @@ static int receive_from_sock(struct connection *con)
ret = -EAGAIN;
goto out_close;
}
+   if (con->nodeid == 0) {
+   ret = -EINVAL;
+   goto out_close;
+   }
 
if (con->rx_page == NULL) {
/*
@@ -582,8 +586,6 @@ static int receive_from_sock(struct connection *con)
else if (ret == len)
call_again_soon = 1;
 
-   BUG_ON(con->nodeid == 0);
-
cbuf_add(&con->cb, ret);
ret = dlm_process_incoming_buffer(con->nodeid,
  page_address(con->rx_page),
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 6/6] dlm: fix reconnecting but not sending data

2015-08-11 Thread Marcelo Ricardo Leitner

There are cases on which lowcomms_connect_sock() is called directly,
which caused the CF_WRITE_PENDING flag to not bet set upon reconnect,
specially on send_to_sock() error handling. On this last, the flag was
already cleared and no further attempt on transmitting would be done.

As dlm tends to connect when it needs to transmit something, it makes
sense to always mark this flag right after the connect.

Signed-off-by: Marcelo Ricardo Leitner 
---
 fs/dlm/lowcomms.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 
4ea64e93e6b19ea24132606cffaca8bb502d18ab..cd008c94efb8bbec0cc1f0123c9bebf7a58b48b5
 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1020,6 +1020,7 @@ socket_err:
 
 out:
mutex_unlock(&con->sock_mutex);
+   set_bit(CF_WRITE_PENDING, &con->flags);
 }
 
 /* Connect a new socket to its peer */
@@ -1114,6 +1115,7 @@ out_err:
}
 out:
mutex_unlock(&con->sock_mutex);
+   set_bit(CF_WRITE_PENDING, &con->flags);
return;
 }
 
@@ -1502,10 +1504,8 @@ static void process_send_sockets(struct work_struct 
*work)
 {
struct connection *con = container_of(work, struct connection, swork);
 
-   if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
+   if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags))
con->connect_action(con);
-   set_bit(CF_WRITE_PENDING, &con->flags);
-   }
if (test_and_clear_bit(CF_WRITE_PENDING, &con->flags))
send_to_sock(con);
 }
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH net-next] documentation: bring vxlan documentation more up-to-date

2015-08-11 Thread Rick Jones


On 08/11/2015 03:09 PM, Stephen Hemminger wrote:

On Tue, 11 Aug 2015 13:47:16 -0700 (PDT)
r...@tardy.usa.hp.com (Rick Jones) wrote:


+  # ip link add vxlan0 type vxlan id 42 group 239.1.1.1 dev eth1
+
+This creates a new device named vxlan0.  The device uses the
+multicast group 239.1.1.1 over eth1 to handle traffic for which there
+is no entry is in the forwarding table.  The Linux implementation of
+VXLAN pre-dates the IANA's selection of a standard destination port
+number and uses the Linux-selected value by default to maintain
+backwards compatibility.  If you wish to use the IANA-assigned
+destination port number of 4789 you can add "dstport 4789" to the
+command line above.


This example should be changed to use "dstport 4789".
Almost anyone reading the documentation will want to use the IANA value.


I'll spin a v2.


Note: ip command will give a noisy warning if dstport is not set.


Indeed. It was that noisy warning which sent me down this path in the 
first place :)


rick jones

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH net-next] documentation: bring vxlan documentation more up-to-date

2015-08-11 Thread Stephen Hemminger

On Tue, 11 Aug 2015 13:47:16 -0700 (PDT)
r...@tardy.usa.hp.com (Rick Jones) wrote:

> +  # ip link add vxlan0 type vxlan id 42 group 239.1.1.1 dev eth1
> +
> +This creates a new device named vxlan0.  The device uses the
> +multicast group 239.1.1.1 over eth1 to handle traffic for which there
> +is no entry is in the forwarding table.  The Linux implementation of
> +VXLAN pre-dates the IANA's selection of a standard destination port
> +number and uses the Linux-selected value by default to maintain
> +backwards compatibility.  If you wish to use the IANA-assigned
> +destination port number of 4789 you can add "dstport 4789" to the
> +command line above.

This example should be changed to use "dstport 4789".
Almost anyone reading the documentation will want to use the IANA value.

Note: ip command will give a noisy warning if dstport is not set.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: ipv6_mc_check_mld - kernel BUG at net/core/skbuff.c:1128

2015-08-11 Thread Linus Lüssing

On Tue, Aug 11, 2015 at 10:51:40PM +0200, Linus Lüssing wrote:
> On Mon, Aug 10, 2015 at 02:56:12PM -0700, Brenden Blanco wrote:
> > Doing some code reading with Alexei, we found a suspect commit, which
> > introduces an skb_get and skb_may_pull of the same skb, which leads to the 
> > BUG
> > when skb->len == len.
> 
> Urgh, didn't know that pskb_may_pull() doesn't like an skb with a
> reference count greater than one... But yes, the BUG() call in
> skbuff.c:1128 / pskb_expand_head() says that (though in this case
> the BUG() in skbuff.c call actually seems kinda weird (/"wrong"?), as
> it isn't shared between different code paths).

The more I think about it, I'm tending to remove the BUG() call in
pskb_expand_head() as in this case it obviously isn't a bug.

The skb_get() allows a simple and in my opinion easy to read cleanup
part of skb_trimmed for any caller of ip{v6,}_mc_check_mld(). No need
to check whether skb == skb_trimmed for a caller for instance,
simply checking whether skb_trimmed exists is enough.

Any objections to remove the "if (skb_shared(skb)) BUG()" part in
pskb_expand_head()? Or would there be any other undesired side
effects in utilising skb_get() like that?

Cheers, Linus
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH net-next 2/3] mpls: consistently use u8 to store number of labels

2015-08-11 Thread Roopa Prabhu

From: Roopa Prabhu 

change all types representing number of labels to u8
to be consistent.

This also changes labels to u8 in the light weight
mpls_tunnel_encap structure. This is because the
light weight mpls iptunnel code shares some of the label
encoding functions like nla_get/put_labels with the af_mpls
code.

Signed-off-by: Roopa Prabhu 
---
 include/net/mpls_iptunnel.h |2 +-
 net/mpls/af_mpls.c  |   10 +-
 net/mpls/internal.h |2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/net/mpls_iptunnel.h b/include/net/mpls_iptunnel.h
index 4757997..179253f 100644
--- a/include/net/mpls_iptunnel.h
+++ b/include/net/mpls_iptunnel.h
@@ -18,7 +18,7 @@
 
 struct mpls_iptunnel_encap {
u32 label[MAX_NEW_LABELS];
-   u32 labels;
+   u8  labels;
 };
 
 static inline struct mpls_iptunnel_encap *mpls_lwtunnel_encap(struct 
lwtunnel_state *lwtstate)
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index cf86e9d..eb089ef 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -243,11 +243,11 @@ static const struct nla_policy rtm_mpls_policy[RTA_MAX+1] 
= {
 struct mpls_route_config {
u32 rc_protocol;
u32 rc_ifindex;
-   u16 rc_via_table;
-   u16 rc_via_alen;
+   u8  rc_via_table;
+   u8  rc_via_alen;
u8  rc_via[MAX_VIA_ALEN];
+   u8  rc_output_labels;
u32 rc_label;
-   u32 rc_output_labels;
u32 rc_output_label[MAX_NEW_LABELS];
u32 rc_nlflags;
enum mpls_payload_type  rc_payload_type;
@@ -751,7 +751,7 @@ int nla_put_labels(struct sk_buff *skb, int attrtype,
 EXPORT_SYMBOL_GPL(nla_put_labels);
 
 int nla_get_labels(const struct nlattr *nla,
-  u32 max_labels, u32 *labels, u32 label[])
+  u32 max_labels, u8 *labels, u32 label[])
 {
unsigned len = nla_len(nla);
unsigned nla_labels;
@@ -859,7 +859,7 @@ static int rtm_to_route_config(struct sk_buff *skb,  struct 
nlmsghdr *nlh,
break;
case RTA_DST:
{
-   u32 label_count;
+   u8 label_count;
if (nla_get_labels(nla, 1, &label_count,
   &cfg->rc_label))
goto errout;
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index f05e2e8..f5dafcaf 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -87,7 +87,7 @@ static inline struct mpls_entry_decoded 
mpls_entry_decode(struct mpls_shim_hdr *
 
 int nla_put_labels(struct sk_buff *skb, int attrtype,  u8 labels,
   const u32 label[]);
-int nla_get_labels(const struct nlattr *nla, u32 max_labels, u32 *labels,
+int nla_get_labels(const struct nlattr *nla, u32 max_labels, u8 *labels,
   u32 label[]);
 bool mpls_output_possible(const struct net_device *dev);
 unsigned int mpls_dev_mtu(const struct net_device *dev);
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH net-next 1/3] mpls: move mpls_route nexthop fields to a new nhlfe struct

2015-08-11 Thread Roopa Prabhu

From: Roopa Prabhu 

moves mpls_route nexthop fields to a new mpls_nhlfe
struct. mpls_nhlfe represents a mpls nexthop label forwarding entry.
It prepares mpls route structure for multipath support.

In the process moves mpls_route structure into internal.h.
Moves some of the code from mpls_route_add into a separate mpls
nhlfe build function. changed mpls_rt_alloc to take number of
nexthops as argument.

A mpls route can point to multiple mpls_nhlfe. This patch
does not support multipath yet, hence the rest of the changes
assume that a mpls route points to a single mpls_nhlfe

Signed-off-by: Roopa Prabhu 
---
 net/mpls/af_mpls.c  |  225 ---
 net/mpls/internal.h |   35 
 2 files changed, 158 insertions(+), 102 deletions(-)

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 8c5707d..cf86e9d 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -21,35 +21,6 @@
 #endif
 #include "internal.h"
 
-#define LABEL_NOT_SPECIFIED (1<<20)
-#define MAX_NEW_LABELS 2
-
-/* This maximum ha length copied from the definition of struct neighbour */
-#define MAX_VIA_ALEN (ALIGN(MAX_ADDR_LEN, sizeof(unsigned long)))
-
-enum mpls_payload_type {
-   MPT_UNSPEC, /* IPv4 or IPv6 */
-   MPT_IPV4 = 4,
-   MPT_IPV6 = 6,
-
-   /* Other types not implemented:
-*  - Pseudo-wire with or without control word (RFC4385)
-*  - GAL (RFC5586)
-*/
-};
-
-struct mpls_route { /* next hop label forwarding entry */
-   struct net_device __rcu *rt_dev;
-   struct rcu_head rt_rcu;
-   u32 rt_label[MAX_NEW_LABELS];
-   u8  rt_protocol; /* routing protocol that set this 
entry */
-   u8  rt_payload_type;
-   u8  rt_labels;
-   u8  rt_via_alen;
-   u8  rt_via_table;
-   u8  rt_via[0];
-};
-
 static int zero = 0;
 static int label_limit = (1 << 20) - 1;
 
@@ -83,7 +54,7 @@ EXPORT_SYMBOL_GPL(mpls_output_possible);
 static unsigned int mpls_rt_header_size(const struct mpls_route *rt)
 {
/* The size of the layer 2.5 labels to be added for this route */
-   return rt->rt_labels * sizeof(struct mpls_shim_hdr);
+   return rt->rt_nh->nh_labels * sizeof(struct mpls_shim_hdr);
 }
 
 unsigned int mpls_dev_mtu(const struct net_device *dev)
@@ -124,7 +95,7 @@ static bool mpls_egress(struct mpls_route *rt, struct 
sk_buff *skb,
if (!pskb_may_pull(skb, 12))
return false;
 
-   payload_type = rt->rt_payload_type;
+   payload_type = rt->rt_nh->nh_payload_type;
if (payload_type == MPT_UNSPEC)
payload_type = ip_hdr(skb)->version;
 
@@ -197,7 +168,7 @@ static int mpls_forward(struct sk_buff *skb, struct 
net_device *dev,
goto drop;
 
/* Find the output device */
-   out_dev = rcu_dereference(rt->rt_dev);
+   out_dev = rcu_dereference(rt->rt_nh->nh_dev);
if (!mpls_output_possible(out_dev))
goto drop;
 
@@ -240,13 +211,15 @@ static int mpls_forward(struct sk_buff *skb, struct 
net_device *dev,
/* Push the new labels */
hdr = mpls_hdr(skb);
bos = dec.bos;
-   for (i = rt->rt_labels - 1; i >= 0; i--) {
-   hdr[i] = mpls_entry_encode(rt->rt_label[i], dec.ttl, 0, 
bos);
+   for (i = rt->rt_nh->nh_labels - 1; i >= 0; i--) {
+   hdr[i] = mpls_entry_encode(rt->rt_nh->nh_label[i],
+  dec.ttl, 0, bos);
bos = false;
}
}
 
-   err = neigh_xmit(rt->rt_via_table, out_dev, rt->rt_via, skb);
+   err = neigh_xmit(rt->rt_nh->nh_via_table, out_dev, rt->rt_nh->nh_via,
+skb);
if (err)
net_dbg_ratelimited("%s: packet transmission failed: %d\n",
__func__, err);
@@ -281,13 +254,15 @@ struct mpls_route_config {
struct nl_info  rc_nlinfo;
 };
 
-static struct mpls_route *mpls_rt_alloc(size_t alen)
+static struct mpls_route *mpls_rt_alloc(int num_nh)
 {
struct mpls_route *rt;
 
-   rt = kzalloc(sizeof(*rt) + alen, GFP_KERNEL);
+   rt = kzalloc(sizeof(*rt) + (num_nh * sizeof(struct mpls_nhlfe)),
+GFP_KERNEL);
if (rt)
-   rt->rt_via_alen = alen;
+   rt->rt_nhn = num_nh;
+
return rt;
 }
 
@@ -322,7 +297,7 @@ static void mpls_route_update(struct net *net, unsigned 
index,
 
platform_label = rtnl_dereference(net->mpls.platform_label);
rt = rtnl_dereference(platform_label[index]);
-   if (!dev || (rt && (rtnl_dereference(rt->rt_dev) == dev))) {
+   if (!dev || (rt && (rtnl_dereference(rt->rt_nh->nh_dev) == dev))) {
rcu_assign_pointer(platform_label[index], new);

[PATCH net-next 0/3] mpls: multipath support

2015-08-11 Thread Roopa Prabhu

From: Roopa Prabhu 

This patch series adds multipath support to mpls routes.

resembles ipv4 multipath support. The multipath route nexthop
selection algorithm is the same code as in ipv4 fib code.

I understand that the multipath algorithm in ipv4 is undergoing
some changes and will move mpls to similar algo if applicable once
those get merged.

mpls multipath support can be moved under CONFIG_MPLS_ROUTE_MULTIPATH if
needed similar to CONFIG_IP_ROUTE_MULTIPATH. I started with that
but that resulted in too many #ifdef CONFIG_MPLS_ROUTE_MULTIPATH
throughout the af_mpls code. If there is a strong reason
to introduce a config option, I will respin v2 with
CONFIG_MPLS_ROUTE_MULTIPATH. These multipath patches do not introduce
any UAPI changes.

example iproute2 usage:
$ip -f mpls route add 100 nexthop as 200 via inet 10.1.1.2 dev swp1 \
nexthop as 300 via inet 10.1.1.6 dev swp2

$ip -f mpls route show
100 
nexthop as to 200 via inet 10.1.1.2  dev swp1
nexthop as to 300 via inet 10.1.1.6  dev swp2


Roopa Prabhu (3):
  mpls: move mpls_route nexthop fields to a new nhlfe struct
  mpls: consistently use u8 to store number of labels
  mpls: add multipath route support

 include/net/mpls_iptunnel.h |2 +-
 net/mpls/af_mpls.c  |  519 ---
 net/mpls/internal.h |   44 +++-
 3 files changed, 437 insertions(+), 128 deletions(-)

-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH net-next 3/3] mpls: add multipath route support

2015-08-11 Thread Roopa Prabhu

From: Roopa Prabhu 

Adds support for MPLS multipath routes.
supports parse/fill of RTA_MULTIPATH netlink attribute
for multipath routes similar to ipv4 fib. Mostly based on
multipath handling in ipv4 fib code.

The multipath route nexthop selection algorithm is the same
code as in ipv4 fib.

This patch also adds new functions to parse multipath attributes
from route config into mpls_nhlfe.

note that it also simplifies mpls_route_update. Removes handling
route updates based on dev argument. The reason for
doing that is, the function was not being used for route updates
based on dev and if we do need to support route updates based
on dev in the future it will have to be done differently.

Signed-off-by: Roopa Prabhu 
---
 net/mpls/af_mpls.c  |  378 +--
 net/mpls/internal.h |   19 +++
 2 files changed, 323 insertions(+), 74 deletions(-)

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index eb089ef..de5ae29 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -19,10 +19,12 @@
 #include 
 #include 
 #endif
+#include 
 #include "internal.h"
 
 static int zero = 0;
 static int label_limit = (1 << 20) - 1;
+static DEFINE_SPINLOCK(mpls_multipath_lock);
 
 static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt,
   struct nlmsghdr *nlh, struct net *net, u32 portid,
@@ -51,10 +53,10 @@ bool mpls_output_possible(const struct net_device *dev)
 }
 EXPORT_SYMBOL_GPL(mpls_output_possible);
 
-static unsigned int mpls_rt_header_size(const struct mpls_route *rt)
+static unsigned int mpls_nhlfe_header_size(const struct mpls_nhlfe *nhlfe)
 {
/* The size of the layer 2.5 labels to be added for this route */
-   return rt->rt_nh->nh_labels * sizeof(struct mpls_shim_hdr);
+   return nhlfe->nh_labels * sizeof(struct mpls_shim_hdr);
 }
 
 unsigned int mpls_dev_mtu(const struct net_device *dev)
@@ -76,7 +78,52 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned 
int mtu)
 }
 EXPORT_SYMBOL_GPL(mpls_pkt_too_big);
 
-static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb,
+/* This is a cut/copy/modify from fib_select_multipath */
+static void mpls_select_multipath(struct mpls_route *rt, int *nhidx)
+{
+   int w;
+
+   spin_lock_bh(&mpls_multipath_lock);
+   if (rt->rt_power <= 0) {
+   int power = 0;
+
+   change_nexthops(rt) {
+   power += nhlfe->nh_weight;
+   nhlfe->nh_power = nhlfe->nh_weight;
+   } endfor_nexthops(rt);
+   rt->rt_power = power;
+   if (power <= 0) {
+   spin_unlock_bh(&mpls_multipath_lock);
+   /* Race condition: route has just become dead. */
+   *nhidx = 0;
+   return;
+   }
+   }
+
+   /* w should be random number [0..rt->rt_power-1],
+* it is pretty bad approximation.
+*/
+   w = jiffies % rt->rt_power;
+
+   change_nexthops(rt) {
+   if (nhlfe->nh_power) {
+   w -= nhlfe->nh_power;
+   if (w <= 0) {
+   nhlfe->nh_power--;
+   rt->rt_power--;
+   *nhidx = nhsel;
+   spin_unlock_bh(&mpls_multipath_lock);
+   return;
+   }
+   }
+   } endfor_nexthops(rt);
+
+   /* Race condition: route has just become dead. */
+   *nhidx = 0;
+   spin_unlock_bh(&mpls_multipath_lock);
+}
+
+static bool mpls_egress(struct mpls_nhlfe *nhlfe, struct sk_buff *skb,
struct mpls_entry_decoded dec)
 {
enum mpls_payload_type payload_type;
@@ -95,7 +142,7 @@ static bool mpls_egress(struct mpls_route *rt, struct 
sk_buff *skb,
if (!pskb_may_pull(skb, 12))
return false;
 
-   payload_type = rt->rt_nh->nh_payload_type;
+   payload_type = nhlfe->nh_payload_type;
if (payload_type == MPT_UNSPEC)
payload_type = ip_hdr(skb)->version;
 
@@ -130,6 +177,7 @@ static int mpls_forward(struct sk_buff *skb, struct 
net_device *dev,
struct net *net = dev_net(dev);
struct mpls_shim_hdr *hdr;
struct mpls_route *rt;
+   struct mpls_nhlfe *nhlfe;
struct mpls_entry_decoded dec;
struct net_device *out_dev;
struct mpls_dev *mdev;
@@ -137,6 +185,7 @@ static int mpls_forward(struct sk_buff *skb, struct 
net_device *dev,
unsigned int new_header_size;
unsigned int mtu;
int err;
+   int nhidx;
 
/* Careful this entire function runs inside of an rcu critical section 
*/
 
@@ -167,9 +216,12 @@ static int mpls_forward(struct sk_buff *skb, struct 
net_device *dev,
if (!rt)
goto drop;
 
+   mpls_select_multipath(rt, &nhidx);
+   nhlfe = &rt->rt_nh[nhidx];
+
/* Find

Re: pull request: bluetooth 2015-08-11

2015-08-11 Thread David Miller

From: Johan Hedberg 
Date: Tue, 11 Aug 2015 23:04:06 +0300

> Here's an important regression fix for the 4.2-rc series that ensures
> user space isn't given invalid LTK values. The bug essentially prevents
> the encryption of subsequent LE connections, i.e. makes it impossible to
> pair devices over LE.
> 
> Let me know if there are any issues pulling. Thanks.

Pulled, thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [BUG] net/ipv4: inconsistent routing table

2015-08-11 Thread David Miller

From: Alexander Duyck 
Date: Tue, 11 Aug 2015 13:52:27 -0700

> On 08/10/2015 04:50 AM, Hannes Frederic Sowa wrote:
>>> 4. document it
>> I prefer that one :)
> 
> Yeah, me too.  The fact is things have worked this way up until now
> and I suspect the reason why this hasn't been reported until now is
> simply because in many cases it works since routes are usually updated
> if you are moving the gateway onto the local system.

+1
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: ipv6_mc_check_mld - kernel BUG at net/core/skbuff.c:1128

2015-08-11 Thread Linus Lüssing

On Mon, Aug 10, 2015 at 02:56:12PM -0700, Brenden Blanco wrote:
> Doing some code reading with Alexei, we found a suspect commit, which
> introduces an skb_get and skb_may_pull of the same skb, which leads to the BUG
> when skb->len == len.

Urgh, didn't know that pskb_may_pull() doesn't like an skb with a
reference count greater than one... But yes, the BUG() call in
skbuff.c:1128 / pskb_expand_head() says that (though in this case
the BUG() in skbuff.c call actually seems kinda weird (/"wrong"?), as
it isn't shared between different code paths).

Thanks for the thorough analysis, going to provide a patch within
the next 24h (hopefully).

Cheers, Linus
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [BUG] net/ipv4: inconsistent routing table

2015-08-11 Thread Alexander Duyck


On 08/10/2015 04:50 AM, Hannes Frederic Sowa wrote:

Hello,

Zang MingJie  writes:


Here comes several options:

1. reject local next hop w/ EINVAL
2. delete route when local next hop removed

Will also cause some people to complain.


3. transition between RT_SCOPE_HOST amd RT_SCOPE_LINK

I don't understand the scope transition. I know Alex mentioned it for
the first time. Maybe he can explain?


If I am not mistaken part of the issue in terms of the behaviour being 
seen is due to the fact that the nexthop scope is recorded only when the 
route is added, and there is code in place in rt_set_nexthop which will 
only use the gateway if the scope is RT_SCOPE_LINK.  So what we would 
probably need to do is go through and audit any routes on a given 
interface every time an address is added or removed and if the nh_gw is 
equal to the address added or removed would would need to transition 
between RT_SCOPE_LINK and RT_SCOPE_HOST since the gateway is 
transitioning between the local system and somewhere on the other side 
of the link.


The problem is that this would still be a behaviour change and there may 
be somebody that has heartburn about it.



4. document it

I prefer that one :)


Yeah, me too.  The fact is things have worked this way up until now and 
I suspect the reason why this hasn't been reported until now is simply 
because in many cases it works since routes are usually updated if you 
are moving the gateway onto the local system.


- Alex
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [net-next PATCH] net: Document xfrm4_gc_thresh and xfrm6_gc_thresh

2015-08-11 Thread David Miller

From: Alexander Duyck 
Date: Tue, 11 Aug 2015 13:35:01 -0700

> This change adds documentation for xfrm4_gc_thresh and xfrm6_gc_thresh
> based on the comments in commit eeb1b73378b56 ("xfrm: Increase the garbage
> collector threshold").
> 
> Signed-off-by: Alexander Duyck 

Thanks, I'll let Steffen review and pick this up.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 0/2] net: thunder: Add ACPI support.

2015-08-11 Thread David Miller

From: Robert Richter 
Date: Tue, 11 Aug 2015 22:12:37 +0200

> On 11.08.15 13:04:55, David Daney wrote:
>> >In the future it might be better structured to try and get the OF
>> >node, and if that fails then try and use the ACPI method to obtain
>> >these values.
>> 
>> Our current approach, as you can see in the patch, is the opposite.  If ACPI
>> is being used, prefer that over the OF device tree.
>> 
>> You seem to be recommending precedence for OF.  It should be consistent
>> across all drivers/sub-systems, so do you really think that OF before ACPI
>> is the way to go?
> 
> If ACPI is enabled then no OF function may be called at all.

That makes no sense to me at all.

If ACPI is enabled, the OF routines should return no nodes etc.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 0/2] net: thunder: Add ACPI support.

2015-08-11 Thread David Miller

From: David Daney 
Date: Tue, 11 Aug 2015 13:04:55 -0700

> You seem to be recommending precedence for OF.  It should be
> consistent across all drivers/sub-systems, so do you really think
> that OF before ACPI is the way to go?

I just think it's more hackish to test acpi_disabled than to
simply see if the matching OF node even exists.

If ACPI is enabled, no OF node will be found.

It could just be my preference for such things.

I really wish it just fell out from the probing method, but
we're using PCI for that.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH net-next] documentation: bring vxlan documentation more up-to-date

2015-08-11 Thread Rick Jones

From: Rick Jones 

A few things have changed since the previous version of the vxlan
documentation was written, so update it and correct some grammer and
such while we are at it.

Signed-off-by: Rick Jones 

diff --git a/Documentation/networking/vxlan.txt 
b/Documentation/networking/vxlan.txt
index 6d99351..4126031 100644
--- a/Documentation/networking/vxlan.txt
+++ b/Documentation/networking/vxlan.txt
@@ -1,32 +1,38 @@
 Virtual eXtensible Local Area Networking documentation
 ==
 
-The VXLAN protocol is a tunnelling protocol that is designed to
-solve the problem of limited number of available VLAN's (4096).
-With VXLAN identifier is expanded to 24 bits.
+The VXLAN protocol is a tunnelling protocol that is designed to solve
+the problem of the limited number of available VLAN IDs (4096) in IEEE
+802.1q.  With VXLAN the size of the identifier is expanded to 24 bits
+(16777216).
 
-It is a draft RFC standard, that is implemented by Cisco Nexus,
-Vmware and Brocade. The protocol runs over UDP using a single
-destination port (still not standardized by IANA).
-This document describes the Linux kernel tunnel device,
-there is also an implantation of VXLAN for Openvswitch.
+VXLAN is described by IETF RFC 7348, and has been implemented by a
+number of vendors.  The protocol runs over UDP using a single
+destination port.  This document describes the Linux kernel tunnel
+device, there is also a separate implementation of VXLAN for
+Openvswitch.
 
 Unlike most tunnels, a VXLAN is a 1 to N network, not just point
 to point. A VXLAN device can either dynamically learn the IP address
 of the other end, in a manner similar to a learning bridge, or the
 forwarding entries can be configured statically.
 
-The management of vxlan is done in a similar fashion to it's
-too closest neighbors GRE and VLAN. Configuring VXLAN requires
-the version of iproute2 that matches the kernel release
-where VXLAN was first merged upstream.
+The management of vxlan is done in a similar fashion to its two
+closest neighbors GRE and VLAN. Configuring VXLAN requires the version
+of iproute2 that matches the kernel release where VXLAN was first
+merged upstream.
 
 1. Create vxlan device
-  # ip li add vxlan0 type vxlan id 42 group 239.1.1.1 dev eth1
-
-This creates a new device (vxlan0). The device uses the
-the multicast group 239.1.1.1 over eth1 to handle packets where
-no entry is in the forwarding table.
+  # ip link add vxlan0 type vxlan id 42 group 239.1.1.1 dev eth1
+
+This creates a new device named vxlan0.  The device uses the
+multicast group 239.1.1.1 over eth1 to handle traffic for which there
+is no entry is in the forwarding table.  The Linux implementation of
+VXLAN pre-dates the IANA's selection of a standard destination port
+number and uses the Linux-selected value by default to maintain
+backwards compatibility.  If you wish to use the IANA-assigned
+destination port number of 4789 you can add "dstport 4789" to the
+command line above.
 
 2. Delete vxlan device
   # ip link delete vxlan0
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[net-next PATCH] net: Document xfrm4_gc_thresh and xfrm6_gc_thresh

2015-08-11 Thread Alexander Duyck

This change adds documentation for xfrm4_gc_thresh and xfrm6_gc_thresh
based on the comments in commit eeb1b73378b56 ("xfrm: Increase the garbage
collector threshold").

Signed-off-by: Alexander Duyck 
---
 Documentation/networking/ip-sysctl.txt |   10 ++
 1 file changed, 10 insertions(+)

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index 56db1efd7189..46e88ed7f41d 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1181,6 +1181,11 @@ tag - INTEGER
Allows you to write a number, which can be used as required.
Default value is 0.
 
+xfrm4_gc_thresh - INTEGER
+   The threshold at which we will start garbage collecting for IPv4
+   destination cache entries.  At twice this value the system will
+   refuse new allocations.
+
 Alexey Kuznetsov.
 kuz...@ms2.inr.ac.ru
 
@@ -1617,6 +1622,11 @@ ratelimit - INTEGER
otherwise the minimal space between responses in milliseconds.
Default: 1000
 
+xfrm6_gc_thresh - INTEGER
+   The threshold at which we will start garbage collecting for IPv6
+   destination cache entries.  At twice this value the system will
+   refuse new allocations.
+
 
 IPv6 Update by:
 Pekka Savola 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Question on behavior of tg3_self_test() (ethtool -t on tg3 driver)

2015-08-11 Thread Michael Chan

On Tue, 2015-08-11 at 14:24 -0500, Douglas Miller wrote: 
> Yes, the "wrap plugs" are the loopback cables/plugs. It is my 
> understanding that the "offline" tests do not require anything to be 
> plugged into the ports, as they do not in any way touch the "external" 
> port. They perform an "internal loopback" test which does not depend on 
> any external connection.

Correct.

> 
>  From what I can tell, the only difference between "offline" and 
> "external_lb" is that "external_lb" performs the external loopback 
> tests, *in addition to* all the tests done for "offline".

Correct.

> This would 
> imply that the only tests that depend on anything connected to the 
> physical port is "external_lb", and there is no requirement that the 
> wrap plugs be removed/replaced in order to run "offline" tests.

When you do external loopback test, we skip the link test because you no
longer have normal connection to the network.  You now use a special
loopback cable, which will fail the link up test because the link up
test assumes connection to the network using normal cable.

> 
> In the case I was debugging, wrap plugs were installed because the ports 
> were, later, being tested in an "external loopback" way.
> 
> What I am observing is that it takes about 20 seconds for the kernel to 
> declare that the link is up, after running the "offline" or 
> "external_lb" test. In the case of "offline" I cannot run the test again 
> until the kernel declares the link up. In the case of "external_lb" I 
> can run the test again immediately and it passes.

As stated earlier, because we skip the link test when we are performing
external_lb.

So, you should always do ethtool -t  external_lb if you have a
loopback cable connected.  We will perform the external loopback test
and skip the link test.

If you don't have an external loopback cable connected, you should run
ethtool -t  offline.  It will not do the external loopback test and
will do the link test for proper link up with the network.

> This suggests to me 
> that the "external_lb" case (again, it is a superset of "offline") is 
> performing some configuration on the port that allows the subsequent 
> test to work. The one significant difference between "offline" and 
> "external_lb" is that "external_lb" performs the
> "tg3_phy_lpbk_set(tp, 0, true);" changes to configuration (immediately 
> prior to running the loopback tests again). I believe this call is to 
> switch from "internal loopback" to "normal", in order to leverage the 
> wrap plugs and perform the external loopback tests. But this call is not 
> made for "offline" and I am wondering if that leaves the port in a state 
> where it cannot be used until the kernel completes the "link up".
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 0/2] net: thunder: Add ACPI support.

2015-08-11 Thread Robert Richter

On 11.08.15 13:04:55, David Daney wrote:
> >In the future it might be better structured to try and get the OF
> >node, and if that fails then try and use the ACPI method to obtain
> >these values.
> 
> Our current approach, as you can see in the patch, is the opposite.  If ACPI
> is being used, prefer that over the OF device tree.
> 
> You seem to be recommending precedence for OF.  It should be consistent
> across all drivers/sub-systems, so do you really think that OF before ACPI
> is the way to go?

If ACPI is enabled then no OF function may be called at all.

With !ACPI or acpi=no kernel parameter, then acpi_disabled is set and
no ACPI function should be called. It always falls back to and only
uses OF/devicetree in this case.

So there is now way to try devicetree first and then use acpi or vice
versa. There is no mixup using acpi or devicetree with the same boot,
either one or the other.

-Robert
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH net-next v2 0/7] net: dsa: mv88e6xxx: support switchdev FDB objects

2015-08-11 Thread Vivien Didelot

Hi David,

On 15-08-11 12:05:18, David Miller wrote:
> From: David Miller 
> Date: Tue, 11 Aug 2015 12:00:27 -0700 (PDT)
> 
> > Ok, if you guys really want me to I'll do the revert-reapply thing.
> 
> Done.

Thank you, this is much appreciated.

-v
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 0/2] net: thunder: Add ACPI support.

2015-08-11 Thread David Daney

On 08/11/2015 11:49 AM, David Miller wrote:

From: David Daney 
Date: Mon, 10 Aug 2015 17:58:35 -0700

Change from v1:  Drop PHY binding part, use fwnode_property* APIs.

The first patch (1/2) rearranges the existing code a little with no
functional change to get ready for the second.  The second (2/2) does
the actual work of adding support to extract the needed information
from the ACPI tables.

Series applied.

Thank you very much.

In the future it might be better structured to try and get the OF
node, and if that fails then try and use the ACPI method to obtain
these values.

Our current approach, as you can see in the patch, is the opposite.  If 
ACPI is being used, prefer that over the OF device tree.

You seem to be recommending precedence for OF.  It should be consistent 
across all drivers/sub-systems, so do you really think that OF before 
ACPI is the way to go?

Thanks,
David Daney

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

pull request: bluetooth 2015-08-11

2015-08-11 Thread Johan Hedberg

Hi Dave,

Here's an important regression fix for the 4.2-rc series that ensures
user space isn't given invalid LTK values. The bug essentially prevents
the encryption of subsequent LE connections, i.e. makes it impossible to
pair devices over LE.

Let me know if there are any issues pulling. Thanks.

Johan

---
The following changes since commit 2475b22526d70234ecfe4a1ff88aed69badefba9:

  xen-netback: Allocate fraglist early to avoid complex rollback (2015-08-03 
22:23:03 -0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth.git 
for-upstream

for you to fetch changes up to cb92205bad2e4dd630b884142dd707b72504c200:

  Bluetooth: fix MGMT_EV_NEW_LONG_TERM_KEY event (2015-08-06 16:36:03 +0200)


Jakub Pawlowski (1):
  Bluetooth: fix MGMT_EV_NEW_LONG_TERM_KEY event

 net/bluetooth/mgmt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)


pgpVjr3jMhqxm.pgp
Description: PGP signature

Re: [PATCH 04/10] batman-adv: Make NC capability changes atomic

2015-08-11 Thread Sergei Shtylyov


On 08/11/2015 07:35 PM, Antonio Quartulli wrote:


From: Linus Lüssing 

Bitwise OR/AND assignments in C aren't guaranteed to be atomic. One
OGM handler might undo the set/clear of a specific bit from another
handler run in between.

Fix this by using the atomic set_bit()/clear_bit()/test_bit() functions.

Fixes: 3f4841ffb336 ("batman-adv: tvlv - add network coding container")
Signed-off-by: Linus Lüssing 
Signed-off-by: Marek Lindner 
Signed-off-by: Antonio Quartulli 
---
  net/batman-adv/network-coding.c | 7 ---
  net/batman-adv/types.h  | 2 +-
  2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index f0a50f3..cfdc80d 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c

[...]

@@ -894,7 +895,7 @@ void batadv_nc_update_nc_node(struct batadv_priv *bat_priv,
goto out;

/* check if orig node is network coding enabled */
-   if (!(orig_node->capabilities & BATADV_ORIG_CAPA_HAS_NC))
+   if (!(test_bit(BATADV_ORIG_CAPA_HAS_NC, &orig_node->capabilities)))


   Likewise, () around the call no needed.

[...]

MBR, Sergei

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 03/10] batman-adv: Make DAT capability changes atomic

2015-08-11 Thread Sergei Shtylyov


Hello.

On 08/11/2015 07:35 PM, Antonio Quartulli wrote:


From: Linus Lüssing 



Bitwise OR/AND assignments in C aren't guaranteed to be atomic. One
OGM handler might undo the set/clear of a specific bit from another
handler run in between.



Fix this by using the atomic set_bit()/clear_bit()/test_bit() functions.



Fixes: 17cf0ea455f1 ("batman-adv: tvlv - add distributed arp table container")
Signed-off-by: Linus Lüssing 
Signed-off-by: Marek Lindner 
Signed-off-by: Antonio Quartulli 
---
  net/batman-adv/distributed-arp-table.c | 7 ---
  net/batman-adv/types.h | 4 ++--
  2 files changed, 6 insertions(+), 5 deletions(-)



diff --git a/net/batman-adv/distributed-arp-table.c 
b/net/batman-adv/distributed-arp-table.c
index fb54e6a..244dcf6 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c

[...]

@@ -453,7 +454,7 @@ static bool batadv_is_orig_node_eligible(struct 
batadv_dat_candidate *res,
int j;

/* check if orig node candidate is running DAT */
-   if (!(candidate->capabilities & BATADV_ORIG_CAPA_HAS_DAT))
+   if (!(test_bit(BATADV_ORIG_CAPA_HAS_DAT, &candidate->capabilities)))


   () around the tst_bit() call not needed.

[...]

MBR, Sergei

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Question on behavior of tg3_self_test() (ethtool -t on tg3 driver)

2015-08-11 Thread Douglas Miller


Thanks Michael for getting back to me.

Yes, the "wrap plugs" are the loopback cables/plugs. It is my 
understanding that the "offline" tests do not require anything to be 
plugged into the ports, as they do not in any way touch the "external" 
port. They perform an "internal loopback" test which does not depend on 
any external connection.


From what I can tell, the only difference between "offline" and 
"external_lb" is that "external_lb" performs the external loopback 
tests, *in addition to* all the tests done for "offline". This would 
imply that the only tests that depend on anything connected to the 
physical port is "external_lb", and there is no requirement that the 
wrap plugs be removed/replaced in order to run "offline" tests.


In the case I was debugging, wrap plugs were installed because the ports 
were, later, being tested in an "external loopback" way.


What I am observing is that it takes about 20 seconds for the kernel to 
declare that the link is up, after running the "offline" or 
"external_lb" test. In the case of "offline" I cannot run the test again 
until the kernel declares the link up. In the case of "external_lb" I 
can run the test again immediately and it passes. This suggests to me 
that the "external_lb" case (again, it is a superset of "offline") is 
performing some configuration on the port that allows the subsequent 
test to work. The one significant difference between "offline" and 
"external_lb" is that "external_lb" performs the
"tg3_phy_lpbk_set(tp, 0, true);" changes to configuration (immediately 
prior to running the loopback tests again). I believe this call is to 
switch from "internal loopback" to "normal", in order to leverage the 
wrap plugs and perform the external loopback tests. But this call is not 
made for "offline" and I am wondering if that leaves the port in a state 
where it cannot be used until the kernel completes the "link up".


Thanks,
Doug



On 08/11/2015 12:41 PM, Michael Chan wrote:

On Tue, 2015-08-11 at 10:59 -0500, Douglas Miller wrote:

(Sorry if you got several duplicates, am trying to work through rejected
messages due to supposed HTML content)

The following behavior is being observed when running "ethtool -t 
offline" on ports on the Broadcom BCM5719 adapter (tg3 driver). The
ports have wrap plugs on them, although I'm not sure why that would have
any affect.

I'm not sure what are wrap plugs.


The test "ethtool -t  offline" was being running continuously. The
first invocation passes, all subsequent ones fail (at least in the "link
test" step) after ~20 second timeout. When running the test once, I see
the following: Looking at /var/log/messages, I see a "Link is down"
message during the test. Then, 20 seconds after the test completes,
there is a "Link is up..." message. If I wait for the "Link is up..."
message I can run the test without problems. If the test is run again
while the link is still down, it fails and seems to delay the "link up"
by an additional 20 seconds.

When you do offline test, the chip is reset and the PHY is also reset,
causing the link to go down.  Normally, link should come back up within
a few seconds.  The selftest code will wait for 6 seconds for copper and
2 seconds for serdes link to be up before declaring there is no link.

So for whaever reason, the link in your setup takes longer than that to
come up and therefore it fails the link test when you run it in a loop
starting on the 2nd iteration.



If I run "external_lb" instead of "offline", I am able to run the test
repeatedly without error. So it seems that some action taken in the
"external_lb" case actually "repairs" the port. But the "external_lb"
test also exhibits the link-down for 20 seconds symptom, although it can
been run while the link is considered "down" without failure.

External loopback requires a loopback cable.  So you must have a
loopback cable for this test to pass.  May be that's what you meant by
wrap plugs.


The first question is whether we should expect to be able to run
"ethtool -t  offline" continually, with no delay between runs. I
presume this is supported.

If your intention is to run external loopback, yes you should specify
external loopback.  Otherwise the driver expects normal link behavior
and that's why it fails.

If you connect a normal cable, then ethtool -t  offline works
repeatedly, right?


Second question, I would like someone with experience with the tg3
driver and this adapter to comment on what might be done to fix this. My
first, simple, guess would be move the "tg3_phy_lpbk_set(tp, 0, true);"
setting (in tg3_test_loopback()) to be done for both "offline" and
"external_lb" cases. I am awaiting time on a system with this adapter in
order to try out some possible fixes and/or debug what might be
wrong/different with the configuration after the "offline" test.

I would appreciate any help,
Thanks,
Doug Miller





--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message

Re: [PATCH] iproute2: Add support for VRF device

2015-08-11 Thread Nikolay Aleksandrov

Hi,
Since there will be another version a few minor nits below,

> On Aug 10, 2015, at 8:50 PM, David Ahern  wrote:
> 
> Allow user to create a vrf device and specify its table binding.
> Based on the iplink_vlan implementation.
> 
> Signed-off-by: Shrijeet Mukherjee 
> Signed-off-by: David Ahern 
> ---
> include/linux/if_link.h |  8 +
> ip/Makefile |  2 +-
> ip/iplink.c |  2 +-
> ip/iplink_vrf.c | 85 +
> 4 files changed, 95 insertions(+), 2 deletions(-)
> create mode 100644 ip/iplink_vrf.c
> 
> diff --git a/include/linux/if_link.h b/include/linux/if_link.h
> index b905cf7f4948..74dedf4320b8 100644
> --- a/include/linux/if_link.h
> +++ b/include/linux/if_link.h
> @@ -338,6 +338,14 @@ enum macvlan_macaddr_mode {
> 
> #define MACVLAN_FLAG_NOPROMISC1
> 
> +/* VRF section */
> +enum {
> + IFLA_VRF_UNSPEC,
> + IFLA_VRF_TABLE,
> + __IFLA_VRF_MAX
> +};
> +
> +#define IFLA_VRF_MAX (__IFLA_VRF_MAX - 1)
> /* IPVLAN section */
> enum {
>   IFLA_IPVLAN_UNSPEC,
> diff --git a/ip/Makefile b/ip/Makefile
> index 77653ecc5785..d8b38ac2e44b 100644
> --- a/ip/Makefile
> +++ b/ip/Makefile
> @@ -7,7 +7,7 @@ IPOBJ=ip.o ipaddress.o ipaddrlabel.o iproute.o iprule.o 
> ipnetns.o \
> iplink_vxlan.o tcp_metrics.o iplink_ipoib.o ipnetconf.o link_ip6tnl.o \
> link_iptnl.o link_gre6.o iplink_bond.o iplink_bond_slave.o iplink_hsr.o \
> iplink_bridge.o iplink_bridge_slave.o ipfou.o iplink_ipvlan.o \
> -iplink_geneve.o
> +iplink_geneve.o iplink_vrf.o
> 
> RTMONOBJ=rtmon.o
> 
> diff --git a/ip/iplink.c b/ip/iplink.c
> index 369d50eab94e..14bf7211a447 100644
> --- a/ip/iplink.c
> +++ b/ip/iplink.c
> @@ -94,7 +94,7 @@ void iplink_usage(void)
>   fprintf(stderr, "TYPE := { vlan | veth | vcan | dummy | ifb | 
> macvlan | macvtap |\n");
>   fprintf(stderr, "  bridge | bond | ipoib | ip6tnl | 
> ipip | sit | vxlan |\n");
>   fprintf(stderr, "  gre | gretap | ip6gre | ip6gretap | 
> vti | nlmon |\n");
> - fprintf(stderr, "  bond_slave | ipvlan | geneve }\n");
> + fprintf(stderr, "  bond_slave | ipvlan | geneve | vrf 
> }\n");
>   }
>   exit(-1);
> }
> diff --git a/ip/iplink_vrf.c b/ip/iplink_vrf.c
> new file mode 100644
> index ..0d7e21c7c152
> --- /dev/null
> +++ b/ip/iplink_vrf.c
> @@ -0,0 +1,85 @@
> +/* iplink_vrf.c  VRF device support
> + *
> + *  This program is free software; you can redistribute it and/or
> + *  modify it under the terms of the GNU General Public License
> + *  as published by the Free Software Foundation; either version
> + *  2 of the License, or (at your option) any later version.
> + *
> + * Authors: Shrijeet Mukherjee 
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include "rt_names.h"
> +#include "utils.h"
> +#include "ip_common.h"
> +
> +static void vrf_explain(FILE *f)
> +{
> + fprintf(f, "Usage: ... vrf table TABLEID \n");
> +}
> +
> +static void explain(void)
> +{
> + vrf_explain(stderr);
> +}
> +
> +static int table_arg(void)
> +{
> + fprintf(stderr,"Error: argument of \"table\" must be 0-32767 and 
> currently unused\n");
> + return -1;
> +}
> +
> +static int vrf_parse_opt(struct link_util *lu, int argc, char **argv,
> + struct nlmsghdr *n)
> +{
> + while (argc > 0) {
> + if (matches(*argv, "table") == 0) {
> + __u32 table = 0;
> + NEXT_ARG();
^^^
Please leave a line between local variable definitions and code. Also no need
to initialize table to 0.

> +
> + table = atoi(*argv);
> + if (table < 0 || table > 32767)
^^^
table is unsigned, so < 0 will be always false.

> + return table_arg();
> + addattr32(n, 1024, IFLA_VRF_TABLE, table);
> + } else if (matches(*argv, "help") == 0) {
> + explain();
> + return -1;
> + } else {
> + fprintf(stderr, "vrf: unknown option \"%s\"?\n",
> + *argv);
> + explain();
> + return -1;
> + }
> + argc--, argv++;
> + }
> +
> + return 0;
> +}
> +
> +static void vrf_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[])
> +{
> + if (!tb)
> + return;
> +
> + if (tb[IFLA_VRF_TABLE])
> + fprintf(f, "table %u ", rta_getattr_u32(tb[IFLA_VRF_TABLE]));
> +}
> +
> +static void vrf_print_help(struct link_util *lu, int argc, char **argv,
> +   FILE *f)
> +{
> + vrf_explain(f);
> +}
> +
> +struct link_util vrf_link_util = {
> + .id = "vrf",
> + .maxattr= IFLA_VRF_MAX,
> + .parse_opt  = vrf_parse_opt,
> +

Re: [PATCH] net: fs_enet: mask interrupts for TX partial frames.

2015-08-11 Thread David Miller

From: Christophe Leroy 
Date: Tue, 11 Aug 2015 12:11:03 +0200 (CEST)

> We are not interested in interrupts for partially transmitted frames.
> Unlike SCC and FCC, the FEC doesn't handle the I bit in buffer
> descriptors, instead it defines two interrupt bits, TXB and TXF.
> 
> We have to mask TXB in order to only get interrupts once the
> frame is fully transmitted.
> 
> Signed-off-by: Christophe Leroy 

Applied.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] net: fs_enet: explicitly remove I flag on TX partial frames

2015-08-11 Thread David Miller

From: Christophe Leroy 
Date: Tue, 11 Aug 2015 12:11:00 +0200 (CEST)

> We are not interested in interrupts for partially transmitted frames,
> we have to clear BD_ENET_TX_INTR explicitly otherwise it may remain
> from a previously used descriptor.
> 
> Signed-off-by: Christophe Leroy 

Applied.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH net-next v2 0/7] net: dsa: mv88e6xxx: support switchdev FDB objects

2015-08-11 Thread David Miller

From: David Miller 
Date: Tue, 11 Aug 2015 12:00:27 -0700 (PDT)

> Ok, if you guys really want me to I'll do the revert-reapply thing.

Done.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: [Intel-wired-lan] [PATCH v2 1/2] igb: Teardown SR-IOV before unregister_netdev()

2015-08-11 Thread Brown, Aaron F

> From: Intel-wired-lan [mailto:intel-wired-lan-boun...@lists.osuosl.org] On
> Behalf Of Alex Williamson
> Sent: Wednesday, July 29, 2015 1:38 PM
> To: intel-wired-...@lists.osuosl.org; da...@davemloft.net; Kirsher,
> Jeffrey T
> Cc: netdev@vger.kernel.org; linux-ker...@vger.kernel.org
> Subject: [Intel-wired-lan] [PATCH v2 1/2] igb: Teardown SR-IOV before
> unregister_netdev()
> 
> When the .remove() callback for a PF is called, SR-IOV support for the
> device is disabled, which requires unbinding and removing the VFs.
> The VFs may be in-use either by the host kernel or userspace, such as
> assigned to a VM through vfio-pci.  In this latter case, the VFs may
> be removed either by shutting down the VM or hot-unplugging the
> devices from the VM.  Unfortunately in the case of a Windows 2012 R2
> guest, hot-unplug is broken due to the ordering of the PF driver
> teardown.  Disabling SR-IOV prior to unregister_netdev() avoids this
> issue.
> 
> Signed-off-by: Alex Williamson 
> Acked-by: Mitch Williams 
> ---
>  drivers/net/ethernet/intel/igb/igb_main.c |8 
>  1 file changed, 4 insertions(+), 4 deletions(-)

Tested-by: Aaron Brown 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH net-next v2 0/7] net: dsa: mv88e6xxx: support switchdev FDB objects

2015-08-11 Thread David Miller

From: David Miller 
Date: Tue, 11 Aug 2015 11:52:49 -0700 (PDT)

> From: Vivien Didelot 
> Date: Tue, 11 Aug 2015 14:18:42 -0400 (EDT)
> 
>> On Aug 11, 2015, at 2:07 PM, David da...@davemloft.net wrote:
>> 
>>> From: Florian Fainelli 
>>> Date: Tue, 11 Aug 2015 11:03:35 -0700
>>> 
 Put differently, my question is how do you value not rewriting
 history vs. breaking bisectability (by accident of course)?
>>> 
>>> I never will rewrite history, ever.
>>> 
>>> Too many people clone my tree and depend upon it.
>> 
>> Sorry, I still don't understand. What are the consequences of:
>> 
>> git revert -m 1 f1d5ca4
>> 
>> Then applying v3?
> 
> In this scenerio I think a relative fixup works better.
> 
>> You already did that in the past:
>> https://github.com/torvalds/linux/commit/1f2cd84
> 
> Each and every situation is evaluated by me on a case by case
> basis.

Ok, if you guys really want me to I'll do the revert-reapply thing.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 net-next] sky2: use random address if EEPROM is bad

2015-08-11 Thread Sergei Shtylyov


Hello.

On 08/11/2015 05:35 PM, Liviu Dudau wrote:


On some embedded systems the EEPROM does not contain a valid MAC address.
In that case it is better to fallback to a generated mac address and
let init scripts fix the value later.



Reported-by: Liviu Dudau 
Signed-off-by: Stephen Hemminger 
[Changed handcoded setup to use eth_hw_addr_random() instead]
Signed-off-by: Liviu Dudau 
---
I have tested this on my Juno platform and I can successfully do an nfsroot 
boot.



Best regards,
Liviu



  drivers/net/ethernet/marvell/sky2.c | 7 +++
  1 file changed, 7 insertions(+)



diff --git a/drivers/net/ethernet/marvell/sky2.c 
b/drivers/net/ethernet/marvell/sky2.c
index d9f4498..c309879 100644
--- a/drivers/net/ethernet/marvell/sky2.c
+++ b/drivers/net/ethernet/marvell/sky2.c
@@ -4819,6 +4819,13 @@ static struct net_device *sky2_init_netdev(struct 
sky2_hw *hw, unsigned port,
memcpy_fromio(dev->dev_addr, hw->regs + B2_MAC_1 + port * 8,
  ETH_ALEN);

+   /* if the address is invalid, use a random value */
+   if (!is_valid_ether_addr(dev->dev_addr)) {
+   netdev_warn(dev,
+   "Invalid MAC address, defaulting to random\n");


   Please start the continuation line right under 'dev' on the borken up line.


+   eth_hw_addr_random(dev);
+   }
+
return dev;
  }


MBR, Sergei

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH net-next v2 0/7] net: dsa: mv88e6xxx: support switchdev FDB objects

2015-08-11 Thread David Miller

From: Vivien Didelot 
Date: Tue, 11 Aug 2015 14:18:42 -0400 (EDT)

> On Aug 11, 2015, at 2:07 PM, David da...@davemloft.net wrote:
> 
>> From: Florian Fainelli 
>> Date: Tue, 11 Aug 2015 11:03:35 -0700
>> 
>>> Put differently, my question is how do you value not rewriting
>>> history vs. breaking bisectability (by accident of course)?
>> 
>> I never will rewrite history, ever.
>> 
>> Too many people clone my tree and depend upon it.
> 
> Sorry, I still don't understand. What are the consequences of:
> 
> git revert -m 1 f1d5ca4
> 
> Then applying v3?

In this scenerio I think a relative fixup works better.

> You already did that in the past:
> https://github.com/torvalds/linux/commit/1f2cd84

Each and every situation is evaluated by me on a case by case
basis.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH net-next 6/9] net: Fix up inet_addr_type checks

2015-08-11 Thread David Miller

From: David Ahern 
Date: Tue, 11 Aug 2015 12:18:20 -0600

> The intent here was to default to current behavior and to keep the
> details of that in one place. If you prefer table id to always enter
> with the right value I can make that happen.

I think it looks better that way.

People reading individual pieces of code can tell what is happening
much more easily.

As currently structured, the have to know the internal details of a
helper function to understand what '0' means.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 net-next] bpf: s390: Fix build error caused by the struct bpf_array member name changed

2015-08-11 Thread David Miller

From: Kaixu Xia 
Date: Tue, 11 Aug 2015 08:56:51 +

> There is a build error that "'struct bpf_array' has no member
> named 'prog'" on s390. In commit 2a36f0b92eb6 ("bpf: Make the
> bpf_prog_array_map more generic"), the member 'prog' of struct
> bpf_array is replaced by 'ptrs'. So this patch fixes it.
> 
> Fixes: 2a36f0b92eb6 ("bpf: Make the bpf_prog_array_map more generic")
> Reported-by: Wu Fengguang 
> Signed-off-by: Kaixu Xia 

Applied, thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 0/2] net: thunder: Add ACPI support.

2015-08-11 Thread David Miller

From: David Daney 
Date: Mon, 10 Aug 2015 17:58:35 -0700

> Change from v1:  Drop PHY binding part, use fwnode_property* APIs.
> 
> The first patch (1/2) rearranges the existing code a little with no
> functional change to get ready for the second.  The second (2/2) does
> the actual work of adding support to extract the needed information
> from the ACPI tables.

Series applied.

In the future it might be better structured to try and get the OF
node, and if that fails then try and use the ACPI method to obtain
these values.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 3/3] gianfar: remove faulty filer optimizer

2015-08-11 Thread David Miller

From: Jakub Kiciński 
Date: Tue, 11 Aug 2015 16:51:09 +0200

> On Tue, 11 Aug 2015 14:00:23 +, Manoil Claudiu wrote:
>> >-Original Message-
>> >From: Jakub Kicinski [mailto:moorr...@wp.pl]
>> >Sent: Monday, August 10, 2015 11:12 PM
>> >To: David S. Miller; Manoil Claudiu-B08782
>> >Cc: netdev@vger.kernel.org; Jakub Kicinski
>> >Subject: [PATCH 3/3] gianfar: remove faulty filer optimizer
>> >
>> >From: Jakub Kicinski 
>> >
>> >Current filer rule optimization is broken in several ways:
>> > (1) It destroys rule ordering.
>> > (2) It performs reads/writes beyond end of allocated tables.
>> > (3) It breaks badly for rules with more than 2 specifiers
>> > (e.g. matching ip, port, tos).
>> > (4) We observed that the masking rules it generates do not
>> > play well with clustering on P2020.  Only first rule
>> > of the cluster would ever fire.  Given that optimizer
>> > relies heavily on masking this is very hard to fix.
>> >
>> >The fact that nobody noticed (1), (3) or (4) makes me think
>> >that this feature is not very widely used and we should just
>> >remove it.
>> 
>> I'm not familiar with this filer classification code and its
>> author is no longer active apparently.   There is not much of a
>> choice here since this optimization feature is too complex and
>> poorly documented to be reviewed and validated in a reasonable
>> time span.
>> An example, a simple use case showing expected behavior vs.
>> actual behavior would help.
> 
> Sure, sorry, should be part of the submission quite honestly...

I think removing this optimizer is the thing to do as well.

Please respin this patch series with the examples added to the
commit message of patch #3.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH net-next v2 0/7] net: dsa: mv88e6xxx: support switchdev FDB objects

2015-08-11 Thread Vivien Didelot

Hi David,

On Aug 11, 2015, at 2:07 PM, David da...@davemloft.net wrote:

> From: Florian Fainelli 
> Date: Tue, 11 Aug 2015 11:03:35 -0700
> 
>> Put differently, my question is how do you value not rewriting
>> history vs. breaking bisectability (by accident of course)?
> 
> I never will rewrite history, ever.
> 
> Too many people clone my tree and depend upon it.

Sorry, I still don't understand. What are the consequences of:

git revert -m 1 f1d5ca4

Then applying v3?

You already did that in the past:
https://github.com/torvalds/linux/commit/1f2cd84

Thanks,
-v
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH net-next 6/9] net: Fix up inet_addr_type checks

2015-08-11 Thread David Ahern


On 8/11/15 12:14 PM, David Miller wrote:

From: David Ahern 
Date: Mon, 10 Aug 2015 11:50:33 -0600


@@ -427,6 +428,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, 
int addr_len)
struct net *net = sock_net(sk);
unsigned short snum;
int chk_addr_ret;
+   int tb_id = 0;
int err;

/* If the socket has its own bind function then use it. (RAW) */
@@ -448,7 +450,16 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, 
int addr_len)
goto out;
}

-   chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr);
+   if (sk->sk_bound_dev_if) {
+   struct net_device *dev;
+
+   rcu_read_lock();
+   dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
+   if (dev)
+   tb_id = vrf_dev_table_rcu(dev);
+   rcu_read_unlock();
+   }
+   chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);

/* Not specified by any standard per-se, however it breaks too
 * many applications when removed.  It is unfortunate since

  ...

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index b11321a8e58d..d84ae0e30369 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -226,6 +226,9 @@ static inline unsigned int __inet_dev_addr_type(struct net 
*net,

rcu_read_lock();

+   if (!tb_id)
+   tb_id = RT_TABLE_LOCAL;
+
table = fib_get_table(net, tb_id);


All of this code that quietly translates table ID zero into RT_TABLE_LOCAL is
confusing.

It would be so much easier to understand if the code was structured like:

int tb_id = RT_TABLE_LOCAL;

if (doing_vrf_stuff)
tb_id = foo;



The intent here was to default to current behavior and to keep the 
details of that in one place. If you prefer table id to always enter 
with the right value I can make that happen.


David
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH net-next 6/9] net: Fix up inet_addr_type checks

2015-08-11 Thread David Miller

From: David Ahern 
Date: Mon, 10 Aug 2015 11:50:33 -0600

> @@ -427,6 +428,7 @@ int inet_bind(struct socket *sock, struct sockaddr 
> *uaddr, int addr_len)
>   struct net *net = sock_net(sk);
>   unsigned short snum;
>   int chk_addr_ret;
> + int tb_id = 0;
>   int err;
>  
>   /* If the socket has its own bind function then use it. (RAW) */
> @@ -448,7 +450,16 @@ int inet_bind(struct socket *sock, struct sockaddr 
> *uaddr, int addr_len)
>   goto out;
>   }
>  
> - chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr);
> + if (sk->sk_bound_dev_if) {
> + struct net_device *dev;
> +
> + rcu_read_lock();
> + dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
> + if (dev)
> + tb_id = vrf_dev_table_rcu(dev);
> + rcu_read_unlock();
> + }
> + chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);
>  
>   /* Not specified by any standard per-se, however it breaks too
>* many applications when removed.  It is unfortunate since
 ...
> diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
> index b11321a8e58d..d84ae0e30369 100644
> --- a/net/ipv4/fib_frontend.c
> +++ b/net/ipv4/fib_frontend.c
> @@ -226,6 +226,9 @@ static inline unsigned int __inet_dev_addr_type(struct 
> net *net,
>  
>   rcu_read_lock();
>  
> + if (!tb_id)
> + tb_id = RT_TABLE_LOCAL;
> +
>   table = fib_get_table(net, tb_id);

All of this code that quietly translates table ID zero into RT_TABLE_LOCAL is
confusing.

It would be so much easier to understand if the code was structured like:

int tb_id = RT_TABLE_LOCAL;

if (doing_vrf_stuff)
tb_id = foo;
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH net-next 1/9] net: Introduce VRF related flags and helpers

2015-08-11 Thread David Miller

From: David Ahern 
Date: Mon, 10 Aug 2015 11:50:28 -0600

> +static inline int vrf_dev_table(const struct net_device *dev)
> +{
> + int tb_id = 0;
> +
> + rcu_read_lock();
> + tb_id = vrf_dev_table_rcu(dev);
> + rcu_read_unlock();
> +
> + return tb_id;
> +}

The initialization of "tb_id" to zero in the variable declaration is
unnecessary, please remove it.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH net-next v2 0/7] net: dsa: mv88e6xxx: support switchdev FDB objects

2015-08-11 Thread David Miller

From: Florian Fainelli 
Date: Tue, 11 Aug 2015 11:03:35 -0700

> Put differently, my question is how do you value not rewriting
> history vs. breaking bisectability (by accident of course)?

I never will rewrite history, ever.

Too many people clone my tree and depend upon it.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH net-next v2 0/7] net: dsa: mv88e6xxx: support switchdev FDB objects

2015-08-11 Thread Florian Fainelli

On 11/08/15 10:38, David Miller wrote:
> From: Vivien Didelot 
> Date: Tue, 11 Aug 2015 12:25:06 -0400 (EDT)
> 
>> I can work on fixup patches to restore v3 changes on top of v2, but this
>> won't fix the bisectability issue.
>>
>> Instead of fixing individual portions, reverting the merge commit
>> f1d5ca4: "Merge branch 'mv88e6xxx-switchdev-fdb'" would undo all the v2
>> series at once, then v3 can be merged on top of it.
>>
>> Can you consider this as an option?
> 
> Nothing will fix bisectability, so don't try.
> 
> Reverting an entire series when you have the fix available
> already is excessive.
> 
> So as I have already asked you, send a relative fixup to clear
> up this situation.

What if the fix is to actually not break bisectability? Put differently,
my question is how do you value not rewriting history vs. breaking
bisectability (by accident of course)?
-- 
Florian
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: VxLAN support question

2015-08-11 Thread Andrew Qu

Hi Alexei,

I support using mcast group for all VNIs in control plane/data plane.
But in case there is no mcast routing enabled, I need to support
P2P vxlan underlay, hence use of the configuration I showed.

Thanks for the confirmation.

By the way, any informational doc I can read to know about the metadata mode of 
vxlan?

Andrew

 please ignore my company added confidentiality notice on this thread 
=

-Original Message-
From: Alexei Starovoitov [mailto:a...@plumgrid.com] 
Sent: Monday, August 10, 2015 10:37 PM
To: Andrew Qu; David Miller
Cc: tg...@suug.ch; je...@nicira.com; pshe...@nicira.com; netdev@vger.kernel.org
Subject: Re: VxLAN support question

On 8/10/15 4:47 PM, Andrew Qu wrote:
>
> Pretty much what I want is that  kernel will have about 1K interfaces 
> (something like Tunnel100.1-tunnel100.1000 To be created and attached 
> to 1K bridge domains on which each VNI is associated with given VNI to 
> bridge-domain will be assigned using other CLIs)

creating 1k vxlan devices is doable, but you probably want to take a look at 
recently added metadata mode of vxlan.
Also sounds like for each vni you'd need a different multicast group?
What fabric going to support that?

> * Email Confidentiality Notice 

please avoid such banners.

* Email Confidentiality Notice 
The information contained in this e-mail message (including any 
attachments) may be confidential, proprietary, privileged, or otherwise
exempt from disclosure under applicable laws. It is intended to be 
conveyed only to the designated recipient(s). Any use, dissemination, 
distribution, printing, retaining or copying of this e-mail (including its 
attachments) by unintended recipient(s) is strictly prohibited and may 
be unlawful. If you are not an intended recipient of this e-mail, or believe 
that you have received this e-mail in error, please notify the sender 
immediately (by replying to this e-mail), delete any and all copies of 
this e-mail (including any attachments) from your system, and do not
disclose the content of this e-mail to any other person. Thank you!

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

REDIRECT and UDP in client

2015-08-11 Thread Madhan

Hi All ,
I am trying to redirect the udp packet to particular port where I run
my proxy (SOCKS) . I am supposed to obtain the original destination
ipaddress of the client to add to the data field of socks message as
per the RFC .

However , when I use REDIRECT chain as it sets the dest addr to the LO
address i get the LO address 127.0.0.1 and the port where socks proxy
client runs .

I tried patching SO_ORIGINAL_DST to obtain for UDP protocol also . In
this case it would work if I use single FD for single dest . However
when I use single FD for multiple connection it might not work .


I cannot add as part of rsvmsg , as I have to send to the proxy server
in advance .

TPROXY works in prerouting , I am not sure how we use in the client
side to send the first packet with the dest address . (I am not sure
about the TPROXYing though)


Can you please suggest the method to overcome the problem . Thanks in advance.

Regards,
Madhan Raj K ,
Senior Software Engineer - Android R&D ,
Samsung Research Institute - Bangalore .
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH net-next v2] tcp: reduce cpu usage under tcp memory pressure when SO_SNDBUF is set

2015-08-11 Thread Jason Baron



On 08/11/2015 12:12 PM, Eric Dumazet wrote:
> On Tue, 2015-08-11 at 11:03 -0400, Jason Baron wrote:
> 
>>
>> Yes, so the test case I'm using to test against is somewhat contrived.
>> In that I am simply allocating around 40,000 sockets that are idle to
>> create a 'permanent' memory pressure in the background. Then, I have
>> just 1 flow that sets SO_SNDBUF, which results in the: poll(), write() loop.
>>
>> That said, we encountered this issue initially where we had 10,000+
>> flows and whenever the system would get into memory pressure, we would
>> see all the cpus spin at 100%.
>>
>> So the testcase I wrote, was just a simplistic version for testing. But
>> I am going to try and test against the more realistic workload where
>> this issue was initially observed.
>>
> 
> Note that I am still trying to understand why we need to increase socket
> structure, for something which is inherently a problem of sharing memory
> with an unknown (potentially big) number of sockets.
> 

I was trying to mirror the wakeups when SO_SNDBUF is not set, where we
continue to trigger on 1/3 of the buffer being available, as the
sk->sndbuf is shrunk. And I saw this value as dynamic depending on
number of sockets and read/write buffer usage. So that's where I was
coming from with it.

Also, at least with the .config I have the tcp_sock structure didn't
increase in size (although struct sock did go up by 8 and not 4).

> I suggested to use a flag (one bit).
> 
> If set, then we should fallback to tcp_wmem[0] (each socket has 4096
> bytes, so that we can avoid starvation)
> 
> 
> 

Ok, I will test this approach.

Thanks,

-Jason
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Question on behavior of tg3_self_test() (ethtool -t on tg3 driver)

2015-08-11 Thread Michael Chan

On Tue, 2015-08-11 at 10:59 -0500, Douglas Miller wrote: 
> (Sorry if you got several duplicates, am trying to work through rejected 
> messages due to supposed HTML content)
> 
> The following behavior is being observed when running "ethtool -t  
> offline" on ports on the Broadcom BCM5719 adapter (tg3 driver). The 
> ports have wrap plugs on them, although I'm not sure why that would have 
> any affect.

I'm not sure what are wrap plugs.

> 
> The test "ethtool -t  offline" was being running continuously. The 
> first invocation passes, all subsequent ones fail (at least in the "link 
> test" step) after ~20 second timeout. When running the test once, I see 
> the following: Looking at /var/log/messages, I see a "Link is down" 
> message during the test. Then, 20 seconds after the test completes, 
> there is a "Link is up..." message. If I wait for the "Link is up..." 
> message I can run the test without problems. If the test is run again 
> while the link is still down, it fails and seems to delay the "link up" 
> by an additional 20 seconds.

When you do offline test, the chip is reset and the PHY is also reset,
causing the link to go down.  Normally, link should come back up within
a few seconds.  The selftest code will wait for 6 seconds for copper and
2 seconds for serdes link to be up before declaring there is no link.

So for whaever reason, the link in your setup takes longer than that to
come up and therefore it fails the link test when you run it in a loop
starting on the 2nd iteration.


> If I run "external_lb" instead of "offline", I am able to run the test 
> repeatedly without error. So it seems that some action taken in the 
> "external_lb" case actually "repairs" the port. But the "external_lb" 
> test also exhibits the link-down for 20 seconds symptom, although it can 
> been run while the link is considered "down" without failure.

External loopback requires a loopback cable.  So you must have a
loopback cable for this test to pass.  May be that's what you meant by
wrap plugs.

> 
> The first question is whether we should expect to be able to run 
> "ethtool -t  offline" continually, with no delay between runs. I 
> presume this is supported.

If your intention is to run external loopback, yes you should specify
external loopback.  Otherwise the driver expects normal link behavior
and that's why it fails.

If you connect a normal cable, then ethtool -t  offline works
repeatedly, right?

> 
> Second question, I would like someone with experience with the tg3 
> driver and this adapter to comment on what might be done to fix this. My 
> first, simple, guess would be move the "tg3_phy_lpbk_set(tp, 0, true);" 
> setting (in tg3_test_loopback()) to be done for both "offline" and 
> "external_lb" cases. I am awaiting time on a system with this adapter in 
> order to try out some possible fixes and/or debug what might be 
> wrong/different with the configuration after the "offline" test.
> 
> I would appreciate any help,
> Thanks,
> Doug Miller
> 


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH net-next v2 0/7] net: dsa: mv88e6xxx: support switchdev FDB objects

2015-08-11 Thread David Miller

From: Vivien Didelot 
Date: Tue, 11 Aug 2015 12:25:06 -0400 (EDT)

> I can work on fixup patches to restore v3 changes on top of v2, but this
> won't fix the bisectability issue.
> 
> Instead of fixing individual portions, reverting the merge commit
> f1d5ca4: "Merge branch 'mv88e6xxx-switchdev-fdb'" would undo all the v2
> series at once, then v3 can be merged on top of it.
> 
> Can you consider this as an option?

Nothing will fix bisectability, so don't try.

Reverting an entire series when you have the fix available
already is excessive.

So as I have already asked you, send a relative fixup to clear
up this situation.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Xen-devel] [PATCH v3 0/9] Use correctly the Xen memory terminologies

2015-08-11 Thread David Vrabel

On 07/08/15 17:34, Julien Grall wrote:
> Hi all,
> 
> This patch series aims to use the memory terminologies described in
> include/xen/mm.h [1] for Linux xen code.

Applied to for-linus-4.3, thanks.

David
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH net] netconsole: Check for carrier before calling netpoll_send_udp()

2015-08-11 Thread David Miller

From: Jonathan Maxwell 
Date: Tue, 11 Aug 2015 15:53:18 +1000

>> What if the carrier check passes, and then the chip reset starts on
>> another cpu?  You'll have the same problem.
> 
> Okay, let me see if I can come up with a better way to mitigate this.

I personally think that drivers need to synchronize such things
internally.  They are the only entity which knows when it's "OK"
to do whatever the netpoll method does, and they are also the only
entity which can properly synchronize such checks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[iproute PATCH] ip-link: fix minor typo in manpage

2015-08-11 Thread Phil Sutter

Change '-human-readble' to '-human-readable'.

Signed-off-by: Phil Sutter 
---
 man/man8/ip-link.8.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in
index c123fcc..1c312af 100644
--- a/man/man8/ip-link.8.in
+++ b/man/man8/ip-link.8.in
@@ -889,7 +889,7 @@ output more statistics about packet usage.
 output more detailed information.
 
 .TP
-.BR "\-h", " \-human", " \-human-readble"
+.BR "\-h", " \-human", " \-human-readable"
 output statistics with human readable values number followed by suffix
 
 .TP
-- 
2.1.2

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 net-next] sky2: use random address if EEPROM is bad

2015-08-11 Thread Stephen Hemminger

On Tue, 11 Aug 2015 15:35:56 +0100
Liviu Dudau  wrote:

> On some embedded systems the EEPROM does not contain a valid MAC address.
> In that case it is better to fallback to a generated mac address and
> let init scripts fix the value later.
> 
> Reported-by: Liviu Dudau 
> Signed-off-by: Stephen Hemminger 
> [Changed handcoded setup to use eth_hw_addr_random() instead]
> Signed-off-by: Liviu Dudau 
> ---
> I have tested this on my Juno platform and I can successfully do an nfsroot 
> boot.
> 
> Best regards,
> Liviu
> 
>  drivers/net/ethernet/marvell/sky2.c | 7 +++
>  1 file changed, 7 insertions(+)
> 
> diff --git a/drivers/net/ethernet/marvell/sky2.c 
> b/drivers/net/ethernet/marvell/sky2.c
> index d9f4498..c309879 100644
> --- a/drivers/net/ethernet/marvell/sky2.c
> +++ b/drivers/net/ethernet/marvell/sky2.c
> @@ -4819,6 +4819,13 @@ static struct net_device *sky2_init_netdev(struct 
> sky2_hw *hw, unsigned port,
>   memcpy_fromio(dev->dev_addr, hw->regs + B2_MAC_1 + port * 8,
> ETH_ALEN);
>  
> + /* if the address is invalid, use a random value */
> + if (!is_valid_ether_addr(dev->dev_addr)) {
> + netdev_warn(dev,
> + "Invalid MAC address, defaulting to random\n");
> + eth_hw_addr_random(dev);
> + }
> +
>   return dev;
>  }
>  

This is not enough, you need to program the hardware with the new random MAC
address. The easiest way is calling sky2_set_mac_address, but you need to 
convert
the address from array back to sockaddr.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 0/2] of: fsl/fman: reuse the fixed node parsing code

2015-08-11 Thread Stas Sergeev


11.08.2015 19:33, Madalin-Cristian Bucur пишет:

+ Joakim, Shaohui


-Original Message-
From: Stas Sergeev [mailto:s...@list.ru]

08.08.2015 20:32, Florian Fainelli пишет:

CC'ing Stas,

Hi.


Le 08/05/15 07:42, Madalin Bucur a écrit :

The FMan MAC configuration code needs the speed and duplex

information

for fixed-link interfaces that is parsed now by the of function
of_phy_register_fixed_link(). This parses the fixed-link parameters but
does not expose to the caller neither the phy_device pointer nor the
status struct where it loads the fixed-link params.

I have only barely touched that code, but IMO both things
are by design. There are some API deficiencies, and so, many
drivers still use of_phy_find_device() to circumvent the encapsulation
and get the phy_device pointer, but this is unlikely a good thing
to do. I even proposed some API extensions, but there was no
interest.


   By extracting the
fixed-link parsing code from of_phy_register_fixed_link() into a
separate function the parsed values are made available without changing
the existing API. This change also removes a small redundancy in the
previous code calling fixed_phy_register().

Today, the fixed_link is not always fixed.
See for example this patch (already mainlined):
https://lkml.org/lkml/2015/7/20/711
of_phy_is_fixed_link() returns 'true' if you have
managed="in-band-status", and so the SGMII in-band status
can update fixed-link params.

So my question is: why do you even need to know whether
the link is fixed or not? IIRC you can check the phy_device
pointer in the adjust_link callback of of_phy_connect() to get
the current link status values. Why is this not enough for your
task? Maybe the patch description should be updated to include
why the current technique is bad, what is actually fixed by the
change.
I think using the fixed-link DT values directly is not something
to be done. The encapsulation is there for a reason, so maybe
instead we can see what API additions do we need to avoid the
current limitations that force people to use of_phy_find_device()
and other work-arounds.

I need to be able to determine the imposed speed and duplex for fixed link
external PHYs because I need to configure the internal PHY with matching
values. If I do not set the same speed, given the fact that AN needs to be off,
there will be no link and no adjust link to fix things later (and the internal 
PHY is
not updated by adjust link anyway). I do not have access at the phy pointer at
the time I need the speed and duplex, to retrieve the defaults from there and
I've tried to make the smallest changes that allow me to retrieve those without
modifying existing API.
Why is it important to hide the default values from the MAC driver?

My worry is that the fixed values are not really fixed, and
therefore are not always useful to access directly. It is likely
not a problem for your use-case, as, as you say, the AN is
disabled, but this is probably not the best to do in general.
And also you do:
---

-   err = of_phy_register_fixed_link(mac_node);
-   if (err)
+   struct phy_device *phy;
+
+   mac_dev->fixed_link = kzalloc(sizeof(*mac_dev->fixed_link),
+ GFP_KERNEL);
+   if (of_phy_parse_fixed_link(mac_node, mac_dev->fixed_link))
+   goto _return_dev_set_drvdata;
+
+   phy = fixed_phy_register(PHY_POLL, mac_dev->fixed_link,
+mac_node);

---

which means you really want to circumvent the current OF
api quite a lot, without saying why in the patch description.
As such, it may be difficult to review. Could you please write
a more complete description to the patch?

As to your problem: would it be possible to set speed & duplex
after you do of_phy_connect()? It returns the phy_device
pointer, and perhaps you can look into phydev->speed and
phydev->duplex at that point?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/4] batman-adv: fix kernel crash due to missing NULL checks

2015-08-11 Thread Antonio Quartulli

On 05/08/15 15:15, David Laight wrote:
> So is this test just hiding anoter bug somewhere??

Hi David and thanks for your feedback.

The point is that we got several bug reports of kernel crashes due to
NULL pointer deferences in these lines and fter having debugged this
problem for quite a while we preferred to move on and propose this patch.

Still, I am personally debugging this part of the code to understand if
we really have something wrong or if this NULL pointer is something we
should expect (and therefore check).

For the time being we think this patch is better than having horrible
kernel crashes, but I hope to come to a definitive conclusion soon.

Cheers,


-- 
Antonio Quartulli



signature.asc
Description: OpenPGP digital signature

[PATCH 10/10] batman-adv: Fix potentially broken skb network header access

2015-08-11 Thread Antonio Quartulli

From: Linus Lüssing 

The two commits noted below added calls to ip_hdr() and ipv6_hdr(). They
need a correctly set skb network header.

Unfortunately we cannot rely on the device drivers to set it for us.
Therefore setting it in the beginning of the according ndo_start_xmit
handler.

Fixes: 1d8ab8d3c176 ("batman-adv: Modified forwarding behaviour for multicast 
packets")
Fixes: ab49886e3da7 ("batman-adv: Add IPv4 link-local/IPv6-ll-all-nodes 
multicast support")
Signed-off-by: Linus Lüssing 
Signed-off-by: Marek Lindner 
Signed-off-by: Antonio Quartulli 
---
 net/batman-adv/soft-interface.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index c002961..926292d 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -202,6 +202,7 @@ static int batadv_interface_tx(struct sk_buff *skb,
int gw_mode;
enum batadv_forw_mode forw_mode;
struct batadv_orig_node *mcast_single_orig = NULL;
+   int network_offset = ETH_HLEN;
 
if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE)
goto dropped;
@@ -214,14 +215,18 @@ static int batadv_interface_tx(struct sk_buff *skb,
case ETH_P_8021Q:
vhdr = vlan_eth_hdr(skb);
 
-   if (vhdr->h_vlan_encapsulated_proto != ethertype)
+   if (vhdr->h_vlan_encapsulated_proto != ethertype) {
+   network_offset += VLAN_HLEN;
break;
+   }
 
/* fall through */
case ETH_P_BATMAN:
goto dropped;
}
 
+   skb_set_network_header(skb, network_offset);
+
if (batadv_bla_tx(bat_priv, skb, vid))
goto dropped;
 
-- 
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 09/10] batman-adv: remove broadcast packets scheduled for purged outgoing if

2015-08-11 Thread Antonio Quartulli

From: Simon Wunderlich 

When an interface is purged, the broadcast packets scheduled for this
interface should get purged as well.

Signed-off-by: Simon Wunderlich 
Signed-off-by: Marek Lindner 
Signed-off-by: Antonio Quartulli 
---
 net/batman-adv/send.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index 0a01992..191076e 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -616,7 +616,8 @@ batadv_purge_outstanding_packets(struct batadv_priv 
*bat_priv,
 * we delete only packets belonging to the given interface
 */
if ((hard_iface) &&
-   (forw_packet->if_incoming != hard_iface))
+   (forw_packet->if_incoming != hard_iface) &&
+   (forw_packet->if_outgoing != hard_iface))
continue;
 
spin_unlock_bh(&bat_priv->forw_bcast_list_lock);
-- 
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 02/10] batman-adv: Avoid u32 overflow during gateway select

2015-08-11 Thread Antonio Quartulli

From: Ruben Wisniewski 

The gateway selection based on fast connections is using a single value
calculated from the average tq (0-255) and the download bandwidth (in
100Kibit). The formula for the first step (tq ** 2 * 1 * bandwidth)
tends to overflow a u32 with low bandwidth settings like 50 [100KiBit]
and a tq value of over 92.

Changing this to a 64 bit unsigned integer allows to support a
bandwidth_down with up to ~2.8e10 [100KiBit] and a perfect tq of 255. This
is ~6.6 times higher than the maximum possible value of the gateway
announcement TVLV.

This problem only affects the non-default gw_sel_class 1.

Signed-off-by: Ruben Wisniewsi 
[s...@narfation.org: rewritten commit message]
Signed-off-by: Sven Eckelmann 
Signed-off-by: Marek Lindner 
Signed-off-by: Antonio Quartulli 
---
 net/batman-adv/gateway_client.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index e1e1f31..4ac24d8 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -153,7 +153,7 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
struct batadv_neigh_node *router;
struct batadv_neigh_ifinfo *router_ifinfo;
struct batadv_gw_node *gw_node, *curr_gw = NULL;
-   uint32_t max_gw_factor = 0, tmp_gw_factor = 0;
+   uint64_t max_gw_factor = 0, tmp_gw_factor = 0;
uint8_t max_tq = 0;
uint8_t tq_avg;
struct batadv_orig_node *orig_node;
-- 
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 04/10] batman-adv: Make NC capability changes atomic

2015-08-11 Thread Antonio Quartulli

From: Linus Lüssing 

Bitwise OR/AND assignments in C aren't guaranteed to be atomic. One
OGM handler might undo the set/clear of a specific bit from another
handler run in between.

Fix this by using the atomic set_bit()/clear_bit()/test_bit() functions.

Fixes: 3f4841ffb336 ("batman-adv: tvlv - add network coding container")
Signed-off-by: Linus Lüssing 
Signed-off-by: Marek Lindner 
Signed-off-by: Antonio Quartulli 
---
 net/batman-adv/network-coding.c | 7 ---
 net/batman-adv/types.h  | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index f0a50f3..cfdc80d 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -19,6 +19,7 @@
 #include "main.h"
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -134,9 +135,9 @@ static void batadv_nc_tvlv_ogm_handler_v1(struct 
batadv_priv *bat_priv,
  uint16_t tvlv_value_len)
 {
if (flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND)
-   orig->capabilities &= ~BATADV_ORIG_CAPA_HAS_NC;
+   clear_bit(BATADV_ORIG_CAPA_HAS_NC, &orig->capabilities);
else
-   orig->capabilities |= BATADV_ORIG_CAPA_HAS_NC;
+   set_bit(BATADV_ORIG_CAPA_HAS_NC, &orig->capabilities);
 }
 
 /**
@@ -894,7 +895,7 @@ void batadv_nc_update_nc_node(struct batadv_priv *bat_priv,
goto out;
 
/* check if orig node is network coding enabled */
-   if (!(orig_node->capabilities & BATADV_ORIG_CAPA_HAS_NC))
+   if (!(test_bit(BATADV_ORIG_CAPA_HAS_NC, &orig_node->capabilities)))
goto out;
 
/* accept ogms from 'good' neighbors and single hop neighbors */
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 29fd625..ed4aec5 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -314,7 +314,7 @@ struct batadv_orig_node {
  */
 enum batadv_orig_capabilities {
BATADV_ORIG_CAPA_HAS_DAT,
-   BATADV_ORIG_CAPA_HAS_NC = BIT(1),
+   BATADV_ORIG_CAPA_HAS_NC,
BATADV_ORIG_CAPA_HAS_TT = BIT(2),
BATADV_ORIG_CAPA_HAS_MCAST = BIT(3),
 };
-- 
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

1 2 >

1 - 100 of 140 matches

Mail list logo