date:20160202

[PATCH next 3/3] ipvlan: misc changes

2016-02-02 Thread Mahesh Bandewar

From: Mahesh Bandewar 

1. scope correction for few functions that are used in single file.
2. Adjust variables that are used in fast-path to fit into single cacheline
3. Update rcv_frame() to skip shared check for frames coming over wire

Signed-off-by: Mahesh Bandewar 
---
 drivers/net/ipvlan/ipvlan.h  |  9 +++--
 drivers/net/ipvlan/ipvlan_core.c | 27 ---
 drivers/net/ipvlan/ipvlan_main.c |  2 +-
 3 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h
index 817cab1a7959..695a5dc9ace3 100644
--- a/drivers/net/ipvlan/ipvlan.h
+++ b/drivers/net/ipvlan/ipvlan.h
@@ -84,19 +84,19 @@ struct ipvl_addr {
 #define ip4addr ipu.ip4
struct hlist_node   hlnode;  /* Hash-table linkage */
struct list_headanode;   /* logical-interface linkage */
-   struct rcu_head rcu;
ipvl_hdr_type   atype;
+   struct rcu_head rcu;
 };
 
 struct ipvl_port {
struct net_device   *dev;
struct hlist_head   hlhead[IPVLAN_HASH_SIZE];
struct list_headipvlans;
-   struct rcu_head rcu;
+   u16 mode;
struct work_struct  wq;
struct sk_buff_head backlog;
int count;
-   u16 mode;
+   struct rcu_head rcu;
 };
 
 static inline struct ipvl_port *ipvlan_port_get_rcu(const struct net_device *d)
@@ -114,7 +114,6 @@ static inline struct ipvl_port *ipvlan_port_get_rtnl(const 
struct net_device *d)
return rtnl_dereference(d->rx_handler_data);
 }
 
-void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct net_device *dev);
 void ipvlan_init_secret(void);
 unsigned int ipvlan_mac_hash(const unsigned char *addr);
 rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb);
@@ -124,7 +123,5 @@ void ipvlan_ht_addr_add(struct ipvl_dev *ipvlan, struct 
ipvl_addr *addr);
 struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan,
   const void *iaddr, bool is_v6);
 bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6);
-struct ipvl_addr *ipvlan_ht_addr_lookup(const struct ipvl_port *port,
-   const void *iaddr, bool is_v6);
 void ipvlan_ht_addr_del(struct ipvl_addr *addr);
 #endif /* __IPVLAN_H */
diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index 21c380f9ccd5..5f79cd3eb8be 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -53,8 +53,8 @@ static u8 ipvlan_get_v4_hash(const void *iaddr)
   IPVLAN_HASH_MASK;
 }
 
-struct ipvl_addr *ipvlan_ht_addr_lookup(const struct ipvl_port *port,
-   const void *iaddr, bool is_v6)
+static struct ipvl_addr *ipvlan_ht_addr_lookup(const struct ipvl_port *port,
+  const void *iaddr, bool is_v6)
 {
struct ipvl_addr *addr;
u8 hash;
@@ -265,20 +265,25 @@ static int ipvlan_rcv_frame(struct ipvl_addr *addr, 
struct sk_buff **pskb,
struct sk_buff *skb = *pskb;
 
len = skb->len + ETH_HLEN;
-   if (unlikely(!(dev->flags & IFF_UP))) {
-   kfree_skb(skb);
-   goto out;
-   }
+   /* Only packets exchanged between two local slaves need to have
+* device-up check as well as skb-share check.
+*/
+   if (local) {
+   if (unlikely(!(dev->flags & IFF_UP))) {
+   kfree_skb(skb);
+   goto out;
+   }
 
-   skb = skb_share_check(skb, GFP_ATOMIC);
-   if (!skb)
-   goto out;
+   skb = skb_share_check(skb, GFP_ATOMIC);
+   if (!skb)
+   goto out;
 
-   *pskb = skb;
+   *pskb = skb;
+   }
skb->dev = dev;
-   skb->pkt_type = PACKET_HOST;
 
if (local) {
+   skb->pkt_type = PACKET_HOST;
if (dev_forward_skb(ipvlan->dev, skb) == NET_RX_SUCCESS)
success = true;
} else {
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index 54c542526262..b20951583b46 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -9,7 +9,7 @@
 
 #include "ipvlan.h"
 
-void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct net_device *dev)
+static void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct net_device *dev)
 {
ipvlan->dev->mtu = dev->mtu - ipvlan->mtu_adj;
 }
-- 
2.7.0.rc3.207.g0ac5344

[PATCH net-next 5/6] tcp: make congestion control more robust against reordering

2016-02-02 Thread Yuchung Cheng

This change enables congestion control to update cwnd based on
not only packet cumulatively acked but also packets delivered
out-of-order. This makes congestion control robust against packet
reordering because it may raise cwnd as long as packets are being
delivered once reordering has been detected (i.e., it only cares
the amount of packets delivered, not the ordering among them).

Signed-off-by: Yuchung Cheng 
Signed-off-by: Neal Cardwell 
Signed-off-by: Eric Dumazet 
---
 net/ipv4/tcp_input.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2d6eaad..39c5326 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3662,7 +3662,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff 
*skb, int flag)
tcp_cwnd_reduction(sk, acked_sacked, flag);
} else if (tcp_may_raise_cwnd(sk, flag)) {
/* Advance cwnd if state allows */
-   tcp_cong_avoid(sk, ack, acked);
+   tcp_cong_avoid(sk, ack, acked_sacked);
}
 
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
-- 
2.7.0.rc3.207.g0ac5344

[PATCH net-next 4/6] tcp: refactor pkts acked accounting

2016-02-02 Thread Yuchung Cheng

A small refactoring that gets number of packets cumulatively acked
from tcp_clean_rtx_queue() directly.

Signed-off-by: Yuchung Cheng 
Signed-off-by: Neal Cardwell 
Signed-off-by: Eric Dumazet 
---
 net/ipv4/tcp_input.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e7a8ce7..2d6eaad 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3106,7 +3106,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct 
sk_buff *skb,
  * arrived at the other end.
  */
 static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
-  u32 prior_snd_una,
+  u32 prior_snd_una, int *acked,
   struct tcp_sacktag_state *sack)
 {
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -3279,6 +3279,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int 
prior_fackets,
}
}
 #endif
+   *acked = pkts_acked;
return flag;
 }
 
@@ -3644,10 +3645,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff 
*skb, int flag)
goto no_queue;
 
/* See if we can take anything off of the retransmit queue. */
-   acked = tp->packets_out;
-   flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una,
+   flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, ,
_state);
-   acked -= tp->packets_out;
 
if (tcp_ack_is_dubious(sk, flag)) {
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
-- 
2.7.0.rc3.207.g0ac5344

[PATCH net-next 3/6] tcp: new delivery accounting

2016-02-02 Thread Yuchung Cheng

This patch changes the accounting of how many packets are
newly acked or sacked when the sender receives an ACK.

The current approach basically computes

   newly_acked_sacked = (prior_packets - prior_sacked) -
(tp->packets_out - tp->sacked_out)

   where prior_packets and prior_sacked out are snapshot
   at the beginning of the ACK processing.

The new approach tracks the delivery information via a new
TCP state variable "delivered" which monotically increases
as new packets are delivered in order or out-of-order.

The reason for this change is that the current approach is
brittle that produces negative or inaccurate estimate.

   1) For non-SACK connections, an ACK that advances the SND.UNA
   could reset the DUPACK counters (tp->sacked_out) in
   tcp_process_loss() or tcp_fastretrans_alert(). This inflates
   the inflight suddenly and causes under-estimate or even
   negative estimate. Here is a real example:

   before   after (processing ACK)
   packets_out 75   73
   sacked_out  230
   ca stateLoss Open

   The old approach computes (75-23) - (73 - 0) = -21 delivered
   while the new approach computes 1 delivered since it
   considers the 2nd-24th packets are delivered OOO.

   2) MSS change would re-count packets_out and sacked_out so
   the estimate is in-accurate and can even become negative.
   E.g., the inflight is doubled when MSS is halved.

   3) Spurious retransmission signaled by DSACK is not accounted

The new approach is simpler and more robust. For SACK connections,
tp->delivered increments as packets are being acked or sacked in
SACK and ACK processing.

For non-sack connections, it's done in tcp_remove_reno_sacks() and
tcp_add_reno_sack(). When an ACK advances the SND.UNA, tp->delivered
is incremented by the number of packets ACKed (less the current
number of DUPACKs received plus one packet hole).  Upon receiving
a DUPACK, tp->delivered is incremented assuming one out-of-order
packet is delivered.

Upon receiving a DSACK, tp->delivered is incremtened assuming one
retransmission is delivered in tcp_sacktag_write_queue().

Signed-off-by: Yuchung Cheng 
Signed-off-by: Neal Cardwell 
Signed-off-by: Eric Dumazet 
---
 include/linux/tcp.h  |  1 +
 net/ipv4/tcp_input.c | 21 +++--
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index b386361..d909fee 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -256,6 +256,7 @@ struct tcp_sock {
u32 prr_delivered;  /* Number of newly delivered packets to
 * receiver in Recovery. */
u32 prr_out;/* Total number of pkts sent during Recovery. */
+   u32 delivered;  /* Total data packets delivered incl. rexmits */
 
u32 rcv_wnd;/* Current receiver window  */
u32 write_seq;  /* Tail(+1) of data held in tcp send buffer */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index f2ebe8b..e7a8ce7 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1214,6 +1214,7 @@ static u8 tcp_sacktag_one(struct sock *sk,
sacked |= TCPCB_SACKED_ACKED;
state->flag |= FLAG_DATA_SACKED;
tp->sacked_out += pcount;
+   tp->delivered += pcount;  /* Out-of-order packets delivered */
 
fack_count += pcount;
 
@@ -1825,8 +1826,12 @@ static void tcp_check_reno_reordering(struct sock *sk, 
const int addend)
 static void tcp_add_reno_sack(struct sock *sk)
 {
struct tcp_sock *tp = tcp_sk(sk);
+   u32 prior_sacked = tp->sacked_out;
+
tp->sacked_out++;
tcp_check_reno_reordering(sk, 0);
+   if (tp->sacked_out > prior_sacked)
+   tp->delivered++; /* Some out-of-order packet is delivered */
tcp_verify_left_out(tp);
 }
 
@@ -1838,6 +1843,7 @@ static void tcp_remove_reno_sacks(struct sock *sk, int 
acked)
 
if (acked > 0) {
/* One ACK acked hole. The rest eat duplicate ACKs. */
+   tp->delivered += max_t(int, acked - tp->sacked_out, 1);
if (acked - 1 >= tp->sacked_out)
tp->sacked_out = 0;
else
@@ -3158,10 +3164,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int 
prior_fackets,
flag |= FLAG_ORIG_SACK_ACKED;
}
 
-   if (sacked & TCPCB_SACKED_ACKED)
+   if (sacked & TCPCB_SACKED_ACKED) {
tp->sacked_out -= acked_pcount;
-   else if (tcp_is_sack(tp) && !tcp_skb_spurious_retrans(tp, skb))
-   tcp_rack_advance(tp, >skb_mstamp, sacked);
+   } else if (tcp_is_sack(tp)) {
+   tp->delivered += acked_pcount;
+   if (!tcp_skb_spurious_retrans(tp, skb))

Re: [PATCH v2] openvswitch: allow management from inside user namespaces

2016-02-02 Thread Serge Hallyn

Quoting Tycho Andersen (tycho.ander...@canonical.com):
> Operations with the GENL_ADMIN_PERM flag fail permissions checks because
> this flag means we call netlink_capable, which uses the init user ns.
> 
> Instead, let's introduce a new flag, GENL_UNS_ADMIN_PERM for operations
> which should be allowed inside a user namespace.
> 
> The motivation for this is to be able to run openvswitch in unprivileged
> containers. I've tested this and it seems to work, but I really have no
> idea about the security consequences of this patch, so thoughts would be
> much appreciated.
> 
> v2: use the GENL_UNS_ADMIN_PERM flag instead of a check in each function
> 
> Reported-by: James Page 
> Signed-off-by: Tycho Andersen 
> CC: Eric Biederman 
> CC: Pravin Shelar 
> CC: Justin Pettit 
> CC: "David S. Miller" 
> ---
>  include/uapi/linux/genetlink.h |  1 +
>  net/netlink/genetlink.c|  6 --
>  net/openvswitch/datapath.c | 20 ++--
>  3 files changed, 15 insertions(+), 12 deletions(-)
> 
> diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h
> index c3363ba..5512c90 100644
> --- a/include/uapi/linux/genetlink.h
> +++ b/include/uapi/linux/genetlink.h
> @@ -21,6 +21,7 @@ struct genlmsghdr {
>  #define GENL_CMD_CAP_DO  0x02
>  #define GENL_CMD_CAP_DUMP0x04
>  #define GENL_CMD_CAP_HASPOL  0x08
> +#define GENL_UNS_ADMIN_PERM  0x10
>  
>  /*
>   * List of reserved static generic netlink identifiers:
> diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
> index f830326..6bbb3eb 100644
> --- a/net/netlink/genetlink.c
> +++ b/net/netlink/genetlink.c
> @@ -576,8 +576,10 @@ static int genl_family_rcv_msg(struct genl_family 
> *family,
>   if (ops == NULL)
>   return -EOPNOTSUPP;
>  
> - if ((ops->flags & GENL_ADMIN_PERM) &&
> - !netlink_capable(skb, CAP_NET_ADMIN))
> + if (((ops->flags & GENL_ADMIN_PERM) &&
> + !netlink_capable(skb, CAP_NET_ADMIN)) ||

Seems like this would be a lot clearer if you split it up, i.e.:

/* CAP_NET_ADMIN required against initial user_ns */
if ((ops->flags & GENL_ADMIN_PERM) &&
!netlink_capable(skb, CAP_NET_ADMIN))
return -EPERM;

/* CAP_NET_ADMIN required against device user_ns */
if ((ops->flags & GENL_UNS_ADMIN_PERM) &&
!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
return -EPERM;

> + ((ops->flags & GENL_UNS_ADMIN_PERM) &&
> + !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)))
>   return -EPERM;
>  
>   if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) {
> diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
> index deadfda..d6f7fe9 100644
> --- a/net/openvswitch/datapath.c
> +++ b/net/openvswitch/datapath.c
> @@ -654,7 +654,7 @@ static const struct nla_policy 
> packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
>  
>  static const struct genl_ops dp_packet_genl_ops[] = {
>   { .cmd = OVS_PACKET_CMD_EXECUTE,
> -   .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
> +   .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */

Hm, I'd like to suggest adding 'over netns', but I guess that breaks 80 cols...

> .policy = packet_policy,
> .doit = ovs_packet_cmd_execute
>   }
> @@ -1391,12 +1391,12 @@ static const struct nla_policy 
> flow_policy[OVS_FLOW_ATTR_MAX + 1] = {
>  
>  static const struct genl_ops dp_flow_genl_ops[] = {
>   { .cmd = OVS_FLOW_CMD_NEW,
> -   .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
> +   .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
> .policy = flow_policy,
> .doit = ovs_flow_cmd_new
>   },
>   { .cmd = OVS_FLOW_CMD_DEL,
> -   .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
> +   .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
> .policy = flow_policy,
> .doit = ovs_flow_cmd_del
>   },
> @@ -1407,7 +1407,7 @@ static const struct genl_ops dp_flow_genl_ops[] = {
> .dumpit = ovs_flow_cmd_dump
>   },
>   { .cmd = OVS_FLOW_CMD_SET,
> -   .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
> +   .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
> .policy = flow_policy,
> .doit = ovs_flow_cmd_set,
>   },
> @@ -1777,12 +1777,12 @@ static const struct nla_policy 
> datapath_policy[OVS_DP_ATTR_MAX + 1] = {
>  
>  static const struct genl_ops dp_datapath_genl_ops[] = {
>   { .cmd = OVS_DP_CMD_NEW,
> -   .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
> +   .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
> .policy = datapath_policy,
> .doit =

[PATCH net-next 1/6] tcp: retransmit after recovery processing and congestion control

2016-02-02 Thread Yuchung Cheng

The retransmission and F-RTO transmission currently happen inside
recovery state processing (tcp_fastretrans_alert) but before
congestion control.  This refactoring moves the logic after both
s.t. we can determine how much to send (cwnd) before deciding what to
send.

Signed-off-by: Yuchung Cheng 
Signed-off-by: Neal Cardwell 
Signed-off-by: Eric Dumazet 
---
 net/ipv4/tcp_input.c | 58 +---
 1 file changed, 46 insertions(+), 12 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 0003d40..482c0b4 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -126,6 +126,10 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
 #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
 #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
 
+#define REXMIT_NONE0 /* no loss recovery to do */
+#define REXMIT_LOST1 /* retransmit packets marked lost */
+#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */
+
 /* Adapt the MSS value used to make delayed ack decision to the
  * real world.
  */
@@ -2664,7 +2668,8 @@ static void tcp_enter_recovery(struct sock *sk, bool 
ece_ack)
 /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
  * recovered or spurious. Otherwise retransmits more on partial ACKs.
  */
-static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
+static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
+int *rexmit)
 {
struct tcp_sock *tp = tcp_sk(sk);
bool recovered = !before(tp->snd_una, tp->high_seq);
@@ -2686,10 +2691,15 @@ static void tcp_process_loss(struct sock *sk, int flag, 
bool is_dupack)
tp->frto = 0; /* Step 3.a. loss was real */
} else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
tp->high_seq = tp->snd_nxt;
-   __tcp_push_pending_frames(sk, tcp_current_mss(sk),
- TCP_NAGLE_OFF);
-   if (after(tp->snd_nxt, tp->high_seq))
-   return; /* Step 2.b */
+   /* Step 2.b. Try send new data (but deferred until cwnd
+* is updated in tcp_ack()). Otherwise fall back to
+* the conventional recovery.
+*/
+   if (tcp_send_head(sk) &&
+   after(tcp_wnd_end(tp), tp->snd_nxt)) {
+   *rexmit = REXMIT_NEW;
+   return;
+   }
tp->frto = 0;
}
}
@@ -2708,7 +2718,7 @@ static void tcp_process_loss(struct sock *sk, int flag, 
bool is_dupack)
else if (flag & FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp);
}
-   tcp_xmit_retransmit_queue(sk);
+   *rexmit = REXMIT_LOST;
 }
 
 /* Undo during fast recovery after partial ACK. */
@@ -2758,7 +2768,7 @@ static bool tcp_try_undo_partial(struct sock *sk, const 
int acked,
  */
 static void tcp_fastretrans_alert(struct sock *sk, const int acked,
  const int prior_unsacked,
- bool is_dupack, int flag)
+ bool is_dupack, int flag, int *rexmit)
 {
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -2833,7 +2843,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const 
int acked,
}
break;
case TCP_CA_Loss:
-   tcp_process_loss(sk, flag, is_dupack);
+   tcp_process_loss(sk, flag, is_dupack, rexmit);
if (icsk->icsk_ca_state != TCP_CA_Open &&
!(flag & FLAG_LOST_RETRANS))
return;
@@ -2873,7 +2883,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const 
int acked,
if (do_lost)
tcp_update_scoreboard(sk, fast_rexmit);
tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit, flag);
-   tcp_xmit_retransmit_queue(sk);
+   *rexmit = REXMIT_LOST;
 }
 
 /* Kathleen Nichols' algorithm for tracking the minimum value of
@@ -3508,6 +3518,27 @@ static inline void tcp_in_ack_event(struct sock *sk, u32 
flags)
icsk->icsk_ca_ops->in_ack_event(sk, flags);
 }
 
+/* Congestion control has updated the cwnd already. So if we're in
+ * loss recovery then now we do any new sends (for FRTO) or
+ * retransmits (for CA_Loss or CA_recovery) that make sense.
+ */
+static void tcp_xmit_recovery(struct sock *sk, int rexmit)
+{
+   struct tcp_sock *tp = tcp_sk(sk);
+
+   if (rexmit == REXMIT_NONE)
+   return;
+
+   if (unlikely(rexmit == 2)) {
+   __tcp_push_pending_frames(sk,

[PATCH net-next 6/6] tcp: tcp_cong_control helper

2016-02-02 Thread Yuchung Cheng

Refactor and consolidate cwnd and rate updates into a new function
tcp_cong_control().

Signed-off-by: Yuchung Cheng 
Signed-off-by: Neal Cardwell 
Signed-off-by: Eric Dumazet 
---
 net/ipv4/tcp_input.c | 31 +++
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 39c5326..52aa5df 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3325,6 +3325,24 @@ static inline bool tcp_may_raise_cwnd(const struct sock 
*sk, const int flag)
return flag & FLAG_DATA_ACKED;
 }
 
+/* The "ultimate" congestion control function that aims to replace the rigid
+ * cwnd increase and decrease control (tcp_cong_avoid,tcp_*cwnd_reduction).
+ * It's called toward the end of processing an ACK with precise rate
+ * information. All transmission or retransmission are delayed afterwards.
+ */
+static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
+int flag)
+{
+   if (tcp_in_cwnd_reduction(sk)) {
+   /* Reduce cwnd if state mandates */
+   tcp_cwnd_reduction(sk, acked_sacked, flag);
+   } else if (tcp_may_raise_cwnd(sk, flag)) {
+   /* Advance cwnd if state allows */
+   tcp_cong_avoid(sk, ack, acked_sacked);
+   }
+   tcp_update_pacing_rate(sk);
+}
+
 /* Check that window update is acceptable.
  * The function assumes that snd_una<=ack<=snd_next.
  */
@@ -3555,7 +3573,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff 
*skb, int flag)
int prior_packets = tp->packets_out;
u32 prior_delivered = tp->delivered;
int acked = 0; /* Number of packets newly acked */
-   u32 acked_sacked; /* Number of packets newly acked or sacked */
int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
 
sack_state.first_sackt.v64 = 0;
@@ -3655,16 +3672,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff 
*skb, int flag)
if (tp->tlp_high_seq)
tcp_process_tlp_ack(sk, ack, flag);
 
-   acked_sacked = tp->delivered - prior_delivered;
-   /* Advance cwnd if state allows */
-   if (tcp_in_cwnd_reduction(sk)) {
-   /* Reduce cwnd if state mandates */
-   tcp_cwnd_reduction(sk, acked_sacked, flag);
-   } else if (tcp_may_raise_cwnd(sk, flag)) {
-   /* Advance cwnd if state allows */
-   tcp_cong_avoid(sk, ack, acked_sacked);
-   }
-
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
struct dst_entry *dst = __sk_dst_get(sk);
if (dst)
@@ -3673,7 +3680,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff 
*skb, int flag)
 
if (icsk->icsk_pending == ICSK_TIME_RETRANS)
tcp_schedule_loss_probe(sk);
-   tcp_update_pacing_rate(sk);
+   tcp_cong_control(sk, ack, tp->delivered - prior_delivered, flag);
tcp_xmit_recovery(sk, rexmit);
return 1;
 
-- 
2.7.0.rc3.207.g0ac5344

[PATCH v3 net-next 0/2] tcp: Redundant Data Bundling (RDB)

2016-02-02 Thread Bendik Rønning Opstad


Redundant Data Bundling (RDB) is a mechanism for TCP aimed at reducing
the latency for applications sending time-dependent data.
Latency-sensitive applications or services, such as online games and
remote desktop, produce traffic with thin-stream characteristics,
characterized by small packets and a relatively high ITT. By bundling
already sent data in packets with new data, RDB alleviates head-of-line
blocking by reducing the need to retransmit data segments when packets
are lost. RDB is a continuation on the work on latency improvements for
TCP in Linux, previously resulting in two thin-stream mechanisms in the
Linux kernel
(https://github.com/torvalds/linux/blob/master/Documentation/networking/tcp-thin.txt).

The RDB implementation has been thoroughly tested, and shows
significant latency reductions when packet loss occurs[1]. The tests
show that, by imposing restrictions on the bundling rate, it can be
made not to negatively affect competing traffic in an unfair manner.

Note: Current patch set depends on the patch "tcp: refactor struct tcp_skb_cb"
(http://patchwork.ozlabs.org/patch/510674)

These patches have also been tested with as set of packetdrill scripts
located at
https://github.com/bendikro/packetdrill/tree/master/gtests/net/packetdrill/tests/linux/rdb
(The tests require patching packetdrill with a new socket option:
https://github.com/bendikro/packetdrill/commit/9916b6c53e33dd04329d29b7d8baf703b2c2ac1b)

Detailed info about the RDB mechanism can be found at
http://mlab.no/blog/2015/10/redundant-data-bundling-in-tcp, as well as
in the paper "Latency and Fairness Trade-Off for Thin Streams using
Redundant Data Bundling in TCP"[2].

[1] http://home.ifi.uio.no/paalh/students/BendikOpstad.pdf
[2] http://home.ifi.uio.no/bendiko/rdb_fairness_tradeoff.pdf

Changes:

v3 (PATCH):
 * tcp-Add-Redundant-Data-Bundling-RDB:
   * Changed name of sysctl variable from tcp_rdb_max_skbs to
 tcp_rdb_max_packets after comment from Eric Dumazet about
 not exposing internal (kernel) names like skb.
   * Formatting and function docs fixes

v2 (RFC/PATCH):
 * tcp-Add-DPIFL-thin-stream-detection-mechanism:
   * Change calculation in tcp_stream_is_thin_dpifl based on
 feedback from Eric Dumazet.

 * tcp-Add-Redundant-Data-Bundling-RDB:
   * Removed setting nonagle in do_tcp_setsockopt (TCP_RDB)
 to reduce complexity as commented by Neal Cardwell.
   * Cleaned up loss detection code in rdb_check_rtx_queue_loss

v1 (RFC/PATCH)


Bendik Rønning Opstad (2):
  tcp: Add DPIFL thin stream detection mechanism
  tcp: Add Redundant Data Bundling (RDB)

 Documentation/networking/ip-sysctl.txt |  23 +++
 include/linux/skbuff.h |   1 +
 include/linux/tcp.h|   3 +-
 include/net/tcp.h  |  35 +
 include/uapi/linux/tcp.h   |   1 +
 net/core/skbuff.c  |   3 +-
 net/ipv4/Makefile  |   3 +-
 net/ipv4/sysctl_net_ipv4.c |  35 +
 net/ipv4/tcp.c |  16 +-
 net/ipv4/tcp_input.c   |   3 +
 net/ipv4/tcp_output.c  |  11 +-
 net/ipv4/tcp_rdb.c | 273 +
 12 files changed, 399 insertions(+), 8 deletions(-)
 create mode 100644 net/ipv4/tcp_rdb.c

-- 
1.9.1

[PATCH v3 net-next 2/2] tcp: Add Redundant Data Bundling (RDB)

2016-02-02 Thread Bendik Rønning Opstad

RDB is a mechanism that enables a TCP sender to bundle redundant
(already sent) data with TCP packets containing new data. By bundling
(retransmitting) already sent data with each TCP packet containing new
data, the connection will be more resistant to sporadic packet loss
which reduces the application layer latency significantly in congested
scenarios.

The main functionality added:

  o Loss detection of hidden loss events: When bundling redundant data
with each packet, packet loss can be hidden from the TCP engine due
to lack of dupACKs. This is because the loss is "repaired" by the
redundant data in the packet coming after the lost packet. Based on
incoming ACKs, such hidden loss events are detected, and CWR state
is entered.

  o When packets are scheduled for transmission, RDB replaces the SKB to
be sent with a modified SKB containing the redundant data of
previously sent data segments from the TCP output queue.

  o RDB will only be used for streams classified as thin by the function
tcp_stream_is_thin_dpifl(). This enforces a lower bound on the ITT
for streams that may benefit from RDB, controlled by the sysctl
variable tcp_thin_dpifl_itt_lower_bound.

RDB is enabled on a connection with the socket option TCP_RDB, or on all
new connections by setting the sysctl variable tcp_rdb=1.

Cc: Andreas Petlund 
Cc: Carsten Griwodz 
Cc: Pål Halvorsen 
Cc: Jonas Markussen 
Cc: Kristian Evensen 
Cc: Kenneth Klette Jonassen 
Signed-off-by: Bendik Rønning Opstad 
---
 Documentation/networking/ip-sysctl.txt |  15 ++
 include/linux/skbuff.h |   1 +
 include/linux/tcp.h|   3 +-
 include/net/tcp.h  |  14 ++
 include/uapi/linux/tcp.h   |   1 +
 net/core/skbuff.c  |   3 +-
 net/ipv4/Makefile  |   3 +-
 net/ipv4/sysctl_net_ipv4.c |  26 
 net/ipv4/tcp.c |  14 +-
 net/ipv4/tcp_input.c   |   3 +
 net/ipv4/tcp_output.c  |  11 +-
 net/ipv4/tcp_rdb.c | 273 +
 12 files changed, 359 insertions(+), 8 deletions(-)
 create mode 100644 net/ipv4/tcp_rdb.c

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index eb42853..14f960d 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -716,6 +716,21 @@ tcp_thin_dpifl_itt_lower_bound - INTEGER
calculated, which is used to classify whether a stream is thin.
Default: 1
 
+tcp_rdb - BOOLEAN
+   Enable RDB for all new TCP connections.
+   Default: 0
+
+tcp_rdb_max_bytes - INTEGER
+   Enable restriction on how many bytes an RDB packet can contain.
+   This is the total amount of payload including the new unsent data.
+   Default: 0
+
+tcp_rdb_max_packets - INTEGER
+   Enable restriction on how many previous packets in the output queue
+   RDB may include data from. A value of 1 will restrict bundling to
+   only the data from the last packet that was sent.
+   Default: 1
+
 tcp_limit_output_bytes - INTEGER
Controls TCP Small Queue limit per tcp socket.
TCP bulk sender tends to increase packets in flight until it
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 11f935c..eb81877 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2914,6 +2914,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct 
iov_iter *frm);
 void skb_free_datagram(struct sock *sk, struct sk_buff *skb);
 void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb);
 int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int 
flags);
+void copy_skb_header(struct sk_buff *new, const struct sk_buff *old);
 int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len);
 int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len);
 __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to,
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index b386361..da6dae8 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -202,9 +202,10 @@ struct tcp_sock {
} rack;
u16 advmss; /* Advertised MSS   */
u8  unused;
-   u8  nonagle : 4,/* Disable Nagle algorithm? */
+   u8  nonagle : 3,/* Disable Nagle algorithm? */
thin_lto: 1,/* Use linear timeouts for thin streams */
thin_dupack : 1,/* Fast retransmit on first dupack  */
+   rdb : 1,/* Redundant Data Bundling enabled  */
repair  : 1,
frto: 1;/* F-RTO (RFC5682) activated in

[PATCH net-next 1/2] ethtool: add IPv6 to the NFC API

2016-02-02 Thread Edward Cree

Signed-off-by: Edward Cree 
---
 include/uapi/linux/ethtool.h | 70 
 1 file changed, 64 insertions(+), 6 deletions(-)

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 57fa390..3b6af3e 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -748,6 +748,56 @@ struct ethtool_usrip4_spec {
__u8proto;
 };
 
+/**
+ * struct ethtool_tcpip6_spec - flow specification for TCP/IPv6 etc.
+ * @ip6src: Source host
+ * @ip6dst: Destination host
+ * @psrc: Source port
+ * @pdst: Destination port
+ * @tos: Type-of-service
+ *
+ * This can be used to specify a TCP/IPv6, UDP/IPv6 or SCTP/IPv6 flow.
+ */
+struct ethtool_tcpip6_spec {
+   __be32  ip6src[4];
+   __be32  ip6dst[4];
+   __be16  psrc;
+   __be16  pdst;
+   __u8tos;
+};
+
+/**
+ * struct ethtool_ah_espip6_spec - flow specification for IPsec/IPv6
+ * @ip6src: Source host
+ * @ip6dst: Destination host
+ * @spi: Security parameters index
+ * @tos: Type-of-service
+ *
+ * This can be used to specify an IPsec transport or tunnel over IPv6.
+ */
+struct ethtool_ah_espip6_spec {
+   __be32  ip6src[4];
+   __be32  ip6dst[4];
+   __be32  spi;
+   __u8tos;
+};
+
+/**
+ * struct ethtool_usrip6_spec - general flow specification for IPv6
+ * @ip6src: Source host
+ * @ip6dst: Destination host
+ * @l4_4_bytes: First 4 bytes of transport (layer 4) header
+ * @tos: Type-of-service
+ * @l4_proto: Transport protocol number (nexthdr after any Extension Headers)
+ */
+struct ethtool_usrip6_spec {
+   __be32  ip6src[4];
+   __be32  ip6dst[4];
+   __be32  l4_4_bytes;
+   __u8tos;
+   __u8l4_proto;
+};
+
 union ethtool_flow_union {
struct ethtool_tcpip4_spec  tcp_ip4_spec;
struct ethtool_tcpip4_spec  udp_ip4_spec;
@@ -755,6 +805,12 @@ union ethtool_flow_union {
struct ethtool_ah_espip4_spec   ah_ip4_spec;
struct ethtool_ah_espip4_spec   esp_ip4_spec;
struct ethtool_usrip4_spec  usr_ip4_spec;
+   struct ethtool_tcpip6_spec  tcp_ip6_spec;
+   struct ethtool_tcpip6_spec  udp_ip6_spec;
+   struct ethtool_tcpip6_spec  sctp_ip6_spec;
+   struct ethtool_ah_espip6_spec   ah_ip6_spec;
+   struct ethtool_ah_espip6_spec   esp_ip6_spec;
+   struct ethtool_usrip6_spec  usr_ip6_spec;
struct ethhdr   ether_spec;
__u8hdata[52];
 };
@@ -1367,15 +1423,17 @@ enum ethtool_sfeatures_retval_bits {
 #defineUDP_V4_FLOW 0x02/* hash or spec (udp_ip4_spec) */
 #defineSCTP_V4_FLOW0x03/* hash or spec (sctp_ip4_spec) */
 #defineAH_ESP_V4_FLOW  0x04/* hash only */
-#defineTCP_V6_FLOW 0x05/* hash only */
-#defineUDP_V6_FLOW 0x06/* hash only */
-#defineSCTP_V6_FLOW0x07/* hash only */
+#defineTCP_V6_FLOW 0x05/* hash or spec (tcp_ip6_spec; nfc 
only) */
+#defineUDP_V6_FLOW 0x06/* hash or spec (udp_ip6_spec; nfc 
only) */
+#defineSCTP_V6_FLOW0x07/* hash or spec (sctp_ip6_spec; nfc 
only) */
 #defineAH_ESP_V6_FLOW  0x08/* hash only */
 #defineAH_V4_FLOW  0x09/* hash or spec (ah_ip4_spec) */
 #defineESP_V4_FLOW 0x0a/* hash or spec (esp_ip4_spec) */
-#defineAH_V6_FLOW  0x0b/* hash only */
-#defineESP_V6_FLOW 0x0c/* hash only */
-#defineIP_USER_FLOW0x0d/* spec only (usr_ip4_spec) */
+#defineAH_V6_FLOW  0x0b/* hash or spec (ah_ip6_spec; nfc only) 
*/
+#defineESP_V6_FLOW 0x0c/* hash or spec (esp_ip6_spec; nfc 
only) */
+#defineIPV4_USER_FLOW  0x0d/* spec only (usr_ip4_spec) */
+#defineIP_USER_FLOWIPV4_USER_FLOW
+#defineIPV6_USER_FLOW  0x0e/* spec only (usr_ip6_spec; nfc only) */
 #defineIPV4_FLOW   0x10/* hash only */
 #defineIPV6_FLOW   0x11/* hash only */
 #defineETHER_FLOW  0x12/* spec only (ether_spec) */

[PATCH net-next 2/2] sunvnet: perf tracepoint invocations to trace LDC state machine

2016-02-02 Thread Sowmini Varadhan

Use sunvnet perf trace macros to monitor LDC message exchange state.

Signed-off-by: Sowmini Varadhan 
---
 drivers/net/ethernet/sun/sunvnet.c |   24 ++--
 1 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/sun/sunvnet.c 
b/drivers/net/ethernet/sun/sunvnet.c
index 23fa298..942a95d 100644
--- a/drivers/net/ethernet/sun/sunvnet.c
+++ b/drivers/net/ethernet/sun/sunvnet.c
@@ -17,6 +17,8 @@
 #include 
 #include 
 #include 
+#define CREATE_TRACE_POINTS
+#include 
 
 #if IS_ENABLED(CONFIG_IPV6)
 #include 
@@ -540,6 +542,8 @@ static int vnet_walk_rx_one(struct vnet_port *port,
err = vnet_rx_one(port, desc);
if (err == -ECONNRESET)
return err;
+   trace_vnet_rx_one(port->vio._local_sid, port->vio._peer_sid,
+ index, desc->hdr.ack);
desc->hdr.state = VIO_DESC_DONE;
err = put_rx_desc(port, dr, desc, index);
if (err < 0)
@@ -587,9 +591,15 @@ static int vnet_walk_rx(struct vnet_port *port, struct 
vio_dring_state *dr,
ack_start = ack_end = vio_dring_prev(dr, start);
if (send_ack) {
port->napi_resume = false;
+   trace_vnet_tx_send_stopped_ack(port->vio._local_sid,
+  port->vio._peer_sid,
+  ack_end, *npkts);
return vnet_send_ack(port, dr, ack_start, ack_end,
 VIO_DRING_STOPPED);
} else  {
+   trace_vnet_tx_defer_stopped_ack(port->vio._local_sid,
+   port->vio._peer_sid,
+   ack_end, *npkts);
port->napi_resume = true;
port->napi_stop_idx = ack_end;
return 1;
@@ -663,6 +673,8 @@ static int vnet_ack(struct vnet_port *port, void *msgbuf)
/* sync for race conditions with vnet_start_xmit() and tell xmit it
 * is time to send a trigger.
 */
+   trace_vnet_rx_stopped_ack(port->vio._local_sid,
+ port->vio._peer_sid, end);
dr->cons = vio_dring_next(dr, end);
desc = vio_dring_entry(dr, dr->cons);
if (desc->hdr.state == VIO_DESC_READY && !port->start_cons) {
@@ -886,6 +898,9 @@ static int __vnet_tx_trigger(struct vnet_port *port, u32 
start)
int retries = 0;
 
if (port->stop_rx) {
+   trace_vnet_tx_pending_stopped_ack(port->vio._local_sid,
+ port->vio._peer_sid,
+ port->stop_rx_idx, -1);
err = vnet_send_ack(port,
>vio.drings[VIO_DRIVER_RX_RING],
port->stop_rx_idx, -1,
@@ -908,6 +923,8 @@ static int __vnet_tx_trigger(struct vnet_port *port, u32 
start)
if (retries++ > VNET_MAX_RETRIES)
break;
} while (err == -EAGAIN);
+   trace_vnet_tx_trigger(port->vio._local_sid,
+ port->vio._peer_sid, start, err);
 
return err;
 }
@@ -1414,8 +1431,11 @@ static int vnet_start_xmit(struct sk_buff *skb, struct 
net_device *dev)
 * producer to consumer announcement that work is available to the
 * consumer
 */
-   if (!port->start_cons)
-   goto ldc_start_done; /* previous trigger suffices */
+   if (!port->start_cons) { /* previous trigger suffices */
+   trace_vnet_skip_tx_trigger(port->vio._local_sid,
+  port->vio._peer_sid, dr->cons);
+   goto ldc_start_done;
+   }
 
err = __vnet_tx_trigger(port, dr->cons);
if (unlikely(err < 0)) {
-- 
1.7.1

[PATCH net-next 1/2] sunvnet: Add support for perf LDC event tracing

2016-02-02 Thread Sowmini Varadhan

Add perf event macros for support of tracing and instrumentation
of LDC state machine

Signed-off-by: Sowmini Varadhan 
---
 include/trace/events/sunvnet.h |  139 
 1 files changed, 139 insertions(+), 0 deletions(-)
 create mode 100644 include/trace/events/sunvnet.h

diff --git a/include/trace/events/sunvnet.h b/include/trace/events/sunvnet.h
new file mode 100644
index 000..eb080b2
--- /dev/null
+++ b/include/trace/events/sunvnet.h
@@ -0,0 +1,139 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM sunvnet
+
+#if !defined(_TRACE_SUNVNET_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_SUNVNET_H
+
+#include 
+
+TRACE_EVENT(vnet_rx_one,
+
+   TP_PROTO(int lsid, int rsid, int index, int needs_ack),
+
+   TP_ARGS(lsid, rsid, index, needs_ack),
+
+   TP_STRUCT__entry(
+   __field(int, lsid)
+   __field(int, rsid)
+   __field(int, index)
+   __field(int, needs_ack)
+   ),
+
+   TP_fast_assign(
+   __entry->lsid = lsid;
+   __entry->rsid = rsid;
+   __entry->index = index;
+   __entry->needs_ack = needs_ack;
+   ),
+
+   TP_printk("(%x:%x) walk_rx_one index %d; needs_ack %d",
+   __entry->lsid, __entry->rsid,
+   __entry->index, __entry->needs_ack)
+);
+
+DECLARE_EVENT_CLASS(vnet_tx_stopped_ack_template,
+
+   TP_PROTO(int lsid, int rsid, int ack_end, int npkts),
+
+   TP_ARGS(lsid, rsid, ack_end, npkts),
+
+   TP_STRUCT__entry(
+   __field(int, lsid)
+   __field(int, rsid)
+   __field(int, ack_end)
+   __field(int, npkts)
+   ),
+
+   TP_fast_assign(
+   __entry->lsid = lsid;
+   __entry->rsid = rsid;
+   __entry->ack_end = ack_end;
+   __entry->npkts = npkts;
+   ),
+
+   TP_printk("(%x:%x) stopped ack for %d; npkts %d",
+   __entry->lsid, __entry->rsid,
+   __entry->ack_end, __entry->npkts)
+);
+DEFINE_EVENT(vnet_tx_stopped_ack_template, vnet_tx_send_stopped_ack,
+TP_PROTO(int lsid, int rsid, int ack_end, int npkts),
+TP_ARGS(lsid, rsid, ack_end, npkts));
+DEFINE_EVENT(vnet_tx_stopped_ack_template, vnet_tx_defer_stopped_ack,
+TP_PROTO(int lsid, int rsid, int ack_end, int npkts),
+TP_ARGS(lsid, rsid, ack_end, npkts));
+DEFINE_EVENT(vnet_tx_stopped_ack_template, vnet_tx_pending_stopped_ack,
+TP_PROTO(int lsid, int rsid, int ack_end, int npkts),
+TP_ARGS(lsid, rsid, ack_end, npkts));
+
+TRACE_EVENT(vnet_rx_stopped_ack,
+
+   TP_PROTO(int lsid, int rsid, int end),
+
+   TP_ARGS(lsid, rsid, end),
+
+   TP_STRUCT__entry(
+   __field(int, lsid)
+   __field(int, rsid)
+   __field(int, end)
+   ),
+
+   TP_fast_assign(
+   __entry->lsid = lsid;
+   __entry->rsid = rsid;
+   __entry->end = end;
+   ),
+
+   TP_printk("(%x:%x) stopped ack for index %d",
+   __entry->lsid, __entry->rsid, __entry->end)
+);
+
+TRACE_EVENT(vnet_tx_trigger,
+
+   TP_PROTO(int lsid, int rsid, int start, int err),
+
+   TP_ARGS(lsid, rsid, start, err),
+
+   TP_STRUCT__entry(
+   __field(int, lsid)
+   __field(int, rsid)
+   __field(int, start)
+   __field(int, err)
+   ),
+
+   TP_fast_assign(
+   __entry->lsid = lsid;
+   __entry->rsid = rsid;
+   __entry->start = start;
+   __entry->err = err;
+   ),
+
+   TP_printk("(%x:%x) Tx trigger for %d sent with err %d %s",
+   __entry->lsid, __entry->rsid, __entry->start,
+   __entry->err, __entry->err > 0 ? "(ok)" : " ")
+);
+
+TRACE_EVENT(vnet_skip_tx_trigger,
+
+   TP_PROTO(int lsid, int rsid, int last),
+
+   TP_ARGS(lsid, rsid, last),
+
+   TP_STRUCT__entry(
+   __field(int, lsid)
+   __field(int, rsid)
+   __field(int, last)
+   ),
+
+   TP_fast_assign(
+   __entry->lsid = lsid;
+   __entry->rsid = rsid;
+   __entry->last = last;
+   ),
+
+   TP_printk("(%x:%x) Skip Tx trigger. Last trigger sent was %d",
+   __entry->lsid, __entry->rsid, __entry->last)
+);
+#endif /* _TRACE_SOCK_H */
+
+/* This part must be outside protection */
+#include 
-- 
1.7.1

[PATCH net-next 0/2] sunvnet: perf tracepoint hooks

2016-02-02 Thread Sowmini Varadhan

Added some perf tracepoints to help track and debug sunvnet 
descriptor state at run-time.

Sowmini Varadhan (2):
  sunvnet: Add support for perf LDC event tracing
  sunvnet: perf tracepoint invocations to trace LDC state machine

 drivers/net/ethernet/sun/sunvnet.c |   24 ++-
 include/trace/events/sunvnet.h |  139 
 2 files changed, 161 insertions(+), 2 deletions(-)
 create mode 100644 include/trace/events/sunvnet.h

Re: [PATCH v2] unix: properly account for FDs passed over unix sockets

2016-02-02 Thread Hannes Frederic Sowa


On 02.02.2016 18:34, David Herrmann wrote:

Hi

On Sun, Jan 10, 2016 at 7:54 AM, Willy Tarreau  wrote:

It is possible for a process to allocate and accumulate far more FDs than
the process' limit by sending them over a unix socket then closing them
to keep the process' fd count low.

This change addresses this problem by keeping track of the number of FDs
in flight per user and preventing non-privileged processes from having
more FDs in flight than their configured FD limit.


This is not really what this patch does. This patch rather prevents
processes from having more of their owned *files* in flight than their
configured FD limit. See below for details..


Reported-by: socketp...@gmail.com
Reported-by: Tetsuo Handa 
Mitigates: CVE-2013-4312 (Linux 2.0+)
Suggested-by: Linus Torvalds 
Acked-by: Hannes Frederic Sowa 
Signed-off-by: Willy Tarreau 
---
v2: add reported-by, mitigates and acked-by.

It would be nice if (if accepted) it would be backported to -stable as the
issue is currently exploitable.
---
  include/linux/sched.h |  1 +
  net/unix/af_unix.c| 24 
  net/unix/garbage.c| 13 -
  3 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index edad7a4..fbf25f1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -830,6 +830,7 @@ struct user_struct {
 unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? 
*/
  #endif
 unsigned long locked_shm; /* How many pages of mlocked shm ? */
+   unsigned long unix_inflight;/* How many files in flight in unix 
sockets */

  #ifdef CONFIG_KEYS
 struct key *uid_keyring;/* UID specific keyring */
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 45aebd9..d6d7b43 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1499,6 +1499,21 @@ static void unix_destruct_scm(struct sk_buff *skb)
 sock_wfree(skb);
  }

+/*
+ * The "user->unix_inflight" variable is protected by the garbage
+ * collection lock, and we just read it locklessly here. If you go
+ * over the limit, there might be a tiny race in actually noticing
+ * it across threads. Tough.


This limit is checked once per transaction. IIRC, a transaction can
carry 255 files. Thus, it is easy to exceed this limit without any
race involved.


+ */
+static inline bool too_many_unix_fds(struct task_struct *p)
+{
+   struct user_struct *user = current_user();
+
+   if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE)))
+   return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
+   return false;
+}
+
  #define MAX_RECURSION_LEVEL 4

  static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
@@ -1507,6 +1522,9 @@ static int unix_attach_fds(struct scm_cookie *scm, struct 
sk_buff *skb)
 unsigned char max_level = 0;
 int unix_sock_count = 0;

+   if (too_many_unix_fds(current))
+   return -ETOOMANYREFS;
+


Ignoring the capabilities, this effectively resolves to:

 if (current_cred()->user->unix_inflight > rlimit(RLIMIT_NOFILE))
 return -ETOOMANYREFS;

It limits the number of inflight FDs of the *current* user to its *own* limit.

But..


 for (i = scm->fp->count - 1; i >= 0; i--) {
 struct sock *sk = unix_get_socket(scm->fp->fp[i]);

@@ -1528,10 +1546,8 @@ static int unix_attach_fds(struct scm_cookie *scm, 
struct sk_buff *skb)
 if (!UNIXCB(skb).fp)
 return -ENOMEM;

-   if (unix_sock_count) {
-   for (i = scm->fp->count - 1; i >= 0; i--)
-   unix_inflight(scm->fp->fp[i]);
-   }
+   for (i = scm->fp->count - 1; i >= 0; i--)
+   unix_inflight(scm->fp->fp[i]);
 return max_level;
  }

diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index a73a226..8fcdc22 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -120,11 +120,11 @@ void unix_inflight(struct file *fp)
  {
 struct sock *s = unix_get_socket(fp);

+   spin_lock(_gc_lock);
+
 if (s) {
 struct unix_sock *u = unix_sk(s);

-   spin_lock(_gc_lock);
-
 if (atomic_long_inc_return(>inflight) == 1) {
 BUG_ON(!list_empty(>link));
 list_add_tail(>link, _inflight_list);
@@ -132,25 +132,28 @@ void unix_inflight(struct file *fp)
 BUG_ON(list_empty(>link));
 }
 unix_tot_inflight++;
-   spin_unlock(_gc_lock);
 }
+   fp->f_cred->user->unix_inflight++;


..this modifies the limit of the owner of the file you send. As such,
if you only send files that you don't own, you will continuously bump
the limit of the file-owner, but never your own. As such, the
protection

[PATCH next 2/3] ipvlan: mode is u16

2016-02-02 Thread Mahesh Bandewar

From: Mahesh Bandewar 

The mode argument was erronusly defined as u32 but it has always
been u16.

Signed-off-by: Mahesh Bandewar 
---
 drivers/net/ipvlan/ipvlan.h  | 1 -
 drivers/net/ipvlan/ipvlan_main.c | 9 ++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h
index 9542b7bac61a..817cab1a7959 100644
--- a/drivers/net/ipvlan/ipvlan.h
+++ b/drivers/net/ipvlan/ipvlan.h
@@ -115,7 +115,6 @@ static inline struct ipvl_port *ipvlan_port_get_rtnl(const 
struct net_device *d)
 }
 
 void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct net_device *dev);
-void ipvlan_set_port_mode(struct ipvl_port *port, u32 nval);
 void ipvlan_init_secret(void);
 unsigned int ipvlan_mac_hash(const unsigned char *addr);
 rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb);
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index f94392d07126..54c542526262 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -14,7 +14,7 @@ void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct 
net_device *dev)
ipvlan->dev->mtu = dev->mtu - ipvlan->mtu_adj;
 }
 
-void ipvlan_set_port_mode(struct ipvl_port *port, u32 nval)
+static void ipvlan_set_port_mode(struct ipvl_port *port, u16 nval)
 {
struct ipvl_dev *ipvlan;
 
@@ -442,6 +442,7 @@ static int ipvlan_link_new(struct net *src_net, struct 
net_device *dev,
struct ipvl_port *port;
struct net_device *phy_dev;
int err;
+   u16 mode = IPVLAN_MODE_L3;
 
if (!tb[IFLA_LINK])
return -EINVAL;
@@ -460,10 +461,10 @@ static int ipvlan_link_new(struct net *src_net, struct 
net_device *dev,
return err;
}
 
-   port = ipvlan_port_get_rtnl(phy_dev);
if (data && data[IFLA_IPVLAN_MODE])
-   port->mode = nla_get_u16(data[IFLA_IPVLAN_MODE]);
+   mode = nla_get_u16(data[IFLA_IPVLAN_MODE]);
 
+   port = ipvlan_port_get_rtnl(phy_dev);
ipvlan->phy_dev = phy_dev;
ipvlan->dev = dev;
ipvlan->port = port;
@@ -488,6 +489,8 @@ static int ipvlan_link_new(struct net *src_net, struct 
net_device *dev,
goto ipvlan_destroy_port;
 
list_add_tail_rcu(>pnode, >ipvlans);
+   ipvlan_set_port_mode(port, mode);
+
netif_stacked_transfer_operstate(phy_dev, dev);
return 0;
 
-- 
2.7.0.rc3.207.g0ac5344

[PATCH next 1/3] ipvlan: scrub skb before routing in L3 mode.

2016-02-02 Thread Mahesh Bandewar

From: Mahesh Bandewar 

Scrub skb before hitting the iptable hooks to ensure packets hit
these hooks.

Signed-off-by: Mahesh Bandewar 
---
 drivers/net/ipvlan/ipvlan_core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index 8c48bb2a94ea..21c380f9ccd5 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -365,7 +365,7 @@ static int ipvlan_process_v4_outbound(struct sk_buff *skb)
ip_rt_put(rt);
goto err;
}
-   skb_dst_drop(skb);
+   skb_scrub_packet(skb, false);
skb_dst_set(skb, >dst);
err = ip_local_out(net, skb->sk, skb);
if (unlikely(net_xmit_eval(err)))
@@ -403,7 +403,7 @@ static int ipvlan_process_v6_outbound(struct sk_buff *skb)
dst_release(dst);
goto err;
}
-   skb_dst_drop(skb);
+   skb_scrub_packet(skb, false);
skb_dst_set(skb, dst);
err = ip6_local_out(net, skb->sk, skb);
if (unlikely(net_xmit_eval(err)))
-- 
2.7.0.rc3.207.g0ac5344

[PATCH v3 net-next 1/2] tcp: Add DPIFL thin stream detection mechanism

2016-02-02 Thread Bendik Rønning Opstad

The existing mechanism for detecting thin streams (tcp_stream_is_thin)
is based on a static limit of less than 4 packets in flight. This treats
streams differently depending on the connections RTT, such that a stream
on a high RTT link may never be considered thin, whereas the same
application would produce a stream that would always be thin in a low RTT
scenario (e.g. data center).

By calculating a dynamic packets in flight limit (DPIFL), the thin stream
detection will be independent of the RTT and treat streams equally based
on the transmission pattern, i.e. the inter-transmission time (ITT).

Cc: Andreas Petlund 
Cc: Carsten Griwodz 
Cc: Pål Halvorsen 
Cc: Jonas Markussen 
Cc: Kristian Evensen 
Cc: Kenneth Klette Jonassen 
Signed-off-by: Bendik Rønning Opstad 
---
 Documentation/networking/ip-sysctl.txt |  8 
 include/net/tcp.h  | 21 +
 net/ipv4/sysctl_net_ipv4.c |  9 +
 net/ipv4/tcp.c |  2 ++
 4 files changed, 40 insertions(+)

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index 73b36d7..eb42853 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -708,6 +708,14 @@ tcp_thin_dupack - BOOLEAN
Documentation/networking/tcp-thin.txt
Default: 0
 
+tcp_thin_dpifl_itt_lower_bound - INTEGER
+   Controls the lower bound inter-transmission time (ITT) threshold
+   for when a stream is considered thin. The value is specified in
+   microseconds, and may not be lower than 1 (10 ms). Based on
+   this threshold, a dynamic packets in flight limit (DPIFL) is
+   calculated, which is used to classify whether a stream is thin.
+   Default: 1
+
 tcp_limit_output_bytes - INTEGER
Controls TCP Small Queue limit per tcp socket.
TCP bulk sender tends to increase packets in flight until it
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3dd20fe..2d86bd7 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -215,6 +215,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 
 /* TCP thin-stream limits */
 #define TCP_THIN_LINEAR_RETRIES 6   /* After 6 linear retries, do exp. 
backoff */
+/* Lowest possible DPIFL lower bound ITT is 10 ms (1 usec) */
+#define TCP_THIN_DPIFL_ITT_LOWER_BOUND_MIN 1
 
 /* TCP initial congestion window as per rfc6928 */
 #define TCP_INIT_CWND  10
@@ -271,6 +273,7 @@ extern int sysctl_tcp_workaround_signed_windows;
 extern int sysctl_tcp_slow_start_after_idle;
 extern int sysctl_tcp_thin_linear_timeouts;
 extern int sysctl_tcp_thin_dupack;
+extern int sysctl_tcp_thin_dpifl_itt_lower_bound;
 extern int sysctl_tcp_early_retrans;
 extern int sysctl_tcp_limit_output_bytes;
 extern int sysctl_tcp_challenge_ack_limit;
@@ -1649,6 +1652,24 @@ static inline bool tcp_stream_is_thin(struct tcp_sock 
*tp)
return tp->packets_out < 4 && !tcp_in_initial_slowstart(tp);
 }
 
+/**
+ * tcp_stream_is_thin_dpifl() - Tests if the stream is thin based on dynamic 
PIF
+ *  limit
+ * @tp: the tcp_sock struct
+ *
+ * Return: true if current packets in flight (PIF) count is lower than
+ * the dynamic PIF limit, else false
+ */
+static inline bool tcp_stream_is_thin_dpifl(const struct tcp_sock *tp)
+{
+   /* Calculate the maximum allowed PIF limit by dividing the RTT by
+* the minimum allowed inter-transmission time (ITT).
+* Tests if PIF < RTT / ITT-lower-bound
+*/
+   return (u64) tcp_packets_in_flight(tp) *
+   sysctl_tcp_thin_dpifl_itt_lower_bound < (tp->srtt_us >> 3);
+}
+
 /* /proc */
 enum tcp_seq_states {
TCP_SEQ_STATE_LISTENING,
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 4d367b4..6014bc4 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -41,6 +41,7 @@ static int tcp_syn_retries_min = 1;
 static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
 static int ip_ping_group_range_min[] = { 0, 0 };
 static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
+static int tcp_thin_dpifl_itt_lower_bound_min = 
TCP_THIN_DPIFL_ITT_LOWER_BOUND_MIN;
 
 /* Update system visible IP port range */
 static void set_local_port_range(struct net *net, int range[2])
@@ -687,6 +688,14 @@ static struct ctl_table ipv4_table[] = {
.proc_handler   = proc_dointvec
},
{
+   .procname   = "tcp_thin_dpifl_itt_lower_bound",
+   .data   = _tcp_thin_dpifl_itt_lower_bound,
+   .maxlen = sizeof(int),
+   .mode   = 0644,
+   .proc_handler   = _dointvec_minmax,
+   .extra1 = _thin_dpifl_itt_lower_bound_min,
+   },
+   {

[PATCH net v4] r8169: Add 2 new interrupts tested on my hardware.

2016-02-02 Thread Corcodel Marian

With SWInt and TxDescUnavail interrupts now have full support interrupts 
(maybe).
This new interrupts is tested on my hardware RTL 8102.For another hardware need 
to 
 be tested.

Signed-off-by: Corcodel Marian 
---
 drivers/net/ethernet/realtek/r8169.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c 
b/drivers/net/ethernet/realtek/r8169.c
index 5186ef9..7588ba7 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -8037,7 +8037,7 @@ static const struct rtl_cfg_info {
.region = 2,
.align  = 8,
.event_slow = SYSErr | LinkChg | RxOverflow | RxFIFOOver |
- PCSTimeout,
+ PCSTimeout | SWInt | TxDescUnavail,
.features   = RTL_FEATURE_MSI,
.default_ver= RTL_GIGA_MAC_VER_13,
}
-- 
2.5.0

Re: [PATCH v2] unix: properly account for FDs passed over unix sockets

2016-02-02 Thread Hannes Frederic Sowa


On 02.02.2016 20:29, Linus Torvalds wrote:

On Tue, Feb 2, 2016 at 10:29 AM, Hannes Frederic Sowa
 wrote:


Anyway, can someone provide a high-level description of what exactly
this patch is supposed to do? Which operation should be limited, who
should inflight FDs be accounted on, and which rlimit should be used
on each operation? I'm having a hard time auditing existing
user-space, given just the scarce description of this commit.


Yes, all your observations are true. I think we need to explicitly
need to refer to the sending socket while attaching the fds.


I don't think that really helps. Maybe somebody passed a unix domain
socket around, and now we're crediting the wrong socket again.


I was struggling a bit what you meant but I think you are referring to 
the following scenario:


process-1 opens up a unix socket and passes it to process-2 (this 
process has different credentials) via af-unix. Process-2 then sends 
multiple fds to another destination over this transferred unix-fd.


True, in this case we again account the fds to the wrong process, which 
is bad.



So how about we actually add a "struct cred *" to the scm_cookie
itself, and we initialize it to "get_current_cred()". And then always
use that.


Unfortunately we never transfer a scm_cookie via the skbs but merely use 
it to initialize unix_skb_parms structure in skb->cb and destroy it 
afterwards.


But "struct pid *" in unix_skb_parms should be enough to get us to 
corresponding "struct cred *" so we can decrement the correct counter 
during skb destruction.


So:

We increment current task's unix_inflight and also check the current 
task's limit during attaching fds to skbs and decrement the inflight 
counter via "struct pid *". This looks like it should work.



That way it's always the person who actually does the send (rather
than the opener of the socket _or_ the opener of the file that gets
passed around) that gets credited, and thanks to the cred pointer we
can then de-credit them properly.


Exactly, I try to implement that. Thanks a lot!

Hannes

[PATCH net-next 0/6] tcp: congestion control refactoring

2016-02-02 Thread Yuchung Cheng

This patch set refactors the sequence of congestion control,
loss recovery, and transmission logic in TCP ack processing.

The design goal is to decouple and sequence them in the following order:

  0. ACK accounting: free or tag sent packets [unchanged]

  1. loss recovery: identify lost/ecn packets and update congestion state

  2. congestion control: up/down cwnd and pacing rate based on (1)

  3. transmission: send new or retransmit old based on (1) and (2)

This refactoring makes the cwnd changes more clear because it's done
in one place. The packet accounting is also more robust especially
for connections that do not support SACK. Patch 1-4 and 6 are
refactoring and patch 5 improves TCP performance under reordering.

Yuchung Cheng (6):
  tcp: retransmit after recovery processing and congestion control
  tcp: move cwnd reduction after recovery state procesing
  tcp: new delivery accounting
  tcp: refactor pkts acked accounting
  tcp: make congestion control more robust against reordering
  tcp: tcp_cong_control helper

 include/linux/tcp.h  |   1 +
 net/ipv4/tcp_input.c | 149 +--
 2 files changed, 98 insertions(+), 52 deletions(-)

-- 
2.7.0.rc3.207.g0ac5344

[PATCH nf] netfilter: nfnetlink: correctly validate length of batch messages

2016-02-02 Thread phil . turnbull

From: Phil Turnbull 

If nlh->nlmsg_len is zero then an infinite loop is triggered because
'skb_pull(skb, msglen);' pulls zero bytes.

The calculation in nlmsg_len() underflows if 'nlh->nlmsg_len <
NLMSG_HDRLEN' which bypasses the length validation and will later
trigger an out-of-bound read.

If the length validation does fail then the malformed batch message is
copied back to userspace. However, we cannot do this because the
nlh->nlmsg_len can be invalid. This leads to an out-of-bounds read in
netlink_ack:

[   41.455421] 
==
[   41.456431] BUG: KASAN: slab-out-of-bounds in memcpy+0x1d/0x40 at addr 
880119e79340
[   41.456431] Read of size 4294967280 by task a.out/987
[   41.456431] 
=
[   41.456431] BUG kmalloc-512 (Not tainted): kasan: bad access detected
[   41.456431] 
-
...
[   41.456431] Bytes b4 880119e79310: 00 00 00 00 d5 03 00 00 b0 fb fe 
ff 00 00 00 00  
[   41.456431] Object 880119e79320: 20 00 00 00 10 00 05 00 00 00 00 00 
00 00 00 00   ...
[   41.456431] Object 880119e79330: 14 00 0a 00 01 03 fc 40 45 56 11 22 
33 10 00 05  ...@EV."3...
[   41.456431] Object 880119e79340: f0 ff ff ff 88 99 aa bb 00 14 00 0a 
00 06 fe fb  
^^ start of batch nlmsg with
   nlmsg_len=4294967280
...
[   41.456431] Memory state around the buggy address:
[   41.456431]  880119e79400: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 
00 00
[   41.456431]  880119e79480: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 
00 00
[   41.456431] >880119e79500: 00 00 00 00 fc fc fc fc fc fc fc fc fc fc 
fc fc
[   41.456431]^
[   41.456431]  880119e79580: fc fc fc fc fc fc fc fc fc fc fc fc fc fc 
fc fc
[   41.456431]  880119e79600: fc fc fc fc fc fc fc fc fc fc fb fb fb fb 
fb fb
[   41.456431] 
==

Fix this with better validation of nlh->nlmsg_len and by setting
NFNL_BATCH_FAILURE if any batch message fails length validation.

CAP_NET_ADMIN is required to trigger the bugs.

Fixes: 9ea2aa8b7dba ("netfilter: nfnetlink: validate nfnetlink header from 
batch")
Signed-off-by: Phil Turnbull 
---
 net/netfilter/nfnetlink.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 62e92af..857ae89 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -328,10 +328,12 @@ replay:
nlh = nlmsg_hdr(skb);
err = 0;
 
-   if (nlmsg_len(nlh) < sizeof(struct nfgenmsg) ||
-   skb->len < nlh->nlmsg_len) {
-   err = -EINVAL;
-   goto ack;
+   if (nlh->nlmsg_len < NLMSG_HDRLEN ||
+   skb->len < nlh->nlmsg_len ||
+   nlmsg_len(nlh) < sizeof(struct nfgenmsg)) {
+   nfnl_err_reset(_list);
+   status |= NFNL_BATCH_FAILURE;
+   goto done;
}
 
/* Only requests are handled by the kernel */
-- 
1.9.1

[PATCH net-next 2/6] tcp: move cwnd reduction after recovery state procesing

2016-02-02 Thread Yuchung Cheng

Currently the cwnd is reduced and increased in various different
places. The reduction happens in various places in the recovery
state processing (tcp_fastretrans_alert) while the increase
happens afterward.

A better sequence is to identify lost packets and update
the congestion control state (icsk_ca_state) first. Then base
on the new state, up/down the cwnd in one central place. It's
more clear to reason cwnd changes.

Signed-off-by: Yuchung Cheng 
Signed-off-by: Neal Cardwell 
Signed-off-by: Eric Dumazet 
---
 net/ipv4/tcp_input.c | 60 
 1 file changed, 28 insertions(+), 32 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 482c0b4..f2ebe8b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2473,14 +2473,12 @@ static void tcp_init_cwnd_reduction(struct sock *sk)
tcp_ecn_queue_cwr(tp);
 }
 
-static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
-  int fast_rexmit, int flag)
+static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked,
+  int flag)
 {
struct tcp_sock *tp = tcp_sk(sk);
int sndcnt = 0;
int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
-   int newly_acked_sacked = prior_unsacked -
-(tp->packets_out - tp->sacked_out);
 
if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
return;
@@ -2498,7 +2496,8 @@ static void tcp_cwnd_reduction(struct sock *sk, const int 
prior_unsacked,
} else {
sndcnt = min(delta, newly_acked_sacked);
}
-   sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
+   /* Force a fast retransmit upon entering fast recovery */
+   sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
 }
 
@@ -2543,7 +2542,7 @@ static void tcp_try_keep_open(struct sock *sk)
}
 }
 
-static void tcp_try_to_open(struct sock *sk, int flag, const int 
prior_unsacked)
+static void tcp_try_to_open(struct sock *sk, int flag)
 {
struct tcp_sock *tp = tcp_sk(sk);
 
@@ -2557,8 +2556,6 @@ static void tcp_try_to_open(struct sock *sk, int flag, 
const int prior_unsacked)
 
if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
tcp_try_keep_open(sk);
-   } else {
-   tcp_cwnd_reduction(sk, prior_unsacked, 0, flag);
}
 }
 
@@ -2722,8 +2719,7 @@ static void tcp_process_loss(struct sock *sk, int flag, 
bool is_dupack,
 }
 
 /* Undo during fast recovery after partial ACK. */
-static bool tcp_try_undo_partial(struct sock *sk, const int acked,
-const int prior_unsacked, int flag)
+static bool tcp_try_undo_partial(struct sock *sk, const int acked)
 {
struct tcp_sock *tp = tcp_sk(sk);
 
@@ -2738,10 +2734,8 @@ static bool tcp_try_undo_partial(struct sock *sk, const 
int acked,
 * can undo. Otherwise we clock out new packets but do not
 * mark more packets lost or retransmit more.
 */
-   if (tp->retrans_out) {
-   tcp_cwnd_reduction(sk, prior_unsacked, 0, flag);
+   if (tp->retrans_out)
return true;
-   }
 
if (!tcp_any_retrans_done(sk))
tp->retrans_stamp = 0;
@@ -2760,21 +2754,21 @@ static bool tcp_try_undo_partial(struct sock *sk, const 
int acked,
  * taking into account both packets sitting in receiver's buffer and
  * packets lost by network.
  *
- * Besides that it does CWND reduction, when packet loss is detected
- * and changes state of machine.
+ * Besides that it updates the congestion state when packet loss or ECN
+ * is detected. But it does not reduce the cwnd, it is done by the
+ * congestion control later.
  *
  * It does _not_ decide what to send, it is made in function
  * tcp_xmit_retransmit_queue().
  */
 static void tcp_fastretrans_alert(struct sock *sk, const int acked,
- const int prior_unsacked,
- bool is_dupack, int flag, int *rexmit)
+ bool is_dupack, int *ack_flag, int *rexmit)
 {
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+   int fast_rexmit = 0, flag = *ack_flag;
bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
(tcp_fackets_out(tp) > tp->reordering));
-   int fast_rexmit = 0;
 
if (WARN_ON(!tp->packets_out && tp->sacked_out))
tp->sacked_out = 0;
@@ -2821,8 +2815,10 @@ static void tcp_fastretrans_alert(struct sock *sk, const 
int acked,
 
/* Use RACK to detect loss */
if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS &&
-

[PATCH net-next 2/2] sfc: implement IPv6 NFC (and IPV4_USER_FLOW)

2016-02-02 Thread Edward Cree

Signed-off-by: Edward Cree 
---
 drivers/net/ethernet/sfc/ethtool.c | 176 +
 1 file changed, 176 insertions(+)

diff --git a/drivers/net/ethernet/sfc/ethtool.c 
b/drivers/net/ethernet/sfc/ethtool.c
index 0347976..49fac36 100644
--- a/drivers/net/ethernet/sfc/ethtool.c
+++ b/drivers/net/ethernet/sfc/ethtool.c
@@ -783,14 +783,26 @@ static int efx_ethtool_reset(struct net_device *net_dev, 
u32 *flags)
 static const u8 mac_addr_ig_mask[ETH_ALEN] __aligned(2) = {0x01, 0, 0, 0, 0, 
0};
 
 #define IP4_ADDR_FULL_MASK ((__force __be32)~0)
+#define IP_PROTO_FULL_MASK 0xFF
 #define PORT_FULL_MASK ((__force __be16)~0)
 #define ETHER_TYPE_FULL_MASK   ((__force __be16)~0)
 
+static inline void ip6_fill_mask(__be32 *mask)
+{
+   mask[0] = mask[1] = mask[2] = mask[3] = ~(__be32)0;
+}
+
 static int efx_ethtool_get_class_rule(struct efx_nic *efx,
  struct ethtool_rx_flow_spec *rule)
 {
struct ethtool_tcpip4_spec *ip_entry = >h_u.tcp_ip4_spec;
struct ethtool_tcpip4_spec *ip_mask = >m_u.tcp_ip4_spec;
+   struct ethtool_usrip4_spec *uip_entry = >h_u.usr_ip4_spec;
+   struct ethtool_usrip4_spec *uip_mask = >m_u.usr_ip4_spec;
+   struct ethtool_tcpip6_spec *ip6_entry = >h_u.tcp_ip6_spec;
+   struct ethtool_tcpip6_spec *ip6_mask = >m_u.tcp_ip6_spec;
+   struct ethtool_usrip6_spec *uip6_entry = >h_u.usr_ip6_spec;
+   struct ethtool_usrip6_spec *uip6_mask = >m_u.usr_ip6_spec;
struct ethhdr *mac_entry = >h_u.ether_spec;
struct ethhdr *mac_mask = >m_u.ether_spec;
struct efx_filter_spec spec;
@@ -833,6 +845,35 @@ static int efx_ethtool_get_class_rule(struct efx_nic *efx,
ip_entry->psrc = spec.rem_port;
ip_mask->psrc = PORT_FULL_MASK;
}
+   } else if ((spec.match_flags & EFX_FILTER_MATCH_ETHER_TYPE) &&
+   spec.ether_type == htons(ETH_P_IPV6) &&
+   (spec.match_flags & EFX_FILTER_MATCH_IP_PROTO) &&
+   (spec.ip_proto == IPPROTO_TCP || spec.ip_proto == IPPROTO_UDP) &&
+   !(spec.match_flags &
+ ~(EFX_FILTER_MATCH_ETHER_TYPE | EFX_FILTER_MATCH_OUTER_VID |
+   EFX_FILTER_MATCH_LOC_HOST | EFX_FILTER_MATCH_REM_HOST |
+   EFX_FILTER_MATCH_IP_PROTO |
+   EFX_FILTER_MATCH_LOC_PORT | EFX_FILTER_MATCH_REM_PORT))) {
+   rule->flow_type = ((spec.ip_proto == IPPROTO_TCP) ?
+  TCP_V6_FLOW : UDP_V6_FLOW);
+   if (spec.match_flags & EFX_FILTER_MATCH_LOC_HOST) {
+   memcpy(ip6_entry->ip6dst, spec.loc_host,
+  sizeof(ip6_entry->ip6dst));
+   ip6_fill_mask(ip6_mask->ip6dst);
+   }
+   if (spec.match_flags & EFX_FILTER_MATCH_REM_HOST) {
+   memcpy(ip6_entry->ip6src, spec.rem_host,
+  sizeof(ip6_entry->ip6src));
+   ip6_fill_mask(ip6_mask->ip6src);
+   }
+   if (spec.match_flags & EFX_FILTER_MATCH_LOC_PORT) {
+   ip6_entry->pdst = spec.loc_port;
+   ip6_mask->pdst = PORT_FULL_MASK;
+   }
+   if (spec.match_flags & EFX_FILTER_MATCH_REM_PORT) {
+   ip6_entry->psrc = spec.rem_port;
+   ip6_mask->psrc = PORT_FULL_MASK;
+   }
} else if (!(spec.match_flags &
 ~(EFX_FILTER_MATCH_LOC_MAC | EFX_FILTER_MATCH_LOC_MAC_IG |
   EFX_FILTER_MATCH_REM_MAC | EFX_FILTER_MATCH_ETHER_TYPE |
@@ -855,6 +896,39 @@ static int efx_ethtool_get_class_rule(struct efx_nic *efx,
mac_entry->h_proto = spec.ether_type;
mac_mask->h_proto = ETHER_TYPE_FULL_MASK;
}
+   } else if (spec.match_flags & EFX_FILTER_MATCH_ETHER_TYPE &&
+  spec.ether_type == htons(ETH_P_IP)) {
+   rule->flow_type = IPV4_USER_FLOW;
+   uip_entry->ip_ver = ETH_RX_NFC_IP4;
+   if (spec.match_flags & EFX_FILTER_MATCH_IP_PROTO) {
+   uip_mask->proto = IP_PROTO_FULL_MASK;
+   uip_entry->proto = spec.ip_proto;
+   }
+   if (spec.match_flags & EFX_FILTER_MATCH_LOC_HOST) {
+   uip_entry->ip4dst = spec.loc_host[0];
+   uip_mask->ip4dst = IP4_ADDR_FULL_MASK;
+   }
+   if (spec.match_flags & EFX_FILTER_MATCH_REM_HOST) {
+   uip_entry->ip4src = spec.rem_host[0];
+   uip_mask->ip4src = IP4_ADDR_FULL_MASK;
+   }
+   } else if (spec.match_flags & EFX_FILTER_MATCH_ETHER_TYPE &&
+  spec.ether_type == htons(ETH_P_IPV6)) {
+   rule->flow_type = IPV6_USER_FLOW;
+   if

Re: [PATCH v2] unix: properly account for FDs passed over unix sockets

2016-02-02 Thread Linus Torvalds

On Tue, Feb 2, 2016 at 12:49 PM, Willy Tarreau  wrote:
> On Tue, Feb 02, 2016 at 12:44:54PM -0800, Linus Torvalds wrote:
>>
>> Umm. I think the "struct cred" may change in between, can't it?
>
> You mean for example in case of setuid() or something like this ?

Yeah. I'd be worried about looking up the creds or user structure
later, and possibly getting a different one.

I'd much rather look it up at attach time, and just carry an extra
pointer around. That seems to be an inherently safer model where
there's no worry about "what happens if the user does X in the
meantime".

  Linus

Re: [PATCH net-next 2/4] net: dev: add batching to net_device notifiers

2016-02-02 Thread Julian Anastasov


Hello,

On Mon, 1 Feb 2016, Salam Noureddine wrote:

> @@ -1572,8 +1582,12 @@ rollback:
>   call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
>   dev);
>   call_netdevice_notifier(nb, NETDEV_DOWN, dev);
> + call_netdevice_notifier(nb, NETDEV_DOWN_BATCH,
> + dev);
>   }
>   call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
> + call_netdevice_notifier(nb, NETDEV_UNREGISTER_BATCH,
> + dev);

If the rule is once per net, the above call...

>   }

should be here:

call_netdevice_notifier(nb, NETDEV_UNREGISTER_BATCH,
net->loopback_dev);

and also once after outroll label?:

call_netdevice_notifier(nb, NETDEV_UNREGISTER_BATCH, last);

>   }
>  
> @@ -1614,8 +1628,12 @@ int unregister_netdevice_notifier(struct 
> notifier_block *nb)
>   call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
>   dev);
>   call_netdevice_notifier(nb, NETDEV_DOWN, dev);
> + call_netdevice_notifier(nb, NETDEV_DOWN_BATCH,
> + dev);
>   }
>   call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
> + call_netdevice_notifier(nb, NETDEV_UNREGISTER_BATCH,
> + dev);

Above call...

>   }

should be here, for net->loopback_dev?
Also, is it ok to call NETDEV_DOWN_BATCH many times, as result,
sometimes after NETDEV_UNREGISTER?

>   }
>  unlock:

> @@ -6427,7 +6447,9 @@ static void net_set_todo(struct net_device *dev)
>  static void rollback_registered_many(struct list_head *head)
>  {
>   struct net_device *dev, *tmp;
> + struct net *net, *net_tmp;
>   LIST_HEAD(close_head);
> + LIST_HEAD(net_head);
>  
>   BUG_ON(dev_boot_phase);
>   ASSERT_RTNL();
> @@ -6504,6 +6526,15 @@ static void rollback_registered_many(struct list_head 
> *head)
>  #endif
>   }
>  
> + list_for_each_entry(dev, head, unreg_list) {
> + net_add_event_list(_head, dev_net(dev));
> + }
> + list_for_each_entry_safe(net, net_tmp, _head, event_list) {
> + call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH,
> +  net->loopback_dev);
> + net_del_event_list(net);
> + }
> +

NETDEV_UNREGISTER* should not be called before
following synchronize_net and NETDEV_UNREGISTER. May be
we should split the loop: loop (dev_shutdown+NETDEV_UNREGISTER)
followed by above NETDEV_UNREGISTER_BATCH then again the
loop for all remaining calls.

>   synchronize_net();
>  
>   list_for_each_entry(dev, head, unreg_list)

Regards

--
Julian Anastasov

Re: [PATCH v2] unix: properly account for FDs passed over unix sockets

2016-02-02 Thread Hannes Frederic Sowa


On 02.02.2016 21:44, Linus Torvalds wrote:

On Tue, Feb 2, 2016 at 12:32 PM, Hannes Frederic Sowa
 wrote:


Unfortunately we never transfer a scm_cookie via the skbs but merely use it
to initialize unix_skb_parms structure in skb->cb and destroy it afterwards.


Ok, I obviously didn't check very closely.


But "struct pid *" in unix_skb_parms should be enough to get us to
corresponding "struct cred *" so we can decrement the correct counter during
skb destruction.


Umm. I think the "struct cred" may change in between, can't it?


While reviewing the task_struct->cred/real_cred assignments, I noticed 
that, too. I already went the same way and added a "struct cred *" to 
unix_skb_parms.



So I don't think you can later look up the cred based on the pid.


Yep, it also looked to dangerous to me.


Could we add the cred pointer (or just the user pointer) to the unix_skb_parms?

Or maybe just add it to the "struct scm_fp_list"?


scm_fp_list seems to be an even better place. I have a look, thanks!

Hannes

[net-next PATCH 11/11] RFC: net: RPS bulk enqueue to backlog

2016-02-02 Thread Jesper Dangaard Brouer

NEED TO CLEAN UP PATCH (likely still contains bugs...)

When enabling Receive Packet Steering (RPS) like :

 echo 32768 > /proc/sys/net/core/rps_sock_flow_entries

 for N in $(seq 0 7) ; do
echo 4096 > /sys/class/net/${DEV}/queues/rx-$N/rps_flow_cnt
echo f >  /sys/class/net/${DEV}/queues/rx-$N/rps_cpus
grep -H . /sys/class/net/${DEV}/queues/rx-$N/rps_cpus
 done

I noticed high contention on enqueue_to_backlog().  To mitigate
this introduce enqueue_list_to_backlog(), which allow to enqueue
an entire skb list, instead of having to take the per packet cost.

The skb list's needed for bulk enqueing are constructed in the
per CPU area of softnet_data.  And thus, can be constructed
without heavy CPU synchronization.

I'm excited about the performance improvement of this patch:

Before:
 Ethtool(mlx5p2  ) stat:  7246630 (  7,246,630) <= rx3_packets /sec
After:
 Ethtool(mlx5p2  ) stat:  9182886 (  9,182,886) <= rx3_packets /sec
Improvement:
 (1/9182886-1/7246630)*10^9 = saving -29.0 ns

The benchmark is a single pktgen flow, which is RPS directed to
another CPU, for further processing via process_backlog().  The
remote CPU cannot handle the load and this CPU drops packets when
it cannot get them enqueued.
---
 include/linux/netdevice.h |   10 +++
 net/core/dev.c|  133 -
 2 files changed, 140 insertions(+), 3 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 11df9af41a3c..dc5baef95d27 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -633,6 +633,15 @@ struct rps_dev_flow {
 };
 #define RPS_NO_FILTER 0x
 
+struct rps_cpu_queue {
+   struct sk_buff_head skb_list;
+   int to_cpu;
+   struct rps_dev_flow *rflow;
+   struct net_device *dev;
+};
+#define RPS_CPU_QUEUES 2 /* Must be power of 2 */
+#define RPS_CPU_QUEUES_MASK(RPS_CPU_QUEUES - 1)
+
 /*
  * The rps_dev_flow_table structure contains a table of flow mappings.
  */
@@ -2662,6 +2671,7 @@ struct softnet_data {
unsigned intreceived_rps;
 #ifdef CONFIG_RPS
struct softnet_data *rps_ipi_list;
+   struct rps_cpu_queuelocal_rps_queue[RPS_CPU_QUEUES];
 #endif
 #ifdef CONFIG_NET_FLOW_LIMIT
struct sd_flow_limit __rcu *flow_limit;
diff --git a/net/core/dev.c b/net/core/dev.c
index 35c92a968937..0a231529bc0c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3736,6 +3736,60 @@ drop:
return NET_RX_DROP;
 }
 
+static int enqueue_list_to_backlog(struct sk_buff_head *skb_list, int cpu,
+  unsigned int *qtail, struct net_device *dev)
+{
+   unsigned int qlen, qlen_drop;
+   struct softnet_data *sd;
+   struct sk_buff *skb;
+   unsigned long flags;
+
+   sd = _cpu(softnet_data, cpu);
+
+   local_irq_save(flags);
+
+   rps_lock(sd);
+   if (!netif_running(dev))
+   goto drop;
+   qlen = skb_queue_len(>input_pkt_queue);
+   /* NOTICE: Had to drop !skb_flow_limit(skb, qlen) check here */
+   if (qlen <= netdev_max_backlog) {
+   if (qlen) {
+enqueue:
+   //__skb_queue_tail(>input_pkt_queue, skb);
+   skb_queue_splice_tail_init(skb_list,
+  >input_pkt_queue);
+   input_queue_tail_incr_save(sd, qtail);
+   rps_unlock(sd);
+   local_irq_restore(flags);
+   return NET_RX_SUCCESS;
+   }
+
+   /* Schedule NAPI for backlog device
+* We can use non atomic operation since we own the queue lock
+*/
+   if (!__test_and_set_bit(NAPI_STATE_SCHED, >backlog.state)) {
+   if (!rps_ipi_queued(sd))
+   napi_schedule(sd, >backlog);
+   }
+   goto enqueue;
+   }
+
+drop:
+   qlen_drop = skb_queue_len(skb_list);
+   sd->dropped += qlen_drop;
+   rps_unlock(sd);
+
+   local_irq_restore(flags);
+
+   atomic_long_add(qlen_drop, >rx_dropped);
+   while ((skb = __skb_dequeue(skb_list)) != NULL) {
+   __kfree_skb_defer(skb);
+   }
+   return NET_RX_DROP;
+}
+
+
 static int netif_rx_internal(struct sk_buff *skb)
 {
int ret;
@@ -4211,14 +4265,43 @@ static int netif_receive_skb_internal(struct sk_buff 
*skb)
 #ifdef CONFIG_RPS
if (static_key_false(_needed)) {
struct rps_dev_flow voidflow, *rflow = 
+   struct softnet_data *sd = this_cpu_ptr(_data);
+   struct rps_cpu_queue *lq; /* softnet cpu local queue (lq) */
+
int cpu = get_rps_cpu(skb->dev, skb, );
+   if (cpu < 0)
+   goto no_rps;
+
+   /* RPS destinated packet */
+   // XXX: is local_irq_disable needed here?
+   sd =

Re: Keystone 2 boards boot failure

2016-02-02 Thread Arnd Bergmann

On Tuesday 02 February 2016 15:01:33 Franklin S Cooper Jr. wrote:
> 
> Yes. Here is a boot log on the latest master with the below
> three patches reverted.
> http://pastebin.com/W7RWSHpE (Working)
> 
> I reverted these three patches. The two latest patches seem
> to be trying to correct/expand upon the last patch on this list.
> 
> commit 958d104e3d40eef5148c402887138f6594ff7e1e
> netcp: fix regression in receive processing
> 
> commit 9dd2d6c5c9755b160fe0111bcdad9491676feea8
> netcp: add more __le32 annotations
> 
> commit 899077791403ff7a2d8cfaa87bd1a82d729463e2
> netcp: try to reduce type confusion in descriptors
> 

The middle patch should have no effect on generated code, so I'm ignoring
that for now.

The next thing to rule out is an endianess bug. I assume you
are running this on with a little-endian kernel, correct? If
you are running big-endian, the base assumption that the driver needs
to swap the data was flawed and that portion needs to be done.

If you are running little-endian 32-bit, please try the partial
revert below, which just undoes the attempt to make it work with
64-bit kernels.

Arnd

diff --git a/drivers/net/ethernet/ti/netcp_core.c 
b/drivers/net/ethernet/ti/netcp_core.c
index c61d66d38634..7e291c04a81a 100644
--- a/drivers/net/ethernet/ti/netcp_core.c
+++ b/drivers/net/ethernet/ti/netcp_core.c
@@ -117,20 +117,10 @@ static void get_pkt_info(dma_addr_t *buff, u32 *buff_len, 
dma_addr_t *ndesc,
*ndesc = le32_to_cpu(desc->next_desc);
 }
 
-static void get_pad_info(u32 *pad0, u32 *pad1, u32 *pad2, struct knav_dma_desc 
*desc)
+static void get_pad_info(u32 *pad0, u32 *pad1, struct knav_dma_desc *desc)
 {
*pad0 = le32_to_cpu(desc->pad[0]);
*pad1 = le32_to_cpu(desc->pad[1]);
-   *pad2 = le32_to_cpu(desc->pad[2]);
-}
-
-static void get_pad_ptr(void **padptr, struct knav_dma_desc *desc)
-{
-   u64 pad64;
-
-   pad64 = le32_to_cpu(desc->pad[0]) +
-   ((u64)le32_to_cpu(desc->pad[1]) << 32);
-   *padptr = (void *)(uintptr_t)pad64;
 }
 
 static void get_org_pkt_info(dma_addr_t *buff, u32 *buff_len,
@@ -163,11 +153,10 @@ static void set_desc_info(u32 desc_info, u32 pkt_info,
desc->packet_info = cpu_to_le32(pkt_info);
 }
 
-static void set_pad_info(u32 pad0, u32 pad1, u32 pad2, struct knav_dma_desc 
*desc)
+static void set_pad_info(u32 pad0, u32 pad1, struct knav_dma_desc *desc)
 {
desc->pad[0] = cpu_to_le32(pad0);
desc->pad[1] = cpu_to_le32(pad1);
-   desc->pad[2] = cpu_to_le32(pad1);
 }
 
 static void set_org_pkt_info(dma_addr_t buff, u32 buff_len,
@@ -581,7 +570,6 @@ static void netcp_free_rx_desc_chain(struct netcp_intf 
*netcp,
dma_addr_t dma_desc, dma_buf;
unsigned int buf_len, dma_sz = sizeof(*ndesc);
void *buf_ptr;
-   u32 pad[2];
u32 tmp;
 
get_words(_desc, 1, >next_desc);
@@ -593,15 +581,13 @@ static void netcp_free_rx_desc_chain(struct netcp_intf 
*netcp,
break;
}
get_pkt_info(_buf, , _desc, ndesc);
-   get_pad_ptr(_ptr, ndesc);
+   get_pad_info((u32 *)_ptr, , ndesc);
dma_unmap_page(netcp->dev, dma_buf, PAGE_SIZE, DMA_FROM_DEVICE);
__free_page(buf_ptr);
knav_pool_desc_put(netcp->rx_pool, desc);
}
 
-   get_pad_info([0], [1], _len, desc);
-   buf_ptr = (void *)(uintptr_t)(pad[0] + ((u64)pad[1] << 32));
-
+   get_pad_info((u32 *)_ptr, _len, desc);
if (buf_ptr)
netcp_frag_free(buf_len <= PAGE_SIZE, buf_ptr);
knav_pool_desc_put(netcp->rx_pool, desc);
@@ -639,8 +625,8 @@ static int netcp_process_one_rx_packet(struct netcp_intf 
*netcp)
dma_addr_t dma_desc, dma_buff;
struct netcp_packet p_info;
struct sk_buff *skb;
-   u32 pad[2];
void *org_buf_ptr;
+   u32 tmp;
 
dma_desc = knav_queue_pop(netcp->rx_queue, _sz);
if (!dma_desc)
@@ -653,8 +639,7 @@ static int netcp_process_one_rx_packet(struct netcp_intf 
*netcp)
}
 
get_pkt_info(_buff, _len, _desc, desc);
-   get_pad_info([0], [1], _buf_len, desc);
-   org_buf_ptr = (void *)(uintptr_t)(pad[0] + ((u64)pad[1] << 32));
+   get_pad_info((u32 *)_buf_ptr, _buf_len, desc);
 
if (unlikely(!org_buf_ptr)) {
dev_err(netcp->ndev_dev, "NULL bufptr in desc\n");
@@ -679,7 +664,6 @@ static int netcp_process_one_rx_packet(struct netcp_intf 
*netcp)
/* Fill in the page fragment list */
while (dma_desc) {
struct page *page;
-   void *ptr;
 
ndesc = knav_pool_desc_unmap(netcp->rx_pool, dma_desc, dma_sz);
if (unlikely(!ndesc)) {
@@ -688,8 +672,7 @@ static int netcp_process_one_rx_packet(struct netcp_intf 
*netcp)
}
 
get_pkt_info(_buff, _len, _desc, ndesc);
-   get_pad_ptr(, ndesc);
-   page = ptr;
+

Re: Keystone 2 boards boot failure

2016-02-02 Thread Arnd Bergmann

On Tuesday 02 February 2016 10:50:41 Franklin S Cooper Jr. wrote:
> Latest mainline is currently failing to boot for Keystone 2
> Hawking but I'm assuming its true for other Keystone 2
> boards. Bisect shows that this issue popped up after the
> patch "netcp: try to reduce type confusion in descriptors"
> (commit 89907779) was introduced. There was another patch
> "netcp: fix regression in receive processing" that seems to
> fix some bugs that the prior patch introduced however it
> still did not resolve the boot failure and was documented as
> not being tested.
> 
> Should we revert these commits or does anyone have any
> suggestions on how to fix these failures? I would be more
> than happy to test any fix.
> 

Have you tried to see if a revert fixes the problem on a
current kernel?

Arnd

Re: [PATCH v2] unix: properly account for FDs passed over unix sockets

2016-02-02 Thread Willy Tarreau

On Tue, Feb 02, 2016 at 09:32:56PM +0100, Hannes Frederic Sowa wrote:
> But "struct pid *" in unix_skb_parms should be enough to get us to 
> corresponding "struct cred *" so we can decrement the correct counter 
> during skb destruction.
> 
> So:
> 
> We increment current task's unix_inflight and also check the current 
> task's limit during attaching fds to skbs and decrement the inflight 
> counter via "struct pid *". This looks like it should work.

I like it as well, the principle sounds sane.

> >That way it's always the person who actually does the send (rather
> >than the opener of the socket _or_ the opener of the file that gets
> >passed around) that gets credited, and thanks to the cred pointer we
> >can then de-credit them properly.
> 
> Exactly, I try to implement that. Thanks a lot!

Thanks to you Hannes, I appreciate that you work on it, it would take
much more time to me to dig into this.

Willy

Re: [PATCH v2] unix: properly account for FDs passed over unix sockets

2016-02-02 Thread Linus Torvalds

On Tue, Feb 2, 2016 at 12:32 PM, Hannes Frederic Sowa
 wrote:
>
> Unfortunately we never transfer a scm_cookie via the skbs but merely use it
> to initialize unix_skb_parms structure in skb->cb and destroy it afterwards.

Ok, I obviously didn't check very closely.

> But "struct pid *" in unix_skb_parms should be enough to get us to
> corresponding "struct cred *" so we can decrement the correct counter during
> skb destruction.

Umm. I think the "struct cred" may change in between, can't it?

So I don't think you can later look up the cred based on the pid.

Could we add the cred pointer (or just the user pointer) to the unix_skb_parms?

Or maybe just add it to the "struct scm_fp_list"?

   Linus

[net-next PATCH 10/11] RFC: net: API for RX handover of multiple SKBs to stack

2016-02-02 Thread Jesper Dangaard Brouer

Introduce napi_gro_receive_list() which takes a full SKB-list
for processing by the stack.

It also take over invoking eth_type_trans().

One purpose is to disconnect the icache usage/sharing between
driver level RX (NAPI loop) and upper RX network stack.

Another advantage is that the stack now knows how many packets it
received, and can do the appropiate packet bundling inside the
stack.  E.g. flush/process these bundles when the skb list is empty.

PITFALLS: Slightly overkill to use a struct sk_buff_head (24 bytes),
to handover packets.  Which is allocated on callers stack.  It
also maintains a qlen, which is unnecessary in this hotpath code.
A simple list within the first SKB could be a minimum solution.

Signed-off-by: Jesper Dangaard Brouer 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c |   12 +---
 include/linux/netdevice.h   |3 +++
 net/core/dev.c  |   18 ++
 3 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 88f88d354abc..b6e7cc29f02c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -230,7 +230,6 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
 {
struct mlx5e_rq *rq = container_of(cq, struct mlx5e_rq, cq);
struct sk_buff_head rx_skb_list;
-   struct sk_buff *rx_skb;
int work_done;
 
/* Using SKB list infrastructure, even-though some instructions
@@ -281,16 +280,7 @@ wq_ll_pop:
mlx5_wq_ll_pop(>wq, wqe_counter_be,
   >next.next_wqe_index);
}
-
-   while ((rx_skb = __skb_dequeue(_skb_list)) != NULL) {
-   rx_skb->protocol = eth_type_trans(rx_skb, rq->netdev);
-   napi_gro_receive(cq->napi, rx_skb);
-
-   /* NOT FOR UPSTREAM INCLUSION:
-* How I did isolated testing of driver RX, I here called:
-*  napi_consume_skb(rx_skb, budget);
-*/
-   }
+   napi_gro_receive_list(cq->napi, _skb_list, rq->netdev);
 
mlx5_cqwq_update_db_record(>wq);
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5ac140dcb789..11df9af41a3c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3142,6 +3142,9 @@ int netif_rx(struct sk_buff *skb);
 int netif_rx_ni(struct sk_buff *skb);
 int netif_receive_skb(struct sk_buff *skb);
 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
+void napi_gro_receive_list(struct napi_struct *napi,
+  struct sk_buff_head *skb_list,
+  struct net_device *netdev);
 void napi_gro_flush(struct napi_struct *napi, bool flush_old);
 struct sk_buff *napi_get_frags(struct napi_struct *napi);
 gro_result_t napi_gro_frags(struct napi_struct *napi);
diff --git a/net/core/dev.c b/net/core/dev.c
index 24be1d07d854..35c92a968937 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4579,6 +4579,24 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, 
struct sk_buff *skb)
 }
 EXPORT_SYMBOL(napi_gro_receive);
 
+void napi_gro_receive_list(struct napi_struct *napi,
+  struct sk_buff_head *skb_list,
+  struct net_device *netdev)
+{
+   struct sk_buff *skb;
+
+   while ((skb = __skb_dequeue(skb_list)) != NULL) {
+   skb->protocol = eth_type_trans(skb, netdev);
+
+   skb_mark_napi_id(skb, napi);
+   trace_napi_gro_receive_entry(skb);
+
+   skb_gro_reset_offset(skb);
+   napi_skb_finish(dev_gro_receive(napi, skb), skb);
+   }
+}
+EXPORT_SYMBOL(napi_gro_receive_list);
+
 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
 {
if (unlikely(skb->pfmemalloc)) {

[net-next PATCH 09/11] RFC: dummy: bulk free SKBs

2016-02-02 Thread Jesper Dangaard Brouer

Normal TX completion uses napi_consume_skb(), thus also make dummy driver
use this, as it make it easier to see the effect of bulk freeing SKBs.

---
 drivers/net/dummy.c |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c
index 69fc8409a973..985565ec5d60 100644
--- a/drivers/net/dummy.c
+++ b/drivers/net/dummy.c
@@ -85,7 +85,8 @@ static netdev_tx_t dummy_xmit(struct sk_buff *skb, struct 
net_device *dev)
dstats->tx_bytes += skb->len;
u64_stats_update_end(>syncp);
 
-   dev_kfree_skb(skb);
+   //dev_kfree_skb(skb);
+   napi_consume_skb(skb, 1);
return NETDEV_TX_OK;
 }

[PATCH net v2] bonding: Fix ARP monitor validation

2016-02-02 Thread Jay Vosburgh


The current logic in bond_arp_rcv will accept an incoming ARP for
validation if (a) the receiving slave is either "active" (which includes
the currently active slave, or the current ARP slave) or, (b) there is a
currently active slave, and it has received an ARP since it became active.
For case (b), the receiving slave isn't the currently active slave, and is
receiving the original broadcast ARP request, not an ARP reply from the
target.

This logic can fail if there is no currently active slave.  In
this situation, the ARP probe logic cycles through all slaves, assigning
each in turn as the "current_arp_slave" for one arp_interval, then setting
that one as "active," and sending an ARP probe from that slave.  The
current logic expects the ARP reply to arrive on the sending
current_arp_slave, however, due to switch FDB updating delays, the reply
may be directed to another slave.

This can arise if the bonding slaves and switch are working, but
the ARP target is not responding.  When the ARP target recovers, a
condition may result wherein the ARP target host replies faster than the
switch can update its forwarding table, causing each ARP reply to be sent
to the previous current_arp_slave.  This will never pass the logic in
bond_arp_rcv, as neither of the above conditions (a) or (b) are met.

Some experimentation on a LAN shows ARP reply round trips in the
200 usec range, but my available switches never update their FDB in less
than 4000 usec.

This patch changes the logic in bond_arp_rcv to additionally
accept an ARP reply for validation on any slave if there is a current ARP
slave and it sent an ARP probe during the previous arp_interval.

Fixes: aeea64ac717a ("bonding: don't trust arp requests unless active slave 
really works")
Cc: Veaceslav Falico 
Cc: Andy Gospodarek 
Signed-off-by: Jay Vosburgh 

---
v2: more detail in log and comment; no code change.

 drivers/net/bonding/bond_main.c | 39 ---
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 56b560558884..65a4107749df 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -214,6 +214,8 @@ static void bond_uninit(struct net_device *bond_dev);
 static struct rtnl_link_stats64 *bond_get_stats(struct net_device *bond_dev,
struct rtnl_link_stats64 
*stats);
 static void bond_slave_arr_handler(struct work_struct *work);
+static bool bond_time_in_interval(struct bonding *bond, unsigned long last_act,
+ int mod);
 
 /* General routines -*/
 
@@ -2459,7 +2461,7 @@ int bond_arp_rcv(const struct sk_buff *skb, struct 
bonding *bond,
 struct slave *slave)
 {
struct arphdr *arp = (struct arphdr *)skb->data;
-   struct slave *curr_active_slave;
+   struct slave *curr_active_slave, *curr_arp_slave;
unsigned char *arp_ptr;
__be32 sip, tip;
int alen, is_arp = skb->protocol == __cpu_to_be16(ETH_P_ARP);
@@ -2506,26 +2508,41 @@ int bond_arp_rcv(const struct sk_buff *skb, struct 
bonding *bond,
 , );
 
curr_active_slave = rcu_dereference(bond->curr_active_slave);
+   curr_arp_slave = rcu_dereference(bond->current_arp_slave);
 
-   /* Backup slaves won't see the ARP reply, but do come through
-* here for each ARP probe (so we swap the sip/tip to validate
-* the probe).  In a "redundant switch, common router" type of
-* configuration, the ARP probe will (hopefully) travel from
-* the active, through one switch, the router, then the other
-* switch before reaching the backup.
+   /* We 'trust' the received ARP enough to validate it if:
+*
+* (a) the slave receiving the ARP is active (which includes the
+* current ARP slave, if any), or
+*
+* (b) the receiving slave isn't active, but there is a currently
+* active slave and it received valid arp reply(s) after it became
+* the currently active slave, or
+*
+* (c) there is an ARP slave that sent an ARP during the prior ARP
+* interval, and we receive an ARP reply on any slave.  We accept
+* these because switch FDB update delays may deliver the ARP
+* reply to a slave other than the sender of the ARP request.
 *
-* We 'trust' the arp requests if there is an active slave and
-* it received valid arp reply(s) after it became active. This
-* is done to avoid endless looping when we can't reach the
+* Note: for (b), backup slaves are receiving the broadcast ARP
+* request, not a reply.  This request passes from the sending
+* slave through the L2

Re: [PATCH v2] unix: properly account for FDs passed over unix sockets

2016-02-02 Thread Hannes Frederic Sowa


Hi Willy,

On 02.02.2016 21:39, Willy Tarreau wrote:

On Tue, Feb 02, 2016 at 09:32:56PM +0100, Hannes Frederic Sowa wrote:

But "struct pid *" in unix_skb_parms should be enough to get us to
corresponding "struct cred *" so we can decrement the correct counter
during skb destruction.

So:

We increment current task's unix_inflight and also check the current
task's limit during attaching fds to skbs and decrement the inflight
counter via "struct pid *". This looks like it should work.


I like it as well, the principle sounds sane.


That way it's always the person who actually does the send (rather
than the opener of the socket _or_ the opener of the file that gets
passed around) that gets credited, and thanks to the cred pointer we
can then de-credit them properly.


Exactly, I try to implement that. Thanks a lot!


Thanks to you Hannes, I appreciate that you work on it, it would take
much more time to me to dig into this.


I slightly tested the attached patch. If you have the original 
reproducer available could you also give it a try? Unfortunately I 
currently don't find it and am limited in time this evening.


Thanks a lot,
Hannes

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 2a91a0561a4783..4567dbe04f274d 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -6,8 +6,8 @@
 #include 
 #include 

-void unix_inflight(struct file *fp);
-void unix_notinflight(struct file *fp);
+void unix_inflight(const struct cred *cred, struct file *fp);
+void unix_notinflight(const struct cred *cred, struct file *fp);
 void unix_gc(void);
 void wait_for_unix_gc(void);
 struct sock *unix_get_socket(struct file *filp);
diff --git a/include/net/scm.h b/include/net/scm.h
index 262532d111f51e..8bf7d496545bf8 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -21,6 +21,7 @@ struct scm_creds {
 struct scm_fp_list {
short   count;
short   max;
+   const struct cred   *cred;
struct file *fp[SCM_MAX_FD];
 };

diff --git a/net/core/scm.c b/net/core/scm.c
index 14596fb3717270..6b02b574e283f6 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -87,6 +87,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct 
scm_fp_list **fplp)

*fplp = fpl;
fpl->count = 0;
fpl->max = SCM_MAX_FD;
+   fpl->cred = NULL;
}
fpp = >fp[fpl->count];

@@ -107,6 +108,10 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct 
scm_fp_list **fplp)

*fpp++ = file;
fpl->count++;
}
+
+   if (fpl->cred)
+   put_cred(fpl->cred);
+   fpl->cred = get_current_cred();
return num;
 }

@@ -119,6 +124,7 @@ void __scm_destroy(struct scm_cookie *scm)
scm->fp = NULL;
for (i=fpl->count-1; i>=0; i--)
fput(fpl->fp[i]);
+   put_cred(fpl->cred);
kfree(fpl);
}
 }
@@ -336,6 +342,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
for (i = 0; i < fpl->count; i++)
get_file(fpl->fp[i]);
new_fpl->max = new_fpl->count;
+   new_fpl->cred = get_cred(fpl->cred);
}
return new_fpl;
 }
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 49d5093eb0553a..ba5058682419ba 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1496,7 +1496,7 @@ static void unix_detach_fds(struct scm_cookie 
*scm, struct sk_buff *skb)

UNIXCB(skb).fp = NULL;

for (i = scm->fp->count-1; i >= 0; i--)
-   unix_notinflight(scm->fp->fp[i]);
+   unix_notinflight(scm->fp->cred, scm->fp->fp[i]);
 }

 static void unix_destruct_scm(struct sk_buff *skb)
@@ -1561,7 +1561,7 @@ static int unix_attach_fds(struct scm_cookie *scm, 
struct sk_buff *skb)

return -ENOMEM;

for (i = scm->fp->count - 1; i >= 0; i--)
-   unix_inflight(scm->fp->fp[i]);
+   unix_inflight(scm->fp->cred, scm->fp->fp[i]);
return max_level;
 }

diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 8fcdc2283af50c..30b03e7547 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -116,7 +116,7 @@ struct sock *unix_get_socket(struct file *filp)
  * descriptor if it is for an AF_UNIX socket.
  */

-void unix_inflight(struct file *fp)
+void unix_inflight(const struct cred *cred, struct file *fp)
 {
struct sock *s = unix_get_socket(fp);

@@ -133,11 +133,11 @@ void unix_inflight(struct file *fp)
}
unix_tot_inflight++;
}
-   fp->f_cred->user->unix_inflight++;
+   cred->user->unix_inflight++;
spin_unlock(_gc_lock);
 }

-void unix_notinflight(struct file *fp)
+void unix_notinflight(const struct cred *cred, struct file *fp)
 {
struct sock *s = unix_get_socket(fp);

@@ -152,7 +152,7 @@ void unix_notinflight(struct file *fp)

Re: [PATCH v3 net-next 2/2] tcp: Add Redundant Data Bundling (RDB)

2016-02-02 Thread Eric Dumazet

On Tue, 2016-02-02 at 20:23 +0100, Bendik Rønning Opstad wrote:
> RDB is a mechanism that enables a TCP sender to bundle redundant
> (already sent) data with TCP packets containing new data. By bundling
> (retransmitting) already sent data with each TCP packet containing new
> data, the connection will be more resistant to sporadic packet loss
> which reduces the application layer latency significantly in congested
> scenarios.
> 
> The main functionality added:
> 
>   o Loss detection of hidden loss events: When bundling redundant data
> with each packet, packet loss can be hidden from the TCP engine due
> to lack of dupACKs. This is because the loss is "repaired" by the
> redundant data in the packet coming after the lost packet. Based on
> incoming ACKs, such hidden loss events are detected, and CWR state
> is entered.
> 
>   o When packets are scheduled for transmission, RDB replaces the SKB to
> be sent with a modified SKB containing the redundant data of
> previously sent data segments from the TCP output queue.

Really this looks very complicated.

Why not simply append the new skb content to prior one ?

skb_still_in_host_queue(sk, prior_skb) would also tell you if the skb is
really available (ie its clone not sitting/waiting in a qdisc on the
host)

Note : select_size() always allocate skb with SKB_WITH_OVERHEAD(2048 -
MAX_TCP_HEADER) available bytes in skb->data.

Also note that tcp_collapse_retrans() is very similar to your needs. You
might simply expand it.

Re: Keystone 2 boards boot failure

2016-02-02 Thread Franklin S Cooper Jr.



On 02/02/2016 02:41 PM, Arnd Bergmann wrote:
> On Tuesday 02 February 2016 10:50:41 Franklin S Cooper Jr. wrote:
>> Latest mainline is currently failing to boot for Keystone 2
>> Hawking but I'm assuming its true for other Keystone 2
>> boards. Bisect shows that this issue popped up after the
>> patch "netcp: try to reduce type confusion in descriptors"
>> (commit 89907779) was introduced. There was another patch
>> "netcp: fix regression in receive processing" that seems to
>> fix some bugs that the prior patch introduced however it
>> still did not resolve the boot failure and was documented as
>> not being tested.
>>
>> Should we revert these commits or does anyone have any
>> suggestions on how to fix these failures? I would be more
>> than happy to test any fix.
>>
> Have you tried to see if a revert fixes the problem on a
> current kernel?

Yes. Here is a boot log on the latest master with the below
three patches reverted.
http://pastebin.com/W7RWSHpE (Working)

I reverted these three patches. The two latest patches seem
to be trying to correct/expand upon the last patch on this list.

commit 958d104e3d40eef5148c402887138f6594ff7e1e
netcp: fix regression in receive processing

commit 9dd2d6c5c9755b160fe0111bcdad9491676feea8
netcp: add more __le32 annotations

commit 899077791403ff7a2d8cfaa87bd1a82d729463e2
netcp: try to reduce type confusion in descriptors



>
>   Arnd

[net-next PATCH 08/11] mlx5: hint the NAPI alloc skb API about the expected bulk size

2016-02-02 Thread Jesper Dangaard Brouer

Use the newly introduced napi_alloc_skb_hint() API, to get the underlying
slab bulk allocation sizes to align with what mlx5 driver need for refilling
its RX ring queue.

Signed-off-by: Jesper Dangaard Brouer 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   |   10 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c |2 +-
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 3e531fae9ed3..b2aba498e8d4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -593,7 +593,8 @@ void mlx5e_cq_error_event(struct mlx5_core_cq *mcq, enum 
mlx5_event event);
 int mlx5e_napi_poll(struct napi_struct *napi, int budget);
 bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget);
 int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget);
-bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq, struct napi_struct *napi);
+bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq, struct napi_struct *napi,
+   unsigned int bulk_hint);
 struct mlx5_cqe64 *mlx5e_get_cqe(struct mlx5e_cq *cq);
 
 void mlx5e_update_stats(struct mlx5e_priv *priv);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 5d96d6682db0..88f88d354abc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -43,12 +43,13 @@ static inline bool mlx5e_rx_hw_stamp(struct mlx5e_tstamp 
*tstamp)
 
 static inline int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq,
 struct mlx5e_rx_wqe *wqe, u16 ix,
-struct napi_struct *napi)
+struct napi_struct *napi,
+unsigned int bulk_hint)
 {
struct sk_buff *skb;
dma_addr_t dma_addr;
 
-   skb = napi_alloc_skb(napi, rq->wqe_sz);
+   skb = napi_alloc_skb_hint(napi, rq->wqe_sz, bulk_hint);
if (unlikely(!skb))
return -ENOMEM;
 
@@ -77,7 +78,8 @@ err_free_skb:
return -ENOMEM;
 }
 
-bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq, struct napi_struct *napi)
+bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq, struct napi_struct *napi,
+   unsigned int hint)
 {
struct mlx5_wq_ll *wq = >wq;
 
@@ -87,7 +89,7 @@ bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq, struct 
napi_struct *napi)
while (!mlx5_wq_ll_is_full(wq)) {
struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(wq, wq->head);
 
-   if (unlikely(mlx5e_alloc_rx_wqe(rq, wqe, wq->head, napi)))
+   if (unlikely(mlx5e_alloc_rx_wqe(rq, wqe, wq->head, napi, hint)))
break;
 
mlx5_wq_ll_push(wq, be16_to_cpu(wqe->next.next_wqe_index));
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
index 8fd07c8087e3..6488404edff6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -64,7 +64,7 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget)
 
work_done = mlx5e_poll_rx_cq(>rq.cq, budget);
busy |= work_done == budget;
-   busy |= mlx5e_post_rx_wqes(>rq, napi);
+   busy |= mlx5e_post_rx_wqes(>rq, napi, work_done);
 
if (busy)
return budget;

[net-next PATCH 07/11] net: introduce napi_alloc_skb_hint() for more use-cases

2016-02-02 Thread Jesper Dangaard Brouer

The default bulk alloc size arbitrarily choosen (to be 8) might
not suit all use-cases, this introduce a function napi_alloc_skb_hint()
that allow the caller to specify a bulk size hint they are expecting.
It is a hint because __napi_alloc_skb() limits the bulk size to
the array size.

One user is the mlx5 driver, which bulk re-populate it's RX ring
with both SKBs and pages.  Thus, it would like to work with
bigger bulk alloc chunks.

Signed-off-by: Jesper Dangaard Brouer 
---
 include/linux/skbuff.h |   19 +++
 net/core/skbuff.c  |8 +++-
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b06ba2e07c89..4d0c0eacbc34 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2391,14 +2391,25 @@ static inline void skb_free_frag(void *addr)
__free_page_frag(addr);
 }
 
+#define NAPI_SKB_CACHE_SIZE64U /* Used in struct napi_alloc_cache */
+#define NAPI_SKB_BULK_ALLOC 8U /* Default slab bulk alloc in NAPI */
+
 void *napi_alloc_frag(unsigned int fragsz);
-struct sk_buff *__napi_alloc_skb(struct napi_struct *napi,
-unsigned int length, gfp_t gfp_mask);
+struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
+unsigned int bulk_hint, gfp_t gfp_mask);
 static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi,
-unsigned int length)
+unsigned int len)
+{
+   return __napi_alloc_skb(napi, len, NAPI_SKB_BULK_ALLOC, GFP_ATOMIC);
+}
+static inline struct sk_buff *napi_alloc_skb_hint(struct napi_struct *napi,
+ unsigned int len,
+ unsigned int bulk_hint)
 {
-   return __napi_alloc_skb(napi, length, GFP_ATOMIC);
+   bulk_hint = bulk_hint ? : 1;
+   return __napi_alloc_skb(napi, len, bulk_hint, GFP_ATOMIC);
 }
+
 void napi_consume_skb(struct sk_buff *skb, int budget);
 
 void __kfree_skb_flush(void);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ae8cdbec90ee..f77209fb5361 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -347,8 +347,6 @@ struct sk_buff *build_skb(void *data, unsigned int 
frag_size)
 }
 EXPORT_SYMBOL(build_skb);
 
-#define NAPI_SKB_CACHE_SIZE64
-
 struct napi_alloc_cache {
struct page_frag_cache page;
size_t skb_count;
@@ -480,9 +478,10 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
  * %NULL is returned if there is no free memory.
  */
 struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
-gfp_t gfp_mask)
+unsigned int bulk_hint, gfp_t gfp_mask)
 {
struct napi_alloc_cache *nc = this_cpu_ptr(_alloc_cache);
+   unsigned int bulk_sz = min(bulk_hint, NAPI_SKB_CACHE_SIZE);
struct skb_shared_info *shinfo;
struct sk_buff *skb;
void *data;
@@ -507,10 +506,9 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, 
unsigned int len,
if (unlikely(!data))
return NULL;
 
-#define BULK_ALLOC_SIZE 8
if (!nc->skb_count) {
nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache,
- gfp_mask, BULK_ALLOC_SIZE,
+ gfp_mask, bulk_sz,
  nc->skb_cache);
}
if (likely(nc->skb_count)) {

Re: [PATCH v2] unix: properly account for FDs passed over unix sockets

2016-02-02 Thread Willy Tarreau

On Tue, Feb 02, 2016 at 12:53:20PM -0800, Linus Torvalds wrote:
> On Tue, Feb 2, 2016 at 12:49 PM, Willy Tarreau  wrote:
> > On Tue, Feb 02, 2016 at 12:44:54PM -0800, Linus Torvalds wrote:
> >>
> >> Umm. I think the "struct cred" may change in between, can't it?
> >
> > You mean for example in case of setuid() or something like this ?
> 
> Yeah. I'd be worried about looking up the creds or user structure
> later, and possibly getting a different one.
> 
> I'd much rather look it up at attach time, and just carry an extra
> pointer around. That seems to be an inherently safer model where
> there's no worry about "what happens if the user does X in the
> meantime".

Normally we can only change from root to non-root, and we don't apply
the limits to root, so if we have the ability to only store one bit
indicating "not tracked" or to simply nullify one pointer to avoid
counting in flight FDs for root, we don't take the risk to recredit
them to the target user after a change.

I just don't know if we can do that though :-/

Willy

[net-next PATCH 06/11] RFC: mlx5: RX bulking or bundling of packets before calling network stack

2016-02-02 Thread Jesper Dangaard Brouer

There are several techniques/concepts combined in this optimization.
It is both a data-cache and instruction-cache optimization.

First of all, this is primarily about delaying touching
packet-data, which happend in eth_type_trans, until the prefetch
have had time to fetch.  Thus, hopefully avoiding a cache-miss on
packet data.

Secondly, the instruction-cache optimization is about, not
calling the network stack for every packet, which is pulled out
of the RX ring.  Calling the full stack likely removes/flushes
the instruction cache every time.

Thus, have two loops, one loop pulling out packet from the RX
ring and starting the prefetching, and the second loop calling
eth_type_trans() and invoking the stack via napi_gro_receive().

Signed-off-by: Jesper Dangaard Brouer 


Notes:
This is the patch that gave a speed up of 6.2Mpps to 12Mpps, when
trying to measure lowest RX level, by dropping the packets in the
driver itself (marked drop point as comment).

For now, the ring is emptied upto the budget.  I don't know if it
would be better to chunk it up more?

In the future, I imagine the we can call the stack with the full
SKB-list instead of this local loop.  But then it would look a
bit strange, to call eth_type_trans() as the only function...
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c |   23 +++
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index e923f4adc0f8..5d96d6682db0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -214,8 +214,6 @@ static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 
*cqe,
 
mlx5e_handle_csum(netdev, cqe, rq, skb);
 
-   skb->protocol = eth_type_trans(skb, netdev);
-
skb_record_rx_queue(skb, rq->ix);
 
if (likely(netdev->features & NETIF_F_RXHASH))
@@ -229,8 +227,15 @@ static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 
*cqe,
 int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
 {
struct mlx5e_rq *rq = container_of(cq, struct mlx5e_rq, cq);
+   struct sk_buff_head rx_skb_list;
+   struct sk_buff *rx_skb;
int work_done;
 
+   /* Using SKB list infrastructure, even-though some instructions
+* could be saved by open-coding it on skb->next directly.
+*/
+   __skb_queue_head_init(_skb_list);
+
/* avoid accessing cq (dma coherent memory) if not needed */
if (!test_and_clear_bit(MLX5E_CQ_HAS_CQES, >flags))
return 0;
@@ -252,7 +257,6 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
wqe_counter= be16_to_cpu(wqe_counter_be);
wqe= mlx5_wq_ll_get_wqe(>wq, wqe_counter);
skb= rq->skb[wqe_counter];
-   prefetch(skb->data);
rq->skb[wqe_counter] = NULL;
 
dma_unmap_single(rq->pdev,
@@ -265,16 +269,27 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
dev_kfree_skb(skb);
goto wq_ll_pop;
}
+   prefetch(skb->data);
 
mlx5e_build_rx_skb(cqe, rq, skb);
rq->stats.packets++;
-   napi_gro_receive(cq->napi, skb);
+   __skb_queue_tail(_skb_list, skb);
 
 wq_ll_pop:
mlx5_wq_ll_pop(>wq, wqe_counter_be,
   >next.next_wqe_index);
}
 
+   while ((rx_skb = __skb_dequeue(_skb_list)) != NULL) {
+   rx_skb->protocol = eth_type_trans(rx_skb, rq->netdev);
+   napi_gro_receive(cq->napi, rx_skb);
+
+   /* NOT FOR UPSTREAM INCLUSION:
+* How I did isolated testing of driver RX, I here called:
+*  napi_consume_skb(rx_skb, budget);
+*/
+   }
+
mlx5_cqwq_update_db_record(>wq);
 
/* ensure cq space is freed before enabling more cqes */

[net-next PATCH 05/11] mlx5: use napi_*_skb APIs to get bulk alloc and free

2016-02-02 Thread Jesper Dangaard Brouer

Bulk alloc and free of SKBs happen transparently by the API calls
napi_alloc_skb() and napi_consume_skb().

The mlx5 driver have an extra high benefit of these changes,
because it already have a loop refilling its RX ring queue.  I
considered if the alloc API should be allowed to request larger
allocation, knowing the size of objects it need to refill.  For
now, just use the default bulk size hidden inside napi_alloc_skb().

Signed-off-by: Jesper Dangaard Brouer 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   |9 +
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c   |4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c |4 ++--
 4 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 9ea49a893323..3e531fae9ed3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -591,9 +591,9 @@ netdev_tx_t mlx5e_xmit(struct sk_buff *skb, struct 
net_device *dev);
 void mlx5e_completion_event(struct mlx5_core_cq *mcq);
 void mlx5e_cq_error_event(struct mlx5_core_cq *mcq, enum mlx5_event event);
 int mlx5e_napi_poll(struct napi_struct *napi, int budget);
-bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq);
+bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget);
 int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget);
-bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq);
+bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq, struct napi_struct *napi);
 struct mlx5_cqe64 *mlx5e_get_cqe(struct mlx5e_cq *cq);
 
 void mlx5e_update_stats(struct mlx5e_priv *priv);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index dd959d929aad..e923f4adc0f8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -42,12 +42,13 @@ static inline bool mlx5e_rx_hw_stamp(struct mlx5e_tstamp 
*tstamp)
 }
 
 static inline int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq,
-struct mlx5e_rx_wqe *wqe, u16 ix)
+struct mlx5e_rx_wqe *wqe, u16 ix,
+struct napi_struct *napi)
 {
struct sk_buff *skb;
dma_addr_t dma_addr;
 
-   skb = netdev_alloc_skb(rq->netdev, rq->wqe_sz);
+   skb = napi_alloc_skb(napi, rq->wqe_sz);
if (unlikely(!skb))
return -ENOMEM;
 
@@ -76,7 +77,7 @@ err_free_skb:
return -ENOMEM;
 }
 
-bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
+bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq, struct napi_struct *napi)
 {
struct mlx5_wq_ll *wq = >wq;
 
@@ -86,7 +87,7 @@ bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
while (!mlx5_wq_ll_is_full(wq)) {
struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(wq, wq->head);
 
-   if (unlikely(mlx5e_alloc_rx_wqe(rq, wqe, wq->head)))
+   if (unlikely(mlx5e_alloc_rx_wqe(rq, wqe, wq->head, napi)))
break;
 
mlx5_wq_ll_push(wq, be16_to_cpu(wqe->next.next_wqe_index));
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 2c3fba0fff54..06a29c8f5712 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -326,7 +326,7 @@ netdev_tx_t mlx5e_xmit(struct sk_buff *skb, struct 
net_device *dev)
return mlx5e_sq_xmit(sq, skb);
 }
 
-bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq)
+bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget)
 {
struct mlx5e_sq *sq;
u32 dma_fifo_cc;
@@ -402,7 +402,7 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq)
npkts++;
nbytes += wi->num_bytes;
sqcc += wi->num_wqebbs;
-   dev_kfree_skb(skb);
+   napi_consume_skb(skb, napi_budget);
} while (!last_wqe);
}
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
index 4ac8d716dbdd..8fd07c8087e3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -60,11 +60,11 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget)
clear_bit(MLX5E_CHANNEL_NAPI_SCHED, >flags);
 
for (i = 0; i < c->num_tc; i++)
-   busy |= mlx5e_poll_tx_cq(>sq[i].cq);
+   busy |= mlx5e_poll_tx_cq(>sq[i].cq, budget);
 
work_done = mlx5e_poll_rx_cq(>rq.cq, budget);
busy |= work_done == budget;
-   busy |= mlx5e_post_rx_wqes(>rq);
+   busy |= mlx5e_post_rx_wqes(>rq, napi);
 
if (busy)
return budget;

Re: [RFC PATCH 3/7] net: sched: add cls_u32 offload hooks for netdevs

2016-02-02 Thread Or Gerlitz

On Tue, Feb 2, 2016 at 6:42 PM, John Fastabend
 > [..] I added
> flower support to the driver with about 100lines of code fwiw I'll
> send the patch out later today,

that would be very helpful, would appreciate if you post the code that
supports flower to the list or @ your github

> sure I skipped populating all the
> fields by breaking out of some case statements but not that many.

Re: [PATCH] net: drop write-only stack variable

2016-02-02 Thread Hannes Frederic Sowa


On 02.02.2016 18:17, David Herrmann wrote:

Remove a write-only stack variable from unix_attach_fds(). This is a
left-over from the security fix in:

 commit 712f4aad406bb1ed67f3f98d04c044191f0ff593
 Author: willy tarreau 
 Date:   Sun Jan 10 07:54:56 2016 +0100

 unix: properly account for FDs passed over unix sockets

Signed-off-by: David Herrmann 


I don't think this specifically needs a Fixes tag. Seems more like a 
clean-up.


Anyway, I also noticed this during review and it can be removed safely.

Acked-by: Hannes Frederic Sowa 

Thanks,
Hannes

Re: [PATCH v2] unix: properly account for FDs passed over unix sockets

2016-02-02 Thread Willy Tarreau

On Tue, Feb 02, 2016 at 12:44:54PM -0800, Linus Torvalds wrote:
> On Tue, Feb 2, 2016 at 12:32 PM, Hannes Frederic Sowa
>  wrote:
> > But "struct pid *" in unix_skb_parms should be enough to get us to
> > corresponding "struct cred *" so we can decrement the correct counter during
> > skb destruction.
> 
> Umm. I think the "struct cred" may change in between, can't it?

You mean for example in case of setuid() or something like this ?

willy

[net-next PATCH 01/11] net: bulk free infrastructure for NAPI context, use napi_consume_skb

2016-02-02 Thread Jesper Dangaard Brouer

Discovered that network stack were hitting the kmem_cache/SLUB
slowpath when freeing SKBs.  Doing bulk free with kmem_cache_free_bulk
can speedup this slowpath.

NAPI context is a bit special, lets take advantage of that for bulk
free'ing SKBs.

In NAPI context we are running in softirq, which gives us certain
protection.  A softirq can run on several CPUs at once.  BUT the
important part is a softirq will never preempt another softirq running
on the same CPU.  This gives us the opportunity to access per-cpu
variables in softirq context.

Extend napi_alloc_cache (before only contained page_frag_cache) to be
a struct with a small array based stack for holding SKBs.  Introduce a
SKB defer and flush API for accessing this.

Introduce napi_consume_skb() as replacement for e.g. dev_consume_skb_any()
when running in NAPI context.  A small trick to handle/detect if we
are called from netpoll is to see if budget is 0.  In that case, we
need to invoke dev_consume_skb_irq().

Joint work with Alexander Duyck.

Signed-off-by: Jesper Dangaard Brouer 
Signed-off-by: Alexander Duyck 
---
 include/linux/skbuff.h |3 ++
 net/core/dev.c |1 +
 net/core/skbuff.c  |   83 +---
 3 files changed, 81 insertions(+), 6 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 11f935c1a090..3c8d348223d7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2399,6 +2399,9 @@ static inline struct sk_buff *napi_alloc_skb(struct 
napi_struct *napi,
 {
return __napi_alloc_skb(napi, length, GFP_ATOMIC);
 }
+void napi_consume_skb(struct sk_buff *skb, int budget);
+
+void __kfree_skb_flush(void);
 
 /**
  * __dev_alloc_pages - allocate page for network Rx
diff --git a/net/core/dev.c b/net/core/dev.c
index cc9e3652cf93..73e6cbc10ac6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5149,6 +5149,7 @@ static void net_rx_action(struct softirq_action *h)
}
}
 
+   __kfree_skb_flush();
local_irq_disable();
 
list_splice_tail_init(>poll_list, );
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b2df375ec9c2..e26bb2b1dba4 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -347,8 +347,16 @@ struct sk_buff *build_skb(void *data, unsigned int 
frag_size)
 }
 EXPORT_SYMBOL(build_skb);
 
+#define NAPI_SKB_CACHE_SIZE64
+
+struct napi_alloc_cache {
+   struct page_frag_cache page;
+   size_t skb_count;
+   void *skb_cache[NAPI_SKB_CACHE_SIZE];
+};
+
 static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
-static DEFINE_PER_CPU(struct page_frag_cache, napi_alloc_cache);
+static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
 
 static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
 {
@@ -378,9 +386,9 @@ EXPORT_SYMBOL(netdev_alloc_frag);
 
 static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
 {
-   struct page_frag_cache *nc = this_cpu_ptr(_alloc_cache);
+   struct napi_alloc_cache *nc = this_cpu_ptr(_alloc_cache);
 
-   return __alloc_page_frag(nc, fragsz, gfp_mask);
+   return __alloc_page_frag(>page, fragsz, gfp_mask);
 }
 
 void *napi_alloc_frag(unsigned int fragsz)
@@ -474,7 +482,7 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
 struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
 gfp_t gfp_mask)
 {
-   struct page_frag_cache *nc = this_cpu_ptr(_alloc_cache);
+   struct napi_alloc_cache *nc = this_cpu_ptr(_alloc_cache);
struct sk_buff *skb;
void *data;
 
@@ -494,7 +502,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, 
unsigned int len,
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
 
-   data = __alloc_page_frag(nc, len, gfp_mask);
+   data = __alloc_page_frag(>page, len, gfp_mask);
if (unlikely(!data))
return NULL;
 
@@ -505,7 +513,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, 
unsigned int len,
}
 
/* use OR instead of assignment to avoid clearing of bits in mask */
-   if (nc->pfmemalloc)
+   if (nc->page.pfmemalloc)
skb->pfmemalloc = 1;
skb->head_frag = 1;
 
@@ -747,6 +755,69 @@ void consume_skb(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(consume_skb);
 
+void __kfree_skb_flush(void)
+{
+   struct napi_alloc_cache *nc = this_cpu_ptr(_alloc_cache);
+
+   /* flush skb_cache if containing objects */
+   if (nc->skb_count) {
+   kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
+nc->skb_cache);
+   nc->skb_count = 0;
+   }
+}
+
+static void __kfree_skb_defer(struct sk_buff *skb)
+{
+   struct napi_alloc_cache *nc = this_cpu_ptr(_alloc_cache);
+
+   /* drop skb->head and call any destructors for packet */
+   skb_release_all(skb);
+
+   /* record skb

[net-next PATCH 02/11] net: bulk free SKBs that were delay free'ed due to IRQ context

2016-02-02 Thread Jesper Dangaard Brouer

The network stack defers SKBs free, in-case free happens in IRQ or
when IRQs are disabled. This happens in __dev_kfree_skb_irq() that
writes SKBs that were free'ed during IRQ to the softirq completion
queue (softnet_data.completion_queue).

These SKBs are naturally delayed, and cleaned up during NET_TX_SOFTIRQ
in function net_tx_action().  Take advantage of this a use the skb
defer and flush API, as we are already in softirq context.

For modern drivers this rarely happens. Although most drivers do call
dev_kfree_skb_any(), which detects the situation and calls
__dev_kfree_skb_irq() when needed.  This due to netpoll can call from
IRQ context.

Signed-off-by: Alexander Duyck 
Signed-off-by: Jesper Dangaard Brouer 
---
 include/linux/skbuff.h |1 +
 net/core/dev.c |8 +++-
 net/core/skbuff.c  |8 ++--
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 3c8d348223d7..b06ba2e07c89 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2402,6 +2402,7 @@ static inline struct sk_buff *napi_alloc_skb(struct 
napi_struct *napi,
 void napi_consume_skb(struct sk_buff *skb, int budget);
 
 void __kfree_skb_flush(void);
+void __kfree_skb_defer(struct sk_buff *skb);
 
 /**
  * __dev_alloc_pages - allocate page for network Rx
diff --git a/net/core/dev.c b/net/core/dev.c
index 73e6cbc10ac6..24be1d07d854 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3829,8 +3829,14 @@ static void net_tx_action(struct softirq_action *h)
trace_consume_skb(skb);
else
trace_kfree_skb(skb, net_tx_action);
-   __kfree_skb(skb);
+
+   if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
+   __kfree_skb(skb);
+   else
+   __kfree_skb_defer(skb);
}
+
+   __kfree_skb_flush();
}
 
if (sd->output_queue) {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index e26bb2b1dba4..d278e51789e9 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -767,7 +767,7 @@ void __kfree_skb_flush(void)
}
 }
 
-static void __kfree_skb_defer(struct sk_buff *skb)
+static inline void _kfree_skb_defer(struct sk_buff *skb)
 {
struct napi_alloc_cache *nc = this_cpu_ptr(_alloc_cache);
 
@@ -789,6 +789,10 @@ static void __kfree_skb_defer(struct sk_buff *skb)
nc->skb_count = 0;
}
 }
+void __kfree_skb_defer(struct sk_buff *skb)
+{
+   _kfree_skb_defer(skb);
+}
 
 void napi_consume_skb(struct sk_buff *skb, int budget)
 {
@@ -814,7 +818,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
return;
}
 
-   __kfree_skb_defer(skb);
+   _kfree_skb_defer(skb);
 }
 EXPORT_SYMBOL(napi_consume_skb);

[net-next PATCH 00/11] net: mitigating kmem_cache slowpath and BoF discussion patches

2016-02-02 Thread Jesper Dangaard Brouer

This patchset is relevant for my NetDev 1.1 "Network Performance BoF" [1].

The first 4 patches, is a repost[2], for the first real use-case of
kmem_cache bulk alloc and free API.  They were adjusted slightly to
accomodate my last slab API changes.  They should be ready for
inclusion in net-next, as the needed MM tree are avail in net-next.

Patch 5 is also enabling the SKB bulk API for mlx5.

Thus, patches 1-5 should be ready for net-next.

After patch 5, the experimental patches begin, which is Prove-of-Concept
code for what we will be discussing during the Network Performance BoF [1]

[1] 
http://netdevconf.org/1.1/bof-network-performance-bof-jesper-dangaard-brouer.html
[2] http://thread.gmane.org/gmane.linux.network/384302/

---

Jesper Dangaard Brouer (11):
  net: bulk free infrastructure for NAPI context, use napi_consume_skb
  net: bulk free SKBs that were delay free'ed due to IRQ context
  ixgbe: bulk free SKBs during TX completion cleanup cycle
  net: bulk alloc and reuse of SKBs in NAPI context
  mlx5: use napi_*_skb APIs to get bulk alloc and free
  mlx5: RX bulking or bundling of packets before calling network stack
  net: introduce napi_alloc_skb_bulk() for more use-cases
  mlx5: hint the NAPI alloc skb API about the expected bulk size
  EXPERIMENT: dummy: bulk free
  net: API for RX handover of multiple SKBs to stack
  net: RPS bulk enqueue to backlog


 drivers/net/dummy.c   |3 
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |6 -
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |5 -
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   |   24 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c   |4 -
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c |4 -
 include/linux/netdevice.h |   13 ++
 include/linux/skbuff.h|   23 ++-
 net/core/dev.c|  160 -
 net/core/skbuff.c |  122 +++-
 10 files changed, 327 insertions(+), 37 deletions(-)

--

[net-next PATCH 04/11] net: bulk alloc and reuse of SKBs in NAPI context

2016-02-02 Thread Jesper Dangaard Brouer

Think twice before applying
 - This patch can potentially introduce added latency in some workloads

This patch introduce bulk alloc of SKBs and allow reuse of SKBs
free'ed in same softirq cycle.  SKBs are normally free'ed during TX
completion, but most high speed drivers also cleanup TX ring during
NAPI RX poll cycle.  Thus, if using napi_consume_skb/__kfree_skb_defer,
SKBs will be avail in the napi_alloc_cache->skb_cache.

If no SKBs are avail for reuse, then only bulk alloc 8 SKBs, to limit
the potential overshooting unused SKBs needed to free'ed when NAPI
cycle ends (flushed in net_rx_action via __kfree_skb_flush()).

Benchmarking IPv4-forwarding, on CPU i7-4790K @4.2GHz (no turbo boost)
(GCC version 5.1.1 20150618 (Red Hat 5.1.1-4))
 Allocator SLUB:
  Single CPU/flow numbers: before: 2064446 pps -> after: 2083031 pps
  Improvement: +18585 pps, -4.3 nanosec, +0.9%
 Allocator SLAB:
  Single CPU/flow numbers: before: 2035949 pps -> after: 2033567 pps
  Regression: -2382 pps, +0.57 nanosec, -0.1 %

Even-though benchmarking does show an improvement for SLUB(+0.9%), I'm
not convinced bulk alloc will be a win in all situations:
 * I see stalls on walking the SLUB freelist (normal hidden by prefetch)
 * In case RX queue is not full, alloc and free more SKBs than needed

More testing is needed with more real life benchmarks.

Joint work with Alexander Duyck.

Signed-off-by: Jesper Dangaard Brouer 
Signed-off-by: Alexander Duyck 
---
 net/core/skbuff.c |   35 +++
 1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d278e51789e9..ae8cdbec90ee 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -483,13 +483,14 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct 
*napi, unsigned int len,
 gfp_t gfp_mask)
 {
struct napi_alloc_cache *nc = this_cpu_ptr(_alloc_cache);
+   struct skb_shared_info *shinfo;
struct sk_buff *skb;
void *data;
 
len += NET_SKB_PAD + NET_IP_ALIGN;
 
-   if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
-   (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
+   if (unlikely((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
+(gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA {
skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
if (!skb)
goto skb_fail;
@@ -506,12 +507,38 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct 
*napi, unsigned int len,
if (unlikely(!data))
return NULL;
 
-   skb = __build_skb(data, len);
-   if (unlikely(!skb)) {
+#define BULK_ALLOC_SIZE 8
+   if (!nc->skb_count) {
+   nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache,
+ gfp_mask, BULK_ALLOC_SIZE,
+ nc->skb_cache);
+   }
+   if (likely(nc->skb_count)) {
+   skb = (struct sk_buff *)nc->skb_cache[--nc->skb_count];
+   } else {
+   /* alloc bulk failed */
skb_free_frag(data);
return NULL;
}
 
+   len -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+   memset(skb, 0, offsetof(struct sk_buff, tail));
+   skb->truesize = SKB_TRUESIZE(len);
+   atomic_set(>users, 1);
+   skb->head = data;
+   skb->data = data;
+   skb_reset_tail_pointer(skb);
+   skb->end = skb->tail + len;
+   skb->mac_header = (typeof(skb->mac_header))~0U;
+   skb->transport_header = (typeof(skb->transport_header))~0U;
+
+   /* make sure we initialize shinfo sequentially */
+   shinfo = skb_shinfo(skb);
+   memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
+   atomic_set(>dataref, 1);
+   kmemcheck_annotate_variable(shinfo->destructor_arg);
+
/* use OR instead of assignment to avoid clearing of bits in mask */
if (nc->page.pfmemalloc)
skb->pfmemalloc = 1;

[net-next PATCH 03/11] ixgbe: bulk free SKBs during TX completion cleanup cycle

2016-02-02 Thread Jesper Dangaard Brouer

There is an opportunity to bulk free SKBs during reclaiming of
resources after DMA transmit completes in ixgbe_clean_tx_irq.  Thus,
bulk freeing at this point does not introduce any added latency.

Simply use napi_consume_skb() which were recently introduced.  The
napi_budget parameter is needed by napi_consume_skb() to detect if it
is called from netpoll.

Benchmarking IPv4-forwarding, on CPU i7-4790K @4.2GHz (no turbo boost)
 Single CPU/flow numbers: before: 1982144 pps ->  after : 2064446 pps
 Improvement: +82302 pps, -20 nanosec, +4.1%
 (SLUB and GCC version 5.1.1 20150618 (Red Hat 5.1.1-4))

Joint work with Alexander Duyck.

Signed-off-by: Alexander Duyck 
Signed-off-by: Jesper Dangaard Brouer 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index c4003a88bbf6..0c701b8438b6 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1089,7 +1089,7 @@ static void ixgbe_tx_timeout_reset(struct ixgbe_adapter 
*adapter)
  * @tx_ring: tx ring to clean
  **/
 static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector,
-  struct ixgbe_ring *tx_ring)
+  struct ixgbe_ring *tx_ring, int napi_budget)
 {
struct ixgbe_adapter *adapter = q_vector->adapter;
struct ixgbe_tx_buffer *tx_buffer;
@@ -1127,7 +1127,7 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector 
*q_vector,
total_packets += tx_buffer->gso_segs;
 
/* free the skb */
-   dev_consume_skb_any(tx_buffer->skb);
+   napi_consume_skb(tx_buffer->skb, napi_budget);
 
/* unmap skb header data */
dma_unmap_single(tx_ring->dev,
@@ -2784,7 +2784,7 @@ int ixgbe_poll(struct napi_struct *napi, int budget)
 #endif
 
ixgbe_for_each_ring(ring, q_vector->tx)
-   clean_complete &= !!ixgbe_clean_tx_irq(q_vector, ring);
+   clean_complete &= !!ixgbe_clean_tx_irq(q_vector, ring, budget);
 
/* Exit if we are called by netpoll or busy polling is active */
if ((budget <= 0) || !ixgbe_qv_lock_napi(q_vector))

Re: [PATCH net-next 6/6] e1000e: call ndo_stop() instead of dev_close() when running offline selftest

2016-02-02 Thread David Miller


Always, when sending an updated version of a patch which is part of a
series, always resubmit the entire series rather than just the patch
which is changing.

Thanks.

Re: [PATCH] rtlwifi: Fix reusable codes in core.c

2016-02-02 Thread Julian Calaby

Hi Byeoungwook,

On Wed, Feb 3, 2016 at 2:48 AM, Byeoungwook Kim  wrote:
> rtl_*_delay() functions were reused same codes about addr variable.
> So i have converted to rtl_addr_delay() from code about addr variable.
>
> Conditional codes in rtl_addr_delay() were improved in readability and
> performance by using switch codes.
>
> Signed-off-by: Byeoungwook Kim 
> ---
>  drivers/net/wireless/realtek/rtlwifi/core.c | 48 
> +++--
>  1 file changed, 18 insertions(+), 30 deletions(-)
>
> diff --git a/drivers/net/wireless/realtek/rtlwifi/core.c 
> b/drivers/net/wireless/realtek/rtlwifi/core.c
> index 4ae421e..c1193d1 100644
> --- a/drivers/net/wireless/realtek/rtlwifi/core.c
> +++ b/drivers/net/wireless/realtek/rtlwifi/core.c
> @@ -37,36 +37,34 @@
>
>  void rtl_addr_delay(u32 addr)
>  {
> -   if (addr == 0xfe)
> +   switch (addr) {
> +   case 0xfe:
> mdelay(50);
> -   else if (addr == 0xfd)
> +   break;
> +   case 0xfd:
> mdelay(5);
> -   else if (addr == 0xfc)
> +   break;
> +   case 0xfc:
> mdelay(1);
> -   else if (addr == 0xfb)
> +   break;
> +   case 0xfb:
> udelay(50);
> -   else if (addr == 0xfa)
> +   break;
> +   case 0xfa:
> udelay(5);
> -   else if (addr == 0xf9)
> +   break;
> +   case 0xf9:
> udelay(1);
> +   break;
> +   };

As you're introducing a case statement here, you could consolidate the
addresses that have the same delays, i.e.

case 0xfe:
case 0xfb:
mdelay(50);
break;

also, you should arguably be doing this cleanup in a separate patch, i.e.

1. Convert open coded instances to use this function (i.e. the changes
below this comment)
2. Improve the function

>  }
>  EXPORT_SYMBOL(rtl_addr_delay);
>
>  void rtl_rfreg_delay(struct ieee80211_hw *hw, enum radio_path rfpath, u32 
> addr,
>  u32 mask, u32 data)
>  {
> -   if (addr == 0xfe) {
> -   mdelay(50);
> -   } else if (addr == 0xfd) {
> -   mdelay(5);
> -   } else if (addr == 0xfc) {
> -   mdelay(1);
> -   } else if (addr == 0xfb) {
> -   udelay(50);
> -   } else if (addr == 0xfa) {
> -   udelay(5);
> -   } else if (addr == 0xf9) {
> -   udelay(1);
> +   if (addr >= 0xf9 && addr <= 0xfe) {
> +   rtl_addr_delay(addr);
> } else {
> rtl_set_rfreg(hw, rfpath, addr, mask, data);
> udelay(1);
> @@ -76,18 +74,8 @@ EXPORT_SYMBOL(rtl_rfreg_delay);
>
>  void rtl_bb_delay(struct ieee80211_hw *hw, u32 addr, u32 data)
>  {
> -   if (addr == 0xfe) {
> -   mdelay(50);
> -   } else if (addr == 0xfd) {
> -   mdelay(5);
> -   } else if (addr == 0xfc) {
> -   mdelay(1);
> -   } else if (addr == 0xfb) {
> -   udelay(50);
> -   } else if (addr == 0xfa) {
> -   udelay(5);
> -   } else if (addr == 0xf9) {
> -   udelay(1);
> +   if (addr >= 0xf9 && addr <= 0xfe) {
> +   rtl_addr_delay(addr);
> } else {
> rtl_set_bbreg(hw, addr, MASKDWORD, data);
> udelay(1);

Thanks,

-- 
Julian Calaby

Email: julian.cal...@gmail.com
Profile: http://www.google.com/profiles/julian.calaby/

[PATCH net-next] hv_netvsc: Increase delay for RNDIS_STATUS_NETWORK_CHANGE

2016-02-02 Thread Haiyang Zhang

We simulates a link down period for RNDIS_STATUS_NETWORK_CHANGE message to
trigger DHCP renew. User daemons may need multiple seconds to trigger the
link down event. (e.g. ifplugd: 5sec, network-manager: 4sec.) So update
this link down period to 10 sec to properly trigger DHCP renew.

Signed-off-by: Haiyang Zhang 
---
 drivers/net/hyperv/netvsc_drv.c |   10 --
 1 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 1d3a665..6f23973 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -43,6 +43,8 @@
 
 #define RING_SIZE_MIN 64
 #define LINKCHANGE_INT (2 * HZ)
+/* Extra delay for RNDIS_STATUS_NETWORK_CHANGE: */
+#define LINKCHANGE_DELAY (8 * HZ)
 static int ring_size = 128;
 module_param(ring_size, int, S_IRUGO);
 MODULE_PARM_DESC(ring_size, "Ring buffer size (# of pages)");
@@ -964,6 +966,7 @@ static void netvsc_link_change(struct work_struct *w)
return;
}
ndev_ctx->last_reconfig = jiffies;
+   delay = LINKCHANGE_INT;
 
spin_lock_irqsave(_ctx->lock, flags);
if (!list_empty(_ctx->reconfig_events)) {
@@ -1009,8 +1012,11 @@ static void netvsc_link_change(struct work_struct *w)
netif_tx_stop_all_queues(net);
event->event = RNDIS_STATUS_MEDIA_CONNECT;
spin_lock_irqsave(_ctx->lock, flags);
-   list_add_tail(>list, _ctx->reconfig_events);
+   list_add(>list, _ctx->reconfig_events);
spin_unlock_irqrestore(_ctx->lock, flags);
+
+   ndev_ctx->last_reconfig += LINKCHANGE_DELAY;
+   delay = LINKCHANGE_INT + LINKCHANGE_DELAY;
reschedule = true;
}
break;
@@ -1025,7 +1031,7 @@ static void netvsc_link_change(struct work_struct *w)
 * second, handle next reconfig event in 2 seconds.
 */
if (reschedule)
-   schedule_delayed_work(_ctx->dwork, LINKCHANGE_INT);
+   schedule_delayed_work(_ctx->dwork, delay);
 }
 
 static void netvsc_free_netdev(struct net_device *netdev)
-- 
1.7.4.1

[RFC] Inverse of flowi{4,6}_oif: flowi{4,6}_not_oif

2016-02-02 Thread Jason A. Donenfeld

Hi folks,

Sometimes it is useful to ask, "what is the route for 1.2.3.4/32 if we
*exclude* routes that go out through eth8?" Currently, the only way of
doing this is to read the entire routing table in userspace, and then
reimplement all of the logic for the various tables and metrics and
complex logic of the FIB, remove the routes you want, and then
calculate the answer. This is, obviously, far from satisfactory, as
it's not really feasible to accurate reimplement that. Of course,
another obviously flawed way would be to just remove those routes for
"dev eth8", do the lookup, and then re-add them, but this is
disruptive.

The best solution for this is to add a flowi4_not_oif and
flowi6_not_oif member which looks up a route that doesn't use the
specified netdev.

What are the use cases of this? Several.

In userspace, the most obvious usage is this: OpenVPN or OpenConnect
or any other similar application receives routes from a server. It
wants to add those routes to the routing table. But, it needs to make
sure that the OpenVPN endpoint is still accessible over the actual
network interface, especially in the case of being pushed "0/1 and
128/1". So, before adding those routes, it looks up what the existing
route is, and then adds that route explicitly: "ip route add
1.2.3.4/32 via ". Then it can add routes that
might potentially override this, while keeping the tunnel working.

However, there are big problems with this naive (yet "state of the
art") approach. What if the former default route changes (because of,
say, dhclient)? In this case, the explicit route to the endpoint is
not updated. Or worse, what if several complicated changes are made at
once to the routing table? The *only* way to reliably figure out the
new explicit route to the tunnel endpoint is to remove the tunnel's
existing routes (!), query the route for the endpoint, and then re-add
them. Not only does this affect availability due to its blatant lack
of atomicity, but it also is an issue from a network security
perspective. Another problem -- which affects me personally on a daily
basis -- is: what happens when the device that previously routed the
endpoint goes down, and then back up again? This happens with wireless
cards, for example, when a laptop suspends. On an OpenVPN laptop with
"0/1 and 128/1" routes, upon resuming from suspend and reconnecting to
a wireless network, one must manually reconfigure the explicit route
to the endpoint, since it has been automatically garbage collected
when the interface went down. No, this isn't a userspace problem: as
previously mentioned, userspace cannot reliably make the calculations
necessary to add such endpoint routes without affecting availability
and/or security.

There's another use case, inside the kernel. Geneve, vxlan, and many
other tunnel devices have this copy codeblock:

if (rt->dst.dev == dev) { /* is this necessary? */
netdev_dbg(dev, "circular route to %pI4\n", >daddr);
ip_rt_put(rt);
return ERR_PTR(-ELOOP);
}

While it remains up for debate (and potential configuration flags)
whether one would want such an "automagical" solution, it is possible
to imagine "rt->dst" here being calculated with "flowi{4,6}_not_oif"
in mind, which would eliminate this loop detection need and generally
lead to having a happier network administrator.

In private discussions with several system admins and kernel
developers alike, the response has been, "oh my God, I know - I hate
this issue. What an elegant solution! Have you written to davem &
friends about this?" to which I respond, "maybe some day I'll have the
courage..." Well, this is it guys.

So, what I propose is adding this "flowi{4,6}_not_oif", for an
extremely common and only-properly-solved-by-the-kernel problem. The
first step would be augmenting fib4 and fib6, and the second step
would be adding support for this to ip-route(8) and the rtnetlink
layer.

I stress again: there is no feasible userspace solution to this problem.

So, this [RFC] is to determine the following:

(1) Would you merge a patch that adds this functionality?
(2) Is there someone intimately familiar with the FIB who would be
willing to write this patch?

- If 1 && 2, awesome! I owe you a steak dinner.
- If !1, why? You best have quite a good alternative solution for this
issue (that doesn't include the words "install NetworkManager").
- If 1 && !2, I'll do a thorough study of the FIB code and write it myself.
- If !1 and 2, um, well, join the cause I guess.

Hope to hear from you soon.

Thanks,
Jason

[PATCH] ethtool: add support for dynamic mode in {SG}RXFH commands

2016-02-02 Thread Jacob Keller

Signed-off-by: Jacob Keller 
---
 ethtool-copy.h |  8 +++-
 ethtool.c  | 36 +++-
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/ethtool-copy.h b/ethtool-copy.h
index d23ffc4c38b4..620dcea06d25 100644
--- a/ethtool-copy.h
+++ b/ethtool-copy.h
@@ -890,6 +890,10 @@ struct ethtool_rxfh_indir {
  * hardware hash key.
  * @hfunc: Defines the current RSS hash function used by HW (or to be set to).
  * Valid values are one of the %ETH_RSS_HASH_*.
+ * @dynamic: Indicate whether the device driver may use dynamic RSS settings
+ * which change due to various run time factors, such as number of
+ * queues. When false driver must attempt to preserve RSS settings when
+ * possible. When true driver may override any requested RSS settings.
  * @rsvd:  Reserved for future extensions.
  * @rss_config: RX ring/queue index for each hash value i.e., indirection table
  * of @indir_size __u32 elements, followed by hash key of @key_size
@@ -900,6 +904,7 @@ struct ethtool_rxfh_indir {
  * %ETH_RXFH_INDIR_NO_CHANGE means that indir table setting is not requested
  * and a @indir_size of zero means the indir table should be reset to default
  * values. An hfunc of zero means that hash function setting is not requested.
+ * If dynamic is true, driver may ignore any other settings requested.
  */
 struct ethtool_rxfh {
__u32   cmd;
@@ -907,7 +912,8 @@ struct ethtool_rxfh {
__u32   indir_size;
__u32   key_size;
__u8hfunc;
-   __u8rsvd8[3];
+   __u8dynamic;
+   __u8rsvd8[2];
__u32   rsvd32;
__u32   rss_config[0];
 };
diff --git a/ethtool.c b/ethtool.c
index 92c40b823f2c..29b9279b6b1c 100644
--- a/ethtool.c
+++ b/ethtool.c
@@ -917,6 +917,19 @@ static int convert_string_to_hashkey(char *rss_hkey, u32 
key_size,
return 2;
 }
 
+static u8 parse_dynamic(const char *rss_dynamic, u8 *dynamic)
+{
+   if (!strcmp(rss_dynamic, "on")) {
+   *dynamic = 1;
+   return 0;
+   } else if (!strcmp(rss_dynamic, "off")) {
+   *dynamic = 0;
+   return 0;
+   } else {
+   return 2;
+   }
+}
+
 static int parse_hkey(char **rss_hkey, u32 key_size,
  const char *rss_hkey_string)
 {
@@ -3213,6 +3226,11 @@ static int do_grxfh(struct cmd_context *ctx)
indir_bytes = rss->indir_size * sizeof(rss->rss_config[0]);
hkey = ((char *)rss->rss_config + indir_bytes);
 
+   if (rss->dynamic)
+   printf("Dynamic mode enabled\n");
+   else
+   printf("Static mode enabled\n");
+
printf("RSS hash key:\n");
if (!rss->key_size)
printf("Operation not supported\n");
@@ -3326,11 +3344,13 @@ static int do_srxfh(struct cmd_context *ctx)
int rxfhindir_equal = 0;
char **rxfhindir_weight = NULL;
char *rxfhindir_key = NULL;
+   char *rxfh_dynamic = NULL;
char *hkey = NULL;
int err = 0;
u32 arg_num = 0, indir_bytes = 0;
u32 entry_size = sizeof(rss_head.rss_config[0]);
u32 num_weights = 0;
+   u8 dynamic = 0;
 
if (ctx->argc < 2)
exit_bad_args();
@@ -3357,6 +3377,12 @@ static int do_srxfh(struct cmd_context *ctx)
if (!rxfhindir_key)
exit_bad_args();
++arg_num;
+   } else if (!strcmp(ctx->argp[arg_num], "dynamic")) {
+   ++arg_num;
+   rxfh_dynamic = ctx->argp[arg_num];
+   if (!rxfhindir_key)
+   exit_bad_args();
+   ++arg_num;
} else {
exit_bad_args();
}
@@ -3392,6 +3418,12 @@ static int do_srxfh(struct cmd_context *ctx)
return err;
}
 
+   if (rxfh_dynamic) {
+   err = parse_dynamic(rxfh_dynamic, );
+   if (err)
+   return err;
+   }
+
if (rxfhindir_equal || rxfhindir_weight)
indir_bytes = rss_head.indir_size * entry_size;
 
@@ -3403,6 +3435,7 @@ static int do_srxfh(struct cmd_context *ctx)
rss->cmd = ETHTOOL_SRSSH;
rss->indir_size = rss_head.indir_size;
rss->key_size = rss_head.key_size;
+   rss->dynamic = dynamic;
 
if (fill_indir_table(>indir_size, rss->rss_config, rxfhindir_equal,
 rxfhindir_weight, num_weights)) {
@@ -4112,7 +4145,8 @@ static const struct option {
{ "-X|--set-rxfh-indir|--rxfh", 1, do_srxfh,
  "Set Rx flow hash indirection and/or hash key",
  " [ equal N | weight W0 W1 ... ]\n"
- " [ hkey %x:%x:%x:%x:%x: ]\n" },
+ " [ hkey %x:%x:%x:%x:%x: ]\n"
+ " [ dynamic on|off ]\n" },
{

[PATCH 1/2] ethtool: add dynamic flag to ETHTOOL_{GS}RXFH commands

2016-02-02 Thread Jacob Keller

Ethtool supports a few operations for modifying and controlling
a device's RSS table. Sometimes, changes in other features of the device
may require (or desire) changes to the RSS table. Currently there is no
method to indicate to the driver whether the current RSS table settings
should be maintained or overridden.

A simple example of this is for when the number of receive queues is
changed, there are two possibilities. First, the number of queues is
decreased. This must result in a reprogramming of the RSS table since it
will no longer match correctly and may attempt to assign traffic to
a queue which is now disabled. In this case drivers have a clear
indication of what to do.

The second case, is when the number of queues has increased. In this
case, the current RSS table may be preserved. However, doing so would
result in the new queues being unused for RSS. But if the driver chooses
to destroy the RSS configuration it may result in unwanted behavior as
now the user's configured changes are lost.

This patch attempts to resolve this (and other similar) issues by
indicating a new flag "dynamic" which can be set by the user when
calling the ethtool interface.

This flag indicates to the driver that it may overwrite settings in the
RSS table. If false, it indicates the driver should do what it can to
preserve the RSS table changes requested by the user. That is, for cases
where it can preserve the table it must. If the value is set true, it
means the driver may or may not apply the current settings and is free
to change the values as necessary. The current default is set to false,
as this is how most drivers appear to behave today.

Signed-off-by: Jacob Keller 
Cc: Lendacky, Thomas 
Cc: Yuval Mintz 
Cc: Michael Chan 
Cc: Matt Carlson 
Cc: Sunil Goutham 
Cc: Hariprasad Shenai 
Cc: Govindarajulu Varadarajan <_gov...@gmx.com>
Cc: Kalesh AP 
Cc: Andrew Lunn 
Cc: Shannon Nelson 
Cc: Mitch Williams 
Cc: Carolyn Wyborny 
Cc: Emil Tantilov 
Cc: Thomas Petazzoni 
Cc: Amir Vadai 
Cc: Achiad Shochat 
Cc: Ben Hutchings 
Cc: Michał Mirosław 
Cc: Alexander Duyck 

---
 drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c   |  7 --
 .../net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c|  7 --
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c  |  5 +++-
 drivers/net/ethernet/broadcom/tg3.c|  8 +--
 .../net/ethernet/cavium/thunder/nicvf_ethtool.c|  7 --
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c |  7 --
 drivers/net/ethernet/cisco/enic/enic_ethtool.c |  7 --
 drivers/net/ethernet/emulex/benet/be_ethtool.c |  7 --
 drivers/net/ethernet/hisilicon/hns/hns_ethtool.c   |  8 +--
 drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c   |  7 --
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c |  7 --
 drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c |  6 +++--
 drivers/net/ethernet/intel/igb/igb_ethtool.c   |  6 +++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c   |  7 --
 drivers/net/ethernet/intel/ixgbevf/ethtool.c   |  5 +++-
 drivers/net/ethernet/marvell/mvneta.c  |  8 +--
 drivers/net/ethernet/mellanox/mlx4/en_ethtool.c|  7 --
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |  7 --
 .../net/ethernet/netronome/nfp/nfp_net_ethtool.c   |  7 --
 drivers/net/ethernet/sfc/ethtool.c |  7 --
 include/linux/ethtool.h|  4 ++--
 include/uapi/linux/ethtool.h   |  8 ++-
 net/core/ethtool.c | 27 +-
 23 files changed, 124 insertions(+), 52 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
index 6040293db9c1..4eecd225db7c 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
@@ -509,11 +509,14 @@ static u32 xgbe_get_rxfh_indir_size(struct net_device 
*netdev)
 }
 
 static int xgbe_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
-u8 *hfunc)
+u8 *hfunc, u8 *dynamic)
 {
struct xgbe_prv_data *pdata = netdev_priv(netdev);
unsigned int i;
 
+   if (dynamic)
+   *dynamic = false;
+
if (indir) {
for (i = 0; i < ARRAY_SIZE(pdata->rss_table); i++)
indir[i] = XGMAC_GET_BITS(pdata->rss_table[i],
@@ -530,7 +533,7 @@ static int xgbe_get_rxfh(struct net_device *netdev, u32 
*indir, u8 *key,
 }

[PATCH 2/2] fm10k: support dynamic mode for RSS table control

2016-02-02 Thread Jacob Keller

Add support for the new dynamic flag from set_rxfh and get_rxfh. For now
the only known dynamic reason to change RSS is when number of queues is
changed. The default mode for the driver will be dynamic, indicating
that the driver is free to change its own default setting as it sees
fit. Since the current default userspace ethtool operation will only
send false as the value to dynamic this results in the behavior that the
driver configured RSS settings are capable of being modified but user
supplied changes will be preserved as long as possible.

In practice, this resolves an issue where decreasing the number of
queues and then increasing them shall no longer break RSS unless the
user has manually configured a non-dynamic RSS table setting.

If the user has configured the RSS table without setting dynamic, the
driver will do its best to maintain the configuration. If this is not
possible, we will indicate that the mode is now dynamic and reset to
driver defaults.

Signed-off-by: Jacob Keller 
---
 drivers/net/ethernet/intel/fm10k/fm10k.h |  1 +
 drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c | 10 +-
 drivers/net/ethernet/intel/fm10k/fm10k_main.c|  8 +---
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k.h 
b/drivers/net/ethernet/intel/fm10k/fm10k.h
index 83f386714e87..5402b5c55247 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k.h
+++ b/drivers/net/ethernet/intel/fm10k/fm10k.h
@@ -268,6 +268,7 @@ struct fm10k_intfc {
 #define FM10K_FLAG_RX_TS_ENABLED   (u32)(BIT(3))
 #define FM10K_FLAG_SWPRI_CONFIG(u32)(BIT(4))
 #define FM10K_FLAG_DEBUG_STATS (u32)(BIT(5))
+#define FM10K_FLAG_STATIC_RETA_TBL (u32)(BIT(6))
int xcast_mode;
 
/* Tx fast path data */
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c 
b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
index af11c4c1b256..8799296ff86e 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
@@ -1100,7 +1100,7 @@ static int fm10k_get_rssh(struct net_device *netdev, u32 
*indir, u8 *key,
*hfunc = ETH_RSS_HASH_TOP;
 
if (dynamic)
-   *dynamic = false;
+   *dynamic = !(interface->flags & FM10K_FLAG_STATIC_RETA_TBL);
 
err = fm10k_get_reta(netdev, indir);
if (err || !key)
@@ -1123,6 +1123,14 @@ static int fm10k_set_rssh(struct net_device *netdev, 
const u32 *indir,
if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
return -EOPNOTSUPP;
 
+   /* If dynamic mode is not requested, enable the static flag. We'll
+* still attempt to populate the RETA table using the provided
+* settings if possible. */
+   if (dynamic)
+   interface->flags &= ~FM10K_FLAG_STATIC_RETA_TBL;
+   else
+   interface->flags |= FM10K_FLAG_STATIC_RETA_TBL;
+
err = fm10k_set_reta(netdev, indir);
if (err || !key)
return err;
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c 
b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
index 134ce4daa994..8c75f60028a6 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
@@ -1932,13 +1932,13 @@ static void fm10k_assign_rings(struct fm10k_intfc 
*interface)
fm10k_cache_ring_rss(interface);
 }
 
-static void fm10k_init_reta(struct fm10k_intfc *interface)
+void fm10k_init_reta(struct fm10k_intfc *interface)
 {
u16 i, rss_i = interface->ring_feature[RING_F_RSS].indices;
u32 reta, base;
 
-   /* If the netdev is initialized we have to maintain table if possible */
-   if (interface->netdev->reg_state != NETREG_UNINITIALIZED) {
+   /* Maintain static user provided table if possible. */
+   if (interface->flags & FM10K_FLAG_STATIC_RETA_TBL) {
for (i = FM10K_RETA_SIZE; i--;) {
reta = interface->reta[i];
if reta << 24) >> 24) < rss_i) &&
@@ -1954,6 +1954,8 @@ static void fm10k_init_reta(struct fm10k_intfc *interface)
}
 
 repopulate_reta:
+   interface->flags &= ~FM10K_FLAG_STATIC_RETA_TBL;
+
/* Populate the redirection table 4 entries at a time.  To do this
 * we are generating the results for n and n+2 and then interleaving
 * those with the results with n+1 and n+3.
-- 
2.6.3.505.g5cc1fd1

[PATCH net-next v4 1/2] ethtool: add speed/duplex validation functions

2016-02-02 Thread Nikolay Aleksandrov

From: Nikolay Aleksandrov 

Add functions which check if the speed/duplex are defined.

Signed-off-by: Nikolay Aleksandrov 
Acked-by: Michael S. Tsirkin 
---
v2: new patch
v3: added Michael's ack
v4: no change

 include/uapi/linux/ethtool.h | 34 ++
 1 file changed, 34 insertions(+)

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 57fa39005e79..b2e180181629 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1319,11 +1319,45 @@ enum ethtool_sfeatures_retval_bits {
 
 #define SPEED_UNKNOWN  -1
 
+static inline int ethtool_validate_speed(__u32 speed)
+{
+   switch (speed) {
+   case SPEED_10:
+   case SPEED_100:
+   case SPEED_1000:
+   case SPEED_2500:
+   case SPEED_5000:
+   case SPEED_1:
+   case SPEED_2:
+   case SPEED_25000:
+   case SPEED_4:
+   case SPEED_5:
+   case SPEED_56000:
+   case SPEED_10:
+   case SPEED_UNKNOWN:
+   return 1;
+   }
+
+   return 0;
+}
+
 /* Duplex, half or full. */
 #define DUPLEX_HALF0x00
 #define DUPLEX_FULL0x01
 #define DUPLEX_UNKNOWN 0xff
 
+static inline int ethtool_validate_duplex(__u8 duplex)
+{
+   switch (duplex) {
+   case DUPLEX_HALF:
+   case DUPLEX_FULL:
+   case DUPLEX_UNKNOWN:
+   return 1;
+   }
+
+   return 0;
+}
+
 /* Which connector port. */
 #define PORT_TP0x00
 #define PORT_AUI   0x01
-- 
2.4.3

[PATCH net-next v4 0/2] virtio_net: add ethtool get/set settings support

2016-02-02 Thread Nikolay Aleksandrov

From: Nikolay Aleksandrov 

Hi,
Patch 1 adds ethtool speed/duplex validation functions which check if the
value is defined. Patch 2 adds support for ethtool (get|set)_settings and
uses the validation functions to check the user-supplied values.

v2: split in 2 patches to allow everyone to make use of the validation
functions and allow virtio_net devices to be half duplex
v3: added a check to return error if the user tries to change anything else
besides duplex/speed as per Michael's comment
v4: Set port type to PORT_OTHER

Cheers,
 Nik

Nikolay Aleksandrov (2):
  ethtool: add speed/duplex validation functions
  virtio_net: add ethtool support for set and get of settings

 drivers/net/virtio_net.c | 59 
 include/uapi/linux/ethtool.h | 34 +
 2 files changed, 93 insertions(+)

-- 
2.4.3

[PATCH] ethtool: add dynamic flag to {SG}RXFH

2016-02-02 Thread Jacob Keller

This patch series proposes the addition of a dynamic flag to the
ethtool {SG}RXFH operations. The primary reasoning for this is so that
drivers may indicate when they destroyed configured RSS settings, and
can determine when they have more liberty to remove user's settings.
The default mode shall be the current static mode, where drivers
should make best effort to maintain the RSS table settings if
possible. However, if the user sets dynamic mode flag, then the driver
is free (if necessary or useful) to modify the requested settings. The
primary reason for this is when queue sizes change dynamically. If you
increase the number of available queues, the RSS table may be
il-configured and the driver might which to change the settings.
Today, most drivers attempt to maintain the RSS table when possible.
This means that a user can observe functioning RSS, decrease the
number of queues, and then increase them again. Under current
functionality, drivers may end up never re-writing the RSS table back
to the default when the queues are increased. Even worse, if a driver
does do this today, they may have destroyed some specific settings the
user configured in the RSS table. Instead, use the dynamic mode value
which the driver will use to indicate whether or not the current
settings might change due to dynamic factors.

This series includes support to fix all the driver function pointers,
and a patch to enhance fm10k driver to support this feature. The
previous behavior of the fm10k driver was especially problematic, and
the current patch series attempts to resolve this.

It is possible to have most of the behavior handled purely in driver,
but then we lose any ability to communicate this to the user via
ethtool.

Jacob Keller (3):
  ethtool: add dynamic flag to ETHTOOL_{GS}RXFH commands
  fm10k: support dynamic mode for RSS table control
  ethtool: add support for dynamic mode in {SG}RXFH commands

 drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c |  7 +--
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c  |  7 +--
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c|  5 -
 drivers/net/ethernet/broadcom/tg3.c  |  8 ++--
 drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c  |  7 +--
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c   |  7 +--
 drivers/net/ethernet/cisco/enic/enic_ethtool.c   |  7 +--
 drivers/net/ethernet/emulex/benet/be_ethtool.c   |  7 +--
 drivers/net/ethernet/hisilicon/hns/hns_ethtool.c |  8 ++--
 drivers/net/ethernet/intel/fm10k/fm10k.h |  1 +
 drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c | 15 +--
 drivers/net/ethernet/intel/fm10k/fm10k_main.c|  8 +---
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c   |  7 +--
 drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c   |  6 --
 drivers/net/ethernet/intel/igb/igb_ethtool.c |  6 --
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c |  7 +--
 drivers/net/ethernet/intel/ixgbevf/ethtool.c |  5 -
 drivers/net/ethernet/marvell/mvneta.c|  8 ++--
 drivers/net/ethernet/mellanox/mlx4/en_ethtool.c  |  7 +--
 drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c |  7 +--
 drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c |  7 +--
 drivers/net/ethernet/sfc/ethtool.c   |  7 +--
 include/linux/ethtool.h  |  4 ++--
 include/uapi/linux/ethtool.h |  8 +++-
 net/core/ethtool.c   | 27 
---
 25 files changed, 138 insertions(+), 55 deletions(-)

 ethtool-copy.h |  8 +++-
 ethtool.c  | 36 +++-
 2 files changed, 42 insertions(+), 2 deletions(-)

-- 
2.6.3.505.g5cc1fd1

Re: [PATCH net-next 1/7] tcp: apply Kern's check on RTTs used for congestion control

2016-02-02 Thread Yuchung Cheng

On Tue, Feb 2, 2016 at 11:30 AM, Kenneth Klette Jonassen
 wrote:
>
> On Sat, Oct 17, 2015 at 6:57 AM, Yuchung Cheng  wrote:
> > Currently ca_seq_rtt_us does not use Kern's check. Fix that by
> > checking if any packet acked is a retransmit, for both RTT used
> > for RTT estimation and congestion control.
> >
> *snip*
>
> This patch (commit 9e45a3e) puzzles me, because Karn's check was
> already in effect:
> http://lxr.free-electrons.com/source/net/ipv4/tcp_input.c?v=4.3#L3117
>
> Since first_ackt/last_ackt is only set when non-retransmitted packets
> are ACKed (sequentially), we know them to be unambiguous samples for
> RTTM. Even if a (sequential) ACK covers retransmitted packets, we can
> still make a valid RTTM if that ACK also covers non-retransmitted
> packets. But this patch seems to prevent that?
Perhaps the commit message is not clear. Here is an example: an ACK
acks 2 packets where the 1st packet was retransmitted but the 2nd
packet is not.

Since we don't know if the ACk was caused by the retransmission (plugs
a hole) or by the original, we should not take an RTT sample. In other
words, we should refrain from taking RTT sample as long as the ACK
covers any retransmitted sequence.

[PATCH net-next v4 2/2] virtio_net: add ethtool support for set and get of settings

2016-02-02 Thread Nikolay Aleksandrov

From: Nikolay Aleksandrov 

This patch allows the user to set and retrieve speed and duplex of the
virtio_net device via ethtool. Having this functionality is very helpful
for simulating different environments and also enables the virtio_net
device to participate in operations where proper speed and duplex are
required (e.g. currently bonding lacp mode requires full duplex). Custom
speed and duplex are not allowed, the user-supplied settings are validated
before applying.

Example:
$ ethtool eth1
Settings for eth1:
...
Speed: Unknown!
Duplex: Unknown! (255)
$ ethtool -s eth1 speed 1000 duplex full
$ ethtool eth1
Settings for eth1:
...
Speed: 1000Mb/s
Duplex: Full

Based on a patch by Roopa Prabhu.

Signed-off-by: Nikolay Aleksandrov 
---
v2: use the new ethtool speed/duplex validation functions and allow half
duplex to be set
v3: return error if the user tries to change anything besides speed/duplex
as per Michael's comment
We have to zero-out advertising as it gets set automatically by ethtool if
setting speed and duplex together.
v4: Set port type to PORT_OTHER

 drivers/net/virtio_net.c | 59 
 1 file changed, 59 insertions(+)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 767ab11a6e9f..3acffc82960a 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -146,6 +146,10 @@ struct virtnet_info {
virtio_net_ctrl_ack ctrl_status;
u8 ctrl_promisc;
u8 ctrl_allmulti;
+
+   /* Ethtool settings */
+   u8 duplex;
+   u32 speed;
 };
 
 struct padded_vnet_hdr {
@@ -1376,6 +1380,57 @@ static void virtnet_get_channels(struct net_device *dev,
channels->other_count = 0;
 }
 
+/* Check if the user is trying to change anything besides speed/duplex */
+static bool virtnet_validate_ethtool_cmd(const struct ethtool_cmd *cmd)
+{
+   struct ethtool_cmd diff1 = *cmd;
+   struct ethtool_cmd diff2 = {};
+
+   /* advertising and cmd are usually set */
+   ethtool_cmd_speed_set(, 0);
+   diff1.advertising = 0;
+   diff1.duplex = 0;
+   diff1.cmd = 0;
+
+   return !memcmp(, , sizeof(diff1));
+}
+
+static int virtnet_set_settings(struct net_device *dev, struct ethtool_cmd 
*cmd)
+{
+   struct virtnet_info *vi = netdev_priv(dev);
+   u32 speed;
+
+   speed = ethtool_cmd_speed(cmd);
+   /* don't allow custom speed and duplex */
+   if (!ethtool_validate_speed(speed) ||
+   !ethtool_validate_duplex(cmd->duplex) ||
+   !virtnet_validate_ethtool_cmd(cmd))
+   return -EINVAL;
+   vi->speed = speed;
+   vi->duplex = cmd->duplex;
+
+   return 0;
+}
+
+static int virtnet_get_settings(struct net_device *dev, struct ethtool_cmd 
*cmd)
+{
+   struct virtnet_info *vi = netdev_priv(dev);
+
+   ethtool_cmd_speed_set(cmd, vi->speed);
+   cmd->duplex = vi->duplex;
+   cmd->port = PORT_OTHER;
+
+   return 0;
+}
+
+static void virtnet_init_settings(struct net_device *dev)
+{
+   struct virtnet_info *vi = netdev_priv(dev);
+
+   vi->speed = SPEED_UNKNOWN;
+   vi->duplex = DUPLEX_UNKNOWN;
+}
+
 static const struct ethtool_ops virtnet_ethtool_ops = {
.get_drvinfo = virtnet_get_drvinfo,
.get_link = ethtool_op_get_link,
@@ -1383,6 +1438,8 @@ static const struct ethtool_ops virtnet_ethtool_ops = {
.set_channels = virtnet_set_channels,
.get_channels = virtnet_get_channels,
.get_ts_info = ethtool_op_get_ts_info,
+   .get_settings = virtnet_get_settings,
+   .set_settings = virtnet_set_settings,
 };
 
 #define MIN_MTU 68
@@ -1855,6 +1912,8 @@ static int virtnet_probe(struct virtio_device *vdev)
netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs);
netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs);
 
+   virtnet_init_settings(dev);
+
err = register_netdev(dev);
if (err) {
pr_debug("virtio_net: registering device failed\n");
-- 
2.4.3

Re: [net-next PATCH 07/11] net: introduce napi_alloc_skb_hint() for more use-cases

2016-02-02 Thread kbuild test robot

Hi Jesper,

[auto build test WARNING on net-next/master]

url:
https://github.com/0day-ci/linux/commits/Jesper-Dangaard-Brouer/net-mitigating-kmem_cache-slowpath-and-BoF-discussion-patches/20160203-051706
reproduce: make htmldocs

All warnings (new ones prefixed by >>):

   include/linux/skbuff.h:922: warning: No description found for parameter 'sk'
>> net/core/skbuff.c:482: warning: No description found for parameter 
>> 'bulk_hint'
   net/core/gen_stats.c:155: warning: No description found for parameter 'cpu'
   net/core/gen_estimator.c:212: warning: No description found for parameter 
'cpu_bstats'
   net/core/gen_estimator.c:303: warning: No description found for parameter 
'cpu_bstats'
   net/core/dev.c:6450: warning: No description found for parameter 'len'
   include/linux/netdevice.h:1321: warning: Enum value 
'IFF_XMIT_DST_RELEASE_PERM' not described in enum 'netdev_priv_flags'
   include/linux/netdevice.h:1321: warning: Enum value 'IFF_IPVLAN_MASTER' not 
described in enum 'netdev_priv_flags'
   include/linux/netdevice.h:1321: warning: Enum value 'IFF_IPVLAN_SLAVE' not 
described in enum 'netdev_priv_flags'
   include/linux/netdevice.h:1826: warning: No description found for parameter 
'ptype_all'
   include/linux/netdevice.h:1826: warning: No description found for parameter 
'ptype_specific'

vim +/bulk_hint +482 net/core/skbuff.c

^1da177e Linus Torvalds 2005-04-16  466  
fd11a83d Alexander Duyck2014-12-09  467  /**
fd11a83d Alexander Duyck2014-12-09  468   * __napi_alloc_skb - 
allocate skbuff for rx in a specific NAPI instance
fd11a83d Alexander Duyck2014-12-09  469   * @napi: napi instance 
this buffer was allocated for
d7499160 Masanari Iida  2015-08-24  470   * @len: length to allocate
fd11a83d Alexander Duyck2014-12-09  471   * @gfp_mask: 
get_free_pages mask, passed to alloc_skb and alloc_pages
fd11a83d Alexander Duyck2014-12-09  472   *
fd11a83d Alexander Duyck2014-12-09  473   * Allocate a new sk_buff 
for use in NAPI receive.  This buffer will
fd11a83d Alexander Duyck2014-12-09  474   * attempt to allocate the 
head from a special reserved region used
fd11a83d Alexander Duyck2014-12-09  475   * only for NAPI Rx 
allocation.  By doing this we can save several
fd11a83d Alexander Duyck2014-12-09  476   * CPU cycles by avoiding 
having to disable and re-enable IRQs.
fd11a83d Alexander Duyck2014-12-09  477   *
fd11a83d Alexander Duyck2014-12-09  478   * %NULL is returned if 
there is no free memory.
fd11a83d Alexander Duyck2014-12-09  479   */
9451980a Alexander Duyck2015-05-06  480  struct sk_buff 
*__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
c24f01ac Jesper Dangaard Brouer 2016-02-02  481 
 unsigned int bulk_hint, gfp_t gfp_mask)
fd11a83d Alexander Duyck2014-12-09 @482  {
1ec46e92 Jesper Dangaard Brouer 2016-02-02  483 struct napi_alloc_cache 
*nc = this_cpu_ptr(_alloc_cache);
c24f01ac Jesper Dangaard Brouer 2016-02-02  484 unsigned int bulk_sz = 
min(bulk_hint, NAPI_SKB_CACHE_SIZE);
fc755a89 Jesper Dangaard Brouer 2016-02-02  485 struct skb_shared_info 
*shinfo;
fd11a83d Alexander Duyck2014-12-09  486 struct sk_buff *skb;
9451980a Alexander Duyck2015-05-06  487 void *data;
fd11a83d Alexander Duyck2014-12-09  488  
9451980a Alexander Duyck2015-05-06  489 len += NET_SKB_PAD + 
NET_IP_ALIGN;
9451980a Alexander Duyck2015-05-06  490  

:: The code at line 482 was first introduced by commit
:: fd11a83dd3630ec6a60f8a702446532c5c7e1991 net: Pull out core bits of 
__netdev_alloc_skb and add __napi_alloc_skb

:: TO: Alexander Duyck 
:: CC: David S. Miller 

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: Binary data

Re: Keystone 2 boards boot failure

2016-02-02 Thread Franklin Cooper

On 02/02/2016 03:26 PM, Arnd Bergmann wrote:
> On Tuesday 02 February 2016 15:01:33 Franklin S Cooper Jr. wrote:
>>
>> Yes. Here is a boot log on the latest master with the below
>> three patches reverted.
>> http://pastebin.com/W7RWSHpE (Working)
>>
>> I reverted these three patches. The two latest patches seem
>> to be trying to correct/expand upon the last patch on this list.
>>
>> commit 958d104e3d40eef5148c402887138f6594ff7e1e
>> netcp: fix regression in receive processing
>>
>> commit 9dd2d6c5c9755b160fe0111bcdad9491676feea8
>> netcp: add more __le32 annotations
>>
>> commit 899077791403ff7a2d8cfaa87bd1a82d729463e2
>> netcp: try to reduce type confusion in descriptors
>>
> 
> The middle patch should have no effect on generated code, so I'm ignoring
> that for now.
> 
> The next thing to rule out is an endianess bug. I assume you
> are running this on with a little-endian kernel, correct? If
> you are running big-endian, the base assumption that the driver needs
> to swap the data was flawed and that portion needs to be done.
> 
> If you are running little-endian 32-bit, please try the partial
> revert below, which just undoes the attempt to make it work with
> 64-bit kernels.

Keystone 2 devices are little-endian 32-bit devices.

This partial revert fixes the boot problem for me.
> 
>   Arnd
>   
> diff --git a/drivers/net/ethernet/ti/netcp_core.c 
> b/drivers/net/ethernet/ti/netcp_core.c
> index c61d66d38634..7e291c04a81a 100644
> --- a/drivers/net/ethernet/ti/netcp_core.c
> +++ b/drivers/net/ethernet/ti/netcp_core.c
> @@ -117,20 +117,10 @@ static void get_pkt_info(dma_addr_t *buff, u32 
> *buff_len, dma_addr_t *ndesc,
>   *ndesc = le32_to_cpu(desc->next_desc);
>  }
>  
> -static void get_pad_info(u32 *pad0, u32 *pad1, u32 *pad2, struct 
> knav_dma_desc *desc)
> +static void get_pad_info(u32 *pad0, u32 *pad1, struct knav_dma_desc *desc)
>  {
>   *pad0 = le32_to_cpu(desc->pad[0]);
>   *pad1 = le32_to_cpu(desc->pad[1]);
> - *pad2 = le32_to_cpu(desc->pad[2]);
> -}
> -
> -static void get_pad_ptr(void **padptr, struct knav_dma_desc *desc)
> -{
> - u64 pad64;
> -
> - pad64 = le32_to_cpu(desc->pad[0]) +
> - ((u64)le32_to_cpu(desc->pad[1]) << 32);
> - *padptr = (void *)(uintptr_t)pad64;
>  }
>  
>  static void get_org_pkt_info(dma_addr_t *buff, u32 *buff_len,
> @@ -163,11 +153,10 @@ static void set_desc_info(u32 desc_info, u32 pkt_info,
>   desc->packet_info = cpu_to_le32(pkt_info);
>  }
>  
> -static void set_pad_info(u32 pad0, u32 pad1, u32 pad2, struct knav_dma_desc 
> *desc)
> +static void set_pad_info(u32 pad0, u32 pad1, struct knav_dma_desc *desc)
>  {
>   desc->pad[0] = cpu_to_le32(pad0);
>   desc->pad[1] = cpu_to_le32(pad1);
> - desc->pad[2] = cpu_to_le32(pad1);
>  }
>  
>  static void set_org_pkt_info(dma_addr_t buff, u32 buff_len,
> @@ -581,7 +570,6 @@ static void netcp_free_rx_desc_chain(struct netcp_intf 
> *netcp,
>   dma_addr_t dma_desc, dma_buf;
>   unsigned int buf_len, dma_sz = sizeof(*ndesc);
>   void *buf_ptr;
> - u32 pad[2];
>   u32 tmp;
>  
>   get_words(_desc, 1, >next_desc);
> @@ -593,15 +581,13 @@ static void netcp_free_rx_desc_chain(struct netcp_intf 
> *netcp,
>   break;
>   }
>   get_pkt_info(_buf, , _desc, ndesc);
> - get_pad_ptr(_ptr, ndesc);
> + get_pad_info((u32 *)_ptr, , ndesc);
>   dma_unmap_page(netcp->dev, dma_buf, PAGE_SIZE, DMA_FROM_DEVICE);
>   __free_page(buf_ptr);
>   knav_pool_desc_put(netcp->rx_pool, desc);
>   }
>  
> - get_pad_info([0], [1], _len, desc);
> - buf_ptr = (void *)(uintptr_t)(pad[0] + ((u64)pad[1] << 32));
> -
> + get_pad_info((u32 *)_ptr, _len, desc);
>   if (buf_ptr)
>   netcp_frag_free(buf_len <= PAGE_SIZE, buf_ptr);
>   knav_pool_desc_put(netcp->rx_pool, desc);
> @@ -639,8 +625,8 @@ static int netcp_process_one_rx_packet(struct netcp_intf 
> *netcp)
>   dma_addr_t dma_desc, dma_buff;
>   struct netcp_packet p_info;
>   struct sk_buff *skb;
> - u32 pad[2];
>   void *org_buf_ptr;
> + u32 tmp;
>  
>   dma_desc = knav_queue_pop(netcp->rx_queue, _sz);
>   if (!dma_desc)
> @@ -653,8 +639,7 @@ static int netcp_process_one_rx_packet(struct netcp_intf 
> *netcp)
>   }
>  
>   get_pkt_info(_buff, _len, _desc, desc);
> - get_pad_info([0], [1], _buf_len, desc);
> - org_buf_ptr = (void *)(uintptr_t)(pad[0] + ((u64)pad[1] << 32));
> + get_pad_info((u32 *)_buf_ptr, _buf_len, desc);
>  
>   if (unlikely(!org_buf_ptr)) {
>   dev_err(netcp->ndev_dev, "NULL bufptr in desc\n");
> @@ -679,7 +664,6 @@ static int netcp_process_one_rx_packet(struct netcp_intf 
> *netcp)
>   /* Fill in the page fragment list */
>   while (dma_desc) {
>   struct page *page;
> - void *ptr;
>  
>   ndesc = knav_pool_desc_unmap(netcp->rx_pool, dma_desc,

Re: Keystone 2 boards boot failure

2016-02-02 Thread Arnd Bergmann

On Tuesday 02 February 2016 16:59:34 Franklin Cooper wrote:
> On 02/02/2016 03:26 PM, Arnd Bergmann wrote:
> > On Tuesday 02 February 2016 15:01:33 Franklin S Cooper Jr. wrote:
> >>
> >> Yes. Here is a boot log on the latest master with the below
> >> three patches reverted.
> >> http://pastebin.com/W7RWSHpE (Working)
> >>
> >> I reverted these three patches. The two latest patches seem
> >> to be trying to correct/expand upon the last patch on this list.
> >>
> >> commit 958d104e3d40eef5148c402887138f6594ff7e1e
> >> netcp: fix regression in receive processing
> >>
> >> commit 9dd2d6c5c9755b160fe0111bcdad9491676feea8
> >> netcp: add more __le32 annotations
> >>
> >> commit 899077791403ff7a2d8cfaa87bd1a82d729463e2
> >> netcp: try to reduce type confusion in descriptors
> >>
> > 
> > The middle patch should have no effect on generated code, so I'm ignoring
> > that for now.
> > 
> > The next thing to rule out is an endianess bug. I assume you
> > are running this on with a little-endian kernel, correct? If
> > you are running big-endian, the base assumption that the driver needs
> > to swap the data was flawed and that portion needs to be done.
> > 
> > If you are running little-endian 32-bit, please try the partial
> > revert below, which just undoes the attempt to make it work with
> > 64-bit kernels.
> 
> Keystone 2 devices are little-endian 32-bit devices.

I meant the kernel you are running on it, not the hardware.
You should always be able to run both a big-endian kernel and
a littl-endian kernel on any ARMv7 machine, and a couple of
platforms use 64-bit physical addresses even on 32-bit machines
(with the normal 32-bit instruction set).

I wasn't completely sure if there are already keystone-derived
products with 64-bit CPU cores, but I guess the driver would
fail really badly on those (with or without the patch).

> This partial revert fixes the boot problem for me.

Ok.


I tried to create a smaller version and stumbled over
a typo, maybe that's the whole problem. Can you try this one:

diff --git a/drivers/net/ethernet/ti/netcp_core.c 
b/drivers/net/ethernet/ti/netcp_core.c
index c61d66d38634..8490804416dd 100644
--- a/drivers/net/ethernet/ti/netcp_core.c
+++ b/drivers/net/ethernet/ti/netcp_core.c
@@ -167,7 +167,7 @@ static void set_pad_info(u32 pad0, u32 pad1, u32 pad2, 
struct knav_dma_desc *des
 {
desc->pad[0] = cpu_to_le32(pad0);
desc->pad[1] = cpu_to_le32(pad1);
-   desc->pad[2] = cpu_to_le32(pad1);
+   desc->pad[2] = cpu_to_le32(pad2);
 }
 
 static void set_org_pkt_info(dma_addr_t buff, u32 buff_len,


Arnd

Re: [net-next PATCH 04/11] net: bulk alloc and reuse of SKBs in NAPI context

2016-02-02 Thread Alexei Starovoitov

On Tue, Feb 02, 2016 at 10:12:01PM +0100, Jesper Dangaard Brouer wrote:
> Think twice before applying
>  - This patch can potentially introduce added latency in some workloads
> 
> This patch introduce bulk alloc of SKBs and allow reuse of SKBs
> free'ed in same softirq cycle.  SKBs are normally free'ed during TX
> completion, but most high speed drivers also cleanup TX ring during
> NAPI RX poll cycle.  Thus, if using napi_consume_skb/__kfree_skb_defer,
> SKBs will be avail in the napi_alloc_cache->skb_cache.
> 
> If no SKBs are avail for reuse, then only bulk alloc 8 SKBs, to limit
> the potential overshooting unused SKBs needed to free'ed when NAPI
> cycle ends (flushed in net_rx_action via __kfree_skb_flush()).
> 
> Benchmarking IPv4-forwarding, on CPU i7-4790K @4.2GHz (no turbo boost)
> (GCC version 5.1.1 20150618 (Red Hat 5.1.1-4))
>  Allocator SLUB:
>   Single CPU/flow numbers: before: 2064446 pps -> after: 2083031 pps
>   Improvement: +18585 pps, -4.3 nanosec, +0.9%
>  Allocator SLAB:
>   Single CPU/flow numbers: before: 2035949 pps -> after: 2033567 pps
>   Regression: -2382 pps, +0.57 nanosec, -0.1 %
> 
> Even-though benchmarking does show an improvement for SLUB(+0.9%), I'm
> not convinced bulk alloc will be a win in all situations:
>  * I see stalls on walking the SLUB freelist (normal hidden by prefetch)
>  * In case RX queue is not full, alloc and free more SKBs than needed
> 
> More testing is needed with more real life benchmarks.
> 
> Joint work with Alexander Duyck.
> 
> Signed-off-by: Jesper Dangaard Brouer 
> Signed-off-by: Alexander Duyck 
...
> - skb = __build_skb(data, len);
> - if (unlikely(!skb)) {
> +#define BULK_ALLOC_SIZE 8
> + if (!nc->skb_count) {
> + nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache,
> +   gfp_mask, BULK_ALLOC_SIZE,
> +   nc->skb_cache);
> + }
> + if (likely(nc->skb_count)) {
> + skb = (struct sk_buff *)nc->skb_cache[--nc->skb_count];
> + } else {
> + /* alloc bulk failed */
>   skb_free_frag(data);
>   return NULL;
>   }
>  
> + len -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> +
> + memset(skb, 0, offsetof(struct sk_buff, tail));
> + skb->truesize = SKB_TRUESIZE(len);
> + atomic_set(>users, 1);
> + skb->head = data;
> + skb->data = data;
> + skb_reset_tail_pointer(skb);
> + skb->end = skb->tail + len;
> + skb->mac_header = (typeof(skb->mac_header))~0U;
> + skb->transport_header = (typeof(skb->transport_header))~0U;
> +
> + /* make sure we initialize shinfo sequentially */
> + shinfo = skb_shinfo(skb);
> + memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
> + atomic_set(>dataref, 1);
> + kmemcheck_annotate_variable(shinfo->destructor_arg);

copy-pasting from __build_skb()...
Either new helper is needed or extend __build_skb() to take
pre-allocated 'raw_skb' pointer.
This interface is questionable until patch 7 comes to use it.
Would have helped if they were back to back.

Overall I like the first 3 patches. I think they're useful
on their won.
As far as bulk alloc... have you considered splitting
bulk alloc of skb and init of skb?
Like in the above
+   skb = (struct sk_buff *)nc->skb_cache[--nc->skb_count];
will give cold pointer and first memset() will be missing cache.
Either prefetch is needed the way slab_alloc_node() is doing
in the line prefetch_freepointer(s, next_object);
or buld_alloc_skb and bulk_init_skb need to be two loops
driven by drivers.
Another idea is we can move skb_init all the way up till
eth_type_trans() and the driver should prefetch both
skb->data and skb pointers. Then eth_type_trans_and_skb_init()
helper will read from cache and store into cache.
Rephrasing the idea:
when the drivers do napi_alloc_skb() they don't really
need initialized 'struct sk_buff'. They either need skb->data
to copy headers into or shinfo->frags to add a page to,
the full init can wait till eth_type_trans_and_init()
right before napi_gro_receive().
Thoughts?

[PATCH iproute2] ipmonitor: match user option 'all' before 'all-nsid'

2016-02-02 Thread Roopa Prabhu

From: Roopa Prabhu 

'ip monitor all' is broken on older kernels.
This patch fixes 'ip monitor all' to match
'all' and not 'all-nsid'.

It moves parsing arg 'all-nsid' to after parsing
'all'.

Before:
$ip monitor all
NETLINK_LISTEN_ALL_NSID: Protocol not available

After:
$ip monitor all
[NEIGH]Deleted 10.0.0.1 dev eth1 lladdr c4:54:44:4f:b2:dd STALE

Fixes: 449b824ad196 ("ipmonitor: allows to monitor in several netns")
Signed-off-by: Roopa Prabhu 
---
 ip/ipmonitor.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ip/ipmonitor.c b/ip/ipmonitor.c
index 99a237f..7aeccd2 100644
--- a/ip/ipmonitor.c
+++ b/ip/ipmonitor.c
@@ -186,8 +186,6 @@ int do_ipmonitor(int argc, char **argv)
file = *argv;
} else if (matches(*argv, "label") == 0) {
prefix_banner = 1;
-   } else if (matches(*argv, "all-nsid") == 0) {
-   listen_all_nsid = 1;
} else if (matches(*argv, "link") == 0) {
llink=1;
groups = 0;
@@ -217,6 +215,8 @@ int do_ipmonitor(int argc, char **argv)
groups = 0;
} else if (strcmp(*argv, "all") == 0) {
prefix_banner=1;
+   } else if (matches(*argv, "all-nsid") == 0) {
+   listen_all_nsid = 1;
} else if (matches(*argv, "help") == 0) {
usage();
} else if (strcmp(*argv, "dev") == 0) {
-- 
1.9.1

Re: [PATCH net-next 1/4] net: add event_list to struct net and provide utility functions

2016-02-02 Thread Salam Noureddine

On Tue, Feb 2, 2016 at 12:01 PM, Julian Anastasov  wrote:

>> +#ifdef CONFIG_NET_NS
>> +static inline void net_add_event_list(struct list_head *head, struct net 
>> *net)
>> +{
>> + if (!list_empty(>event_list))
>
> Above check looks inverted, it works may be
> because INIT_LIST_HEAD(>event_list) is missing.
>
>> + list_add_tail(>event_list, head);
>> +}
>> +

Thanks for catching this! I ran my benchmark again with the corrected check
and I still get the same benefits with respect to rtnl_lock hold time.

Salam

Fw: [Bug 111771] New: deadlock in ppp/l2tp

2016-02-02 Thread Stephen Hemminger

Please excuse URL mangling, my bugzilla address appears to route through
stupid corporate firewall.

Begin forwarded message:

Date: Tue, 2 Feb 2016 18:38:41 +
From: "bugzilla-dae...@bugzilla.kernel.org" 

To: "shemmin...@linux-foundation.org" 
Subject: [Bug 111771] New: deadlock in ppp/l2tp


https://urldefense.proofpoint.com/v2/url?u=https-3A__bugzilla.kernel.org_show-5Fbug.cgi-3Fid-3D111771=CwICaQ=IL_XqQWOjubgfqINi2jTzg=q_lvUiVm1uM6QEw9TPH-6jiV__hsrE6xXUAtATPE9x0=QRVzJYt9nD-EOW0XdrPpw2-kYmZu0sg62aaPeiiLI_Q=l3HC8fgAgyPVwSgMaX2Hjr8GL3P5j2fL1kDXhEW-v9w=
 

Bug ID: 111771
   Summary: deadlock in ppp/l2tp
   Product: Networking
   Version: 2.5
Kernel Version: 4.3.2
  Hardware: x86-64
OS: Linux
  Tree: Mainline
Status: NEW
  Severity: normal
  Priority: P1
 Component: Other
  Assignee: shemmin...@linux-foundation.org
  Reporter: sor...@gmail.com
Regression: No

Created attachment 202771
  --> 
https://urldefense.proofpoint.com/v2/url?u=https-3A__bugzilla.kernel.org_attachment.cgi-3Fid-3D202771-26action-3Dedit=CwICaQ=IL_XqQWOjubgfqINi2jTzg=q_lvUiVm1uM6QEw9TPH-6jiV__hsrE6xXUAtATPE9x0=QRVzJYt9nD-EOW0XdrPpw2-kYmZu0sg62aaPeiiLI_Q=2a_KCXgtoQ6NLxDon6_3flSUMpb7Tjj8WhGPZ09E8Vo=
 
kernel.log, config, cpuinfo

I'm getting a deadlock and the computer is unresponsive shortly after setting
up a xl2tpd/ipsec vpn connection. The deadlock occurs every time, it is very
reproducible on my computer.

It is the mainline kernel, version 4.3.2.

The processor is given in a copy /proc/cpuinfo that I've attached.

uname:
Linux version 4.3.2 (gcc version 5.3.1 20160121 (Debian 5.3.1-7) ) #5 SMP
PREEMPT Sat Jan 30 00:05:40 CET 2016

ppp is from the debian package of their unstable distribution:
ppp_2.4.7-1+2_amd64

I've attached the kernel config file and the kernel log.

-- 
You are receiving this mail because:
You are the assignee for the bug.

Re: [PATCH v2] unix: properly account for FDs passed over unix sockets

2016-02-02 Thread Hannes Frederic Sowa


On 03.02.2016 01:57, Hannes Frederic Sowa wrote:

On 02.02.2016 23:11, Linus Torvalds wrote:

But I'm OK with that patch as is if you prefer it that way (maybe you
want
to use the cred to then test for root separately etc, out maybe there
already was done use of cred as cred that I just missed when reading the
patch on my phone..)


I don't see any reason to switch over to struct user_struct. I tested a
patch and will send it out soon.


I meant that there is no reason not to switch over to struct 
user_struct. Patch is send out.


Thanks for looking into this!

Hannes

Re: Keystone 2 boards boot failure

2016-02-02 Thread Franklin S Cooper Jr.



On 02/02/2016 05:26 PM, Arnd Bergmann wrote:
> On Tuesday 02 February 2016 16:59:34 Franklin Cooper wrote:
>> On 02/02/2016 03:26 PM, Arnd Bergmann wrote:
>>> On Tuesday 02 February 2016 15:01:33 Franklin S Cooper Jr. wrote:
 Yes. Here is a boot log on the latest master with the below
 three patches reverted.
 http://pastebin.com/W7RWSHpE (Working)

 I reverted these three patches. The two latest patches seem
 to be trying to correct/expand upon the last patch on this list.

 commit 958d104e3d40eef5148c402887138f6594ff7e1e
 netcp: fix regression in receive processing

 commit 9dd2d6c5c9755b160fe0111bcdad9491676feea8
 netcp: add more __le32 annotations

 commit 899077791403ff7a2d8cfaa87bd1a82d729463e2
 netcp: try to reduce type confusion in descriptors

>>> The middle patch should have no effect on generated code, so I'm ignoring
>>> that for now.
>>>
>>> The next thing to rule out is an endianess bug. I assume you
>>> are running this on with a little-endian kernel, correct? If
>>> you are running big-endian, the base assumption that the driver needs
>>> to swap the data was flawed and that portion needs to be done.
>>>
>>> If you are running little-endian 32-bit, please try the partial
>>> revert below, which just undoes the attempt to make it work with
>>> 64-bit kernels.
>> Keystone 2 devices are little-endian 32-bit devices.
> I meant the kernel you are running on it, not the hardware.
> You should always be able to run both a big-endian kernel and
> a littl-endian kernel on any ARMv7 machine, and a couple of
> platforms use 64-bit physical addresses even on 32-bit machines
> (with the normal 32-bit instruction set).

I'm not sure if Keystone 2 devices support this or if we
have support for this. I'll have to double check.
>
> I wasn't completely sure if there are already keystone-derived
> products with 64-bit CPU cores, but I guess the driver would
> fail really badly on those (with or without the patch).
>
>> This partial revert fixes the boot problem for me.
> Ok.
>
>
> I tried to create a smaller version and stumbled over
> a typo, maybe that's the whole problem. Can you try this one:
>
> diff --git a/drivers/net/ethernet/ti/netcp_core.c 
> b/drivers/net/ethernet/ti/netcp_core.c
> index c61d66d38634..8490804416dd 100644
> --- a/drivers/net/ethernet/ti/netcp_core.c
> +++ b/drivers/net/ethernet/ti/netcp_core.c
> @@ -167,7 +167,7 @@ static void set_pad_info(u32 pad0, u32 pad1, u32 pad2, 
> struct knav_dma_desc *des
>  {
>   desc->pad[0] = cpu_to_le32(pad0);
>   desc->pad[1] = cpu_to_le32(pad1);
> - desc->pad[2] = cpu_to_le32(pad1);
> + desc->pad[2] = cpu_to_le32(pad2);
>  }
>  
>  static void set_org_pkt_info(dma_addr_t buff, u32 buff_len,
>
>
>   Arnd

So only making this change on the latest master with no
other changes I see the boot problem again.

bnx2x commits needed to use 7.51.10 firmware?

2016-02-02 Thread Dan Streetman

Hi Ariel,

I'm trying to update the bnx2x driver in Ubuntu trusty (3.13 kernel)
release to use the 7.51.10 firmware; can you help me determine which
commits need to be backported?

Some reference is in Launchpad bug 1454286:
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1454286

basically, there are 87 commits between the current bnx2x driver level
in Ubuntu trusty's kernel, and commit e42780b where the bnx2x driver
is acutally updated to use the 7.51.10 firmware.  Can you provide
guidance to which commits should be pulled back?  Do all 87 of them
need to be included?

Thanks!

Re: [PATCH v2] unix: properly account for FDs passed over unix sockets

2016-02-02 Thread Hannes Frederic Sowa


On 02.02.2016 23:11, Linus Torvalds wrote:

[ sorry for the html mail, I'm out grocery shopping ]

On Feb 2, 2016 13:55, "Hannes Frederic Sowa" 
wrote:


I slightly tested the attached patch.


Looks fine. I do wonder: if the only thing we use that "struct cred" for is
to do that ->user lookup, maybe we should just use "struct user_struct"
directly, and skip the cred entirely.

Something like

 fp->user = get_uid(current_user());

and then

 put_uid(fp->user);

But I'm OK with that patch as is if you prefer it that way (maybe you want
to use the cred to then test for root separately etc, out maybe there
already was done use of cred as cred that I just missed when reading the
patch on my phone..)


I don't see any reason to switch over to struct user_struct. I tested a 
patch and will send it out soon.


Bye,
Hannes

[PATCH net] unix: correctly track in-flight fds in sending process user_struct

2016-02-02 Thread Hannes Frederic Sowa

The commit referenced in the Fixes tag incorrectly accounted the number
of in-flight fds over a unix domain socket to the original opener
of the file-descriptor. This allows another process to arbitrary
deplete the original file-openers resource limit for the maximum of
open files. Instead the sending processes and its struct cred should
be credited.

To do so, we add a reference counted struct user_struct pointer to the
scm_fp_list and use it to account for the number of inflight unix fds.

Fixes: 712f4aad406bb1 ("unix: properly account for FDs passed over unix 
sockets")
Reported-by: David Herrmann 
Cc: David Herrmann 
Cc: Willy Tarreau 
Cc: Linus Torvalds 
Suggested-by: Linus Torvalds 
Signed-off-by: Hannes Frederic Sowa 
---
 include/net/af_unix.h | 4 ++--
 include/net/scm.h | 1 +
 net/core/scm.c| 7 +++
 net/unix/af_unix.c| 4 ++--
 net/unix/garbage.c| 8 
 5 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 2a91a0561a4783..9b4c418bebd84a 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -6,8 +6,8 @@
 #include 
 #include 
 
-void unix_inflight(struct file *fp);
-void unix_notinflight(struct file *fp);
+void unix_inflight(struct user_struct *user, struct file *fp);
+void unix_notinflight(struct user_struct *user, struct file *fp);
 void unix_gc(void);
 void wait_for_unix_gc(void);
 struct sock *unix_get_socket(struct file *filp);
diff --git a/include/net/scm.h b/include/net/scm.h
index 262532d111f51e..59fa93c01d2a16 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -21,6 +21,7 @@ struct scm_creds {
 struct scm_fp_list {
short   count;
short   max;
+   struct user_struct  *user;
struct file *fp[SCM_MAX_FD];
 };
 
diff --git a/net/core/scm.c b/net/core/scm.c
index 14596fb3717270..2696aefdc14888 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -87,6 +87,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct 
scm_fp_list **fplp)
*fplp = fpl;
fpl->count = 0;
fpl->max = SCM_MAX_FD;
+   fpl->user = NULL;
}
fpp = >fp[fpl->count];
 
@@ -107,6 +108,10 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct 
scm_fp_list **fplp)
*fpp++ = file;
fpl->count++;
}
+
+   if (!fpl->user)
+   fpl->user = get_uid(current_user());
+
return num;
 }
 
@@ -119,6 +124,7 @@ void __scm_destroy(struct scm_cookie *scm)
scm->fp = NULL;
for (i=fpl->count-1; i>=0; i--)
fput(fpl->fp[i]);
+   free_uid(fpl->user);
kfree(fpl);
}
 }
@@ -336,6 +342,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
for (i = 0; i < fpl->count; i++)
get_file(fpl->fp[i]);
new_fpl->max = new_fpl->count;
+   new_fpl->user = get_uid(fpl->user);
}
return new_fpl;
 }
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 49d5093eb0553a..29be035f9c6502 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1496,7 +1496,7 @@ static void unix_detach_fds(struct scm_cookie *scm, 
struct sk_buff *skb)
UNIXCB(skb).fp = NULL;
 
for (i = scm->fp->count-1; i >= 0; i--)
-   unix_notinflight(scm->fp->fp[i]);
+   unix_notinflight(scm->fp->user, scm->fp->fp[i]);
 }
 
 static void unix_destruct_scm(struct sk_buff *skb)
@@ -1561,7 +1561,7 @@ static int unix_attach_fds(struct scm_cookie *scm, struct 
sk_buff *skb)
return -ENOMEM;
 
for (i = scm->fp->count - 1; i >= 0; i--)
-   unix_inflight(scm->fp->fp[i]);
+   unix_inflight(scm->fp->user, scm->fp->fp[i]);
return max_level;
 }
 
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 8fcdc2283af50c..6a0d48525fcf9a 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -116,7 +116,7 @@ struct sock *unix_get_socket(struct file *filp)
  * descriptor if it is for an AF_UNIX socket.
  */
 
-void unix_inflight(struct file *fp)
+void unix_inflight(struct user_struct *user, struct file *fp)
 {
struct sock *s = unix_get_socket(fp);
 
@@ -133,11 +133,11 @@ void unix_inflight(struct file *fp)
}
unix_tot_inflight++;
}
-   fp->f_cred->user->unix_inflight++;
+   user->unix_inflight++;
spin_unlock(_gc_lock);
 }
 
-void unix_notinflight(struct file *fp)
+void unix_notinflight(struct user_struct *user, struct file *fp)
 {
struct sock *s = unix_get_socket(fp);
 
@@ -152,7 +152,7 @@ void unix_notinflight(struct file *fp)
list_del_init(>link);
unix_tot_inflight--;

[PATCH net] ipv6: fix a lockdep splat

2016-02-02 Thread Eric Dumazet

From: Eric Dumazet 

Silence lockdep false positive about rcu_dereference() being
used in the wrong context.

First one should use rcu_dereference_protected() as we own the spinlock.

Second one should be a normal assignation, as no barrier is needed.

Fixes: 18367681a10bd ("ipv6 flowlabel: Convert np->ipv6_fl_list to RCU.")
Reported-by: Dave Jones 
Signed-off-by: Eric Dumazet 
---
 net/ipv6/ip6_flowlabel.c |5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 1f9ebe3cbb4a..dc2db4f7b182 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -540,12 +540,13 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user 
*optval, int optlen)
}
spin_lock_bh(_sk_fl_lock);
for (sflp = >ipv6_fl_list;
-(sfl = rcu_dereference(*sflp)) != NULL;
+(sfl = rcu_dereference_protected(*sflp,
+ 
lockdep_is_held(_sk_fl_lock))) != NULL;
 sflp = >next) {
if (sfl->fl->label == freq.flr_label) {
if (freq.flr_label == 
(np->flow_label_FLOWLABEL_MASK))
np->flow_label &= ~IPV6_FLOWLABEL_MASK;
-   *sflp = rcu_dereference(sfl->next);
+   *sflp = sfl->next;
spin_unlock_bh(_sk_fl_lock);
fl_release(sfl->fl);
kfree_rcu(sfl, rcu);

Re: [PATCH net] net: Allow flow dissector to handle non 4-byte aligned headers

2016-02-02 Thread Sowmini Varadhan

On (02/01/16 19:56), Alexander Duyck wrote:
> > @@ -394,7 +407,7 @@ ip_proto_again:
> >data, hlen, &_eth);
> > if (!eth)
> > goto out_bad;
> > -   proto = eth->h_proto;
> > +   proto = get_unaligned_be16(>h_proto);
> > nhoff += sizeof(*eth);
> > }
> 
> This piece doesn't make any sense to me.  It is already only 2 bytes
> wide.  I'm not sure why we should be seeing this trigger an unaligned
> access.  Are you sure it wasn't something like the keyid causing the
> issue?  I'd be interested in seeing what the compiler did here that it
> is triggering the problem.

You're right- I was getting blinded by all the unaligned-access
messages swimming by and making a mistake. It was actually the 

memcpy(_addrs->v4addrs, >saddr,
   sizeof(key_addrs->v4addrs));

The assembler code is this:

   0x8d3298 <__skb_flow_dissect+500>:   ld  [ %l5 + 0xc ], %g3
   0x8d329c <__skb_flow_dissect+504>:   add  %i2, %g1, %g2
   0x8d32a0 <__skb_flow_dissect+508>:   st  %g3, [ %i2 + %g1 ]
   0x8d32a4 <__skb_flow_dissect+512>:   ld  [ %l5 + 0x10 ], %g1
   0x8d32a8 <__skb_flow_dissect+516>:   st  %g1, [ %g2 + 4 ]
   0x8d32ac <__skb_flow_dissect+520>:   mov  2, %g1

I get unaligned access traps at __skb_flow_dissect+500 and
__skb_flow_dissect+512 (corresponding to saddr and daddr), once for
each interface (gretap/eth0 and eth1). 

--Sowmini

[PATCH net-next 0/2] IPv6 NFC

2016-02-02 Thread Edward Cree

This series adds support for steering IPv6 flows using the ethtool NFC
 interface, and implements it for sfc devices.
Tested using an in-development patch to the ethtool utility.

Edward Cree (2):
  ethtool: add IPv6 to the NFC API
  sfc: implement IPv6 NFC (and IPV4_USER_FLOW)

 drivers/net/ethernet/sfc/ethtool.c | 176 +
 include/uapi/linux/ethtool.h   |  70 +--
 2 files changed, 240 insertions(+), 6 deletions(-)

Re: [PATCH net-next 1/7] tcp: apply Kern's check on RTTs used for congestion control

2016-02-02 Thread Kenneth Klette Jonassen

On Sat, Oct 17, 2015 at 6:57 AM, Yuchung Cheng  wrote:
> Currently ca_seq_rtt_us does not use Kern's check. Fix that by
> checking if any packet acked is a retransmit, for both RTT used
> for RTT estimation and congestion control.
>
*snip*

This patch (commit 9e45a3e) puzzles me, because Karn's check was
already in effect:
http://lxr.free-electrons.com/source/net/ipv4/tcp_input.c?v=4.3#L3117

Since first_ackt/last_ackt is only set when non-retransmitted packets
are ACKed (sequentially), we know them to be unambiguous samples for
RTTM. Even if a (sequential) ACK covers retransmitted packets, we can
still make a valid RTTM if that ACK also covers non-retransmitted
packets. But this patch seems to prevent that?

Re: [PATCH v2] unix: properly account for FDs passed over unix sockets

2016-02-02 Thread Linus Torvalds

On Tue, Feb 2, 2016 at 10:29 AM, Hannes Frederic Sowa
 wrote:
>>
>> Anyway, can someone provide a high-level description of what exactly
>> this patch is supposed to do? Which operation should be limited, who
>> should inflight FDs be accounted on, and which rlimit should be used
>> on each operation? I'm having a hard time auditing existing
>> user-space, given just the scarce description of this commit.
>
> Yes, all your observations are true. I think we need to explicitly
> need to refer to the sending socket while attaching the fds.

I don't think that really helps. Maybe somebody passed a unix domain
socket around, and now we're crediting the wrong socket again.

So how about we actually add a "struct cred *" to the scm_cookie
itself, and we initialize it to "get_current_cred()". And then always
use that.

That way it's always the person who actually does the send (rather
than the opener of the socket _or_ the opener of the file that gets
passed around) that gets credited, and thanks to the cred pointer we
can then de-credit them properly.

Hmm?

Linus

net/ipv6/ip6_flowlabel.c:543 suspicious rcu_dereference_check() usage!

2016-02-02 Thread Dave Jones

===
[ INFO: suspicious RCU usage. ]
4.5.0-rc2-think+ #2 Tainted: GW  
---
net/ipv6/ip6_flowlabel.c:543 suspicious rcu_dereference_check() usage!

other info that might help us debug this:


rcu_scheduler_active = 1, debug_locks = 1
2 locks held by trinity-c0/21982:
 #0:  (sk_lock-AF_INET6){+.+.+.}, at: [] 
do_ipv6_setsockopt.isra.5+0x1bc/0x1c20
 #1:  (ip6_sk_fl_lock){+.}, at: [] 
ipv6_flowlabel_opt+0x584/0x1b00

stack backtrace:
CPU: 1 PID: 21982 Comm: trinity-c0 Tainted: GW   4.5.0-rc2-think+ #2
 b1821f20 e98d3751 88032ccaf798 b076bd50
 41b58ab3 b19d0db1 b076bca5 0001
 8803011308e8 e98d3751 88032ccaf770 0003
Call Trace:
 [] dump_stack+0xab/0x11b
 [] ? _atomic_dec_and_lock+0x95/0x95
 [] lockdep_rcu_suspicious+0xe6/0x100
 [] ipv6_flowlabel_opt+0x62b/0x1b00
 [] ? ipv6_flowlabel_opt_get+0x6b0/0x6b0
 [] ? preempt_count_sub+0x1a/0x140
 [] ? mark_held_locks+0xc8/0x140
 [] ? __local_bh_enable_ip+0x64/0xc0
 [] ? lock_sock_nested+0x5a/0xe0
 [] ? trace_hardirqs_on+0xd/0x10
 [] ? __local_bh_enable_ip+0x64/0xc0
 [] ? lock_sock_nested+0xa9/0xe0
 [] do_ipv6_setsockopt.isra.5+0xbc4/0x1c20
 [] ? debug_lockdep_rcu_enabled.part.34+0x1f/0x40
 [] ? ip6_ra_control+0x1d0/0x1d0
 [] ? mark_lock+0xf4/0xc90
 [] ? print_usage_bug+0x520/0x520
 [] ? print_usage_bug+0x520/0x520
 [] ? native_sched_clock+0x69/0x160
 [] ? __lock_acquire+0x7ee/0x2ec0
 [] ? mark_lock+0xf4/0xc90
 [] ? print_usage_bug+0x520/0x520
 [] ? debug_check_no_locks_freed+0x200/0x200
 [] ? __lock_acquire+0x7ee/0x2ec0
 [] ? preempt_count_sub+0xc9/0x140
 [] ? debug_smp_processor_id+0x17/0x20
 [] ? get_lock_stats+0x4e/0xf0
 [] ? preempt_count_sub+0xc9/0x140
 [] ? trace_hardirqs_off_caller+0x73/0x150
 [] ? trace_hardirqs_off+0xd/0x10
 [] ? __acct_update_integrals+0x108/0x260
 [] ? taskstats_exit+0x5f0/0x5f0
 [] ? debug_smp_processor_id+0x17/0x20
 [] ? get_lock_stats+0x4e/0xf0
 [] ? preempt_count_sub+0xc9/0x140
 [] ? account_user_time+0x141/0x200
 [] ipv6_setsockopt+0x31/0xe0
 [] tcp_setsockopt+0x71/0xd0
 [] sock_common_setsockopt+0x6c/0xb0
 [] SyS_setsockopt+0x116/0x200
 [] ? SyS_recv+0x20/0x20
 [] ? int_ret_from_sys_call+0x52/0x9f
 [] ? trace_hardirqs_on_thunk+0x17/0x19
 [] entry_SYSCALL_64_fastpath+0x12/0x6b

Re: [PATCH] rtlwifi: fix semicolon.cocci warnings

2016-02-02 Thread Larry Finger


On 02/02/2016 09:40 PM, kbuild test robot wrote:

drivers/net/wireless/realtek/rtlwifi/core.c:59:2-3: Unneeded semicolon


  Remove unneeded semicolon.

Generated by: scripts/coccinelle/misc/semicolon.cocci

CC: Byeoungwook Kim 
Signed-off-by: Fengguang Wu 
---


Acked-by: Larry Finger 

Thanks,

Larry



  core.c |2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

--- a/drivers/net/wireless/realtek/rtlwifi/core.c
+++ b/drivers/net/wireless/realtek/rtlwifi/core.c
@@ -56,7 +56,7 @@ void rtl_addr_delay(u32 addr)
case 0xf9:
udelay(1);
break;
-   };
+   }
  }
  EXPORT_SYMBOL(rtl_addr_delay);

[net] bonding: use return instead of goto

2016-02-02 Thread Zhang Shengju

Replace 'goto' with 'return' to remove unnecessary check at label:
err_undo_flags.

Signed-off-by: Zhang Shengju 
---
 drivers/net/bonding/bond_main.c | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index bcc7b19..abe014f 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1379,8 +1379,7 @@ int bond_enslave(struct net_device *bond_dev, struct 
net_device *slave_dev)
if (slave_dev->flags & IFF_UP) {
netdev_err(bond_dev, "%s is up - this may be due to an out of 
date ifenslave\n",
   slave_dev->name);
-   res = -EPERM;
-   goto err_undo_flags;
+   return -EPERM;
}
 
/* set bonding device ether type by slave - bonding netdevices are
@@ -1400,8 +1399,7 @@ int bond_enslave(struct net_device *bond_dev, struct 
net_device *slave_dev)
res = notifier_to_errno(res);
if (res) {
netdev_err(bond_dev, "refused to change device 
type\n");
-   res = -EBUSY;
-   goto err_undo_flags;
+   return -EBUSY;
}
 
/* Flush unicast and multicast addresses */
@@ -1421,8 +1419,7 @@ int bond_enslave(struct net_device *bond_dev, struct 
net_device *slave_dev)
} else if (bond_dev->type != slave_dev->type) {
netdev_err(bond_dev, "%s ether type (%d) is different from 
other slaves (%d), can not enslave it\n",
   slave_dev->name, slave_dev->type, bond_dev->type);
-   res = -EINVAL;
-   goto err_undo_flags;
+   return -EINVAL;
}
 
if (slave_ops->ndo_set_mac_address == NULL) {
-- 
1.8.3.1

[PATCH 3.13.y-ckt 125/136] veth: don’t modify ip_summed; doing so treats packets with bad checksums as good.

2016-02-02 Thread Kamal Mostafa

3.13.11-ckt34 -stable review patch.  If anyone has any objections, please let 
me know.

---8<

From: Vijay Pandurangan 

[ Upstream commit ce8c839b74e3017996fad4e1b7ba2e2625ede82f ]

Packets that arrive from real hardware devices have ip_summed ==
CHECKSUM_UNNECESSARY if the hardware verified the checksums, or
CHECKSUM_NONE if the packet is bad or it was unable to verify it. The
current version of veth will replace CHECKSUM_NONE with
CHECKSUM_UNNECESSARY, which causes corrupt packets routed from hardware to
a veth device to be delivered to the application. This caused applications
at Twitter to receive corrupt data when network hardware was corrupting
packets.

We believe this was added as an optimization to skip computing and
verifying checksums for communication between containers. However, locally
generated packets have ip_summed == CHECKSUM_PARTIAL, so the code as
written does nothing for them. As far as we can tell, after removing this
code, these packets are transmitted from one stack to another unmodified
(tcpdump shows invalid checksums on both sides, as expected), and they are
delivered correctly to applications. We didn’t test every possible network
configuration, but we tried a few common ones such as bridging containers,
using NAT between the host and a container, and routing from hardware
devices to containers. We have effectively deployed this in production at
Twitter (by disabling RX checksum offloading on veth devices).

This code dates back to the first version of the driver, commit
 ("[NET]: Virtual ethernet device driver"), so I
suspect this bug occurred mostly because the driver API has evolved
significantly since then. Commit <0b7967503dc97864f283a> ("net/veth: Fix
packet checksumming") (in December 2010) fixed this for packets that get
created locally and sent to hardware devices, by not changing
CHECKSUM_PARTIAL. However, the same issue still occurs for packets coming
in from hardware devices.

Co-authored-by: Evan Jones 
Signed-off-by: Evan Jones 
Cc: Nicolas Dichtel 
Cc: Phil Sutter 
Cc: Toshiaki Makita 
Cc: netdev@vger.kernel.org
Cc: linux-ker...@vger.kernel.org
Signed-off-by: Vijay Pandurangan 
Acked-by: Cong Wang 
Signed-off-by: David S. Miller 
Signed-off-by: Kamal Mostafa 
---
 drivers/net/veth.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 5b37437..887e698 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -116,12 +116,6 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct 
net_device *dev)
kfree_skb(skb);
goto drop;
}
-   /* don't change ip_summed == CHECKSUM_PARTIAL, as that
-* will cause bad checksum on forwarded packets
-*/
-   if (skb->ip_summed == CHECKSUM_NONE &&
-   rcv->features & NETIF_F_RXCSUM)
-   skb->ip_summed = CHECKSUM_UNNECESSARY;
 
if (likely(dev_forward_skb(rcv, skb) == NET_RX_SUCCESS)) {
struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
-- 
1.9.1

Re: [PATCH net-next 1/4] net: add event_list to struct net and provide utility functions

2016-02-02 Thread Julian Anastasov

Hello,

On Mon, 1 Feb 2016, Salam Noureddine wrote:

> +#ifdef CONFIG_NET_NS
> +static inline void net_add_event_list(struct list_head *head, struct net 
> *net)
> +{
> + if (!list_empty(>event_list))

Above check looks inverted, it works may be
because INIT_LIST_HEAD(>event_list) is missing.

> + list_add_tail(>event_list, head);
> +}
> +

Regards

--
Julian Anastasov

[PATCH v2 net-next 3/8] be2net: Fix be_vlan_rem_vid() to check vlan id being removed

2016-02-02 Thread Sriharsha Basavapatna

The driver decrements its vlan count without checking if it is really
present in its list. This results in an invalid vlan count and impacts
subsequent vlan add/rem ops. The function be_vlan_rem_vid() should be
updated to fix this.

Signed-off-by: Sriharsha Basavapatna 
---
 drivers/net/ethernet/emulex/benet/be_main.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/emulex/benet/be_main.c 
b/drivers/net/ethernet/emulex/benet/be_main.c
index f99de36..09e6f2c 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -1463,6 +1463,9 @@ static int be_vlan_rem_vid(struct net_device *netdev, 
__be16 proto, u16 vid)
if (lancer_chip(adapter) && vid == 0)
return 0;
 
+   if (!test_bit(vid, adapter->vids))
+   return 0;
+
clear_bit(vid, adapter->vids);
adapter->vlans_added--;
 
-- 
2.3.0.rc2

[PATCH v2 net-next 5/8] be2net: Don't run ethtool self-tests for VFs

2016-02-02 Thread Sriharsha Basavapatna

From: Somnath Kotur 

The CMD_SUBSYSTEM_LOWLEVEL cmds need DEV_CFG Privilege to run
which VFs don't have by default.
Self-tests need to be issued only for PFs.

Signed-off-by: Somnath Kotur 
Signed-off-by: Sriharsha Basavapatna 
---
 drivers/net/ethernet/emulex/benet/be_cmds.c | 29 -
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.c 
b/drivers/net/ethernet/emulex/benet/be_cmds.c
index 3b665f1..7d51d47 100644
--- a/drivers/net/ethernet/emulex/benet/be_cmds.c
+++ b/drivers/net/ethernet/emulex/benet/be_cmds.c
@@ -65,7 +65,22 @@ static struct be_cmd_priv_map cmd_priv_map[] = {
CMD_SUBSYSTEM_COMMON,
BE_PRIV_LNKMGMT | BE_PRIV_VHADM |
BE_PRIV_DEVCFG | BE_PRIV_DEVSEC
-   }
+   },
+   {
+   OPCODE_LOWLEVEL_HOST_DDR_DMA,
+   CMD_SUBSYSTEM_LOWLEVEL,
+   BE_PRIV_DEVCFG | BE_PRIV_DEVSEC
+   },
+   {
+   OPCODE_LOWLEVEL_LOOPBACK_TEST,
+   CMD_SUBSYSTEM_LOWLEVEL,
+   BE_PRIV_DEVCFG | BE_PRIV_DEVSEC
+   },
+   {
+   OPCODE_LOWLEVEL_SET_LOOPBACK_MODE,
+   CMD_SUBSYSTEM_LOWLEVEL,
+   BE_PRIV_DEVCFG | BE_PRIV_DEVSEC
+   },
 };
 
 static bool be_cmd_allowed(struct be_adapter *adapter, u8 opcode, u8 subsystem)
@@ -3169,6 +3184,10 @@ int be_cmd_set_loopback(struct be_adapter *adapter, u8 
port_num,
struct be_cmd_req_set_lmode *req;
int status;
 
+   if (!be_cmd_allowed(adapter, OPCODE_LOWLEVEL_SET_LOOPBACK_MODE,
+   CMD_SUBSYSTEM_LOWLEVEL))
+   return -EPERM;
+
spin_lock_bh(>mcc_lock);
 
wrb = wrb_from_mccq(adapter);
@@ -3214,6 +3233,10 @@ int be_cmd_loopback_test(struct be_adapter *adapter, u32 
port_num,
struct be_cmd_resp_loopback_test *resp;
int status;
 
+   if (!be_cmd_allowed(adapter, OPCODE_LOWLEVEL_LOOPBACK_TEST,
+   CMD_SUBSYSTEM_LOWLEVEL))
+   return -EPERM;
+
spin_lock_bh(>mcc_lock);
 
wrb = wrb_from_mccq(adapter);
@@ -3260,6 +3283,10 @@ int be_cmd_ddr_dma_test(struct be_adapter *adapter, u64 
pattern,
int status;
int i, j = 0;
 
+   if (!be_cmd_allowed(adapter, OPCODE_LOWLEVEL_HOST_DDR_DMA,
+   CMD_SUBSYSTEM_LOWLEVEL))
+   return -EPERM;
+
spin_lock_bh(>mcc_lock);
 
wrb = wrb_from_mccq(adapter);
-- 
2.3.0.rc2

[PATCH v2 net-next 8/8] be2net: Fix interval calculation in interrupt moderation

2016-02-02 Thread Sriharsha Basavapatna

From: Padmanabh Ratnakar 

Interrupt moderation parameters need to be recalculated only
after a time interval of 1 ms. Interval calculation is wrong
when there is a rollover of jiffies. Using recommended way of interval
calculation using jiffies to fix this.

Signed-off-by: Padmanabh Ratnakar 
Signed-off-by: Sriharsha Basavapatna 
---
 drivers/net/ethernet/emulex/benet/be_main.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be_main.c 
b/drivers/net/ethernet/emulex/benet/be_main.c
index d5286d3..9c1fc9d 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -1917,8 +1917,7 @@ static u32 be_get_eq_delay_mult_enc(struct be_eq_obj *eqo)
if (!aic->enable)
return 0;
 
-   if (time_before_eq(now, aic->jiffies) ||
-   jiffies_to_msecs(now - aic->jiffies) < 1)
+   if (jiffies_to_msecs(now - aic->jiffies) < 1)
eqd = aic->prev_eqd;
else
eqd = be_get_new_eqd(eqo);
-- 
2.3.0.rc2

[PATCH v2 net-next 6/8] be2net: Fix Lancer error recovery

2016-02-02 Thread Sriharsha Basavapatna

From: Padmanabh Ratnakar 

After error is detected, wait for adapter to move to ready state
before destroying queues and cleanup of other resources. Also
skip performing any cleanup for non-Lancer chips and move debug
messages to correct routine.

Signed-off-by: Padmanabh Ratnakar 
Signed-off-by: Sriharsha Basavapatna 
---
 drivers/net/ethernet/emulex/benet/be_main.c | 51 +++--
 1 file changed, 34 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be_main.c 
b/drivers/net/ethernet/emulex/benet/be_main.c
index 62f6fbb..6eb3aba 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -4859,21 +4859,27 @@ static int be_resume(struct be_adapter *adapter)
 
 static int be_err_recover(struct be_adapter *adapter)
 {
-   struct device *dev = >pdev->dev;
int status;
 
+   /* Error recovery is supported only Lancer as of now */
+   if (!lancer_chip(adapter))
+   return -EIO;
+
+   /* Wait for adapter to reach quiescent state before
+* destroying queues
+*/
+   status = be_fw_wait_ready(adapter);
+   if (status)
+   goto err;
+
+   be_cleanup(adapter);
+
status = be_resume(adapter);
if (status)
goto err;
 
-   dev_info(dev, "Adapter recovery successful\n");
return 0;
 err:
-   if (be_physfn(adapter))
-   dev_err(dev, "Adapter recovery failed\n");
-   else
-   dev_err(dev, "Re-trying adapter recovery\n");
-
return status;
 }
 
@@ -4882,21 +4888,32 @@ static void be_err_detection_task(struct work_struct 
*work)
struct be_adapter *adapter =
container_of(work, struct be_adapter,
 be_err_detection_work.work);
-   int status = 0;
+   struct device *dev = >pdev->dev;
+   int recovery_status;
 
be_detect_error(adapter);
 
-   if (be_check_error(adapter, BE_ERROR_HW)) {
-   be_cleanup(adapter);
-
-   /* As of now error recovery support is in Lancer only */
-   if (lancer_chip(adapter))
-   status = be_err_recover(adapter);
+   if (be_check_error(adapter, BE_ERROR_HW))
+   recovery_status = be_err_recover(adapter);
+   else
+   goto reschedule_task;
+
+   if (!recovery_status) {
+   dev_info(dev, "Adapter recovery successful\n");
+   goto reschedule_task;
+   } else if (be_virtfn(adapter)) {
+   /* For VFs, check if PF have allocated resources
+* every second.
+*/
+   dev_err(dev, "Re-trying adapter recovery\n");
+   goto reschedule_task;
+   } else {
+   dev_err(dev, "Adapter recovery failed\n");
}
 
-   /* Always attempt recovery on VFs */
-   if (!status || be_virtfn(adapter))
-   be_schedule_err_detection(adapter);
+   return;
+reschedule_task:
+   be_schedule_err_detection(adapter);
 }
 
 static void be_log_sfp_info(struct be_adapter *adapter)
-- 
2.3.0.rc2

[PATCH v2 net-next 1/8] be2net: return error status from be_set_phys_id()

2016-02-02 Thread Sriharsha Basavapatna

From: Suresh Reddy 

be_set_phys_id() returns 0 to ethtool when the command fails in the FW.

This patch fixes the set_phys_id() to return -EIO in case the FW cmd fails.

Signed-off-by: Suresh Reddy 
Signed-off-by: Sriharsha Basavapatna 
---
 drivers/net/ethernet/emulex/benet/be_ethtool.c | 23 +--
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be_ethtool.c 
b/drivers/net/ethernet/emulex/benet/be_ethtool.c
index a19ac44..2ff6916 100644
--- a/drivers/net/ethernet/emulex/benet/be_ethtool.c
+++ b/drivers/net/ethernet/emulex/benet/be_ethtool.c
@@ -720,29 +720,32 @@ static int be_set_phys_id(struct net_device *netdev,
  enum ethtool_phys_id_state state)
 {
struct be_adapter *adapter = netdev_priv(netdev);
+   int status = 0;
 
switch (state) {
case ETHTOOL_ID_ACTIVE:
-   be_cmd_get_beacon_state(adapter, adapter->hba_port_num,
-   >beacon_state);
-   return 1;   /* cycle on/off once per second */
+   status = be_cmd_get_beacon_state(adapter, adapter->hba_port_num,
+>beacon_state);
+   if (status)
+   return be_cmd_status(status);
+   return 1;   /* cycle on/off once per second */
 
case ETHTOOL_ID_ON:
-   be_cmd_set_beacon_state(adapter, adapter->hba_port_num, 0, 0,
-   BEACON_STATE_ENABLED);
+   status = be_cmd_set_beacon_state(adapter, adapter->hba_port_num,
+0, 0, BEACON_STATE_ENABLED);
break;
 
case ETHTOOL_ID_OFF:
-   be_cmd_set_beacon_state(adapter, adapter->hba_port_num, 0, 0,
-   BEACON_STATE_DISABLED);
+   status = be_cmd_set_beacon_state(adapter, adapter->hba_port_num,
+0, 0, BEACON_STATE_DISABLED);
break;
 
case ETHTOOL_ID_INACTIVE:
-   be_cmd_set_beacon_state(adapter, adapter->hba_port_num, 0, 0,
-   adapter->beacon_state);
+   status = be_cmd_set_beacon_state(adapter, adapter->hba_port_num,
+0, 0, adapter->beacon_state);
}
 
-   return 0;
+   return be_cmd_status(status);
 }
 
 static int be_set_dump(struct net_device *netdev, struct ethtool_dump *dump)
-- 
2.3.0.rc2

[PATCH v2 net-next 7/8] be2net: Add retry in case of error recovery failure

2016-02-02 Thread Sriharsha Basavapatna

From: Padmanabh Ratnakar 

Retry error recovery MAX_ERR_RECOVERY_RETRY_COUNT times in case of
failure during error recovery.

Signed-off-by: Padmanabh Ratnakar 
Signed-off-by: Sriharsha Basavapatna 
---
 drivers/net/ethernet/emulex/benet/be.h  |  5 +
 drivers/net/ethernet/emulex/benet/be_main.c | 23 +--
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be.h 
b/drivers/net/ethernet/emulex/benet/be.h
index de88c30..515e206 100644
--- a/drivers/net/ethernet/emulex/benet/be.h
+++ b/drivers/net/ethernet/emulex/benet/be.h
@@ -397,6 +397,10 @@ enum vf_state {
 #define BE_UC_PMAC_COUNT   30
 #define BE_VF_UC_PMAC_COUNT2
 
+#define MAX_ERR_RECOVERY_RETRY_COUNT   3
+#define ERR_DETECTION_DELAY1000
+#define ERR_RECOVERY_RETRY_DELAY   3
+
 /* Ethtool set_dump flags */
 #define LANCER_INITIATE_FW_DUMP0x1
 #define LANCER_DELETE_FW_DUMP  0x2
@@ -534,6 +538,7 @@ struct be_adapter {
u16 work_counter;
 
struct delayed_work be_err_detection_work;
+   u8 recovery_retries;
u8 err_flags;
u32 flags;
u32 cmd_privileges;
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c 
b/drivers/net/ethernet/emulex/benet/be_main.c
index 6eb3aba..d5286d3 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -4265,10 +4265,10 @@ static void be_schedule_worker(struct be_adapter 
*adapter)
adapter->flags |= BE_FLAGS_WORKER_SCHEDULED;
 }
 
-static void be_schedule_err_detection(struct be_adapter *adapter)
+static void be_schedule_err_detection(struct be_adapter *adapter, u32 delay)
 {
schedule_delayed_work(>be_err_detection_work,
- msecs_to_jiffies(1000));
+ msecs_to_jiffies(delay));
adapter->flags |= BE_FLAGS_ERR_DETECTION_SCHEDULED;
 }
 
@@ -4890,6 +4890,7 @@ static void be_err_detection_task(struct work_struct 
*work)
 be_err_detection_work.work);
struct device *dev = >pdev->dev;
int recovery_status;
+   int delay = ERR_DETECTION_DELAY;
 
be_detect_error(adapter);
 
@@ -4899,6 +4900,7 @@ static void be_err_detection_task(struct work_struct 
*work)
goto reschedule_task;
 
if (!recovery_status) {
+   adapter->recovery_retries = 0;
dev_info(dev, "Adapter recovery successful\n");
goto reschedule_task;
} else if (be_virtfn(adapter)) {
@@ -4907,13 +4909,22 @@ static void be_err_detection_task(struct work_struct 
*work)
 */
dev_err(dev, "Re-trying adapter recovery\n");
goto reschedule_task;
+   } else if (adapter->recovery_retries++ <
+  MAX_ERR_RECOVERY_RETRY_COUNT) {
+   /* In case of another error during recovery, it takes 30 sec
+* for adapter to come out of error. Retry error recovery after
+* this time interval.
+*/
+   dev_err(>pdev->dev, "Re-trying adapter recovery\n");
+   delay = ERR_RECOVERY_RETRY_DELAY;
+   goto reschedule_task;
} else {
dev_err(dev, "Adapter recovery failed\n");
}
 
return;
 reschedule_task:
-   be_schedule_err_detection(adapter);
+   be_schedule_err_detection(adapter, delay);
 }
 
 static void be_log_sfp_info(struct be_adapter *adapter)
@@ -5309,7 +5320,7 @@ static int be_probe(struct pci_dev *pdev, const struct 
pci_device_id *pdev_id)
 
be_roce_dev_add(adapter);
 
-   be_schedule_err_detection(adapter);
+   be_schedule_err_detection(adapter, ERR_DETECTION_DELAY);
 
/* On Die temperature not supported for VF. */
if (be_physfn(adapter) && IS_ENABLED(CONFIG_BE2NET_HWMON)) {
@@ -5376,7 +5387,7 @@ static int be_pci_resume(struct pci_dev *pdev)
if (status)
return status;
 
-   be_schedule_err_detection(adapter);
+   be_schedule_err_detection(adapter, ERR_DETECTION_DELAY);
 
if (adapter->wol_en)
be_setup_wol(adapter, false);
@@ -5476,7 +5487,7 @@ static void be_eeh_resume(struct pci_dev *pdev)
if (status)
goto err;
 
-   be_schedule_err_detection(adapter);
+   be_schedule_err_detection(adapter, ERR_DETECTION_DELAY);
return;
 err:
dev_err(>pdev->dev, "EEH resume failed\n");
-- 
2.3.0.rc2

[PATCH v2 net-next 2/8] be2net: check for INSUFFICIENT_PRIVILEGES error

2016-02-02 Thread Sriharsha Basavapatna

From: Suresh Reddy 

The driver currently logs the message "VF is not privileged to issue
opcode" by checking only the base_status field for UNAUTHORIZED_REQUEST.
Add check to look for INSUFFICIENT_PRIVILEGES in the additional status
field also as not all cmds fail with that base status.

Signed-off-by: Suresh Reddy 
Signed-off-by: Sriharsha Basavapatna 
---
 drivers/net/ethernet/emulex/benet/be_cmds.c | 3 ++-
 drivers/net/ethernet/emulex/benet/be_cmds.h | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.c 
b/drivers/net/ethernet/emulex/benet/be_cmds.c
index b63d8ad..3b665f1 100644
--- a/drivers/net/ethernet/emulex/benet/be_cmds.c
+++ b/drivers/net/ethernet/emulex/benet/be_cmds.c
@@ -236,7 +236,8 @@ static int be_mcc_compl_process(struct be_adapter *adapter,
 
if (base_status != MCC_STATUS_SUCCESS &&
!be_skip_err_log(opcode, base_status, addl_status)) {
-   if (base_status == MCC_STATUS_UNAUTHORIZED_REQUEST) {
+   if (base_status == MCC_STATUS_UNAUTHORIZED_REQUEST ||
+   addl_status == MCC_ADDL_STATUS_INSUFFICIENT_PRIVILEGES) {
dev_warn(>pdev->dev,
 "VF is not privileged to issue opcode %d-%d\n",
 opcode, subsystem);
diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.h 
b/drivers/net/ethernet/emulex/benet/be_cmds.h
index 241819b..f260ef3 100644
--- a/drivers/net/ethernet/emulex/benet/be_cmds.h
+++ b/drivers/net/ethernet/emulex/benet/be_cmds.h
@@ -68,7 +68,8 @@ enum mcc_addl_status {
MCC_ADDL_STATUS_TOO_MANY_INTERFACES = 0x4a,
MCC_ADDL_STATUS_INSUFFICIENT_VLANS = 0xab,
MCC_ADDL_STATUS_INVALID_SIGNATURE = 0x56,
-   MCC_ADDL_STATUS_MISSING_SIGNATURE = 0x57
+   MCC_ADDL_STATUS_MISSING_SIGNATURE = 0x57,
+   MCC_ADDL_STATUS_INSUFFICIENT_PRIVILEGES = 0x60
 };
 
 #define CQE_BASE_STATUS_MASK   0x
-- 
2.3.0.rc2

[PATCH v2 net-next 4/8] be2net: SRIOV Queue distribution should factor in EQ-count of VFs

2016-02-02 Thread Sriharsha Basavapatna

The SRIOV resource distribution logic for RX/TX queue counts is not optimal
when a small number of VFs are enabled. It does not take into account the
VF's EQ count while computing the queue counts. Because of this, the VF
gets a large number of queues, though it doesn't have sufficient EQs,
resulting in wasted queue resources. And the PF gets a smaller share of
queues though it has more EQs. Fix this by capping the VF queue count at
its EQ count.

Signed-off-by: Sriharsha Basavapatna 
---
 drivers/net/ethernet/emulex/benet/be.h  |  4 
 drivers/net/ethernet/emulex/benet/be_main.c | 15 ++-
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be.h 
b/drivers/net/ethernet/emulex/benet/be.h
index cf83783..de88c30 100644
--- a/drivers/net/ethernet/emulex/benet/be.h
+++ b/drivers/net/ethernet/emulex/benet/be.h
@@ -89,6 +89,10 @@
 #define BE3_MAX_TX_QS  16
 #define BE3_MAX_EVT_QS 16
 #define BE3_SRIOV_MAX_EVT_QS   8
+#define SH_VF_MAX_NIC_EQS  3   /* Skyhawk VFs can have a max of 4 EQs
+* and at least 1 is granted to either
+* SURF/DPDK
+*/
 
 #define MAX_RSS_IFACES 15
 #define MAX_RX_QS  32
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c 
b/drivers/net/ethernet/emulex/benet/be_main.c
index 09e6f2c..62f6fbb 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -3792,18 +3792,15 @@ static u16 be_calculate_vf_qs(struct be_adapter 
*adapter, u16 num_vfs)
struct be_resources res = adapter->pool_res;
u16 num_vf_qs = 1;
 
-   /* Distribute the queue resources equally among the PF and it's VFs
+   /* Distribute the queue resources among the PF and it's VFs
 * Do not distribute queue resources in multi-channel configuration.
 */
if (num_vfs && !be_is_mc(adapter)) {
-   /* If number of VFs requested is 8 less than max supported,
-* assign 8 queue pairs to the PF and divide the remaining
-* resources evenly among the VFs
-*/
-   if (num_vfs < (be_max_vfs(adapter) - 8))
-   num_vf_qs = (res.max_rss_qs - 8) / num_vfs;
-   else
-   num_vf_qs = res.max_rss_qs / num_vfs;
+/* Divide the qpairs evenly among the VFs and the PF, capped
+ * at VF-EQ-count. Any remainder qpairs belong to the PF.
+ */
+   num_vf_qs = min(SH_VF_MAX_NIC_EQS,
+   res.max_rss_qs / (num_vfs + 1));
 
/* Skyhawk-R chip supports only MAX_RSS_IFACES RSS capable
 * interfaces per port. Provide RSS on VFs, only if number
-- 
2.3.0.rc2

[PATCH v2 net-next 0/8] be2net patch-set

2016-02-02 Thread Sriharsha Basavapatna

v2 changes:
Patch-4:Changed a tab to space in be.h
Patches-6,7,8:  Updated commit log summary line: benet --> be2net

Hi David,

The following patch set contains a few non-critical bug fixes. Please
consider applying this to the net-next tree. Thanks.

Patch-1 fixes be_set_phys_id() ethtool function to return an error code.
Patch-2 fixes a warning when some commands fail for VFs.
Patch-3 fixes be_vlan_rem_vid() to verify vlan being removed is in the list.
Patch-4 improves SRIOV queue distribution logic.
Patch-5 avoids running self test on VFs.
Patch-6 fixes error recovery in Lancer to clean up after moving to ready state.
Patch-7 adds retry logic to error recovery in case of recovery failures
Patch-8 fixes time interval used in eq delay computation routine


Padmanabh Ratnakar (3):
  be2net: Fix Lancer error recovery
  be2net: Add retry in case of error recovery failure
  be2net: Fix interval calculation in interrupt moderation

Somnath Kotur (1):
  be2net: Don't run ethtool self-tests for VFs

Sriharsha Basavapatna (2):
  be2net: Fix be_vlan_rem_vid() to check vlan id being removed
  be2net: SRIOV Queue distribution should factor in EQ-count of VFs

Suresh Reddy (2):
  be2net: return error status from be_set_phys_id()
  be2net: check for INSUFFICIENT_PRIVILEGES error

 drivers/net/ethernet/emulex/benet/be.h |  9 +++
 drivers/net/ethernet/emulex/benet/be_cmds.c| 32 -
 drivers/net/ethernet/emulex/benet/be_cmds.h|  3 +-
 drivers/net/ethernet/emulex/benet/be_ethtool.c | 23 ---
 drivers/net/ethernet/emulex/benet/be_main.c| 93 +-
 5 files changed, 114 insertions(+), 46 deletions(-)

-- 
2.3.0.rc2

[PATCH v3] rtlwifi: Fix improve function 'rtl_addr_delay()' in core.c

2016-02-02 Thread Byeoungwook Kim

Conditional codes in rtl_addr_delay() were improved in readability and
performance by using switch codes.

Reviewed-by: Julian Calaby 
Signed-off-by: Byeoungwook Kim 
Signed-off-by: Fengguang Wu 
---
V3 remove unneeded semicolon.
V2 split in separate patchs.
 drivers/net/wireless/realtek/rtlwifi/core.c | 20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/core.c 
b/drivers/net/wireless/realtek/rtlwifi/core.c
index 4ae421e..63cda78 100644
--- a/drivers/net/wireless/realtek/rtlwifi/core.c
+++ b/drivers/net/wireless/realtek/rtlwifi/core.c
@@ -37,18 +37,26 @@
 
 void rtl_addr_delay(u32 addr)
 {
-   if (addr == 0xfe)
+   switch (addr) {
+   case 0xfe:
mdelay(50);
-   else if (addr == 0xfd)
+   break;
+   case 0xfd:
mdelay(5);
-   else if (addr == 0xfc)
+   break;
+   case 0xfc:
mdelay(1);
-   else if (addr == 0xfb)
+   break;
+   case 0xfb:
udelay(50);
-   else if (addr == 0xfa)
+   break;
+   case 0xfa:
udelay(5);
-   else if (addr == 0xf9)
+   break;
+   case 0xf9:
udelay(1);
+   break;
+   }
 }
 EXPORT_SYMBOL(rtl_addr_delay);
 
-- 
2.5.0

[PATCH net-next 4/4] dtb: xgene: Add irqs to support multi queue

2016-02-02 Thread Iyappan Subramanian

Signed-off-by: Iyappan Subramanian 
Signed-off-by: Khuong Dinh 
Signed-off-by: Tanmay Inamdar 
Tested-by: Toan Le 
---
 arch/arm64/boot/dts/apm/apm-shadowcat.dtsi | 8 +++-
 arch/arm64/boot/dts/apm/apm-storm.dtsi | 8 +++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/boot/dts/apm/apm-shadowcat.dtsi 
b/arch/arm64/boot/dts/apm/apm-shadowcat.dtsi
index 5d87a3d..278f106 100644
--- a/arch/arm64/boot/dts/apm/apm-shadowcat.dtsi
+++ b/arch/arm64/boot/dts/apm/apm-shadowcat.dtsi
@@ -621,7 +621,13 @@
  <0x0 0x1f60 0x0 0Xd100>,
  <0x0 0x2000 0x0 0X22>;
interrupts = <0 108 4>,
-<0 109 4>;
+<0 109 4>,
+<0 110 4>,
+<0 111 4>,
+<0 112 4>,
+<0 113 4>,
+<0 114 4>,
+<0 115 4>;
port-id = <1>;
dma-coherent;
clocks = < 0>;
diff --git a/arch/arm64/boot/dts/apm/apm-storm.dtsi 
b/arch/arm64/boot/dts/apm/apm-storm.dtsi
index fe30f76..cafb2c2 100644
--- a/arch/arm64/boot/dts/apm/apm-storm.dtsi
+++ b/arch/arm64/boot/dts/apm/apm-storm.dtsi
@@ -958,7 +958,13 @@
  <0x0 0x1800 0x0 0X200>;
reg-names = "enet_csr", "ring_csr", "ring_cmd";
interrupts = <0x0 0x60 0x4>,
-<0x0 0x61 0x4>;
+<0x0 0x61 0x4>,
+<0x0 0x62 0x4>,
+<0x0 0x63 0x4>,
+<0x0 0x64 0x4>,
+<0x0 0x65 0x4>,
+<0x0 0x66 0x4>,
+<0x0 0x67 0x4>;
dma-coherent;
clocks = < 0>;
/* mac address will be overwritten by the bootloader */
-- 
1.9.1

[PATCH net-next 1/4] drivers: net: xgene: Add support for Classifier engine

2016-02-02 Thread Iyappan Subramanian

Signed-off-by: Iyappan Subramanian 
Signed-off-by: Khuong Dinh 
Signed-off-by: Tanmay Inamdar 
Tested-by: Toan Le 
---
 drivers/net/ethernet/apm/xgene/Makefile  |   3 +-
 drivers/net/ethernet/apm/xgene/xgene_enet_cle.c  | 357 +++
 drivers/net/ethernet/apm/xgene/xgene_enet_cle.h  | 264 +
 drivers/net/ethernet/apm/xgene/xgene_enet_hw.h   |   1 +
 drivers/net/ethernet/apm/xgene/xgene_enet_main.c |  29 +-
 drivers/net/ethernet/apm/xgene/xgene_enet_main.h |  14 +
 6 files changed, 659 insertions(+), 9 deletions(-)
 create mode 100644 drivers/net/ethernet/apm/xgene/xgene_enet_cle.c
 create mode 100644 drivers/net/ethernet/apm/xgene/xgene_enet_cle.h

diff --git a/drivers/net/ethernet/apm/xgene/Makefile 
b/drivers/net/ethernet/apm/xgene/Makefile
index 700b5ab..f46321f 100644
--- a/drivers/net/ethernet/apm/xgene/Makefile
+++ b/drivers/net/ethernet/apm/xgene/Makefile
@@ -3,5 +3,6 @@
 #
 
 xgene-enet-objs := xgene_enet_hw.o xgene_enet_sgmac.o xgene_enet_xgmac.o \
-  xgene_enet_main.o xgene_enet_ring2.o xgene_enet_ethtool.o
+  xgene_enet_main.o xgene_enet_ring2.o xgene_enet_ethtool.o \
+  xgene_enet_cle.o
 obj-$(CONFIG_NET_XGENE) += xgene-enet.o
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c 
b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c
new file mode 100644
index 000..7eea982
--- /dev/null
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c
@@ -0,0 +1,357 @@
+/* Applied Micro X-Gene SoC Ethernet Classifier structures
+ *
+ * Copyright (c) 2016, Applied Micro Circuits Corporation
+ * Authors: Khuong Dinh 
+ *  Tanmay Inamdar 
+ *  Iyappan Subramanian 
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see .
+ */
+
+#include "xgene_enet_main.h"
+
+static void xgene_cle_dbptr_to_hw(struct xgene_enet_pdata *pdata,
+ struct xgene_cle_dbptr *dbptr, u32 *buf)
+{
+   buf[4] = SET_VAL(CLE_FPSEL, dbptr->fpsel) |
+SET_VAL(CLE_DSTQIDL, dbptr->dstqid);
+
+   buf[5] = SET_VAL(CLE_DSTQIDH, (u32)dbptr->dstqid >> CLE_DSTQIDL_LEN) |
+SET_VAL(CLE_PRIORITY, dbptr->cle_priority);
+}
+
+static void xgene_cle_kn_to_hw(struct xgene_cle_ptree_kn *kn, u32 *buf)
+{
+   u32 i, j = 0;
+   u32 data;
+
+   buf[j++] = SET_VAL(CLE_TYPE, kn->node_type);
+   for (i = 0; i < kn->num_keys; i++) {
+   struct xgene_cle_ptree_key *key = >key[i];
+
+   if (!(i % 2)) {
+   buf[j] = SET_VAL(CLE_KN_PRIO, key->priority) |
+SET_VAL(CLE_KN_RPTR, key->result_pointer);
+   } else {
+   data = SET_VAL(CLE_KN_PRIO, key->priority) |
+  SET_VAL(CLE_KN_RPTR, key->result_pointer);
+   buf[j++] |= (data << 16);
+   }
+   }
+}
+
+static void xgene_cle_dn_to_hw(struct xgene_cle_ptree_ewdn *dn,
+  u32 *buf, u32 jb)
+{
+   u32 i, j = 0;
+   u32 npp;
+   struct xgene_cle_ptree_branch *br;
+
+   buf[j++] = SET_VAL(CLE_DN_TYPE, dn->node_type) |
+  SET_VAL(CLE_DN_LASTN, dn->last_node) |
+  SET_VAL(CLE_DN_HLS, dn->hdr_len_store) |
+  SET_VAL(CLE_DN_EXT, dn->hdr_extn) |
+  SET_VAL(CLE_DN_BSTOR, dn->byte_store) |
+  SET_VAL(CLE_DN_SBSTOR, dn->search_byte_store) |
+  SET_VAL(CLE_DN_RPTR, dn->result_pointer);
+
+   for (i = 0; i < dn->num_branches; i++) {
+   br = >branch[i];
+   npp = br->next_packet_pointer;
+
+   if ((br->jump_rel == JMP_ABS) && (npp < CLE_PKTRAM_SIZE))
+   npp += jb;
+
+   buf[j++] = SET_VAL(CLE_BR_VALID, br->valid) |
+  SET_VAL(CLE_BR_NPPTR, npp) |
+  SET_VAL(CLE_BR_JB, br->jump_bw) |
+  SET_VAL(CLE_BR_JR, br->jump_rel) |
+  SET_VAL(CLE_BR_OP, br->operation) |
+  SET_VAL(CLE_BR_NNODE, br->next_node) |
+  SET_VAL(CLE_BR_NBR, br->next_branch);
+
+   buf[j++] = SET_VAL(CLE_BR_DATA, br->data) |
+

[RESEND PATCH 1/9] ipv4: Namespaceify tcp syn retries sysctl knob

2016-02-02 Thread Nikolay Borisov

Signed-off-by: Nikolay Borisov 
---
 include/net/netns/ipv4.h   |  2 ++
 include/net/tcp.h  |  1 -
 net/ipv4/sysctl_net_ipv4.c | 18 +-
 net/ipv4/tcp.c |  3 ++-
 net/ipv4/tcp_ipv4.c|  2 ++
 net/ipv4/tcp_timer.c   |  4 ++--
 6 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index ffa2777b6475..59c6155e4896 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -95,6 +95,8 @@ struct netns_ipv4 {
int sysctl_tcp_keepalive_probes;
int sysctl_tcp_keepalive_intvl;
 
+   int sysctl_tcp_syn_retries;
+
struct ping_group_range ping_group_range;
 
atomic_t dev_addr_genid;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3ed10fc89c7d..a7f6f25297d7 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -240,7 +240,6 @@ extern int sysctl_tcp_timestamps;
 extern int sysctl_tcp_window_scaling;
 extern int sysctl_tcp_sack;
 extern int sysctl_tcp_fin_timeout;
-extern int sysctl_tcp_syn_retries;
 extern int sysctl_tcp_synack_retries;
 extern int sysctl_tcp_retries1;
 extern int sysctl_tcp_retries2;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index fccf8e92bf81..db95287d2b94 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -293,15 +293,6 @@ static struct ctl_table ipv4_table[] = {
.extra2 = _ttl_max,
},
{
-   .procname   = "tcp_syn_retries",
-   .data   = _tcp_syn_retries,
-   .maxlen = sizeof(int),
-   .mode   = 0644,
-   .proc_handler   = proc_dointvec_minmax,
-   .extra1 = _syn_retries_min,
-   .extra2 = _syn_retries_max
-   },
-   {
.procname   = "tcp_synack_retries",
.data   = _tcp_synack_retries,
.maxlen = sizeof(int),
@@ -950,6 +941,15 @@ static struct ctl_table ipv4_net_table[] = {
.mode   = 0644,
.proc_handler   = proc_dointvec_jiffies,
},
+   {
+   .procname   = "tcp_syn_retries",
+   .data   = _net.ipv4.sysctl_tcp_syn_retries,
+   .maxlen = sizeof(int),
+   .mode   = 0644,
+   .proc_handler   = proc_dointvec_minmax,
+   .extra1 = _syn_retries_min,
+   .extra2 = _syn_retries_max
+   },
{ }
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c82cca18c90f..bb36a39b5685 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2722,6 +2722,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 {
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+   struct net *net = sock_net(sk);
int val, len;
 
if (get_user(len, optlen))
@@ -2756,7 +2757,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = keepalive_probes(tp);
break;
case TCP_SYNCNT:
-   val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
+   val = icsk->icsk_syn_retries ? : 
net->ipv4.sysctl_tcp_syn_retries;
break;
case TCP_LINGER2:
val = tp->linger2;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 9db9bdb14449..c9944e0c48d3 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2382,6 +2382,8 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
 
+   net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
+
return 0;
 fail:
tcp_sk_exit(net);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index a4730a28b220..c5d51f530c65 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,7 +22,6 @@
 #include 
 #include 
 
-int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES;
 int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES;
 int sysctl_tcp_retries1 __read_mostly = TCP_RETR1;
 int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
@@ -157,6 +156,7 @@ static int tcp_write_timeout(struct sock *sk)
 {
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+   struct net *net = sock_net(sk);
int retry_until;
bool do_reset, syn_set = false;
 
@@ -169,7 +169,7 @@ static int tcp_write_timeout(struct sock *sk)
NET_INC_STATS_BH(sock_net(sk),
 
LINUX_MIB_TCPFASTOPENACTIVEFAIL);
}
-   retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
+   retry_until = icsk->icsk_syn_retries ? : 
net->ipv4.sysctl_tcp_syn_retries;

[RESEND PATCH 2/9] ipv4: Namespaceify tcp synack retries sysctl knob

2016-02-02 Thread Nikolay Borisov

Signed-off-by: Nikolay Borisov 
---
 include/net/netns/ipv4.h|  1 +
 include/net/tcp.h   |  1 -
 net/ipv4/inet_connection_sock.c |  7 ++-
 net/ipv4/sysctl_net_ipv4.c  | 14 +++---
 net/ipv4/tcp_ipv4.c |  1 +
 net/ipv4/tcp_timer.c|  3 +--
 6 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 59c6155e4896..bca049102441 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -96,6 +96,7 @@ struct netns_ipv4 {
int sysctl_tcp_keepalive_intvl;
 
int sysctl_tcp_syn_retries;
+   int sysctl_tcp_synack_retries;
 
struct ping_group_range ping_group_range;
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index a7f6f25297d7..5a162875e80c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -240,7 +240,6 @@ extern int sysctl_tcp_timestamps;
 extern int sysctl_tcp_window_scaling;
 extern int sysctl_tcp_sack;
 extern int sysctl_tcp_fin_timeout;
-extern int sysctl_tcp_synack_retries;
 extern int sysctl_tcp_retries1;
 extern int sysctl_tcp_retries2;
 extern int sysctl_tcp_orphan_retries;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 46b9c887bede..9b17c1792dce 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -482,10 +482,6 @@ EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
 #define AF_INET_FAMILY(fam) true
 #endif
 
-/* Only thing we need from tcp.h */
-extern int sysctl_tcp_synack_retries;
-
-
 /* Decide when to expire the request and when to resend SYN-ACK */
 static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
  const int max_retries,
@@ -557,6 +553,7 @@ static void reqsk_timer_handler(unsigned long data)
 {
struct request_sock *req = (struct request_sock *)data;
struct sock *sk_listener = req->rsk_listener;
+   struct net *net = sock_net(sk_listener);
struct inet_connection_sock *icsk = inet_csk(sk_listener);
struct request_sock_queue *queue = >icsk_accept_queue;
int qlen, expire = 0, resend = 0;
@@ -566,7 +563,7 @@ static void reqsk_timer_handler(unsigned long data)
if (sk_state_load(sk_listener) != TCP_LISTEN)
goto drop;
 
-   max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
+   max_retries = icsk->icsk_syn_retries ? : 
net->ipv4.sysctl_tcp_synack_retries;
thresh = max_retries;
/* Normally all the openreqs are young and become mature
 * (i.e. converted to established socket) for first timeout.
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index db95287d2b94..5dd89de5bf8d 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -293,13 +293,6 @@ static struct ctl_table ipv4_table[] = {
.extra2 = _ttl_max,
},
{
-   .procname   = "tcp_synack_retries",
-   .data   = _tcp_synack_retries,
-   .maxlen = sizeof(int),
-   .mode   = 0644,
-   .proc_handler   = proc_dointvec
-   },
-   {
.procname   = "tcp_max_orphans",
.data   = _tcp_max_orphans,
.maxlen = sizeof(int),
@@ -950,6 +943,13 @@ static struct ctl_table ipv4_net_table[] = {
.extra1 = _syn_retries_min,
.extra2 = _syn_retries_max
},
+   {
+   .procname   = "tcp_synack_retries",
+   .data   = _net.ipv4.sysctl_tcp_synack_retries,
+   .maxlen = sizeof(int),
+   .mode   = 0644,
+   .proc_handler   = proc_dointvec
+   },
{ }
 };
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c9944e0c48d3..a5268576021c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2383,6 +2383,7 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
 
net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
+   net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
 
return 0;
 fail:
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index c5d51f530c65..ca25fdf0c525 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,7 +22,6 @@
 #include 
 #include 
 
-int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES;
 int sysctl_tcp_retries1 __read_mostly = TCP_RETR1;
 int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
 int sysctl_tcp_orphan_retries __read_mostly;
@@ -332,7 +331,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk)
 {
struct inet_connection_sock *icsk = inet_csk(sk);
int max_retries = icsk->icsk_syn_retries ? :
-   sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
+

[PATCH net-next 3/4] drivers: net: xgene: Add support for multiple queues

2016-02-02 Thread Iyappan Subramanian

Signed-off-by: Iyappan Subramanian 
Signed-off-by: Khuong Dinh 
Signed-off-by: Tanmay Inamdar 
Tested-by: Toan Le 
---
 drivers/net/ethernet/apm/xgene/xgene_enet_cle.c  |  11 +-
 drivers/net/ethernet/apm/xgene/xgene_enet_main.c | 452 ++-
 drivers/net/ethernet/apm/xgene/xgene_enet_main.h |  20 +-
 3 files changed, 289 insertions(+), 194 deletions(-)

diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c 
b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c
index d51268f..ae0c93c 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c
@@ -333,13 +333,14 @@ static int xgene_cle_set_rss_skeys(struct xgene_enet_cle 
*cle)
 static int xgene_cle_set_rss_idt(struct xgene_enet_pdata *pdata)
 {
int i, ret = 0;
-   u32 fpsel, dstqid, nfpsel, idt_reg;
+   u32 fpsel, dstqid, nfpsel, idt_reg, idx;
u16 pool_id;
 
for (i = 0; i < XGENE_CLE_IDT_ENTRIES; i++) {
-   pool_id = pdata->rx_ring->buf_pool->id;
+   idx = i % pdata->rxq_cnt;
+   pool_id = pdata->rx_ring[idx]->buf_pool->id;
fpsel = xgene_enet_ring_bufnum(pool_id) - 0x20;
-   dstqid = xgene_enet_dst_ring_num(pdata->rx_ring);
+   dstqid = xgene_enet_dst_ring_num(pdata->rx_ring[idx]);
nfpsel = 0;
idt_reg = 0;
 
@@ -695,8 +696,8 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata 
*pdata)
br->mask = 0x;
}
 
-   def_qid = xgene_enet_dst_ring_num(pdata->rx_ring);
-   pool_id = pdata->rx_ring->buf_pool->id;
+   def_qid = xgene_enet_dst_ring_num(pdata->rx_ring[0]);
+   pool_id = pdata->rx_ring[0]->buf_pool->id;
def_fpsel = xgene_enet_ring_bufnum(pool_id) - 0x20;
 
memset(dbptr, 0, sizeof(struct xgene_cle_dbptr) * DB_MAX_PTRS);
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c 
b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
index 2716d45..1653345 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
@@ -182,7 +182,6 @@ static int xgene_enet_tx_completion(struct 
xgene_enet_desc_ring *cp_ring,
 static u64 xgene_enet_work_msg(struct sk_buff *skb)
 {
struct net_device *ndev = skb->dev;
-   struct xgene_enet_pdata *pdata = netdev_priv(ndev);
struct iphdr *iph;
u8 l3hlen = 0, l4hlen = 0;
u8 ethhdr, proto = 0, csum_enable = 0;
@@ -228,10 +227,6 @@ static u64 xgene_enet_work_msg(struct sk_buff *skb)
if (!mss || ((skb->len - hdr_len) <= mss))
goto out;
 
-   if (mss != pdata->mss) {
-   pdata->mss = mss;
-   pdata->mac_ops->set_mss(pdata);
-   }
hopinfo |= SET_BIT(ET);
}
} else if (iph->protocol == IPPROTO_UDP) {
@@ -413,7 +408,7 @@ out:
raw_desc->m0 = cpu_to_le64(SET_VAL(LL, ll) | SET_VAL(NV, nv) |
   SET_VAL(USERINFO, tx_ring->tail));
tx_ring->cp_ring->cp_skb[tx_ring->tail] = skb;
-   pdata->tx_level += count;
+   pdata->tx_level[tx_ring->cp_ring->index] += count;
tx_ring->tail = tail;
 
return count;
@@ -423,15 +418,17 @@ static netdev_tx_t xgene_enet_start_xmit(struct sk_buff 
*skb,
 struct net_device *ndev)
 {
struct xgene_enet_pdata *pdata = netdev_priv(ndev);
-   struct xgene_enet_desc_ring *tx_ring = pdata->tx_ring;
-   u32 tx_level = pdata->tx_level;
+   struct xgene_enet_desc_ring *tx_ring;
+   int index = skb->queue_mapping;
+   u32 tx_level = pdata->tx_level[index];
int count;
 
-   if (tx_level < pdata->txc_level)
-   tx_level += ((typeof(pdata->tx_level))~0U);
+   tx_ring = pdata->tx_ring[index];
+   if (tx_level < pdata->txc_level[index])
+   tx_level += ((typeof(pdata->tx_level[index]))~0U);
 
-   if ((tx_level - pdata->txc_level) > pdata->tx_qcnt_hi) {
-   netif_stop_queue(ndev);
+   if ((tx_level - pdata->txc_level[index]) > pdata->tx_qcnt_hi) {
+   netif_stop_subqueue(ndev, index);
return NETDEV_TX_BUSY;
}
 
@@ -529,7 +526,8 @@ static bool is_rx_desc(struct xgene_enet_raw_desc *raw_desc)
 static int xgene_enet_process_ring(struct xgene_enet_desc_ring *ring,
   int budget)
 {
-   struct xgene_enet_pdata *pdata = netdev_priv(ring->ndev);
+   struct net_device *ndev = ring->ndev;
+   struct xgene_enet_pdata *pdata = netdev_priv(ndev);
struct xgene_enet_raw_desc *raw_desc, *exp_desc;
u16 head = ring->head;
u16 slots = ring->slots - 1;
@@ -573,7 +571,7 @@ static int xgene_enet_process_ring(struct

[RESEND PATCH 0/9] Namespaceify more of the tcp sysctl knobs

2016-02-02 Thread Nikolay Borisov

This patch series continues making more of the tcp-related
sysctl knobs be per net-namespace. Most of these apply per
socket and have global defaults so should be safe and I
don't expect any breakages. 

Having those per net-namespace is useful when multiple  
containers are hosted and it is required to tune the 
tcp settings for each independently of the host node. 

I've split the patches to be per-sysctl but after
the review if the outcome is positive I'm happy
to either send it in one big blob or just.  

Nikolay Borisov (9):
  ipv4: Namespaceify tcp syn retries sysctl knob
  ipv4: Namespaceify tcp synack retries sysctl knob
  ipv4: Namespaceify tcp syncookies sysctl knob
  ipv4: Namespaceify tcp reordering sysctl knob
  ipv4: Namespaceify tcp_retries1 sysctl knob
  ipv4: Namespaceify tcp_retries2 sysctl knob
  ipv4: Namespaceify tcp_orphan_retries sysctl knob
  ipv4: Namespaceify tcp_fin_timeout sysctl knob
  ipv4: Namespaceify tcp_notsent_lowat sysctl knob

 include/net/netns/ipv4.h|  10 +++
 include/net/tcp.h   |  17 ++---
 net/ipv4/inet_connection_sock.c |   7 +--
 net/ipv4/syncookies.c   |   4 +-
 net/ipv4/sysctl_net_ipv4.c  | 136 
 net/ipv4/tcp.c  |  12 ++--
 net/ipv4/tcp_input.c|  22 ---
 net/ipv4/tcp_ipv4.c |  11 +++-
 net/ipv4/tcp_metrics.c  |   3 +-
 net/ipv4/tcp_minisocks.c|   3 -
 net/ipv4/tcp_output.c   |   6 +-
 net/ipv4/tcp_timer.c|  23 +++
 net/ipv6/syncookies.c   |   2 +-
 13 files changed, 130 insertions(+), 126 deletions(-)

-- 
2.5.0

1 2 3 >

1 - 100 of 263 matches

Mail list logo