Re: [PATCH net-next] ifb: fix packets checksum

2018-05-28 Thread Jonathan Maxwell
On Sat, May 26, 2018 at 6:43 AM, David Miller  wrote:
> From: Jon Maxwell 
> Date: Fri, 25 May 2018 07:38:29 +1000
>
>> Fixup the checksum for CHECKSUM_COMPLETE when pulling skbs on RX path.
>> Otherwise we get splats when tc mirred is used to redirect packets to ifb.
>>
>> Before fix:
>>
>> nic: hw csum failure
>>
>> Signed-off-by: Jon Maxwell 
>
> This definitely seems correct, but I am really surprised a bug like this has
> lasted as long as it has.
>

Sorry for the late reply I have been away for a few days. The customer never
saw this on bnx2x. Then they switched to the mlx5 driver and it started
happening continuously when doing iperf3 tests and also for other TCP traffic.

mlx5 uses CHECKSUM_COMPLETE. I think that bnx2x uses CHECKSUM UNNECESSARY
which avoided the code path that triggers the csum failure message in
__skb_checksum_complete(). It only logs the message for CHECKSUM_COMPLETE
skbs. Probably few Linux users are using a combination of NIC drivers that
use CHECKSUM_COMPLETE, tc mirred and ifb, which is why this has never been
reported before.

This is very similar to commit 7be709af2b65.

> So I'll let this sit for another day or two for review.


Re: [PATCH net-next v1] tcp: Add mark for TIMEWAIT sockets

2018-05-10 Thread Jonathan Maxwell
On Thu, May 10, 2018 at 3:45 PM, Eric Dumazet  wrote:
>
>
> On 05/09/2018 10:21 PM, Jon Maxwell wrote:
>
> ...
>
>>   if (th->rst)
>> @@ -723,11 +724,17 @@ static void tcp_v4_send_reset(const struct sock *sk, 
>> struct sk_buff *skb)
>>   arg.tos = ip_hdr(skb)->tos;
>>   arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
>>   local_bh_disable();
>> - ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
>> + ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
>> + if (sk && sk->sk_state == TCP_TIME_WAIT)
>> + ctl_sk->sk_mark = inet_twsk(sk)->tw_mark;
>> + else if (sk && sk_fullsock(sk))
>
> I do not believe we could have a non fullsock here ?
>

Okay thanks I'll make these changes to v2.

> A request socket (SYN_RECV state) at this point is not expected.
>
>
> So you can factorize :
>
> if (sk)
> ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
>   inet_twsk(sk)->tw_mark : sk->sk_mark;
>
> (same remark for IPv6)
>
>
>> + ctl_sk->sk_mark = sk->sk_mark;
>> + ip_send_unicast_reply(ctl_sk,
>> skb, _SKB_CB(skb)->header.h4.opt,
>> ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
>> , arg.iov[0].iov_len);
>
>


Re: [PATCH net-next] tcp: Add mark for TIMEWAIT sockets

2018-05-09 Thread Jonathan Maxwell
On Thu, May 10, 2018 at 1:32 PM, Eric Dumazet  wrote:
>
>
> On 05/09/2018 07:07 PM, Jon Maxwell wrote:
>> Aidan McGurn from Openwave Mobility systems reported the following bug:
>>
>> "Marked routing is broken on customer deployment. Its effects are large
>> increase in Uplink retransmissions caused by the client never receiving
>> the final ACK to their FINACK - this ACK misses the mark and routes out
>> of the incorrect route."
>>
>> Currently marks are added to sk_buffs for replies when the "fwmark_reflect"
>> sysctl is enabled. But not for TIME_WAIT sockets where the original socket 
>> had
>> sk->sk_mark set via setsockopt(SO_MARK..).
>>
>> Fix this in IPv4/v6 by adding tw->tw_mark for TIME_WAIT sockets. Copy the the
>> original sk->sk_mark in __inet_twsk_hashdance() to the new tw->tw_mark 
>> location.
>> Then copy this into ctl_sk->sk_mark so that the skb gets sent with the 
>> correct
>> mark. Do the same for resets. Give the "fwmark_reflect" sysctl precedence 
>> over
>> sk->sk_mark so that netfilter rules are still honored.
>>
>> Signed-off-by: Jon Maxwell 
>> ---
>>  include/net/inet_timewait_sock.h |  1 +
>>  net/ipv4/ip_output.c |  3 ++-
>>  net/ipv4/tcp_ipv4.c  | 18 --
>>  net/ipv4/tcp_minisocks.c |  1 +
>>  net/ipv6/tcp_ipv6.c  |  8 +++-
>>  5 files changed, 27 insertions(+), 4 deletions(-)
>>
>> diff --git a/include/net/inet_timewait_sock.h 
>> b/include/net/inet_timewait_sock.h
>> index c7be1ca8e562..659d8ed5a3bc 100644
>> --- a/include/net/inet_timewait_sock.h
>> +++ b/include/net/inet_timewait_sock.h
>> @@ -62,6 +62,7 @@ struct inet_timewait_sock {
>>  #define tw_dr__tw_common.skc_tw_dr
>>
>>   int tw_timeout;
>> + __u32   tw_mark;
>>   volatile unsigned char  tw_substate;
>>   unsigned char   tw_rcv_wscale;
>>
>> diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
>> index 95adb171f852..cca4412dc4cb 100644
>> --- a/net/ipv4/ip_output.c
>> +++ b/net/ipv4/ip_output.c
>> @@ -1539,6 +1539,7 @@ void ip_send_unicast_reply(struct sock *sk, struct 
>> sk_buff *skb,
>>   struct sk_buff *nskb;
>>   int err;
>>   int oif;
>> + __u32 mark = IP4_REPLY_MARK(net, skb->mark);
>>
>>   if (__ip_options_echo(net, , skb, sopt))
>>   return;
>> @@ -1561,7 +1562,7 @@ void ip_send_unicast_reply(struct sock *sk, struct 
>> sk_buff *skb,
>>   oif = skb->skb_iif;
>>
>>   flowi4_init_output(, oif,
>> -IP4_REPLY_MARK(net, skb->mark),
>> +mark ? (mark) : sk->sk_mark,
>
> You can avoid the declaration of mark variable and simply use here :
>
> IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark,
>

Thanks for the advice and suggestions Eric. That is more elegant. Will do in v1.

>>  RT_TOS(arg->tos),
>>  RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
>>  ip_reply_arg_flowi_flags(arg),
>> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
>> index f70586b50838..fbee36579c83 100644
>> --- a/net/ipv4/tcp_ipv4.c
>> +++ b/net/ipv4/tcp_ipv4.c
>> @@ -621,6 +621,7 @@ static void tcp_v4_send_reset(const struct sock *sk, 
>> struct sk_buff *skb)
>>   struct sock *sk1 = NULL;
>>  #endif
>>   struct net *net;
>> + struct sock *ctl_sk;
>>
>>   /* Never send a reset in response to a reset. */
>>   if (th->rst)
>> @@ -723,11 +724,17 @@ static void tcp_v4_send_reset(const struct sock *sk, 
>> struct sk_buff *skb)
>>   arg.tos = ip_hdr(skb)->tos;
>>   arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
>>   local_bh_disable();
>> - ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
>> + ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
>> + if (sk && sk->sk_state == TCP_TIME_WAIT)
>> + ctl_sk->sk_mark = inet_twsk(sk)->tw_mark;
>> + else if (sk && sk_fullsock(sk))
>> + ctl_sk->sk_mark = sk->sk_mark;
>> + ip_send_unicast_reply(ctl_sk,
>> skb, _SKB_CB(skb)->header.h4.opt,
>> ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
>> , arg.iov[0].iov_len);
>>
>> + ctl_sk->sk_mark = 0;
>>   __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
>>   __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
>>   local_bh_enable();
>> @@ -759,6 +766,7 @@ static void tcp_v4_send_ack(const struct sock *sk,
>>   } rep;
>>   struct net *net = sock_net(sk);
>>   struct ip_reply_arg arg;
>> + struct sock *ctl_sk;
>>
>>   memset(, 0, sizeof(struct tcphdr));
>>   memset(, 0, sizeof(arg));
>> @@ -809,11 +817,17 @@ static void tcp_v4_send_ack(const struct sock *sk,
>>   arg.tos = tos;
>>   arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
>>   local_bh_disable();
>> - 

Re: [PATCH net, v1] dccp/tcp: fix routing redirect race

2017-03-09 Thread Jonathan Maxwell
On Fri, Mar 10, 2017 at 3:23 PM, Eric Dumazet  wrote:
> On Fri, 2017-03-10 at 14:31 +1100, Jon Maxwell wrote:
>> As Eric Dumazet pointed out this also needs to be fixed in IPv6.
>> v1: Contains the IPv6 patch as well.
>
>> Fixes: ceb3320610d6 ("ipv4: Kill routes during PMTU/redirect updates.")
>> Cc: Eric Garver 
>> Cc: Hannes Sowa 
>> Signed-off-by: Jon Maxwell 
>> ---
>>  net/dccp/ipv4.c | 3 ++-
>>  net/ipv4/tcp_ipv4.c | 3 ++-
>>  net/ipv6/tcp_ipv6.c | 8 +---
>
> Hi Jon.
>
> You forgot net/dccp/ipv6.c
>
> dccp_v6_err() has the same issue.
>
>

Thanks Eric,

My bad. I'll fix that and post another patch.


Re: [PATCH net] dccp/tcp: fix routing redirect race

2017-03-08 Thread Jonathan Maxwell
On Thu, Mar 9, 2017 at 3:40 PM, Eric Dumazet <eric.duma...@gmail.com> wrote:
> On Thu, 2017-03-09 at 14:42 +1100, Jonathan Maxwell wrote:
>> Sorry let me resend in plain text mode.
>>
>> On Thu, Mar 9, 2017 at 1:10 PM, Eric Dumazet <eric.duma...@gmail.com> wrote:
>> > On Thu, 2017-03-09 at 12:15 +1100, Jon Maxwell wrote:
>> >> We have seen a few incidents lately where a dst_enty has been freed
>> >> with a dangling TCP socket reference (sk->sk_dst_cache) pointing to that
>> >> dst_entry. If the conditions/timings are right a crash then ensues when 
>> >> the
>> >> freed dst_entry is referenced later on. A Common crashing back trace is:
>> >
>> > Very nice catch !
>> >
>>
>> Thanks Eric.
>>
>> > Don't we have a similar issue for IPv6 ?
>> >
>> >
>>
>> Good point.
>>
>> We checked and as far as we can tell IPv6 does not invalidate the route.
>> So it should be safer.
>
> Simply doing :
>
> __sk_dst_check(sk, np->dst_cookie);
>
> is racy, even before calling dst->ops->redirect(dst, sk, skb);
>
> (if socket is owned by user)
>
>
>

Okay, I will add a similar patch for IPv6 to also protect from that.


Re: [PATCH net] dccp/tcp: fix routing redirect race

2017-03-08 Thread Jonathan Maxwell
Sorry let me resend in plain text mode.

On Thu, Mar 9, 2017 at 1:10 PM, Eric Dumazet  wrote:
> On Thu, 2017-03-09 at 12:15 +1100, Jon Maxwell wrote:
>> We have seen a few incidents lately where a dst_enty has been freed
>> with a dangling TCP socket reference (sk->sk_dst_cache) pointing to that
>> dst_entry. If the conditions/timings are right a crash then ensues when the
>> freed dst_entry is referenced later on. A Common crashing back trace is:
>
> Very nice catch !
>

Thanks Eric.

> Don't we have a similar issue for IPv6 ?
>
>

Good point.

We checked and as far as we can tell IPv6 does not invalidate the route.
So it should be safer.


Re: [PATCHv1] net-next: treewide use is_vlan_dev() helper function.

2017-02-05 Thread Jonathan Maxwell
On Sun, Feb 5, 2017 at 4:00 AM, Parav Pandit  wrote:
> This patch makes use of is_vlan_dev() function instead of flag
> comparison which is exactly done by is_vlan_dev() helper function.
>
> Signed-off-by: Parav Pandit 
> Reviewed-by: Daniel Jurgens 
> ---
>  drivers/infiniband/core/cma.c|  6 ++
>  drivers/infiniband/sw/rxe/rxe_net.c  |  2 +-
>  drivers/net/ethernet/broadcom/cnic.c |  2 +-
>  drivers/net/ethernet/chelsio/cxgb3/l2t.c |  2 +-
>  drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c  |  4 ++--
>  drivers/net/ethernet/chelsio/cxgb4/l2t.c |  2 +-
>  drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c |  8 
>  drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c |  4 ++--
>  drivers/net/hyperv/netvsc_drv.c  |  2 +-
>  drivers/scsi/bnx2fc/bnx2fc_fcoe.c|  6 +++---
>  drivers/scsi/cxgbi/libcxgbi.c|  6 +++---
>  drivers/scsi/fcoe/fcoe.c | 13 ++---
>  include/rdma/ib_addr.h   |  6 ++
>  net/hsr/hsr_slave.c  |  3 ++-
>  14 files changed, 31 insertions(+), 35 deletions(-)
>

Neatens the code up nicely.

Acked-by: Jon Maxwell 


Re: [PATCH net] ibmveth: calculate gso_segs for large packets

2016-12-15 Thread Jonathan Maxwell
On Wed, Dec 14, 2016 at 11:15 AM, Thomas Falcon
<tlfal...@linux.vnet.ibm.com> wrote:
> Include calculations to compute the number of segments
> that comprise an aggregated large packet.
>
> Signed-off-by: Thomas Falcon <tlfal...@linux.vnet.ibm.com>

Reviewed-by: Jonathan Maxwell <jmaxwel...@gmail.com>

> ---
>  drivers/net/ethernet/ibm/ibmveth.c | 12 ++--
>  1 file changed, 10 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/ethernet/ibm/ibmveth.c 
> b/drivers/net/ethernet/ibm/ibmveth.c
> index fbece63..a831f94 100644
> --- a/drivers/net/ethernet/ibm/ibmveth.c
> +++ b/drivers/net/ethernet/ibm/ibmveth.c
> @@ -1181,7 +1181,9 @@ static netdev_tx_t ibmveth_start_xmit(struct sk_buff 
> *skb,
>
>  static void ibmveth_rx_mss_helper(struct sk_buff *skb, u16 mss, int lrg_pkt)
>  {
> +   struct tcphdr *tcph;
> int offset = 0;
> +   int hdr_len;
>
> /* only TCP packets will be aggregated */
> if (skb->protocol == htons(ETH_P_IP)) {
> @@ -1208,14 +1210,20 @@ static void ibmveth_rx_mss_helper(struct sk_buff 
> *skb, u16 mss, int lrg_pkt)
> /* if mss is not set through Large Packet bit/mss in rx buffer,
>  * expect that the mss will be written to the tcp header checksum.
>  */
> +   tcph = (struct tcphdr *)(skb->data + offset);
> if (lrg_pkt) {
> skb_shinfo(skb)->gso_size = mss;
> } else if (offset) {
> -   struct tcphdr *tcph = (struct tcphdr *)(skb->data + offset);
> -
> skb_shinfo(skb)->gso_size = ntohs(tcph->check);
> tcph->check = 0;
> }
> +
> +   if (skb_shinfo(skb)->gso_size) {
> +   hdr_len = offset + tcph->doff * 4;
> +   skb_shinfo(skb)->gso_segs =
> +   DIV_ROUND_UP(skb->len - hdr_len,
> +skb_shinfo(skb)->gso_size);
> +   }
>  }
>
>  static int ibmveth_poll(struct napi_struct *napi, int budget)
> --
> 1.8.3.1
>


Re: [PATCH net-next] ibmveth: v1 calculate correct gso_size and set gso_type

2016-11-06 Thread Jonathan Maxwell
On Thu, Nov 3, 2016 at 8:40 AM, Brian King  wrote:
> On 10/27/2016 10:26 AM, Eric Dumazet wrote:
>> On Wed, 2016-10-26 at 11:09 +1100, Jon Maxwell wrote:
>>> We recently encountered a bug where a few customers using ibmveth on the
>>> same LPAR hit an issue where a TCP session hung when large receive was
>>> enabled. Closer analysis revealed that the session was stuck because the
>>> one side was advertising a zero window repeatedly.
>>>
>>> We narrowed this down to the fact the ibmveth driver did not set gso_size
>>> which is translated by TCP into the MSS later up the stack. The MSS is
>>> used to calculate the TCP window size and as that was abnormally large,
>>> it was calculating a zero window, even although the sockets receive buffer
>>> was completely empty.
>>>
>>> We were able to reproduce this and worked with IBM to fix this. Thanks Tom
>>> and Marcelo for all your help and review on this.
>>>
>>> The patch fixes both our internal reproduction tests and our customers 
>>> tests.
>>>
>>> Signed-off-by: Jon Maxwell 
>>> ---
>>>  drivers/net/ethernet/ibm/ibmveth.c | 20 
>>>  1 file changed, 20 insertions(+)
>>>
>>> diff --git a/drivers/net/ethernet/ibm/ibmveth.c 
>>> b/drivers/net/ethernet/ibm/ibmveth.c
>>> index 29c05d0..c51717e 100644
>>> --- a/drivers/net/ethernet/ibm/ibmveth.c
>>> +++ b/drivers/net/ethernet/ibm/ibmveth.c
>>> @@ -1182,6 +1182,8 @@ static int ibmveth_poll(struct napi_struct *napi, int 
>>> budget)
>>>  int frames_processed = 0;
>>>  unsigned long lpar_rc;
>>>  struct iphdr *iph;
>>> +bool large_packet = 0;
>>> +u16 hdr_len = ETH_HLEN + sizeof(struct tcphdr);
>>>
>>>  restart_poll:
>>>  while (frames_processed < budget) {
>>> @@ -1236,10 +1238,28 @@ static int ibmveth_poll(struct napi_struct *napi, 
>>> int budget)
>>>  iph->check = 0;
>>>  iph->check = 
>>> ip_fast_csum((unsigned char *)iph, iph->ihl);
>>>  adapter->rx_large_packets++;
>>> +large_packet = 1;
>>>  }
>>>  }
>>>  }
>>>
>>> +if (skb->len > netdev->mtu) {
>>> +iph = (struct iphdr *)skb->data;
>>> +if (be16_to_cpu(skb->protocol) == ETH_P_IP &&
>>> +iph->protocol == IPPROTO_TCP) {
>>> +hdr_len += sizeof(struct iphdr);
>>> +skb_shinfo(skb)->gso_type = 
>>> SKB_GSO_TCPV4;
>>> +skb_shinfo(skb)->gso_size = 
>>> netdev->mtu - hdr_len;
>>> +} else if (be16_to_cpu(skb->protocol) == 
>>> ETH_P_IPV6 &&
>>> +   iph->protocol == IPPROTO_TCP) {
>>> +hdr_len += sizeof(struct ipv6hdr);
>>> +skb_shinfo(skb)->gso_type = 
>>> SKB_GSO_TCPV6;
>>> +skb_shinfo(skb)->gso_size = 
>>> netdev->mtu - hdr_len;
>>> +}
>>> +if (!large_packet)
>>> +adapter->rx_large_packets++;
>>> +}
>>> +
>>>
>>
>> This might break forwarding and PMTU discovery.
>>
>> You force gso_size to device mtu, regardless of real MSS used by the TCP
>> sender.
>>
>> Don't you have the MSS provided in RX descriptor, instead of guessing
>> the value ?
>
> We've had some further discussions on this with the Virtual I/O Server (VIOS)
> development team. The large receive aggregation in the VIOS (AIX based) is 
> actually
> being done by software in the VIOS. What they may be able to do is when 
> performing
> this aggregation, they could look at the packet lengths of all the packets 
> being
> aggregated and take the largest packet size within the aggregation unit, 
> minus the
> header length and return that to the virtual ethernet client which we could 
> then stuff
> into gso_size. They are currently assessing how feasible this would be to do 
> and whether
> it would impact other bits of the code. However, assuming this does end up 
> being an option,
> would this address the concerns here or is that going to break something else 
> I'm
> not thinking of?

I was discussing this with a colleague and although this is better than
what we have so far. We wonder if there could be a corner case where
it ends up with a smaller value than the current MSS. For example if
the application sent a burst of small TCP packets with the PUSH
bit set. In that case they may not be coalesced by GRO. The VIOS could
probably be coded to detect that condition and use the previous MSS.
But that may not necessarily be the current MSS.

The ibmveth driver passes 

Re: [PATCH net-next] ibmveth: v1 calculate correct gso_size and set gso_type

2016-10-29 Thread Jonathan Maxwell
On Fri, Oct 28, 2016 at 5:08 AM, Eric Dumazet  wrote:
> On Thu, 2016-10-27 at 12:54 -0500, Thomas Falcon wrote:
>> On 10/27/2016 10:26 AM, Eric Dumazet wrote:
>> > On Wed, 2016-10-26 at 11:09 +1100, Jon Maxwell wrote:
>> >> We recently encountered a bug where a few customers using ibmveth on the
>> >> same LPAR hit an issue where a TCP session hung when large receive was
>> >> enabled. Closer analysis revealed that the session was stuck because the
>> >> one side was advertising a zero window repeatedly.
>> >>
>> >> We narrowed this down to the fact the ibmveth driver did not set gso_size
>> >> which is translated by TCP into the MSS later up the stack. The MSS is
>> >> used to calculate the TCP window size and as that was abnormally large,
>> >> it was calculating a zero window, even although the sockets receive buffer
>> >> was completely empty.
>> >>
>> >> We were able to reproduce this and worked with IBM to fix this. Thanks Tom
>> >> and Marcelo for all your help and review on this.
>> >>
>> >> The patch fixes both our internal reproduction tests and our customers 
>> >> tests.
>> >>
>> >> Signed-off-by: Jon Maxwell 
>> >> ---
>> >>  drivers/net/ethernet/ibm/ibmveth.c | 20 
>> >>  1 file changed, 20 insertions(+)
>> >>
>> >> diff --git a/drivers/net/ethernet/ibm/ibmveth.c 
>> >> b/drivers/net/ethernet/ibm/ibmveth.c
>> >> index 29c05d0..c51717e 100644
>> >> --- a/drivers/net/ethernet/ibm/ibmveth.c
>> >> +++ b/drivers/net/ethernet/ibm/ibmveth.c
>> >> @@ -1182,6 +1182,8 @@ static int ibmveth_poll(struct napi_struct *napi, 
>> >> int budget)
>> >>int frames_processed = 0;
>> >>unsigned long lpar_rc;
>> >>struct iphdr *iph;
>> >> +  bool large_packet = 0;
>> >> +  u16 hdr_len = ETH_HLEN + sizeof(struct tcphdr);
>> >>
>> >>  restart_poll:
>> >>while (frames_processed < budget) {
>> >> @@ -1236,10 +1238,28 @@ static int ibmveth_poll(struct napi_struct *napi, 
>> >> int budget)
>> >>iph->check = 0;
>> >>iph->check = 
>> >> ip_fast_csum((unsigned char *)iph, iph->ihl);
>> >>adapter->rx_large_packets++;
>> >> +  large_packet = 1;
>> >>}
>> >>}
>> >>}
>> >>
>> >> +  if (skb->len > netdev->mtu) {
>> >> +  iph = (struct iphdr *)skb->data;
>> >> +  if (be16_to_cpu(skb->protocol) == ETH_P_IP &&
>> >> +  iph->protocol == IPPROTO_TCP) {
>> >> +  hdr_len += sizeof(struct iphdr);
>> >> +  skb_shinfo(skb)->gso_type = 
>> >> SKB_GSO_TCPV4;
>> >> +  skb_shinfo(skb)->gso_size = 
>> >> netdev->mtu - hdr_len;
>> >> +  } else if (be16_to_cpu(skb->protocol) == 
>> >> ETH_P_IPV6 &&
>> >> + iph->protocol == IPPROTO_TCP) {
>> >> +  hdr_len += sizeof(struct ipv6hdr);
>> >> +  skb_shinfo(skb)->gso_type = 
>> >> SKB_GSO_TCPV6;
>> >> +  skb_shinfo(skb)->gso_size = 
>> >> netdev->mtu - hdr_len;
>> >> +  }
>> >> +  if (!large_packet)
>> >> +  adapter->rx_large_packets++;
>> >> +  }
>> >> +
>> >>
>> > This might break forwarding and PMTU discovery.
>> >
>> > You force gso_size to device mtu, regardless of real MSS used by the TCP
>> > sender.
>> >
>> > Don't you have the MSS provided in RX descriptor, instead of guessing
>> > the value ?
>> >
>> >
>> >
>> The MSS is not always available unfortunately, so this is the best solution 
>> there is at the moment.
>
> Hmm... then what about skb_shinfo(skb)->gso_segs ?
>
> ip_rcv() for example has :
>
> __IP_ADD_STATS(net,
>IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK),
>max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
>
>

Okay thanks Eric. As the MSS is the main concern, let me work with the
team here and see if we find can a better way to do this. Will take care of
the other points you raised at the same time.

>
> Also prefer : (skb->protocol == htons(ETH_P_IP)) tests
>
> And the ipv6 test is wrong :
>
> } else if (be16_to_cpu(skb->protocol) == ETH_P_IPV6 &&
>iph->protocol == IPPROTO_TCP) {
>
>
> Since iph is a pointer to ipv4 iphdr .
>
>
>


Re: [PATCH net-next] ibmveth: calculate correct gso_size and set gso_type

2016-10-25 Thread Jonathan Maxwell
>> + u16 hdr_len = ETH_HLEN + sizeof(struct tcphdr);

> Compiler may optmize this, but maybe move hdr_len to [*] ?>

There are other places in the stack where a u16 is used for the
same purpose. So I'll rather stick to that convention.

I'll make the other formatting changes you suggested and
resubmit as v1.

Thanks

Jon

On Tue, Oct 25, 2016 at 9:31 PM, Marcelo Ricardo Leitner
 wrote:
> On Tue, Oct 25, 2016 at 04:13:41PM +1100, Jon Maxwell wrote:
>> We recently encountered a bug where a few customers using ibmveth on the
>> same LPAR hit an issue where a TCP session hung when large receive was
>> enabled. Closer analysis revealed that the session was stuck because the
>> one side was advertising a zero window repeatedly.
>>
>> We narrowed this down to the fact the ibmveth driver did not set gso_size
>> which is translated by TCP into the MSS later up the stack. The MSS is
>> used to calculate the TCP window size and as that was abnormally large,
>> it was calculating a zero window, even although the sockets receive buffer
>> was completely empty.
>>
>> We were able to reproduce this and worked with IBM to fix this. Thanks Tom
>> and Marcelo for all your help and review on this.
>>
>> The patch fixes both our internal reproduction tests and our customers tests.
>>
>> Signed-off-by: Jon Maxwell 
>> ---
>>  drivers/net/ethernet/ibm/ibmveth.c | 19 +++
>>  1 file changed, 19 insertions(+)
>>
>> diff --git a/drivers/net/ethernet/ibm/ibmveth.c 
>> b/drivers/net/ethernet/ibm/ibmveth.c
>> index 29c05d0..3028c33 100644
>> --- a/drivers/net/ethernet/ibm/ibmveth.c
>> +++ b/drivers/net/ethernet/ibm/ibmveth.c
>> @@ -1182,6 +1182,8 @@ static int ibmveth_poll(struct napi_struct *napi, int 
>> budget)
>>   int frames_processed = 0;
>>   unsigned long lpar_rc;
>>   struct iphdr *iph;
>> + bool large_packet = 0;
>> + u16 hdr_len = ETH_HLEN + sizeof(struct tcphdr);
>
> Compiler may optmize this, but maybe move hdr_len to [*] ?
>
>>
>>  restart_poll:
>>   while (frames_processed < budget) {
>> @@ -1236,10 +1238,27 @@ static int ibmveth_poll(struct napi_struct *napi, 
>> int budget)
>>   iph->check = 0;
>>   iph->check = 
>> ip_fast_csum((unsigned char *)iph, iph->ihl);
>>   adapter->rx_large_packets++;
>> + large_packet = 1;
>>   }
>>   }
>>   }
>>
>> + if (skb->len > netdev->mtu) {
>
> [*]
>
>> + iph = (struct iphdr *)skb->data;
>> + if (be16_to_cpu(skb->protocol) == ETH_P_IP && 
>> iph->protocol == IPPROTO_TCP) {
>
> The if line above is too long, should be broken in two.
>
>> + hdr_len += sizeof(struct iphdr);
>> + skb_shinfo(skb)->gso_type = 
>> SKB_GSO_TCPV4;
>> + skb_shinfo(skb)->gso_size = 
>> netdev->mtu - hdr_len;
>> + } else if (be16_to_cpu(skb->protocol) == 
>> ETH_P_IPV6 &&
>> + iph->protocol == IPPROTO_TCP) {
> ^
> And this one should start 3 spaces later, right below be16_
>
>   Marcelo
>
>> + hdr_len += sizeof(struct ipv6hdr);
>> + skb_shinfo(skb)->gso_type = 
>> SKB_GSO_TCPV6;
>> + skb_shinfo(skb)->gso_size = 
>> netdev->mtu - hdr_len;
>> + }
>> + if (!large_packet)
>> + adapter->rx_large_packets++;
>> + }
>> +
>>   napi_gro_receive(napi, skb);/* send it up */
>>
>>   netdev->stats.rx_packets++;
>> --
>> 1.8.3.1
>>


Re: [PATCH net] netconsole: Check for carrier before calling netpoll_send_udp()

2015-08-11 Thread Jonathan Maxwell
 I personally think that drivers need to synchronize such things
 internally.  They are the only entity which knows when it's OK
 to do whatever the netpoll method does, and they are also the only
 entity which can properly synchronize such checks.

Thanks agreed. I am testing the following ixgbe patch on my reproducer
that  checks for resetting/removing/down state flags in ixgbe_poll() and
bails if true. It does that check in other ixgbe routines as well. It's working
fine so far. We will need to do something similar for vmxnet3 as well and
possibly other drivers.

--- a/drivers/net/ixgbe/ixgbe_main.c 2015-08-10 17:13:02.899400508 +1000
+++ b/drivers/net/ixgbe/ixgbe_main.c.patch 2015-08-12 11:34:49.951053887 +1000
@@ -2672,6 +2672,11 @@
  int per_ring_budget;
  bool clean_complete = true;

+ if (test_bit(__IXGBE_DOWN, adapter-state) ||
+test_bit(__IXGBE_REMOVING, adapter-state) ||
+test_bit(__IXGBE_RESETTING, adapter-state))
+ return budget;
+
 #ifdef CONFIG_IXGBE_DCA
  if (adapter-flags  IXGBE_FLAG_DCA_ENABLED)
  ixgbe_update_dca(q_vector);
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net] netconsole: Check for carrier before calling netpoll_send_udp()

2015-08-10 Thread Jonathan Maxwell
 What if the carrier check passes, and then the chip reset starts on
 another cpu?  You'll have the same problem.

Okay, let me see if I can come up with a better way to mitigate this.

On Tue, Aug 11, 2015 at 2:22 PM, David Miller da...@davemloft.net wrote:
 From: Jon Maxwell jmaxwel...@gmail.com
 Date: Tue, 11 Aug 2015 11:32:26 +1000

 We have seen a few crashes recently where a NIC is getting
 reset for some reason and then the driver or another module calls
 printk() which invokes netconsole. Netconsole then calls the
 adapter specific poll routine via netpoll which crashes because
 the adapter is resetting and its structures are being reinitialized.

 This isn't a fix.

 What if the carrier check passes, and then the chip reset starts on
 another cpu?  You'll have the same problem.

 I'm not applying this, sorry.
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next] tcp: always send a quick ack when quickacks are enabled

2015-07-07 Thread Jonathan Maxwell
 On Tue, 2015-07-07 at 14:22 +1000, Jon Maxwell wrote:


  @@ -4887,6 +4884,7 @@ static inline void tcp_data_snd_check(struct sock
  *sk)
   static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
   {
   struct tcp_sock *tp = tcp_sk(sk);
  +const struct dst_entry *dst = __sk_dst_get(sk);
 
   /* More than one full frame received... */
   if (((tp-rcv_nxt - tp-rcv_wup)  inet_csk(sk)-icsk_ack.rcv_mss 
  @@ -4896,6 +4894,8 @@ static void __tcp_ack_snd_check(struct sock *sk, int
  ofo_possible)
__tcp_select_window(sk) = tp-rcv_wnd) ||
   /* We ACK each frame or... */
   tcp_in_quickack_mode(sk) ||
  +/* quickack on dst */
  +(dst  dst_metric(dst, RTAX_QUICKACK)) ||
   /* We have out of order data. */
   (ofo_possible  skb_peek(tp-out_of_order_queue))) {

 This logic should be moved to tcp_in_quickack_mode() ?

Yes agreed that's a better place for the check seeing that we
already check the other quickack conditions there as well.


 Note I placed the dst test before others, to reduce jump prediction
 misses.

 diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
 index 684f095d196e..69ec8d25a2e5 100644
 --- a/net/ipv4/tcp_input.c
 +++ b/net/ipv4/tcp_input.c
 @@ -196,11 +196,13 @@ static void tcp_enter_quickack_mode(struct sock *sk)
   * and the session is not interactive.
   */

 -static inline bool tcp_in_quickack_mode(const struct sock *sk)
 +static bool tcp_in_quickack_mode(struct sock *sk)
  {
  const struct inet_connection_sock *icsk = inet_csk(sk);
 +const struct dst_entry *dst = __sk_dst_get(sk);

 -return icsk-icsk_ack.quick  !icsk-icsk_ack.pingpong;
 +return (dst  dst_metric(dst, RTAX_QUICKACK)) ||
 +   (icsk-icsk_ack.quick  !icsk-icsk_ack.pingpong);
  }

  static void tcp_ecn_queue_cwr(struct tcp_sock *tp)



On Tue, Jul 7, 2015 at 5:05 PM, Eric Dumazet eric.duma...@gmail.com wrote:
 On Tue, 2015-07-07 at 14:22 +1000, Jon Maxwell wrote:


 @@ -4887,6 +4884,7 @@ static inline void tcp_data_snd_check(struct sock *sk)
  static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
  {
   struct tcp_sock *tp = tcp_sk(sk);
 + const struct dst_entry *dst = __sk_dst_get(sk);

   /* More than one full frame received... */
   if (((tp-rcv_nxt - tp-rcv_wup)  inet_csk(sk)-icsk_ack.rcv_mss 
 @@ -4896,6 +4894,8 @@ static void __tcp_ack_snd_check(struct sock *sk, int 
 ofo_possible)
__tcp_select_window(sk) = tp-rcv_wnd) ||
   /* We ACK each frame or... */
   tcp_in_quickack_mode(sk) ||
 + /* quickack on dst */
 + (dst  dst_metric(dst, RTAX_QUICKACK)) ||
   /* We have out of order data. */
   (ofo_possible  skb_peek(tp-out_of_order_queue))) {

 This logic should be moved to tcp_in_quickack_mode() ?

 Note I placed the dst test before others, to reduce jump prediction
 misses.

 diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
 index 684f095d196e..69ec8d25a2e5 100644
 --- a/net/ipv4/tcp_input.c
 +++ b/net/ipv4/tcp_input.c
 @@ -196,11 +196,13 @@ static void tcp_enter_quickack_mode(struct sock *sk)
   * and the session is not interactive.
   */

 -static inline bool tcp_in_quickack_mode(const struct sock *sk)
 +static bool tcp_in_quickack_mode(struct sock *sk)
  {
 const struct inet_connection_sock *icsk = inet_csk(sk);
 +   const struct dst_entry *dst = __sk_dst_get(sk);

 -   return icsk-icsk_ack.quick  !icsk-icsk_ack.pingpong;
 +   return (dst  dst_metric(dst, RTAX_QUICKACK)) ||
 +  (icsk-icsk_ack.quick  !icsk-icsk_ack.pingpong);
  }

  static void tcp_ecn_queue_cwr(struct tcp_sock *tp)




--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html