Re: [PATCH RFC v2 00/12] socket sendmsg MSG_ZEROCOPY

2017-02-28 Thread Willem de Bruijn
On Tue, Feb 28, 2017 at 7:28 PM, Tom Herbert  wrote:
> On Tue, Feb 28, 2017 at 3:22 PM, Eric Dumazet  wrote:
>> On Tue, 2017-02-28 at 14:52 -0800, Andy Lutomirski wrote:
>>
>>> The user pages are a gift to the kernel.  The application  may  not
>>> modify this memory ever, otherwise the page cache and on-disk data may
>>> differ.
>>>
>>> This is just not okay IMO.
>>
>> TCP works just fine in this case.
>>
>> TX checksum will be computed by the NIC after/while data is copied.
>>
>> If really the application changes the data, that will not cause any
>> problems, other than user side consistency.
>>
>> This is why we require a copy (for all buffers that came from zero-copy)
>> if network stack hits a device that can not offload TX checksum.
>>
>> Even pwrite() does not guarantee consistency if multiple threads are
>> using it on overlapping regions.
>>
> The Mellanox team working on TLS offload pointed out to us that if
> data is changed for a retransmit then it becomes trivial for someone
> snooping to break the encryption. Sounds pretty scary and it would be
> a shame if we couldn't use zero-copy in that use case :-( Hopefully we
> can find a solution...
>

This requires collusion by the process initiating the zerocopy send
to help the entity snooping the link. That could be an attack on admin
configured tunnels, but user-directed encryption offload like AF_TLS
can still use zerocopy.


Re: [patch] net/mlx4: && vs & typo

2017-02-28 Thread Julia Lawall
On Tue, 28 Feb 2017, Bart Van Assche wrote:

> On 02/28/2017 02:23 PM, Joe Perches wrote:
> > On Tue, 2017-02-28 at 15:35 +, Bart Van Assche wrote:
> >> On Tue, 2017-02-28 at 15:02 +0300, Dan Carpenter wrote:
> >>> Bitwise & was obviously intended here.
> > []
> >>> diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h
> > []
> >>> @@ -109,7 +109,7 @@ static inline void(u8 *addr, u64 mac)
> >>>   int i;
> >>>
> >>>   for (i = ETH_ALEN; i > 0; i--) {
> >>> - addr[i - 1] = mac && 0xFF;
> >>> + addr[i - 1] = mac & 0xFF;
> >>>   mac >>= 8;
> >>>   }
> >>>  }
> >>
> >> Is this the only place where such a loop occurs?
> >
> > Seems to be.
> >
> >> Should a put_unaligned_be48()
> >> function be introduced?
> >
> > Why?  This is used exactly once.
>
> Really? Here is an example of another open-coded version of
> put_unaligned_be48() from arch/mips/cavium-octeon/octeon-platform.c:
>
>   new_mac[0] = (mac >> 40) & 0xff;
>   new_mac[1] = (mac >> 32) & 0xff;
>   new_mac[2] = (mac >> 24) & 0xff;
>   new_mac[3] = (mac >> 16) & 0xff;
>   new_mac[4] = (mac >> 8) & 0xff;
>   new_mac[5] = mac & 0xff;

drivers/media/radio/radio-shark2.c:
for (i = 0; i < 6; i++)
   shark->transfer_buffer[i + 1] = (reg >> (40 - i * 8)) & 0xff;

drivers/rtc/rtc-ab3100.c
   buf[0] = (hw_counter) & 0xFF;
   buf[1] = (hw_counter >> 8) & 0xFF;
   buf[2] = (hw_counter >> 16) & 0xFF;
   buf[3] = (hw_counter >> 24) & 0xFF;
   buf[4] = (hw_counter >> 32) & 0xFF;
   buf[5] = (hw_counter >> 40) & 0xFF;

drivers/net/ethernet/sun/ldmvsw.c
for (i = 0; i < ETH_ALEN; i++)
   port->raddr[i] = (*rmac >> (5 - i) * 8) & 0xff;

drivers/net/ethernet/sun/sunvnet.c
for (i = 0; i < ETH_ALEN; i++)
   dev->dev_addr[i] = (*local_mac >> (5 - i) * 8) & 0xff;

for (i = 0; i < ETH_ALEN; i++)
   port->raddr[i] = (*rmac >> (5 - i) * 8) & 0xff;

julia

>
> Bart.
> --
> To unsubscribe from this list: send the line "unsubscribe kernel-janitors" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>


[PATCH-v4-RESEND 2/4] vhost-vsock: add pkt cancel capability

2017-02-28 Thread Peng Tao
To allow canceling all packets of a connection.

Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Jorgen Hansen 
Signed-off-by: Peng Tao 
---
 drivers/vhost/vsock.c  | 41 +
 include/net/af_vsock.h |  3 +++
 2 files changed, 44 insertions(+)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index ce5e63d..57babce 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -223,6 +223,46 @@ vhost_transport_send_pkt(struct virtio_vsock_pkt *pkt)
return len;
 }
 
+static int
+vhost_transport_cancel_pkt(struct vsock_sock *vsk)
+{
+   struct vhost_vsock *vsock;
+   struct virtio_vsock_pkt *pkt, *n;
+   int cnt = 0;
+   LIST_HEAD(freeme);
+
+   /* Find the vhost_vsock according to guest context id  */
+   vsock = vhost_vsock_get(vsk->remote_addr.svm_cid);
+   if (!vsock)
+   return -ENODEV;
+
+   spin_lock_bh(&vsock->send_pkt_list_lock);
+   list_for_each_entry_safe(pkt, n, &vsock->send_pkt_list, list) {
+   if (pkt->cancel_token != vsk)
+   continue;
+   list_move(&pkt->list, &freeme);
+   }
+   spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+   list_for_each_entry_safe(pkt, n, &freeme, list) {
+   if (pkt->reply)
+   cnt++;
+   list_del(&pkt->list);
+   virtio_transport_free_pkt(pkt);
+   }
+
+   if (cnt) {
+   struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX];
+   int new_cnt;
+
+   new_cnt = atomic_sub_return(cnt, &vsock->queued_replies);
+   if (new_cnt + cnt >= tx_vq->num && new_cnt < tx_vq->num)
+   vhost_poll_queue(&tx_vq->poll);
+   }
+
+   return 0;
+}
+
 static struct virtio_vsock_pkt *
 vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq,
  unsigned int out, unsigned int in)
@@ -675,6 +715,7 @@ static struct virtio_transport vhost_transport = {
.release  = virtio_transport_release,
.connect  = virtio_transport_connect,
.shutdown = virtio_transport_shutdown,
+   .cancel_pkt   = vhost_transport_cancel_pkt,
 
.dgram_enqueue= virtio_transport_dgram_enqueue,
.dgram_dequeue= virtio_transport_dgram_dequeue,
diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index f275896..f32ed9a 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -100,6 +100,9 @@ struct vsock_transport {
void (*destruct)(struct vsock_sock *);
void (*release)(struct vsock_sock *);
 
+   /* Cancel all pending packets sent on vsock. */
+   int (*cancel_pkt)(struct vsock_sock *vsk);
+
/* Connections. */
int (*connect)(struct vsock_sock *);
 
-- 
2.7.4



[PATCH-v4-RESEND 3/4] vsock: add pkt cancel capability

2017-02-28 Thread Peng Tao
Reviewed-by: Stefan Hajnoczi 
Signed-off-by: Peng Tao 
---
 net/vmw_vsock/virtio_transport.c | 42 
 1 file changed, 42 insertions(+)

diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 6788264..f5c44b5 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -213,6 +213,47 @@ virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt)
return len;
 }
 
+static int
+virtio_transport_cancel_pkt(struct vsock_sock *vsk)
+{
+   struct virtio_vsock *vsock;
+   struct virtio_vsock_pkt *pkt, *n;
+   int cnt = 0;
+   LIST_HEAD(freeme);
+
+   vsock = virtio_vsock_get();
+   if (!vsock) {
+   return -ENODEV;
+   }
+
+   spin_lock_bh(&vsock->send_pkt_list_lock);
+   list_for_each_entry_safe(pkt, n, &vsock->send_pkt_list, list) {
+   if (pkt->cancel_token != vsk)
+   continue;
+   list_move(&pkt->list, &freeme);
+   }
+   spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+   list_for_each_entry_safe(pkt, n, &freeme, list) {
+   if (pkt->reply)
+   cnt++;
+   list_del(&pkt->list);
+   virtio_transport_free_pkt(pkt);
+   }
+
+   if (cnt) {
+   struct virtqueue *rx_vq = vsock->vqs[VSOCK_VQ_RX];
+   int new_cnt;
+
+   new_cnt = atomic_sub_return(cnt, &vsock->queued_replies);
+   if (new_cnt + cnt >= virtqueue_get_vring_size(rx_vq) &&
+   new_cnt < virtqueue_get_vring_size(rx_vq))
+   queue_work(virtio_vsock_workqueue, &vsock->rx_work);
+   }
+
+   return 0;
+}
+
 static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
 {
int buf_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
@@ -462,6 +503,7 @@ static struct virtio_transport virtio_transport = {
.release  = virtio_transport_release,
.connect  = virtio_transport_connect,
.shutdown = virtio_transport_shutdown,
+   .cancel_pkt   = virtio_transport_cancel_pkt,
 
.dgram_bind   = virtio_transport_dgram_bind,
.dgram_dequeue= virtio_transport_dgram_dequeue,
-- 
2.7.4



[PATCH-v4-RESEND 4/4] vsock: cancel packets when failing to connect

2017-02-28 Thread Peng Tao
Otherwise we'll leave the packets queued until releasing vsock device.
E.g., if guest is slow to start up, resulting ETIMEDOUT on connect, guest
will get the connect requests from failed host sockets.

Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Jorgen Hansen 
Signed-off-by: Peng Tao 
---
 net/vmw_vsock/af_vsock.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 8a398b3..c73b03a 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1101,10 +1101,19 @@ static const struct proto_ops vsock_dgram_ops = {
.sendpage = sock_no_sendpage,
 };
 
+static int vsock_transport_cancel_pkt(struct vsock_sock *vsk)
+{
+   if (!transport->cancel_pkt)
+   return -EOPNOTSUPP;
+
+   return transport->cancel_pkt(vsk);
+}
+
 static void vsock_connect_timeout(struct work_struct *work)
 {
struct sock *sk;
struct vsock_sock *vsk;
+   int cancel = 0;
 
vsk = container_of(work, struct vsock_sock, dwork.work);
sk = sk_vsock(vsk);
@@ -1115,8 +1124,11 @@ static void vsock_connect_timeout(struct work_struct 
*work)
sk->sk_state = SS_UNCONNECTED;
sk->sk_err = ETIMEDOUT;
sk->sk_error_report(sk);
+   cancel = 1;
}
release_sock(sk);
+   if (cancel)
+   vsock_transport_cancel_pkt(vsk);
 
sock_put(sk);
 }
@@ -1223,11 +1235,13 @@ static int vsock_stream_connect(struct socket *sock, 
struct sockaddr *addr,
err = sock_intr_errno(timeout);
sk->sk_state = SS_UNCONNECTED;
sock->state = SS_UNCONNECTED;
+   vsock_transport_cancel_pkt(vsk);
goto out_wait;
} else if (timeout == 0) {
err = -ETIMEDOUT;
sk->sk_state = SS_UNCONNECTED;
sock->state = SS_UNCONNECTED;
+   vsock_transport_cancel_pkt(vsk);
goto out_wait;
}
 
-- 
2.7.4



[PATCH-v4-RESEND 0/4] vsock: cancel connect packets when failing to connect

2017-02-28 Thread Peng Tao
Hi David,

These patchsets were sent before and reviewed by Stefan and Jorgen 
[https://www.spinics.net/lists/kvm/msg142367.html].
If there is any blocker, please do tell and I'll see to it. Thanks!

Currently, if a connect call fails on a signal or timeout (e.g., guest is still
in the process of starting up), we'll just return to caller and leave the 
connect
packet queued and they are sent even though the connection is considered a 
failure,
which can confuse applications with unwanted false connect attempt.

The patchset enables vsock (both host and guest) to cancel queued packets when
a connect attempt is considered to fail.

v4 changelog:
  - drop two unnecessary void * cast
  - update new callback comment
v3 changelog:
  - define cancel_pkt callback in struct vsock_transport rather than struct 
virtio_transport
  - rename virtio_vsock_pkt->vsk to virtio_vsock_pkt->cancel_token
v2 changelog:
  - fix queued_replies counting and resume tx/rx when necessary

Cheers,
Tao

Peng Tao (4):
  vsock: track pkt owner vsock
  vhost-vsock: add pkt cancel capability
  vsock: add pkt cancel capability
  vsock: cancel packets when failing to connect

 drivers/vhost/vsock.c   | 41 
 include/linux/virtio_vsock.h|  2 ++
 include/net/af_vsock.h  |  3 +++
 net/vmw_vsock/af_vsock.c| 14 +++
 net/vmw_vsock/virtio_transport.c| 42 +
 net/vmw_vsock/virtio_transport_common.c |  7 ++
 6 files changed, 109 insertions(+)

-- 
2.7.4



[PATCH-v4-RESEND 1/4] vsock: track pkt owner vsock

2017-02-28 Thread Peng Tao
So that we can cancel a queued pkt later if necessary.

Reviewed-by: Stefan Hajnoczi 
Signed-off-by: Peng Tao 
---
 include/linux/virtio_vsock.h| 2 ++
 net/vmw_vsock/virtio_transport_common.c | 7 +++
 2 files changed, 9 insertions(+)

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index 9638bfe..193ad3a 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -48,6 +48,7 @@ struct virtio_vsock_pkt {
struct virtio_vsock_hdr hdr;
struct work_struct work;
struct list_head list;
+   void *cancel_token; /* only used for cancellation */
void *buf;
u32 len;
u32 off;
@@ -56,6 +57,7 @@ struct virtio_vsock_pkt {
 
 struct virtio_vsock_pkt_info {
u32 remote_cid, remote_port;
+   struct vsock_sock *vsk;
struct msghdr *msg;
u32 pkt_len;
u16 type;
diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index 849c4ad..ab505f1 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -57,6 +57,7 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
pkt->len= len;
pkt->hdr.len= cpu_to_le32(len);
pkt->reply  = info->reply;
+   pkt->cancel_token   = info->vsk;
 
if (info->msg && len > 0) {
pkt->buf = kmalloc(len, GFP_KERNEL);
@@ -179,6 +180,7 @@ static int virtio_transport_send_credit_update(struct 
vsock_sock *vsk,
struct virtio_vsock_pkt_info info = {
.op = VIRTIO_VSOCK_OP_CREDIT_UPDATE,
.type = type,
+   .vsk = vsk,
};
 
return virtio_transport_send_pkt_info(vsk, &info);
@@ -518,6 +520,7 @@ int virtio_transport_connect(struct vsock_sock *vsk)
struct virtio_vsock_pkt_info info = {
.op = VIRTIO_VSOCK_OP_REQUEST,
.type = VIRTIO_VSOCK_TYPE_STREAM,
+   .vsk = vsk,
};
 
return virtio_transport_send_pkt_info(vsk, &info);
@@ -533,6 +536,7 @@ int virtio_transport_shutdown(struct vsock_sock *vsk, int 
mode)
  VIRTIO_VSOCK_SHUTDOWN_RCV : 0) |
 (mode & SEND_SHUTDOWN ?
  VIRTIO_VSOCK_SHUTDOWN_SEND : 0),
+   .vsk = vsk,
};
 
return virtio_transport_send_pkt_info(vsk, &info);
@@ -559,6 +563,7 @@ virtio_transport_stream_enqueue(struct vsock_sock *vsk,
.type = VIRTIO_VSOCK_TYPE_STREAM,
.msg = msg,
.pkt_len = len,
+   .vsk = vsk,
};
 
return virtio_transport_send_pkt_info(vsk, &info);
@@ -580,6 +585,7 @@ static int virtio_transport_reset(struct vsock_sock *vsk,
.op = VIRTIO_VSOCK_OP_RST,
.type = VIRTIO_VSOCK_TYPE_STREAM,
.reply = !!pkt,
+   .vsk = vsk,
};
 
/* Send RST only if the original pkt is not a RST pkt */
@@ -825,6 +831,7 @@ virtio_transport_send_response(struct vsock_sock *vsk,
.remote_cid = le64_to_cpu(pkt->hdr.src_cid),
.remote_port = le32_to_cpu(pkt->hdr.src_port),
.reply = true,
+   .vsk = vsk,
};
 
return virtio_transport_send_pkt_info(vsk, &info);
-- 
2.7.4



Re: [PATCH RFC v2 00/12] socket sendmsg MSG_ZEROCOPY

2017-02-28 Thread Eric Dumazet
On Tue, 2017-02-28 at 22:28 -0500, David Miller wrote:

> These device are already choking, because as I stated this can already
> be done via sendfile().
> 
> Networking card wise this isn't an issue, chips bring the entire packet
> into their FIFO, compute checksums on the fly mid-stream, and then write
> the 16-bit checksum field before starting to write the packet onto the
> wire.
> 
> I think this is completely a non-issue, and we thought about this right
> from the start when sendfile() support was added nearly two decades ago.
> If network cards from back then didn't crap out in this situation I
> think the ones out there now are probably ok.

Well, we had to fix one issue with GSO fall back about 4 years ago.

https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=cef401de7be8c4e155c6746bfccf721a4fa5fab9

So extra scrutiny would be nice.

Zero copy is incredibly hard to get right.





Re: [PATCH RFC v2 00/12] socket sendmsg MSG_ZEROCOPY

2017-02-28 Thread David Miller
From: Andy Lutomirski 
Date: Tue, 28 Feb 2017 13:06:49 -0800

> On Tue, Feb 28, 2017 at 12:43 PM, Willem de Bruijn
>  wrote:
>> On Tue, Feb 28, 2017 at 2:46 PM, Andy Lutomirski  wrote:
>>> On Mon, Feb 27, 2017 at 10:57 AM, Michael Kerrisk
>>>  wrote:
 [CC += linux-...@vger.kernel.org]

 Hi Willem

>>>
> On a send call with MSG_ZEROCOPY, the kernel pins the user pages and
> creates skbuff fragments directly from these pages. On tx completion,
> it notifies the socket owner that it is safe to modify memory by
> queuing a completion notification onto the socket error queue.
>>>
>>> What happens if the user writes to the pages while it's not safe?
>>>
>>> How about if you're writing to an interface or a route that has crypto
>>> involved and a malicious user can make the data change in the middle
>>> of a crypto operation, thus perhaps leaking the entire key?  (I
>>> wouldn't be at all surprised if a lot of provably secure AEAD
>>> constructions are entirely compromised if an attacker can get the
>>> ciphertext and tag computed from a message that changed during the
>>> computation.
>>
>> Operations that read or write payload, such as this crypto example,
>> but also ebpf in tc or iptables, for instance, demand a deep copy using
>> skb_copy_ubufs before the operation.
>>
>> This blacklist approach requires caution, but these paths should be
>> few and countable. It is not possible to predict at the socket layer
>> whether a packet will encounter any such operation, so white-listing
>> a subset of end-to-end paths is not practical.
> 
> How about hardware that malfunctions if the packet changes out from
> under it?  A whitelist seems quite a bit safer.

These device are already choking, because as I stated this can already
be done via sendfile().

Networking card wise this isn't an issue, chips bring the entire packet
into their FIFO, compute checksums on the fly mid-stream, and then write
the 16-bit checksum field before starting to write the packet onto the
wire.

I think this is completely a non-issue, and we thought about this right
from the start when sendfile() support was added nearly two decades ago.
If network cards from back then didn't crap out in this situation I
think the ones out there now are probably ok.


Re: [PATCH RFC v2 00/12] socket sendmsg MSG_ZEROCOPY

2017-02-28 Thread David Miller
From: Andy Lutomirski 
Date: Tue, 28 Feb 2017 11:46:23 -0800

> On Mon, Feb 27, 2017 at 10:57 AM, Michael Kerrisk
>  wrote:
>> [CC += linux-...@vger.kernel.org]
>>
>> Hi Willem
>>
> 
>>> On a send call with MSG_ZEROCOPY, the kernel pins the user pages and
>>> creates skbuff fragments directly from these pages. On tx completion,
>>> it notifies the socket owner that it is safe to modify memory by
>>> queuing a completion notification onto the socket error queue.
> 
> What happens if the user writes to the pages while it's not safe?

Just want to mention that this ability to write to data behind a
network send's back is not a new thing added by MSG_ZEROCOPY.

All of this is already possible with sendfile().  The pages can be
written to completely asynchronously to the data being pulled out of
the page cache into the transmit path.

The crypto case is interesting, but that is a seperate discussion
about an existing problem rather than something specifically new to
the MSG_ZEROCOPY changes.

Thanks.


Re: [PATCH RFC net-next v2 1/4] skbuff: add stub to help computing crc32c on SCTP packets

2017-02-28 Thread Tom Herbert
On Tue, Feb 28, 2017 at 2:46 PM, Alexander Duyck
 wrote:
> On Tue, Feb 28, 2017 at 2:32 AM, Davide Caratti  wrote:
>> sctp_compute_checksum requires crc32c symbol (provided by libcrc32c), so
>> it can't be used in net core. Like it has been done previously with other
>> symbols (e.g. ipv6_dst_lookup), introduce a stub struct skb_checksum_ops
>> to allow computation of SCTP checksum in net core after sctp.ko (and thus
>> libcrc32c) has been loaded.
>
> At a minimum the name really needs to change.  SCTP does not do
> checksums.  It does a CRC, and a CRC is a very different thing.  The
> fact that somebody decided that offloading a CRC could use the same
> framework is very unfortunate, and your patch descriptions in this
> whole set are calling out a CRC as checksums which it is not.
>
> I don't want to see anything "checksum" or "csum" related in the
> naming when it comes to dealing with SCTP unless we absolutely have to
> have it.  So any function names or structures with sctp in the name
> should call out "crc32" or "crc", please don't use checksum.
>
Alexander,

I agree that internal functions to sctp should not refer to checksum,
but I think we need to take care to be consistent with any external
API (even if somebody made a mistake defining it this way :-) ). As
you know the checksum interface must be very precisely defined, there
is no leeway for ambiguity. Many places in the stack use csum and
CHECKSUM_* to refer to the API not the actual algorithm, others don't
(e.g. CHECKSUM_UNNECESSARY can apply to SCTP checksum,
CHECKSUM_COMPLETE must be an Internet checksum).

For instance, in that light skb_sctp_csum_help is appropriately named
I think because this is being called from skb_csum_help and refers to
the interface to resolve a checksum.

Tom

>> Reviewed-by: Marcelo Ricardo Leitner 
>> Signed-off-by: Davide Caratti 
>> ---
>>  include/linux/skbuff.h |  2 ++
>>  net/core/skbuff.c  | 20 
>>  net/sctp/offload.c |  7 +++
>>  3 files changed, 29 insertions(+)
>>
>> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
>> index 69ccd26..cab9a32 100644
>> --- a/include/linux/skbuff.h
>> +++ b/include/linux/skbuff.h
>> @@ -3125,6 +3125,8 @@ struct skb_checksum_ops {
>> __wsum (*combine)(__wsum csum, __wsum csum2, int offset, int len);
>>  };
>>
>> +extern const struct skb_checksum_ops *sctp_csum_stub __read_mostly;
>> +
>>  __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
>>   __wsum csum, const struct skb_checksum_ops *ops);
>>  __wsum skb_checksum(const struct sk_buff *skb, int offset, int len,
>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>> index f355795..64fd8fd 100644
>> --- a/net/core/skbuff.c
>> +++ b/net/core/skbuff.c
>> @@ -2242,6 +2242,26 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff 
>> *skb, int offset,
>>  }
>>  EXPORT_SYMBOL(skb_copy_and_csum_bits);
>>
>> +static __wsum warn_sctp_csum_update(const void *buff, int len, __wsum sum)
>> +{
>> +   net_warn_ratelimited("attempt to compute crc32c without sctp.ko\n");
>> +   return 0;
>> +}
>> +
>> +static __wsum warn_sctp_csum_combine(__wsum csum, __wsum csum2,
>> +int offset, int len)
>> +{
>> +   net_warn_ratelimited("attempt to compute crc32c without sctp.ko\n");
>> +   return 0;
>> +}
>> +
>> +const struct skb_checksum_ops *sctp_csum_stub __read_mostly =
>> +   &(struct skb_checksum_ops) {
>> +   .update  = warn_sctp_csum_update,
>> +   .combine = warn_sctp_csum_combine,
>> +};
>> +EXPORT_SYMBOL(sctp_csum_stub);
>> +
>>   /**
>>   * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy()
>>   * @from: source buffer
>> diff --git a/net/sctp/offload.c b/net/sctp/offload.c
>> index 4f5a2b5..e9c3db0 100644
>> --- a/net/sctp/offload.c
>> +++ b/net/sctp/offload.c
>> @@ -98,6 +98,12 @@ static const struct net_offload sctp6_offload = {
>> },
>>  };
>>
>> +static const struct skb_checksum_ops *sctp_csum_ops __read_mostly =
>> +   &(struct skb_checksum_ops) {
>> +   .update  = sctp_csum_update,
>> +   .combine = sctp_csum_combine,
>> +};
>> +
>>  int __init sctp_offload_init(void)
>>  {
>> int ret;
>> @@ -110,6 +116,7 @@ int __init sctp_offload_init(void)
>> if (ret)
>> goto ipv4;
>>
>> +   sctp_csum_stub = sctp_csum_ops;
>> return ret;
>>
>>  ipv4:
>> --
>> 2.7.4
>>


Re: [PATCH v1 2/4] nvmet-rdma: use generic inet_pton_with_scope

2017-02-28 Thread Sagi Grimberg

Please add a changelog and mention that this adds IPv6 support.


Will do, thanks!


[PATCH v2] net: pch_gbe: Fix TX RX descriptor accesses for big endian systems

2017-02-28 Thread Hassan Naveed
Fix pch_gbe driver for ethernet operations for a big endian CPU.
Values written to and read from transmit and receive descriptors
in the pch_gbe driver are byte swapped from the perspective of a
big endian CPU, since the ethernet controller always operates in
little endian mode. Rectify this by appropriately byte swapping
these descriptor field values in the driver software.

Signed-off-by: Hassan Naveed 
Reviewed-by: Paul Burton 
Reviewed-by: Matt Redfearn 
Cc: Paul Burton 
Cc: Matt Redfearn 
Cc: David S. Miller 
Cc: Florian Westphal 
Cc: françois romieu 
---
Changes in v2: Additionally changed transmit and receive descriptors field
types to __le{16,32}. Ran sparse with endianness checking enabled and no
new warnings were generated.


 drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h| 22 
 .../net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c   | 66 --
 2 files changed, 46 insertions(+), 42 deletions(-)

diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h 
b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h
index de1dd08..e279eb2 100644
--- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h
+++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h
@@ -431,13 +431,13 @@ struct pch_gbe_hw {
  * @reserved2: Reserved
  */
 struct pch_gbe_rx_desc {
-   u32 buffer_addr;
-   u32 tcp_ip_status;
-   u16 rx_words_eob;
-   u16 gbec_status;
+   __le32 buffer_addr;
+   __le32 tcp_ip_status;
+   __le16 rx_words_eob;
+   __le16 gbec_status;
u8 dma_status;
u8 reserved1;
-   u16 reserved2;
+   __le16 reserved2;
 };
 
 /**
@@ -452,14 +452,14 @@ struct pch_gbe_rx_desc {
  * @gbec_status:   GMAC Status
  */
 struct pch_gbe_tx_desc {
-   u32 buffer_addr;
-   u16 length;
-   u16 reserved1;
-   u16 tx_words_eob;
-   u16 tx_frame_ctrl;
+   __le32 buffer_addr;
+   __le16 length;
+   __le16 reserved1;
+   __le16 tx_words_eob;
+   __le16 tx_frame_ctrl;
u8 dma_status;
u8 reserved2;
-   u16 gbec_status;
+   __le16 gbec_status;
 };
 
 
diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c 
b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
index d1048dd..6937169 100644
--- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
+++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
@@ -1250,11 +1250,11 @@ static void pch_gbe_tx_queue(struct pch_gbe_adapter 
*adapter,
 
/*-- Set Tx descriptor --*/
tx_desc = PCH_GBE_TX_DESC(*tx_ring, ring_num);
-   tx_desc->buffer_addr = (buffer_info->dma);
-   tx_desc->length = skb->len;
-   tx_desc->tx_words_eob = skb->len + 3;
-   tx_desc->tx_frame_ctrl = (frame_ctrl);
-   tx_desc->gbec_status = (DSC_INIT16);
+   tx_desc->buffer_addr = cpu_to_le32(buffer_info->dma);
+   tx_desc->length = cpu_to_le16(skb->len);
+   tx_desc->tx_words_eob = cpu_to_le16(skb->len + 3);
+   tx_desc->tx_frame_ctrl = cpu_to_le16(frame_ctrl);
+   tx_desc->gbec_status = cpu_to_le16(DSC_INIT16);
 
if (unlikely(++ring_num == tx_ring->count))
ring_num = 0;
@@ -1460,8 +1460,8 @@ static irqreturn_t pch_gbe_intr(int irq, void *data)
}
buffer_info->mapped = true;
rx_desc = PCH_GBE_RX_DESC(*rx_ring, i);
-   rx_desc->buffer_addr = (buffer_info->dma);
-   rx_desc->gbec_status = DSC_INIT16;
+   rx_desc->buffer_addr = cpu_to_le32(buffer_info->dma);
+   rx_desc->gbec_status = cpu_to_le16(DSC_INIT16);
 
netdev_dbg(netdev,
   "i = %d  buffer_info->dma = 0x08%llx  
buffer_info->length = 0x%x\n",
@@ -1533,7 +1533,7 @@ static void pch_gbe_alloc_tx_buffers(struct 
pch_gbe_adapter *adapter,
skb_reserve(skb, PCH_GBE_DMA_ALIGN);
buffer_info->skb = skb;
tx_desc = PCH_GBE_TX_DESC(*tx_ring, i);
-   tx_desc->gbec_status = (DSC_INIT16);
+   tx_desc->gbec_status = cpu_to_le16(DSC_INIT16);
}
return;
 }
@@ -1564,11 +1564,12 @@ static void pch_gbe_alloc_tx_buffers(struct 
pch_gbe_adapter *adapter,
i = tx_ring->next_to_clean;
tx_desc = PCH_GBE_TX_DESC(*tx_ring, i);
netdev_dbg(adapter->netdev, "gbec_status:0x%04x  dma_status:0x%04x\n",
-  tx_desc->gbec_status, tx_desc->dma_status);
+  le16_to_cpu(tx_desc->gbec_status), tx_desc->dma_status);
 
unused = PCH_GBE_DESC_UNUSED(tx_ring);
thresh = tx_ring->count - PCH_GBE_TX_WEIGHT;
-   if ((tx_desc->gbec_status == DSC_INIT16) && (unused < thresh))
+   if ((le16_to_cpu(tx_desc->gbec_status) == DSC_INIT16) &&
+   (unused < thresh))
{  /* current marked clean, tx queue filling up, do extra clean */
int j, k;
if (unused < 8) {  /* tx queue nearly full */
@@ -1583,47 +1584,49 @@ static void pch_gbe_alloc_tx_buffers(struct

Re: net: GPF in rt6_nexthop_info

2017-02-28 Thread David Ahern
On 2/28/17 3:14 PM, Eric Dumazet wrote:
> On Tue, Feb 28, 2017 at 3:09 PM, David Ahern  wrote:
>> On 2/28/17 5:10 AM, Eric Dumazet wrote:
>>> David, rt->rt6i_idev can be NULL.
>>
>> Do you know of an example where rt6i_idev can be NULL - besides the
>> null_entry rt which is null only because of init order?
> 
> 
> I might have been mistaken, but many points in net/ipv6/route.c test
> rt->rt6i_idev being NULL or not before dereferencing it.
> 
> Maybe other rts (other than null_entry) share this property.
> 

Seen those and always been confused that an L3 route object does not
require the L3 addr struct for the device.

Code wise the struct is allocated for all net devices at registration as
long as mtu >= 1280. If it is not allocated, routes referencing the
device fail with ENODEV (idev check in ip6_route_info_create). Putting
that together I can't see how a standard ipv6 rt other than the
null_entry can reference a null rt6i_idev.


Re: [PATCH RFC v2 00/12] socket sendmsg MSG_ZEROCOPY

2017-02-28 Thread Tom Herbert
On Tue, Feb 28, 2017 at 4:58 PM, Willem de Bruijn
 wrote:
> On Tue, Feb 28, 2017 at 7:28 PM, Tom Herbert  wrote:
>> On Tue, Feb 28, 2017 at 3:22 PM, Eric Dumazet  wrote:
>>> On Tue, 2017-02-28 at 14:52 -0800, Andy Lutomirski wrote:
>>>
 The user pages are a gift to the kernel.  The application  may  not
 modify this memory ever, otherwise the page cache and on-disk data may
 differ.

 This is just not okay IMO.
>>>
>>> TCP works just fine in this case.
>>>
>>> TX checksum will be computed by the NIC after/while data is copied.
>>>
>>> If really the application changes the data, that will not cause any
>>> problems, other than user side consistency.
>>>
>>> This is why we require a copy (for all buffers that came from zero-copy)
>>> if network stack hits a device that can not offload TX checksum.
>>>
>>> Even pwrite() does not guarantee consistency if multiple threads are
>>> using it on overlapping regions.
>>>
>> The Mellanox team working on TLS offload pointed out to us that if
>> data is changed for a retransmit then it becomes trivial for someone
>> snooping to break the encryption. Sounds pretty scary and it would be
>> a shame if we couldn't use zero-copy in that use case :-( Hopefully we
>> can find a solution...
>>
>
> This requires collusion by the process initiating the zerocopy send
> to help the entity snooping the link. That could be an attack on admin
> configured tunnels, but user-directed encryption offload like AF_TLS
> can still use zerocopy.

Yes, but we can't trust the user to always understand or correctly
implement the semantic nuances when security is involved. If we can't
provide a  robust API then the only recourse is to not allow zero copy
in that case. We could suggest COW to solve all problems, but I think
we know where the conversation will go ;-)

Tom


Re: [PATCH RFC v2 00/12] socket sendmsg MSG_ZEROCOPY

2017-02-28 Thread Eric Dumazet
On Tue, 2017-02-28 at 14:25 -0800, Andy Lutomirski wrote:
> On Tue, Feb 28, 2017 at 1:47 PM, Eric Dumazet  wrote:
> > On Tue, 2017-02-28 at 13:09 -0800, Andy Lutomirski wrote:
> >
> >> Does this mean that a user program that does a zerocopy send can cause
> >> a retransmitted segment to contain different data than the original
> >> segment?  If so, is that okay?
> >
> > Same remark applies to sendfile() already
> 
> True.
> 
> >, or other zero copy modes
> > (vmsplice() + splice() )
> 
> I hate vmsplice().  I thought I remembered it being essentially
> disabled at some point due to security problems.

Right, zero copy is hard ;)

vmsplice() is not disabled in current kernels, unless I missed
something.






Re: [PATCH RFC v2 00/12] socket sendmsg MSG_ZEROCOPY

2017-02-28 Thread Tom Herbert
On Tue, Feb 28, 2017 at 3:22 PM, Eric Dumazet  wrote:
> On Tue, 2017-02-28 at 14:52 -0800, Andy Lutomirski wrote:
>
>> The user pages are a gift to the kernel.  The application  may  not
>> modify this memory ever, otherwise the page cache and on-disk data may
>> differ.
>>
>> This is just not okay IMO.
>
> TCP works just fine in this case.
>
> TX checksum will be computed by the NIC after/while data is copied.
>
> If really the application changes the data, that will not cause any
> problems, other than user side consistency.
>
> This is why we require a copy (for all buffers that came from zero-copy)
> if network stack hits a device that can not offload TX checksum.
>
> Even pwrite() does not guarantee consistency if multiple threads are
> using it on overlapping regions.
>
The Mellanox team working on TLS offload pointed out to us that if
data is changed for a retransmit then it becomes trivial for someone
snooping to break the encryption. Sounds pretty scary and it would be
a shame if we couldn't use zero-copy in that use case :-( Hopefully we
can find a solution...

Tom

>
>


Re: [PATCH v2 net] net: solve a NAPI race

2017-02-28 Thread Stephen Hemminger
On Wed, 1 Mar 2017 01:22:40 +0100
Francois Romieu  wrote:

> David Miller  :
> > From: Eric Dumazet 
> > Date: Mon, 27 Feb 2017 08:44:14 -0800
> >   
> > > Any point doing a napi_schedule() not from device hard irq handler
> > > is subject to the race for NIC using some kind of edge trigger
> > > interrupts.
> > > 
> > > Since we do not provide a ndo to disable device interrupts, the
> > > following can happen.  
> > 
> > Ok, now I understand.
> > 
> > I think even without considering the race you are trying to solve,
> > this situation is really dangerous.
> > 
> > I am sure that every ->poll() handler out there was written by an
> > author who completely assumed that if they are executing then the
> > device's interrupts for that NAPI instance are disabled.  And this is
> > with very few, if any, exceptions.  
> 
> Shareable pci irq used to remind author that such an assumption was
> not always right. Otoh it was still manageable as long as level only
> triggered irq were involved.
> 

When I had to deal with that in sky2, the best way was to have a single
NAPI poll handler shared between both ports. Works well and avoids races
in interrupt handling and enabling.


Re: [PATCH] drivers: net: xgene: Fix crash on DT systems

2017-02-28 Thread Iyappan Subramanian
On Tue, Feb 28, 2017 at 9:08 AM, Alban Bedel
 wrote:
> On DT systems the driver require a clock, but the probe just print a
> warning and continue, leading to a crash when resetting the device.
> To fix this crash and properly handle probe deferals only ignore the
> missing clock if DT isn't used or if the clock doesn't exist.
>
> Signed-off-by: Alban Bedel 
> ---
>  drivers/net/ethernet/apm/xgene/xgene_enet_main.c | 6 ++
>  1 file changed, 6 insertions(+)
>
> diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c 
> b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
> index d0d0d12b531f..68b48edc5921 100644
> --- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
> +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
> @@ -1756,6 +1756,12 @@ static int xgene_enet_get_resources(struct 
> xgene_enet_pdata *pdata)
>
> pdata->clk = devm_clk_get(&pdev->dev, NULL);
> if (IS_ERR(pdata->clk)) {
> +   /* Abort if the clock is defined but couldn't be retrived.
> +* Always abort if the clock is missing on DT system as
> +* the driver can't cope with this case.
> +*/
> +   if (PTR_ERR(pdata->clk) != -ENOENT || dev->of_node)
> +   return PTR_ERR(pdata->clk);
> /* Firmware may have set up the clock already. */
> dev_info(dev, "clocks have been setup already\n");
> }
> --
> 2.11.0
>

Thanks, Alban.

Acked-by: Iyappan Subramanian 


Re: [PATCH v2 net] net: solve a NAPI race

2017-02-28 Thread Francois Romieu
David Miller  :
> From: Eric Dumazet 
> Date: Mon, 27 Feb 2017 08:44:14 -0800
> 
> > Any point doing a napi_schedule() not from device hard irq handler
> > is subject to the race for NIC using some kind of edge trigger
> > interrupts.
> > 
> > Since we do not provide a ndo to disable device interrupts, the
> > following can happen.
> 
> Ok, now I understand.
> 
> I think even without considering the race you are trying to solve,
> this situation is really dangerous.
> 
> I am sure that every ->poll() handler out there was written by an
> author who completely assumed that if they are executing then the
> device's interrupts for that NAPI instance are disabled.  And this is
> with very few, if any, exceptions.

Shareable pci irq used to remind author that such an assumption was
not always right. Otoh it was still manageable as long as level only
triggered irq were involved.

-- 
Ueimor


Re: [PATCH RFC v2 00/12] socket sendmsg MSG_ZEROCOPY

2017-02-28 Thread Eric Dumazet
On Tue, 2017-02-28 at 16:28 -0800, Tom Herbert wrote:

> The Mellanox team working on TLS offload pointed out to us that if
> data is changed for a retransmit then it becomes trivial for someone
> snooping to break the encryption. Sounds pretty scary and it would be
> a shame if we couldn't use zero-copy in that use case :-( Hopefully we
> can find a solution...

Right, this is why offloading encryption over TCP is also hard ;)





Re: net: GPF in rt6_nexthop_info

2017-02-28 Thread David Ahern
On 2/28/17 5:10 AM, Eric Dumazet wrote:
> David, rt->rt6i_idev can be NULL.

Do you know of an example where rt6i_idev can be NULL - besides the
null_entry rt which is null only because of init order?


[PATCH RFC net-next] Cancel any pending connection attempts before taking down connection

2017-02-28 Thread Sowmini Varadhan
This is a test patch being supplied for a trial run on syzkaller.

Explicitly cancel the workq before releasing resources that
will allow netns deletion, so that the connect request does
not trip up on a use-after free of the netns afterward.

Signed-off-by: Sowmini Varadhan 
Reported-by: Dmitry Vyukov 
---
 net/rds/tcp.c |2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 57bb523..2568eb1 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -509,6 +509,8 @@ static void rds_tcp_conn_paths_destroy(struct 
rds_connection *conn)
 
for (i = 0; i < RDS_MPATH_WORKERS; i++) {
cp = &conn->c_path[i];
+   if (cancel_delayed_work_sync(&cp->cp_conn_w))
+   pr_info("cancelled on path %d\n", i);
tc = cp->cp_transport_data;
if (!tc->t_sock)
continue;
-- 
1.7.1



Re: net/rds: use-after-free in inet_create

2017-02-28 Thread Sowmini Varadhan

Actually, I'm not sure if I can assert that these are all manifestations
of the same bug- was a netns-delete involved in this one as well?

I see:

> BUG: KASAN: use-after-free in memcmp+0xe3/0x160 lib/string.c:768 at
:
>  memcmp+0xe3/0x160 lib/string.c:768
:
>  rds_find_bound+0x4fe/0x8a0 net/rds/bind.c:63
>  rds_recv_incoming+0x5f3/0x12c0 net/rds/recv.c:349
>  rds_loop_xmit+0x1c5/0x490 net/rds/loop.c:82
:
This appears to be for a looped back packet, and looks like there
are problems with  some rds_sock that got removed from the bind_hash_table..

According to the report, socket was created at
> Allocated:
> PID = 5235
:
>  sk_prot_alloc+0x65/0x2a0 net/core/sock.c:1334
>  sk_alloc+0x105/0x1010 net/core/sock.c:1396
>  rds_create+0x11c/0x600 net/rds/af_rds.c:504

and closed at some point:
> Freed:
> PID = 5235
 :
>  rds_release+0x3a1/0x4d0 net/rds/af_rds.c:89
>  sock_release+0x8d/0x1e0 net/socket.c:599

This is all uspace created rds sockets, and while there may be an
unrelated bug here, I'm not sure I see  the netns/kernel-socket
connection.. can you please clarify if this was also seen in some netns
context?

--Sowmini







[PATCH net v1 3/3] amd-xgbe: Don't overwrite SFP PHY mod_absent settings

2017-02-28 Thread Tom Lendacky
If an SFP module is not present, xgbe_phy_sfp_phy_settings() should
return after applying the default settings. Currently there is no return
statement and the default settings are overwritten.

Signed-off-by: Tom Lendacky 
---
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c |2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index 04804cb..e707c49 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -716,6 +716,8 @@ static void xgbe_phy_sfp_phy_settings(struct xgbe_prv_data 
*pdata)
pdata->phy.duplex = DUPLEX_UNKNOWN;
pdata->phy.autoneg = AUTONEG_ENABLE;
pdata->phy.advertising = pdata->phy.supported;
+
+   return;
}
 
pdata->phy.advertising &= ~ADVERTISED_Autoneg;



[PATCH] net: qcom/emac: optimize QDF2400 SGMII RX/TX impedence values

2017-02-28 Thread Timur Tabi
Adjust the impedance values of the RX and TX lanes in the SGMII block
so that they are closer to optimal values.

Signed-off-by: Timur Tabi 
---
 drivers/net/ethernet/qualcomm/emac/emac-sgmii-qdf2400.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/qualcomm/emac/emac-sgmii-qdf2400.c 
b/drivers/net/ethernet/qualcomm/emac/emac-sgmii-qdf2400.c
index f62c215..7116be4 100644
--- a/drivers/net/ethernet/qualcomm/emac/emac-sgmii-qdf2400.c
+++ b/drivers/net/ethernet/qualcomm/emac/emac-sgmii-qdf2400.c
@@ -26,6 +26,7 @@
 
 /* SGMII digital lane registers */
 #define EMAC_SGMII_LN_DRVR_CTRL0   0x000C
+#define EMAC_SGMII_LN_DRVR_CTRL1   0x0010
 #define EMAC_SGMII_LN_DRVR_TAP_EN  0x0018
 #define EMAC_SGMII_LN_TX_MARGINING 0x001C
 #define EMAC_SGMII_LN_TX_PRE   0x0020
@@ -48,6 +49,7 @@
 #define EMAC_SGMII_LN_RX_EN_SIGNAL 0x02AC
 #define EMAC_SGMII_LN_RX_MISC_CNTRL0   0x02B8
 #define EMAC_SGMII_LN_DRVR_LOGIC_CLKDIV0x02C8
+#define EMAC_SGMII_LN_RX_RESECODE_OFFSET   0x02CC
 
 /* SGMII digital lane register values */
 #define UCDR_STEP_BY_TWO_MODE0 BIT(7)
@@ -73,6 +75,8 @@
 #define CML_GEAR_MODE(x)   (((x) & 7) << 3)
 #define CML2CMOS_IBOOST_MODE(x)((x) & 7)
 
+#define RESCODE_OFFSET(x)  ((x) & 0x1f)
+
 #define MIXER_LOADB_MODE(x)(((x) & 0xf) << 2)
 #define MIXER_DATARATE_MODE(x) ((x) & 3)
 
@@ -159,6 +163,8 @@ static void emac_reg_write_all(void __iomem *base,
{EMAC_SGMII_LN_PARALLEL_RATE, PARALLEL_RATE_MODE0(1)},
{EMAC_SGMII_LN_TX_BAND_MODE, BAND_MODE0(1)},
{EMAC_SGMII_LN_RX_BAND, BAND_MODE0(2)},
+   {EMAC_SGMII_LN_DRVR_CTRL1, RESCODE_OFFSET(7)},
+   {EMAC_SGMII_LN_RX_RESECODE_OFFSET, RESCODE_OFFSET(9)},
{EMAC_SGMII_LN_LANE_MODE, LANE_MODE(26)},
{EMAC_SGMII_LN_RX_RCVR_PATH1_MODE0, CDR_PD_SEL_MODE0(2) |
EN_DLL_MODE0 | EN_IQ_DCC_MODE0 | EN_IQCAL_MODE0},
-- 
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm
Technologies, Inc.  Qualcomm Technologies, Inc. is a member of the
Code Aurora Forum, a Linux Foundation Collaborative Project.



Re: [PATCH RFC v2 00/12] socket sendmsg MSG_ZEROCOPY

2017-02-28 Thread Eric Dumazet
On Tue, 2017-02-28 at 14:52 -0800, Andy Lutomirski wrote:

> The user pages are a gift to the kernel.  The application  may  not
> modify this memory ever, otherwise the page cache and on-disk data may
> differ.
> 
> This is just not okay IMO.

TCP works just fine in this case.

TX checksum will be computed by the NIC after/while data is copied.

If really the application changes the data, that will not cause any
problems, other than user side consistency.

This is why we require a copy (for all buffers that came from zero-copy)
if network stack hits a device that can not offload TX checksum.

Even pwrite() does not guarantee consistency if multiple threads are
using it on overlapping regions.





Re: [PATCH RFC v2 00/12] socket sendmsg MSG_ZEROCOPY

2017-02-28 Thread Andy Lutomirski
On Tue, Feb 28, 2017 at 2:40 PM, Eric Dumazet  wrote:
> On Tue, 2017-02-28 at 14:25 -0800, Andy Lutomirski wrote:
>> On Tue, Feb 28, 2017 at 1:47 PM, Eric Dumazet  wrote:
>> > On Tue, 2017-02-28 at 13:09 -0800, Andy Lutomirski wrote:
>> >
>> >> Does this mean that a user program that does a zerocopy send can cause
>> >> a retransmitted segment to contain different data than the original
>> >> segment?  If so, is that okay?
>> >
>> > Same remark applies to sendfile() already
>>
>> True.
>>
>> >, or other zero copy modes
>> > (vmsplice() + splice() )
>>
>> I hate vmsplice().  I thought I remembered it being essentially
>> disabled at some point due to security problems.
>
> Right, zero copy is hard ;)
>
> vmsplice() is not disabled in current kernels, unless I missed
> something.
>

I think you're right.  That being said, from the man page:

The user pages are a gift to the kernel.  The application  may  not
modify this memory ever, otherwise the page cache and on-disk data may
differ.

This is just not okay IMO.


Re: net: GPF in rt6_nexthop_info

2017-02-28 Thread Eric Dumazet
On Tue, Feb 28, 2017 at 3:09 PM, David Ahern  wrote:
> On 2/28/17 5:10 AM, Eric Dumazet wrote:
>> David, rt->rt6i_idev can be NULL.
>
> Do you know of an example where rt6i_idev can be NULL - besides the
> null_entry rt which is null only because of init order?


I might have been mistaken, but many points in net/ipv6/route.c test
rt->rt6i_idev being NULL or not before dereferencing it.

Maybe other rts (other than null_entry) share this property.


Re: [PATCH RFC net-next v2 1/4] skbuff: add stub to help computing crc32c on SCTP packets

2017-02-28 Thread Alexander Duyck
On Tue, Feb 28, 2017 at 2:32 AM, Davide Caratti  wrote:
> sctp_compute_checksum requires crc32c symbol (provided by libcrc32c), so
> it can't be used in net core. Like it has been done previously with other
> symbols (e.g. ipv6_dst_lookup), introduce a stub struct skb_checksum_ops
> to allow computation of SCTP checksum in net core after sctp.ko (and thus
> libcrc32c) has been loaded.

At a minimum the name really needs to change.  SCTP does not do
checksums.  It does a CRC, and a CRC is a very different thing.  The
fact that somebody decided that offloading a CRC could use the same
framework is very unfortunate, and your patch descriptions in this
whole set are calling out a CRC as checksums which it is not.

I don't want to see anything "checksum" or "csum" related in the
naming when it comes to dealing with SCTP unless we absolutely have to
have it.  So any function names or structures with sctp in the name
should call out "crc32" or "crc", please don't use checksum.

> Reviewed-by: Marcelo Ricardo Leitner 
> Signed-off-by: Davide Caratti 
> ---
>  include/linux/skbuff.h |  2 ++
>  net/core/skbuff.c  | 20 
>  net/sctp/offload.c |  7 +++
>  3 files changed, 29 insertions(+)
>
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 69ccd26..cab9a32 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -3125,6 +3125,8 @@ struct skb_checksum_ops {
> __wsum (*combine)(__wsum csum, __wsum csum2, int offset, int len);
>  };
>
> +extern const struct skb_checksum_ops *sctp_csum_stub __read_mostly;
> +
>  __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
>   __wsum csum, const struct skb_checksum_ops *ops);
>  __wsum skb_checksum(const struct sk_buff *skb, int offset, int len,
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index f355795..64fd8fd 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -2242,6 +2242,26 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff 
> *skb, int offset,
>  }
>  EXPORT_SYMBOL(skb_copy_and_csum_bits);
>
> +static __wsum warn_sctp_csum_update(const void *buff, int len, __wsum sum)
> +{
> +   net_warn_ratelimited("attempt to compute crc32c without sctp.ko\n");
> +   return 0;
> +}
> +
> +static __wsum warn_sctp_csum_combine(__wsum csum, __wsum csum2,
> +int offset, int len)
> +{
> +   net_warn_ratelimited("attempt to compute crc32c without sctp.ko\n");
> +   return 0;
> +}
> +
> +const struct skb_checksum_ops *sctp_csum_stub __read_mostly =
> +   &(struct skb_checksum_ops) {
> +   .update  = warn_sctp_csum_update,
> +   .combine = warn_sctp_csum_combine,
> +};
> +EXPORT_SYMBOL(sctp_csum_stub);
> +
>   /**
>   * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy()
>   * @from: source buffer
> diff --git a/net/sctp/offload.c b/net/sctp/offload.c
> index 4f5a2b5..e9c3db0 100644
> --- a/net/sctp/offload.c
> +++ b/net/sctp/offload.c
> @@ -98,6 +98,12 @@ static const struct net_offload sctp6_offload = {
> },
>  };
>
> +static const struct skb_checksum_ops *sctp_csum_ops __read_mostly =
> +   &(struct skb_checksum_ops) {
> +   .update  = sctp_csum_update,
> +   .combine = sctp_csum_combine,
> +};
> +
>  int __init sctp_offload_init(void)
>  {
> int ret;
> @@ -110,6 +116,7 @@ int __init sctp_offload_init(void)
> if (ret)
> goto ipv4;
>
> +   sctp_csum_stub = sctp_csum_ops;
> return ret;
>
>  ipv4:
> --
> 2.7.4
>


[PATCH net-next v3 0/3] net: ethernet: bgmac: PM support and clean-ups

2017-02-28 Thread Jon Mason
Changes in v3:
* Corrected a bug Florian found and added his Reviewed-by

Changes in v2:
* Reworked the PM patch with Florian's suggestions


Add code to support Power Management (only tested on NS2), and add some
code clean-ups


Joey Zhong (1):
  net: ethernet: bgmac: driver power manangement

Jon Mason (2):
  net: ethernet: bgmac: use #defines for MAX size
  net: ethernet: bgmac: unify code of the same family

 drivers/net/ethernet/broadcom/bgmac-bcma.c | 64 +++---
 drivers/net/ethernet/broadcom/bgmac-platform.c | 34 ++
 drivers/net/ethernet/broadcom/bgmac.c  | 51 
 drivers/net/ethernet/broadcom/bgmac.h  |  4 +-
 4 files changed, 116 insertions(+), 37 deletions(-)

-- 
2.7.4



[PATCH net-next v3 1/3] net: ethernet: bgmac: use #defines for MAX size

2017-02-28 Thread Jon Mason
The maximum frame size is really just the standard ethernet frame size
and FCS.  So use those existing defines to make the code a little more
beautiful.

Signed-off-by: Jon Mason 
---
 drivers/net/ethernet/broadcom/bgmac.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bgmac.h 
b/drivers/net/ethernet/broadcom/bgmac.h
index 6d1c6ff..a75ed35 100644
--- a/drivers/net/ethernet/broadcom/bgmac.h
+++ b/drivers/net/ethernet/broadcom/bgmac.h
@@ -402,7 +402,7 @@
 
 #define BGMAC_WEIGHT   64
 
-#define ETHER_MAX_LEN   1518
+#define ETHER_MAX_LEN  (ETH_FRAME_LEN + ETH_FCS_LEN)
 
 /* Feature Flags */
 #define BGMAC_FEAT_TX_MASK_SETUP   BIT(0)
-- 
2.7.4



Re: [PATCH RFC v2 00/12] socket sendmsg MSG_ZEROCOPY

2017-02-28 Thread Andy Lutomirski
On Tue, Feb 28, 2017 at 1:47 PM, Eric Dumazet  wrote:
> On Tue, 2017-02-28 at 13:09 -0800, Andy Lutomirski wrote:
>
>> Does this mean that a user program that does a zerocopy send can cause
>> a retransmitted segment to contain different data than the original
>> segment?  If so, is that okay?
>
> Same remark applies to sendfile() already

True.

>, or other zero copy modes
> (vmsplice() + splice() )

I hate vmsplice().  I thought I remembered it being essentially
disabled at some point due to security problems.

>


Re: [PATCH v5 06/10] seccomp,landlock: Handle Landlock events per process hierarchy

2017-02-28 Thread Andy Lutomirski
On Tue, Feb 21, 2017 at 5:26 PM, Mickaël Salaün  wrote:
> The seccomp(2) syscall can be use to apply a Landlock rule to the
> current process. As with a seccomp filter, the Landlock rule is enforced
> for all its future children. An inherited rule tree can be updated
> (append-only) by the owner of inherited Landlock nodes (e.g. a parent
> process that create a new rule)

Can you clarify exaclty what this type of update does?  Is it
something that should be supported by normal seccomp rules as well?

> +/**
> + * landlock_run_prog - run Landlock program for a syscall

Unless this is actually specific to syscalls, s/for a syscall//, perhaps?

> +   if (new_events->nodes[event_idx]->owner ==
> +   &new_events->nodes[event_idx]) {
> +   /* We are the owner, we can then update the node. */
> +   add_landlock_rule(new_events, rule);

This is the part I don't get.  Adding a rule if you're the owner (BTW,
why is ownership visible to userspace at all?) for just yourself and
future children is very different from adding it so it applies to
preexisting children too.


> +   } else if (atomic_read(¤t_events->usage) == 1) {
> +   WARN_ON(new_events->nodes[event_idx]->owner);
> +   /*
> +* We can become the new owner if no other task use 
> it.
> +* This avoid an unnecessary allocation.
> +*/
> +   new_events->nodes[event_idx]->owner =
> +   &new_events->nodes[event_idx];
> +   add_landlock_rule(new_events, rule);
> +   } else {
> +   /*
> +* We are not the owner, we need to fork 
> current_events
> +* and then add a new node.
> +*/
> +   struct landlock_node *node;
> +   size_t i;
> +
> +   node = kmalloc(sizeof(*node), GFP_KERNEL);
> +   if (!node) {
> +   new_events = ERR_PTR(-ENOMEM);
> +   goto put_rule;
> +   }
> +   atomic_set(&node->usage, 1);
> +   /* set the previous node after the new_events
> +* allocation */
> +   node->prev = NULL;
> +   /* do not increment the previous node usage */
> +   node->owner = &new_events->nodes[event_idx];
> +   /* rule->prev is already NULL */
> +   atomic_set(&rule->usage, 1);
> +   node->rule = rule;
> +
> +   new_events = new_raw_landlock_events();
> +   if (IS_ERR(new_events)) {
> +   /* put the rule as well */
> +   put_landlock_node(node);
> +   return ERR_PTR(-ENOMEM);
> +   }
> +   for (i = 0; i < ARRAY_SIZE(new_events->nodes); i++) {
> +   new_events->nodes[i] =
> +   lockless_dereference(
> +   
> current_events->nodes[i]);
> +   if (i == event_idx)
> +   node->prev = new_events->nodes[i];
> +   if (!WARN_ON(!new_events->nodes[i]))
> +   
> atomic_inc(&new_events->nodes[i]->usage);
> +   }
> +   new_events->nodes[event_idx] = node;
> +
> +   /*
> +* @current_events will not be freed here because 
> it's usage
> +* field is > 1. It is only prevented to be freed by 
> another
> +* subject thanks to the caller of 
> landlock_append_prog() which
> +* should be locked if needed.
> +*/
> +   put_landlock_events(current_events);
> +   }
> +   }
> +   return new_events;
> +
> +put_prog:
> +   bpf_prog_put(prog);
> +   return new_events;
> +
> +put_rule:
> +   put_landlock_rule(rule);
> +   return new_events;
> +}
> +
> +/**
> + * landlock_seccomp_append_prog - attach a Landlock rule to the current 
> process
> + *
> + * current->seccomp.landlock_events is lazily allocated. When a process fork,
> + * only a pointer is copied. When a new event is added by a process, if there
> + * is other references to this process' landlock_events, then a new 
> allocation
> + * is made to contains an array pointing to Landlock rule lists. This design
> + * has low-performance impact and is memory efficient while keeping the
> + * property of append-only rul

Re: [patch] net/mlx4: && vs & typo

2017-02-28 Thread Bart Van Assche
On 02/28/2017 02:23 PM, Joe Perches wrote:
> On Tue, 2017-02-28 at 15:35 +, Bart Van Assche wrote:
>> On Tue, 2017-02-28 at 15:02 +0300, Dan Carpenter wrote:
>>> Bitwise & was obviously intended here.
> []
>>> diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h
> []
>>> @@ -109,7 +109,7 @@ static inline void  (u8 *addr, u64 mac)
>>> int i;
>>>  
>>> for (i = ETH_ALEN; i > 0; i--) {
>>> -   addr[i - 1] = mac && 0xFF;
>>> +   addr[i - 1] = mac & 0xFF;
>>> mac >>= 8;
>>> }
>>>  }
>>
>> Is this the only place where such a loop occurs?
> 
> Seems to be.
> 
>> Should a put_unaligned_be48()
>> function be introduced?
> 
> Why?  This is used exactly once.

Really? Here is an example of another open-coded version of
put_unaligned_be48() from arch/mips/cavium-octeon/octeon-platform.c:

new_mac[0] = (mac >> 40) & 0xff;
new_mac[1] = (mac >> 32) & 0xff;
new_mac[2] = (mac >> 24) & 0xff;
new_mac[3] = (mac >> 16) & 0xff;
new_mac[4] = (mac >> 8) & 0xff;
new_mac[5] = mac & 0xff;

Bart.


[PATCH] net: smsc: smc911x: use new api ethtool_{get|set}_link_ksettings

2017-02-28 Thread Philippe Reynes
The ethtool api {get|set}_settings is deprecated.
We move this driver to new api {get|set}_link_ksettings.

As I don't have the hardware, I'd be very pleased if
someone may test this patch.

Signed-off-by: Philippe Reynes 
---
 drivers/net/ethernet/smsc/smc911x.c |   51 ++
 1 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/drivers/net/ethernet/smsc/smc911x.c 
b/drivers/net/ethernet/smsc/smc911x.c
index 4f19c61..36307d3 100644
--- a/drivers/net/ethernet/smsc/smc911x.c
+++ b/drivers/net/ethernet/smsc/smc911x.c
@@ -1446,40 +1446,40 @@ static int smc911x_close(struct net_device *dev)
  * Ethtool support
  */
 static int
-smc911x_ethtool_getsettings(struct net_device *dev, struct ethtool_cmd *cmd)
+smc911x_ethtool_get_link_ksettings(struct net_device *dev,
+  struct ethtool_link_ksettings *cmd)
 {
struct smc911x_local *lp = netdev_priv(dev);
int ret, status;
unsigned long flags;
+   u32 supported;
 
DBG(SMC_DEBUG_FUNC, dev, "--> %s\n", __func__);
-   cmd->maxtxpkt = 1;
-   cmd->maxrxpkt = 1;
 
if (lp->phy_type != 0) {
spin_lock_irqsave(&lp->lock, flags);
-   ret = mii_ethtool_gset(&lp->mii, cmd);
+   ret = mii_ethtool_get_link_ksettings(&lp->mii, cmd);
spin_unlock_irqrestore(&lp->lock, flags);
} else {
-   cmd->supported = SUPPORTED_10baseT_Half |
+   supported = SUPPORTED_10baseT_Half |
SUPPORTED_10baseT_Full |
SUPPORTED_TP | SUPPORTED_AUI;
 
if (lp->ctl_rspeed == 10)
-   ethtool_cmd_speed_set(cmd, SPEED_10);
+   cmd->base.speed = SPEED_10;
else if (lp->ctl_rspeed == 100)
-   ethtool_cmd_speed_set(cmd, SPEED_100);
-
-   cmd->autoneg = AUTONEG_DISABLE;
-   if (lp->mii.phy_id==1)
-   cmd->transceiver = XCVR_INTERNAL;
-   else
-   cmd->transceiver = XCVR_EXTERNAL;
-   cmd->port = 0;
+   cmd->base.speed = SPEED_100;
+
+   cmd->base.autoneg = AUTONEG_DISABLE;
+   cmd->base.port = 0;
SMC_GET_PHY_SPECIAL(lp, lp->mii.phy_id, status);
-   cmd->duplex =
+   cmd->base.duplex =
(status & (PHY_SPECIAL_SPD_10FULL_ | 
PHY_SPECIAL_SPD_100FULL_)) ?
DUPLEX_FULL : DUPLEX_HALF;
+
+   ethtool_convert_legacy_u32_to_link_mode(
+   cmd->link_modes.supported, supported);
+
ret = 0;
}
 
@@ -1487,7 +1487,8 @@ static int smc911x_close(struct net_device *dev)
 }
 
 static int
-smc911x_ethtool_setsettings(struct net_device *dev, struct ethtool_cmd *cmd)
+smc911x_ethtool_set_link_ksettings(struct net_device *dev,
+  const struct ethtool_link_ksettings *cmd)
 {
struct smc911x_local *lp = netdev_priv(dev);
int ret;
@@ -1495,16 +1496,18 @@ static int smc911x_close(struct net_device *dev)
 
if (lp->phy_type != 0) {
spin_lock_irqsave(&lp->lock, flags);
-   ret = mii_ethtool_sset(&lp->mii, cmd);
+   ret = mii_ethtool_set_link_ksettings(&lp->mii, cmd);
spin_unlock_irqrestore(&lp->lock, flags);
} else {
-   if (cmd->autoneg != AUTONEG_DISABLE ||
-   cmd->speed != SPEED_10 ||
-   (cmd->duplex != DUPLEX_HALF && cmd->duplex != 
DUPLEX_FULL) ||
-   (cmd->port != PORT_TP && cmd->port != PORT_AUI))
+   if (cmd->base.autoneg != AUTONEG_DISABLE ||
+   cmd->base.speed != SPEED_10 ||
+   (cmd->base.duplex != DUPLEX_HALF &&
+cmd->base.duplex != DUPLEX_FULL) ||
+   (cmd->base.port != PORT_TP &&
+cmd->base.port != PORT_AUI))
return -EINVAL;
 
-   lp->ctl_rfduplx = cmd->duplex == DUPLEX_FULL;
+   lp->ctl_rfduplx = cmd->base.duplex == DUPLEX_FULL;
 
ret = 0;
}
@@ -1686,8 +1689,6 @@ static int smc911x_ethtool_geteeprom_len(struct 
net_device *dev)
 }
 
 static const struct ethtool_ops smc911x_ethtool_ops = {
-   .get_settings= smc911x_ethtool_getsettings,
-   .set_settings= smc911x_ethtool_setsettings,
.get_drvinfo = smc911x_ethtool_getdrvinfo,
.get_msglevel= smc911x_ethtool_getmsglevel,
.set_msglevel= smc911x_ethtool_setmsglevel,
@@ -1698,6 +1699,8 @@ static int smc911x_ethtool_geteeprom_len(struct 
net_device *dev)
.get_eeprom_len = smc911x_ethtool_geteeprom_len,
.get_eeprom = smc911x_ethtool_geteeprom,
.set_eeprom = smc911x_ethtool_seteeprom,
+   .get_link_ksettings  = smc911

Re: [patch] net/mlx4: && vs & typo

2017-02-28 Thread Joe Perches
On Tue, 2017-02-28 at 15:35 +, Bart Van Assche wrote:
> On Tue, 2017-02-28 at 15:02 +0300, Dan Carpenter wrote:
> > Bitwise & was obviously intended here.
[]
> > diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h
[]
> > @@ -109,7 +109,7 @@ static inline void  (u8 *addr, u64 mac)
> > int i;
> >  
> > for (i = ETH_ALEN; i > 0; i--) {
> > -   addr[i - 1] = mac && 0xFF;
> > +   addr[i - 1] = mac & 0xFF;
> > mac >>= 8;
> > }
> >  }
> 
> Is this the only place where such a loop occurs?

Seems to be.

> Should a put_unaligned_be48()
> function be introduced?

Why?  This is used exactly once.



Re: [Patch net] ipv6: ignore null_entry in inet6_rtm_getroute() too

2017-02-28 Thread David Ahern
On 2/28/17 11:48 AM, Cong Wang wrote:
> On Tue, Feb 28, 2017 at 11:01 AM, David Ahern  
> wrote:
>> On 2/28/17 10:44 AM, Cong Wang wrote:
>>> Like commit 1f17e2f2c8a8 ("net: ipv6: ignore null_entry on route dumps"),
>>> we need to ignore null entry in inet6_rtm_getroute() too.
>>>
>>> Return -ENOENT here because we return the same errno when deleting
>>> the null entry.
>>>
>>> Fixes: a1a22c1206 ("net: ipv6: Keep nexthop of multipath route on admin 
>>> down")
>>> Reported-by: Dmitry Vyukov 
>>> Cc: David Ahern 
>>> Signed-off-by: Cong Wang 
>>> ---
>>>  net/ipv6/route.c | 6 ++
>>>  1 file changed, 6 insertions(+)
>>>
>>> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
>>> index f54f426..25590d1 100644
>>> --- a/net/ipv6/route.c
>>> +++ b/net/ipv6/route.c
>>> @@ -3627,6 +3627,12 @@ static int inet6_rtm_getroute(struct sk_buff 
>>> *in_skb, struct nlmsghdr *nlh)
>>>   rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
>>>   }
>>>
>>> + if (rt == net->ipv6.ip6_null_entry) {
>>> + ip6_rt_put(rt);
>>> + err = -ENOENT;
>>> + goto errout;
>>> + }
>>> +
>>>   skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
>>>   if (!skb) {
>>>   ip6_rt_put(rt);
>>>
>>
>> hold on. That test exposed something else, not just a getroute problem.
>> I accidentally ran 'unsahre -n; ip -6 ro ls' on my host machine instead
>> of a VM, so took some time to recover. dumproute already covers the null
>> route.

My host was running a slightly older kernel (did not have the null_entry
check in the dump route path for one).

As for trapping null_entry on getroute, this changes user experience.
Right now you always get a route response for IPv6 with the error set as
rta_error. This patch changes that. I am fine with it since it makes
IPv6 more like IPv4:

# ip -6 ro get 2001:db8:12::1
RTNETLINK answers: Network is unreachable

But, if we are going to do this then err should be set based on
rt->dst.error (ENOENT is not the right error) and the commit message
should state the change.



Re: net/rds: use-after-free in inet_create

2017-02-28 Thread Sowmini Varadhan
On (03/01/17 00:14), Dmitry Vyukov wrote:
> 
> But the other 2 use-after-frees happened on cp->cp_send_w. Shouldn't
> we cancel it as well? And cp_recv_w?

yes, good point, I missed that. let me see if I can refactor the code
to release the netns as the last thing before free.. 


[PATCH] netfilter: Use pr_cont where appropriate

2017-02-28 Thread Joe Perches
Logging output was changed when simple printks without KERN_CONT
are now emitted on a new line and KERN_CONT is required to continue
lines so use pr_cont.

Miscellanea:

o realign arguments
o use print_hex_dump instead of a local variant

Signed-off-by: Joe Perches 
---
 net/bridge/netfilter/ebt_log.c | 34 +-
 net/ipv4/netfilter/nf_nat_snmp_basic.c | 15 ++-
 2 files changed, 19 insertions(+), 30 deletions(-)

diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index 98b9c8e8615e..707caea39743 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -62,10 +62,10 @@ print_ports(const struct sk_buff *skb, uint8_t protocol, 
int offset)
pptr = skb_header_pointer(skb, offset,
  sizeof(_ports), &_ports);
if (pptr == NULL) {
-   printk(" INCOMPLETE TCP/UDP header");
+   pr_cont(" INCOMPLETE TCP/UDP header");
return;
}
-   printk(" SPT=%u DPT=%u", ntohs(pptr->src), ntohs(pptr->dst));
+   pr_cont(" SPT=%u DPT=%u", ntohs(pptr->src), ntohs(pptr->dst));
}
 }
 
@@ -100,11 +100,11 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int 
hooknum,
 
ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
if (ih == NULL) {
-   printk(" INCOMPLETE IP header");
+   pr_cont(" INCOMPLETE IP header");
goto out;
}
-   printk(" IP SRC=%pI4 IP DST=%pI4, IP tos=0x%02X, IP proto=%d",
-  &ih->saddr, &ih->daddr, ih->tos, ih->protocol);
+   pr_cont(" IP SRC=%pI4 IP DST=%pI4, IP tos=0x%02X, IP proto=%d",
+   &ih->saddr, &ih->daddr, ih->tos, ih->protocol);
print_ports(skb, ih->protocol, ih->ihl*4);
goto out;
}
@@ -120,11 +120,11 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int 
hooknum,
 
ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
if (ih == NULL) {
-   printk(" INCOMPLETE IPv6 header");
+   pr_cont(" INCOMPLETE IPv6 header");
goto out;
}
-   printk(" IPv6 SRC=%pI6 IPv6 DST=%pI6, IPv6 priority=0x%01X, 
Next Header=%d",
-  &ih->saddr, &ih->daddr, ih->priority, ih->nexthdr);
+   pr_cont(" IPv6 SRC=%pI6 IPv6 DST=%pI6, IPv6 priority=0x%01X, 
Next Header=%d",
+   &ih->saddr, &ih->daddr, ih->priority, ih->nexthdr);
nexthdr = ih->nexthdr;
offset_ph = ipv6_skip_exthdr(skb, sizeof(_iph), &nexthdr, 
&frag_off);
if (offset_ph == -1)
@@ -142,12 +142,12 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int 
hooknum,
 
ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph);
if (ah == NULL) {
-   printk(" INCOMPLETE ARP header");
+   pr_cont(" INCOMPLETE ARP header");
goto out;
}
-   printk(" ARP HTYPE=%d, PTYPE=0x%04x, OPCODE=%d",
-  ntohs(ah->ar_hrd), ntohs(ah->ar_pro),
-  ntohs(ah->ar_op));
+   pr_cont(" ARP HTYPE=%d, PTYPE=0x%04x, OPCODE=%d",
+   ntohs(ah->ar_hrd), ntohs(ah->ar_pro),
+   ntohs(ah->ar_op));
 
/* If it's for Ethernet and the lengths are OK,
 * then log the ARP payload
@@ -161,17 +161,17 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int 
hooknum,
ap = skb_header_pointer(skb, sizeof(_arph),
sizeof(_arpp), &_arpp);
if (ap == NULL) {
-   printk(" INCOMPLETE ARP payload");
+   pr_cont(" INCOMPLETE ARP payload");
goto out;
}
-   printk(" ARP MAC SRC=%pM ARP IP SRC=%pI4 ARP MAC 
DST=%pM ARP IP DST=%pI4",
-   ap->mac_src, ap->ip_src, ap->mac_dst, 
ap->ip_dst);
+   pr_cont(" ARP MAC SRC=%pM ARP IP SRC=%pI4 ARP MAC 
DST=%pM ARP IP DST=%pI4",
+   ap->mac_src, ap->ip_src,
+   ap->mac_dst, ap->ip_dst);
}
}
 out:
-   printk("\n");
+   pr_cont("\n");
spin_unlock_bh(&ebt_log_lock);
-
 }
 
 static unsigned int
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c 
b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index c9b52c361da2..ef49989c93b1 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -998,18 +998,6 @@ static unsigned char snmp_t

[PATCH net-next v3 3/3] net: ethernet: bgmac: driver power manangement

2017-02-28 Thread Jon Mason
From: Joey Zhong 

Implement suspend/resume callbacks in the bgmac driver. This makes sure
that we de-initialize and re-initialize the hardware correctly before
entering suspend and when resuming.

Signed-off-by: Joey Zhong 
Signed-off-by: Jon Mason 
Reviewed-by: Florian Fainelli 
---
 drivers/net/ethernet/broadcom/bgmac-platform.c | 34 +
 drivers/net/ethernet/broadcom/bgmac.c  | 51 ++
 drivers/net/ethernet/broadcom/bgmac.h  |  2 +
 3 files changed, 87 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bgmac-platform.c 
b/drivers/net/ethernet/broadcom/bgmac-platform.c
index 5a3d0b7..ce47728 100644
--- a/drivers/net/ethernet/broadcom/bgmac-platform.c
+++ b/drivers/net/ethernet/broadcom/bgmac-platform.c
@@ -21,8 +21,12 @@
 #include 
 #include "bgmac.h"
 
+#define NICPM_PADRING_CFG  0x0004
 #define NICPM_IOMUX_CTRL   0x0008
 
+#define NICPM_PADRING_CFG_INIT_VAL 0x7400
+#define NICPM_IOMUX_CTRL_INIT_VAL_AX   0x2188
+
 #define NICPM_IOMUX_CTRL_INIT_VAL  0x3196e000
 #define NICPM_IOMUX_CTRL_SPD_SHIFT 10
 #define NICPM_IOMUX_CTRL_SPD_10M   0
@@ -108,6 +112,10 @@ static void bgmac_nicpm_speed_set(struct net_device 
*net_dev)
if (!bgmac->plat.nicpm_base)
return;
 
+   /* SET RGMII IO CONFIG */
+   writel(NICPM_PADRING_CFG_INIT_VAL,
+  bgmac->plat.nicpm_base + NICPM_PADRING_CFG);
+
val = NICPM_IOMUX_CTRL_INIT_VAL;
switch (bgmac->net_dev->phydev->speed) {
default:
@@ -239,6 +247,31 @@ static int bgmac_remove(struct platform_device *pdev)
return 0;
 }
 
+#ifdef CONFIG_PM
+static int bgmac_suspend(struct device *dev)
+{
+   struct bgmac *bgmac = dev_get_drvdata(dev);
+
+   return bgmac_enet_suspend(bgmac);
+}
+
+static int bgmac_resume(struct device *dev)
+{
+   struct bgmac *bgmac = dev_get_drvdata(dev);
+
+   return bgmac_enet_resume(bgmac);
+}
+
+static const struct dev_pm_ops bgmac_pm_ops = {
+   .suspend = bgmac_suspend,
+   .resume = bgmac_resume
+};
+
+#define BGMAC_PM_OPS (&bgmac_pm_ops)
+#else
+#define BGMAC_PM_OPS NULL
+#endif /* CONFIG_PM */
+
 static const struct of_device_id bgmac_of_enet_match[] = {
{.compatible = "brcm,amac",},
{.compatible = "brcm,nsp-amac",},
@@ -252,6 +285,7 @@ static struct platform_driver bgmac_enet_driver = {
.driver = {
.name  = "bgmac-enet",
.of_match_table = bgmac_of_enet_match,
+   .pm = BGMAC_PM_OPS
},
.probe = bgmac_probe,
.remove = bgmac_remove,
diff --git a/drivers/net/ethernet/broadcom/bgmac.c 
b/drivers/net/ethernet/broadcom/bgmac.c
index 6b7782f..7f516a2 100644
--- a/drivers/net/ethernet/broadcom/bgmac.c
+++ b/drivers/net/ethernet/broadcom/bgmac.c
@@ -1480,6 +1480,7 @@ int bgmac_enet_probe(struct bgmac *bgmac)
 
net_dev->irq = bgmac->irq;
SET_NETDEV_DEV(net_dev, bgmac->dev);
+   dev_set_drvdata(bgmac->dev, bgmac);
 
if (!is_valid_ether_addr(net_dev->dev_addr)) {
dev_err(bgmac->dev, "Invalid MAC addr: %pM\n",
@@ -1552,5 +1553,55 @@ void bgmac_enet_remove(struct bgmac *bgmac)
 }
 EXPORT_SYMBOL_GPL(bgmac_enet_remove);
 
+int bgmac_enet_suspend(struct bgmac *bgmac)
+{
+   if (!netif_running(bgmac->net_dev))
+   return 0;
+
+   phy_stop(bgmac->net_dev->phydev);
+
+   netif_stop_queue(bgmac->net_dev);
+
+   napi_disable(&bgmac->napi);
+
+   netif_tx_lock(bgmac->net_dev);
+   netif_device_detach(bgmac->net_dev);
+   netif_tx_unlock(bgmac->net_dev);
+
+   bgmac_chip_intrs_off(bgmac);
+   bgmac_chip_reset(bgmac);
+   bgmac_dma_cleanup(bgmac);
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(bgmac_enet_suspend);
+
+int bgmac_enet_resume(struct bgmac *bgmac)
+{
+   int rc;
+
+   if (!netif_running(bgmac->net_dev))
+   return 0;
+
+   rc = bgmac_dma_init(bgmac);
+   if (rc)
+   return rc;
+
+   bgmac_chip_init(bgmac);
+
+   napi_enable(&bgmac->napi);
+
+   netif_tx_lock(bgmac->net_dev);
+   netif_device_attach(bgmac->net_dev);
+   netif_tx_unlock(bgmac->net_dev);
+
+   netif_start_queue(bgmac->net_dev);
+
+   phy_start(bgmac->net_dev->phydev);
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(bgmac_enet_resume);
+
 MODULE_AUTHOR("Rafał Miłecki");
 MODULE_LICENSE("GPL");
diff --git a/drivers/net/ethernet/broadcom/bgmac.h 
b/drivers/net/ethernet/broadcom/bgmac.h
index a75ed35..c181876 100644
--- a/drivers/net/ethernet/broadcom/bgmac.h
+++ b/drivers/net/ethernet/broadcom/bgmac.h
@@ -537,6 +537,8 @@ int bgmac_enet_probe(struct bgmac *bgmac);
 void bgmac_enet_remove(struct bgmac *bgmac);
 void bgmac_adjust_link(struct net_device *net_dev);
 int bgmac_phy_connect_direct(struct bgmac *bgmac);
+int bgmac_enet_suspend(struct bgmac *bgmac);
+int bgmac_enet_resume(struct bgmac *bgmac);
 
 struct mii_bus *bcma_mdio_mii_register(struct bgmac *b

Re: [PATCH RFC v2 00/12] socket sendmsg MSG_ZEROCOPY

2017-02-28 Thread Eric Dumazet
On Tue, 2017-02-28 at 13:09 -0800, Andy Lutomirski wrote:

> Does this mean that a user program that does a zerocopy send can cause
> a retransmitted segment to contain different data than the original
> segment?  If so, is that okay?

Same remark applies to sendfile() already, or other zero copy modes
(vmsplice() + splice() )






Re: [PATCH] iproute2: show network device dependency tree

2017-02-28 Thread Jiri Pirko
Tue, Feb 28, 2017 at 09:19:23PM CET, zaboj.camp...@post.cz wrote:
>On Mon, 2017-02-27 at 10:55 -0800, Stephen Hemminger wrote:
>> 
>> Another alternative format would be to make -tree a output modifier and 
>> ident (like ps tree options).
>> 
>> $ ip -t link
>> 1: lo:  mtu 65536 qdisc noqueue state UNKNOWN mode 
>> DEFAULT group default qlen 1
>> link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
>> 8: bond0  mtu 1500 qdisc pfifo_fast state DOWN mode 
>> DEFAULT group default qlen 1000
>> link/ether 52:54:00:66:24:cd brd ff:ff:ff:ff:ff:ff 
>> 2: eth1:  mtu 1500 qdisc pfifo_fast master bond0 
>> state DOWN mode DEFAULT group default qlen 1000
>> link/ether 52:54:00:66:24:cd brd ff:ff:ff:ff:ff:ff
>
>OK, it looks better because the information is complete.
>
>I looked at several tree printing utilities and I like
>the lsblk format. But tabular format does not fit the ip-show
>well. I will try to indent the current ip-show output may be
>with fancy lines.

I don't really think that the output you provide has any value. As JiriB
wrote, it is really problematic to draw relationships between devices.
Let's take OVS as a very non-trivial example. Your patch aims to handle
only the very basic ones.


[PATCH net v1 0/3] amd-xgbe: AMD XGBE driver fixes 2017-02-28

2017-02-28 Thread Tom Lendacky
This patch series addresses some issues in the AMD XGBE driver.

The following fixes are included in this driver update series:

- Stop the PHY before disabling and releasing device interrupts so that
  MDIO requests issued by the device can be properly handled
- Set the MDIO communication mode on device startup, not just device
  probe
- Do not overwrite SFP settings when mod_absent is detected

This patch series is based on net.

---

Tom Lendacky (3):
  amd-xgbe: Stop the PHY before releasing interrupts
  amd-xgbe: Be sure to set MDIO modes on device (re)start
  amd-xgbe: Don't overwrite SFP PHY mod_absent settings


 drivers/net/ethernet/amd/xgbe/xgbe-dev.c|2 +-
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c|4 ++--
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c |   24 
 3 files changed, 27 insertions(+), 3 deletions(-)

-- 
Tom Lendacky


Re: [PATCH RFC v2 00/12] socket sendmsg MSG_ZEROCOPY

2017-02-28 Thread Andy Lutomirski
On Tue, Feb 28, 2017 at 12:43 PM, Willem de Bruijn
 wrote:
>
>> I can see this working if you have a special type of skb that
>> indicates that the data might be concurrently written and have all the
>> normal skb APIs (including, especially, anything that clones it) make
>> a copy first.
>
> Support for cloned skbs is required for TCP, both at tcp_transmit_skb
> and segmentation offload. Patch 4 especially adds reference counting
> of shared pages across clones and other sk_buff operations like
> pskb_expand_head. This still allows for deep copy (skb_copy_ubufs)
> on clones in specific datapaths like the above.

Does this mean that a user program that does a zerocopy send can cause
a retransmitted segment to contain different data than the original
segment?  If so, is that okay?


Re: [PATCH RFC v2 00/12] socket sendmsg MSG_ZEROCOPY

2017-02-28 Thread Willem de Bruijn
>>> I can see this working if you have a special type of skb that
>>> indicates that the data might be concurrently written and have all the
>>> normal skb APIs (including, especially, anything that clones it) make
>>> a copy first.
>>
>> Support for cloned skbs is required for TCP, both at tcp_transmit_skb
>> and segmentation offload. Patch 4 especially adds reference counting
>> of shared pages across clones and other sk_buff operations like
>> pskb_expand_head. This still allows for deep copy (skb_copy_ubufs)
>> on clones in specific datapaths like the above.
>
> Does this mean that a user program that does a zerocopy send can cause
> a retransmitted segment to contain different data than the original
> segment? If so, is that okay?

That is possible, indeed. The bytestream at the receiver is then
likely undefined. Though integrity of the tcp receive stack should
not be affected. A valid question is whether the stack should protect
against such users. The pattern is reminiscent of evasion attacks. In
the limited case, privileged users already can generate this data
pattern, of course.


Re: [drivers/net/vxlan]Why rcu_read_lock is not obtained before rculist travelling

2017-02-28 Thread Cong Wang
On Tue, Feb 28, 2017 at 6:03 AM, 颜小波  wrote:
> But I don’t find any rcu_read_lock invoked before travelling fdb_head list.  
> In vxlan_xmit and vxlan_snoop function, vxlan_find_mac function is called to 
> search the vxlan_fdb of the dst_mac or src_mac. Then information in vxlan_fdb 
>  is used for further process.  But as no rcu_read_lock is obtained before the 
> list travelling, I am wondering if it is possible that vxlan_fdb is freed 
> when it is being used.
>

In both RX and TX paths, rcu read lock is acquired by upper layer.
Check __dev_queue_xmit() and process_backlog().


Re: Extending socket timestamping API for NTP

2017-02-28 Thread Willem de Bruijn
>> > With this change I'm getting two error messages per transmission, but
>> > it looks like it may need some additional changes.
>> >
>> > If the first error message is received after the HW timestamp was
>> > captured,
>>
>> When does this happen? The first timestamp is generated from
>> skb_tx_timestamp in the device driver's ndo_start_xmit before
>> passing the packet to the NIC, the second when the device
>> driver cleans the tx descriptor on completion.
>
> As I understand it, it happens when the first skb (created by the
> skb_tx_timestamp() call) is received by the application after the
> driver called skb_tstamp_tx() with the HW timestamp. The SW timestamps
> are separate, but the HW timestamp is shared between clones. It

Oh right, the conversion to struct scm_timestamping only happens
on socket read in __sock_recv_timestamp.

> probably doesn't happen with the TSONLY option as it allocates a new
> skb. When I print timestamps from scm_timestamping I see a mix of two
> cases:
>
> TX 1488268812.193945472 0.0 1488286813.273760139
> TX 0.0 0.0 1488286813.273760139
> RX 1488268812.354356188 0.0 1488286813.434096389
>
> TX 1488268816.364407934 0.0 0.0
> TX 0.0 0.0 1488286817.444251014
> RX 1488268816.525150589 0.0 1488286817.604749889
>
> In the first case I assume the HW timestamp was saved before the first
> error message was received, so both error messages have the same HW
> timestamp, but only one has the SW timestamp. In the second case, the
> HW timestamp was saved later, so there is one message with SW
> timestamp and one message with HW timestamp.
>
> From the application point of view it would make sense if in the first
> case there was only one error message containing both timestamps. I'm

Agreed. I just proposed something similar on the error queue for
zerocopy notifications in http://patchwork.ozlabs.org/patch/731214/

> not sure how easy/safe it would be to drop the second skb. The other
> approach would be to not put HW timestamp in the first message when
> this "dual TX timestamping" option is enabled, so each error message
> has only one timestamp.

If it's possible to avoid one skb_clone completely, then that is preferable
over creating both and consuming one. If either approach becomes
complex, then queuing two separate messages is fine. A process
can recvmmsg(), after all. As long as the behavior is consistent.


Re: net/rds: use-after-free in inet_create

2017-02-28 Thread Dmitry Vyukov
On Wed, Mar 1, 2017 at 12:06 AM, Sowmini Varadhan
 wrote:
> Just posted an RFC patch, that I'm also testing here..
> hopefully we'll se the pr_info light up, and know that the problematic
> situation actually happened (I'll remove the pr_info if/when this
> gets submitted as a non-RFC patch).. thanks for helping with testing
> this..


But the other 2 use-after-frees happened on cp->cp_send_w. Shouldn't
we cancel it as well? And cp_recv_w?


[PATCH] net/mlx5e: add IPV6 dependency

2017-02-28 Thread Arnd Bergmann
The ethernet support now calls directly into the ipv6 core code, which
fails if IPV6 is a loadable module but mlx5 is built-in:

drivers/net/ethernet/mellanox/mlx5/core/en_tc.o: In function 
`mlx5e_create_encap_header_ipv6':
en_tc.c:(.text.mlx5e_create_encap_header_ipv6+0x110): undefined reference to 
`ip6_route_output_flags'

This adds a dependency to ensure that MLX5_CORE_EN can only be built
if we are able link the kernel successfully. The downside is that the
ethernet option can be hidden. Alternatively we could make MLX5_CORE
depend on "IPV6 || !IPV6", which would force MLX5_CORE to be a module
when IPV6 is, including in configurations where we don't use the ethernet
support at all.

Signed-off-by: Arnd Bergmann 
---
 drivers/net/ethernet/mellanox/mlx5/core/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig 
b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
index ddb4ca4ff930..117170014e88 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -14,6 +14,7 @@ config MLX5_CORE
 config MLX5_CORE_EN
bool "Mellanox Technologies ConnectX-4 Ethernet support"
depends on NETDEVICES && ETHERNET && PCI && MLX5_CORE
+   depends on IPV6=y || IPV6=n || MLX5_CORE=m
imply PTP_1588_CLOCK
default n
---help---
-- 
2.9.0



[PATCH net v1 1/3] amd-xgbe: Stop the PHY before releasing interrupts

2017-02-28 Thread Tom Lendacky
Some configurations require the use of the hardware's MDIO support to
communicate with external PHYs. The MDIO commands indicate completion
through the device interrupt. When bringing down the device the interrupts
were released before stopping the external PHY, resulting in MDIO command
timeouts. Move the stopping of the PHY to before the releasing of the
interrupts.

Signed-off-by: Tom Lendacky 
---
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 3aa457c..248f60d 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -1131,12 +1131,12 @@ static void xgbe_stop(struct xgbe_prv_data *pdata)
hw_if->disable_tx(pdata);
hw_if->disable_rx(pdata);
 
+   phy_if->phy_stop(pdata);
+
xgbe_free_irqs(pdata);
 
xgbe_napi_disable(pdata, 1);
 
-   phy_if->phy_stop(pdata);
-
hw_if->exit(pdata);
 
channel = pdata->channel;



[PATCH net v1 2/3] amd-xgbe: Be sure to set MDIO modes on device (re)start

2017-02-28 Thread Tom Lendacky
The MDIO register mode is set when the device is probed. But when the
device is brought down and then back up, the MDIO register mode has been
reset.  Be sure to reset the mode during device startup and only change
the mode of the address specified.

Signed-off-by: Tom Lendacky 
---
 drivers/net/ethernet/amd/xgbe/xgbe-dev.c|2 +-
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c |   22 ++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
index a7d16db..937f37a 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
@@ -1323,7 +1323,7 @@ static int xgbe_read_ext_mii_regs(struct xgbe_prv_data 
*pdata, int addr,
 static int xgbe_set_ext_mii_mode(struct xgbe_prv_data *pdata, unsigned int 
port,
 enum xgbe_mdio_mode mode)
 {
-   unsigned int reg_val = 0;
+   unsigned int reg_val = XGMAC_IOREAD(pdata, MAC_MDIOCL22R);
 
switch (mode) {
case XGBE_MDIO_MODE_CL22:
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index 9d8c9530..04804cb 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -875,6 +875,16 @@ static int xgbe_phy_find_phy_device(struct xgbe_prv_data 
*pdata)
!phy_data->sfp_phy_avail)
return 0;
 
+   /* Set the proper MDIO mode for the PHY */
+   ret = pdata->hw_if.set_ext_mii_mode(pdata, phy_data->mdio_addr,
+   phy_data->phydev_mode);
+   if (ret) {
+   netdev_err(pdata->netdev,
+  "mdio port/clause not compatible (%u/%u)\n",
+  phy_data->mdio_addr, phy_data->phydev_mode);
+   return ret;
+   }
+
/* Create and connect to the PHY device */
phydev = get_phy_device(phy_data->mii, phy_data->mdio_addr,
(phy_data->phydev_mode == XGBE_MDIO_MODE_CL45));
@@ -2722,6 +2732,18 @@ static int xgbe_phy_start(struct xgbe_prv_data *pdata)
if (ret)
return ret;
 
+   /* Set the proper MDIO mode for the re-driver */
+   if (phy_data->redrv && !phy_data->redrv_if) {
+   ret = pdata->hw_if.set_ext_mii_mode(pdata, phy_data->redrv_addr,
+   XGBE_MDIO_MODE_CL22);
+   if (ret) {
+   netdev_err(pdata->netdev,
+  "redriver mdio port not compatible (%u)\n",
+  phy_data->redrv_addr);
+   return ret;
+   }
+   }
+
/* Start in highest supported mode */
xgbe_phy_set_mode(pdata, phy_data->start_mode);
 



ipv6 sysctl

2017-02-28 Thread Ani Sinha
Hi guys,

Commit a79ca223e029 ('ipv6: fix bad free of addrconf_init_net')
introduced in linux 3.9 tries to fix an issue involving free-ing
statically allocated memory. Additionally, it subtly changes behavior
of how certain ipv6 sysctl values are inherited from the default net
namespace to the child namespaces.   Before a79ca223e029, the default
namespace would directly modify the values in statically allocated
struct ipv6_devconf for example and all child namespaces would inherit
these values upon creation (their own private copy was initialized
using the statically allocated ipv6_devconf). After this change, any
sysctl value changes in default net namespace is not seen by any new
child namespaces that are created afterwards. This is because all
network namespaces, including the default namespace has it's own
private copy of  struct ipv6_devconf which is initialized by certain
fixed values. This is in contrast to what we have in ipv4 where child
namespaces continues to inherit values from the default namespace upon
creation.

I see that there was a previous discussion here :
https://patchwork.kernel.org/patch/4639391/

Was the above inconsistency between ipv4 and ipv6 sysctl
initialization intentional or was it an unintended effect of the above
change ? It would be nice to have a symmetric behavior between ipv4
and ipv6. Please share your thoughts on this.

thanks,
ani


RE: [Intel-wired-lan] [PATCH] e1000e: fix timing for 82579 Gigabit Ethernet controller

2017-02-28 Thread Keller, Jacob E
> -Original Message-
> From: Intel-wired-lan [mailto:intel-wired-lan-boun...@lists.osuosl.org] On
> Behalf Of Neftin, Sasha
> Sent: Monday, February 27, 2017 12:40 AM
> To: Bernd Faust ; Kirsher, Jeffrey T
> ; intel-wired-...@lists.osuosl.org;
> netdev@vger.kernel.org; linux-ker...@vger.kernel.org
> Subject: Re: [Intel-wired-lan] [PATCH] e1000e: fix timing for 82579 Gigabit
> Ethernet controller
> 
> On 2/26/2017 11:08, Neftin, Sasha wrote:
> > On 2/19/2017 14:55, Neftin, Sasha wrote:
> >> On 2/16/2017 20:42, Bernd Faust wrote:
> >>> After an upgrade to Linux kernel v4.x the hardware timestamps of the
> >>> 82579 Gigabit Ethernet Controller are different than expected.
> >>> The values that are being read are almost four times as big as before
> >>> the kernel upgrade.
> >>>
> >>> The difference is that after the upgrade the driver sets the clock
> >>> frequency to 25MHz, where before the upgrade it was set to 96MHz. Intel
> >>> confirmed that the correct frequency for this network adapter is 96MHz.
> >>>
> >>> Signed-off-by: Bernd Faust 
> >>> ---
> >>>   drivers/net/ethernet/intel/e1000e/netdev.c | 6 ++
> >>>   1 file changed, 6 insertions(+)
> >>>
> >>> diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c
> >>> b/drivers/net/ethernet/intel/e1000e/netdev.c
> >>> index 7017281..8b7113d 100644
> >>> --- a/drivers/net/ethernet/intel/e1000e/netdev.c
> >>> +++ b/drivers/net/ethernet/intel/e1000e/netdev.c
> >>> @@ -3511,6 +3511,12 @@ s32 e1000e_get_base_timinca(struct
> >>> e1000_adapter *adapter, u32 *timinca)
> >>>
> >>>   switch (hw->mac.type) {
> >>>   case e1000_pch2lan:
> >>> +/* Stable 96MHz frequency */
> >>> +incperiod = INCPERIOD_96MHz;
> >>> +incvalue = INCVALUE_96MHz;
> >>> +shift = INCVALUE_SHIFT_96MHz;
> >>> +adapter->cc.shift = shift + INCPERIOD_SHIFT_96MHz;
> >>> +break;
> >>>   case e1000_pch_lpt:
> >>>   if (er32(TSYNCRXCTL) & E1000_TSYNCRXCTL_SYSCFI) {
> >>>   /* Stable 96MHz frequency */
> >>> --
> >>> 2.7.4
> >>> ___
> >>> Intel-wired-lan mailing list
> >>> intel-wired-...@lists.osuosl.org
> >>> http://lists.osuosl.org/mailman/listinfo/intel-wired-lan
> >>
> >> Hello,
> >>
> >> e1000_pch2lan mac type corresponds to 82579LM and 82579V network
> >> adapters. System clock frequency indication (SYSCFI) for these
> >> devices supports both 25MHz and 96MHz frequency. By default
> >> TSYNCRXCTL.SYSCFI is set to 1 and that means 96MHz frequency is picked.
> >>
> >> It is better to keep the current implementation as it covers all
> >> options.
> >>
> >> Thanks,
> >>
> >> Sasha
> >>
> > Hello,
> >
> > During last couple of weeks I saw few  complaints from community on
> > same timing problem with 82579. I will double check clock definition
> > with HW architecture.
> >
> > Sasha
> >
> I've double checked - 82579 support 96MHz frequency only. So, let's
> accept this suggestion to upstream.
> 
> Ack.
> 

This resolves also a complaint from someone on the LinuxPTP development mailing 
list.

ACK.

Thanks,
Jake



Re: net/rds: use-after-free in inet_create

2017-02-28 Thread Sowmini Varadhan
Just posted an RFC patch, that I'm also testing here..
hopefully we'll se the pr_info light up, and know that the problematic
situation actually happened (I'll remove the pr_info if/when this
gets submitted as a non-RFC patch).. thanks for helping with testing
this..

--Sowmini



Re: [PATCH RFC v2 00/12] socket sendmsg MSG_ZEROCOPY

2017-02-28 Thread Andy Lutomirski
On Tue, Feb 28, 2017 at 12:43 PM, Willem de Bruijn
 wrote:
> On Tue, Feb 28, 2017 at 2:46 PM, Andy Lutomirski  wrote:
>> On Mon, Feb 27, 2017 at 10:57 AM, Michael Kerrisk
>>  wrote:
>>> [CC += linux-...@vger.kernel.org]
>>>
>>> Hi Willem
>>>
>>
 On a send call with MSG_ZEROCOPY, the kernel pins the user pages and
 creates skbuff fragments directly from these pages. On tx completion,
 it notifies the socket owner that it is safe to modify memory by
 queuing a completion notification onto the socket error queue.
>>
>> What happens if the user writes to the pages while it's not safe?
>>
>> How about if you're writing to an interface or a route that has crypto
>> involved and a malicious user can make the data change in the middle
>> of a crypto operation, thus perhaps leaking the entire key?  (I
>> wouldn't be at all surprised if a lot of provably secure AEAD
>> constructions are entirely compromised if an attacker can get the
>> ciphertext and tag computed from a message that changed during the
>> computation.
>
> Operations that read or write payload, such as this crypto example,
> but also ebpf in tc or iptables, for instance, demand a deep copy using
> skb_copy_ubufs before the operation.
>
> This blacklist approach requires caution, but these paths should be
> few and countable. It is not possible to predict at the socket layer
> whether a packet will encounter any such operation, so white-listing
> a subset of end-to-end paths is not practical.

How about hardware that malfunctions if the packet changes out from
under it?  A whitelist seems quite a bit safer.

--Andy


Re: [PATCH] iproute2: show network device dependency tree

2017-02-28 Thread Zaboj Campula
On Mon, 2017-02-27 at 10:55 -0800, Stephen Hemminger wrote:
> 
> Another alternative format would be to make -tree a output modifier and ident 
> (like ps tree options).
> 
> $ ip -t link
> 1: lo:  mtu 65536 qdisc noqueue state UNKNOWN mode 
> DEFAULT group default qlen 1
> link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
> 8: bond0  mtu 1500 qdisc pfifo_fast state DOWN mode 
> DEFAULT group default qlen 1000
> link/ether 52:54:00:66:24:cd brd ff:ff:ff:ff:ff:ff 
> 2: eth1:  mtu 1500 qdisc pfifo_fast master bond0 
> state DOWN mode DEFAULT group default qlen 1000
> link/ether 52:54:00:66:24:cd brd ff:ff:ff:ff:ff:ff

OK, it looks better because the information is complete.

I looked at several tree printing utilities and I like
the lsblk format. But tabular format does not fit the ip-show
well. I will try to indent the current ip-show output may be
with fancy lines.



Re: [PATCH RFC v2 00/12] socket sendmsg MSG_ZEROCOPY

2017-02-28 Thread Willem de Bruijn
On Tue, Feb 28, 2017 at 2:46 PM, Andy Lutomirski  wrote:
> On Mon, Feb 27, 2017 at 10:57 AM, Michael Kerrisk
>  wrote:
>> [CC += linux-...@vger.kernel.org]
>>
>> Hi Willem
>>
>
>>> On a send call with MSG_ZEROCOPY, the kernel pins the user pages and
>>> creates skbuff fragments directly from these pages. On tx completion,
>>> it notifies the socket owner that it is safe to modify memory by
>>> queuing a completion notification onto the socket error queue.
>
> What happens if the user writes to the pages while it's not safe?
>
> How about if you're writing to an interface or a route that has crypto
> involved and a malicious user can make the data change in the middle
> of a crypto operation, thus perhaps leaking the entire key?  (I
> wouldn't be at all surprised if a lot of provably secure AEAD
> constructions are entirely compromised if an attacker can get the
> ciphertext and tag computed from a message that changed during the
> computation.

Operations that read or write payload, such as this crypto example,
but also ebpf in tc or iptables, for instance, demand a deep copy using
skb_copy_ubufs before the operation.

This blacklist approach requires caution, but these paths should be
few and countable. It is not possible to predict at the socket layer
whether a packet will encounter any such operation, so white-listing
a subset of end-to-end paths is not practical.

> I can see this working if you have a special type of skb that
> indicates that the data might be concurrently written and have all the
> normal skb APIs (including, especially, anything that clones it) make
> a copy first.

Support for cloned skbs is required for TCP, both at tcp_transmit_skb
and segmentation offload. Patch 4 especially adds reference counting
of shared pages across clones and other sk_buff operations like
pskb_expand_head. This still allows for deep copy (skb_copy_ubufs)
on clones in specific datapaths like the above.


Re: net/rds: use-after-free in inet_create

2017-02-28 Thread Dmitry Vyukov
On Tue, Feb 28, 2017 at 6:33 PM, Sowmini Varadhan
 wrote:
> On (02/28/17 17:51), Dmitry Vyukov wrote:
>> Searching other crashes for "net/rds" I found 2 more crashes that may
>> be related. They suggest that the delayed works are not properly
>> stopped when the socket is destroyed. That would explain how
>> rds_connect_worker accesses freed net, right?
>
> yes, I think we may want to explicitly cancel this workq.. this
> in rds_conn_destroy().
>
> I'm trying to build/sanity-test (if lucky, reproduce the bug)
> as I send this out.. let me get back to you..
>
> If I have a patch against net-next, would you be willing/able to
> try it out? given that this does not show up on demand, I'm not
> sure how we can check that "the fix worked"..


Yes, I can now apply custom patches to the bots. However, it fired
only 3 times, so it will give weak signal. But at least it will test
that the patch does not cause other bad things.


Re: [PATCH RFC v2 00/12] socket sendmsg MSG_ZEROCOPY

2017-02-28 Thread Andy Lutomirski
On Mon, Feb 27, 2017 at 10:57 AM, Michael Kerrisk
 wrote:
> [CC += linux-...@vger.kernel.org]
>
> Hi Willem
>

>> On a send call with MSG_ZEROCOPY, the kernel pins the user pages and
>> creates skbuff fragments directly from these pages. On tx completion,
>> it notifies the socket owner that it is safe to modify memory by
>> queuing a completion notification onto the socket error queue.

What happens if the user writes to the pages while it's not safe?

How about if you're writing to an interface or a route that has crypto
involved and a malicious user can make the data change in the middle
of a crypto operation, thus perhaps leaking the entire key?  (I
wouldn't be at all surprised if a lot of provably secure AEAD
constructions are entirely compromised if an attacker can get the
ciphertext and tag computed from a message that changed during the
computation.

I can see this working if you have a special type of skb that
indicates that the data might be concurrently written and have all the
normal skb APIs (including, especially, anything that clones it) make
a copy first.

--Andy


Re: [PATCH] iproute2: show network device dependency tree

2017-02-28 Thread Zaboj Campula
On Mon, 2017-02-27 at 17:38 +0100, Jiri Benc wrote:
> 
> It produces dot (graphviz) output or json and has no dependencies on
> anything GUI related. Just run it on the remote machine and display the
> output locally.
> 
> ssh root@remote plotnetcfg | dot -Tpdf | whatever_pdf_viewer
> 
> Note that some pdf viewers can't read stdin or require dash as the
> parameter to use stdin.
> 
> I don't think it's possible to enhance iproute2 to display the network
> interface dependencies in an useful way. It's just too complex. It's
> not even a (undirected) tree.

I know there is an option to execute something remotely but I think 
a pure text output may be useful to get a quick overview.

Well it is impossible to draw a simple tree showing the configuration
exactly with all details. May be it is too ambitious to draw a tree
at all. But neither directory structure is a tree (when consider links)
and there are a plenty of tools showing directory tree.


Re: [PATCH RFC net-next v2 3/4] net: more accurate checksumming in validate_xmit_skb

2017-02-28 Thread Tom Herbert
On Tue, Feb 28, 2017 at 2:32 AM, Davide Caratti  wrote:
> Introduce skb->csum_not_inet to identify not-yet-checksummed SCTP packets.
> Use this bit in combination with netdev feature bit in validate_xmit_skb,
> to discriminate whether skb needs crc32c or 2-complement Internet Checksum
> (or none of the two, when the underlying device can do checksum offload).
>
> Signed-off-by: Davide Caratti 
> ---
>  include/linux/skbuff.h|  1 +
>  net/core/dev.c| 14 --
>  net/netfilter/ipvs/ip_vs_proto_sctp.c |  1 +
>  net/netfilter/nf_nat_proto_sctp.c |  1 +
>  net/sched/act_csum.c  |  1 +
>  net/sctp/output.c |  1 +
>  6 files changed, 17 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 0671131..236b7d9 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -759,6 +759,7 @@ struct sk_buff {
> __u8tc_redirected:1;
> __u8tc_from_ingress:1;
>  #endif
> +   __u8csum_not_inet:1;
>
Unfortunately this potentially pushes the skbuf flags over 32 bits if
I count correctly. I suggest that you rename csum_bad to
csum_not_inet. Looks like csum_bad is only set by a grand total of one
driver and I don't believe that is enough to justify its existence.
It's probably a good time to remove it.

>  #ifdef CONFIG_NET_SCHED
> __u16   tc_index;   /* traffic control index */
> diff --git a/net/core/dev.c b/net/core/dev.c
> index b9fb843..fae3217 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2614,6 +2614,7 @@ int skb_sctp_csum_help(struct sk_buff *skb)
> }
> *(__le32 *)(skb->data + offset) = crc32c_csum;
> skb->ip_summed = CHECKSUM_NONE;
> +   skb->csum_not_inet = 0;
>  out:
> return ret;
>  }
> @@ -2960,6 +2961,16 @@ static struct sk_buff *validate_xmit_vlan(struct 
> sk_buff *skb,
> return skb;
>  }
>
> +static int skb_csum_hwoffload_help(struct sk_buff *skb,
> +  netdev_features_t features)
> +{
> +   if (unlikely(skb->csum_not_inet))
> +   return !(features & NETIF_F_SCTP_CRC) ?
> +   skb_sctp_csum_help(skb) : 0;
> +
Return value looks complex. Maybe we should just change
skb_csum_*_help to return bool, true of checksum was handled false if
not.

> +   return !(features & NETIF_F_CSUM_MASK) ? skb_checksum_help(skb) : 0;
> +}
> +
>  static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct 
> net_device *dev)
>  {
> netdev_features_t features;
> @@ -2995,8 +3006,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff 
> *skb, struct net_device
> else
> skb_set_transport_header(skb,
>  
> skb_checksum_start_offset(skb));
> -   if (!(features & NETIF_F_CSUM_MASK) &&
> -   skb_checksum_help(skb))
> +   if (skb_csum_hwoffload_help(skb, features))
> goto out_kfree_skb;
> }
> }
> diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c 
> b/net/netfilter/ipvs/ip_vs_proto_sctp.c
> index d952d67..4972a60 100644
> --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
> +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
> @@ -81,6 +81,7 @@ static void sctp_nat_csum(struct sk_buff *skb, 
> sctp_sctphdr_t *sctph,
>   unsigned int sctphoff)
>  {
> sctph->checksum = sctp_compute_cksum(skb, sctphoff);
> +   skb->csum_not_inet = 0;
> skb->ip_summed = CHECKSUM_UNNECESSARY;
>  }
>
> diff --git a/net/netfilter/nf_nat_proto_sctp.c 
> b/net/netfilter/nf_nat_proto_sctp.c
> index 31d3586..9459b88 100644
> --- a/net/netfilter/nf_nat_proto_sctp.c
> +++ b/net/netfilter/nf_nat_proto_sctp.c
> @@ -49,6 +49,7 @@ sctp_manip_pkt(struct sk_buff *skb,
>
> if (skb->ip_summed != CHECKSUM_PARTIAL) {
> hdr->checksum = sctp_compute_cksum(skb, hdroff);
> +   skb->csum_not_inet = 0;
> skb->ip_summed = CHECKSUM_NONE;
> }
>
> diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
> index e978ccd4..85cb150 100644
> --- a/net/sched/act_csum.c
> +++ b/net/sched/act_csum.c
> @@ -337,6 +337,7 @@ static int tcf_csum_sctp(struct sk_buff *skb, unsigned 
> int ihl,
>
> sctph->checksum = sctp_compute_cksum(skb,
>  skb_network_offset(skb) + ihl);
> +   skb->csum_not_inet = 0;
> skb->ip_summed = CHECKSUM_NONE;
>
> return 1;
> diff --git a/net/sctp/output.c b/net/sctp/output.c
> index 814eac0..0dc227b 100644
> --- a/net/sctp/output.c
> +++ b/net/sctp/output.c
> @@ -528,6 +528,7 @@ static int sctp_packet_pack(struct sctp_packet *packet,
> } else {
>  chksum:
> head-

Re: [Patch net] ipv6: ignore null_entry in inet6_rtm_getroute() too

2017-02-28 Thread Cong Wang
On Tue, Feb 28, 2017 at 11:01 AM, David Ahern  wrote:
> On 2/28/17 10:44 AM, Cong Wang wrote:
>> Like commit 1f17e2f2c8a8 ("net: ipv6: ignore null_entry on route dumps"),
>> we need to ignore null entry in inet6_rtm_getroute() too.
>>
>> Return -ENOENT here because we return the same errno when deleting
>> the null entry.
>>
>> Fixes: a1a22c1206 ("net: ipv6: Keep nexthop of multipath route on admin 
>> down")
>> Reported-by: Dmitry Vyukov 
>> Cc: David Ahern 
>> Signed-off-by: Cong Wang 
>> ---
>>  net/ipv6/route.c | 6 ++
>>  1 file changed, 6 insertions(+)
>>
>> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
>> index f54f426..25590d1 100644
>> --- a/net/ipv6/route.c
>> +++ b/net/ipv6/route.c
>> @@ -3627,6 +3627,12 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, 
>> struct nlmsghdr *nlh)
>>   rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
>>   }
>>
>> + if (rt == net->ipv6.ip6_null_entry) {
>> + ip6_rt_put(rt);
>> + err = -ENOENT;
>> + goto errout;
>> + }
>> +
>>   skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
>>   if (!skb) {
>>   ip6_rt_put(rt);
>>
>
> hold on. That test exposed something else, not just a getroute problem.
> I accidentally ran 'unsahre -n; ip -6 ro ls' on my host machine instead
> of a VM, so took some time to recover. dumproute already covers the null
> route.
>

Of course, you already stated it in your commit:

ip6_null_entry is the root of all ipv6 fib tables making it integrated
into the table and hence passed to the ipv6 route dump code. The
null_entry route uses the loopback device for dst.dev but may not have
rt6i_idev set because of the order in which initializations are done --
ip6_route_net_init is run before addrconf_init has initialized the
loopback device. Fixing the initialization order is a much bigger problem
with no obvious solution thus far.

The BUG is triggered when the loopback is set down and the netif_running
check added by a1a22c1206 fails. The fill_node descends to checking
rt->rt6i_idev for ignore_routes_with_linkdown and since rt6i_idev is
NULL it faults.

The null_entry route should not be processed in a dump request. Catch
and ignore. This check is done in rt6_dump_route as it is the highest
place in the callchain with knowledge of both the route and the network
namespace.

which is why I omit it.

The rt->rt6i_idev = in6_dev_get(loopback_dev) is apparently not correct,
at that time loopback_dev is just registered and not up or running, its
in6_dev pointer should be NULL, we need to listen to inet6addr event to
make it non-NULL. I thought you apparently knew this...


Re: [Patch net] ipv6: ignore null_entry in inet6_rtm_getroute() too

2017-02-28 Thread David Ahern
On 2/28/17 10:44 AM, Cong Wang wrote:
> Like commit 1f17e2f2c8a8 ("net: ipv6: ignore null_entry on route dumps"),
> we need to ignore null entry in inet6_rtm_getroute() too.
> 
> Return -ENOENT here because we return the same errno when deleting
> the null entry.
> 
> Fixes: a1a22c1206 ("net: ipv6: Keep nexthop of multipath route on admin down")
> Reported-by: Dmitry Vyukov 
> Cc: David Ahern 
> Signed-off-by: Cong Wang 
> ---
>  net/ipv6/route.c | 6 ++
>  1 file changed, 6 insertions(+)
> 
> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> index f54f426..25590d1 100644
> --- a/net/ipv6/route.c
> +++ b/net/ipv6/route.c
> @@ -3627,6 +3627,12 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, 
> struct nlmsghdr *nlh)
>   rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
>   }
>  
> + if (rt == net->ipv6.ip6_null_entry) {
> + ip6_rt_put(rt);
> + err = -ENOENT;
> + goto errout;
> + }
> +
>   skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
>   if (!skb) {
>   ip6_rt_put(rt);
> 

hold on. That test exposed something else, not just a getroute problem.
I accidentally ran 'unsahre -n; ip -6 ro ls' on my host machine instead
of a VM, so took some time to recover. dumproute already covers the null
route.

I'll get back to this in the afternoon.


[PATCH v4 net] net: solve a NAPI race

2017-02-28 Thread Eric Dumazet
From: Eric Dumazet 

While playing with mlx4 hardware timestamping of RX packets, I found
that some packets were received by TCP stack with a ~200 ms delay...

Since the timestamp was provided by the NIC, and my probe was added
in tcp_v4_rcv() while in BH handler, I was confident it was not
a sender issue, or a drop in the network.

This would happen with a very low probability, but hurting RPC
workloads.

A NAPI driver normally arms the IRQ after the napi_complete_done(),
after NAPI_STATE_SCHED is cleared, so that the hard irq handler can grab
it.

Problem is that if another point in the stack grabs NAPI_STATE_SCHED bit
while IRQ are not disabled, we might have later an IRQ firing and
finding this bit set, right before napi_complete_done() clears it.

This can happen with busy polling users, or if gro_flush_timeout is
used. But some other uses of napi_schedule() in drivers can cause this
as well.

thread 1 thread 2 (could be on same cpu, or not)

// busy polling or napi_watchdog()
napi_schedule();
...
napi->poll()

device polling:
read 2 packets from ring buffer
  Additional 3rd packet is
available.
  device hard irq

  // does nothing because
NAPI_STATE_SCHED bit is owned by thread 1
  napi_schedule();
  
napi_complete_done(napi, 2);
rearm_irq();


Note that rearm_irq() will not force the device to send an additional
IRQ for the packet it already signaled (3rd packet in my example)

This patch adds a new NAPI_STATE_MISSED bit, that napi_schedule_prep()
can set if it could not grab NAPI_STATE_SCHED

Then napi_complete_done() properly reschedules the napi to make sure
we do not miss something.

Since we manipulate multiple bits at once, use cmpxchg() like in
sk_busy_loop() to provide proper transactions.

In v2, I changed napi_watchdog() to use a relaxed variant of
napi_schedule_prep() : No need to set NAPI_STATE_MISSED from this point.

In v3, I added more details in the changelog and clears
NAPI_STATE_MISSED in busy_poll_stop()

In v4, I added the ideas given by Alexander Duyck in v3 review

Signed-off-by: Eric Dumazet 
Cc: Alexander Duyck 
---
 include/linux/netdevice.h |   29 -
 net/core/dev.c|   76 ++--
 2 files changed, 81 insertions(+), 24 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 
f40f0ab3847a8caaf46bd4d5f224c65014f501cc..97456b2539e46d6232dda804f6a434db6fd7134f
 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -330,6 +330,7 @@ struct napi_struct {
 
 enum {
NAPI_STATE_SCHED,   /* Poll is scheduled */
+   NAPI_STATE_MISSED,  /* reschedule a napi */
NAPI_STATE_DISABLE, /* Disable pending */
NAPI_STATE_NPSVC,   /* Netpoll - don't dequeue from poll_list */
NAPI_STATE_HASHED,  /* In NAPI hash (busy polling possible) */
@@ -338,12 +339,13 @@ enum {
 };
 
 enum {
-   NAPIF_STATE_SCHED= (1UL << NAPI_STATE_SCHED),
-   NAPIF_STATE_DISABLE  = (1UL << NAPI_STATE_DISABLE),
-   NAPIF_STATE_NPSVC= (1UL << NAPI_STATE_NPSVC),
-   NAPIF_STATE_HASHED   = (1UL << NAPI_STATE_HASHED),
-   NAPIF_STATE_NO_BUSY_POLL = (1UL << NAPI_STATE_NO_BUSY_POLL),
-   NAPIF_STATE_IN_BUSY_POLL = (1UL << NAPI_STATE_IN_BUSY_POLL),
+   NAPIF_STATE_SCHED= BIT(NAPI_STATE_SCHED),
+   NAPIF_STATE_MISSED   = BIT(NAPI_STATE_MISSED),
+   NAPIF_STATE_DISABLE  = BIT(NAPI_STATE_DISABLE),
+   NAPIF_STATE_NPSVC= BIT(NAPI_STATE_NPSVC),
+   NAPIF_STATE_HASHED   = BIT(NAPI_STATE_HASHED),
+   NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
+   NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
 };
 
 enum gro_result {
@@ -414,20 +416,7 @@ static inline bool napi_disable_pending(struct napi_struct 
*n)
return test_bit(NAPI_STATE_DISABLE, &n->state);
 }
 
-/**
- * napi_schedule_prep - check if NAPI can be scheduled
- * @n: NAPI context
- *
- * Test if NAPI routine is already running, and if not mark
- * it as running.  This is used as a condition variable to
- * insure only one NAPI poll instance runs.  We also make
- * sure there is no pending NAPI disable.
- */
-static inline bool napi_schedule_prep(struct napi_struct *n)
-{
-   return !napi_disable_pending(n) &&
-   !test_and_set_bit(NAPI_STATE_SCHED, &n->state);
-}
+bool napi_schedule_prep(struct napi_struct *n);
 
 /**
  * napi_schedule - schedule NAPI poll
diff --git a/net/core/dev.c b/net/core/dev.c
index 
304f2deae5f9897e60a79ed8b69d6ef208295ded..ab4fe5304834e43790fa023d99d7fd1844f8f728
 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4884,6 +4884,39 @@ void __napi_schedule(struct napi_struct *n)
 EXPORT_SYMBOL(__napi_schedule);
 
 /**

Re: usb/net/hso: WARNING: ungligned urb->setup_dma

2017-02-28 Thread Baruch Siach
Hi Stefan,

On Tue, Feb 28, 2017 at 05:21:18PM +0100, Stefan Wahren wrote:
> Am 28.02.2017 um 13:01 schrieb Baruch Siach:
> > On Tue, Feb 28, 2017 at 10:28:10AM +0200, Baruch Siach wrote:
> > > I'm hitting this warning consistently on my Raspberry Pi 3 running 
> > > kernel
> > > v4.10.1 with some unrelated device tree changes, and a debug print 
> > > (below).
> > > The device identifies as "GlobeTrotter HSDPA Modem", VID: 0af0, PID: 6971.
> > > The warning triggers consistently on first write access to /dev/ttyHS0 
> > > that
> > > ModemManager attempts. The first line in the log is my debug print.
> > I tested the same hardware successfully on an i.MX6 CuBox-i (ARM32) using 
> > the
> > same kernel version (4.10.1), and on an x86_64 PC (4.9). So this seems to be
> > platform specific. I don't have any other ARM64 machine at the moment, 
> > though.
> 
> those platforms usually doesn't use the dwc2 USB host controller. So it
> should be tested with another dwc2 platform.

The code that initializes setup_dma is not under drivers/usb/dwc2/. Though the 
problem looks like memory corruption, so its cause might be anywhere.

> Is the issue reproducible with other USB devices like keyboards?

Other USB devices, including a different cellular USB dongle, do not exhibit 
the same problem.

> Are you using arm64 defconfig?

I'm using that attached arm64 defconfig.

> Are you able to bisect this issue?

I don't have any previously working point.

Thanks,
baruch

-- 
 http://baruch.siach.name/blog/  ~. .~   Tk Open Systems
=}ooO--U--Ooo{=
   - bar...@tkos.co.il - tel: +972.52.368.4656, http://www.tkos.co.il -
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
CONFIG_AUDIT=y
CONFIG_NO_HZ_IDLE=y
CONFIG_HIGH_RES_TIMERS=y
CONFIG_BSD_PROCESS_ACCT=y
CONFIG_BSD_PROCESS_ACCT_V3=y
CONFIG_TASKSTATS=y
CONFIG_TASK_DELAY_ACCT=y
CONFIG_TASK_XACCT=y
CONFIG_TASK_IO_ACCOUNTING=y
CONFIG_IKCONFIG=y
CONFIG_IKCONFIG_PROC=y
CONFIG_MEMCG=y
CONFIG_MEMCG_SWAP=y
CONFIG_BLK_CGROUP=y
CONFIG_CGROUP_PIDS=y
CONFIG_CGROUP_HUGETLB=y
CONFIG_CPUSETS=y
CONFIG_CGROUP_DEVICE=y
CONFIG_CGROUP_CPUACCT=y
CONFIG_CGROUP_PERF=y
CONFIG_USER_NS=y
CONFIG_SCHED_AUTOGROUP=y
CONFIG_BLK_DEV_INITRD=y
# CONFIG_RD_BZIP2 is not set
# CONFIG_RD_LZMA is not set
# CONFIG_RD_XZ is not set
# CONFIG_RD_LZO is not set
# CONFIG_RD_LZ4 is not set
CONFIG_KALLSYMS_ALL=y
# CONFIG_COMPAT_BRK is not set
CONFIG_PROFILING=y
CONFIG_JUMP_LABEL=y
CONFIG_BLK_DEV_INTEGRITY=y
# CONFIG_IOSCHED_DEADLINE is not set
CONFIG_ARCH_BCM2835=y
CONFIG_ARM64_VA_BITS_48=y
CONFIG_SCHED_MC=y
CONFIG_HOTPLUG_CPU=y
CONFIG_PREEMPT=y
CONFIG_KSM=y
CONFIG_TRANSPARENT_HUGEPAGE=y
CONFIG_CMA=y
CONFIG_SECCOMP=y
CONFIG_CMDLINE="root=/dev/mmcblk0p2 rootwait console=ttyS0,115200"
CONFIG_CMDLINE_FORCE=y
# CONFIG_EFI is not set
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
CONFIG_COMPAT=y
# CONFIG_SUSPEND is not set
CONFIG_PM=y
CONFIG_CPU_IDLE=y
CONFIG_ARM_CPUIDLE=y
CONFIG_CPU_FREQ=y
CONFIG_CPUFREQ_DT=y
CONFIG_ARM_BIG_LITTLE_CPUFREQ=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_UNIX=y
CONFIG_INET=y
CONFIG_IP_MULTICAST=y
CONFIG_IP_PNP=y
CONFIG_IP_PNP_DHCP=y
CONFIG_IP_PNP_BOOTP=y
CONFIG_NETFILTER=y
CONFIG_NF_CONNTRACK=y
CONFIG_NF_CONNTRACK_EVENTS=y
CONFIG_NETFILTER_XT_TARGET_CHECKSUM=y
CONFIG_NETFILTER_XT_TARGET_LOG=y
CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=y
CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
CONFIG_NF_CONNTRACK_IPV4=y
CONFIG_IP_NF_IPTABLES=y
CONFIG_IP_NF_FILTER=y
CONFIG_IP_NF_TARGET_REJECT=y
CONFIG_IP_NF_NAT=y
CONFIG_IP_NF_TARGET_MASQUERADE=y
CONFIG_IP_NF_MANGLE=y
CONFIG_NF_CONNTRACK_IPV6=y
CONFIG_IP6_NF_IPTABLES=y
CONFIG_IP6_NF_FILTER=y
CONFIG_IP6_NF_TARGET_REJECT=y
CONFIG_IP6_NF_MANGLE=y
CONFIG_IP6_NF_NAT=y
CONFIG_IP6_NF_TARGET_MASQUERADE=y
CONFIG_BRIDGE=y
CONFIG_BRIDGE_VLAN_FILTERING=y
CONFIG_VLAN_8021Q=y
CONFIG_VLAN_8021Q_GVRP=y
CONFIG_VLAN_8021Q_MVRP=y
CONFIG_BT=y
CONFIG_BT_BNEP=y
CONFIG_BT_BNEP_MC_FILTER=y
CONFIG_BT_BNEP_PROTO_FILTER=y
CONFIG_BT_HIDP=y
# CONFIG_BT_HS is not set
CONFIG_BT_LEDS=y
CONFIG_BT_HCIUART=y
CONFIG_BT_HCIUART_BCM=y
CONFIG_RFKILL=y
CONFIG_RFKILL_REGULATOR=y
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
CONFIG_DEVTMPFS=y
CONFIG_DEVTMPFS_MOUNT=y
CONFIG_DMA_CMA=y
CONFIG_VEXPRESS_CONFIG=y
CONFIG_BLK_DEV_LOOP=y
CONFIG_SRAM=y
# CONFIG_VEXPRESS_SYSCFG is not set
CONFIG_SCSI=y
# CONFIG_SCSI_PROC_FS is not set
CONFIG_BLK_DEV_SD=y
CONFIG_CHR_DEV_SG=y
CONFIG_SCSI_SAS_LIBSAS=y
# CONFIG_SCSI_LOWLEVEL is not set
CONFIG_NETDEVICES=y
CONFIG_TUN=y
CONFIG_VETH=y
# CONFIG_ETHERNET is not set
CONFIG_PPP=y
CONFIG_PPP_BSDCOMP=y
CONFIG_PPP_DEFLATE=y
CONFIG_PPP_ASYNC=y
CONFIG_USB_USBNET=y
# CONFIG_USB_NET_AX8817X is not set
# CONFIG_USB_NET_AX88179_178A is not set
CONFIG_USB_NET_HUAWEI_CDC_NCM=y
CONFIG_USB_NET_CDC_MBIM=y
CONFIG_USB_NET_SMSC95XX=y
# CONFIG_USB_NET_NET1080 is not set
# CONFIG_USB_NET_CDC_SUBSET is not set
# CONFIG_USB_NET_ZAURUS is not set
CONFIG_USB_HSO=y
# CONFIG_WLAN is not set
CONFIG_INPUT_EVDEV=y
CONFIG_KEYBOARD_GPIO=y
CONFIG_INPUT_MISC=y
# 

[PATCH net-next v3 2/3] net: ethernet: bgmac: unify code of the same family

2017-02-28 Thread Jon Mason
BCM471X and BCM535X are of the same family (from what I can derive from
internal documents).  Group them into the case statement together, which
results in more code reuse.

Also, use existing helper variables to make the code a little more
readable too.

Signed-off-by: Jon Mason 
---
 drivers/net/ethernet/broadcom/bgmac-bcma.c | 64 +-
 1 file changed, 28 insertions(+), 36 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bgmac-bcma.c 
b/drivers/net/ethernet/broadcom/bgmac-bcma.c
index d59cfcc4..cf15b7e 100644
--- a/drivers/net/ethernet/broadcom/bgmac-bcma.c
+++ b/drivers/net/ethernet/broadcom/bgmac-bcma.c
@@ -192,36 +192,50 @@ static int bgmac_probe(struct bcma_device *core)
goto err1;
}
 
-   bgmac->has_robosw = !!(core->bus->sprom.boardflags_lo &
-  BGMAC_BFL_ENETROBO);
+   bgmac->has_robosw = !!(sprom->boardflags_lo & BGMAC_BFL_ENETROBO);
if (bgmac->has_robosw)
dev_warn(bgmac->dev, "Support for Roboswitch not 
implemented\n");
 
-   if (core->bus->sprom.boardflags_lo & BGMAC_BFL_ENETADM)
+   if (sprom->boardflags_lo & BGMAC_BFL_ENETADM)
dev_warn(bgmac->dev, "Support for ADMtek ethernet switch not 
implemented\n");
 
/* Feature Flags */
-   switch (core->bus->chipinfo.id) {
+   switch (ci->id) {
+   /* BCM 471X/535X family */
+   case BCMA_CHIP_ID_BCM4716:
+   bgmac->feature_flags |= BGMAC_FEAT_CLKCTLST;
+   /* fallthrough */
+   case BCMA_CHIP_ID_BCM47162:
+   bgmac->feature_flags |= BGMAC_FEAT_FLW_CTRL2;
+   bgmac->feature_flags |= BGMAC_FEAT_SET_RXQ_CLK;
+   break;
case BCMA_CHIP_ID_BCM5357:
+   case BCMA_CHIP_ID_BCM53572:
bgmac->feature_flags |= BGMAC_FEAT_SET_RXQ_CLK;
bgmac->feature_flags |= BGMAC_FEAT_CLKCTLST;
bgmac->feature_flags |= BGMAC_FEAT_FLW_CTRL1;
bgmac->feature_flags |= BGMAC_FEAT_SW_TYPE_PHY;
-   if (core->bus->chipinfo.pkg == BCMA_PKG_ID_BCM47186) {
-   bgmac->feature_flags |= BGMAC_FEAT_IOST_ATTACHED;
+   if (ci->pkg == BCMA_PKG_ID_BCM47188 ||
+   ci->pkg == BCMA_PKG_ID_BCM47186) {
bgmac->feature_flags |= BGMAC_FEAT_SW_TYPE_RGMII;
+   bgmac->feature_flags |= BGMAC_FEAT_IOST_ATTACHED;
}
-   if (core->bus->chipinfo.pkg == BCMA_PKG_ID_BCM5358)
+   if (ci->pkg == BCMA_PKG_ID_BCM5358)
bgmac->feature_flags |= BGMAC_FEAT_SW_TYPE_EPHYRMII;
break;
-   case BCMA_CHIP_ID_BCM53572:
-   bgmac->feature_flags |= BGMAC_FEAT_SET_RXQ_CLK;
+   case BCMA_CHIP_ID_BCM53573:
bgmac->feature_flags |= BGMAC_FEAT_CLKCTLST;
-   bgmac->feature_flags |= BGMAC_FEAT_FLW_CTRL1;
-   bgmac->feature_flags |= BGMAC_FEAT_SW_TYPE_PHY;
-   if (core->bus->chipinfo.pkg == BCMA_PKG_ID_BCM47188) {
-   bgmac->feature_flags |= BGMAC_FEAT_SW_TYPE_RGMII;
+   bgmac->feature_flags |= BGMAC_FEAT_SET_RXQ_CLK;
+   if (ci->pkg == BCMA_PKG_ID_BCM47189)
bgmac->feature_flags |= BGMAC_FEAT_IOST_ATTACHED;
+   if (core->core_unit == 0) {
+   bgmac->feature_flags |= BGMAC_FEAT_CC4_IF_SW_TYPE;
+   if (ci->pkg == BCMA_PKG_ID_BCM47189)
+   bgmac->feature_flags |=
+   BGMAC_FEAT_CC4_IF_SW_TYPE_RGMII;
+   } else if (core->core_unit == 1) {
+   bgmac->feature_flags |= BGMAC_FEAT_IRQ_ID_OOB_6;
+   bgmac->feature_flags |= BGMAC_FEAT_CC7_IF_TYPE_RGMII;
}
break;
case BCMA_CHIP_ID_BCM4749:
@@ -229,18 +243,11 @@ static int bgmac_probe(struct bcma_device *core)
bgmac->feature_flags |= BGMAC_FEAT_CLKCTLST;
bgmac->feature_flags |= BGMAC_FEAT_FLW_CTRL1;
bgmac->feature_flags |= BGMAC_FEAT_SW_TYPE_PHY;
-   if (core->bus->chipinfo.pkg == 10) {
+   if (ci->pkg == 10) {
bgmac->feature_flags |= BGMAC_FEAT_SW_TYPE_RGMII;
bgmac->feature_flags |= BGMAC_FEAT_IOST_ATTACHED;
}
break;
-   case BCMA_CHIP_ID_BCM4716:
-   bgmac->feature_flags |= BGMAC_FEAT_CLKCTLST;
-   /* fallthrough */
-   case BCMA_CHIP_ID_BCM47162:
-   bgmac->feature_flags |= BGMAC_FEAT_FLW_CTRL2;
-   bgmac->feature_flags |= BGMAC_FEAT_SET_RXQ_CLK;
-   break;
/* bcm4707_family */
case BCMA_CHIP_ID_BCM4707:
case BCMA_CHIP_ID_BCM47094:
@@ -249,21 +256,6 @@ static int bgmac_probe(struct bcma_device *core)
bgmac->feature_flags |= B

[PATCH net v4 1/2] net: ethernet: bgmac: init sequence bug

2017-02-28 Thread Jon Mason
Fix a bug in the 'bgmac' driver init sequence that blind writes for init
sequence where it should preserve most bits other than the ones it is
deliberately manipulating.

The code now checks to see if the adapter needs to be brought out of
reset (where as before it was doing an IDM write to bring it out of
reset regardless of whether it was in reset or not).  Also, removed
unnecessary usleeps (as there is already a read present to flush the
IDM writes).

Signed-off-by: Zac Schroff 
Signed-off-by: Jon Mason 
Fixes: f6a95a24957 ("net: ethernet: bgmac: Add platform device support")
---
 drivers/net/ethernet/broadcom/bgmac-platform.c | 27 +-
 drivers/net/ethernet/broadcom/bgmac.h  | 16 +++
 2 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bgmac-platform.c 
b/drivers/net/ethernet/broadcom/bgmac-platform.c
index 7b1af95..da1b8b2 100644
--- a/drivers/net/ethernet/broadcom/bgmac-platform.c
+++ b/drivers/net/ethernet/broadcom/bgmac-platform.c
@@ -51,8 +51,7 @@ static void platform_bgmac_idm_write(struct bgmac *bgmac, u16 
offset, u32 value)
 
 static bool platform_bgmac_clk_enabled(struct bgmac *bgmac)
 {
-   if ((bgmac_idm_read(bgmac, BCMA_IOCTL) &
-(BCMA_IOCTL_CLK | BCMA_IOCTL_FGC)) != BCMA_IOCTL_CLK)
+   if ((bgmac_idm_read(bgmac, BCMA_IOCTL) & BGMAC_CLK_EN) != BGMAC_CLK_EN)
return false;
if (bgmac_idm_read(bgmac, BCMA_RESET_CTL) & BCMA_RESET_CTL_RESET)
return false;
@@ -61,15 +60,25 @@ static bool platform_bgmac_clk_enabled(struct bgmac *bgmac)
 
 static void platform_bgmac_clk_enable(struct bgmac *bgmac, u32 flags)
 {
-   bgmac_idm_write(bgmac, BCMA_IOCTL,
-   (BCMA_IOCTL_CLK | BCMA_IOCTL_FGC | flags));
-   bgmac_idm_read(bgmac, BCMA_IOCTL);
+   u32 val;
 
-   bgmac_idm_write(bgmac, BCMA_RESET_CTL, 0);
-   bgmac_idm_read(bgmac, BCMA_RESET_CTL);
-   udelay(1);
+   /* The Reset Control register only contains a single bit to show if the
+* controller is currently in reset.  Do a sanity check here, just in
+* case the bootloader happened to leave the device in reset.
+*/
+   val = bgmac_idm_read(bgmac, BCMA_RESET_CTL);
+   if (val) {
+   bgmac_idm_write(bgmac, BCMA_RESET_CTL, 0);
+   bgmac_idm_read(bgmac, BCMA_RESET_CTL);
+   udelay(1);
+   }
 
-   bgmac_idm_write(bgmac, BCMA_IOCTL, (BCMA_IOCTL_CLK | flags));
+   val = bgmac_idm_read(bgmac, BCMA_IOCTL);
+   /* Some bits of BCMA_IOCTL set by HW/ATF and should not change */
+   val |= flags & ~(BGMAC_AWCACHE | BGMAC_ARCACHE | BGMAC_AWUSER |
+BGMAC_ARUSER);
+   val |= BGMAC_CLK_EN;
+   bgmac_idm_write(bgmac, BCMA_IOCTL, val);
bgmac_idm_read(bgmac, BCMA_IOCTL);
udelay(1);
 }
diff --git a/drivers/net/ethernet/broadcom/bgmac.h 
b/drivers/net/ethernet/broadcom/bgmac.h
index 248727d..6d1c6ff 100644
--- a/drivers/net/ethernet/broadcom/bgmac.h
+++ b/drivers/net/ethernet/broadcom/bgmac.h
@@ -213,6 +213,22 @@
 /* BCMA GMAC core specific IO Control (BCMA_IOCTL) flags */
 #define BGMAC_BCMA_IOCTL_SW_CLKEN  0x0004  /* PHY Clock 
Enable */
 #define BGMAC_BCMA_IOCTL_SW_RESET  0x0008  /* PHY Reset */
+/* The IOCTL values appear to be different in NS, NSP, and NS2, and do not 
match
+ * the values directly above
+ */
+#define BGMAC_CLK_EN   BIT(0)
+#define BGMAC_RESERVED_0   BIT(1)
+#define BGMAC_SOURCE_SYNC_MODE_EN  BIT(2)
+#define BGMAC_DEST_SYNC_MODE_ENBIT(3)
+#define BGMAC_TX_CLK_OUT_INVERT_EN BIT(4)
+#define BGMAC_DIRECT_GMII_MODE BIT(5)
+#define BGMAC_CLK_250_SEL  BIT(6)
+#define BGMAC_AWCACHE  (0xf << 7)
+#define BGMAC_RESERVED_1   (0x1f << 11)
+#define BGMAC_ARCACHE  (0xf << 16)
+#define BGMAC_AWUSER   (0x3f << 20)
+#define BGMAC_ARUSER   (0x3f << 26)
+#define BGMAC_RESERVED BIT(31)
 
 /* BCMA GMAC core specific IO status (BCMA_IOST) flags */
 #define BGMAC_BCMA_IOST_ATTACHED   0x0800
-- 
2.7.4



[PATCH net v4 0/2] net: ethernet: bgmac: bug fixes

2017-02-28 Thread Jon Mason
Changes in v4:
* Added the udelays from the previous code (per David Miller)

Changes in v3:
* Reworked the init sequence patch to only remove the device reset if
  the device is actually in reset.  Given that this code doesn't bear
  much resemblance to the original code, I'm changing the author of the
  patch.  This was tested on NS2 SVK.

Changes in v2:
* Reworked the first match to make it more obvious what portions of the
  register were being preserved (Per Rafal Mileki)
* Style change to reorder the function variables in patch 2 (per Sergei
  Shtylyov)


Bug fixes for bgmac driver


Hari Vyas (1):
  net: ethernet: bgmac: mac address change bug

Jon Mason (1):
  net: ethernet: bgmac: init sequence bug

 drivers/net/ethernet/broadcom/bgmac-platform.c | 27 +-
 drivers/net/ethernet/broadcom/bgmac.c  |  6 +-
 drivers/net/ethernet/broadcom/bgmac.h  | 16 +++
 3 files changed, 39 insertions(+), 10 deletions(-)

-- 
2.7.4



[Patch net] ipv6: ignore null_entry in inet6_rtm_getroute() too

2017-02-28 Thread Cong Wang
Like commit 1f17e2f2c8a8 ("net: ipv6: ignore null_entry on route dumps"),
we need to ignore null entry in inet6_rtm_getroute() too.

Return -ENOENT here because we return the same errno when deleting
the null entry.

Fixes: a1a22c1206 ("net: ipv6: Keep nexthop of multipath route on admin down")
Reported-by: Dmitry Vyukov 
Cc: David Ahern 
Signed-off-by: Cong Wang 
---
 net/ipv6/route.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f54f426..25590d1 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3627,6 +3627,12 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, 
struct nlmsghdr *nlh)
rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
}
 
+   if (rt == net->ipv6.ip6_null_entry) {
+   ip6_rt_put(rt);
+   err = -ENOENT;
+   goto errout;
+   }
+
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
if (!skb) {
ip6_rt_put(rt);
-- 
2.5.5



[PATCH net v4 2/2] net: ethernet: bgmac: mac address change bug

2017-02-28 Thread Jon Mason
From: Hari Vyas 

ndo_set_mac_address() passes struct sockaddr * as 2nd parameter to
bgmac_set_mac_address() but code assumed u8 *.  This caused two bytes
chopping and the wrong mac address was configured.

Signed-off-by: Hari Vyas 
Signed-off-by: Jon Mason 
Fixes: 4e209001b86 ("bgmac: write mac address to hardware in 
ndo_set_mac_address")
---
 drivers/net/ethernet/broadcom/bgmac.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bgmac.c 
b/drivers/net/ethernet/broadcom/bgmac.c
index 4150467..6b7782f 100644
--- a/drivers/net/ethernet/broadcom/bgmac.c
+++ b/drivers/net/ethernet/broadcom/bgmac.c
@@ -1223,12 +1223,16 @@ static netdev_tx_t bgmac_start_xmit(struct sk_buff *skb,
 static int bgmac_set_mac_address(struct net_device *net_dev, void *addr)
 {
struct bgmac *bgmac = netdev_priv(net_dev);
+   struct sockaddr *sa = addr;
int ret;
 
ret = eth_prepare_mac_addr_change(net_dev, addr);
if (ret < 0)
return ret;
-   bgmac_write_mac_address(bgmac, (u8 *)addr);
+
+   ether_addr_copy(bgmac->mac_addr, sa->sa_data);
+   bgmac_write_mac_address(bgmac, bgmac->mac_addr);
+
eth_commit_mac_addr_change(net_dev, addr);
return 0;
 }
-- 
2.7.4



Re: usb/net/hso: WARNING: ungligned urb->setup_dma

2017-02-28 Thread Stefan Wahren
Hi Baruch,

> Baruch Siach  hat am 28. Februar 2017 um 19:07 geschrieben:
> 
> 
> Hi Stefan,
> 
> On Tue, Feb 28, 2017 at 05:21:18PM +0100, Stefan Wahren wrote:
> > Am 28.02.2017 um 13:01 schrieb Baruch Siach:
> > > On Tue, Feb 28, 2017 at 10:28:10AM +0200, Baruch Siach wrote:
> > > > I'm hitting this warning consistently on my Raspberry Pi 3 running 
> > > > kernel
> > > > v4.10.1 with some unrelated device tree changes, and a debug print 
> > > > (below).
> > > > The device identifies as "GlobeTrotter HSDPA Modem", VID: 0af0, PID: 
> > > > 6971.
> > > > The warning triggers consistently on first write access to /dev/ttyHS0 
> > > > that
> > > > ModemManager attempts. The first line in the log is my debug print.
> > > I tested the same hardware successfully on an i.MX6 CuBox-i (ARM32) using 
> > > the
> > > same kernel version (4.10.1), and on an x86_64 PC (4.9). So this seems to 
> > > be
> > > platform specific. I don't have any other ARM64 machine at the moment, 
> > > though.
> > 
> > those platforms usually doesn't use the dwc2 USB host controller. So it
> > should be tested with another dwc2 platform.
> 
> The code that initializes setup_dma is not under drivers/usb/dwc2/. Though 
> the 
> problem looks like memory corruption, so its cause might be anywhere.

only a suspicion, but could you please try this patch [1]?

[1] - https://patchwork.kernel.org/patch/9166771/


Re: [patch] net/mlx4: && vs & typo

2017-02-28 Thread Tariq Toukan



On 28/02/2017 2:02 PM, Dan Carpenter wrote:

Bitwise & was obviously intended here.


Sure! Thanks for your patch.



Fixes: 745d8ae4622c ("net/mlx4: Spoofcheck and zero MAC can't coexist")
Signed-off-by: Dan Carpenter 
---
Applies to net.git.

diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h
index e965e5090d96..a858bcb6220b 100644
--- a/include/linux/mlx4/driver.h
+++ b/include/linux/mlx4/driver.h
@@ -109,7 +109,7 @@ static inline void mlx4_u64_to_mac(u8 *addr, u64 mac)
int i;

for (i = ETH_ALEN; i > 0; i--) {
-   addr[i - 1] = mac && 0xFF;
+   addr[i - 1] = mac & 0xFF;
mac >>= 8;
}
 }
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Reviewed-by: Tariq Toukan 


Re: net: GPF in rt6_nexthop_info

2017-02-28 Thread Cong Wang
On Tue, Feb 28, 2017 at 5:10 AM, Eric Dumazet  wrote:
> On Tue, 2017-02-28 at 12:34 +0100, Dmitry Vyukov wrote:
>> Hello,
>>
>> The following program triggers GPF in rt6_nexthop_info
>>
>> // autogenerated by syzkaller (http://github.com/google/syzkaller)
>> #include 
>> #include 
>> #include 
>> #include 
>> #include 
>>
>> int main()
>> {
>>   unshare(CLONE_NEWNET);
>>   int fd = socket(PF_NETLINK, SOCK_DGRAM|SOCK_NONBLOCK, 0);
>>   const char* data ="\x22\x00\x00\x00\x1a\x00\x07\x00\x08\x09\x00\x0f"
>> "\x09\x00\x07\x00\x0a\xff\xfc\x03\x1b\x00\xa8\xc6"
>> "\x19\xf3\xff\xff\x05\x00\x10\x00\xbe\x45";
>>   write(fd, data, 34);
>>   return 0;
>> }
>>
>> general protection fault:  [#1] SMP KASAN
>> Modules linked in:
>> CPU: 2 PID: 2965 Comm: a.out Not tainted 4.10.0+ #229
>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
>> task: 8800650fc100 task.stack: 880065378000
>> RIP: 0010:rt6_nexthop_info+0x278/0x3b0 net/ipv6/route.c:3325
>> RSP: 0018:88006537f350 EFLAGS: 00010203
>> RAX: dc00 RBX: 88006865da00 RCX: 
>> RDX:  RSI: 004a RDI: 0254
>> RBP: 88006537f3f0 R08: ed000cc55820 R09: ed000cc55820
>> R10: 0001 R11: ed000cc5581f R12: 11000ca6fe6d
>> R13: 8800662ac0d8 R14: 88006537f3c8 R15: dc00
>> FS:  00c39880() GS:88006d10() knlGS:
>> CS:  0010 DS:  ES:  CR0: 80050033
>> CR2: 004b20e0 CR3: 6cdb7000 CR4: 001406e0
>> Call Trace:
>>  rt6_fill_node.isra.61+0xea4/0x1780 net/ipv6/route.c:3513
>>  inet6_rtm_getroute+0x7da/0xce0 net/ipv6/route.c:3639
>>  rtnetlink_rcv_msg+0x609/0x860 net/core/rtnetlink.c:4104
>>  netlink_rcv_skb+0x2ab/0x390 net/netlink/af_netlink.c:2298
>>  rtnetlink_rcv+0x2a/0x40 net/core/rtnetlink.c:4110
>>  netlink_unicast_kernel net/netlink/af_netlink.c:1231 [inline]
>>  netlink_unicast+0x514/0x730 net/netlink/af_netlink.c:1257
>>  netlink_sendmsg+0xa9f/0xe50 net/netlink/af_netlink.c:1803
>>  sock_sendmsg_nosec net/socket.c:633 [inline]
>>  sock_sendmsg+0xca/0x110 net/socket.c:643
>>  sock_write_iter+0x326/0x600 net/socket.c:846
>>  new_sync_write fs/read_write.c:499 [inline]
>>  __vfs_write+0x483/0x740 fs/read_write.c:512
>>  vfs_write+0x187/0x530 fs/read_write.c:560
>>  SYSC_write fs/read_write.c:607 [inline]
>>  SyS_write+0xfb/0x230 fs/read_write.c:599
>>  entry_SYSCALL_64_fastpath+0x1f/0xc2
>> RIP: 0033:0x433ef0
>> RSP: 002b:7ffcea6171b8 EFLAGS: 0246 ORIG_RAX: 0001
>> RAX: ffda RBX: 00401730 RCX: 00433ef0
>> RDX: 0022 RSI: 00493548 RDI: 0003
>> RBP:  R08: 000b R09: 0004
>> R10: 000d R11: 0246 R12: 004002b0
>> R13: 7ffcea6172c8 R14: 0002 R15: 
>> Code: ff df 80 3c 01 00 0f 85 3c 01 00 00 48 8b 8b 58 01 00 00 48 b8
>> 00 00 00 00 00 fc ff df 48 8d b9 54 02 00 00 48 89 fe 48 c1 ee 03 <0f>
>> b6 34 06 48 89 f8 83 e0 07 83 c0 03 40 38 f0 7c 05 40 84 f6
>> RIP: rt6_nexthop_info+0x278/0x3b0 net/ipv6/route.c:3325 RSP: 88006537f350
>> ---[ end trace 6a59074c79adfb5f ]---
>>
>> On commit e5d56efc97f8240d0b5d66c03949382b6d7e5570
>
> Not sure how you chose all these addresses on CC, while you forgot David
> Ahern. I doubt Yuchung is interested in fixing IPv6 bugs.
>
> David, rt->rt6i_idev can be NULL.

Probably we need to skip null entry again here:

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f54f426..25590d1 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3627,6 +3627,12 @@ static int inet6_rtm_getroute(struct sk_buff
*in_skb, struct nlmsghdr *nlh)
rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
}

+   if (rt == net->ipv6.ip6_null_entry) {
+   ip6_rt_put(rt);
+   err = -ENOENT;
+   goto errout;
+   }
+
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
if (!skb) {
ip6_rt_put(rt);


Re: [PATCH v2 net] net: solve a NAPI race

2017-02-28 Thread Alexander Duyck
On Mon, Feb 27, 2017 at 6:21 AM, Eric Dumazet  wrote:
> From: Eric Dumazet 
>
> While playing with mlx4 hardware timestamping of RX packets, I found
> that some packets were received by TCP stack with a ~200 ms delay...
>
> Since the timestamp was provided by the NIC, and my probe was added
> in tcp_v4_rcv() while in BH handler, I was confident it was not
> a sender issue, or a drop in the network.
>
> This would happen with a very low probability, but hurting RPC
> workloads.
>
> A NAPI driver normally arms the IRQ after the napi_complete_done(),
> after NAPI_STATE_SCHED is cleared, so that the hard irq handler can grab
> it.
>
> Problem is that if another point in the stack grabs NAPI_STATE_SCHED bit
> while IRQ are not disabled, we might have later an IRQ firing and
> finding this bit set, right before napi_complete_done() clears it.
>
> This can happen with busy polling users, or if gro_flush_timeout is
> used. But some other uses of napi_schedule() in drivers can cause this
> as well.
>
> This patch adds a new NAPI_STATE_MISSED bit, that napi_schedule_prep()
> can set if it could not grab NAPI_STATE_SCHED
>
> Then napi_complete_done() properly reschedules the napi to make sure
> we do not miss something.
>
> Since we manipulate multiple bits at once, use cmpxchg() like in
> sk_busy_loop() to provide proper transactions.
>
> In v2, I changed napi_watchdog() to use a relaxed variant of
> napi_schedule_prep() : No need to set NAPI_STATE_MISSED from this point.
>
> Signed-off-by: Eric Dumazet 
> ---
>  include/linux/netdevice.h |   29 ++-
>  net/core/dev.c|   53 +---
>  2 files changed, 58 insertions(+), 24 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 
> f40f0ab3847a8caaf46bd4d5f224c65014f501cc..97456b2539e46d6232dda804f6a434db6fd7134f
>  100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -330,6 +330,7 @@ struct napi_struct {
>
>  enum {
> NAPI_STATE_SCHED,   /* Poll is scheduled */
> +   NAPI_STATE_MISSED,  /* reschedule a napi */
> NAPI_STATE_DISABLE, /* Disable pending */
> NAPI_STATE_NPSVC,   /* Netpoll - don't dequeue from poll_list */
> NAPI_STATE_HASHED,  /* In NAPI hash (busy polling possible) */
> @@ -338,12 +339,13 @@ enum {
>  };
>
>  enum {
> -   NAPIF_STATE_SCHED= (1UL << NAPI_STATE_SCHED),
> -   NAPIF_STATE_DISABLE  = (1UL << NAPI_STATE_DISABLE),
> -   NAPIF_STATE_NPSVC= (1UL << NAPI_STATE_NPSVC),
> -   NAPIF_STATE_HASHED   = (1UL << NAPI_STATE_HASHED),
> -   NAPIF_STATE_NO_BUSY_POLL = (1UL << NAPI_STATE_NO_BUSY_POLL),
> -   NAPIF_STATE_IN_BUSY_POLL = (1UL << NAPI_STATE_IN_BUSY_POLL),
> +   NAPIF_STATE_SCHED= BIT(NAPI_STATE_SCHED),
> +   NAPIF_STATE_MISSED   = BIT(NAPI_STATE_MISSED),
> +   NAPIF_STATE_DISABLE  = BIT(NAPI_STATE_DISABLE),
> +   NAPIF_STATE_NPSVC= BIT(NAPI_STATE_NPSVC),
> +   NAPIF_STATE_HASHED   = BIT(NAPI_STATE_HASHED),
> +   NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
> +   NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
>  };
>
>  enum gro_result {
> @@ -414,20 +416,7 @@ static inline bool napi_disable_pending(struct 
> napi_struct *n)
> return test_bit(NAPI_STATE_DISABLE, &n->state);
>  }
>
> -/**
> - * napi_schedule_prep - check if NAPI can be scheduled
> - * @n: NAPI context
> - *
> - * Test if NAPI routine is already running, and if not mark
> - * it as running.  This is used as a condition variable to
> - * insure only one NAPI poll instance runs.  We also make
> - * sure there is no pending NAPI disable.
> - */
> -static inline bool napi_schedule_prep(struct napi_struct *n)
> -{
> -   return !napi_disable_pending(n) &&
> -   !test_and_set_bit(NAPI_STATE_SCHED, &n->state);
> -}
> +bool napi_schedule_prep(struct napi_struct *n);
>
>  /**
>   * napi_schedule - schedule NAPI poll
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 
> 304f2deae5f9897e60a79ed8b69d6ef208295ded..edeb916487015f279036ecf7ff5d9096dff365d3
>  100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -4884,6 +4884,32 @@ void __napi_schedule(struct napi_struct *n)
>  EXPORT_SYMBOL(__napi_schedule);
>
>  /**
> + * napi_schedule_prep - check if napi can be scheduled
> + * @n: napi context
> + *
> + * Test if NAPI routine is already running, and if not mark
> + * it as running.  This is used as a condition variable
> + * insure only one NAPI poll instance runs.  We also make
> + * sure there is no pending NAPI disable.
> + */
> +bool napi_schedule_prep(struct napi_struct *n)
> +{
> +   unsigned long val, new;
> +
> +   do {
> +   val = READ_ONCE(n->state);
> +   if (unlikely(val & NAPIF_STATE_DISABLE))
> +   return false;
> +   new = val | NAPIF_STATE_SCHED;
> +   if (

Re: [PATCH v1 3/4] nvme-rdma: use inet_pton_with_scope helper

2017-02-28 Thread Sagi Grimberg

Could use a proper changelog.


Will do, thanks!


[PATCH] drivers: net: xgene: Fix crash on DT systems

2017-02-28 Thread Alban Bedel
On DT systems the driver require a clock, but the probe just print a
warning and continue, leading to a crash when resetting the device.
To fix this crash and properly handle probe deferals only ignore the
missing clock if DT isn't used or if the clock doesn't exist.

Signed-off-by: Alban Bedel 
---
 drivers/net/ethernet/apm/xgene/xgene_enet_main.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c 
b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
index d0d0d12b531f..68b48edc5921 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
@@ -1756,6 +1756,12 @@ static int xgene_enet_get_resources(struct 
xgene_enet_pdata *pdata)
 
pdata->clk = devm_clk_get(&pdev->dev, NULL);
if (IS_ERR(pdata->clk)) {
+   /* Abort if the clock is defined but couldn't be retrived.
+* Always abort if the clock is missing on DT system as
+* the driver can't cope with this case.
+*/
+   if (PTR_ERR(pdata->clk) != -ENOENT || dev->of_node)
+   return PTR_ERR(pdata->clk);
/* Firmware may have set up the clock already. */
dev_info(dev, "clocks have been setup already\n");
}
-- 
2.11.0



Re: [PATCH v1 3/4] nvme-rdma: use inet_pton_with_scope helper

2017-02-28 Thread Christoph Hellwig
Could use a proper changelog.

Otherwise looks fine:

Reviewed-by: Christoph Hellwig 


Re: [PATCH] MAINTAINERS: Orphan usb/net/hso driver

2017-02-28 Thread Greg KH
On Tue, Feb 28, 2017 at 10:38:44AM -0500, David Miller wrote:
> From: Baruch Siach 
> Date: Tue, 28 Feb 2017 10:39:48 +0200
> 
> > The email address of Jan Dumon bounces, and there is not relevant 
> > information
> > in the linked website.
> > 
> > Signed-off-by: Baruch Siach 
> 
> This patch is fine and I will apply it later, however I will make a
> general statement that we list USB networking drivers very
> inconsistently.
> 
> Specifically, some list linux-usb as the mailing list, others list
> netdev.  We should figure out which one (or both) is the best to
> specify, and apply that rule consitently to the MAINTAINERS file.

I don't want them :)

Seriously, I don't have a problem with the patches being cc:ed to
linux-usb as long as you will take them, as often times the issues
involved are USB specific things, not network driver-specific things, so
the developers here can weigh in on them.

Want me to make up a patch to do that?

thanks,

greg k-h


Re: net/rds: use-after-free in inet_create

2017-02-28 Thread Sowmini Varadhan
On (02/28/17 18:45), Dmitry Vyukov wrote:
> 
> Yes, I can now apply custom patches to the bots. However, it fired
> only 3 times, so it will give weak signal. But at least it will test
> that the patch does not cause other bad things.

Ok, let me do my bit of homework on this one and get back to you
(probably by tomorrow)..




Re: [PATCH v2 net] net: solve a NAPI race

2017-02-28 Thread Eric Dumazet
On Tue, 2017-02-28 at 09:20 -0800, Alexander Duyck wrote:
> On Mon, Feb 27, 2017 at 6:21 AM, Eric Dumazet  wrote:

> > +bool napi_schedule_prep(struct napi_struct *n)
> > +{
> > +   unsigned long val, new;
> > +
> > +   do {
> > +   val = READ_ONCE(n->state);
> > +   if (unlikely(val & NAPIF_STATE_DISABLE))
> > +   return false;
> > +   new = val | NAPIF_STATE_SCHED;
> > +   if (unlikely(val & NAPIF_STATE_SCHED))
> > +   new |= NAPIF_STATE_MISSED;
> 
> You might want to consider just using a combination AND, divide,
> multiply, and OR to avoid having to have any conditional branches
> being added due to this code path.  Basically the logic would look
> like:


> new = val | NAPIF_STATE_SCHED;
> new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED * NAPIF_STATE_MISSED;
> 
> In assembler that all ends up getting translated out to AND, SHL, OR.
> You avoid the branching, or MOV/OR/TEST/CMOV type code you would end
> up with otherwise.

Sure, I can try to optimize this a bit ;)


> > +   } while (cmpxchg(&n->state, val, new) != val);
> > +
> > +   if (unlikely(val & NAPIF_STATE_MISSED))
> > +   __napi_schedule(n);
> > +
> > return true;
> >  }
> 
> If you rescheduled napi should you really be returning true?  Seems
> like you should be returning "!(val & NAPIF_STATE_MISSED)" to try to
> avoid letting this occur again.

Good idea.

Hmm... you mean that many drivers test napi_complete_done() return
value ?

;)

Thanks !




Re: net/rds: use-after-free in inet_create

2017-02-28 Thread Sowmini Varadhan
On (02/28/17 17:51), Dmitry Vyukov wrote:
> Searching other crashes for "net/rds" I found 2 more crashes that may
> be related. They suggest that the delayed works are not properly
> stopped when the socket is destroyed. That would explain how
> rds_connect_worker accesses freed net, right?

yes, I think we may want to explicitly cancel this workq.. this
in rds_conn_destroy().

I'm trying to build/sanity-test (if lucky, reproduce the bug)
as I send this out.. let me get back to you..

If I have a patch against net-next, would you be willing/able to 
try it out? given that this does not show up on demand, I'm not
sure how we can check that "the fix worked"..

--Sowmini


Re: [PATCH v3 net] net: solve a NAPI race

2017-02-28 Thread Eric Dumazet
On Tue, 2017-02-28 at 08:17 -0800, Stephen Hemminger wrote:

> Maybe just as simple as using irqsave/irqrestore in driver.


CPU can be differents.

irqsave will not help.





Re: net/rds: use-after-free in inet_create

2017-02-28 Thread Dmitry Vyukov
On Tue, Feb 28, 2017 at 5:38 PM, Sowmini Varadhan
 wrote:
> On (02/28/17 17:32), Dmitry Vyukov wrote:
>> Not reproducible so far.
>>
>> rds is compiled into kernel (no modules):
>> CONFIG_RDS=y
>> CONFIG_RDS_TCP=y
>
> I see. So if it never gets unloaded, the rds_connections "should"
> be around forever.. let me inspect code and see if I spot some
> race-window..
>
>> Also fuzzer actively creates and destroys namespaces.
>> Yes, I don't see socket(0x15) in the log. Probably it was truncated.
>
> I see. May be useful if we coudl get a crash dump to see what
> other threads were going on (might give a hint about which threads
> were racing). I'll try reproducing this at my end too.


Searching other crashes for "net/rds" I found 2 more crashes that may
be related. They suggest that the delayed works are not properly
stopped when the socket is destroyed. That would explain how
rds_connect_worker accesses freed net, right?


BUG: KASAN: use-after-free in memcmp+0xe3/0x160 lib/string.c:768 at
addr 88018d49cb20
Read of size 1 by task kworker/u4:4/3546
CPU: 1 PID: 3546 Comm: kworker/u4:4 Not tainted 4.9.0 #7
Hardware name: Google Google Compute Engine/Google Compute Engine,
BIOS Google 01/01/2011
Workqueue: krdsd rds_send_worker
 8801ccd46628 8234ce1f 0001 1100399a8c58
 ed00399a8c50 41b58ab3 84b38258 8234cb31
  10bf 8156afb0 858c8e58
Call Trace:
 [] __dump_stack lib/dump_stack.c:15 [inline]
 [] dump_stack+0x2ee/0x3ef lib/dump_stack.c:51
 [] kasan_object_err+0x1c/0x70 mm/kasan/report.c:162
 [] print_address_description mm/kasan/report.c:200 [inline]
 [] kasan_report_error mm/kasan/report.c:289 [inline]
 [] kasan_report.part.2+0x1e5/0x4b0 mm/kasan/report.c:311
 [] kasan_report mm/kasan/report.c:329 [inline]
 [] __asan_report_load1_noabort+0x29/0x30
mm/kasan/report.c:329
 [] memcmp+0xe3/0x160 lib/string.c:768
 [] rhashtable_compare include/linux/rhashtable.h:556 [inline]
 [] __rhashtable_lookup
include/linux/rhashtable.h:578 [inline]
 [] rhashtable_lookup include/linux/rhashtable.h:610 [inline]
 [] rhashtable_lookup_fast
include/linux/rhashtable.h:636 [inline]
 [] rds_find_bound+0x4fe/0x8a0 net/rds/bind.c:63
 [] rds_recv_incoming+0x5fc/0x1300 net/rds/recv.c:313
 [] rds_loop_xmit+0x1c5/0x480 net/rds/loop.c:82
 [] rds_send_xmit+0x104a/0x2420 net/rds/send.c:348
 [] rds_send_worker+0x122/0x2a0 net/rds/threads.c:189
 [] process_one_work+0xbd0/0x1c10 kernel/workqueue.c:2096
 [] worker_thread+0x223/0x1990 kernel/workqueue.c:2230
 [] kthread+0x323/0x3e0 kernel/kthread.c:209
 [] ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:433
Object at 88018d49c6c0, in cache RDS size: 1464
Allocated:
PID = 5431
 [   40.943107] [] save_stack_trace+0x16/0x20
arch/x86/kernel/stacktrace.c:57
 [   40.950346] [] save_stack+0x43/0xd0 mm/kasan/kasan.c:495
 [   40.957064] [] set_track mm/kasan/kasan.c:507 [inline]
 [   40.957064] [] kasan_kmalloc+0xaa/0xd0
mm/kasan/kasan.c:598
 [   40.964040] [] kasan_slab_alloc+0x12/0x20
mm/kasan/kasan.c:537
 [   40.971282] [] kmem_cache_alloc+0x102/0x680 mm/slab.c:3573
 [   40.978696] [] sk_prot_alloc+0x65/0x2a0
net/core/sock.c:1327
 [   40.985766] [] sk_alloc+0x8c/0x460 net/core/sock.c:1389
 [   40.992398] [] rds_create+0x11c/0x5e0 net/rds/af_rds.c:504
 [   40.999296] [] __sock_create+0x4e4/0x870 net/socket.c:1168
 [   41.006446] [] sock_create net/socket.c:1208 [inline]
 [   41.006446] [] SYSC_socket net/socket.c:1238 [inline]
 [   41.006446] [] SyS_socket+0xf9/0x230 net/socket.c:1218
 [   41.013251] [] entry_SYSCALL_64_fastpath+0x1f/0xc2
Freed:
PID = 5431
 [   41.025881] [] save_stack_trace+0x16/0x20
arch/x86/kernel/stacktrace.c:57
 [   41.033124] [] save_stack+0x43/0xd0 mm/kasan/kasan.c:495
 [   41.039840] [] set_track mm/kasan/kasan.c:507 [inline]
 [   41.039840] [] kasan_slab_free+0x6f/0xb0
mm/kasan/kasan.c:571
 [   41.046992] [] __cache_free mm/slab.c:3515 [inline]
 [   41.046992] [] kmem_cache_free+0x71/0x240 mm/slab.c:3775
 [   41.054232] [] sk_prot_free net/core/sock.c:1370 [inline]
 [   41.054232] [] __sk_destruct+0x47d/0x6a0
net/core/sock.c:1445
 [   41.061383] [] sk_destruct+0x47/0x80 net/core/sock.c:1453
 [   41.068199] [] __sk_free+0x57/0x230 net/core/sock.c:1461
 [   41.074921] [] sk_free+0x23/0x30 net/core/sock.c:1472
 [   41.081398] [] sock_put include/net/sock.h:1591 [inline]
 [   41.081398] [] rds_release+0x358/0x500 net/rds/af_rds.c:89
 [   41.088376] [] sock_release+0x8d/0x1e0 net/socket.c:585
 [   41.095358] [] sock_close+0x16/0x20 net/socket.c:1032
 [   41.102083] [] __fput+0x332/0x7f0 fs/file_table.c:208
 [   41.108628] [] fput+0x15/0x20 fs/file_table.c:244
 [   41.115184] [] task_work_run+0x18a/0x260
kernel/task_work.c:116
 [   41.122337] [] tracehook_notify_resume
include/linux/tracehook.h:191 [inline]
 [   41.122337] [] exit_to_usermode_loop+0x23b/0x2a0
arch/x86/entry/common.c:160
 [   41.130193] [] prepare_exit_to_usermode
arch/x86/entry/common.c:190 [inline]
 [   41.1301

Re: [PATCH 1/1] rds: ib: add the static type to the variables

2017-02-28 Thread Santosh Shilimkar

On 2/27/2017 10:45 PM, Zhu Yanjun wrote:

The variables rds_ib_mr_1m_pool_size and rds_ib_mr_8k_pool_size
are used only in the ib.c file. As such, the static type is
added to limit them in this file.

Cc: Joe Jin 
Cc: Junxiao Bi 
Signed-off-by: Zhu Yanjun 
---

Looks good.
Acked-by: Santosh Shilimkar 


Re: net/rds: use-after-free in inet_create

2017-02-28 Thread Sowmini Varadhan
On (02/28/17 17:32), Dmitry Vyukov wrote:
> Not reproducible so far.
> 
> rds is compiled into kernel (no modules):
> CONFIG_RDS=y
> CONFIG_RDS_TCP=y

I see. So if it never gets unloaded, the rds_connections "should"
be around forever.. let me inspect code and see if I spot some 
race-window.. 

> Also fuzzer actively creates and destroys namespaces.
> Yes, I don't see socket(0x15) in the log. Probably it was truncated.

I see. May be useful if we coudl get a crash dump to see what
other threads were going on (might give a hint about which threads
were racing). I'll try reproducing this at my end too.

--Sowmini


Re: net/rds: use-after-free in inet_create

2017-02-28 Thread Dmitry Vyukov
On Tue, Feb 28, 2017 at 5:15 PM, Sowmini Varadhan
 wrote:
> On (02/28/17 16:49), Dmitry Vyukov wrote:
>>
>> Grepping "socket" there, it was doing lots of things with sockets. Are
>> we looking for some particular socket type? If there are few programs
>> that create sockets of that type, then we can narrow down the set:
>
> Yes, we are looking for PF_RDS/AF_RDS - this should be
> #define AF_RDS  21  /* RDS sockets  */
>
> I see PF_KCM there (value 41) but no instances of 0x15.. how did
> the rds_connect_worker thread get kicked off at all?
>
> the way this is supposed to work is
> 1. someone modprobes rds-tcp
> 2. app tries to do rds_sendmsg to some ip address in a netns - this triggers 
> the
>creation of an rds_connection, and subsequent kernel socket TCP connection
>threads (i.e., rds_connect_worker) for that netns
> 3. if you unload rds-tcp, the module_unload should do all the cleanup
>needed via rds_tcp_conn_paths_destroy. This is done
> Its not clear to me that the test is doing any of this...
>
> is this reproducible? let me check if there is some race window where
> we can restart a connection attempt when rds_tcp_kill_sock assumes
> that the connect worker has been quiesced..


Not reproducible so far.

rds is compiled into kernel (no modules):
CONFIG_RDS=y
CONFIG_RDS_TCP=y

Also fuzzer actively creates and destroys namespaces.

Yes, I don't see socket(0x15) in the log. Probably it was truncated.


Re: usb/net/hso: WARNING: ungligned urb->setup_dma

2017-02-28 Thread Stefan Wahren

Hi Baruch,

Am 28.02.2017 um 13:01 schrieb Baruch Siach:

Hi linux-usb list,

(Dropped Jan's bouncing address, added Rpi3 platform lists)

On Tue, Feb 28, 2017 at 10:28:10AM +0200, Baruch Siach wrote:

Hi linux-usb list,

I'm hitting this warning consistently on my Raspberry Pi 3 running kernel
v4.10.1 with some unrelated device tree changes, and a debug print (below).
The device identifies as "GlobeTrotter HSDPA Modem", VID: 0af0, PID: 6971.
The warning triggers consistently on first write access to /dev/ttyHS0 that
ModemManager attempts. The first line in the log is my debug print.

I tested the same hardware successfully on an i.MX6 CuBox-i (ARM32) using the
same kernel version (4.10.1), and on an x86_64 PC (4.9). So this seems to be
platform specific. I don't have any other ARM64 machine at the moment, though.


those platforms usually doesn't use the dwc2 USB host controller. So it 
should be tested with another dwc2 platform.


Is the issue reproducible with other USB devices like keyboards?
Are you using arm64 defconfig?
Are you able to bisect this issue?

Stefan


Re: [PATCH v1 2/4] nvmet-rdma: use generic inet_pton_with_scope

2017-02-28 Thread Christoph Hellwig
Please add a changelog and mention that this adds IPv6 support.

Otherwise looks fine:

Reviewed-by: Christoph Hellwig 


Re: [PATCH v1 1/4] net/utils: generic inet_pton_with_scope helper

2017-02-28 Thread Christoph Hellwig
Looks fine,

Reviewed-by: Christoph Hellwig 


Re: [PATCH v3 net] net: solve a NAPI race

2017-02-28 Thread Stephen Hemminger
On Mon, 27 Feb 2017 12:18:31 -0800
Eric Dumazet  wrote:

> This can happen with busy polling users, or if gro_flush_timeout is
> used. But some other uses of napi_schedule() in drivers can cause this
> as well.

Where were IRQ's re-enabled?


> thread 1 thread 2 (could be on same cpu)
> 
> // busy polling or napi_watchdog()
> napi_schedule();
> ...
> napi->poll()
> 
> device polling:
> read 2 packets from ring buffer
>   Additional 3rd packet is available.
>   device hard irq
> 
>   // does nothing because 
> NAPI_STATE_SCHED bit is owned by thread 1
>   napi_schedule();
>   
> napi_complete_done(napi, 2);
> rearm_irq();

Maybe just as simple as using irqsave/irqrestore in driver.


Re: net/rds: use-after-free in inet_create

2017-02-28 Thread Sowmini Varadhan
On (02/28/17 16:49), Dmitry Vyukov wrote:
> 
> Grepping "socket" there, it was doing lots of things with sockets. Are
> we looking for some particular socket type? If there are few programs
> that create sockets of that type, then we can narrow down the set:

Yes, we are looking for PF_RDS/AF_RDS - this should be 
#define AF_RDS  21  /* RDS sockets  */

I see PF_KCM there (value 41) but no instances of 0x15.. how did 
the rds_connect_worker thread get kicked off at all?

the way this is supposed to work is
1. someone modprobes rds-tcp
2. app tries to do rds_sendmsg to some ip address in a netns - this triggers the
   creation of an rds_connection, and subsequent kernel socket TCP connection
   threads (i.e., rds_connect_worker) for that netns
3. if you unload rds-tcp, the module_unload should do all the cleanup
   needed via rds_tcp_conn_paths_destroy. This is done  
Its not clear to me that the test is doing any of this... 

is this reproducible? let me check if there is some race window where
we can restart a connection attempt when rds_tcp_kill_sock assumes
that the connect worker has been quiesced..

--Sowmini


Re: usb/net/hso: WARNING: ungligned urb->setup_dma

2017-02-28 Thread Michael Zoran
On Tue, 2017-02-28 at 14:01 +0200, Baruch Siach wrote:
> Hi linux-usb list,
> 
> (Dropped Jan's bouncing address, added Rpi3 platform lists)
> 
> On Tue, Feb 28, 2017 at 10:28:10AM +0200, Baruch Siach wrote:
> > Hi linux-usb list,
> > 
> > I'm hitting this warning consistently on my Raspberry Pi 3 running
> > kernel 
> > v4.10.1 with some unrelated device tree changes, and a debug print
> > (below). 
> > The device identifies as "GlobeTrotter HSDPA Modem", VID: 0af0,
> > PID: 6971.
> > The warning triggers consistently on first write access to
> > /dev/ttyHS0 that 
> > ModemManager attempts. The first line in the log is my debug print.
> 
> I tested the same hardware successfully on an i.MX6 CuBox-i (ARM32)
> using the 
> same kernel version (4.10.1), and on an x86_64 PC (4.9). So this
> seems to be 
> platform specific. I don't have any other ARM64 machine at the
> moment, though.
> 
> Has anyone noticed USB host issues on Raspberry Pi 3 with 64bit
> kernel?
> 
> baruch
> 
> > [   65.004892] dwc2_map_urb_for_dma: setup_dma: f5635066
> > [   65.010718] [ cut here ]
> > [   65.016079] WARNING: CPU: 1 PID: 794 at
> > drivers/usb/dwc2/hcd.c:2631 dwc2_map_urb_for_dma+0x94/0x130
> > [   65.025930] 
> > [   65.028143] CPU: 1 PID: 794 Comm: ModemManager Not tainted
> > 4.10.1-8-g76809ec3947f-dirty #719
> > [   65.037747] Hardware name: Raspberry Pi 3 Model B (DT)
> > [   65.043656] task: 8000362e3e80 task.stack: 80003556c000
> > [   65.050359] PC is at dwc2_map_urb_for_dma+0x94/0x130
> > [   65.056090] LR is at dwc2_map_urb_for_dma+0x34/0x130
> > [   65.061819] pc : [] lr : []
> > pstate: 01c5
> > [   65.070018] sp : 80003556fa70
> > [   65.074080] x29: 80003556fa70 x28: 0005 
> > [   65.080167] x27: 80003563d000 x26: 8000350d4600 
> > [   65.086252] x25: 800035686780 x24: 0002 
> > [   65.092247] x23: 0001 x22: 01080020 
> > [   65.097635] x21: 800035425800 x20: 01080020 
> > [   65.103024] x19: 800035b72400 x18: 0010 
> > [   65.108413] x17: 908f5a40 x16: 081abad8 
> > [   65.113801] x15: 0006 x14: 8882c047 
> > [   65.119191] x13: 0882c055 x12: 0882e458 
> > [   65.124579] x11: 80003556f7d0 x10: 087c9a38 
> > [   65.129968] x9 : 083613e0 x8 : 363566203a616d64 
> > [   65.135356] x7 : 5f7075746573203a x6 : 02fc 
> > [   65.140744] x5 :  x4 :  
> > [   65.146132] x3 :  x2 : 8000362e3e80 
> > [   65.151521] x1 : 0001 x0 : 0880e000 
> > [   65.156907] 
> > [   65.158414] ---[ end trace a78d20eaecac455a ]---
> > [   65.163093] Call trace:
> > [   65.165573] Exception stack(0x80003556f8a0 to
> > 0x80003556f9d0)
> > [   65.172108] f8a0: 800035b72400 0001
> > 80003556fa70 083ff0ac
> > [   65.180052] f8c0: 086f8f70 08795000
> > 0882dce8 087fb318
> > [   65.187996] f8e0: 0882c048 00010882bae8
> > 80003556f990 080dc664
> > [   65.195940] f900: 800035b72400 01080020
> > 800035425800 01080020
> > [   65.203883] f920: 0001 0002
> > 800035686780 8000350d4600
> > [   65.211827] f940: 0880e000 0001
> > 8000362e3e80 
> > [   65.219771] f960:  
> > 02fc 5f7075746573203a
> > [   65.227715] f980: 363566203a616d64 083613e0
> > 087c9a38 80003556f7d0
> > [   65.235659] f9a0: 0882e458 0882c055
> > 8882c047 0006
> > [   65.243601] f9c0: 081abad8 908f5a40
> > [   65.248548] [] dwc2_map_urb_for_dma+0x94/0x130
> > [   65.254643] [] usb_hcd_submit_urb+0x8c/0x8c0
> > [   65.260560] [] usb_submit_urb+0x248/0x480
> > [   65.266215] [] mux_device_request+0xdc/0x1f8
> > [   65.272134] []
> > hso_mux_serial_write_data+0x28/0x38
> > [   65.278581] [] hso_kick_transmit+0x88/0xb8
> > [   65.284323] [] hso_serial_write+0x60/0xc0
> > [   65.289979] [] n_tty_write+0x300/0x3e0
> > [   65.295369] [] tty_write+0x170/0x2b8
> > [   65.300585] [] __vfs_write+0x1c/0x100
> > [   65.305884] [] vfs_write+0x9c/0x1a8
> > [   65.311010] [] SyS_write+0x44/0xa0
> > [   65.316049] [] el0_svc_naked+0x24/0x28
> > 
> > The debug print otherwise just shows:
> > 
> > [   64.476785] dwc2_map_urb_for_dma: setup_dma: 
> > 
> > So it looks like urb->setup_dma content is garbage.
> > 
> > Any thoughts?
> > 
> > Thanks,
> > baruch
> 

I've seen similar issues twice before:

1. Back when arm64 was initially being ported to the RPI 3 by people on
the net back in March/April of last year and only the serial console
worked, DMA was broken in the dwc2 driver.  The dwc2 driver would send
and receive garbage data.  The reason was that some of the memory
manager DMA functionality for arm64 wasn't 

Re: net/rds: use-after-free in inet_create

2017-02-28 Thread Dmitry Vyukov
On Tue, Feb 28, 2017 at 4:37 PM, Sowmini Varadhan
 wrote:
> On (02/28/17 15:22), Dmitry Vyukov wrote:
>>
>> Hello,
>>
>> I've got the following report while running syzkaller fuzzer on
>> linux-next/8d01c069486aca75b8f6018a759215b0ed0c91f0. So far it
>> happened only once. net was somehow deleted from underneath
>> inet_create. I've noticed that rds uses sock_create_kern which does
>> not take net reference. What is that that must keep net alive then?
>
> The rds_connection (which is where the net pointer is being obtained from)
> should keep the connection alive. Did you have the rds[_tcp] modules
> loaded at the time of failure? Were there kernel tcp sockets to/from
> the 16385 port? any hints on what else the test was doing (was it
> running a userspace RDS application that triggered the kernel TCP
> connection attempt in the first place)?


Here is syzkaller log before the crash:
https://gist.githubusercontent.com/dvyukov/8bb6a4c6543597c9598d5771258889fe/raw/08bd950bb69071a260046b0bcc5ab85701aea8e7/gistfile1.txt
Separate tests are separated by "executing program" lines. If a crash
happens within a user process context, it's possible to figure out
what exactly program triggered the bug. But this happened in a kernel
thread context, so I have no glues so far.

Grepping "socket" there, it was doing lots of things with sockets. Are
we looking for some particular socket type? If there are few programs
that create sockets of that type, then we can narrow down the set:

r1 = socket(0x11, 0x5, 0xa)
socket(0x4, 0x, 0x0)
socketpair(0x7, 0x805, 0x6,
&(0x7ffd-0x8)={0x, 0x})
socketpair(0x2, 0x80a, 0x8001,
&(0x7ffd1000-0x8)={0x, 0x})
socket$alg(0x26, 0x5, 0x0)
socket$sctp6(0xa, 0x81, 0x84)
r10 = socket(0x10, 0x802, 0x0)
socketpair(0x10, 0x0, 0x3,
&(0x7fe54000)={0x, 0x})
socket(0x2002, 0x1, 0x7f)
r8 = socket$sctp6(0xa, 0x1, 0x84)
socket(0x0, 0xa, 0x0)
socket(0x0, 0x0, 0x1)
socketpair$unix(0x1, 0x1, 0x0,
&(0x7f995000-0x8)={0x,
0x})
r1 = socket(0x2, 0x2, 0x0)
r5 = socket$alg(0x26, 0x5, 0x0)
r6 = socket$kcm(0x29, 0x2, 0x0)
r7 = socket$netlink(0x10, 0x3, 0x0)
r10 = socket(0x10, 0x3, 0x0)
r1 = socket(0x4, 0x, 0x0)
r2 = socket(0xa, 0x6, 0x0)
r6 = socket(0x2, 0x5, 0x0)
r11 = socket(0xa, 0x2, 0x0)
r12 = socket(0xa, 0x2, 0x0)
socket(0x1, 0x80007, 0xfffd)
socketpair$sctp(0x2, 0x1, 0x84,
&(0x7f00)={0x,
0x})
r16 = socket$bt_hci(0x1f, 0x3, 0x1)
r18 = socket(0x1000a, 0x80001, 0x0)
socket$sctp6(0xa, 0x1, 0x84)
socket$alg(0x26, 0x5, 0x0)
socketpair$unix(0x1, 0x4003, 0x0,
&(0x7ffc1000-0x8)={0x, 0x})
socketpair$unix(0x1, 0x40001, 0x0,
&(0x7f194000)={0x,
0x})
socket$bt_bnep(0x1f, 0x3, 0x4)
r0 = socket(0x10, 0x7, 0x8)
r2 = socket$alg(0x26, 0x5, 0x0)
r1 = socket$tcp(0x2, 0x1, 0x0)
r1 = socket(0x0, 0x2, 0x0)
r2 = socket$alg(0x26, 0x5, 0x0)
r4 = socket(0xa, 0x0, 0x40)
r8 = socket$bt_sco(0x1f, 0x5, 0x2)
socketpair$unix(0x1, 0x0, 0x0,
&(0x7f024000-0x8)={0x, 0x})
socket$nfc_raw(0x27, 0x3, 0x0)
r15 = socket(0xb, 0x6, 0x0)
socketpair$unix(0x1, 0x5, 0x0,
&(0x7f02f000-0x8)={0x, 0x})
r16 = socket(0x10, 0x802, 0x80010)
socket$sctp6(0xa, 0x1, 0x84)
socket$alg(0x26, 0x5, 0x0)
r3 = socket(0xa, 0x1, 0x0)
r13 = socket(0x10, 0x802, 0x0)
r0 = socket$netlink(0x10, 0x3, 0x10)
socketpair(0x1, 0x80f, 0x7,
&(0x7fb67000)={0x, 0x})
r2 = socket$alg(0x26, 0x5, 0x0)
socket$bt_hidp(0x1f, 0x3, 0x6)
socket$bt_bnep(0x1f, 0x3, 0x4)
socket$sctp(0x2, 0x1, 0x84)
r2 = socket(0x2, 0x3, 0x6)
r4 = socket(0x11, 0x802, 0x300)
r0 = socket$kcm(0x29, 0x5, 0x0)
r3 = socket$alg(0x26, 0x5, 0x0)
socketpair$unix(0x1, 0x5, 0x0,
&(0x7f51)={0x, 0x})
r1 = socket$alg(0x26, 0x5, 0x0)
r0 = socket$bt_cmtp(0x1f, 0x3, 0x5)
socket$unix(0x1, 0x800200, 0x0)
socketpair$unix(0x1, 0x5, 0x0,
&(0x7fb3)={0x, 0x})
r0 = socket(0xa, 0x1, 0x0)
r7 = socket(0xa, 0x2, 0x41)
r5 = socket(0xa, 0x2, 0x88)
r4 = socket(0xa, 0x2, 0x88)
r0 = socket$icmp6_raw(0xa, 0x3, 0x3a)
r1 = socket(0xa, 0x5, 0x0)
socket$icmp6(0xa, 0x2, 0x3a)
socket$icmp6_raw(0xa, 0x3, 0x3a)


Re: [PATCH] MAINTAINERS: Orphan usb/net/hso driver

2017-02-28 Thread David Miller
From: Baruch Siach 
Date: Tue, 28 Feb 2017 10:39:48 +0200

> The email address of Jan Dumon bounces, and there is not relevant information
> in the linked website.
> 
> Signed-off-by: Baruch Siach 

This patch is fine and I will apply it later, however I will make a
general statement that we list USB networking drivers very
inconsistently.

Specifically, some list linux-usb as the mailing list, others list
netdev.  We should figure out which one (or both) is the best to
specify, and apply that rule consitently to the MAINTAINERS file.

Thanks.


Re: net/sctp: use-after-free in sctp_hash_transport

2017-02-28 Thread Xin Long
On Tue, Feb 28, 2017 at 11:35 PM, Dmitry Vyukov  wrote:
> On Mon, Feb 27, 2017 at 5:27 PM, Xin Long  wrote:
>> On Mon, Feb 27, 2017 at 11:45 PM, Andrey Konovalov
>>  wrote:
>>> Hi,
>>>
>>> I've got the following error report while fuzzing the kernel with syzkaller.
>>>
>>> On commit e5d56efc97f8240d0b5d66c03949382b6d7e5570 (Feb 26).
>>>
>>> A reproducer and .config are attached.
>>>
>>> ===
>>> [ ERR: suspicious RCU usage.  ]
>>> 4.10.0+ #54 Not tainted
>>> ---
>>> ./include/linux/rhashtable.h:602 suspicious rcu_dereference_check() usage!
>>>
>>> other info that might help us debug this:
>>>
>>>
>>> rcu_scheduler_active = 2, debug_locks = 0
>>> 1 lock held by a.out/4189:
>>>  #0:  (sk_lock-AF_INET6){+.+.+.}, at: []
>>> sctp_setsockopt+0x318/0x5f10
>>>
>>> stack backtrace:
>>> CPU: 1 PID: 4189 Comm: a.out Not tainted 4.10.0+ #54
>>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
>>> Call Trace:
>>>  __dump_stack lib/dump_stack.c:15
>>>  dump_stack+0x292/0x398 lib/dump_stack.c:51
>>>  lockdep_rcu_suspicious+0x139/0x180 kernel/locking/lockdep.c:4452
>>>  __rhashtable_lookup ./include/linux/rhashtable.h:602
>>>  rhltable_lookup ./include/linux/rhashtable.h:690
>>>  sctp_hash_transport+0x826/0xcc0 net/sctp/input.c:887
>>>  sctp_assoc_add_peer+0xd0b/0x1470 net/sctp/associola.c:716
>>>  __sctp_connect+0x26d/0xdb0 net/sctp/socket.c:1184
>>>  __sctp_setsockopt_connectx+0x197/0x200 net/sctp/socket.c:1338
>>>  sctp_setsockopt_connectx net/sctp/socket.c:1370
>>>  sctp_setsockopt+0x15fa/0x5f10 net/sctp/socket.c:3936
>>>  sock_common_setsockopt+0x95/0xd0 net/core/sock.c:2725
>>>  SYSC_setsockopt net/socket.c:1786
>>>  SyS_setsockopt+0x270/0x3a0 net/socket.c:1765
>>>  entry_SYSCALL_64_fastpath+0x1f/0xc2 arch/x86/entry/entry_64.S:204
>>> RIP: 0033:0x7f3e27a55b79
>>> RSP: 002b:7f3e2296fd98 EFLAGS: 0206 ORIG_RAX: 0036
>>> RAX: ffda RBX: 7f3e229709c0 RCX: 7f3e27a55b79
>>> RDX: 006e RSI: 0084 RDI: 0003
>>> RBP: 7f3e27f21220 R08: 0010 R09: 
>>> R10: 20004000 R11: 0206 R12: 
>>> R13: 7f3e229709c0 R14: 7f3e2834c040 R15: 0003
>>> ==
>>> BUG: KASAN: use-after-free in sctp_hash_transport+0x855/0xcc0 at addr
>>> 8800671e1f8c
>>> Read of size 4 by task a.out/4189
>>> CPU: 1 PID: 4189 Comm: a.out Not tainted 4.10.0+ #54
>>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
>>> Call Trace:
>>>  __dump_stack lib/dump_stack.c:15
>>>  dump_stack+0x292/0x398 lib/dump_stack.c:51
>>>  kasan_object_err+0x1c/0x70 mm/kasan/report.c:162
>>>  print_address_description mm/kasan/report.c:200
>>>  kasan_report_error mm/kasan/report.c:289
>>>  kasan_report.part.1+0x20e/0x4e0 mm/kasan/report.c:311
>>>  kasan_report mm/kasan/report.c:331
>>>  __asan_report_load4_noabort+0x29/0x30 mm/kasan/report.c:331
>>>  rht_key_hashfn ./include/linux/rhashtable.h:254
>>>  __rhashtable_lookup ./include/linux/rhashtable.h:604
>>>  rhltable_lookup ./include/linux/rhashtable.h:690
>>>  sctp_hash_transport+0x855/0xcc0 net/sctp/input.c:887
>>>  sctp_assoc_add_peer+0xd0b/0x1470 net/sctp/associola.c:716
>>>  __sctp_connect+0x26d/0xdb0 net/sctp/socket.c:1184
>>>  __sctp_setsockopt_connectx+0x197/0x200 net/sctp/socket.c:1338
>>>  sctp_setsockopt_connectx net/sctp/socket.c:1370
>>>  sctp_setsockopt+0x15fa/0x5f10 net/sctp/socket.c:3936
>>>  sock_common_setsockopt+0x95/0xd0 net/core/sock.c:2725
>>>  SYSC_setsockopt net/socket.c:1786
>>>  SyS_setsockopt+0x270/0x3a0 net/socket.c:1765
>>>  entry_SYSCALL_64_fastpath+0x1f/0xc2 arch/x86/entry/entry_64.S:204
>>> RIP: 0033:0x7f3e27a55b79
>>> RSP: 002b:7f3e2296fd98 EFLAGS: 0206 ORIG_RAX: 0036
>>> RAX: ffda RBX: 7f3e229709c0 RCX: 7f3e27a55b79
>>> RDX: 006e RSI: 0084 RDI: 0003
>>> RBP: 7f3e27f21220 R08: 0010 R09: 
>>> R10: 20004000 R11: 0206 R12: 
>>> R13: 7f3e229709c0 R14: 7f3e2834c040 R15: 0003
>>> Object at 8800671e1f80, in cache kmalloc-1024 size: 1024
>>> Allocated:
>>> PID = 1
>>> save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:57
>>>  save_stack+0x43/0xd0 mm/kasan/kasan.c:502
>>>  set_track mm/kasan/kasan.c:514
>>>  kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:605
>>>  __kmalloc+0xa0/0x2d0 mm/slub.c:3745
>>>  kmalloc ./include/linux/slab.h:495
>>>  kzalloc ./include/linux/slab.h:663
>>>  bucket_table_alloc+0x618/0x930 lib/rhashtable.c:224
>>>  rhashtable_init+0x5f8/0xc60 lib/rhashtable.c:1006
>>>  rhltable_init+0x53/0xa0 lib/rhashtable.c:1037
>>>  sctp_transport_hashtable_init+0x1c/0x20 net/sctp/input.c:865
>>>  sctp_init+0x62c/0x88f net/sctp/protocol.c:1486
>>>  do_one_initcall+0xf3/0x390 init/main.c:788
>>>  do_initcall_lev

Re: net/sctp: use-after-free in sctp_hash_transport

2017-02-28 Thread Dmitry Vyukov
On Mon, Feb 27, 2017 at 5:27 PM, Xin Long  wrote:
> On Mon, Feb 27, 2017 at 11:45 PM, Andrey Konovalov
>  wrote:
>> Hi,
>>
>> I've got the following error report while fuzzing the kernel with syzkaller.
>>
>> On commit e5d56efc97f8240d0b5d66c03949382b6d7e5570 (Feb 26).
>>
>> A reproducer and .config are attached.
>>
>> ===
>> [ ERR: suspicious RCU usage.  ]
>> 4.10.0+ #54 Not tainted
>> ---
>> ./include/linux/rhashtable.h:602 suspicious rcu_dereference_check() usage!
>>
>> other info that might help us debug this:
>>
>>
>> rcu_scheduler_active = 2, debug_locks = 0
>> 1 lock held by a.out/4189:
>>  #0:  (sk_lock-AF_INET6){+.+.+.}, at: []
>> sctp_setsockopt+0x318/0x5f10
>>
>> stack backtrace:
>> CPU: 1 PID: 4189 Comm: a.out Not tainted 4.10.0+ #54
>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
>> Call Trace:
>>  __dump_stack lib/dump_stack.c:15
>>  dump_stack+0x292/0x398 lib/dump_stack.c:51
>>  lockdep_rcu_suspicious+0x139/0x180 kernel/locking/lockdep.c:4452
>>  __rhashtable_lookup ./include/linux/rhashtable.h:602
>>  rhltable_lookup ./include/linux/rhashtable.h:690
>>  sctp_hash_transport+0x826/0xcc0 net/sctp/input.c:887
>>  sctp_assoc_add_peer+0xd0b/0x1470 net/sctp/associola.c:716
>>  __sctp_connect+0x26d/0xdb0 net/sctp/socket.c:1184
>>  __sctp_setsockopt_connectx+0x197/0x200 net/sctp/socket.c:1338
>>  sctp_setsockopt_connectx net/sctp/socket.c:1370
>>  sctp_setsockopt+0x15fa/0x5f10 net/sctp/socket.c:3936
>>  sock_common_setsockopt+0x95/0xd0 net/core/sock.c:2725
>>  SYSC_setsockopt net/socket.c:1786
>>  SyS_setsockopt+0x270/0x3a0 net/socket.c:1765
>>  entry_SYSCALL_64_fastpath+0x1f/0xc2 arch/x86/entry/entry_64.S:204
>> RIP: 0033:0x7f3e27a55b79
>> RSP: 002b:7f3e2296fd98 EFLAGS: 0206 ORIG_RAX: 0036
>> RAX: ffda RBX: 7f3e229709c0 RCX: 7f3e27a55b79
>> RDX: 006e RSI: 0084 RDI: 0003
>> RBP: 7f3e27f21220 R08: 0010 R09: 
>> R10: 20004000 R11: 0206 R12: 
>> R13: 7f3e229709c0 R14: 7f3e2834c040 R15: 0003
>> ==
>> BUG: KASAN: use-after-free in sctp_hash_transport+0x855/0xcc0 at addr
>> 8800671e1f8c
>> Read of size 4 by task a.out/4189
>> CPU: 1 PID: 4189 Comm: a.out Not tainted 4.10.0+ #54
>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
>> Call Trace:
>>  __dump_stack lib/dump_stack.c:15
>>  dump_stack+0x292/0x398 lib/dump_stack.c:51
>>  kasan_object_err+0x1c/0x70 mm/kasan/report.c:162
>>  print_address_description mm/kasan/report.c:200
>>  kasan_report_error mm/kasan/report.c:289
>>  kasan_report.part.1+0x20e/0x4e0 mm/kasan/report.c:311
>>  kasan_report mm/kasan/report.c:331
>>  __asan_report_load4_noabort+0x29/0x30 mm/kasan/report.c:331
>>  rht_key_hashfn ./include/linux/rhashtable.h:254
>>  __rhashtable_lookup ./include/linux/rhashtable.h:604
>>  rhltable_lookup ./include/linux/rhashtable.h:690
>>  sctp_hash_transport+0x855/0xcc0 net/sctp/input.c:887
>>  sctp_assoc_add_peer+0xd0b/0x1470 net/sctp/associola.c:716
>>  __sctp_connect+0x26d/0xdb0 net/sctp/socket.c:1184
>>  __sctp_setsockopt_connectx+0x197/0x200 net/sctp/socket.c:1338
>>  sctp_setsockopt_connectx net/sctp/socket.c:1370
>>  sctp_setsockopt+0x15fa/0x5f10 net/sctp/socket.c:3936
>>  sock_common_setsockopt+0x95/0xd0 net/core/sock.c:2725
>>  SYSC_setsockopt net/socket.c:1786
>>  SyS_setsockopt+0x270/0x3a0 net/socket.c:1765
>>  entry_SYSCALL_64_fastpath+0x1f/0xc2 arch/x86/entry/entry_64.S:204
>> RIP: 0033:0x7f3e27a55b79
>> RSP: 002b:7f3e2296fd98 EFLAGS: 0206 ORIG_RAX: 0036
>> RAX: ffda RBX: 7f3e229709c0 RCX: 7f3e27a55b79
>> RDX: 006e RSI: 0084 RDI: 0003
>> RBP: 7f3e27f21220 R08: 0010 R09: 
>> R10: 20004000 R11: 0206 R12: 
>> R13: 7f3e229709c0 R14: 7f3e2834c040 R15: 0003
>> Object at 8800671e1f80, in cache kmalloc-1024 size: 1024
>> Allocated:
>> PID = 1
>> save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:57
>>  save_stack+0x43/0xd0 mm/kasan/kasan.c:502
>>  set_track mm/kasan/kasan.c:514
>>  kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:605
>>  __kmalloc+0xa0/0x2d0 mm/slub.c:3745
>>  kmalloc ./include/linux/slab.h:495
>>  kzalloc ./include/linux/slab.h:663
>>  bucket_table_alloc+0x618/0x930 lib/rhashtable.c:224
>>  rhashtable_init+0x5f8/0xc60 lib/rhashtable.c:1006
>>  rhltable_init+0x53/0xa0 lib/rhashtable.c:1037
>>  sctp_transport_hashtable_init+0x1c/0x20 net/sctp/input.c:865
>>  sctp_init+0x62c/0x88f net/sctp/protocol.c:1486
>>  do_one_initcall+0xf3/0x390 init/main.c:788
>>  do_initcall_level init/main.c:854
>>  do_initcalls init/main.c:862
>>  do_basic_setup init/main.c:880
>>  kernel_init_freeable+0x5cc/0x6a6 init/main.c:1031
>>  kernel_init+0x1

Re: [patch] net/mlx4: && vs & typo

2017-02-28 Thread Bart Van Assche
On Tue, 2017-02-28 at 15:02 +0300, Dan Carpenter wrote:
> Bitwise & was obviously intended here.
> 
> Fixes: 745d8ae4622c ("net/mlx4: Spoofcheck and zero MAC can't coexist")
> Signed-off-by: Dan Carpenter 
> ---
> Applies to net.git.
> 
> diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h
> index e965e5090d96..a858bcb6220b 100644
> --- a/include/linux/mlx4/driver.h
> +++ b/include/linux/mlx4/driver.h
> @@ -109,7 +109,7 @@ static inline void mlx4_u64_to_mac(u8 *addr, u64 mac)
>   int i;
>  
>   for (i = ETH_ALEN; i > 0; i--) {
> - addr[i - 1] = mac && 0xFF;
> + addr[i - 1] = mac & 0xFF;
>   mac >>= 8;
>   }
>  }

Is this the only place where such a loop occurs? Should a put_unaligned_be48()
function be introduced?

Bart.

Re: net/rds: use-after-free in inet_create

2017-02-28 Thread Sowmini Varadhan
On (02/28/17 15:22), Dmitry Vyukov wrote:
> 
> Hello,
> 
> I've got the following report while running syzkaller fuzzer on
> linux-next/8d01c069486aca75b8f6018a759215b0ed0c91f0. So far it
> happened only once. net was somehow deleted from underneath
> inet_create. I've noticed that rds uses sock_create_kern which does
> not take net reference. What is that that must keep net alive then?

The rds_connection (which is where the net pointer is being obtained from)
should keep the connection alive. Did you have the rds[_tcp] modules
loaded at the time of failure? Were there kernel tcp sockets to/from
the 16385 port? any hints on what else the test was doing (was it
running a userspace RDS application that triggered the kernel TCP
connection attempt in the first place)?

--Sowmini



Re: net/atm: vcc_sendmsg calls kmem_cache_alloc in non-blocking context

2017-02-28 Thread Eric Dumazet
On Tue, Feb 28, 2017 at 7:26 AM, Dmitry Vyukov  wrote:
> Hello,
>
> I've got the following WARNING while running syzkaller fuzzer:
>
> [ cut here ]
> WARNING: CPU: 0 PID: 9197 at kernel/sched/core.c:6149
> __might_sleep+0x149/0x1a0 kernel/sched/core.c:6144
> do not call blocking ops when !TASK_RUNNING; state=1 set at
> [] prepare_to_wait+0x182/0x530
> kernel/sched/wait.c:178
> Kernel panic - not syncing: panic_on_warn set ...
>
> CPU: 0 PID: 9197 Comm: syz-executor2 Not tainted 4.10.0+ #54
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
> Call Trace:
>  __dump_stack lib/dump_stack.c:15 [inline]
>  dump_stack+0x292/0x398 lib/dump_stack.c:51
>  panic+0x1cb/0x3a9 kernel/panic.c:179
>  __warn+0x1c4/0x1e0 kernel/panic.c:540
>  warn_slowpath_fmt+0xc5/0x110 kernel/panic.c:563
>  __might_sleep+0x149/0x1a0 kernel/sched/core.c:6144
>  slab_pre_alloc_hook mm/slab.h:432 [inline]
>  slab_alloc_node mm/slub.c:2644 [inline]
>  kmem_cache_alloc_node+0x14a/0x280 mm/slub.c:2754
>  __alloc_skb+0x10f/0x770 net/core/skbuff.c:219
>  alloc_skb include/linux/skbuff.h:932 [inline]
>  alloc_tx net/atm/common.c:75 [inline]
>  vcc_sendmsg+0x5e8/0x1010 net/atm/common.c:609
>  sock_sendmsg_nosec net/socket.c:633 [inline]
>  sock_sendmsg+0xca/0x110 net/socket.c:643
>  ___sys_sendmsg+0x9d2/0xae0 net/socket.c:1985
>  __sys_sendmsg+0x138/0x320 net/socket.c:2019
>  SYSC_sendmsg net/socket.c:2030 [inline]
>  SyS_sendmsg+0x2d/0x50 net/socket.c:2026
>  entry_SYSCALL_64_fastpath+0x1f/0xc2
> RIP: 0033:0x4458b9
> RSP: 002b:7f20dd37ab58 EFLAGS: 0286 ORIG_RAX: 002e
> RAX: ffda RBX: 0005 RCX: 004458b9
> RDX:  RSI: 2000 RDI: 0005
> RBP: 006e1af0 R08:  R09: 
> R10:  R11: 0286 R12: 00708000
> R13: 0005 R14: 2ff8 R15: 0008
>
> On commit 86292b33d4b79ee03e2f43ea0381ef85f077c760

Looks the typical pattern for which Cong sent various patches.

like "inet: fix sleeping inside inet_wait_for_connect()"


  1   2   >