date:20170509

Re: bpf pointer alignment validation

2017-05-09 Thread Alexei Starovoitov

On Tue, May 09, 2017 at 02:32:34PM -0400, David Miller wrote:
>  
> +static u32 calc_align(u32 imm)
> +{
> + u32 align = 1;
> +
> + if (!imm)
> + return 1U << 31;
> +
> + while (!(imm & 1)) {
> + imm >>= 1;
> + align <<= 1;
> + }
> + return align;
> +}

same question as in previous reply.
Why not to use something like:
static u32 calc_align(u32 n)
{
if (!n)
return 1U << 31;
return n - ((n - 1) & n);
}

> - if (log_level && do_print_state) {
> + if (log_level > 1 || (log_level && do_print_state)) {
>   verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx);
>   print_verifier_state(>cur_state);
>   do_print_state = false;

this needs to be tweaked like
if (log_level > 1)
verbose("%d:", insn_idx);
else
verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx);

otherwise it prints prev_insn_idx which is meaningful
only with processing exit and search pruning.
That's why below ...

> + .descr = "unknown shift",
> + .insns = {
> + LOAD_UNKNOWN(BPF_REG_3),
> + BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
> + BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
> + BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
> + BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
> + LOAD_UNKNOWN(BPF_REG_4),
> + BPF_ALU64_IMM(BPF_LSH, BPF_REG_4, 5),
> + BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
> + BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
> + BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
> + BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
> + BPF_MOV64_IMM(BPF_REG_0, 0),
> + BPF_EXIT_INSN(),
> + },
> + .prog_type = BPF_PROG_TYPE_SCHED_CLS,
> + .matches = {
> + "from 4 to 7: R0=pkt(id=0,off=8,r=8) R1=ctx 
> R2=pkt(id=0,off=0,r=8) R3=inv56 R10=fp",
> + "from 4 to 8: R0=pkt(id=0,off=8,r=8) R1=ctx 
> R2=pkt(id=0,off=0,r=8) R3=inv55,min_align=2 R10=fp",
> + "from 4 to 9: R0=pkt(id=0,off=8,r=8) R1=ctx 
> R2=pkt(id=0,off=0,r=8) R3=inv54,min_align=4 R10=fp",
> + "from 4 to 10: R0=pkt(id=0,off=8,r=8) R1=ctx 
> R2=pkt(id=0,off=0,r=8) R3=inv53,min_align=8 R10=fp",
> + "from 4 to 11: R0=pkt(id=0,off=8,r=8) R1=ctx 
> R2=pkt(id=0,off=0,r=8) R3=inv52,min_align=16 R10=fp",
> + "from 15 to 18: R0=pkt(id=0,off=8,r=8) R1=ctx 
> R2=pkt(id=0,off=0,r=8) R3=pkt_end R4=inv56 R10=fp",
> + "from 15 to 19: R0=pkt(id=0,off=8,r=8) R1=ctx 
> R2=pkt(id=0,off=0,r=8) R3=pkt_end R4=inv51,min_align=32 R10=fp",
> + "from 15 to 20: R0=pkt(id=0,off=8,r=8) R1=ctx 
> R2=pkt(id=0,off=0,r=8) R3=pkt_end R4=inv52,min_align=16 R10=fp",

... looks crazy here, since program is linear and 'from 4' and 'from 15' are
completely non-obvious and will change in the future for sure.
Since it just happened that search pruning heuristic kicked in at those points.
Hence doing verbose("%d:", insn_idx); is necessary to avoid noise.

> + .prog_type = BPF_PROG_TYPE_SCHED_CLS,
> + .matches = {
> + /* Calculated offset in R4 has unknown value, but known
> +  * alignment of 4.
> +  */
> + "from 4 to 8: R0=pkt(id=0,off=8,r=8) R1=ctx 
> R2=pkt(id=0,off=0,r=8) R3=pkt_end R6=inv54,min_align=4 R10=fp",
> +
> + /* Offset is added to packet pointer, resulting in known
> +  * auxiliary alignment and offset.
> +  */
> + "from 4 to 11: R0=pkt(id=0,off=8,r=8) R1=ctx 
> R2=pkt(id=0,off=0,r=8) R3=pkt_end 
> R5=pkt(id=1,off=0,r=0),aux_off=14,aux_off_align=4 R6=inv54,min_align=4 
> R10=fp",
> +
> + /* At the time the word size load is performed from R5,
> +  * it's total offset is NET_IP_ALIGN + reg->off (0) +
> +  * reg->aux_off (14) which is 16.  Then the variable
> +  * offset is considered using reg->aux_off_align which
> +  * is 4 and meets the load's requirements.
> +  */
> + "from 13 to 15: R0=pkt(id=0,off=8,r=8) R1=ctx 
> R2=pkt(id=0,off=0,r=8) R3=pkt_end 
> R4=pkt(id=1,off=4,r=4),aux_off=14,aux_off_align=4 
> R5=pkt(id=1,off=0,r=4),aux_off=14,aux_off_align=4 R6=inv54,min_align=4 
> R10=fp",
> +
> +
> + /* Variable offset is added to R5 packet pointer,
> +  * resulting in auxiliary alignment of 4.
> +  */
> + "from 13 to 18: R0=pkt(id=0,off=8,r=8) R1=ctx 
> R2=pkt(id=0,off=0,r=8) R3=pkt_end

Re: DQL and TCQ_F_CAN_BYPASS destroy performance under virtualizaiton (Was: "Re: net_sched strange in 4.11")

2017-05-09 Thread Anton Ivanov

[snip]

> Virtio-net net does not support BQL. Before commit ea7735d97ba9
> ("virtio-net: move free_old_xmit_skbs"), it's even impossible to
> support that since we don't have tx interrupt for each packet.  I
> haven't measured the impact of xmit_more, maybe I was wrong but I
> think it may help in some cases since it may improve the batching on
> host more or less.

Sorry, hit send too soon.

Impact of xmit more depends on your transport.

If, for example, you are using sendmmsg on the outer side which can
consume the bulked data "as is", the impact is quite significant. If
your transport does not support bulking, the fact there was bulking
earlier in the chain has little impact.

There is some, but not a lot.

[snip]

-- 
Anton R. Ivanov
Cambridgegreys Limited. Registered in England. Company Number 10273661

Re: DQL and TCQ_F_CAN_BYPASS destroy performance under virtualizaiton (Was: "Re: net_sched strange in 4.11")

2017-05-09 Thread Anton Ivanov

On 10/05/17 03:18, Jason Wang wrote:
>
>
> On 2017年05月09日 23:11, Stefan Hajnoczi wrote:
>> On Tue, May 09, 2017 at 08:46:46AM +0100, Anton Ivanov wrote:
>>> I have figured it out. Two issues.
>>>
>>> 1) skb->xmit_more is hardly ever set under virtualization because
>>> the qdisc
>>> is usually bypassed because of TCQ_F_CAN_BYPASS. Once
>>> TCQ_F_CAN_BYPASS is
>>> set a virtual NIC driver is not likely see skb->xmit_more (this
>>> answers my
>>> "how does this work at all" question).
>>>
>>> 2) If that flag is turned off (I patched sched_generic to turn it
>>> off in
>>> pfifo_fast while testing), DQL keeps xmit_more from being set. If
>>> the driver
>>> is not DQL enabled xmit_more is never ever set. If the driver is DQL
>>> enabled
>>> the queue is adjusted to ensure xmit_more stops happening within
>>> 10-15 xmit
>>> cycles.
>>>
>>> That is plain *wrong* for virtual NICs - virtio, emulated NICs, etc.
>>> There,
>>> the BIG cost is telling the hypervisor that it needs to "kick" the
>>> packets.
>>> The cost of putting them into the vNIC buffers is negligible. You want
>>> xmit_more to happen - it makes between 50% and 300% (depending on vNIC
>>> design) difference. If there is no xmit_more the vNIC will immediately
>>> "kick" the hypervisor and try to signal that  the packet needs to move
>>> straight away (as for example in virtio_net).
>
> How do you measure the performance? TCP or just measure pps?

In this particular case - tcp from guest. I have a couple of other
benchmarks (forwarding, etc).

>
>>>
>>> In addition to that, the perceived line rate is proportional to this
>>> cost,
>>> so I am not sure that the current dql math holds. In fact, I think
>>> it does
>>> not - it is trying to adjust something which influences the
>>> perceived line
>>> rate.
>>>
>>> So - how do we turn BOTH bypass and DQL adjustment while under
>>> virtualization and set them to be "always qdisc" + "always xmit_more
>>> allowed"
>
> Virtio-net net does not support BQL. Before commit ea7735d97ba9
> ("virtio-net: move free_old_xmit_skbs"), it's even impossible to
> support that since we don't have tx interrupt for each packet.  I
> haven't measured the impact of xmit_more, maybe I was wrong but I
> think it may help in some cases since it may improve the batching on
> host more or less.

If you do not support BQL, you might as well look the xmit_more part
kick code path. Line 1127.

bool kick = !skb->xmit_more; effectively means kick = true;

It will never be triggered. You will be kicking each packet and per
packet. xmit_more is now set only out of BQL. If BQL is not enabled you
never get it. Now, will the current dql code work correctly if you do
not have a defined line rate and completion interrupts - no idea.
Probably not. IMHO instead of trying to fix it there should be a way for
a device or architecture to turn it off.

To be clear - I ran into this working on my own drivers for UML, you are
cc-ed because you are likely to be one of the most affected.

A.

>
> Thanks
>
>>>
>>> A.
>>>
>>> P.S. Cc-ing virtio maintainer
>> CCing Michael Tsirkin and Jason Wang, who are the core virtio and
>> virtio-net maintainers.  (I maintain the vsock driver - it's unrelated
>> to this discussion.)
>>
>>> A.
>>>
>>>
>>> On 08/05/17 08:15, Anton Ivanov wrote:
 Hi all,

 I was revising some of my old work for UML to prepare it for
 submission
 and I noticed that skb->xmit_more does not seem to be set any more.

 I traced the issue as far as net/sched/sched_generic.c

 try_bulk_dequeue_skb() is never invoked (the drivers I am working
 on are
 dql enabled so that is not the problem).

 More interestingly, if I put a breakpoint and debug output into
 dequeue_skb() around line 147 - right before the bulk: tag that skb
 there is always NULL. ???

 Similarly, debug in pfifo_fast_dequeue shows only NULLs being
 dequeued.
 Again - ???

 First and foremost, I apologize for the silly question, but how can
 this
 work at all? I see the skbs showing up at the driver level, why are
 NULLs being returned at qdisc dequeue and where do the skbs at the
 driver level come from?

 Second, where should I look to fix it?

 A.

>>>
>>> -- 
>>> Anton R. Ivanov
>>>
>>> Cambridge Greys Limited, England company No 10273661
>>> http://www.cambridgegreys.com/
>>>
>
>

-- 
Anton R. Ivanov
Cambridgegreys Limited. Registered in England. Company Number 10273661

Re: [PATCH] net: fec: select queue depending on VLAN priority

2017-05-09 Thread Stefan Agner

On 2017-05-09 06:39, David Miller wrote:
> From: Stefan Agner 
> Date: Mon,  8 May 2017 22:37:08 -0700
> 
>> Since the addition of the multi queue code with commit 59d0f7465644
>> ("net: fec: init multi queue date structure") the queue selection
>> has been handelt by the default transmit queue selection
>> implementation which tries to evenly distribute the traffic across
>> all available queues. This selection presumes that the queues are
>> using an equal priority, however, the queues 1 and 2 are actually
>> of higher priority (the classification of the queues is enabled in
>> fec_enet_enable_ring).
>>
>> This can lead to net scheduler warnings and continuous TX ring
>> dumps when exercising the system with iperf.
>>
>> Use only queue 0 for all common traffic (no VLAN and P802.1p
>> priority 0 and 1) and route level 2-7 through queue 1 and 2.
>>
>> Signed-off-by: Fugang Duan 
>> Fixes: 59d0f7465644 ("net: fec: init multi queue date structure")
> 
> If the queues are used for prioritization, and it does not have
> multiple normal priority level queues, multiqueue is not what the
> driver should have implemented.

As Andy mentioned, there is also a round-robin mode. I'll try that.

What would be the proper way to use the prioritized queues?

--
Stefan

Re: [PATCH] libertas: Avoid reading past end of buffer

2017-05-09 Thread Joe Perches

On Tue, 2017-05-09 at 16:23 -0700, Kees Cook wrote:
> Using memcpy() from a string that is shorter than the length copied means
> the destination buffer is being filled with arbitrary data from the kernel
> rodata segment. Instead, use strncpy() which will fill the trailing bytes
> with zeros. Additionally adjust indentation to keep checkpatch.pl happy.
> 
> This was found with the future CONFIG_FORTIFY_SOURCE feature.
[]
> diff --git a/drivers/net/wireless/marvell/libertas/mesh.c 
> b/drivers/net/wireless/marvell/libertas/mesh.c
[]
> @@ -1177,9 +1177,9 @@ void lbs_mesh_ethtool_get_strings(struct net_device 
> *dev,
>   switch (stringset) {
>   case ETH_SS_STATS:
>   for (i = 0; i < MESH_STATS_NUM; i++) {
> - memcpy(s + i * ETH_GSTRING_LEN,
> - mesh_stat_strings[i],
> - ETH_GSTRING_LEN);
> + strncpy(s + i * ETH_GSTRING_LEN,
> + mesh_stat_strings[i],
> + ETH_GSTRING_LEN);
>   }

The better solution is to declare
mesh_stat_strings in in the normal way

---
 drivers/net/wireless/marvell/libertas/mesh.c | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/net/wireless/marvell/libertas/mesh.c 
b/drivers/net/wireless/marvell/libertas/mesh.c
index d0c881dd5846..a535e7f48d2d 100644
--- a/drivers/net/wireless/marvell/libertas/mesh.c
+++ b/drivers/net/wireless/marvell/libertas/mesh.c
@@ -1108,15 +1108,15 @@ void lbs_mesh_set_txpd(struct lbs_private *priv,
  * Ethtool related
  */
 
-static const char * const mesh_stat_strings[] = {
-   "drop_duplicate_bcast",
-   "drop_ttl_zero",
-   "drop_no_fwd_route",
-   "drop_no_buffers",
-   "fwded_unicast_cnt",
-   "fwded_bcast_cnt",
-   "drop_blind_table",
-   "tx_failed_cnt"
+static const char mesh_stat_strings[][ETH_GSTRING_LEN] = {
+   "drop_duplicate_bcast",
+   "drop_ttl_zero",
+   "drop_no_fwd_route",
+   "drop_no_buffers",
+   "fwded_unicast_cnt",
+   "fwded_bcast_cnt",
+   "drop_blind_table",
+   "tx_failed_cnt",
 };
 
 void lbs_mesh_ethtool_get_stats(struct net_device *dev,

Re: [PATCH] net: dsa: loop: Free resources if initialization is deferred

2017-05-09 Thread Julia Lawall



On Wed, 10 May 2017, Christophe JAILLET wrote:

> Free some devm'allocated memory in case of deferred driver initialization.
> This avoid to waste some memory in such a case.

I really think it would be helpful to mention the special behavior of
-EPROBE_DEFER.  It doesn't take much space, and it coud be helpful to
someone in the future.

julia

>
> Suggested-by: Joe Perches 
> Signed-off-by: Christophe JAILLET 
> ---
>  drivers/net/dsa/dsa_loop.c | 5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/dsa/dsa_loop.c b/drivers/net/dsa/dsa_loop.c
> index a19e1781e9bb..557afb418320 100644
> --- a/drivers/net/dsa/dsa_loop.c
> +++ b/drivers/net/dsa/dsa_loop.c
> @@ -260,8 +260,11 @@ static int dsa_loop_drv_probe(struct mdio_device 
> *mdiodev)
>   return -ENOMEM;
>
>   ps->netdev = dev_get_by_name(_net, pdata->netdev);
> - if (!ps->netdev)
> + if (!ps->netdev) {
> + devm_kfree(>dev, ps);
> + devm_kfree(>dev, ds);
>   return -EPROBE_DEFER;
> + }
>
>   pdata->cd.netdev[DSA_LOOP_CPU_PORT] = >netdev->dev;
>
> --
> 2.11.0
>
> --
> To unsubscribe from this list: send the line "unsubscribe kernel-janitors" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

[PATCH] net: dsa: loop: Free resources if initialization is deferred

2017-05-09 Thread Christophe JAILLET

Free some devm'allocated memory in case of deferred driver initialization.
This avoid to waste some memory in such a case.

Suggested-by: Joe Perches 
Signed-off-by: Christophe JAILLET 
---
 drivers/net/dsa/dsa_loop.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/dsa/dsa_loop.c b/drivers/net/dsa/dsa_loop.c
index a19e1781e9bb..557afb418320 100644
--- a/drivers/net/dsa/dsa_loop.c
+++ b/drivers/net/dsa/dsa_loop.c
@@ -260,8 +260,11 @@ static int dsa_loop_drv_probe(struct mdio_device *mdiodev)
return -ENOMEM;
 
ps->netdev = dev_get_by_name(_net, pdata->netdev);
-   if (!ps->netdev)
+   if (!ps->netdev) {
+   devm_kfree(>dev, ps);
+   devm_kfree(>dev, ds);
return -EPROBE_DEFER;
+   }
 
pdata->cd.netdev[DSA_LOOP_CPU_PORT] = >netdev->dev;
 
-- 
2.11.0

[PATCH net-next V4 01/10] ptr_ring: batch ring zeroing

2017-05-09 Thread Jason Wang

From: "Michael S. Tsirkin" 

A known weakness in ptr_ring design is that it does not handle well the
situation when ring is almost full: as entries are consumed they are
immediately used again by the producer, so consumer and producer are
writing to a shared cache line.

To fix this, add batching to consume calls: as entries are
consumed do not write NULL into the ring until we get
a multiple (in current implementation 2x) of cache lines
away from the producer. At that point, write them all out.

We do the write out in the reverse order to keep
producer from sharing cache with consumer for as long
as possible.

Writeout also triggers when ring wraps around - there's
no special reason to do this but it helps keep the code
a bit simpler.

What should we do if getting away from producer by 2 cache lines
would mean we are keeping the ring moe than half empty?
Maybe we should reduce the batching in this case,
current patch simply reduces the batching.

Notes:
- it is no longer true that a call to consume guarantees
  that the following call to produce will succeed.
  No users seem to assume that.
- batching can also in theory reduce the signalling rate:
  users that would previously send interrups to the producer
  to wake it up after consuming each entry would now only
  need to do this once in a batch.
  Doing this would be easy by returning a flag to the caller.
  No users seem to do signalling on consume yet so this was not
  implemented yet.

Tested with pktgen on tap with xdp1 in guest:

Before 2.10 Mpps
After  2.27 Mpps (+7.48%)

Signed-off-by: Michael S. Tsirkin 
Reviewed-by: Jesper Dangaard Brouer 
Signed-off-by: Jason Wang 
---
 include/linux/ptr_ring.h | 63 +---
 1 file changed, 54 insertions(+), 9 deletions(-)

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index 6c70444..6b2e0dd 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -34,11 +34,13 @@
 struct ptr_ring {
int producer cacheline_aligned_in_smp;
spinlock_t producer_lock;
-   int consumer cacheline_aligned_in_smp;
+   int consumer_head cacheline_aligned_in_smp; /* next valid entry */
+   int consumer_tail; /* next entry to invalidate */
spinlock_t consumer_lock;
/* Shared consumer/producer data */
/* Read-only by both the producer and the consumer */
int size cacheline_aligned_in_smp; /* max entries in queue */
+   int batch; /* number of entries to consume in a batch */
void **queue;
 };
 
@@ -170,7 +172,7 @@ static inline int ptr_ring_produce_bh(struct ptr_ring *r, 
void *ptr)
 static inline void *__ptr_ring_peek(struct ptr_ring *r)
 {
if (likely(r->size))
-   return r->queue[r->consumer];
+   return r->queue[r->consumer_head];
return NULL;
 }
 
@@ -231,9 +233,38 @@ static inline bool ptr_ring_empty_bh(struct ptr_ring *r)
 /* Must only be called after __ptr_ring_peek returned !NULL */
 static inline void __ptr_ring_discard_one(struct ptr_ring *r)
 {
-   r->queue[r->consumer++] = NULL;
-   if (unlikely(r->consumer >= r->size))
-   r->consumer = 0;
+   /* Fundamentally, what we want to do is update consumer
+* index and zero out the entry so producer can reuse it.
+* Doing it naively at each consume would be as simple as:
+*   r->queue[r->consumer++] = NULL;
+*   if (unlikely(r->consumer >= r->size))
+*   r->consumer = 0;
+* but that is suboptimal when the ring is full as producer is writing
+* out new entries in the same cache line.  Defer these updates until a
+* batch of entries has been consumed.
+*/
+   int head = r->consumer_head++;
+
+   /* Once we have processed enough entries invalidate them in
+* the ring all at once so producer can reuse their space in the ring.
+* We also do this when we reach end of the ring - not mandatory
+* but helps keep the implementation simple.
+*/
+   if (unlikely(r->consumer_head - r->consumer_tail >= r->batch ||
+r->consumer_head >= r->size)) {
+   /* Zero out entries in the reverse order: this way we touch the
+* cache line that producer might currently be reading the last;
+* producer won't make progress and touch other cache lines
+* besides the first one until we write out all entries.
+*/
+   while (likely(head >= r->consumer_tail))
+   r->queue[head--] = NULL;
+   r->consumer_tail = r->consumer_head;
+   }
+   if (unlikely(r->consumer_head >= r->size)) {
+   r->consumer_head = 0;
+   r->consumer_tail = 0;
+   }
 }
 
 static inline void *__ptr_ring_consume(struct ptr_ring *r)
@@ -345,14

[PATCH net-next V4 02/10] ptr_ring: add ptr_ring_unconsume

2017-05-09 Thread Jason Wang

From: "Michael S. Tsirkin" 

Applications that consume a batch of entries in one go
can benefit from ability to return some of them back
into the ring.

Add an API for that - assuming there's space. If there's no space
naturally can't do this and have to drop entries, but this implies ring
is full so we'd likely drop some anyway.

Signed-off-by: Michael S. Tsirkin 
Signed-off-by: Jason Wang 
---
 include/linux/ptr_ring.h | 55 
 1 file changed, 55 insertions(+)

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index 6b2e0dd..796b90f 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -403,6 +403,61 @@ static inline int ptr_ring_init(struct ptr_ring *r, int 
size, gfp_t gfp)
return 0;
 }
 
+/*
+ * Return entries into ring. Destroy entries that don't fit.
+ *
+ * Note: this is expected to be a rare slow path operation.
+ *
+ * Note: producer lock is nested within consumer lock, so if you
+ * resize you must make sure all uses nest correctly.
+ * In particular if you consume ring in interrupt or BH context, you must
+ * disable interrupts/BH when doing so.
+ */
+static inline void ptr_ring_unconsume(struct ptr_ring *r, void **batch, int n,
+ void (*destroy)(void *))
+{
+   unsigned long flags;
+   int head;
+
+   spin_lock_irqsave(>consumer_lock, flags);
+   spin_lock(>producer_lock);
+
+   if (!r->size)
+   goto done;
+
+   /*
+* Clean out buffered entries (for simplicity). This way following code
+* can test entries for NULL and if not assume they are valid.
+*/
+   head = r->consumer_head - 1;
+   while (likely(head >= r->consumer_tail))
+   r->queue[head--] = NULL;
+   r->consumer_tail = r->consumer_head;
+
+   /*
+* Go over entries in batch, start moving head back and copy entries.
+* Stop when we run into previously unconsumed entries.
+*/
+   while (n) {
+   head = r->consumer_head - 1;
+   if (head < 0)
+   head = r->size - 1;
+   if (r->queue[head]) {
+   /* This batch entry will have to be destroyed. */
+   goto done;
+   }
+   r->queue[head] = batch[--n];
+   r->consumer_tail = r->consumer_head = head;
+   }
+
+done:
+   /* Destroy all entries left in the batch. */
+   while (n)
+   destroy(batch[--n]);
+   spin_unlock(>producer_lock);
+   spin_unlock_irqrestore(>consumer_lock, flags);
+}
+
 static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue,
   int size, gfp_t gfp,
   void (*destroy)(void *))
-- 
2.7.4

[PATCH net-next V4 03/10] skb_array: introduce skb_array_unconsume

2017-05-09 Thread Jason Wang

Signed-off-by: Jason Wang 
---
 include/linux/skb_array.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h
index f4dfade..79850b6 100644
--- a/include/linux/skb_array.h
+++ b/include/linux/skb_array.h
@@ -156,6 +156,12 @@ static void __skb_array_destroy_skb(void *ptr)
kfree_skb(ptr);
 }
 
+static inline void skb_array_unconsume(struct skb_array *a,
+  struct sk_buff **skbs, int n)
+{
+   ptr_ring_unconsume(>ring, (void **)skbs, n, __skb_array_destroy_skb);
+}
+
 static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp)
 {
return ptr_ring_resize(>ring, size, gfp, __skb_array_destroy_skb);
-- 
2.7.4

[PATCH net-next V4 00/10] vhost_net batch dequeuing

2017-05-09 Thread Jason Wang

This series tries to implement rx batching for vhost-net. This is done
by batching the dequeuing from skb_array which was exported by
underlayer socket and pass the sbk back through msg_control to finish
userspace copying. This is also the requirement for more batching
implemention on rx path.

Tests shows at most 8.8% improvment bon rx pps on top of batch zeroing.

Please review.

Thanks

Changes from V3:
- add batch zeroing patch to fix the build warnings

Changes from V2:
- rebase to net-next HEAD
- use unconsume helpers to put skb back on releasing
- introduce and use vhost_net internal buffer helpers
- renew performance numbers on top of batch zeroing

Changes from V1:
- switch to use for() in __ptr_ring_consume_batched()
- rename peek_head_len_batched() to fetch_skbs()
- use skb_array_consume_batched() instead of
  skb_array_consume_batched_bh() since no consumer run in bh
- drop the lockless peeking patch since skb_array could be resized, so
  it's not safe to call lockless one

Jason Wang (8):
  skb_array: introduce skb_array_unconsume
  ptr_ring: introduce batch dequeuing
  skb_array: introduce batch dequeuing
  tun: export skb_array
  tap: export skb_array
  tun: support receiving skb through msg_control
  tap: support receiving skb from msg_control
  vhost_net: try batch dequing from skb array

Michael S. Tsirkin (2):
  ptr_ring: batch ring zeroing
  ptr_ring: add ptr_ring_unconsume

 drivers/net/tap.c |  25 ++-
 drivers/net/tun.c |  31 ++--
 drivers/vhost/net.c   | 117 +++--
 include/linux/if_tap.h|   5 ++
 include/linux/if_tun.h|   5 ++
 include/linux/ptr_ring.h  | 183 +++---
 include/linux/skb_array.h |  31 
 7 files changed, 370 insertions(+), 27 deletions(-)

-- 
2.7.4

[PATCH net-next V4 05/10] skb_array: introduce batch dequeuing

2017-05-09 Thread Jason Wang

Signed-off-by: Jason Wang 
---
 include/linux/skb_array.h | 25 +
 1 file changed, 25 insertions(+)

diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h
index 79850b6..35226cd 100644
--- a/include/linux/skb_array.h
+++ b/include/linux/skb_array.h
@@ -97,21 +97,46 @@ static inline struct sk_buff *skb_array_consume(struct 
skb_array *a)
return ptr_ring_consume(>ring);
 }
 
+static inline int skb_array_consume_batched(struct skb_array *a,
+   struct sk_buff **array, int n)
+{
+   return ptr_ring_consume_batched(>ring, (void **)array, n);
+}
+
 static inline struct sk_buff *skb_array_consume_irq(struct skb_array *a)
 {
return ptr_ring_consume_irq(>ring);
 }
 
+static inline int skb_array_consume_batched_irq(struct skb_array *a,
+   struct sk_buff **array, int n)
+{
+   return ptr_ring_consume_batched_irq(>ring, (void **)array, n);
+}
+
 static inline struct sk_buff *skb_array_consume_any(struct skb_array *a)
 {
return ptr_ring_consume_any(>ring);
 }
 
+static inline int skb_array_consume_batched_any(struct skb_array *a,
+   struct sk_buff **array, int n)
+{
+   return ptr_ring_consume_batched_any(>ring, (void **)array, n);
+}
+
+
 static inline struct sk_buff *skb_array_consume_bh(struct skb_array *a)
 {
return ptr_ring_consume_bh(>ring);
 }
 
+static inline int skb_array_consume_batched_bh(struct skb_array *a,
+  struct sk_buff **array, int n)
+{
+   return ptr_ring_consume_batched_bh(>ring, (void **)array, n);
+}
+
 static inline int __skb_array_len_with_tag(struct sk_buff *skb)
 {
if (likely(skb)) {
-- 
2.7.4

[PATCH net-next V4 04/10] ptr_ring: introduce batch dequeuing

2017-05-09 Thread Jason Wang

This patch introduce a batched version of consuming, consumer can
dequeue more than one pointers from the ring at a time. We don't care
about the reorder of reading here so no need for compiler barrier.

Signed-off-by: Jason Wang 
---
 include/linux/ptr_ring.h | 65 
 1 file changed, 65 insertions(+)

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index 796b90f..d8c97ec 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -278,6 +278,22 @@ static inline void *__ptr_ring_consume(struct ptr_ring *r)
return ptr;
 }
 
+static inline int __ptr_ring_consume_batched(struct ptr_ring *r,
+void **array, int n)
+{
+   void *ptr;
+   int i;
+
+   for (i = 0; i < n; i++) {
+   ptr = __ptr_ring_consume(r);
+   if (!ptr)
+   break;
+   array[i] = ptr;
+   }
+
+   return i;
+}
+
 /*
  * Note: resize (below) nests producer lock within consumer lock, so if you
  * call this in interrupt or BH context, you must disable interrupts/BH when
@@ -328,6 +344,55 @@ static inline void *ptr_ring_consume_bh(struct ptr_ring *r)
return ptr;
 }
 
+static inline int ptr_ring_consume_batched(struct ptr_ring *r,
+  void **array, int n)
+{
+   int ret;
+
+   spin_lock(>consumer_lock);
+   ret = __ptr_ring_consume_batched(r, array, n);
+   spin_unlock(>consumer_lock);
+
+   return ret;
+}
+
+static inline int ptr_ring_consume_batched_irq(struct ptr_ring *r,
+  void **array, int n)
+{
+   int ret;
+
+   spin_lock_irq(>consumer_lock);
+   ret = __ptr_ring_consume_batched(r, array, n);
+   spin_unlock_irq(>consumer_lock);
+
+   return ret;
+}
+
+static inline int ptr_ring_consume_batched_any(struct ptr_ring *r,
+  void **array, int n)
+{
+   unsigned long flags;
+   int ret;
+
+   spin_lock_irqsave(>consumer_lock, flags);
+   ret = __ptr_ring_consume_batched(r, array, n);
+   spin_unlock_irqrestore(>consumer_lock, flags);
+
+   return ret;
+}
+
+static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r,
+ void **array, int n)
+{
+   int ret;
+
+   spin_lock_bh(>consumer_lock);
+   ret = __ptr_ring_consume_batched(r, array, n);
+   spin_unlock_bh(>consumer_lock);
+
+   return ret;
+}
+
 /* Cast to structure type and call a function without discarding from FIFO.
  * Function must return a value.
  * Callers must take consumer_lock.
-- 
2.7.4

[PATCH net-next V4 08/10] tun: support receiving skb through msg_control

2017-05-09 Thread Jason Wang

This patch makes tun_recvmsg() can receive from skb from its caller
through msg_control. Vhost_net will be the first user.

Signed-off-by: Jason Wang 
---
 drivers/net/tun.c | 18 ++
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 3cbfc5c..f8041f9c 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1510,9 +1510,8 @@ static struct sk_buff *tun_ring_recv(struct tun_file 
*tfile, int noblock,
 
 static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
   struct iov_iter *to,
-  int noblock)
+  int noblock, struct sk_buff *skb)
 {
-   struct sk_buff *skb;
ssize_t ret;
int err;
 
@@ -1521,10 +1520,12 @@ static ssize_t tun_do_read(struct tun_struct *tun, 
struct tun_file *tfile,
if (!iov_iter_count(to))
return 0;
 
-   /* Read frames from ring */
-   skb = tun_ring_recv(tfile, noblock, );
-   if (!skb)
-   return err;
+   if (!skb) {
+   /* Read frames from ring */
+   skb = tun_ring_recv(tfile, noblock, );
+   if (!skb)
+   return err;
+   }
 
ret = tun_put_user(tun, tfile, skb, to);
if (unlikely(ret < 0))
@@ -1544,7 +1545,7 @@ static ssize_t tun_chr_read_iter(struct kiocb *iocb, 
struct iov_iter *to)
 
if (!tun)
return -EBADFD;
-   ret = tun_do_read(tun, tfile, to, file->f_flags & O_NONBLOCK);
+   ret = tun_do_read(tun, tfile, to, file->f_flags & O_NONBLOCK, NULL);
ret = min_t(ssize_t, ret, len);
if (ret > 0)
iocb->ki_pos = ret;
@@ -1646,7 +1647,8 @@ static int tun_recvmsg(struct socket *sock, struct msghdr 
*m, size_t total_len,
 SOL_PACKET, TUN_TX_TIMESTAMP);
goto out;
}
-   ret = tun_do_read(tun, tfile, >msg_iter, flags & MSG_DONTWAIT);
+   ret = tun_do_read(tun, tfile, >msg_iter, flags & MSG_DONTWAIT,
+ m->msg_control);
if (ret > (ssize_t)total_len) {
m->msg_flags |= MSG_TRUNC;
ret = flags & MSG_TRUNC ? ret : total_len;
-- 
2.7.4

[PATCH net-next V4 06/10] tun: export skb_array

2017-05-09 Thread Jason Wang

This patch exports skb_array through tun_get_skb_array(). Caller can
then manipulate skb array directly.

Signed-off-by: Jason Wang 
---
 drivers/net/tun.c  | 13 +
 include/linux/if_tun.h |  5 +
 2 files changed, 18 insertions(+)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index bbd707b..3cbfc5c 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2626,6 +2626,19 @@ struct socket *tun_get_socket(struct file *file)
 }
 EXPORT_SYMBOL_GPL(tun_get_socket);
 
+struct skb_array *tun_get_skb_array(struct file *file)
+{
+   struct tun_file *tfile;
+
+   if (file->f_op != _fops)
+   return ERR_PTR(-EINVAL);
+   tfile = file->private_data;
+   if (!tfile)
+   return ERR_PTR(-EBADFD);
+   return >tx_array;
+}
+EXPORT_SYMBOL_GPL(tun_get_skb_array);
+
 module_init(tun_init);
 module_exit(tun_cleanup);
 MODULE_DESCRIPTION(DRV_DESCRIPTION);
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index ed6da2e..bf9bdf4 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -19,6 +19,7 @@
 
 #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
 struct socket *tun_get_socket(struct file *);
+struct skb_array *tun_get_skb_array(struct file *file);
 #else
 #include 
 #include 
@@ -28,5 +29,9 @@ static inline struct socket *tun_get_socket(struct file *f)
 {
return ERR_PTR(-EINVAL);
 }
+static inline struct skb_array *tun_get_skb_array(struct file *f)
+{
+   return ERR_PTR(-EINVAL);
+}
 #endif /* CONFIG_TUN */
 #endif /* __IF_TUN_H */
-- 
2.7.4

[PATCH net-next V4 07/10] tap: export skb_array

2017-05-09 Thread Jason Wang

This patch exports skb_array through tap_get_skb_array(). Caller can
then manipulate skb array directly.

Signed-off-by: Jason Wang 
---
 drivers/net/tap.c  | 13 +
 include/linux/if_tap.h |  5 +
 2 files changed, 18 insertions(+)

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 4d4173d..abdaf86 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -1193,6 +1193,19 @@ struct socket *tap_get_socket(struct file *file)
 }
 EXPORT_SYMBOL_GPL(tap_get_socket);
 
+struct skb_array *tap_get_skb_array(struct file *file)
+{
+   struct tap_queue *q;
+
+   if (file->f_op != _fops)
+   return ERR_PTR(-EINVAL);
+   q = file->private_data;
+   if (!q)
+   return ERR_PTR(-EBADFD);
+   return >skb_array;
+}
+EXPORT_SYMBOL_GPL(tap_get_skb_array);
+
 int tap_queue_resize(struct tap_dev *tap)
 {
struct net_device *dev = tap->dev;
diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
index 3482c3c..4837157 100644
--- a/include/linux/if_tap.h
+++ b/include/linux/if_tap.h
@@ -3,6 +3,7 @@
 
 #if IS_ENABLED(CONFIG_TAP)
 struct socket *tap_get_socket(struct file *);
+struct skb_array *tap_get_skb_array(struct file *file);
 #else
 #include 
 #include 
@@ -12,6 +13,10 @@ static inline struct socket *tap_get_socket(struct file *f)
 {
return ERR_PTR(-EINVAL);
 }
+static inline struct skb_array *tap_get_skb_array(struct file *f)
+{
+   return ERR_PTR(-EINVAL);
+}
 #endif /* CONFIG_TAP */
 
 #include 
-- 
2.7.4

[PATCH net-next V4 09/10] tap: support receiving skb from msg_control

2017-05-09 Thread Jason Wang

This patch makes tap_recvmsg() can receive from skb from its caller
through msg_control. Vhost_net will be the first user.

Signed-off-by: Jason Wang 
---
 drivers/net/tap.c | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index abdaf86..9af3239 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -824,15 +824,17 @@ static ssize_t tap_put_user(struct tap_queue *q,
 
 static ssize_t tap_do_read(struct tap_queue *q,
   struct iov_iter *to,
-  int noblock)
+  int noblock, struct sk_buff *skb)
 {
DEFINE_WAIT(wait);
-   struct sk_buff *skb;
ssize_t ret = 0;
 
if (!iov_iter_count(to))
return 0;
 
+   if (skb)
+   goto put;
+
while (1) {
if (!noblock)
prepare_to_wait(sk_sleep(>sk), ,
@@ -856,6 +858,7 @@ static ssize_t tap_do_read(struct tap_queue *q,
if (!noblock)
finish_wait(sk_sleep(>sk), );
 
+put:
if (skb) {
ret = tap_put_user(q, skb, to);
if (unlikely(ret < 0))
@@ -872,7 +875,7 @@ static ssize_t tap_read_iter(struct kiocb *iocb, struct 
iov_iter *to)
struct tap_queue *q = file->private_data;
ssize_t len = iov_iter_count(to), ret;
 
-   ret = tap_do_read(q, to, file->f_flags & O_NONBLOCK);
+   ret = tap_do_read(q, to, file->f_flags & O_NONBLOCK, NULL);
ret = min_t(ssize_t, ret, len);
if (ret > 0)
iocb->ki_pos = ret;
@@ -1155,7 +1158,8 @@ static int tap_recvmsg(struct socket *sock, struct msghdr 
*m,
int ret;
if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
return -EINVAL;
-   ret = tap_do_read(q, >msg_iter, flags & MSG_DONTWAIT);
+   ret = tap_do_read(q, >msg_iter, flags & MSG_DONTWAIT,
+ m->msg_control);
if (ret > total_len) {
m->msg_flags |= MSG_TRUNC;
ret = flags & MSG_TRUNC ? ret : total_len;
-- 
2.7.4

[PATCH net-next V4 10/10] vhost_net: try batch dequing from skb array

2017-05-09 Thread Jason Wang

We used to dequeue one skb during recvmsg() from skb_array, this could
be inefficient because of the bad cache utilization and spinlock
touching for each packet. This patch tries to batch them by calling
batch dequeuing helpers explicitly on the exported skb array and pass
the skb back through msg_control for underlayer socket to finish the
userspace copying.

Batch dequeuing is also the requirement for more batching improvement
on rx.

Tests were done by pktgen on tap with XDP1 in guest on top of batch
zeroing:

rx batch | pps

2562.41Mpps (+6.16%)
1282.48Mpps (+8.80%)
64 2.38Mpps (+3.96%) <- Default
16 2.31Mpps (+1.76%)
4  2.31Mpps (+1.76%)
1  2.30Mpps (+1.32%)
0  2.27Mpps (+7.48%)

Signed-off-by: Jason Wang 
---
 drivers/vhost/net.c | 117 +---
 1 file changed, 111 insertions(+), 6 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 9b51989..fbaecf3 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -28,6 +28,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include 
 
@@ -85,6 +87,13 @@ struct vhost_net_ubuf_ref {
struct vhost_virtqueue *vq;
 };
 
+#define VHOST_RX_BATCH 64
+struct vhost_net_buf {
+   struct sk_buff *queue[VHOST_RX_BATCH];
+   int tail;
+   int head;
+};
+
 struct vhost_net_virtqueue {
struct vhost_virtqueue vq;
size_t vhost_hlen;
@@ -99,6 +108,8 @@ struct vhost_net_virtqueue {
/* Reference counting for outstanding ubufs.
 * Protected by vq mutex. Writers must also take device mutex. */
struct vhost_net_ubuf_ref *ubufs;
+   struct skb_array *rx_array;
+   struct vhost_net_buf rxq;
 };
 
 struct vhost_net {
@@ -117,6 +128,71 @@ struct vhost_net {
 
 static unsigned vhost_net_zcopy_mask __read_mostly;
 
+static void *vhost_net_buf_get_ptr(struct vhost_net_buf *rxq)
+{
+   if (rxq->tail != rxq->head)
+   return rxq->queue[rxq->head];
+   else
+   return NULL;
+}
+
+static int vhost_net_buf_get_size(struct vhost_net_buf *rxq)
+{
+   return rxq->tail - rxq->head;
+}
+
+static int vhost_net_buf_is_empty(struct vhost_net_buf *rxq)
+{
+   return rxq->tail == rxq->head;
+}
+
+static void *vhost_net_buf_consume(struct vhost_net_buf *rxq)
+{
+   void *ret = vhost_net_buf_get_ptr(rxq);
+   ++rxq->head;
+   return ret;
+}
+
+static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq)
+{
+   struct vhost_net_buf *rxq = >rxq;
+
+   rxq->head = 0;
+   rxq->tail = skb_array_consume_batched(nvq->rx_array, rxq->queue,
+ VHOST_RX_BATCH);
+   return rxq->tail;
+}
+
+static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq)
+{
+   struct vhost_net_buf *rxq = >rxq;
+
+   if (nvq->rx_array && !vhost_net_buf_is_empty(rxq)) {
+   skb_array_unconsume(nvq->rx_array, rxq->queue + rxq->head,
+   vhost_net_buf_get_size(rxq));
+   rxq->head = rxq->tail = 0;
+   }
+}
+
+static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq)
+{
+   struct vhost_net_buf *rxq = >rxq;
+
+   if (!vhost_net_buf_is_empty(rxq))
+   goto out;
+
+   if (!vhost_net_buf_produce(nvq))
+   return 0;
+
+out:
+   return __skb_array_len_with_tag(vhost_net_buf_get_ptr(rxq));
+}
+
+static void vhost_net_buf_init(struct vhost_net_buf *rxq)
+{
+   rxq->head = rxq->tail = 0;
+}
+
 static void vhost_net_enable_zcopy(int vq)
 {
vhost_net_zcopy_mask |= 0x1 << vq;
@@ -201,6 +277,7 @@ static void vhost_net_vq_reset(struct vhost_net *n)
n->vqs[i].ubufs = NULL;
n->vqs[i].vhost_hlen = 0;
n->vqs[i].sock_hlen = 0;
+   vhost_net_buf_init(>vqs[i].rxq);
}
 
 }
@@ -503,15 +580,14 @@ static void handle_tx(struct vhost_net *net)
mutex_unlock(>mutex);
 }
 
-static int peek_head_len(struct sock *sk)
+static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
 {
-   struct socket *sock = sk->sk_socket;
struct sk_buff *head;
int len = 0;
unsigned long flags;
 
-   if (sock->ops->peek_len)
-   return sock->ops->peek_len(sock);
+   if (rvq->rx_array)
+   return vhost_net_buf_peek(rvq);
 
spin_lock_irqsave(>sk_receive_queue.lock, flags);
head = skb_peek(>sk_receive_queue);
@@ -537,10 +613,11 @@ static int sk_has_rx_data(struct sock *sk)
 
 static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
 {
+   struct vhost_net_virtqueue *rvq = >vqs[VHOST_NET_VQ_RX];
struct vhost_net_virtqueue *nvq = >vqs[VHOST_NET_VQ_TX];
struct vhost_virtqueue *vq = >vq;
unsigned long uninitialized_var(endtime);
-   int len = peek_head_len(sk);
+   int len = peek_head_len(rvq, sk);

Re: [PATCH net 2/2] xdp: disallow use of native and generic hook at once

2017-05-09 Thread Jakub Kicinski

On Wed, 10 May 2017 03:31:31 +0200, Daniel Borkmann wrote:
> While working on the iproute2 generic XDP frontend, I noticed that
> as of right now it's possible to have native *and* generic XDP
> programs loaded both at the same time for the case when a driver
> supports native XDP.

Nice improvement!  A couple of absolute nitpicks below..

> The intended model for generic XDP from b5cdae3291f7 ("net: Generic
> XDP") is, however, that only one out of the two can be present at
> once which is also indicated as such in the XPD netlink dump part.
  ^^^
 XDP

> @@ -6851,6 +6851,32 @@ int dev_change_proto_down(struct net_device *dev, bool 
> proto_down)
>  }
>  EXPORT_SYMBOL(dev_change_proto_down);
>  
> +bool __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op)

Out of curiosity - the leading underscores refer to caller having to
hold rtnl?  I assume they are not needed in the function below because
it's static?

> +{
> + struct netdev_xdp xdp;
> +
> + memset(, 0, sizeof(xdp));
> + xdp.command = XDP_QUERY_PROG;

Probably personal preference, but seems like designated struct
initializer would do quite nicely here and save the memset :)

> diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
> index dda9f16..99320f0 100644
> --- a/net/core/rtnetlink.c
> +++ b/net/core/rtnetlink.c
> @@ -1251,24 +1251,20 @@ static int rtnl_xdp_fill(struct sk_buff *skb, struct 
> net_device *dev)
>  {
>   struct nlattr *xdp;
>   u32 xdp_flags = 0;
> - u8 val = 0;
>   int err;
> + u8 val;
>  
>   xdp = nla_nest_start(skb, IFLA_XDP);
>   if (!xdp)
>   return -EMSGSIZE;
> +
>   if (rcu_access_pointer(dev->xdp_prog)) {
>   xdp_flags = XDP_FLAGS_SKB_MODE;
>   val = 1;
> - } else if (dev->netdev_ops->ndo_xdp) {
> - struct netdev_xdp xdp_op = {};
> -
> - xdp_op.command = XDP_QUERY_PROG;
> - err = dev->netdev_ops->ndo_xdp(dev, _op);
> - if (err)
> - goto err_cancel;
> - val = xdp_op.prog_attached;
> + } else {
> + val = dev_xdp_attached(dev);
>   }

Would it make sense to set xdp_flags to XDP_FLAGS_DRV_MODE here to keep
things symmetrical?  I know you are just preserving existing behaviour
but it may seem slightly arbitrary to a new comer to report one of the
very similarly named flags in the dump but not the other.

RE: [PATCH] net: fec: select queue depending on VLAN priority

2017-05-09 Thread Andy Duan

From: David Miller  Sent: Tuesday, May 09, 2017 9:39 PM
>To: ste...@agner.ch
>Cc: Andy Duan ; and...@lunn.ch;
>feste...@gmail.com; netdev@vger.kernel.org; linux-
>ker...@vger.kernel.org
>Subject: Re: [PATCH] net: fec: select queue depending on VLAN priority
>
>From: Stefan Agner 
>Date: Mon,  8 May 2017 22:37:08 -0700
>
>> Since the addition of the multi queue code with commit 59d0f7465644
>> ("net: fec: init multi queue date structure") the queue selection has
>> been handelt by the default transmit queue selection implementation
>> which tries to evenly distribute the traffic across all available
>> queues. This selection presumes that the queues are using an equal
>> priority, however, the queues 1 and 2 are actually of higher priority
>> (the classification of the queues is enabled in fec_enet_enable_ring).
>>
>> This can lead to net scheduler warnings and continuous TX ring dumps
>> when exercising the system with iperf.
>>
>> Use only queue 0 for all common traffic (no VLAN and P802.1p priority
>> 0 and 1) and route level 2-7 through queue 1 and 2.
>>
>> Signed-off-by: Fugang Duan 
>> Fixes: 59d0f7465644 ("net: fec: init multi queue date structure")
>
>If the queues are used for prioritization, and it does not have multiple normal
>priority level queues, multiqueue is not what the driver should have
>implemented.
Firstly, HW multiple queues support:
- Traffic-shaping bandwidth distribution supports credit-based and 
round-robin-based policies. Either policy can be combined with time-based 
shaping.
- AVB (Audio Video Bridging, IEEE 802.1Qav) features:
* Credit-based bandwidth distribution policy can be combined 
with time-based shaping
* AVB endpoint talker and listener support
* Support for arbitration between different priority traffic 
(for example, AVB class A, AVB class B, and non-AVB)
Round-robin-based policies:
It has the same priority for three queues: In the round-robin QoS 
scheme, each queue is given an equal opportunity to transmit one frame. For 
example, if queue n has a frame to transmit, the queue transmits its frame. 
After queue n has transmitted its frame, or if queue n does not have a frame to 
transmit, queue n+1 is then allowed to transmit its frame, and so on.

Credit-based policies:
The AVB credit based shaper acts independently, per class, to control 
the bandwidth distribution between normal traffic and time-sensitive traffic 
with respect to the total link bandwidth available.
Credit based shaper conbined with time-based shaping:  
- priority: ClassA queue > ClassB queue > best effort
- ensure the queue bandwidth as user set based on time-based 
shaping algorithms (transmitter transmit frame from three queue in turn based 
on time-based shaping algorithms)
And in real AVB case,  each streaming can be independent, and are fixed 
on related queue. Then driver level should implement .ndo_select_queue() to put 
the streaming into related queue. That is what the patch did.

The current driver config the three queue to credit-based policies (AVB), the 
patch seems no problem for the implementation. Do you have any suggestion ?

Andy

[PATCH net-next V3 2/9] skb_array: introduce skb_array_unconsume

2017-05-09 Thread Jason Wang

Signed-off-by: Jason Wang 
---
 include/linux/skb_array.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h
index f4dfade..79850b6 100644
--- a/include/linux/skb_array.h
+++ b/include/linux/skb_array.h
@@ -156,6 +156,12 @@ static void __skb_array_destroy_skb(void *ptr)
kfree_skb(ptr);
 }
 
+static inline void skb_array_unconsume(struct skb_array *a,
+  struct sk_buff **skbs, int n)
+{
+   ptr_ring_unconsume(>ring, (void **)skbs, n, __skb_array_destroy_skb);
+}
+
 static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp)
 {
return ptr_ring_resize(>ring, size, gfp, __skb_array_destroy_skb);
-- 
2.7.4

[PATCH net-next V3 1/9] ptr_ring: add ptr_ring_unconsume

2017-05-09 Thread Jason Wang

From: "Michael S. Tsirkin" 

Applications that consume a batch of entries in one go
can benefit from ability to return some of them back
into the ring.

Add an API for that - assuming there's space. If there's no space
naturally can't do this and have to drop entries, but this implies ring
is full so we'd likely drop some anyway.

Signed-off-by: Michael S. Tsirkin 
Signed-off-by: Jason Wang 
---
 include/linux/ptr_ring.h | 55 
 1 file changed, 55 insertions(+)

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index 6c70444..5476f68 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -359,6 +359,61 @@ static inline int ptr_ring_init(struct ptr_ring *r, int 
size, gfp_t gfp)
return 0;
 }
 
+/*
+ * Return entries into ring. Destroy entries that don't fit.
+ *
+ * Note: this is expected to be a rare slow path operation.
+ *
+ * Note: producer lock is nested within consumer lock, so if you
+ * resize you must make sure all uses nest correctly.
+ * In particular if you consume ring in interrupt or BH context, you must
+ * disable interrupts/BH when doing so.
+ */
+static inline void ptr_ring_unconsume(struct ptr_ring *r, void **batch, int n,
+ void (*destroy)(void *))
+{
+   unsigned long flags;
+   int head;
+
+   spin_lock_irqsave(>consumer_lock, flags);
+   spin_lock(>producer_lock);
+
+   if (!r->size)
+   goto done;
+
+   /*
+* Clean out buffered entries (for simplicity). This way following code
+* can test entries for NULL and if not assume they are valid.
+*/
+   head = r->consumer_head - 1;
+   while (likely(head >= r->consumer_tail))
+   r->queue[head--] = NULL;
+   r->consumer_tail = r->consumer_head;
+
+   /*
+* Go over entries in batch, start moving head back and copy entries.
+* Stop when we run into previously unconsumed entries.
+*/
+   while (n) {
+   head = r->consumer_head - 1;
+   if (head < 0)
+   head = r->size - 1;
+   if (r->queue[head]) {
+   /* This batch entry will have to be destroyed. */
+   goto done;
+   }
+   r->queue[head] = batch[--n];
+   r->consumer_tail = r->consumer_head = head;
+   }
+
+done:
+   /* Destroy all entries left in the batch. */
+   while (n)
+   destroy(batch[--n]);
+   spin_unlock(>producer_lock);
+   spin_unlock_irqrestore(>consumer_lock, flags);
+}
+
 static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue,
   int size, gfp_t gfp,
   void (*destroy)(void *))
-- 
2.7.4

[PATCH net-next V3 0/9] vhost_net rx batch dequeuing

2017-05-09 Thread Jason Wang

This series tries to implement rx batching for vhost-net. This is done
by batching the dequeuing from skb_array which was exported by
underlayer socket and pass the sbk back through msg_control to finish
userspace copying. This is also the requirement for more batching
implemention on rx path.

Tests shows at most 8.8% improvment bon rx pps on top of batch zeroing.

Please review.

Thanks

Changes from V2:
- rebase to net-next HEAD
- use unconsume helpers to put skb back on releasing
- introduce and use vhost_net internal buffer helpers
- renew performance numbers on top of batch zeroing

Changes from V1:
- switch to use for() in __ptr_ring_consume_batched()
- rename peek_head_len_batched() to fetch_skbs()
- use skb_array_consume_batched() instead of
  skb_array_consume_batched_bh() since no consumer run in bh
- drop the lockless peeking patch since skb_array could be resized, so
  it's not safe to call lockless one

Jason Wang (8):
  skb_array: introduce skb_array_unconsume
  ptr_ring: introduce batch dequeuing
  skb_array: introduce batch dequeuing
  tun: export skb_array
  tap: export skb_array
  tun: support receiving skb through msg_control
  tap: support receiving skb from msg_control
  vhost_net: try batch dequing from skb array

Michael S. Tsirkin (1):
  ptr_ring: add ptr_ring_unconsume

 drivers/net/tap.c |  25 --
 drivers/net/tun.c |  31 
 drivers/vhost/net.c   | 117 +---
 include/linux/if_tap.h|   5 ++
 include/linux/if_tun.h|   5 ++
 include/linux/ptr_ring.h  | 120 ++
 include/linux/skb_array.h |  31 
 7 files changed, 316 insertions(+), 18 deletions(-)

-- 
2.7.4

[PATCH net-next V3 3/9] ptr_ring: introduce batch dequeuing

2017-05-09 Thread Jason Wang

This patch introduce a batched version of consuming, consumer can
dequeue more than one pointers from the ring at a time. We don't care
about the reorder of reading here so no need for compiler barrier.

Signed-off-by: Jason Wang 
---
 include/linux/ptr_ring.h | 65 
 1 file changed, 65 insertions(+)

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index 5476f68..9db39e6 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -247,6 +247,22 @@ static inline void *__ptr_ring_consume(struct ptr_ring *r)
return ptr;
 }
 
+static inline int __ptr_ring_consume_batched(struct ptr_ring *r,
+void **array, int n)
+{
+   void *ptr;
+   int i;
+
+   for (i = 0; i < n; i++) {
+   ptr = __ptr_ring_consume(r);
+   if (!ptr)
+   break;
+   array[i] = ptr;
+   }
+
+   return i;
+}
+
 /*
  * Note: resize (below) nests producer lock within consumer lock, so if you
  * call this in interrupt or BH context, you must disable interrupts/BH when
@@ -297,6 +313,55 @@ static inline void *ptr_ring_consume_bh(struct ptr_ring *r)
return ptr;
 }
 
+static inline int ptr_ring_consume_batched(struct ptr_ring *r,
+  void **array, int n)
+{
+   int ret;
+
+   spin_lock(>consumer_lock);
+   ret = __ptr_ring_consume_batched(r, array, n);
+   spin_unlock(>consumer_lock);
+
+   return ret;
+}
+
+static inline int ptr_ring_consume_batched_irq(struct ptr_ring *r,
+  void **array, int n)
+{
+   int ret;
+
+   spin_lock_irq(>consumer_lock);
+   ret = __ptr_ring_consume_batched(r, array, n);
+   spin_unlock_irq(>consumer_lock);
+
+   return ret;
+}
+
+static inline int ptr_ring_consume_batched_any(struct ptr_ring *r,
+  void **array, int n)
+{
+   unsigned long flags;
+   int ret;
+
+   spin_lock_irqsave(>consumer_lock, flags);
+   ret = __ptr_ring_consume_batched(r, array, n);
+   spin_unlock_irqrestore(>consumer_lock, flags);
+
+   return ret;
+}
+
+static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r,
+ void **array, int n)
+{
+   int ret;
+
+   spin_lock_bh(>consumer_lock);
+   ret = __ptr_ring_consume_batched(r, array, n);
+   spin_unlock_bh(>consumer_lock);
+
+   return ret;
+}
+
 /* Cast to structure type and call a function without discarding from FIFO.
  * Function must return a value.
  * Callers must take consumer_lock.
-- 
2.7.4

[PATCH net-next V3 4/9] skb_array: introduce batch dequeuing

2017-05-09 Thread Jason Wang

Signed-off-by: Jason Wang 
---
 include/linux/skb_array.h | 25 +
 1 file changed, 25 insertions(+)

diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h
index 79850b6..35226cd 100644
--- a/include/linux/skb_array.h
+++ b/include/linux/skb_array.h
@@ -97,21 +97,46 @@ static inline struct sk_buff *skb_array_consume(struct 
skb_array *a)
return ptr_ring_consume(>ring);
 }
 
+static inline int skb_array_consume_batched(struct skb_array *a,
+   struct sk_buff **array, int n)
+{
+   return ptr_ring_consume_batched(>ring, (void **)array, n);
+}
+
 static inline struct sk_buff *skb_array_consume_irq(struct skb_array *a)
 {
return ptr_ring_consume_irq(>ring);
 }
 
+static inline int skb_array_consume_batched_irq(struct skb_array *a,
+   struct sk_buff **array, int n)
+{
+   return ptr_ring_consume_batched_irq(>ring, (void **)array, n);
+}
+
 static inline struct sk_buff *skb_array_consume_any(struct skb_array *a)
 {
return ptr_ring_consume_any(>ring);
 }
 
+static inline int skb_array_consume_batched_any(struct skb_array *a,
+   struct sk_buff **array, int n)
+{
+   return ptr_ring_consume_batched_any(>ring, (void **)array, n);
+}
+
+
 static inline struct sk_buff *skb_array_consume_bh(struct skb_array *a)
 {
return ptr_ring_consume_bh(>ring);
 }
 
+static inline int skb_array_consume_batched_bh(struct skb_array *a,
+  struct sk_buff **array, int n)
+{
+   return ptr_ring_consume_batched_bh(>ring, (void **)array, n);
+}
+
 static inline int __skb_array_len_with_tag(struct sk_buff *skb)
 {
if (likely(skb)) {
-- 
2.7.4

[PATCH net-next V3 6/9] tap: export skb_array

2017-05-09 Thread Jason Wang

This patch exports skb_array through tap_get_skb_array(). Caller can
then manipulate skb array directly.

Signed-off-by: Jason Wang 
---
 drivers/net/tap.c  | 13 +
 include/linux/if_tap.h |  5 +
 2 files changed, 18 insertions(+)

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 4d4173d..abdaf86 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -1193,6 +1193,19 @@ struct socket *tap_get_socket(struct file *file)
 }
 EXPORT_SYMBOL_GPL(tap_get_socket);
 
+struct skb_array *tap_get_skb_array(struct file *file)
+{
+   struct tap_queue *q;
+
+   if (file->f_op != _fops)
+   return ERR_PTR(-EINVAL);
+   q = file->private_data;
+   if (!q)
+   return ERR_PTR(-EBADFD);
+   return >skb_array;
+}
+EXPORT_SYMBOL_GPL(tap_get_skb_array);
+
 int tap_queue_resize(struct tap_dev *tap)
 {
struct net_device *dev = tap->dev;
diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
index 3482c3c..4837157 100644
--- a/include/linux/if_tap.h
+++ b/include/linux/if_tap.h
@@ -3,6 +3,7 @@
 
 #if IS_ENABLED(CONFIG_TAP)
 struct socket *tap_get_socket(struct file *);
+struct skb_array *tap_get_skb_array(struct file *file);
 #else
 #include 
 #include 
@@ -12,6 +13,10 @@ static inline struct socket *tap_get_socket(struct file *f)
 {
return ERR_PTR(-EINVAL);
 }
+static inline struct skb_array *tap_get_skb_array(struct file *f)
+{
+   return ERR_PTR(-EINVAL);
+}
 #endif /* CONFIG_TAP */
 
 #include 
-- 
2.7.4

[PATCH net-next V3 5/9] tun: export skb_array

2017-05-09 Thread Jason Wang

This patch exports skb_array through tun_get_skb_array(). Caller can
then manipulate skb array directly.

Signed-off-by: Jason Wang 
---
 drivers/net/tun.c  | 13 +
 include/linux/if_tun.h |  5 +
 2 files changed, 18 insertions(+)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index bbd707b..3cbfc5c 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2626,6 +2626,19 @@ struct socket *tun_get_socket(struct file *file)
 }
 EXPORT_SYMBOL_GPL(tun_get_socket);
 
+struct skb_array *tun_get_skb_array(struct file *file)
+{
+   struct tun_file *tfile;
+
+   if (file->f_op != _fops)
+   return ERR_PTR(-EINVAL);
+   tfile = file->private_data;
+   if (!tfile)
+   return ERR_PTR(-EBADFD);
+   return >tx_array;
+}
+EXPORT_SYMBOL_GPL(tun_get_skb_array);
+
 module_init(tun_init);
 module_exit(tun_cleanup);
 MODULE_DESCRIPTION(DRV_DESCRIPTION);
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index ed6da2e..bf9bdf4 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -19,6 +19,7 @@
 
 #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
 struct socket *tun_get_socket(struct file *);
+struct skb_array *tun_get_skb_array(struct file *file);
 #else
 #include 
 #include 
@@ -28,5 +29,9 @@ static inline struct socket *tun_get_socket(struct file *f)
 {
return ERR_PTR(-EINVAL);
 }
+static inline struct skb_array *tun_get_skb_array(struct file *f)
+{
+   return ERR_PTR(-EINVAL);
+}
 #endif /* CONFIG_TUN */
 #endif /* __IF_TUN_H */
-- 
2.7.4

[PATCH net-next V3 7/9] tun: support receiving skb through msg_control

2017-05-09 Thread Jason Wang

This patch makes tun_recvmsg() can receive from skb from its caller
through msg_control. Vhost_net will be the first user.

Signed-off-by: Jason Wang 
---
 drivers/net/tun.c | 18 ++
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 3cbfc5c..f8041f9c 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1510,9 +1510,8 @@ static struct sk_buff *tun_ring_recv(struct tun_file 
*tfile, int noblock,
 
 static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
   struct iov_iter *to,
-  int noblock)
+  int noblock, struct sk_buff *skb)
 {
-   struct sk_buff *skb;
ssize_t ret;
int err;
 
@@ -1521,10 +1520,12 @@ static ssize_t tun_do_read(struct tun_struct *tun, 
struct tun_file *tfile,
if (!iov_iter_count(to))
return 0;
 
-   /* Read frames from ring */
-   skb = tun_ring_recv(tfile, noblock, );
-   if (!skb)
-   return err;
+   if (!skb) {
+   /* Read frames from ring */
+   skb = tun_ring_recv(tfile, noblock, );
+   if (!skb)
+   return err;
+   }
 
ret = tun_put_user(tun, tfile, skb, to);
if (unlikely(ret < 0))
@@ -1544,7 +1545,7 @@ static ssize_t tun_chr_read_iter(struct kiocb *iocb, 
struct iov_iter *to)
 
if (!tun)
return -EBADFD;
-   ret = tun_do_read(tun, tfile, to, file->f_flags & O_NONBLOCK);
+   ret = tun_do_read(tun, tfile, to, file->f_flags & O_NONBLOCK, NULL);
ret = min_t(ssize_t, ret, len);
if (ret > 0)
iocb->ki_pos = ret;
@@ -1646,7 +1647,8 @@ static int tun_recvmsg(struct socket *sock, struct msghdr 
*m, size_t total_len,
 SOL_PACKET, TUN_TX_TIMESTAMP);
goto out;
}
-   ret = tun_do_read(tun, tfile, >msg_iter, flags & MSG_DONTWAIT);
+   ret = tun_do_read(tun, tfile, >msg_iter, flags & MSG_DONTWAIT,
+ m->msg_control);
if (ret > (ssize_t)total_len) {
m->msg_flags |= MSG_TRUNC;
ret = flags & MSG_TRUNC ? ret : total_len;
-- 
2.7.4

[PATCH net-next V3 9/9] vhost_net: try batch dequing from skb array

2017-05-09 Thread Jason Wang

We used to dequeue one skb during recvmsg() from skb_array, this could
be inefficient because of the bad cache utilization and spinlock
touching for each packet. This patch tries to batch them by calling
batch dequeuing helpers explicitly on the exported skb array and pass
the skb back through msg_control for underlayer socket to finish the
userspace copying.

Batch dequeuing is also the requirement for more batching improvement
on rx.

Tests were done by pktgen on tap with XDP1 in guest on top of batch
zeroing:

rx batch | pps

2562.41Mpps (+6.16%)
1282.48Mpps (+8.80%)
64 2.38Mpps (+3.96%) <- Default
16 2.31Mpps (+1.76%)
4  2.31Mpps (+1.76%)
1  2.30Mpps (+1.32%)
0  2.27Mpps (+7.48%)

Signed-off-by: Jason Wang 
---
 drivers/vhost/net.c | 117 +---
 1 file changed, 111 insertions(+), 6 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 9b51989..fbaecf3 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -28,6 +28,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include 
 
@@ -85,6 +87,13 @@ struct vhost_net_ubuf_ref {
struct vhost_virtqueue *vq;
 };
 
+#define VHOST_RX_BATCH 64
+struct vhost_net_buf {
+   struct sk_buff *queue[VHOST_RX_BATCH];
+   int tail;
+   int head;
+};
+
 struct vhost_net_virtqueue {
struct vhost_virtqueue vq;
size_t vhost_hlen;
@@ -99,6 +108,8 @@ struct vhost_net_virtqueue {
/* Reference counting for outstanding ubufs.
 * Protected by vq mutex. Writers must also take device mutex. */
struct vhost_net_ubuf_ref *ubufs;
+   struct skb_array *rx_array;
+   struct vhost_net_buf rxq;
 };
 
 struct vhost_net {
@@ -117,6 +128,71 @@ struct vhost_net {
 
 static unsigned vhost_net_zcopy_mask __read_mostly;
 
+static void *vhost_net_buf_get_ptr(struct vhost_net_buf *rxq)
+{
+   if (rxq->tail != rxq->head)
+   return rxq->queue[rxq->head];
+   else
+   return NULL;
+}
+
+static int vhost_net_buf_get_size(struct vhost_net_buf *rxq)
+{
+   return rxq->tail - rxq->head;
+}
+
+static int vhost_net_buf_is_empty(struct vhost_net_buf *rxq)
+{
+   return rxq->tail == rxq->head;
+}
+
+static void *vhost_net_buf_consume(struct vhost_net_buf *rxq)
+{
+   void *ret = vhost_net_buf_get_ptr(rxq);
+   ++rxq->head;
+   return ret;
+}
+
+static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq)
+{
+   struct vhost_net_buf *rxq = >rxq;
+
+   rxq->head = 0;
+   rxq->tail = skb_array_consume_batched(nvq->rx_array, rxq->queue,
+ VHOST_RX_BATCH);
+   return rxq->tail;
+}
+
+static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq)
+{
+   struct vhost_net_buf *rxq = >rxq;
+
+   if (nvq->rx_array && !vhost_net_buf_is_empty(rxq)) {
+   skb_array_unconsume(nvq->rx_array, rxq->queue + rxq->head,
+   vhost_net_buf_get_size(rxq));
+   rxq->head = rxq->tail = 0;
+   }
+}
+
+static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq)
+{
+   struct vhost_net_buf *rxq = >rxq;
+
+   if (!vhost_net_buf_is_empty(rxq))
+   goto out;
+
+   if (!vhost_net_buf_produce(nvq))
+   return 0;
+
+out:
+   return __skb_array_len_with_tag(vhost_net_buf_get_ptr(rxq));
+}
+
+static void vhost_net_buf_init(struct vhost_net_buf *rxq)
+{
+   rxq->head = rxq->tail = 0;
+}
+
 static void vhost_net_enable_zcopy(int vq)
 {
vhost_net_zcopy_mask |= 0x1 << vq;
@@ -201,6 +277,7 @@ static void vhost_net_vq_reset(struct vhost_net *n)
n->vqs[i].ubufs = NULL;
n->vqs[i].vhost_hlen = 0;
n->vqs[i].sock_hlen = 0;
+   vhost_net_buf_init(>vqs[i].rxq);
}
 
 }
@@ -503,15 +580,14 @@ static void handle_tx(struct vhost_net *net)
mutex_unlock(>mutex);
 }
 
-static int peek_head_len(struct sock *sk)
+static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
 {
-   struct socket *sock = sk->sk_socket;
struct sk_buff *head;
int len = 0;
unsigned long flags;
 
-   if (sock->ops->peek_len)
-   return sock->ops->peek_len(sock);
+   if (rvq->rx_array)
+   return vhost_net_buf_peek(rvq);
 
spin_lock_irqsave(>sk_receive_queue.lock, flags);
head = skb_peek(>sk_receive_queue);
@@ -537,10 +613,11 @@ static int sk_has_rx_data(struct sock *sk)
 
 static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
 {
+   struct vhost_net_virtqueue *rvq = >vqs[VHOST_NET_VQ_RX];
struct vhost_net_virtqueue *nvq = >vqs[VHOST_NET_VQ_TX];
struct vhost_virtqueue *vq = >vq;
unsigned long uninitialized_var(endtime);
-   int len = peek_head_len(sk);
+   int len = peek_head_len(rvq, sk);

[PATCH net-next V3 8/9] tap: support receiving skb from msg_control

2017-05-09 Thread Jason Wang

This patch makes tap_recvmsg() can receive from skb from its caller
through msg_control. Vhost_net will be the first user.

Signed-off-by: Jason Wang 
---
 drivers/net/tap.c | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index abdaf86..9af3239 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -824,15 +824,17 @@ static ssize_t tap_put_user(struct tap_queue *q,
 
 static ssize_t tap_do_read(struct tap_queue *q,
   struct iov_iter *to,
-  int noblock)
+  int noblock, struct sk_buff *skb)
 {
DEFINE_WAIT(wait);
-   struct sk_buff *skb;
ssize_t ret = 0;
 
if (!iov_iter_count(to))
return 0;
 
+   if (skb)
+   goto put;
+
while (1) {
if (!noblock)
prepare_to_wait(sk_sleep(>sk), ,
@@ -856,6 +858,7 @@ static ssize_t tap_do_read(struct tap_queue *q,
if (!noblock)
finish_wait(sk_sleep(>sk), );
 
+put:
if (skb) {
ret = tap_put_user(q, skb, to);
if (unlikely(ret < 0))
@@ -872,7 +875,7 @@ static ssize_t tap_read_iter(struct kiocb *iocb, struct 
iov_iter *to)
struct tap_queue *q = file->private_data;
ssize_t len = iov_iter_count(to), ret;
 
-   ret = tap_do_read(q, to, file->f_flags & O_NONBLOCK);
+   ret = tap_do_read(q, to, file->f_flags & O_NONBLOCK, NULL);
ret = min_t(ssize_t, ret, len);
if (ret > 0)
iocb->ki_pos = ret;
@@ -1155,7 +1158,8 @@ static int tap_recvmsg(struct socket *sock, struct msghdr 
*m,
int ret;
if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
return -EINVAL;
-   ret = tap_do_read(q, >msg_iter, flags & MSG_DONTWAIT);
+   ret = tap_do_read(q, >msg_iter, flags & MSG_DONTWAIT,
+ m->msg_control);
if (ret > total_len) {
m->msg_flags |= MSG_TRUNC;
ret = flags & MSG_TRUNC ? ret : total_len;
-- 
2.7.4

Re: DQL and TCQ_F_CAN_BYPASS destroy performance under virtualizaiton (Was: "Re: net_sched strange in 4.11")

2017-05-09 Thread Jason Wang




On 2017年05月09日 23:11, Stefan Hajnoczi wrote:

On Tue, May 09, 2017 at 08:46:46AM +0100, Anton Ivanov wrote:

I have figured it out. Two issues.

1) skb->xmit_more is hardly ever set under virtualization because the qdisc
is usually bypassed because of TCQ_F_CAN_BYPASS. Once TCQ_F_CAN_BYPASS is
set a virtual NIC driver is not likely see skb->xmit_more (this answers my
"how does this work at all" question).

2) If that flag is turned off (I patched sched_generic to turn it off in
pfifo_fast while testing), DQL keeps xmit_more from being set. If the driver
is not DQL enabled xmit_more is never ever set. If the driver is DQL enabled
the queue is adjusted to ensure xmit_more stops happening within 10-15 xmit
cycles.

That is plain *wrong* for virtual NICs - virtio, emulated NICs, etc. There,
the BIG cost is telling the hypervisor that it needs to "kick" the packets.
The cost of putting them into the vNIC buffers is negligible. You want
xmit_more to happen - it makes between 50% and 300% (depending on vNIC
design) difference. If there is no xmit_more the vNIC will immediately
"kick" the hypervisor and try to signal that  the packet needs to move
straight away (as for example in virtio_net).


How do you measure the performance? TCP or just measure pps?



In addition to that, the perceived line rate is proportional to this cost,
so I am not sure that the current dql math holds. In fact, I think it does
not - it is trying to adjust something which influences the perceived line
rate.

So - how do we turn BOTH bypass and DQL adjustment while under
virtualization and set them to be "always qdisc" + "always xmit_more
allowed"


Virtio-net net does not support BQL. Before commit ea7735d97ba9 
("virtio-net: move free_old_xmit_skbs"), it's even impossible to support 
that since we don't have tx interrupt for each packet.  I haven't 
measured the impact of xmit_more, maybe I was wrong but I think it may 
help in some cases since it may improve the batching on host more or less.


Thanks



A.

P.S. Cc-ing virtio maintainer

CCing Michael Tsirkin and Jason Wang, who are the core virtio and
virtio-net maintainers.  (I maintain the vsock driver - it's unrelated
to this discussion.)


A.


On 08/05/17 08:15, Anton Ivanov wrote:

Hi all,

I was revising some of my old work for UML to prepare it for submission
and I noticed that skb->xmit_more does not seem to be set any more.

I traced the issue as far as net/sched/sched_generic.c

try_bulk_dequeue_skb() is never invoked (the drivers I am working on are
dql enabled so that is not the problem).

More interestingly, if I put a breakpoint and debug output into
dequeue_skb() around line 147 - right before the bulk: tag that skb
there is always NULL. ???

Similarly, debug in pfifo_fast_dequeue shows only NULLs being dequeued.
Again - ???

First and foremost, I apologize for the silly question, but how can this
work at all? I see the skbs showing up at the driver level, why are
NULLs being returned at qdisc dequeue and where do the skbs at the
driver level come from?

Second, where should I look to fix it?

A.



--
Anton R. Ivanov

Cambridge Greys Limited, England company No 10273661
http://www.cambridgegreys.com/

Re: [PATCH v2 net-next] bnxt: add dma mapping attributes

2017-05-09 Thread Michael Chan

On Tue, May 9, 2017 at 6:30 PM, Shannon Nelson
 wrote:
> On the SPARC platform we need to use the DMA_ATTR_WEAK_ORDERING attribute
> in our Rx path dma mapping in order to get the expected performance out
> of the receive path.  Adding it to the Tx path has little effect, so
> that's not a part of this patch.
>
> Signed-off-by: Shannon Nelson 
> Reviewed-by: Tushar Dave 
> Reviewed-by: Tom Saeger 
> ---
> v2: simplify by getting rid of the driver-specific ifdef and define

Acked-by: Michael Chan

Re: [PATCH RFC v2] ptr_ring: add ptr_ring_unconsume

2017-05-09 Thread Jason Wang




On 2017年05月09日 21:26, Michael S. Tsirkin wrote:

On Wed, Apr 26, 2017 at 05:09:42PM +0800, Jason Wang wrote:


On 2017年04月25日 00:01, Michael S. Tsirkin wrote:

Applications that consume a batch of entries in one go
can benefit from ability to return some of them back
into the ring.

Add an API for that - assuming there's space. If there's no space
naturally can't do this and have to drop entries, but this implies ring
is full so we'd likely drop some anyway.

Signed-off-by: Michael S. Tsirkin 
---

Jason, if you add this and unconsume the outstanding packets
on backend disconnect, vhost close and reset, I think
we should apply your patch even if we don't yet know 100%
why it helps.

changes from v1:
- fix up coding style issues reported by Sergei Shtylyov


   include/linux/ptr_ring.h | 56 

   1 file changed, 56 insertions(+)

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index 783e7f5..902afc2 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -457,6 +457,62 @@ static inline int ptr_ring_init(struct ptr_ring *r, int 
size, gfp_t gfp)
return 0;
   }
+/*
+ * Return entries into ring. Destroy entries that don't fit.
+ *
+ * Note: this is expected to be a rare slow path operation.
+ *
+ * Note: producer lock is nested within consumer lock, so if you
+ * resize you must make sure all uses nest correctly.
+ * In particular if you consume ring in interrupt or BH context, you must
+ * disable interrupts/BH when doing so.
+ */
+static inline void ptr_ring_unconsume(struct ptr_ring *r, void **batch, int n,
+ void (*destroy)(void *))
+{
+   unsigned long flags;
+   int head;
+
+   spin_lock_irqsave(>consumer_lock, flags);
+   spin_lock(>producer_lock);
+
+   if (!r->size)
+   goto done;
+
+   /*
+* Clean out buffered entries (for simplicity). This way following code
+* can test entries for NULL and if not assume they are valid.
+*/
+   head = r->consumer_head - 1;
+   while (likely(head >= r->consumer_tail))
+   r->queue[head--] = NULL;
+   r->consumer_tail = r->consumer_head;
+
+   /*
+* Go over entries in batch, start moving head back and copy entries.
+* Stop when we run into previously unconsumed entries.
+*/
+   while (n--) {
+   head = r->consumer_head - 1;
+   if (head < 0)
+   head = r->size - 1;
+   if (r->queue[head]) {
+   /* This batch entry will have to be destroyed. */
+   ++n;
+   goto done;
+   }
+   r->queue[head] = batch[n];
+   r->consumer_tail = r->consumer_head = head;

Looks like something wrong here (bad page state reported), uncomment the
above while() solving the issue. But after staring it for a while I didn't
find anything interesting, maybe you have some idea on this?

Thanks



+   }
+
+done:
+   /* Destroy all entries left in the batch. */
+   while (n--)
+   destroy(batch[n]);
+   spin_unlock(>producer_lock);
+   spin_unlock_irqrestore(>consumer_lock, flags);
+}
+
   static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue,
   int size, gfp_t gfp,
   void (*destroy)(void *))

What's our plan here? I can't delay pull request much longer.



I'm waiting for net-next to be opened (since the series touches tun/tap).

Let me post a new version soon.

Thanks

[PATCH v2 net-next] bnxt: add dma mapping attributes

2017-05-09 Thread Shannon Nelson

On the SPARC platform we need to use the DMA_ATTR_WEAK_ORDERING attribute
in our Rx path dma mapping in order to get the expected performance out
of the receive path.  Adding it to the Tx path has little effect, so
that's not a part of this patch.

Signed-off-by: Shannon Nelson 
Reviewed-by: Tushar Dave 
Reviewed-by: Tom Saeger 
---
v2: simplify by getting rid of the driver-specific ifdef and define

 drivers/net/ethernet/broadcom/bnxt/bnxt.c |   56 +---
 1 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 1f1e54b..687f852 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -582,7 +582,8 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi 
*bnapi, int nr_pkts)
if (!page)
return NULL;
 
-   *mapping = dma_map_page(dev, page, 0, PAGE_SIZE, bp->rx_dir);
+   *mapping = dma_map_page_attrs(dev, page, 0, PAGE_SIZE, bp->rx_dir,
+ DMA_ATTR_WEAK_ORDERING);
if (dma_mapping_error(dev, *mapping)) {
__free_page(page);
return NULL;
@@ -601,8 +602,9 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi 
*bnapi, int nr_pkts)
if (!data)
return NULL;
 
-   *mapping = dma_map_single(>dev, data + bp->rx_dma_offset,
- bp->rx_buf_use_size, bp->rx_dir);
+   *mapping = dma_map_single_attrs(>dev, data + bp->rx_dma_offset,
+   bp->rx_buf_use_size, bp->rx_dir,
+   DMA_ATTR_WEAK_ORDERING);
 
if (dma_mapping_error(>dev, *mapping)) {
kfree(data);
@@ -705,8 +707,9 @@ static inline int bnxt_alloc_rx_page(struct bnxt *bp,
return -ENOMEM;
}
 
-   mapping = dma_map_page(>dev, page, offset, BNXT_RX_PAGE_SIZE,
-  PCI_DMA_FROMDEVICE);
+   mapping = dma_map_page_attrs(>dev, page, offset,
+BNXT_RX_PAGE_SIZE, PCI_DMA_FROMDEVICE,
+DMA_ATTR_WEAK_ORDERING);
if (dma_mapping_error(>dev, mapping)) {
__free_page(page);
return -EIO;
@@ -799,7 +802,8 @@ static void bnxt_reuse_rx_agg_bufs(struct bnxt_napi *bnapi, 
u16 cp_cons,
return NULL;
}
dma_addr -= bp->rx_dma_offset;
-   dma_unmap_page(>pdev->dev, dma_addr, PAGE_SIZE, bp->rx_dir);
+   dma_unmap_page_attrs(>pdev->dev, dma_addr, PAGE_SIZE, bp->rx_dir,
+DMA_ATTR_WEAK_ORDERING);
 
if (unlikely(!payload))
payload = eth_get_headlen(data_ptr, len);
@@ -841,8 +845,8 @@ static void bnxt_reuse_rx_agg_bufs(struct bnxt_napi *bnapi, 
u16 cp_cons,
}
 
skb = build_skb(data, 0);
-   dma_unmap_single(>pdev->dev, dma_addr, bp->rx_buf_use_size,
-bp->rx_dir);
+   dma_unmap_single_attrs(>pdev->dev, dma_addr, bp->rx_buf_use_size,
+  bp->rx_dir, DMA_ATTR_WEAK_ORDERING);
if (!skb) {
kfree(data);
return NULL;
@@ -909,8 +913,9 @@ static void bnxt_reuse_rx_agg_bufs(struct bnxt_napi *bnapi, 
u16 cp_cons,
return NULL;
}
 
-   dma_unmap_page(>dev, mapping, BNXT_RX_PAGE_SIZE,
-  PCI_DMA_FROMDEVICE);
+   dma_unmap_page_attrs(>dev, mapping, BNXT_RX_PAGE_SIZE,
+PCI_DMA_FROMDEVICE,
+DMA_ATTR_WEAK_ORDERING);
 
skb->data_len += frag_len;
skb->len += frag_len;
@@ -1329,8 +1334,9 @@ static void bnxt_abort_tpa(struct bnxt *bp, struct 
bnxt_napi *bnapi,
tpa_info->mapping = new_mapping;
 
skb = build_skb(data, 0);
-   dma_unmap_single(>pdev->dev, mapping, bp->rx_buf_use_size,
-bp->rx_dir);
+   dma_unmap_single_attrs(>pdev->dev, mapping,
+  bp->rx_buf_use_size, bp->rx_dir,
+  DMA_ATTR_WEAK_ORDERING);
 
if (!skb) {
kfree(data);
@@ -1971,9 +1977,11 @@ static void bnxt_free_rx_skbs(struct bnxt *bp)
if (!data)
continue;
 
-   dma_unmap_single(>dev, tpa_info->mapping,
-bp->rx_buf_use_size,
-bp->rx_dir);
+   dma_unmap_single_attrs(>dev,
+  tpa_info->mapping,
+

[PATCH net 0/2] Two generic xdp related follow-ups

2017-05-09 Thread Daniel Borkmann

Two follow-ups for the generic XDP API, would be great if
both could still be considered, since the XDP API is not
frozen yet. For details please see individual patches.

Thanks!

Daniel Borkmann (2):
  xdp: add flag to enforce driver mode
  xdp: disallow use of native and generic hook at once

 include/linux/netdevice.h  | 15 --
 include/uapi/linux/if_link.h   |  6 ++--
 net/core/dev.c | 57 +-
 net/core/rtnetlink.c   | 19 +++--
 samples/bpf/xdp1_user.c|  8 --
 samples/bpf/xdp_tx_iptunnel_user.c |  7 -
 6 files changed, 77 insertions(+), 35 deletions(-)

-- 
1.9.3

[PATCH net 1/2] xdp: add flag to enforce driver mode

2017-05-09 Thread Daniel Borkmann

After commit b5cdae3291f7 ("net: Generic XDP") we automatically fall
back to a generic XDP variant if the driver does not support native
XDP. Allow for an option where the user can specify that always the
native XDP variant should be selected and in case it's not supported
by a driver, just bail out.

Signed-off-by: Daniel Borkmann 
Acked-by: Alexei Starovoitov 
---
 include/uapi/linux/if_link.h   | 6 --
 net/core/dev.c | 2 ++
 net/core/rtnetlink.c   | 5 +
 samples/bpf/xdp1_user.c| 8 ++--
 samples/bpf/xdp_tx_iptunnel_user.c | 7 ++-
 5 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 8e56ac7..549ac8a 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -888,9 +888,11 @@ enum {
 /* XDP section */
 
 #define XDP_FLAGS_UPDATE_IF_NOEXIST(1U << 0)
-#define XDP_FLAGS_SKB_MODE (2U << 0)
+#define XDP_FLAGS_SKB_MODE (1U << 1)
+#define XDP_FLAGS_DRV_MODE (1U << 2)
 #define XDP_FLAGS_MASK (XDP_FLAGS_UPDATE_IF_NOEXIST | \
-XDP_FLAGS_SKB_MODE)
+XDP_FLAGS_SKB_MODE | \
+XDP_FLAGS_DRV_MODE)
 
 enum {
IFLA_XDP_UNSPEC,
diff --git a/net/core/dev.c b/net/core/dev.c
index d07aa5f..61443f0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6872,6 +6872,8 @@ int dev_change_xdp_fd(struct net_device *dev, struct 
netlink_ext_ack *extack,
ASSERT_RTNL();
 
xdp_op = ops->ndo_xdp;
+   if (!xdp_op && (flags & XDP_FLAGS_DRV_MODE))
+   return -EOPNOTSUPP;
if (!xdp_op || (flags & XDP_FLAGS_SKB_MODE))
xdp_op = generic_xdp_install;
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index bcb0f610..dda9f16 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2199,6 +2199,11 @@ static int do_setlink(const struct sk_buff *skb,
err = -EINVAL;
goto errout;
}
+   if ((xdp_flags & XDP_FLAGS_SKB_MODE) &&
+   (xdp_flags & XDP_FLAGS_DRV_MODE)) {
+   err = -EINVAL;
+   goto errout;
+   }
}
 
if (xdp[IFLA_XDP_FD]) {
diff --git a/samples/bpf/xdp1_user.c b/samples/bpf/xdp1_user.c
index 378850c..17be9ea 100644
--- a/samples/bpf/xdp1_user.c
+++ b/samples/bpf/xdp1_user.c
@@ -62,13 +62,14 @@ static void usage(const char *prog)
fprintf(stderr,
"usage: %s [OPTS] IFINDEX\n\n"
"OPTS:\n"
-   "-Suse skb-mode\n",
+   "-Suse skb-mode\n"
+   "-Nenforce native mode\n",
prog);
 }
 
 int main(int argc, char **argv)
 {
-   const char *optstr = "S";
+   const char *optstr = "SN";
char filename[256];
int opt;
 
@@ -77,6 +78,9 @@ int main(int argc, char **argv)
case 'S':
xdp_flags |= XDP_FLAGS_SKB_MODE;
break;
+   case 'N':
+   xdp_flags |= XDP_FLAGS_DRV_MODE;
+   break;
default:
usage(basename(argv[0]));
return 1;
diff --git a/samples/bpf/xdp_tx_iptunnel_user.c 
b/samples/bpf/xdp_tx_iptunnel_user.c
index 92b8bde..631cdcc 100644
--- a/samples/bpf/xdp_tx_iptunnel_user.c
+++ b/samples/bpf/xdp_tx_iptunnel_user.c
@@ -79,6 +79,8 @@ static void usage(const char *cmd)
printf("-m  Used in sending the IP Tunneled pkt\n");
printf("-T  Default: 0 (forever)\n");
printf("-P  Default is TCP\n");
+   printf("-S use skb-mode\n");
+   printf("-N enforce native mode\n");
printf("-h Display this help\n");
 }
 
@@ -138,7 +140,7 @@ int main(int argc, char **argv)
 {
unsigned char opt_flags[256] = {};
unsigned int kill_after_s = 0;
-   const char *optstr = "i:a:p:s:d:m:T:P:Sh";
+   const char *optstr = "i:a:p:s:d:m:T:P:SNh";
int min_port = 0, max_port = 0;
struct iptnl_info tnl = {};
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
@@ -206,6 +208,9 @@ int main(int argc, char **argv)
case 'S':
xdp_flags |= XDP_FLAGS_SKB_MODE;
break;
+   case 'N':
+   xdp_flags |= XDP_FLAGS_DRV_MODE;
+   break;
default:
usage(argv[0]);
return 1;
-- 
1.9.3

[PATCH net 2/2] xdp: disallow use of native and generic hook at once

2017-05-09 Thread Daniel Borkmann

While working on the iproute2 generic XDP frontend, I noticed that
as of right now it's possible to have native *and* generic XDP
programs loaded both at the same time for the case when a driver
supports native XDP.

The intended model for generic XDP from b5cdae3291f7 ("net: Generic
XDP") is, however, that only one out of the two can be present at
once which is also indicated as such in the XPD netlink dump part.
The main rationale for generic XDP is to ease accessibility (in
case a driver does not yet have XDP support) and to generically
provide a semantical model as an example for driver developers
wanting to add XDP support. The generic XDP option for an XDP
aware driver can still be useful for comparing and testing both
implementations.

However, it is not intended to have a second XDP processing stage
or layer with exactly the same functionality of the first native
stage. Only reason would be to have a partial fallback for future
XDP features that are not supported yet in the native implementation
and we probably also shouldn't strive for such fallback and instead
encourage native feature support in the first place. Given there's
currently no such fallback issue or use case, lets not go there
yet if we don't need to.

Therefore, change semantics for loading XDP and bail out if the
user tries to load a generic XDP program when a native one is
present and vice versa. Another alternative to bailing out would
be to handle the transition from one flavor to another gracefully,
but that would require to bring the device down, exchange both
types of programs, and bring it up again in order to avoid a tiny
window where a packet could hit both hooks. Given this complicates
the logic a bit more for just a debugging feature, I went with the
simpler variant.

Signed-off-by: Daniel Borkmann 
Acked-by: Alexei Starovoitov 
---
 include/linux/netdevice.h | 15 +++--
 net/core/dev.c| 55 +++
 net/core/rtnetlink.c  | 14 +---
 3 files changed, 54 insertions(+), 30 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9c23bd2..abedec7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3296,11 +3296,22 @@ int dev_get_phys_port_id(struct net_device *dev,
 int dev_get_phys_port_name(struct net_device *dev,
   char *name, size_t len);
 int dev_change_proto_down(struct net_device *dev, bool proto_down);
-int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
- int fd, u32 flags);
 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device 
*dev);
 struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device 
*dev,
struct netdev_queue *txq, int *ret);
+
+typedef int (*xdp_op_t)(struct net_device *dev, struct netdev_xdp *xdp);
+int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
+ int fd, u32 flags);
+bool __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op);
+
+static inline bool dev_xdp_attached(struct net_device *dev)
+{
+   const struct net_device_ops *ops = dev->netdev_ops;
+
+   return ops->ndo_xdp && __dev_xdp_attached(dev, ops->ndo_xdp);
+}
+
 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 bool is_skb_forwardable(const struct net_device *dev,
diff --git a/net/core/dev.c b/net/core/dev.c
index 61443f0..d25f847 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6851,6 +6851,32 @@ int dev_change_proto_down(struct net_device *dev, bool 
proto_down)
 }
 EXPORT_SYMBOL(dev_change_proto_down);
 
+bool __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op)
+{
+   struct netdev_xdp xdp;
+
+   memset(, 0, sizeof(xdp));
+   xdp.command = XDP_QUERY_PROG;
+
+   /* Query must always succeed. */
+   WARN_ON(xdp_op(dev, ) < 0);
+   return xdp.prog_attached;
+}
+
+static int dev_xdp_install(struct net_device *dev, xdp_op_t xdp_op,
+  struct netlink_ext_ack *extack,
+  struct bpf_prog *prog)
+{
+   struct netdev_xdp xdp;
+
+   memset(, 0, sizeof(xdp));
+   xdp.command = XDP_SETUP_PROG;
+   xdp.extack = extack;
+   xdp.prog = prog;
+
+   return xdp_op(dev, );
+}
+
 /**
  * dev_change_xdp_fd - set or clear a bpf program for a device rx path
  * @dev: device
@@ -6863,43 +6889,34 @@ int dev_change_proto_down(struct net_device *dev, bool 
proto_down)
 int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
  int fd, u32 flags)
 {
-   int (*xdp_op)(struct net_device *dev, struct netdev_xdp *xdp);
const struct net_device_ops *ops = dev->netdev_ops;
struct bpf_prog *prog = NULL;
-   struct netdev_xdp xdp;
+   xdp_op_t

Re: [PATCH v1] ACPI: Switch to use generic UUID API

2017-05-09 Thread Zhang Rui

On Thu, 2017-05-04 at 12:21 +0300, Andy Shevchenko wrote:
> acpi_evaluate_dsm() and friends take a pointer to a raw buffer of 16
> bytes. Instead we convert them to use uuid_le type. At the same time
> we
> convert current users.
> 
> acpi_str_to_uuid() becomes useless after the conversion and it's safe
> to
> get rid of it.
> 
> The conversion fixes a potential bug in int340x_thermal as well since
> we have to use memcmp() on binary data.
> 
> Cc: Rafael J. Wysocki 
> Cc: Mika Westerberg 
> Cc: Borislav Petkov 
> Cc: Dan Williams 
> Cc: Amir Goldstein 
> Cc: Jarkko Sakkinen 
> Cc: Jani Nikula 
> Cc: Ben Skeggs 
> Cc: Benjamin Tissoires 
> Cc: Joerg Roedel 
> Cc: Adrian Hunter 
> Cc: Yisen Zhuang 
> Cc: Bjorn Helgaas 
> Cc: Zhang Rui 
> Cc: Felipe Balbi 
> Cc: Mathias Nyman 
> Cc: Heikki Krogerus 
> Cc: Liam Girdwood 
> Cc: Mark Brown 
> Signed-off-by: Andy Shevchenko 
> ---
> 
> diff --git a/drivers/thermal/int340x_thermal/int3400_thermal.c
> b/drivers/thermal/int340x_thermal/int3400_thermal.c
> index 9413c4abf0b9..c0eb3bb19b23 100644
> --- a/drivers/thermal/int340x_thermal/int3400_thermal.c
> +++ b/drivers/thermal/int340x_thermal/int3400_thermal.c
> @@ -23,7 +23,7 @@ enum int3400_thermal_uuid {
>   INT3400_THERMAL_MAXIMUM_UUID,
>  };
>  
> -static u8 *int3400_thermal_uuids[INT3400_THERMAL_MAXIMUM_UUID] = {
> +static const char
> *int3400_thermal_uuids[INT3400_THERMAL_MAXIMUM_UUID] = {
>   "42A441D6-AE6A-462b-A84B-4A8CE79027D3",
>   "3A95C389-E4B8-4629-A526-C52C88626BAE",
>   "97C68AE7-15FA-499c-B8C9-5DA81D606E0A",
> @@ -141,10 +141,10 @@ static int int3400_thermal_get_uuids(struct
> int3400_thermal_priv *priv)
>   }
>  
>   for (j = 0; j < INT3400_THERMAL_MAXIMUM_UUID; j++) {
> - u8 uuid[16];
> + uuid_le u;
>  
> - acpi_str_to_uuid(int3400_thermal_uuids[j],
> uuid);
> - if (!strncmp(uuid, objb->buffer.pointer,
> 16)) {
> + uuid_le_to_bin(int3400_thermal_uuids[j],
> );
> + if (!uuid_le_cmp(*(uuid_le *)objb-
> >buffer.pointer), u) {
>   priv->uuid_bitmap |= (1 << j);
>   break;
>   }
thanks for the fix.

Acked-by: Zhang Rui 

-rui

Re: [PATCH net-next] bnxt: add dma mapping attributes

2017-05-09 Thread Shannon Nelson

On 5/9/2017 5:49 PM, David Miller wrote:

From: Shannon Nelson 
Date: Tue,  9 May 2017 13:37:33 -0700

@@ -66,6 +66,12 @@
 MODULE_DESCRIPTION("Broadcom BCM573xx network driver");
 MODULE_VERSION(DRV_MODULE_VERSION);

+#ifdef CONFIG_SPARC
+#define BNXT_DMA_ATTRS  DMA_ATTR_WEAK_ORDERING
+#else
+#define BNXT_DMA_ATTRS 0
+#endif
+

I do not what this ifdef crap showing up in every driver just
to improve performance on one platform.

This needs to be generically done somehow in a way that
drivers get the correct setting automatically and without
ifdef checks.

Yes, these do get ugly, and as Michael pointed out this really is an 
advisory bit and should just be ignored by platforms that don't care. 
I'll just respin this using the DMA_ATTR_WEAK_ORDERING bit directly 
rather than obscuring it with a BNXT define.

sln

Re:Re: [PATCH net v2] driver: vrf: Fix one possible use-after-free issue

2017-05-09 Thread Gao Feng


At 2017-05-10 02:37:36, "David Miller"  wrote:
>From: gfree.w...@vip.163.com
>Date: Tue,  9 May 2017 18:27:33 +0800
>
>> @@ -989,6 +989,7 @@ static u32 vrf_fib_table(const struct net_device *dev)
>>  
>>  static int vrf_rcv_finish(struct net *net, struct sock *sk, struct sk_buff 
>> *skb)
>>  {
>> +kfree_skb(skb);
>>  return 0;
>>  }
>>  
>> @@ -998,7 +999,7 @@ static struct sk_buff *vrf_rcv_nfhook(u8 pf, unsigned 
>> int hook,
>>  {
>>  struct net *net = dev_net(dev);
>>  
>> -if (NF_HOOK(pf, hook, net, NULL, skb, dev, NULL, vrf_rcv_finish) < 0)
>> +if (nf_hook(pf, hook, net, NULL, skb, dev, NULL, vrf_rcv_finish) != 1)
>>  skb = NULL;/* kfree_skb(skb) handled by nf code */
>>  
>>  return skb;
>
>Indeed, this fixes the immediate problem with NF_STOLEN.
>
>Making NF_STOLEN fully functional is another story, we'd need to stack
>this all together properly:

Yes, this fix just make the vrf wouldn't cause panic which is caused by 
use-after-free skb.
It doesn't work as real NF_QUEUE when reinject the skb. 

>
>static int __ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff 
>*skb)
>{
> ...
>}
>
>static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
>{
>   return l3mdev_ip_rcv(skb, __ip_rcv_finish);
>}
>...
>static inline
>struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb,
> int (*okfn)(struct net *, struct sock *, struct 
> sk_buff *))
>{
>   return l3mdev_l3_rcv(skb, okfn, AF_INET);
>}
>
>etc. but that's going to really add a kink to the receive path,
>microbenchmark wise.

It is a solution to make NF_STOLEN fully function, but I haven't environment to 
test the benchmark.

Best Regards
Feng

Re: [PATCH net-next] bnxt: add dma mapping attributes

2017-05-09 Thread David Miller

From: Shannon Nelson 
Date: Tue,  9 May 2017 13:37:33 -0700

> @@ -66,6 +66,12 @@
>  MODULE_DESCRIPTION("Broadcom BCM573xx network driver");
>  MODULE_VERSION(DRV_MODULE_VERSION);
>  
> +#ifdef CONFIG_SPARC
> +#define BNXT_DMA_ATTRS  DMA_ATTR_WEAK_ORDERING
> +#else
> +#define BNXT_DMA_ATTRS   0
> +#endif
> +

I do not what this ifdef crap showing up in every driver just
to improve performance on one platform.

This needs to be generically done somehow in a way that
drivers get the correct setting automatically and without
ifdef checks.

Re: [PATCH 0/2] net: Set maximum receive packet size on veth interfaces

2017-05-09 Thread Fredrik Markström

On Tue, May 9, 2017 at 7:48 PM, David Miller  wrote:
>
> From: Fredrik Markstrom 
> Date: Tue,  9 May 2017 14:44:36 +0200
>
> > Currently veth drops all packets larger then the mtu set on the
> > receiving end of the pair. This is inconsistent with most hardware
> > ethernet drivers.
>
> False.
>
> In fact, many pieces of ethernet hardware a not physically capable of
> sending even VLAN packets that are above the normal base ethernet MTU.
>

Maybe I was unclear, the veth implementation drops all packers larger then the
configured MTU (on the receiving interface).
Most ethernet drivers accepts packets up to the ethernet MTU no matter the
configured MTU. As far as I can tell from the RFC:s that is ok.

A simpler solution would be to only drop packets larger then ethernet MTU (like
most network drivers), but I guess that will break existing stuff
already out there.

The example below demonstrates what behaviour I'm referring to.

Example:
 8< --
# veth0 and veth1 is a veth pair and veth1 has ben moved to a separate
network namespace.
% # NS A
% ip address add 1.2.3.4/24 dev veth0
% ip link set veth0 mtu 300 up

% # NS B
% ip address add 1.2.3.5/24 dev veth1
% ip link set veth1 mtu 1000 up
% ping -c 1 -W 1 -s 400 1.2.3.4
PING 1.2.3.4 (1.2.3.4) 400(428) bytes of data.

--- 1.2.3.4 ping statistics ---
1 packets transmitted, 0 received, 100% packet loss, time 0ms
 8< --

While it works fine in most setup:s with hw-interfaces:

 8< --
# Host A eth2 and Host B eth0 is on the same network.

# On HOST A
% ip address add 1.2.3.4/24 dev eth2
% ip link set eth2 mtu 300 up

% # HOST B
% ip address add 1.2.3.5/24 dev eth0
% ip link set eth0 mtu 1000 up
% ping -c 1 -W 1 -s 400 1.2.3.4
PING 1.2.3.4 (1.2.3.4) 400(428) bytes of data.
408 bytes from 1.2.3.4: icmp_seq=1 ttl=64 time=1.57 ms

--- 1.2.3.4 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 1.573/1.573/1.573/0.000 ms
 8< --

>
> It is therefore untenable to remove this restriction univerally like
> this.

With the explaination above, do you still think it's untenable ?

/Fredrik

Re: [Patch net] ipv6/dccp: do not inherit ipv6_mc_list from parent

2017-05-09 Thread Eric Dumazet

On Tue, 2017-05-09 at 16:59 -0700, Cong Wang wrote:
> Like commit 657831ffc38e ("dccp/tcp: do not inherit mc_list from parent")
> we should clear ipv6_mc_list etc. for IPv6 sockets too.
> 
> Cc: Eric Dumazet 
> Signed-off-by: Cong Wang 
> ---

Thanks !

Acked-by: Eric Dumazet

[PATCH] arp: honour gratuitous ARP _replies_

2017-05-09 Thread Ihar Hrachyshka

When arp_accept is 1, gratuitous ARPs are supposed to override matching
entries irrespective of whether they arrive during locktime. This was
implemented in commit 56022a8fdd87 ("ipv4: arp: update neighbour address
when a gratuitous arp is received and arp_accept is set")

There is a glitch in the patch though. RFC 2002, section 4.6, "ARP,
Proxy ARP, and Gratuitous ARP", defines gratuitous ARPs so that they can
be either of Request or Reply type. Those Reply gratuitous ARPs can be
triggered with standard tooling, for example, arping -A option does just
that.

This patch fixes the glitch, making both Request and Reply flavours of
gratuitous ARPs to behave identically.

As per RFC, if gratuitous ARPs are of Reply type, their Target Hardware
Address field should also be set to the link-layer address to which this
cache entry should be updated. The field is present in ARP over Ethernet
but not in IEEE 1394. In this patch, I don't consider any broadcasted
ARP replies as gratuitous if the field is not present, to conform the
standard. It's not clear whether there is such a thing for IEEE 1394 as
a gratuitous ARP reply; until it's cleared up, we will ignore such
broadcasts. Note that they will still update existing ARP cache entries,
assuming they arrive out of locktime time interval.

Signed-off-by: Ihar Hrachyshka 
---
 net/ipv4/arp.c | 18 --
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 0937b34..fb97b9c 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -653,6 +653,7 @@ static int arp_process(struct net *net, struct sock *sk, 
struct sk_buff *skb)
unsigned char *arp_ptr;
struct rtable *rt;
unsigned char *sha;
+   unsigned char *tha = NULL;
__be32 sip, tip;
u16 dev_type = dev->type;
int addr_type;
@@ -724,6 +725,7 @@ static int arp_process(struct net *net, struct sock *sk, 
struct sk_buff *skb)
break;
 #endif
default:
+   tha = arp_ptr;
arp_ptr += dev->addr_len;
}
memcpy(, arp_ptr, 4);
@@ -842,8 +844,20 @@ static int arp_process(struct net *net, struct sock *sk, 
struct sk_buff *skb)
   It is possible, that this option should be enabled for some
   devices (strip is candidate)
 */
-   is_garp = arp->ar_op == htons(ARPOP_REQUEST) && tip == sip &&
- addr_type == RTN_UNICAST;
+   is_garp = tip == sip && addr_type == RTN_UNICAST;
+
+   /* Unsolicited ARP _replies_ also require target hwaddr to be
+* the same as source.
+*/
+   if (is_garp && arp->ar_op == htons(ARPOP_REPLY))
+   is_garp =
+#if IS_ENABLED(CONFIG_FIREWIRE_NET)
+   /* IPv4 over IEEE 1394 doesn't provide target
+* hardware address field in its ARP payload.
+*/
+   tha &&
+#endif
+   !memcmp(tha, sha, dev->addr_len);
 
if (!n &&
((arp->ar_op == htons(ARPOP_REPLY)  &&
-- 
2.9.3

[PATCH] neighbour: update neigh timestamps iff update is effective

2017-05-09 Thread Ihar Hrachyshka

It's a common practice to send gratuitous ARPs after moving an
IP address to another device to speed up healing of a service. To
fulfill service availability constraints, the timing of network peers
updating their caches to point to a new location of an IP address can be
particularly important.

Sometimes neigh_update calls won't touch neither lladdr nor state, for
example if an update arrives in locktime interval. Then we effectively
ignore the update request, bailing out of touching the neigh entry,
except that we still bump its timestamps.

This may be a problem for updates arriving in quick succession. For
example, consider the following scenario:

A service is moved to another device with its IP address. The new device
sends three gratuitous ARP requests into the network with ~1 seconds
interval between them. Just before the first request arrives to one of
network peer nodes, its neigh entry for the IP address transitions from
STALE to DELAY.  This transition, among other things, updates
neigh->updated. Once the kernel receives the first gratuitous ARP, it
ignores it because its arrival time is inside the locktime interval. The
kernel still bumps neigh->updated. Then the second gratuitous ARP
request arrives, and it's also ignored because it's still in the (new)
locktime interval. Same happens for the third request. The node
eventually heals itself (after delay_first_probe_time seconds since the
initial transition to DELAY state), but it just wasted some time and
require a new ARP request/reply round trip. This unfortunate behaviour
both puts more load on the network, as well as reduces service
availability.

This patch changes neigh_update so that it bumps neigh->updated (as well
as neigh->confirmed) only once we are sure that either lladdr or entry
state will change). In the scenario described above, it means that the
second gratuitous ARP request will actually update the entry lladdr.

Ideally, we would update the neigh entry on the very first gratuitous
ARP request. The locktime mechanism is designed to ignore ARP updates in
a short timeframe after a previous ARP update was honoured by the kernel
layer. This would require tracking timestamps for state transitions
separately from timestamps when actual updates are received. This would
probably involve changes in neighbour struct. Therefore, the patch
doesn't tackle the issue of the first gratuitous APR ignored, leaving
it for a follow-up.

Signed-off-by: Ihar Hrachyshka 
---
 net/core/neighbour.c | 14 ++
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 58b0bcc..d274f81 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1132,10 +1132,6 @@ int neigh_update(struct neighbour *neigh, const u8 
*lladdr, u8 new,
lladdr = neigh->ha;
}
 
-   if (new & NUD_CONNECTED)
-   neigh->confirmed = jiffies;
-   neigh->updated = jiffies;
-
/* If entry was valid and address is not changed,
   do not change entry state, if new one is STALE.
 */
@@ -1157,6 +1153,16 @@ int neigh_update(struct neighbour *neigh, const u8 
*lladdr, u8 new,
}
}
 
+   /* Update timestamps only once we know we will make a change to the
+* neighbour entry. Otherwise we risk to move the locktime window with
+* noop updates and ignore relevant ARP updates.
+*/
+   if (new != old || lladdr != neigh->ha) {
+   if (new & NUD_CONNECTED)
+   neigh->confirmed = jiffies;
+   neigh->updated = jiffies;
+   }
+
if (new != old) {
neigh_del_timer(neigh);
if (new & NUD_PROBE)
-- 
2.9.3

[Patch net] ipv6/dccp: do not inherit ipv6_mc_list from parent

2017-05-09 Thread Cong Wang

Like commit 657831ffc38e ("dccp/tcp: do not inherit mc_list from parent")
we should clear ipv6_mc_list etc. for IPv6 sockets too.

Cc: Eric Dumazet 
Signed-off-by: Cong Wang 
---
 net/dccp/ipv6.c | 6 ++
 net/ipv6/tcp_ipv6.c | 2 ++
 2 files changed, 8 insertions(+)

diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index d9b6a4e..b6bbb71 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -426,6 +426,9 @@ static struct sock *dccp_v6_request_recv_sock(const struct 
sock *sk,
newsk->sk_backlog_rcv = dccp_v4_do_rcv;
newnp->pktoptions  = NULL;
newnp->opt = NULL;
+   newnp->ipv6_mc_list = NULL;
+   newnp->ipv6_ac_list = NULL;
+   newnp->ipv6_fl_list = NULL;
newnp->mcast_oif   = inet6_iif(skb);
newnp->mcast_hops  = ipv6_hdr(skb)->hop_limit;
 
@@ -490,6 +493,9 @@ static struct sock *dccp_v6_request_recv_sock(const struct 
sock *sk,
/* Clone RX bits */
newnp->rxopt.all = np->rxopt.all;
 
+   newnp->ipv6_mc_list = NULL;
+   newnp->ipv6_ac_list = NULL;
+   newnp->ipv6_fl_list = NULL;
newnp->pktoptions = NULL;
newnp->opt= NULL;
newnp->mcast_oif  = inet6_iif(skb);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index aeb9497..df5a9ff 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1062,6 +1062,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct 
sock *sk, struct sk_buff *
newtp->af_specific = _sock_ipv6_mapped_specific;
 #endif
 
+   newnp->ipv6_mc_list = NULL;
newnp->ipv6_ac_list = NULL;
newnp->ipv6_fl_list = NULL;
newnp->pktoptions  = NULL;
@@ -1131,6 +1132,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct 
sock *sk, struct sk_buff *
   First: no IPv4 options.
 */
newinet->inet_opt = NULL;
+   newnp->ipv6_mc_list = NULL;
newnp->ipv6_ac_list = NULL;
newnp->ipv6_fl_list = NULL;
 
-- 
2.5.5

Re: [Patch net] ipv4: restore rt->fi for reference counting

2017-05-09 Thread Eric Dumazet

On Tue, 2017-05-09 at 16:35 -0700, Cong Wang wrote:

> This statement is only used to ensure we pass the "dead == fi->fib_nhs"
> check right below the inner loop, it is fine to keep it without break since
> fi is not changed in the inner loop.
> 

So the dead++ above wont end up with (dead > fi->fib_nhs) ?

Re: [Patch net] ipv4: restore rt->fi for reference counting

2017-05-09 Thread Eric Dumazet

On Tue, 2017-05-09 at 16:35 -0700, Cong Wang wrote:

> All of them take RCU read lock, so, as I explained in the code comment,
> they all should be fine because of synchronize_net() on unregister path.
> Do you see anything otherwise?

They might take rcu lock, but compiler is still allowed to read
fi->fib_dev multiple times, and crashes might happen.

You will need to audit all code and fix it, using proper
rcu_dereference() or similar code ensuring compiler wont do stupid
things.

Like :

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 
1201409ba1dcb18ee028003b065410b87bf4a602..ab69517befbb5f300af785fbb20071a3d1086593
 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2666,11 +2666,13 @@ static int fib_route_seq_show(struct seq_file *seq, 
void *v)
 
seq_setwidth(seq, 127);
 
-   if (fi)
+   if (fi) {
+   struct net_device *dev = rcu_dereference(fi->fib_dev);
+
seq_printf(seq,
   "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
   "%d\t%08X\t%d\t%u\t%u",
-  fi->fib_dev ? fi->fib_dev->name : "*",
+  dev ? dev->name : "*",
   prefix,
   fi->fib_nh->nh_gw, flags, 0, 0,
   fi->fib_priority,
@@ -2679,13 +2681,13 @@ static int fib_route_seq_show(struct seq_file *seq, 
void *v)
fi->fib_advmss + 40 : 0),
   fi->fib_window,
   fi->fib_rtt >> 3);
-   else
+   } else {
seq_printf(seq,
   "*\t%08X\t%08X\t%04X\t%d\t%u\t"
   "%d\t%08X\t%d\t%u\t%u",
   prefix, 0, flags, 0, 0, 0,
   mask, 0, 0, 0);
-
+   }
seq_pad(seq, '\n');
}

Re: [Patch net] ipv4: restore rt->fi for reference counting

2017-05-09 Thread Cong Wang

On Tue, May 9, 2017 at 4:09 PM, Eric Dumazet  wrote:
> On Tue, 2017-05-09 at 15:54 -0700, Eric Dumazet wrote:
>> On Tue, 2017-05-09 at 15:52 -0700, Eric Dumazet wrote:
>> > On Tue, 2017-05-09 at 15:07 -0700, Cong Wang wrote:
>> > > On Tue, May 9, 2017 at 1:56 PM, Cong Wang  
>> > > wrote:
>> > > > Wait... if we transfer dst->dev to loopback_dev because we don't
>> > > > want to block unregister path, then we might have a similar problem
>> > > > for rt->fi too, fib_info is still referenced by dst, so these nh_dev's 
>> > > > still
>> > > > hold the dev references...
>> > > >
>> > >
>> > > I finally come up with the attach patch... Do you mind to give it a try?
>> >
>> > I will, but this might be delayed by a few hours.
>> >
>> > In the mean time, it looks like you could try adding the following to
>> > your .config ;)
>> >
>> > CONFIG_IP_ROUTE_MULTIPATH=y
>> >
>> >
>>
>> +   /* This should be fine, we are on unregister
>> +* path so synchronize_net() already waits 
>> for
>> +* existing readers. We have to release the
>> +* dev here because dst could still hold this
>> +* fib_info via rt->fi, we can't wait for GC.
>> +*/
>> +   RCU_INIT_POINTER(nexthop_nh->nh_dev, NULL);
>> +   dev_put(dev);
>> dead = fi->fib_nhs;
>>
>> dead = fi->fib_mhs looks wrong if you remove the break; statement ?
>>
>> -   break;

This statement is only used to ensure we pass the "dead == fi->fib_nhs"
check right below the inner loop, it is fine to keep it without break since
fi is not changed in the inner loop.


>
> Also setting nexthop_nh->nh_dev to NULL looks quite dangerous
>
> We have plenty of sites doing :
>
> if (fi->fib_dev)
> x = fi->fib_dev->field
>
> fib_route_seq_show() is one example.
>

All of them take RCU read lock, so, as I explained in the code comment,
they all should be fine because of synchronize_net() on unregister path.
Do you see anything otherwise?

[PATCH] libertas: Avoid reading past end of buffer

2017-05-09 Thread Kees Cook

Using memcpy() from a string that is shorter than the length copied means
the destination buffer is being filled with arbitrary data from the kernel
rodata segment. Instead, use strncpy() which will fill the trailing bytes
with zeros. Additionally adjust indentation to keep checkpatch.pl happy.

This was found with the future CONFIG_FORTIFY_SOURCE feature.

Cc: Daniel Micay 
Signed-off-by: Kees Cook 
---
 drivers/net/wireless/marvell/libertas/mesh.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/wireless/marvell/libertas/mesh.c 
b/drivers/net/wireless/marvell/libertas/mesh.c
index d0c881dd5846..d0b1948ca242 100644
--- a/drivers/net/wireless/marvell/libertas/mesh.c
+++ b/drivers/net/wireless/marvell/libertas/mesh.c
@@ -1177,9 +1177,9 @@ void lbs_mesh_ethtool_get_strings(struct net_device *dev,
switch (stringset) {
case ETH_SS_STATS:
for (i = 0; i < MESH_STATS_NUM; i++) {
-   memcpy(s + i * ETH_GSTRING_LEN,
-   mesh_stat_strings[i],
-   ETH_GSTRING_LEN);
+   strncpy(s + i * ETH_GSTRING_LEN,
+   mesh_stat_strings[i],
+   ETH_GSTRING_LEN);
}
break;
}
-- 
2.7.4


-- 
Kees Cook
Pixel Security

Re: [Patch net] ipv4: restore rt->fi for reference counting

2017-05-09 Thread Eric Dumazet

On Tue, 2017-05-09 at 15:54 -0700, Eric Dumazet wrote:
> On Tue, 2017-05-09 at 15:52 -0700, Eric Dumazet wrote:
> > On Tue, 2017-05-09 at 15:07 -0700, Cong Wang wrote:
> > > On Tue, May 9, 2017 at 1:56 PM, Cong Wang  
> > > wrote:
> > > > Wait... if we transfer dst->dev to loopback_dev because we don't
> > > > want to block unregister path, then we might have a similar problem
> > > > for rt->fi too, fib_info is still referenced by dst, so these nh_dev's 
> > > > still
> > > > hold the dev references...
> > > >
> > > 
> > > I finally come up with the attach patch... Do you mind to give it a try?
> > 
> > I will, but this might be delayed by a few hours.
> > 
> > In the mean time, it looks like you could try adding the following to
> > your .config ;)
> > 
> > CONFIG_IP_ROUTE_MULTIPATH=y
> > 
> > 
> 
> +   /* This should be fine, we are on unregister
> +* path so synchronize_net() already waits for
> +* existing readers. We have to release the
> +* dev here because dst could still hold this
> +* fib_info via rt->fi, we can't wait for GC.
> +*/
> +   RCU_INIT_POINTER(nexthop_nh->nh_dev, NULL);
> +   dev_put(dev);
> dead = fi->fib_nhs;
> 
> dead = fi->fib_mhs looks wrong if you remove the break; statement ?
> 
> -   break;

Also setting nexthop_nh->nh_dev to NULL looks quite dangerous

We have plenty of sites doing :

if (fi->fib_dev)
x = fi->fib_dev->field

fib_route_seq_show() is one example.

Re: [PATCH] wcn36xx: Close SMD channel on device removal

2017-05-09 Thread Bjorn Andersson

On Mon 08 May 23:17 PDT 2017, Kalle Valo wrote:

> Bjorn Andersson  writes:
> 
> > The SMD channel is not the primary WCNSS channel and must explicitly be
> > closed as the device is removed, or the channel will already by open on
> > a subsequent probe call in e.g. the case of reloading the kernel module.
> >
> > This issue was introduced because I simplified the underlying SMD
> > implementation while the SMD adaptions of the driver sat on the mailing
> > list, but missed to update these patches. The patch does however only
> > apply back to the transition to rpmsg, hence the limited Fixes.
> >
> > Fixes: 5052de8deff5 ("soc: qcom: smd: Transition client drivers from smd to 
> > rpmsg")
> > Reported-by: Eyal Ilsar 
> > Signed-off-by: Bjorn Andersson 
> 
> As this is a regression I'll queue this to 4.12.
> 

Thanks.

> But if this is an older bug (didn't quite understand your description
> though) should there be a separate patch for stable releases?
> 

AFAICT this never worked, as it seems I did the rework in SMD while we
tried to figure out the dependency issues we had with moving to SMD. So
v4.9 through v4.11 has SMD support - with this bug.

How do I proceed, do you want me to write up a fix for stable@? Do I
send that out as an ordinary patch?

Regards,
Bjorn

Re: [PATCH net-next] bnxt: add dma mapping attributes

2017-05-09 Thread Shannon Nelson


On 5/9/2017 2:05 PM, Michael Chan wrote:

On Tue, May 9, 2017 at 1:37 PM, Shannon Nelson
 wrote:

On the SPARC platform we need to use the DMA_ATTR_WEAK_ORDERING attribute
in our Rx path dma mapping in order to get the expected performance out
of the receive path.  Adding it to the Tx path has little effect, so
that's not a part of this patch.

Signed-off-by: Shannon Nelson 
Reviewed-by: Tushar Dave 
Reviewed-by: Tom Saeger 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c |   61 ++--
 1 files changed, 39 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 1f1e54b..771742c 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -66,6 +66,12 @@
 MODULE_DESCRIPTION("Broadcom BCM573xx network driver");
 MODULE_VERSION(DRV_MODULE_VERSION);

+#ifdef CONFIG_SPARC
+#define BNXT_DMA_ATTRS  DMA_ATTR_WEAK_ORDERING
+#else
+#define BNXT_DMA_ATTRS 0
+#endif
+


I think we can use the same attribute for all architectures.
Architectures that don't implement weak ordering will ignore the
attribute.



In the long run, you are probably correct, and it would be simple enough 
to change this.  However, given the recent threads about the 
applicability of Relaxed Ordering and a couple of PCIe root complexes 
that have been found to have issues with Relaxed Ordering TLPs, I prefer 
to stay on the conservative side and set it up only for the platform I 
know.  As it stands, this patch won't change the currently working 
behavior on other platforms, but will help us out on the one we know can 
use the feature.


sln

Re: [Patch net] ipv4: restore rt->fi for reference counting

2017-05-09 Thread Eric Dumazet

On Tue, 2017-05-09 at 15:52 -0700, Eric Dumazet wrote:
> On Tue, 2017-05-09 at 15:07 -0700, Cong Wang wrote:
> > On Tue, May 9, 2017 at 1:56 PM, Cong Wang  wrote:
> > > Wait... if we transfer dst->dev to loopback_dev because we don't
> > > want to block unregister path, then we might have a similar problem
> > > for rt->fi too, fib_info is still referenced by dst, so these nh_dev's 
> > > still
> > > hold the dev references...
> > >
> > 
> > I finally come up with the attach patch... Do you mind to give it a try?
> 
> I will, but this might be delayed by a few hours.
> 
> In the mean time, it looks like you could try adding the following to
> your .config ;)
> 
> CONFIG_IP_ROUTE_MULTIPATH=y
> 
> 

+   /* This should be fine, we are on unregister
+* path so synchronize_net() already waits for
+* existing readers. We have to release the
+* dev here because dst could still hold this
+* fib_info via rt->fi, we can't wait for GC.
+*/
+   RCU_INIT_POINTER(nexthop_nh->nh_dev, NULL);
+   dev_put(dev);
dead = fi->fib_nhs;

dead = fi->fib_mhs looks wrong if you remove the break; statement ?

-   break;
}

Re: [Patch net] ipv4: restore rt->fi for reference counting

2017-05-09 Thread Eric Dumazet

On Tue, 2017-05-09 at 15:07 -0700, Cong Wang wrote:
> On Tue, May 9, 2017 at 1:56 PM, Cong Wang  wrote:
> > Wait... if we transfer dst->dev to loopback_dev because we don't
> > want to block unregister path, then we might have a similar problem
> > for rt->fi too, fib_info is still referenced by dst, so these nh_dev's still
> > hold the dev references...
> >
> 
> I finally come up with the attach patch... Do you mind to give it a try?

I will, but this might be delayed by a few hours.

In the mean time, it looks like you could try adding the following to
your .config ;)

CONFIG_IP_ROUTE_MULTIPATH=y

Re: [PATCH v2 net] dccp/tcp: do not inherit mc_list from parent

2017-05-09 Thread Eric Dumazet

On Tue, 2017-05-09 at 15:37 -0700, Cong Wang wrote:
> On Tue, May 9, 2017 at 6:29 AM, Eric Dumazet  wrote:
> > From: Eric Dumazet 
> >
> > syzkaller found a way to trigger double frees from ip_mc_drop_socket()
> >
> > It turns out that leave a copy of parent mc_list at accept() time,
> > which is very bad.
> >
> > Very similar to commit 8b485ce69876 ("tcp: do not inherit
> > fastopen_req from parent")
> >
> > Initial report from Pray3r, completed by Andrey one.
> > Thanks a lot to them !
> >
> > Signed-off-by: Eric Dumazet 
> > Reported-by: Pray3r 
> > Reported-by: Andrey Konovalov 
> > Tested-by: Andrey Konovalov 
> > ---
> > v2: fix moved into inet_csk_clone_lock() to fix both DCCP and TCP
> >
> >  net/ipv4/inet_connection_sock.c |2 ++
> >  1 file changed, 2 insertions(+)
> >
> > diff --git a/net/ipv4/inet_connection_sock.c 
> > b/net/ipv4/inet_connection_sock.c
> > index 
> > 5e313c1ac94fc88eca5fe3a0e9e46e551e955ff0..1054d330bf9df3189a21dbb08e27c0e6ad136775
> >  100644
> > --- a/net/ipv4/inet_connection_sock.c
> > +++ b/net/ipv4/inet_connection_sock.c
> > @@ -794,6 +794,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
> > /* listeners have SOCK_RCU_FREE, not the children */
> > sock_reset_flag(newsk, SOCK_RCU_FREE);
> >
> > +   inet_sk(newsk)->mc_list = NULL;
> > +
> > newsk->sk_mark = inet_rsk(req)->ir_mark;
> > atomic64_set(>sk_cookie,
> >  atomic64_read(_rsk(req)->ir_cookie));
> >
> 
> I think IPv6 needs this too?
> 
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index aeb9497..b3611d9 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -1062,6 +1062,7 @@ static struct sock *tcp_v6_syn_recv_sock(const
> struct sock *sk, struct sk_buff *
> newtp->af_specific = _sock_ipv6_mapped_specific;
>  #endif
> 
> +   newnp->ipv6_mc_list = NULL;
> newnp->ipv6_ac_list = NULL;
> newnp->ipv6_fl_list = NULL;
> newnp->pktoptions  = NULL;

Good point, but it looks like you patched on only IPv4 mapped sockets.

And DCCP would need fixes as well.

Re: [PATCH v2 net] dccp/tcp: do not inherit mc_list from parent

2017-05-09 Thread Cong Wang

On Tue, May 9, 2017 at 6:29 AM, Eric Dumazet  wrote:
> From: Eric Dumazet 
>
> syzkaller found a way to trigger double frees from ip_mc_drop_socket()
>
> It turns out that leave a copy of parent mc_list at accept() time,
> which is very bad.
>
> Very similar to commit 8b485ce69876 ("tcp: do not inherit
> fastopen_req from parent")
>
> Initial report from Pray3r, completed by Andrey one.
> Thanks a lot to them !
>
> Signed-off-by: Eric Dumazet 
> Reported-by: Pray3r 
> Reported-by: Andrey Konovalov 
> Tested-by: Andrey Konovalov 
> ---
> v2: fix moved into inet_csk_clone_lock() to fix both DCCP and TCP
>
>  net/ipv4/inet_connection_sock.c |2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
> index 
> 5e313c1ac94fc88eca5fe3a0e9e46e551e955ff0..1054d330bf9df3189a21dbb08e27c0e6ad136775
>  100644
> --- a/net/ipv4/inet_connection_sock.c
> +++ b/net/ipv4/inet_connection_sock.c
> @@ -794,6 +794,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
> /* listeners have SOCK_RCU_FREE, not the children */
> sock_reset_flag(newsk, SOCK_RCU_FREE);
>
> +   inet_sk(newsk)->mc_list = NULL;
> +
> newsk->sk_mark = inet_rsk(req)->ir_mark;
> atomic64_set(>sk_cookie,
>  atomic64_read(_rsk(req)->ir_cookie));
>

I think IPv6 needs this too?

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index aeb9497..b3611d9 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1062,6 +1062,7 @@ static struct sock *tcp_v6_syn_recv_sock(const
struct sock *sk, struct sk_buff *
newtp->af_specific = _sock_ipv6_mapped_specific;
 #endif

+   newnp->ipv6_mc_list = NULL;
newnp->ipv6_ac_list = NULL;
newnp->ipv6_fl_list = NULL;
newnp->pktoptions  = NULL;

Re: [Patch net] ipv4: restore rt->fi for reference counting

2017-05-09 Thread Cong Wang

On Tue, May 9, 2017 at 1:56 PM, Cong Wang  wrote:
> Wait... if we transfer dst->dev to loopback_dev because we don't
> want to block unregister path, then we might have a similar problem
> for rt->fi too, fib_info is still referenced by dst, so these nh_dev's still
> hold the dev references...
>

I finally come up with the attach patch... Do you mind to give it a try?
commit a431b4d969f617c5ef8711b6bf493199eecec759
Author: Cong Wang 
Date:   Tue May 9 14:35:10 2017 -0700

ipv4: restore rt->fi for reference counting, try #2

Signed-off-by: Cong Wang 

diff --git a/include/net/route.h b/include/net/route.h
index 2cc0e14..4335eb7 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -69,6 +69,7 @@ struct rtable {
 
struct list_headrt_uncached;
struct uncached_list*rt_uncached_list;
+   struct fib_info *fi; /* for refcnt to shared metrics */
 };
 
 static inline bool rt_is_input_route(const struct rtable *rt)
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index da449dd..d77c453 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1429,8 +1429,15 @@ int fib_sync_down_dev(struct net_device *dev, unsigned 
long event, bool force)
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
if (event == NETDEV_UNREGISTER &&
nexthop_nh->nh_dev == dev) {
+   /* This should be fine, we are on unregister
+* path so synchronize_net() already waits for
+* existing readers. We have to release the
+* dev here because dst could still hold this
+* fib_info via rt->fi, we can't wait for GC.
+*/
+   RCU_INIT_POINTER(nexthop_nh->nh_dev, NULL);
+   dev_put(dev);
dead = fi->fib_nhs;
-   break;
}
 #endif
} endfor_nexthops(fi)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 655d9ee..f647310 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1387,6 +1387,11 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
 {
struct rtable *rt = (struct rtable *) dst;
 
+   if (rt->fi) {
+   fib_info_put(rt->fi);
+   rt->fi = NULL;
+   }
+
if (!list_empty(>rt_uncached)) {
struct uncached_list *ul = rt->rt_uncached_list;
 
@@ -1424,6 +1429,16 @@ static bool rt_cache_valid(const struct rtable *rt)
!rt_is_expired(rt);
 }
 
+static void rt_init_metrics(struct rtable *rt, struct fib_info *fi)
+{
+   if (fi->fib_metrics != (u32 *)dst_default_metrics) {
+   fib_info_hold(fi);
+   rt->fi = fi;
+   }
+
+   dst_init_metrics(>dst, fi->fib_metrics, true);
+}
+
 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
   const struct fib_result *res,
   struct fib_nh_exception *fnhe,
@@ -1438,7 +1453,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 
daddr,
rt->rt_gateway = nh->nh_gw;
rt->rt_uses_gateway = 1;
}
-   dst_init_metrics(>dst, fi->fib_metrics, true);
+   rt_init_metrics(rt, fi);
 #ifdef CONFIG_IP_ROUTE_CLASSID
rt->dst.tclassid = nh->nh_tclassid;
 #endif
@@ -1490,6 +1505,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
rt->rt_gateway = 0;
rt->rt_uses_gateway = 0;
rt->rt_table_id = 0;
+   rt->fi = NULL;
INIT_LIST_HEAD(>rt_uncached);
 
rt->dst.output = ip_output;

Re: [PATCH net-next] bnxt: add dma mapping attributes

2017-05-09 Thread Michael Chan

On Tue, May 9, 2017 at 1:37 PM, Shannon Nelson
 wrote:
> On the SPARC platform we need to use the DMA_ATTR_WEAK_ORDERING attribute
> in our Rx path dma mapping in order to get the expected performance out
> of the receive path.  Adding it to the Tx path has little effect, so
> that's not a part of this patch.
>
> Signed-off-by: Shannon Nelson 
> Reviewed-by: Tushar Dave 
> Reviewed-by: Tom Saeger 
> ---
>  drivers/net/ethernet/broadcom/bnxt/bnxt.c |   61 ++--
>  1 files changed, 39 insertions(+), 22 deletions(-)
>
> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
> b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> index 1f1e54b..771742c 100644
> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> @@ -66,6 +66,12 @@
>  MODULE_DESCRIPTION("Broadcom BCM573xx network driver");
>  MODULE_VERSION(DRV_MODULE_VERSION);
>
> +#ifdef CONFIG_SPARC
> +#define BNXT_DMA_ATTRS  DMA_ATTR_WEAK_ORDERING
> +#else
> +#define BNXT_DMA_ATTRS 0
> +#endif
> +

I think we can use the same attribute for all architectures.
Architectures that don't implement weak ordering will ignore the
attribute.

Re: [Patch net] ipv4: restore rt->fi for reference counting

2017-05-09 Thread Cong Wang

On Tue, May 9, 2017 at 9:56 AM, Eric Dumazet  wrote:
> On Tue, 2017-05-09 at 09:44 -0700, Cong Wang wrote:
>
>>
>> Eric, how did you produce it?
>> I guess it's because of nh_dev which is the only netdevice pointer inside
>> fib_info. Let me take a deeper look.
>>
>
> Nothing particular, I am using kexec to boot new kernels, and all my
> attempts with your patch included demonstrated the issue.


Interesting. So this happens on your eth0 teardown path, but
I don't see any problem in the related NETDEV_UNREGISTER
notifiers yet, we don't call dst_destroy() on this path and will defer it
to GC, but that should not be delayed for so long?

Wait... if we transfer dst->dev to loopback_dev because we don't
want to block unregister path, then we might have a similar problem
for rt->fi too, fib_info is still referenced by dst, so these nh_dev's still
hold the dev references...

Something like this (just for proof of concept):

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index da449dd..85cd614 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -205,7 +205,7 @@ static void free_fib_info_rcu(struct rcu_head *head)
struct fib_info *fi = container_of(head, struct fib_info, rcu);

change_nexthops(fi) {
-   if (nexthop_nh->nh_dev)
+   if (nexthop_nh->nh_dev && !(nexthop_nh->nh_flags & RTNH_F_DEAD))
dev_put(nexthop_nh->nh_dev);
lwtstate_put(nexthop_nh->nh_lwtstate);
free_nh_exceptions(nexthop_nh);
@@ -1414,8 +1414,10 @@ int fib_sync_down_dev(struct net_device *dev,
unsigned long event, bool force)
else if (nexthop_nh->nh_dev == dev &&
 nexthop_nh->nh_scope != scope) {
switch (event) {
-   case NETDEV_DOWN:
case NETDEV_UNREGISTER:
+   dev_put(dev);
+   /* fall through */
+   case NETDEV_DOWN:
nexthop_nh->nh_flags |= RTNH_F_DEAD;
/* fall through */
case NETDEV_CHANGE:

[PATCH net-next] bnxt: add dma mapping attributes

2017-05-09 Thread Shannon Nelson

On the SPARC platform we need to use the DMA_ATTR_WEAK_ORDERING attribute
in our Rx path dma mapping in order to get the expected performance out
of the receive path.  Adding it to the Tx path has little effect, so
that's not a part of this patch.

Signed-off-by: Shannon Nelson 
Reviewed-by: Tushar Dave 
Reviewed-by: Tom Saeger 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c |   61 ++--
 1 files changed, 39 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 1f1e54b..771742c 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -66,6 +66,12 @@
 MODULE_DESCRIPTION("Broadcom BCM573xx network driver");
 MODULE_VERSION(DRV_MODULE_VERSION);
 
+#ifdef CONFIG_SPARC
+#define BNXT_DMA_ATTRS  DMA_ATTR_WEAK_ORDERING
+#else
+#define BNXT_DMA_ATTRS 0
+#endif
+
 #define BNXT_RX_OFFSET (NET_SKB_PAD + NET_IP_ALIGN)
 #define BNXT_RX_DMA_OFFSET NET_SKB_PAD
 #define BNXT_RX_COPY_THRESH 256
@@ -582,7 +588,8 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi 
*bnapi, int nr_pkts)
if (!page)
return NULL;
 
-   *mapping = dma_map_page(dev, page, 0, PAGE_SIZE, bp->rx_dir);
+   *mapping = dma_map_page_attrs(dev, page, 0, PAGE_SIZE, bp->rx_dir,
+ BNXT_DMA_ATTRS);
if (dma_mapping_error(dev, *mapping)) {
__free_page(page);
return NULL;
@@ -601,8 +608,9 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi 
*bnapi, int nr_pkts)
if (!data)
return NULL;
 
-   *mapping = dma_map_single(>dev, data + bp->rx_dma_offset,
- bp->rx_buf_use_size, bp->rx_dir);
+   *mapping = dma_map_single_attrs(>dev, data + bp->rx_dma_offset,
+   bp->rx_buf_use_size, bp->rx_dir,
+   BNXT_DMA_ATTRS);
 
if (dma_mapping_error(>dev, *mapping)) {
kfree(data);
@@ -705,8 +713,9 @@ static inline int bnxt_alloc_rx_page(struct bnxt *bp,
return -ENOMEM;
}
 
-   mapping = dma_map_page(>dev, page, offset, BNXT_RX_PAGE_SIZE,
-  PCI_DMA_FROMDEVICE);
+   mapping = dma_map_page_attrs(>dev, page, offset,
+BNXT_RX_PAGE_SIZE, PCI_DMA_FROMDEVICE,
+BNXT_DMA_ATTRS);
if (dma_mapping_error(>dev, mapping)) {
__free_page(page);
return -EIO;
@@ -799,7 +808,8 @@ static void bnxt_reuse_rx_agg_bufs(struct bnxt_napi *bnapi, 
u16 cp_cons,
return NULL;
}
dma_addr -= bp->rx_dma_offset;
-   dma_unmap_page(>pdev->dev, dma_addr, PAGE_SIZE, bp->rx_dir);
+   dma_unmap_page_attrs(>pdev->dev, dma_addr, PAGE_SIZE, bp->rx_dir,
+BNXT_DMA_ATTRS);
 
if (unlikely(!payload))
payload = eth_get_headlen(data_ptr, len);
@@ -841,8 +851,8 @@ static void bnxt_reuse_rx_agg_bufs(struct bnxt_napi *bnapi, 
u16 cp_cons,
}
 
skb = build_skb(data, 0);
-   dma_unmap_single(>pdev->dev, dma_addr, bp->rx_buf_use_size,
-bp->rx_dir);
+   dma_unmap_single_attrs(>pdev->dev, dma_addr, bp->rx_buf_use_size,
+  bp->rx_dir, BNXT_DMA_ATTRS);
if (!skb) {
kfree(data);
return NULL;
@@ -909,8 +919,8 @@ static void bnxt_reuse_rx_agg_bufs(struct bnxt_napi *bnapi, 
u16 cp_cons,
return NULL;
}
 
-   dma_unmap_page(>dev, mapping, BNXT_RX_PAGE_SIZE,
-  PCI_DMA_FROMDEVICE);
+   dma_unmap_page_attrs(>dev, mapping, BNXT_RX_PAGE_SIZE,
+PCI_DMA_FROMDEVICE, BNXT_DMA_ATTRS);
 
skb->data_len += frag_len;
skb->len += frag_len;
@@ -1329,8 +1339,9 @@ static void bnxt_abort_tpa(struct bnxt *bp, struct 
bnxt_napi *bnapi,
tpa_info->mapping = new_mapping;
 
skb = build_skb(data, 0);
-   dma_unmap_single(>pdev->dev, mapping, bp->rx_buf_use_size,
-bp->rx_dir);
+   dma_unmap_single_attrs(>pdev->dev, mapping,
+  bp->rx_buf_use_size, bp->rx_dir,
+  BNXT_DMA_ATTRS);
 
if (!skb) {
kfree(data);
@@ -1971,9 +1982,11 @@ static void bnxt_free_rx_skbs(struct bnxt *bp)
if (!data)
continue;
 
-   dma_unmap_single(>dev, tpa_info->mapping,
-bp->rx_buf_use_size,
-

Re: arch: arm: bpf: Converting cBPF to eBPF for arm 32 bit

2017-05-09 Thread Daniel Borkmann


On 05/09/2017 10:12 PM, Shubham Bansal wrote:

Hi Daniel,

I just tried running test_bpf.ko module.

$ echo 2 >>  /proc/sys/net/core/bpf_jit_enable
$ insmod test_bpf.ko

test_bpf: #0 TAX
bpf_jit: flen=14 proglen=212 pass=2 image=7f15a83c from=insmod pid=730
JIT code: : f0 05 2d e9 40 d2 4d e2 00 40 a0 e3 0c 42 8d e5
JIT code: 0010: 08 42 8d e5 00 00 20 e0 01 10 21 e0 20 62 9d e5
JIT code: 0020: 20 72 9d e5 06 70 27 e0 20 72 8d e5 24 62 9d e5
JIT code: 0030: 24 72 9d e5 06 70 27 e0 24 72 8d e5 00 40 a0 e1
JIT code: 0040: 01 50 a0 e1 01 00 a0 e3 00 10 a0 e3 20 02 8d e5
JIT code: 0050: 24 12 8d e5 02 00 a0 e3 00 10 a0 e3 20 62 9d e5
JIT code: 0060: 06 00 80 e0 00 10 a0 e3 00 00 60 e2 00 10 a0 e3
JIT code: 0070: 20 02 8d e5 24 12 8d e5 54 40 90 e5 20 62 9d e5
JIT code: 0080: 06 00 80 e0 00 10 a0 e3 20 02 8d e5 24 12 8d e5
JIT code: 0090: 04 00 a0 e1 01 10 a0 e3 20 62 9d e5 06 10 81 e0
JIT code: 00a0: 01 20 a0 e3 04 32 8d e2 bc 68 0a e3 11 60 48 e3
JIT code: 00b0: 36 ff 2f e1 01 10 21 e0 00 00 50 e3 04 00 00 0a
JIT code: 00c0: 00 00 d0 e5 01 00 00 ea 40 d2 8d e2 f0 05 bd e8
JIT code: 00d0: 1e ff 2f e1
jited:1
Unhandled fault: page domain fault (0x01b) at 0x0051
pgd = 871d
[0051] *pgd=671b7831, *pte=, *ppte=
Internal error: : 1b [#1] SMP ARM
Modules linked in: test_bpf(+)
CPU: 0 PID: 730 Comm: insmod Not tainted 4.11.0+ #5
Hardware name: ARM-Versatile Express
task: 87023700 task.stack: 8718a000
PC is at 0x7f15a8b4
LR is at test_bpf_init+0x5bc/0x1000 [test_bpf]
pc : [<7f15a8b4>]lr : [<7f1575bc>]psr: 8013
sp : 8718bd7c  ip : 0015  fp : 7f005008
r10: 7f005094  r9 : 893ba020  r8 : 893ba000
r7 :   r6 : 0001  r5 :   r4 : 
r3 : 7f15a83c  r2 : 893ba020  r1 :   r0 : fffd
Flags: Nzcv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment none
Control: 10c5387d  Table: 671d0059  DAC: 0051
Process insmod (pid: 730, stack limit = 0x8718a210)
Stack: (0x8718bd7c to 0x8718c000)
bd60:
bd80: 2710 870db300 c302e7e8 7f004010 893ba000 7f005094  
bda0:    0001 0001  014000c0 00150628
bdc0: 7f0050ac 7f154840 1234 1234aaab c302e7e8 000f  893ba000
bde0: 000b 7f004010 87fd54a0 e000 7f157000  871b6fc0 0001
be00: 78e4905c 0024 7f154640 8010179c 80a06544 8718a000 0001 80a54980
be20: 80a3066c 0007 809685c0 80a54700 80a54700 07551000 80a54700 60070013
be40: 7f154640 801f3fc8 78e4905c 7f154640 0001 871b6fe4 7f154640 0001
be60: 871b6b00 0001 78e4905c 801eaa94 0001 871b6fe4 8718bf44 0001
be80: 871b6fe4 80196e4c 7f15464c 7fff 7f154640 80193f10 87127000 7f154640
bea0: 7f154688 80703800 7f154770 807037e4 8081b184 807bec60 807becc4 807bec6c
bec0: 7f15481c 8010c1b8 9360 76ed8028 0f60   
bee0:        
bf00:        3f80
bf20: 76f5cf88  93684f80 8718a000 00160fda 0051  801973b0
bf40: 87671a00 93501000 00183f80 93684760 93684574 936788e0 00155000 00155290
bf60:    1f64 0032 0033 001d 
bf80: 0017   00183f80 756e694c 0080 80107684 fffd
bfa0:  801074c0  00183f80 76dd9008 00183f80 00160fda 
bfc0:  00183f80 756e694c 0080 0001 7eabae2c 00172f8c 
bfe0: 7eabaae0 7eabaad0 0004017f 00013172 60070030 76dd9008  
[<7f1575bc>] (test_bpf_init [test_bpf]) from [<7f157000>]
(test_bpf_init+0x0/0x1000 [test_bpf])
[<7f157000>] (test_bpf_init [test_bpf]) from [<78e4905c>] (0x78e4905c)
Code: e260 e3a01000 e58d0220 e58d1224 (e5904054)
---[ end trace a36398923b914fe2 ]---
Segmentation fault

Why is trying to execute TAX which is a cBPF instruction?


Kernel translates this to eBPF internally (bpf_prepare_filter() ->
bpf_migrate_filter()), no cBPF will see the JIT directly.

Is your implementation still using bpf_jit_compile() callback as
opposed to bpf_int_jit_compile()?!

Cheers,
Daniel

Re: arch: arm: bpf: Converting cBPF to eBPF for arm 32 bit

2017-05-09 Thread David Miller

From: Shubham Bansal 
Date: Wed, 10 May 2017 01:42:10 +0530

> Why is trying to execute TAX which is a cBPF instruction?

Because some of the tests are classic BPF programs which
get translated into eBPF ones and sent to the JIT for
compilation.

[PATCH nf] xtables: zero padding in data_to_user

2017-05-09 Thread Willem de Bruijn

From: Willem de Bruijn 

When looking up an iptables rule, the iptables binary compares the
aligned match and target data (XT_ALIGN). In some cases this can
exceed the actual data size to include padding bytes.

Before commit f77bc5b23fb1 ("iptables: use match, target and data
copy_to_user helpers") the malloc()ed bytes were overwritten by the
kernel with kzalloced contents, zeroing the padding and making the
comparison succeed. After this patch, the kernel copies and clears
only data, leaving the padding bytes undefined.

Extend the clear operation from data size to aligned data size to
include the padding bytes, if any.

Padding bytes can be observed in both match and target, and the bug
triggered, by issuing a rule with match icmp and target ACCEPT:

  iptables -t mangle -A INPUT -i lo -p icmp --icmp-type 1 -j ACCEPT
  iptables -t mangle -D INPUT -i lo -p icmp --icmp-type 1 -j ACCEPT

Fixes: f77bc5b23fb1 ("iptables: use match, target and data copy_to_user 
helpers")
Reported-by: Paul Moore 
Reported-by: Richard Guy Briggs 
Signed-off-by: Willem de Bruijn 
---
 include/linux/netfilter/x_tables.h | 2 +-
 net/bridge/netfilter/ebtables.c| 9 ++---
 net/netfilter/x_tables.c   | 9 ++---
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/include/linux/netfilter/x_tables.h 
b/include/linux/netfilter/x_tables.h
index be378cf47fcc..b3044c2c62cb 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -294,7 +294,7 @@ int xt_match_to_user(const struct xt_entry_match *m,
 int xt_target_to_user(const struct xt_entry_target *t,
  struct xt_entry_target __user *u);
 int xt_data_to_user(void __user *dst, const void *src,
-   int usersize, int size);
+   int usersize, int size, int aligned_size);
 
 void *xt_copy_counters_from_user(const void __user *user, unsigned int len,
 struct xt_counters_info *info, bool compat);
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 9ec0c9f908fa..9c6e619f452b 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1373,7 +1373,8 @@ static inline int ebt_obj_to_user(char __user *um, const 
char *_name,
strlcpy(name, _name, sizeof(name));
if (copy_to_user(um, name, EBT_FUNCTION_MAXNAMELEN) ||
put_user(datasize, (int __user *)(um + EBT_FUNCTION_MAXNAMELEN)) ||
-   xt_data_to_user(um + entrysize, data, usersize, datasize))
+   xt_data_to_user(um + entrysize, data, usersize, datasize,
+   XT_ALIGN(datasize)))
return -EFAULT;
 
return 0;
@@ -1658,7 +1659,8 @@ static int compat_match_to_user(struct ebt_entry_match 
*m, void __user **dstptr,
if (match->compat_to_user(cm->data, m->data))
return -EFAULT;
} else {
-   if (xt_data_to_user(cm->data, m->data, match->usersize, msize))
+   if (xt_data_to_user(cm->data, m->data, match->usersize, msize,
+   COMPAT_XT_ALIGN(msize)))
return -EFAULT;
}
 
@@ -1687,7 +1689,8 @@ static int compat_target_to_user(struct ebt_entry_target 
*t,
if (target->compat_to_user(cm->data, t->data))
return -EFAULT;
} else {
-   if (xt_data_to_user(cm->data, t->data, target->usersize, tsize))
+   if (xt_data_to_user(cm->data, t->data, target->usersize, tsize,
+   COMPAT_XT_ALIGN(tsize)))
return -EFAULT;
}
 
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index f134d384852f..c64716a735b0 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -283,12 +283,13 @@ static int xt_obj_to_user(u16 __user *psize, u16 size,
   >u.user.revision, K->u.kernel.TYPE->revision)
 
 int xt_data_to_user(void __user *dst, const void *src,
-   int usersize, int size)
+   int usersize, int size, int aligned_size)
 {
usersize = usersize ? : size;
if (copy_to_user(dst, src, usersize))
return -EFAULT;
-   if (usersize != size && clear_user(dst + usersize, size - usersize))
+   if (usersize != aligned_size &&
+   clear_user(dst + usersize, aligned_size - usersize))
return -EFAULT;
 
return 0;
@@ -298,7 +299,9 @@ EXPORT_SYMBOL_GPL(xt_data_to_user);
 #define XT_DATA_TO_USER(U, K, TYPE, C_SIZE)\
xt_data_to_user(U->data, K->data,   \
K->u.kernel.TYPE->usersize, \
-   C_SIZE ? : K->u.kernel.TYPE->TYPE##size)
+   C_SIZE ? : K->u.kernel.TYPE->TYPE##size,

Re: arch: arm: bpf: Converting cBPF to eBPF for arm 32 bit

2017-05-09 Thread Shubham Bansal

Hi Daniel,

I just tried running test_bpf.ko module.

$ echo 2 >>  /proc/sys/net/core/bpf_jit_enable
$ insmod test_bpf.ko

test_bpf: #0 TAX
bpf_jit: flen=14 proglen=212 pass=2 image=7f15a83c from=insmod pid=730
JIT code: : f0 05 2d e9 40 d2 4d e2 00 40 a0 e3 0c 42 8d e5
JIT code: 0010: 08 42 8d e5 00 00 20 e0 01 10 21 e0 20 62 9d e5
JIT code: 0020: 20 72 9d e5 06 70 27 e0 20 72 8d e5 24 62 9d e5
JIT code: 0030: 24 72 9d e5 06 70 27 e0 24 72 8d e5 00 40 a0 e1
JIT code: 0040: 01 50 a0 e1 01 00 a0 e3 00 10 a0 e3 20 02 8d e5
JIT code: 0050: 24 12 8d e5 02 00 a0 e3 00 10 a0 e3 20 62 9d e5
JIT code: 0060: 06 00 80 e0 00 10 a0 e3 00 00 60 e2 00 10 a0 e3
JIT code: 0070: 20 02 8d e5 24 12 8d e5 54 40 90 e5 20 62 9d e5
JIT code: 0080: 06 00 80 e0 00 10 a0 e3 20 02 8d e5 24 12 8d e5
JIT code: 0090: 04 00 a0 e1 01 10 a0 e3 20 62 9d e5 06 10 81 e0
JIT code: 00a0: 01 20 a0 e3 04 32 8d e2 bc 68 0a e3 11 60 48 e3
JIT code: 00b0: 36 ff 2f e1 01 10 21 e0 00 00 50 e3 04 00 00 0a
JIT code: 00c0: 00 00 d0 e5 01 00 00 ea 40 d2 8d e2 f0 05 bd e8
JIT code: 00d0: 1e ff 2f e1
jited:1
Unhandled fault: page domain fault (0x01b) at 0x0051
pgd = 871d
[0051] *pgd=671b7831, *pte=, *ppte=
Internal error: : 1b [#1] SMP ARM
Modules linked in: test_bpf(+)
CPU: 0 PID: 730 Comm: insmod Not tainted 4.11.0+ #5
Hardware name: ARM-Versatile Express
task: 87023700 task.stack: 8718a000
PC is at 0x7f15a8b4
LR is at test_bpf_init+0x5bc/0x1000 [test_bpf]
pc : [<7f15a8b4>]lr : [<7f1575bc>]psr: 8013
sp : 8718bd7c  ip : 0015  fp : 7f005008
r10: 7f005094  r9 : 893ba020  r8 : 893ba000
r7 :   r6 : 0001  r5 :   r4 : 
r3 : 7f15a83c  r2 : 893ba020  r1 :   r0 : fffd
Flags: Nzcv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment none
Control: 10c5387d  Table: 671d0059  DAC: 0051
Process insmod (pid: 730, stack limit = 0x8718a210)
Stack: (0x8718bd7c to 0x8718c000)
bd60:
bd80: 2710 870db300 c302e7e8 7f004010 893ba000 7f005094  
bda0:    0001 0001  014000c0 00150628
bdc0: 7f0050ac 7f154840 1234 1234aaab c302e7e8 000f  893ba000
bde0: 000b 7f004010 87fd54a0 e000 7f157000  871b6fc0 0001
be00: 78e4905c 0024 7f154640 8010179c 80a06544 8718a000 0001 80a54980
be20: 80a3066c 0007 809685c0 80a54700 80a54700 07551000 80a54700 60070013
be40: 7f154640 801f3fc8 78e4905c 7f154640 0001 871b6fe4 7f154640 0001
be60: 871b6b00 0001 78e4905c 801eaa94 0001 871b6fe4 8718bf44 0001
be80: 871b6fe4 80196e4c 7f15464c 7fff 7f154640 80193f10 87127000 7f154640
bea0: 7f154688 80703800 7f154770 807037e4 8081b184 807bec60 807becc4 807bec6c
bec0: 7f15481c 8010c1b8 9360 76ed8028 0f60   
bee0:        
bf00:        3f80
bf20: 76f5cf88  93684f80 8718a000 00160fda 0051  801973b0
bf40: 87671a00 93501000 00183f80 93684760 93684574 936788e0 00155000 00155290
bf60:    1f64 0032 0033 001d 
bf80: 0017   00183f80 756e694c 0080 80107684 fffd
bfa0:  801074c0  00183f80 76dd9008 00183f80 00160fda 
bfc0:  00183f80 756e694c 0080 0001 7eabae2c 00172f8c 
bfe0: 7eabaae0 7eabaad0 0004017f 00013172 60070030 76dd9008  
[<7f1575bc>] (test_bpf_init [test_bpf]) from [<7f157000>]
(test_bpf_init+0x0/0x1000 [test_bpf])
[<7f157000>] (test_bpf_init [test_bpf]) from [<78e4905c>] (0x78e4905c)
Code: e260 e3a01000 e58d0220 e58d1224 (e5904054)
---[ end trace a36398923b914fe2 ]---
Segmentation fault

Why is trying to execute TAX which is a cBPF instruction?

Best,
Shubham Bansal


On Thu, Apr 6, 2017 at 6:21 PM, Daniel Borkmann  wrote:
> On 04/06/2017 01:05 PM, Shubham Bansal wrote:
>>
>> Gentle Reminder.
>
>
> Sorry for late reply.
>
>> Anybody can tell me how to test the JIT compiler ?
>
>
> There's lib/test_bpf.c, see Documentation/networking/filter.txt +1349
> for some more information. It basically contains various test cases that
> have the purpose to test the JIT with corner cases. If you see a useful
> test missing, please send a patch for it, so all other JITs can benefit
> from this as well. For extracting disassembly from a generated test case,
> check out bpf_jit_disasm (Documentation/networking/filter.txt +486).
>
> Thanks,
> Daniel

[GIT] Networking

2017-05-09 Thread David Miller


1) Fix multiqueue in stmmac driver on PCI, from Andy Shevchenko.

2) cdc_ncm doesn't actually fully zero out the padding area is
   allocates on TX, from Jim Baxter.

3) Don't leak map addresses in BPF verifier, from Daniel Borkmann.

4) If we randomize TCP timestamps, we have to do it everywhere
   including SYN cookies.  From Eric Dumazet.

5) Fix "ethtool -S" crash in aquantia driver, from Pavel Belous.

6) Fix allocation size for ntp filter bitmap in bnxt_en driver,
   from Dan Carpenter.

7) Add missing memory allocation return value check to DSA loop
   driver, from Christophe Jaillet.

8) Fix XDP leak on driver unload in qed driver, from Suddarsana Reddy
   Kalluru.

9) Don't inherit MC list from parent inet connection sockets,
   another syzkaller spotted gem.  Fix from Eric Dumazet.

Please pull, thanks a lot.

The following changes since commit af82455f7dbd9dc20244d80d033721b30d22c065:

  Merge tag 'char-misc-4.12-rc1' of 
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc (2017-05-04 
19:15:35 -0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git 

for you to fetch changes up to 657831ffc38e30092a2d5f03d385d710eb88b09a:

  dccp/tcp: do not inherit mc_list from parent (2017-05-09 15:17:49 -0400)


Andy Shevchenko (4):
  stmmac: pci: set default number of rx and tx queues
  stmmac: pci: TX and RX queue priority configuration
  stmmac: pci: RX queue routing configuration
  stmmac: pci: split out common_default_data() helper

Christophe Jaillet (1):
  net: dsa: loop: Check for memory allocation failure

Dan Carpenter (1):
  bnxt_en: allocate enough space for ->ntp_fltr_bmap

Daniel Borkmann (1):
  bpf: don't let ldimm64 leak map addresses on unprivileged

David S. Miller (5):
  Merge branch 'stmmac-pci-fix-crash-on-Intel-Galileo-Gen2'
  Merge tag 'mac80211-for-davem-2017-05-08' of 
git://git.kernel.org/.../jberg/mac80211
  Revert "ipv4: restore rt->fi for reference counting"
  Merge branch 'mlx4-misc-fixes'
  Merge branch 'qed-general-fixes'

Eric Dumazet (2):
  tcp: randomize timestamps on syncookies
  dccp/tcp: do not inherit mc_list from parent

Ganesh Goudar (1):
  cxgb4: avoid disabling FEC by default

Geliang Tang (2):
  net/hippi/rrunner: use memdup_user
  yam: use memdup_user

Grygorii Strashko (1):
  net: ethernet: ti: cpsw: adjust cpsw fifos depth for fullduplex flow 
control

Hangbin Liu (2):
  bonding: check nla_put_be32 return value
  vti: check nla_put_* return value

Jack Morgenstein (1):
  net/mlx4_core: Reduce harmless SRIOV error message to debug level

Jim Baxter (1):
  net: cdc_ncm: Fix TX zero padding

Johannes Berg (4):
  mac80211: properly remove RX_ENC_FLAG_40MHZ
  nl80211: correctly validate MU-MIMO groups
  mac80211: fix IBSS presp allocation size
  cfg80211: fix multi scheduled scan kernel-doc

Jon Mason (1):
  net: mdio-mux: bcm-iproc: call mdiobus_free() in error path

Kamal Heib (1):
  net/mlx4_en: Change the error print to debug print

Karim Eshapa (1):
  drivers: net: wimax: i2400m: i2400m-usb: Use time_after for time 
comparison

Kees Cook (4):
  bna: Avoid reading past end of buffer
  bna: ethtool: Avoid reading past end of buffer
  qlge: Avoid reading past end of buffer
  DECnet: Use container_of() for embedded struct

Luca Coelho (1):
  mac80211: bail out from prep_connection() if a reconfig is ongoing

Mintz, Yuval (3):
  qed: Fix VF removal sequence
  qed: Tell QM the number of tasks
  qede: Split PF/VF ndos.

Pavel Belous (1):
  aquantia: Fix "ethtool -S" crash when adapter down.

Rakesh Pandit (1):
  net: alx: handle pci_alloc_irq_vectors return correctly

Ram Amrani (1):
  qed: Correct doorbell configuration for !4Kb pages

Suddarsana Reddy Kalluru (1):
  qede: Fix XDP memory leak on unload

Talat Batheesh (1):
  net/mlx4_en: Avoid adding steering rules with invalid ring

Tobias Klauser (1):
  bridge: netlink: account for IFLA_BRPORT_{B, M}CAST_FLOOD size and policy

Vlad Yasevich (1):
  vlan: Keep NETIF_F_HW_CSUM similar to other software devices

WANG Cong (2):
  ipv4: restore rt->fi for reference counting
  ipv6: reorder ip6_route_dev_notifier after ipv6_dev_notf

Wei Wang (1):
  tcp: make congestion control optionally skip slow start after idle

 drivers/net/bonding/bond_netlink.c|  3 ++-
 drivers/net/dsa/dsa_loop.c|  3 +++
 drivers/net/ethernet/aquantia/atlantic/aq_nic.c   |  6 --
 drivers/net/ethernet/atheros/alx/main.c   |  4 ++--
 drivers/net/ethernet/broadcom/bnxt/bnxt.c |  3 ++-
 drivers/net/ethernet/brocade/bna/bfa_ioc.c|  2 +-
 drivers/net/ethernet/brocade/bna/bnad_ethtool.c   |  4 ++--
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h

Re: [PATCH v2 net] dccp/tcp: do not inherit mc_list from parent

2017-05-09 Thread David Miller

From: Eric Dumazet 
Date: Tue, 09 May 2017 06:29:19 -0700

> From: Eric Dumazet 
> 
> syzkaller found a way to trigger double frees from ip_mc_drop_socket()
> 
> It turns out that leave a copy of parent mc_list at accept() time,
> which is very bad.
> 
> Very similar to commit 8b485ce69876 ("tcp: do not inherit
> fastopen_req from parent")
> 
> Initial report from Pray3r, completed by Andrey one.
> Thanks a lot to them !
> 
> Signed-off-by: Eric Dumazet 
> Reported-by: Pray3r 
> Reported-by: Andrey Konovalov 
> Tested-by: Andrey Konovalov 
> ---
> v2: fix moved into inet_csk_clone_lock() to fix both DCCP and TCP

Applied and queued up for -stable, thanks Eric.

Re: Requirements for a shutdown function?

2017-05-09 Thread Florian Fainelli

On 05/09/2017 11:51 AM, Timur Tabi wrote:
> On 05/09/2017 01:46 PM, Florian Fainelli wrote:
>> A good test case for exercising a .shutdown() function is kexec'ing a
>> new kernel for instance.
> 
> I tried that.  I run iperf in one window while launching kexec in another.
> Even without a shutdown function, network traffic appear to halt on its own
> and the kexec succeeds.
> 
> Is it possible that the network stack detects a kexec and automatically
> stops all network devices?

No. why would it? However the device driver model does call into your
driver's remove function and that one does a right job already because
it does an network device unregister, and so on.

There is no strict requirement for implementing a .shutdown() function
AFAICT and it does not necessarily make sense to have one depending on
the bus type. For platform/MMIO devices, it hardly has any value, but on
e.g: PCI, it could be added as an additional step to perform a full
device shutdown.

> 
>> You should put your HW in a state where it won't be doing DMA, or have
>> any adverse side effects to the system, putting it in a low power state
>> is also a good approach.
> 
> My in-house driver stops the RX and TX queues.  I'm guessing that's good
> enough, but I don't have a failing test case to prove it.
> 

That's probably good enough, yes.
-- 
Florian

Re: Requirements for a shutdown function?

2017-05-09 Thread Timur Tabi

On 05/09/2017 01:46 PM, Florian Fainelli wrote:
> A good test case for exercising a .shutdown() function is kexec'ing a
> new kernel for instance.

I tried that.  I run iperf in one window while launching kexec in another.
Even without a shutdown function, network traffic appear to halt on its own
and the kexec succeeds.

Is it possible that the network stack detects a kexec and automatically
stops all network devices?

> You should put your HW in a state where it won't be doing DMA, or have
> any adverse side effects to the system, putting it in a low power state
> is also a good approach.

My in-house driver stops the RX and TX queues.  I'm guessing that's good
enough, but I don't have a failing test case to prove it.

-- 
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm
Technologies, Inc.  Qualcomm Technologies, Inc. is a member of the
Code Aurora Forum, a Linux Foundation Collaborative Project.

Re: Requirements for a shutdown function?

2017-05-09 Thread Florian Fainelli

On 05/09/2017 09:58 AM, Timur Tabi wrote:
> I'm trying to add a platform_driver.shutdown function to my Ethernet driver
> (drivers/net/ethernet/qualcomm/emac/*), but I can't find any definitive
> information as to what a network driver shutdown callback is supposed to do.
>  I also don't know what testcase I should use to verify that my function is
> working.

A good test case for exercising a .shutdown() function is kexec'ing a
new kernel for instance.

> 
> I see only four instances of a platform_driver.shutdown function in
> drivers/net/ethernet:
> 
> $ git grep -A 20 -w platform_driver | grep '\.shutdown'
> apm/xgene-v2/main.c-  .shutdown = xge_shutdown,
> apm/xgene/xgene_enet_main.c-  .shutdown = xgene_enet_shutdown,
> marvell/mv643xx_eth.c-.shutdown   = mv643xx_eth_shutdown,
> marvell/pxa168_eth.c- .shutdown = pxa168_eth_shutdown,
> 
> (Other shutdown functions are for pci_driver.shutdown).
> 
> For the xgene drivers, the shutdown function just calls the 'remove'
> function.  Isn't that overkill?  Why bother with a shutdown function if it's
> just the same thing as removing the driver outright?

Yes, that appears unnecessary.

> 
> mv643xx_eth_shutdown() seems more reasonable.  All it does is halt the TX
> and RX queues.
> 
> pxa168_eth_shutdown() is a little more heavyweight: halts the queues, and
> stops the DMA and calls phy_stop().
> 
> Can anyone help me figure out what my driver really should do?

You should put your HW in a state where it won't be doing DMA, or have
any adverse side effects to the system, putting it in a low power state
is also a good approach.
-- 
Florian

RE: [PATCH net-next 9/9] ipvlan: introduce individual MAC addresses

2017-05-09 Thread Chiappero, Marco

> -Original Message-
> From: Dan Williams [mailto:d...@redhat.com]
> Sent: Monday, May 8, 2017 7:13 PM
> To: Chiappero, Marco ; Jiri Benc
> 
> Cc: netdev@vger.kernel.org; David S . Miller ; Kirsher,
> Jeffrey T ; Duyck, Alexander H
> ; Grandhi, Sainath
> ; Mahesh Bandewar 
> Subject: Re: [PATCH net-next 9/9] ipvlan: introduce individual MAC addresses
> 
> On Mon, 2017-05-08 at 15:29 +, Chiappero, Marco wrote:
> > > -Original Message-
> > > From: Jiri Benc [mailto:jb...@redhat.com]
> > > Sent: Thursday, May 4, 2017 5:44 PM
> > > To: Chiappero, Marco 
> > > Cc: Dan Williams ; netdev@vger.kernel.org; David S
> > > 
> > > And doing some kind of weird MAC NAT in ipvlan just to satisfy
> > > broken tools that can't cope with multiple interfaces with the same
> > > MAC address is wrong, too.
> >
> > Ipvlan has always had the MAC issue, regardless, these tools simply
> > make it more apparent. And as I said already, whether they are broken
> 
> If we're talking about the slaves being unable to change when the parent MAC
> changes, everyone agrees that was a bug.
> 
> If we are talking about the "all slaves have the same MAC" then that's not an
> "issue", that's the way it's designed.  Whether that design is the best design
> possible is debatable, but that's the way it currently is.

I'm referring to the latter, the former is obvious. That "design" looks like a 
temporary workaround to me, in fact a couple of TODOs in the code lead to 
believe it is. They should be removed, at the very least.

> > is debatable (yet I have to read a reasonable motivation). At the
> 
> Existing tools are already broken for bond slaves and a couple existing 
> drivers,
> see below.
> 
> Note that making the MACs unique would break DHCP functionality, because
> now the MAC address encoded in the 'chaddr' field of the DHCP packet would
> no longer correspond to the MAC address of the device that DHCP replies should
> be received on.  The userspace process writes the 'chaddr' field, and the
> userspace process would see the unique (and
> incorrect) MAC address.

Fair point. However, despite not fixing the issues with DHCP, it would be no 
way worse that it is now (even though I admit I don't like much the 
workaround). BTW, I don't know about the ISC upstream version, but on 
Debian/Ubuntu the -B flag is not available, which makes ipvlan+DHCP non-viable 
on a very large number of deployments.

> > very least their expectation to have unique addresses on the same
> > broadcast domain is hardly arguable. Should ipvlan considered special?
> > Again, questionable.
> 
> bond slaves
> drivers/net/ethernet/toshiba/ps3_gelic_net.c
> drivers/s390/net/qeth_l3_main.c
> 
> already all have the same MAC address for different netdevices.  Yeah, not 
> many
> people have PS3s anymore, but s390 qeth is fairly widely used.
>  Just pointing out that ipvlan is hardly the first device to have encountered 
> this
> issue, or to have solved it this way.  qeth does pretty much the same thing as
> ipvlan.
> 
> I'm not arguing that ipvlan is perfect.  I'm just arguing that in its current 
> form
> (eg, virtualized L2 device) making this change is incorrect.

Don't get me wrong, I understand and appreciate your lengthy reply, even though 
the fact that a poor solution is already included elsewhere doesn't make it any 
better.
However, regardless of the uniqueness topic, I don't feel is completely right 
to change the whole world upstairs making it ipvlan aware either, unless there 
is a real and coherent differentiation (e.g. L3 only interfaces). I'll drop 
considering ipvlan as an option for now.

Regards,
Marco

--
Intel Research and Development Ireland Limited
Registered in Ireland
Registered Office: Collinstown Industrial Park, Leixlip, County Kildare
Registered Number: 308263

This e-mail and any attachments may contain confidential material for the sole
use of the intended recipient(s). Any review or distribution by others is
strictly prohibited. If you are not the intended recipient, please contact the
sender and delete all copies.

Re: [PATCH net v2] driver: vrf: Fix one possible use-after-free issue

2017-05-09 Thread David Miller

From: gfree.w...@vip.163.com
Date: Tue,  9 May 2017 18:27:33 +0800

> @@ -989,6 +989,7 @@ static u32 vrf_fib_table(const struct net_device *dev)
>  
>  static int vrf_rcv_finish(struct net *net, struct sock *sk, struct sk_buff 
> *skb)
>  {
> + kfree_skb(skb);
>   return 0;
>  }
>  
> @@ -998,7 +999,7 @@ static struct sk_buff *vrf_rcv_nfhook(u8 pf, unsigned int 
> hook,
>  {
>   struct net *net = dev_net(dev);
>  
> - if (NF_HOOK(pf, hook, net, NULL, skb, dev, NULL, vrf_rcv_finish) < 0)
> + if (nf_hook(pf, hook, net, NULL, skb, dev, NULL, vrf_rcv_finish) != 1)
>   skb = NULL;/* kfree_skb(skb) handled by nf code */
>  
>   return skb;

Indeed, this fixes the immediate problem with NF_STOLEN.

Making NF_STOLEN fully functional is another story, we'd need to stack
this all together properly:

static int __ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff 
*skb)
{
 ...
}

static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
return l3mdev_ip_rcv(skb, __ip_rcv_finish);
}
...
static inline
struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb,
  int (*okfn)(struct net *, struct sock *, struct 
sk_buff *))
{
return l3mdev_l3_rcv(skb, okfn, AF_INET);
}

etc. but that's going to really add a kink to the receive path,
microbenchmark wise.

Re: bpf pointer alignment validation

2017-05-09 Thread David Miller

From: Daniel Borkmann 
Date: Mon, 08 May 2017 12:49:25 +0200

> Could you also add test cases specifically to this for test_verifier
> in bpf selftests? I'm thinking of the cases when we have no pkt id
> and offset originated from reg->off (accumulated through const imm
> ops on reg) and insn->off, where we had i) no pkt id and ii) a
> specific pkt id (so we can probe for aux_off_align rejection as well).
> I believe we do have coverage to some extend in some of the tests
> (more on the map_value_adj though), but it would be good to keep
> tracking this specifically as well.

So I created a new test, that uses the verifier, but in a new way.

The thing I wanted to do is match on verifier dump strings so that I
can test what the verifier thinks about the known alignment et al. of
various registers after the execution of certain instructions.

I accomplish this by doing two things:

1) If the log level of 2 or greater is given to the verifier, I dump
   the internal state to the log after every instruction.

2) A new BPF library helper called bpf_verify_program() is added which
   always passes the log to the BPF system call and uses a log level
   of 2.

Then in my "test_align" I have BPF programs as well as strings to
match against in the verifier dump.

The whole thing is below, and writing the test cases certainly helped
me refine the implementation quite a bit.

I'll keep adding some more tests and hacking on this.  It just
occurred to me that it would be extremely variable to be able to turn
on strict alignment requirements even on architectures that do not
need it.

That way anyone adding regressions to the alignment code, or hitting
new cases the code can't currently handle, would notice immediately
regardles of the type of system the run the test case on.

To be quite honest, strict alignment might not be a bad default either
to give helpful feedback to people writing eBPF programs.

---
 include/linux/bpf_verifier.h |   3 +
 kernel/bpf/verifier.c| 104 -
 tools/lib/bpf/bpf.c  |  20 ++
 tools/lib/bpf/bpf.h  |   4 +
 tools/testing/selftests/bpf/Makefile |   4 +-
 tools/testing/selftests/bpf/test_align.c | 378 +++
 6 files changed, 500 insertions(+), 13 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_align.c

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 5efb4db..7c6a519 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -40,6 +40,9 @@ struct bpf_reg_state {
 */
s64 min_value;
u64 max_value;
+   u32 min_align;
+   u32 aux_off;
+   u32 aux_off_align;
 };

 enum bpf_stack_slot_type {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c2ff608..2b1b06e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -241,6 +241,12 @@ static void print_verifier_state(struct bpf_verifier_state 
*state)
if (reg->max_value != BPF_REGISTER_MAX_RANGE)
verbose(",max_value=%llu",
(unsigned long long)reg->max_value);
+   if (reg->min_align)
+   verbose(",min_align=%u", reg->min_align);
+   if (reg->aux_off)
+   verbose(",aux_off=%u", reg->aux_off);
+   if (reg->aux_off_align)
+   verbose(",aux_off_align=%u", reg->aux_off_align);
}
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] == STACK_SPILL)
@@ -455,6 +461,9 @@ static void init_reg_state(struct bpf_reg_state *regs)
regs[i].imm = 0;
regs[i].min_value = BPF_REGISTER_MIN_RANGE;
regs[i].max_value = BPF_REGISTER_MAX_RANGE;
+   regs[i].min_align = 0;
+   regs[i].aux_off = 0;
+   regs[i].aux_off_align = 0;
}

/* frame pointer */
@@ -483,11 +492,17 @@ static void reset_reg_range_values(struct bpf_reg_state 
*regs, u32 regno)
regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
 }

+static void reset_reg_align(struct bpf_reg_state *regs, u32 regno)
+{
+   regs[regno].min_align = 0;
+}
+
 static void mark_reg_unknown_value_and_range(struct bpf_reg_state *regs,
 u32 regno)
 {
mark_reg_unknown_value(regs, regno);
reset_reg_range_values(regs, regno);
+   reset_reg_align(regs, regno);
 }

 enum reg_arg_type {
@@ -770,13 +785,24 @@ static bool is_pointer_value(struct bpf_verifier_env 
*env, int regno)
 static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
   int off, int size)
 {
-   if (reg->id && size != 1) {
-   verbose("Unknown alignment. Only byte-sized access allowed in 
packet access.\n");
-   return -EACCES;
+   int reg_off;
+
+

Re: mlx5 endpoint driver problem

2017-05-09 Thread Joao Pinto

Hi again Saeed,

Às 6:44 PM de 5/9/2017, Saeed Mahameed escreveu:
> On Tue, May 9, 2017 at 8:38 PM, Joao Pinto  wrote:
>> Hi Saeed,
>>
>> Às 6:35 PM de 5/9/2017, Saeed Mahameed escreveu:
>>> On Tue, May 9, 2017 at 7:25 PM, Joao Pinto  wrote:
 Hello,

 I am making tests with a Mellanox MLX5 Endpoint, and I am getting kernel 
 hangs
 when trying to enable the hca:

 mlx5_core :01:00.0: enabling device ( -> 0002)
 mlx5_core :01:00.0: Warning: couldn't set 64-bit PCI DMA mask
 mlx5_core :01:00.0: Warning: couldn't set 64-bit consistent PCI DMA 
 mask
 mlx5_core :01:00.0: firmware version: 16.19.21102
 INFO: task swapper:1 blocked for more than 10 seconds.
   Not tainted 4.11.0-BETAMSIX1 #51
 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
 swapper D0 1  0 0x

 Stack Trace:
   __switch_to+0x0/0x94
   __schedule+0x1da/0x8b0
   schedule+0x26/0x6c
   schedule_timeout+0x2da/0x380
   wait_for_completion+0x92/0x104
   mlx5_cmd_exec+0x70e/0xd60
   mlx5_load_one+0x1b4/0xad8
   init_one+0x404/0x600
   pci_device_probe+0x122/0x1f0
   really_probe+0x1ac/0x348
   __driver_attach+0xa8/0xd0
   bus_for_each_dev+0x3c/0x74
   bus_add_driver+0xc2/0x184
   driver_register+0x50/0xec
   init+0x40/0x60

 (...)

 Stack Trace:
   __switch_to+0x0/0x94
   __schedule+0x1da/0x8b0
   schedule+0x26/0x6c
   schedule_timeout+0x2da/0x380
   wait_for_completion+0x92/0x104
   mlx5_cmd_exec+0x70e/0xd60
   mlx5_load_one+0x1b4/0xad8
   init_one+0x404/0x600
   pci_device_probe+0x122/0x1f0
   really_probe+0x1ac/0x348
   __driver_attach+0xa8/0xd0
   bus_for_each_dev+0x3c/0x74
   bus_add_driver+0xc2/0x184
   driver_register+0x50/0xec
   init+0x40/0x60
 mlx5_core :01:00.0: wait_func:882:(pid 1): ENABLE_HCA(0x104) timeout. 
 Will
 cause a leak of a command resource
 mlx5_core :01:00.0: enable hca failed
 mlx5_core :01:00.0: mlx5_load_one failed with error code -110
 mlx5_core: probe of :01:00.0 failed with error -110

 Could you give me a clue of what might be happennig?

>>>
>>> Hi Joao,
>>>
>>> Looks like FW is not responding, most likely due to the DMA mask
>>> setting warnings, which architecture is this ?
>>>
 Thanks,
 Joao
>>
>> I am working with a 32-bit ARC processor based board, connected to a 
>> prototyped
>> Gen4 PCI RC.
>>
> 
> Ok, i will consult with our PCI and FW experts and get back to you.
> 
> please note that the current mlx5 driver was never tested on 32-bit
> architecture and might not work properly for you.

I have new data for you. My colleague is using a Mellanox MT27800 Family
(ConnectX-5) with Firmware version 16.19.148 and it does have hangs, but it
fails in the CPU mask:

mlx5_core :01:00.0: enabling device ( -> 0002)
mlx5_core :01:00.0: Warning: couldn't set 64-bit PCI DMA mask
mlx5_core :01:00.0: Warning: couldn't set 64-bit consistent PCI DMA mask
mlx5_core :01:00.0: firmware version: 16.19.148
mlx5_core :01:00.0: Port module event: module 0, Cable unplugged
mlx5_core :01:00.0: mlx5_irq_set_affinity_hint:628:(pid 1):
irq_set_affinity_hint failed,irq 0x0032
mlx5_core :01:00.0: Failed to alloc affinity hint cpumask
mlx5_core :01:00.0: mlx5_load_one failed with error code -22
mlx5_core: probe of :01:00.0 failed with error -22

Mine is a Mellanox MT28800 Family (ConnectX-5) with Firmware Version 
16.19.21102.

Hopes it gives more data for analysis.

Thanks,
Joao

> 
>> Thanks,
>> Joao
>>
>>
>>

Re: [PATCH 0/2] net: Set maximum receive packet size on veth interfaces

2017-05-09 Thread David Miller

From: Fredrik Markstrom 
Date: Tue,  9 May 2017 14:44:36 +0200

> Currently veth drops all packets larger then the mtu set on the
> receiving end of the pair. This is inconsistent with most hardware
> ethernet drivers.

False.

In fact, many pieces of ethernet hardware a not physically capable of
sending even VLAN packets that are above the normal base ethernet MTU.

It is therefore untenable to remove this restriction univerally like
this.

Re: mlx5 endpoint driver problem

2017-05-09 Thread Saeed Mahameed

On Tue, May 9, 2017 at 8:38 PM, Joao Pinto  wrote:
> Hi Saeed,
>
> Às 6:35 PM de 5/9/2017, Saeed Mahameed escreveu:
>> On Tue, May 9, 2017 at 7:25 PM, Joao Pinto  wrote:
>>> Hello,
>>>
>>> I am making tests with a Mellanox MLX5 Endpoint, and I am getting kernel 
>>> hangs
>>> when trying to enable the hca:
>>>
>>> mlx5_core :01:00.0: enabling device ( -> 0002)
>>> mlx5_core :01:00.0: Warning: couldn't set 64-bit PCI DMA mask
>>> mlx5_core :01:00.0: Warning: couldn't set 64-bit consistent PCI DMA mask
>>> mlx5_core :01:00.0: firmware version: 16.19.21102
>>> INFO: task swapper:1 blocked for more than 10 seconds.
>>>   Not tainted 4.11.0-BETAMSIX1 #51
>>> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>>> swapper D0 1  0 0x
>>>
>>> Stack Trace:
>>>   __switch_to+0x0/0x94
>>>   __schedule+0x1da/0x8b0
>>>   schedule+0x26/0x6c
>>>   schedule_timeout+0x2da/0x380
>>>   wait_for_completion+0x92/0x104
>>>   mlx5_cmd_exec+0x70e/0xd60
>>>   mlx5_load_one+0x1b4/0xad8
>>>   init_one+0x404/0x600
>>>   pci_device_probe+0x122/0x1f0
>>>   really_probe+0x1ac/0x348
>>>   __driver_attach+0xa8/0xd0
>>>   bus_for_each_dev+0x3c/0x74
>>>   bus_add_driver+0xc2/0x184
>>>   driver_register+0x50/0xec
>>>   init+0x40/0x60
>>>
>>> (...)
>>>
>>> Stack Trace:
>>>   __switch_to+0x0/0x94
>>>   __schedule+0x1da/0x8b0
>>>   schedule+0x26/0x6c
>>>   schedule_timeout+0x2da/0x380
>>>   wait_for_completion+0x92/0x104
>>>   mlx5_cmd_exec+0x70e/0xd60
>>>   mlx5_load_one+0x1b4/0xad8
>>>   init_one+0x404/0x600
>>>   pci_device_probe+0x122/0x1f0
>>>   really_probe+0x1ac/0x348
>>>   __driver_attach+0xa8/0xd0
>>>   bus_for_each_dev+0x3c/0x74
>>>   bus_add_driver+0xc2/0x184
>>>   driver_register+0x50/0xec
>>>   init+0x40/0x60
>>> mlx5_core :01:00.0: wait_func:882:(pid 1): ENABLE_HCA(0x104) timeout. 
>>> Will
>>> cause a leak of a command resource
>>> mlx5_core :01:00.0: enable hca failed
>>> mlx5_core :01:00.0: mlx5_load_one failed with error code -110
>>> mlx5_core: probe of :01:00.0 failed with error -110
>>>
>>> Could you give me a clue of what might be happennig?
>>>
>>
>> Hi Joao,
>>
>> Looks like FW is not responding, most likely due to the DMA mask
>> setting warnings, which architecture is this ?
>>
>>> Thanks,
>>> Joao
>
> I am working with a 32-bit ARC processor based board, connected to a 
> prototyped
> Gen4 PCI RC.
>

Ok, i will consult with our PCI and FW experts and get back to you.

please note that the current mlx5 driver was never tested on 32-bit
architecture and might not work properly for you.

> Thanks,
> Joao
>
>
>

Re: mlx5 endpoint driver problem

2017-05-09 Thread Joao Pinto

Hi Saeed,

Às 6:35 PM de 5/9/2017, Saeed Mahameed escreveu:
> On Tue, May 9, 2017 at 7:25 PM, Joao Pinto  wrote:
>> Hello,
>>
>> I am making tests with a Mellanox MLX5 Endpoint, and I am getting kernel 
>> hangs
>> when trying to enable the hca:
>>
>> mlx5_core :01:00.0: enabling device ( -> 0002)
>> mlx5_core :01:00.0: Warning: couldn't set 64-bit PCI DMA mask
>> mlx5_core :01:00.0: Warning: couldn't set 64-bit consistent PCI DMA mask
>> mlx5_core :01:00.0: firmware version: 16.19.21102
>> INFO: task swapper:1 blocked for more than 10 seconds.
>>   Not tainted 4.11.0-BETAMSIX1 #51
>> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>> swapper D0 1  0 0x
>>
>> Stack Trace:
>>   __switch_to+0x0/0x94
>>   __schedule+0x1da/0x8b0
>>   schedule+0x26/0x6c
>>   schedule_timeout+0x2da/0x380
>>   wait_for_completion+0x92/0x104
>>   mlx5_cmd_exec+0x70e/0xd60
>>   mlx5_load_one+0x1b4/0xad8
>>   init_one+0x404/0x600
>>   pci_device_probe+0x122/0x1f0
>>   really_probe+0x1ac/0x348
>>   __driver_attach+0xa8/0xd0
>>   bus_for_each_dev+0x3c/0x74
>>   bus_add_driver+0xc2/0x184
>>   driver_register+0x50/0xec
>>   init+0x40/0x60
>>
>> (...)
>>
>> Stack Trace:
>>   __switch_to+0x0/0x94
>>   __schedule+0x1da/0x8b0
>>   schedule+0x26/0x6c
>>   schedule_timeout+0x2da/0x380
>>   wait_for_completion+0x92/0x104
>>   mlx5_cmd_exec+0x70e/0xd60
>>   mlx5_load_one+0x1b4/0xad8
>>   init_one+0x404/0x600
>>   pci_device_probe+0x122/0x1f0
>>   really_probe+0x1ac/0x348
>>   __driver_attach+0xa8/0xd0
>>   bus_for_each_dev+0x3c/0x74
>>   bus_add_driver+0xc2/0x184
>>   driver_register+0x50/0xec
>>   init+0x40/0x60
>> mlx5_core :01:00.0: wait_func:882:(pid 1): ENABLE_HCA(0x104) timeout. 
>> Will
>> cause a leak of a command resource
>> mlx5_core :01:00.0: enable hca failed
>> mlx5_core :01:00.0: mlx5_load_one failed with error code -110
>> mlx5_core: probe of :01:00.0 failed with error -110
>>
>> Could you give me a clue of what might be happennig?
>>
> 
> Hi Joao,
> 
> Looks like FW is not responding, most likely due to the DMA mask
> setting warnings, which architecture is this ?
> 
>> Thanks,
>> Joao

I am working with a 32-bit ARC processor based board, connected to a prototyped
Gen4 PCI RC.

Thanks,
Joao

Re: mlx5 endpoint driver problem

2017-05-09 Thread Saeed Mahameed

On Tue, May 9, 2017 at 7:25 PM, Joao Pinto  wrote:
> Hello,
>
> I am making tests with a Mellanox MLX5 Endpoint, and I am getting kernel hangs
> when trying to enable the hca:
>
> mlx5_core :01:00.0: enabling device ( -> 0002)
> mlx5_core :01:00.0: Warning: couldn't set 64-bit PCI DMA mask
> mlx5_core :01:00.0: Warning: couldn't set 64-bit consistent PCI DMA mask
> mlx5_core :01:00.0: firmware version: 16.19.21102
> INFO: task swapper:1 blocked for more than 10 seconds.
>   Not tainted 4.11.0-BETAMSIX1 #51
> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> swapper D0 1  0 0x
>
> Stack Trace:
>   __switch_to+0x0/0x94
>   __schedule+0x1da/0x8b0
>   schedule+0x26/0x6c
>   schedule_timeout+0x2da/0x380
>   wait_for_completion+0x92/0x104
>   mlx5_cmd_exec+0x70e/0xd60
>   mlx5_load_one+0x1b4/0xad8
>   init_one+0x404/0x600
>   pci_device_probe+0x122/0x1f0
>   really_probe+0x1ac/0x348
>   __driver_attach+0xa8/0xd0
>   bus_for_each_dev+0x3c/0x74
>   bus_add_driver+0xc2/0x184
>   driver_register+0x50/0xec
>   init+0x40/0x60
>
> (...)
>
> Stack Trace:
>   __switch_to+0x0/0x94
>   __schedule+0x1da/0x8b0
>   schedule+0x26/0x6c
>   schedule_timeout+0x2da/0x380
>   wait_for_completion+0x92/0x104
>   mlx5_cmd_exec+0x70e/0xd60
>   mlx5_load_one+0x1b4/0xad8
>   init_one+0x404/0x600
>   pci_device_probe+0x122/0x1f0
>   really_probe+0x1ac/0x348
>   __driver_attach+0xa8/0xd0
>   bus_for_each_dev+0x3c/0x74
>   bus_add_driver+0xc2/0x184
>   driver_register+0x50/0xec
>   init+0x40/0x60
> mlx5_core :01:00.0: wait_func:882:(pid 1): ENABLE_HCA(0x104) timeout. Will
> cause a leak of a command resource
> mlx5_core :01:00.0: enable hca failed
> mlx5_core :01:00.0: mlx5_load_one failed with error code -110
> mlx5_core: probe of :01:00.0 failed with error -110
>
> Could you give me a clue of what might be happennig?
>

Hi Joao,

Looks like FW is not responding, most likely due to the DMA mask
setting warnings, which architecture is this ?

> Thanks,
> Joao

Re: [PATCH net v2] driver: vrf: Fix one possible use-after-free issue

2017-05-09 Thread Florian Westphal

David Ahern  wrote:
> On 5/9/17 3:27 AM, gfree.w...@vip.163.com wrote:
> > diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
> > index ceda586..db88249 100644
> > --- a/drivers/net/vrf.c
> > +++ b/drivers/net/vrf.c
> > @@ -989,6 +989,7 @@ static u32 vrf_fib_table(const struct net_device *dev)
> >  
> >  static int vrf_rcv_finish(struct net *net, struct sock *sk, struct sk_buff 
> > *skb)
> >  {
> > +   kfree_skb(skb);
> > return 0;
> >  }
> >  
> > @@ -998,7 +999,7 @@ static struct sk_buff *vrf_rcv_nfhook(u8 pf, unsigned 
> > int hook,
> >  {
> > struct net *net = dev_net(dev);
> >  
> > -   if (NF_HOOK(pf, hook, net, NULL, skb, dev, NULL, vrf_rcv_finish) < 0)
> > +   if (nf_hook(pf, hook, net, NULL, skb, dev, NULL, vrf_rcv_finish) != 1)
> > skb = NULL;/* kfree_skb(skb) handled by nf code */
> >  
> > return skb;
> > 
> 
> I'm clearly misunderstanding something ...
> 
> With the current code:
> - nf_hook returns 1, NF_HOOK invokes vrf_rcv_finish as the okfn, it
> returns 0, skb passes on.
> 
> - nf_hook returns 0, vrf_rcv_finish has been called by the nf_hook tree,
> vrf_rcv_finish returns 0, skb passes on

Yes, thats a bug. The skb could be have been queued to userspace, or
stolen by a hook.

It is illegal to use the skb after NF_HOOK() no matter the return value.
The okfn has to do further processing.

If nf_hook() is used instead, only a return value of 1 means the skb is
still valid.

In < 0 case it was free'd, in 0 case its in unknown state (usually queued
or free'd).

As for the patch, it avoids skb leak on userspace reinject but nfqueue
still won't work as no reinject is possible (vrf_rcv_finish is a sink that
doesn't do anyting).

Hope this clarifies things.

Requirements for a shutdown function?

2017-05-09 Thread Timur Tabi

I'm trying to add a platform_driver.shutdown function to my Ethernet driver
(drivers/net/ethernet/qualcomm/emac/*), but I can't find any definitive
information as to what a network driver shutdown callback is supposed to do.
 I also don't know what testcase I should use to verify that my function is
working.

I see only four instances of a platform_driver.shutdown function in
drivers/net/ethernet:

$ git grep -A 20 -w platform_driver | grep '\.shutdown'
apm/xgene-v2/main.c-.shutdown = xge_shutdown,
apm/xgene/xgene_enet_main.c-.shutdown = xgene_enet_shutdown,
marvell/mv643xx_eth.c-  .shutdown   = mv643xx_eth_shutdown,
marvell/pxa168_eth.c-   .shutdown = pxa168_eth_shutdown,

(Other shutdown functions are for pci_driver.shutdown).

For the xgene drivers, the shutdown function just calls the 'remove'
function.  Isn't that overkill?  Why bother with a shutdown function if it's
just the same thing as removing the driver outright?

mv643xx_eth_shutdown() seems more reasonable.  All it does is halt the TX
and RX queues.

pxa168_eth_shutdown() is a little more heavyweight: halts the queues, and
stops the DMA and calls phy_stop().

Can anyone help me figure out what my driver really should do?

-- 
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm
Technologies, Inc.  Qualcomm Technologies, Inc. is a member of the
Code Aurora Forum, a Linux Foundation Collaborative Project.

Re: [Patch net] ipv4: restore rt->fi for reference counting

2017-05-09 Thread Eric Dumazet

On Tue, 2017-05-09 at 09:44 -0700, Cong Wang wrote:

> 
> Eric, how did you produce it?
> I guess it's because of nh_dev which is the only netdevice pointer inside
> fib_info. Let me take a deeper look.
> 

Nothing particular, I am using kexec to boot new kernels, and all my
attempts with your patch included demonstrated the issue.

eth0 is a bonding device, it might matter, I do not know.

We also have some tunnels, but unfortunately I can not provide a setup
that you could use on say a VM.

I can send you the .config if this can help

> >>
> >> I am assuming you are quite confident it is this change?
> >
> > At least, reverting the patch resolves the issue for me.
> >
> > Keeping fib (and their reference to netdev) is apparently too much,
> > we probably need to implement a refcount on the metrics themselves,
> > being stand alone objects.
> 
> I don't disagree, just that it may need to change too much code which
> goes beyond a stable candidate.

Well, your choice, but dealing with a full blown fib and its
dependencies look fragile to me.

[PATCH net-next] net: stmmac: use correct pointer when printing normal descriptor ring

2017-05-09 Thread Niklas Cassel

From: Niklas Cassel 

Signed-off-by: Niklas Cassel 
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c 
b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index cd8c60132390..a74c481401c4 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -3725,7 +3725,7 @@ static void sysfs_display_ring(void *head, int size, int 
extend_desc,
ep++;
} else {
seq_printf(seq, "%d [0x%x]: 0x%x 0x%x 0x%x 0x%x\n",
-  i, (unsigned int)virt_to_phys(ep),
+  i, (unsigned int)virt_to_phys(p),
   le32_to_cpu(p->des0), le32_to_cpu(p->des1),
   le32_to_cpu(p->des2), le32_to_cpu(p->des3));
p++;
-- 
2.11.0

Re: [PATCH net v2] driver: vrf: Fix one possible use-after-free issue

2017-05-09 Thread David Ahern

On 5/9/17 3:27 AM, gfree.w...@vip.163.com wrote:
> diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
> index ceda586..db88249 100644
> --- a/drivers/net/vrf.c
> +++ b/drivers/net/vrf.c
> @@ -989,6 +989,7 @@ static u32 vrf_fib_table(const struct net_device *dev)
>  
>  static int vrf_rcv_finish(struct net *net, struct sock *sk, struct sk_buff 
> *skb)
>  {
> + kfree_skb(skb);
>   return 0;
>  }
>  
> @@ -998,7 +999,7 @@ static struct sk_buff *vrf_rcv_nfhook(u8 pf, unsigned int 
> hook,
>  {
>   struct net *net = dev_net(dev);
>  
> - if (NF_HOOK(pf, hook, net, NULL, skb, dev, NULL, vrf_rcv_finish) < 0)
> + if (nf_hook(pf, hook, net, NULL, skb, dev, NULL, vrf_rcv_finish) != 1)
>   skb = NULL;/* kfree_skb(skb) handled by nf code */
>  
>   return skb;
> 

I'm clearly misunderstanding something ...

With the current code:
- nf_hook returns 1, NF_HOOK invokes vrf_rcv_finish as the okfn, it
returns 0, skb passes on.

- nf_hook returns 0, vrf_rcv_finish has been called by the nf_hook tree,
vrf_rcv_finish returns 0, skb passes on

- nf_hook returns < 0,  vrf_rcv_finish is not called, skb is freed by
netfilter code, vrf_rcv_nfhook returns NULL

What am I missing?

With the above, if nf_hook returns 1, vrf_rcv_finish is not called.

Re: [Patch net] ipv4: restore rt->fi for reference counting

2017-05-09 Thread Cong Wang

On Mon, May 8, 2017 at 7:18 PM, Eric Dumazet  wrote:
> On Mon, 2017-05-08 at 21:22 -0400, David Miller wrote:
>> From: Eric Dumazet 
>> Date: Mon, 08 May 2017 17:01:20 -0700
>>
>> > On Mon, 2017-05-08 at 14:35 -0400, David Miller wrote:
>> >> From: Cong Wang 
>> >> Date: Thu,  4 May 2017 14:54:17 -0700
>> >>
>> >> > IPv4 dst could use fi->fib_metrics to store metrics but fib_info
>> >> > itself is refcnt'ed, so without taking a refcnt fi and
>> >> > fi->fib_metrics could be freed while dst metrics still points to
>> >> > it. This triggers use-after-free as reported by Andrey twice.
>> >> >
>> >> > This patch reverts commit 2860583fe840 ("ipv4: Kill rt->fi") to
>> >> > restore this reference counting. It is a quick fix for -net and
>> >> > -stable, for -net-next, as Eric suggested, we can consider doing
>> >> > reference counting for metrics itself instead of relying on fib_info.
>> >> >
>> >> > IPv6 is very different, it copies or steals the metrics from mx6_config
>> >> > in fib6_commit_metrics() so probably doesn't need a refcnt.
>> >> >
>> >> > Decnet has already done the refcnt'ing, see dn_fib_semantic_match().
>> >> >
>> >> > Fixes: 2860583fe840 ("ipv4: Kill rt->fi")
>> >> > Reported-by: Andrey Konovalov 
>> >> > Tested-by: Andrey Konovalov 
>> >> > Signed-off-by: Cong Wang 
>> >>
>> >> Applied and queued up for -stable, thanks.
>> >
>> > Although I now have on latest net tree these messages when I reboot my
>> > test machine.
>> >
>> > [  224.085873] unregister_netdevice: waiting for eth0 to become free. 
>> > Usage count = 43
>>
>> Strange, the refcounting looks quite OK in the patch you're quoting.
>> I looked over it a few times and cannot figure out a possible cause
>> there.


Eric, how did you produce it?
I guess it's because of nh_dev which is the only netdevice pointer inside
fib_info. Let me take a deeper look.

>>
>> I am assuming you are quite confident it is this change?
>
> At least, reverting the patch resolves the issue for me.
>
> Keeping fib (and their reference to netdev) is apparently too much,
> we probably need to implement a refcount on the metrics themselves,
> being stand alone objects.

I don't disagree, just that it may need to change too much code which
goes beyond a stable candidate.

Thanks for the bug report!

mlx5 endpoint driver problem

2017-05-09 Thread Joao Pinto

Hello,

I am making tests with a Mellanox MLX5 Endpoint, and I am getting kernel hangs
when trying to enable the hca:

mlx5_core :01:00.0: enabling device ( -> 0002)
mlx5_core :01:00.0: Warning: couldn't set 64-bit PCI DMA mask
mlx5_core :01:00.0: Warning: couldn't set 64-bit consistent PCI DMA mask
mlx5_core :01:00.0: firmware version: 16.19.21102
INFO: task swapper:1 blocked for more than 10 seconds.
  Not tainted 4.11.0-BETAMSIX1 #51
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
swapper D0 1  0 0x

Stack Trace:
  __switch_to+0x0/0x94
  __schedule+0x1da/0x8b0
  schedule+0x26/0x6c
  schedule_timeout+0x2da/0x380
  wait_for_completion+0x92/0x104
  mlx5_cmd_exec+0x70e/0xd60
  mlx5_load_one+0x1b4/0xad8
  init_one+0x404/0x600
  pci_device_probe+0x122/0x1f0
  really_probe+0x1ac/0x348
  __driver_attach+0xa8/0xd0
  bus_for_each_dev+0x3c/0x74
  bus_add_driver+0xc2/0x184
  driver_register+0x50/0xec
  init+0x40/0x60

(...)

Stack Trace:
  __switch_to+0x0/0x94
  __schedule+0x1da/0x8b0
  schedule+0x26/0x6c
  schedule_timeout+0x2da/0x380
  wait_for_completion+0x92/0x104
  mlx5_cmd_exec+0x70e/0xd60
  mlx5_load_one+0x1b4/0xad8
  init_one+0x404/0x600
  pci_device_probe+0x122/0x1f0
  really_probe+0x1ac/0x348
  __driver_attach+0xa8/0xd0
  bus_for_each_dev+0x3c/0x74
  bus_add_driver+0xc2/0x184
  driver_register+0x50/0xec
  init+0x40/0x60
mlx5_core :01:00.0: wait_func:882:(pid 1): ENABLE_HCA(0x104) timeout. Will
cause a leak of a command resource
mlx5_core :01:00.0: enable hca failed
mlx5_core :01:00.0: mlx5_load_one failed with error code -110
mlx5_core: probe of :01:00.0 failed with error -110

Could you give me a clue of what might be happennig?

Thanks,
Joao

[PATCH] netxen_nic: set rcode to the return status from the call to netxen_issue_cmd

2017-05-09 Thread Colin King

From: Colin Ian King 

Currently rcode is being initialized to NX_RCODE_SUCCESS and later it
is checked to see if it is not NX_RCODE_SUCCESS which is never true. It
appears that there is an unintentional missing assignment of rcode from
the return of the call to netxen_issue_cmd() that was dropped in
an earlier fix, so add it in.

Detected by CoverityScan, CID#401900 ("Logically dead code")

Fixes: 2dcd5d95ad6b2 ("netxen_nic: fix cdrp race condition")
Signed-off-by: Colin Ian King 
---
 drivers/net/ethernet/qlogic/netxen/netxen_nic_ctx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_ctx.c 
b/drivers/net/ethernet/qlogic/netxen/netxen_nic_ctx.c
index b8d5270359cd..e30676515529 100644
--- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_ctx.c
+++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_ctx.c
@@ -247,7 +247,7 @@ nx_fw_cmd_set_mtu(struct netxen_adapter *adapter, int mtu)
cmd.req.arg3 = 0;
 
if (recv_ctx->state == NX_HOST_CTX_STATE_ACTIVE)
-   netxen_issue_cmd(adapter, );
+   rcode = netxen_issue_cmd(adapter, );
 
if (rcode != NX_RCODE_SUCCESS)
return -EIO;
-- 
2.11.0

Re: [PATCH 0/2] net: Set maximum receive packet size on veth interfaces

2017-05-09 Thread Stephen Hemminger

On Tue,  9 May 2017 14:44:36 +0200
Fredrik Markstrom  wrote:

> Currently veth drops all packets larger then the mtu set on the receiving
> end of the pair. This is inconsistent with most hardware ethernet drivers.

There is no guarantee that packets larger than MTU + VLAN tag will be received
by hardware drivers. So why is this necessary for veth? What is your special
use case which makes this necessary?

openvswitch MTU patch needed in 4.10 stable

2017-05-09 Thread Stephen Hemminger

Could you queue the patch to stable?

Begin forwarded message:

Date: Tue, 09 May 2017 08:21:46 +
From: bugzilla-dae...@bugzilla.kernel.org
To: step...@networkplumber.org
Subject: [Bug 195695] New: openvswitch: Set internal device max mtu to 
ETH_MAX_MTU

https://bugzilla.kernel.org/show_bug.cgi?id=195695

Bug ID: 195695
   Summary: openvswitch: Set internal device max mtu to
ETH_MAX_MTU
   Product: Networking
   Version: 2.5
Kernel Version: 4.10
  Hardware: All
OS: Linux
  Tree: Mainline
Status: NEW
  Severity: normal
  Priority: P1
 Component: Other
  Assignee: step...@networkplumber.org
  Reporter: en...@basis-consulting.com
Regression: No

Can not use jumbo frames with openvswitch bridge in kernel 4.10. This is fixed
in kernel 4.11:

Commit 91572088e3fd ("net: use core MTU range checking in core net
infra") changed the openvswitch internal device to use the core net
infra for controlling the MTU range, but failed to actually set the
max_mtu as described in the commit message, which now defaults to
ETH_DATA_LEN.

This patch fixes this by setting max_mtu to ETH_MAX_MTU after
ether_setup() call.

Fixes: 91572088e3fd ("net: use core MTU range checking in core net infra")
Signed-off-by: Jarno Rajahalme 
Signed-off-by: David S. Miller 

Diffstat (limited to 'net/openvswitch/vport-internal_dev.c')
-rw-r--r--
net/openvswitch/vport-internal_dev.c
2

1 files changed, 2 insertions, 0 deletions
diff --git a/net/openvswitch/vport-internal_dev.c
b/net/openvswitch/vport-internal_dev.c
index 09141a1..89193a6 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -149,6 +149,8 @@ static void do_setup(struct net_device *netdev)
 {
ether_setup(netdev);

+   netdev->max_mtu = ETH_MAX_MTU;
+
netdev->netdev_ops = _dev_netdev_ops;

netdev->priv_flags &= ~IFF_TX_SKB_SHARING;

Br.
Endre Vaade

-- 
You are receiving this mail because:
You are the assignee for the bug.

Re: [PATCH RFC net-next 0/6] net: reducing memory footprint of network devices

2017-05-09 Thread David Ahern

On 5/9/17 2:50 AM, Nicolas Dichtel wrote:
> Your initial patch tried to make those interfaces transparent, this is not the
> case anymore here. It would probably be useful to be able to filter those
> interfaces in the kernel during a dump.

The earlier email was for hidden devices; the intent there is to hide
certain devices (e.g., switch control netdevs) from user dumps by default.

Adding an attribute at create time such as IFF_INVISIBLE for such
devices would be a follow on to this set - but leveraging the same
sysctl and sysfs bypasses.

Re: [PATCH net 0/5] qed*: General fixes

2017-05-09 Thread David Miller

From: Yuval Mintz 
Date: Tue, 9 May 2017 15:07:46 +0300

> This series contain several fixes for qed and qede.
> 
>  - #1 [and ~#5] relate to XDP cleanups
>  - #2 and #5 correct VF behavior
>  - #3 and #4 fix and add missing configurations needed for RoCE & storage
> 
> Dave,
> 
> Please consider applying the series to 'net'.

Series applied, thank you.

Re: [PATCH net 0/3] mlx4 misc fixes

2017-05-09 Thread David Miller

From: Tariq Toukan 
Date: Tue,  9 May 2017 14:45:21 +0300

> This patchset contains misc bug fixes from the team
> to the mlx4 Core and Eth drivers.
> 
> Series generated against net commit:
> 32f1bc0f3d26 Revert "ipv4: restore rt->fi for reference counting"

Series applied, thanks Tariq.

Re: [PATCH] net: dsa: loop: Check for memory allocation failure

2017-05-09 Thread Joe Perches

On Mon, 2017-05-08 at 17:35 -0700, Florian Fainelli wrote:
> On 05/08/2017 04:46 PM, Julia Lawall wrote:
> > On Mon, 8 May 2017, Joe Perches wrote:
> > > Each time -EPROBE_DEFER occurs, another set of calls to
> > > dsa_switch_alloc and dev_kzalloc also occurs.
> > > 
> > > Perhaps it'd be better to do:
> > > 
> > >   if (ps->netdev) {
> > >   devm_kfree(>dev, ps);
> > >   devm_kfree(>dev, ds);
> > >   return -EPROBE_DEFER;
> > >   }
> > 
> > Is EPROBE_DEFER handled differently than other kinds of errors?
> 
> In the core device driver model, yes, EPROBE_DEFER is treated
> differently than other errors because it puts the driver on a retry queue.
> 
> EPROBE_DEFER is already a slow and exceptional path, and this is a
> mock-up driver, so I am not sure what value there is in trying to
> balance devm_kzalloc() with corresponding devm_kfree()...

Example code should be as correct as possible.

Re: [PATCH net] ipv6: make sure dev is not NULL before call ip6_frag_reasm

2017-05-09 Thread Eric Dumazet

On Tue, 2017-05-09 at 21:40 +0800, Hangbin Liu wrote:

> 
> I saw we checked the dev in this function
> 
>   dev = skb->dev;
>   if (dev) {
>   fq->iif = dev->ifindex;
>   skb->dev = NULL;
> }
> 
> and upper caller ipv6_frag_rcv()
> 
>   fq = fq_find(net, fhdr->identification, >saddr, >daddr,
>skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr));
> 
> 
> Apologise that I did not do enough research to make sure whether skb->dev
> could be NULL or not. I will do the check recently and reply when got a
> confirmation.

If really having a NULL dev is possible, I would rather change things
this way, as your fix has side effects.

diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 
e1da5b888cc4901711d573075f8ae4eada7f086e..6c0a2b74ba705cbe13b4e7522d958a9c3d395c29
 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -77,7 +77,7 @@ static u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
 static struct inet_frags ip6_frags;
 
 static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
- struct net_device *dev);
+ struct net_device *dev, struct inet6_dev *idev);
 
 /*
  * callers should be careful not to use the hash value outside the ipfrag_lock
@@ -209,6 +209,7 @@ fq_find(struct net *net, __be32 id, const struct in6_addr 
*src,
 static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
   struct frag_hdr *fhdr, int nhoff)
 {
+   struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
struct sk_buff *prev, *next;
struct net_device *dev;
int offset, end, fragsize;
@@ -223,8 +224,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct 
sk_buff *skb,
((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1)));
 
if ((unsigned int)end > IPV6_MAXPLEN) {
-   __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-   IPSTATS_MIB_INHDRERRORS);
+   __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
  ((u8 *)>frag_off -
   skb_network_header(skb)));
@@ -258,8 +258,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct 
sk_buff *skb,
/* RFC2460 says always send parameter problem in
 * this case. -DaveM
 */
-   __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-   IPSTATS_MIB_INHDRERRORS);
+   __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
  offsetof(struct ipv6hdr, 
payload_len));
return -1;
@@ -354,7 +353,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct 
sk_buff *skb,
unsigned long orefdst = skb->_skb_refdst;
 
skb->_skb_refdst = 0UL;
-   res = ip6_frag_reasm(fq, prev, dev);
+   res = ip6_frag_reasm(fq, prev, dev, idev);
skb->_skb_refdst = orefdst;
return res;
}
@@ -365,8 +364,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct 
sk_buff *skb,
 discard_fq:
inet_frag_kill(>q, _frags);
 err:
-   __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-   IPSTATS_MIB_REASMFAILS);
+   __IP6_INC_STATS(net, idev, IPSTATS_MIB_REASMFAILS);
kfree_skb(skb);
return -1;
 }
@@ -381,7 +379,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct 
sk_buff *skb,
  * the last and the first frames arrived and all the bits are here.
  */
 static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
- struct net_device *dev)
+ struct net_device *dev, struct inet6_dev *idev)
 {
struct net *net = container_of(fq->q.net, struct net, ipv6.frags);
struct sk_buff *fp, *head = fq->q.fragments;
@@ -505,9 +503,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct 
sk_buff *prev,
skb_postpush_rcsum(head, skb_network_header(head),
   skb_network_header_len(head));
 
-   rcu_read_lock();
-   __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
-   rcu_read_unlock();
+   __IP6_INC_STATS(net, idev, IPSTATS_MIB_REASMOKS);
fq->q.fragments = NULL;
fq->q.fragments_tail = NULL;
return 1;

Re: DQL and TCQ_F_CAN_BYPASS destroy performance under virtualizaiton (Was: "Re: net_sched strange in 4.11")

2017-05-09 Thread Stefan Hajnoczi

On Tue, May 09, 2017 at 08:46:46AM +0100, Anton Ivanov wrote:
> I have figured it out. Two issues.
> 
> 1) skb->xmit_more is hardly ever set under virtualization because the qdisc
> is usually bypassed because of TCQ_F_CAN_BYPASS. Once TCQ_F_CAN_BYPASS is
> set a virtual NIC driver is not likely see skb->xmit_more (this answers my
> "how does this work at all" question).
> 
> 2) If that flag is turned off (I patched sched_generic to turn it off in
> pfifo_fast while testing), DQL keeps xmit_more from being set. If the driver
> is not DQL enabled xmit_more is never ever set. If the driver is DQL enabled
> the queue is adjusted to ensure xmit_more stops happening within 10-15 xmit
> cycles.
> 
> That is plain *wrong* for virtual NICs - virtio, emulated NICs, etc. There,
> the BIG cost is telling the hypervisor that it needs to "kick" the packets.
> The cost of putting them into the vNIC buffers is negligible. You want
> xmit_more to happen - it makes between 50% and 300% (depending on vNIC
> design) difference. If there is no xmit_more the vNIC will immediately
> "kick" the hypervisor and try to signal that  the packet needs to move
> straight away (as for example in virtio_net).
> 
> In addition to that, the perceived line rate is proportional to this cost,
> so I am not sure that the current dql math holds. In fact, I think it does
> not - it is trying to adjust something which influences the perceived line
> rate.
> 
> So - how do we turn BOTH bypass and DQL adjustment while under
> virtualization and set them to be "always qdisc" + "always xmit_more
> allowed"
> 
> A.
> 
> P.S. Cc-ing virtio maintainer

CCing Michael Tsirkin and Jason Wang, who are the core virtio and
virtio-net maintainers.  (I maintain the vsock driver - it's unrelated
to this discussion.)

> 
> A.
> 
> 
> On 08/05/17 08:15, Anton Ivanov wrote:
> > Hi all,
> > 
> > I was revising some of my old work for UML to prepare it for submission
> > and I noticed that skb->xmit_more does not seem to be set any more.
> > 
> > I traced the issue as far as net/sched/sched_generic.c
> > 
> > try_bulk_dequeue_skb() is never invoked (the drivers I am working on are
> > dql enabled so that is not the problem).
> > 
> > More interestingly, if I put a breakpoint and debug output into
> > dequeue_skb() around line 147 - right before the bulk: tag that skb
> > there is always NULL. ???
> > 
> > Similarly, debug in pfifo_fast_dequeue shows only NULLs being dequeued.
> > Again - ???
> > 
> > First and foremost, I apologize for the silly question, but how can this
> > work at all? I see the skbs showing up at the driver level, why are
> > NULLs being returned at qdisc dequeue and where do the skbs at the
> > driver level come from?
> > 
> > Second, where should I look to fix it?
> > 
> > A.
> > 
> 
> 
> -- 
> Anton R. Ivanov
> 
> Cambridge Greys Limited, England company No 10273661
> http://www.cambridgegreys.com/
> 


signature.asc
Description: PGP signature

Re: [PATCH 0/4] hamradio: Fine-tuning for nine function implementations

2017-05-09 Thread David Miller


You can feel free to continue submitting these changes, even though
people have asked you to back off on this, and that there is little to
no value to this churn.

But I personally am not going to apply any of your changes...

Especially since you keep posting even though people are asking you to
not make these changes.

You can ignore feedback like that, and you are explicitly being
notified that as a result, we can feel free to ignore you _too_.

People who submit kernel changes in the way you do waste a lot of
people's valuable time which could be spent on fixing real bugs,
implementing new important features, adding new documentation to
improve the understanding of the kernel for everyone, etc.

But instead, that time is being invested to reviewing your extremely
low value patches, many of which are undesirable.

I will not stand for it as the networking maintainer and am going to
ignore everything you submit until your approach and attitude towards
kernel patch submission _fundamentally_ (not temporarily, or for one
specific set of patches) changes.

Thank you.

RE: sky2: Use seq_putc() in sky2_debug_show()

2017-05-09 Thread David Laight

From: Stephen Hemminger
> Sent: 09 May 2017 06:50
> On Mon, 8 May 2017 19:42:46 +0200
> SF Markus Elfring  wrote:
> 
> > > Which issue do you mean? I dont see any issue you fix here.
> >
> > Are the run time characteristics a bit nicer for the function seq_putc
> > in comparison to the function seq_puts for printing a single line break 
> > here?
> >
> > Regards,
> > Markus
> 
> I would put this in why bother category. seq_puts is correct and this is only
> in diagnostic output useful to developer and disabled on most distro kernels

Sometimes consistency is best.
Output everything with seq_printf(), using a format "%s" if necessary.
The performance really doesn't matter here at all.

It is also (probably) possible to get gcc to do the conversions - as it does 
for printf().
(A right PITA for embedded systems where only printf() exists.)

David

[PATCH 4/4] hamradio: Adjust four function calls together with a variable assignment

2017-05-09 Thread SF Markus Elfring

From: Markus Elfring 
Date: Tue, 9 May 2017 15:57:17 +0200

The script "checkpatch.pl" pointed information out like the following.

ERROR: do not use assignment in if condition

Thus fix affected source code places.

Signed-off-by: Markus Elfring 
---
 drivers/net/hamradio/bpqether.c | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c
index eaa0f2e8e561..5e234e0ca256 100644
--- a/drivers/net/hamradio/bpqether.c
+++ b/drivers/net/hamradio/bpqether.c
@@ -185,7 +185,8 @@ static int bpq_rcv(struct sk_buff *skb, struct net_device 
*dev, struct packet_ty
if (!net_eq(dev_net(dev), _net))
goto drop;
 
-   if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+   skb = skb_share_check(skb, GFP_ATOMIC);
+   if (!skb)
return NET_RX_DROP;
 
if (!pskb_may_pull(skb, sizeof(struct ethhdr)))
@@ -286,7 +287,8 @@ static netdev_tx_t bpq_xmit(struct sk_buff *skb, struct 
net_device *dev)
bpq = netdev_priv(dev);
 
orig_dev = dev;
-   if ((dev = bpq_get_ether_dev(dev)) == NULL) {
+   dev = bpq_get_ether_dev(dev);
+   if (!dev) {
orig_dev->stats.tx_dropped++;
kfree_skb(skb);
return NETDEV_TX_OK;
@@ -565,12 +567,14 @@ static int bpq_device_event(struct notifier_block *this,
break;
 
case NETDEV_DOWN:   /* ethernet device closed -> close BPQ 
interface */
-   if ((dev = bpq_get_ax25_dev(dev)) != NULL)
+   dev = bpq_get_ax25_dev(dev);
+   if (dev)
dev_close(dev);
break;
 
case NETDEV_UNREGISTER: /* ethernet device removed -> free BPQ 
interface */
-   if ((dev = bpq_get_ax25_dev(dev)) != NULL)
+   dev = bpq_get_ax25_dev(dev);
+   if (dev)
bpq_free_device(dev);
break;
default:
-- 
2.12.2

Re: Marvell phy errata origins?

2017-05-09 Thread Andrew Lunn

> According to Marvell this was errata for 88M1101 , and should not be
> applied to any other phy .. So we should be removing these lines and
> make a special aneg for 88M1101 then restore everything that doesn't
> need this back to the generic aneg,

Hi Daniel

Thanks for finding this out. Can you role a patch?

   Andrew

[PATCH 3/4] hamradio: Use seq_puts() in bpq_seq_show()

2017-05-09 Thread SF Markus Elfring

From: Markus Elfring 
Date: Tue, 9 May 2017 15:45:09 +0200

A string which did not contain a data format specification should be put
into a sequence. Thus use the corresponding function "seq_puts".

This issue was detected by using the Coccinelle software.

Signed-off-by: Markus Elfring 
---
 drivers/net/hamradio/bpqether.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c
index f62e7f325cf9..eaa0f2e8e561 100644
--- a/drivers/net/hamradio/bpqether.c
+++ b/drivers/net/hamradio/bpqether.c
@@ -434,7 +434,7 @@ static int bpq_seq_show(struct seq_file *seq, void *v)
bpqdev->dest_addr);
 
if (is_multicast_ether_addr(bpqdev->acpt_addr))
-   seq_printf(seq, "*\n");
+   seq_puts(seq, "*\n");
else
seq_printf(seq, "%pM\n", bpqdev->acpt_addr);
 
-- 
2.12.2

[PATCH 2/4] hamradio: Adjust four function calls together with a variable assignment

2017-05-09 Thread SF Markus Elfring

From: Markus Elfring 
Date: Tue, 9 May 2017 15:15:16 +0200

The script "checkpatch.pl" pointed information out like the following.

ERROR: do not use assignment in if condition

Thus fix affected source code places.
Improve a size determination.

Signed-off-by: Markus Elfring 
---
 drivers/net/hamradio/yam.c | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/net/hamradio/yam.c b/drivers/net/hamradio/yam.c
index 542f1e511df1..c792b0f116a5 100644
--- a/drivers/net/hamradio/yam.c
+++ b/drivers/net/hamradio/yam.c
@@ -401,7 +401,8 @@ static unsigned char *add_mcs(unsigned char *bits, int 
bitrate,
}
 
/* Allocate a new mcs */
-   if ((p = kmalloc(sizeof(struct yam_mcs), GFP_KERNEL)) == NULL) {
+   p = kmalloc(sizeof(*p), GFP_KERNEL);
+   if (!p) {
release_firmware(fw);
return NULL;
}
@@ -549,7 +550,8 @@ static inline void yam_rx_flag(struct net_device *dev, 
struct yam_port *yp)
if ((yp->rx_crch & yp->rx_crcl) != 0xFF) {
/* Bad crc */
} else {
-   if (!(skb = dev_alloc_skb(pkt_len))) {
+   skb = dev_alloc_skb(pkt_len);
+   if (!skb) {
printk(KERN_WARNING "%s: memory squeeze, 
dropping packet\n", dev->name);
++dev->stats.rx_dropped;
} else {
@@ -670,7 +672,8 @@ static void yam_tx_byte(struct net_device *dev, struct 
yam_port *yp)
break;
case TX_HEAD:
if (--yp->tx_count <= 0) {
-   if (!(skb = skb_dequeue(>send_queue))) {
+   skb = skb_dequeue(>send_queue);
+   if (!skb) {
ptt_off(dev);
yp->tx_state = TX_OFF;
break;
@@ -879,7 +882,8 @@ static int yam_open(struct net_device *dev)
printk(KERN_ERR "%s: cannot 0x%lx busy\n", dev->name, 
dev->base_addr);
return -EACCES;
}
-   if ((u = yam_check_uart(dev->base_addr)) == c_uart_unknown) {
+   u = yam_check_uart(dev->base_addr);
+   if (u == c_uart_unknown) {
printk(KERN_ERR "%s: cannot find uart type\n", dev->name);
ret = -EIO;
goto out_release_base;
-- 
2.12.2

1 2 >

1 - 100 of 190 matches

Mail list logo