Re: [PATCH 1/3] tuntap: rx batching

2016-11-15 Thread Jason Wang



On 2016年11月15日 11:41, Michael S. Tsirkin wrote:

On Tue, Nov 15, 2016 at 11:14:48AM +0800, Jason Wang wrote:

>
>
>On 2016年11月12日 00:20, Michael S. Tsirkin wrote:

> >On Fri, Nov 11, 2016 at 12:28:38PM +0800, Jason Wang wrote:

> > >
> > >On 2016年11月11日 12:17, John Fastabend wrote:

> > > >On 16-11-10 07:31 PM, Michael S. Tsirkin wrote:

> > > > > >On Fri, Nov 11, 2016 at 10:07:44AM +0800, Jason Wang wrote:

> > > > > > > >
> > > > > > > >On 2016年11月10日 00:38, Michael S. Tsirkin wrote:

> > > > > > > > > >On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:

> > > > > > > > > > > >Backlog were used for tuntap rx, but it can only process 
1 packet at
> > > > > > > > > > > >one time since it was scheduled during sendmsg() 
synchronously in
> > > > > > > > > > > >process context. This lead bad cache utilization so this 
patch tries
> > > > > > > > > > > >to do some batching before call rx NAPI. This is done 
through:
> > > > > > > > > > > >
> > > > > > > > > > > >- accept MSG_MORE as a hint from sendmsg() caller, if it 
was set,
> > > > > > > > > > > > batch the packet temporarily in a linked list and 
submit them all
> > > > > > > > > > > > once MSG_MORE were cleared.
> > > > > > > > > > > >- implement a tuntap specific NAPI handler for 
processing this kind of
> > > > > > > > > > > > possible batching. (This could be done by extending 
backlog to
> > > > > > > > > > > > support skb like, but using a tun specific one 
looks cleaner and
> > > > > > > > > > > > easier for future extension).
> > > > > > > > > > > >
> > > > > > > > > > > >Signed-off-by: Jason Wang

> > > > > > > > > >So why do we need an extra queue?

> > > > > > > >The idea was borrowed from backlog to allow some kind of bulking 
and avoid
> > > > > > > >spinlock on each dequeuing.
> > > > > > > >

> > > > > > > > > >This is not what hardware devices do.
> > > > > > > > > >How about adding the packet to queue unconditionally, 
deferring
> > > > > > > > > >signalling until we get sendmsg without MSG_MORE?

> > > > > > > >Then you need touch spinlock when dequeuing each packet.

> > > >Random thought, I have a cmpxchg ring I am using for the qdisc work that
> > > >could possibly replace the spinlock implementation. I haven't figured
> > > >out the resizing API yet because I did not need it but I assume it could
> > > >help here and let you dequeue multiple skbs in one operation.
> > > >
> > > >I can post the latest version if useful or an older version is
> > > >somewhere on patchworks as well.
> > > >
> > > >.John
> > > >
> > > >

> > >Look useful here, and I can compare the performance if you post.
> > >
> > >A question is can we extend the skb_array to support that?
> > >
> > >Thanks

> >I'd like to start with simple patch adding napi with one queue, then add
> >optimization patches on top.

>
>The point is tun is using backlog who uses two queues (process_queue and
>input_pkt_queue).
>
>How about something like:
>
>1) NAPI support with skb_array

I would start with just write queue linked list. It all runs on a single
CPU normally,


True for virt but I'm not sure the others. If we have multiple senders 
at the same time, current code scales very well.



  so the nice reductions of cache line bounces due to skb
array should never materialize.

While we are at it, limiting the size of the queue might
be a good idea. Kind of like TUNSETSNDBUF but 1. actually
working where instead of tracking packets within net stack
we make sndbuf track the internal buffer


Get your point, will start from simple skb list.

Thanks


Re: [PATCH 1/3] tuntap: rx batching

2016-11-15 Thread Jason Wang



On 2016年11月15日 11:41, Michael S. Tsirkin wrote:

On Tue, Nov 15, 2016 at 11:14:48AM +0800, Jason Wang wrote:

>
>
>On 2016年11月12日 00:20, Michael S. Tsirkin wrote:

> >On Fri, Nov 11, 2016 at 12:28:38PM +0800, Jason Wang wrote:

> > >
> > >On 2016年11月11日 12:17, John Fastabend wrote:

> > > >On 16-11-10 07:31 PM, Michael S. Tsirkin wrote:

> > > > > >On Fri, Nov 11, 2016 at 10:07:44AM +0800, Jason Wang wrote:

> > > > > > > >
> > > > > > > >On 2016年11月10日 00:38, Michael S. Tsirkin wrote:

> > > > > > > > > >On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:

> > > > > > > > > > > >Backlog were used for tuntap rx, but it can only process 
1 packet at
> > > > > > > > > > > >one time since it was scheduled during sendmsg() 
synchronously in
> > > > > > > > > > > >process context. This lead bad cache utilization so this 
patch tries
> > > > > > > > > > > >to do some batching before call rx NAPI. This is done 
through:
> > > > > > > > > > > >
> > > > > > > > > > > >- accept MSG_MORE as a hint from sendmsg() caller, if it 
was set,
> > > > > > > > > > > > batch the packet temporarily in a linked list and 
submit them all
> > > > > > > > > > > > once MSG_MORE were cleared.
> > > > > > > > > > > >- implement a tuntap specific NAPI handler for 
processing this kind of
> > > > > > > > > > > > possible batching. (This could be done by extending 
backlog to
> > > > > > > > > > > > support skb like, but using a tun specific one 
looks cleaner and
> > > > > > > > > > > > easier for future extension).
> > > > > > > > > > > >
> > > > > > > > > > > >Signed-off-by: Jason Wang

> > > > > > > > > >So why do we need an extra queue?

> > > > > > > >The idea was borrowed from backlog to allow some kind of bulking 
and avoid
> > > > > > > >spinlock on each dequeuing.
> > > > > > > >

> > > > > > > > > >This is not what hardware devices do.
> > > > > > > > > >How about adding the packet to queue unconditionally, 
deferring
> > > > > > > > > >signalling until we get sendmsg without MSG_MORE?

> > > > > > > >Then you need touch spinlock when dequeuing each packet.

> > > >Random thought, I have a cmpxchg ring I am using for the qdisc work that
> > > >could possibly replace the spinlock implementation. I haven't figured
> > > >out the resizing API yet because I did not need it but I assume it could
> > > >help here and let you dequeue multiple skbs in one operation.
> > > >
> > > >I can post the latest version if useful or an older version is
> > > >somewhere on patchworks as well.
> > > >
> > > >.John
> > > >
> > > >

> > >Look useful here, and I can compare the performance if you post.
> > >
> > >A question is can we extend the skb_array to support that?
> > >
> > >Thanks

> >I'd like to start with simple patch adding napi with one queue, then add
> >optimization patches on top.

>
>The point is tun is using backlog who uses two queues (process_queue and
>input_pkt_queue).
>
>How about something like:
>
>1) NAPI support with skb_array

I would start with just write queue linked list. It all runs on a single
CPU normally,


True for virt but I'm not sure the others. If we have multiple senders 
at the same time, current code scales very well.



  so the nice reductions of cache line bounces due to skb
array should never materialize.

While we are at it, limiting the size of the queue might
be a good idea. Kind of like TUNSETSNDBUF but 1. actually
working where instead of tracking packets within net stack
we make sndbuf track the internal buffer


Get your point, will start from simple skb list.

Thanks


Re: [PATCH 1/3] tuntap: rx batching

2016-11-14 Thread Michael S. Tsirkin
On Tue, Nov 15, 2016 at 11:14:48AM +0800, Jason Wang wrote:
> 
> 
> On 2016年11月12日 00:20, Michael S. Tsirkin wrote:
> > On Fri, Nov 11, 2016 at 12:28:38PM +0800, Jason Wang wrote:
> > > 
> > > On 2016年11月11日 12:17, John Fastabend wrote:
> > > > On 16-11-10 07:31 PM, Michael S. Tsirkin wrote:
> > > > > > On Fri, Nov 11, 2016 at 10:07:44AM +0800, Jason Wang wrote:
> > > > > > > > 
> > > > > > > > On 2016年11月10日 00:38, Michael S. Tsirkin wrote:
> > > > > > > > > > On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:
> > > > > > > > > > > > Backlog were used for tuntap rx, but it can only 
> > > > > > > > > > > > process 1 packet at
> > > > > > > > > > > > one time since it was scheduled during sendmsg() 
> > > > > > > > > > > > synchronously in
> > > > > > > > > > > > process context. This lead bad cache utilization so 
> > > > > > > > > > > > this patch tries
> > > > > > > > > > > > to do some batching before call rx NAPI. This is done 
> > > > > > > > > > > > through:
> > > > > > > > > > > > 
> > > > > > > > > > > > - accept MSG_MORE as a hint from sendmsg() caller, if 
> > > > > > > > > > > > it was set,
> > > > > > > > > > > > batch the packet temporarily in a linked list and 
> > > > > > > > > > > > submit them all
> > > > > > > > > > > > once MSG_MORE were cleared.
> > > > > > > > > > > > - implement a tuntap specific NAPI handler for 
> > > > > > > > > > > > processing this kind of
> > > > > > > > > > > > possible batching. (This could be done by extending 
> > > > > > > > > > > > backlog to
> > > > > > > > > > > > support skb like, but using a tun specific one 
> > > > > > > > > > > > looks cleaner and
> > > > > > > > > > > > easier for future extension).
> > > > > > > > > > > > 
> > > > > > > > > > > > Signed-off-by: Jason Wang
> > > > > > > > > > So why do we need an extra queue?
> > > > > > > > The idea was borrowed from backlog to allow some kind of 
> > > > > > > > bulking and avoid
> > > > > > > > spinlock on each dequeuing.
> > > > > > > > 
> > > > > > > > > >This is not what hardware devices do.
> > > > > > > > > > How about adding the packet to queue unconditionally, 
> > > > > > > > > > deferring
> > > > > > > > > > signalling until we get sendmsg without MSG_MORE?
> > > > > > > > Then you need touch spinlock when dequeuing each packet.
> > > > Random thought, I have a cmpxchg ring I am using for the qdisc work that
> > > > could possibly replace the spinlock implementation. I haven't figured
> > > > out the resizing API yet because I did not need it but I assume it could
> > > > help here and let you dequeue multiple skbs in one operation.
> > > > 
> > > > I can post the latest version if useful or an older version is
> > > > somewhere on patchworks as well.
> > > > 
> > > > .John
> > > > 
> > > > 
> > > Look useful here, and I can compare the performance if you post.
> > > 
> > > A question is can we extend the skb_array to support that?
> > > 
> > > Thanks
> > I'd like to start with simple patch adding napi with one queue, then add
> > optimization patches on top.
> 
> The point is tun is using backlog who uses two queues (process_queue and
> input_pkt_queue).
> 
> How about something like:
> 
> 1) NAPI support with skb_array

I would start with just write queue linked list. It all runs on a single
CPU normally, so the nice reductions of cache line bounces due to skb
array should never materialize.

While we are at it, limiting the size of the queue might
be a good idea. Kind of like TUNSETSNDBUF but 1. actually
working where instead of tracking packets within net stack
we make sndbuf track the internal buffer


> 2) MSG_MORE support
> 3) other optimizations on top
> 
> ?
> 
> > 
> > One issue that comes to mind is that write queue limits
> > are byte based, they do not count packets unlike tun rx queue.
> 
> I'm not sure I get the issue, write queue is not exported and only used for
> batching. We probably need an internal limit in tun to avoid OOM attacker
> from guest.
> 
> Thanks


Re: [PATCH 1/3] tuntap: rx batching

2016-11-14 Thread Michael S. Tsirkin
On Tue, Nov 15, 2016 at 11:14:48AM +0800, Jason Wang wrote:
> 
> 
> On 2016年11月12日 00:20, Michael S. Tsirkin wrote:
> > On Fri, Nov 11, 2016 at 12:28:38PM +0800, Jason Wang wrote:
> > > 
> > > On 2016年11月11日 12:17, John Fastabend wrote:
> > > > On 16-11-10 07:31 PM, Michael S. Tsirkin wrote:
> > > > > > On Fri, Nov 11, 2016 at 10:07:44AM +0800, Jason Wang wrote:
> > > > > > > > 
> > > > > > > > On 2016年11月10日 00:38, Michael S. Tsirkin wrote:
> > > > > > > > > > On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:
> > > > > > > > > > > > Backlog were used for tuntap rx, but it can only 
> > > > > > > > > > > > process 1 packet at
> > > > > > > > > > > > one time since it was scheduled during sendmsg() 
> > > > > > > > > > > > synchronously in
> > > > > > > > > > > > process context. This lead bad cache utilization so 
> > > > > > > > > > > > this patch tries
> > > > > > > > > > > > to do some batching before call rx NAPI. This is done 
> > > > > > > > > > > > through:
> > > > > > > > > > > > 
> > > > > > > > > > > > - accept MSG_MORE as a hint from sendmsg() caller, if 
> > > > > > > > > > > > it was set,
> > > > > > > > > > > > batch the packet temporarily in a linked list and 
> > > > > > > > > > > > submit them all
> > > > > > > > > > > > once MSG_MORE were cleared.
> > > > > > > > > > > > - implement a tuntap specific NAPI handler for 
> > > > > > > > > > > > processing this kind of
> > > > > > > > > > > > possible batching. (This could be done by extending 
> > > > > > > > > > > > backlog to
> > > > > > > > > > > > support skb like, but using a tun specific one 
> > > > > > > > > > > > looks cleaner and
> > > > > > > > > > > > easier for future extension).
> > > > > > > > > > > > 
> > > > > > > > > > > > Signed-off-by: Jason Wang
> > > > > > > > > > So why do we need an extra queue?
> > > > > > > > The idea was borrowed from backlog to allow some kind of 
> > > > > > > > bulking and avoid
> > > > > > > > spinlock on each dequeuing.
> > > > > > > > 
> > > > > > > > > >This is not what hardware devices do.
> > > > > > > > > > How about adding the packet to queue unconditionally, 
> > > > > > > > > > deferring
> > > > > > > > > > signalling until we get sendmsg without MSG_MORE?
> > > > > > > > Then you need touch spinlock when dequeuing each packet.
> > > > Random thought, I have a cmpxchg ring I am using for the qdisc work that
> > > > could possibly replace the spinlock implementation. I haven't figured
> > > > out the resizing API yet because I did not need it but I assume it could
> > > > help here and let you dequeue multiple skbs in one operation.
> > > > 
> > > > I can post the latest version if useful or an older version is
> > > > somewhere on patchworks as well.
> > > > 
> > > > .John
> > > > 
> > > > 
> > > Look useful here, and I can compare the performance if you post.
> > > 
> > > A question is can we extend the skb_array to support that?
> > > 
> > > Thanks
> > I'd like to start with simple patch adding napi with one queue, then add
> > optimization patches on top.
> 
> The point is tun is using backlog who uses two queues (process_queue and
> input_pkt_queue).
> 
> How about something like:
> 
> 1) NAPI support with skb_array

I would start with just write queue linked list. It all runs on a single
CPU normally, so the nice reductions of cache line bounces due to skb
array should never materialize.

While we are at it, limiting the size of the queue might
be a good idea. Kind of like TUNSETSNDBUF but 1. actually
working where instead of tracking packets within net stack
we make sndbuf track the internal buffer


> 2) MSG_MORE support
> 3) other optimizations on top
> 
> ?
> 
> > 
> > One issue that comes to mind is that write queue limits
> > are byte based, they do not count packets unlike tun rx queue.
> 
> I'm not sure I get the issue, write queue is not exported and only used for
> batching. We probably need an internal limit in tun to avoid OOM attacker
> from guest.
> 
> Thanks


Re: [PATCH 1/3] tuntap: rx batching

2016-11-14 Thread Jason Wang



On 2016年11月12日 00:20, Michael S. Tsirkin wrote:

On Fri, Nov 11, 2016 at 12:28:38PM +0800, Jason Wang wrote:


On 2016年11月11日 12:17, John Fastabend wrote:

On 16-11-10 07:31 PM, Michael S. Tsirkin wrote:

On Fri, Nov 11, 2016 at 10:07:44AM +0800, Jason Wang wrote:


On 2016年11月10日 00:38, Michael S. Tsirkin wrote:

On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:

Backlog were used for tuntap rx, but it can only process 1 packet at
one time since it was scheduled during sendmsg() synchronously in
process context. This lead bad cache utilization so this patch tries
to do some batching before call rx NAPI. This is done through:

- accept MSG_MORE as a hint from sendmsg() caller, if it was set,
batch the packet temporarily in a linked list and submit them all
once MSG_MORE were cleared.
- implement a tuntap specific NAPI handler for processing this kind of
possible batching. (This could be done by extending backlog to
support skb like, but using a tun specific one looks cleaner and
easier for future extension).

Signed-off-by: Jason Wang

So why do we need an extra queue?

The idea was borrowed from backlog to allow some kind of bulking and avoid
spinlock on each dequeuing.


   This is not what hardware devices do.
How about adding the packet to queue unconditionally, deferring
signalling until we get sendmsg without MSG_MORE?

Then you need touch spinlock when dequeuing each packet.

Random thought, I have a cmpxchg ring I am using for the qdisc work that
could possibly replace the spinlock implementation. I haven't figured
out the resizing API yet because I did not need it but I assume it could
help here and let you dequeue multiple skbs in one operation.

I can post the latest version if useful or an older version is
somewhere on patchworks as well.

.John



Look useful here, and I can compare the performance if you post.

A question is can we extend the skb_array to support that?

Thanks

I'd like to start with simple patch adding napi with one queue, then add
optimization patches on top.


The point is tun is using backlog who uses two queues (process_queue and 
input_pkt_queue).


How about something like:

1) NAPI support with skb_array
2) MSG_MORE support
3) other optimizations on top

?



One issue that comes to mind is that write queue limits
are byte based, they do not count packets unlike tun rx queue.


I'm not sure I get the issue, write queue is not exported and only used 
for batching. We probably need an internal limit in tun to avoid OOM 
attacker from guest.


Thanks


Re: [PATCH 1/3] tuntap: rx batching

2016-11-14 Thread Jason Wang



On 2016年11月12日 00:20, Michael S. Tsirkin wrote:

On Fri, Nov 11, 2016 at 12:28:38PM +0800, Jason Wang wrote:


On 2016年11月11日 12:17, John Fastabend wrote:

On 16-11-10 07:31 PM, Michael S. Tsirkin wrote:

On Fri, Nov 11, 2016 at 10:07:44AM +0800, Jason Wang wrote:


On 2016年11月10日 00:38, Michael S. Tsirkin wrote:

On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:

Backlog were used for tuntap rx, but it can only process 1 packet at
one time since it was scheduled during sendmsg() synchronously in
process context. This lead bad cache utilization so this patch tries
to do some batching before call rx NAPI. This is done through:

- accept MSG_MORE as a hint from sendmsg() caller, if it was set,
batch the packet temporarily in a linked list and submit them all
once MSG_MORE were cleared.
- implement a tuntap specific NAPI handler for processing this kind of
possible batching. (This could be done by extending backlog to
support skb like, but using a tun specific one looks cleaner and
easier for future extension).

Signed-off-by: Jason Wang

So why do we need an extra queue?

The idea was borrowed from backlog to allow some kind of bulking and avoid
spinlock on each dequeuing.


   This is not what hardware devices do.
How about adding the packet to queue unconditionally, deferring
signalling until we get sendmsg without MSG_MORE?

Then you need touch spinlock when dequeuing each packet.

Random thought, I have a cmpxchg ring I am using for the qdisc work that
could possibly replace the spinlock implementation. I haven't figured
out the resizing API yet because I did not need it but I assume it could
help here and let you dequeue multiple skbs in one operation.

I can post the latest version if useful or an older version is
somewhere on patchworks as well.

.John



Look useful here, and I can compare the performance if you post.

A question is can we extend the skb_array to support that?

Thanks

I'd like to start with simple patch adding napi with one queue, then add
optimization patches on top.


The point is tun is using backlog who uses two queues (process_queue and 
input_pkt_queue).


How about something like:

1) NAPI support with skb_array
2) MSG_MORE support
3) other optimizations on top

?



One issue that comes to mind is that write queue limits
are byte based, they do not count packets unlike tun rx queue.


I'm not sure I get the issue, write queue is not exported and only used 
for batching. We probably need an internal limit in tun to avoid OOM 
attacker from guest.


Thanks


Re: [PATCH 1/3] tuntap: rx batching

2016-11-11 Thread Michael S. Tsirkin
On Fri, Nov 11, 2016 at 12:28:38PM +0800, Jason Wang wrote:
> 
> 
> On 2016年11月11日 12:17, John Fastabend wrote:
> > On 16-11-10 07:31 PM, Michael S. Tsirkin wrote:
> > > >On Fri, Nov 11, 2016 at 10:07:44AM +0800, Jason Wang wrote:
> > > > >>
> > > > >>
> > > > >>On 2016年11月10日 00:38, Michael S. Tsirkin wrote:
> > > > > >>>On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:
> > > > > > Backlog were used for tuntap rx, but it can only process 1 
> > > > > > packet at
> > > > > > one time since it was scheduled during sendmsg() synchronously 
> > > > > > in
> > > > > > process context. This lead bad cache utilization so this patch 
> > > > > > tries
> > > > > > to do some batching before call rx NAPI. This is done through:
> > > > > > 
> > > > > > - accept MSG_MORE as a hint from sendmsg() caller, if it was 
> > > > > > set,
> > > > > > batch the packet temporarily in a linked list and submit 
> > > > > >  them all
> > > > > > once MSG_MORE were cleared.
> > > > > > - implement a tuntap specific NAPI handler for processing this 
> > > > > > kind of
> > > > > > possible batching. (This could be done by extending backlog 
> > > > > >  to
> > > > > > support skb like, but using a tun specific one looks 
> > > > > >  cleaner and
> > > > > > easier for future extension).
> > > > > > 
> > > > > > Signed-off-by: Jason Wang
> > > > > >>>So why do we need an extra queue?
> > > > >>
> > > > >>The idea was borrowed from backlog to allow some kind of bulking and 
> > > > >>avoid
> > > > >>spinlock on each dequeuing.
> > > > >>
> > > > > >>>   This is not what hardware devices do.
> > > > > >>>How about adding the packet to queue unconditionally, deferring
> > > > > >>>signalling until we get sendmsg without MSG_MORE?
> > > > >>
> > > > >>Then you need touch spinlock when dequeuing each packet.
> > > >
> > Random thought, I have a cmpxchg ring I am using for the qdisc work that
> > could possibly replace the spinlock implementation. I haven't figured
> > out the resizing API yet because I did not need it but I assume it could
> > help here and let you dequeue multiple skbs in one operation.
> > 
> > I can post the latest version if useful or an older version is
> > somewhere on patchworks as well.
> > 
> > .John
> > 
> > 
> 
> Look useful here, and I can compare the performance if you post.
> 
> A question is can we extend the skb_array to support that?
> 
> Thanks

I'd like to start with simple patch adding napi with one queue, then add
optimization patches on top.

One issue that comes to mind is that write queue limits
are byte based, they do not count packets unlike tun rx queue.



-- 
MST


Re: [PATCH 1/3] tuntap: rx batching

2016-11-11 Thread Michael S. Tsirkin
On Fri, Nov 11, 2016 at 12:28:38PM +0800, Jason Wang wrote:
> 
> 
> On 2016年11月11日 12:17, John Fastabend wrote:
> > On 16-11-10 07:31 PM, Michael S. Tsirkin wrote:
> > > >On Fri, Nov 11, 2016 at 10:07:44AM +0800, Jason Wang wrote:
> > > > >>
> > > > >>
> > > > >>On 2016年11月10日 00:38, Michael S. Tsirkin wrote:
> > > > > >>>On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:
> > > > > > Backlog were used for tuntap rx, but it can only process 1 
> > > > > > packet at
> > > > > > one time since it was scheduled during sendmsg() synchronously 
> > > > > > in
> > > > > > process context. This lead bad cache utilization so this patch 
> > > > > > tries
> > > > > > to do some batching before call rx NAPI. This is done through:
> > > > > > 
> > > > > > - accept MSG_MORE as a hint from sendmsg() caller, if it was 
> > > > > > set,
> > > > > > batch the packet temporarily in a linked list and submit 
> > > > > >  them all
> > > > > > once MSG_MORE were cleared.
> > > > > > - implement a tuntap specific NAPI handler for processing this 
> > > > > > kind of
> > > > > > possible batching. (This could be done by extending backlog 
> > > > > >  to
> > > > > > support skb like, but using a tun specific one looks 
> > > > > >  cleaner and
> > > > > > easier for future extension).
> > > > > > 
> > > > > > Signed-off-by: Jason Wang
> > > > > >>>So why do we need an extra queue?
> > > > >>
> > > > >>The idea was borrowed from backlog to allow some kind of bulking and 
> > > > >>avoid
> > > > >>spinlock on each dequeuing.
> > > > >>
> > > > > >>>   This is not what hardware devices do.
> > > > > >>>How about adding the packet to queue unconditionally, deferring
> > > > > >>>signalling until we get sendmsg without MSG_MORE?
> > > > >>
> > > > >>Then you need touch spinlock when dequeuing each packet.
> > > >
> > Random thought, I have a cmpxchg ring I am using for the qdisc work that
> > could possibly replace the spinlock implementation. I haven't figured
> > out the resizing API yet because I did not need it but I assume it could
> > help here and let you dequeue multiple skbs in one operation.
> > 
> > I can post the latest version if useful or an older version is
> > somewhere on patchworks as well.
> > 
> > .John
> > 
> > 
> 
> Look useful here, and I can compare the performance if you post.
> 
> A question is can we extend the skb_array to support that?
> 
> Thanks

I'd like to start with simple patch adding napi with one queue, then add
optimization patches on top.

One issue that comes to mind is that write queue limits
are byte based, they do not count packets unlike tun rx queue.



-- 
MST


Re: [PATCH 1/3] tuntap: rx batching

2016-11-10 Thread John Fastabend
On 16-11-10 08:28 PM, Jason Wang wrote:
> 
> 
> On 2016年11月11日 12:17, John Fastabend wrote:
>> On 16-11-10 07:31 PM, Michael S. Tsirkin wrote:
>>> >On Fri, Nov 11, 2016 at 10:07:44AM +0800, Jason Wang wrote:
 >>
 >>
 >>On 2016年11月10日 00:38, Michael S. Tsirkin wrote:
> >>>On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:
>> Backlog were used for tuntap rx, but it can only process 1
>> packet at
>> one time since it was scheduled during sendmsg() synchronously in
>> process context. This lead bad cache utilization so this patch
>> tries
>> to do some batching before call rx NAPI. This is done through:
>> 
>> - accept MSG_MORE as a hint from sendmsg() caller, if it was set,
>> batch the packet temporarily in a linked list and submit
>> them all
>> once MSG_MORE were cleared.
>> - implement a tuntap specific NAPI handler for processing this
>> kind of
>> possible batching. (This could be done by extending
>> backlog to
>> support skb like, but using a tun specific one looks
>> cleaner and
>> easier for future extension).
>> 
>> Signed-off-by: Jason Wang
> >>>So why do we need an extra queue?
 >>
 >>The idea was borrowed from backlog to allow some kind of bulking
 and avoid
 >>spinlock on each dequeuing.
 >>
> >>>   This is not what hardware devices do.
> >>>How about adding the packet to queue unconditionally, deferring
> >>>signalling until we get sendmsg without MSG_MORE?
 >>
 >>Then you need touch spinlock when dequeuing each packet.
>>> >
>> Random thought, I have a cmpxchg ring I am using for the qdisc work that
>> could possibly replace the spinlock implementation. I haven't figured
>> out the resizing API yet because I did not need it but I assume it could
>> help here and let you dequeue multiple skbs in one operation.
>>
>> I can post the latest version if useful or an older version is
>> somewhere on patchworks as well.
>>
>> .John
>>
>>
> 
> Look useful here, and I can compare the performance if you post.
> 
> A question is can we extend the skb_array to support that?
> 
> Thanks
> 

Sent out two RFC patches with the implementation, the first has been
running on my system for some time the second for multiple packets is
only lightly tested and that was awhile back.

.John


Re: [PATCH 1/3] tuntap: rx batching

2016-11-10 Thread John Fastabend
On 16-11-10 08:28 PM, Jason Wang wrote:
> 
> 
> On 2016年11月11日 12:17, John Fastabend wrote:
>> On 16-11-10 07:31 PM, Michael S. Tsirkin wrote:
>>> >On Fri, Nov 11, 2016 at 10:07:44AM +0800, Jason Wang wrote:
 >>
 >>
 >>On 2016年11月10日 00:38, Michael S. Tsirkin wrote:
> >>>On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:
>> Backlog were used for tuntap rx, but it can only process 1
>> packet at
>> one time since it was scheduled during sendmsg() synchronously in
>> process context. This lead bad cache utilization so this patch
>> tries
>> to do some batching before call rx NAPI. This is done through:
>> 
>> - accept MSG_MORE as a hint from sendmsg() caller, if it was set,
>> batch the packet temporarily in a linked list and submit
>> them all
>> once MSG_MORE were cleared.
>> - implement a tuntap specific NAPI handler for processing this
>> kind of
>> possible batching. (This could be done by extending
>> backlog to
>> support skb like, but using a tun specific one looks
>> cleaner and
>> easier for future extension).
>> 
>> Signed-off-by: Jason Wang
> >>>So why do we need an extra queue?
 >>
 >>The idea was borrowed from backlog to allow some kind of bulking
 and avoid
 >>spinlock on each dequeuing.
 >>
> >>>   This is not what hardware devices do.
> >>>How about adding the packet to queue unconditionally, deferring
> >>>signalling until we get sendmsg without MSG_MORE?
 >>
 >>Then you need touch spinlock when dequeuing each packet.
>>> >
>> Random thought, I have a cmpxchg ring I am using for the qdisc work that
>> could possibly replace the spinlock implementation. I haven't figured
>> out the resizing API yet because I did not need it but I assume it could
>> help here and let you dequeue multiple skbs in one operation.
>>
>> I can post the latest version if useful or an older version is
>> somewhere on patchworks as well.
>>
>> .John
>>
>>
> 
> Look useful here, and I can compare the performance if you post.
> 
> A question is can we extend the skb_array to support that?
> 
> Thanks
> 

Sent out two RFC patches with the implementation, the first has been
running on my system for some time the second for multiple packets is
only lightly tested and that was awhile back.

.John


Re: [PATCH 1/3] tuntap: rx batching

2016-11-10 Thread Jason Wang



On 2016年11月11日 12:17, John Fastabend wrote:

On 16-11-10 07:31 PM, Michael S. Tsirkin wrote:

>On Fri, Nov 11, 2016 at 10:07:44AM +0800, Jason Wang wrote:

>>
>>
>>On 2016年11月10日 00:38, Michael S. Tsirkin wrote:

>>>On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:

Backlog were used for tuntap rx, but it can only process 1 packet at
one time since it was scheduled during sendmsg() synchronously in
process context. This lead bad cache utilization so this patch tries
to do some batching before call rx NAPI. This is done through:

- accept MSG_MORE as a hint from sendmsg() caller, if it was set,
batch the packet temporarily in a linked list and submit them all
once MSG_MORE were cleared.
- implement a tuntap specific NAPI handler for processing this kind of
possible batching. (This could be done by extending backlog to
support skb like, but using a tun specific one looks cleaner and
easier for future extension).

Signed-off-by: Jason Wang

>>>So why do we need an extra queue?

>>
>>The idea was borrowed from backlog to allow some kind of bulking and avoid
>>spinlock on each dequeuing.
>>

>>>   This is not what hardware devices do.
>>>How about adding the packet to queue unconditionally, deferring
>>>signalling until we get sendmsg without MSG_MORE?

>>
>>Then you need touch spinlock when dequeuing each packet.

>

Random thought, I have a cmpxchg ring I am using for the qdisc work that
could possibly replace the spinlock implementation. I haven't figured
out the resizing API yet because I did not need it but I assume it could
help here and let you dequeue multiple skbs in one operation.

I can post the latest version if useful or an older version is
somewhere on patchworks as well.

.John




Look useful here, and I can compare the performance if you post.

A question is can we extend the skb_array to support that?

Thanks



Re: [PATCH 1/3] tuntap: rx batching

2016-11-10 Thread Jason Wang



On 2016年11月11日 12:17, John Fastabend wrote:

On 16-11-10 07:31 PM, Michael S. Tsirkin wrote:

>On Fri, Nov 11, 2016 at 10:07:44AM +0800, Jason Wang wrote:

>>
>>
>>On 2016年11月10日 00:38, Michael S. Tsirkin wrote:

>>>On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:

Backlog were used for tuntap rx, but it can only process 1 packet at
one time since it was scheduled during sendmsg() synchronously in
process context. This lead bad cache utilization so this patch tries
to do some batching before call rx NAPI. This is done through:

- accept MSG_MORE as a hint from sendmsg() caller, if it was set,
batch the packet temporarily in a linked list and submit them all
once MSG_MORE were cleared.
- implement a tuntap specific NAPI handler for processing this kind of
possible batching. (This could be done by extending backlog to
support skb like, but using a tun specific one looks cleaner and
easier for future extension).

Signed-off-by: Jason Wang

>>>So why do we need an extra queue?

>>
>>The idea was borrowed from backlog to allow some kind of bulking and avoid
>>spinlock on each dequeuing.
>>

>>>   This is not what hardware devices do.
>>>How about adding the packet to queue unconditionally, deferring
>>>signalling until we get sendmsg without MSG_MORE?

>>
>>Then you need touch spinlock when dequeuing each packet.

>

Random thought, I have a cmpxchg ring I am using for the qdisc work that
could possibly replace the spinlock implementation. I haven't figured
out the resizing API yet because I did not need it but I assume it could
help here and let you dequeue multiple skbs in one operation.

I can post the latest version if useful or an older version is
somewhere on patchworks as well.

.John




Look useful here, and I can compare the performance if you post.

A question is can we extend the skb_array to support that?

Thanks



Re: [PATCH 1/3] tuntap: rx batching

2016-11-10 Thread John Fastabend
On 16-11-10 07:31 PM, Michael S. Tsirkin wrote:
> On Fri, Nov 11, 2016 at 10:07:44AM +0800, Jason Wang wrote:
>>
>>
>> On 2016年11月10日 00:38, Michael S. Tsirkin wrote:
>>> On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:
 Backlog were used for tuntap rx, but it can only process 1 packet at
 one time since it was scheduled during sendmsg() synchronously in
 process context. This lead bad cache utilization so this patch tries
 to do some batching before call rx NAPI. This is done through:

 - accept MSG_MORE as a hint from sendmsg() caller, if it was set,
batch the packet temporarily in a linked list and submit them all
once MSG_MORE were cleared.
 - implement a tuntap specific NAPI handler for processing this kind of
possible batching. (This could be done by extending backlog to
support skb like, but using a tun specific one looks cleaner and
easier for future extension).

 Signed-off-by: Jason Wang 
>>> So why do we need an extra queue?
>>
>> The idea was borrowed from backlog to allow some kind of bulking and avoid
>> spinlock on each dequeuing.
>>
>>>   This is not what hardware devices do.
>>> How about adding the packet to queue unconditionally, deferring
>>> signalling until we get sendmsg without MSG_MORE?
>>
>> Then you need touch spinlock when dequeuing each packet.
> 

Random thought, I have a cmpxchg ring I am using for the qdisc work that
could possibly replace the spinlock implementation. I haven't figured
out the resizing API yet because I did not need it but I assume it could
help here and let you dequeue multiple skbs in one operation.

I can post the latest version if useful or an older version is
somewhere on patchworks as well.

.John


> It runs on the same CPU, right? Otherwise we should use skb_array...
> 
>>>
>>>
 ---
   drivers/net/tun.c | 71 
 ++-
   1 file changed, 65 insertions(+), 6 deletions(-)

>>
>> [...]
>>
rxhash = skb_get_hash(skb);
 -  netif_rx_ni(skb);
 +  skb_queue_tail(>socket.sk->sk_write_queue, skb);
 +
 +  if (!more) {
 +  local_bh_disable();
 +  napi_schedule(>napi);
 +  local_bh_enable();
>>> Why do we need to disable bh here? I thought napi_schedule can
>>> be called from any context.
>>
>> Yes, it's unnecessary. Will remove.
>>
>> Thanks



Re: [PATCH 1/3] tuntap: rx batching

2016-11-10 Thread John Fastabend
On 16-11-10 07:31 PM, Michael S. Tsirkin wrote:
> On Fri, Nov 11, 2016 at 10:07:44AM +0800, Jason Wang wrote:
>>
>>
>> On 2016年11月10日 00:38, Michael S. Tsirkin wrote:
>>> On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:
 Backlog were used for tuntap rx, but it can only process 1 packet at
 one time since it was scheduled during sendmsg() synchronously in
 process context. This lead bad cache utilization so this patch tries
 to do some batching before call rx NAPI. This is done through:

 - accept MSG_MORE as a hint from sendmsg() caller, if it was set,
batch the packet temporarily in a linked list and submit them all
once MSG_MORE were cleared.
 - implement a tuntap specific NAPI handler for processing this kind of
possible batching. (This could be done by extending backlog to
support skb like, but using a tun specific one looks cleaner and
easier for future extension).

 Signed-off-by: Jason Wang 
>>> So why do we need an extra queue?
>>
>> The idea was borrowed from backlog to allow some kind of bulking and avoid
>> spinlock on each dequeuing.
>>
>>>   This is not what hardware devices do.
>>> How about adding the packet to queue unconditionally, deferring
>>> signalling until we get sendmsg without MSG_MORE?
>>
>> Then you need touch spinlock when dequeuing each packet.
> 

Random thought, I have a cmpxchg ring I am using for the qdisc work that
could possibly replace the spinlock implementation. I haven't figured
out the resizing API yet because I did not need it but I assume it could
help here and let you dequeue multiple skbs in one operation.

I can post the latest version if useful or an older version is
somewhere on patchworks as well.

.John


> It runs on the same CPU, right? Otherwise we should use skb_array...
> 
>>>
>>>
 ---
   drivers/net/tun.c | 71 
 ++-
   1 file changed, 65 insertions(+), 6 deletions(-)

>>
>> [...]
>>
rxhash = skb_get_hash(skb);
 -  netif_rx_ni(skb);
 +  skb_queue_tail(>socket.sk->sk_write_queue, skb);
 +
 +  if (!more) {
 +  local_bh_disable();
 +  napi_schedule(>napi);
 +  local_bh_enable();
>>> Why do we need to disable bh here? I thought napi_schedule can
>>> be called from any context.
>>
>> Yes, it's unnecessary. Will remove.
>>
>> Thanks



Re: [PATCH 1/3] tuntap: rx batching

2016-11-10 Thread Jason Wang



On 2016年11月11日 11:31, Michael S. Tsirkin wrote:

On Fri, Nov 11, 2016 at 10:07:44AM +0800, Jason Wang wrote:

>
>
>On 2016年11月10日 00:38, Michael S. Tsirkin wrote:

> >On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:

> > >Backlog were used for tuntap rx, but it can only process 1 packet at
> > >one time since it was scheduled during sendmsg() synchronously in
> > >process context. This lead bad cache utilization so this patch tries
> > >to do some batching before call rx NAPI. This is done through:
> > >
> > >- accept MSG_MORE as a hint from sendmsg() caller, if it was set,
> > >batch the packet temporarily in a linked list and submit them all
> > >once MSG_MORE were cleared.
> > >- implement a tuntap specific NAPI handler for processing this kind of
> > >possible batching. (This could be done by extending backlog to
> > >support skb like, but using a tun specific one looks cleaner and
> > >easier for future extension).
> > >
> > >Signed-off-by: Jason Wang

> >So why do we need an extra queue?

>
>The idea was borrowed from backlog to allow some kind of bulking and avoid
>spinlock on each dequeuing.
>

> >   This is not what hardware devices do.
> >How about adding the packet to queue unconditionally, deferring
> >signalling until we get sendmsg without MSG_MORE?

>
>Then you need touch spinlock when dequeuing each packet.

It runs on the same CPU, right? Otherwise we should use skb_array...



There could be multiple senders technically. Will try skb_array and see 
if there's any difference.


Thanks


Re: [PATCH 1/3] tuntap: rx batching

2016-11-10 Thread Jason Wang



On 2016年11月11日 11:31, Michael S. Tsirkin wrote:

On Fri, Nov 11, 2016 at 10:07:44AM +0800, Jason Wang wrote:

>
>
>On 2016年11月10日 00:38, Michael S. Tsirkin wrote:

> >On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:

> > >Backlog were used for tuntap rx, but it can only process 1 packet at
> > >one time since it was scheduled during sendmsg() synchronously in
> > >process context. This lead bad cache utilization so this patch tries
> > >to do some batching before call rx NAPI. This is done through:
> > >
> > >- accept MSG_MORE as a hint from sendmsg() caller, if it was set,
> > >batch the packet temporarily in a linked list and submit them all
> > >once MSG_MORE were cleared.
> > >- implement a tuntap specific NAPI handler for processing this kind of
> > >possible batching. (This could be done by extending backlog to
> > >support skb like, but using a tun specific one looks cleaner and
> > >easier for future extension).
> > >
> > >Signed-off-by: Jason Wang

> >So why do we need an extra queue?

>
>The idea was borrowed from backlog to allow some kind of bulking and avoid
>spinlock on each dequeuing.
>

> >   This is not what hardware devices do.
> >How about adding the packet to queue unconditionally, deferring
> >signalling until we get sendmsg without MSG_MORE?

>
>Then you need touch spinlock when dequeuing each packet.

It runs on the same CPU, right? Otherwise we should use skb_array...



There could be multiple senders technically. Will try skb_array and see 
if there's any difference.


Thanks


Re: [PATCH 1/3] tuntap: rx batching

2016-11-10 Thread Michael S. Tsirkin
On Fri, Nov 11, 2016 at 10:07:44AM +0800, Jason Wang wrote:
> 
> 
> On 2016年11月10日 00:38, Michael S. Tsirkin wrote:
> > On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:
> > > Backlog were used for tuntap rx, but it can only process 1 packet at
> > > one time since it was scheduled during sendmsg() synchronously in
> > > process context. This lead bad cache utilization so this patch tries
> > > to do some batching before call rx NAPI. This is done through:
> > > 
> > > - accept MSG_MORE as a hint from sendmsg() caller, if it was set,
> > >batch the packet temporarily in a linked list and submit them all
> > >once MSG_MORE were cleared.
> > > - implement a tuntap specific NAPI handler for processing this kind of
> > >possible batching. (This could be done by extending backlog to
> > >support skb like, but using a tun specific one looks cleaner and
> > >easier for future extension).
> > > 
> > > Signed-off-by: Jason Wang 
> > So why do we need an extra queue?
> 
> The idea was borrowed from backlog to allow some kind of bulking and avoid
> spinlock on each dequeuing.
> 
> >   This is not what hardware devices do.
> > How about adding the packet to queue unconditionally, deferring
> > signalling until we get sendmsg without MSG_MORE?
> 
> Then you need touch spinlock when dequeuing each packet.

It runs on the same CPU, right? Otherwise we should use skb_array...

> > 
> > 
> > > ---
> > >   drivers/net/tun.c | 71 
> > > ++-
> > >   1 file changed, 65 insertions(+), 6 deletions(-)
> > > 
> 
> [...]
> 
> > >   rxhash = skb_get_hash(skb);
> > > - netif_rx_ni(skb);
> > > + skb_queue_tail(>socket.sk->sk_write_queue, skb);
> > > +
> > > + if (!more) {
> > > + local_bh_disable();
> > > + napi_schedule(>napi);
> > > + local_bh_enable();
> > Why do we need to disable bh here? I thought napi_schedule can
> > be called from any context.
> 
> Yes, it's unnecessary. Will remove.
> 
> Thanks


Re: [PATCH 1/3] tuntap: rx batching

2016-11-10 Thread Michael S. Tsirkin
On Fri, Nov 11, 2016 at 10:07:44AM +0800, Jason Wang wrote:
> 
> 
> On 2016年11月10日 00:38, Michael S. Tsirkin wrote:
> > On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:
> > > Backlog were used for tuntap rx, but it can only process 1 packet at
> > > one time since it was scheduled during sendmsg() synchronously in
> > > process context. This lead bad cache utilization so this patch tries
> > > to do some batching before call rx NAPI. This is done through:
> > > 
> > > - accept MSG_MORE as a hint from sendmsg() caller, if it was set,
> > >batch the packet temporarily in a linked list and submit them all
> > >once MSG_MORE were cleared.
> > > - implement a tuntap specific NAPI handler for processing this kind of
> > >possible batching. (This could be done by extending backlog to
> > >support skb like, but using a tun specific one looks cleaner and
> > >easier for future extension).
> > > 
> > > Signed-off-by: Jason Wang 
> > So why do we need an extra queue?
> 
> The idea was borrowed from backlog to allow some kind of bulking and avoid
> spinlock on each dequeuing.
> 
> >   This is not what hardware devices do.
> > How about adding the packet to queue unconditionally, deferring
> > signalling until we get sendmsg without MSG_MORE?
> 
> Then you need touch spinlock when dequeuing each packet.

It runs on the same CPU, right? Otherwise we should use skb_array...

> > 
> > 
> > > ---
> > >   drivers/net/tun.c | 71 
> > > ++-
> > >   1 file changed, 65 insertions(+), 6 deletions(-)
> > > 
> 
> [...]
> 
> > >   rxhash = skb_get_hash(skb);
> > > - netif_rx_ni(skb);
> > > + skb_queue_tail(>socket.sk->sk_write_queue, skb);
> > > +
> > > + if (!more) {
> > > + local_bh_disable();
> > > + napi_schedule(>napi);
> > > + local_bh_enable();
> > Why do we need to disable bh here? I thought napi_schedule can
> > be called from any context.
> 
> Yes, it's unnecessary. Will remove.
> 
> Thanks


Re: [PATCH 1/3] tuntap: rx batching

2016-11-10 Thread Jason Wang



On 2016年11月10日 00:38, Michael S. Tsirkin wrote:

On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:

Backlog were used for tuntap rx, but it can only process 1 packet at
one time since it was scheduled during sendmsg() synchronously in
process context. This lead bad cache utilization so this patch tries
to do some batching before call rx NAPI. This is done through:

- accept MSG_MORE as a hint from sendmsg() caller, if it was set,
   batch the packet temporarily in a linked list and submit them all
   once MSG_MORE were cleared.
- implement a tuntap specific NAPI handler for processing this kind of
   possible batching. (This could be done by extending backlog to
   support skb like, but using a tun specific one looks cleaner and
   easier for future extension).

Signed-off-by: Jason Wang 

So why do we need an extra queue?


The idea was borrowed from backlog to allow some kind of bulking and 
avoid spinlock on each dequeuing.



  This is not what hardware devices do.
How about adding the packet to queue unconditionally, deferring
signalling until we get sendmsg without MSG_MORE?


Then you need touch spinlock when dequeuing each packet.





---
  drivers/net/tun.c | 71 ++-
  1 file changed, 65 insertions(+), 6 deletions(-)



[...]


rxhash = skb_get_hash(skb);
-   netif_rx_ni(skb);
+   skb_queue_tail(>socket.sk->sk_write_queue, skb);
+
+   if (!more) {
+   local_bh_disable();
+   napi_schedule(>napi);
+   local_bh_enable();

Why do we need to disable bh here? I thought napi_schedule can
be called from any context.


Yes, it's unnecessary. Will remove.

Thanks



Re: [PATCH 1/3] tuntap: rx batching

2016-11-10 Thread Jason Wang



On 2016年11月10日 00:38, Michael S. Tsirkin wrote:

On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:

Backlog were used for tuntap rx, but it can only process 1 packet at
one time since it was scheduled during sendmsg() synchronously in
process context. This lead bad cache utilization so this patch tries
to do some batching before call rx NAPI. This is done through:

- accept MSG_MORE as a hint from sendmsg() caller, if it was set,
   batch the packet temporarily in a linked list and submit them all
   once MSG_MORE were cleared.
- implement a tuntap specific NAPI handler for processing this kind of
   possible batching. (This could be done by extending backlog to
   support skb like, but using a tun specific one looks cleaner and
   easier for future extension).

Signed-off-by: Jason Wang 

So why do we need an extra queue?


The idea was borrowed from backlog to allow some kind of bulking and 
avoid spinlock on each dequeuing.



  This is not what hardware devices do.
How about adding the packet to queue unconditionally, deferring
signalling until we get sendmsg without MSG_MORE?


Then you need touch spinlock when dequeuing each packet.





---
  drivers/net/tun.c | 71 ++-
  1 file changed, 65 insertions(+), 6 deletions(-)



[...]


rxhash = skb_get_hash(skb);
-   netif_rx_ni(skb);
+   skb_queue_tail(>socket.sk->sk_write_queue, skb);
+
+   if (!more) {
+   local_bh_disable();
+   napi_schedule(>napi);
+   local_bh_enable();

Why do we need to disable bh here? I thought napi_schedule can
be called from any context.


Yes, it's unnecessary. Will remove.

Thanks



Re: [PATCH 1/3] tuntap: rx batching

2016-11-09 Thread Michael S. Tsirkin
On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:
> Backlog were used for tuntap rx, but it can only process 1 packet at
> one time since it was scheduled during sendmsg() synchronously in
> process context. This lead bad cache utilization so this patch tries
> to do some batching before call rx NAPI. This is done through:
> 
> - accept MSG_MORE as a hint from sendmsg() caller, if it was set,
>   batch the packet temporarily in a linked list and submit them all
>   once MSG_MORE were cleared.
> - implement a tuntap specific NAPI handler for processing this kind of
>   possible batching. (This could be done by extending backlog to
>   support skb like, but using a tun specific one looks cleaner and
>   easier for future extension).
> 
> Signed-off-by: Jason Wang 

So why do we need an extra queue? This is not what hardware devices do.
How about adding the packet to queue unconditionally, deferring
signalling until we get sendmsg without MSG_MORE?


> ---
>  drivers/net/tun.c | 71 
> ++-
>  1 file changed, 65 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 1588469..d40583b 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -74,6 +74,7 @@
>  #include 
>  
>  #include 
> +#include 
>  
>  /* Uncomment to enable debugging */
>  /* #define TUN_DEBUG 1 */
> @@ -169,6 +170,8 @@ struct tun_file {
>   struct list_head next;
>   struct tun_struct *detached;
>   struct skb_array tx_array;
> + struct napi_struct napi;
> + struct sk_buff_head process_queue;
>  };
>  
>  struct tun_flow_entry {
> @@ -522,6 +525,8 @@ static void tun_queue_purge(struct tun_file *tfile)
>   while ((skb = skb_array_consume(>tx_array)) != NULL)
>   kfree_skb(skb);
>  
> + skb_queue_purge(>sk.sk_write_queue);
> + skb_queue_purge(>process_queue);
>   skb_queue_purge(>sk.sk_error_queue);
>  }
>  
> @@ -532,6 +537,11 @@ static void __tun_detach(struct tun_file *tfile, bool 
> clean)
>  
>   tun = rtnl_dereference(tfile->tun);
>  
> + if (tun && clean) {
> + napi_disable(>napi);
> + netif_napi_del(>napi);
> + }
> +
>   if (tun && !tfile->detached) {
>   u16 index = tfile->queue_index;
>   BUG_ON(index >= tun->numqueues);
> @@ -587,6 +597,7 @@ static void tun_detach_all(struct net_device *dev)
>  
>   for (i = 0; i < n; i++) {
>   tfile = rtnl_dereference(tun->tfiles[i]);
> + napi_disable(>napi);
>   BUG_ON(!tfile);
>   tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
>   tfile->socket.sk->sk_data_ready(tfile->socket.sk);
> @@ -603,6 +614,7 @@ static void tun_detach_all(struct net_device *dev)
>   synchronize_net();
>   for (i = 0; i < n; i++) {
>   tfile = rtnl_dereference(tun->tfiles[i]);
> + netif_napi_del(>napi);
>   /* Drop read queue */
>   tun_queue_purge(tfile);
>   sock_put(>sk);
> @@ -618,6 +630,41 @@ static void tun_detach_all(struct net_device *dev)
>   module_put(THIS_MODULE);
>  }
>  
> +static int tun_poll(struct napi_struct *napi, int budget)
> +{
> + struct tun_file *tfile = container_of(napi, struct tun_file, napi);
> + struct sk_buff_head *input_queue =
> +>socket.sk->sk_write_queue;
> + struct sk_buff *skb;
> + unsigned int received = 0;
> +
> + while (1) {
> + while ((skb = __skb_dequeue(>process_queue))) {
> + netif_receive_skb(skb);
> + if (++received >= budget)
> + return received;
> + }
> +
> + spin_lock(_queue->lock);
> + if (skb_queue_empty(input_queue)) {
> + spin_unlock(_queue->lock);
> + break;
> + }
> + skb_queue_splice_tail_init(input_queue, >process_queue);
> + spin_unlock(_queue->lock);
> + }
> +
> + if (received < budget) {
> + napi_complete(napi);
> + if (skb_peek(>socket.sk->sk_write_queue) &&
> + unlikely(napi_schedule_prep(napi))) {
> + __napi_schedule(napi);
> + }
> + }
> +
> + return received;
> +}
> +
>  static int tun_attach(struct tun_struct *tun, struct file *file, bool 
> skip_filter)
>  {
>   struct tun_file *tfile = file->private_data;
> @@ -666,9 +713,11 @@ static int tun_attach(struct tun_struct *tun, struct 
> file *file, bool skip_filte
>  
>   if (tfile->detached)
>   tun_enable_queue(tfile);
> - else
> + else {
>   sock_hold(>sk);
> -
> + netif_napi_add(tun->dev, >napi, tun_poll, 64);
> + napi_enable(>napi);
> + }
>   tun_set_real_num_queues(tun);
>  
>   /* device is allowed to go away first, so no need to hold extra
> @@ 

Re: [PATCH 1/3] tuntap: rx batching

2016-11-09 Thread Michael S. Tsirkin
On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:
> Backlog were used for tuntap rx, but it can only process 1 packet at
> one time since it was scheduled during sendmsg() synchronously in
> process context. This lead bad cache utilization so this patch tries
> to do some batching before call rx NAPI. This is done through:
> 
> - accept MSG_MORE as a hint from sendmsg() caller, if it was set,
>   batch the packet temporarily in a linked list and submit them all
>   once MSG_MORE were cleared.
> - implement a tuntap specific NAPI handler for processing this kind of
>   possible batching. (This could be done by extending backlog to
>   support skb like, but using a tun specific one looks cleaner and
>   easier for future extension).
> 
> Signed-off-by: Jason Wang 

So why do we need an extra queue? This is not what hardware devices do.
How about adding the packet to queue unconditionally, deferring
signalling until we get sendmsg without MSG_MORE?


> ---
>  drivers/net/tun.c | 71 
> ++-
>  1 file changed, 65 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 1588469..d40583b 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -74,6 +74,7 @@
>  #include 
>  
>  #include 
> +#include 
>  
>  /* Uncomment to enable debugging */
>  /* #define TUN_DEBUG 1 */
> @@ -169,6 +170,8 @@ struct tun_file {
>   struct list_head next;
>   struct tun_struct *detached;
>   struct skb_array tx_array;
> + struct napi_struct napi;
> + struct sk_buff_head process_queue;
>  };
>  
>  struct tun_flow_entry {
> @@ -522,6 +525,8 @@ static void tun_queue_purge(struct tun_file *tfile)
>   while ((skb = skb_array_consume(>tx_array)) != NULL)
>   kfree_skb(skb);
>  
> + skb_queue_purge(>sk.sk_write_queue);
> + skb_queue_purge(>process_queue);
>   skb_queue_purge(>sk.sk_error_queue);
>  }
>  
> @@ -532,6 +537,11 @@ static void __tun_detach(struct tun_file *tfile, bool 
> clean)
>  
>   tun = rtnl_dereference(tfile->tun);
>  
> + if (tun && clean) {
> + napi_disable(>napi);
> + netif_napi_del(>napi);
> + }
> +
>   if (tun && !tfile->detached) {
>   u16 index = tfile->queue_index;
>   BUG_ON(index >= tun->numqueues);
> @@ -587,6 +597,7 @@ static void tun_detach_all(struct net_device *dev)
>  
>   for (i = 0; i < n; i++) {
>   tfile = rtnl_dereference(tun->tfiles[i]);
> + napi_disable(>napi);
>   BUG_ON(!tfile);
>   tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
>   tfile->socket.sk->sk_data_ready(tfile->socket.sk);
> @@ -603,6 +614,7 @@ static void tun_detach_all(struct net_device *dev)
>   synchronize_net();
>   for (i = 0; i < n; i++) {
>   tfile = rtnl_dereference(tun->tfiles[i]);
> + netif_napi_del(>napi);
>   /* Drop read queue */
>   tun_queue_purge(tfile);
>   sock_put(>sk);
> @@ -618,6 +630,41 @@ static void tun_detach_all(struct net_device *dev)
>   module_put(THIS_MODULE);
>  }
>  
> +static int tun_poll(struct napi_struct *napi, int budget)
> +{
> + struct tun_file *tfile = container_of(napi, struct tun_file, napi);
> + struct sk_buff_head *input_queue =
> +>socket.sk->sk_write_queue;
> + struct sk_buff *skb;
> + unsigned int received = 0;
> +
> + while (1) {
> + while ((skb = __skb_dequeue(>process_queue))) {
> + netif_receive_skb(skb);
> + if (++received >= budget)
> + return received;
> + }
> +
> + spin_lock(_queue->lock);
> + if (skb_queue_empty(input_queue)) {
> + spin_unlock(_queue->lock);
> + break;
> + }
> + skb_queue_splice_tail_init(input_queue, >process_queue);
> + spin_unlock(_queue->lock);
> + }
> +
> + if (received < budget) {
> + napi_complete(napi);
> + if (skb_peek(>socket.sk->sk_write_queue) &&
> + unlikely(napi_schedule_prep(napi))) {
> + __napi_schedule(napi);
> + }
> + }
> +
> + return received;
> +}
> +
>  static int tun_attach(struct tun_struct *tun, struct file *file, bool 
> skip_filter)
>  {
>   struct tun_file *tfile = file->private_data;
> @@ -666,9 +713,11 @@ static int tun_attach(struct tun_struct *tun, struct 
> file *file, bool skip_filte
>  
>   if (tfile->detached)
>   tun_enable_queue(tfile);
> - else
> + else {
>   sock_hold(>sk);
> -
> + netif_napi_add(tun->dev, >napi, tun_poll, 64);
> + napi_enable(>napi);
> + }
>   tun_set_real_num_queues(tun);
>  
>   /* device is allowed to go away first, so no need to hold extra
> @@ -1150,7 +1199,7 @@ 

[PATCH 1/3] tuntap: rx batching

2016-11-08 Thread Jason Wang
Backlog were used for tuntap rx, but it can only process 1 packet at
one time since it was scheduled during sendmsg() synchronously in
process context. This lead bad cache utilization so this patch tries
to do some batching before call rx NAPI. This is done through:

- accept MSG_MORE as a hint from sendmsg() caller, if it was set,
  batch the packet temporarily in a linked list and submit them all
  once MSG_MORE were cleared.
- implement a tuntap specific NAPI handler for processing this kind of
  possible batching. (This could be done by extending backlog to
  support skb like, but using a tun specific one looks cleaner and
  easier for future extension).

Signed-off-by: Jason Wang 
---
 drivers/net/tun.c | 71 ++-
 1 file changed, 65 insertions(+), 6 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 1588469..d40583b 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -74,6 +74,7 @@
 #include 
 
 #include 
+#include 
 
 /* Uncomment to enable debugging */
 /* #define TUN_DEBUG 1 */
@@ -169,6 +170,8 @@ struct tun_file {
struct list_head next;
struct tun_struct *detached;
struct skb_array tx_array;
+   struct napi_struct napi;
+   struct sk_buff_head process_queue;
 };
 
 struct tun_flow_entry {
@@ -522,6 +525,8 @@ static void tun_queue_purge(struct tun_file *tfile)
while ((skb = skb_array_consume(>tx_array)) != NULL)
kfree_skb(skb);
 
+   skb_queue_purge(>sk.sk_write_queue);
+   skb_queue_purge(>process_queue);
skb_queue_purge(>sk.sk_error_queue);
 }
 
@@ -532,6 +537,11 @@ static void __tun_detach(struct tun_file *tfile, bool 
clean)
 
tun = rtnl_dereference(tfile->tun);
 
+   if (tun && clean) {
+   napi_disable(>napi);
+   netif_napi_del(>napi);
+   }
+
if (tun && !tfile->detached) {
u16 index = tfile->queue_index;
BUG_ON(index >= tun->numqueues);
@@ -587,6 +597,7 @@ static void tun_detach_all(struct net_device *dev)
 
for (i = 0; i < n; i++) {
tfile = rtnl_dereference(tun->tfiles[i]);
+   napi_disable(>napi);
BUG_ON(!tfile);
tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
tfile->socket.sk->sk_data_ready(tfile->socket.sk);
@@ -603,6 +614,7 @@ static void tun_detach_all(struct net_device *dev)
synchronize_net();
for (i = 0; i < n; i++) {
tfile = rtnl_dereference(tun->tfiles[i]);
+   netif_napi_del(>napi);
/* Drop read queue */
tun_queue_purge(tfile);
sock_put(>sk);
@@ -618,6 +630,41 @@ static void tun_detach_all(struct net_device *dev)
module_put(THIS_MODULE);
 }
 
+static int tun_poll(struct napi_struct *napi, int budget)
+{
+   struct tun_file *tfile = container_of(napi, struct tun_file, napi);
+   struct sk_buff_head *input_queue =
+  >socket.sk->sk_write_queue;
+   struct sk_buff *skb;
+   unsigned int received = 0;
+
+   while (1) {
+   while ((skb = __skb_dequeue(>process_queue))) {
+   netif_receive_skb(skb);
+   if (++received >= budget)
+   return received;
+   }
+
+   spin_lock(_queue->lock);
+   if (skb_queue_empty(input_queue)) {
+   spin_unlock(_queue->lock);
+   break;
+   }
+   skb_queue_splice_tail_init(input_queue, >process_queue);
+   spin_unlock(_queue->lock);
+   }
+
+   if (received < budget) {
+   napi_complete(napi);
+   if (skb_peek(>socket.sk->sk_write_queue) &&
+   unlikely(napi_schedule_prep(napi))) {
+   __napi_schedule(napi);
+   }
+   }
+
+   return received;
+}
+
 static int tun_attach(struct tun_struct *tun, struct file *file, bool 
skip_filter)
 {
struct tun_file *tfile = file->private_data;
@@ -666,9 +713,11 @@ static int tun_attach(struct tun_struct *tun, struct file 
*file, bool skip_filte
 
if (tfile->detached)
tun_enable_queue(tfile);
-   else
+   else {
sock_hold(>sk);
-
+   netif_napi_add(tun->dev, >napi, tun_poll, 64);
+   napi_enable(>napi);
+   }
tun_set_real_num_queues(tun);
 
/* device is allowed to go away first, so no need to hold extra
@@ -1150,7 +1199,7 @@ static struct sk_buff *tun_alloc_skb(struct tun_file 
*tfile,
 /* Get packet from user space buffer */
 static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
void *msg_control, struct iov_iter *from,
-   int noblock)
+   int noblock, bool more)
 {
struct tun_pi 

[PATCH 1/3] tuntap: rx batching

2016-11-08 Thread Jason Wang
Backlog were used for tuntap rx, but it can only process 1 packet at
one time since it was scheduled during sendmsg() synchronously in
process context. This lead bad cache utilization so this patch tries
to do some batching before call rx NAPI. This is done through:

- accept MSG_MORE as a hint from sendmsg() caller, if it was set,
  batch the packet temporarily in a linked list and submit them all
  once MSG_MORE were cleared.
- implement a tuntap specific NAPI handler for processing this kind of
  possible batching. (This could be done by extending backlog to
  support skb like, but using a tun specific one looks cleaner and
  easier for future extension).

Signed-off-by: Jason Wang 
---
 drivers/net/tun.c | 71 ++-
 1 file changed, 65 insertions(+), 6 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 1588469..d40583b 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -74,6 +74,7 @@
 #include 
 
 #include 
+#include 
 
 /* Uncomment to enable debugging */
 /* #define TUN_DEBUG 1 */
@@ -169,6 +170,8 @@ struct tun_file {
struct list_head next;
struct tun_struct *detached;
struct skb_array tx_array;
+   struct napi_struct napi;
+   struct sk_buff_head process_queue;
 };
 
 struct tun_flow_entry {
@@ -522,6 +525,8 @@ static void tun_queue_purge(struct tun_file *tfile)
while ((skb = skb_array_consume(>tx_array)) != NULL)
kfree_skb(skb);
 
+   skb_queue_purge(>sk.sk_write_queue);
+   skb_queue_purge(>process_queue);
skb_queue_purge(>sk.sk_error_queue);
 }
 
@@ -532,6 +537,11 @@ static void __tun_detach(struct tun_file *tfile, bool 
clean)
 
tun = rtnl_dereference(tfile->tun);
 
+   if (tun && clean) {
+   napi_disable(>napi);
+   netif_napi_del(>napi);
+   }
+
if (tun && !tfile->detached) {
u16 index = tfile->queue_index;
BUG_ON(index >= tun->numqueues);
@@ -587,6 +597,7 @@ static void tun_detach_all(struct net_device *dev)
 
for (i = 0; i < n; i++) {
tfile = rtnl_dereference(tun->tfiles[i]);
+   napi_disable(>napi);
BUG_ON(!tfile);
tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
tfile->socket.sk->sk_data_ready(tfile->socket.sk);
@@ -603,6 +614,7 @@ static void tun_detach_all(struct net_device *dev)
synchronize_net();
for (i = 0; i < n; i++) {
tfile = rtnl_dereference(tun->tfiles[i]);
+   netif_napi_del(>napi);
/* Drop read queue */
tun_queue_purge(tfile);
sock_put(>sk);
@@ -618,6 +630,41 @@ static void tun_detach_all(struct net_device *dev)
module_put(THIS_MODULE);
 }
 
+static int tun_poll(struct napi_struct *napi, int budget)
+{
+   struct tun_file *tfile = container_of(napi, struct tun_file, napi);
+   struct sk_buff_head *input_queue =
+  >socket.sk->sk_write_queue;
+   struct sk_buff *skb;
+   unsigned int received = 0;
+
+   while (1) {
+   while ((skb = __skb_dequeue(>process_queue))) {
+   netif_receive_skb(skb);
+   if (++received >= budget)
+   return received;
+   }
+
+   spin_lock(_queue->lock);
+   if (skb_queue_empty(input_queue)) {
+   spin_unlock(_queue->lock);
+   break;
+   }
+   skb_queue_splice_tail_init(input_queue, >process_queue);
+   spin_unlock(_queue->lock);
+   }
+
+   if (received < budget) {
+   napi_complete(napi);
+   if (skb_peek(>socket.sk->sk_write_queue) &&
+   unlikely(napi_schedule_prep(napi))) {
+   __napi_schedule(napi);
+   }
+   }
+
+   return received;
+}
+
 static int tun_attach(struct tun_struct *tun, struct file *file, bool 
skip_filter)
 {
struct tun_file *tfile = file->private_data;
@@ -666,9 +713,11 @@ static int tun_attach(struct tun_struct *tun, struct file 
*file, bool skip_filte
 
if (tfile->detached)
tun_enable_queue(tfile);
-   else
+   else {
sock_hold(>sk);
-
+   netif_napi_add(tun->dev, >napi, tun_poll, 64);
+   napi_enable(>napi);
+   }
tun_set_real_num_queues(tun);
 
/* device is allowed to go away first, so no need to hold extra
@@ -1150,7 +1199,7 @@ static struct sk_buff *tun_alloc_skb(struct tun_file 
*tfile,
 /* Get packet from user space buffer */
 static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
void *msg_control, struct iov_iter *from,
-   int noblock)
+   int noblock, bool more)
 {
struct tun_pi pi = { 0,