Re: [RFC] [ver3 PATCH 3/6] virtio_net: virtio_net driver changes

2011-11-21 Thread Ben Hutchings
On Fri, 2011-11-11 at 18:34 +0530, Krishna Kumar wrote:
 Changes for multiqueue virtio_net driver.
[...]
 @@ -677,25 +730,35 @@ static struct rtnl_link_stats64 *virtnet
  {
   struct virtnet_info *vi = netdev_priv(dev);
   int cpu;
 - unsigned int start;
  
   for_each_possible_cpu(cpu) {
 - struct virtnet_stats __percpu *stats
 - = per_cpu_ptr(vi-stats, cpu);
 - u64 tpackets, tbytes, rpackets, rbytes;
 -
 - do {
 - start = u64_stats_fetch_begin(stats-syncp);
 - tpackets = stats-tx_packets;
 - tbytes   = stats-tx_bytes;
 - rpackets = stats-rx_packets;
 - rbytes   = stats-rx_bytes;
 - } while (u64_stats_fetch_retry(stats-syncp, start));
 -
 - tot-rx_packets += rpackets;
 - tot-tx_packets += tpackets;
 - tot-rx_bytes   += rbytes;
 - tot-tx_bytes   += tbytes;
 + int qpair;
 +
 + for (qpair = 0; qpair  vi-num_queue_pairs; qpair++) {
 + struct virtnet_send_stats __percpu *tx_stat;
 + struct virtnet_recv_stats __percpu *rx_stat;

While you're at it, you can drop the per-CPU stats and make them only
per-queue.  There is unlikely to be any benefit in maintaining them
per-CPU while receive and transmit processing is serialised per-queue.

[...]
 +static int invoke_find_vqs(struct virtnet_info *vi)
 +{
 + vq_callback_t **callbacks;
 + struct virtqueue **vqs;
 + int ret = -ENOMEM;
 + int i, total_vqs;
 + char **names;
 +
 + /*
 +  * We expect 1 RX virtqueue followed by 1 TX virtqueue, followed
 +  * by the same 'vi-num_queue_pairs-1' more times, and optionally
 +  * one control virtqueue.
 +  */
 + total_vqs = vi-num_queue_pairs * 2 +
 + virtio_has_feature(vi-vdev, VIRTIO_NET_F_CTRL_VQ);
 +
 + /* Allocate space for find_vqs parameters */
 + vqs = kmalloc(total_vqs * sizeof(*vqs), GFP_KERNEL);
 + callbacks = kmalloc(total_vqs * sizeof(*callbacks), GFP_KERNEL);
 + names = kmalloc(total_vqs * sizeof(*names), GFP_KERNEL);
 + if (!vqs || !callbacks || !names)
 + goto err;
 +
 + /* Allocate/initialize parameters for recv virtqueues */
 + for (i = 0; i  vi-num_queue_pairs * 2; i += 2) {
 + callbacks[i] = skb_recv_done;
 + names[i] = kasprintf(GFP_KERNEL, input.%d, i / 2);
 + if (!names[i])
 + goto err;
 + }
 +
 + /* Allocate/initialize parameters for send virtqueues */
 + for (i = 1; i  vi-num_queue_pairs * 2; i += 2) {
 + callbacks[i] = skb_xmit_done;
 + names[i] = kasprintf(GFP_KERNEL, output.%d, i / 2);
 + if (!names[i])
 + goto err;
 + }
[...]

The RX and TX interrupt names for a multiqueue device should follow the
formats device-rx-index and device-tx-index.

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC] [ver3 PATCH 3/6] virtio_net: virtio_net driver changes

2011-11-21 Thread Ben Hutchings
On Fri, 2011-11-18 at 08:24 +0200, Sasha Levin wrote:
 On Fri, 2011-11-18 at 01:08 +, Ben Hutchings wrote:
  On Fri, 2011-11-11 at 18:34 +0530, Krishna Kumar wrote:
   Changes for multiqueue virtio_net driver.
  [...]
   @@ -677,25 +730,35 @@ static struct rtnl_link_stats64 *virtnet
{
 struct virtnet_info *vi = netdev_priv(dev);
 int cpu;
   - unsigned int start;

 for_each_possible_cpu(cpu) {
   - struct virtnet_stats __percpu *stats
   - = per_cpu_ptr(vi-stats, cpu);
   - u64 tpackets, tbytes, rpackets, rbytes;
   -
   - do {
   - start = u64_stats_fetch_begin(stats-syncp);
   - tpackets = stats-tx_packets;
   - tbytes   = stats-tx_bytes;
   - rpackets = stats-rx_packets;
   - rbytes   = stats-rx_bytes;
   - } while (u64_stats_fetch_retry(stats-syncp, start));
   -
   - tot-rx_packets += rpackets;
   - tot-tx_packets += tpackets;
   - tot-rx_bytes   += rbytes;
   - tot-tx_bytes   += tbytes;
   + int qpair;
   +
   + for (qpair = 0; qpair  vi-num_queue_pairs; qpair++) {
   + struct virtnet_send_stats __percpu *tx_stat;
   + struct virtnet_recv_stats __percpu *rx_stat;
  
  While you're at it, you can drop the per-CPU stats and make them only
  per-queue.  There is unlikely to be any benefit in maintaining them
  per-CPU while receive and transmit processing is serialised per-queue.
 
 It allows you to update stats without a lock.

But you'll already be holding a lock related to the queue.

 Whats the benefit of having them per queue?

It should save some memory (and a little time when summing stats, though
that's unlikely to matter much).

The important thing is that splitting up stats per-CPU *and* per-queue
is a waste.

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC] [ver3 PATCH 3/6] virtio_net: virtio_net driver changes

2011-11-21 Thread Ben Hutchings
On Fri, 2011-11-18 at 18:18 +0200, Sasha Levin wrote:
 On Fri, 2011-11-18 at 15:40 +, Ben Hutchings wrote:
  On Fri, 2011-11-18 at 08:24 +0200, Sasha Levin wrote:
   On Fri, 2011-11-18 at 01:08 +, Ben Hutchings wrote:
On Fri, 2011-11-11 at 18:34 +0530, Krishna Kumar wrote:
 Changes for multiqueue virtio_net driver.
[...]
 @@ -677,25 +730,35 @@ static struct rtnl_link_stats64 *virtnet
  {
   struct virtnet_info *vi = netdev_priv(dev);
   int cpu;
 - unsigned int start;
  
   for_each_possible_cpu(cpu) {
 - struct virtnet_stats __percpu *stats
 - = per_cpu_ptr(vi-stats, cpu);
 - u64 tpackets, tbytes, rpackets, rbytes;
 -
 - do {
 - start = u64_stats_fetch_begin(stats-syncp);
 - tpackets = stats-tx_packets;
 - tbytes   = stats-tx_bytes;
 - rpackets = stats-rx_packets;
 - rbytes   = stats-rx_bytes;
 - } while (u64_stats_fetch_retry(stats-syncp, start));
 -
 - tot-rx_packets += rpackets;
 - tot-tx_packets += tpackets;
 - tot-rx_bytes   += rbytes;
 - tot-tx_bytes   += tbytes;
 + int qpair;
 +
 + for (qpair = 0; qpair  vi-num_queue_pairs; qpair++) {
 + struct virtnet_send_stats __percpu *tx_stat;
 + struct virtnet_recv_stats __percpu *rx_stat;

While you're at it, you can drop the per-CPU stats and make them only
per-queue.  There is unlikely to be any benefit in maintaining them
per-CPU while receive and transmit processing is serialised per-queue.
   
   It allows you to update stats without a lock.
  
  But you'll already be holding a lock related to the queue.
 
 Right, but now you're holding a queue lock just when playing with the
 queue, we don't hold it when we process the data - which is when we
 usually need to update stats.
[...]

The *stack* is holding the appropriate lock when calling the NAPI poll
function or ndo_start_xmit function.

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC] [ver3 PATCH 3/6] virtio_net: virtio_net driver changes

2011-11-18 Thread Sasha Levin
On Fri, 2011-11-18 at 15:40 +, Ben Hutchings wrote:
 On Fri, 2011-11-18 at 08:24 +0200, Sasha Levin wrote:
  On Fri, 2011-11-18 at 01:08 +, Ben Hutchings wrote:
   On Fri, 2011-11-11 at 18:34 +0530, Krishna Kumar wrote:
Changes for multiqueue virtio_net driver.
   [...]
@@ -677,25 +730,35 @@ static struct rtnl_link_stats64 *virtnet
 {
struct virtnet_info *vi = netdev_priv(dev);
int cpu;
-   unsigned int start;
 
for_each_possible_cpu(cpu) {
-   struct virtnet_stats __percpu *stats
-   = per_cpu_ptr(vi-stats, cpu);
-   u64 tpackets, tbytes, rpackets, rbytes;
-
-   do {
-   start = u64_stats_fetch_begin(stats-syncp);
-   tpackets = stats-tx_packets;
-   tbytes   = stats-tx_bytes;
-   rpackets = stats-rx_packets;
-   rbytes   = stats-rx_bytes;
-   } while (u64_stats_fetch_retry(stats-syncp, start));
-
-   tot-rx_packets += rpackets;
-   tot-tx_packets += tpackets;
-   tot-rx_bytes   += rbytes;
-   tot-tx_bytes   += tbytes;
+   int qpair;
+
+   for (qpair = 0; qpair  vi-num_queue_pairs; qpair++) {
+   struct virtnet_send_stats __percpu *tx_stat;
+   struct virtnet_recv_stats __percpu *rx_stat;
   
   While you're at it, you can drop the per-CPU stats and make them only
   per-queue.  There is unlikely to be any benefit in maintaining them
   per-CPU while receive and transmit processing is serialised per-queue.
  
  It allows you to update stats without a lock.
 
 But you'll already be holding a lock related to the queue.

Right, but now you're holding a queue lock just when playing with the
queue, we don't hold it when we process the data - which is when we
usually need to update stats.

  Whats the benefit of having them per queue?
 
 It should save some memory (and a little time when summing stats, though
 that's unlikely to matter much).
 
 The important thing is that splitting up stats per-CPU *and* per-queue
 is a waste.
 
 Ben.
 


-- 

Sasha.

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC] [ver3 PATCH 3/6] virtio_net: virtio_net driver changes

2011-11-17 Thread Sasha Levin
On Fri, 2011-11-18 at 01:08 +, Ben Hutchings wrote:
 On Fri, 2011-11-11 at 18:34 +0530, Krishna Kumar wrote:
  Changes for multiqueue virtio_net driver.
 [...]
  @@ -677,25 +730,35 @@ static struct rtnl_link_stats64 *virtnet
   {
  struct virtnet_info *vi = netdev_priv(dev);
  int cpu;
  -   unsigned int start;
   
  for_each_possible_cpu(cpu) {
  -   struct virtnet_stats __percpu *stats
  -   = per_cpu_ptr(vi-stats, cpu);
  -   u64 tpackets, tbytes, rpackets, rbytes;
  -
  -   do {
  -   start = u64_stats_fetch_begin(stats-syncp);
  -   tpackets = stats-tx_packets;
  -   tbytes   = stats-tx_bytes;
  -   rpackets = stats-rx_packets;
  -   rbytes   = stats-rx_bytes;
  -   } while (u64_stats_fetch_retry(stats-syncp, start));
  -
  -   tot-rx_packets += rpackets;
  -   tot-tx_packets += tpackets;
  -   tot-rx_bytes   += rbytes;
  -   tot-tx_bytes   += tbytes;
  +   int qpair;
  +
  +   for (qpair = 0; qpair  vi-num_queue_pairs; qpair++) {
  +   struct virtnet_send_stats __percpu *tx_stat;
  +   struct virtnet_recv_stats __percpu *rx_stat;
 
 While you're at it, you can drop the per-CPU stats and make them only
 per-queue.  There is unlikely to be any benefit in maintaining them
 per-CPU while receive and transmit processing is serialised per-queue.

It allows you to update stats without a lock.

Whats the benefit of having them per queue?

-- 

Sasha.

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[RFC] [ver3 PATCH 3/6] virtio_net: virtio_net driver changes

2011-11-11 Thread Krishna Kumar
Changes for multiqueue virtio_net driver.

Signed-off-by: krkum...@in.ibm.com
---
 drivers/net/virtio_net.c   |  688 ---
 include/linux/virtio_net.h |2 
 2 files changed, 481 insertions(+), 209 deletions(-)

diff -ruNp org/drivers/net/virtio_net.c new/drivers/net/virtio_net.c
--- org/drivers/net/virtio_net.c2011-11-11 16:44:38.0 +0530
+++ new/drivers/net/virtio_net.c2011-11-11 16:44:59.0 +0530
@@ -40,33 +40,42 @@ module_param(gso, bool, 0444);
 
 #define VIRTNET_SEND_COMMAND_SG_MAX2
 
-struct virtnet_stats {
+struct virtnet_send_stats {
struct u64_stats_sync syncp;
u64 tx_bytes;
u64 tx_packets;
+};
 
+struct virtnet_recv_stats {
+   struct u64_stats_sync syncp;
u64 rx_bytes;
u64 rx_packets;
 };
 
-struct virtnet_info {
-   struct virtio_device *vdev;
-   struct virtqueue *rvq, *svq, *cvq;
-   struct net_device *dev;
-   struct napi_struct napi;
-   unsigned int status;
+/* Internal representation of a send virtqueue */
+struct send_queue {
+   /* Virtqueue associated with this send _queue */
+   struct virtqueue *vq;
 
-   /* Number of input buffers, and max we've ever had. */
-   unsigned int num, max;
+   /* TX: fragments + linear part + virtio header */
+   struct scatterlist sg[MAX_SKB_FRAGS + 2];
 
-   /* I like... big packets and I cannot lie! */
-   bool big_packets;
+   /* Active tx statistics */
+   struct virtnet_send_stats __percpu *stats;
+};
 
-   /* Host will merge rx buffers for big packets (shake it! shake it!) */
-   bool mergeable_rx_bufs;
+/* Internal representation of a receive virtqueue */
+struct receive_queue {
+   /* Virtqueue associated with this receive_queue */
+   struct virtqueue *vq;
+
+   /* Back pointer to the virtnet_info */
+   struct virtnet_info *vi;
 
-   /* Active statistics */
-   struct virtnet_stats __percpu *stats;
+   struct napi_struct napi;
+
+   /* Number of input buffers, and max we've ever had. */
+   unsigned int num, max;
 
/* Work struct for refilling if we run low on memory. */
struct delayed_work refill;
@@ -74,9 +83,29 @@ struct virtnet_info {
/* Chain pages by the private ptr. */
struct page *pages;
 
-   /* fragments + linear part + virtio header */
-   struct scatterlist rx_sg[MAX_SKB_FRAGS + 2];
-   struct scatterlist tx_sg[MAX_SKB_FRAGS + 2];
+   /* RX: fragments + linear part + virtio header */
+   struct scatterlist sg[MAX_SKB_FRAGS + 2];
+
+   /* Active rx statistics */
+   struct virtnet_recv_stats __percpu *stats;
+};
+
+struct virtnet_info {
+   int num_queue_pairs;/* # of RX/TX vq pairs */
+
+   struct send_queue **sq;
+   struct receive_queue **rq;
+   struct virtqueue *cvq;
+
+   struct virtio_device *vdev;
+   struct net_device *dev;
+   unsigned int status;
+
+   /* I like... big packets and I cannot lie! */
+   bool big_packets;
+
+   /* Host will merge rx buffers for big packets (shake it! shake it!) */
+   bool mergeable_rx_bufs;
 };
 
 struct skb_vnet_hdr {
@@ -106,22 +135,22 @@ static inline struct skb_vnet_hdr *skb_v
  * private is used to chain pages for big packets, put the whole
  * most recent used list in the beginning for reuse
  */
-static void give_pages(struct virtnet_info *vi, struct page *page)
+static void give_pages(struct receive_queue *rq, struct page *page)
 {
struct page *end;
 
/* Find end of list, sew whole thing into vi-pages. */
for (end = page; end-private; end = (struct page *)end-private);
-   end-private = (unsigned long)vi-pages;
-   vi-pages = page;
+   end-private = (unsigned long)rq-pages;
+   rq-pages = page;
 }
 
-static struct page *get_a_page(struct virtnet_info *vi, gfp_t gfp_mask)
+static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
 {
-   struct page *p = vi-pages;
+   struct page *p = rq-pages;
 
if (p) {
-   vi-pages = (struct page *)p-private;
+   rq-pages = (struct page *)p-private;
/* clear private here, it is used to chain pages */
p-private = 0;
} else
@@ -129,15 +158,16 @@ static struct page *get_a_page(struct vi
return p;
 }
 
-static void skb_xmit_done(struct virtqueue *svq)
+static void skb_xmit_done(struct virtqueue *vq)
 {
-   struct virtnet_info *vi = svq-vdev-priv;
+   struct virtnet_info *vi = vq-vdev-priv;
+   int qnum = vq-queue_index / 2; /* RX/TX vqs are allocated in pairs */
 
/* Suppress further interrupts. */
-   virtqueue_disable_cb(svq);
+   virtqueue_disable_cb(vq);
 
/* We were probably waiting for more output buffers. */
-   netif_wake_queue(vi-dev);
+   netif_wake_subqueue(vi-dev, qnum);
 }
 
 static void set_skb_frag(struct sk_buff *skb, struct