[RFC] [ver3 PATCH 2/6] virtio: Move 'num_queues' to virtqueue

2011-11-11 Thread Krishna Kumar
Move queue_index from virtio_net_config to virtqueue. This is
needed to figure out the queue number of the vq in the 'done'
handler of the device.

Signed-off-by: krkum...@in.ibm.com
---
 drivers/virtio/virtio_pci.c |   10 +++---
 include/linux/virtio.h  |1 +
 2 files changed, 4 insertions(+), 7 deletions(-)

diff -ruNp org/drivers/virtio/virtio_pci.c new/drivers/virtio/virtio_pci.c
--- org/drivers/virtio/virtio_pci.c 2011-11-11 16:44:30.0 +0530
+++ new/drivers/virtio/virtio_pci.c 2011-11-11 16:44:45.0 +0530
@@ -75,9 +75,6 @@ struct virtio_pci_vq_info
/* the number of entries in the queue */
int num;
 
-   /* the index of the queue */
-   int queue_index;
-
/* the virtual address of the ring queue */
void *queue;
 
@@ -180,11 +177,10 @@ static void vp_reset(struct virtio_devic
 static void vp_notify(struct virtqueue *vq)
 {
struct virtio_pci_device *vp_dev = to_vp_device(vq-vdev);
-   struct virtio_pci_vq_info *info = vq-priv;
 
/* we write the queue's selector into the notification register to
 * signal the other end */
-   iowrite16(info-queue_index, vp_dev-ioaddr + VIRTIO_PCI_QUEUE_NOTIFY);
+   iowrite16(vq-queue_index, vp_dev-ioaddr + VIRTIO_PCI_QUEUE_NOTIFY);
 }
 
 /* Handle a configuration change: Tell driver if it wants to know. */
@@ -380,7 +376,6 @@ static struct virtqueue *setup_vq(struct
if (!info)
return ERR_PTR(-ENOMEM);
 
-   info-queue_index = index;
info-num = num;
info-msix_vector = msix_vec;
 
@@ -403,6 +398,7 @@ static struct virtqueue *setup_vq(struct
goto out_activate_queue;
}
 
+   vq-queue_index = index;
vq-priv = info;
info-vq = vq;
 
@@ -445,7 +441,7 @@ static void vp_del_vq(struct virtqueue *
list_del(info-node);
spin_unlock_irqrestore(vp_dev-lock, flags);
 
-   iowrite16(info-queue_index, vp_dev-ioaddr + VIRTIO_PCI_QUEUE_SEL);
+   iowrite16(vq-queue_index, vp_dev-ioaddr + VIRTIO_PCI_QUEUE_SEL);
 
if (vp_dev-msix_enabled) {
iowrite16(VIRTIO_MSI_NO_VECTOR,
diff -ruNp org/include/linux/virtio.h new/include/linux/virtio.h
--- org/include/linux/virtio.h  2011-11-11 16:44:30.0 +0530
+++ new/include/linux/virtio.h  2011-11-11 16:44:45.0 +0530
@@ -22,6 +22,7 @@ struct virtqueue {
void (*callback)(struct virtqueue *vq);
const char *name;
struct virtio_device *vdev;
+   int queue_index;/* the index of the queue */
void *priv;
 };
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC] [ver3 PATCH 1/6] virtio_net: Introduce VIRTIO_NET_F_MULTIQUEUE

2011-11-11 Thread Krishna Kumar
Introduce VIRTIO_NET_F_MULTIQUEUE. 

Signed-off-by: krkum...@in.ibm.com
---
 include/linux/virtio_net.h |1 +
 1 file changed, 1 insertion(+)

diff -ruNp org/include/linux/virtio_net.h new/include/linux/virtio_net.h
--- org/include/linux/virtio_net.h  2011-10-12 10:16:46.0 +0530
+++ new/include/linux/virtio_net.h  2011-11-11 16:44:34.0 +0530
@@ -49,6 +49,7 @@
 #define VIRTIO_NET_F_CTRL_RX   18  /* Control channel RX mode support */
 #define VIRTIO_NET_F_CTRL_VLAN 19  /* Control channel VLAN filtering */
 #define VIRTIO_NET_F_CTRL_RX_EXTRA 20  /* Extra RX mode control support */
+#define VIRTIO_NET_F_MULTIQUEUE21  /* Device supports multiple 
TXQ/RXQ */
 
 #define VIRTIO_NET_S_LINK_UP   1   /* Link is up */
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC] [ver3 PATCH 0/6] Implement multiqueue virtio-net

2011-11-11 Thread Krishna Kumar
This patch series resurrects the earlier multiple TX/RX queues
functionality for virtio_net, and addresses the issues pointed
out.  It also includes an API to share irq's, f.e.  amongst the
TX vqs. 

I plan to run TCP/UDP STREAM and RR tests for local-host and
local-remote, and send the results in the next couple of days.


patch #1: Introduce VIRTIO_NET_F_MULTIQUEUE
patch #2: Move 'num_queues' to virtqueue
patch #3: virtio_net driver changes
patch #4: vhost_net changes
patch #5: Implement find_vqs_irq()
patch #6: Convert virtio_net driver to use find_vqs_irq()


Changes from rev2:
Michael:
---
1. Added functions to handle setting RX/TX/CTRL vq's.
2. num_queue_pairs instead of numtxqs.
3. Experimental support for fewer irq's in find_vqs.

Rusty:
--
4. Cleaned up some existing while (1).
5. rvq/svq and rx_sg/tx_sg changed to vq and sg respectively.
6. Cleaned up some #if 1 code.


Issue when using patch5:
-

The new API is designed to minimize code duplication.  E.g.
vp_find_vqs() is implemented as:

static int vp_find_vqs(...)
{
return vp_find_vqs_irq(vdev, nvqs, vqs, callbacks, names, NULL);
}

In my testing, when multiple tx/rx is used with multiple netperf
sessions, all the device tx queues stops a few thousand times and
subsequently woken up by skb_xmit_done.  But after some 40K-50K
iterations of stop/wake, some of the txq's stop and no wake
interrupt comes. (modprobe -r followed by modprobe solves this, so
it is not a system hang).  At the time of the hang (#txqs=#rxqs=4):

# egrep CPU|virtio0 /proc/interrupts | grep -v config
   CPU0 CPU1 CPU2CPU3
41:490574926248828   49421  PCI-MSI-edgevirtio0-input.0
42:5066 5213 52215109   PCI-MSI-edgevirtio0-output.0
43:433804377043007   43148  PCI-MSI-edgevirtio0-input.1
44:414334172742101   41175  PCI-MSI-edgevirtio0-input.2
45:384653762938468   38768  PCI-MSI-edgevirtio0-input.3

# tc -s qdisc show dev eth0
qdisc mq 0: root  
Sent 393196939897 bytes 271191624 pkt (dropped 59897,
overlimits 0 requeues 67156) backlog 25375720b 1601p
requeues 67156  

I am not sure if patch #5 is responsible for the hang.  Also, without
patch #5/patch #6, I changed vp_find_vqs() to:
static int vp_find_vqs(...)
{
return vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names,
  false, false);
}
No packets were getting TX'd with this change when #txqs1.  This is
with the MQ-only patch that doesn't touch drivers/virtio/ directory.

Also, the MQ patch works reasonably well with 2 vectors - with
use_msix=1 and per_vq_vectors=0 in vp_find_vqs().

Patch against net-next - please review.

Signed-off-by: krkum...@in.ibm.com
---

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC] [ver3 PATCH 4/6] vhost_net: vhost_net changes

2011-11-11 Thread Krishna Kumar
Changes for multiqueue vhost_net driver.

Signed-off-by: krkum...@in.ibm.com
---
 drivers/vhost/net.c   |  253 +---
 drivers/vhost/vhost.c |  225 ---
 drivers/vhost/vhost.h |   26 +++-
 3 files changed, 340 insertions(+), 164 deletions(-)

diff -ruNp org/drivers/vhost/net.c new/drivers/vhost/net.c
--- org/drivers/vhost/net.c 2011-11-11 16:44:56.0 +0530
+++ new/drivers/vhost/net.c 2011-11-11 16:45:11.0 +0530
@@ -41,12 +41,6 @@ MODULE_PARM_DESC(experimental_zcopytx, 
 #define VHOST_MAX_PEND 128
 #define VHOST_GOODCOPY_LEN 256
 
-enum {
-   VHOST_NET_VQ_RX = 0,
-   VHOST_NET_VQ_TX = 1,
-   VHOST_NET_VQ_MAX = 2,
-};
-
 enum vhost_net_poll_state {
VHOST_NET_POLL_DISABLED = 0,
VHOST_NET_POLL_STARTED = 1,
@@ -55,12 +49,13 @@ enum vhost_net_poll_state {
 
 struct vhost_net {
struct vhost_dev dev;
-   struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
-   struct vhost_poll poll[VHOST_NET_VQ_MAX];
+   struct vhost_virtqueue *vqs;
+   struct vhost_poll *poll;
+   struct socket **socks;
/* Tells us whether we are polling a socket for TX.
 * We only do this when socket buffer fills up.
 * Protected by tx vq lock. */
-   enum vhost_net_poll_state tx_poll_state;
+   enum vhost_net_poll_state *tx_poll_state;
 };
 
 static bool vhost_sock_zcopy(struct socket *sock)
@@ -108,28 +103,28 @@ static void copy_iovec_hdr(const struct 
 }
 
 /* Caller must have TX VQ lock */
-static void tx_poll_stop(struct vhost_net *net)
+static void tx_poll_stop(struct vhost_net *net, int qnum)
 {
-   if (likely(net-tx_poll_state != VHOST_NET_POLL_STARTED))
+   if (likely(net-tx_poll_state[qnum / 2] != VHOST_NET_POLL_STARTED))
return;
-   vhost_poll_stop(net-poll + VHOST_NET_VQ_TX);
-   net-tx_poll_state = VHOST_NET_POLL_STOPPED;
+   vhost_poll_stop(net-poll[qnum]);
+   net-tx_poll_state[qnum / 2] = VHOST_NET_POLL_STOPPED;
 }
 
 /* Caller must have TX VQ lock */
-static void tx_poll_start(struct vhost_net *net, struct socket *sock)
+static void tx_poll_start(struct vhost_net *net, struct socket *sock, int qnum)
 {
-   if (unlikely(net-tx_poll_state != VHOST_NET_POLL_STOPPED))
+   if (unlikely(net-tx_poll_state[qnum / 2] != VHOST_NET_POLL_STOPPED))
return;
-   vhost_poll_start(net-poll + VHOST_NET_VQ_TX, sock-file);
-   net-tx_poll_state = VHOST_NET_POLL_STARTED;
+   vhost_poll_start(net-poll[qnum], sock-file);
+   net-tx_poll_state[qnum / 2] = VHOST_NET_POLL_STARTED;
 }
 
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
-static void handle_tx(struct vhost_net *net)
+static void handle_tx(struct vhost_virtqueue *vq)
 {
-   struct vhost_virtqueue *vq = net-dev.vqs[VHOST_NET_VQ_TX];
+   struct vhost_net *net = container_of(vq-dev, struct vhost_net, dev);
unsigned out, in, s;
int head;
struct msghdr msg = {
@@ -155,7 +150,7 @@ static void handle_tx(struct vhost_net *
wmem = atomic_read(sock-sk-sk_wmem_alloc);
if (wmem = sock-sk-sk_sndbuf) {
mutex_lock(vq-mutex);
-   tx_poll_start(net, sock);
+   tx_poll_start(net, sock, vq-qnum);
mutex_unlock(vq-mutex);
return;
}
@@ -164,7 +159,7 @@ static void handle_tx(struct vhost_net *
vhost_disable_notify(net-dev, vq);
 
if (wmem  sock-sk-sk_sndbuf / 2)
-   tx_poll_stop(net);
+   tx_poll_stop(net, vq-qnum);
hdr_size = vq-vhost_hlen;
zcopy = vhost_sock_zcopy(sock);
 
@@ -186,7 +181,7 @@ static void handle_tx(struct vhost_net *
 
wmem = atomic_read(sock-sk-sk_wmem_alloc);
if (wmem = sock-sk-sk_sndbuf * 3 / 4) {
-   tx_poll_start(net, sock);
+   tx_poll_start(net, sock, vq-qnum);
set_bit(SOCK_ASYNC_NOSPACE, sock-flags);
break;
}
@@ -197,7 +192,7 @@ static void handle_tx(struct vhost_net *
(vq-upend_idx - vq-done_idx) :
(vq-upend_idx + UIO_MAXIOV - vq-done_idx);
if (unlikely(num_pends  VHOST_MAX_PEND)) {
-   tx_poll_start(net, sock);
+   tx_poll_start(net, sock, vq-qnum);
set_bit(SOCK_ASYNC_NOSPACE, sock-flags);
break;
}
@@ -257,7 +252,7 @@ static void handle_tx(struct vhost_net *
UIO_MAXIOV;
}
vhost_discard_vq_desc(vq, 1);
-   tx_poll_start(net, sock);
+   

[RFC] [ver3 PATCH 3/6] virtio_net: virtio_net driver changes

2011-11-11 Thread Krishna Kumar
Changes for multiqueue virtio_net driver.

Signed-off-by: krkum...@in.ibm.com
---
 drivers/net/virtio_net.c   |  688 ---
 include/linux/virtio_net.h |2 
 2 files changed, 481 insertions(+), 209 deletions(-)

diff -ruNp org/drivers/net/virtio_net.c new/drivers/net/virtio_net.c
--- org/drivers/net/virtio_net.c2011-11-11 16:44:38.0 +0530
+++ new/drivers/net/virtio_net.c2011-11-11 16:44:59.0 +0530
@@ -40,33 +40,42 @@ module_param(gso, bool, 0444);
 
 #define VIRTNET_SEND_COMMAND_SG_MAX2
 
-struct virtnet_stats {
+struct virtnet_send_stats {
struct u64_stats_sync syncp;
u64 tx_bytes;
u64 tx_packets;
+};
 
+struct virtnet_recv_stats {
+   struct u64_stats_sync syncp;
u64 rx_bytes;
u64 rx_packets;
 };
 
-struct virtnet_info {
-   struct virtio_device *vdev;
-   struct virtqueue *rvq, *svq, *cvq;
-   struct net_device *dev;
-   struct napi_struct napi;
-   unsigned int status;
+/* Internal representation of a send virtqueue */
+struct send_queue {
+   /* Virtqueue associated with this send _queue */
+   struct virtqueue *vq;
 
-   /* Number of input buffers, and max we've ever had. */
-   unsigned int num, max;
+   /* TX: fragments + linear part + virtio header */
+   struct scatterlist sg[MAX_SKB_FRAGS + 2];
 
-   /* I like... big packets and I cannot lie! */
-   bool big_packets;
+   /* Active tx statistics */
+   struct virtnet_send_stats __percpu *stats;
+};
 
-   /* Host will merge rx buffers for big packets (shake it! shake it!) */
-   bool mergeable_rx_bufs;
+/* Internal representation of a receive virtqueue */
+struct receive_queue {
+   /* Virtqueue associated with this receive_queue */
+   struct virtqueue *vq;
+
+   /* Back pointer to the virtnet_info */
+   struct virtnet_info *vi;
 
-   /* Active statistics */
-   struct virtnet_stats __percpu *stats;
+   struct napi_struct napi;
+
+   /* Number of input buffers, and max we've ever had. */
+   unsigned int num, max;
 
/* Work struct for refilling if we run low on memory. */
struct delayed_work refill;
@@ -74,9 +83,29 @@ struct virtnet_info {
/* Chain pages by the private ptr. */
struct page *pages;
 
-   /* fragments + linear part + virtio header */
-   struct scatterlist rx_sg[MAX_SKB_FRAGS + 2];
-   struct scatterlist tx_sg[MAX_SKB_FRAGS + 2];
+   /* RX: fragments + linear part + virtio header */
+   struct scatterlist sg[MAX_SKB_FRAGS + 2];
+
+   /* Active rx statistics */
+   struct virtnet_recv_stats __percpu *stats;
+};
+
+struct virtnet_info {
+   int num_queue_pairs;/* # of RX/TX vq pairs */
+
+   struct send_queue **sq;
+   struct receive_queue **rq;
+   struct virtqueue *cvq;
+
+   struct virtio_device *vdev;
+   struct net_device *dev;
+   unsigned int status;
+
+   /* I like... big packets and I cannot lie! */
+   bool big_packets;
+
+   /* Host will merge rx buffers for big packets (shake it! shake it!) */
+   bool mergeable_rx_bufs;
 };
 
 struct skb_vnet_hdr {
@@ -106,22 +135,22 @@ static inline struct skb_vnet_hdr *skb_v
  * private is used to chain pages for big packets, put the whole
  * most recent used list in the beginning for reuse
  */
-static void give_pages(struct virtnet_info *vi, struct page *page)
+static void give_pages(struct receive_queue *rq, struct page *page)
 {
struct page *end;
 
/* Find end of list, sew whole thing into vi-pages. */
for (end = page; end-private; end = (struct page *)end-private);
-   end-private = (unsigned long)vi-pages;
-   vi-pages = page;
+   end-private = (unsigned long)rq-pages;
+   rq-pages = page;
 }
 
-static struct page *get_a_page(struct virtnet_info *vi, gfp_t gfp_mask)
+static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
 {
-   struct page *p = vi-pages;
+   struct page *p = rq-pages;
 
if (p) {
-   vi-pages = (struct page *)p-private;
+   rq-pages = (struct page *)p-private;
/* clear private here, it is used to chain pages */
p-private = 0;
} else
@@ -129,15 +158,16 @@ static struct page *get_a_page(struct vi
return p;
 }
 
-static void skb_xmit_done(struct virtqueue *svq)
+static void skb_xmit_done(struct virtqueue *vq)
 {
-   struct virtnet_info *vi = svq-vdev-priv;
+   struct virtnet_info *vi = vq-vdev-priv;
+   int qnum = vq-queue_index / 2; /* RX/TX vqs are allocated in pairs */
 
/* Suppress further interrupts. */
-   virtqueue_disable_cb(svq);
+   virtqueue_disable_cb(vq);
 
/* We were probably waiting for more output buffers. */
-   netif_wake_queue(vi-dev);
+   netif_wake_subqueue(vi-dev, qnum);
 }
 
 static void set_skb_frag(struct sk_buff *skb, struct 

[RFC] [ver3 PATCH 6/6] virtio_net: Convert virtio_net driver to use find_vqs_irq

2011-11-11 Thread Krishna Kumar
Convert virtio_net driver to use find_vqs_irq(). The TX vq's
share a single irq, while the RX vq's have individual irq's.
The skb_xmit_done handler also checks if any work is required.

Signed-off-by: krkum...@in.ibm.com
---
 drivers/net/virtio_net.c |   29 ++---
 1 file changed, 22 insertions(+), 7 deletions(-)

diff -ruNp org/drivers/net/virtio_net.c new/drivers/net/virtio_net.c
--- org/drivers/net/virtio_net.c2011-11-11 16:45:17.0 +0530
+++ new/drivers/net/virtio_net.c2011-11-11 16:48:45.0 +0530
@@ -163,11 +163,13 @@ static void skb_xmit_done(struct virtque
struct virtnet_info *vi = vq-vdev-priv;
int qnum = vq-queue_index / 2; /* RX/TX vqs are allocated in pairs */
 
-   /* Suppress further interrupts. */
-   virtqueue_disable_cb(vq);
+   if (__netif_subqueue_stopped(vi-dev, qnum)) {
+   /* Suppress further interrupts. */
+   virtqueue_disable_cb(vq);
 
-   /* We were probably waiting for more output buffers. */
-   netif_wake_subqueue(vi-dev, qnum);
+   /* We were probably waiting for more output buffers. */
+   netif_wake_subqueue(vi-dev, qnum);
+   }
 }
 
 static void set_skb_frag(struct sk_buff *skb, struct page *page,
@@ -1120,6 +1122,7 @@ static void setup_cvq(struct virtnet_inf
 
 static int invoke_find_vqs(struct virtnet_info *vi)
 {
+   unsigned long *flags = NULL;
vq_callback_t **callbacks;
struct virtqueue **vqs;
int ret = -ENOMEM;
@@ -1141,6 +1144,14 @@ static int invoke_find_vqs(struct virtne
if (!vqs || !callbacks || !names)
goto err;
 
+   if (vi-num_queue_pairs  1) {
+   int num = (total_vqs + BITS_PER_LONG - 1) / BITS_PER_LONG;
+
+   flags = kzalloc(num * sizeof(*flags), GFP_KERNEL);
+   if (!flags)
+   goto err;
+   }
+
/* Allocate/initialize parameters for recv virtqueues */
for (i = 0; i  vi-num_queue_pairs * 2; i += 2) {
callbacks[i] = skb_recv_done;
@@ -1155,6 +1166,8 @@ static int invoke_find_vqs(struct virtne
names[i] = kasprintf(GFP_KERNEL, output.%d, i / 2);
if (!names[i])
goto err;
+   if (flags)
+   set_bit(i, flags);
}
 
/* Parameters for control virtqueue, if any */
@@ -1163,9 +1176,9 @@ static int invoke_find_vqs(struct virtne
names[i - 1] = control;
}
 
-   ret = vi-vdev-config-find_vqs(vi-vdev, total_vqs, vqs, callbacks,
-(const char **)names);
-
+   ret = vi-vdev-config-find_vqs_irq(vi-vdev, total_vqs, vqs,
+callbacks, (const char **)names,
+flags);
if (ret)
goto err;
 
@@ -1174,6 +1187,8 @@ static int invoke_find_vqs(struct virtne
setup_cvq(vi, vqs, vi-num_queue_pairs * 2);
 
 err:
+   kfree(flags);
+
if (ret  names)
for (i = 0; i  vi-num_queue_pairs * 2; i++)
kfree(names[i]);

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC] [ver3 PATCH 5/6] virtio: Implement find_vqs_irq()

2011-11-11 Thread Krishna Kumar
Implement find_vqs_irq() to reduce number of vectors. It can
be used to specify which vq's need their own irqs, and which
can share irqs with other vq's.

Signed-off-by: krkum...@in.ibm.com
---
 drivers/virtio/virtio_pci.c   |  108 
 include/linux/virtio_config.h |   14 
 2 files changed, 95 insertions(+), 27 deletions(-)

diff -ruNp org/drivers/virtio/virtio_pci.c new/drivers/virtio/virtio_pci.c
--- org/drivers/virtio/virtio_pci.c 2011-11-11 16:45:09.0 +0530
+++ new/drivers/virtio/virtio_pci.c 2011-11-11 16:54:35.0 +0530
@@ -40,7 +40,7 @@ struct virtio_pci_device
/* the IO mapping for the PCI config space */
void __iomem *ioaddr;
 
-   /* a list of queues so we can dispatch IRQs */
+   /* a list of queues which have registered to receive IRQs */
spinlock_t lock;
struct list_head virtqueues;
 
@@ -196,7 +196,7 @@ static irqreturn_t vp_config_changed(int
return IRQ_HANDLED;
 }
 
-/* Notify all virtqueues on an interrupt. */
+/* Notify all vq's on 'virtqueues' list on an interrupt. */
 static irqreturn_t vp_vring_interrupt(int irq, void *opaque)
 {
struct virtio_pci_device *vp_dev = opaque;
@@ -358,7 +358,7 @@ static struct virtqueue *setup_vq(struct
struct virtio_pci_device *vp_dev = to_vp_device(vdev);
struct virtio_pci_vq_info *info;
struct virtqueue *vq;
-   unsigned long flags, size;
+   unsigned long size;
u16 num;
int err;
 
@@ -378,6 +378,7 @@ static struct virtqueue *setup_vq(struct
 
info-num = num;
info-msix_vector = msix_vec;
+   INIT_LIST_HEAD(info-node);
 
size = PAGE_ALIGN(vring_size(num, VIRTIO_PCI_VRING_ALIGN));
info-queue = alloc_pages_exact(size, GFP_KERNEL|__GFP_ZERO);
@@ -411,14 +412,6 @@ static struct virtqueue *setup_vq(struct
}
}
 
-   if (callback) {
-   spin_lock_irqsave(vp_dev-lock, flags);
-   list_add(info-node, vp_dev-virtqueues);
-   spin_unlock_irqrestore(vp_dev-lock, flags);
-   } else {
-   INIT_LIST_HEAD(info-node);
-   }
-
return vq;
 
 out_assign:
@@ -472,7 +465,8 @@ static void vp_del_vqs(struct virtio_dev
if (vp_dev-per_vq_vectors 
info-msix_vector != VIRTIO_MSI_NO_VECTOR)
free_irq(vp_dev-msix_entries[info-msix_vector].vector,
-vq);
+list_empty(info-node) ?
+(void *)vq : (void *)vp_dev);
vp_del_vq(vq);
}
vp_dev-per_vq_vectors = false;
@@ -480,16 +474,37 @@ static void vp_del_vqs(struct virtio_dev
vp_free_vectors(vdev);
 }
 
+static void add_vq_to_list(struct virtqueue *vq,
+  struct virtio_pci_device *vp_dev,
+  vq_callback_t *cb)
+{
+   struct virtio_pci_vq_info *info = vq-priv;
+   unsigned long flags;
+
+   if (cb) {
+   spin_lock_irqsave(vp_dev-lock, flags);
+   list_add(info-node, vp_dev-virtqueues);
+   spin_unlock_irqrestore(vp_dev-lock, flags);
+   }
+}
+
+/* Return true if flags is NULL, or 'bit'# in flags is clear */
+static bool bit_clear(unsigned long *flags, int bit)
+{
+   return flags ? !test_bit(bit, flags) : true;
+}
+
 static int vp_try_to_find_vqs(struct virtio_device *vdev, unsigned nvqs,
  struct virtqueue *vqs[],
  vq_callback_t *callbacks[],
  const char *names[],
  bool use_msix,
- bool per_vq_vectors)
+ bool per_vq_vectors, unsigned long *flags)
 {
struct virtio_pci_device *vp_dev = to_vp_device(vdev);
u16 msix_vec;
int i, err, nvectors, allocated_vectors;
+   int count = 0;  /* Count of vq's using shared irq's */
 
if (!use_msix) {
/* Old style: one normal interrupt for change and all vqs. */
@@ -500,9 +515,19 @@ static int vp_try_to_find_vqs(struct vir
if (per_vq_vectors) {
/* Best option: one for change interrupt, one per vq. */
nvectors = 1;
-   for (i = 0; i  nvqs; ++i)
-   if (callbacks[i])
+   for (i = 0; i  nvqs; ++i) {
+   bool alloc_irq = bit_clear(flags, i);
+
+   /*
+* We allocate a vector if cb is present,
+* AND (driver requested a vector OR this
+* is the first shared vector).
+*/
+   if (callbacks[i] 
+   (alloc_irq || ++count == 1))
   

Re: [RFC] [ver3 PATCH 0/6] Implement multiqueue virtio-net

2011-11-11 Thread Krishna Kumar
Sasha Levin levinsasha...@gmail.com wrote on 11/12/2011 03:32:04 AM:

 I'm seeing this BUG() sometimes when running it using a small patch I
 did for KVM tool:
 
 [1.281531] Call Trace:
 [1.281531]  [8138a0e5] ? free_rq_sq+0x2c/0xce
 [1.281531]  [8138bb63] ? virtnet_probe+0x81c/0x855
 [1.281531]  [8129c9e7] ? virtio_dev_probe+0xa7/0xc6
 [1.281531]  [8134d2c3] ? driver_probe_device+0xb2/0x142
 [1.281531]  [8134d3a2] ? __driver_attach+0x4f/0x6f
 [1.281531]  [8134d353] ? driver_probe_device+0x142/0x142
 [1.281531]  [8134c3ab] ? bus_for_each_dev+0x47/0x72
 [1.281531]  [8134c90d] ? bus_add_driver+0xa2/0x1e6
 [1.281531]  [81cc1b36] ? tun_init+0x89/0x89
 [1.281531]  [8134db59] ? driver_register+0x8d/0xf8
 [1.281531]  [81cc1b36] ? tun_init+0x89/0x89
 [1.281531]  [81c98ac1] ? do_one_initcall+0x78/0x130
 [1.281531]  [81c98c0e] ? kernel_init+0x95/0x113
 [1.281531]  [81658274] ? kernel_thread_helper+0x4/0x10
 [1.281531]  [81c98b79] ? do_one_initcall+0x130/0x130
 [1.281531]  [81658270] ? gs_change+0x13/0x13
 [1.281531] Code: c2 85 d2 48 0f 45 2d d1 39 ce 00 eb 22 65 8b 14 25
 90 cc 00 00 48 8b 05 f0 a6 bc 00 48 63 d2 4c 89 e7 48 03 3c d0 e8 83 dd
 00 00 
 [1.281531]  8b 68 10 44 89 e6 48 89 ef 2b 75 18 e8 e4 f1 ff ff 8b 05
 fd 
 [1.281531] RIP  [810b3ac7] free_percpu+0x9a/0x104
 [1.281531]  RSP 88001383fd50
 [1.281531] CR2: 0010
 [1.281531] ---[ end trace 68cbc23dfe2fe62a ]---
 
 I don't have time today to dig into it, sorry.

Thanks for the report.

free_rq_sq() was being called twice in the failure path. The second
call panic'd since it had freed the same pointers earlier.

1. free_rq_sq() was being called twice in the failure path.
   virtnet_setup_vqs() had already freed up rq/sq on error, and
   virtnet_probe() tried to do it again. Fix it in virtnet_probe
   by moving the call up.
2. Make free_rq_sq() re-entrant by setting freed pointers to NULL.
3. Remove free_stats() as it was being called only once.

Sasha, could you please try this patch on top of existing patches?

thanks!

Signed-off-by: krkum...@in.ibm.com
---
 drivers/net/virtio_net.c |   41 +++--
 1 file changed, 13 insertions(+), 28 deletions(-)

diff -ruNp n6/drivers/net/virtio_net.c n7/drivers/net/virtio_net.c
--- n6/drivers/net/virtio_net.c 2011-11-12 11:03:48.0 +0530
+++ n7/drivers/net/virtio_net.c 2011-11-12 10:39:28.0 +0530
@@ -782,23 +782,6 @@ static void virtnet_netpoll(struct net_d
 }
 #endif
 
-static void free_stats(struct virtnet_info *vi)
-{
-   int i;
-
-   for (i = 0; i  vi-num_queue_pairs; i++) {
-   if (vi-sq  vi-sq[i]) {
-   free_percpu(vi-sq[i]-stats);
-   vi-sq[i]-stats = NULL;
-   }
-
-   if (vi-rq  vi-rq[i]) {
-   free_percpu(vi-rq[i]-stats);
-   vi-rq[i]-stats = NULL;
-   }
-   }
-}
-
 static int virtnet_open(struct net_device *dev)
 {
struct virtnet_info *vi = netdev_priv(dev);
@@ -1054,19 +1037,22 @@ static void free_rq_sq(struct virtnet_in
 {
int i;
 
-   free_stats(vi);
-
-   if (vi-rq) {
-   for (i = 0; i  vi-num_queue_pairs; i++)
+   for (i = 0; i  vi-num_queue_pairs; i++) {
+   if (vi-rq  vi-rq[i]) {
+   free_percpu(vi-rq[i]-stats);
kfree(vi-rq[i]);
-   kfree(vi-rq);
-   }
+   vi-rq[i] = NULL;
+   }
 
-   if (vi-sq) {
-   for (i = 0; i  vi-num_queue_pairs; i++)
+   if (vi-sq  vi-sq[i]) {
+   free_percpu(vi-sq[i]-stats);
kfree(vi-sq[i]);
-   kfree(vi-sq);
+   vi-sq[i] = NULL;
+   }
}
+
+   kfree(vi-rq);
+   kfree(vi-sq);
 }
 
 static void free_unused_bufs(struct virtnet_info *vi)
@@ -1387,10 +1373,9 @@ free_vqs:
for (i = 0; i  num_queue_pairs; i++)
cancel_delayed_work_sync(vi-rq[i]-refill);
vdev-config-del_vqs(vdev);
-
-free_netdev:
free_rq_sq(vi);
 
+free_netdev:
free_netdev(dev);
return err;
 }

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/4] [RFC] virtio-net: Improve small packet performance

2011-05-04 Thread Krishna Kumar
Earlier approach to improving small packet performance went
along the lines of dropping packets when the txq is full to
avoid stop/start of the txq. Though performance improved
significantly (upto 3x) for a single thread, multiple netperf
sessions showed a regression of upto -17% (starting from 4
sessions).

This patch proposes a different approach with the following
changes:

A. virtio:
- Provide a API to get available number of slots.

B. virtio-net:
- Remove stop/start txq's and associated callback.
- Pre-calculate the number of slots needed to transmit
  the skb in xmit_skb and bail out early if enough space
  is not available. My testing shows that 2.5-3% of
  packets are benefited by using this API.
- Do not drop skbs but instead return TX_BUSY like other
  drivers.
- When returning EBUSY, set a per-txq variable to indicate
  to dev_queue_xmit() whether to restart xmits on this txq.

C. net/sched/sch_generic.c:
Since virtio-net now returns EBUSY, the skb is requeued to
gso_skb. This allows adding the addional check for restart
xmits in just the slow-path (the first re-queued packet
case of dequeue_skb, where it checks for gso_skb) before
deciding whether to call the driver or not.

Patch was also tested between two servers with Emulex OneConnect
10G cards to confirm there is no regression. Though the patch is
an attempt to improve only small packet performance, there was
improvement for 1K, 2K and also 16K both in BW and SD. Results
from Guest - Remote Host (BW in Mbps) for 1K and 16K I/O sizes:


I/O Size: 1K
#   BW1 BW2 (%) SD1 SD2 (%)

1   12263313 (170.2)6.6 1.9 (-71.2)
2   32237705 (139.0)18.07.1 (-60.5)
4   72238716 (20.6) 36.529.7 (-18.6)
8   86898693 (0)131.5   123.0 (-6.4)
16  80598285 (2.8)  578.3   506.2 (-12.4)
32  77587955 (2.5)  2281.4  2244.2 (-1.6)
64  75037895 (5.2)  9734.0  9424.4 (-3.1)
96  74967751 (3.4)  21980.9 20169.3 (-8.2)
128 73897741 (4.7)  40467.5 34995.5 (-13.5)

Summary:BW: 16.2%   SD: -10.2%


I/O Size: 16K
#   BW1 BW2 (%) SD1 SD2 (%)

1   66847019 (5.0)  1.1 1.1 (0)
2   76747196 (-6.2) 5.0 4.8 (-4.0)
4   73588032 (9.1)  21.320.4 (-4.2)
8   73938015 (8.4)  82.782.0 (-.8)
16  79588366 (5.1)  283.2   310.7 (9.7)
32  77928113 (4.1)  1257.5  1363.0 (8.3)
64  76738040 (4.7)  5723.1  5812.4 (1.5)
96  74627883 (5.6)  12731.8 12119.8 (-4.8)
128 73387800 (6.2)  21331.7 21094.7 (-1.1)

Summary:BW: 4.6%SD: -1.5%

Signed-off-by: Krishna Kumar krkum...@in.ibm.com
---
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/4] [RFC] netdevice: Introduce per-txq xmit_restart

2011-05-04 Thread Krishna Kumar
Add a per-txq field that can (optionally) be set by participating
drivers to indicate when to restart tx.

Signed-off-by: Krishna Kumar krkum...@in.ibm.com
---
 include/linux/netdevice.h |1 +
 1 file changed, 1 insertion(+)

diff -ruNp org/include/linux/netdevice.h new/include/linux/netdevice.h
--- org/include/linux/netdevice.h   2011-05-04 18:57:06.0 +0530
+++ new/include/linux/netdevice.h   2011-05-04 18:57:09.0 +0530
@@ -571,6 +571,7 @@ struct netdev_queue {
 * please use this field instead of dev-trans_start
 */
unsigned long   trans_start;
+   unsigned long   xmit_restart_jiffies; /* jiffies to restart */
 } cacheline_aligned_in_smp;
 
 static inline int netdev_queue_numa_node_read(const struct netdev_queue *q)
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/4] [RFC] virtio: Introduce new API to get free space

2011-05-04 Thread Krishna Kumar
Introduce virtqueue_get_capacity() to help bail out of transmit
path early. Also remove notification when we run out of space (I
am not sure if this should be under a feature bit).

Signed-off-by: Krishna Kumar krkum...@in.ibm.com
---
 drivers/virtio/virtio_ring.c |   13 -
 include/linux/virtio.h   |5 +
 2 files changed, 13 insertions(+), 5 deletions(-)

diff -ruNp org/include/linux/virtio.h new/include/linux/virtio.h
--- org/include/linux/virtio.h  2011-05-04 18:57:06.0 +0530
+++ new/include/linux/virtio.h  2011-05-04 18:57:09.0 +0530
@@ -27,6 +27,9 @@ struct virtqueue {
 
 /**
  * operations for virtqueue
+ * virtqueue_get_capacity: Get vq capacity
+ * vq: the struct virtqueue we're talking about.
+ * Returns remaining capacity of queue
  * virtqueue_add_buf: expose buffer to other end
  * vq: the struct virtqueue we're talking about.
  * sg: the description of the buffer(s).
@@ -62,6 +65,8 @@ struct virtqueue {
  * All operations can be called in any context.
  */
 
+int virtqueue_get_capacity(struct virtqueue *vq);
+
 int virtqueue_add_buf_gfp(struct virtqueue *vq,
  struct scatterlist sg[],
  unsigned int out_num,
diff -ruNp org/drivers/virtio/virtio_ring.c new/drivers/virtio/virtio_ring.c
--- org/drivers/virtio/virtio_ring.c2011-05-04 18:57:06.0 +0530
+++ new/drivers/virtio/virtio_ring.c2011-05-04 18:57:09.0 +0530
@@ -156,6 +156,14 @@ static int vring_add_indirect(struct vri
return head;
 }
 
+int virtqueue_get_capacity(struct virtqueue *_vq)
+{
+   struct vring_virtqueue *vq = to_vvq(_vq);
+
+   return vq-num_free;
+}
+EXPORT_SYMBOL_GPL(virtqueue_get_capacity);
+
 int virtqueue_add_buf_gfp(struct virtqueue *_vq,
  struct scatterlist sg[],
  unsigned int out,
@@ -185,11 +193,6 @@ int virtqueue_add_buf_gfp(struct virtque
if (vq-num_free  out + in) {
pr_debug(Can't add buf len %i - avail = %i\n,
 out + in, vq-num_free);
-   /* FIXME: for historical reasons, we force a notify here if
-* there are outgoing parts to the buffer.  Presumably the
-* host should service the ring ASAP. */
-   if (out)
-   vq-notify(vq-vq);
END_USE(vq);
return -ENOSPC;
}
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/4] [RFC] sched: Changes to dequeue_skb

2011-05-04 Thread Krishna Kumar
Dequeue_skb has an additional check, for the first packet that
is requeued, to see if the device has requested xmits after a
interval. This is intended to not affect the fast xmit path, and
have minimal overhead to the slow path. Drivers setting the
restart time should not stop/start their tx queues, and hence
the frozen/stopped check can be avoided.

Signed-off-by: Krishna Kumar krkum...@in.ibm.com
---
 net/sched/sch_generic.c |   23 ++-
 1 file changed, 18 insertions(+), 5 deletions(-)

diff -ruNp org/net/sched/sch_generic.c new/net/sched/sch_generic.c
--- org/net/sched/sch_generic.c 2011-05-04 18:57:06.0 +0530
+++ new/net/sched/sch_generic.c 2011-05-04 18:57:09.0 +0530
@@ -50,17 +50,30 @@ static inline int dev_requeue_skb(struct
return 0;
 }
 
+/*
+ * This function can return a rare false positive for drivers setting
+ * xmit_restart_jiffies (e.g. virtio-net) when xmit_restart_jiffies is
+ * zero but the device may not be ready. That only leads to the skb
+ * being requeued again.
+ */
+static inline int can_restart_xmit(struct Qdisc *q, struct sk_buff *skb)
+{
+   struct net_device *dev = qdisc_dev(q);
+   struct netdev_queue *txq;
+
+   txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
+   if (unlikely(txq-xmit_restart_jiffies))
+   return time_after_eq(jiffies, txq-xmit_restart_jiffies);
+   return !netif_tx_queue_frozen_or_stopped(txq);
+}
+
 static inline struct sk_buff *dequeue_skb(struct Qdisc *q)
 {
struct sk_buff *skb = q-gso_skb;
 
if (unlikely(skb)) {
-   struct net_device *dev = qdisc_dev(q);
-   struct netdev_queue *txq;
-
/* check the reason of requeuing without tx lock first */
-   txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
-   if (!netif_tx_queue_frozen_or_stopped(txq)) {
+   if (can_restart_xmit(q, skb)) {
q-gso_skb = NULL;
q-q.qlen--;
} else
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/4] [RFC] virtio-net: Changes to virtio-net driver

2011-05-04 Thread Krishna Kumar
Changes:

1. Remove xmit notification
2. free_old_xmit_skbs() frees upto a limit to reduce tx jitter.
3. xmit_skb() precalculates the number of slots and checks if
   that is available. It assumes that we are not using
   indirect descriptors at this time.
4. start_xmit() becomes a small routine that removes most error
   checks, does not drop packets but instead returns EBUSY if
   there is no space to transmit. It also sets when to restart
   xmits in future.

Signed-off-by: Krishna Kumar krkum...@in.ibm.com
---
 drivers/net/virtio_net.c |   70 ++---
 1 file changed, 20 insertions(+), 50 deletions(-)

diff -ruNp org/drivers/net/virtio_net.c new/drivers/net/virtio_net.c
--- org/drivers/net/virtio_net.c2011-05-04 18:57:06.0 +0530
+++ new/drivers/net/virtio_net.c2011-05-04 18:57:09.0 +0530
@@ -117,17 +117,6 @@ static struct page *get_a_page(struct vi
return p;
 }
 
-static void skb_xmit_done(struct virtqueue *svq)
-{
-   struct virtnet_info *vi = svq-vdev-priv;
-
-   /* Suppress further interrupts. */
-   virtqueue_disable_cb(svq);
-
-   /* We were probably waiting for more output buffers. */
-   netif_wake_queue(vi-dev);
-}
-
 static void set_skb_frag(struct sk_buff *skb, struct page *page,
 unsigned int offset, unsigned int *len)
 {
@@ -509,19 +498,18 @@ again:
return received;
 }
 
-static unsigned int free_old_xmit_skbs(struct virtnet_info *vi)
+static inline void free_old_xmit_skbs(struct virtnet_info *vi)
 {
struct sk_buff *skb;
-   unsigned int len, tot_sgs = 0;
+   unsigned int count = 0, len;
 
-   while ((skb = virtqueue_get_buf(vi-svq, len)) != NULL) {
+   while (count++  MAX_SKB_FRAGS+2 
+  (skb = virtqueue_get_buf(vi-svq, len)) != NULL) {
pr_debug(Sent skb %p\n, skb);
vi-dev-stats.tx_bytes += skb-len;
vi-dev-stats.tx_packets++;
-   tot_sgs += skb_vnet_hdr(skb)-num_sg;
dev_kfree_skb_any(skb);
}
-   return tot_sgs;
 }
 
 static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
@@ -531,6 +519,12 @@ static int xmit_skb(struct virtnet_info 
 
pr_debug(%s: xmit %p %pM\n, vi-dev-name, skb, dest);
 
+   hdr-num_sg = skb_to_sgvec(skb, vi-tx_sg + 1, 0, skb-len) + 1;
+   if (unlikely(hdr-num_sg  virtqueue_get_capacity(vi-svq))) {
+   /* Don't rely on indirect descriptors when reaching capacity */
+   return -ENOSPC;
+   }
+
if (skb-ip_summed == CHECKSUM_PARTIAL) {
hdr-hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
hdr-hdr.csum_start = skb_checksum_start_offset(skb);
@@ -566,7 +560,6 @@ static int xmit_skb(struct virtnet_info 
else
sg_set_buf(vi-tx_sg, hdr-hdr, sizeof hdr-hdr);
 
-   hdr-num_sg = skb_to_sgvec(skb, vi-tx_sg + 1, 0, skb-len) + 1;
return virtqueue_add_buf(vi-svq, vi-tx_sg, hdr-num_sg,
0, skb);
 }
@@ -574,30 +567,21 @@ static int xmit_skb(struct virtnet_info 
 static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
struct virtnet_info *vi = netdev_priv(dev);
-   int capacity;
 
/* Free up any pending old buffers before queueing new ones. */
free_old_xmit_skbs(vi);
 
/* Try to transmit */
-   capacity = xmit_skb(vi, skb);
+   if (unlikely(xmit_skb(vi, skb)  0)) {
+   struct netdev_queue *txq;
 
-   /* This can happen with OOM and indirect buffers. */
-   if (unlikely(capacity  0)) {
-   if (net_ratelimit()) {
-   if (likely(capacity == -ENOMEM)) {
-   dev_warn(dev-dev,
-TX queue failure: out of memory\n);
-   } else {
-   dev-stats.tx_fifo_errors++;
-   dev_warn(dev-dev,
-Unexpected TX queue failure: %d\n,
-capacity);
-   }
-   }
-   dev-stats.tx_dropped++;
-   kfree_skb(skb);
-   return NETDEV_TX_OK;
+   /*
+* Tell kernel to restart xmits after 1 jiffy to help the
+* host catch up.
+*/
+   txq = netdev_get_tx_queue(dev, 0);
+   txq-xmit_restart_jiffies = jiffies + 1;
+   return NETDEV_TX_BUSY;
}
virtqueue_kick(vi-svq);
 
@@ -605,20 +589,6 @@ static netdev_tx_t start_xmit(struct sk_
skb_orphan(skb);
nf_reset(skb);
 
-   /* Apparently nice girls don't return TX_BUSY; stop the queue
-* before it gets out of hand.  Naturally, this wastes entries. */
-   if (capacity  2+MAX_SKB_FRAGS) {
-   netif_stop_queue(dev

Re: [PATCH 0/4] [RFC] virtio-net: Improve small packet performance

2011-05-04 Thread Krishna Kumar
Krishna Kumar2/India/IBM@IBMIN wrote on 05/04/2011 07:32:58 PM:

 [PATCH 0/4] [RFC] virtio-net: Improve small packet performance

I found having tabs in the table made the results a little
difficult to understand. Converting the same to spaces, hope
it is clear this time.


   I/O Size: 1K
# BW1  BW2 (%)SD1   SD2 (%)

1 1226 3313 (170.2)   6.6   1.9 (-71.2)
2 3223 7705 (139.0)   18.0  7.1 (-60.5)
4 7223 8716 (20.6)36.5  29.7 (-18.6)
8 8689 8693 (0)   131.5 123.0 (-6.4)
168059 8285 (2.8) 578.3 506.2 (-12.4)
327758 7955 (2.5) 2281.42244.2 (-1.6)
647503 7895 (5.2) 9734.09424.4 (-3.1)
967496 7751 (3.4) 21980.9   20169.3 (-8.2)
128   7389 7741 (4.7) 40467.5   34995.5 (-13.5)

Summary: BW: 16.2% SD: -10.2%


   I/O Size: 16K
# BW1  BW2 (%)SD1   SD2 (%)

1 6684 7019 (5.0) 1.1   1.1 (0)
2 7674 7196 (-6.2)5.0   4.8 (-4.0)
4 7358 8032 (9.1) 21.3  20.4 (-4.2)
8 7393 8015 (8.4) 82.7  82.0 (-.8)
167958 8366 (5.1) 283.2 310.7 (9.7)
327792 8113 (4.1) 1257.51363.0 (8.3)
647673 8040 (4.7) 5723.15812.4 (1.5)
967462 7883 (5.6) 12731.8   12119.8 (-4.8)
128   7338 7800 (6.2) 21331.7   21094.7 (-1.1)

Summary: BW: 4.6% SD: -1.5%
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/4] [RFC rev2] Implement multiqueue (RX TX) virtio-net

2011-04-05 Thread Krishna Kumar
This patchset implements both RX and TX MQ. Patch against virtio-net,
vhost and qemu are included.

Changes from rev1:
---
1. vqs are allocated as: rx/tx, rx/tx, rx/tx, etc. Lot of code in
   guest/host/qemu changes, but code becomes simpler.
2. vhost cache align of vhost_dev correctly.
3. virtio-net: cleanup properly on errors (eg detach buf for vq0 as 
   pointed out by Micheal).
4. Minor changes:
- Fixed some typos.
- Changed vhost_get_thread_index to use MAX_VHOST_THREADS.
- Removed VIRTIO_MAX_TXQS.
- Changed capability to VIRTIO_NET_F_MULTIQUEUE.
- Modified numtxqs in virtnet_info to num_queue_pairs.
  virtnet_info still has numtxqs as it is more convenient.
- Moved code for VIRTIO_NET_F_CTRL_VLAN into probe function.
- Improve check for return value of virtio_config_val().
- Removed cache align directives in guest as it was redundant.
5. If we have a wrapper to init all vqs, pls add a wrapper to clean up
all vqs as well: I haven't done this as some errors are very
specific to the failure location (and what was initialized till
then). So only those errors are cleaned up using goto's like the
rest of the code. I can change in next version if you feel this is
still required.
6.  I think we should have free_unused_bufs that handles a single queue,
and call it in a loop: I haven't done this as I think the caller wants
all rx/tx queues to be cleaned up by calling this function.

TODO's:

1. Reduce vectors for find_vqs().
2. Make vhost changes minimal. For now, I have restricted the number of
   vhost threads to 4. This can be either made unrestricted; or if the
   userspace vhost works, it can be removed altogether.

Please review and provide feedback. I am travelling a bit in the next
few days but will respond at the earliest.

Signed-off-by: Krishna Kumar krkum...@in.ibm.com
---
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/4] [RFC rev2] Change virtqueue structure

2011-04-05 Thread Krishna Kumar
Move queue_index from virtio_pci_vq_info to virtqueue.  This
allows callback handlers to figure out the queue number for
the vq that needs attention.

Signed-off-by: Krishna Kumar krkum...@in.ibm.com
---
 drivers/virtio/virtio_pci.c |   10 +++---
 include/linux/virtio.h  |1 +
 2 files changed, 4 insertions(+), 7 deletions(-)

diff -ruNp org/include/linux/virtio.h new/include/linux/virtio.h
--- org/include/linux/virtio.h  2011-04-05 14:15:18.0 +0530
+++ new/include/linux/virtio.h  2011-04-05 14:15:18.0 +0530
@@ -22,6 +22,7 @@ struct virtqueue {
void (*callback)(struct virtqueue *vq);
const char *name;
struct virtio_device *vdev;
+   int queue_index;/* the index of the queue */
void *priv;
 };
 
diff -ruNp org/drivers/virtio/virtio_pci.c new/drivers/virtio/virtio_pci.c
--- org/drivers/virtio/virtio_pci.c 2011-04-05 14:15:18.0 +0530
+++ new/drivers/virtio/virtio_pci.c 2011-04-05 14:15:18.0 +0530
@@ -75,9 +75,6 @@ struct virtio_pci_vq_info
/* the number of entries in the queue */
int num;
 
-   /* the index of the queue */
-   int queue_index;
-
/* the virtual address of the ring queue */
void *queue;
 
@@ -180,11 +177,10 @@ static void vp_reset(struct virtio_devic
 static void vp_notify(struct virtqueue *vq)
 {
struct virtio_pci_device *vp_dev = to_vp_device(vq-vdev);
-   struct virtio_pci_vq_info *info = vq-priv;
 
/* we write the queue's selector into the notification register to
 * signal the other end */
-   iowrite16(info-queue_index, vp_dev-ioaddr + VIRTIO_PCI_QUEUE_NOTIFY);
+   iowrite16(vq-queue_index, vp_dev-ioaddr + VIRTIO_PCI_QUEUE_NOTIFY);
 }
 
 /* Handle a configuration change: Tell driver if it wants to know. */
@@ -380,7 +376,6 @@ static struct virtqueue *setup_vq(struct
if (!info)
return ERR_PTR(-ENOMEM);
 
-   info-queue_index = index;
info-num = num;
info-msix_vector = msix_vec;
 
@@ -403,6 +398,7 @@ static struct virtqueue *setup_vq(struct
goto out_activate_queue;
}
 
+   vq-queue_index = index;
vq-priv = info;
info-vq = vq;
 
@@ -441,7 +437,7 @@ static void vp_del_vq(struct virtqueue *
list_del(info-node);
spin_unlock_irqrestore(vp_dev-lock, flags);
 
-   iowrite16(info-queue_index, vp_dev-ioaddr + VIRTIO_PCI_QUEUE_SEL);
+   iowrite16(vq-queue_index, vp_dev-ioaddr + VIRTIO_PCI_QUEUE_SEL);
 
if (vp_dev-msix_enabled) {
iowrite16(VIRTIO_MSI_NO_VECTOR,
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/4] [RFC rev2] virtio-net changes

2011-04-05 Thread Krishna Kumar
Implement mq virtio-net driver. 

Though struct virtio_net_config changes, it works with the old
qemu since the last element is not accessed unless qemu sets
VIRTIO_NET_F_MULTIQUEUE.

Signed-off-by: Krishna Kumar krkum...@in.ibm.com
---
 drivers/net/virtio_net.c   |  573 ---
 include/linux/virtio_net.h |3 
 2 files changed, 408 insertions(+), 168 deletions(-)

diff -ruNp org/include/linux/virtio_net.h new/include/linux/virtio_net.h
--- org/include/linux/virtio_net.h  2011-04-05 14:15:18.0 +0530
+++ new/include/linux/virtio_net.h  2011-04-05 14:15:18.0 +0530
@@ -26,6 +26,7 @@
 #define VIRTIO_NET_F_CTRL_RX   18  /* Control channel RX mode support */
 #define VIRTIO_NET_F_CTRL_VLAN 19  /* Control channel VLAN filtering */
 #define VIRTIO_NET_F_CTRL_RX_EXTRA 20  /* Extra RX mode control support */
+#define VIRTIO_NET_F_MULTIQUEUE21  /* Device supports multiple 
TXQ/RXQ */
 
 #define VIRTIO_NET_S_LINK_UP   1   /* Link is up */
 
@@ -34,6 +35,8 @@ struct virtio_net_config {
__u8 mac[6];
/* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */
__u16 status;
+   /* total number of RX/TX queues */
+   __u16 num_queue_pairs;
 } __attribute__((packed));
 
 /* This is the first element of the scatter-gather list.  If you don't
diff -ruNp org/drivers/net/virtio_net.c new/drivers/net/virtio_net.c
--- org/drivers/net/virtio_net.c2011-04-05 20:30:23.0 +0530
+++ new/drivers/net/virtio_net.c2011-04-05 20:30:53.0 +0530
@@ -40,31 +40,53 @@ module_param(gso, bool, 0444);
 
 #define VIRTNET_SEND_COMMAND_SG_MAX2
 
-struct virtnet_info {
-   struct virtio_device *vdev;
-   struct virtqueue *rvq, *svq, *cvq;
-   struct net_device *dev;
+/* Internal representation of a send virtqueue */
+struct send_queue {
+   /* Virtqueue associated with this send _queue */
+   struct virtqueue *svq;
+
+   /* TX: fragments + linear part + virtio header */
+   struct scatterlist tx_sg[MAX_SKB_FRAGS + 2];
+};
+
+/* Internal representation of a receive virtqueue */
+struct receive_queue {
+   /* Virtqueue associated with this receive_queue */
+   struct virtqueue *rvq;
+
+   /* Back pointer to the virtnet_info */
+   struct virtnet_info *vi;
+
struct napi_struct napi;
-   unsigned int status;
 
/* Number of input buffers, and max we've ever had. */
unsigned int num, max;
 
-   /* I like... big packets and I cannot lie! */
-   bool big_packets;
-
-   /* Host will merge rx buffers for big packets (shake it! shake it!) */
-   bool mergeable_rx_bufs;
-
/* Work struct for refilling if we run low on memory. */
struct delayed_work refill;
 
/* Chain pages by the private ptr. */
struct page *pages;
 
-   /* fragments + linear part + virtio header */
+   /* RX: fragments + linear part + virtio header */
struct scatterlist rx_sg[MAX_SKB_FRAGS + 2];
-   struct scatterlist tx_sg[MAX_SKB_FRAGS + 2];
+};
+
+struct virtnet_info {
+   struct send_queue **sq;
+   struct receive_queue **rq;
+
+   int numtxqs; /* # of rxqs/txqs */
+   struct virtio_device *vdev;
+   struct virtqueue *cvq;
+   struct net_device *dev;
+   unsigned int status;
+
+   /* I like... big packets and I cannot lie! */
+   bool big_packets;
+
+   /* Host will merge rx buffers for big packets (shake it! shake it!) */
+   bool mergeable_rx_bufs;
 };
 
 struct skb_vnet_hdr {
@@ -94,22 +116,22 @@ static inline struct skb_vnet_hdr *skb_v
  * private is used to chain pages for big packets, put the whole
  * most recent used list in the beginning for reuse
  */
-static void give_pages(struct virtnet_info *vi, struct page *page)
+static void give_pages(struct receive_queue *rq, struct page *page)
 {
struct page *end;
 
/* Find end of list, sew whole thing into vi-pages. */
for (end = page; end-private; end = (struct page *)end-private);
-   end-private = (unsigned long)vi-pages;
-   vi-pages = page;
+   end-private = (unsigned long)rq-pages;
+   rq-pages = page;
 }
 
-static struct page *get_a_page(struct virtnet_info *vi, gfp_t gfp_mask)
+static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
 {
-   struct page *p = vi-pages;
+   struct page *p = rq-pages;
 
if (p) {
-   vi-pages = (struct page *)p-private;
+   rq-pages = (struct page *)p-private;
/* clear private here, it is used to chain pages */
p-private = 0;
} else
@@ -120,12 +142,13 @@ static struct page *get_a_page(struct vi
 static void skb_xmit_done(struct virtqueue *svq)
 {
struct virtnet_info *vi = svq-vdev-priv;
+   int qnum = svq-queue_index / 2; /* RX/TX vqs are allocated in pairs */
 
/* Suppress further interrupts. */
virtqueue_disable_cb(svq

[PATCH 3/4] [RFC rev2] vhost changes

2011-04-05 Thread Krishna Kumar
vhost_net_open is changed to allocate a vhost_net and return.
The remaining initializations are delayed till SET_OWNER.
SET_OWNER is changed so that the argument is used to determine
how many txqs to use.  Unmodified qemu's will pass NULL, so
this is recognized and handled as numtxqs=1.

The number of vhost threads is = #txqs.  Threads handle more
than one txq when #txqs is more than MAX_VHOST_THREADS (4).
The same thread handles both RX and TX - tested with tap/bridge
so far (TBD: some changes are needed in macvtap to support the
same).

Signed-off-by: Krishna Kumar krkum...@in.ibm.com
---
 drivers/vhost/net.c   |  249 +---
 drivers/vhost/vhost.c |  221 ---
 drivers/vhost/vhost.h |   22 ++-
 3 files changed, 332 insertions(+), 160 deletions(-)

diff -ruNp org/drivers/vhost/net.c new/drivers/vhost/net.c
--- org/drivers/vhost/net.c 2011-04-05 14:15:18.0 +0530
+++ new/drivers/vhost/net.c 2011-04-05 20:15:32.0 +0530
@@ -32,12 +32,6 @@
  * Using this limit prevents one virtqueue from starving others. */
 #define VHOST_NET_WEIGHT 0x8
 
-enum {
-   VHOST_NET_VQ_RX = 0,
-   VHOST_NET_VQ_TX = 1,
-   VHOST_NET_VQ_MAX = 2,
-};
-
 enum vhost_net_poll_state {
VHOST_NET_POLL_DISABLED = 0,
VHOST_NET_POLL_STARTED = 1,
@@ -46,12 +40,13 @@ enum vhost_net_poll_state {
 
 struct vhost_net {
struct vhost_dev dev;
-   struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
-   struct vhost_poll poll[VHOST_NET_VQ_MAX];
+   struct vhost_virtqueue *vqs;
+   struct vhost_poll *poll;
+   struct socket **socks;
/* Tells us whether we are polling a socket for TX.
 * We only do this when socket buffer fills up.
 * Protected by tx vq lock. */
-   enum vhost_net_poll_state tx_poll_state;
+   enum vhost_net_poll_state *tx_poll_state;
 };
 
 /* Pop first len bytes from iovec. Return number of segments used. */
@@ -93,28 +88,28 @@ static void copy_iovec_hdr(const struct 
 }
 
 /* Caller must have TX VQ lock */
-static void tx_poll_stop(struct vhost_net *net)
+static void tx_poll_stop(struct vhost_net *net, int qnum)
 {
-   if (likely(net-tx_poll_state != VHOST_NET_POLL_STARTED))
+   if (likely(net-tx_poll_state[qnum / 2] != VHOST_NET_POLL_STARTED))
return;
-   vhost_poll_stop(net-poll + VHOST_NET_VQ_TX);
-   net-tx_poll_state = VHOST_NET_POLL_STOPPED;
+   vhost_poll_stop(net-poll[qnum]);
+   net-tx_poll_state[qnum / 2] = VHOST_NET_POLL_STOPPED;
 }
 
 /* Caller must have TX VQ lock */
-static void tx_poll_start(struct vhost_net *net, struct socket *sock)
+static void tx_poll_start(struct vhost_net *net, struct socket *sock, int qnum)
 {
-   if (unlikely(net-tx_poll_state != VHOST_NET_POLL_STOPPED))
+   if (unlikely(net-tx_poll_state[qnum / 2] != VHOST_NET_POLL_STOPPED))
return;
-   vhost_poll_start(net-poll + VHOST_NET_VQ_TX, sock-file);
-   net-tx_poll_state = VHOST_NET_POLL_STARTED;
+   vhost_poll_start(net-poll[qnum], sock-file);
+   net-tx_poll_state[qnum / 2] = VHOST_NET_POLL_STARTED;
 }
 
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
-static void handle_tx(struct vhost_net *net)
+static void handle_tx(struct vhost_virtqueue *vq)
 {
-   struct vhost_virtqueue *vq = net-dev.vqs[VHOST_NET_VQ_TX];
+   struct vhost_net *net = container_of(vq-dev, struct vhost_net, dev);
unsigned out, in, s;
int head;
struct msghdr msg = {
@@ -138,7 +133,7 @@ static void handle_tx(struct vhost_net *
wmem = atomic_read(sock-sk-sk_wmem_alloc);
if (wmem = sock-sk-sk_sndbuf) {
mutex_lock(vq-mutex);
-   tx_poll_start(net, sock);
+   tx_poll_start(net, sock, vq-qnum);
mutex_unlock(vq-mutex);
return;
}
@@ -147,7 +142,7 @@ static void handle_tx(struct vhost_net *
vhost_disable_notify(vq);
 
if (wmem  sock-sk-sk_sndbuf / 2)
-   tx_poll_stop(net);
+   tx_poll_stop(net, vq-qnum);
hdr_size = vq-vhost_hlen;
 
for (;;) {
@@ -162,7 +157,7 @@ static void handle_tx(struct vhost_net *
if (head == vq-num) {
wmem = atomic_read(sock-sk-sk_wmem_alloc);
if (wmem = sock-sk-sk_sndbuf * 3 / 4) {
-   tx_poll_start(net, sock);
+   tx_poll_start(net, sock, vq-qnum);
set_bit(SOCK_ASYNC_NOSPACE, sock-flags);
break;
}
@@ -192,7 +187,7 @@ static void handle_tx(struct vhost_net *
err = sock-ops-sendmsg(NULL, sock, msg, len);
if (unlikely(err  0)) {
vhost_discard_vq_desc(vq, 1);
-   tx_poll_start(net, sock

[PATCH 4/4] [RFC rev2] qemu changes

2011-04-05 Thread Krishna Kumar
diff -ruNp org/hw/vhost.c new/hw/vhost.c
--- org/hw/vhost.c  2011-04-05 14:15:18.0 +0530
+++ new/hw/vhost.c  2011-04-05 14:15:18.0 +0530
@@ -581,7 +581,7 @@ static void vhost_virtqueue_cleanup(stru
   0, virtio_queue_get_desc_size(vdev, idx));
 }
 
-int vhost_dev_init(struct vhost_dev *hdev, int devfd, bool force)
+int vhost_dev_init(struct vhost_dev *hdev, int devfd, bool force, int numtxqs)
 {
 uint64_t features;
 int r;
@@ -593,11 +593,13 @@ int vhost_dev_init(struct vhost_dev *hde
 return -errno;
 }
 }
-r = ioctl(hdev-control, VHOST_SET_OWNER, NULL);
+r = ioctl(hdev-control, VHOST_SET_OWNER, numtxqs);
 if (r  0) {
 goto fail;
 }
 
+hdev-nvqs = numtxqs * 2;
+
 r = ioctl(hdev-control, VHOST_GET_FEATURES, features);
 if (r  0) {
 goto fail;
diff -ruNp org/hw/vhost.h new/hw/vhost.h
--- org/hw/vhost.h  2011-04-05 14:15:18.0 +0530
+++ new/hw/vhost.h  2011-04-05 14:15:18.0 +0530
@@ -41,7 +41,7 @@ struct vhost_dev {
 bool force;
 };
 
-int vhost_dev_init(struct vhost_dev *hdev, int devfd, bool force);
+int vhost_dev_init(struct vhost_dev *hdev, int devfd, bool force, int numtxqs);
 void vhost_dev_cleanup(struct vhost_dev *hdev);
 bool vhost_dev_query(struct vhost_dev *hdev, VirtIODevice *vdev);
 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev);
diff -ruNp org/hw/vhost_net.c new/hw/vhost_net.c
--- org/hw/vhost_net.c  2011-04-05 14:15:18.0 +0530
+++ new/hw/vhost_net.c  2011-04-05 20:27:01.0 +0530
@@ -36,8 +36,9 @@
 
 struct vhost_net {
 struct vhost_dev dev;
-struct vhost_virtqueue vqs[2];
-int backend;
+struct vhost_virtqueue *vqs;
+int nvqs;
+int *backend;
 VLANClientState *vc;
 };
 
@@ -70,11 +71,11 @@ void vhost_net_ack_features(struct vhost
 }
 }
 
-static int vhost_net_get_fd(VLANClientState *backend)
+static int vhost_net_get_fd(VLANClientState *backend, int index)
 {
 switch (backend-info-type) {
 case NET_CLIENT_TYPE_TAP:
-return tap_get_fd(backend);
+return tap_get_fd(backend, index);
 default:
 fprintf(stderr, vhost-net requires tap backend\n);
 return -EBADFD;
@@ -82,27 +83,36 @@ static int vhost_net_get_fd(VLANClientSt
 }
 
 struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd,
- bool force)
+ bool force, int numtxqs)
 {
-int r;
+int i, r;
 struct vhost_net *net = qemu_malloc(sizeof *net);
 if (!backend) {
 fprintf(stderr, vhost-net requires backend to be setup\n);
 goto fail;
 }
-r = vhost_net_get_fd(backend);
-if (r  0) {
-goto fail;
+
+net-backend = qemu_malloc(numtxqs * (sizeof *net-backend));
+for (i = 0; i  numtxqs; i++) {
+r = vhost_net_get_fd(backend, i);
+if (r  0) {
+goto fail;
+}
+net-backend[i] = r;
 }
+
 net-vc = backend;
 net-dev.backend_features = tap_has_vnet_hdr(backend) ? 0 :
 (1  VHOST_NET_F_VIRTIO_NET_HDR);
-net-backend = r;
 
-r = vhost_dev_init(net-dev, devfd, force);
+r = vhost_dev_init(net-dev, devfd, force, numtxqs);
 if (r  0) {
 goto fail;
 }
+
+net-nvqs = numtxqs * 2;
+net-vqs = qemu_malloc(net-nvqs * (sizeof *net-vqs));
+
 if (!tap_has_vnet_hdr_len(backend,
   sizeof(struct virtio_net_hdr_mrg_rxbuf))) {
 net-dev.features = ~(1  VIRTIO_NET_F_MRG_RXBUF);
@@ -137,7 +147,6 @@ int vhost_net_start(struct vhost_net *ne
  sizeof(struct virtio_net_hdr_mrg_rxbuf));
 }
 
-net-dev.nvqs = 2;
 net-dev.vqs = net-vqs;
 r = vhost_dev_start(net-dev, dev);
 if (r  0) {
@@ -145,9 +154,9 @@ int vhost_net_start(struct vhost_net *ne
 }
 
 net-vc-info-poll(net-vc, false);
-qemu_set_fd_handler(net-backend, NULL, NULL, NULL);
-file.fd = net-backend;
 for (file.index = 0; file.index  net-dev.nvqs; ++file.index) {
+qemu_set_fd_handler(net-backend[file.index/2], NULL, NULL, NULL);
+file.fd = net-backend[(file.index / 2) % (net-dev.nvqs / 2)];
 r = ioctl(net-dev.control, VHOST_NET_SET_BACKEND, file);
 if (r  0) {
 r = -errno;
@@ -195,7 +204,7 @@ void vhost_net_cleanup(struct vhost_net 
 }
 #else
 struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd,
- bool force)
+ bool force, int numtxqs)
 {
 return NULL;
 }
diff -ruNp org/hw/vhost_net.h new/hw/vhost_net.h
--- org/hw/vhost_net.h  2011-04-05 14:15:18.0 +0530
+++ new/hw/vhost_net.h  2011-04-05 14:15:18.0 +0530
@@ -6,7 +6,8 @@
 struct vhost_net;
 typedef struct vhost_net VHostNetState;
 
-VHostNetState *vhost_net_init(VLANClientState *backend, int devfd, bool force);
+VHostNetState 

[PATCH 0/3] [RFC] Implement multiqueue (RX TX) virtio-net

2011-02-27 Thread Krishna Kumar
This patch series is a continuation of an earlier one that
implemented guest MQ TX functionality.  This new patchset
implements both RX and TX MQ.  Qemu changes are not being
included at this time solely to aid in easier review.
Compatibility testing with old/new combinations of qemu/guest
and vhost was done without any issues.

Some early TCP/UDP test results are at the bottom of this
post, I plan to submit more test results in the coming days.

Please review and provide feedback on what can improve.

Thanks!

Signed-off-by: Krishna Kumar krkum...@in.ibm.com
---


Test configuration:
  Host:  8 Intel Xeon, 8 GB memory
  Guest: 4 cpus, 2 GB memory

Each test case runs for 60 secs, results below are average over
two runs.  Bandwidth numbers are in gbps.  I have used default
netperf, and no testing/system tuning other than taskset each
vhost to 0xf (cpus 0-3).  Comparison is testing original kernel
vs new kernel with #txqs=8 (# refers to number of netperf
sessions).

___
TCP: Guest - Local Host (TCP_STREAM)
#BW1BW2 (%) SD1SD2 (%)   RSD1RSD2 (%)
___
17190   7170 (-.2)  0  0 (0)  3   4 (33.3)
28774   11235 (28.0)3  3 (0)  16  14 (-12.5)
49753   15195 (55.7)17 21 (23.5)  65  59 (-9.2)
810224  18265 (78.6)71 115 (61.9) 251 240 (-4.3)
16   10749  18123 (68.6)277456 (64.6) 985 925 (-6.0)
32   11133  17351 (55.8)1132   1947 (71.9)39353831 (-2.6)
64   11223  17115 (52.4)4682   7836 (67.3)15949   15373 (-3.6)
128  11269  16986 (50.7)19783  31505 (59.2)   66799   61759 (-7.5)
___
Summary:  BW: 37.6  SD: 61.2  RSD: -6.5


___
 TCP: Local Host - Guest (TCP_MAERTS)
#BW1BW2 (%)SD1SD2 (%)RSD1RSD2 (%)
___
111490  10870 (-5.3)   0  0 (0)  2   2 (0)
210612  10554 (-.5)2  3 (50.0)   12  12 (0)
410047  14320 (42.5)   13 16 (23.0)  53  53 (0)
89273   15182 (63.7)   56 84 (50.0)  228 233 (2.1)
16   9291   15853 (70.6)   235390 (65.9) 934 965 (3.3)
32   9382   15741 (67.7)   9691823 (88.1)38684037 (4.3)
64   9270   14585 (57.3)   3966   8836 (122.7)   15415   17818 (15.5)
128  8997   14678 (63.1)   17024  36649 (115.2)  64933   72677 (11.9)
___
SUM:  BW: 24.8  SD: 114.6  RSD: 12.1

__
UDP: Local Host - Guest (UDP_STREAM)
#  BW1  BW2 (%)SD1SD2 (%)
__
1  1723616585 (-3.7)1  1 (0)
2  1679522693 (35.1)5  6 (20.0)
4  1339021502 (60.5)37 36 (-2.7)
8  1326124361 (83.7)163175 (7.3)
16 1277223796 (86.3)692826 (19.3)
32 1283223880 (86.0)2812   2871 (2.0)
64 1277924293 (90.1)11299  11237 (-.5)
1281300624857 (91.1)44778  43884 (-1.9)
__
Summary:  BW: 37.1  SD: -1.2
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] [RFC] Change virtqueue structure

2011-02-27 Thread Krishna Kumar
Move queue_index from virtio_pci_vq_info to virtqueue.  This
allows callback handlers to figure out the queue number for
the vq that needs attention.

Signed-off-by: Krishna Kumar krkum...@in.ibm.com
---
 drivers/virtio/virtio_pci.c |   10 +++---
 include/linux/virtio.h  |1 +
 2 files changed, 4 insertions(+), 7 deletions(-)

diff -ruNp org/include/linux/virtio.h new/include/linux/virtio.h
--- org/include/linux/virtio.h  2010-10-11 10:20:22.0 +0530
+++ new/include/linux/virtio.h  2011-02-23 16:26:18.0 +0530
@@ -22,6 +22,7 @@ struct virtqueue {
void (*callback)(struct virtqueue *vq);
const char *name;
struct virtio_device *vdev;
+   int queue_index;/* the index of the queue */
void *priv;
 };
 
diff -ruNp org/drivers/virtio/virtio_pci.c new/drivers/virtio/virtio_pci.c
--- org/drivers/virtio/virtio_pci.c 2011-01-28 11:38:24.0 +0530
+++ new/drivers/virtio/virtio_pci.c 2011-02-25 10:11:22.0 +0530
@@ -75,9 +75,6 @@ struct virtio_pci_vq_info
/* the number of entries in the queue */
int num;
 
-   /* the index of the queue */
-   int queue_index;
-
/* the virtual address of the ring queue */
void *queue;
 
@@ -180,11 +177,10 @@ static void vp_reset(struct virtio_devic
 static void vp_notify(struct virtqueue *vq)
 {
struct virtio_pci_device *vp_dev = to_vp_device(vq-vdev);
-   struct virtio_pci_vq_info *info = vq-priv;
 
/* we write the queue's selector into the notification register to
 * signal the other end */
-   iowrite16(info-queue_index, vp_dev-ioaddr + VIRTIO_PCI_QUEUE_NOTIFY);
+   iowrite16(vq-queue_index, vp_dev-ioaddr + VIRTIO_PCI_QUEUE_NOTIFY);
 }
 
 /* Handle a configuration change: Tell driver if it wants to know. */
@@ -380,7 +376,6 @@ static struct virtqueue *setup_vq(struct
if (!info)
return ERR_PTR(-ENOMEM);
 
-   info-queue_index = index;
info-num = num;
info-msix_vector = msix_vec;
 
@@ -403,6 +398,7 @@ static struct virtqueue *setup_vq(struct
goto out_activate_queue;
}
 
+   vq-queue_index = index;
vq-priv = info;
info-vq = vq;
 
@@ -441,7 +437,7 @@ static void vp_del_vq(struct virtqueue *
list_del(info-node);
spin_unlock_irqrestore(vp_dev-lock, flags);
 
-   iowrite16(info-queue_index, vp_dev-ioaddr + VIRTIO_PCI_QUEUE_SEL);
+   iowrite16(vq-queue_index, vp_dev-ioaddr + VIRTIO_PCI_QUEUE_SEL);
 
if (vp_dev-msix_enabled) {
iowrite16(VIRTIO_MSI_NO_VECTOR,
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/3] [RFC] Changes for MQ virtio-net

2011-02-27 Thread Krishna Kumar
Implement mq virtio-net driver. 

Though struct virtio_net_config changes, it works with the old
qemu since the last element is not accessed unless qemu sets
VIRTIO_NET_F_NUMTXQS.  Patch also adds a macro for the maximum
number of TX vq's (VIRTIO_MAX_TXQS) that the user can specify.

Signed-off-by: Krishna Kumar krkum...@in.ibm.com
---
 drivers/net/virtio_net.c   |  543 ---
 include/linux/virtio_net.h |6 
 2 files changed, 386 insertions(+), 163 deletions(-)

diff -ruNp org/include/linux/virtio_net.h new/include/linux/virtio_net.h
--- org/include/linux/virtio_net.h  2010-10-11 10:20:22.0 +0530
+++ new/include/linux/virtio_net.h  2011-02-25 16:24:15.0 +0530
@@ -7,6 +7,9 @@
 #include linux/virtio_config.h
 #include linux/if_ether.h
 
+/* Maximum number of individual RX/TX queues supported */
+#define VIRTIO_MAX_TXQS16
+
 /* The feature bitmap for virtio net */
 #define VIRTIO_NET_F_CSUM  0   /* Host handles pkts w/ partial csum */
 #define VIRTIO_NET_F_GUEST_CSUM1   /* Guest handles pkts w/ 
partial csum */
@@ -26,6 +29,7 @@
 #define VIRTIO_NET_F_CTRL_RX   18  /* Control channel RX mode support */
 #define VIRTIO_NET_F_CTRL_VLAN 19  /* Control channel VLAN filtering */
 #define VIRTIO_NET_F_CTRL_RX_EXTRA 20  /* Extra RX mode control support */
+#define VIRTIO_NET_F_NUMTXQS   21  /* Device supports multiple TX queue */
 
 #define VIRTIO_NET_S_LINK_UP   1   /* Link is up */
 
@@ -34,6 +38,8 @@ struct virtio_net_config {
__u8 mac[6];
/* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */
__u16 status;
+   /* number of RX/TX queues */
+   __u16 numtxqs;
 } __attribute__((packed));
 
 /* This is the first element of the scatter-gather list.  If you don't
diff -ruNp org/drivers/net/virtio_net.c new/drivers/net/virtio_net.c
--- org/drivers/net/virtio_net.c2011-02-21 17:55:42.0 +0530
+++ new/drivers/net/virtio_net.c2011-02-25 16:23:41.0 +0530
@@ -40,31 +40,53 @@ module_param(gso, bool, 0444);
 
 #define VIRTNET_SEND_COMMAND_SG_MAX2
 
-struct virtnet_info {
-   struct virtio_device *vdev;
-   struct virtqueue *rvq, *svq, *cvq;
-   struct net_device *dev;
+/* Internal representation of a send virtqueue */
+struct send_queue {
+   struct virtqueue *svq;
+
+   /* TX: fragments + linear part + virtio header */
+   struct scatterlist tx_sg[MAX_SKB_FRAGS + 2];
+};
+
+/* Internal representation of a receive virtqueue */
+struct receive_queue {
+   /* Virtqueue associated with this receive_queue */
+   struct virtqueue *rvq;
+
+   /* Back pointer to the virtnet_info */
+   struct virtnet_info *vi;
+
struct napi_struct napi;
-   unsigned int status;
 
/* Number of input buffers, and max we've ever had. */
unsigned int num, max;
 
-   /* I like... big packets and I cannot lie! */
-   bool big_packets;
-
-   /* Host will merge rx buffers for big packets (shake it! shake it!) */
-   bool mergeable_rx_bufs;
-
/* Work struct for refilling if we run low on memory. */
struct delayed_work refill;
 
/* Chain pages by the private ptr. */
struct page *pages;
 
-   /* fragments + linear part + virtio header */
+   /* RX: fragments + linear part + virtio header */
struct scatterlist rx_sg[MAX_SKB_FRAGS + 2];
-   struct scatterlist tx_sg[MAX_SKB_FRAGS + 2];
+};
+
+struct virtnet_info {
+   struct send_queue **sq;
+   struct receive_queue **rq;
+
+   /* read-mostly variables */
+   int numtxqs cacheline_aligned_in_smp;   /* # of rxqs/txqs */
+   struct virtio_device *vdev;
+   struct virtqueue *cvq;
+   struct net_device *dev;
+   unsigned int status;
+
+   /* I like... big packets and I cannot lie! */
+   bool big_packets;
+
+   /* Host will merge rx buffers for big packets (shake it! shake it!) */
+   bool mergeable_rx_bufs;
 };
 
 struct skb_vnet_hdr {
@@ -94,22 +116,22 @@ static inline struct skb_vnet_hdr *skb_v
  * private is used to chain pages for big packets, put the whole
  * most recent used list in the beginning for reuse
  */
-static void give_pages(struct virtnet_info *vi, struct page *page)
+static void give_pages(struct receive_queue *rq, struct page *page)
 {
struct page *end;
 
/* Find end of list, sew whole thing into vi-pages. */
for (end = page; end-private; end = (struct page *)end-private);
-   end-private = (unsigned long)vi-pages;
-   vi-pages = page;
+   end-private = (unsigned long)rq-pages;
+   rq-pages = page;
 }
 
-static struct page *get_a_page(struct virtnet_info *vi, gfp_t gfp_mask)
+static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
 {
-   struct page *p = vi-pages;
+   struct page *p = rq-pages;
 
if (p) {
-   vi-pages = (struct page *)p-private

[PATCH 3/3] [RFC] Changes for MQ vhost

2011-02-27 Thread Krishna Kumar
Changes for mq vhost.

vhost_net_open is changed to allocate a vhost_net and return.
The remaining initializations are delayed till SET_OWNER.
SET_OWNER is changed so that the argument is used to determine
how many txqs to use.  Unmodified qemu's will pass NULL, so
this is recognized and handled as numtxqs=1.

The number of vhost threads is = #txqs.  Threads handle more
than one txq when #txqs is more than MAX_VHOST_THREADS (4).
The same thread handles both RX and TX - tested with tap/bridge
so far (TBD: needs some changes in macvtap driver to support
the same).

I had to convert space-tab in vhost_attach_cgroups* to avoid
checkpatch errors.
 
Signed-off-by: Krishna Kumar krkum...@in.ibm.com
---
 drivers/vhost/net.c   |  295 ++--
 drivers/vhost/vhost.c |  225 +++---
 drivers/vhost/vhost.h |   39 -
 3 files changed, 378 insertions(+), 181 deletions(-)

diff -ruNp org/drivers/vhost/vhost.h new/drivers/vhost/vhost.h
--- org/drivers/vhost/vhost.h   2011-02-08 09:05:09.0 +0530
+++ new/drivers/vhost/vhost.h   2011-02-28 11:48:06.0 +0530
@@ -35,11 +35,11 @@ struct vhost_poll {
wait_queue_t  wait;
struct vhost_work work;
unsigned long mask;
-   struct vhost_dev *dev;
+   struct vhost_virtqueue*vq;  /* points back to vq */
 };
 
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
-unsigned long mask, struct vhost_dev *dev);
+unsigned long mask, struct vhost_virtqueue *vq);
 void vhost_poll_start(struct vhost_poll *poll, struct file *file);
 void vhost_poll_stop(struct vhost_poll *poll);
 void vhost_poll_flush(struct vhost_poll *poll);
@@ -108,8 +108,14 @@ struct vhost_virtqueue {
/* Log write descriptors */
void __user *log_base;
struct vhost_log *log;
+   struct task_struct *worker; /* worker for this vq */
+   spinlock_t *work_lock;  /* points to a dev-work_lock[] entry */
+   struct list_head *work_list;/* points to a dev-work_list[] entry */
+   int qnum;   /* 0 for RX, 1 - n-1 for TX */
 };
 
+#define MAX_VHOST_THREADS  4
+
 struct vhost_dev {
/* Readers use RCU to access memory table pointer
 * log base pointer and features.
@@ -122,12 +128,33 @@ struct vhost_dev {
int nvqs;
struct file *log_file;
struct eventfd_ctx *log_ctx;
-   spinlock_t work_lock;
-   struct list_head work_list;
-   struct task_struct *worker;
+   spinlock_t *work_lock[MAX_VHOST_THREADS];
+   struct list_head *work_list[MAX_VHOST_THREADS];
 };
 
-long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs);
+/*
+ * Return maximum number of vhost threads needed to handle RX  TX.
+ * Upto MAX_VHOST_THREADS are started, and threads can be shared
+ * among different vq's if numtxqs  MAX_VHOST_THREADS.
+ */
+static inline int get_nvhosts(int nvqs)
+{
+   return min_t(int, nvqs / 2, MAX_VHOST_THREADS);
+}
+
+/*
+ * Get index of an existing thread that will handle this txq/rxq.
+ * The same thread handles both rx[index] and tx[index].
+ */
+static inline int vhost_get_thread_index(int index, int numtxqs, int nvhosts)
+{
+   return (index % numtxqs) % nvhosts;
+}
+
+int vhost_setup_vqs(struct vhost_dev *dev, int numtxqs);
+void vhost_free_vqs(struct vhost_dev *dev);
+long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs,
+   int nvhosts);
 long vhost_dev_check_owner(struct vhost_dev *);
 long vhost_dev_reset_owner(struct vhost_dev *);
 void vhost_dev_cleanup(struct vhost_dev *);
diff -ruNp org/drivers/vhost/net.c new/drivers/vhost/net.c
--- org/drivers/vhost/net.c 2011-02-08 09:05:09.0 +0530
+++ new/drivers/vhost/net.c 2011-02-28 11:48:53.0 +0530
@@ -32,12 +32,6 @@
  * Using this limit prevents one virtqueue from starving others. */
 #define VHOST_NET_WEIGHT 0x8
 
-enum {
-   VHOST_NET_VQ_RX = 0,
-   VHOST_NET_VQ_TX = 1,
-   VHOST_NET_VQ_MAX = 2,
-};
-
 enum vhost_net_poll_state {
VHOST_NET_POLL_DISABLED = 0,
VHOST_NET_POLL_STARTED = 1,
@@ -46,12 +40,13 @@ enum vhost_net_poll_state {
 
 struct vhost_net {
struct vhost_dev dev;
-   struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
-   struct vhost_poll poll[VHOST_NET_VQ_MAX];
+   struct vhost_virtqueue *vqs;
+   struct vhost_poll *poll;
+   struct socket **socks;
/* Tells us whether we are polling a socket for TX.
 * We only do this when socket buffer fills up.
 * Protected by tx vq lock. */
-   enum vhost_net_poll_state tx_poll_state;
+   enum vhost_net_poll_state *tx_poll_state;
 };
 
 /* Pop first len bytes from iovec. Return number of segments used. */
@@ -91,28 +86,28 @@ static void copy_iovec_hdr(const struct 
 }
 
 /* Caller must have TX VQ lock */
-static void tx_poll_stop(struct vhost_net *net

[v3 RFC PATCH 0/4] Implement multiqueue virtio-net

2010-10-20 Thread Krishna Kumar
  ___
#BW%   CPU%RCPU%SD% RSD%

17.62 -38.03   -26.26  -50.00   -33.33
228.95 20.4621.62   0   -7.14
484.05 60.7945.74  -2.43-12.42
886.43 79.5750.32   15.85   -3.10
16   88.63 99.4858.17   9.47-13.10
24   74.65 80.8741.99  -1.81-22.89
32   63.86 59.2123.58  -18.13   -36.37
40   64.79 60.5322.23  -15.77   -35.84
48   49.68 26.93.51-36.40   -49.61
64   54.69 36.505.41   -26.59   -43.23
80   45.06 12.72   -13.25  -37.79   -52.08
96   40.21-3.16-24.53  -39.92   -52.97
128  36.33-33.19   -43.66  -5.68-20.49

BW: 49.3%,  CPU/RCPU: 15.5%,-8.2%,  SD/RSD: -22.2%,-37.0%


Signed-off-by: Krishna Kumar krkum...@in.ibm.com
---
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[v3 RFC PATCH 1/4] Change virtqueue structure

2010-10-20 Thread Krishna Kumar
Move queue_index from virtio_pci_vq_info to virtqueue.  This
allows callback handlers to figure out the queue number for
the vq that needs attention.

Signed-off-by: Krishna Kumar krkum...@in.ibm.com  
---
 drivers/virtio/virtio_pci.c |   10 +++---
 include/linux/virtio.h  |1 +
 2 files changed, 4 insertions(+), 7 deletions(-)

diff -ruNp org/include/linux/virtio.h 
new.dynamic.optimize_vhost/include/linux/virtio.h
--- org/include/linux/virtio.h  2010-10-11 10:20:22.0 +0530
+++ new.dynamic.optimize_vhost/include/linux/virtio.h   2010-10-15 
13:25:42.0 +0530
@@ -22,6 +22,7 @@ struct virtqueue {
void (*callback)(struct virtqueue *vq);
const char *name;
struct virtio_device *vdev;
+   int queue_index;/* the index of the queue */
void *priv;
 };
 
diff -ruNp org/drivers/virtio/virtio_pci.c 
new.dynamic.optimize_vhost/drivers/virtio/virtio_pci.c
--- org/drivers/virtio/virtio_pci.c 2010-10-11 10:20:15.0 +0530
+++ new.dynamic.optimize_vhost/drivers/virtio/virtio_pci.c  2010-10-15 
13:25:42.0 +0530
@@ -75,9 +75,6 @@ struct virtio_pci_vq_info
/* the number of entries in the queue */
int num;
 
-   /* the index of the queue */
-   int queue_index;
-
/* the virtual address of the ring queue */
void *queue;
 
@@ -185,11 +182,10 @@ static void vp_reset(struct virtio_devic
 static void vp_notify(struct virtqueue *vq)
 {
struct virtio_pci_device *vp_dev = to_vp_device(vq-vdev);
-   struct virtio_pci_vq_info *info = vq-priv;
 
/* we write the queue's selector into the notification register to
 * signal the other end */
-   iowrite16(info-queue_index, vp_dev-ioaddr + VIRTIO_PCI_QUEUE_NOTIFY);
+   iowrite16(vq-queue_index, vp_dev-ioaddr + VIRTIO_PCI_QUEUE_NOTIFY);
 }
 
 /* Handle a configuration change: Tell driver if it wants to know. */
@@ -385,7 +381,6 @@ static struct virtqueue *setup_vq(struct
if (!info)
return ERR_PTR(-ENOMEM);
 
-   info-queue_index = index;
info-num = num;
info-msix_vector = msix_vec;
 
@@ -408,6 +403,7 @@ static struct virtqueue *setup_vq(struct
goto out_activate_queue;
}
 
+   vq-queue_index = index;
vq-priv = info;
info-vq = vq;
 
@@ -446,7 +442,7 @@ static void vp_del_vq(struct virtqueue *
list_del(info-node);
spin_unlock_irqrestore(vp_dev-lock, flags);
 
-   iowrite16(info-queue_index, vp_dev-ioaddr + VIRTIO_PCI_QUEUE_SEL);
+   iowrite16(vq-queue_index, vp_dev-ioaddr + VIRTIO_PCI_QUEUE_SEL);
 
if (vp_dev-msix_enabled) {
iowrite16(VIRTIO_MSI_NO_VECTOR,
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[v3 RFC PATCH 2/4] Changes for virtio-net

2010-10-20 Thread Krishna Kumar
Implement mq virtio-net driver. 

Though struct virtio_net_config changes, it works with old
qemu's since the last element is not accessed, unless qemu
sets VIRTIO_NET_F_NUMTXQS.  Patch also adds a macro for the
maximum number of TX vq's (VIRTIO_MAX_SQ) that the user can
specify.

Signed-off-by: Krishna Kumar krkum...@in.ibm.com
--- 
 drivers/net/virtio_net.c   |  234 ++-
 include/linux/virtio_net.h |6 
 2 files changed, 185 insertions(+), 55 deletions(-)

diff -ruNp org/include/linux/virtio_net.h 
new.dynamic.optimize_vhost/include/linux/virtio_net.h
--- org/include/linux/virtio_net.h  2010-10-11 10:20:22.0 +0530
+++ new.dynamic.optimize_vhost/include/linux/virtio_net.h   2010-10-19 
13:24:38.0 +0530
@@ -7,6 +7,9 @@
 #include linux/virtio_config.h
 #include linux/if_ether.h
 
+/* Maximum number of TX queues supported */
+#define VIRTIO_MAX_SQ 32
+
 /* The feature bitmap for virtio net */
 #define VIRTIO_NET_F_CSUM  0   /* Host handles pkts w/ partial csum */
 #define VIRTIO_NET_F_GUEST_CSUM1   /* Guest handles pkts w/ 
partial csum */
@@ -26,6 +29,7 @@
 #define VIRTIO_NET_F_CTRL_RX   18  /* Control channel RX mode support */
 #define VIRTIO_NET_F_CTRL_VLAN 19  /* Control channel VLAN filtering */
 #define VIRTIO_NET_F_CTRL_RX_EXTRA 20  /* Extra RX mode control support */
+#define VIRTIO_NET_F_NUMTXQS   21  /* Device supports multiple TX queue */
 
 #define VIRTIO_NET_S_LINK_UP   1   /* Link is up */
 
@@ -34,6 +38,8 @@ struct virtio_net_config {
__u8 mac[6];
/* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */
__u16 status;
+   /* number of transmit queues */
+   __u16 numtxqs;
 } __attribute__((packed));
 
 /* This is the first element of the scatter-gather list.  If you don't
diff -ruNp org/drivers/net/virtio_net.c 
new.dynamic.optimize_vhost/drivers/net/virtio_net.c
--- org/drivers/net/virtio_net.c2010-10-11 10:20:02.0 +0530
+++ new.dynamic.optimize_vhost/drivers/net/virtio_net.c 2010-10-19 
17:01:53.0 +0530
@@ -40,11 +40,24 @@ module_param(gso, bool, 0444);
 
 #define VIRTNET_SEND_COMMAND_SG_MAX2
 
+/* Our representation of a send virtqueue */
+struct send_queue {
+   struct virtqueue *svq;
+
+   /* TX: fragments + linear part + virtio header */
+   struct scatterlist tx_sg[MAX_SKB_FRAGS + 2];
+};
+
 struct virtnet_info {
+   struct send_queue **sq;
+   struct napi_struct napi cacheline_aligned_in_smp;
+
+   /* read-mostly variables */
+   int numtxqs cacheline_aligned_in_smp;
struct virtio_device *vdev;
-   struct virtqueue *rvq, *svq, *cvq;
+   struct virtqueue *rvq;
+   struct virtqueue *cvq;
struct net_device *dev;
-   struct napi_struct napi;
unsigned int status;
 
/* Number of input buffers, and max we've ever had. */
@@ -62,9 +75,8 @@ struct virtnet_info {
/* Chain pages by the private ptr. */
struct page *pages;
 
-   /* fragments + linear part + virtio header */
+   /* RX: fragments + linear part + virtio header */
struct scatterlist rx_sg[MAX_SKB_FRAGS + 2];
-   struct scatterlist tx_sg[MAX_SKB_FRAGS + 2];
 };
 
 struct skb_vnet_hdr {
@@ -120,12 +132,13 @@ static struct page *get_a_page(struct vi
 static void skb_xmit_done(struct virtqueue *svq)
 {
struct virtnet_info *vi = svq-vdev-priv;
+   int qnum = svq-queue_index - 1;/* 0 is RX vq */
 
/* Suppress further interrupts. */
virtqueue_disable_cb(svq);
 
/* We were probably waiting for more output buffers. */
-   netif_wake_queue(vi-dev);
+   netif_wake_subqueue(vi-dev, qnum);
 }
 
 static void set_skb_frag(struct sk_buff *skb, struct page *page,
@@ -495,12 +508,13 @@ again:
return received;
 }
 
-static unsigned int free_old_xmit_skbs(struct virtnet_info *vi)
+static unsigned int free_old_xmit_skbs(struct virtnet_info *vi,
+  struct virtqueue *svq)
 {
struct sk_buff *skb;
unsigned int len, tot_sgs = 0;
 
-   while ((skb = virtqueue_get_buf(vi-svq, len)) != NULL) {
+   while ((skb = virtqueue_get_buf(svq, len)) != NULL) {
pr_debug(Sent skb %p\n, skb);
vi-dev-stats.tx_bytes += skb-len;
vi-dev-stats.tx_packets++;
@@ -510,7 +524,8 @@ static unsigned int free_old_xmit_skbs(s
return tot_sgs;
 }
 
-static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
+static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb,
+   struct virtqueue *svq, struct scatterlist *tx_sg)
 {
struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
const unsigned char *dest = ((struct ethhdr *)skb-data)-h_dest;
@@ -548,12 +563,12 @@ static int xmit_skb(struct virtnet_info 
 
/* Encode metadata header at front. */
if (vi-mergeable_rx_bufs

[v3 RFC PATCH 3/4] Changes for vhost

2010-10-20 Thread Krishna Kumar
Changes for mq vhost.

vhost_net_open is changed to allocate a vhost_net and
return.  The remaining initializations are delayed till
SET_OWNER.  SET_OWNER is changed so that the argument
is used to determine how many txqs to use.  Unmodified
qemu's will pass NULL, so this is recognized and handled
as numtxqs=1.

Besides changing handle_tx to use 'vq', this patch also
changes handle_rx to take vq as parameter.  The mq RX
patch requires this change, but till then it is consistent
(and less confusing) to make the interfaces for handling
rx and tx similar.

vhost thread handling for RX and TX is as follows.  The
first vhost thread handles RX traffic, while the remaining
threads handles TX.  The number of threads is = #txqs, and
threads handle more than one txq when #txqs is more than
MAX_VHOST_THREADS (4).  When guest is started with 1 txqs
and there is only one stream of traffic from the guest,
that is recognized and handled such that vhost[0] processes
both RX and TX.  This can change dynamically.  vhost_poll
has a new element - find_vq(), which allows optimizing some
code for cases where numtxqs=1 or a packet on vhost[0]
needs processing.

Signed-off-by: Krishna Kumar krkum...@in.ibm.com
---
 drivers/vhost/net.c   |  284 ++--
 drivers/vhost/vhost.c |  275 --
 drivers/vhost/vhost.h |   42 +
 3 files changed, 430 insertions(+), 171 deletions(-)

diff -ruNp org/drivers/vhost/vhost.h new/drivers/vhost/vhost.h
--- org/drivers/vhost/vhost.h   2010-10-11 10:21:14.0 +0530
+++ new/drivers/vhost/vhost.h   2010-10-20 14:11:23.0 +0530
@@ -35,11 +35,13 @@ struct vhost_poll {
wait_queue_t  wait;
struct vhost_work work;
unsigned long mask;
-   struct vhost_dev *dev;
+   struct vhost_virtqueue*(*find_vq)(struct vhost_poll *poll);
+   struct vhost_virtqueue*vq;  /* points back to vq */
 };
 
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
-unsigned long mask, struct vhost_dev *dev);
+unsigned long mask, struct vhost_virtqueue *vq,
+int single_queue);
 void vhost_poll_start(struct vhost_poll *poll, struct file *file);
 void vhost_poll_stop(struct vhost_poll *poll);
 void vhost_poll_flush(struct vhost_poll *poll);
@@ -108,6 +110,10 @@ struct vhost_virtqueue {
/* Log write descriptors */
void __user *log_base;
struct vhost_log *log;
+   struct task_struct *worker; /* vhost for this vq, can be shared */
+   spinlock_t *work_lock;
+   struct list_head *work_list;
+   int qnum;   /* 0 for RX, 1 - n-1 for TX */
 };
 
 struct vhost_dev {
@@ -119,15 +125,39 @@ struct vhost_dev {
struct mutex mutex;
unsigned acked_features;
struct vhost_virtqueue *vqs;
+   unsigned long *jiffies;
int nvqs;
struct file *log_file;
struct eventfd_ctx *log_ctx;
-   spinlock_t work_lock;
-   struct list_head work_list;
-   struct task_struct *worker;
+   spinlock_t *work_lock;
+   struct list_head *work_list;
 };
 
-long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs);
+/*
+ * Define maximum number of TX threads, and use that to have a maximum
+ * number of vhost threads to handle RX  TX. First thread handles RX.
+ * If guest is started with #txqs=1, only one vhost thread is started.
+ * Else, upto MAX_VHOST_THREADS are started where th[0] handles RX and
+ * remaining handles TX. However, vhost_poll_queue has an optimization
+ * where th[0] is selected for both RX  TX if there is only one flow.
+ */
+#define MAX_TXQ_THREADS4
+#define MAX_VHOST_THREADS  (MAX_TXQ_THREADS + 1)
+
+static inline int get_nvhosts(int nvqs)
+{
+   int num_vhosts = nvqs - 1;
+
+   if (nvqs  2)
+   num_vhosts = min_t(int, nvqs, MAX_VHOST_THREADS);
+
+   return num_vhosts;
+}
+
+int vhost_setup_vqs(struct vhost_dev *dev, int numtxqs);
+void vhost_free_vqs(struct vhost_dev *dev);
+long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs,
+   int nvhosts);
 long vhost_dev_check_owner(struct vhost_dev *);
 long vhost_dev_reset_owner(struct vhost_dev *);
 void vhost_dev_cleanup(struct vhost_dev *);
diff -ruNp org/drivers/vhost/net.c new/drivers/vhost/net.c
--- org/drivers/vhost/net.c 2010-10-11 10:21:14.0 +0530
+++ new/drivers/vhost/net.c 2010-10-20 14:20:10.0 +0530
@@ -33,12 +33,6 @@
  * Using this limit prevents one virtqueue from starving others. */
 #define VHOST_NET_WEIGHT 0x8
 
-enum {
-   VHOST_NET_VQ_RX = 0,
-   VHOST_NET_VQ_TX = 1,
-   VHOST_NET_VQ_MAX = 2,
-};
-
 enum vhost_net_poll_state {
VHOST_NET_POLL_DISABLED = 0,
VHOST_NET_POLL_STARTED = 1,
@@ -47,12 +41,13 @@ enum vhost_net_poll_state {
 
 struct vhost_net {
struct vhost_dev dev

[v3 RFC PATCH 4/4] qemu changes

2010-10-20 Thread Krishna Kumar
Changes in qemu to support mq TX.

Signed-off-by: Krishna Kumar krkum...@in.ibm.com
--- 
 hw/vhost.c  |7 --
 hw/vhost.h  |2 -
 hw/vhost_net.c  |   16 +
 hw/vhost_net.h  |2 -
 hw/virtio-net.c |   53 --
 hw/virtio-net.h |2 +
 hw/virtio-pci.c |2 +
 net.c   |   17 ++
 net.h   |1 
 net/tap.c   |   34 ++---
 10 files changed, 107 insertions(+), 29 deletions(-)

diff -ruNp org3/hw/vhost.c new3/hw/vhost.c
--- org3/hw/vhost.c 2010-10-19 19:38:11.0 +0530
+++ new3/hw/vhost.c 2010-10-20 12:44:21.0 +0530
@@ -580,7 +580,7 @@ static void vhost_virtqueue_cleanup(stru
   0, virtio_queue_get_desc_size(vdev, idx));
 }
 
-int vhost_dev_init(struct vhost_dev *hdev, int devfd)
+int vhost_dev_init(struct vhost_dev *hdev, int devfd, int numtxqs)
 {
 uint64_t features;
 int r;
@@ -592,11 +592,14 @@ int vhost_dev_init(struct vhost_dev *hde
 return -errno;
 }
 }
-r = ioctl(hdev-control, VHOST_SET_OWNER, NULL);
+
+r = ioctl(hdev-control, VHOST_SET_OWNER, numtxqs);
 if (r  0) {
 goto fail;
 }
 
+hdev-nvqs = numtxqs + 1;
+
 r = ioctl(hdev-control, VHOST_GET_FEATURES, features);
 if (r  0) {
 goto fail;
diff -ruNp org3/hw/vhost.h new3/hw/vhost.h
--- org3/hw/vhost.h 2010-07-01 11:42:09.0 +0530
+++ new3/hw/vhost.h 2010-10-20 12:47:10.0 +0530
@@ -40,7 +40,7 @@ struct vhost_dev {
 unsigned long long log_size;
 };
 
-int vhost_dev_init(struct vhost_dev *hdev, int devfd);
+int vhost_dev_init(struct vhost_dev *hdev, int devfd, int numtxqs);
 void vhost_dev_cleanup(struct vhost_dev *hdev);
 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev);
 void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev);
diff -ruNp org3/hw/vhost_net.c new3/hw/vhost_net.c
--- org3/hw/vhost_net.c 2010-09-28 10:07:31.0 +0530
+++ new3/hw/vhost_net.c 2010-10-19 19:46:52.0 +0530
@@ -36,7 +36,8 @@
 
 struct vhost_net {
 struct vhost_dev dev;
-struct vhost_virtqueue vqs[2];
+struct vhost_virtqueue *vqs;
+int nvqs;
 int backend;
 VLANClientState *vc;
 };
@@ -81,7 +82,8 @@ static int vhost_net_get_fd(VLANClientSt
 }
 }
 
-struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd)
+struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd,
+int numtxqs)
 {
 int r;
 struct vhost_net *net = qemu_malloc(sizeof *net);
@@ -98,10 +100,14 @@ struct vhost_net *vhost_net_init(VLANCli
 (1  VHOST_NET_F_VIRTIO_NET_HDR);
 net-backend = r;
 
-r = vhost_dev_init(net-dev, devfd);
+r = vhost_dev_init(net-dev, devfd, numtxqs);
 if (r  0) {
 goto fail;
 }
+
+net-nvqs = numtxqs + 1;
+net-vqs = qemu_malloc(net-nvqs * (sizeof *net-vqs));
+
 if (!tap_has_vnet_hdr_len(backend,
   sizeof(struct virtio_net_hdr_mrg_rxbuf))) {
 net-dev.features = ~(1  VIRTIO_NET_F_MRG_RXBUF);
@@ -131,7 +137,6 @@ int vhost_net_start(struct vhost_net *ne
  sizeof(struct virtio_net_hdr_mrg_rxbuf));
 }
 
-net-dev.nvqs = 2;
 net-dev.vqs = net-vqs;
 r = vhost_dev_start(net-dev, dev);
 if (r  0) {
@@ -188,7 +193,8 @@ void vhost_net_cleanup(struct vhost_net 
 qemu_free(net);
 }
 #else
-struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd)
+struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd,
+int nvqs)
 {
return NULL;
 }
diff -ruNp org3/hw/vhost_net.h new3/hw/vhost_net.h
--- org3/hw/vhost_net.h 2010-07-01 11:42:09.0 +0530
+++ new3/hw/vhost_net.h 2010-10-19 19:46:52.0 +0530
@@ -6,7 +6,7 @@
 struct vhost_net;
 typedef struct vhost_net VHostNetState;
 
-VHostNetState *vhost_net_init(VLANClientState *backend, int devfd);
+VHostNetState *vhost_net_init(VLANClientState *backend, int devfd, int nvqs);
 
 int vhost_net_start(VHostNetState *net, VirtIODevice *dev);
 void vhost_net_stop(VHostNetState *net, VirtIODevice *dev);
diff -ruNp org3/hw/virtio-net.c new3/hw/virtio-net.c
--- org3/hw/virtio-net.c2010-10-19 19:38:11.0 +0530
+++ new3/hw/virtio-net.c2010-10-19 21:02:33.0 +0530
@@ -32,7 +32,7 @@ typedef struct VirtIONet
 uint8_t mac[ETH_ALEN];
 uint16_t status;
 VirtQueue *rx_vq;
-VirtQueue *tx_vq;
+VirtQueue **tx_vq;
 VirtQueue *ctrl_vq;
 NICState *nic;
 QEMUTimer *tx_timer;
@@ -65,6 +65,7 @@ typedef struct VirtIONet
 } mac_table;
 uint32_t *vlans;
 DeviceState *qdev;
+uint16_t numtxqs;
 } VirtIONet;
 
 /* TODO
@@ -82,6 +83,7 @@ static void virtio_net_get_config(VirtIO
 struct virtio_net_config netcfg;
 
 netcfg.status = n-status;
+netcfg.numtxqs = n-numtxqs;
 memcpy(netcfg.mac, n-mac, ETH_ALEN

[v2 RFC PATCH 0/4] Implement multiqueue virtio-net

2010-09-17 Thread Krishna Kumar
)  41263847  (-6.76)15953   12172  (-23.70)
64   23411  27661 (18.15)  72166035  (-16.36)   28146   19078  (-32.21)
80   23175  27141 (17.11)  11729   12454 (6.18) 44765   39750  (-11.20)
96   23337  26759 (14.66)  16745   15905 (-5.01)65099   50261  (-22.79)
128  22726  28339 (24.69)  30571   27893 (-8.76)118089  89994  (-23.79)

Summary:BW: 22.8%SD: -4.21%RSD: -21.06%

   UDP: BW vs SD/CPU
#  BW1  BW2 (%)  CPU1  CPU2 (%)  SD1SD2(%)
_
1  3652137415 (2.44)   61 61(0)  2  2  (0)
4  2858546903 (64.08)  397546   (37.53)  72 68 (-5.55)
8  2664944694 (67.71)  8511243  (46.06)  334339(1.49)
16 2590543385 (67.47)  1740   2631  (51.20)  1409   1572   (11.56)
32 2498040448 (61.92)  3502   5360  (53.05)  5881   6401   (8.84)
48 2743939451 (43.77)  5410   8324  (53.86)  12475  14855  (19.07)
64 2568239915 (55.42)  7165   10825 (51.08)  23404  25982  (11.01)
96 2620540190 (53.36)  10855  16283 (50.00)  52124  75014  (43.91)
1282574140252 (56.37)  14448  22186 (53.55)  133922 96843  (-27.68)

Summary:   BW: 50.4  CPU: 51.8  SD: -27.68
_
N#: Number of netperf sessions, 60 sec runs
BW1,SD1,RSD1: Bandwidth (sum across 2 runs in mbps), SD and Remote
  SD for original code
BW2,SD2,RSD2: Bandwidth (sum across 2 runs in mbps), SD and Remote
  SD for new code.
CPU1,CPU2,RCPU1,RCPU2: Similar to SD.

For 1 TCP netperf, I ran 7 iterations and summed it. Explanation
for degradation for 1 stream case:
1. Without any tuning, BW falls -6.5%.
2. When vhosts on server were bound to CPU0, BW was as good
   as with original code.
3. When new code was started with numtxqs=1 (or mq=off, which
   is the default), there was no degradation.


   Next steps:
   ---
1. MQ RX patch is also complete - plan to submit once TX is OK (as
   well as after identifying bandwidth degradations for some test
   cases).
2. Cache-align data structures: I didn't see any BW/SD improvement
   after making the sq's (and similarly for vhost) cache-aligned
   statically:
struct virtnet_info {
...
struct send_queue sq[16] cacheline_aligned_in_smp;
...
};
3. Migration is not tested.

Review/feedback appreciated.

Signed-off-by: Krishna Kumar krkum...@in.ibm.com
---
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[v2 RFC PATCH 1/4] Change virtqueue structure

2010-09-17 Thread Krishna Kumar
Move queue_index from virtio_pci_vq_info to virtqueue. This
allows callback handlers to figure out the queue number for
the vq that needs attention.

Signed-off-by: Krishna Kumar krkum...@in.ibm.com 
---
 drivers/virtio/virtio_pci.c |   10 +++---
 include/linux/virtio.h  |1 +
 2 files changed, 4 insertions(+), 7 deletions(-)

diff -ruNp org2/include/linux/virtio.h tx_only2/include/linux/virtio.h
--- org2/include/linux/virtio.h 2010-06-02 18:46:43.0 +0530
+++ tx_only2/include/linux/virtio.h 2010-09-16 15:24:01.0 +0530
@@ -22,6 +22,7 @@ struct virtqueue {
void (*callback)(struct virtqueue *vq);
const char *name;
struct virtio_device *vdev;
+   int queue_index;/* the index of the queue */
void *priv;
 };
 
diff -ruNp org2/drivers/virtio/virtio_pci.c tx_only2/drivers/virtio/virtio_pci.c
--- org2/drivers/virtio/virtio_pci.c2010-08-05 14:48:06.0 +0530
+++ tx_only2/drivers/virtio/virtio_pci.c2010-09-16 15:24:01.0 
+0530
@@ -75,9 +75,6 @@ struct virtio_pci_vq_info
/* the number of entries in the queue */
int num;
 
-   /* the index of the queue */
-   int queue_index;
-
/* the virtual address of the ring queue */
void *queue;
 
@@ -185,11 +182,10 @@ static void vp_reset(struct virtio_devic
 static void vp_notify(struct virtqueue *vq)
 {
struct virtio_pci_device *vp_dev = to_vp_device(vq-vdev);
-   struct virtio_pci_vq_info *info = vq-priv;
 
/* we write the queue's selector into the notification register to
 * signal the other end */
-   iowrite16(info-queue_index, vp_dev-ioaddr + VIRTIO_PCI_QUEUE_NOTIFY);
+   iowrite16(vq-queue_index, vp_dev-ioaddr + VIRTIO_PCI_QUEUE_NOTIFY);
 }
 
 /* Handle a configuration change: Tell driver if it wants to know. */
@@ -385,7 +381,6 @@ static struct virtqueue *setup_vq(struct
if (!info)
return ERR_PTR(-ENOMEM);
 
-   info-queue_index = index;
info-num = num;
info-msix_vector = msix_vec;
 
@@ -408,6 +403,7 @@ static struct virtqueue *setup_vq(struct
goto out_activate_queue;
}
 
+   vq-queue_index = index;
vq-priv = info;
info-vq = vq;
 
@@ -446,7 +442,7 @@ static void vp_del_vq(struct virtqueue *
list_del(info-node);
spin_unlock_irqrestore(vp_dev-lock, flags);
 
-   iowrite16(info-queue_index, vp_dev-ioaddr + VIRTIO_PCI_QUEUE_SEL);
+   iowrite16(vq-queue_index, vp_dev-ioaddr + VIRTIO_PCI_QUEUE_SEL);
 
if (vp_dev-msix_enabled) {
iowrite16(VIRTIO_MSI_NO_VECTOR,
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[v2 RFC PATCH 3/4] Changes for vhost

2010-09-17 Thread Krishna Kumar
Changes for mq vhost.

vhost_net_open is changed to allocate a vhost_net and
return.  The remaining initializations are delayed till
SET_OWNER. SET_OWNER is changed so that the argument
is used to figure out how many txqs to use.  Unmodified
qemu's will pass NULL, so this is recognized and handled
as numtxqs=1.
 
Besides changing handle_tx to use 'vq', this patch also
changes handle_rx to take vq as parameter.  The mq RX
patch requires this change, but till then it is consistent
(and less confusing) to make the interfaces for handling
rx and tx similar.

Signed-off-by: Krishna Kumar krkum...@in.ibm.com
---
 drivers/vhost/net.c   |  273 ++--
 drivers/vhost/vhost.c |  186 +++
 drivers/vhost/vhost.h |   16 +-
 3 files changed, 324 insertions(+), 151 deletions(-)

diff -ruNp org2/drivers/vhost/vhost.h tx_only2/drivers/vhost/vhost.h
--- org2/drivers/vhost/vhost.h  2010-08-03 08:49:31.0 +0530
+++ tx_only2/drivers/vhost/vhost.h  2010-09-16 15:24:01.0 +0530
@@ -40,11 +40,11 @@ struct vhost_poll {
wait_queue_t  wait;
struct vhost_work work;
unsigned long mask;
-   struct vhost_dev *dev;
+   struct vhost_virtqueue*vq;  /* points back to vq */
 };
 
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
-unsigned long mask, struct vhost_dev *dev);
+unsigned long mask, struct vhost_virtqueue *vq);
 void vhost_poll_start(struct vhost_poll *poll, struct file *file);
 void vhost_poll_stop(struct vhost_poll *poll);
 void vhost_poll_flush(struct vhost_poll *poll);
@@ -110,6 +110,10 @@ struct vhost_virtqueue {
/* Log write descriptors */
void __user *log_base;
struct vhost_log log[VHOST_NET_MAX_SG];
+   struct task_struct *worker; /* vhost for this vq, shared btwn RX/TX */
+   spinlock_t *work_lock;
+   struct list_head *work_list;
+   int qnum;   /* 0 for RX, 1 - n-1 for TX */
 };
 
 struct vhost_dev {
@@ -124,11 +128,13 @@ struct vhost_dev {
int nvqs;
struct file *log_file;
struct eventfd_ctx *log_ctx;
-   spinlock_t work_lock;
-   struct list_head work_list;
-   struct task_struct *worker;
+   spinlock_t *work_lock;
+   struct list_head *work_list;
 };
 
+int vhost_setup_vqs(struct vhost_dev *dev, int numtxqs);
+void vhost_free_vqs(struct vhost_dev *dev);
+int vhost_get_num_threads(int nvqs);
 long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs);
 long vhost_dev_check_owner(struct vhost_dev *);
 long vhost_dev_reset_owner(struct vhost_dev *);
diff -ruNp org2/drivers/vhost/net.c tx_only2/drivers/vhost/net.c
--- org2/drivers/vhost/net.c2010-08-05 14:48:06.0 +0530
+++ tx_only2/drivers/vhost/net.c2010-09-16 15:24:01.0 +0530
@@ -33,12 +33,6 @@
  * Using this limit prevents one virtqueue from starving others. */
 #define VHOST_NET_WEIGHT 0x8
 
-enum {
-   VHOST_NET_VQ_RX = 0,
-   VHOST_NET_VQ_TX = 1,
-   VHOST_NET_VQ_MAX = 2,
-};
-
 enum vhost_net_poll_state {
VHOST_NET_POLL_DISABLED = 0,
VHOST_NET_POLL_STARTED = 1,
@@ -47,12 +41,13 @@ enum vhost_net_poll_state {
 
 struct vhost_net {
struct vhost_dev dev;
-   struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
-   struct vhost_poll poll[VHOST_NET_VQ_MAX];
+   struct vhost_virtqueue *vqs;
+   struct vhost_poll *poll;
+   struct socket **socks;
/* Tells us whether we are polling a socket for TX.
 * We only do this when socket buffer fills up.
 * Protected by tx vq lock. */
-   enum vhost_net_poll_state tx_poll_state;
+   enum vhost_net_poll_state *tx_poll_state;
 };
 
 /* Pop first len bytes from iovec. Return number of segments used. */
@@ -92,28 +87,28 @@ static void copy_iovec_hdr(const struct 
 }
 
 /* Caller must have TX VQ lock */
-static void tx_poll_stop(struct vhost_net *net)
+static void tx_poll_stop(struct vhost_net *net, int qnum)
 {
-   if (likely(net-tx_poll_state != VHOST_NET_POLL_STARTED))
+   if (likely(net-tx_poll_state[qnum] != VHOST_NET_POLL_STARTED))
return;
-   vhost_poll_stop(net-poll + VHOST_NET_VQ_TX);
-   net-tx_poll_state = VHOST_NET_POLL_STOPPED;
+   vhost_poll_stop(net-poll[qnum]);
+   net-tx_poll_state[qnum] = VHOST_NET_POLL_STOPPED;
 }
 
 /* Caller must have TX VQ lock */
-static void tx_poll_start(struct vhost_net *net, struct socket *sock)
+static void tx_poll_start(struct vhost_net *net, struct socket *sock, int qnum)
 {
-   if (unlikely(net-tx_poll_state != VHOST_NET_POLL_STOPPED))
+   if (unlikely(net-tx_poll_state[qnum] != VHOST_NET_POLL_STOPPED))
return;
-   vhost_poll_start(net-poll + VHOST_NET_VQ_TX, sock-file);
-   net-tx_poll_state = VHOST_NET_POLL_STARTED;
+   vhost_poll_start(net-poll[qnum], sock-file);
+   net-tx_poll_state

[v2 RFC PATCH 2/4] Changes for virtio-net

2010-09-17 Thread Krishna Kumar
Implement mq virtio-net driver. 

Though struct virtio_net_config changes, it works with old
qemu's since the last element is not accessed, unless qemu
sets VIRTIO_NET_F_NUMTXQS.

Signed-off-by: Krishna Kumar krkum...@in.ibm.com
---
 drivers/net/virtio_net.c   |  213 ++-
 include/linux/virtio_net.h |3 
 2 files changed, 163 insertions(+), 53 deletions(-)

diff -ruNp org2/include/linux/virtio_net.h tx_only2/include/linux/virtio_net.h
--- org2/include/linux/virtio_net.h 2010-02-10 13:20:27.0 +0530
+++ tx_only2/include/linux/virtio_net.h 2010-09-16 15:24:01.0 +0530
@@ -26,6 +26,7 @@
 #define VIRTIO_NET_F_CTRL_RX   18  /* Control channel RX mode support */
 #define VIRTIO_NET_F_CTRL_VLAN 19  /* Control channel VLAN filtering */
 #define VIRTIO_NET_F_CTRL_RX_EXTRA 20  /* Extra RX mode control support */
+#define VIRTIO_NET_F_NUMTXQS   21  /* Device supports multiple TX queue */
 
 #define VIRTIO_NET_S_LINK_UP   1   /* Link is up */
 
@@ -34,6 +35,8 @@ struct virtio_net_config {
__u8 mac[6];
/* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */
__u16 status;
+   /* number of transmit queues */
+   __u16 numtxqs;
 } __attribute__((packed));
 
 /* This is the first element of the scatter-gather list.  If you don't
diff -ruNp org2/drivers/net/virtio_net.c tx_only2/drivers/net/virtio_net.c
--- org2/drivers/net/virtio_net.c   2010-07-08 12:54:32.0 +0530
+++ tx_only2/drivers/net/virtio_net.c   2010-09-16 15:24:01.0 +0530
@@ -40,9 +40,20 @@ module_param(gso, bool, 0444);
 
 #define VIRTNET_SEND_COMMAND_SG_MAX2
 
+/* Our representation of a send virtqueue */
+struct send_queue {
+   struct virtqueue *svq;
+
+   /* TX: fragments + linear part + virtio header */
+   struct scatterlist tx_sg[MAX_SKB_FRAGS + 2];
+};
+
 struct virtnet_info {
struct virtio_device *vdev;
-   struct virtqueue *rvq, *svq, *cvq;
+   int numtxqs;/* Number of tx queues */
+   struct send_queue *sq;
+   struct virtqueue *rvq;
+   struct virtqueue *cvq;
struct net_device *dev;
struct napi_struct napi;
unsigned int status;
@@ -62,9 +73,8 @@ struct virtnet_info {
/* Chain pages by the private ptr. */
struct page *pages;
 
-   /* fragments + linear part + virtio header */
+   /* RX: fragments + linear part + virtio header */
struct scatterlist rx_sg[MAX_SKB_FRAGS + 2];
-   struct scatterlist tx_sg[MAX_SKB_FRAGS + 2];
 };
 
 struct skb_vnet_hdr {
@@ -120,12 +130,13 @@ static struct page *get_a_page(struct vi
 static void skb_xmit_done(struct virtqueue *svq)
 {
struct virtnet_info *vi = svq-vdev-priv;
+   int qnum = svq-queue_index - 1;/* 0 is RX vq */
 
/* Suppress further interrupts. */
virtqueue_disable_cb(svq);
 
/* We were probably waiting for more output buffers. */
-   netif_wake_queue(vi-dev);
+   netif_wake_subqueue(vi-dev, qnum);
 }
 
 static void set_skb_frag(struct sk_buff *skb, struct page *page,
@@ -495,12 +506,13 @@ again:
return received;
 }
 
-static unsigned int free_old_xmit_skbs(struct virtnet_info *vi)
+static unsigned int free_old_xmit_skbs(struct virtnet_info *vi,
+  struct virtqueue *svq)
 {
struct sk_buff *skb;
unsigned int len, tot_sgs = 0;
 
-   while ((skb = virtqueue_get_buf(vi-svq, len)) != NULL) {
+   while ((skb = virtqueue_get_buf(svq, len)) != NULL) {
pr_debug(Sent skb %p\n, skb);
vi-dev-stats.tx_bytes += skb-len;
vi-dev-stats.tx_packets++;
@@ -510,7 +522,8 @@ static unsigned int free_old_xmit_skbs(s
return tot_sgs;
 }
 
-static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
+static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb,
+   struct virtqueue *svq, struct scatterlist *tx_sg)
 {
struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
const unsigned char *dest = ((struct ethhdr *)skb-data)-h_dest;
@@ -548,12 +561,12 @@ static int xmit_skb(struct virtnet_info 
 
/* Encode metadata header at front. */
if (vi-mergeable_rx_bufs)
-   sg_set_buf(vi-tx_sg, hdr-mhdr, sizeof hdr-mhdr);
+   sg_set_buf(tx_sg, hdr-mhdr, sizeof hdr-mhdr);
else
-   sg_set_buf(vi-tx_sg, hdr-hdr, sizeof hdr-hdr);
+   sg_set_buf(tx_sg, hdr-hdr, sizeof hdr-hdr);
 
-   hdr-num_sg = skb_to_sgvec(skb, vi-tx_sg + 1, 0, skb-len) + 1;
-   return virtqueue_add_buf(vi-svq, vi-tx_sg, hdr-num_sg,
+   hdr-num_sg = skb_to_sgvec(skb, tx_sg + 1, 0, skb-len) + 1;
+   return virtqueue_add_buf(svq, tx_sg, hdr-num_sg,
0, skb);
 }
 
@@ -561,31 +574,34 @@ static netdev_tx_t start_xmit(struct sk_
 {
struct virtnet_info *vi = netdev_priv(dev);
int capacity;
+   int

[v2 RFC PATCH 4/4] qemu changes

2010-09-17 Thread Krishna Kumar
Changes in qemu to support mq TX.

Signed-off-by: Krishna Kumar krkum...@in.ibm.com
---
 hw/vhost.c  |8 ++-
 hw/vhost.h  |2 
 hw/vhost_net.c  |   16 +--
 hw/vhost_net.h  |2 
 hw/virtio-net.c |   97 ++
 hw/virtio-net.h |2 
 hw/virtio-pci.c |2 
 net.c   |   17 
 net.h   |1 
 net/tap.c   |   27 ++--
 10 files changed, 129 insertions(+), 45 deletions(-)

diff -ruNp org2/hw/vhost.c tx_only.rev2/hw/vhost.c
--- org2/hw/vhost.c 2010-08-09 09:51:58.0 +0530
+++ tx_only.rev2/hw/vhost.c 2010-09-16 16:23:56.0 +0530
@@ -599,23 +599,27 @@ static void vhost_virtqueue_cleanup(stru
   0, virtio_queue_get_desc_size(vdev, idx));
 }
 
-int vhost_dev_init(struct vhost_dev *hdev, int devfd)
+int vhost_dev_init(struct vhost_dev *hdev, int devfd, int numtxqs)
 {
 uint64_t features;
 int r;
 if (devfd = 0) {
 hdev-control = devfd;
+hdev-nvqs = 2;
 } else {
 hdev-control = open(/dev/vhost-net, O_RDWR);
 if (hdev-control  0) {
 return -errno;
 }
 }
-r = ioctl(hdev-control, VHOST_SET_OWNER, NULL);
+
+r = ioctl(hdev-control, VHOST_SET_OWNER, numtxqs);
 if (r  0) {
 goto fail;
 }
 
+hdev-nvqs = numtxqs + 1;
+
 r = ioctl(hdev-control, VHOST_GET_FEATURES, features);
 if (r  0) {
 goto fail;
diff -ruNp org2/hw/vhost.h tx_only.rev2/hw/vhost.h
--- org2/hw/vhost.h 2010-07-01 11:42:09.0 +0530
+++ tx_only.rev2/hw/vhost.h 2010-09-16 16:23:56.0 +0530
@@ -40,7 +40,7 @@ struct vhost_dev {
 unsigned long long log_size;
 };
 
-int vhost_dev_init(struct vhost_dev *hdev, int devfd);
+int vhost_dev_init(struct vhost_dev *hdev, int devfd, int nvqs);
 void vhost_dev_cleanup(struct vhost_dev *hdev);
 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev);
 void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev);
diff -ruNp org2/hw/vhost_net.c tx_only.rev2/hw/vhost_net.c
--- org2/hw/vhost_net.c 2010-08-09 09:51:58.0 +0530
+++ tx_only.rev2/hw/vhost_net.c 2010-09-16 16:23:56.0 +0530
@@ -36,7 +36,8 @@
 
 struct vhost_net {
 struct vhost_dev dev;
-struct vhost_virtqueue vqs[2];
+struct vhost_virtqueue *vqs;
+int nvqs;
 int backend;
 VLANClientState *vc;
 };
@@ -76,7 +77,8 @@ static int vhost_net_get_fd(VLANClientSt
 }
 }
 
-struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd)
+struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd,
+int numtxqs)
 {
 int r;
 struct vhost_net *net = qemu_malloc(sizeof *net);
@@ -93,10 +95,14 @@ struct vhost_net *vhost_net_init(VLANCli
 (1  VHOST_NET_F_VIRTIO_NET_HDR);
 net-backend = r;
 
-r = vhost_dev_init(net-dev, devfd);
+r = vhost_dev_init(net-dev, devfd, numtxqs);
 if (r  0) {
 goto fail;
 }
+
+net-nvqs = numtxqs + 1;
+net-vqs = qemu_malloc(net-nvqs * (sizeof *net-vqs));
+
 if (~net-dev.features  net-dev.backend_features) {
 fprintf(stderr, vhost lacks feature mask % PRIu64  for backend\n,
 (uint64_t)(~net-dev.features  net-dev.backend_features));
@@ -118,7 +124,6 @@ int vhost_net_start(struct vhost_net *ne
 struct vhost_vring_file file = { };
 int r;
 
-net-dev.nvqs = 2;
 net-dev.vqs = net-vqs;
 r = vhost_dev_start(net-dev, dev);
 if (r  0) {
@@ -166,7 +171,8 @@ void vhost_net_cleanup(struct vhost_net 
 qemu_free(net);
 }
 #else
-struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd)
+struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd,
+int nvqs)
 {
return NULL;
 }
diff -ruNp org2/hw/vhost_net.h tx_only.rev2/hw/vhost_net.h
--- org2/hw/vhost_net.h 2010-07-01 11:42:09.0 +0530
+++ tx_only.rev2/hw/vhost_net.h 2010-09-16 16:23:56.0 +0530
@@ -6,7 +6,7 @@
 struct vhost_net;
 typedef struct vhost_net VHostNetState;
 
-VHostNetState *vhost_net_init(VLANClientState *backend, int devfd);
+VHostNetState *vhost_net_init(VLANClientState *backend, int devfd, int nvqs);
 
 int vhost_net_start(VHostNetState *net, VirtIODevice *dev);
 void vhost_net_stop(VHostNetState *net, VirtIODevice *dev);
diff -ruNp org2/hw/virtio-net.c tx_only.rev2/hw/virtio-net.c
--- org2/hw/virtio-net.c2010-07-19 12:41:28.0 +0530
+++ tx_only.rev2/hw/virtio-net.c2010-09-16 16:23:56.0 +0530
@@ -32,17 +32,17 @@ typedef struct VirtIONet
 uint8_t mac[ETH_ALEN];
 uint16_t status;
 VirtQueue *rx_vq;
-VirtQueue *tx_vq;
+VirtQueue **tx_vq;
 VirtQueue *ctrl_vq;
 NICState *nic;
-QEMUTimer *tx_timer;
-int tx_timer_active;
+QEMUTimer **tx_timer;
+int *tx_timer_active;
 uint32_t has_vnet_hdr;
 uint8_t has_ufo;
 struct {
 VirtQueueElement elem;
 ssize_t len

[PATCH] vhost: Fix host panic if ioctl called with wrong index

2010-05-24 Thread Krishna Kumar
From: Krishna Kumar krkum...@in.ibm.com

Missed a boundary value check in vhost_set_vring. The host panics if
idx == nvqs is used in ioctl commands in vhost_virtqueue_init.

Signed-off-by: Krishna Kumar krkum...@in.ibm.com
---
 drivers/vhost/vhost.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff -ruNp org/drivers/vhost/vhost.c new/drivers/vhost/vhost.c
--- org/drivers/vhost/vhost.c   2010-05-24 09:25:57.0 +0530
+++ new/drivers/vhost/vhost.c   2010-05-24 09:26:53.0 +0530
@@ -374,7 +374,7 @@ static long vhost_set_vring(struct vhost
r = get_user(idx, idxp);
if (r  0)
return r;
-   if (idx  d-nvqs)
+   if (idx = d-nvqs)
return -ENOBUFS;
 
vq = d-vqs + idx;
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html