Re: [PATCH 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-07 Thread Michael S. Tsirkin
On Wed, Apr 07, 2010 at 10:41:08AM +0800, Xin, Xiaohui wrote:
 Michael,
  
  Qemu needs a userspace write, is that a synchronous one or
 asynchronous one?
 
 It's a synchronous non-blocking write.
 Sorry, why the Qemu live migration needs the device have a userspace write?
 how does the write operation work? And why a read operation is not cared here?
 
 Thanks
 Xiaohui

Roughly, with ethernet bridges, moving a device from one location in
the network to another makes forwarding tables incorrect (or incomplete),
until outgoing traffic from the device causes these tables
to be updated. Since there's no guarantee that guest
will generate outgoing traffic, after migration qemu sends out several
dummy packets itself.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re:[PATCH 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-07 Thread xiaohui . xin
From: Xin Xiaohui xiaohui@intel.com

---

Michael,
Thanks a lot for the explanation. I have drafted a patch for the qemu write
after I looked into tun driver. Does it do in right way?

Thanks
Xiaohui

 drivers/vhost/mpassthru.c |   45 +
 1 files changed, 45 insertions(+), 0 deletions(-)

diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
index e9449ac..1cde097 100644
--- a/drivers/vhost/mpassthru.c
+++ b/drivers/vhost/mpassthru.c
@@ -1065,6 +1065,49 @@ static unsigned int mp_chr_poll(struct file *file, 
poll_table * wait)
return mask;
 }
 
+static ssize_t mp_chr_aio_write(struct kiocb *iocb, const struct iovec *iov,
+   unsigned long count, loff_t pos)
+{
+   struct file *file = iocb-ki_filp;
+   struct mp_struct *mp = mp_get(file-private_data);
+   struct sock *sk = mp-socket.sk;
+   struct sk_buff *skb;
+   int len, err;
+   ssize_t result;
+
+   if (!mp)
+   return -EBADFD;
+
+   /* currently, async is not supported */
+   if (!is_sync_kiocb(iocb))
+   return -EFAULT;
+
+   len = iov_length(iov, count);
+   skb = sock_alloc_send_skb(sk, len + NET_IP_ALIGN,
+ file-f_flags  O_NONBLOCK, err);
+
+   if (!skb)
+   return -EFAULT;
+
+   skb_reserve(skb, NET_IP_ALIGN);
+   skb_put(skb, len);
+
+   if (skb_copy_datagram_from_iovec(skb, 0, iov, 0, len)) {
+   kfree_skb(skb);
+   return -EFAULT;
+   }
+   skb_set_network_header(skb, ETH_HLEN);
+   skb-protocol = *((__be16 *)(skb-data) + ETH_ALEN);
+   skb-dev = mp-dev;
+
+   dev_queue_xmit(skb);
+   mp-dev-stats.tx_packets++;
+   mp-dev-stats.tx_bytes += len;
+
+   mp_put(mp);
+   return result;
+}
+
 static int mp_chr_close(struct inode *inode, struct file *file)
 {
struct mp_file *mfile = file-private_data;
@@ -1084,6 +1127,8 @@ static int mp_chr_close(struct inode *inode, struct file 
*file)
 static const struct file_operations mp_fops = {
.owner  = THIS_MODULE,
.llseek = no_llseek,
+   .write  = do_sync_write,
+   .aio_write = mp_chr_aio_write,
.poll   = mp_chr_poll,
.unlocked_ioctl = mp_chr_ioctl,
.open   = mp_chr_open,
-- 
1.5.4.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-07 Thread Michael S. Tsirkin
On Wed, Apr 07, 2010 at 05:00:39PM +0800, xiaohui@intel.com wrote:
 From: Xin Xiaohui xiaohui@intel.com
 
 ---
 
 Michael,
 Thanks a lot for the explanation. I have drafted a patch for the qemu write
 after I looked into tun driver. Does it do in right way?
 
 Thanks
 Xiaohui
 
  drivers/vhost/mpassthru.c |   45 
 +
  1 files changed, 45 insertions(+), 0 deletions(-)
 
 diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
 index e9449ac..1cde097 100644
 --- a/drivers/vhost/mpassthru.c
 +++ b/drivers/vhost/mpassthru.c
 @@ -1065,6 +1065,49 @@ static unsigned int mp_chr_poll(struct file *file, 
 poll_table * wait)
   return mask;
  }
  
 +static ssize_t mp_chr_aio_write(struct kiocb *iocb, const struct iovec *iov,
 + unsigned long count, loff_t pos)
 +{
 + struct file *file = iocb-ki_filp;
 + struct mp_struct *mp = mp_get(file-private_data);
 + struct sock *sk = mp-socket.sk;
 + struct sk_buff *skb;
 + int len, err;
 + ssize_t result;
 +
 + if (!mp)
 + return -EBADFD;
 +

Can this happen? When?

 + /* currently, async is not supported */
 + if (!is_sync_kiocb(iocb))
 + return -EFAULT;

Really necessary? I think do_sync_write handles all this.

 +
 + len = iov_length(iov, count);
 + skb = sock_alloc_send_skb(sk, len + NET_IP_ALIGN,
 +   file-f_flags  O_NONBLOCK, err);
 +
 + if (!skb)
 + return -EFAULT;

Surely not EFAULT. -EAGAIN?

 +
 + skb_reserve(skb, NET_IP_ALIGN);
 + skb_put(skb, len);
 +
 + if (skb_copy_datagram_from_iovec(skb, 0, iov, 0, len)) {
 + kfree_skb(skb);
 + return -EFAULT;
 + }
 + skb_set_network_header(skb, ETH_HLEN);

Is this really right or necessary? Also,
probably need to check that length is at least ETH_ALEN before
doing this.

 + skb-protocol = *((__be16 *)(skb-data) + ETH_ALEN);

eth_type_trans?

 + skb-dev = mp-dev;
 +
 + dev_queue_xmit(skb);
 + mp-dev-stats.tx_packets++;
 + mp-dev-stats.tx_bytes += len;

Doesn't the hard start xmit function for the device
increment the counters?

 +
 + mp_put(mp);
 + return result;
 +}
 +
  static int mp_chr_close(struct inode *inode, struct file *file)
  {
   struct mp_file *mfile = file-private_data;
 @@ -1084,6 +1127,8 @@ static int mp_chr_close(struct inode *inode, struct 
 file *file)
  static const struct file_operations mp_fops = {
   .owner  = THIS_MODULE,
   .llseek = no_llseek,
 + .write  = do_sync_write,
 + .aio_write = mp_chr_aio_write,
   .poll   = mp_chr_poll,
   .unlocked_ioctl = mp_chr_ioctl,
   .open   = mp_chr_open,
 -- 
 1.5.4.4
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-06 Thread Michael S. Tsirkin
On Tue, Apr 06, 2010 at 01:41:37PM +0800, Xin, Xiaohui wrote:
 Michael,
  
 For the DOS issue, I'm not sure how much the limit get_user_pages()
  can pin is reasonable, should we compute the bindwidth to make it?
 
 There's a ulimit for locked memory. Can we use this, decreasing
 the value for rlimit array? We can do this when backend is
 enabled and re-increment when backend is disabled.
 
 I have tried it with rlim[RLIMIT_MEMLOCK].rlim_cur, but I found
 the initial value for it is 0x1, after right shift PAGE_SHIFT,
 it's only 16 pages we can lock then, it seems too small, since the 
 guest virito-net driver may submit a lot requests one time.
 
 
 Thanks
 Xiaohui

Yes, that's the default, but system administrator can always increase
this value with ulimit if necessary.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-06 Thread Xin, Xiaohui
Michael,
 
 Qemu needs a userspace write, is that a synchronous one or
asynchronous one?

It's a synchronous non-blocking write.
Sorry, why the Qemu live migration needs the device have a userspace write?
how does the write operation work? And why a read operation is not cared here?

Thanks
Xiaohui
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-05 Thread Xin, Xiaohui
Michael,
 
For the DOS issue, I'm not sure how much the limit get_user_pages()
 can pin is reasonable, should we compute the bindwidth to make it?

There's a ulimit for locked memory. Can we use this, decreasing
the value for rlimit array? We can do this when backend is
enabled and re-increment when backend is disabled.

I have tried it with rlim[RLIMIT_MEMLOCK].rlim_cur, but I found
the initial value for it is 0x1, after right shift PAGE_SHIFT,
it's only 16 pages we can lock then, it seems too small, since the 
guest virito-net driver may submit a lot requests one time.


Thanks
Xiaohui

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re:[PATCH 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-01 Thread Xin Xiaohui
Add a device to utilize the vhost-net backend driver for
copy-less data transfer between guest FE and host NIC.
It pins the guest user space to the host memory and
provides proto_ops as sendmsg/recvmsg to vhost-net.

Signed-off-by: Xin Xiaohui xiaohui@intel.com
Signed-off-by: Zhao Yu yzha...@gmail.com
Sigend-off-by: Jeff Dike jd...@c2.user-mode-linux.org
---

Micheal,
Sorry, I did not resolve all your comments this time.
I did not move the device out of vhost directory because I
did not implement real asynchronous read/write operations
to mp device for now, We wish we can do this after the network
code checked in. 

For the DOS issue, I'm not sure how much the limit get_user_pages()
can pin is reasonable, should we compute the bindwidth to make it?

We use get_user_pages_fast() and use set_page_dirty_lock().
Remove read_rcu_lock()/unlock(), since the ctor pointer is
only changed by BIND/UNBIND ioctl, and during that time,
the NIC is always stoped, all outstanding requests are done,
so the ctor pointer cannot be raced into wrong condition.

Qemu needs a userspace write, is that a synchronous one or
asynchronous one?

Thanks
Xiaohui

 drivers/vhost/Kconfig |5 +
 drivers/vhost/Makefile|2 +
 drivers/vhost/mpassthru.c | 1162 +
 include/linux/mpassthru.h |   29 ++
 4 files changed, 1198 insertions(+), 0 deletions(-)
 create mode 100644 drivers/vhost/mpassthru.c
 create mode 100644 include/linux/mpassthru.h

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 9f409f4..ee32a3b 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -9,3 +9,8 @@ config VHOST_NET
  To compile this driver as a module, choose M here: the module will
  be called vhost_net.
 
+config VHOST_PASSTHRU
+   tristate Zerocopy network driver (EXPERIMENTAL)
+   depends on VHOST_NET
+   ---help---
+ zerocopy network I/O support
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index 72dd020..3f79c79 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -1,2 +1,4 @@
 obj-$(CONFIG_VHOST_NET) += vhost_net.o
 vhost_net-y := vhost.o net.o
+
+obj-$(CONFIG_VHOST_PASSTHRU) += mpassthru.o
diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
new file mode 100644
index 000..6e8fc4d
--- /dev/null
+++ b/drivers/vhost/mpassthru.c
@@ -0,0 +1,1162 @@
+/*
+ *  MPASSTHRU - Mediate passthrough device.
+ *  Copyright (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ */
+
+#define DRV_NAMEmpassthru
+#define DRV_DESCRIPTION Mediate passthru device driver
+#define DRV_COPYRIGHT   (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G
+
+#include linux/module.h
+#include linux/errno.h
+#include linux/kernel.h
+#include linux/major.h
+#include linux/slab.h
+#include linux/smp_lock.h
+#include linux/poll.h
+#include linux/fcntl.h
+#include linux/init.h
+#include linux/aio.h
+
+#include linux/skbuff.h
+#include linux/netdevice.h
+#include linux/etherdevice.h
+#include linux/miscdevice.h
+#include linux/ethtool.h
+#include linux/rtnetlink.h
+#include linux/if.h
+#include linux/if_arp.h
+#include linux/if_ether.h
+#include linux/crc32.h
+#include linux/nsproxy.h
+#include linux/uaccess.h
+#include linux/virtio_net.h
+#include linux/mpassthru.h
+#include net/net_namespace.h
+#include net/netns/generic.h
+#include net/rtnetlink.h
+#include net/sock.h
+
+#include asm/system.h
+
+#include vhost.h
+
+/* Uncomment to enable debugging */
+/* #define MPASSTHRU_DEBUG 1 */
+
+#ifdef MPASSTHRU_DEBUG
+static int debug;
+
+#define DBG  if (mp-debug) printk
+#define DBG1 if (debug == 2) printk
+#else
+#define DBG(a...)
+#define DBG1(a...)
+#endif
+
+#define COPY_THRESHOLD (L1_CACHE_BYTES * 4)
+#define COPY_HDR_LEN   (L1_CACHE_BYTES  64 ? 64 : L1_CACHE_BYTES)
+
+struct frag {
+   u16 offset;
+   u16 size;
+};
+
+struct page_ctor {
+   struct list_headreadq;
+   int w_len;
+   int r_len;
+   spinlock_t  read_lock;
+   struct kmem_cache   *cache;
+   struct net_device   *dev;
+   struct mpassthru_port   port;
+};
+
+struct page_info {
+   void*ctrl;
+   struct list_headlist;
+   int header;
+   /* indicate the actual length of bytes
+* send/recv in the user space buffers
+*/
+   int total;
+   int 

Re: [PATCH 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-01 Thread Michael S. Tsirkin
On Thu, Apr 01, 2010 at 05:27:18PM +0800, Xin Xiaohui wrote:
 Add a device to utilize the vhost-net backend driver for
 copy-less data transfer between guest FE and host NIC.
 It pins the guest user space to the host memory and
 provides proto_ops as sendmsg/recvmsg to vhost-net.
 
 Signed-off-by: Xin Xiaohui xiaohui@intel.com
 Signed-off-by: Zhao Yu yzha...@gmail.com
 Sigend-off-by: Jeff Dike jd...@c2.user-mode-linux.org
 ---
 
 Micheal,
 Sorry, I did not resolve all your comments this time.
 I did not move the device out of vhost directory because I
 did not implement real asynchronous read/write operations
 to mp device for now, We wish we can do this after the network
 code checked in. 

Well, placement of code is not such a major issue.
It's just that code under drivers/net gets more and better
review than drivers/vhost. I'll try to get Dave's opinion.

 
 For the DOS issue, I'm not sure how much the limit get_user_pages()
 can pin is reasonable, should we compute the bindwidth to make it?

There's a ulimit for locked memory. Can we use this, decreasing
the value for rlimit array? We can do this when backend is
enabled and re-increment when backend is disabled.

 We use get_user_pages_fast() and use set_page_dirty_lock().
 Remove read_rcu_lock()/unlock(), since the ctor pointer is
 only changed by BIND/UNBIND ioctl, and during that time,
 the NIC is always stoped, all outstanding requests are done,
 so the ctor pointer cannot be raced into wrong condition.
 
 Qemu needs a userspace write, is that a synchronous one or
 asynchronous one?

It's a synchronous non-blocking write.

 Thanks
 Xiaohui
 
  drivers/vhost/Kconfig |5 +
  drivers/vhost/Makefile|2 +
  drivers/vhost/mpassthru.c | 1162 
 +
  include/linux/mpassthru.h |   29 ++
  4 files changed, 1198 insertions(+), 0 deletions(-)
  create mode 100644 drivers/vhost/mpassthru.c
  create mode 100644 include/linux/mpassthru.h
 
 diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
 index 9f409f4..ee32a3b 100644
 --- a/drivers/vhost/Kconfig
 +++ b/drivers/vhost/Kconfig
 @@ -9,3 +9,8 @@ config VHOST_NET
 To compile this driver as a module, choose M here: the module will
 be called vhost_net.
  
 +config VHOST_PASSTHRU
 + tristate Zerocopy network driver (EXPERIMENTAL)
 + depends on VHOST_NET
 + ---help---
 +   zerocopy network I/O support
 diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
 index 72dd020..3f79c79 100644
 --- a/drivers/vhost/Makefile
 +++ b/drivers/vhost/Makefile
 @@ -1,2 +1,4 @@
  obj-$(CONFIG_VHOST_NET) += vhost_net.o
  vhost_net-y := vhost.o net.o
 +
 +obj-$(CONFIG_VHOST_PASSTHRU) += mpassthru.o
 diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
 new file mode 100644
 index 000..6e8fc4d
 --- /dev/null
 +++ b/drivers/vhost/mpassthru.c
 @@ -0,0 +1,1162 @@
 +/*
 + *  MPASSTHRU - Mediate passthrough device.
 + *  Copyright (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G
 + *
 + *  This program is free software; you can redistribute it and/or modify
 + *  it under the terms of the GNU General Public License as published by
 + *  the Free Software Foundation; either version 2 of the License, or
 + *  (at your option) any later version.
 + *
 + *  This program is distributed in the hope that it will be useful,
 + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 + *  GNU General Public License for more details.
 + *
 + */
 +
 +#define DRV_NAMEmpassthru
 +#define DRV_DESCRIPTION Mediate passthru device driver
 +#define DRV_COPYRIGHT   (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G
 +
 +#include linux/module.h
 +#include linux/errno.h
 +#include linux/kernel.h
 +#include linux/major.h
 +#include linux/slab.h
 +#include linux/smp_lock.h
 +#include linux/poll.h
 +#include linux/fcntl.h
 +#include linux/init.h
 +#include linux/aio.h
 +
 +#include linux/skbuff.h
 +#include linux/netdevice.h
 +#include linux/etherdevice.h
 +#include linux/miscdevice.h
 +#include linux/ethtool.h
 +#include linux/rtnetlink.h
 +#include linux/if.h
 +#include linux/if_arp.h
 +#include linux/if_ether.h
 +#include linux/crc32.h
 +#include linux/nsproxy.h
 +#include linux/uaccess.h
 +#include linux/virtio_net.h
 +#include linux/mpassthru.h
 +#include net/net_namespace.h
 +#include net/netns/generic.h
 +#include net/rtnetlink.h
 +#include net/sock.h
 +
 +#include asm/system.h
 +
 +#include vhost.h
 +
 +/* Uncomment to enable debugging */
 +/* #define MPASSTHRU_DEBUG 1 */
 +
 +#ifdef MPASSTHRU_DEBUG
 +static int debug;
 +
 +#define DBG  if (mp-debug) printk
 +#define DBG1 if (debug == 2) printk
 +#else
 +#define DBG(a...)
 +#define DBG1(a...)
 +#endif
 +
 +#define COPY_THRESHOLD (L1_CACHE_BYTES * 4)
 +#define COPY_HDR_LEN   (L1_CACHE_BYTES  64 ? 64 : L1_CACHE_BYTES)
 +
 +struct frag {
 + u16 offset;
 + u16 size;
 +};
 +
 +struct page_ctor {
 +   

[PATCH 1/3] A device for zero-copy based on KVM virtio-net.

2010-02-10 Thread Xin Xiaohui
Add a device to utilize the vhost-net backend driver for
copy-less data transfer between guest FE and host NIC.
It pins the guest user space to the host memory and
provides proto_ops as sendmsg/recvmsg to vhost-net.

Signed-off-by: Xin Xiaohui xiaohui@intel.com
Signed-off-by: Zhao Yu yzha...@gmail.com
Sigend-off-by: Jeff Dike jd...@c2.user-mode-linux.org
---
 drivers/vhost/Kconfig  |5 +
 drivers/vhost/Makefile |2 +
 drivers/vhost/mpassthru.c  | 1178 
 include/linux/miscdevice.h |1 +
 include/linux/mpassthru.h  |   17 +
 5 files changed, 1203 insertions(+), 0 deletions(-)
 create mode 100644 drivers/vhost/mpassthru.c
 create mode 100644 include/linux/mpassthru.h

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 9f409f4..ee32a3b 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -9,3 +9,8 @@ config VHOST_NET
  To compile this driver as a module, choose M here: the module will
  be called vhost_net.
 
+config VHOST_PASSTHRU
+   tristate Zerocopy network driver (EXPERIMENTAL)
+   depends on VHOST_NET
+   ---help---
+ zerocopy network I/O support
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index 72dd020..3f79c79 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -1,2 +1,4 @@
 obj-$(CONFIG_VHOST_NET) += vhost_net.o
 vhost_net-y := vhost.o net.o
+
+obj-$(CONFIG_VHOST_PASSTHRU) += mpassthru.o
diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
new file mode 100644
index 000..d8d153f
--- /dev/null
+++ b/drivers/vhost/mpassthru.c
@@ -0,0 +1,1178 @@
+/*
+ *  MPASSTHRU - Mediate passthrough device.
+ *  Copyright (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ */
+
+#define DRV_NAMEmpassthru
+#define DRV_DESCRIPTION Mediate passthru device driver
+#define DRV_COPYRIGHT   (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G
+
+#include linux/module.h
+#include linux/errno.h
+#include linux/kernel.h
+#include linux/major.h
+#include linux/slab.h
+#include linux/smp_lock.h
+#include linux/poll.h
+#include linux/fcntl.h
+#include linux/init.h
+#include linux/skbuff.h
+#include linux/netdevice.h
+#include linux/etherdevice.h
+#include linux/miscdevice.h
+#include linux/ethtool.h
+#include linux/rtnetlink.h
+#include linux/if.h
+#include linux/if_arp.h
+#include linux/if_ether.h
+#include linux/crc32.h
+#include linux/nsproxy.h
+#include linux/uaccess.h
+#include linux/virtio_net.h
+#include linux/mpassthru.h
+#include net/net_namespace.h
+#include net/netns/generic.h
+#include net/rtnetlink.h
+#include net/sock.h
+
+#include asm/system.h
+
+#include vhost.h
+
+/* Uncomment to enable debugging */
+/* #define MPASSTHRU_DEBUG 1 */
+
+#ifdef MPASSTHRU_DEBUG
+static int debug;
+
+#define DBG  if (mp-debug) printk
+#define DBG1 if (debug == 2) printk
+#else
+#define DBG(a...)
+#define DBG1(a...)
+#endif
+
+#define COPY_THRESHOLD (L1_CACHE_BYTES * 4)
+#define COPY_HDR_LEN   (L1_CACHE_BYTES  64 ? 64 : L1_CACHE_BYTES)
+
+struct frag {
+   u16 offset;
+   u16 size;
+};
+
+struct page_ctor {
+   struct list_headreadq;
+   int w_len;
+   int r_len;
+   spinlock_t  read_lock;
+   atomic_trefcnt;
+   struct kmem_cache   *cache;
+   struct net_device   *dev;
+   struct netdev_page_ctor ctor;
+   void*sendctrl;
+   void*recvctrl;
+};
+
+struct page_info {
+   struct list_headlist;
+   int header;
+   /* indicate the actual length of bytes
+* send/recv in the user space buffers
+*/
+   int total;
+   int offset;
+   struct page *pages[MAX_SKB_FRAGS+1];
+   struct skb_frag_struct  frag[MAX_SKB_FRAGS+1];
+   struct sk_buff  *skb;
+   struct page_ctor*ctor;
+
+   /* The pointer relayed to skb, to indicate
+* it's a user space allocated skb or kernel
+*/
+   struct skb_user_pageuser;
+   struct skb_shared_info  ushinfo;
+
+#define INFO_READ  0
+#define INFO_WRITE 1
+   unsignedflags;
+   unsignedpnum;
+
+   /* It's meaningful for receive, means
+* the max length allowed
+*/
+   size_t  

Re: [PATCH 1/3] A device for zero-copy based on KVM virtio-net.

2010-02-10 Thread Eric Dumazet
Le mercredi 10 février 2010 à 19:48 +0800, Xin Xiaohui a écrit :
 Add a device to utilize the vhost-net backend driver for
 copy-less data transfer between guest FE and host NIC.
 It pins the guest user space to the host memory and
 provides proto_ops as sendmsg/recvmsg to vhost-net.
 
 Signed-off-by: Xin Xiaohui xiaohui@intel.com
 Signed-off-by: Zhao Yu yzha...@gmail.com
 Sigend-off-by: Jeff Dike jd...@c2.user-mode-linux.org


 +static int page_ctor_attach(struct mp_struct *mp)
 +{
 + int rc;
 + struct page_ctor *ctor;
 + struct net_device *dev = mp-dev;
 +
 + rcu_read_lock();
 + if (rcu_dereference(mp-ctor)) {
 + rcu_read_unlock();
 + return -EBUSY;
 + }
 + rcu_read_unlock();

Strange read locking here, for an obvious writer role.
What do you really want to do ?
If writer are serialized by mp_mutex, you dont need this
recu_read_lock()/rcu_read_unlock() stuff.

 +
 + ctor = kzalloc(sizeof(*ctor), GFP_KERNEL);
 + if (!ctor)
 + return -ENOMEM;
 + rc = netdev_page_ctor_prep(dev, ctor-ctor);
 + if (rc)
 + goto fail;
 +
 + ctor-cache = kmem_cache_create(skb_page_info,
 + sizeof(struct page_info), 0,
 + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);


SLAB_PANIC here means : crash whole system in case of error.
This is not what you want in a driver.

 +
 + if (!ctor-cache)
 + goto cache_fail;
 +
 + INIT_LIST_HEAD(ctor-readq);
 + spin_lock_init(ctor-read_lock);
 +
 + ctor-w_len = 0;
 + ctor-r_len = 0;
 +
 + dev_hold(dev);
 + ctor-dev = dev;
 + ctor-ctor.ctor = page_ctor;
 + ctor-ctor.sock = mp-socket;
 + atomic_set(ctor-refcnt, 1);
 +
 + rc = netdev_page_ctor_attach(dev, ctor-ctor);
 + if (rc)
 + goto fail;
 +
 + /* locked by mp_mutex */
 + rcu_assign_pointer(mp-ctor, ctor);
 +
 + /* XXX:Need we do set_offload here ? */
 +
 + return 0;
 +
 +fail:
 + kmem_cache_destroy(ctor-cache);
 +cache_fail:
 + kfree(ctor);
 + dev_put(dev);
 +
 + return rc;
 +}
 +
 +
 +static inline void get_page_ctor(struct page_ctor *ctor)
 +{
 +   atomic_inc(ctor-refcnt);
 +}
 +
 +static inline void put_page_ctor(struct page_ctor *ctor)
 +{
 + if (atomic_dec_and_test(ctor-refcnt))
 + kfree(ctor);

Are you sure a RCU grace period is not needed before freeing ?


 +
 +static int page_ctor_detach(struct mp_struct *mp)
 +{
 + struct page_ctor *ctor;
 + struct page_info *info;
 + int i;
 +
 + rcu_read_lock();
 + ctor = rcu_dereference(mp-ctor);
 + rcu_read_unlock();

Strange locking again here


 +
 + if (!ctor)
 + return -ENODEV;
 +
 + while ((info = info_dequeue(ctor))) {
 + for (i = 0; i  info-pnum; i++)
 + if (info-pages[i])
 + put_page(info-pages[i]);
 + kmem_cache_free(ctor-cache, info);
 + }
 + kmem_cache_destroy(ctor-cache);
 + netdev_page_ctor_detach(ctor-dev);
 + dev_put(ctor-dev);
 +
 + /* locked by mp_mutex */
 + rcu_assign_pointer(mp-ctor, NULL);
 + synchronize_rcu();
 +
 + put_page_ctor(ctor);
 +
 + return 0;
 +}
 +
 +/* For small user space buffers transmit, we don't need to call
 + * get_user_pages().
 + */
 +static struct page_info *alloc_small_page_info(struct page_ctor *ctor,
 + int total)
 +{
 + struct page_info *info = kmem_cache_alloc(ctor-cache, GFP_KERNEL);
kmem_cache_zalloc() ?
 +
 + if (!info)
 + return NULL;
 + memset(info, 0, sizeof(struct page_info));
 + memset(info-pages, 0, sizeof(info-pages));

redundant memset() whole structure already cleared one line above

 +
 + info-header = 0;
already cleared
 + info-total = total;
 + info-skb = NULL;
already cleared 
 
 + info-user.dtor = page_dtor;
 + info-ctor = ctor;
 + info-flags = INFO_WRITE;
 + info-pnum = 0;
already cleared 
 
 + return info;
 +}
 +
 +/* The main function to transform the guest user space address
 + * to host kernel address via get_user_pages(). Thus the hardware
 + * can do DMA directly to the user space address.
 + */
 +static struct page_info *alloc_page_info(struct page_ctor *ctor,
 + struct iovec *iov, int count, struct frag *frags,
 + int npages, int total)
 +{
 + int rc;
 + int i, j, n = 0;
 + int len;
 + unsigned long base;
 + struct page_info *info = kmem_cache_alloc(ctor-cache, GFP_KERNEL);
kmem_cache_zalloc() ? 
 
 +
 + if (!info)
 + return NULL;
 + memset(info, 0, sizeof(struct page_info));
kmem_cache_zalloc() ?
 + memset(info-pages, 0, sizeof(info-pages));
already cleared 
 
 +
 + down_read(current-mm-mmap_sem);
 + for (i = j = 0; i  count; i++) {
 + base = (unsigned long)iov[i].iov_base;
 + len = iov[i].iov_len;
 +
 + if (!len)
 +  

RE: [PATCH 1/3] A device for zero-copy based on KVM virtio-net.

2010-02-10 Thread Xin, Xiaohui
Eric,
Thanks. I will look into that. But don't stop there. 
Please comments more. :-)

Thanks
Xiaohui

-Original Message-
From: Eric Dumazet [mailto:eric.duma...@gmail.com] 
Sent: Wednesday, February 10, 2010 11:18 PM
To: Xin, Xiaohui
Cc: net...@vger.kernel.org; kvm@vger.kernel.org; linux-ker...@vger.kernel.org; 
mi...@elte.hu; m...@redhat.com; jd...@c2.user-mode-linux.org; Zhao Yu
Subject: Re: [PATCH 1/3] A device for zero-copy based on KVM virtio-net.

Le mercredi 10 février 2010 à 19:48 +0800, Xin Xiaohui a écrit :
 Add a device to utilize the vhost-net backend driver for
 copy-less data transfer between guest FE and host NIC.
 It pins the guest user space to the host memory and
 provides proto_ops as sendmsg/recvmsg to vhost-net.
 
 Signed-off-by: Xin Xiaohui xiaohui@intel.com
 Signed-off-by: Zhao Yu yzha...@gmail.com
 Sigend-off-by: Jeff Dike jd...@c2.user-mode-linux.org


 +static int page_ctor_attach(struct mp_struct *mp)
 +{
 + int rc;
 + struct page_ctor *ctor;
 + struct net_device *dev = mp-dev;
 +
 + rcu_read_lock();
 + if (rcu_dereference(mp-ctor)) {
 + rcu_read_unlock();
 + return -EBUSY;
 + }
 + rcu_read_unlock();

Strange read locking here, for an obvious writer role.
What do you really want to do ?
If writer are serialized by mp_mutex, you dont need this
recu_read_lock()/rcu_read_unlock() stuff.

 +
 + ctor = kzalloc(sizeof(*ctor), GFP_KERNEL);
 + if (!ctor)
 + return -ENOMEM;
 + rc = netdev_page_ctor_prep(dev, ctor-ctor);
 + if (rc)
 + goto fail;
 +
 + ctor-cache = kmem_cache_create(skb_page_info,
 + sizeof(struct page_info), 0,
 + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);


SLAB_PANIC here means : crash whole system in case of error.
This is not what you want in a driver.

 +
 + if (!ctor-cache)
 + goto cache_fail;
 +
 + INIT_LIST_HEAD(ctor-readq);
 + spin_lock_init(ctor-read_lock);
 +
 + ctor-w_len = 0;
 + ctor-r_len = 0;
 +
 + dev_hold(dev);
 + ctor-dev = dev;
 + ctor-ctor.ctor = page_ctor;
 + ctor-ctor.sock = mp-socket;
 + atomic_set(ctor-refcnt, 1);
 +
 + rc = netdev_page_ctor_attach(dev, ctor-ctor);
 + if (rc)
 + goto fail;
 +
 + /* locked by mp_mutex */
 + rcu_assign_pointer(mp-ctor, ctor);
 +
 + /* XXX:Need we do set_offload here ? */
 +
 + return 0;
 +
 +fail:
 + kmem_cache_destroy(ctor-cache);
 +cache_fail:
 + kfree(ctor);
 + dev_put(dev);
 +
 + return rc;
 +}
 +
 +
 +static inline void get_page_ctor(struct page_ctor *ctor)
 +{
 +   atomic_inc(ctor-refcnt);
 +}
 +
 +static inline void put_page_ctor(struct page_ctor *ctor)
 +{
 + if (atomic_dec_and_test(ctor-refcnt))
 + kfree(ctor);

Are you sure a RCU grace period is not needed before freeing ?


 +
 +static int page_ctor_detach(struct mp_struct *mp)
 +{
 + struct page_ctor *ctor;
 + struct page_info *info;
 + int i;
 +
 + rcu_read_lock();
 + ctor = rcu_dereference(mp-ctor);
 + rcu_read_unlock();

Strange locking again here


 +
 + if (!ctor)
 + return -ENODEV;
 +
 + while ((info = info_dequeue(ctor))) {
 + for (i = 0; i  info-pnum; i++)
 + if (info-pages[i])
 + put_page(info-pages[i]);
 + kmem_cache_free(ctor-cache, info);
 + }
 + kmem_cache_destroy(ctor-cache);
 + netdev_page_ctor_detach(ctor-dev);
 + dev_put(ctor-dev);
 +
 + /* locked by mp_mutex */
 + rcu_assign_pointer(mp-ctor, NULL);
 + synchronize_rcu();
 +
 + put_page_ctor(ctor);
 +
 + return 0;
 +}
 +
 +/* For small user space buffers transmit, we don't need to call
 + * get_user_pages().
 + */
 +static struct page_info *alloc_small_page_info(struct page_ctor *ctor,
 + int total)
 +{
 + struct page_info *info = kmem_cache_alloc(ctor-cache, GFP_KERNEL);
kmem_cache_zalloc() ?
 +
 + if (!info)
 + return NULL;
 + memset(info, 0, sizeof(struct page_info));
 + memset(info-pages, 0, sizeof(info-pages));

redundant memset() whole structure already cleared one line above

 +
 + info-header = 0;
already cleared
 + info-total = total;
 + info-skb = NULL;
already cleared 
 
 + info-user.dtor = page_dtor;
 + info-ctor = ctor;
 + info-flags = INFO_WRITE;
 + info-pnum = 0;
already cleared 
 
 + return info;
 +}
 +
 +/* The main function to transform the guest user space address
 + * to host kernel address via get_user_pages(). Thus the hardware
 + * can do DMA directly to the user space address.
 + */
 +static struct page_info *alloc_page_info(struct page_ctor *ctor,
 + struct iovec *iov, int count, struct frag *frags,
 + int npages, int total)
 +{
 + int rc;
 + int i, j, n = 0;
 + int len;
 + unsigned long base;
 + struct page_info *info =