[PATCHv7 0/4] virtio_console: Add rproc_serial driver

2012-10-15 Thread sjur . brandeland
From: Sjur Brændeland sjur.brandel...@stericsson.com

This patch-set introduces a new virtio type rproc_serial for communicating
with remote processors over shared memory. The driver depends on the
the remoteproc framework. As preparation for introducing rproc_serial
I've done a refactoring of the transmit buffer handling.

This patch-set is a rework of the patch-set from Sept 25th, hopefully all
review comments has been addressed.

The fist patch is a bugfix and migth be applicable for 3.7.

Thanks,
Sjur

Sjur Brændeland (4):
  virtio_console: Free buffer if splice fails
  virtio_console: Use kmalloc instead of kzalloc
  virtio_console: Merge struct buffer_token into struct port_buffer
  virtio_console: Add support for remoteproc serial

 drivers/char/virtio_console.c |  328 +
 include/linux/virtio_ids.h|1 +
 2 files changed, 234 insertions(+), 95 deletions(-)

-- 
1.7.5.4

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[PATCHv7 1/4] virtio_console: Free buffer if splice fails

2012-10-15 Thread sjur . brandeland
From: Sjur Brændeland sjur.brandel...@stericsson.com

Free the allocated scatter list if send_pages fails in function
port_splice_write.

Signed-off-by: Sjur Brændeland sjur.brandel...@stericsson.com
---
 drivers/char/virtio_console.c |2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index 8ab9c3d..c36b2f6 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -879,6 +879,8 @@ static ssize_t port_fops_splice_write(struct 
pipe_inode_info *pipe,
if (likely(ret  0))
ret = send_pages(port, sgl.sg, sgl.n, sgl.len, true);
 
+   if (unlikely(ret = 0))
+   kfree(sgl.sg);
return ret;
 }
 
-- 
1.7.5.4

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[PATCHv7 2/4] virtio_console: Use kmalloc instead of kzalloc

2012-10-15 Thread sjur . brandeland
From: Sjur Brændeland sjur.brandel...@stericsson.com

Avoid the more cpu expensive kzalloc when allocating buffers.
Originally kzalloc was intended for isolating the guest from
the host by not sending random guest data to the host. But device
isolation is not yet in place so kzalloc is not really needed.

Signed-off-by: Sjur Brændeland sjur.brandel...@stericsson.com
---
 drivers/char/virtio_console.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index c36b2f6..301d17e 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -349,7 +349,7 @@ static struct port_buffer *alloc_buf(size_t buf_size)
buf = kmalloc(sizeof(*buf), GFP_KERNEL);
if (!buf)
goto fail;
-   buf-buf = kzalloc(buf_size, GFP_KERNEL);
+   buf-buf = kmalloc(buf_size, GFP_KERNEL);
if (!buf-buf)
goto free_buf;
buf-len = 0;
-- 
1.7.5.4

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[PATCHv7 3/4] virtio_console: Merge struct buffer_token into struct port_buffer

2012-10-15 Thread sjur . brandeland
From: Sjur Brændeland sjur.brandel...@stericsson.com

Refactoring the splice functionality by unifying the approach for
sending scatter-lists and regular buffers. This simplifies
buffer handling and reduces code size. Splice will now allocate
a port_buffer and send_buf() and free_buf() can always be used
for any buffer.

Signed-off-by: Sjur Brændeland sjur.brandel...@stericsson.com
---
 drivers/char/virtio_console.c |  131 +
 1 files changed, 55 insertions(+), 76 deletions(-)

diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index 301d17e..917cc830 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -111,6 +111,12 @@ struct port_buffer {
size_t len;
/* offset in the buf from which to consume data */
size_t offset;
+
+   /* If sgpages == 0 then buf is used */
+   unsigned int sgpages;
+
+   /* sg is used if spages  0. sg must be the last in is struct */
+   struct scatterlist sg[0];
 };
 
 /*
@@ -338,17 +344,39 @@ static inline bool use_multiport(struct ports_device 
*portdev)
 
 static void free_buf(struct port_buffer *buf)
 {
+   unsigned int i;
+
kfree(buf-buf);
+   for (i = 0; i  buf-sgpages; i++) {
+   struct page *page = sg_page(buf-sg[i]);
+   if (!page)
+   break;
+   put_page(page);
+   }
+
kfree(buf);
 }
 
-static struct port_buffer *alloc_buf(size_t buf_size)
+static struct port_buffer *alloc_buf(struct virtqueue *vq, size_t buf_size,
+int pages)
 {
struct port_buffer *buf;
 
-   buf = kmalloc(sizeof(*buf), GFP_KERNEL);
+   /*
+* Allocate buffer and the sg list. The sg list array is allocated
+* directly after the port_buffer struct.
+*/
+   buf = kmalloc(sizeof(*buf) + sizeof(struct scatterlist) * pages,
+ GFP_KERNEL);
if (!buf)
goto fail;
+
+   buf-sgpages = pages;
+   if (pages  0) {
+   buf-buf = NULL;
+   return buf;
+   }
+
buf-buf = kmalloc(buf_size, GFP_KERNEL);
if (!buf-buf)
goto free_buf;
@@ -476,52 +504,26 @@ static ssize_t send_control_msg(struct port *port, 
unsigned int event,
return 0;
 }
 
-struct buffer_token {
-   union {
-   void *buf;
-   struct scatterlist *sg;
-   } u;
-   /* If sgpages == 0 then buf is used, else sg is used */
-   unsigned int sgpages;
-};
-
-static void reclaim_sg_pages(struct scatterlist *sg, unsigned int nrpages)
-{
-   int i;
-   struct page *page;
-
-   for (i = 0; i  nrpages; i++) {
-   page = sg_page(sg[i]);
-   if (!page)
-   break;
-   put_page(page);
-   }
-   kfree(sg);
-}
 
 /* Callers must take the port-outvq_lock */
 static void reclaim_consumed_buffers(struct port *port)
 {
-   struct buffer_token *tok;
+   struct port_buffer *buf;
unsigned int len;
 
if (!port-portdev) {
/* Device has been unplugged.  vqs are already gone. */
return;
}
-   while ((tok = virtqueue_get_buf(port-out_vq, len))) {
-   if (tok-sgpages)
-   reclaim_sg_pages(tok-u.sg, tok-sgpages);
-   else
-   kfree(tok-u.buf);
-   kfree(tok);
+   while ((buf = virtqueue_get_buf(port-out_vq, len))) {
+   free_buf(buf);
port-outvq_full = false;
}
 }
 
 static ssize_t __send_to_port(struct port *port, struct scatterlist *sg,
  int nents, size_t in_count,
- struct buffer_token *tok, bool nonblock)
+ void *data, bool nonblock)
 {
struct virtqueue *out_vq;
ssize_t ret;
@@ -534,7 +536,7 @@ static ssize_t __send_to_port(struct port *port, struct 
scatterlist *sg,
 
reclaim_consumed_buffers(port);
 
-   ret = virtqueue_add_buf(out_vq, sg, nents, 0, tok, GFP_ATOMIC);
+   ret = virtqueue_add_buf(out_vq, sg, nents, 0, data, GFP_ATOMIC);
 
/* Tell Host to go! */
virtqueue_kick(out_vq);
@@ -572,37 +574,6 @@ done:
return in_count;
 }
 
-static ssize_t send_buf(struct port *port, void *in_buf, size_t in_count,
-   bool nonblock)
-{
-   struct scatterlist sg[1];
-   struct buffer_token *tok;
-
-   tok = kmalloc(sizeof(*tok), GFP_ATOMIC);
-   if (!tok)
-   return -ENOMEM;
-   tok-sgpages = 0;
-   tok-u.buf = in_buf;
-
-   sg_init_one(sg, in_buf, in_count);
-
-   return __send_to_port(port, sg, 1, in_count, tok, nonblock);
-}
-
-static ssize_t send_pages(struct port *port, struct scatterlist *sg, int nents,
- size_t in_count, bool nonblock)
-{
-   struct 

[PATCHv7 4/4] virtio_console: Add support for remoteproc serial

2012-10-15 Thread sjur . brandeland
From: Sjur Brændeland sjur.brandel...@stericsson.com

Add a simple serial connection driver called
VIRTIO_ID_RPROC_SERIAL (11) for communicating with a
remote processor in an asymmetric multi-processing
configuration.

This implementation reuses the existing virtio_console
implementation, and adds support for DMA allocation
of data buffers and disables use of tty console and
the virtio control queue.

Signed-off-by: Sjur Brændeland sjur.brandel...@stericsson.com
---
 drivers/char/virtio_console.c |  201 -
 include/linux/virtio_ids.h|1 +
 2 files changed, 180 insertions(+), 22 deletions(-)

diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index 917cc830..eeb9b35 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -37,8 +37,12 @@
 #include linux/wait.h
 #include linux/workqueue.h
 #include linux/module.h
+#include linux/dma-mapping.h
+#include linux/kconfig.h
 #include ../tty/hvc/hvc_console.h
 
+#define is_rproc_enabled IS_ENABLED(CONFIG_REMOTEPROC)
+
 /*
  * This is a global struct for storing common data for all the devices
  * this driver handles.
@@ -112,6 +116,15 @@ struct port_buffer {
/* offset in the buf from which to consume data */
size_t offset;
 
+   /* DMA address of buffer */
+   dma_addr_t dma;
+
+   /* Device we got DMA memory from */
+   struct device *dev;
+
+   /* List of pending dma buffers to free */
+   struct list_head list;
+
/* If sgpages == 0 then buf is used */
unsigned int sgpages;
 
@@ -331,6 +344,11 @@ static bool is_console_port(struct port *port)
return false;
 }
 
+static bool is_rproc_serial(const struct virtio_device *vdev)
+{
+   return is_rproc_enabled  vdev-id.device == VIRTIO_ID_RPROC_SERIAL;
+}
+
 static inline bool use_multiport(struct ports_device *portdev)
 {
/*
@@ -342,11 +360,13 @@ static inline bool use_multiport(struct ports_device 
*portdev)
return portdev-vdev-features[0]  (1  VIRTIO_CONSOLE_F_MULTIPORT);
 }
 
-static void free_buf(struct port_buffer *buf)
+static DEFINE_SPINLOCK(dma_bufs_lock);
+static LIST_HEAD(pending_free_dma_bufs);
+
+static void free_buf(struct port_buffer *buf, bool can_sleep)
 {
unsigned int i;
 
-   kfree(buf-buf);
for (i = 0; i  buf-sgpages; i++) {
struct page *page = sg_page(buf-sg[i]);
if (!page)
@@ -354,14 +374,58 @@ static void free_buf(struct port_buffer *buf)
put_page(page);
}
 
+   if (!buf-dev) {
+   kfree(buf-buf);
+   } else if (is_rproc_enabled) {
+   unsigned long flags;
+
+   /* dma_free_coherent requires interrupts to be enabled. */
+   if (!can_sleep) {
+   /* queue up dma-buffers to be freed later */
+   spin_lock_irqsave(dma_bufs_lock, flags);
+   list_add_tail(buf-list, pending_free_dma_bufs);
+   spin_unlock_irqrestore(dma_bufs_lock, flags);
+   return;
+   }
+   dma_free_coherent(buf-dev, buf-size, buf-buf, buf-dma);
+
+   /* Release device refcnt and allow it to be freed */
+   put_device(buf-dev);
+   }
+
kfree(buf);
 }
 
+static void reclaim_dma_bufs(void)
+{
+   unsigned long flags;
+   struct port_buffer *buf, *tmp;
+   LIST_HEAD(tmp_list);
+
+   if (list_empty(pending_free_dma_bufs))
+   return;
+
+   /* Create a copy of the pending_free_dma_bufs while holding the lock */
+   spin_lock_irqsave(dma_bufs_lock, flags);
+   list_cut_position(tmp_list, pending_free_dma_bufs,
+ pending_free_dma_bufs.prev);
+   spin_unlock_irqrestore(dma_bufs_lock, flags);
+
+   /* Release the dma buffers, without irqs enabled */
+   list_for_each_entry_safe(buf, tmp, tmp_list, list) {
+   list_del(buf-list);
+   free_buf(buf, true);
+   }
+}
+
 static struct port_buffer *alloc_buf(struct virtqueue *vq, size_t buf_size,
 int pages)
 {
struct port_buffer *buf;
 
+   if (is_rproc_serial(vq-vdev))
+   reclaim_dma_bufs();
+
/*
 * Allocate buffer and the sg list. The sg list array is allocated
 * directly after the port_buffer struct.
@@ -373,11 +437,34 @@ static struct port_buffer *alloc_buf(struct virtqueue 
*vq, size_t buf_size,
 
buf-sgpages = pages;
if (pages  0) {
+   buf-dev = NULL;
buf-buf = NULL;
return buf;
}
 
-   buf-buf = kmalloc(buf_size, GFP_KERNEL);
+   if (is_rproc_serial(vq-vdev)) {
+   /*
+* Allocate DMA memory from ancestor. When a virtio
+* device is created by remoteproc, the DMA memory is
+* associated with the grandparent 

[PATCH 1/1] vhost-blk: Add vhost-blk support v4

2012-10-15 Thread Asias He
vhost-blk is an in-kernel virito-blk device accelerator.

Due to lack of proper in-kernel AIO interface, this version converts
guest's I/O request to bio and use submit_bio() to submit I/O directly.
So this version any supports raw block device as guest's disk image,
e.g. /dev/sda, /dev/ram0. We can add file based image support to
vhost-blk once we have in-kernel AIO interface. There are some work in
progress for in-kernel AIO interface from Dave Kleikamp and Zach Brown:

   http://marc.info/?l=linux-fsdevelm=133312234313122

Performance evaluation:
-
1) LKVM
Fio with libaio ioengine on Fusion IO device using kvm tool
IOPS   Before   After   Improvement
seq-read   107  121 +13.0%
seq-write  130  179 +37.6%
rnd-read   102  122 +19.6%
rnd-write  125  159 +27.0%

2) QEMU
Fio with libaio ioengine on Fusion IO device using QEMU
IOPS   Before   After   Improvement
seq-read   76   123 +61.8%
seq-write  139  173 +24.4%
rnd-read   73   120 +64.3%
rnd-write  75   156 +108.0%

Userspace bits:
-
1) LKVM
The latest vhost-blk userspace bits for kvm tool can be found here:
g...@github.com:asias/linux-kvm.git blk.vhost-blk

2) QEMU
The latest vhost-blk userspace prototype for QEMU can be found here:
g...@github.com:asias/qemu.git blk.vhost-blk

Changes in v4:
- Mark req-status as userspace pointer
- Use __copy_to_user() instead of copy_to_user() in vhost_blk_set_status()
- Add if (need_resched()) schedule() in blk thread
- Kill vhost_blk_stop_vq() and move it into vhost_blk_stop()
- Use vq_err() instead of pr_warn()
- Fail un Unsupported request
- Add flush in vhost_blk_set_features()

Changes in v3:
- Sending REQ_FLUSH bio instead of vfs_fsync, thanks Christoph!
- Check file passed by user is a raw block device file

Signed-off-by: Asias He as...@redhat.com
---
 drivers/vhost/Kconfig |   1 +
 drivers/vhost/Kconfig.blk |  10 +
 drivers/vhost/Makefile|   2 +
 drivers/vhost/blk.c   | 677 ++
 drivers/vhost/blk.h   |   8 +
 5 files changed, 698 insertions(+)
 create mode 100644 drivers/vhost/Kconfig.blk
 create mode 100644 drivers/vhost/blk.c
 create mode 100644 drivers/vhost/blk.h

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 202bba6..acd8038 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -11,4 +11,5 @@ config VHOST_NET
 
 if STAGING
 source drivers/vhost/Kconfig.tcm
+source drivers/vhost/Kconfig.blk
 endif
diff --git a/drivers/vhost/Kconfig.blk b/drivers/vhost/Kconfig.blk
new file mode 100644
index 000..ff8ab76
--- /dev/null
+++ b/drivers/vhost/Kconfig.blk
@@ -0,0 +1,10 @@
+config VHOST_BLK
+   tristate Host kernel accelerator for virtio blk (EXPERIMENTAL)
+   depends on BLOCK   EXPERIMENTAL  m
+   ---help---
+ This kernel module can be loaded in host kernel to accelerate
+ guest block with virtio_blk. Not to be confused with virtio_blk
+ module itself which needs to be loaded in guest kernel.
+
+ To compile this driver as a module, choose M here: the module will
+ be called vhost_blk.
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index a27b053..1a8a4a5 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -2,3 +2,5 @@ obj-$(CONFIG_VHOST_NET) += vhost_net.o
 vhost_net-y := vhost.o net.o
 
 obj-$(CONFIG_TCM_VHOST) += tcm_vhost.o
+obj-$(CONFIG_VHOST_BLK) += vhost_blk.o
+vhost_blk-y := blk.o
diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c
new file mode 100644
index 000..5c2b790
--- /dev/null
+++ b/drivers/vhost/blk.c
@@ -0,0 +1,677 @@
+/*
+ * Copyright (C) 2011 Taobao, Inc.
+ * Author: Liu Yuan tailai...@taobao.com
+ *
+ * Copyright (C) 2012 Red Hat, Inc.
+ * Author: Asias He as...@redhat.com
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * virtio-blk server in host kernel.
+ */
+
+#include linux/miscdevice.h
+#include linux/module.h
+#include linux/vhost.h
+#include linux/virtio_blk.h
+#include linux/mutex.h
+#include linux/file.h
+#include linux/kthread.h
+#include linux/blkdev.h
+
+#include vhost.c
+#include vhost.h
+#include blk.h
+
+/* The block header is in the first and separate buffer. */
+#define BLK_HDR0
+
+static DEFINE_IDA(vhost_blk_index_ida);
+
+enum {
+   VHOST_BLK_VQ_REQ = 0,
+   VHOST_BLK_VQ_MAX = 1,
+};
+
+struct req_page_list {
+   struct page **pages;
+   int pages_nr;
+};
+
+struct vhost_blk_req {
+   struct llist_node llnode;
+   struct req_page_list *pl;
+   struct vhost_blk *blk;
+
+   struct iovec *iov;
+   int iov_nr;
+
+   struct bio **bio;
+   atomic_t bio_nr;
+
+   sector_t sector;
+   int write;
+   u16 head;
+   long len;
+
+   u8 __user *status;
+};
+
+struct vhost_blk {
+   struct task_struct *host_kick;
+   struct 

Re: memory corruption in HYPERVISOR_physdev_op()

2012-10-15 Thread Ian Campbell
On Fri, 2012-09-14 at 14:24 +0300, Dan Carpenter wrote:
 Hi Jeremy,

Jeremy doesn't work on Xen much any more. Adding Konrad and the
xen-devel@ list.

 My static analyzer complains about potential memory corruption in
 HYPERVISOR_physdev_op()
 
 arch/x86/include/asm/xen/hypercall.h
389  static inline int
390  HYPERVISOR_physdev_op(int cmd, void *arg)
391  {
392  int rc = _hypercall2(int, physdev_op, cmd, arg);
393  if (unlikely(rc == -ENOSYS)) {
394  struct physdev_op op;
395  op.cmd = cmd;
396  memcpy(op.u, arg, sizeof(op.u));
397  rc = _hypercall1(int, physdev_op_compat, op);
398  memcpy(arg, op.u, sizeof(op.u));
 ^
 Some of the arg buffers are not as large as sizeof(op.u) which is either
 12 or 16 depending on the size of longs in struct physdev_apic.

Nasty!

 
399  }
400  return rc;
401  }
 
 One example of this is in xen_initdom_restore_msi_irqs().
 
 arch/x86/pci/xen.c
337  struct physdev_pci_device restore_ext;
338  
339  restore_ext.seg = pci_domain_nr(dev-bus);
340  restore_ext.bus = dev-bus-number;
341  restore_ext.devfn = dev-devfn;
342  ret = HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi_ext,
343  restore_ext);
 
 There are only 4 bytes here.
 
344  if (ret == -ENOSYS)
 ^^
 If we hit this condition, we have corrupted some memory.

I can see the memory corruption but how does it relate to ret ==
-ENOSYS?

 
345  pci_seg_supported = false;
 
 regards,
 dan carpenter
 ___
 Virtualization mailing list
 Virtualization@lists.linux-foundation.org
 https://lists.linuxfoundation.org/mailman/listinfo/virtualization
 

-- 
Ian Campbell
Current Noise: Therapy? - Femtex

Riffle West Virginia is so small that the Boy Scout had to double as the
town drunk.

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: potential integer overflow in xenbus_file_write()

2012-10-15 Thread Ian Campbell
On Thu, 2012-09-13 at 19:00 +0300, Dan Carpenter wrote:
 Hi,

Thanks Dan. I'm not sure anyone from Xen-land really monitors
virtualization@. Adding xen-devel and Konrad.

 
 I was reading some code and had a question in xenbus_file_write()
 
 drivers/xen/xenbus/xenbus_dev_frontend.c
461  if ((len + u-len)  sizeof(u-u.buffer)) {
  
 Can this addition overflow?

len is a size_t and u-len is an unsigned int, so I expect so.

   Should the test be something like:
 
   if (len  sizeof(u-u.buffer) || len + u-len  sizeof(u-u.buffer)) {

I think that would do it.

Ian.

462  /* On error, dump existing buffer */
463  u-len = 0;
464  rc = -EINVAL;
465  goto out;
466  }
467  
468  ret = copy_from_user(u-u.buffer + u-len, ubuf, len);
469  
 
 regards,
 dan carpenter
 ___
 Virtualization mailing list
 Virtualization@lists.linux-foundation.org
 https://lists.linuxfoundation.org/mailman/listinfo/virtualization
 


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [Xen-devel] memory corruption in HYPERVISOR_physdev_op()

2012-10-15 Thread Jan Beulich
 On 15.10.12 at 12:27, Ian Campbell ian.campb...@citrix.com wrote:
 On Fri, 2012-09-14 at 14:24 +0300, Dan Carpenter wrote:
 My static analyzer complains about potential memory corruption in
 HYPERVISOR_physdev_op()
 
 arch/x86/include/asm/xen/hypercall.h
389  static inline int
390  HYPERVISOR_physdev_op(int cmd, void *arg)
391  {
392  int rc = _hypercall2(int, physdev_op, cmd, arg);
393  if (unlikely(rc == -ENOSYS)) {
394  struct physdev_op op;
395  op.cmd = cmd;
396  memcpy(op.u, arg, sizeof(op.u));
397  rc = _hypercall1(int, physdev_op_compat, op);
398  memcpy(arg, op.u, sizeof(op.u));
 ^
 Some of the arg buffers are not as large as sizeof(op.u) which is either
 12 or 16 depending on the size of longs in struct physdev_apic.
 
 Nasty!

Wasn't it that pv-ops expects Xen 4.0.1 or newer anyway? If so,
what does this code exist for in the first place (it's framed by
#if CONFIG_XEN_COMPAT = 0x030002 in the Xenified kernel)?

399  }
400  return rc;
401  }
 
 One example of this is in xen_initdom_restore_msi_irqs().
 
 arch/x86/pci/xen.c
337  struct physdev_pci_device restore_ext;
338  
339  restore_ext.seg = pci_domain_nr(dev-bus);
340  restore_ext.bus = dev-bus-number;
341  restore_ext.devfn = dev-devfn;
342  ret = 
 HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi_ext,
343  restore_ext);
 
 There are only 4 bytes here.
 
344  if (ret == -ENOSYS)
 ^^
 If we hit this condition, we have corrupted some memory.
 
 I can see the memory corruption but how does it relate to ret ==
 -ENOSYS?

The (supposedly) corrupting code site inside an

if (unlikely(rc == -ENOSYS)) {

Supposedly because as long as the argument passed to the
function is in memory accessed by the local CPU only and
doesn't overlap with storage used for rc (e.g. living in a
register), there's no corruption possible afaict - the second
memcpy() would just copy back what the first one obtained
from there.

Fixing this other than by removing the broken code would be
pretty hard I'm afraid (and I tend to leave the code untouched
altogether in the Xenified tree).

Jan

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [Xen-devel] memory corruption in HYPERVISOR_physdev_op()

2012-10-15 Thread Ian Campbell
On Mon, 2012-10-15 at 11:48 +0100, Jan Beulich wrote:
  On 15.10.12 at 12:27, Ian Campbell ian.campb...@citrix.com wrote:
  On Fri, 2012-09-14 at 14:24 +0300, Dan Carpenter wrote:
  My static analyzer complains about potential memory corruption in
  HYPERVISOR_physdev_op()
  
  arch/x86/include/asm/xen/hypercall.h
 389  static inline int
 390  HYPERVISOR_physdev_op(int cmd, void *arg)
 391  {
 392  int rc = _hypercall2(int, physdev_op, cmd, arg);
 393  if (unlikely(rc == -ENOSYS)) {
 394  struct physdev_op op;
 395  op.cmd = cmd;
 396  memcpy(op.u, arg, sizeof(op.u));
 397  rc = _hypercall1(int, physdev_op_compat, op);
 398  memcpy(arg, op.u, sizeof(op.u));
  ^
  Some of the arg buffers are not as large as sizeof(op.u) which is either
  12 or 16 depending on the size of longs in struct physdev_apic.
  
  Nasty!
 
 Wasn't it that pv-ops expects Xen 4.0.1 or newer anyway? If so,
 what does this code exist for in the first place (it's framed by
 #if CONFIG_XEN_COMPAT = 0x030002 in the Xenified kernel)?

I think the 4.0.1 or newer requirement is for dom0 only. I guess physdev
op is only used in dom0 though? Or does passthrough need it?

 
 399  }
 400  return rc;
 401  }
  
  One example of this is in xen_initdom_restore_msi_irqs().
  
  arch/x86/pci/xen.c
 337  struct physdev_pci_device restore_ext;
 338  
 339  restore_ext.seg = pci_domain_nr(dev-bus);
 340  restore_ext.bus = dev-bus-number;
 341  restore_ext.devfn = dev-devfn;
 342  ret = 
  HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi_ext,
 343  restore_ext);
  
  There are only 4 bytes here.
  
 344  if (ret == -ENOSYS)
  ^^
  If we hit this condition, we have corrupted some memory.
  
  I can see the memory corruption but how does it relate to ret ==
  -ENOSYS?
 
 The (supposedly) corrupting code site inside an
 
   if (unlikely(rc == -ENOSYS)) {

Ah, for some reason I assumed this was in the eventual caller, even
though it was staring me right in the face in the full quote.

 Supposedly because as long as the argument passed to the
 function is in memory accessed by the local CPU only and
 doesn't overlap with storage used for rc (e.g. living in a
 register), there's no corruption possible afaict - the second
 memcpy() would just copy back what the first one obtained
 from there.
 
 Fixing this other than by removing the broken code would be
 pretty hard I'm afraid (and I tend to leave the code untouched
 altogether in the Xenified tree).

Given that it is compat code the list of subops which needs to supported
in this case is small and finite so a simple lookup table or even switch
stmt for the size might be an option.

Ian.

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [Xen-devel] memory corruption in HYPERVISOR_physdev_op()

2012-10-15 Thread Jan Beulich
 On 15.10.12 at 12:58, Ian Campbell ian.campb...@citrix.com wrote:
 On Mon, 2012-10-15 at 11:48 +0100, Jan Beulich wrote:
  On 15.10.12 at 12:27, Ian Campbell ian.campb...@citrix.com wrote:
  On Fri, 2012-09-14 at 14:24 +0300, Dan Carpenter wrote:
  My static analyzer complains about potential memory corruption in
  HYPERVISOR_physdev_op()
  
  arch/x86/include/asm/xen/hypercall.h
 389  static inline int
 390  HYPERVISOR_physdev_op(int cmd, void *arg)
 391  {
 392  int rc = _hypercall2(int, physdev_op, cmd, arg);
 393  if (unlikely(rc == -ENOSYS)) {
 394  struct physdev_op op;
 395  op.cmd = cmd;
 396  memcpy(op.u, arg, sizeof(op.u));
 397  rc = _hypercall1(int, physdev_op_compat, op);
 398  memcpy(arg, op.u, sizeof(op.u));
  ^
  Some of the arg buffers are not as large as sizeof(op.u) which is either
  12 or 16 depending on the size of longs in struct physdev_apic.
  
  Nasty!
 
 Wasn't it that pv-ops expects Xen 4.0.1 or newer anyway? If so,
 what does this code exist for in the first place (it's framed by
 #if CONFIG_XEN_COMPAT = 0x030002 in the Xenified kernel)?
 
 I think the 4.0.1 or newer requirement is for dom0 only. I guess physdev
 op is only used in dom0 though? Or does passthrough need it?

No, it's only platform_op that is Dom0-only.

  I can see the memory corruption but how does it relate to ret ==
  -ENOSYS?
 
 The (supposedly) corrupting code site inside an
 
  if (unlikely(rc == -ENOSYS)) {
 
 Ah, for some reason I assumed this was in the eventual caller, even
 though it was staring me right in the face in the full quote.

I think Dan's reference was to an eventual caller - it would see
the -ENOSYS, as the compat call wouldn't return anything else
than the modern one, and the modern one (to enter the code
in question) must have returned -ENOSYS.

 Fixing this other than by removing the broken code would be
 pretty hard I'm afraid (and I tend to leave the code untouched
 altogether in the Xenified tree).
 
 Given that it is compat code the list of subops which needs to supported
 in this case is small and finite so a simple lookup table or even switch
 stmt for the size might be an option.

Ugly, particularly for an inline function. But possible of course.

Jan

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH] xen/xenbus: silence GCC warning

2012-10-15 Thread Konrad Rzeszutek Wilk
On Mon, Oct 15, 2012 at 12:03:09PM +0200, Paul Bolle wrote:
 Compiling xenbus_xs.o triggers this GCC warning:
 drivers/xen/xenbus/xenbus_xs.c:628:13: warning: function declaration 
 isn’t a prototype [-Wstrict-prototypes]
 
 Add the obvious and trivial fix.

I already got the fix for this in my tree. Thanks!

 
 While we're touching this function add some equally obvious and trivial
 whitespace fixes.
 
 Signed-off-by: Paul Bolle pebo...@tiscali.nl
 ---
 0) Triggered by compiling v3.7-rc1 using (basically) Fedora 17's current
 config. Compile tested only.
 
 1) Obligatory reference: https://lwn.net/Articles/487493/ .
 
  drivers/xen/xenbus/xenbus_xs.c | 5 +++--
  1 file changed, 3 insertions(+), 2 deletions(-)
 
 diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c
 index 48220e1..7a2b0da 100644
 --- a/drivers/xen/xenbus/xenbus_xs.c
 +++ b/drivers/xen/xenbus/xenbus_xs.c
 @@ -619,13 +619,14 @@ static struct xenbus_watch *find_watch(const char 
 *token)
  
   return NULL;
  }
 +
  /*
   * Certain older XenBus toolstack cannot handle reading values that are
   * not populated. Some Xen 3.4 installation are incapable of doing this
   * so if we are running on anything older than 4 do not attempt to read
   * control/platform-feature-xs_reset_watches.
   */
 -static bool xen_strict_xenbus_quirk()
 +static bool xen_strict_xenbus_quirk(void)
  {
   uint32_t eax, ebx, ecx, edx, base;
  
 @@ -635,8 +636,8 @@ static bool xen_strict_xenbus_quirk()
   if ((eax  16)  4)
   return true;
   return false;
 -
  }
 +
  static void xs_reset_watches(void)
  {
   int err, supported = 0;
 -- 
 1.7.11.7
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[PATCH for-3.7] vhost: fix mergeable bufs on BE hosts

2012-10-15 Thread Michael S. Tsirkin
We copy head count to a 16 bit field,
this works by chance on LE but on BE
guest gets 0. Fix it up.

Signed-off-by: Michael S. Tsirkin m...@redhat.com
Tested-by: Alexander Graf ag...@suse.de
Cc: sta...@kernel.org

---
 drivers/vhost/net.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 9ab6d47..2bb463c 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -448,7 +448,8 @@ static void handle_rx(struct vhost_net *net)
.hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
};
size_t total_len = 0;
-   int err, headcount, mergeable;
+   int err, mergeable;
+   s16 headcount;
size_t vhost_hlen, sock_hlen;
size_t vhost_len, sock_len;
/* TODO: check that we are running from vhost_worker? */
-- 
MST
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH 00/10] VMCI for Linux upstreaming

2012-10-15 Thread George Zhang

* * *

In an effort to improve the out-of-the-box experience with Linux
kernels for VMware users, VMware is working on readying the Virtual
Machine Communication Interface (vmw_vmci) and VMCI Sockets
(vmw_vsock) kernel modules for inclusion in the Linux kernel. The
purpose of this post is to acquire feedback on the vmw_vmci kernel
module. The vmw_vsock kernel module will be presented in a later post.


* * *

VMCI allows virtual machines to communicate with host kernel modules
and the VMware hypervisors. User level applications both in a virtual
machine and on the host can use vmw_vmci through VMCI Sockets, a socket
address family designed to be compatible with UDP and TCP at the
interface level. Today, VMCI and VMCI Sockets are used by the VMware
shared folders (HGFS) and various VMware Tools components inside the
guest for zero-config, network-less access to VMware host services. In
addition to this, VMware's users are using VMCI Sockets for various
applications, where network access of the virtual machine is
restricted or non-existent. Examples of this are VMs communicating
with device proxies for proprietary hardware running as host
applications and automated testing of applications running within
virtual machines.

In a virtual machine, VMCI is exposed as a regular PCI device. The
primary communication mechanisms supported are a point-to-point
bidirectional transport based on a pair of memory-mapped queues, and
asynchronous notifications in the form of datagrams and
doorbells. These features are available to kernel level components
such as HGFS and VMCI Sockets through the VMCI kernel API. In addition
to this, the VMCI kernel API provides support for receiving events
related to the state of the VMCI communication channels, and the
virtual machine itself.

Outside the virtual machine, the host side support of the VMCI kernel
module makes the same VMCI kernel API available to VMCI endpoints on
the host. In addition to this, the host side manages each VMCI device
in a virtual machine through a context object. This context object
serves to identify the virtual machine for communication, and to track
the resource consumption of the given VMCI device. Both operations
related to communication between the virtual machine and the host
kernel, and those related to the management of the VMCI device state
in the host kernel, are invoked by the user level component of the
hypervisor through a set of ioctls on the VMCI device node.  To
provide seamless support for nested virtualization, where a virtual
machine may use both a VMCI PCI device to talk to its hypervisor, and
the VMCI host side support to run nested virtual machines, the VMCI
host and virtual machine support are combined in a single kernel
module.

For additional information about the use of VMCI and in particular
VMCI Sockets, please refer to the VMCI Socket Programming Guide
available at https://www.vmware.com/support/developer/vmci-sdk/.



---

George Zhang (10):
  VMCI: context implementation.
  VMCI: datagram implementation.
  VMCI: doorbell implementation.
  VMCI: device driver implementaton.
  VMCI: event handling implementation.
  VMCI: handle array implementation.
  VMCI: queue pairs implementation.
  VMCI: resource object implementation.
  VMCI: routing implementation.
  VMCI: Some header and config files.


 drivers/misc/Kconfig  |1 
 drivers/misc/Makefile |2 
 drivers/misc/vmw_vmci/Kconfig |   16 
 drivers/misc/vmw_vmci/Makefile|   41 
 drivers/misc/vmw_vmci/vmci_common_int.h   |   34 
 drivers/misc/vmw_vmci/vmci_context.c  | 1291 +++
 drivers/misc/vmw_vmci/vmci_context.h  |  177 +
 drivers/misc/vmw_vmci/vmci_datagram.c |  522 
 drivers/misc/vmw_vmci/vmci_datagram.h |   55 
 drivers/misc/vmw_vmci/vmci_doorbell.c |  674 +
 drivers/misc/vmw_vmci/vmci_doorbell.h |   53 
 drivers/misc/vmw_vmci/vmci_driver.c   | 2187 ++
 drivers/misc/vmw_vmci/vmci_driver.h   |   44 
 drivers/misc/vmw_vmci/vmci_event.c|  415 +++
 drivers/misc/vmw_vmci/vmci_event.h|   25 
 drivers/misc/vmw_vmci/vmci_handle_array.c |  162 +
 drivers/misc/vmw_vmci/vmci_handle_array.h |   46 
 drivers/misc/vmw_vmci/vmci_queue_pair.c   | 3556 +
 drivers/misc/vmw_vmci/vmci_queue_pair.h   |  191 ++
 drivers/misc/vmw_vmci/vmci_resource.c |  237 ++
 drivers/misc/vmw_vmci/vmci_resource.h |   59 
 drivers/misc/vmw_vmci/vmci_route.c|  237 ++
 drivers/misc/vmw_vmci/vmci_route.h|   30 
 include/linux/vmw_vmci_api.h  |   89 +
 include/linux/vmw_vmci_defs.h |  971 
 25 files changed, 5 insertions(+), 0 deletions(-)
 create mode 100644 drivers/misc/vmw_vmci/Kconfig
 create mode 100644 drivers/misc/vmw_vmci/Makefile
 create mode 100644 drivers/misc/vmw_vmci/vmci_common_int.h
 create mode 100644 

[PATCH 02/10] VMCI: datagram implementation.

2012-10-15 Thread George Zhang
VMCI datagram Implements datagrams to allow data to be sent between host and 
guest.


Signed-off-by: George Zhang georgezh...@vmware.com
---
 drivers/misc/vmw_vmci/vmci_datagram.c |  522 +
 drivers/misc/vmw_vmci/vmci_datagram.h |   55 +++
 2 files changed, 577 insertions(+), 0 deletions(-)
 create mode 100644 drivers/misc/vmw_vmci/vmci_datagram.c
 create mode 100644 drivers/misc/vmw_vmci/vmci_datagram.h

diff --git a/drivers/misc/vmw_vmci/vmci_datagram.c 
b/drivers/misc/vmw_vmci/vmci_datagram.c
new file mode 100644
index 000..ea9bb04
--- /dev/null
+++ b/drivers/misc/vmw_vmci/vmci_datagram.c
@@ -0,0 +1,522 @@
+/*
+ * VMware VMCI Driver
+ *
+ * Copyright (C) 2012 VMware, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation version 2 and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+#include linux/vmw_vmci_defs.h
+#include linux/vmw_vmci_api.h
+#include linux/module.h
+#include linux/sched.h
+#include linux/slab.h
+#include linux/bug.h
+
+#include vmci_common_int.h
+#include vmci_datagram.h
+#include vmci_resource.h
+#include vmci_context.h
+#include vmci_driver.h
+#include vmci_event.h
+#include vmci_route.h
+
+/*
+ * struct datagram_entry describes the datagram entity. It is used for datagram
+ * entities created only on the host.
+ */
+struct datagram_entry {
+   struct vmci_resource resource;
+   u32 flags;
+   bool run_delayed;
+   vmci_datagram_recv_cb recv_cb;
+   void *client_data;
+   u32 priv_flags;
+};
+
+struct delayed_datagram_info {
+   struct datagram_entry *entry;
+   struct vmci_datagram msg;
+   struct work_struct work;
+   bool in_dg_host_queue;
+};
+
+static atomic_t delayed_dg_host_queue_size;
+
+
+/*
+ * Create a datagram entry given a handle pointer.
+ */
+static int dg_create_handle(u32 resource_id,
+   u32 flags,
+   u32 priv_flags,
+   vmci_datagram_recv_cb recv_cb,
+   void *client_data, struct vmci_handle *out_handle)
+{
+   int result;
+   u32 context_id;
+   struct vmci_handle handle;
+   struct datagram_entry *entry;
+
+   ASSERT(recv_cb != NULL);
+   ASSERT(out_handle != NULL);
+   ASSERT(!(priv_flags  ~VMCI_PRIVILEGE_ALL_FLAGS));
+
+   if ((flags  VMCI_FLAG_WELLKNOWN_DG_HND) != 0)
+   return VMCI_ERROR_INVALID_ARGS;
+
+   if ((flags  VMCI_FLAG_ANYCID_DG_HND) != 0) {
+   context_id = VMCI_INVALID_ID;
+   } else {
+   context_id = vmci_get_context_id();
+   if (context_id == VMCI_INVALID_ID)
+   return VMCI_ERROR_NO_RESOURCES;
+   }
+
+   handle = vmci_make_handle(context_id, resource_id);
+
+   entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+   if (entry == NULL) {
+   pr_warn(Failed allocating memory for datagram entry.);
+   return VMCI_ERROR_NO_MEM;
+   }
+
+   entry-run_delayed = (flags  VMCI_FLAG_DG_DELAYED_CB) ? true : false;
+   entry-flags = flags;
+   entry-recv_cb = recv_cb;
+   entry-client_data = client_data;
+   entry-priv_flags = priv_flags;
+
+   /* Make datagram resource live. */
+   result = vmci_resource_add(entry-resource,
+  VMCI_RESOURCE_TYPE_DATAGRAM,
+  handle);
+   if (result != VMCI_SUCCESS) {
+   pr_warn(Failed to add new resource (handle=0x%x:0x%x), error: 
%d,
+   handle.context, handle.resource, result);
+   kfree(entry);
+   return result;
+   }
+
+   *out_handle = vmci_resource_handle(entry-resource);
+   return VMCI_SUCCESS;
+}
+
+int __init vmci_datagram_init(void)
+{
+   atomic_set(delayed_dg_host_queue_size, 0);
+   return VMCI_SUCCESS;
+}
+
+/*
+ * Internal utilility function with the same purpose as
+ * vmci_datagram_get_priv_flags that also takes a context_id.
+ */
+static int vmci_datagram_get_priv_flags(u32 context_id,
+   struct vmci_handle handle,
+   u32 *priv_flags)
+{
+   ASSERT(priv_flags);
+   ASSERT(context_id != VMCI_INVALID_ID);
+
+   if (context_id == VMCI_HOST_CONTEXT_ID) {
+   struct datagram_entry *src_entry;
+   struct vmci_resource *resource;
+
+   resource = vmci_resource_by_handle(handle,
+  VMCI_RESOURCE_TYPE_DATAGRAM);
+   if (!resource)
+   return 

[PATCH 01/10] VMCI: context implementation.

2012-10-15 Thread George Zhang
VMCI Context code maintains state for vmci and allows the driver to
communicate with multiple VMs.


Signed-off-by: George Zhang georgezh...@vmware.com
---
 drivers/misc/vmw_vmci/vmci_context.c | 1291 ++
 drivers/misc/vmw_vmci/vmci_context.h |  177 +
 2 files changed, 1468 insertions(+), 0 deletions(-)
 create mode 100644 drivers/misc/vmw_vmci/vmci_context.c
 create mode 100644 drivers/misc/vmw_vmci/vmci_context.h

diff --git a/drivers/misc/vmw_vmci/vmci_context.c 
b/drivers/misc/vmw_vmci/vmci_context.c
new file mode 100644
index 000..c552dd3
--- /dev/null
+++ b/drivers/misc/vmw_vmci/vmci_context.c
@@ -0,0 +1,1291 @@
+/*
+ * VMware VMCI Driver
+ *
+ * Copyright (C) 2012 VMware, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation version 2 and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+#include linux/vmw_vmci_defs.h
+#include linux/vmw_vmci_api.h
+#include linux/highmem.h
+#include linux/kernel.h
+#include linux/module.h
+#include linux/sched.h
+#include linux/slab.h
+
+#include vmci_common_int.h
+#include vmci_queue_pair.h
+#include vmci_datagram.h
+#include vmci_doorbell.h
+#include vmci_context.h
+#include vmci_driver.h
+#include vmci_event.h
+
+/*
+ * List of current VMCI contexts.
+ */
+static struct {
+   struct list_head head;
+   spinlock_t lock; /* Spinlock for context list operations */
+} ctx_list;
+
+static void ctx_signal_notify(struct vmci_ctx *context)
+{
+   if (context-notify)
+   *context-notify = true;
+}
+
+static void ctx_clear_notify(struct vmci_ctx *context)
+{
+   if (context-notify)
+   *context-notify = false;
+}
+
+/*
+ * If nothing requires the attention of the guest, clears both
+ * notify flag and call.
+ */
+static void ctx_clear_notify_call(struct vmci_ctx *context)
+{
+   if (context-pending_datagrams == 0 
+   vmci_handle_arr_get_size(context-pending_doorbell_array) == 0)
+   ctx_clear_notify(context);
+}
+
+/*
+ * Sets the context's notify flag iff datagrams are pending for this
+ * context.  Called from vmci_setup_notify().
+ */
+void vmci_ctx_check_signal_notify(struct vmci_ctx *context)
+{
+   ASSERT(context);
+
+   spin_lock(context-lock);
+   if (context-pending_datagrams)
+   ctx_signal_notify(context);
+   spin_unlock(context-lock);
+}
+
+int __init vmci_ctx_init(void)
+{
+   INIT_LIST_HEAD(ctx_list.head);
+   spin_lock_init(ctx_list.lock);
+
+   return VMCI_SUCCESS;
+}
+
+/*
+ * Allocates and initializes a VMCI context.
+ */
+int vmci_ctx_init_ctx(u32 cid,
+ u32 priv_flags,
+ uintptr_t event_hnd,
+ int user_version,
+ const struct cred *cred,
+ struct vmci_ctx **out_context)
+{
+   struct vmci_ctx *context;
+   int result;
+
+   if (priv_flags  ~VMCI_PRIVILEGE_ALL_FLAGS) {
+   pr_devel(Invalid flag (flags=0x%x) for VMCI context.,
+priv_flags);
+   return VMCI_ERROR_INVALID_ARGS;
+   }
+
+   if (user_version == 0)
+   return VMCI_ERROR_INVALID_ARGS;
+
+   context = kzalloc(sizeof(*context), GFP_KERNEL);
+   if (context == NULL) {
+   pr_warn(Failed to allocate memory for VMCI context.);
+   return VMCI_ERROR_NO_MEM;
+   }
+
+   INIT_LIST_HEAD(context-list_item);
+   INIT_LIST_HEAD(context-datagram_queue);
+
+   context-user_version = user_version;
+
+   context-queue_pair_array = vmci_handle_arr_create(0);
+   if (!context-queue_pair_array) {
+   result = VMCI_ERROR_NO_MEM;
+   goto error;
+   }
+
+   context-doorbell_array = vmci_handle_arr_create(0);
+   if (!context-doorbell_array) {
+   result = VMCI_ERROR_NO_MEM;
+   goto error;
+   }
+
+   context-pending_doorbell_array = vmci_handle_arr_create(0);
+   if (!context-pending_doorbell_array) {
+   result = VMCI_ERROR_NO_MEM;
+   goto error;
+   }
+
+   INIT_LIST_HEAD(context-notifier_list);
+
+   spin_lock_init(context-lock);
+
+   kref_init(context-kref);
+
+   /* Inititialize host-specific VMCI context. */
+   init_waitqueue_head(context-host_context.wait_queue);
+
+   context-priv_flags = priv_flags;
+
+   if (cred)
+   context-cred = get_cred(cred);
+
+   context-notify = NULL;
+   context-notify_page = NULL;
+
+   /*
+* If we collide with an existing context we generate a new
+* and 

[PATCH 03/10] VMCI: doorbell implementation.

2012-10-15 Thread George Zhang
VMCI doorbell code allows for notifcations between host and guest.


Signed-off-by: George Zhang georgezh...@vmware.com
---
 drivers/misc/vmw_vmci/vmci_doorbell.c |  674 +
 drivers/misc/vmw_vmci/vmci_doorbell.h |   53 +++
 2 files changed, 727 insertions(+), 0 deletions(-)
 create mode 100644 drivers/misc/vmw_vmci/vmci_doorbell.c
 create mode 100644 drivers/misc/vmw_vmci/vmci_doorbell.h

diff --git a/drivers/misc/vmw_vmci/vmci_doorbell.c 
b/drivers/misc/vmw_vmci/vmci_doorbell.c
new file mode 100644
index 000..0a8a6e5
--- /dev/null
+++ b/drivers/misc/vmw_vmci/vmci_doorbell.c
@@ -0,0 +1,674 @@
+/*
+ * VMware VMCI Driver
+ *
+ * Copyright (C) 2012 VMware, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation version 2 and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+#include linux/vmw_vmci_defs.h
+#include linux/vmw_vmci_api.h
+#include linux/completion.h
+#include linux/hash.h
+#include linux/kernel.h
+#include linux/list.h
+#include linux/module.h
+#include linux/sched.h
+#include linux/slab.h
+
+#include vmci_common_int.h
+#include vmci_datagram.h
+#include vmci_doorbell.h
+#include vmci_resource.h
+#include vmci_driver.h
+#include vmci_route.h
+
+
+#define VMCI_DOORBELL_INDEX_BITS   6
+#define VMCI_DOORBELL_INDEX_TABLE_SIZE (1  VMCI_DOORBELL_INDEX_BITS)
+#define VMCI_DOORBELL_HASH(_idx)   hash_32(_idx, VMCI_DOORBELL_INDEX_BITS)
+
+/*
+ * DoorbellEntry describes the a doorbell notification handle allocated by the
+ * host.
+ */
+struct dbell_entry {
+   struct vmci_resource resource;
+   struct hlist_node node;
+   struct work_struct work;
+   vmci_callback notify_cb;
+   void *client_data;
+   u32 idx;
+   u32 priv_flags;
+   bool run_delayed;
+   atomic_t active;/* Only used by guest personality */
+};
+
+/* The VMCI index table keeps track of currently registered doorbells. */
+struct dbell_index_table {
+   spinlock_t lock;/* Index table lock */
+   struct hlist_head entries[VMCI_DOORBELL_INDEX_TABLE_SIZE];
+};
+
+static struct dbell_index_table vmci_doorbell_it = {
+   .lock = __SPIN_LOCK_UNLOCKED(vmci_doorbell_it.lock),
+};
+
+/*
+ * The max_notify_idx is one larger than the currently known bitmap index in
+ * use, and is used to determine how much of the bitmap needs to be scanned.
+ */
+static u32 max_notify_idx;
+
+/*
+ * The notify_idx_count is used for determining whether there are free entries
+ * within the bitmap (if notify_idx_count + 1  max_notify_idx).
+ */
+static u32 notify_idx_count;
+
+/*
+ * The last_notify_idx_reserved is used to track the last index handed out - in
+ * the case where multiple handles share a notification index, we hand out
+ * indexes round robin based on last_notify_idx_reserved.
+ */
+static u32 last_notify_idx_reserved;
+
+/* This is a one entry cache used to by the index allocation. */
+static u32 last_notify_idx_released = PAGE_SIZE;
+
+
+/*
+ * Utility function that retrieves the privilege flags associated
+ * with a given doorbell handle. For guest endpoints, the
+ * privileges are determined by the context ID, but for host
+ * endpoints privileges are associated with the complete
+ * handle. Hypervisor endpoints are not yet supported.
+ */
+int vmci_dbell_get_priv_flags(struct vmci_handle handle, u32 *priv_flags)
+{
+   if (priv_flags == NULL || handle.context == VMCI_INVALID_ID)
+   return VMCI_ERROR_INVALID_ARGS;
+
+   if (handle.context == VMCI_HOST_CONTEXT_ID) {
+   struct dbell_entry *entry;
+   struct vmci_resource *resource;
+
+   resource = vmci_resource_by_handle(handle,
+  VMCI_RESOURCE_TYPE_DOORBELL);
+   if (!resource)
+   return VMCI_ERROR_NOT_FOUND;
+
+   entry = container_of(resource, struct dbell_entry, resource);
+   *priv_flags = entry-priv_flags;
+   vmci_resource_put(resource);
+   } else if (handle.context == VMCI_HYPERVISOR_CONTEXT_ID) {
+   /*
+* Hypervisor endpoints for notifications are not
+* supported (yet).
+*/
+   return VMCI_ERROR_INVALID_ARGS;
+   } else {
+   *priv_flags = vmci_context_get_priv_flags(handle.context);
+   }
+
+   return VMCI_SUCCESS;
+}
+
+/*
+ * Find doorbell entry by bitmap index.
+ */
+static struct dbell_entry *dbell_index_table_find(u32 idx)
+{
+   u32 bucket = VMCI_DOORBELL_HASH(idx);
+   struct dbell_entry *dbell;
+   struct hlist_node 

[PATCH 05/10] VMCI: event handling implementation.

2012-10-15 Thread George Zhang
VMCI event code that manages event handlers and handles callbacks when specific 
events fire.


Signed-off-by: George Zhang georgezh...@vmware.com
---
 drivers/misc/vmw_vmci/vmci_event.c |  415 
 drivers/misc/vmw_vmci/vmci_event.h |   25 ++
 2 files changed, 440 insertions(+), 0 deletions(-)
 create mode 100644 drivers/misc/vmw_vmci/vmci_event.c
 create mode 100644 drivers/misc/vmw_vmci/vmci_event.h

diff --git a/drivers/misc/vmw_vmci/vmci_event.c 
b/drivers/misc/vmw_vmci/vmci_event.c
new file mode 100644
index 000..977550d
--- /dev/null
+++ b/drivers/misc/vmw_vmci/vmci_event.c
@@ -0,0 +1,415 @@
+/*
+ * VMware VMCI Driver
+ *
+ * Copyright (C) 2012 VMware, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation version 2 and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+#include linux/vmw_vmci_defs.h
+#include linux/vmw_vmci_api.h
+#include linux/completion.h
+#include linux/kref.h
+#include linux/list.h
+#include linux/module.h
+#include linux/sched.h
+#include linux/slab.h
+
+#include vmci_driver.h
+#include vmci_event.h
+
+#define EVENT_MAGIC 0xEABE
+#define VMCI_EVENT_MAX_ATTEMPTS 10
+
+struct vmci_subscription {
+   u32 id;
+   u32 event;
+   struct kref kref;
+   struct completion done; /* unregistered, ready to be freed */
+   vmci_event_cb callback;
+   void *callback_data;
+   struct list_head node;  /* on one of subscriber lists */
+   bool run_delayed;
+};
+
+static struct list_head subscriber_array[VMCI_EVENT_MAX];
+static DEFINE_MUTEX(subscriber_mutex);
+
+struct delayed_event_info {
+   struct work_struct work;
+   struct vmci_subscription *sub;
+   u8 event_payload[sizeof(struct vmci_event_data_max)];
+};
+
+struct event_ref {
+   struct vmci_subscription *sub;
+   struct list_head list_item;
+};
+
+int __init vmci_event_init(void)
+{
+   int i;
+
+   for (i = 0; i  VMCI_EVENT_MAX; i++)
+   INIT_LIST_HEAD(subscriber_array[i]);
+
+   return VMCI_SUCCESS;
+}
+
+void vmci_event_exit(void)
+{
+   int e;
+
+   /* We free all memory at exit. */
+   for (e = 0; e  VMCI_EVENT_MAX; e++) {
+   struct vmci_subscription *cur, *p2;
+   list_for_each_entry_safe(cur, p2, subscriber_array[e], node) {
+
+   /*
+* We should never get here because all events
+* should have been unregistered before we try
+* to unload the driver module.  Also, delayed
+* callbacks could still be firing so this
+* cleanup would not be safe.  Still it is
+* better to free the memory than not ... so
+* we leave this code in just in case
+*/
+   pr_warn(Unexpected free events occuring.);
+   kfree(cur);
+   }
+   }
+
+}
+
+/*
+ * Gets a reference to the given VMCISubscription.
+ */
+static struct vmci_subscription *event_get(struct vmci_subscription *entry)
+{
+   kref_get(entry-kref);
+
+   return entry;
+}
+
+static void event_signal_destroy(struct kref *kref)
+{
+   struct vmci_subscription *entry =
+   container_of(kref, struct vmci_subscription, kref);
+
+   complete(entry-done);
+}
+
+/*
+ * Releases the given VMCISubscription.
+ * Fires the destroy event if the reference count has gone to zero.
+ */
+static void event_release(struct vmci_subscription *entry)
+{
+   kref_put(entry-kref, event_signal_destroy);
+}
+
+/*
+ * Find entry. Assumes lock is held.
+ */
+static struct vmci_subscription *event_find(u32 sub_id)
+{
+   int e;
+
+   for (e = 0; e  VMCI_EVENT_MAX; e++) {
+   struct vmci_subscription *cur;
+   list_for_each_entry(cur, subscriber_array[e], node) {
+   if (cur-id == sub_id)
+   return cur;
+   }
+   }
+   return NULL;
+}
+
+/*
+ * Calls the specified callback in a delayed context.
+ */
+static void event_delayed_dispatch(struct work_struct *work)
+{
+   struct delayed_event_info *event_info =
+   container_of(work, struct delayed_event_info, work);
+   struct vmci_subscription *sub = event_info-sub;
+   struct vmci_event_data *ed;
+
+   BUG_ON(!sub);
+
+   ed = (struct vmci_event_data *)event_info-event_payload;
+
+   sub-callback(sub-id, ed, sub-callback_data);
+   event_release(sub);
+
+   kfree(event_info);
+}
+
+/*
+ 

[PATCH 06/10] VMCI: handle array implementation.

2012-10-15 Thread George Zhang
VMCI handle code adds support for dynamic arrays that will grow if they need to.


Signed-off-by: George Zhang georgezh...@vmware.com
---
 drivers/misc/vmw_vmci/vmci_handle_array.c |  162 +
 drivers/misc/vmw_vmci/vmci_handle_array.h |   46 
 2 files changed, 208 insertions(+), 0 deletions(-)
 create mode 100644 drivers/misc/vmw_vmci/vmci_handle_array.c
 create mode 100644 drivers/misc/vmw_vmci/vmci_handle_array.h

diff --git a/drivers/misc/vmw_vmci/vmci_handle_array.c 
b/drivers/misc/vmw_vmci/vmci_handle_array.c
new file mode 100644
index 000..c7db831
--- /dev/null
+++ b/drivers/misc/vmw_vmci/vmci_handle_array.c
@@ -0,0 +1,162 @@
+/*
+ * VMware VMCI Driver
+ *
+ * Copyright (C) 2012 VMware, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation version 2 and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+#include linux/slab.h
+#include vmci_handle_array.h
+
+static size_t handle_arr_calc_size(size_t capacity)
+{
+   return sizeof(struct vmci_handle_arr) +
+   capacity * sizeof(struct vmci_handle);
+}
+
+struct vmci_handle_arr *vmci_handle_arr_create(size_t capacity)
+{
+   struct vmci_handle_arr *array;
+
+   if (capacity == 0)
+   capacity = VMCI_HANDLE_ARRAY_DEFAULT_SIZE;
+
+   array = kmalloc(handle_arr_calc_size(capacity), GFP_ATOMIC);
+   if (!array)
+   return NULL;
+
+   array-capacity = capacity;
+   array-size = 0;
+
+   return array;
+}
+
+void vmci_handle_arr_destroy(struct vmci_handle_arr *array)
+{
+   kfree(array);
+}
+
+void vmci_handle_arr_append_entry(struct vmci_handle_arr **array_ptr,
+ struct vmci_handle handle)
+{
+   struct vmci_handle_arr *array;
+
+   BUG_ON(!array_ptr || !*array_ptr);
+   array = *array_ptr;
+
+   if (unlikely(array-size = array-capacity)) {
+   /* reallocate. */
+   struct vmci_handle_arr *new_array;
+   size_t new_capacity = array-capacity * VMCI_ARR_CAP_MULT;
+   size_t new_size = handle_arr_calc_size(new_capacity);
+
+   new_array = krealloc(array, new_size, GFP_ATOMIC);
+   if (!new_array)
+   return;
+
+   new_array-capacity = new_capacity;
+   *array_ptr = array = new_array;
+   }
+
+   array-entries[array-size] = handle;
+   array-size++;
+}
+
+/*
+ * Handle that was removed, VMCI_INVALID_HANDLE if entry not found.
+ */
+struct vmci_handle vmci_handle_arr_remove_entry(struct vmci_handle_arr *array,
+   struct vmci_handle entry_handle)
+{
+   struct vmci_handle handle = VMCI_INVALID_HANDLE;
+   size_t i;
+
+   BUG_ON(!array);
+
+   for (i = 0; i  array-size; i++) {
+   if (VMCI_HANDLE_EQUAL(array-entries[i], entry_handle)) {
+   handle = array-entries[i];
+   array-size--;
+   array-entries[i] = array-entries[array-size];
+   array-entries[array-size] = VMCI_INVALID_HANDLE;
+   break;
+   }
+   }
+
+   return handle;
+}
+
+/*
+ * Handle that was removed, VMCI_INVALID_HANDLE if array was empty.
+ */
+struct vmci_handle vmci_handle_arr_remove_tail(struct vmci_handle_arr *array)
+{
+   struct vmci_handle handle = VMCI_INVALID_HANDLE;
+
+   BUG_ON(!array);
+
+   if (array-size) {
+   array-size--;
+   handle = array-entries[array-size];
+   array-entries[array-size] = VMCI_INVALID_HANDLE;
+   }
+
+   return handle;
+}
+
+/*
+ * Handle at given index, VMCI_INVALID_HANDLE if invalid index.
+ */
+struct vmci_handle
+vmci_handle_arr_get_entry(const struct vmci_handle_arr *array, size_t index)
+{
+   BUG_ON(!array);
+
+   if (unlikely(index = array-size))
+   return VMCI_INVALID_HANDLE;
+
+   return array-entries[index];
+}
+
+size_t vmci_handle_arr_get_size(const struct vmci_handle_arr *array)
+{
+   BUG_ON(!array);
+
+   return array-size;
+}
+
+bool vmci_handle_arr_has_entry(const struct vmci_handle_arr *array,
+  struct vmci_handle entry_handle)
+{
+   size_t i;
+
+   BUG_ON(!array);
+
+   for (i = 0; i  array-size; i++)
+   if (VMCI_HANDLE_EQUAL(array-entries[i], entry_handle))
+   return true;
+
+   return false;
+}
+
+/*
+ * NULL if the array is empty. Otherwise, a pointer to the array
+ * of VMCI handles in the handle array.
+ */
+struct vmci_handle 

[PATCH 08/10] VMCI: resource object implementation.

2012-10-15 Thread George Zhang
VMCI resource tracks all used resources within the vmci code.


Signed-off-by: George Zhang georgezh...@vmware.com
---
 drivers/misc/vmw_vmci/vmci_resource.c |  237 +
 drivers/misc/vmw_vmci/vmci_resource.h |   59 
 2 files changed, 296 insertions(+), 0 deletions(-)
 create mode 100644 drivers/misc/vmw_vmci/vmci_resource.c
 create mode 100644 drivers/misc/vmw_vmci/vmci_resource.h

diff --git a/drivers/misc/vmw_vmci/vmci_resource.c 
b/drivers/misc/vmw_vmci/vmci_resource.c
new file mode 100644
index 000..a2f5fd0
--- /dev/null
+++ b/drivers/misc/vmw_vmci/vmci_resource.c
@@ -0,0 +1,237 @@
+/*
+ * VMware VMCI Driver
+ *
+ * Copyright (C) 2012 VMware, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation version 2 and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+#include linux/vmw_vmci_defs.h
+#include linux/hash.h
+#include linux/types.h
+#include linux/rculist.h
+
+#include vmci_common_int.h
+#include vmci_resource.h
+#include vmci_driver.h
+
+
+#define VMCI_RESOURCE_HASH_BITS 7
+#define VMCI_RESOURCE_HASH_BUCKETS  (1  VMCI_RESOURCE_HASH_BITS)
+
+struct vmci_hash_table {
+   spinlock_t lock;
+   struct hlist_head entries[VMCI_RESOURCE_HASH_BUCKETS];
+};
+
+static struct vmci_hash_table vmci_resource_table = {
+   .lock = __SPIN_LOCK_UNLOCKED(vmci_resource_table.lock),
+};
+
+static unsigned int vmci_resource_hash(struct vmci_handle handle)
+{
+   return hash_32(VMCI_HANDLE_TO_RESOURCE_ID(handle),
+  VMCI_RESOURCE_HASH_BITS);
+}
+
+/*
+ * Gets a resource (if one exists) matching given handle from the hash table.
+ */
+static struct vmci_resource *vmci_resource_lookup(struct vmci_handle handle)
+{
+   struct vmci_resource *r, *resource = NULL;
+   struct hlist_node *node;
+   unsigned int idx = vmci_resource_hash(handle);
+
+   BUG_ON(VMCI_HANDLE_EQUAL(handle, VMCI_INVALID_HANDLE));
+
+   rcu_read_lock();
+   hlist_for_each_entry_rcu(r, node,
+vmci_resource_table.entries[idx], node) {
+   u32 rid = VMCI_HANDLE_TO_RESOURCE_ID(r-handle);
+   u32 cid = VMCI_HANDLE_TO_CONTEXT_ID(r-handle);
+
+   if (rid == VMCI_HANDLE_TO_RESOURCE_ID(handle) 
+   (cid == VMCI_HANDLE_TO_CONTEXT_ID(handle) ||
+cid == VMCI_INVALID_ID)) {
+   resource = r;
+   break;
+   }
+   }
+   rcu_read_unlock();
+
+   return resource;
+}
+
+/*
+ * Find an unused resource ID and return it. The first
+ * VMCI_RESERVED_RESOURCE_ID_MAX are reserved so we start from
+ * its value + 1.
+ * Returns VMCI resource id on success, VMCI_INVALID_ID on failure.
+ */
+static u32 vmci_resource_find_id(u32 context_id)
+{
+   static u32 resource_id = VMCI_RESERVED_RESOURCE_ID_MAX + 1;
+   u32 old_rid = resource_id;
+   u32 current_rid;
+
+   /*
+* Generate a unique resource ID.  Keep on trying until we wrap around
+* in the RID space.
+*/
+   BUG_ON(old_rid = VMCI_RESERVED_RESOURCE_ID_MAX);
+
+   do {
+   struct vmci_handle handle;
+
+   current_rid = resource_id;
+   resource_id++;
+   if (unlikely(resource_id == VMCI_INVALID_ID)) {
+   /* Skip the reserved rids. */
+   resource_id = VMCI_RESERVED_RESOURCE_ID_MAX + 1;
+   }
+
+   handle = vmci_make_handle(context_id, current_rid);
+   if (!vmci_resource_lookup(handle))
+   return current_rid;
+   } while (resource_id != old_rid);
+
+   return VMCI_INVALID_ID;
+}
+
+
+int vmci_resource_add(struct vmci_resource *resource,
+ enum vmci_resource_type resource_type,
+ struct vmci_handle handle)
+
+{
+   unsigned int idx;
+   int result;
+
+   BUG_ON(!resource);
+
+   spin_lock(vmci_resource_table.lock);
+
+   if (handle.resource == VMCI_INVALID_ID) {
+   handle.resource = vmci_resource_find_id(handle.context);
+   if (handle.resource == VMCI_INVALID_ID) {
+   result = VMCI_ERROR_NO_HANDLE;
+   goto out;
+   }
+   } else if (vmci_resource_lookup(handle)) {
+   result = VMCI_ERROR_ALREADY_EXISTS;
+   goto out;
+   }
+
+   resource-handle = handle;
+   resource-type = resource_type;
+   INIT_HLIST_NODE(resource-node);
+   kref_init(resource-kref);
+   init_completion(resource-done);

[PATCH 09/10] VMCI: routing implementation.

2012-10-15 Thread George Zhang
VMCI routing code is responsible for routing between various hosts/guests
as well as routing in nested scenarios.


Signed-off-by: George Zhang georgezh...@vmware.com
---
 drivers/misc/vmw_vmci/vmci_route.c |  237 
 drivers/misc/vmw_vmci/vmci_route.h |   30 +
 2 files changed, 267 insertions(+), 0 deletions(-)
 create mode 100644 drivers/misc/vmw_vmci/vmci_route.c
 create mode 100644 drivers/misc/vmw_vmci/vmci_route.h

diff --git a/drivers/misc/vmw_vmci/vmci_route.c 
b/drivers/misc/vmw_vmci/vmci_route.c
new file mode 100644
index 000..8aa43ee
--- /dev/null
+++ b/drivers/misc/vmw_vmci/vmci_route.c
@@ -0,0 +1,237 @@
+/*
+ * VMware VMCI Driver
+ *
+ * Copyright (C) 2012 VMware, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation version 2 and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+#include linux/vmw_vmci_defs.h
+#include linux/vmw_vmci_api.h
+
+#include vmci_common_int.h
+#include vmci_context.h
+#include vmci_driver.h
+#include vmci_route.h
+
+/*
+ * Make a routing decision for the given source and destination handles.
+ * This will try to determine the route using the handles and the available
+ * devices.  Will set the source context if it is invalid.
+ */
+int vmci_route(struct vmci_handle *src,
+  const struct vmci_handle *dst,
+  bool from_guest,
+  enum vmci_route *route)
+{
+   bool has_host_device = vmci_host_code_active();
+   bool has_guest_device = vmci_guest_code_active();
+
+   ASSERT(src);
+   ASSERT(dst);
+   ASSERT(route);
+
+   *route = VMCI_ROUTE_NONE;
+
+   /*
+* from_guest is only ever set to true by
+* IOCTL_VMCI_DATAGRAM_SEND (or by the vmkernel equivalent),
+* which comes from the VMX, so we know it is coming from a
+* guest.
+*
+* To avoid inconsistencies, test these once.  We will test
+* them again when we do the actual send to ensure that we do
+* not touch a non-existent device.
+*/
+
+   /* Must have a valid destination context. */
+   if (VMCI_INVALID_ID == dst-context)
+   return VMCI_ERROR_INVALID_ARGS;
+
+   /* Anywhere to hypervisor. */
+   if (VMCI_HYPERVISOR_CONTEXT_ID == dst-context) {
+
+   /*
+* If this message already came from a guest then we
+* cannot send it to the hypervisor.  It must come
+* from a local client.
+*/
+   if (from_guest)
+   return VMCI_ERROR_DST_UNREACHABLE;
+
+   /*
+* We must be acting as a guest in order to send to
+* the hypervisor.
+*/
+   if (!has_guest_device)
+   return VMCI_ERROR_DEVICE_NOT_FOUND;
+
+   /* And we cannot send if the source is the host context. */
+   if (VMCI_HOST_CONTEXT_ID == src-context)
+   return VMCI_ERROR_INVALID_ARGS;
+
+   /*
+* If the client passed the ANON source handle then
+* respect it (both context and resource are invalid).
+* However, if they passed only an invalid context,
+* then they probably mean ANY, in which case we
+* should set the real context here before passing it
+* down.
+*/
+   if (VMCI_INVALID_ID == src-context 
+   VMCI_INVALID_ID != src-resource)
+   src-context = vmci_get_context_id();
+
+   /* Send from local client down to the hypervisor. */
+   *route = VMCI_ROUTE_AS_GUEST;
+   return VMCI_SUCCESS;
+   }
+
+   /* Anywhere to local client on host. */
+   if (VMCI_HOST_CONTEXT_ID == dst-context) {
+   /*
+* If it is not from a guest but we are acting as a
+* guest, then we need to send it down to the host.
+* Note that if we are also acting as a host then this
+* will prevent us from sending from local client to
+* local client, but we accept that restriction as a
+* way to remove any ambiguity from the host context.
+*/
+   if (src-context == VMCI_HYPERVISOR_CONTEXT_ID) {
+   /*
+* If the hypervisor is the source, this is
+* host local communication. The hypervisor
+* may send vmci event datagrams to the host
+

[PATCH 10/10] VMCI: Some header and config files.

2012-10-15 Thread George Zhang
VMCI header config patch adds all the necessary files to enable building of the 
VMCI
module with the Linux Makefiles and Kconfig systems. Also adds the header
files used for building modules against the driver.


Signed-off-by: George Zhang georgezh...@vmware.com
---
 drivers/misc/Kconfig|1 
 drivers/misc/Makefile   |2 
 drivers/misc/vmw_vmci/Kconfig   |   16 +
 drivers/misc/vmw_vmci/Makefile  |   41 +
 drivers/misc/vmw_vmci/vmci_common_int.h |   34 +
 include/linux/vmw_vmci_api.h|   89 +++
 include/linux/vmw_vmci_defs.h   |  971 +++
 7 files changed, 1154 insertions(+), 0 deletions(-)
 create mode 100644 drivers/misc/vmw_vmci/Kconfig
 create mode 100644 drivers/misc/vmw_vmci/Makefile
 create mode 100644 drivers/misc/vmw_vmci/vmci_common_int.h
 create mode 100644 include/linux/vmw_vmci_api.h
 create mode 100644 include/linux/vmw_vmci_defs.h

diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 2661f6e..fe38c7a 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -517,4 +517,5 @@ source drivers/misc/lis3lv02d/Kconfig
 source drivers/misc/carma/Kconfig
 source drivers/misc/altera-stapl/Kconfig
 source drivers/misc/mei/Kconfig
+source drivers/misc/vmw_vmci/Kconfig
 endmenu
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index 456972f..21ed953 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -51,3 +51,5 @@ obj-y += carma/
 obj-$(CONFIG_USB_SWITCH_FSA9480) += fsa9480.o
 obj-$(CONFIG_ALTERA_STAPL) +=altera-stapl/
 obj-$(CONFIG_INTEL_MEI)+= mei/
+obj-$(CONFIG_MAX8997_MUIC) += max8997-muic.o
+obj-$(CONFIG_VMWARE_VMCI)  += vmw_vmci/
diff --git a/drivers/misc/vmw_vmci/Kconfig b/drivers/misc/vmw_vmci/Kconfig
new file mode 100644
index 000..55015e7
--- /dev/null
+++ b/drivers/misc/vmw_vmci/Kconfig
@@ -0,0 +1,16 @@
+#
+# VMware VMCI device
+#
+
+config VMWARE_VMCI
+   tristate VMware VMCI Driver
+   depends on X86
+   help
+ This is VMware's Virtual Machine Communication Interface.  It enables
+ high-speed communication between host and guest in a virtual
+ environment via the VMCI virtual device.
+
+ If unsure, say N.
+
+ To compile this driver as a module, choose M here: the
+ module will be called vmw_vmci.
diff --git a/drivers/misc/vmw_vmci/Makefile b/drivers/misc/vmw_vmci/Makefile
new file mode 100644
index 000..bcc3b6c
--- /dev/null
+++ b/drivers/misc/vmw_vmci/Makefile
@@ -0,0 +1,41 @@
+
+#
+# Linux driver for VMware's VMCI device.
+#
+# Copyright (C) 2007-2012, VMware, Inc. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; version 2 of the License and no later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+# NON INFRINGEMENT.  See the GNU General Public License for more
+# details.
+#
+# Maintained by: Andrew Stiegmann pv-driv...@vmware.com
+#
+
+
+#
+# Makefile for the VMware VMCI
+#
+
+obj-$(CONFIG_VMWARE_VMCI) += vmw_vmci.o
+
+vmw_vmci-y += vmci_context.o
+vmw_vmci-y += vmci_datagram.o
+vmw_vmci-y += vmci_doorbell.o
+vmw_vmci-y += vmci_driver.o
+vmw_vmci-y += vmci_event.o
+vmw_vmci-y += vmci_handle_array.o
+vmw_vmci-y += vmci_queue_pair.o
+vmw_vmci-y += vmci_resource.o
+vmw_vmci-y += vmci_route.o
+
+vmci:
+   $(MAKE) -C ../../.. SUBDIRS=$$PWD CONFIG_VMWARE_VMCI=m modules
+
+clean:
+   $(MAKE) -C ../../.. SUBDIRS=$$PWD CONFIG_VMWARE_VMCI=m clean
diff --git a/drivers/misc/vmw_vmci/vmci_common_int.h 
b/drivers/misc/vmw_vmci/vmci_common_int.h
new file mode 100644
index 000..77667ec
--- /dev/null
+++ b/drivers/misc/vmw_vmci/vmci_common_int.h
@@ -0,0 +1,34 @@
+/*
+ * VMware VMCI Driver
+ *
+ * Copyright (C) 2012 VMware, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation version 2 and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+#ifndef _VMCI_COMMONINT_H_
+#define _VMCI_COMMONINT_H_
+
+#include linux/printk.h
+
+#define ASSERT(cond) BUG_ON(!(cond))
+
+#define PCI_VENDOR_ID_VMWARE   0x15AD
+#define PCI_DEVICE_ID_VMWARE_VMCI  0x0740
+#define VMCI_DRIVER_VERSION_STRING 

[PATCH 0/6] VSOCK for Linux upstreaming

2012-10-15 Thread George Zhang

* * *

In an effort to improve the out-of-the-box experience with Linux
kernels for VMware users, VMware is working on readying the Virtual
Machine Communication Interface (vmw_vmci) and VMCI Sockets (VSOCK)
(vmw_vsock) kernel modules for inclusion in the Linux kernel. The
purpose of this post is to acquire feedback on the vmw_vsock kernel
module. The vmw_vmci kernel module has been presented in an early post.


* * *

VMCI Sockets allows virtual machines to communicate with host kernel
modules and the VMware hypervisors. VMCI Sockets kernel module has
dependency on VMCI kernel module. User level applications both in
a virtual machine and on the host can use vmw_vmci through VMCI
Sockets API which facilitates fast and efficient communication
between guest virtual machines and their host. A socket
address family designed to be compatible with UDP and TCP at the
interface level. Today, VMCI and VMCI Sockets are used by the VMware
shared folders (HGFS) and various VMware Tools components inside the
guest for zero-config, network-less access to VMware host services. In
addition to this, VMware's users are using VMCI Sockets for various
applications, where network access of the virtual machine is
restricted or non-existent. Examples of this are VMs communicating
with device proxies for proprietary hardware running as host
applications and automated testing of applications running within
virtual machines.

The VMware VMCI Sockets are similar to other socket types, like
Berkeley UNIX socket interface. The VMCI sockets module supports
both connection-oriented stream sockets like TCP, and connectionless
datagram sockets like UDP. The VSOCK protocol family is defined as
AF_VSOCK and the socket operations split for SOCK_DGRAM and
SOCK_STREAM.

For additional information about the use of VMCI and in particular
VMCI Sockets, please refer to the VMCI Socket Programming Guide
available at https://www.vmware.com/support/developer/vmci-sdk/.



---

George Zhang (6):
  VSOCK: vsock protocol implementation.
  VSOCK: vsock address implementaion.
  VSOCK: notification implementation.
  VSOCK: statistics implementation.
  VSOCK: utility functions.
  VSOCK: header and config files.


 net/Kconfig |1 
 net/Makefile|1 
 net/vmw_vsock/Kconfig   |   14 
 net/vmw_vsock/Makefile  |8 
 net/vmw_vsock/af_vsock.c| 4259 +++
 net/vmw_vsock/af_vsock.h|  179 +
 net/vmw_vsock/notify.c  | 1041 +
 net/vmw_vsock/notify.h  |  130 +
 net/vmw_vsock/notify_qstate.c   |  670 ++
 net/vmw_vsock/stats.c   |   37 
 net/vmw_vsock/stats.h   |  222 ++
 net/vmw_vsock/util.c|  694 ++
 net/vmw_vsock/util.h|  331 +++
 net/vmw_vsock/vmci_sockets.h|  517 
 net/vmw_vsock/vmci_sockets_packet.h |  107 +
 net/vmw_vsock/vsock_addr.c  |  264 ++
 net/vmw_vsock/vsock_addr.h  |   40 
 net/vmw_vsock/vsock_common.h|  130 +
 net/vmw_vsock/vsock_packet.h|  131 +
 net/vmw_vsock/vsock_version.h   |   29 
 20 files changed, 8805 insertions(+), 0 deletions(-)
 create mode 100644 net/vmw_vsock/Kconfig
 create mode 100644 net/vmw_vsock/Makefile
 create mode 100644 net/vmw_vsock/af_vsock.c
 create mode 100644 net/vmw_vsock/af_vsock.h
 create mode 100644 net/vmw_vsock/notify.c
 create mode 100644 net/vmw_vsock/notify.h
 create mode 100644 net/vmw_vsock/notify_qstate.c
 create mode 100644 net/vmw_vsock/stats.c
 create mode 100644 net/vmw_vsock/stats.h
 create mode 100644 net/vmw_vsock/util.c
 create mode 100644 net/vmw_vsock/util.h
 create mode 100644 net/vmw_vsock/vmci_sockets.h
 create mode 100644 net/vmw_vsock/vmci_sockets_packet.h
 create mode 100644 net/vmw_vsock/vsock_addr.c
 create mode 100644 net/vmw_vsock/vsock_addr.h
 create mode 100644 net/vmw_vsock/vsock_common.h
 create mode 100644 net/vmw_vsock/vsock_packet.h
 create mode 100644 net/vmw_vsock/vsock_version.h

-- 
Signature
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH 2/6] VSOCK: vsock address implementaion.

2012-10-15 Thread George Zhang
VSOCK linux address code implementation.


Signed-off-by: George Zhang georgezh...@vmware.com
---
 net/vmw_vsock/vsock_addr.c |  264 
 net/vmw_vsock/vsock_addr.h |   40 +++
 2 files changed, 304 insertions(+), 0 deletions(-)
 create mode 100644 net/vmw_vsock/vsock_addr.c
 create mode 100644 net/vmw_vsock/vsock_addr.h

diff --git a/net/vmw_vsock/vsock_addr.c b/net/vmw_vsock/vsock_addr.c
new file mode 100644
index 000..c93a174
--- /dev/null
+++ b/net/vmw_vsock/vsock_addr.c
@@ -0,0 +1,264 @@
+/*
+ * VMware vSockets Driver
+ *
+ * Copyright (C) 2007-2012 VMware, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation version 2 and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+/*
+ * vsockAddr.c --
+ *
+ * VSockets address implementation.
+ */
+
+/*
+ * These includes come before vsockCommon.h to ensure that VMware's ASSERT
+ * macro is used instead of Linux's irda.h definition.
+ */
+#include linux/types.h
+#include linux/socket.h
+#include linux/stddef.h  /* for NULL */
+#include net/sock.h
+
+#include vsock_common.h
+
+/*
+ *
+ * vsock_addr_init --
+ *
+ * Initialize the given address with the given context id and port. This will
+ * clear the address, set the correct family, and add the given values.
+ *
+ * Results: None.
+ *
+ * Side effects: None.
+ */
+
+void vsock_addr_init(struct sockaddr_vm *addr, u32 cid, u32 port)
+{
+   memset(addr, 0, sizeof *addr);
+
+   addr-svm_family = AF_VSOCK;
+   addr-svm_cid = cid;
+   addr-svm_port = port;
+
+   BUG_ON(vsock_addr_validate(addr) != 0);
+}
+
+/*
+ *
+ * vsock_addr_validate --
+ *
+ * Try to validate the given address.  The address must not be null and must
+ * have the correct address family.  Any reserved fields must be zero.
+ *
+ * Results: 0 on success, EFAULT if the address is null, EAFNOSUPPORT if the
+ * address is of the wrong family, and EINVAL if the reserved fields are not
+ * zero.
+ *
+ * Side effects: None.
+ */
+
+int vsock_addr_validate(const struct sockaddr_vm *addr)
+{
+   if (!addr)
+   return -EFAULT;
+
+   if (addr-svm_family != AF_VSOCK)
+   return -EAFNOSUPPORT;
+
+   if (addr-svm_zero[0] != 0)
+   return -EINVAL;
+
+   return 0;
+}
+
+/*
+ *
+ * vsock_addr_bound --
+ *
+ * Determines whether the provided address is bound.
+ *
+ * Results: TRUE if the address structure is bound, FALSE otherwise.
+ *
+ * Side effects: None.
+ */
+
+bool vsock_addr_bound(const struct sockaddr_vm *addr)
+{
+   BUG_ON(!addr);
+
+   return addr-svm_port != VMADDR_PORT_ANY;
+}
+
+/*
+ *
+ * vsock_addr_unbind --
+ *
+ * Unbind the given addresss.
+ *
+ * Results: None.
+ *
+ * Side effects: None.
+ */
+
+void vsock_addr_unbind(struct sockaddr_vm *addr)
+{
+   vsock_addr_init(addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
+}
+
+/*
+ *
+ * vsock_addr_equals_addr --
+ *
+ * Determine if the given addresses are equal.
+ *
+ * Results: TRUE if the addresses are equal, FALSE otherwise.
+ *
+ * Side effects: None.
+ */
+
+bool vsock_addr_equals_addr(const struct sockaddr_vm *addr,
+   const struct sockaddr_vm *other)
+{
+   BUG_ON(vsock_addr_validate(addr) != 0);
+   BUG_ON(vsock_addr_validate(other) != 0);
+
+   return addr-svm_cid == other-svm_cid 
+   addr-svm_port == other-svm_port;
+}
+
+/*
+ *
+ * vsock_addr_equals_addr_any --
+ *
+ * Determine if the given addresses are equal. Will accept either an exact
+ * match or one where the rids match and that either the cids match or are set
+ * to VMADDR_CID_ANY.
+ *
+ * Results: TRUE if the addresses are equal, FALSE otherwise.
+ *
+ * Side effects: None.
+ */
+
+bool vsock_addr_equals_addr_any(const struct sockaddr_vm *addr,
+   const struct sockaddr_vm *other)
+{
+   BUG_ON(vsock_addr_validate(addr) != 0);
+   BUG_ON(vsock_addr_validate(other) != 0);
+
+   return (addr-svm_cid == VMADDR_CID_ANY ||
+   other-svm_cid == VMADDR_CID_ANY ||
+   addr-svm_cid == other-svm_cid) 
+  addr-svm_port == other-svm_port;
+}
+
+/*
+ *
+ * vsock_addr_equals_handle_port --
+ *
+ * Determines if the given address matches the given handle and port.
+ *
+ * Results: TRUE if the address matches the handle and port, FALSE otherwise.
+ *
+ * Side effects: None.
+ */
+
+bool vsock_addr_equals_handle_port(const struct sockaddr_vm *addr,
+  struct vmci_handle handle, u32 port)
+{
+   BUG_ON(vsock_addr_validate(addr) != 0);
+
+   return addr-svm_cid == 

[PATCH 3/6] VSOCK: notification implementation.

2012-10-15 Thread George Zhang
VSOCK control notifications for VMCI Stream Sockets protocol.


Signed-off-by: George Zhang georgezh...@vmware.com
---
 net/vmw_vsock/notify.c | 1041 
 net/vmw_vsock/notify.h |  130 ++
 2 files changed, 1171 insertions(+), 0 deletions(-)
 create mode 100644 net/vmw_vsock/notify.c
 create mode 100644 net/vmw_vsock/notify.h

diff --git a/net/vmw_vsock/notify.c b/net/vmw_vsock/notify.c
new file mode 100644
index 000..03a0a1f
--- /dev/null
+++ b/net/vmw_vsock/notify.c
@@ -0,0 +1,1041 @@
+/*
+ * VMware vSockets Driver
+ *
+ * Copyright (C) 2009-2012 VMware, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation version 2 and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+/*
+ * notify.c --
+ *
+ * Linux control notifications for the VMCI Stream Sockets protocol.
+ */
+
+#include linux/types.h
+
+#include linux/socket.h
+#include linux/stddef.h  /* for NULL */
+#include net/sock.h
+
+#include notify.h
+#include af_vsock.h
+
+#define PKT_FIELD(vsk, field_name) \
+   (vsk)-notify.pkt.field_name
+
+#define VSOCK_MAX_DGRAM_RESENDS   10
+
+/*
+ *
+ * vsock_vmci_notify_waiting_write --
+ *
+ * Determines if the conditions have been met to notify a waiting writer.
+ *
+ * Results: true if a notification should be sent, false otherwise.
+ *
+ * Side effects: None.
+ */
+
+static bool vsock_vmci_notify_waiting_write(vsock_vmci_sock *vsk)
+{
+#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
+   bool retval;
+   u64 notify_limit;
+
+   if (!PKT_FIELD(vsk, peer_waiting_write))
+   return false;
+
+#ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
+   /*
+* When the sender blocks, we take that as a sign that the sender is
+* faster than the receiver. To reduce the transmit rate of the sender,
+* we delay the sending of the read notification by decreasing the
+* write_notify_window. The notification is delayed until the number of
+* bytes used in the queue drops below the write_notify_window.
+*/
+
+   if (!PKT_FIELD(vsk, peer_waiting_write_detected)) {
+   PKT_FIELD(vsk, peer_waiting_write_detected) = true;
+   if (PKT_FIELD(vsk, write_notify_window)  PAGE_SIZE) {
+   PKT_FIELD(vsk, write_notify_window) =
+   PKT_FIELD(vsk, write_notify_min_window);
+   } else {
+   PKT_FIELD(vsk, write_notify_window) -= PAGE_SIZE;
+   if (PKT_FIELD(vsk, write_notify_window) 
+   PKT_FIELD(vsk, write_notify_min_window))
+   PKT_FIELD(vsk, write_notify_window) =
+   PKT_FIELD(vsk, write_notify_min_window);
+
+   }
+   }
+   notify_limit = vsk-consume_size - PKT_FIELD(vsk, write_notify_window);
+#else
+   notify_limit = 0;
+#endif
+
+   /*
+* For now we ignore the wait information and just see if the free
+* space exceeds the notify limit.  Note that improving this function
+* to be more intelligent will not require a protocol change and will
+* retain compatibility between endpoints with mixed versions of this
+* function.
+*
+* The notify_limit is used to delay notifications in the case where
+* flow control is enabled. Below the test is expressed in terms of
+* free space in the queue: if free_space  ConsumeSize -
+* write_notify_window then notify An alternate way of expressing this
+* is to rewrite the expression to use the data ready in the receive
+* queue: if write_notify_window  bufferReady then notify as
+* free_space == ConsumeSize - bufferReady.
+*/
+   retval = vmci_qpair_consume_free_space(vsk-qpair)  notify_limit;
+#ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
+   if (retval) {
+   /*
+* Once we notify the peer, we reset the detected flag so the
+* next wait will again cause a decrease in the window size.
+*/
+
+   PKT_FIELD(vsk, peer_waiting_write_detected) = false;
+   }
+#endif
+   return retval;
+#else
+   return true;
+#endif
+}
+
+/*
+ *
+ * vsock_vmci_notify_waiting_read --
+ *
+ * Determines if the conditions have been met to notify a waiting reader.
+ *
+ * Results: true if a notification should be sent, false otherwise.
+ *
+ * Side effects: None.
+ */
+
+static bool vsock_vmci_notify_waiting_read(vsock_vmci_sock *vsk)
+{
+#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
+

[PATCH 4/6] VSOCK: statistics implementation.

2012-10-15 Thread George Zhang
VSOCK stats for VMCI Stream Sockets protocol.


Signed-off-by: George Zhang georgezh...@vmware.com
---
 net/vmw_vsock/stats.c |   37 
 net/vmw_vsock/stats.h |  222 +
 2 files changed, 259 insertions(+), 0 deletions(-)
 create mode 100644 net/vmw_vsock/stats.c
 create mode 100644 net/vmw_vsock/stats.h

diff --git a/net/vmw_vsock/stats.c b/net/vmw_vsock/stats.c
new file mode 100644
index 000..2d172d5
--- /dev/null
+++ b/net/vmw_vsock/stats.c
@@ -0,0 +1,37 @@
+/*
+ * VMware vSockets Driver
+ *
+ * Copyright (C) 2009-2012 VMware, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation version 2 and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+/*
+ * stats.c --
+ *
+ * Linux stats for the VMCI Stream Sockets protocol.
+ */
+
+#include linux/types.h
+
+#include linux/socket.h
+#include linux/stddef.h  /* for NULL */
+#include net/sock.h
+
+#include af_vsock.h
+#include stats.h
+
+#ifdef VSOCK_GATHER_STATISTICS
+u64 vsock_stats_ctl_pkt_count[VSOCK_PACKET_TYPE_MAX];
+u64 vsock_stats_consume_queue_hist[VSOCK_NUM_QUEUE_LEVEL_BUCKETS];
+u64 vsock_stats_produce_queue_hist[VSOCK_NUM_QUEUE_LEVEL_BUCKETS];
+atomic64_t vsock_stats_consume_total;
+atomic64_t vsock_stats_produce_total;
+#endif
diff --git a/net/vmw_vsock/stats.h b/net/vmw_vsock/stats.h
new file mode 100644
index 000..ce6ec7e
--- /dev/null
+++ b/net/vmw_vsock/stats.h
@@ -0,0 +1,222 @@
+/*
+ * VMware vSockets Driver
+ *
+ * Copyright (C) 2009-2012 VMware, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation version 2 and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+/*
+ * stats.h --
+ *
+ * Stats functions for Linux vsock module.
+ */
+
+#ifndef __STATS_H__
+#define __STATS_H__
+
+#include linux/types.h
+
+#include vsock_common.h
+#include vsock_packet.h
+
+/*
+ * Define VSOCK_GATHER_STATISTICS to turn on statistics gathering. Currently
+ * this consists of 3 types of stats: 1. The number of control datagram
+ * messages sent. 2. The level of queuepair fullness (in 10% buckets) whenever
+ * data is about to be enqueued or dequeued from the queuepair. 3. The total
+ * number of bytes enqueued/dequeued.
+ */
+
+#ifdef VSOCK_GATHER_STATISTICS
+
+#define VSOCK_NUM_QUEUE_LEVEL_BUCKETS 10
+extern u64 vsock_stats_ctl_pkt_count[VSOCK_PACKET_TYPE_MAX];
+extern u64 vsock_stats_consume_queue_hist[VSOCK_NUM_QUEUE_LEVEL_BUCKETS];
+extern u64 vsock_stats_produce_queue_hist[VSOCK_NUM_QUEUE_LEVEL_BUCKETS];
+extern atomic64_t vsock_stats_consume_total;
+extern atomic64_t vsock_stats_produce_total;
+
+#define VSOCK_STATS_STREAM_CONSUME_HIST(vsk)   \
+   vsock_vmci_stats_update_queue_bucket_count((vsk)-qpair,\
+   (vsk)-consume_size,\
+   vmci_qpair_consume_buf_ready((vsk)-qpair), \
+   vsock_stats_consume_queue_hist)
+#define VSOCK_STATS_STREAM_PRODUCE_HIST(vsk)   \
+   vsock_vmci_stats_update_queue_bucket_count((vsk)-qpair,\
+   (vsk)-produce_size,\
+   vmci_qpair_produce_buf_ready((vsk)-qpair), \
+   vsock_stats_produce_queue_hist)
+#define VSOCK_STATS_CTLPKT_LOG(pkt_type)   \
+   do {\
+   ++vsock_stats_ctl_pkt_count[pkt_type];  \
+   } while (0)
+#define VSOCK_STATS_STREAM_CONSUME(bytes)  \
+   atomic64_add(vsock_stats_consume_total, bytes)
+#define VSOCK_STATS_STREAM_PRODUCE(bytes)  \
+   atomic64_add(vsock_stats_produce_total, bytes)
+#define VSOCK_STATS_CTLPKT_DUMP_ALL() vsock_vmci_stats_ctl_pkt_dump_all()
+#define VSOCK_STATS_HIST_DUMP_ALL()   vsock_vmci_stats_hist_dump_all()
+#define VSOCK_STATS_TOTALS_DUMP_ALL() vsock_vmci_stats_totals_dump_all()
+#define VSOCK_STATS_RESET()   vsock_vmci_stats_reset()
+
+/*
+ *
+ * vsock_vmci_stats_update_queue_bucket_count --
+ *
+ * Given a queue, determine how much data is enqueued and add that to the
+ * specified queue level statistic bucket.
+ *
+ * Results: None.
+ *
+ * Side effects: None.
+ 

[PATCH 5/6] VSOCK: utility functions.

2012-10-15 Thread George Zhang
VSOCK utility functions for Linux VSocket module.


Signed-off-by: George Zhang georgezh...@vmware.com
---
 net/vmw_vsock/util.c |  694 ++
 net/vmw_vsock/util.h |  331 
 2 files changed, 1025 insertions(+), 0 deletions(-)
 create mode 100644 net/vmw_vsock/util.c
 create mode 100644 net/vmw_vsock/util.h

diff --git a/net/vmw_vsock/util.c b/net/vmw_vsock/util.c
new file mode 100644
index 000..036ca4e
--- /dev/null
+++ b/net/vmw_vsock/util.c
@@ -0,0 +1,694 @@
+/*
+ * VMware vSockets Driver
+ *
+ * Copyright (C) 2007-2012 VMware, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation version 2 and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+/*
+ * util.c --
+ *
+ * Utility functions for Linux VSocket module.
+ */
+
+#include linux/types.h
+#include linux/list.h
+#include linux/socket.h
+#include linux/stddef.h  /* for NULL */
+#include net/sock.h
+
+#include af_vsock.h
+#include util.h
+
+struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1];
+struct list_head vsock_connected_table[VSOCK_HASH_SIZE];
+
+DEFINE_SPINLOCK(vsock_table_lock);
+
+/*
+ *
+ * vsock_vmci_log_pkt --
+ *
+ * Logs the provided packet.
+ *
+ * Results: None.
+ *
+ * Side effects: None.
+ */
+
+void vsock_vmci_log_pkt(char const *function, u32 line, vsock_packet *pkt)
+{
+   char buf[256];
+   char *cur = buf;
+   int left = sizeof buf;
+   int written = 0;
+   char *type_strings[] = {
+   [VSOCK_PACKET_TYPE_INVALID] = INVALID,
+   [VSOCK_PACKET_TYPE_REQUEST] = REQUEST,
+   [VSOCK_PACKET_TYPE_NEGOTIATE] = NEGOTIATE,
+   [VSOCK_PACKET_TYPE_OFFER] = OFFER,
+   [VSOCK_PACKET_TYPE_ATTACH] = ATTACH,
+   [VSOCK_PACKET_TYPE_WROTE] = WROTE,
+   [VSOCK_PACKET_TYPE_READ] = READ,
+   [VSOCK_PACKET_TYPE_RST] = RST,
+   [VSOCK_PACKET_TYPE_SHUTDOWN] = SHUTDOWN,
+   [VSOCK_PACKET_TYPE_WAITING_WRITE] = WAITING_WRITE,
+   [VSOCK_PACKET_TYPE_WAITING_READ] = WAITING_READ,
+   [VSOCK_PACKET_TYPE_REQUEST2] = REQUEST2,
+   [VSOCK_PACKET_TYPE_NEGOTIATE2] = NEGOTIATE2,
+   };
+
+   written = snprintf(cur, left, PKT: %u:%u - %u:%u,
+  VMCI_HANDLE_TO_CONTEXT_ID(pkt-dg.src),
+  pkt-src_port,
+  VMCI_HANDLE_TO_CONTEXT_ID(pkt-dg.dst),
+  pkt-dst_port);
+   if (written = left)
+   goto error;
+
+   left -= written;
+   cur += written;
+
+   switch (pkt-type) {
+   case VSOCK_PACKET_TYPE_REQUEST:
+   case VSOCK_PACKET_TYPE_NEGOTIATE:
+   written = snprintf(cur, left, , %s, size = % FMT64 u,
+  type_strings[pkt-type], pkt-u.size);
+   break;
+
+   case VSOCK_PACKET_TYPE_OFFER:
+   case VSOCK_PACKET_TYPE_ATTACH:
+   written = snprintf(cur, left, , %s, handle = %u:%u,
+  type_strings[pkt-type],
+  VMCI_HANDLE_TO_CONTEXT_ID(pkt-u.handle),
+  VMCI_HANDLE_TO_RESOURCE_ID(pkt-u.handle));
+   break;
+
+   case VSOCK_PACKET_TYPE_WROTE:
+   case VSOCK_PACKET_TYPE_READ:
+   case VSOCK_PACKET_TYPE_RST:
+   written = snprintf(cur, left, , %s, type_strings[pkt-type]);
+   break;
+   case VSOCK_PACKET_TYPE_SHUTDOWN: {
+   bool recv;
+   bool send;
+
+   recv = pkt-u.mode  RCV_SHUTDOWN;
+   send = pkt-u.mode  SEND_SHUTDOWN;
+   written = snprintf(cur, left, , %s, mode = %c%c,
+  type_strings[pkt-type],
+  recv ? 'R' : ' ', send ? 'S' : ' ');
+   }
+   break;
+
+   case VSOCK_PACKET_TYPE_WAITING_WRITE:
+   case VSOCK_PACKET_TYPE_WAITING_READ:
+   written = snprintf(cur, left, , %s, generation = % FMT64 u, 
+  offset = % FMT64 u,
+  type_strings[pkt-type],
+  pkt-u.wait.generation, pkt-u.wait.offset);
+
+   break;
+
+   case VSOCK_PACKET_TYPE_REQUEST2:
+   case VSOCK_PACKET_TYPE_NEGOTIATE2:
+   written = snprintf(cur, left, , %s, size = % FMT64 u, 
+  proto = %u,
+  type_strings[pkt-type], pkt-u.size,
+  pkt-proto);
+ 

[PATCH 6/6] VSOCK: header and config files.

2012-10-15 Thread George Zhang
VSOCK header files, Makefiles and Kconfig systems for Linux VSocket module.


Signed-off-by: George Zhang georgezh...@vmware.com
---
 net/Kconfig |1 
 net/Makefile|1 
 net/vmw_vsock/Kconfig   |   14 +
 net/vmw_vsock/Makefile  |8 
 net/vmw_vsock/notify_qstate.c   |  670 +++
 net/vmw_vsock/vmci_sockets.h|  517 +++
 net/vmw_vsock/vmci_sockets_packet.h |  107 ++
 net/vmw_vsock/vsock_common.h|  130 +++
 net/vmw_vsock/vsock_packet.h|  131 +++
 net/vmw_vsock/vsock_version.h   |   29 ++
 10 files changed, 1608 insertions(+), 0 deletions(-)
 create mode 100644 net/vmw_vsock/Kconfig
 create mode 100644 net/vmw_vsock/Makefile
 create mode 100644 net/vmw_vsock/notify_qstate.c
 create mode 100644 net/vmw_vsock/vmci_sockets.h
 create mode 100644 net/vmw_vsock/vmci_sockets_packet.h
 create mode 100644 net/vmw_vsock/vsock_common.h
 create mode 100644 net/vmw_vsock/vsock_packet.h
 create mode 100644 net/vmw_vsock/vsock_version.h

diff --git a/net/Kconfig b/net/Kconfig
index 245831b..75b8d5e 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -216,6 +216,7 @@ source net/dcb/Kconfig
 source net/dns_resolver/Kconfig
 source net/batman-adv/Kconfig
 source net/openvswitch/Kconfig
+source net/vmw_vsock/Kconfig
 
 config RPS
boolean
diff --git a/net/Makefile b/net/Makefile
index 4f4ee08..cae59f4 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -70,3 +70,4 @@ obj-$(CONFIG_CEPH_LIB)+= ceph/
 obj-$(CONFIG_BATMAN_ADV)   += batman-adv/
 obj-$(CONFIG_NFC)  += nfc/
 obj-$(CONFIG_OPENVSWITCH)  += openvswitch/
+obj-$(CONFIG_VMWARE_VSOCK) += vmw_vsock/
diff --git a/net/vmw_vsock/Kconfig b/net/vmw_vsock/Kconfig
new file mode 100644
index 000..95e2568
--- /dev/null
+++ b/net/vmw_vsock/Kconfig
@@ -0,0 +1,14 @@
+#
+# Vsock protocol
+#
+
+config VMWARE_VSOCK
+   tristate Virtual Socket protocol
+   depends on VMWARE_VMCI
+   help
+ Virtual Socket Protocol is a socket protocol similar to TCP/IP
+ allowing comunication between Virtual Machines and VMware
+ hypervisor.
+
+ To compile this driver as a module, choose M here: the module
+ will be called vsock. If unsure, say N.
diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile
new file mode 100644
index 000..8cb1e1c
--- /dev/null
+++ b/net/vmw_vsock/Makefile
@@ -0,0 +1,8 @@
+obj-$(CONFIG_VMWARE_VSOCK) += vmw_vsock.o
+ccflags-y += -I$(src)/shared
+vmw_vsock-y += af_vsock.o
+vmw_vsock-y += notify.o
+vmw_vsock-y += notify_qstate.o
+vmw_vsock-y += stats.o
+vmw_vsock-y += util.o
+vmw_vsock-y += vsock_addr.o
diff --git a/net/vmw_vsock/notify_qstate.c b/net/vmw_vsock/notify_qstate.c
new file mode 100644
index 000..fafb76c
--- /dev/null
+++ b/net/vmw_vsock/notify_qstate.c
@@ -0,0 +1,670 @@
+/*
+ * VMware vSockets Driver
+ *
+ * Copyright (C) 2009-2012 VMware, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation version 2 and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+/*
+ * notifyQState.c --
+ *
+ * Linux control notifications based on Queuepair state for the VMCI Stream
+ * Sockets protocol.
+ */
+
+#include linux/types.h
+
+#include linux/socket.h
+
+#include linux/stddef.h  /* for NULL */
+#include net/sock.h
+
+#include notify.h
+#include af_vsock.h
+
+#define PKT_FIELD(vsk, field_name) \
+   (vsk)-notify.pkt_q_state.field_name
+
+/*
+ *
+ * vsock_vmci_notify_waiting_write --
+ *
+ * Determines if the conditions have been met to notify a waiting writer.
+ *
+ * Results: true if a notification should be sent, false otherwise.
+ *
+ * Side effects: None.
+ */
+
+static bool vsock_vmci_notify_waiting_write(vsock_vmci_sock *vsk)
+{
+   bool retval;
+   u64 notify_limit;
+
+   if (!PKT_FIELD(vsk, peer_waiting_write))
+   return false;
+
+   /*
+* When the sender blocks, we take that as a sign that the sender is
+* faster than the receiver. To reduce the transmit rate of the sender,
+* we delay the sending of the read notification by decreasing the
+* write_notify_window. The notification is delayed until the number of
+* bytes used in the queue drops below the write_notify_window.
+*/
+
+   if (!PKT_FIELD(vsk, peer_waiting_write_detected)) {
+   PKT_FIELD(vsk, peer_waiting_write_detected) = true;
+   if (PKT_FIELD(vsk, write_notify_window)  PAGE_SIZE) {
+   PKT_FIELD(vsk,