[PATCH 1/2 v1] blkdrv: Add queue limits parameters for sg block drive

2012-08-21 Thread Cong Meng
This patch adds some queue limit parameters into block drive. And inits them
for sg block drive. Some interfaces are also added for accessing them.

Signed-off-by: Cong Meng m...@linux.vnet.ibm.com
---
 block/raw-posix.c |   58 +
 block_int.h   |4 +++
 blockdev.c|   15 +
 hw/block-common.h |3 ++
 4 files changed, 80 insertions(+), 0 deletions(-)

diff --git a/block/raw-posix.c b/block/raw-posix.c
index 0dce089..a086f89 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -53,6 +53,7 @@
 #include linux/cdrom.h
 #include linux/fd.h
 #include linux/fs.h
+#include dirent.h
 #endif
 #ifdef CONFIG_FIEMAP
 #include linux/fiemap.h
@@ -829,6 +830,62 @@ static int hdev_probe_device(const char *filename)
 return 0;
 }
 
+static void read_queue_limit(char *path, const char *filename, unsigned int 
*val)
+{
+FILE *f;
+char *tail = path + strlen(path);
+
+pstrcat(path, MAXPATHLEN, filename);
+f = fopen(path, r);
+if (!f) {
+goto out;
+}
+
+fscanf(f, %u, val);
+fclose(f);
+
+out:
+*tail = 0;
+}
+
+static void sg_get_queue_limits(BlockDriverState *bs, const char *filename)
+{
+DIR *ffs;
+struct dirent *d;
+char path[MAXPATHLEN];
+
+snprintf(path, MAXPATHLEN,
+ /sys/class/scsi_generic/sg%s/device/block/,
+ filename + strlen(/dev/sg));
+
+ffs = opendir(path);
+if (!ffs) {
+return;
+}
+
+for (;;) {
+d = readdir(ffs);
+if (!d) {
+return;
+}
+
+if (strcmp(d-d_name, .) == 0 || strcmp(d-d_name, ..) == 0) {
+continue;
+}
+
+break;
+}
+
+closedir(ffs);
+
+pstrcat(path, MAXPATHLEN, d-d_name);
+pstrcat(path, MAXPATHLEN, /queue/);
+
+read_queue_limit(path, max_sectors_kb, bs-max_sectors);
+read_queue_limit(path, max_segments, bs-max_segments);
+read_queue_limit(path, max_segment_size, bs-max_segment_size);
+}
+
 static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
 {
 BDRVRawState *s = bs-opaque;
@@ -868,6 +925,7 @@ static int hdev_open(BlockDriverState *bs, const char 
*filename, int flags)
 temp = realpath(filename, resolved_path);
 if (temp  strstart(temp, /dev/sg, NULL)) {
 bs-sg = 1;
+sg_get_queue_limits(bs, temp);
 }
 }
 #endif
diff --git a/block_int.h b/block_int.h
index d72317f..a9d07a2 100644
--- a/block_int.h
+++ b/block_int.h
@@ -333,6 +333,10 @@ struct BlockDriverState {
 
 /* long-running background operation */
 BlockJob *job;
+
+unsigned int max_sectors;
+unsigned int max_segments;
+unsigned int max_segment_size;
 };
 
 int get_tmp_filename(char *filename, int size);
diff --git a/blockdev.c b/blockdev.c
index 3d75015..e17edbf 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1191,3 +1191,18 @@ BlockJobInfoList *qmp_query_block_jobs(Error **errp)
 bdrv_iterate(do_qmp_query_block_jobs_one, prev);
 return dummy.next;
 }
+
+unsigned int get_queue_max_sectors(BlockDriverState *bs)
+{
+return (bs-file  bs-file-sg) ? bs-file-max_sectors : 0;
+}
+
+unsigned int get_queue_max_segments(BlockDriverState *bs)
+{
+return (bs-file  bs-file-sg) ? bs-file-max_segments : 0;
+}
+
+unsigned int get_queue_max_segment_size(BlockDriverState *bs)
+{
+return (bs-file  bs-file-sg) ? bs-file-max_segment_size : 0;
+}
diff --git a/hw/block-common.h b/hw/block-common.h
index bb808f7..d47fcd7 100644
--- a/hw/block-common.h
+++ b/hw/block-common.h
@@ -76,4 +76,7 @@ void hd_geometry_guess(BlockDriverState *bs,
int *ptrans);
 int hd_bios_chs_auto_trans(uint32_t cyls, uint32_t heads, uint32_t secs);
 
+unsigned int get_queue_max_sectors(BlockDriverState *bs);
+unsigned int get_queue_max_segments(BlockDriverState *bs);
+unsigned int get_queue_max_segment_size(BlockDriverState *bs);
 #endif
-- 
1.7.7.6

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH 2/2 v1] virtio-scsi: set per-LUN queue limits for sg devices

2012-08-21 Thread Cong Meng
Each virtio scsi HBA has global request queue limits. But the passthrough
LUNs (scsi-generic) come from different host HBAs may have different request
queue limits. If the guest sends commands that exceed the host limits, the
commands will be rejected by host HAB. 

To address the issue, this patch responses the newly added virtio control
queue request by returning the per-LUN queue limits.

Signed-off-by: Cong Meng m...@linux.vnet.ibm.com
---
 hw/virtio-scsi.c |   65 ++
 1 files changed, 65 insertions(+), 0 deletions(-)

diff --git a/hw/virtio-scsi.c b/hw/virtio-scsi.c
index c4a5b22..3c0bd99 100644
--- a/hw/virtio-scsi.c
+++ b/hw/virtio-scsi.c
@@ -28,6 +28,7 @@
 #define VIRTIO_SCSI_F_INOUT0
 #define VIRTIO_SCSI_F_HOTPLUG  1
 #define VIRTIO_SCSI_F_CHANGE   2
+#define VIRTIO_SCSI_F_LUN_QUERY3
 
 /* Response codes */
 #define VIRTIO_SCSI_S_OK   0
@@ -48,6 +49,7 @@
 #define VIRTIO_SCSI_T_TMF  0
 #define VIRTIO_SCSI_T_AN_QUERY 1
 #define VIRTIO_SCSI_T_AN_SUBSCRIBE 2
+#define VIRTIO_SCSI_T_LUN_QUERY3
 
 /* Valid TMF subtypes.  */
 #define VIRTIO_SCSI_T_TMF_ABORT_TASK   0
@@ -66,6 +68,11 @@
 #define VIRTIO_SCSI_T_ASYNC_NOTIFY 2
 #define VIRTIO_SCSI_T_PARAM_CHANGE 3
 
+/* LUN Query */
+#define VIRTIO_SCSI_T_LQ_MAX_SECTORS   0
+#define VIRTIO_SCSI_T_LQ_MAX_SEGMENTS  1
+#define VIRTIO_SCSI_T_LQ_MAX_SEGMENT_SIZE  2
+
 /* Reasons for transport reset event */
 #define VIRTIO_SCSI_EVT_RESET_HARD 0
 #define VIRTIO_SCSI_EVT_RESET_RESCAN   1
@@ -115,6 +122,18 @@ typedef struct {
 uint8_t response;
 } QEMU_PACKED VirtIOSCSICtrlANResp;
 
+/* LUN qeury */
+typedef struct {
+uint32_t type;
+uint8_t lun[8];
+uint32_t subtype;
+} QEMU_PACKED VirtIOSCSICtrlLQReq;
+
+typedef struct {
+uint32_t response;
+uint32_t value;
+} QEMU_PACKED VirtIOSCSICtrlLQResp;
+
 typedef struct {
 uint32_t event;
 uint8_t lun[8];
@@ -160,6 +179,7 @@ typedef struct VirtIOSCSIReq {
 VirtIOSCSICmdReq  *cmd;
 VirtIOSCSICtrlTMFReq  *tmf;
 VirtIOSCSICtrlANReq   *an;
+VirtIOSCSICtrlLQReq   *lq;
 } req;
 union {
 char  *buf;
@@ -167,6 +187,7 @@ typedef struct VirtIOSCSIReq {
 VirtIOSCSICtrlTMFResp *tmf;
 VirtIOSCSICtrlANResp  *an;
 VirtIOSCSIEvent   *event;
+VirtIOSCSICtrlLQResp  *lq;
 } resp;
 } VirtIOSCSIReq;
 
@@ -285,6 +306,43 @@ static void *virtio_scsi_load_request(QEMUFile *f, 
SCSIRequest *sreq)
 return req;
 }
 
+static void virtio_scsi_do_lun_query(VirtIOSCSI *s, VirtIOSCSIReq *req)
+{
+SCSIDevice *d = virtio_scsi_device_find(s, req-req.lq-lun);
+
+if (!d) {
+goto fail;
+}
+
+switch (req-req.lq-subtype) {
+case VIRTIO_SCSI_T_LQ_MAX_SECTORS:
+req-resp.lq-value = get_queue_max_sectors(d-conf.bs);
+if (!req-resp.lq-value) {
+goto fail;
+}
+break;
+case VIRTIO_SCSI_T_LQ_MAX_SEGMENTS:
+req-resp.lq-value = get_queue_max_segments(d-conf.bs);
+if (!req-resp.lq-value) {
+goto fail;
+}
+break;
+case VIRTIO_SCSI_T_LQ_MAX_SEGMENT_SIZE:
+req-resp.lq-value = get_queue_max_segment_size(d-conf.bs);
+if (!req-resp.lq-value) {
+goto fail;
+}
+break;
+default:
+goto fail;
+}
+
+req-resp.lq-response = VIRTIO_SCSI_S_OK;
+return;
+fail:
+req-resp.lq-response = VIRTIO_SCSI_S_FAILURE;
+}
+
 static void virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
 {
 SCSIDevice *d = virtio_scsi_device_find(s, req-req.tmf-lun);
@@ -414,6 +472,12 @@ static void virtio_scsi_handle_ctrl(VirtIODevice *vdev, 
VirtQueue *vq)
 }
 req-resp.an-event_actual = 0;
 req-resp.an-response = VIRTIO_SCSI_S_OK;
+} else if (req-req.lq-type == VIRTIO_SCSI_T_LUN_QUERY) {
+if (out_size  sizeof(VirtIOSCSICtrlLQReq) ||
+in_size  sizeof(VirtIOSCSICtrlLQResp)) {
+virtio_scsi_bad_req();
+}
+virtio_scsi_do_lun_query(s, req);
 }
 virtio_scsi_complete_req(req);
 }
@@ -557,6 +621,7 @@ static uint32_t virtio_scsi_get_features(VirtIODevice *vdev,
 {
 requested_features |= (1UL  VIRTIO_SCSI_F_HOTPLUG);
 requested_features |= (1UL  VIRTIO_SCSI_F_CHANGE);
+requested_features |= (1UL  VIRTIO_SCSI_F_LUN_QUERY);
 return requested_features;
 }
 
-- 
1.7.7.6

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v1] virtio-scsi: get and set the queue limits for sg device

2012-08-21 Thread Cong Meng
Each virtio scsi HBA has global request queue limits. But the passthrough
LUNs (scsi-generic) come from different host HBAs may have different request
queue limits. If the guest sends commands that exceed the host limits, the
commands will be rejected by host HAB. 

This patch addresses this issue by getting the per-LUN queue limits via the the
newly added virtio control request, then setting them properly.

Signed-off-by: Cong Meng m...@linux.vnet.ibm.com
---
 drivers/scsi/virtio_scsi.c  |  113 +--
 include/linux/virtio_scsi.h |   18 +++
 2 files changed, 116 insertions(+), 15 deletions(-)

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 173cb39..ec5066f 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -35,12 +35,14 @@ struct virtio_scsi_cmd {
struct virtio_scsi_cmd_req   cmd;
struct virtio_scsi_ctrl_tmf_req  tmf;
struct virtio_scsi_ctrl_an_req   an;
+   struct virtio_scsi_ctrl_lq_req   lq;
} req;
union {
struct virtio_scsi_cmd_resp  cmd;
struct virtio_scsi_ctrl_tmf_resp tmf;
struct virtio_scsi_ctrl_an_resp  an;
struct virtio_scsi_event evt;
+   struct virtio_scsi_ctrl_lq_resp  lq;
} resp;
 } cacheline_aligned_in_smp;
 
@@ -469,6 +471,46 @@ out:
return ret;
 }
 
+static u32 virtscsi_lun_query(struct scsi_device *sdev, u32 *value, u32 
subtype)
+{
+   struct Scsi_Host *shost = sdev-host;
+   struct virtio_scsi *vscsi = shost_priv(shost);
+   DECLARE_COMPLETION_ONSTACK(comp);
+   struct virtio_scsi_cmd *cmd;
+   struct virtio_scsi_target_state *tgt = vscsi-tgt[sdev-id];
+   unsigned int ret = VIRTIO_SCSI_S_FAILURE;
+
+   cmd = mempool_alloc(virtscsi_cmd_pool, GFP_ATOMIC);
+   if (!cmd)
+   goto out;
+
+   memset(cmd, 0, sizeof(*cmd));
+   cmd-comp = comp;
+   cmd-req.lq = (struct virtio_scsi_ctrl_lq_req){
+   .type = VIRTIO_SCSI_T_LUN_QUERY,
+   .subtype = subtype,
+   .lun[0] = 1,
+   .lun[1] = sdev-id,
+   .lun[2] = (sdev-lun  8) | 0x40,
+   .lun[3] = sdev-lun  0xff,
+   };
+
+   if (virtscsi_kick_cmd(tgt, vscsi-ctrl_vq, cmd, sizeof cmd-req.lq,
+   sizeof cmd-resp.lq, GFP_NOIO)  0) {
+   goto out;
+   }
+
+   wait_for_completion(comp);
+
+   ret = cmd-resp.lq.response;
+   if (ret == VIRTIO_SCSI_S_OK) {
+   *value = cmd-resp.lq.value;
+   }
+out:
+   mempool_free(cmd, virtscsi_cmd_pool);
+   return ret;
+}
+
 static int virtscsi_device_reset(struct scsi_cmnd *sc)
 {
struct virtio_scsi *vscsi = shost_priv(sc-device-host);
@@ -516,20 +558,6 @@ static int virtscsi_abort(struct scsi_cmnd *sc)
return virtscsi_tmf(vscsi, cmd);
 }
 
-static struct scsi_host_template virtscsi_host_template = {
-   .module = THIS_MODULE,
-   .name = Virtio SCSI HBA,
-   .proc_name = virtio_scsi,
-   .queuecommand = virtscsi_queuecommand,
-   .this_id = -1,
-   .eh_abort_handler = virtscsi_abort,
-   .eh_device_reset_handler = virtscsi_device_reset,
-
-   .can_queue = 1024,
-   .dma_boundary = UINT_MAX,
-   .use_clustering = ENABLE_CLUSTERING,
-};
-
 #define virtscsi_config_get(vdev, fld) \
({ \
typeof(((struct virtio_scsi_config *)0)-fld) __val; \
@@ -547,6 +575,60 @@ static struct scsi_host_template virtscsi_host_template = {
  __val, sizeof(__val)); \
})
 
+static u32 virtscsi_max_sectors(struct scsi_device *sdev, u32 *value)
+{
+   return virtscsi_lun_query(sdev, value, VIRTIO_SCSI_T_LQ_MAX_SECTORS);
+}
+
+static u32 virtscsi_max_segments(struct scsi_device *sdev, u32 *value)
+{
+   return virtscsi_lun_query(sdev, value, VIRTIO_SCSI_T_LQ_MAX_SEGMENTS);
+}
+
+static u32 virtscsi_max_segment_size(struct scsi_device *sdev, u32 *value)
+{
+   return virtscsi_lun_query(sdev, value, 
VIRTIO_SCSI_T_LQ_MAX_SEGMENT_SIZE);
+}
+
+static int virtscsi_slave_alloc(struct scsi_device *sdev)
+{
+   struct Scsi_Host *shost = sdev-host;
+   struct virtio_scsi *vscsi = shost_priv(shost);
+   struct virtio_device *vdev = vscsi-vdev;
+   struct request_queue *q = sdev-request_queue;
+   unsigned int max_sectors, max_segments, max_segment_size;
+
+   if (!virtio_has_feature(vdev, VIRTIO_SCSI_F_LUN_QUERY))
+   goto out;
+
+   if (virtscsi_max_sectors(sdev, max_sectors) ||
+   virtscsi_max_segments(sdev, max_segments) ||
+   virtscsi_max_segment_size(sdev, max_segment_size)) {
+   goto out;
+   }
+
+   blk_queue_max_hw_sectors(q, max_sectors);
+   blk_queue_max_segments(q, max_segments - 2);
+   blk_queue_max_segment_size(q, max_segment_size);

Re: [PATCH 1/2 v1] blkdrv: Add queue limits parameters for sg block drive

2012-08-21 Thread Paolo Bonzini
Il 21/08/2012 10:23, Cong Meng ha scritto:
 +static void sg_get_queue_limits(BlockDriverState *bs, const char *filename)
 +{
 +DIR *ffs;
 +struct dirent *d;
 +char path[MAXPATHLEN];
 +
 +snprintf(path, MAXPATHLEN,
 + /sys/class/scsi_generic/sg%s/device/block/,
 + filename + strlen(/dev/sg));
 +
 +ffs = opendir(path);
 +if (!ffs) {
 +return;
 +}
 +
 +for (;;) {
 +d = readdir(ffs);
 +if (!d) {
 +return;
 +}
 +
 +if (strcmp(d-d_name, .) == 0 || strcmp(d-d_name, ..) == 0) {
 +continue;
 +}
 +
 +break;
 +}
 +
 +closedir(ffs);
 +
 +pstrcat(path, MAXPATHLEN, d-d_name);
 +pstrcat(path, MAXPATHLEN, /queue/);
 +
 +read_queue_limit(path, max_sectors_kb, bs-max_sectors);
 +read_queue_limit(path, max_segments, bs-max_segments);
 +read_queue_limit(path, max_segment_size, bs-max_segment_size);
 +}

Using /sys/dev/block or /sys/dev/char seems easier, and lets you
retrieve the parameters for block devices too.

However, I'm worried of the consequences this has for migration.  You
could have the same physical disk accessed with two different HBAs, with
different limits.  So I don't know if this can really be solved at all.

Paolo
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v1] virtio-scsi: get and set the queue limits for sg device

2012-08-21 Thread Stefan Hajnoczi
On Tue, Aug 21, 2012 at 9:26 AM, Cong Meng m...@linux.vnet.ibm.com wrote:
 Each virtio scsi HBA has global request queue limits. But the passthrough
 LUNs (scsi-generic) come from different host HBAs may have different request
 queue limits. If the guest sends commands that exceed the host limits, the
 commands will be rejected by host HAB.

 This patch addresses this issue by getting the per-LUN queue limits via the 
 the
 newly added virtio control request, then setting them properly.

 Signed-off-by: Cong Meng m...@linux.vnet.ibm.com
 ---
  drivers/scsi/virtio_scsi.c  |  113 
 +--
  include/linux/virtio_scsi.h |   18 +++
  2 files changed, 116 insertions(+), 15 deletions(-)

 diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
 index 173cb39..ec5066f 100644
 --- a/drivers/scsi/virtio_scsi.c
 +++ b/drivers/scsi/virtio_scsi.c
 @@ -35,12 +35,14 @@ struct virtio_scsi_cmd {
 struct virtio_scsi_cmd_req   cmd;
 struct virtio_scsi_ctrl_tmf_req  tmf;
 struct virtio_scsi_ctrl_an_req   an;
 +   struct virtio_scsi_ctrl_lq_req   lq;
 } req;
 union {
 struct virtio_scsi_cmd_resp  cmd;
 struct virtio_scsi_ctrl_tmf_resp tmf;
 struct virtio_scsi_ctrl_an_resp  an;
 struct virtio_scsi_event evt;
 +   struct virtio_scsi_ctrl_lq_resp  lq;
 } resp;
  } cacheline_aligned_in_smp;

 @@ -469,6 +471,46 @@ out:
 return ret;
  }

 +static u32 virtscsi_lun_query(struct scsi_device *sdev, u32 *value, u32 
 subtype)
 +{
 +   struct Scsi_Host *shost = sdev-host;
 +   struct virtio_scsi *vscsi = shost_priv(shost);
 +   DECLARE_COMPLETION_ONSTACK(comp);
 +   struct virtio_scsi_cmd *cmd;
 +   struct virtio_scsi_target_state *tgt = vscsi-tgt[sdev-id];
 +   unsigned int ret = VIRTIO_SCSI_S_FAILURE;
 +
 +   cmd = mempool_alloc(virtscsi_cmd_pool, GFP_ATOMIC);
 +   if (!cmd)
 +   goto out;
 +
 +   memset(cmd, 0, sizeof(*cmd));
 +   cmd-comp = comp;
 +   cmd-req.lq = (struct virtio_scsi_ctrl_lq_req){
 +   .type = VIRTIO_SCSI_T_LUN_QUERY,
 +   .subtype = subtype,
 +   .lun[0] = 1,
 +   .lun[1] = sdev-id,
 +   .lun[2] = (sdev-lun  8) | 0x40,
 +   .lun[3] = sdev-lun  0xff,

The LUN addressing code has been duplicated several times now.  How
about replacing it with something like

static void virtio_scsi_set_lun(u8 *lun, struct scsi_device *sdev)
{
lun[0] = 1;
lun[1] = sdev-id;
lun[2] = (sdev-lun  8) | 0x40;
lun[3] = sdev-lun  0xff;
lun[4] = lun[5] = lun[6] = lun[7] = 0;
}
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 1/2 v1] blkdrv: Add queue limits parameters for sg block drive

2012-08-21 Thread Cong Meng



On Tue 21 Aug 2012 04:48:17 PM CST, Paolo Bonzini wrote:

Il 21/08/2012 10:23, Cong Meng ha scritto:

+static void sg_get_queue_limits(BlockDriverState *bs, const char *filename)
+{
+DIR *ffs;
+struct dirent *d;
+char path[MAXPATHLEN];
+
+snprintf(path, MAXPATHLEN,
+ /sys/class/scsi_generic/sg%s/device/block/,
+ filename + strlen(/dev/sg));
+
+ffs = opendir(path);
+if (!ffs) {
+return;
+}
+
+for (;;) {
+d = readdir(ffs);
+if (!d) {
+return;
+}
+
+if (strcmp(d-d_name, .) == 0 || strcmp(d-d_name, ..) == 0) {
+continue;
+}
+
+break;
+}
+
+closedir(ffs);
+
+pstrcat(path, MAXPATHLEN, d-d_name);
+pstrcat(path, MAXPATHLEN, /queue/);
+
+read_queue_limit(path, max_sectors_kb, bs-max_sectors);
+read_queue_limit(path, max_segments, bs-max_segments);
+read_queue_limit(path, max_segment_size, bs-max_segment_size);
+}


Using /sys/dev/block or /sys/dev/char seems easier, and lets you
retrieve the parameters for block devices too.

what do you mean with block devices?   Using /dev/sda instead of 
/dev/sg0?



However, I'm worried of the consequences this has for migration.  You
could have the same physical disk accessed with two different HBAs, with
different limits.  So I don't know if this can really be solved at all.

I know little about qemu migration now.  The pending scsi commands will 
be saved and

transfered to remote machine when starting migration?

Cong.


Paolo



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v1] virtio-scsi: get and set the queue limits for sg device

2012-08-21 Thread Cong Meng



On Tue 21 Aug 2012 04:53:59 PM CST, Stefan Hajnoczi wrote:

On Tue, Aug 21, 2012 at 9:26 AM, Cong Meng m...@linux.vnet.ibm.com wrote:

Each virtio scsi HBA has global request queue limits. But the passthrough
LUNs (scsi-generic) come from different host HBAs may have different request
queue limits. If the guest sends commands that exceed the host limits, the
commands will be rejected by host HAB.

This patch addresses this issue by getting the per-LUN queue limits via the the
newly added virtio control request, then setting them properly.

Signed-off-by: Cong Meng m...@linux.vnet.ibm.com
---
  drivers/scsi/virtio_scsi.c  |  113 +--
  include/linux/virtio_scsi.h |   18 +++
  2 files changed, 116 insertions(+), 15 deletions(-)

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 173cb39..ec5066f 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -35,12 +35,14 @@ struct virtio_scsi_cmd {
 struct virtio_scsi_cmd_req   cmd;
 struct virtio_scsi_ctrl_tmf_req  tmf;
 struct virtio_scsi_ctrl_an_req   an;
+   struct virtio_scsi_ctrl_lq_req   lq;
 } req;
 union {
 struct virtio_scsi_cmd_resp  cmd;
 struct virtio_scsi_ctrl_tmf_resp tmf;
 struct virtio_scsi_ctrl_an_resp  an;
 struct virtio_scsi_event evt;
+   struct virtio_scsi_ctrl_lq_resp  lq;
 } resp;
  } cacheline_aligned_in_smp;

@@ -469,6 +471,46 @@ out:
 return ret;
  }

+static u32 virtscsi_lun_query(struct scsi_device *sdev, u32 *value, u32 
subtype)
+{
+   struct Scsi_Host *shost = sdev-host;
+   struct virtio_scsi *vscsi = shost_priv(shost);
+   DECLARE_COMPLETION_ONSTACK(comp);
+   struct virtio_scsi_cmd *cmd;
+   struct virtio_scsi_target_state *tgt = vscsi-tgt[sdev-id];
+   unsigned int ret = VIRTIO_SCSI_S_FAILURE;
+
+   cmd = mempool_alloc(virtscsi_cmd_pool, GFP_ATOMIC);
+   if (!cmd)
+   goto out;
+
+   memset(cmd, 0, sizeof(*cmd));
+   cmd-comp = comp;
+   cmd-req.lq = (struct virtio_scsi_ctrl_lq_req){
+   .type = VIRTIO_SCSI_T_LUN_QUERY,
+   .subtype = subtype,
+   .lun[0] = 1,
+   .lun[1] = sdev-id,
+   .lun[2] = (sdev-lun  8) | 0x40,
+   .lun[3] = sdev-lun  0xff,


The LUN addressing code has been duplicated several times now.  How
about replacing it with something like


sure. I will include it.

Thnaks.
Cong.



static void virtio_scsi_set_lun(u8 *lun, struct scsi_device *sdev)
{
 lun[0] = 1;
 lun[1] = sdev-id;
 lun[2] = (sdev-lun  8) | 0x40;
 lun[3] = sdev-lun  0xff;
 lun[4] = lun[5] = lun[6] = lun[7] = 0;
}



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 1/2 v1] blkdrv: Add queue limits parameters for sg block drive

2012-08-21 Thread Stefan Hajnoczi
On Tue, Aug 21, 2012 at 9:23 AM, Cong Meng m...@linux.vnet.ibm.com wrote:
 This patch adds some queue limit parameters into block drive. And inits them
 for sg block drive. Some interfaces are also added for accessing them.

 Signed-off-by: Cong Meng m...@linux.vnet.ibm.com
 ---
  block/raw-posix.c |   58 
 +
  block_int.h   |4 +++
  blockdev.c|   15 +
  hw/block-common.h |3 ++
  4 files changed, 80 insertions(+), 0 deletions(-)

 diff --git a/block/raw-posix.c b/block/raw-posix.c
 index 0dce089..a086f89 100644
 --- a/block/raw-posix.c
 +++ b/block/raw-posix.c
 @@ -53,6 +53,7 @@
  #include linux/cdrom.h
  #include linux/fd.h
  #include linux/fs.h
 +#include dirent.h
  #endif
  #ifdef CONFIG_FIEMAP
  #include linux/fiemap.h
 @@ -829,6 +830,62 @@ static int hdev_probe_device(const char *filename)
  return 0;
  }

 +static void read_queue_limit(char *path, const char *filename, unsigned int 
 *val)
 +{
 +FILE *f;
 +char *tail = path + strlen(path);
 +
 +pstrcat(path, MAXPATHLEN, filename);
 +f = fopen(path, r);
 +if (!f) {
 +goto out;
 +}
 +
 +fscanf(f, %u, val);
 +fclose(f);
 +
 +out:
 +*tail = 0;
 +}
 +
 +static void sg_get_queue_limits(BlockDriverState *bs, const char *filename)
 +{
 +DIR *ffs;
 +struct dirent *d;
 +char path[MAXPATHLEN];
 +
 +snprintf(path, MAXPATHLEN,
 + /sys/class/scsi_generic/sg%s/device/block/,
 + filename + strlen(/dev/sg));
 +
 +ffs = opendir(path);
 +if (!ffs) {
 +return;
 +}
 +
 +for (;;) {
 +d = readdir(ffs);
 +if (!d) {
 +return;

Leaks ffs

 +}
 +
 +if (strcmp(d-d_name, .) == 0 || strcmp(d-d_name, ..) == 0) {
 +continue;
 +}
 +
 +break;
 +}

Not sure where in the kernel the block/ sysfs directory is created for
the device.  I wasn't able to check that there is only ever 1
directory entry for a SCSI device's block/.  Any ideas whether it's
safe to grab the first directory entry?

 +
 +closedir(ffs);
 +
 +pstrcat(path, MAXPATHLEN, d-d_name);
 +pstrcat(path, MAXPATHLEN, /queue/);
 +
 +read_queue_limit(path, max_sectors_kb, bs-max_sectors);
 +read_queue_limit(path, max_segments, bs-max_segments);
 +read_queue_limit(path, max_segment_size, bs-max_segment_size);
 +}
 +
  static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
  {
  BDRVRawState *s = bs-opaque;
 @@ -868,6 +925,7 @@ static int hdev_open(BlockDriverState *bs, const char 
 *filename, int flags)
  temp = realpath(filename, resolved_path);
  if (temp  strstart(temp, /dev/sg, NULL)) {
  bs-sg = 1;
 +sg_get_queue_limits(bs, temp);
  }
  }
  #endif
 diff --git a/block_int.h b/block_int.h
 index d72317f..a9d07a2 100644
 --- a/block_int.h
 +++ b/block_int.h
 @@ -333,6 +333,10 @@ struct BlockDriverState {

  /* long-running background operation */
  BlockJob *job;
 +
 +unsigned int max_sectors;
 +unsigned int max_segments;
 +unsigned int max_segment_size;
  };

  int get_tmp_filename(char *filename, int size);
 diff --git a/blockdev.c b/blockdev.c
 index 3d75015..e17edbf 100644
 --- a/blockdev.c
 +++ b/blockdev.c
 @@ -1191,3 +1191,18 @@ BlockJobInfoList *qmp_query_block_jobs(Error **errp)
  bdrv_iterate(do_qmp_query_block_jobs_one, prev);
  return dummy.next;
  }
 +
 +unsigned int get_queue_max_sectors(BlockDriverState *bs)

These should be bdrv_get_queue_max_sectors(BlockDriverState *bs) and
should live in block.c.

 +{
 +return (bs-file  bs-file-sg) ? bs-file-max_sectors : 0;

The BlockDriver should be able to provide these values, we shouldn't
reach into bs-file.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 1/2 v1] blkdrv: Add queue limits parameters for sg block drive

2012-08-21 Thread Stefan Hajnoczi
On Tue, Aug 21, 2012 at 10:41 AM, Cong Meng m...@linux.vnet.ibm.com wrote:


 On Tue 21 Aug 2012 04:48:17 PM CST, Paolo Bonzini wrote:

 Il 21/08/2012 10:23, Cong Meng ha scritto:

 +static void sg_get_queue_limits(BlockDriverState *bs, const char
 *filename)
 +{
 +DIR *ffs;
 +struct dirent *d;
 +char path[MAXPATHLEN];
 +
 +snprintf(path, MAXPATHLEN,
 + /sys/class/scsi_generic/sg%s/device/block/,
 + filename + strlen(/dev/sg));
 +
 +ffs = opendir(path);
 +if (!ffs) {
 +return;
 +}
 +
 +for (;;) {
 +d = readdir(ffs);
 +if (!d) {
 +return;
 +}
 +
 +if (strcmp(d-d_name, .) == 0 || strcmp(d-d_name, ..) == 0)
 {
 +continue;
 +}
 +
 +break;
 +}
 +
 +closedir(ffs);
 +
 +pstrcat(path, MAXPATHLEN, d-d_name);
 +pstrcat(path, MAXPATHLEN, /queue/);
 +
 +read_queue_limit(path, max_sectors_kb, bs-max_sectors);
 +read_queue_limit(path, max_segments, bs-max_segments);
 +read_queue_limit(path, max_segment_size, bs-max_segment_size);
 +}


 Using /sys/dev/block or /sys/dev/char seems easier, and lets you
 retrieve the parameters for block devices too.

 what do you mean with block devices?   Using /dev/sda instead of
 /dev/sg0?


 However, I'm worried of the consequences this has for migration.  You
 could have the same physical disk accessed with two different HBAs, with
 different limits.  So I don't know if this can really be solved at all.

 I know little about qemu migration now.  The pending scsi commands will be
 saved and
 transfered to remote machine when starting migration?

Passthrough is already a migration blocker if both hosts do not have
access to the same LUNs.

When both hosts do have access to the same LUNs it's possible to
extract the block queue limits (using sysfs) and compare them.

Today you can start QEMU with different image files on both hosts.
Migration will appear to work but the disk image on the destination
host could be junk.  This is a similar case, I don't see a problem
except that there should be a safety check (maybe at the libvirt
level) to make this safe.

Stefan
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 2/2 v1] virtio-scsi: set per-LUN queue limits for sg devices

2012-08-21 Thread Stefan Hajnoczi
On Tue, Aug 21, 2012 at 9:23 AM, Cong Meng m...@linux.vnet.ibm.com wrote:
 @@ -557,6 +621,7 @@ static uint32_t virtio_scsi_get_features(VirtIODevice 
 *vdev,
  {
  requested_features |= (1UL  VIRTIO_SCSI_F_HOTPLUG);
  requested_features |= (1UL  VIRTIO_SCSI_F_CHANGE);
 +requested_features |= (1UL  VIRTIO_SCSI_F_LUN_QUERY);

This should be a QEMU 1.3 and later feature bit.  VMs running with -m
pc-1.2 or earlier should not see it by default, this ensure the device
ABI is stable.  For more info, see:
http://patchwork.ozlabs.org/patch/177924/

Stefan
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 1/2 v1] blkdrv: Add queue limits parameters for sg block drive

2012-08-21 Thread Paolo Bonzini
Il 21/08/2012 11:52, Stefan Hajnoczi ha scritto:
  Using /sys/dev/block or /sys/dev/char seems easier, and lets you
  retrieve the parameters for block devices too.
 
  what do you mean with block devices?   Using /dev/sda instead of
  /dev/sg0?

Yes.

  However, I'm worried of the consequences this has for migration.  You
  could have the same physical disk accessed with two different HBAs, with
  different limits.  So I don't know if this can really be solved at all.
 
  I know little about qemu migration now.  The pending scsi commands will be
  saved and
  transfered to remote machine when starting migration?
 
 Passthrough is already a migration blocker if both hosts do not have
 access to the same LUNs.

Yes, but requiring the exact same hardware may be too much.  I'm trying
to understand the problem better before committing to a threefold
spec/qemu/kernel change.

Cong, what is the limit that the host HBA enforces (and what is the
HBA)?  What commands see a problem?  Is it fixed by using scsi-block
instead of scsi-generic (if you can use scsi-block at all, i.e. it's not
a tape or similar device)?

With scsi-generic, QEMU uses a bounce buffer for non-I/O commands to a
SCSI passthrough device, so the only problem in that case should be the
maximum segment size.  This could change in the future, but max_segments
and max_sectors should not yet be a problem.

With scsi-block, QEMU will use read/write on the block device and the
host kernel will then obey the host HBA's block limits.  QEMU will still
use a bounce buffer for non-I/O commands to a scsi-block device, but the
payload is usually small for non-I/O commands.

Paolo

 When both hosts do have access to the same LUNs it's possible to
 extract the block queue limits (using sysfs) and compare them.
 
 Today you can start QEMU with different image files on both hosts.
 Migration will appear to work but the disk image on the destination
 host could be junk.  This is a similar case, I don't see a problem
 except that there should be a safety check (maybe at the libvirt
 level) to make this safe.

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: Re: [PATCH V2 0/6] virtio-trace: Support virtio-trace

2012-08-21 Thread Masami Hiramatsu
(2012/08/21 14:16), Amit Shah wrote:
 On (Thu) 09 Aug 2012 [21:30:29], Yoshihiro YUNOMAE wrote:
 Hi All,

 The following patch set provides a low-overhead system for collecting kernel
 tracing data of guests by a host in a virtualization environment.
 
 ACK this series.

Thank you!

 I ran it through the virtio-serial test suite, and there's no
 regression in existing functionality.
 
 I encourage you to check out the virtio-serial test suite[1] as well
 as the virtio-serial unit tests in the autotest[2] code, and
 contribute to them to add tests for the new functionality you've
 added.

Ah, OK we'll look into them for adding some tests for
splice support :)

 
 [1]
 http://fedorapeople.org/cgit/amitshah/public_git/test-virtserial.git/
 
 [2]
 https://github.com/autotest/autotest/blob/e91cd67b845b291622c8d079a8289c4b0cb1e6ae/client/tests/kvm/tests/virtio_console.py
 
 

Thank you again,

-- 
Masami HIRAMATSU
Software Platform Research Dept. Linux Technology Center
Hitachi, Ltd., Yokohama Research Laboratory
E-mail: masami.hiramatsu...@hitachi.com


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v8 0/5] make balloon pages movable by compaction

2012-08-21 Thread Rafael Aquini
Memory fragmentation introduced by ballooning might reduce significantly
the number of 2MB contiguous memory blocks that can be used within a guest,
thus imposing performance penalties associated with the reduced number of
transparent huge pages that could be used by the guest workload.

This patch-set follows the main idea discussed at 2012 LSFMMS session:
Ballooning for transparent huge pages -- http://lwn.net/Articles/490114/
to introduce the required changes to the virtio_balloon driver, as well as
the changes to the core compaction  migration bits, in order to make those
subsystems aware of ballooned pages and allow memory balloon pages become
movable within a guest, thus avoiding the aforementioned fragmentation issue

Rafael Aquini (5):
  mm: introduce a common interface for balloon pages mobility
  mm: introduce compaction and migration for ballooned pages
  virtio_balloon: introduce migration primitives to balloon pages
  mm: introduce putback_movable_pages()
  mm: add vm event counters for balloon pages compaction

 drivers/virtio/virtio_balloon.c| 212 +++--
 include/linux/balloon_compaction.h | 113 
 include/linux/migrate.h|   2 +
 include/linux/pagemap.h|  18 
 include/linux/vm_event_item.h  |   8 +-
 mm/Kconfig |  15 +++
 mm/Makefile|   2 +-
 mm/balloon_compaction.c| 147 +
 mm/compaction.c|  51 +
 mm/migrate.c   |  52 -
 mm/page_alloc.c|   2 +-
 mm/vmstat.c|  10 +-
 12 files changed, 595 insertions(+), 37 deletions(-)
 create mode 100644 include/linux/balloon_compaction.h
 create mode 100644 mm/balloon_compaction.c


Change log:
v8:
 * introduce a common MM interface for balloon driver page compaction (Michael);
 * remove the global state preventing multiple balloon device support (Michael);
 * introduce RCU protection/syncrhonization to balloon page-mapping (Michael);
v7:
 * fix a potential page leak case at 'putback_balloon_page' (Mel);
 * adjust vm-events-counter patch and remove its drop-on-merge message (Rik);
 * add 'putback_movable_pages' to avoid hacks on 'putback_lru_pages' (Minchan);
v6:
 * rename 'is_balloon_page()' to 'movable_balloon_page()' (Rik);
v5:
 * address Andrew Morton's review comments on the patch series;
 * address a couple extra nitpick suggestions on PATCH 01 (Minchan);
v4: 
 * address Rusty Russel's review comments on PATCH 02;
 * re-base virtio_balloon patch on 9c378abc5c0c6fc8e3acf5968924d274503819b3;
V3: 
 * address reviewers nitpick suggestions on PATCH 01 (Mel, Minchan);
V2: 
 * address Mel Gorman's review comments on PATCH 01;


Preliminary test results:
(2 VCPU 2048mB RAM KVM guest running 3.6.0_rc2+ -- after a reboot)

* 64mB balloon:
[root@localhost ~]# awk '/compact/ {print}' /proc/vmstat
compact_blocks_moved 0
compact_pages_moved 0
compact_pagemigrate_failed 0
compact_stall 0
compact_fail 0
compact_success 0
compact_balloon_isolated 0
compact_balloon_migrated 0
compact_balloon_returned 0
compact_balloon_released 0
[root@localhost ~]# 
[root@localhost ~]# for i in $(seq 1 6); do echo 1  
/proc/sys/vm/compact_memory  done /dev/null 
[1]   Doneecho 1  /proc/sys/vm/compact_memory
[2]   Doneecho 1  /proc/sys/vm/compact_memory
[3]   Doneecho 1  /proc/sys/vm/compact_memory
[4]   Doneecho 1  /proc/sys/vm/compact_memory
[5]-  Doneecho 1  /proc/sys/vm/compact_memory
[6]+  Doneecho 1  /proc/sys/vm/compact_memory
[root@localhost ~]# 
[root@localhost ~]# awk '/compact/ {print}' /proc/vmstat
compact_blocks_moved 3108
compact_pages_moved 43169
compact_pagemigrate_failed 95
compact_stall 0
compact_fail 0
compact_success 0
compact_balloon_isolated 16384
compact_balloon_migrated 16384
compact_balloon_returned 0
compact_balloon_released 16384


* 128 mB balloon:
[root@localhost ~]# awk '/compact/ {print}' /proc/vmstat
compact_blocks_moved 0
compact_pages_moved 0
compact_pagemigrate_failed 0
compact_stall 0
compact_fail 0
compact_success 0
compact_balloon_isolated 0
compact_balloon_migrated 0
compact_balloon_returned 0
compact_balloon_released 0
[root@localhost ~]# 
[root@localhost ~]# for i in $(seq 1 6); do echo 1  
/proc/sys/vm/compact_memory  done /dev/null  
[1]   Doneecho 1  /proc/sys/vm/compact_memory
[2]   Doneecho 1  /proc/sys/vm/compact_memory
[3]   Doneecho 1  /proc/sys/vm/compact_memory
[4]   Doneecho 1  /proc/sys/vm/compact_memory
[5]-  Doneecho 1  /proc/sys/vm/compact_memory
[6]+  Doneecho 1  /proc/sys/vm/compact_memory
[root@localhost ~]# 
[root@localhost ~]# awk '/compact/ {print}' /proc/vmstat
compact_blocks_moved 3062
compact_pages_moved 49774

[PATCH v8 1/5] mm: introduce a common interface for balloon pages mobility

2012-08-21 Thread Rafael Aquini
Memory fragmentation introduced by ballooning might reduce significantly
the number of 2MB contiguous memory blocks that can be used within a guest,
thus imposing performance penalties associated with the reduced number of
transparent huge pages that could be used by the guest workload.

This patch introduces a common interface to help a balloon driver on
making its page set movable to compaction, and thus allowing the system
to better leverage the compation efforts on memory defragmentation.

Signed-off-by: Rafael Aquini aqu...@redhat.com
---
 include/linux/balloon_compaction.h | 113 +
 include/linux/pagemap.h|  18 +
 mm/Kconfig |  15 
 mm/Makefile|   2 +-
 mm/balloon_compaction.c| 145 +
 5 files changed, 292 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/balloon_compaction.h
 create mode 100644 mm/balloon_compaction.c

diff --git a/include/linux/balloon_compaction.h 
b/include/linux/balloon_compaction.h
new file mode 100644
index 000..5fbf036
--- /dev/null
+++ b/include/linux/balloon_compaction.h
@@ -0,0 +1,113 @@
+/*
+ * include/linux/balloon_compaction.h
+ *
+ * Common interface definitions for making balloon pages movable to compaction.
+ *
+ * Copyright (C) 2012, Red Hat, Inc.  Rafael Aquini aqu...@redhat.com
+ */
+#ifndef _LINUX_BALLOON_COMPACTION_H
+#define _LINUX_BALLOON_COMPACTION_H
+#ifdef __KERNEL__
+
+#include linux/rcupdate.h
+#include linux/pagemap.h
+#include linux/gfp.h
+
+#if defined(CONFIG_BALLOON_COMPACTION)
+extern bool isolate_balloon_page(struct page *);
+extern void putback_balloon_page(struct page *);
+
+static inline bool movable_balloon_page(struct page *page)
+{
+   struct address_space *mapping = NULL;
+   bool ret = false;
+   /*
+* Before dereferencing and testing mapping-flags, lets make sure
+* this is not a page that uses -mapping in a different way
+*/
+   if (!PageSlab(page)  !PageSwapCache(page) 
+   !PageAnon(page)  !page_mapped(page)) {
+   rcu_read_lock();
+   mapping = rcu_dereference(page-mapping);
+   if (mapping_balloon(mapping))
+   ret = true;
+   rcu_read_unlock();
+   }
+   return ret;
+}
+
+static inline gfp_t balloon_mapping_gfp_mask(void)
+{
+   return GFP_HIGHUSER_MOVABLE;
+}
+
+/*
+ * __page_balloon_device - return the balloon device owing the page.
+ *
+ * This shall only be used at driver callbacks under proper page_lock,
+ * to get access to the balloon device structure that owns @page.
+ */
+static inline void *__page_balloon_device(struct page *page)
+{
+   struct address_space *mapping;
+   mapping = rcu_access_pointer(page-mapping);
+   if (mapping)
+   mapping = mapping-assoc_mapping;
+   return (void *)mapping;
+}
+
+#define count_balloon_event(e) count_vm_event(e)
+/*
+ * DEFINE_BALLOON_MAPPING_AOPS - declare and instantiate a callback descriptor
+ *  to be used as balloon page-mapping-a_ops.
+ *
+ * @label : declaration identifier (var name)
+ * @migratepg : callback symbol name for performing the page migration step
+ * @isolatepg : callback symbol name for performing the page isolation step
+ * @putbackpg : callback symbol name for performing the page putback step
+ *
+ * address_space_operations utilized methods for ballooned pages:
+ *   .migratepage- used to perform balloon's page migration (as is)
+ *   .launder_page   - used to isolate a page from balloon's page list
+ *   .freepage   - used to reinsert an isolated page to balloon's page list
+ */
+#define DEFINE_BALLOON_MAPPING_AOPS(label, migratepg, isolatepg, putbackpg) \
+   const struct address_space_operations (label) = {   \
+   .migratepage  = (migratepg),\
+   .launder_page = (isolatepg),\
+   .freepage = (putbackpg),\
+   }
+
+#else
+static inline bool isolate_balloon_page(struct page *page) { return false; }
+static inline bool movable_balloon_page(struct page *page) { return false; }
+static inline void putback_balloon_page(struct page *page) { return; }
+
+static inline gfp_t balloon_mapping_gfp_mask(void)
+{
+   return GFP_HIGHUSER;
+}
+
+#define count_balloon_event(e) {}
+#define DEFINE_BALLOON_MAPPING_AOPS(label, migratepg, isolatepg, putbackpg) \
+   const struct address_space_operations *(label) = NULL
+
+#endif /* CONFIG_BALLOON_COMPACTION */
+
+extern struct address_space *alloc_balloon_mapping(void *balloon_device,
+   const struct address_space_operations *a_ops);
+extern void *page_balloon_device(struct page *page);
+
+static inline void assign_balloon_mapping(struct page *page,
+   

[PATCH v8 3/5] virtio_balloon: introduce migration primitives to balloon pages

2012-08-21 Thread Rafael Aquini
Memory fragmentation introduced by ballooning might reduce significantly
the number of 2MB contiguous memory blocks that can be used within a guest,
thus imposing performance penalties associated with the reduced number of
transparent huge pages that could be used by the guest workload.

Besides making balloon pages movable at allocation time and introducing
the necessary primitives to perform balloon page migration/compaction,
this patch also introduces the following locking scheme to provide the
proper synchronization and protection for struct virtio_balloon elements
against concurrent accesses due to parallel operations introduced by
memory compaction / page migration.
 - balloon_lock (mutex) : synchronizes the access demand to elements of
  struct virtio_balloon and its queue operations;
 - pages_lock (spinlock): special protection to balloon pages list against
  concurrent list handling operations;
 - virtio_baloon-pages list handling sync by RCU operations;

Signed-off-by: Rafael Aquini aqu...@redhat.com
---
 drivers/virtio/virtio_balloon.c | 210 +---
 1 file changed, 199 insertions(+), 11 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 0908e60..bda7bb0 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -27,6 +27,7 @@
 #include linux/delay.h
 #include linux/slab.h
 #include linux/module.h
+#include linux/balloon_compaction.h
 
 /*
  * Balloon device works in 4K page units.  So each page is pointed to by
@@ -35,6 +36,12 @@
  */
 #define VIRTIO_BALLOON_PAGES_PER_PAGE (PAGE_SIZE  VIRTIO_BALLOON_PFN_SHIFT)
 
+/* flags used to hint compaction procedures about the balloon device status */
+enum balloon_status_flags {
+   BALLOON_REMOVAL = 0,/* balloon device is under removal steps */
+   BALLOON_OK, /* balloon device is up and running */
+};
+
 struct virtio_balloon
 {
struct virtio_device *vdev;
@@ -46,11 +53,24 @@ struct virtio_balloon
/* The thread servicing the balloon. */
struct task_struct *thread;
 
+   /* balloon special page-mapping */
+   struct address_space *mapping;
+
+   /* Synchronize access/update to this struct virtio_balloon elements */
+   struct mutex balloon_lock;
+
/* Waiting for host to ack the pages we released. */
wait_queue_head_t acked;
 
/* Number of balloon pages we've told the Host we're not using. */
unsigned int num_pages;
+
+   /* balloon device status flag */
+   unsigned short balloon_status;
+
+   /* Protect 'pages' list against concurrent handling */
+   spinlock_t pages_lock;
+
/*
 * The pages we've told the Host we're not using.
 * Each page on this list adds VIRTIO_BALLOON_PAGES_PER_PAGE
@@ -122,13 +142,17 @@ static void set_page_pfns(u32 pfns[], struct page *page)
 
 static void fill_balloon(struct virtio_balloon *vb, size_t num)
 {
+   /* Get the proper GFP alloc mask from vb-mapping flags */
+   gfp_t vb_gfp_mask = mapping_gfp_mask(vb-mapping);
+
/* We can only do one array worth at a time. */
num = min(num, ARRAY_SIZE(vb-pfns));
 
+   mutex_lock(vb-balloon_lock);
for (vb-num_pfns = 0; vb-num_pfns  num;
 vb-num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
-   struct page *page = alloc_page(GFP_HIGHUSER | __GFP_NORETRY |
-   __GFP_NOMEMALLOC | __GFP_NOWARN);
+   struct page *page = alloc_page(vb_gfp_mask | __GFP_NORETRY |
+  __GFP_NOWARN | __GFP_NOMEMALLOC);
if (!page) {
if (printk_ratelimit())
dev_printk(KERN_INFO, vb-vdev-dev,
@@ -141,7 +165,10 @@ static void fill_balloon(struct virtio_balloon *vb, size_t 
num)
set_page_pfns(vb-pfns + vb-num_pfns, page);
vb-num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
totalram_pages--;
-   list_add(page-lru, vb-pages);
+   spin_lock(vb-pages_lock);
+   list_add_rcu(page-lru, vb-pages);
+   assign_balloon_mapping(page, vb-mapping);
+   spin_unlock(vb-pages_lock);
}
 
/* Didn't get any?  Oh well. */
@@ -149,6 +176,7 @@ static void fill_balloon(struct virtio_balloon *vb, size_t 
num)
return;
 
tell_host(vb, vb-inflate_vq);
+   mutex_unlock(vb-balloon_lock);
 }
 
 static void release_pages_by_pfn(const u32 pfns[], unsigned int num)
@@ -169,21 +197,48 @@ static void leak_balloon(struct virtio_balloon *vb, 
size_t num)
/* We can only do one array worth at a time. */
num = min(num, ARRAY_SIZE(vb-pfns));
 
+   mutex_lock(vb-balloon_lock);
for (vb-num_pfns = 0; vb-num_pfns  num;
 vb-num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
-   

[PATCH v8 2/5] mm: introduce compaction and migration for ballooned pages

2012-08-21 Thread Rafael Aquini
Memory fragmentation introduced by ballooning might reduce significantly
the number of 2MB contiguous memory blocks that can be used within a guest,
thus imposing performance penalties associated with the reduced number of
transparent huge pages that could be used by the guest workload.

This patch introduces the helper functions as well as the necessary changes
to teach compaction and migration bits how to cope with pages which are
part of a guest memory balloon, in order to make them movable by memory
compaction procedures.

Signed-off-by: Rafael Aquini aqu...@redhat.com
---
 mm/compaction.c | 47 ---
 mm/migrate.c| 31 ++-
 2 files changed, 58 insertions(+), 20 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index e78cb96..ce43dc2 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -14,6 +14,7 @@
 #include linux/backing-dev.h
 #include linux/sysctl.h
 #include linux/sysfs.h
+#include linux/balloon_compaction.h
 #include internal.h
 
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
@@ -312,32 +313,40 @@ isolate_migratepages_range(struct zone *zone, struct 
compact_control *cc,
continue;
}
 
-   if (!PageLRU(page))
-   continue;
-
/*
-* PageLRU is set, and lru_lock excludes isolation,
-* splitting and collapsing (collapsing has already
-* happened if PageLRU is set).
+* It is possible to migrate LRU pages and balloon pages.
+* Skip any other type of page.
 */
-   if (PageTransHuge(page)) {
-   low_pfn += (1  compound_order(page)) - 1;
-   continue;
-   }
+   if (PageLRU(page)) {
+   /*
+* PageLRU is set, and lru_lock excludes isolation,
+* splitting and collapsing (collapsing has already
+* happened if PageLRU is set).
+*/
+   if (PageTransHuge(page)) {
+   low_pfn += (1  compound_order(page)) - 1;
+   continue;
+   }
 
-   if (!cc-sync)
-   mode |= ISOLATE_ASYNC_MIGRATE;
+   if (!cc-sync)
+   mode |= ISOLATE_ASYNC_MIGRATE;
 
-   lruvec = mem_cgroup_page_lruvec(page, zone);
+   lruvec = mem_cgroup_page_lruvec(page, zone);
 
-   /* Try isolate the page */
-   if (__isolate_lru_page(page, mode) != 0)
-   continue;
+   /* Try isolate the page */
+   if (__isolate_lru_page(page, mode) != 0)
+   continue;
 
-   VM_BUG_ON(PageTransCompound(page));
+   VM_BUG_ON(PageTransCompound(page));
+
+   /* Successfully isolated */
+   del_page_from_lru_list(page, lruvec, page_lru(page));
+   } else if (unlikely(movable_balloon_page(page))) {
+   if (!isolate_balloon_page(page))
+   continue;
+   } else
+   continue;
 
-   /* Successfully isolated */
-   del_page_from_lru_list(page, lruvec, page_lru(page));
list_add(page-lru, migratelist);
cc-nr_migratepages++;
nr_isolated++;
diff --git a/mm/migrate.c b/mm/migrate.c
index 77ed2d7..6392da258 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -35,6 +35,7 @@
 #include linux/hugetlb.h
 #include linux/hugetlb_cgroup.h
 #include linux/gfp.h
+#include linux/balloon_compaction.h
 
 #include asm/tlbflush.h
 
@@ -79,7 +80,10 @@ void putback_lru_pages(struct list_head *l)
list_del(page-lru);
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
-   putback_lru_page(page);
+   if (unlikely(movable_balloon_page(page)))
+   putback_balloon_page(page);
+   else
+   putback_lru_page(page);
}
 }
 
@@ -778,6 +782,17 @@ static int __unmap_and_move(struct page *page, struct page 
*newpage,
}
}
 
+   if (unlikely(movable_balloon_page(page))) {
+   /*
+* A ballooned page does not need any special attention from
+* physical to virtual reverse mapping procedures.
+* Skip any attempt to unmap PTEs or to remap swap cache,
+* in order to avoid burning cycles at rmap level.
+*/
+   remap_swapcache = 0;
+   goto skip_unmap;
+   }
+
/*
 * Corner case handling:
 * 1. 

[PATCH v8 4/5] mm: introduce putback_movable_pages()

2012-08-21 Thread Rafael Aquini
The PATCH mm: introduce compaction and migration for virtio ballooned pages
hacks around putback_lru_pages() in order to allow ballooned pages to be
re-inserted on balloon page list as if a ballooned page was like a LRU page.

As ballooned pages are not legitimate LRU pages, this patch introduces
putback_movable_pages() to properly cope with cases where the isolated
pageset contains ballooned pages and LRU pages, thus fixing the mentioned
inelegant hack around putback_lru_pages().

Signed-off-by: Rafael Aquini aqu...@redhat.com
---
 include/linux/migrate.h |  2 ++
 mm/compaction.c |  4 ++--
 mm/migrate.c| 20 
 mm/page_alloc.c |  2 +-
 4 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index ce7e667..ff103a1 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -10,6 +10,7 @@ typedef struct page *new_page_t(struct page *, unsigned long 
private, int **);
 #ifdef CONFIG_MIGRATION
 
 extern void putback_lru_pages(struct list_head *l);
+extern void putback_movable_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
struct page *, struct page *, enum migrate_mode);
 extern int migrate_pages(struct list_head *l, new_page_t x,
@@ -33,6 +34,7 @@ extern int migrate_huge_page_move_mapping(struct 
address_space *mapping,
 #else
 
 static inline void putback_lru_pages(struct list_head *l) {}
+static inline void putback_movable_pages(struct list_head *l) {}
 static inline int migrate_pages(struct list_head *l, new_page_t x,
unsigned long private, bool offlining,
enum migrate_mode mode) { return -ENOSYS; }
diff --git a/mm/compaction.c b/mm/compaction.c
index ce43dc2..782ed32 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -759,9 +759,9 @@ static int compact_zone(struct zone *zone, struct 
compact_control *cc)
trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
nr_remaining);
 
-   /* Release LRU pages not migrated */
+   /* Release isolated pages not migrated */
if (err) {
-   putback_lru_pages(cc-migratepages);
+   putback_movable_pages(cc-migratepages);
cc-nr_migratepages = 0;
if (err == -ENOMEM) {
ret = COMPACT_PARTIAL;
diff --git a/mm/migrate.c b/mm/migrate.c
index 6392da258..0bf2caf 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -80,6 +80,26 @@ void putback_lru_pages(struct list_head *l)
list_del(page-lru);
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
+   putback_lru_page(page);
+   }
+}
+
+/*
+ * Put previously isolated pages back onto the appropriated lists
+ * from where they were once taken off for compaction/migration.
+ *
+ * This function shall be used instead of putback_lru_pages(),
+ * whenever the isolated pageset has been built by isolate_migratepages_range()
+ */
+void putback_movable_pages(struct list_head *l)
+{
+   struct page *page;
+   struct page *page2;
+
+   list_for_each_entry_safe(page, page2, l, lru) {
+   list_del(page-lru);
+   dec_zone_page_state(page, NR_ISOLATED_ANON +
+   page_is_file_cache(page));
if (unlikely(movable_balloon_page(page)))
putback_balloon_page(page);
else
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 009ac28..78b7663 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5669,7 +5669,7 @@ static int __alloc_contig_migrate_range(unsigned long 
start, unsigned long end)
0, false, MIGRATE_SYNC);
}
 
-   putback_lru_pages(cc.migratepages);
+   putback_movable_pages(cc.migratepages);
return ret  0 ? 0 : ret;
 }
 
-- 
1.7.11.4

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v8 5/5] mm: add vm event counters for balloon pages compaction

2012-08-21 Thread Rafael Aquini
This patch introduces a new set of vm event counters to keep track of
ballooned pages compaction activity.

Signed-off-by: Rafael Aquini aqu...@redhat.com
---
 drivers/virtio/virtio_balloon.c |  2 ++
 include/linux/vm_event_item.h   |  8 +++-
 mm/balloon_compaction.c |  6 --
 mm/migrate.c|  1 +
 mm/vmstat.c | 10 +-
 5 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index bda7bb0..c358ed3 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -449,6 +449,8 @@ int virtballoon_migratepage(struct address_space *mapping,
set_page_pfns(vb-pfns, page);
tell_host(vb, vb-deflate_vq);
 
+   /* perform vm accountability on this successful page migration */
+   count_balloon_event(COMPACTBALLOONMIGRATED);
mutex_unlock(vb-balloon_lock);
return 0;
 }
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 57f7b10..6868aba 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -41,7 +41,13 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 #ifdef CONFIG_COMPACTION
COMPACTBLOCKS, COMPACTPAGES, COMPACTPAGEFAILED,
COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS,
-#endif
+#ifdef CONFIG_BALLOON_COMPACTION
+   COMPACTBALLOONISOLATED, /* isolated from balloon pagelist */
+   COMPACTBALLOONMIGRATED, /* balloon page sucessfully migrated */
+   COMPACTBALLOONRETURNED, /* putback to pagelist, not-migrated */
+   COMPACTBALLOONRELEASED, /* old-page released after migration */
+#endif /* CONFIG_BALLOON_COMPACTION */
+#endif /* CONFIG_COMPACTION */
 #ifdef CONFIG_HUGETLB_PAGE
HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL,
 #endif
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index d79f13d..9186000 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -114,6 +114,7 @@ bool isolate_balloon_page(struct page *page)
(page_count(page) == 2)) {
if (__isolate_balloon_page(page)) {
unlock_page(page);
+   count_vm_event(COMPACTBALLOONISOLATED);
return true;
}
}
@@ -137,9 +138,10 @@ void putback_balloon_page(struct page *page)
 * concurrent isolation threads attempting to re-isolate it.
 */
lock_page(page);
-   if (movable_balloon_page(page))
+   if (movable_balloon_page(page)) {
__putback_balloon_page(page);
-
+   count_vm_event(COMPACTBALLOONRETURNED);
+   }
unlock_page(page);
 }
 #endif /* CONFIG_BALLOON_COMPACTION */
diff --git a/mm/migrate.c b/mm/migrate.c
index 0bf2caf..052e59a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -893,6 +893,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned 
long private,
page_is_file_cache(page));
put_page(page);
__free_page(page);
+   count_balloon_event(COMPACTBALLOONRELEASED);
return rc;
}
 out:
diff --git a/mm/vmstat.c b/mm/vmstat.c
index df7a674..c7919c4 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -768,7 +768,15 @@ const char * const vmstat_text[] = {
compact_stall,
compact_fail,
compact_success,
-#endif
+
+#ifdef CONFIG_BALLOON_COMPACTION
+   compact_balloon_isolated,
+   compact_balloon_migrated,
+   compact_balloon_returned,
+   compact_balloon_released,
+#endif /* CONFIG_BALLOON_COMPACTION */
+
+#endif /* CONFIG_COMPACTION */
 
 #ifdef CONFIG_HUGETLB_PAGE
htlb_buddy_alloc_success,
-- 
1.7.11.4

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v8 1/5] mm: introduce a common interface for balloon pages mobility

2012-08-21 Thread Michael S. Tsirkin
On Tue, Aug 21, 2012 at 09:47:44AM -0300, Rafael Aquini wrote:
 Memory fragmentation introduced by ballooning might reduce significantly
 the number of 2MB contiguous memory blocks that can be used within a guest,
 thus imposing performance penalties associated with the reduced number of
 transparent huge pages that could be used by the guest workload.
 
 This patch introduces a common interface to help a balloon driver on
 making its page set movable to compaction, and thus allowing the system
 to better leverage the compation efforts on memory defragmentation.
 
 Signed-off-by: Rafael Aquini aqu...@redhat.com
 ---
  include/linux/balloon_compaction.h | 113 +
  include/linux/pagemap.h|  18 +
  mm/Kconfig |  15 
  mm/Makefile|   2 +-
  mm/balloon_compaction.c| 145 
 +
  5 files changed, 292 insertions(+), 1 deletion(-)
  create mode 100644 include/linux/balloon_compaction.h
  create mode 100644 mm/balloon_compaction.c
 
 diff --git a/include/linux/balloon_compaction.h 
 b/include/linux/balloon_compaction.h
 new file mode 100644
 index 000..5fbf036
 --- /dev/null
 +++ b/include/linux/balloon_compaction.h
 @@ -0,0 +1,113 @@
 +/*
 + * include/linux/balloon_compaction.h
 + *
 + * Common interface definitions for making balloon pages movable to 
 compaction.
 + *
 + * Copyright (C) 2012, Red Hat, Inc.  Rafael Aquini aqu...@redhat.com
 + */
 +#ifndef _LINUX_BALLOON_COMPACTION_H
 +#define _LINUX_BALLOON_COMPACTION_H
 +#ifdef __KERNEL__
 +
 +#include linux/rcupdate.h
 +#include linux/pagemap.h
 +#include linux/gfp.h
 +
 +#if defined(CONFIG_BALLOON_COMPACTION)
 +extern bool isolate_balloon_page(struct page *);
 +extern void putback_balloon_page(struct page *);
 +
 +static inline bool movable_balloon_page(struct page *page)
 +{
 + struct address_space *mapping = NULL;
 + bool ret = false;
 + /*
 +  * Before dereferencing and testing mapping-flags, lets make sure
 +  * this is not a page that uses -mapping in a different way
 +  */
 + if (!PageSlab(page)  !PageSwapCache(page) 
 + !PageAnon(page)  !page_mapped(page)) {
 + rcu_read_lock();
 + mapping = rcu_dereference(page-mapping);
 + if (mapping_balloon(mapping))
 + ret = true;
 + rcu_read_unlock();

This looks suspicious: you drop rcu_read_unlock
so can't page switch from balloon to non balloon?

Even if correct, it's probably cleaner to just make caller
invoke this within an rcu critical section.
You will notice that all callers immediately re-enter
rcu critical section anyway.

Alternatively, I noted that accesses to page-mapping
seem protected by page lock bit.
If true we can rely on that instead of RCU, just need
assign_balloon_mapping to lock_page/unlock_page.

 + }
 + return ret;
 +}
 +
 +static inline gfp_t balloon_mapping_gfp_mask(void)
 +{
 + return GFP_HIGHUSER_MOVABLE;
 +}
 +
 +/*
 + * __page_balloon_device - return the balloon device owing the page.
 + *
 + * This shall only be used at driver callbacks under proper page_lock,
 + * to get access to the balloon device structure that owns @page.
 + */
 +static inline void *__page_balloon_device(struct page *page)
 +{
 + struct address_space *mapping;
 + mapping = rcu_access_pointer(page-mapping);
 + if (mapping)
 + mapping = mapping-assoc_mapping;
 + return (void *)mapping;
 +}
 +
 +#define count_balloon_event(e) count_vm_event(e)
 +/*
 + * DEFINE_BALLOON_MAPPING_AOPS - declare and instantiate a callback 
 descriptor
 + *to be used as balloon page-mapping-a_ops.
 + *
 + * @label : declaration identifier (var name)
 + * @migratepg : callback symbol name for performing the page migration step
 + * @isolatepg : callback symbol name for performing the page isolation step
 + * @putbackpg : callback symbol name for performing the page putback step
 + *
 + * address_space_operations utilized methods for ballooned pages:
 + *   .migratepage- used to perform balloon's page migration (as is)
 + *   .launder_page   - used to isolate a page from balloon's page list
 + *   .freepage   - used to reinsert an isolated page to balloon's page 
 list
 + */

It would be a good idea to document the assumptions here.
Looks like .launder_page and .freepage are called in rcu critical
section.
But migratepage isn't - why is that safe?

 +#define DEFINE_BALLOON_MAPPING_AOPS(label, migratepg, isolatepg, putbackpg) \
 + const struct address_space_operations (label) = {   \
 + .migratepage  = (migratepg),\
 + .launder_page = (isolatepg),\
 + .freepage = (putbackpg),\
 + }
 +
 +#else
 +static inline bool isolate_balloon_page(struct page *page) { return 

Re: [PATCH v8 1/5] mm: introduce a common interface for balloon pages mobility

2012-08-21 Thread Michael S. Tsirkin
On Tue, Aug 21, 2012 at 04:52:23PM +0300, Michael S. Tsirkin wrote:
 On Tue, Aug 21, 2012 at 09:47:44AM -0300, Rafael Aquini wrote:
  Memory fragmentation introduced by ballooning might reduce significantly
  the number of 2MB contiguous memory blocks that can be used within a guest,
  thus imposing performance penalties associated with the reduced number of
  transparent huge pages that could be used by the guest workload.
  
  This patch introduces a common interface to help a balloon driver on
  making its page set movable to compaction, and thus allowing the system
  to better leverage the compation efforts on memory defragmentation.
  
  Signed-off-by: Rafael Aquini aqu...@redhat.com
  ---
   include/linux/balloon_compaction.h | 113 +
   include/linux/pagemap.h|  18 +
   mm/Kconfig |  15 
   mm/Makefile|   2 +-
   mm/balloon_compaction.c| 145 
  +
   5 files changed, 292 insertions(+), 1 deletion(-)
   create mode 100644 include/linux/balloon_compaction.h
   create mode 100644 mm/balloon_compaction.c
  
  diff --git a/include/linux/balloon_compaction.h 
  b/include/linux/balloon_compaction.h
  new file mode 100644
  index 000..5fbf036
  --- /dev/null
  +++ b/include/linux/balloon_compaction.h
  @@ -0,0 +1,113 @@
  +/*
  + * include/linux/balloon_compaction.h
  + *
  + * Common interface definitions for making balloon pages movable to 
  compaction.
  + *
  + * Copyright (C) 2012, Red Hat, Inc.  Rafael Aquini aqu...@redhat.com
  + */
  +#ifndef _LINUX_BALLOON_COMPACTION_H
  +#define _LINUX_BALLOON_COMPACTION_H
  +#ifdef __KERNEL__
  +
  +#include linux/rcupdate.h
  +#include linux/pagemap.h
  +#include linux/gfp.h
  +
  +#if defined(CONFIG_BALLOON_COMPACTION)
  +extern bool isolate_balloon_page(struct page *);
  +extern void putback_balloon_page(struct page *);
  +
  +static inline bool movable_balloon_page(struct page *page)
  +{
  +   struct address_space *mapping = NULL;
  +   bool ret = false;
  +   /*
  +* Before dereferencing and testing mapping-flags, lets make sure
  +* this is not a page that uses -mapping in a different way
  +*/
  +   if (!PageSlab(page)  !PageSwapCache(page) 
  +   !PageAnon(page)  !page_mapped(page)) {
  +   rcu_read_lock();
  +   mapping = rcu_dereference(page-mapping);
  +   if (mapping_balloon(mapping))
  +   ret = true;
  +   rcu_read_unlock();
 
 This looks suspicious: you drop rcu_read_unlock
 so can't page switch from balloon to non balloon?
 
 Even if correct, it's probably cleaner to just make caller
 invoke this within an rcu critical section.
 You will notice that all callers immediately re-enter
 rcu critical section anyway.
 
 Alternatively, I noted that accesses to page-mapping
 seem protected by page lock bit.
 If true we can rely on that instead of RCU, just need
 assign_balloon_mapping to lock_page/unlock_page.

Actually not true in all cases: not in case
of putback.  Not sure whether code can be changed
to make it true.


  +   }
  +   return ret;
  +}
  +
  +static inline gfp_t balloon_mapping_gfp_mask(void)
  +{
  +   return GFP_HIGHUSER_MOVABLE;
  +}
  +
  +/*
  + * __page_balloon_device - return the balloon device owing the page.
  + *
  + * This shall only be used at driver callbacks under proper page_lock,
  + * to get access to the balloon device structure that owns @page.
  + */
  +static inline void *__page_balloon_device(struct page *page)
  +{
  +   struct address_space *mapping;
  +   mapping = rcu_access_pointer(page-mapping);
  +   if (mapping)
  +   mapping = mapping-assoc_mapping;
  +   return (void *)mapping;
  +}
  +
  +#define count_balloon_event(e) count_vm_event(e)
  +/*
  + * DEFINE_BALLOON_MAPPING_AOPS - declare and instantiate a callback 
  descriptor
  + *  to be used as balloon page-mapping-a_ops.
  + *
  + * @label : declaration identifier (var name)
  + * @migratepg : callback symbol name for performing the page migration step
  + * @isolatepg : callback symbol name for performing the page isolation step
  + * @putbackpg : callback symbol name for performing the page putback step
  + *
  + * address_space_operations utilized methods for ballooned pages:
  + *   .migratepage- used to perform balloon's page migration (as is)
  + *   .launder_page   - used to isolate a page from balloon's page list
  + *   .freepage   - used to reinsert an isolated page to balloon's page 
  list
  + */
 
 It would be a good idea to document the assumptions here.
 Looks like .launder_page and .freepage are called in rcu critical
 section.
 But migratepage isn't - why is that safe?
 
  +#define DEFINE_BALLOON_MAPPING_AOPS(label, migratepg, isolatepg, 
  putbackpg) \
  +   const struct address_space_operations (label) = {   \
  +   .migratepage  = (migratepg),

Re: [PATCH v8 3/5] virtio_balloon: introduce migration primitives to balloon pages

2012-08-21 Thread Michael S. Tsirkin
On Tue, Aug 21, 2012 at 09:47:46AM -0300, Rafael Aquini wrote:
 Memory fragmentation introduced by ballooning might reduce significantly
 the number of 2MB contiguous memory blocks that can be used within a guest,
 thus imposing performance penalties associated with the reduced number of
 transparent huge pages that could be used by the guest workload.
 
 Besides making balloon pages movable at allocation time and introducing
 the necessary primitives to perform balloon page migration/compaction,
 this patch also introduces the following locking scheme to provide the
 proper synchronization and protection for struct virtio_balloon elements
 against concurrent accesses due to parallel operations introduced by
 memory compaction / page migration.
  - balloon_lock (mutex) : synchronizes the access demand to elements of
 struct virtio_balloon and its queue operations;
  - pages_lock (spinlock): special protection to balloon pages list against
 concurrent list handling operations;
  - virtio_baloon-pages list handling sync by RCU operations;
 
 Signed-off-by: Rafael Aquini aqu...@redhat.com
 ---
  drivers/virtio/virtio_balloon.c | 210 
 +---
  1 file changed, 199 insertions(+), 11 deletions(-)
 
 diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
 index 0908e60..bda7bb0 100644
 --- a/drivers/virtio/virtio_balloon.c
 +++ b/drivers/virtio/virtio_balloon.c
 @@ -27,6 +27,7 @@
  #include linux/delay.h
  #include linux/slab.h
  #include linux/module.h
 +#include linux/balloon_compaction.h
  
  /*
   * Balloon device works in 4K page units.  So each page is pointed to by
 @@ -35,6 +36,12 @@
   */
  #define VIRTIO_BALLOON_PAGES_PER_PAGE (PAGE_SIZE  VIRTIO_BALLOON_PFN_SHIFT)
  
 +/* flags used to hint compaction procedures about the balloon device status 
 */
 +enum balloon_status_flags {
 + BALLOON_REMOVAL = 0,/* balloon device is under removal steps */
 + BALLOON_OK, /* balloon device is up and running */
 +};
 +
  struct virtio_balloon
  {
   struct virtio_device *vdev;
 @@ -46,11 +53,24 @@ struct virtio_balloon
   /* The thread servicing the balloon. */
   struct task_struct *thread;
  
 + /* balloon special page-mapping */
 + struct address_space *mapping;
 +
 + /* Synchronize access/update to this struct virtio_balloon elements */
 + struct mutex balloon_lock;
 +
   /* Waiting for host to ack the pages we released. */
   wait_queue_head_t acked;
  
   /* Number of balloon pages we've told the Host we're not using. */
   unsigned int num_pages;
 +
 + /* balloon device status flag */
 + unsigned short balloon_status;
 +
 + /* Protect 'pages' list against concurrent handling */
 + spinlock_t pages_lock;
 +
   /*
* The pages we've told the Host we're not using.
* Each page on this list adds VIRTIO_BALLOON_PAGES_PER_PAGE
 @@ -122,13 +142,17 @@ static void set_page_pfns(u32 pfns[], struct page *page)
  
  static void fill_balloon(struct virtio_balloon *vb, size_t num)
  {
 + /* Get the proper GFP alloc mask from vb-mapping flags */
 + gfp_t vb_gfp_mask = mapping_gfp_mask(vb-mapping);
 +
   /* We can only do one array worth at a time. */
   num = min(num, ARRAY_SIZE(vb-pfns));
  
 + mutex_lock(vb-balloon_lock);
   for (vb-num_pfns = 0; vb-num_pfns  num;
vb-num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
 - struct page *page = alloc_page(GFP_HIGHUSER | __GFP_NORETRY |
 - __GFP_NOMEMALLOC | __GFP_NOWARN);
 + struct page *page = alloc_page(vb_gfp_mask | __GFP_NORETRY |
 +__GFP_NOWARN | __GFP_NOMEMALLOC);
   if (!page) {
   if (printk_ratelimit())
   dev_printk(KERN_INFO, vb-vdev-dev,
 @@ -141,7 +165,10 @@ static void fill_balloon(struct virtio_balloon *vb, 
 size_t num)
   set_page_pfns(vb-pfns + vb-num_pfns, page);
   vb-num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
   totalram_pages--;
 - list_add(page-lru, vb-pages);
 + spin_lock(vb-pages_lock);
 + list_add_rcu(page-lru, vb-pages);
 + assign_balloon_mapping(page, vb-mapping);
 + spin_unlock(vb-pages_lock);
   }
  
   /* Didn't get any?  Oh well. */
 @@ -149,6 +176,7 @@ static void fill_balloon(struct virtio_balloon *vb, 
 size_t num)
   return;
  
   tell_host(vb, vb-inflate_vq);
 + mutex_unlock(vb-balloon_lock);
  }
  
  static void release_pages_by_pfn(const u32 pfns[], unsigned int num)
 @@ -169,21 +197,48 @@ static void leak_balloon(struct virtio_balloon *vb, 
 size_t num)
   /* We can only do one array worth at a time. */
   num = min(num, ARRAY_SIZE(vb-pfns));
  
 + mutex_lock(vb-balloon_lock);
   for (vb-num_pfns = 0; 

Re: [PATCH v8 4/5] mm: introduce putback_movable_pages()

2012-08-21 Thread Michael S. Tsirkin
On Tue, Aug 21, 2012 at 09:47:47AM -0300, Rafael Aquini wrote:
 The PATCH mm: introduce compaction and migration for virtio ballooned pages
 hacks around putback_lru_pages() in order to allow ballooned pages to be
 re-inserted on balloon page list as if a ballooned page was like a LRU page.
 
 As ballooned pages are not legitimate LRU pages, this patch introduces
 putback_movable_pages() to properly cope with cases where the isolated
 pageset contains ballooned pages and LRU pages, thus fixing the mentioned
 inelegant hack around putback_lru_pages().
 
 Signed-off-by: Rafael Aquini aqu...@redhat.com
 ---
  include/linux/migrate.h |  2 ++
  mm/compaction.c |  4 ++--
  mm/migrate.c| 20 
  mm/page_alloc.c |  2 +-
  4 files changed, 25 insertions(+), 3 deletions(-)
 
 diff --git a/include/linux/migrate.h b/include/linux/migrate.h
 index ce7e667..ff103a1 100644
 --- a/include/linux/migrate.h
 +++ b/include/linux/migrate.h
 @@ -10,6 +10,7 @@ typedef struct page *new_page_t(struct page *, unsigned 
 long private, int **);
  #ifdef CONFIG_MIGRATION
  
  extern void putback_lru_pages(struct list_head *l);
 +extern void putback_movable_pages(struct list_head *l);
  extern int migrate_page(struct address_space *,
   struct page *, struct page *, enum migrate_mode);
  extern int migrate_pages(struct list_head *l, new_page_t x,
 @@ -33,6 +34,7 @@ extern int migrate_huge_page_move_mapping(struct 
 address_space *mapping,
  #else
  
  static inline void putback_lru_pages(struct list_head *l) {}
 +static inline void putback_movable_pages(struct list_head *l) {}
  static inline int migrate_pages(struct list_head *l, new_page_t x,
   unsigned long private, bool offlining,
   enum migrate_mode mode) { return -ENOSYS; }
 diff --git a/mm/compaction.c b/mm/compaction.c
 index ce43dc2..782ed32 100644
 --- a/mm/compaction.c
 +++ b/mm/compaction.c
 @@ -759,9 +759,9 @@ static int compact_zone(struct zone *zone, struct 
 compact_control *cc)
   trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
   nr_remaining);
  
 - /* Release LRU pages not migrated */
 + /* Release isolated pages not migrated */
   if (err) {
 - putback_lru_pages(cc-migratepages);
 + putback_movable_pages(cc-migratepages);
   cc-nr_migratepages = 0;
   if (err == -ENOMEM) {
   ret = COMPACT_PARTIAL;
 diff --git a/mm/migrate.c b/mm/migrate.c
 index 6392da258..0bf2caf 100644
 --- a/mm/migrate.c
 +++ b/mm/migrate.c
 @@ -80,6 +80,26 @@ void putback_lru_pages(struct list_head *l)
   list_del(page-lru);
   dec_zone_page_state(page, NR_ISOLATED_ANON +
   page_is_file_cache(page));
 + putback_lru_page(page);
 + }
 +}
 +
 +/*
 + * Put previously isolated pages back onto the appropriated lists

Do you mean appropriate lists?

 + * from where they were once taken off for compaction/migration.
 + *
 + * This function shall be used instead of putback_lru_pages(),
 + * whenever the isolated pageset has been built by 
 isolate_migratepages_range()
 + */
 +void putback_movable_pages(struct list_head *l)
 +{
 + struct page *page;
 + struct page *page2;
 +
 + list_for_each_entry_safe(page, page2, l, lru) {
 + list_del(page-lru);
 + dec_zone_page_state(page, NR_ISOLATED_ANON +
 + page_is_file_cache(page));
   if (unlikely(movable_balloon_page(page)))
   putback_balloon_page(page);
   else
 diff --git a/mm/page_alloc.c b/mm/page_alloc.c
 index 009ac28..78b7663 100644
 --- a/mm/page_alloc.c
 +++ b/mm/page_alloc.c
 @@ -5669,7 +5669,7 @@ static int __alloc_contig_migrate_range(unsigned long 
 start, unsigned long end)
   0, false, MIGRATE_SYNC);
   }
  
 - putback_lru_pages(cc.migratepages);
 + putback_movable_pages(cc.migratepages);
   return ret  0 ? 0 : ret;
  }
  
 -- 
 1.7.11.4
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v8 3/5] virtio_balloon: introduce migration primitives to balloon pages

2012-08-21 Thread Michael S. Tsirkin
On Tue, Aug 21, 2012 at 09:47:46AM -0300, Rafael Aquini wrote:
 Memory fragmentation introduced by ballooning might reduce significantly
 the number of 2MB contiguous memory blocks that can be used within a guest,
 thus imposing performance penalties associated with the reduced number of
 transparent huge pages that could be used by the guest workload.
 
 Besides making balloon pages movable at allocation time and introducing
 the necessary primitives to perform balloon page migration/compaction,
 this patch also introduces the following locking scheme to provide the
 proper synchronization and protection for struct virtio_balloon elements
 against concurrent accesses due to parallel operations introduced by
 memory compaction / page migration.
  - balloon_lock (mutex) : synchronizes the access demand to elements of
 struct virtio_balloon and its queue operations;
  - pages_lock (spinlock): special protection to balloon pages list against
 concurrent list handling operations;
  - virtio_baloon-pages list handling sync by RCU operations;
 
 Signed-off-by: Rafael Aquini aqu...@redhat.com

Sorry about sending multiple comments to same patch.
I just thought of some more questions.

 ---
  drivers/virtio/virtio_balloon.c | 210 
 +---
  1 file changed, 199 insertions(+), 11 deletions(-)
 
 diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
 index 0908e60..bda7bb0 100644
 --- a/drivers/virtio/virtio_balloon.c
 +++ b/drivers/virtio/virtio_balloon.c
 @@ -27,6 +27,7 @@
  #include linux/delay.h
  #include linux/slab.h
  #include linux/module.h
 +#include linux/balloon_compaction.h
  
  /*
   * Balloon device works in 4K page units.  So each page is pointed to by
 @@ -35,6 +36,12 @@
   */
  #define VIRTIO_BALLOON_PAGES_PER_PAGE (PAGE_SIZE  VIRTIO_BALLOON_PFN_SHIFT)
  
 +/* flags used to hint compaction procedures about the balloon device status 
 */
 +enum balloon_status_flags {
 + BALLOON_REMOVAL = 0,/* balloon device is under removal steps */
 + BALLOON_OK, /* balloon device is up and running */
 +};
 +
  struct virtio_balloon
  {
   struct virtio_device *vdev;
 @@ -46,11 +53,24 @@ struct virtio_balloon
   /* The thread servicing the balloon. */
   struct task_struct *thread;
  
 + /* balloon special page-mapping */
 + struct address_space *mapping;
 +
 + /* Synchronize access/update to this struct virtio_balloon elements */
 + struct mutex balloon_lock;
 +
   /* Waiting for host to ack the pages we released. */
   wait_queue_head_t acked;
  
   /* Number of balloon pages we've told the Host we're not using. */
   unsigned int num_pages;
 +
 + /* balloon device status flag */
 + unsigned short balloon_status;
 +
 + /* Protect 'pages' list against concurrent handling */
 + spinlock_t pages_lock;
 +
   /*
* The pages we've told the Host we're not using.
* Each page on this list adds VIRTIO_BALLOON_PAGES_PER_PAGE
 @@ -122,13 +142,17 @@ static void set_page_pfns(u32 pfns[], struct page *page)
  
  static void fill_balloon(struct virtio_balloon *vb, size_t num)
  {
 + /* Get the proper GFP alloc mask from vb-mapping flags */
 + gfp_t vb_gfp_mask = mapping_gfp_mask(vb-mapping);
 +
   /* We can only do one array worth at a time. */
   num = min(num, ARRAY_SIZE(vb-pfns));
  
 + mutex_lock(vb-balloon_lock);
   for (vb-num_pfns = 0; vb-num_pfns  num;
vb-num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
 - struct page *page = alloc_page(GFP_HIGHUSER | __GFP_NORETRY |
 - __GFP_NOMEMALLOC | __GFP_NOWARN);
 + struct page *page = alloc_page(vb_gfp_mask | __GFP_NORETRY |
 +__GFP_NOWARN | __GFP_NOMEMALLOC);
   if (!page) {
   if (printk_ratelimit())
   dev_printk(KERN_INFO, vb-vdev-dev,
 @@ -141,7 +165,10 @@ static void fill_balloon(struct virtio_balloon *vb, 
 size_t num)
   set_page_pfns(vb-pfns + vb-num_pfns, page);
   vb-num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
   totalram_pages--;
 - list_add(page-lru, vb-pages);
 + spin_lock(vb-pages_lock);
 + list_add_rcu(page-lru, vb-pages);
 + assign_balloon_mapping(page, vb-mapping);
 + spin_unlock(vb-pages_lock);
   }
  
   /* Didn't get any?  Oh well. */
 @@ -149,6 +176,7 @@ static void fill_balloon(struct virtio_balloon *vb, 
 size_t num)
   return;
  
   tell_host(vb, vb-inflate_vq);
 + mutex_unlock(vb-balloon_lock);
  }
  
  static void release_pages_by_pfn(const u32 pfns[], unsigned int num)
 @@ -169,21 +197,48 @@ static void leak_balloon(struct virtio_balloon *vb, 
 size_t num)
   /* We can only do one array worth at a time. */
   num = min(num, 

Re: [PATCH v8 1/5] mm: introduce a common interface for balloon pages mobility

2012-08-21 Thread Michael S. Tsirkin
On Tue, Aug 21, 2012 at 05:16:06PM +0200, Peter Zijlstra wrote:
 On Tue, 2012-08-21 at 16:52 +0300, Michael S. Tsirkin wrote:
   + rcu_read_lock();
   + mapping = rcu_dereference(page-mapping);
   + if (mapping_balloon(mapping))
   + ret = true;
   + rcu_read_unlock();
  
  This looks suspicious: you drop rcu_read_unlock
  so can't page switch from balloon to non balloon? 
 
 RCU read lock is a non-exclusive lock, it cannot avoid anything like
 that.

You are right, of course. So even keeping rcu_read_lock across both test
and operation won't be enough - you need to make this function return
the mapping and pass it to isolate_page/putback_page so that it is only
dereferenced once.

-- 
MST
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v8 1/5] mm: introduce a common interface for balloon pages mobility

2012-08-21 Thread Peter Zijlstra
On Tue, 2012-08-21 at 16:52 +0300, Michael S. Tsirkin wrote:
  + rcu_read_lock();
  + mapping = rcu_dereference(page-mapping);
  + if (mapping_balloon(mapping))
  + ret = true;
  + rcu_read_unlock();
 
 This looks suspicious: you drop rcu_read_unlock
 so can't page switch from balloon to non balloon? 

RCU read lock is a non-exclusive lock, it cannot avoid anything like
that.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v8 1/5] mm: introduce a common interface for balloon pages mobility

2012-08-21 Thread Peter Zijlstra
On Tue, 2012-08-21 at 09:47 -0300, Rafael Aquini wrote:
 +   mapping = rcu_access_pointer(page-mapping);
 +   if (mapping)
 +   mapping = mapping-assoc_mapping; 

The comment near rcu_access_pointer() explicitly says:

 * Return the value of the specified RCU-protected pointer, but omit the
 * smp_read_barrier_depends() and keep the ACCESS_ONCE().  This is useful
 * when the value of this pointer is accessed, but the pointer is not
 * dereferenced,

Yet you dereference the pointer... smells like fail to me.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v8 3/5] virtio_balloon: introduce migration primitives to balloon pages

2012-08-21 Thread Peter Zijlstra
On Tue, 2012-08-21 at 17:40 +0300, Michael S. Tsirkin wrote:
  + spin_lock(vb-pages_lock);
  + page = list_first_or_null_rcu(vb-pages, struct page, lru);
 
 Why is list_first_or_null_rcu called outside
 RCU critical section here?

It looks like vb-pages_lock is the exclusive (or modification)
counterpart to the rcu-read-lock in this particular case, so it should
be fine.

Although for that same reason, it seems superfluous to use the RCU list
method since we're exclusive with list manipulations anyway.

  + if (!page) {
  + spin_unlock(vb-pages_lock);
  + break;
  + }
  + /*
  +  * It is safe now to drop page-mapping and delete this page
  +  * from balloon page list, since we are grabbing 'pages_lock'
  +  * which prevents 'virtballoon_isolatepage()' from acting.
  +  */
  + clear_balloon_mapping(page);
  + list_del_rcu(page-lru);
  + spin_unlock(vb-pages_lock); 
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v8 1/5] mm: introduce a common interface for balloon pages mobility

2012-08-21 Thread Paul E. McKenney
On Tue, Aug 21, 2012 at 05:20:11PM +0200, Peter Zijlstra wrote:
 On Tue, 2012-08-21 at 09:47 -0300, Rafael Aquini wrote:
  +   mapping = rcu_access_pointer(page-mapping);
  +   if (mapping)
  +   mapping = mapping-assoc_mapping; 
 
 The comment near rcu_access_pointer() explicitly says:
 
  * Return the value of the specified RCU-protected pointer, but omit the
  * smp_read_barrier_depends() and keep the ACCESS_ONCE().  This is useful
  * when the value of this pointer is accessed, but the pointer is not
  * dereferenced,
 
 Yet you dereference the pointer... smells like fail to me.

Indeed!

This will break DEC Alpha.  In addition, if -mapping can transition
from non-NULL to NULL, and if you used rcu_access_pointer() rather
than rcu_dereference() to avoid lockdep-RCU from yelling at you about
not either being in an RCU read-side critical section or holding an
update-side lock, you can see failures as follows:

1.  CPU 0 runs the above code, picks up mapping, and finds it non-NULL.

2.  CPU 0 is preempted or otherwise delayed.  (Keep in mind that
even disabling interrupts in a guest OS does not prevent the
host hypervisor from preempting!)

3.  Some other CPU NULLs page-mapping.  Because CPU 0 isn't doing
anything to prevent it, this other CPU frees the memory.

4.  CPU 0 resumes, and then accesses what is now the freelist.
Arbitrarily bad things start happening.

If you are in a read-side critical section, use rcu_dereference() instead
of rcu_access_pointer().  If you are holding an update-side lock, use
rcu_dereference_protected() and say what lock you are holding.  If you
are doing something else, please say what it is.

Thanx, Paul

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v8 1/5] mm: introduce a common interface for balloon pages mobility

2012-08-21 Thread Rafael Aquini
On Tue, Aug 21, 2012 at 06:41:42PM +0300, Michael S. Tsirkin wrote:
 On Tue, Aug 21, 2012 at 05:16:06PM +0200, Peter Zijlstra wrote:
  On Tue, 2012-08-21 at 16:52 +0300, Michael S. Tsirkin wrote:
+ rcu_read_lock();
+ mapping = rcu_dereference(page-mapping);
+ if (mapping_balloon(mapping))
+ ret = true;
+ rcu_read_unlock();
   
   This looks suspicious: you drop rcu_read_unlock
   so can't page switch from balloon to non balloon? 
  
  RCU read lock is a non-exclusive lock, it cannot avoid anything like
  that.
 
 You are right, of course. So even keeping rcu_read_lock across both test
 and operation won't be enough - you need to make this function return
 the mapping and pass it to isolate_page/putback_page so that it is only
 dereferenced once.

No, I need to dereference page-mapping to check -mapping flags here, before
returning. Remember this function is used at MM's compaction/migration inner
circles to identify ballooned pages and decide what's the next step. This
function is doing the right thing, IMHO.

Also, looking at how compaction/migration work, we verify the only critical path
for this function is the page isolation step. The other steps (migration and
putback) perform their work on private lists previouly isolated from a given
source.

So, we just need to make sure that the isolation part does not screw things up
by isolating pages that balloon driver is about to release. That's why there are
so many checkpoints down the page isolation path assuring we really are
isolating a balloon page. 


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v8 1/5] mm: introduce a common interface for balloon pages mobility

2012-08-21 Thread Rafael Aquini
On Tue, Aug 21, 2012 at 04:52:23PM +0300, Michael S. Tsirkin wrote:
  + * address_space_operations utilized methods for ballooned pages:
  + *   .migratepage- used to perform balloon's page migration (as is)
  + *   .launder_page   - used to isolate a page from balloon's page list
  + *   .freepage   - used to reinsert an isolated page to balloon's page 
  list
  + */
 
 It would be a good idea to document the assumptions here.
 Looks like .launder_page and .freepage are called in rcu critical
 section.
 But migratepage isn't - why is that safe?
 

The migratepage callback for virtio_balloon can sleep, and IIUC we cannot sleep
within a RCU critical section. 

Also, The migratepage callback is called at inner migration's circle function
move_to_new_page(), and I don't think embedding it in a RCU critical section
would be a good idea, for the same understanding aforementioned.

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v8 1/5] mm: introduce a common interface for balloon pages mobility

2012-08-21 Thread Rafael Aquini
On Tue, Aug 21, 2012 at 09:24:32AM -0700, Paul E. McKenney wrote:
 On Tue, Aug 21, 2012 at 05:20:11PM +0200, Peter Zijlstra wrote:
  On Tue, 2012-08-21 at 09:47 -0300, Rafael Aquini wrote:
   +   mapping = rcu_access_pointer(page-mapping);
   +   if (mapping)
   +   mapping = mapping-assoc_mapping; 
  
  The comment near rcu_access_pointer() explicitly says:
  
   * Return the value of the specified RCU-protected pointer, but omit the
   * smp_read_barrier_depends() and keep the ACCESS_ONCE().  This is useful
   * when the value of this pointer is accessed, but the pointer is not
   * dereferenced,
  
  Yet you dereference the pointer... smells like fail to me.
 
 Indeed!
 
 This will break DEC Alpha.  In addition, if -mapping can transition
 from non-NULL to NULL, and if you used rcu_access_pointer() rather
 than rcu_dereference() to avoid lockdep-RCU from yelling at you about
 not either being in an RCU read-side critical section or holding an
 update-side lock, you can see failures as follows:
 
 1.CPU 0 runs the above code, picks up mapping, and finds it non-NULL.
 
 2.CPU 0 is preempted or otherwise delayed.  (Keep in mind that
   even disabling interrupts in a guest OS does not prevent the
   host hypervisor from preempting!)
 
 3.Some other CPU NULLs page-mapping.  Because CPU 0 isn't doing
   anything to prevent it, this other CPU frees the memory.
 
 4.CPU 0 resumes, and then accesses what is now the freelist.
   Arbitrarily bad things start happening.
 
 If you are in a read-side critical section, use rcu_dereference() instead
 of rcu_access_pointer().  If you are holding an update-side lock, use
 rcu_dereference_protected() and say what lock you are holding.  If you
 are doing something else, please say what it is.
 
   Thanx, Paul

Paul  Peter,

Thanks for looking into this stuff and providing me such valuable feedback, and
RCU usage crashcourse.

I believe rcu_dereference_protected() is what I want/need here, since this code
is always called for pages which we hold locked (PG_locked bit). So, it brings 
me
to ask you if the following usage looks sane enough to fix the well pointed 
issue,
or if it's another misuse of RCU API:

+   mapping = rcu_dereference_protecetd(page-mapping, PageLocked(page));
+   if (mapping)
+   mapping = mapping-assoc_mapping; 


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [Qemu-devel] [PATCH 1/2 v1] blkdrv: Add queue limits parameters for sg block drive

2012-08-21 Thread Blue Swirl
On Tue, Aug 21, 2012 at 8:23 AM, Cong Meng m...@linux.vnet.ibm.com wrote:
 This patch adds some queue limit parameters into block drive. And inits them
 for sg block drive. Some interfaces are also added for accessing them.

 Signed-off-by: Cong Meng m...@linux.vnet.ibm.com
 ---
  block/raw-posix.c |   58 
 +
  block_int.h   |4 +++
  blockdev.c|   15 +
  hw/block-common.h |3 ++
  4 files changed, 80 insertions(+), 0 deletions(-)

 diff --git a/block/raw-posix.c b/block/raw-posix.c
 index 0dce089..a086f89 100644
 --- a/block/raw-posix.c
 +++ b/block/raw-posix.c
 @@ -53,6 +53,7 @@
  #include linux/cdrom.h
  #include linux/fd.h
  #include linux/fs.h
 +#include dirent.h
  #endif
  #ifdef CONFIG_FIEMAP
  #include linux/fiemap.h
 @@ -829,6 +830,62 @@ static int hdev_probe_device(const char *filename)
  return 0;
  }

 +static void read_queue_limit(char *path, const char *filename, unsigned int 
 *val)
 +{
 +FILE *f;
 +char *tail = path + strlen(path);
 +
 +pstrcat(path, MAXPATHLEN, filename);
 +f = fopen(path, r);
 +if (!f) {
 +goto out;
 +}
 +
 +fscanf(f, %u, val);

Please handle errors, otherwise *val may be left to uninitialized state.

 +fclose(f);
 +
 +out:
 +*tail = 0;
 +}
 +
 +static void sg_get_queue_limits(BlockDriverState *bs, const char *filename)
 +{
 +DIR *ffs;
 +struct dirent *d;
 +char path[MAXPATHLEN];
 +
 +snprintf(path, MAXPATHLEN,
 + /sys/class/scsi_generic/sg%s/device/block/,
 + filename + strlen(/dev/sg));
 +

I'd init bs-max_sectors etc. here so they are not left uninitialized.

 +ffs = opendir(path);
 +if (!ffs) {
 +return;
 +}
 +
 +for (;;) {
 +d = readdir(ffs);
 +if (!d) {
 +return;
 +}
 +
 +if (strcmp(d-d_name, .) == 0 || strcmp(d-d_name, ..) == 0) {
 +continue;
 +}
 +
 +break;
 +}
 +
 +closedir(ffs);
 +
 +pstrcat(path, MAXPATHLEN, d-d_name);
 +pstrcat(path, MAXPATHLEN, /queue/);
 +
 +read_queue_limit(path, max_sectors_kb, bs-max_sectors);
 +read_queue_limit(path, max_segments, bs-max_segments);
 +read_queue_limit(path, max_segment_size, bs-max_segment_size);
 +}
 +
  static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
  {
  BDRVRawState *s = bs-opaque;
 @@ -868,6 +925,7 @@ static int hdev_open(BlockDriverState *bs, const char 
 *filename, int flags)
  temp = realpath(filename, resolved_path);
  if (temp  strstart(temp, /dev/sg, NULL)) {
  bs-sg = 1;
 +sg_get_queue_limits(bs, temp);
  }
  }
  #endif
 diff --git a/block_int.h b/block_int.h
 index d72317f..a9d07a2 100644
 --- a/block_int.h
 +++ b/block_int.h
 @@ -333,6 +333,10 @@ struct BlockDriverState {

  /* long-running background operation */
  BlockJob *job;
 +
 +unsigned int max_sectors;

With 32 bit ints and even with 4k sector size, the maximum disk size
would be 16TB which may be soon exceeded by disks on the market.
Please use 64 bit values, probably also for segment values below.

 +unsigned int max_segments;
 +unsigned int max_segment_size;
  };

  int get_tmp_filename(char *filename, int size);
 diff --git a/blockdev.c b/blockdev.c
 index 3d75015..e17edbf 100644
 --- a/blockdev.c
 +++ b/blockdev.c
 @@ -1191,3 +1191,18 @@ BlockJobInfoList *qmp_query_block_jobs(Error **errp)
  bdrv_iterate(do_qmp_query_block_jobs_one, prev);
  return dummy.next;
  }
 +
 +unsigned int get_queue_max_sectors(BlockDriverState *bs)
 +{
 +return (bs-file  bs-file-sg) ? bs-file-max_sectors : 0;
 +}
 +
 +unsigned int get_queue_max_segments(BlockDriverState *bs)
 +{
 +return (bs-file  bs-file-sg) ? bs-file-max_segments : 0;
 +}
 +
 +unsigned int get_queue_max_segment_size(BlockDriverState *bs)
 +{
 +return (bs-file  bs-file-sg) ? bs-file-max_segment_size : 0;
 +}
 diff --git a/hw/block-common.h b/hw/block-common.h
 index bb808f7..d47fcd7 100644
 --- a/hw/block-common.h
 +++ b/hw/block-common.h
 @@ -76,4 +76,7 @@ void hd_geometry_guess(BlockDriverState *bs,
 int *ptrans);
  int hd_bios_chs_auto_trans(uint32_t cyls, uint32_t heads, uint32_t secs);

 +unsigned int get_queue_max_sectors(BlockDriverState *bs);
 +unsigned int get_queue_max_segments(BlockDriverState *bs);
 +unsigned int get_queue_max_segment_size(BlockDriverState *bs);
  #endif
 --
 1.7.7.6


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v8 1/5] mm: introduce a common interface for balloon pages mobility

2012-08-21 Thread Michael S. Tsirkin
On Tue, Aug 21, 2012 at 02:28:20PM -0300, Rafael Aquini wrote:
 On Tue, Aug 21, 2012 at 09:24:32AM -0700, Paul E. McKenney wrote:
  On Tue, Aug 21, 2012 at 05:20:11PM +0200, Peter Zijlstra wrote:
   On Tue, 2012-08-21 at 09:47 -0300, Rafael Aquini wrote:
+   mapping = rcu_access_pointer(page-mapping);
+   if (mapping)
+   mapping = mapping-assoc_mapping; 
   
   The comment near rcu_access_pointer() explicitly says:
   
* Return the value of the specified RCU-protected pointer, but omit the
* smp_read_barrier_depends() and keep the ACCESS_ONCE().  This is useful
* when the value of this pointer is accessed, but the pointer is not
* dereferenced,
   
   Yet you dereference the pointer... smells like fail to me.
  
  Indeed!
  
  This will break DEC Alpha.  In addition, if -mapping can transition
  from non-NULL to NULL, and if you used rcu_access_pointer() rather
  than rcu_dereference() to avoid lockdep-RCU from yelling at you about
  not either being in an RCU read-side critical section or holding an
  update-side lock, you can see failures as follows:
  
  1.  CPU 0 runs the above code, picks up mapping, and finds it non-NULL.
  
  2.  CPU 0 is preempted or otherwise delayed.  (Keep in mind that
  even disabling interrupts in a guest OS does not prevent the
  host hypervisor from preempting!)
  
  3.  Some other CPU NULLs page-mapping.  Because CPU 0 isn't doing
  anything to prevent it, this other CPU frees the memory.
  
  4.  CPU 0 resumes, and then accesses what is now the freelist.
  Arbitrarily bad things start happening.
  
  If you are in a read-side critical section, use rcu_dereference() instead
  of rcu_access_pointer().  If you are holding an update-side lock, use
  rcu_dereference_protected() and say what lock you are holding.  If you
  are doing something else, please say what it is.
  
  Thanx, Paul
 
 Paul  Peter,
 
 Thanks for looking into this stuff and providing me such valuable feedback, 
 and
 RCU usage crashcourse.
 
 I believe rcu_dereference_protected() is what I want/need here, since this 
 code
 is always called for pages which we hold locked (PG_locked bit).

It would only help if we locked the page while updating the mapping,
as far as I can see we don't.

 So, it brings me
 to ask you if the following usage looks sane enough to fix the well pointed 
 issue,
 or if it's another misuse of RCU API:
 
 +   mapping = rcu_dereference_protecetd(page-mapping, PageLocked(page));
 +   if (mapping)
 +   mapping = mapping-assoc_mapping; 
 
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v8 1/5] mm: introduce a common interface for balloon pages mobility

2012-08-21 Thread Michael S. Tsirkin
On Tue, Aug 21, 2012 at 02:55:03PM -0300, Rafael Aquini wrote:
 On Tue, Aug 21, 2012 at 04:52:23PM +0300, Michael S. Tsirkin wrote:
   + * address_space_operations utilized methods for ballooned pages:
   + *   .migratepage- used to perform balloon's page migration (as is)
   + *   .launder_page   - used to isolate a page from balloon's page list
   + *   .freepage   - used to reinsert an isolated page to balloon's 
   page list
   + */
  
  It would be a good idea to document the assumptions here.
  Looks like .launder_page and .freepage are called in rcu critical
  section.
  But migratepage isn't - why is that safe?
  
 
 The migratepage callback for virtio_balloon can sleep, and IIUC we cannot 
 sleep
 within a RCU critical section. 
 
 Also, The migratepage callback is called at inner migration's circle function
 move_to_new_page(), and I don't think embedding it in a RCU critical section
 would be a good idea, for the same understanding aforementioned.

Yes but this means it is still exposed to the module unloading
races that RCU was supposed to fix.
So need to either rework that code so it won't sleep
or switch to some other synchronization.

-- 
MST
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v8 1/5] mm: introduce a common interface for balloon pages mobility

2012-08-21 Thread Rafael Aquini
On Tue, Aug 21, 2012 at 10:13:30PM +0300, Michael S. Tsirkin wrote:
  
  I believe rcu_dereference_protected() is what I want/need here, since this 
  code
  is always called for pages which we hold locked (PG_locked bit).
 
 It would only help if we locked the page while updating the mapping,
 as far as I can see we don't.


But we can do it. In fact, by doing it (locking the page) we can easily avoid
the nasty race balloon_isolate_page / leak_balloon, in a much simpler way, IMHO.


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v8 1/5] mm: introduce a common interface for balloon pages mobility

2012-08-21 Thread Michael S. Tsirkin
On Tue, Aug 21, 2012 at 02:42:52PM -0300, Rafael Aquini wrote:
 On Tue, Aug 21, 2012 at 06:41:42PM +0300, Michael S. Tsirkin wrote:
  On Tue, Aug 21, 2012 at 05:16:06PM +0200, Peter Zijlstra wrote:
   On Tue, 2012-08-21 at 16:52 +0300, Michael S. Tsirkin wrote:
 + rcu_read_lock();
 + mapping = rcu_dereference(page-mapping);
 + if (mapping_balloon(mapping))
 + ret = true;
 + rcu_read_unlock();

This looks suspicious: you drop rcu_read_unlock
so can't page switch from balloon to non balloon? 
   
   RCU read lock is a non-exclusive lock, it cannot avoid anything like
   that.
  
  You are right, of course. So even keeping rcu_read_lock across both test
  and operation won't be enough - you need to make this function return
  the mapping and pass it to isolate_page/putback_page so that it is only
  dereferenced once.
 
 No, I need to dereference page-mapping to check -mapping flags here, before
 returning. Remember this function is used at MM's compaction/migration inner
 circles to identify ballooned pages and decide what's the next step. This
 function is doing the right thing, IMHO.

Yes but the calling code is not doing the right thing.

What Peter pointed out here is that two calls to rcu dereference pointer
can return different values: rcu critical section is not a lock.
So the test for balloon page is not effective: it can change
after the fact.

To fix, get the pointer once and then pass the mapping
around.


 Also, looking at how compaction/migration work, we verify the only critical 
 path
 for this function is the page isolation step. The other steps (migration and
 putback) perform their work on private lists previouly isolated from a given
 source.

I vaguely understand but it would be nice to document this properly.
The interaction between page-lru handling in balloon and in mm
is especially confusing.

 So, we just need to make sure that the isolation part does not screw things up
 by isolating pages that balloon driver is about to release. That's why there 
 are
 so many checkpoints down the page isolation path assuring we really are
 isolating a balloon page. 

Well, testing same thing multiple times is just confusing.  It is very
hard to make sure there are no races with so much complexity,
and the requirements from the balloon driver are unclear to me -
it very much looks like it is poking in mm internals.

-- 
MST
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v8 1/5] mm: introduce a common interface for balloon pages mobility

2012-08-21 Thread Michael S. Tsirkin
On Tue, Aug 21, 2012 at 04:23:58PM -0300, Rafael Aquini wrote:
 On Tue, Aug 21, 2012 at 10:13:30PM +0300, Michael S. Tsirkin wrote:
   
   I believe rcu_dereference_protected() is what I want/need here, since 
   this code
   is always called for pages which we hold locked (PG_locked bit).
  
  It would only help if we locked the page while updating the mapping,
  as far as I can see we don't.
 
 
 But we can do it. In fact, by doing it (locking the page) we can easily avoid
 the nasty race balloon_isolate_page / leak_balloon, in a much simpler way, 
 IMHO.

Absolutely. Further, we should look hard at whether most RCU uses
in this patchset can be replaced with page lock.

-- 
MST
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v8 1/5] mm: introduce a common interface for balloon pages mobility

2012-08-21 Thread Rafael Aquini
On Tue, Aug 21, 2012 at 10:16:12PM +0300, Michael S. Tsirkin wrote:
 On Tue, Aug 21, 2012 at 02:55:03PM -0300, Rafael Aquini wrote:
  On Tue, Aug 21, 2012 at 04:52:23PM +0300, Michael S. Tsirkin wrote:
+ * address_space_operations utilized methods for ballooned pages:
+ *   .migratepage- used to perform balloon's page migration (as is)
+ *   .launder_page   - used to isolate a page from balloon's page list
+ *   .freepage   - used to reinsert an isolated page to balloon's 
page list
+ */
   
   It would be a good idea to document the assumptions here.
   Looks like .launder_page and .freepage are called in rcu critical
   section.
   But migratepage isn't - why is that safe?
   
  
  The migratepage callback for virtio_balloon can sleep, and IIUC we cannot 
  sleep
  within a RCU critical section. 
  
  Also, The migratepage callback is called at inner migration's circle 
  function
  move_to_new_page(), and I don't think embedding it in a RCU critical section
  would be a good idea, for the same understanding aforementioned.
 
 Yes but this means it is still exposed to the module unloading
 races that RCU was supposed to fix.
 So need to either rework that code so it won't sleep
 or switch to some other synchronization.

Can you refactor tell_host() to not sleep? Or, can I get rid of calling it at
virtballoon_migratepage()? If 'no' is the answer for both questions, that's the
way that code has to remain, even if we find a way around to hack the
migratepage callback and have it embedded into a RCU crit section.

That's why I believe once the balloon driver is commanded to unload, we must
flag virtballoon_migratepage to skip it's work. By doing this, the thread
performing memory compaction will have to recur to the 'putback' path which is
RCU protected. (IMHO).

As the module will not uload utill it leaks all pages on its list, that unload
race you pointed before will be covered.

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v8 1/5] mm: introduce a common interface for balloon pages mobility

2012-08-21 Thread Rafael Aquini
On Tue, Aug 21, 2012 at 10:30:31PM +0300, Michael S. Tsirkin wrote:
 On Tue, Aug 21, 2012 at 04:23:58PM -0300, Rafael Aquini wrote:
  On Tue, Aug 21, 2012 at 10:13:30PM +0300, Michael S. Tsirkin wrote:

I believe rcu_dereference_protected() is what I want/need here, since 
this code
is always called for pages which we hold locked (PG_locked bit).
   
   It would only help if we locked the page while updating the mapping,
   as far as I can see we don't.
  
  
  But we can do it. In fact, by doing it (locking the page) we can easily 
  avoid
  the nasty race balloon_isolate_page / leak_balloon, in a much simpler way, 
  IMHO.
 
 Absolutely. Further, we should look hard at whether most RCU uses
 in this patchset can be replaced with page lock.


Yeah, In fact, by testing/grabbing the page lock at leak_balloon() even the
module unload X migration / putback race seems to fade away, since migration
code holds the page locked all the way.

And that seems a quite easy task to be accomplished:


@@ -169,21 +197,61 @@ static void leak_balloon(struct virtio_balloon *vb, size_t
num)
/* We can only do one array worth at a time. */
num = min(num, ARRAY_SIZE(vb-pfns));

+   mutex_lock(vb-balloon_lock);
for (vb-num_pfns = 0; vb-num_pfns  num;
 vb-num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
+   spin_lock(vb-pages_lock);
+   /*
+* 'virtballoon_isolatepage()' can drain vb-pages list
+* making us to stumble across a _temporarily_ empty list.
+*
+* Release the spinlock and resume from here in order to
+* give page migration a shot to refill vb-pages list.
+*/
+   if (unlikely(list_empty(vb-pages))) {
+   spin_unlock(vb-pages_lock);
+   break;
+   }
+
page = list_first_entry(vb-pages, struct page, lru);
+
+   /*
+* Grab the page lock to avoid racing against threads isolating
+* pages from vb-pages list (it's done under page lock).
+*
+* Failing to grab the page lock here means this page has been
+* selected for isolation already.
+*/
+   if (!trylock_page(page)) {
+   spin_unlock(vb-pages_lock);
+   break;
+   }
+
+   clear_balloon_mapping(page);
list_del(page-lru);
set_page_pfns(vb-pfns + vb-num_pfns, page);
vb-num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE;
+   unlock_page(page);
+   spin_unlock(vb-pages_lock);
}

.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[RFC-v3 0/5] vhost-scsi: Add support for host virtualized target

2012-08-21 Thread Nicholas A. Bellinger
From: Nicholas Bellinger n...@linux-iscsi.org

Hi folks,

This is the third RFC for vhost-scsi patches against mainline QEMU v1.1
to support the upstream tcm_vhost host virtualized target driver now
available in v3.6-rc kernel code.  This series is based upon last week's
commit 346fe0c4c0b, and is aiming for a future QEMU v1.3 merge.

The patch series is available directly from:

   git://git.kernel.org/pub/scm/virt/kvm/nab/qemu-kvm.git vhost-scsi-merge-v3

This -v3 series contains further review changes based upon feedback from
MST, Paolo, and Blue.  It also contains the changes to function against
the changes in target-pending/master - headed for v3.6-rc3 code.

Changes from v3 - v2:

 - Move qdev_prop_vhost_scsi + DEFINE_PROP_VHOST_SCSI defs into vhost-scsi.[c,h]
   (reported by MST)
 - Add enum vhost_scsi_vq_list for VHostSCSI-vqs[] enumeration (reported by 
MST)
 - Add missing braces around single like if statement to following QEMU
   style (reported by Blue Swirl)
 - Change vhost_scsi_target-vhost_wwpn to char *, in order to drop casts to
   pstrcpy in vhost_scsi_start() + vhost_scsi_stop() (reported by Blue Swirl)
 - Change VHOST_SCSI_GET_ABI_VERSION to 'int' type (MST)
 - Add vhost-scsi.h include for DEFINE_PROP_VHOST_SCSI (mst + nab)
 - Move vhost-scsi related struct members ahead of *cmd_vqs[0] within
   VirtIOSCSI definition.  (paolo + nab)
 - Fix 4 byte alignment of vhost_scsi_target (MST)
 - Convert fprintf(stderr, ...) usage to - error_report() (reported by MST)
 - Do explict memset of backend before calling VHOST_SCSI_CLEAR_ENDPOINT
   in vhost_scsi_stop() (reported by MST)
 - Add support for vhostfd passing in vhost_scsi_add() (reported by MST)
 - Move net_handle_fd_param() - monitor_handle_fd_param() for generic
   usage by net/ + vhost-scsi (reported by MST)
 - Change vhost_scsi_add() to use monitor_handle_fd_param() (reported by MST)

Changes from v1 - v2:

 - Expose ABI version via VHOST_SCSI_GET_ABI_VERSION + use Rev 0 as
   starting point for v3.6-rc code (Stefan + ALiguori + nab)
 - Fix upstream qemu conflict in hw/qdev-properties.c
 - Make GET_ABI_VERSION use int (nab + mst)
 - Drop unnecessary event-notifier changes (nab)
 - Fix vhost-scsi case lables in configure (reported by paolo)
 - Convert qdev_prop_vhost_scsi to use -get() + -set() following
   qdev_prop_netdev (reported by paolo)
 - Fix typo in qemu-options.hx definition of vhost-scsi (reported by paolo)
 - Squash virtio-scsi: use the vhost-scsi host device from stefan (nab)
 - Fix up virtio_scsi_properties[] conflict w/ upstream qemu (nab)
 - Drop usage of to_virtio_scsi() in virtio_scsi_set_status()
  (reported by paolo)
 - Use modern VirtIOSCSIConf define in virtio-scsi.h (reported by paolo)
 - Use s-conf-vhost_scsi instead of proxyconf-vhost_scsi in
   virtio_scsi_init() (reported by paolo)
 - Only register QEMU SCSI bus is vhost-scsi is not active (reported by paolo)
 - Fix incorrect VirtIOSCSI-cmd_vqs[0] definition (nab)

Thanks again to everyone who has been reviewing this series!

--nab

Nicholas Bellinger (2):
  monitor: Rename+move net_handle_fd_param - monitor_handle_fd_param
  virtio-scsi: Set max_target=0 during vhost-scsi operation

Stefan Hajnoczi (3):
  vhost: Pass device path to vhost_dev_init()
  vhost-scsi: add -vhost-scsi host device for use with tcm-vhost
  virtio-scsi: Add start/stop functionality for vhost-scsi

 configure|   10 +++
 hw/Makefile.objs |1 +
 hw/qdev-properties.c |   41 +++
 hw/vhost-scsi.c  |  190 ++
 hw/vhost-scsi.h  |   62 
 hw/vhost.c   |5 +-
 hw/vhost.h   |3 +-
 hw/vhost_net.c   |2 +-
 hw/virtio-pci.c  |2 +
 hw/virtio-scsi.c |   55 ++-
 hw/virtio-scsi.h |1 +
 monitor.c|   18 +
 monitor.h|1 +
 net.c|   18 -
 net.h|2 -
 net/socket.c |2 +-
 net/tap.c|4 +-
 qemu-common.h|1 +
 qemu-config.c|   19 +
 qemu-options.hx  |4 +
 vl.c |   18 +
 21 files changed, 431 insertions(+), 28 deletions(-)
 create mode 100644 hw/vhost-scsi.c
 create mode 100644 hw/vhost-scsi.h

-- 
1.7.2.5

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[RFC-v3 1/5] monitor: Rename+move net_handle_fd_param - monitor_handle_fd_param

2012-08-21 Thread Nicholas A. Bellinger
From: Nicholas Bellinger n...@linux-iscsi.org

This patch renames+moves the net_handle_fd_param() caller used to
obtain a file descriptor from either qemu_parse_fd() (the normal case)
or from monitor_get_fd() (migration case) into a generically prefixed
monitor_handle_fd_param() to be used by vhost-scsi code.

Also update net/[socket,tap].c consumers to use the new prefix.

Reported-by: Michael S. Tsirkin m...@redhat.com
Cc: Michael S. Tsirkin m...@redhat.com
Cc: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
Cc: Paolo Bonzini pbonz...@redhat.com
Cc: Anthony Liguori aligu...@us.ibm.com
Signed-off-by: Nicholas Bellinger n...@linux-iscsi.org
---
 monitor.c|   18 ++
 monitor.h|1 +
 net.c|   18 --
 net.h|2 --
 net/socket.c |2 +-
 net/tap.c|4 ++--
 6 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/monitor.c b/monitor.c
index 49dccfe..0641efe 100644
--- a/monitor.c
+++ b/monitor.c
@@ -2389,6 +2389,24 @@ int monitor_get_fd(Monitor *mon, const char *fdname)
 return -1;
 }
 
+int monitor_handle_fd_param(Monitor *mon, const char *fdname)
+{
+int fd;
+
+if (!qemu_isdigit(fdname[0])  mon) {
+
+fd = monitor_get_fd(mon, fdname);
+if (fd == -1) {
+error_report(No file descriptor named %s found, fdname);
+return -1;
+}
+} else {
+fd = qemu_parse_fd(fdname);
+}
+
+return fd;
+}
+
 /* mon_cmds and info_cmds would be sorted at runtime */
 static mon_cmd_t mon_cmds[] = {
 #include hmp-commands.h
diff --git a/monitor.h b/monitor.h
index 5f4de1b..d557e97 100644
--- a/monitor.h
+++ b/monitor.h
@@ -65,6 +65,7 @@ int monitor_read_block_device_key(Monitor *mon, const char 
*device,
   void *opaque);
 
 int monitor_get_fd(Monitor *mon, const char *fdname);
+int monitor_handle_fd_param(Monitor *mon, const char *fdname);
 
 void monitor_vprintf(Monitor *mon, const char *fmt, va_list ap)
 GCC_FMT_ATTR(2, 0);
diff --git a/net.c b/net.c
index 60043dd..e5d25d4 100644
--- a/net.c
+++ b/net.c
@@ -522,24 +522,6 @@ int qemu_find_nic_model(NICInfo *nd, const char * const 
*models,
 return -1;
 }
 
-int net_handle_fd_param(Monitor *mon, const char *param)
-{
-int fd;
-
-if (!qemu_isdigit(param[0])  mon) {
-
-fd = monitor_get_fd(mon, param);
-if (fd == -1) {
-error_report(No file descriptor named %s found, param);
-return -1;
-}
-} else {
-fd = qemu_parse_fd(param);
-}
-
-return fd;
-}
-
 static int net_init_nic(const NetClientOptions *opts, const char *name,
 NetClientState *peer)
 {
diff --git a/net.h b/net.h
index 2975056..04fda1d 100644
--- a/net.h
+++ b/net.h
@@ -168,8 +168,6 @@ int qmp_netdev_add(Monitor *mon, const QDict *qdict, 
QObject **ret);
 
 void qdev_set_nic_properties(DeviceState *dev, NICInfo *nd);
 
-int net_handle_fd_param(Monitor *mon, const char *param);
-
 #define POLYNOMIAL 0x04c11db6
 unsigned compute_mcast_idx(const uint8_t *ep);
 
diff --git a/net/socket.c b/net/socket.c
index c172c24..7c602e4 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -629,7 +629,7 @@ int net_init_socket(const NetClientOptions *opts, const 
char *name,
 if (sock-has_fd) {
 int fd;
 
-fd = net_handle_fd_param(cur_mon, sock-fd);
+fd = monitor_handle_fd_param(cur_mon, sock-fd);
 if (fd == -1 || !net_socket_fd_init(peer, socket, name, fd, 1)) {
 return -1;
 }
diff --git a/net/tap.c b/net/tap.c
index 1971525..a88ae8f 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -610,7 +610,7 @@ int net_init_tap(const NetClientOptions *opts, const char 
*name,
 return -1;
 }
 
-fd = net_handle_fd_param(cur_mon, tap-fd);
+fd = monitor_handle_fd_param(cur_mon, tap-fd);
 if (fd == -1) {
 return -1;
 }
@@ -686,7 +686,7 @@ int net_init_tap(const NetClientOptions *opts, const char 
*name,
 int vhostfd;
 
 if (tap-has_vhostfd) {
-vhostfd = net_handle_fd_param(cur_mon, tap-vhostfd);
+vhostfd = monitor_handle_fd_param(cur_mon, tap-vhostfd);
 if (vhostfd == -1) {
 return -1;
 }
-- 
1.7.2.5

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[RFC-v3 2/5] vhost: Pass device path to vhost_dev_init()

2012-08-21 Thread Nicholas A. Bellinger
From: Stefan Hajnoczi stefa...@linux.vnet.ibm.com

The path to /dev/vhost-net is currently hardcoded in vhost_dev_init().
This needs to be changed so that /dev/vhost-scsi can be used.  Pass in
the device path instead of hardcoding it.

Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
Cc: Paolo Bonzini pbonz...@redhat.com
Cc: Michael S. Tsirkin m...@redhat.com
Signed-off-by: Nicholas Bellinger n...@linux-iscsi.org
---
 hw/vhost.c |5 +++--
 hw/vhost.h |3 ++-
 hw/vhost_net.c |2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/hw/vhost.c b/hw/vhost.c
index 0fd8da8..d0ce5aa 100644
--- a/hw/vhost.c
+++ b/hw/vhost.c
@@ -747,14 +747,15 @@ static void vhost_eventfd_del(MemoryListener *listener,
 {
 }
 
-int vhost_dev_init(struct vhost_dev *hdev, int devfd, bool force)
+int vhost_dev_init(struct vhost_dev *hdev, int devfd, const char *devpath,
+   bool force)
 {
 uint64_t features;
 int r;
 if (devfd = 0) {
 hdev-control = devfd;
 } else {
-hdev-control = open(/dev/vhost-net, O_RDWR);
+hdev-control = open(devpath, O_RDWR);
 if (hdev-control  0) {
 return -errno;
 }
diff --git a/hw/vhost.h b/hw/vhost.h
index 80e64df..0c47229 100644
--- a/hw/vhost.h
+++ b/hw/vhost.h
@@ -44,7 +44,8 @@ struct vhost_dev {
 bool force;
 };
 
-int vhost_dev_init(struct vhost_dev *hdev, int devfd, bool force);
+int vhost_dev_init(struct vhost_dev *hdev, int devfd, const char *devpath,
+   bool force);
 void vhost_dev_cleanup(struct vhost_dev *hdev);
 bool vhost_dev_query(struct vhost_dev *hdev, VirtIODevice *vdev);
 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev);
diff --git a/hw/vhost_net.c b/hw/vhost_net.c
index ecaa22d..df2c4a3 100644
--- a/hw/vhost_net.c
+++ b/hw/vhost_net.c
@@ -109,7 +109,7 @@ struct vhost_net *vhost_net_init(NetClientState *backend, 
int devfd,
 (1  VHOST_NET_F_VIRTIO_NET_HDR);
 net-backend = r;
 
-r = vhost_dev_init(net-dev, devfd, force);
+r = vhost_dev_init(net-dev, devfd, /dev/vhost-net, force);
 if (r  0) {
 goto fail;
 }
-- 
1.7.2.5

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[RFC-v3 3/5] vhost-scsi: add -vhost-scsi host device for use with tcm-vhost

2012-08-21 Thread Nicholas A. Bellinger
From: Stefan Hajnoczi stefa...@linux.vnet.ibm.com

This patch adds a new type of host device that drives the vhost_scsi
device.  The syntax to add vhost-scsi is:

  qemu -vhost-scsi id=vhost-scsi0,wwpn=...,tpgt=123

The virtio-scsi emulated device will make use of vhost-scsi to process
virtio-scsi requests inside the kernel and hand them to the in-kernel
SCSI target stack using the tcm_vhost fabric driver.

The tcm_vhost driver was merged into the upstream linux kernel for 3.6-rc2,
and the commit can be found here:

http://git.kernel.org/?p=linux/kernel/git/torvalds/linux.git;a=commitdiff;h=057cbf49a1f08297

Changelog v2 - v3:

- Move qdev_prop_vhost_scsi + DEFINE_PROP_VHOST_SCSI defs into vhost-scsi.[c,h]
  (reported by MST)
- Add enum vhost_scsi_vq_list for VHostSCSI-vqs[] enumeration (reported by MST)
- Add missing braces around single like if statement to following QEMU
  style (reported by Blue Swirl)
- Change vhost_scsi_target-vhost_wwpn to char *, in order to drop casts to
  pstrcpy in vhost_scsi_start() + vhost_scsi_stop() (reported by Blue Swirl)
- Change VHOST_SCSI_GET_ABI_VERSION to 'int' type (MST)
- Fix 4 byte alignment of vhost_scsi_target (MST)
- Convert fprintf(stderr, ...) usage to - error_report() (reported by MST)
- Do explict memset of backend before calling VHOST_SCSI_CLEAR_ENDPOINT
  in vhost_scsi_stop() (reported by MST)
- Add support for vhostfd passing in vhost_scsi_add() (reported by MST)
- Change vhost_scsi_add() to use monitor_handle_fd_param() (reported by MST)

Changelog v1 - v2:

- Expose ABI version via VHOST_SCSI_GET_ABI_VERSION + use Rev 0 as
  starting point for v3.6-rc code (Stefan + ALiguori + nab)
- Fix upstream qemu conflict in hw/qdev-properties.c
- Make GET_ABI_VERSION use int (nab + mst)
- Fix vhost-scsi case lables in configure (reported by paolo)
- Convert qdev_prop_vhost_scsi to use -get() + -set() following
  qdev_prop_netdev (reported by paolo)
- Fix typo in qemu-options.hx definition of vhost-scsi (reported by paolo)

Changelog v0 - v1:

- Add VHOST_SCSI_SET_ENDPOINT call (stefan)
- Enable vhost notifiers for multiple queues (Zhi)
- clear vhost-scsi endpoint on stopped (Zhi)
- Add CONFIG_VHOST_SCSI for QEMU build configure (nab)
- Rename vhost_vring_target - vhost_scsi_target (mst + nab)
- Add support for VHOST_SCSI_GET_ABI_VERSION ioctl (aliguori + nab)

Cc: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
Cc: Zhi Yong Wu wu...@linux.vnet.ibm.com
Cc: Anthony Liguori aligu...@us.ibm.com
Cc: Paolo Bonzini pbonz...@redhat.com
Cc: Michael S. Tsirkin m...@redhat.com
Signed-off-by: Nicholas Bellinger n...@linux-iscsi.org
---
 configure|   10 +++
 hw/Makefile.objs |1 +
 hw/qdev-properties.c |   41 +++
 hw/vhost-scsi.c  |  190 ++
 hw/vhost-scsi.h  |   62 
 qemu-common.h|1 +
 qemu-config.c|   19 +
 qemu-options.hx  |4 +
 vl.c |   18 +
 9 files changed, 346 insertions(+), 0 deletions(-)
 create mode 100644 hw/vhost-scsi.c
 create mode 100644 hw/vhost-scsi.h

diff --git a/configure b/configure
index f0dbc03..1f03202 100755
--- a/configure
+++ b/configure
@@ -168,6 +168,7 @@ libattr=
 xfs=
 
 vhost_net=no
+vhost_scsi=no
 kvm=no
 gprof=no
 debug_tcg=no
@@ -513,6 +514,7 @@ Haiku)
   usb=linux
   kvm=yes
   vhost_net=yes
+  vhost_scsi=yes
   if [ $cpu = i386 -o $cpu = x86_64 ] ; then
 audio_possible_drivers=$audio_possible_drivers fmod
   fi
@@ -818,6 +820,10 @@ for opt do
   ;;
   --enable-vhost-net) vhost_net=yes
   ;;
+  --disable-vhost-scsi) vhost_scsi=no
+  ;;
+  --enable-vhost-scsi) vhost_scsi=yes
+  ;;
   --disable-opengl) opengl=no
   ;;
   --enable-opengl) opengl=yes
@@ -3116,6 +3122,7 @@ echo posix_madvise $posix_madvise
 echo uuid support  $uuid
 echo libcap-ng support $cap_ng
 echo vhost-net support $vhost_net
+echo vhost-scsi support $vhost_scsi
 echo Trace backend $trace_backend
 echo Trace output file $trace_file-pid
 echo spice support $spice
@@ -3828,6 +3835,9 @@ case $target_arch2 in
   if test $vhost_net = yes ; then
 echo CONFIG_VHOST_NET=y  $config_target_mak
   fi
+  if test $vhost_scsi = yes ; then
+echo CONFIG_VHOST_SCSI=y  $config_target_mak
+  fi
 fi
 esac
 case $target_arch2 in
diff --git a/hw/Makefile.objs b/hw/Makefile.objs
index 3ba5dd0..6ab75ec 100644
--- a/hw/Makefile.objs
+++ b/hw/Makefile.objs
@@ -169,6 +169,7 @@ obj-$(CONFIG_VIRTIO) += virtio.o virtio-blk.o 
virtio-balloon.o virtio-net.o
 obj-$(CONFIG_VIRTIO) += virtio-serial-bus.o virtio-scsi.o
 obj-$(CONFIG_SOFTMMU) += vhost_net.o
 obj-$(CONFIG_VHOST_NET) += vhost.o
+obj-$(CONFIG_VHOST_SCSI) += vhost-scsi.o
 obj-$(CONFIG_REALLY_VIRTFS) += 9pfs/
 obj-$(CONFIG_NO_PCI) += pci-stub.o
 obj-$(CONFIG_VGA) += vga.o
diff --git a/hw/qdev-properties.c b/hw/qdev-properties.c
index 8aca0d4..8b505ca 100644
--- a/hw/qdev-properties.c
+++ b/hw/qdev-properties.c
@@ -4,6 +4,7 @@
 #include blockdev.h
 

[RFC-v3 4/5] virtio-scsi: Add start/stop functionality for vhost-scsi

2012-08-21 Thread Nicholas A. Bellinger
From: Stefan Hajnoczi stefa...@linux.vnet.ibm.com

This patch starts and stops vhost as the virtio device transitions
through its status phases.  Vhost can only be started once the guest
reports its driver has successfully initialized, which means the
virtqueues have been set up by the guest.

v3: - Add vhost-scsi.h include for DEFINE_PROP_VHOST_SCSI (mst + nab)
- Move vhost-scsi related struct members ahead of *cmd_vqs[0] within
  VirtIOSCSI definition.  (paolo + nab)

v2: - Squash virtio-scsi: use the vhost-scsi host device from stefan (nab)
- Fix up virtio_scsi_properties[] conflict w/ upstream qemu (nab)
- Drop usage of to_virtio_scsi() in virtio_scsi_set_status()
  (reported by paolo)
- Use modern VirtIOSCSIConf define in virtio-scsi.h (reported by paolo)
- Use s-conf-vhost_scsi instead of proxyconf-vhost_scsi in
  virtio_scsi_init() (reported by paolo)
- Only register QEMU SCSI bus is vhost-scsi is not active (reported
  by paolo)

Cc: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
Cc: Zhi Yong Wu wu...@linux.vnet.ibm.com
Cc: Michael S. Tsirkin m...@redhat.com
Cc: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Nicholas Bellinger n...@linux-iscsi.org
---
 hw/virtio-pci.c  |2 ++
 hw/virtio-scsi.c |   49 +
 hw/virtio-scsi.h |1 +
 3 files changed, 52 insertions(+), 0 deletions(-)

diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
index 125eded..8ec7cf1 100644
--- a/hw/virtio-pci.c
+++ b/hw/virtio-pci.c
@@ -22,6 +22,7 @@
 #include virtio-net.h
 #include virtio-serial.h
 #include virtio-scsi.h
+#include vhost-scsi.h
 #include pci.h
 #include qemu-error.h
 #include msi.h
@@ -1036,6 +1037,7 @@ static void virtio_scsi_exit_pci(PCIDevice *pci_dev)
 }
 
 static Property virtio_scsi_properties[] = {
+DEFINE_PROP_VHOST_SCSI(vhost-scsi, VirtIOPCIProxy, scsi.vhost_scsi),
 DEFINE_PROP_BIT(ioeventfd, VirtIOPCIProxy, flags, 
VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, true),
 DEFINE_PROP_UINT32(vectors, VirtIOPCIProxy, nvectors, 
DEV_NVECTORS_UNSPECIFIED),
 DEFINE_VIRTIO_SCSI_PROPERTIES(VirtIOPCIProxy, host_features, scsi),
diff --git a/hw/virtio-scsi.c b/hw/virtio-scsi.c
index 5f737ac..edda097 100644
--- a/hw/virtio-scsi.c
+++ b/hw/virtio-scsi.c
@@ -13,9 +13,13 @@
  *
  */
 
+#include qemu-common.h
+#include qemu-error.h
+#include vhost-scsi.h
 #include virtio-scsi.h
 #include hw/scsi.h
 #include hw/scsi-defs.h
+#include vhost.h
 
 #define VIRTIO_SCSI_VQ_SIZE 128
 #define VIRTIO_SCSI_CDB_SIZE32
@@ -144,6 +148,10 @@ typedef struct {
 uint32_t cdb_size;
 int resetting;
 bool events_dropped;
+
+bool vhost_started;
+VHostSCSI *vhost_scsi;
+
 VirtQueue *ctrl_vq;
 VirtQueue *event_vq;
 VirtQueue *cmd_vqs[0];
@@ -699,6 +707,38 @@ static struct SCSIBusInfo virtio_scsi_scsi_info = {
 .load_request = virtio_scsi_load_request,
 };
 
+static bool virtio_scsi_started(VirtIOSCSI *s, uint8_t val)
+{
+return (val  VIRTIO_CONFIG_S_DRIVER_OK)  s-vdev.vm_running;
+}
+
+static void virtio_scsi_set_status(VirtIODevice *vdev, uint8_t val)
+{
+VirtIOSCSI *s = (VirtIOSCSI *)vdev;
+bool start = virtio_scsi_started(s, val);
+
+if (s-vhost_started == start) {
+return;
+}
+
+if (start) {
+int ret;
+
+ret = vhost_scsi_start(s-vhost_scsi, vdev);
+if (ret  0) {
+error_report(virtio-scsi: unable to start vhost: %s\n,
+ strerror(-ret));
+
+/* There is no userspace virtio-scsi fallback so exit */
+exit(1);
+}
+} else {
+vhost_scsi_stop(s-vhost_scsi, vdev);
+}
+
+s-vhost_started = start;
+}
+
 VirtIODevice *virtio_scsi_init(DeviceState *dev, VirtIOSCSIConf *proxyconf)
 {
 VirtIOSCSI *s;
@@ -712,12 +752,17 @@ VirtIODevice *virtio_scsi_init(DeviceState *dev, 
VirtIOSCSIConf *proxyconf)
 
 s-qdev = dev;
 s-conf = proxyconf;
+s-vhost_started = false;
+s-vhost_scsi = s-conf-vhost_scsi;
 
 /* TODO set up vdev function pointers */
 s-vdev.get_config = virtio_scsi_get_config;
 s-vdev.set_config = virtio_scsi_set_config;
 s-vdev.get_features = virtio_scsi_get_features;
 s-vdev.reset = virtio_scsi_reset;
+if (s-vhost_scsi) {
+s-vdev.set_status = virtio_scsi_set_status;
+}
 
 s-ctrl_vq = virtio_add_queue(s-vdev, VIRTIO_SCSI_VQ_SIZE,
virtio_scsi_handle_ctrl);
@@ -743,5 +788,9 @@ void virtio_scsi_exit(VirtIODevice *vdev)
 {
 VirtIOSCSI *s = (VirtIOSCSI *)vdev;
 unregister_savevm(s-qdev, virtio-scsi, s);
+
+/* This will stop vhost backend if appropriate. */
+virtio_scsi_set_status(vdev, 0);
+
 virtio_cleanup(vdev);
 }
diff --git a/hw/virtio-scsi.h b/hw/virtio-scsi.h
index 4bc889d..74e9422 100644
--- a/hw/virtio-scsi.h
+++ b/hw/virtio-scsi.h
@@ -22,6 +22,7 @@
 #define VIRTIO_ID_SCSI  8
 
 struct VirtIOSCSIConf {
+VHostSCSI *vhost_scsi;
 uint32_t num_queues;
 

[RFC-v3 5/5] virtio-scsi: Set max_target=0 during vhost-scsi operation

2012-08-21 Thread Nicholas A. Bellinger
From: Nicholas Bellinger n...@linux-iscsi.org

This QEMU patch sets VirtIOSCSIConfig-max_target=0 for vhost-scsi operation
to restrict virtio-scsi LLD guest scanning to max_id=0 (a single target ID
instance) when connected to individual tcm_vhost endpoints.

This ensures that virtio-scsi LLD only attempts to scan target IDs up to
VIRTIO_SCSI_MAX_TARGET when connected via virtio-scsi-raw.

Cc: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
Cc: Zhi Yong Wu wu...@linux.vnet.ibm.com
Cc: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Nicholas Bellinger n...@linux-iscsi.org
---
 hw/virtio-scsi.c |6 +-
 1 files changed, 5 insertions(+), 1 deletions(-)

diff --git a/hw/virtio-scsi.c b/hw/virtio-scsi.c
index edda097..ab5ca72 100644
--- a/hw/virtio-scsi.c
+++ b/hw/virtio-scsi.c
@@ -546,7 +546,11 @@ static void virtio_scsi_get_config(VirtIODevice *vdev,
 stl_raw(scsiconf-sense_size, s-sense_size);
 stl_raw(scsiconf-cdb_size, s-cdb_size);
 stl_raw(scsiconf-max_channel, VIRTIO_SCSI_MAX_CHANNEL);
-stl_raw(scsiconf-max_target, VIRTIO_SCSI_MAX_TARGET);
+if (s-vhost_scsi) {
+stl_raw(scsiconf-max_target, 0);
+} else {
+stl_raw(scsiconf-max_target, VIRTIO_SCSI_MAX_TARGET);
+}
 stl_raw(scsiconf-max_lun, VIRTIO_SCSI_MAX_LUN);
 }
 
-- 
1.7.2.5

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v7 2/4] virtio_balloon: introduce migration primitives to balloon pages

2012-08-21 Thread Rusty Russell
On Wed, 15 Aug 2012 14:28:51 +0300, Michael S. Tsirkin m...@redhat.com 
wrote:
 On Wed, Aug 15, 2012 at 12:16:51PM +0100, Mel Gorman wrote:
  I was thinking of exactly that page-mapping == balloon_mapping check. As I
  do not know how many active balloon drivers there might be I cannot guess
  in advance how much of a scalability problem it will be.
 
 Not at all sure multiple drivers are worth supporting, but multiple
 *devices* is I think worth supporting, if for no other reason than that
 they can work today. For that, we need a device pointer which Rafael
 wants to put into the mapping, this means multiple balloon mappings.

Rafael, please make sure that the balloon driver fails on the second and
subsequent balloon devices.

Michael, we only allow multiple balloon devices because it fell out of
the implementation.  If it causes us even the slightest issue, we should
not support it.  It's not a sensible setup.

Cheers,
Rusty.

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v8 1/5] mm: introduce a common interface for balloon pages mobility

2012-08-21 Thread Michael S. Tsirkin
On Tue, Aug 21, 2012 at 04:34:39PM -0300, Rafael Aquini wrote:
 On Tue, Aug 21, 2012 at 10:16:12PM +0300, Michael S. Tsirkin wrote:
  On Tue, Aug 21, 2012 at 02:55:03PM -0300, Rafael Aquini wrote:
   On Tue, Aug 21, 2012 at 04:52:23PM +0300, Michael S. Tsirkin wrote:
 + * address_space_operations utilized methods for ballooned pages:
 + *   .migratepage- used to perform balloon's page migration (as 
 is)
 + *   .launder_page   - used to isolate a page from balloon's page 
 list
 + *   .freepage   - used to reinsert an isolated page to 
 balloon's page list
 + */

It would be a good idea to document the assumptions here.
Looks like .launder_page and .freepage are called in rcu critical
section.
But migratepage isn't - why is that safe?

   
   The migratepage callback for virtio_balloon can sleep, and IIUC we cannot 
   sleep
   within a RCU critical section. 
   
   Also, The migratepage callback is called at inner migration's circle 
   function
   move_to_new_page(), and I don't think embedding it in a RCU critical 
   section
   would be a good idea, for the same understanding aforementioned.
  
  Yes but this means it is still exposed to the module unloading
  races that RCU was supposed to fix.
  So need to either rework that code so it won't sleep
  or switch to some other synchronization.
 
 Can you refactor tell_host() to not sleep? Or, can I get rid of calling it at
 virtballoon_migratepage()? If 'no' is the answer for both questions, that's 
 the
 way that code has to remain, even if we find a way around to hack the
 migratepage callback and have it embedded into a RCU crit section.
 
 That's why I believe once the balloon driver is commanded to unload, we must
 flag virtballoon_migratepage to skip it's work. By doing this, the thread
 performing memory compaction will have to recur to the 'putback' path which is
 RCU protected. (IMHO).
 
 As the module will not uload utill it leaks all pages on its list, that unload
 race you pointed before will be covered.


It can not be: nothing callback does can prevent it from
running after module unload: you must have some synchronization
in the calling code.

-- 
MST
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v7 2/4] virtio_balloon: introduce migration primitives to balloon pages

2012-08-21 Thread Michael S. Tsirkin
On Tue, Aug 21, 2012 at 03:01:37PM +0930, Rusty Russell wrote:
 On Wed, 15 Aug 2012 14:28:51 +0300, Michael S. Tsirkin m...@redhat.com 
 wrote:
  On Wed, Aug 15, 2012 at 12:16:51PM +0100, Mel Gorman wrote:
   I was thinking of exactly that page-mapping == balloon_mapping check. As 
   I
   do not know how many active balloon drivers there might be I cannot guess
   in advance how much of a scalability problem it will be.
  
  Not at all sure multiple drivers are worth supporting, but multiple
  *devices* is I think worth supporting, if for no other reason than that
  they can work today. For that, we need a device pointer which Rafael
  wants to put into the mapping, this means multiple balloon mappings.
 
 Rafael, please make sure that the balloon driver fails on the second and
 subsequent balloon devices.
 
 Michael, we only allow multiple balloon devices because it fell out of
 the implementation.  If it causes us even the slightest issue, we should
 not support it.  It's not a sensible setup.
 
 Cheers,
 Rusty.

Looks like latest revision does it using a flag which seems cleaner,
so I think the point is moot.

-- 
MST
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v8 1/5] mm: introduce a common interface for balloon pages mobility

2012-08-21 Thread Rafael Aquini
On Wed, Aug 22, 2012 at 03:07:41AM +0300, Michael S. Tsirkin wrote:
 On Tue, Aug 21, 2012 at 05:45:56PM -0300, Rafael Aquini wrote:
  On Tue, Aug 21, 2012 at 10:30:31PM +0300, Michael S. Tsirkin wrote:
   On Tue, Aug 21, 2012 at 04:23:58PM -0300, Rafael Aquini wrote:
On Tue, Aug 21, 2012 at 10:13:30PM +0300, Michael S. Tsirkin wrote:
  
  I believe rcu_dereference_protected() is what I want/need here, 
  since this code
  is always called for pages which we hold locked (PG_locked bit).
 
 It would only help if we locked the page while updating the mapping,
 as far as I can see we don't.


But we can do it. In fact, by doing it (locking the page) we can easily 
avoid
the nasty race balloon_isolate_page / leak_balloon, in a much simpler 
way, IMHO.
   
   Absolutely. Further, we should look hard at whether most RCU uses
   in this patchset can be replaced with page lock.
  
  
  Yeah, In fact, by testing/grabbing the page lock at leak_balloon() even the
  module unload X migration / putback race seems to fade away, since migration
  code holds the page locked all the way.
  And that seems a quite easy task to be accomplished:
  
  
  @@ -169,21 +197,61 @@ static void leak_balloon(struct virtio_balloon *vb, 
  size_t
  num)
  /* We can only do one array worth at a time. */
  num = min(num, ARRAY_SIZE(vb-pfns));
  
  +   mutex_lock(vb-balloon_lock);
  for (vb-num_pfns = 0; vb-num_pfns  num;
   vb-num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
  +   spin_lock(vb-pages_lock);
  +   /*
  +* 'virtballoon_isolatepage()' can drain vb-pages list
  +* making us to stumble across a _temporarily_ empty list.
 
 This still worries me. If this happens we do not
 lock the page so module can go away?
 if not need to document why.

The module won't unload unless it leaks all its pages. If we hit that test that
worries you, leak_balloon() will get back to its caller -- remove_common(), and
it will kept looping at:

/* There might be pages left in the balloon: free them. */
while (vb-num_pages)
leak_balloon(vb, vb-num_pages);

This is true because we do not mess with vb-num_pages while isolating/migrating
balloon pages, so the module will only unload when all isolated pages get back
to vb-pages_list and leak_balloon() reap them appropriatelly. As we will be
doing isolation/migration/putback steps under 'page lock' that race is gone.

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization