[Qemu-block] [PATCH V2 0/4] ide: avoid main-loop hang on CDROM/NFS failure

2015-10-12 Thread Peter Lieven
This series aims at avoiding a hanging main-loop if a vserver has a
CDROM image mounted from a NFS share and that NFS share goes down.
Typical situation is that users mount an CDROM ISO to install something
and then forget to eject that CDROM afterwards.
As a consequence this mounted CD is able to bring down the
whole vserver if the backend NFS share is unreachable. This is bad
especially if the CDROM itself is not needed anymore at this point.

This series aims at fixing 2 blocking I/O operations that would
hang if the NFS server is unavailable:
 - ATAPI PIO read requests used sync calls to blk_read, convert
   them to an async variant where possible.
 - If a busmaster DMA request is cancelled all requests are drained.
   Convert the drain to an async request canceling.

v1->v2: - fix offset for 2352 byte sector size [Kevin]
- use a sync request if we continue an elementary transfer.
  As John pointed out we enter a race condition between next
  IDE command and async transfer otherwise. This is sill not
  optimal, but it fixes the NFS down problems for all cases where
  the NFS server goes down while there is no PIO CD activity.
  Of course, it could still happen during a PIO transfer, but I
  expect this to be the unlikelier case.
  I spent some effort trying to read more sectors at once and
  avoiding continuation of elementary transfers, but with
  whatever I came up it was destroying migration between different
  Qemu versions. I have a quite hackish patch that works and
  should survive migration, but I am not happy with it. So I
  would like to start with this version as it is a big improvement
  already.
- Dropped Patch 5 because it is upstream meanwhile.

Peter Lieven (4):
  ide/atapi: make PIO read requests async
  ide/atapi: blk_aio_readv may return NULL
  ide: add support for cancelable read requests
  ide/atapi: enable cancelable requests

 hw/ide/atapi.c| 99 +--
 hw/ide/core.c | 55 +++
 hw/ide/internal.h | 16 +
 hw/ide/pci.c  | 42 +++
 4 files changed, 188 insertions(+), 24 deletions(-)

-- 
1.9.1




[Qemu-block] [PATCH 4/4] ide/atapi: enable cancelable requests

2015-10-12 Thread Peter Lieven
Signed-off-by: Peter Lieven 
---
 hw/ide/atapi.c | 4 ++--
 hw/ide/core.c  | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c
index e0cf066..8d38b1d 100644
--- a/hw/ide/atapi.c
+++ b/hw/ide/atapi.c
@@ -187,7 +187,7 @@ static int cd_read_sector(IDEState *s, int lba, void *buf)
 printf("cd_read_sector: lba=%d\n", lba);
 #endif
 
-if (blk_aio_readv(s->blk, (int64_t)lba << 2, &s->qiov, 4,
+if (ide_readv_cancelable(s, (int64_t)lba << 2, &s->qiov, 4,
   cd_read_sector_cb, s) == NULL) {
 return -EIO;
 }
@@ -426,7 +426,7 @@ static void ide_atapi_cmd_read_dma_cb(void *opaque, int ret)
 s->bus->dma->iov.iov_len = n * 4 * 512;
 qemu_iovec_init_external(&s->bus->dma->qiov, &s->bus->dma->iov, 1);
 
-s->bus->dma->aiocb = blk_aio_readv(s->blk, (int64_t)s->lba << 2,
+s->bus->dma->aiocb = ide_readv_cancelable(s, (int64_t)s->lba << 2,
&s->bus->dma->qiov, n * 4,
ide_atapi_cmd_read_dma_cb, s);
 if (s->bus->dma->aiocb == NULL) {
diff --git a/hw/ide/core.c b/hw/ide/core.c
index 24547ce..5c7a346 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -2330,6 +2330,7 @@ int ide_init_drive(IDEState *s, BlockBackend *blk, 
IDEDriveKind kind,
 if (kind == IDE_CD) {
 blk_set_dev_ops(blk, &ide_cd_block_ops, s);
 blk_set_guest_block_size(blk, 2048);
+s->requests_cancelable = true;
 } else {
 if (!blk_is_inserted(s->blk)) {
 error_report("Device needs media, but drive is empty");
-- 
1.9.1




[Qemu-block] [PATCH 3/4] ide: add support for cancelable read requests

2015-10-12 Thread Peter Lieven
this patch adds a new aio readv compatible function which copies
all data through a bounce buffer. The benefit is that these requests
can be flagged as canceled to avoid guest memory corruption when
a canceled request is completed by the backend at a later stage.

If an IDE protocol wants to use this function it has to pipe
all read requests through ide_readv_cancelable and it may then
enable requests_cancelable in the IDEState.

If this state is enable we can avoid the blocking blk_drain_all
in case of a BMDMA reset.

Currently only read operations are cancelable thus we can only
use this logic for read-only devices.

Signed-off-by: Peter Lieven 
---
 hw/ide/core.c | 54 ++
 hw/ide/internal.h | 16 
 hw/ide/pci.c  | 42 --
 3 files changed, 98 insertions(+), 14 deletions(-)

diff --git a/hw/ide/core.c b/hw/ide/core.c
index 317406d..24547ce 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -561,6 +561,59 @@ static bool ide_sect_range_ok(IDEState *s,
 return true;
 }
 
+static void ide_readv_cancelable_cb(void *opaque, int ret)
+{
+IDECancelableRequest *req = opaque;
+if (!req->canceled) {
+if (!ret) {
+qemu_iovec_from_buf(req->org_qiov, 0, req->buf, 
req->org_qiov->size);
+}
+req->org_cb(req->org_opaque, ret);
+}
+QLIST_REMOVE(req, list);
+qemu_vfree(req->buf);
+qemu_iovec_destroy(&req->qiov);
+g_free(req);
+}
+
+#define MAX_CANCELABLE_REQS 16
+
+BlockAIOCB *ide_readv_cancelable(IDEState *s, int64_t sector_num,
+ QEMUIOVector *iov, int nb_sectors,
+ BlockCompletionFunc *cb, void *opaque)
+{
+BlockAIOCB *aioreq;
+IDECancelableRequest *req;
+int c = 0;
+
+QLIST_FOREACH(req, &s->cancelable_requests, list) {
+c++;
+}
+if (c > MAX_CANCELABLE_REQS) {
+return NULL;
+}
+
+req = g_new0(IDECancelableRequest, 1);
+qemu_iovec_init(&req->qiov, 1);
+req->buf = qemu_blockalign(blk_bs(s->blk), iov->size);
+qemu_iovec_add(&req->qiov, req->buf, iov->size);
+req->org_qiov = iov;
+req->org_cb = cb;
+req->org_opaque = opaque;
+
+aioreq = blk_aio_readv(s->blk, sector_num, &req->qiov, nb_sectors,
+   ide_readv_cancelable_cb, req);
+if (aioreq == NULL) {
+qemu_vfree(req->buf);
+qemu_iovec_destroy(&req->qiov);
+g_free(req);
+} else {
+QLIST_INSERT_HEAD(&s->cancelable_requests, req, list);
+}
+
+return aioreq;
+}
+
 static void ide_sector_read(IDEState *s);
 
 static void ide_sector_read_cb(void *opaque, int ret)
@@ -805,6 +858,7 @@ void ide_start_dma(IDEState *s, BlockCompletionFunc *cb)
 s->bus->retry_unit = s->unit;
 s->bus->retry_sector_num = ide_get_sector(s);
 s->bus->retry_nsector = s->nsector;
+s->bus->s = s;
 if (s->bus->dma->ops->start_dma) {
 s->bus->dma->ops->start_dma(s->bus->dma, s, cb);
 }
diff --git a/hw/ide/internal.h b/hw/ide/internal.h
index 05e93ff..ad188c2 100644
--- a/hw/ide/internal.h
+++ b/hw/ide/internal.h
@@ -343,6 +343,16 @@ enum ide_dma_cmd {
 #define ide_cmd_is_read(s) \
((s)->dma_cmd == IDE_DMA_READ)
 
+typedef struct IDECancelableRequest {
+QLIST_ENTRY(IDECancelableRequest) list;
+QEMUIOVector qiov;
+uint8_t *buf;
+QEMUIOVector *org_qiov;
+BlockCompletionFunc *org_cb;
+void *org_opaque;
+bool canceled;
+} IDECancelableRequest;
+
 /* NOTE: IDEState represents in fact one drive */
 struct IDEState {
 IDEBus *bus;
@@ -396,6 +406,8 @@ struct IDEState {
 BlockAIOCB *pio_aiocb;
 struct iovec iov;
 QEMUIOVector qiov;
+QLIST_HEAD(, IDECancelableRequest) cancelable_requests;
+bool requests_cancelable;
 /* ATA DMA state */
 int32_t io_buffer_offset;
 int32_t io_buffer_size;
@@ -468,6 +480,7 @@ struct IDEBus {
 uint8_t retry_unit;
 int64_t retry_sector_num;
 uint32_t retry_nsector;
+IDEState *s;
 };
 
 #define TYPE_IDE_DEVICE "ide-device"
@@ -572,6 +585,9 @@ void ide_set_inactive(IDEState *s, bool more);
 BlockAIOCB *ide_issue_trim(BlockBackend *blk,
 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
 BlockCompletionFunc *cb, void *opaque);
+BlockAIOCB *ide_readv_cancelable(IDEState *s, int64_t sector_num,
+ QEMUIOVector *iov, int nb_sectors,
+ BlockCompletionFunc *cb, void *opaque);
 
 /* hw/ide/atapi.c */
 void ide_atapi_cmd(IDEState *s);
diff --git a/hw/ide/pci.c b/hw/ide/pci.c
index d31ff88..5587183 100644
--- a/hw/ide/pci.c
+++ b/hw/ide/pci.c
@@ -240,21 +240,35 @@ void bmdma_cmd_writeb(BMDMAState *bm, uint32_t val)
 /* Ignore writes to S

[Qemu-block] [PATCH 2/4] ide/atapi: blk_aio_readv may return NULL

2015-10-12 Thread Peter Lieven
Signed-off-by: Peter Lieven 
---
 hw/ide/atapi.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c
index 2271ea2..e0cf066 100644
--- a/hw/ide/atapi.c
+++ b/hw/ide/atapi.c
@@ -429,6 +429,10 @@ static void ide_atapi_cmd_read_dma_cb(void *opaque, int 
ret)
 s->bus->dma->aiocb = blk_aio_readv(s->blk, (int64_t)s->lba << 2,
&s->bus->dma->qiov, n * 4,
ide_atapi_cmd_read_dma_cb, s);
+if (s->bus->dma->aiocb == NULL) {
+ide_atapi_io_error(s, -EIO);
+goto eot;
+}
 return;
 
 eot:
-- 
1.9.1




[Qemu-block] [PATCH 1/4] ide/atapi: make PIO read requests async

2015-10-12 Thread Peter Lieven
PIO read requests on the ATAPI interface used to be sync blk requests.
This has two significant drawbacks. First the main loop hangs util an
I/O request is completed and secondly if the I/O request does not
complete (e.g. due to an unresponsive storage) Qemu hangs completely.

Signed-off-by: Peter Lieven 
---
 hw/ide/atapi.c | 93 --
 1 file changed, 84 insertions(+), 9 deletions(-)

diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c
index 747f466..2271ea2 100644
--- a/hw/ide/atapi.c
+++ b/hw/ide/atapi.c
@@ -105,11 +105,16 @@ static void cd_data_to_raw(uint8_t *buf, int lba)
 memset(buf, 0, 288);
 }
 
-static int cd_read_sector(IDEState *s, int lba, uint8_t *buf, int sector_size)
+static int
+cd_read_sector_sync(IDEState *s, int lba, uint8_t *buf)
 {
 int ret;
 
-switch(sector_size) {
+#ifdef DEBUG_IDE_ATAPI
+printf("cd_read_sector_sync: lba=%d\n", lba);
+#endif
+
+switch (s->cd_sector_size) {
 case 2048:
 block_acct_start(blk_get_stats(s->blk), &s->acct,
  4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
@@ -129,9 +134,71 @@ static int cd_read_sector(IDEState *s, int lba, uint8_t 
*buf, int sector_size)
 ret = -EIO;
 break;
 }
+
+if (!ret) {
+s->lba++;
+s->io_buffer_index = 0;
+}
+
 return ret;
 }
 
+static void cd_read_sector_cb(void *opaque, int ret)
+{
+IDEState *s = opaque;
+
+block_acct_done(blk_get_stats(s->blk), &s->acct);
+
+#ifdef DEBUG_IDE_ATAPI
+printf("cd_read_sector_cb: lba=%d ret=%d\n", s->lba, ret);
+#endif
+
+if (ret < 0) {
+ide_atapi_io_error(s, ret);
+return;
+}
+
+if (s->cd_sector_size == 2352) {
+cd_data_to_raw(s->io_buffer, s->lba);
+}
+
+s->lba++;
+s->io_buffer_index = 0;
+s->status &= ~BUSY_STAT;
+
+ide_atapi_cmd_reply_end(s);
+}
+
+static int cd_read_sector(IDEState *s, int lba, void *buf)
+{
+if (s->cd_sector_size != 2048 && s->cd_sector_size != 2352) {
+return -EINVAL;
+}
+
+s->iov.iov_base = buf;
+if (s->cd_sector_size == 2352) {
+buf += 16;
+}
+
+s->iov.iov_len = 4 * BDRV_SECTOR_SIZE;
+qemu_iovec_init_external(&s->qiov, &s->iov, 1);
+
+#ifdef DEBUG_IDE_ATAPI
+printf("cd_read_sector: lba=%d\n", lba);
+#endif
+
+if (blk_aio_readv(s->blk, (int64_t)lba << 2, &s->qiov, 4,
+  cd_read_sector_cb, s) == NULL) {
+return -EIO;
+}
+
+block_acct_start(blk_get_stats(s->blk), &s->acct,
+ 4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
+
+s->status |= BUSY_STAT;
+return 0;
+}
+
 void ide_atapi_cmd_ok(IDEState *s)
 {
 s->error = 0;
@@ -182,18 +249,27 @@ void ide_atapi_cmd_reply_end(IDEState *s)
 ide_atapi_cmd_ok(s);
 ide_set_irq(s->bus);
 #ifdef DEBUG_IDE_ATAPI
-printf("status=0x%x\n", s->status);
+printf("end of transfer, status=0x%x\n", s->status);
 #endif
 } else {
 /* see if a new sector must be read */
 if (s->lba != -1 && s->io_buffer_index >= s->cd_sector_size) {
-ret = cd_read_sector(s, s->lba, s->io_buffer, s->cd_sector_size);
-if (ret < 0) {
-ide_atapi_io_error(s, ret);
+if (!s->elementary_transfer_size) {
+ret = cd_read_sector(s, s->lba, s->io_buffer);
+if (ret < 0) {
+ide_atapi_io_error(s, ret);
+}
 return;
+} else {
+/* rebuffering within an elementary transfer is
+ * only possible with a sync request because we
+ * end up with a race condition otherwise */
+ret = cd_read_sector_sync(s, s->lba, s->io_buffer);
+if (ret < 0) {
+ide_atapi_io_error(s, ret);
+return;
+}
 }
-s->lba++;
-s->io_buffer_index = 0;
 }
 if (s->elementary_transfer_size > 0) {
 /* there are some data left to transmit in this elementary
@@ -275,7 +351,6 @@ static void ide_atapi_cmd_read_pio(IDEState *s, int lba, 
int nb_sectors,
 s->io_buffer_index = sector_size;
 s->cd_sector_size = sector_size;
 
-s->status = READY_STAT | SEEK_STAT;
 ide_atapi_cmd_reply_end(s);
 }
 
-- 
1.9.1




Re: [Qemu-block] [Qemu-devel] [PATCH 1/5] ide/atapi: make PIO read requests async

2015-10-09 Thread Peter Lieven
Am 09.10.2015 um 10:21 schrieb Kevin Wolf:
> Am 08.10.2015 um 18:44 hat John Snow geschrieben:
>> On 10/08/2015 08:06 AM, Peter Lieven wrote:
>>> Hi all,
>>>
>>> short summary from my side. The whole thing seems to get complicated,
>>> let me explain why:
>>>
>>> 1) During review I found that the code in ide_atapi_cmd_reply_end can't
>>> work correctly if the
>>> byte_count_limit is not a divider or a multiple of cd_sector_size. The
>>> reason is that as soon
>>> as we load the next sector we start at io_buffer offset 0 overwriting
>>> whatever is left in there
>>> for transfer. We also reset the io_buffer_index to 0 which means if we
>>> continue with the
>>> elementary transfer we always transfer a whole sector (of corrupt data)
>>> regardless if we
>>> are allowed to transfer that much data. Before we consider fixing this I
>>> wonder if it
>>> is legal at all to have an unaligned byte_count_limit. It obviously has
>>> never caused trouble in
>>> practice so maybe its not happening in real life.
>>>
>> I had overlooked that part. Good catch. I do suspect that in practice
>> nobody will be asking for bizarre values.
>>
>> There's no rule against an unaligned byte_count_limit as far as I have
>> read, but suspect nobody would have a reason to use it in practice.
> If we know that our code is technically wrong, "nobody uses it" is not a
> good reason not to fix it. Because sooner or later someone will use it.

I found that I just misinterpreted the code. I think its correct altough
its not nice.

>
>>> 2) I found that whatever cool optimization I put in to buffer multiple
>>> sectors at once I end
>>> up with code that breaks migration because older versions would either
>>> not fill the io_buffer
>>> as expected or we introduce variables that older versions do not
>>> understand. This will
>>> lead to problems if we migrate in the middle of a transfer.
>>>
>> Ech. This sounds like a bit of a problem. I'll need to think about this
>> one...
> Sounds like a textbook example for subsections to me?

I wonder if we need this at all. Its just PIO. However, Windows and Linux
fall back to PIO if I disconnect the NFS Server for some time. With the
latest version of my patch series this works fine now and even works
fine after restoring NFS connectivy. This was not properly working
because of the glitch that John found.

I have an optimization in mind that might work and will disable the need
for the sync requests while keeping migration possible.
I hope to be able to complete this by Monday and send out a V2 then.

If someone wants to have a look at what I currently have (especially
the cancelable requests part) its here:
https://github.com/plieven/qemu/commits/atapi_async_V2

Thanks,
Peter



Re: [Qemu-block] [Qemu-devel] [PATCH 1/5] ide/atapi: make PIO read requests async

2015-10-08 Thread Peter Lieven

Hi all,

short summary from my side. The whole thing seems to get complicated, let me 
explain why:

1) During review I found that the code in ide_atapi_cmd_reply_end can't work 
correctly if the
byte_count_limit is not a divider or a multiple of cd_sector_size. The reason 
is that as soon
as we load the next sector we start at io_buffer offset 0 overwriting whatever 
is left in there
for transfer. We also reset the io_buffer_index to 0 which means if we continue 
with the
elementary transfer we always transfer a whole sector (of corrupt data) 
regardless if we
are allowed to transfer that much data. Before we consider fixing this I wonder 
if it
is legal at all to have an unaligned byte_count_limit. It obviously has never 
caused trouble in
practice so maybe its not happening in real life.

2) I found that whatever cool optimization I put in to buffer multiple sectors 
at once I end
up with code that breaks migration because older versions would either not fill 
the io_buffer
as expected or we introduce variables that older versions do not understand. 
This will
lead to problems if we migrate in the middle of a transfer.

3) My current plan to get this patch to a useful state would be to use my 
initial patch and just
change the code to use a sync request if we need to buffer additional sectors 
in an elementary
transfer. I found that in real world operating systems the byte_count_limit 
seems to be equal to
the cd_sector_size. After all its just a PIO transfer an operating system will 
likely switch to DMA
as soon as the kernel ist loaded.

Thanks,
Peter




Re: [Qemu-block] [Qemu-devel] [PATCH 1/5] ide/atapi: make PIO read requests async

2015-10-07 Thread Peter Lieven
Am 07.10.2015 um 18:42 schrieb John Snow:
>
> On 10/06/2015 04:46 AM, Peter Lieven wrote:
>> Am 05.10.2015 um 23:15 schrieb John Snow:
>>> On 09/21/2015 08:25 AM, Peter Lieven wrote:
>>>> PIO read requests on the ATAPI interface used to be sync blk requests.
>>>> This has to siginificant drawbacks. First the main loop hangs util an
>>>> I/O request is completed and secondly if the I/O request does not
>>>> complete (e.g. due to an unresponsive storage) Qemu hangs completely.
>>>>
>>>> Signed-off-by: Peter Lieven 
>>>> ---
>>>>   hw/ide/atapi.c | 69
>>>> --
>>>>   1 file changed, 43 insertions(+), 26 deletions(-)
>>>>
>>>> diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c
>>>> index 747f466..9257e1c 100644
>>>> --- a/hw/ide/atapi.c
>>>> +++ b/hw/ide/atapi.c
>>>> @@ -105,31 +105,51 @@ static void cd_data_to_raw(uint8_t *buf, int lba)
>>>>   memset(buf, 0, 288);
>>>>   }
>>>>   -static int cd_read_sector(IDEState *s, int lba, uint8_t *buf, int
>>>> sector_size)
>>>> +static void cd_read_sector_cb(void *opaque, int ret)
>>>>   {
>>>> -int ret;
>>>> +IDEState *s = opaque;
>>>>   -switch(sector_size) {
>>>> -case 2048:
>>>> -block_acct_start(blk_get_stats(s->blk), &s->acct,
>>>> - 4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
>>>> -ret = blk_read(s->blk, (int64_t)lba << 2, buf, 4);
>>>> -block_acct_done(blk_get_stats(s->blk), &s->acct);
>>>> -break;
>>>> -case 2352:
>>>> -block_acct_start(blk_get_stats(s->blk), &s->acct,
>>>> - 4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
>>>> -ret = blk_read(s->blk, (int64_t)lba << 2, buf + 16, 4);
>>>> -block_acct_done(blk_get_stats(s->blk), &s->acct);
>>>> -if (ret < 0)
>>>> -return ret;
>>>> -cd_data_to_raw(buf, lba);
>>>> -break;
>>>> -default:
>>>> -ret = -EIO;
>>>> -break;
>>>> +block_acct_done(blk_get_stats(s->blk), &s->acct);
>>>> +
>>>> +if (ret < 0) {
>>>> +ide_atapi_io_error(s, ret);
>>>> +return;
>>>> +}
>>>> +
>>>> +if (s->cd_sector_size == 2352) {
>>>> +cd_data_to_raw(s->io_buffer, s->lba);
>>>>   }
>>>> -return ret;
>>>> +
>>>> +s->lba++;
>>>> +s->io_buffer_index = 0;
>>>> +s->status &= ~BUSY_STAT;
>>>> +
>>>> +ide_atapi_cmd_reply_end(s);
>>>> +}
>>>> +
>>>> +static int cd_read_sector(IDEState *s, int lba, void *buf, int
>>>> sector_size)
>>>> +{
>>>> +if (sector_size != 2048 && sector_size != 2352) {
>>>> +return -EINVAL;
>>>> +}
>>>> +
>>>> +s->iov.iov_base = buf;
>>>> +if (sector_size == 2352) {
>>>> +buf += 4;
>>>> +}
>>>> +
>>>> +s->iov.iov_len = 4 * BDRV_SECTOR_SIZE;
>>>> +qemu_iovec_init_external(&s->qiov, &s->iov, 1);
>>>> +
>>>> +if (blk_aio_readv(s->blk, (int64_t)lba << 2, &s->qiov, 4,
>>>> +  cd_read_sector_cb, s) == NULL) {
>>>> +return -EIO;
>>>> +}
>>>> +
>>>> +block_acct_start(blk_get_stats(s->blk), &s->acct,
>>>> + 4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
>>>> +s->status |= BUSY_STAT;
>>>> +return 0;
>>>>   }
>>>>   
>>> We discussed this off-list a bit, but for upstream synchronization:
>>>
>>> Unfortunately, I believe making cd_read_sector here non-blocking makes
>>> ide_atapi_cmd_reply_end non-blocking, and as a result makes calls to
>>> s->end_transfer_func() nonblocking, which functions like ide_data_readw
>>> are not prepared to cope with.
>>>
>>> My suggestion is to buffer an entire DRQ block of data at once
>>> (byte_count_limit) to avoid the problem.
>&g

Re: [Qemu-block] [Qemu-devel] [PATCH 1/5] ide/atapi: make PIO read requests async

2015-10-06 Thread Peter Lieven
Am 06.10.2015 um 19:56 schrieb John Snow:
>
> On 10/06/2015 01:12 PM, Peter Lieven wrote:
>>> Am 06.10.2015 um 19:07 schrieb John Snow :
>>>
>>>
>>>
>>>> On 10/06/2015 05:20 AM, Peter Lieven wrote:
>>>>> Am 06.10.2015 um 10:57 schrieb Kevin Wolf:
>>>>> Am 05.10.2015 um 23:15 hat John Snow geschrieben:
>>>>>> On 09/21/2015 08:25 AM, Peter Lieven wrote:
>>>>>>> PIO read requests on the ATAPI interface used to be sync blk requests.
>>>>>>> This has to siginificant drawbacks. First the main loop hangs util an
>>>>>>> I/O request is completed and secondly if the I/O request does not
>>>>>>> complete (e.g. due to an unresponsive storage) Qemu hangs completely.
>>>>>>>
>>>>>>> Signed-off-by: Peter Lieven 
>>>>>>> ---
>>>>>>>  hw/ide/atapi.c | 69
>>>>>>> --
>>>>>>>  1 file changed, 43 insertions(+), 26 deletions(-)
>>>>>>>
>>>>>>> diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c
>>>>>>> index 747f466..9257e1c 100644
>>>>>>> --- a/hw/ide/atapi.c
>>>>>>> +++ b/hw/ide/atapi.c
>>>>>>> @@ -105,31 +105,51 @@ static void cd_data_to_raw(uint8_t *buf, int lba)
>>>>>>>  memset(buf, 0, 288);
>>>>>>>  }
>>>>>>>  -static int cd_read_sector(IDEState *s, int lba, uint8_t *buf, int
>>>>>>> sector_size)
>>>>>>> +static void cd_read_sector_cb(void *opaque, int ret)
>>>>>>>  {
>>>>>>> -int ret;
>>>>>>> +IDEState *s = opaque;
>>>>>>>  -switch(sector_size) {
>>>>>>> -case 2048:
>>>>>>> -block_acct_start(blk_get_stats(s->blk), &s->acct,
>>>>>>> - 4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
>>>>>>> -ret = blk_read(s->blk, (int64_t)lba << 2, buf, 4);
>>>>>>> -block_acct_done(blk_get_stats(s->blk), &s->acct);
>>>>>>> -break;
>>>>>>> -case 2352:
>>>>>>> -block_acct_start(blk_get_stats(s->blk), &s->acct,
>>>>>>> - 4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
>>>>>>> -ret = blk_read(s->blk, (int64_t)lba << 2, buf + 16, 4);
>>>>>>> -block_acct_done(blk_get_stats(s->blk), &s->acct);
>>>>>>> -if (ret < 0)
>>>>>>> -return ret;
>>>>>>> -cd_data_to_raw(buf, lba);
>>>>>>> -break;
>>>>>>> -default:
>>>>>>> -ret = -EIO;
>>>>>>> -break;
>>>>>>> +block_acct_done(blk_get_stats(s->blk), &s->acct);
>>>>>>> +
>>>>>>> +if (ret < 0) {
>>>>>>> +ide_atapi_io_error(s, ret);
>>>>>>> +return;
>>>>>>> +}
>>>>>>> +
>>>>>>> +if (s->cd_sector_size == 2352) {
>>>>>>> +cd_data_to_raw(s->io_buffer, s->lba);
>>>>>>>  }
>>>>>>> -return ret;
>>>>>>> +
>>>>>>> +s->lba++;
>>>>>>> +s->io_buffer_index = 0;
>>>>>>> +s->status &= ~BUSY_STAT;
>>>>>>> +
>>>>>>> +ide_atapi_cmd_reply_end(s);
>>>>>>> +}
>>>>>>> +
>>>>>>> +static int cd_read_sector(IDEState *s, int lba, void *buf, int
>>>>>>> sector_size)
>>>>>>> +{
>>>>>>> +if (sector_size != 2048 && sector_size != 2352) {
>>>>>>> +return -EINVAL;
>>>>>>> +}
>>>>>>> +
>>>>>>> +s->iov.iov_base = buf;
>>>>>>> +if (sector_size == 2352) {
>>>>>>> +buf += 4;
>>>>>>> +}
>>>>> This doesn't look quite right, buf is never read after this.
>>>>>
>>>>> Also, why +=

Re: [Qemu-block] [Qemu-devel] [PATCH 1/5] ide/atapi: make PIO read requests async

2015-10-06 Thread Peter Lieven

> Am 06.10.2015 um 19:07 schrieb John Snow :
> 
> 
> 
>> On 10/06/2015 05:20 AM, Peter Lieven wrote:
>>> Am 06.10.2015 um 10:57 schrieb Kevin Wolf:
>>> Am 05.10.2015 um 23:15 hat John Snow geschrieben:
>>>> 
>>>> On 09/21/2015 08:25 AM, Peter Lieven wrote:
>>>>> PIO read requests on the ATAPI interface used to be sync blk requests.
>>>>> This has to siginificant drawbacks. First the main loop hangs util an
>>>>> I/O request is completed and secondly if the I/O request does not
>>>>> complete (e.g. due to an unresponsive storage) Qemu hangs completely.
>>>>> 
>>>>> Signed-off-by: Peter Lieven 
>>>>> ---
>>>>>  hw/ide/atapi.c | 69
>>>>> --
>>>>>  1 file changed, 43 insertions(+), 26 deletions(-)
>>>>> 
>>>>> diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c
>>>>> index 747f466..9257e1c 100644
>>>>> --- a/hw/ide/atapi.c
>>>>> +++ b/hw/ide/atapi.c
>>>>> @@ -105,31 +105,51 @@ static void cd_data_to_raw(uint8_t *buf, int lba)
>>>>>  memset(buf, 0, 288);
>>>>>  }
>>>>>  -static int cd_read_sector(IDEState *s, int lba, uint8_t *buf, int
>>>>> sector_size)
>>>>> +static void cd_read_sector_cb(void *opaque, int ret)
>>>>>  {
>>>>> -int ret;
>>>>> +IDEState *s = opaque;
>>>>>  -switch(sector_size) {
>>>>> -case 2048:
>>>>> -block_acct_start(blk_get_stats(s->blk), &s->acct,
>>>>> - 4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
>>>>> -ret = blk_read(s->blk, (int64_t)lba << 2, buf, 4);
>>>>> -block_acct_done(blk_get_stats(s->blk), &s->acct);
>>>>> -break;
>>>>> -case 2352:
>>>>> -block_acct_start(blk_get_stats(s->blk), &s->acct,
>>>>> - 4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
>>>>> -ret = blk_read(s->blk, (int64_t)lba << 2, buf + 16, 4);
>>>>> -block_acct_done(blk_get_stats(s->blk), &s->acct);
>>>>> -if (ret < 0)
>>>>> -return ret;
>>>>> -cd_data_to_raw(buf, lba);
>>>>> -break;
>>>>> -default:
>>>>> -ret = -EIO;
>>>>> -break;
>>>>> +block_acct_done(blk_get_stats(s->blk), &s->acct);
>>>>> +
>>>>> +if (ret < 0) {
>>>>> +ide_atapi_io_error(s, ret);
>>>>> +return;
>>>>> +}
>>>>> +
>>>>> +if (s->cd_sector_size == 2352) {
>>>>> +cd_data_to_raw(s->io_buffer, s->lba);
>>>>>  }
>>>>> -return ret;
>>>>> +
>>>>> +s->lba++;
>>>>> +s->io_buffer_index = 0;
>>>>> +s->status &= ~BUSY_STAT;
>>>>> +
>>>>> +ide_atapi_cmd_reply_end(s);
>>>>> +}
>>>>> +
>>>>> +static int cd_read_sector(IDEState *s, int lba, void *buf, int
>>>>> sector_size)
>>>>> +{
>>>>> +if (sector_size != 2048 && sector_size != 2352) {
>>>>> +return -EINVAL;
>>>>> +}
>>>>> +
>>>>> +s->iov.iov_base = buf;
>>>>> +if (sector_size == 2352) {
>>>>> +buf += 4;
>>>>> +}
>>> This doesn't look quite right, buf is never read after this.
>>> 
>>> Also, why +=4 when it was originally buf + 16?
>> 
>> You are right. I mixed that up.
>> 
>>> 
>>>>> +
>>>>> +s->iov.iov_len = 4 * BDRV_SECTOR_SIZE;
>>>>> +qemu_iovec_init_external(&s->qiov, &s->iov, 1);
>>>>> +
>>>>> +if (blk_aio_readv(s->blk, (int64_t)lba << 2, &s->qiov, 4,
>>>>> +  cd_read_sector_cb, s) == NULL) {
>>>>> +return -EIO;
>>>>> +}
>>>>> +
>>>>> +block_acct_start(blk_get_stats(s->blk), &s->acct,
>>>>> + 4 * BDRV_SECTOR_SIZE, 

Re: [Qemu-block] [Qemu-devel] [PATCH 1/5] ide/atapi: make PIO read requests async

2015-10-06 Thread Peter Lieven

Am 06.10.2015 um 10:46 schrieb Peter Lieven:

Am 05.10.2015 um 23:15 schrieb John Snow:


On 09/21/2015 08:25 AM, Peter Lieven wrote:

PIO read requests on the ATAPI interface used to be sync blk requests.
This has to siginificant drawbacks. First the main loop hangs util an
I/O request is completed and secondly if the I/O request does not
complete (e.g. due to an unresponsive storage) Qemu hangs completely.

Signed-off-by: Peter Lieven 
---
  hw/ide/atapi.c | 69 --
  1 file changed, 43 insertions(+), 26 deletions(-)

diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c
index 747f466..9257e1c 100644
--- a/hw/ide/atapi.c
+++ b/hw/ide/atapi.c
@@ -105,31 +105,51 @@ static void cd_data_to_raw(uint8_t *buf, int lba)
  memset(buf, 0, 288);
  }
  -static int cd_read_sector(IDEState *s, int lba, uint8_t *buf, int 
sector_size)
+static void cd_read_sector_cb(void *opaque, int ret)
  {
-int ret;
+IDEState *s = opaque;
  -switch(sector_size) {
-case 2048:
-block_acct_start(blk_get_stats(s->blk), &s->acct,
- 4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
-ret = blk_read(s->blk, (int64_t)lba << 2, buf, 4);
-block_acct_done(blk_get_stats(s->blk), &s->acct);
-break;
-case 2352:
-block_acct_start(blk_get_stats(s->blk), &s->acct,
- 4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
-ret = blk_read(s->blk, (int64_t)lba << 2, buf + 16, 4);
-block_acct_done(blk_get_stats(s->blk), &s->acct);
-if (ret < 0)
-return ret;
-cd_data_to_raw(buf, lba);
-break;
-default:
-ret = -EIO;
-break;
+block_acct_done(blk_get_stats(s->blk), &s->acct);
+
+if (ret < 0) {
+ide_atapi_io_error(s, ret);
+return;
+}
+
+if (s->cd_sector_size == 2352) {
+cd_data_to_raw(s->io_buffer, s->lba);
  }
-return ret;
+
+s->lba++;
+s->io_buffer_index = 0;
+s->status &= ~BUSY_STAT;
+
+ide_atapi_cmd_reply_end(s);
+}
+
+static int cd_read_sector(IDEState *s, int lba, void *buf, int sector_size)
+{
+if (sector_size != 2048 && sector_size != 2352) {
+return -EINVAL;
+}
+
+s->iov.iov_base = buf;
+if (sector_size == 2352) {
+buf += 4;
+}
+
+s->iov.iov_len = 4 * BDRV_SECTOR_SIZE;
+qemu_iovec_init_external(&s->qiov, &s->iov, 1);
+
+if (blk_aio_readv(s->blk, (int64_t)lba << 2, &s->qiov, 4,
+  cd_read_sector_cb, s) == NULL) {
+return -EIO;
+}
+
+block_acct_start(blk_get_stats(s->blk), &s->acct,
+ 4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
+s->status |= BUSY_STAT;
+return 0;
  }

We discussed this off-list a bit, but for upstream synchronization:

Unfortunately, I believe making cd_read_sector here non-blocking makes
ide_atapi_cmd_reply_end non-blocking, and as a result makes calls to
s->end_transfer_func() nonblocking, which functions like ide_data_readw
are not prepared to cope with.

My suggestion is to buffer an entire DRQ block of data at once
(byte_count_limit) to avoid the problem.


Hi John,

first of all thank you for the detailed analysis.

Is the following what you have i mind. For PIO reads > 1 sector
it is a great improvement for the NFS backend:

diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c
index ab45495..ec2ba89 100644
--- a/hw/ide/atapi.c
+++ b/hw/ide/atapi.c
@@ -117,37 +117,40 @@ static void cd_read_sector_cb(void *opaque, int ret)
 }

 if (s->cd_sector_size == 2352) {
-cd_data_to_raw(s->io_buffer, s->lba);
+int nb_sectors = s->packet_transfer_size / 2352;
+while (nb_sectors--) {
+memmove(s->io_buffer + nb_sectors * 2352 + 4,
+s->io_buffer + nb_sectors * 2048, 2048);
+cd_data_to_raw(s->io_buffer + nb_sectors * 2352,
+   s->lba + nb_sectors);
+}
 }

-s->lba++;
+s->lba = -1;
 s->io_buffer_index = 0;
 s->status &= ~BUSY_STAT;

 ide_atapi_cmd_reply_end(s);
 }

-static int cd_read_sector(IDEState *s, int lba, void *buf, int sector_size)
+static int cd_read_sector(IDEState *s, int lba, void *buf, int sector_size, 
int nb_sectors)
 {
 if (sector_size != 2048 && sector_size != 2352) {
 return -EINVAL;
 }

 s->iov.iov_base = buf;
-if (sector_size == 2352) {
-buf += 4;
-}
-
-s->iov.iov_len = 4 * BDRV_SECTOR_SIZE;
+s->iov.iov_len = nb_sectors * 2048;
 qemu_iovec_init_external(&s->qiov, &s->iov, 1);

-if (ide_readv_cancelable(s, (int64_t)lba << 2, &s->qiov, 4,
-  cd_read_sector_cb, s) == NULL) {
+if (ide_readv_cancelable(s, (int64_t)lba << 

Re: [Qemu-block] [Qemu-devel] [PATCH 1/5] ide/atapi: make PIO read requests async

2015-10-06 Thread Peter Lieven

Am 06.10.2015 um 10:57 schrieb Kevin Wolf:

Am 05.10.2015 um 23:15 hat John Snow geschrieben:


On 09/21/2015 08:25 AM, Peter Lieven wrote:

PIO read requests on the ATAPI interface used to be sync blk requests.
This has to siginificant drawbacks. First the main loop hangs util an
I/O request is completed and secondly if the I/O request does not
complete (e.g. due to an unresponsive storage) Qemu hangs completely.

Signed-off-by: Peter Lieven 
---
  hw/ide/atapi.c | 69 --
  1 file changed, 43 insertions(+), 26 deletions(-)

diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c
index 747f466..9257e1c 100644
--- a/hw/ide/atapi.c
+++ b/hw/ide/atapi.c
@@ -105,31 +105,51 @@ static void cd_data_to_raw(uint8_t *buf, int lba)
  memset(buf, 0, 288);
  }
  
-static int cd_read_sector(IDEState *s, int lba, uint8_t *buf, int sector_size)

+static void cd_read_sector_cb(void *opaque, int ret)
  {
-int ret;
+IDEState *s = opaque;
  
-switch(sector_size) {

-case 2048:
-block_acct_start(blk_get_stats(s->blk), &s->acct,
- 4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
-ret = blk_read(s->blk, (int64_t)lba << 2, buf, 4);
-block_acct_done(blk_get_stats(s->blk), &s->acct);
-break;
-case 2352:
-block_acct_start(blk_get_stats(s->blk), &s->acct,
- 4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
-ret = blk_read(s->blk, (int64_t)lba << 2, buf + 16, 4);
-block_acct_done(blk_get_stats(s->blk), &s->acct);
-if (ret < 0)
-return ret;
-cd_data_to_raw(buf, lba);
-break;
-default:
-ret = -EIO;
-break;
+block_acct_done(blk_get_stats(s->blk), &s->acct);
+
+if (ret < 0) {
+ide_atapi_io_error(s, ret);
+return;
+}
+
+if (s->cd_sector_size == 2352) {
+cd_data_to_raw(s->io_buffer, s->lba);
  }
-return ret;
+
+s->lba++;
+s->io_buffer_index = 0;
+s->status &= ~BUSY_STAT;
+
+ide_atapi_cmd_reply_end(s);
+}
+
+static int cd_read_sector(IDEState *s, int lba, void *buf, int sector_size)
+{
+if (sector_size != 2048 && sector_size != 2352) {
+return -EINVAL;
+}
+
+s->iov.iov_base = buf;
+if (sector_size == 2352) {
+buf += 4;
+}

This doesn't look quite right, buf is never read after this.

Also, why +=4 when it was originally buf + 16?


You are right. I mixed that up.




+
+s->iov.iov_len = 4 * BDRV_SECTOR_SIZE;
+qemu_iovec_init_external(&s->qiov, &s->iov, 1);
+
+if (blk_aio_readv(s->blk, (int64_t)lba << 2, &s->qiov, 4,
+  cd_read_sector_cb, s) == NULL) {
+return -EIO;
+}
+
+block_acct_start(blk_get_stats(s->blk), &s->acct,
+ 4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
+s->status |= BUSY_STAT;
+return 0;
  }
  

We discussed this off-list a bit, but for upstream synchronization:

Unfortunately, I believe making cd_read_sector here non-blocking makes
ide_atapi_cmd_reply_end non-blocking, and as a result makes calls to
s->end_transfer_func() nonblocking, which functions like ide_data_readw
are not prepared to cope with.

I don't think that's a problem as long as BSY is set while the
asynchronous command is running and DRQ is cleared. The latter will
protect ide_data_readw(). ide_sector_read() does essentially the same
thing.


 I was thinking the same. Without the BSY its not working at all.



Or maybe I'm just missing what you're trying to say.


My suggestion is to buffer an entire DRQ block of data at once
(byte_count_limit) to avoid the problem.

No matter whether there is a problem or not, buffering more data at once
(and therefore doing less requests) is better for performance anyway.


Its possible to do only one read in the backend and read the whole
request into the IO buffer. I send a follow-up.

Maybe do you have a pointer to the test tool that John mentioned?

Peter




Re: [Qemu-block] [Qemu-devel] [PATCH 1/5] ide/atapi: make PIO read requests async

2015-10-06 Thread Peter Lieven

Am 05.10.2015 um 23:15 schrieb John Snow:


On 09/21/2015 08:25 AM, Peter Lieven wrote:

PIO read requests on the ATAPI interface used to be sync blk requests.
This has to siginificant drawbacks. First the main loop hangs util an
I/O request is completed and secondly if the I/O request does not
complete (e.g. due to an unresponsive storage) Qemu hangs completely.

Signed-off-by: Peter Lieven 
---
  hw/ide/atapi.c | 69 --
  1 file changed, 43 insertions(+), 26 deletions(-)

diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c
index 747f466..9257e1c 100644
--- a/hw/ide/atapi.c
+++ b/hw/ide/atapi.c
@@ -105,31 +105,51 @@ static void cd_data_to_raw(uint8_t *buf, int lba)
  memset(buf, 0, 288);
  }
  
-static int cd_read_sector(IDEState *s, int lba, uint8_t *buf, int sector_size)

+static void cd_read_sector_cb(void *opaque, int ret)
  {
-int ret;
+IDEState *s = opaque;
  
-switch(sector_size) {

-case 2048:
-block_acct_start(blk_get_stats(s->blk), &s->acct,
- 4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
-ret = blk_read(s->blk, (int64_t)lba << 2, buf, 4);
-block_acct_done(blk_get_stats(s->blk), &s->acct);
-break;
-case 2352:
-block_acct_start(blk_get_stats(s->blk), &s->acct,
- 4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
-ret = blk_read(s->blk, (int64_t)lba << 2, buf + 16, 4);
-block_acct_done(blk_get_stats(s->blk), &s->acct);
-if (ret < 0)
-return ret;
-cd_data_to_raw(buf, lba);
-break;
-default:
-ret = -EIO;
-break;
+block_acct_done(blk_get_stats(s->blk), &s->acct);
+
+if (ret < 0) {
+ide_atapi_io_error(s, ret);
+return;
+}
+
+if (s->cd_sector_size == 2352) {
+cd_data_to_raw(s->io_buffer, s->lba);
  }
-return ret;
+
+s->lba++;
+s->io_buffer_index = 0;
+s->status &= ~BUSY_STAT;
+
+ide_atapi_cmd_reply_end(s);
+}
+
+static int cd_read_sector(IDEState *s, int lba, void *buf, int sector_size)
+{
+if (sector_size != 2048 && sector_size != 2352) {
+return -EINVAL;
+}
+
+s->iov.iov_base = buf;
+if (sector_size == 2352) {
+buf += 4;
+}
+
+s->iov.iov_len = 4 * BDRV_SECTOR_SIZE;
+qemu_iovec_init_external(&s->qiov, &s->iov, 1);
+
+if (blk_aio_readv(s->blk, (int64_t)lba << 2, &s->qiov, 4,
+  cd_read_sector_cb, s) == NULL) {
+return -EIO;
+}
+
+block_acct_start(blk_get_stats(s->blk), &s->acct,
+ 4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
+s->status |= BUSY_STAT;
+return 0;
  }
  

We discussed this off-list a bit, but for upstream synchronization:

Unfortunately, I believe making cd_read_sector here non-blocking makes
ide_atapi_cmd_reply_end non-blocking, and as a result makes calls to
s->end_transfer_func() nonblocking, which functions like ide_data_readw
are not prepared to cope with.

My suggestion is to buffer an entire DRQ block of data at once
(byte_count_limit) to avoid the problem.


Hi John,

first of all thank you for the detailed analysis.

Is the following what you have i mind. For PIO reads > 1 sector
it is a great improvement for the NFS backend:

diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c
index ab45495..ec2ba89 100644
--- a/hw/ide/atapi.c
+++ b/hw/ide/atapi.c
@@ -117,37 +117,40 @@ static void cd_read_sector_cb(void *opaque, int ret)
 }

 if (s->cd_sector_size == 2352) {
-cd_data_to_raw(s->io_buffer, s->lba);
+int nb_sectors = s->packet_transfer_size / 2352;
+while (nb_sectors--) {
+memmove(s->io_buffer + nb_sectors * 2352 + 4,
+s->io_buffer + nb_sectors * 2048, 2048);
+cd_data_to_raw(s->io_buffer + nb_sectors * 2352,
+   s->lba + nb_sectors);
+}
 }

-s->lba++;
+s->lba = -1;
 s->io_buffer_index = 0;
 s->status &= ~BUSY_STAT;

 ide_atapi_cmd_reply_end(s);
 }

-static int cd_read_sector(IDEState *s, int lba, void *buf, int sector_size)
+static int cd_read_sector(IDEState *s, int lba, void *buf, int sector_size, 
int nb_sectors)
 {
 if (sector_size != 2048 && sector_size != 2352) {
 return -EINVAL;
 }

 s->iov.iov_base = buf;
-if (sector_size == 2352) {
-buf += 4;
-}
-
-s->iov.iov_len = 4 * BDRV_SECTOR_SIZE;
+s->iov.iov_len = nb_sectors * 2048;
 qemu_iovec_init_external(&s->qiov, &s->iov, 1);

-if (ide_readv_cancelable(s, (int64_t)lba << 2, &s->qiov, 4,
-  cd_read_sector_cb, s) == NULL) {
+if (ide_readv_cancelable(s, (int64_t)lba << 2,
+ 

Re: [Qemu-block] [PATCH] block/nfs: add support for setting debug level

2015-09-21 Thread Peter Lieven

Am 25.06.2015 um 15:18 schrieb Stefan Hajnoczi:

On Tue, Jun 23, 2015 at 10:12:15AM +0200, Peter Lieven wrote:

upcoming libnfs versions will support logging debug messages. Add
support for it in qemu through an URL parameter.

Signed-off-by: Peter Lieven 
---
  block/nfs.c | 4 
  1 file changed, 4 insertions(+)

diff --git a/block/nfs.c b/block/nfs.c
index ca9e24e..f7388a3 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -329,6 +329,10 @@ static int64_t nfs_client_open(NFSClient *client, const 
char *filename,
  } else if (!strcmp(qp->p[i].name, "readahead")) {
  nfs_set_readahead(client->context, val);
  #endif
+#ifdef LIBNFS_FEATURE_DEBUG
+} else if (!strcmp(qp->p[i].name, "debug")) {
+nfs_set_debug(client->context, val);
+#endif
  } else {
  error_setg(errp, "Unknown NFS parameter name: %s",
 qp->p[i].name);

Untrusted users may be able to set these options since they are encoded
in the URI.  I'm imagining a hosting or cloud scenario like OpenStack.

A verbose debug level spams stderr and could consume a lot of disk
space.

(The uid and gid options are probably okay since the NFS server cannot
trust the uid/gid coming from QEMU anyway.)

I think we can merge this patch for QEMU 2.4 but I'd like to have a
discussion about the security risk of encoding libnfs options in the
URI.

CCed Eric Blake in case libvirt is affected.

Has anyone thought about this and what are the rules?


As I hadn't time to work further on the best way to add options for NFS (and 
other
protocols), would it be feasible to allow passing debug as an URL parameter, but
limit the maximum debug level to limit a possible security impact (flooding 
logs)?

If a higher debug level is needed it can be set via device specific options as 
soon
there is a common scheme for them.

Peter




Re: [Qemu-block] [Qemu-devel] [PATCH 0/5] ide: avoid main-loop hang on CDROM/NFS failure

2015-09-21 Thread Peter Lieven


> Am 21.09.2015 um 22:58 schrieb John Snow :
> 
> 
> 
>> On 09/21/2015 08:25 AM, Peter Lieven wrote:
>> This series aims at avoiding a hanging main-loop if a vserver has a
>> CDROM image mounted from a NFS share and that NFS share goes down.
>> Typical situation is that users mount an CDROM ISO to install something
>> and then forget to eject that CDROM afterwards.
>> As a consequence this mounted CD is able to bring down the
>> whole vserver if the backend NFS share is unreachable. This is bad
>> especially if the CDROM itself is not needed anymore at this point.
>> 
>> This series aims at fixing 3 blocking I/O operations that would
>> hang if the NFS server is unavailable:
>> - ATAPI PIO read requests used sync calls to blk_read, convert
>>   them to an async variant.
>> - If a busmaster DMA request is cancelled all requests are drained.
>>   Convert the drain to an async request canceling.
>> - query-block in the HMP or QMP hangs because it indirectly calls
>>   bdrv_get_allocated_file_size.
>> 
>> Note that Patch 5 is only included for completeness.
>> 
>> Peter
>> 
>> Peter Lieven (5):
>>  ide/atapi: make PIO read requests async
>>  ide/atapi: blk_aio_readv may return NULL
>>  ide: add support for cancelable read requests
>>  ide/atapi: enable cancelable requests
>>  block/nfs: cache allocated filesize for read-only files
>> 
>> block/nfs.c   | 36 ++
>> hw/ide/atapi.c| 75 
>> +++
>> hw/ide/core.c | 55 
>> hw/ide/internal.h | 16 
>> hw/ide/pci.c  | 42 ---
>> 5 files changed, 183 insertions(+), 41 deletions(-)
> 
> I assume this supersedes both:
> 
> [Qemu-devel] [PATCH 0/2] ide/atapi: partially avoid deadlock if the
> storage backend is dead
> 
> and
> 
> [Qemu-devel] [PATCH] ide/atapi: make PIO read requests async
> 
> right?

yes, the first patch was wrong as Stefan pointed out and the second is the same 
version as previously on the list.

Peter 

> 
> --js



[Qemu-block] [PATCH 5/5] block/nfs: cache allocated filesize for read-only files

2015-09-21 Thread Peter Lieven
If the file is readonly its not expected to grow so
save the blocking call to nfs_fstat_async and use
the value saved at connection time. Also important
the monitor (and thus the main loop) will not hang
if block device info is queried and the NFS share
is unresponsive.

Signed-off-by: Peter Lieven 
Reviewed-by: Max Reitz 
Reviewed-by: Jeff Cody 
---
 block/nfs.c | 36 
 1 file changed, 36 insertions(+)

diff --git a/block/nfs.c b/block/nfs.c
index c026ff6..5ffd19f 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -43,6 +43,7 @@ typedef struct NFSClient {
 int events;
 bool has_zero_init;
 AioContext *aio_context;
+blkcnt_t st_blocks;
 } NFSClient;
 
 typedef struct NFSRPC {
@@ -374,6 +375,7 @@ static int64_t nfs_client_open(NFSClient *client, const 
char *filename,
 }
 
 ret = DIV_ROUND_UP(st.st_size, BDRV_SECTOR_SIZE);
+client->st_blocks = st.st_blocks;
 client->has_zero_init = S_ISREG(st.st_mode);
 goto out;
 fail:
@@ -464,6 +466,11 @@ static int64_t 
nfs_get_allocated_file_size(BlockDriverState *bs)
 NFSRPC task = {0};
 struct stat st;
 
+if (bdrv_is_read_only(bs) &&
+!(bs->open_flags & BDRV_O_NOCACHE)) {
+return client->st_blocks * 512;
+}
+
 task.st = &st;
 if (nfs_fstat_async(client->context, client->fh, nfs_co_generic_cb,
 &task) != 0) {
@@ -484,6 +491,34 @@ static int nfs_file_truncate(BlockDriverState *bs, int64_t 
offset)
 return nfs_ftruncate(client->context, client->fh, offset);
 }
 
+/* Note that this will not re-establish a connection with the NFS server
+ * - it is effectively a NOP.  */
+static int nfs_reopen_prepare(BDRVReopenState *state,
+  BlockReopenQueue *queue, Error **errp)
+{
+NFSClient *client = state->bs->opaque;
+struct stat st;
+int ret = 0;
+
+if (state->flags & BDRV_O_RDWR && bdrv_is_read_only(state->bs)) {
+error_setg(errp, "Cannot open a read-only mount as read-write");
+return -EACCES;
+}
+
+/* Update cache for read-only reopens */
+if (!(state->flags & BDRV_O_RDWR)) {
+ret = nfs_fstat(client->context, client->fh, &st);
+if (ret < 0) {
+error_setg(errp, "Failed to fstat file: %s",
+   nfs_get_error(client->context));
+return ret;
+}
+client->st_blocks = st.st_blocks;
+}
+
+return 0;
+}
+
 static BlockDriver bdrv_nfs = {
 .format_name= "nfs",
 .protocol_name  = "nfs",
@@ -499,6 +534,7 @@ static BlockDriver bdrv_nfs = {
 .bdrv_file_open = nfs_file_open,
 .bdrv_close = nfs_file_close,
 .bdrv_create= nfs_file_create,
+.bdrv_reopen_prepare= nfs_reopen_prepare,
 
 .bdrv_co_readv  = nfs_co_readv,
 .bdrv_co_writev = nfs_co_writev,
-- 
1.9.1




[Qemu-block] [PATCH 3/5] ide: add support for cancelable read requests

2015-09-21 Thread Peter Lieven
this patch adds a new aio readv compatible function which copies
all data through a bounce buffer. The benefit is that these requests
can be flagged as canceled to avoid guest memory corruption when
a canceled request is completed by the backend at a later stage.

If an IDE protocol wants to use this function it has to pipe
all read requests through ide_readv_cancelable and it may then
enable requests_cancelable in the IDEState.

If this state is enable we can avoid the blocking blk_drain_all
in case of a BMDMA reset.

Currently only read operations are cancelable thus we can only
use this logic for read-only devices.

Signed-off-by: Peter Lieven 
---
 hw/ide/core.c | 54 ++
 hw/ide/internal.h | 16 
 hw/ide/pci.c  | 42 --
 3 files changed, 98 insertions(+), 14 deletions(-)

diff --git a/hw/ide/core.c b/hw/ide/core.c
index 317406d..24547ce 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -561,6 +561,59 @@ static bool ide_sect_range_ok(IDEState *s,
 return true;
 }
 
+static void ide_readv_cancelable_cb(void *opaque, int ret)
+{
+IDECancelableRequest *req = opaque;
+if (!req->canceled) {
+if (!ret) {
+qemu_iovec_from_buf(req->org_qiov, 0, req->buf, 
req->org_qiov->size);
+}
+req->org_cb(req->org_opaque, ret);
+}
+QLIST_REMOVE(req, list);
+qemu_vfree(req->buf);
+qemu_iovec_destroy(&req->qiov);
+g_free(req);
+}
+
+#define MAX_CANCELABLE_REQS 16
+
+BlockAIOCB *ide_readv_cancelable(IDEState *s, int64_t sector_num,
+ QEMUIOVector *iov, int nb_sectors,
+ BlockCompletionFunc *cb, void *opaque)
+{
+BlockAIOCB *aioreq;
+IDECancelableRequest *req;
+int c = 0;
+
+QLIST_FOREACH(req, &s->cancelable_requests, list) {
+c++;
+}
+if (c > MAX_CANCELABLE_REQS) {
+return NULL;
+}
+
+req = g_new0(IDECancelableRequest, 1);
+qemu_iovec_init(&req->qiov, 1);
+req->buf = qemu_blockalign(blk_bs(s->blk), iov->size);
+qemu_iovec_add(&req->qiov, req->buf, iov->size);
+req->org_qiov = iov;
+req->org_cb = cb;
+req->org_opaque = opaque;
+
+aioreq = blk_aio_readv(s->blk, sector_num, &req->qiov, nb_sectors,
+   ide_readv_cancelable_cb, req);
+if (aioreq == NULL) {
+qemu_vfree(req->buf);
+qemu_iovec_destroy(&req->qiov);
+g_free(req);
+} else {
+QLIST_INSERT_HEAD(&s->cancelable_requests, req, list);
+}
+
+return aioreq;
+}
+
 static void ide_sector_read(IDEState *s);
 
 static void ide_sector_read_cb(void *opaque, int ret)
@@ -805,6 +858,7 @@ void ide_start_dma(IDEState *s, BlockCompletionFunc *cb)
 s->bus->retry_unit = s->unit;
 s->bus->retry_sector_num = ide_get_sector(s);
 s->bus->retry_nsector = s->nsector;
+s->bus->s = s;
 if (s->bus->dma->ops->start_dma) {
 s->bus->dma->ops->start_dma(s->bus->dma, s, cb);
 }
diff --git a/hw/ide/internal.h b/hw/ide/internal.h
index 05e93ff..ad188c2 100644
--- a/hw/ide/internal.h
+++ b/hw/ide/internal.h
@@ -343,6 +343,16 @@ enum ide_dma_cmd {
 #define ide_cmd_is_read(s) \
((s)->dma_cmd == IDE_DMA_READ)
 
+typedef struct IDECancelableRequest {
+QLIST_ENTRY(IDECancelableRequest) list;
+QEMUIOVector qiov;
+uint8_t *buf;
+QEMUIOVector *org_qiov;
+BlockCompletionFunc *org_cb;
+void *org_opaque;
+bool canceled;
+} IDECancelableRequest;
+
 /* NOTE: IDEState represents in fact one drive */
 struct IDEState {
 IDEBus *bus;
@@ -396,6 +406,8 @@ struct IDEState {
 BlockAIOCB *pio_aiocb;
 struct iovec iov;
 QEMUIOVector qiov;
+QLIST_HEAD(, IDECancelableRequest) cancelable_requests;
+bool requests_cancelable;
 /* ATA DMA state */
 int32_t io_buffer_offset;
 int32_t io_buffer_size;
@@ -468,6 +480,7 @@ struct IDEBus {
 uint8_t retry_unit;
 int64_t retry_sector_num;
 uint32_t retry_nsector;
+IDEState *s;
 };
 
 #define TYPE_IDE_DEVICE "ide-device"
@@ -572,6 +585,9 @@ void ide_set_inactive(IDEState *s, bool more);
 BlockAIOCB *ide_issue_trim(BlockBackend *blk,
 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
 BlockCompletionFunc *cb, void *opaque);
+BlockAIOCB *ide_readv_cancelable(IDEState *s, int64_t sector_num,
+ QEMUIOVector *iov, int nb_sectors,
+ BlockCompletionFunc *cb, void *opaque);
 
 /* hw/ide/atapi.c */
 void ide_atapi_cmd(IDEState *s);
diff --git a/hw/ide/pci.c b/hw/ide/pci.c
index d31ff88..5587183 100644
--- a/hw/ide/pci.c
+++ b/hw/ide/pci.c
@@ -240,21 +240,35 @@ void bmdma_cmd_writeb(BMDMAState *bm, uint32_t val)
 /* Ignore writes to S

[Qemu-block] [PATCH 4/5] ide/atapi: enable cancelable requests

2015-09-21 Thread Peter Lieven
Signed-off-by: Peter Lieven 
---
 hw/ide/atapi.c | 4 ++--
 hw/ide/core.c  | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c
index b209ed9..ab45495 100644
--- a/hw/ide/atapi.c
+++ b/hw/ide/atapi.c
@@ -141,7 +141,7 @@ static int cd_read_sector(IDEState *s, int lba, void *buf, 
int sector_size)
 s->iov.iov_len = 4 * BDRV_SECTOR_SIZE;
 qemu_iovec_init_external(&s->qiov, &s->iov, 1);
 
-if (blk_aio_readv(s->blk, (int64_t)lba << 2, &s->qiov, 4,
+if (ide_readv_cancelable(s, (int64_t)lba << 2, &s->qiov, 4,
   cd_read_sector_cb, s) == NULL) {
 return -EIO;
 }
@@ -368,7 +368,7 @@ static void ide_atapi_cmd_read_dma_cb(void *opaque, int ret)
 s->bus->dma->iov.iov_len = n * 4 * 512;
 qemu_iovec_init_external(&s->bus->dma->qiov, &s->bus->dma->iov, 1);
 
-s->bus->dma->aiocb = blk_aio_readv(s->blk, (int64_t)s->lba << 2,
+s->bus->dma->aiocb = ide_readv_cancelable(s, (int64_t)s->lba << 2,
&s->bus->dma->qiov, n * 4,
ide_atapi_cmd_read_dma_cb, s);
 if (s->bus->dma->aiocb == NULL) {
diff --git a/hw/ide/core.c b/hw/ide/core.c
index 24547ce..5c7a346 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -2330,6 +2330,7 @@ int ide_init_drive(IDEState *s, BlockBackend *blk, 
IDEDriveKind kind,
 if (kind == IDE_CD) {
 blk_set_dev_ops(blk, &ide_cd_block_ops, s);
 blk_set_guest_block_size(blk, 2048);
+s->requests_cancelable = true;
 } else {
 if (!blk_is_inserted(s->blk)) {
 error_report("Device needs media, but drive is empty");
-- 
1.9.1




[Qemu-block] [PATCH 2/5] ide/atapi: blk_aio_readv may return NULL

2015-09-21 Thread Peter Lieven
Signed-off-by: Peter Lieven 
---
 hw/ide/atapi.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c
index 9257e1c..b209ed9 100644
--- a/hw/ide/atapi.c
+++ b/hw/ide/atapi.c
@@ -371,6 +371,10 @@ static void ide_atapi_cmd_read_dma_cb(void *opaque, int 
ret)
 s->bus->dma->aiocb = blk_aio_readv(s->blk, (int64_t)s->lba << 2,
&s->bus->dma->qiov, n * 4,
ide_atapi_cmd_read_dma_cb, s);
+if (s->bus->dma->aiocb == NULL) {
+ide_atapi_io_error(s, -EIO);
+goto eot;
+}
 return;
 
 eot:
-- 
1.9.1




[Qemu-block] [PATCH 1/5] ide/atapi: make PIO read requests async

2015-09-21 Thread Peter Lieven
PIO read requests on the ATAPI interface used to be sync blk requests.
This has to siginificant drawbacks. First the main loop hangs util an
I/O request is completed and secondly if the I/O request does not
complete (e.g. due to an unresponsive storage) Qemu hangs completely.

Signed-off-by: Peter Lieven 
---
 hw/ide/atapi.c | 69 --
 1 file changed, 43 insertions(+), 26 deletions(-)

diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c
index 747f466..9257e1c 100644
--- a/hw/ide/atapi.c
+++ b/hw/ide/atapi.c
@@ -105,31 +105,51 @@ static void cd_data_to_raw(uint8_t *buf, int lba)
 memset(buf, 0, 288);
 }
 
-static int cd_read_sector(IDEState *s, int lba, uint8_t *buf, int sector_size)
+static void cd_read_sector_cb(void *opaque, int ret)
 {
-int ret;
+IDEState *s = opaque;
 
-switch(sector_size) {
-case 2048:
-block_acct_start(blk_get_stats(s->blk), &s->acct,
- 4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
-ret = blk_read(s->blk, (int64_t)lba << 2, buf, 4);
-block_acct_done(blk_get_stats(s->blk), &s->acct);
-break;
-case 2352:
-block_acct_start(blk_get_stats(s->blk), &s->acct,
- 4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
-ret = blk_read(s->blk, (int64_t)lba << 2, buf + 16, 4);
-block_acct_done(blk_get_stats(s->blk), &s->acct);
-if (ret < 0)
-return ret;
-cd_data_to_raw(buf, lba);
-break;
-default:
-ret = -EIO;
-break;
+block_acct_done(blk_get_stats(s->blk), &s->acct);
+
+if (ret < 0) {
+ide_atapi_io_error(s, ret);
+return;
+}
+
+if (s->cd_sector_size == 2352) {
+cd_data_to_raw(s->io_buffer, s->lba);
 }
-return ret;
+
+s->lba++;
+s->io_buffer_index = 0;
+s->status &= ~BUSY_STAT;
+
+ide_atapi_cmd_reply_end(s);
+}
+
+static int cd_read_sector(IDEState *s, int lba, void *buf, int sector_size)
+{
+if (sector_size != 2048 && sector_size != 2352) {
+return -EINVAL;
+}
+
+s->iov.iov_base = buf;
+if (sector_size == 2352) {
+buf += 4;
+}
+
+s->iov.iov_len = 4 * BDRV_SECTOR_SIZE;
+qemu_iovec_init_external(&s->qiov, &s->iov, 1);
+
+if (blk_aio_readv(s->blk, (int64_t)lba << 2, &s->qiov, 4,
+  cd_read_sector_cb, s) == NULL) {
+return -EIO;
+}
+
+block_acct_start(blk_get_stats(s->blk), &s->acct,
+ 4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
+s->status |= BUSY_STAT;
+return 0;
 }
 
 void ide_atapi_cmd_ok(IDEState *s)
@@ -190,10 +210,8 @@ void ide_atapi_cmd_reply_end(IDEState *s)
 ret = cd_read_sector(s, s->lba, s->io_buffer, s->cd_sector_size);
 if (ret < 0) {
 ide_atapi_io_error(s, ret);
-return;
 }
-s->lba++;
-s->io_buffer_index = 0;
+return;
 }
 if (s->elementary_transfer_size > 0) {
 /* there are some data left to transmit in this elementary
@@ -275,7 +293,6 @@ static void ide_atapi_cmd_read_pio(IDEState *s, int lba, 
int nb_sectors,
 s->io_buffer_index = sector_size;
 s->cd_sector_size = sector_size;
 
-s->status = READY_STAT | SEEK_STAT;
 ide_atapi_cmd_reply_end(s);
 }
 
-- 
1.9.1




[Qemu-block] [PATCH 0/5] ide: avoid main-loop hang on CDROM/NFS failure

2015-09-21 Thread Peter Lieven
This series aims at avoiding a hanging main-loop if a vserver has a
CDROM image mounted from a NFS share and that NFS share goes down.
Typical situation is that users mount an CDROM ISO to install something
and then forget to eject that CDROM afterwards.
As a consequence this mounted CD is able to bring down the
whole vserver if the backend NFS share is unreachable. This is bad
especially if the CDROM itself is not needed anymore at this point.

This series aims at fixing 3 blocking I/O operations that would
hang if the NFS server is unavailable:
 - ATAPI PIO read requests used sync calls to blk_read, convert
   them to an async variant.
 - If a busmaster DMA request is cancelled all requests are drained.
   Convert the drain to an async request canceling.
 - query-block in the HMP or QMP hangs because it indirectly calls
   bdrv_get_allocated_file_size.

Note that Patch 5 is only included for completeness.

Peter

Peter Lieven (5):
  ide/atapi: make PIO read requests async
  ide/atapi: blk_aio_readv may return NULL
  ide: add support for cancelable read requests
  ide/atapi: enable cancelable requests
  block/nfs: cache allocated filesize for read-only files

 block/nfs.c   | 36 ++
 hw/ide/atapi.c| 75 +++
 hw/ide/core.c | 55 
 hw/ide/internal.h | 16 
 hw/ide/pci.c  | 42 ---
 5 files changed, 183 insertions(+), 41 deletions(-)

-- 
1.9.1




Re: [Qemu-block] [Qemu-devel] [PATCH] iscsi: Add chap and "initiator-name" etc as per drive options

2015-09-13 Thread Peter Lieven


> Am 14.09.2015 um 08:38 schrieb Fam Zheng :
> 
>> On Fri, 09/11 08:27, ronnie sahlberg wrote:
>>> On Fri, Sep 11, 2015 at 8:20 AM, Eric Blake  wrote:
 On 09/11/2015 12:00 AM, Fam Zheng wrote:
 Previously we use "-iscsi id=target-iqn,user=foo,password=bar,..." to
 specify iscsi connection parameters, unfortunately it doesn't work with
 qemu-img.
 
 This patch adds per drive options to iscsi driver so that at least
 qemu-img can use the "json:{...}" filename magic.
 
 Signed-off-by: Fam Zheng 
 ---
 block/iscsi.c | 83 
 +--
 1 file changed, 64 insertions(+), 19 deletions(-)
>>> 
>>> It would be nice to also add a matching BlockdevOptionsIscsi to
>>> qapi/block-core.json, to allow setting these structured options from
>>> QMP.  Separate patch is fine, but we need to do the work for ALL of the
>>> remaining block devices eventually, and now that you are structuring the
>>> command line is a good time to think about it.
>>> 
>>> 
 static void iscsi_nop_timed_event(void *opaque)
 @@ -1229,6 +1253,27 @@ static QemuOptsList runtime_opts = {
 .name = "filename",
 .type = QEMU_OPT_STRING,
 .help = "URL to the iscsi image",
 +},{
 +.name = "user",
 +.type = QEMU_OPT_STRING,
 +.help = "username for CHAP authentication to target",
 +},{
 +.name = "password",
 +.type = QEMU_OPT_STRING,
 +.help = "password for CHAP authentication to target",
 +},{
>>> 
>>> Also, this requires passing the password in the command line. We
>>> _really_ need to solve the problem of allowing the password to be passed
>>> via a fd or other QMP command, rather than on the command line.
>> 
>> 
>> Passing via command line is evil. It should still be possible to pass
>> all this via a config file to qemu :
>> 
>> """
>> ...
>> Howto use a configuration file to set iSCSI configuration options:
>> @example
>> cat >iscsi.conf <> [iscsi "iqn.target.name"]
>>  user = "me"
>>  password = "my password"
>>  initiator-name = "iqn.qemu.test:my-initiator"
>>  header-digest = "CRC32C"
>> EOF
>> 
>> qemu-system-i386 -drive file=iscsi://127.0.0.1/iqn.qemu.test/1 \
>>-readconfig iscsi.conf
>> @end example
>> ...
>> """
> 
> I agree passing password with clear text command line is bad, but -readconfig
> doesn't work for qemu-img and qemu-io.  Any idea how to make that work?

you can pass the secrets via environment variables (see libiscsi readme).

Peter



Re: [Qemu-block] [PATCH 2/2] ide/atapi: partially avoid deadlock if the storage backend is dead

2015-09-07 Thread Peter Lieven
Am 07.09.2015 um 15:43 schrieb Stefan Hajnoczi:
> On Sun, Sep 06, 2015 at 11:24:10AM +0200, Peter Lieven wrote:
>>> Taking a step back, what are the semantics of writing !(val &
>>> BM_CMD_START)?  Is the device guaranteed to cancel/complete requests
>>> during the register write?
>> I have to check that. John, do you have an idea?
>>
>> Stefan, or do you have a better approach for getting rid of the 
>> bdrv_drain_all
>> in this piece of code?
> What's needed is an asynchronous approach that honors the semantics of
> the Start/Stop bit.
>
> From "Programming Interface for Bus Master IDE Controller", Revision 1.0:
>
> "Start/Stop Bus Master: Writing a '1' to this bit enables bus master
> operation of the controller.  Bus master operation begins when this bit
> is detected changing from a zero to a one. The controller will transfer
> data between the IDE device and memory only when this bit is set.
> Master operation can be halted by writing a '0' to this bit. All state
> information is lost when a '0' is written; Master mode operation cannot
> be stopped and then resumed. If this bit is reset while bus master
> operation is still active (i.e., the Bus Master IDE Active bit of the
> Bus Master IDE Status register for that IDE channel is set) and the
> drive has not yet finished its data transfer (The Interupt bit in the
> Bus Master IDE Status register for that IDE channel is not set), the bus
> master command is said to be aborted and data transfered from the drive
> may be discarded before being written to system memory. This bit is
> intended to be reset after the data transfer is completed, as indicated
> by either the Bus Master IDE Active bit or the Interrupt bit of the Bus
> Master IDE Status register for that IDE channel being set, or both."
>
> bdrv_aio_cancel() can be used to nudge the request to cancel.  The
> completion function is still called at some later point - and it could
> take an arbitrary amount of time before the request finishes.
>
> From the IDE emulation perspective, there are two requirements:
>
> 1. After the Start bit has been cleared, no IDE state changes should
>occur due to the pending request.  No interrupts should be raised and
>no registers should change when the request completes
>(success/error/cancelled).  Also, the guest should be able to set the
>bit again and submit a new request.

I think my approach should fullfil this requirement already The callback
into the IDE layer is set to NULL.

>
> 2. No guest memory should be touched by a running request after the bit
>is cleared.  QEMU needs to go through bounce buffers in all cases
>instead of doing zero copy (mapping the PRDT).
>
>Using bounce buffers should be doable but you need to check the IDE
>code paths (PIO, DMA, and ATAPI) reachable from hw/ide/pci.c.
>There's also a denial-of-service scenario where the guest repeatedly
>submits an I/O requests and then clears the Start/Stop bit.  That
>would cause QEMU to build up many pending I/O requests and bounce
>buffers.  That needs to be handled, for example by punishing the
>guest by failing news I/O if there are more than 16 cancelled
>requests pending.

This sounds doable. I will look into this.

Peter




Re: [Qemu-block] [PATCH 2/2] ide/atapi: partially avoid deadlock if the storage backend is dead

2015-09-06 Thread Peter Lieven
Am 03.09.2015 um 18:59 schrieb Stefan Hajnoczi:
> On Thu, Aug 20, 2015 at 10:14:08AM +0200, Peter Lieven wrote:
>> the blk_drain_all() that is executed if the guest issues a DMA cancel
>> leads to a stuck main loop if the storage backend (e.g. a NFS share)
>> is unresponsive.
>>
>> This scenario is a common case for CDROM images mounted from an
>> NFS share. In this case a broken NFS server can take down the
>> whole VM even if the mounted CDROM is not used and was just not
>> unmounted after usage.
>>
>> This approach avoids the blk_drain_all for read-only media and
>> cancelles the AIO locally and makes the callback a NOP if the
>> original request is completed after the NFS share is responsive
>> again.
>>
>> Signed-off-by: Peter Lieven 
>> ---
>>  hw/ide/pci.c | 32 ++--
>>  1 file changed, 18 insertions(+), 14 deletions(-)
>>
>> diff --git a/hw/ide/pci.c b/hw/ide/pci.c
>> index d31ff88..a8b4175 100644
>> --- a/hw/ide/pci.c
>> +++ b/hw/ide/pci.c
>> @@ -240,21 +240,25 @@ void bmdma_cmd_writeb(BMDMAState *bm, uint32_t val)
>>  /* Ignore writes to SSBM if it keeps the old value */
>>  if ((val & BM_CMD_START) != (bm->cmd & BM_CMD_START)) {
>>  if (!(val & BM_CMD_START)) {
>> -/*
>> - * We can't cancel Scatter Gather DMA in the middle of the
>> - * operation or a partial (not full) DMA transfer would reach
>> - * the storage so we wait for completion instead (we beahve
>> - * like if the DMA was completed by the time the guest trying
>> - * to cancel dma with bmdma_cmd_writeb with BM_CMD_START not
>> - * set).
>> - *
>> - * In the future we'll be able to safely cancel the I/O if the
>> - * whole DMA operation will be submitted to disk with a single
>> - * aio operation with preadv/pwritev.
>> - */
>>  if (bm->bus->dma->aiocb) {
>> -blk_drain_all();
>> -assert(bm->bus->dma->aiocb == NULL);
>> +if (!bdrv_is_read_only(bm->bus->dma->aiocb->bs)) {
>> +/* We can't cancel Scatter Gather DMA in the middle of 
>> the
>> + * operation or a partial (not full) DMA transfer would
>> + * reach the storage so we wait for completion instead
>> + * (we beahve like if the DMA was completed by the time 
>> the
>> + * guest trying to cancel dma with bmdma_cmd_writeb with
>> + * BM_CMD_START not set). */
>> +blk_drain_all();
>> +assert(bm->bus->dma->aiocb == NULL);
>> +} else {
>> +/* On a read-only device (e.g. CDROM) we can't cause 
>> incon-
>> + * sistencies and thus cancel the AIOCB locally and 
>> avoid
>> + * to be called back later if the original request is
>> + * completed. */
>> +BlockAIOCB *aiocb = bm->bus->dma->aiocb;
>> +aiocb->cb(aiocb->opaque, -ECANCELED);
>> +aiocb->cb = NULL;
> I'm concerned that this isn't safe.
>
> What happens if the request does complete (e.g. will guest RAM be
> modified by the read operation)?

I am afraid you are right. The callback of the storage driver will
likely overwrite the memory. This could be solved for storage drivers
which don't do zero copy if it would be possible to notifiy them about
the cancellation.

>
> What happens if a new request is started and then old NOPed request
> completes?

That should work because the callback of the NOPed request is never
fired.

>
> Taking a step back, what are the semantics of writing !(val &
> BM_CMD_START)?  Is the device guaranteed to cancel/complete requests
> during the register write?

I have to check that. John, do you have an idea?

Stefan, or do you have a better approach for getting rid of the bdrv_drain_all
in this piece of code?

Peter




[Qemu-block] [PATCH] ide/atapi: make PIO read requests async

2015-09-01 Thread Peter Lieven
PIO read requests on the ATAPI interface used to be sync blk requests.
This has to siginificant drawbacks. First the main loop hangs util an
I/O request is completed and secondly if the I/O request does not
complete (e.g. due to an unresponsive storage) Qemu hangs completely.

Signed-off-by: Peter Lieven 
---
 hw/ide/atapi.c | 69 --
 1 file changed, 43 insertions(+), 26 deletions(-)

diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c
index 79dd167..95a5697 100644
--- a/hw/ide/atapi.c
+++ b/hw/ide/atapi.c
@@ -105,31 +105,51 @@ static void cd_data_to_raw(uint8_t *buf, int lba)
 memset(buf, 0, 288);
 }
 
-static int cd_read_sector(IDEState *s, int lba, uint8_t *buf, int sector_size)
+static void cd_read_sector_cb(void *opaque, int ret)
 {
-int ret;
+IDEState *s = opaque;
 
-switch(sector_size) {
-case 2048:
-block_acct_start(blk_get_stats(s->blk), &s->acct,
- 4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
-ret = blk_read(s->blk, (int64_t)lba << 2, buf, 4);
-block_acct_done(blk_get_stats(s->blk), &s->acct);
-break;
-case 2352:
-block_acct_start(blk_get_stats(s->blk), &s->acct,
- 4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
-ret = blk_read(s->blk, (int64_t)lba << 2, buf + 16, 4);
-block_acct_done(blk_get_stats(s->blk), &s->acct);
-if (ret < 0)
-return ret;
-cd_data_to_raw(buf, lba);
-break;
-default:
-ret = -EIO;
-break;
+block_acct_done(blk_get_stats(s->blk), &s->acct);
+
+if (ret < 0) {
+ide_atapi_io_error(s, ret);
+return;
+}
+
+if (s->cd_sector_size == 2352) {
+cd_data_to_raw(s->io_buffer, s->lba);
 }
-return ret;
+
+s->lba++;
+s->io_buffer_index = 0;
+s->status &= ~BUSY_STAT;
+
+ide_atapi_cmd_reply_end(s);
+}
+
+static int cd_read_sector(IDEState *s, int lba, void *buf, int sector_size)
+{
+if (sector_size != 2048 && sector_size != 2352) {
+return -EINVAL;
+}
+
+s->iov.iov_base = buf;
+if (sector_size == 2352) {
+buf += 4;
+}
+
+s->iov.iov_len = 4 * BDRV_SECTOR_SIZE;
+qemu_iovec_init_external(&s->qiov, &s->iov, 1);
+
+if (blk_aio_readv(s->blk, (int64_t)lba << 2, &s->qiov, 4,
+  cd_read_sector_cb, s) == NULL) {
+return -EIO;
+}
+
+block_acct_start(blk_get_stats(s->blk), &s->acct,
+ 4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
+s->status |= BUSY_STAT;
+return 0;
 }
 
 void ide_atapi_cmd_ok(IDEState *s)
@@ -190,10 +210,8 @@ void ide_atapi_cmd_reply_end(IDEState *s)
 ret = cd_read_sector(s, s->lba, s->io_buffer, s->cd_sector_size);
 if (ret < 0) {
 ide_atapi_io_error(s, ret);
-return;
 }
-s->lba++;
-s->io_buffer_index = 0;
+return;
 }
 if (s->elementary_transfer_size > 0) {
 /* there are some data left to transmit in this elementary
@@ -275,7 +293,6 @@ static void ide_atapi_cmd_read_pio(IDEState *s, int lba, 
int nb_sectors,
 s->io_buffer_index = sector_size;
 s->cd_sector_size = sector_size;
 
-s->status = READY_STAT | SEEK_STAT;
 ide_atapi_cmd_reply_end(s);
 }
 
-- 
1.9.1




Re: [Qemu-block] [PATCH] block/iscsi: validate block size returned from target

2015-08-31 Thread Peter Lieven

Am 14.08.2015 um 13:33 schrieb Peter Lieven:

It has been reported that at least tgtd returns a block size of 0
for LUN 0. To avoid running into divide by zero later on and protect
against other problematic block sizes validate the block size right
at connection time.

Cc: qemu-sta...@nongnu.org
Reported-by: Andrey Korolyov 
Signed-off-by: Peter Lieven 
---
  block/iscsi.c | 4 
  dtc   | 2 +-
  2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index 5002916..fac3a7a 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -1214,6 +1214,10 @@ static void iscsi_readcapacity_sync(IscsiLun *iscsilun, 
Error **errp)
  
  if (task == NULL || task->status != SCSI_STATUS_GOOD) {

  error_setg(errp, "iSCSI: failed to send readcapacity10 command.");
+} else if (!iscsilun->block_size ||
+   iscsilun->block_size % BDRV_SECTOR_SIZE) {
+error_setg(errp, "iSCSI: the target returned an invalid "
+   "block size of %d.", iscsilun->block_size);
  }
  if (task) {
  scsi_free_scsi_task(task);


Ping



Re: [Qemu-block] [Qemu-devel] [PATCH 1/2] block/io: allow AIOCB without callback

2015-08-31 Thread Peter Lieven

Am 21.08.2015 um 08:12 schrieb Eric Blake:

On 08/20/2015 01:14 AM, Peter Lieven wrote:

If the backend storage is unresponsive and we cancel a request due to
a timeout we cannot immediately destroy the AIOCB because the storage
might complete the original request laster if it is responsive again.

s/laster/later/


For this purpose allow to set the callback to NULL and ignore it in
this case.

Signed-off-by: Peter Lieven 
---

I'll leave the technical review to others, I'm just pointing out grammar.




I am using this one for quite some time now. It seems a good step into solving 
the deadlock
problem. The issue is we still need to make the ATAPI calls async. The OS is 
only spending 2-3 Minutes
with DMA cancelling and then issues reads again so we still deadlock at the end.

Peter




Re: [Qemu-block] coroutine pool memory usage

2015-08-27 Thread Peter Lieven

Am 27.08.2015 um 17:23 schrieb Paolo Bonzini:

i was debugging increased memory footprint of qemu over the past time and
found that the coroutine pool heap usage can grow up to 70MB by just booting
an Ubuntu Live CD. And those 70MB are never freed.

Is this expected? Wouldn't it make sense to asynchronically throw some
coroutines (or at least their stack) away if there is no I/O?

Yes, perhaps that can be added.  But is it RSS that increases, or is it just
wasted address space?


Sorry, false alarm. I had a debugger running and in this case all memory
seems to be allocated. RSS is fine. I tracked the actual stack space that
is used to around 1400 Byte per coroutine. So no problem.

Peter




[Qemu-block] [PATCH V3] block/nfs: cache allocated filesize for read-only files

2015-08-27 Thread Peter Lieven
If the file is readonly its not expected to grow so
save the blocking call to nfs_fstat_async and use
the value saved at connection time. Also important
the monitor (and thus the main loop) will not hang
if block device info is queried and the NFS share
is unresponsive.

Signed-off-by: Peter Lieven 
---
v1->v2: update cache on reopen_prepare [Max]
v2->v3: use cache only if cache.direct=off [Max]

 block/nfs.c | 36 
 1 file changed, 36 insertions(+)

diff --git a/block/nfs.c b/block/nfs.c
index 02eb4e4..887a98e 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -43,6 +43,7 @@ typedef struct NFSClient {
 int events;
 bool has_zero_init;
 AioContext *aio_context;
+blkcnt_t st_blocks;
 } NFSClient;
 
 typedef struct NFSRPC {
@@ -374,6 +375,7 @@ static int64_t nfs_client_open(NFSClient *client, const 
char *filename,
 }
 
 ret = DIV_ROUND_UP(st.st_size, BDRV_SECTOR_SIZE);
+client->st_blocks = st.st_blocks;
 client->has_zero_init = S_ISREG(st.st_mode);
 goto out;
 fail:
@@ -464,6 +466,11 @@ static int64_t 
nfs_get_allocated_file_size(BlockDriverState *bs)
 NFSRPC task = {0};
 struct stat st;
 
+if (bdrv_is_read_only(bs) &&
+!(bs->open_flags & BDRV_O_NOCACHE)) {
+return client->st_blocks * 512;
+}
+
 task.st = &st;
 if (nfs_fstat_async(client->context, client->fh, nfs_co_generic_cb,
 &task) != 0) {
@@ -484,6 +491,34 @@ static int nfs_file_truncate(BlockDriverState *bs, int64_t 
offset)
 return nfs_ftruncate(client->context, client->fh, offset);
 }
 
+/* Note that this will not re-establish a connection with the NFS server
+ * - it is effectively a NOP.  */
+static int nfs_reopen_prepare(BDRVReopenState *state,
+  BlockReopenQueue *queue, Error **errp)
+{
+NFSClient *client = state->bs->opaque;
+struct stat st;
+int ret = 0;
+
+if (state->flags & BDRV_O_RDWR && bdrv_is_read_only(state->bs)) {
+error_setg(errp, "Cannot open a read-only mount as read-write");
+return -EACCES;
+}
+
+/* Update cache for read-only reopens */
+if (!(state->flags & BDRV_O_RDWR)) {
+ret = nfs_fstat(client->context, client->fh, &st);
+if (ret < 0) {
+error_setg(errp, "Failed to fstat file: %s",
+   nfs_get_error(client->context));
+return ret;
+}
+client->st_blocks = st.st_blocks;
+}
+
+return 0;
+}
+
 static BlockDriver bdrv_nfs = {
 .format_name= "nfs",
 .protocol_name  = "nfs",
@@ -499,6 +534,7 @@ static BlockDriver bdrv_nfs = {
 .bdrv_file_open = nfs_file_open,
 .bdrv_close = nfs_file_close,
 .bdrv_create= nfs_file_create,
+.bdrv_reopen_prepare= nfs_reopen_prepare,
 
 .bdrv_co_readv  = nfs_co_readv,
 .bdrv_co_writev = nfs_co_writev,
-- 
1.9.1




[Qemu-block] coroutine pool memory usage

2015-08-27 Thread Peter Lieven

Hi,

i was debugging increased memory footprint of qemu over the past time and found 
that the coroutine
pool heap usage can grow up to 70MB by just booting an Ubuntu Live CD. And 
those 70MB are never
freed.

Is this expected? Wouldn't it make sense to asynchronically throw some 
coroutines (or at least their stack)
away if there is no I/O?

Does anyone have a pointer to benchmarks of coroutine performance for NAS 
(iSCSI / NFS) with and without
freelist? I would think that it only has significant impact for local (SSD) 
storage?

Thanks,
Peter






Re: [Qemu-block] [PATCHv2] block/nfs: cache allocated filesize for read-only files

2015-08-26 Thread Peter Lieven
Am 26.08.2015 um 17:31 schrieb Jeff Cody:
> On Mon, Aug 24, 2015 at 10:13:16PM +0200, Max Reitz wrote:
>> On 24.08.2015 21:34, Peter Lieven wrote:
>>> Am 24.08.2015 um 20:39 schrieb Max Reitz:
>>>> On 24.08.2015 10:06, Peter Lieven wrote:
>>>>> If the file is readonly its not expected to grow so
>>>>> save the blocking call to nfs_fstat_async and use
>>>>> the value saved at connection time. Also important
>>>>> the monitor (and thus the main loop) will not hang
>>>>> if block device info is queried and the NFS share
>>>>> is unresponsive.
>>>>>
>>>>> Signed-off-by: Peter Lieven 
>>>>> ---
>>>>> v1->v2: update cache on reopen_prepare [Max]
>>>>>
>>>>>  block/nfs.c | 35 +++
>>>>>  1 file changed, 35 insertions(+)
>>>> Reviewed-by: Max Reitz 
>>>>
>>>> I hope you're ready for the "Stale actual-size value with
>>>> cache=direct,read-only=on,format=raw files on NFS" reports. :-)
>>> actually a good point, maybe the cache should only be used if
>>>
>>> !(bs->open_flags & BDRV_O_NOCACHE)
>> Good enough a point to fix it? ;-)
>>
>> Max
>>
> It seems more inline with expected behavior, to add the cache checking
> in before using the size cache.  Would you be opposed to a v3 with
> this check added in?

Of course, will send it tomorrow.

>
> One other concern I have is similar to a concern Max raised earlier -
> about an external program modifying the raw image, while QEMU has it
> opened r/o.  In particular, I wonder about an NFS server making an
> image either sparse / non-sparse.  If it was exported read-only, it
> may be a valid assumption that this could be done safely, as it would
> not change the reported file size or contents, just the allocated size
> on disk.

This might be a use case. But if I allow caching the allocated filesize
might not always be correct. This is even the case on a NFS share mounted
through the kernel where some attributes a cached for some time.

Anyway, would it hurt here if the actual filesize was too small?
In fact it was incorrect since libnfs support was added :-)

Peter




Re: [Qemu-block] [PATCHv2] block/nfs: cache allocated filesize for read-only files

2015-08-24 Thread Peter Lieven
Am 24.08.2015 um 20:39 schrieb Max Reitz:
> On 24.08.2015 10:06, Peter Lieven wrote:
>> If the file is readonly its not expected to grow so
>> save the blocking call to nfs_fstat_async and use
>> the value saved at connection time. Also important
>> the monitor (and thus the main loop) will not hang
>> if block device info is queried and the NFS share
>> is unresponsive.
>>
>> Signed-off-by: Peter Lieven 
>> ---
>> v1->v2: update cache on reopen_prepare [Max]
>>
>>  block/nfs.c | 35 +++
>>  1 file changed, 35 insertions(+)
> Reviewed-by: Max Reitz 
>
> I hope you're ready for the "Stale actual-size value with
> cache=direct,read-only=on,format=raw files on NFS" reports. :-)
actually a good point, maybe the cache should only be used if

!(bs->open_flags & BDRV_O_NOCACHE)

for my cdrom stuff this is still ok.

Peter





[Qemu-block] [PATCHv2] block/nfs: cache allocated filesize for read-only files

2015-08-24 Thread Peter Lieven
If the file is readonly its not expected to grow so
save the blocking call to nfs_fstat_async and use
the value saved at connection time. Also important
the monitor (and thus the main loop) will not hang
if block device info is queried and the NFS share
is unresponsive.

Signed-off-by: Peter Lieven 
---
v1->v2: update cache on reopen_prepare [Max]

 block/nfs.c | 35 +++
 1 file changed, 35 insertions(+)

diff --git a/block/nfs.c b/block/nfs.c
index 02eb4e4..a52e9d5 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -43,6 +43,7 @@ typedef struct NFSClient {
 int events;
 bool has_zero_init;
 AioContext *aio_context;
+blkcnt_t st_blocks;
 } NFSClient;
 
 typedef struct NFSRPC {
@@ -374,6 +375,7 @@ static int64_t nfs_client_open(NFSClient *client, const 
char *filename,
 }
 
 ret = DIV_ROUND_UP(st.st_size, BDRV_SECTOR_SIZE);
+client->st_blocks = st.st_blocks;
 client->has_zero_init = S_ISREG(st.st_mode);
 goto out;
 fail:
@@ -464,6 +466,10 @@ static int64_t 
nfs_get_allocated_file_size(BlockDriverState *bs)
 NFSRPC task = {0};
 struct stat st;
 
+if (bdrv_is_read_only(bs)) {
+return client->st_blocks * 512;
+}
+
 task.st = &st;
 if (nfs_fstat_async(client->context, client->fh, nfs_co_generic_cb,
 &task) != 0) {
@@ -484,6 +490,34 @@ static int nfs_file_truncate(BlockDriverState *bs, int64_t 
offset)
 return nfs_ftruncate(client->context, client->fh, offset);
 }
 
+/* Note that this will not re-establish a connection with the NFS server
+ * - it is effectively a NOP.  */
+static int nfs_reopen_prepare(BDRVReopenState *state,
+  BlockReopenQueue *queue, Error **errp)
+{
+NFSClient *client = state->bs->opaque;
+struct stat st;
+int ret = 0;
+
+if (state->flags & BDRV_O_RDWR && bdrv_is_read_only(state->bs)) {
+error_setg(errp, "Cannot open a read-only mount as read-write");
+return -EACCES;
+}
+
+/* Update cache for read-only reopens */
+if (!(state->flags & BDRV_O_RDWR)) {
+ret = nfs_fstat(client->context, client->fh, &st);
+if (ret < 0) {
+error_setg(errp, "Failed to fstat file: %s",
+   nfs_get_error(client->context));
+return ret;
+}
+client->st_blocks = st.st_blocks;
+}
+
+return 0;
+}
+
 static BlockDriver bdrv_nfs = {
 .format_name= "nfs",
 .protocol_name  = "nfs",
@@ -499,6 +533,7 @@ static BlockDriver bdrv_nfs = {
 .bdrv_file_open = nfs_file_open,
 .bdrv_close = nfs_file_close,
 .bdrv_create= nfs_file_create,
+.bdrv_reopen_prepare= nfs_reopen_prepare,
 
 .bdrv_co_readv  = nfs_co_readv,
 .bdrv_co_writev = nfs_co_writev,
-- 
1.9.1




Re: [Qemu-block] [Qemu-devel] [PATCH] block/nfs: cache allocated filesize for read-only files

2015-08-21 Thread Peter Lieven
Am 21.08.2015 um 18:46 schrieb Max Reitz:
> On 2015-08-21 at 00:49, Peter Lieven wrote:
>> If the file is readonly its not expected to grow so
>> save the blocking call to nfs_fstat_async and use
>> the value saved at connection time. Also important
>> the monitor (and thus the main loop) will not hang
>> if block device info is queried and the NFS share
>> is unresponsive.
>>
>> Signed-off-by: Peter Lieven 
>> ---
>>   block/nfs.c | 6 ++
>>   1 file changed, 6 insertions(+)
>
> First, I don't like the idea of this patch very much, but since I've never 
> used qemu's native NFS client, it's not up to me to decide whether it's worth 
> it.

I am trying to solve that a stale NFS Server with a CDROM ISO on it can hang 
Qemus main loop. One of the things that happens is that
you query "info block" in hmp or "query-block" via QMP and indirectly call 
bdrv_get_allocated_file_size and bang, Qemu hangs. Also I don't
know if its worth to issue an RPC call for each executing of info block.


>
> When it comes to breaking this, what comes to mind first is some external 
> program opening the image read-write outside of qemu and writing to it. Maybe 
> that's a case we generally don't want, but maybe that's something some people 
> do on purpose, knowing what they're doing (with raw images), you never know.

I would consider this bad behaviour. However, allocated file size shouldn't 
matter for raw images. If you resize the image from external you have to call 
bdrv_truncate anyway to make Qemu aware
of that change.

>
> Other than that, there's reopening. As far as I'm aware, qemu can reopen a 
> R/W image read-only, and if that happens, st_blocks may be stale.

Thats a valid point. But it can be solved be implementing .bdrv_reopen_prepare 
and update st_blocks there.

Thanks for you thoughts,
Peter



[Qemu-block] [PATCH] block/nfs: cache allocated filesize for read-only files

2015-08-21 Thread Peter Lieven
If the file is readonly its not expected to grow so
save the blocking call to nfs_fstat_async and use
the value saved at connection time. Also important
the monitor (and thus the main loop) will not hang
if block device info is queried and the NFS share
is unresponsive.

Signed-off-by: Peter Lieven 
---
 block/nfs.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/block/nfs.c b/block/nfs.c
index 02eb4e4..dc9ed21 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -43,6 +43,7 @@ typedef struct NFSClient {
 int events;
 bool has_zero_init;
 AioContext *aio_context;
+blkcnt_t st_blocks;
 } NFSClient;
 
 typedef struct NFSRPC {
@@ -374,6 +375,7 @@ static int64_t nfs_client_open(NFSClient *client, const 
char *filename,
 }
 
 ret = DIV_ROUND_UP(st.st_size, BDRV_SECTOR_SIZE);
+client->st_blocks = st.st_blocks;
 client->has_zero_init = S_ISREG(st.st_mode);
 goto out;
 fail:
@@ -464,6 +466,10 @@ static int64_t 
nfs_get_allocated_file_size(BlockDriverState *bs)
 NFSRPC task = {0};
 struct stat st;
 
+if (bdrv_is_read_only(bs)) {
+return client->st_blocks * 512;
+}
+
 task.st = &st;
 if (nfs_fstat_async(client->context, client->fh, nfs_co_generic_cb,
 &task) != 0) {
-- 
1.9.1




[Qemu-block] [PATCH] block/nfs: fix calculation of allocated file size

2015-08-20 Thread Peter Lieven
st.st_blocks is always counted in 512 byte units. Do not
use st.st_blksize as multiplicator which may be larger.

Cc: qemu-sta...@nongnu.org
Signed-off-by: Peter Lieven 
---
 block/nfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/nfs.c b/block/nfs.c
index c026ff6..02eb4e4 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -475,7 +475,7 @@ static int64_t nfs_get_allocated_file_size(BlockDriverState 
*bs)
 aio_poll(client->aio_context, true);
 }
 
-return (task.ret < 0 ? task.ret : st.st_blocks * st.st_blksize);
+return (task.ret < 0 ? task.ret : st.st_blocks * 512);
 }
 
 static int nfs_file_truncate(BlockDriverState *bs, int64_t offset)
-- 
1.9.1




[Qemu-block] [PATCH 1/2] block/io: allow AIOCB without callback

2015-08-20 Thread Peter Lieven
If the backend storage is unresponsive and we cancel a request due to
a timeout we cannot immediately destroy the AIOCB because the storage
might complete the original request laster if it is responsive again.
For this purpose allow to set the callback to NULL and ignore it in
this case.

Signed-off-by: Peter Lieven 
---
 block/io.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/block/io.c b/block/io.c
index d4bc83b..e628581 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2007,7 +2007,9 @@ static void bdrv_aio_bh_cb(void *opaque)
 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
 }
 qemu_vfree(acb->bounce);
-acb->common.cb(acb->common.opaque, acb->ret);
+if (acb->common.cb) {
+acb->common.cb(acb->common.opaque, acb->ret);
+}
 qemu_bh_delete(acb->bh);
 acb->bh = NULL;
 qemu_aio_unref(acb);
@@ -2075,7 +2077,9 @@ static const AIOCBInfo bdrv_em_co_aiocb_info = {
 static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
 {
 if (!acb->need_bh) {
-acb->common.cb(acb->common.opaque, acb->req.error);
+if (acb->common.cb) {
+acb->common.cb(acb->common.opaque, acb->req.error);
+}
 qemu_aio_unref(acb);
 }
 }
-- 
1.9.1




[Qemu-block] [PATCH 2/2] ide/atapi: partially avoid deadlock if the storage backend is dead

2015-08-20 Thread Peter Lieven
the blk_drain_all() that is executed if the guest issues a DMA cancel
leads to a stuck main loop if the storage backend (e.g. a NFS share)
is unresponsive.

This scenario is a common case for CDROM images mounted from an
NFS share. In this case a broken NFS server can take down the
whole VM even if the mounted CDROM is not used and was just not
unmounted after usage.

This approach avoids the blk_drain_all for read-only media and
cancelles the AIO locally and makes the callback a NOP if the
original request is completed after the NFS share is responsive
again.

Signed-off-by: Peter Lieven 
---
 hw/ide/pci.c | 32 ++--
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/hw/ide/pci.c b/hw/ide/pci.c
index d31ff88..a8b4175 100644
--- a/hw/ide/pci.c
+++ b/hw/ide/pci.c
@@ -240,21 +240,25 @@ void bmdma_cmd_writeb(BMDMAState *bm, uint32_t val)
 /* Ignore writes to SSBM if it keeps the old value */
 if ((val & BM_CMD_START) != (bm->cmd & BM_CMD_START)) {
 if (!(val & BM_CMD_START)) {
-/*
- * We can't cancel Scatter Gather DMA in the middle of the
- * operation or a partial (not full) DMA transfer would reach
- * the storage so we wait for completion instead (we beahve
- * like if the DMA was completed by the time the guest trying
- * to cancel dma with bmdma_cmd_writeb with BM_CMD_START not
- * set).
- *
- * In the future we'll be able to safely cancel the I/O if the
- * whole DMA operation will be submitted to disk with a single
- * aio operation with preadv/pwritev.
- */
 if (bm->bus->dma->aiocb) {
-blk_drain_all();
-assert(bm->bus->dma->aiocb == NULL);
+if (!bdrv_is_read_only(bm->bus->dma->aiocb->bs)) {
+/* We can't cancel Scatter Gather DMA in the middle of the
+ * operation or a partial (not full) DMA transfer would
+ * reach the storage so we wait for completion instead
+ * (we beahve like if the DMA was completed by the time the
+ * guest trying to cancel dma with bmdma_cmd_writeb with
+ * BM_CMD_START not set). */
+blk_drain_all();
+assert(bm->bus->dma->aiocb == NULL);
+} else {
+/* On a read-only device (e.g. CDROM) we can't cause incon-
+ * sistencies and thus cancel the AIOCB locally and avoid
+ * to be called back later if the original request is
+ * completed. */
+BlockAIOCB *aiocb = bm->bus->dma->aiocb;
+aiocb->cb(aiocb->opaque, -ECANCELED);
+aiocb->cb = NULL;
+}
 }
 bm->status &= ~BM_STATUS_DMAING;
 } else {
-- 
1.9.1




[Qemu-block] [PATCH 0/2] ide/atapi: partially avoid deadlock if the storage backend is dead

2015-08-20 Thread Peter Lieven
the blk_drain_all() that is executed if the guest issues a DMA cancel
leads to a stuck main loop if the storage backend (e.g. a NFS share)
is unresponsive.

This scenario is a common case for CDROM images mounted from an
NFS share. In this case a broken NFS server can take down the
whole VM even if the mounted CDROM is not used and was just not
unmounted after usage.

This approach avoids the blk_drain_all for read-only media and
cancelles the AIO locally and makes the callback a NOP if the
original request is completed after the NFS share is responsive
again.

Peter Lieven (2):
  block/io: allow AIOCB without callback
  ide/atapi: partially avoid deadlock if the storage backend is dead

 block/io.c   |  8 ++--
 hw/ide/pci.c | 32 ++--
 2 files changed, 24 insertions(+), 16 deletions(-)

-- 
1.9.1




Re: [Qemu-block] [Qemu-devel] RFC cdrom in own thread?

2015-08-15 Thread Peter Lieven
Am 14.08.2015 um 16:45 schrieb Peter Lieven:
> Am 14.08.2015 um 16:08 schrieb Kevin Wolf:
>> Am 14.08.2015 um 15:43 hat Peter Lieven geschrieben:
>>> Am 22.06.2015 um 23:54 schrieb John Snow:
>>>> On 06/22/2015 09:09 AM, Peter Lieven wrote:
>>>>> Am 22.06.2015 um 11:25 schrieb Stefan Hajnoczi:
>>>>>> On Fri, Jun 19, 2015 at 2:14 PM, Peter Lieven  wrote:
>>>>>>> Am 18.06.2015 um 11:36 schrieb Stefan Hajnoczi:
>>>>>>>> On Thu, Jun 18, 2015 at 10:29 AM, Peter Lieven  wrote:
>>>>>>>>> Am 18.06.2015 um 10:42 schrieb Kevin Wolf:
>>>>>>>>>> Am 18.06.2015 um 10:30 hat Peter Lieven geschrieben:
>>>>>>>>>>> Am 18.06.2015 um 09:45 schrieb Kevin Wolf:
>>>>>>>>>>>> Am 18.06.2015 um 09:12 hat Peter Lieven geschrieben:
>>>>>>>>>>>>> Thread 2 (Thread 0x75550700 (LWP 2636)):
>>>>>>>>>>>>> #0  0x75d87aa3 in ppoll () from
>>>>>>>>>>>>> /lib/x86_64-linux-gnu/libc.so.6
>>>>>>>>>>>>> No symbol table info available.
>>>>>>>>>>>>> #1  0x55955d91 in qemu_poll_ns (fds=0x563889c0,
>>>>>>>>>>>>> nfds=3,
>>>>>>>>>>>>>   timeout=4999424576) at qemu-timer.c:326
>>>>>>>>>>>>>   ts = {tv_sec = 4, tv_nsec = 999424576}
>>>>>>>>>>>>>   tvsec = 4
>>>>>>>>>>>>> #2  0x55956feb in aio_poll (ctx=0x563528e0,
>>>>>>>>>>>>> blocking=true)
>>>>>>>>>>>>>   at aio-posix.c:231
>>>>>>>>>>>>>   node = 0x0
>>>>>>>>>>>>>   was_dispatching = false
>>>>>>>>>>>>>   ret = 1
>>>>>>>>>>>>>   progress = false
>>>>>>>>>>>>> #3  0x5594aeed in bdrv_prwv_co (bs=0x5637eae0,
>>>>>>>>>>>>> offset=4292007936,
>>>>>>>>>>>>>   qiov=0x7554f760, is_write=false, flags=0) at
>>>>>>>>>>>>> block.c:2699
>>>>>>>>>>>>>   aio_context = 0x563528e0
>>>>>>>>>>>>>   co = 0x563888a0
>>>>>>>>>>>>>   rwco = {bs = 0x5637eae0, offset = 4292007936,
>>>>>>>>>>>>> qiov = 0x7554f760, is_write = false, ret =
>>>>>>>>>>>>> 2147483647,
>>>>>>>>>>>>> flags = 0}
>>>>>>>>>>>>> #4  0x5594afa9 in bdrv_rw_co (bs=0x5637eae0,
>>>>>>>>>>>>> sector_num=8382828,
>>>>>>>>>>>>>   buf=0x744cc800 "(", nb_sectors=4, is_write=false,
>>>>>>>>>>>>> flags=0)
>>>>>>>>>>>>>   at block.c:2722
>>>>>>>>>>>>>   qiov = {iov = 0x7554f780, niov = 1, nalloc = -1,
>>>>>>>>>>>>> size =
>>>>>>>>>>>>> 2048}
>>>>>>>>>>>>>   iov = {iov_base = 0x744cc800, iov_len = 2048}
>>>>>>>>>>>>> #5  0x5594b008 in bdrv_read (bs=0x5637eae0,
>>>>>>>>>>>>> sector_num=8382828,
>>>>>>>>>>>>>   buf=0x744cc800 "(", nb_sectors=4) at block.c:2730
>>>>>>>>>>>>> No locals.
>>>>>>>>>>>>> #6  0x5599acef in blk_read (blk=0x56376820,
>>>>>>>>>>>>> sector_num=8382828,
>>>>>>>>>>>>>   buf=0x744cc800 "(", nb_sectors=4) at
>>>>>>>>>>>>> block/block-backend.c:404
>>>>>>>>>>>>> No locals.
>>>>>>>>>>>>> #7  0x55833ed2 in cd_read_sector (s=0x56408f88,
>>>>>>>>>>>>> lba=2095707,
>>>

Re: [Qemu-block] [Qemu-devel] RFC cdrom in own thread?

2015-08-14 Thread Peter Lieven
Am 14.08.2015 um 16:08 schrieb Kevin Wolf:
> Am 14.08.2015 um 15:43 hat Peter Lieven geschrieben:
>> Am 22.06.2015 um 23:54 schrieb John Snow:
>>> On 06/22/2015 09:09 AM, Peter Lieven wrote:
>>>> Am 22.06.2015 um 11:25 schrieb Stefan Hajnoczi:
>>>>> On Fri, Jun 19, 2015 at 2:14 PM, Peter Lieven  wrote:
>>>>>> Am 18.06.2015 um 11:36 schrieb Stefan Hajnoczi:
>>>>>>> On Thu, Jun 18, 2015 at 10:29 AM, Peter Lieven  wrote:
>>>>>>>> Am 18.06.2015 um 10:42 schrieb Kevin Wolf:
>>>>>>>>> Am 18.06.2015 um 10:30 hat Peter Lieven geschrieben:
>>>>>>>>>> Am 18.06.2015 um 09:45 schrieb Kevin Wolf:
>>>>>>>>>>> Am 18.06.2015 um 09:12 hat Peter Lieven geschrieben:
>>>>>>>>>>>> Thread 2 (Thread 0x75550700 (LWP 2636)):
>>>>>>>>>>>> #0  0x75d87aa3 in ppoll () from
>>>>>>>>>>>> /lib/x86_64-linux-gnu/libc.so.6
>>>>>>>>>>>> No symbol table info available.
>>>>>>>>>>>> #1  0x55955d91 in qemu_poll_ns (fds=0x563889c0,
>>>>>>>>>>>> nfds=3,
>>>>>>>>>>>>   timeout=4999424576) at qemu-timer.c:326
>>>>>>>>>>>>   ts = {tv_sec = 4, tv_nsec = 999424576}
>>>>>>>>>>>>   tvsec = 4
>>>>>>>>>>>> #2  0x55956feb in aio_poll (ctx=0x563528e0,
>>>>>>>>>>>> blocking=true)
>>>>>>>>>>>>   at aio-posix.c:231
>>>>>>>>>>>>   node = 0x0
>>>>>>>>>>>>   was_dispatching = false
>>>>>>>>>>>>   ret = 1
>>>>>>>>>>>>   progress = false
>>>>>>>>>>>> #3  0x5594aeed in bdrv_prwv_co (bs=0x5637eae0,
>>>>>>>>>>>> offset=4292007936,
>>>>>>>>>>>>   qiov=0x7554f760, is_write=false, flags=0) at
>>>>>>>>>>>> block.c:2699
>>>>>>>>>>>>   aio_context = 0x563528e0
>>>>>>>>>>>>   co = 0x563888a0
>>>>>>>>>>>>   rwco = {bs = 0x5637eae0, offset = 4292007936,
>>>>>>>>>>>> qiov = 0x7554f760, is_write = false, ret =
>>>>>>>>>>>> 2147483647,
>>>>>>>>>>>> flags = 0}
>>>>>>>>>>>> #4  0x5594afa9 in bdrv_rw_co (bs=0x5637eae0,
>>>>>>>>>>>> sector_num=8382828,
>>>>>>>>>>>>   buf=0x744cc800 "(", nb_sectors=4, is_write=false,
>>>>>>>>>>>> flags=0)
>>>>>>>>>>>>   at block.c:2722
>>>>>>>>>>>>   qiov = {iov = 0x7554f780, niov = 1, nalloc = -1,
>>>>>>>>>>>> size =
>>>>>>>>>>>> 2048}
>>>>>>>>>>>>   iov = {iov_base = 0x744cc800, iov_len = 2048}
>>>>>>>>>>>> #5  0x5594b008 in bdrv_read (bs=0x5637eae0,
>>>>>>>>>>>> sector_num=8382828,
>>>>>>>>>>>>   buf=0x744cc800 "(", nb_sectors=4) at block.c:2730
>>>>>>>>>>>> No locals.
>>>>>>>>>>>> #6  0x5599acef in blk_read (blk=0x56376820,
>>>>>>>>>>>> sector_num=8382828,
>>>>>>>>>>>>   buf=0x744cc800 "(", nb_sectors=4) at
>>>>>>>>>>>> block/block-backend.c:404
>>>>>>>>>>>> No locals.
>>>>>>>>>>>> #7  0x55833ed2 in cd_read_sector (s=0x56408f88,
>>>>>>>>>>>> lba=2095707,
>>>>>>>>>>>>   buf=0x744cc800 "(", sector_size=2048) at
>>>>>>>>>>>> hw/ide/atapi.c:116
>>>>>>>>>>>>   ret = 32767
>>>>>>>>>&

Re: [Qemu-block] [Qemu-devel] RFC cdrom in own thread?

2015-08-14 Thread Peter Lieven
Am 14.08.2015 um 16:08 schrieb Kevin Wolf:
> Am 14.08.2015 um 15:43 hat Peter Lieven geschrieben:
>> Am 22.06.2015 um 23:54 schrieb John Snow:
>>> On 06/22/2015 09:09 AM, Peter Lieven wrote:
>>>> Am 22.06.2015 um 11:25 schrieb Stefan Hajnoczi:
>>>>> On Fri, Jun 19, 2015 at 2:14 PM, Peter Lieven  wrote:
>>>>>> Am 18.06.2015 um 11:36 schrieb Stefan Hajnoczi:
>>>>>>> On Thu, Jun 18, 2015 at 10:29 AM, Peter Lieven  wrote:
>>>>>>>> Am 18.06.2015 um 10:42 schrieb Kevin Wolf:
>>>>>>>>> Am 18.06.2015 um 10:30 hat Peter Lieven geschrieben:
>>>>>>>>>> Am 18.06.2015 um 09:45 schrieb Kevin Wolf:
>>>>>>>>>>> Am 18.06.2015 um 09:12 hat Peter Lieven geschrieben:
>>>>>>>>>>>> Thread 2 (Thread 0x75550700 (LWP 2636)):
>>>>>>>>>>>> #0  0x75d87aa3 in ppoll () from
>>>>>>>>>>>> /lib/x86_64-linux-gnu/libc.so.6
>>>>>>>>>>>> No symbol table info available.
>>>>>>>>>>>> #1  0x55955d91 in qemu_poll_ns (fds=0x563889c0,
>>>>>>>>>>>> nfds=3,
>>>>>>>>>>>>   timeout=4999424576) at qemu-timer.c:326
>>>>>>>>>>>>   ts = {tv_sec = 4, tv_nsec = 999424576}
>>>>>>>>>>>>   tvsec = 4
>>>>>>>>>>>> #2  0x55956feb in aio_poll (ctx=0x563528e0,
>>>>>>>>>>>> blocking=true)
>>>>>>>>>>>>   at aio-posix.c:231
>>>>>>>>>>>>   node = 0x0
>>>>>>>>>>>>   was_dispatching = false
>>>>>>>>>>>>   ret = 1
>>>>>>>>>>>>   progress = false
>>>>>>>>>>>> #3  0x5594aeed in bdrv_prwv_co (bs=0x5637eae0,
>>>>>>>>>>>> offset=4292007936,
>>>>>>>>>>>>   qiov=0x7554f760, is_write=false, flags=0) at
>>>>>>>>>>>> block.c:2699
>>>>>>>>>>>>   aio_context = 0x563528e0
>>>>>>>>>>>>   co = 0x563888a0
>>>>>>>>>>>>   rwco = {bs = 0x5637eae0, offset = 4292007936,
>>>>>>>>>>>> qiov = 0x7554f760, is_write = false, ret =
>>>>>>>>>>>> 2147483647,
>>>>>>>>>>>> flags = 0}
>>>>>>>>>>>> #4  0x5594afa9 in bdrv_rw_co (bs=0x5637eae0,
>>>>>>>>>>>> sector_num=8382828,
>>>>>>>>>>>>   buf=0x744cc800 "(", nb_sectors=4, is_write=false,
>>>>>>>>>>>> flags=0)
>>>>>>>>>>>>   at block.c:2722
>>>>>>>>>>>>   qiov = {iov = 0x7554f780, niov = 1, nalloc = -1,
>>>>>>>>>>>> size =
>>>>>>>>>>>> 2048}
>>>>>>>>>>>>   iov = {iov_base = 0x744cc800, iov_len = 2048}
>>>>>>>>>>>> #5  0x5594b008 in bdrv_read (bs=0x5637eae0,
>>>>>>>>>>>> sector_num=8382828,
>>>>>>>>>>>>   buf=0x744cc800 "(", nb_sectors=4) at block.c:2730
>>>>>>>>>>>> No locals.
>>>>>>>>>>>> #6  0x5599acef in blk_read (blk=0x56376820,
>>>>>>>>>>>> sector_num=8382828,
>>>>>>>>>>>>   buf=0x744cc800 "(", nb_sectors=4) at
>>>>>>>>>>>> block/block-backend.c:404
>>>>>>>>>>>> No locals.
>>>>>>>>>>>> #7  0x55833ed2 in cd_read_sector (s=0x56408f88,
>>>>>>>>>>>> lba=2095707,
>>>>>>>>>>>>   buf=0x744cc800 "(", sector_size=2048) at
>>>>>>>>>>>> hw/ide/atapi.c:116
>>>>>>>>>>>>   ret = 32767
>>>>>>>>>&

Re: [Qemu-block] [Qemu-devel] RFC cdrom in own thread?

2015-08-14 Thread Peter Lieven
Am 22.06.2015 um 23:54 schrieb John Snow:
>
> On 06/22/2015 09:09 AM, Peter Lieven wrote:
>> Am 22.06.2015 um 11:25 schrieb Stefan Hajnoczi:
>>> On Fri, Jun 19, 2015 at 2:14 PM, Peter Lieven  wrote:
>>>> Am 18.06.2015 um 11:36 schrieb Stefan Hajnoczi:
>>>>> On Thu, Jun 18, 2015 at 10:29 AM, Peter Lieven  wrote:
>>>>>> Am 18.06.2015 um 10:42 schrieb Kevin Wolf:
>>>>>>> Am 18.06.2015 um 10:30 hat Peter Lieven geschrieben:
>>>>>>>> Am 18.06.2015 um 09:45 schrieb Kevin Wolf:
>>>>>>>>> Am 18.06.2015 um 09:12 hat Peter Lieven geschrieben:
>>>>>>>>>> Thread 2 (Thread 0x75550700 (LWP 2636)):
>>>>>>>>>> #0  0x75d87aa3 in ppoll () from
>>>>>>>>>> /lib/x86_64-linux-gnu/libc.so.6
>>>>>>>>>> No symbol table info available.
>>>>>>>>>> #1  0x55955d91 in qemu_poll_ns (fds=0x563889c0,
>>>>>>>>>> nfds=3,
>>>>>>>>>>   timeout=4999424576) at qemu-timer.c:326
>>>>>>>>>>   ts = {tv_sec = 4, tv_nsec = 999424576}
>>>>>>>>>>   tvsec = 4
>>>>>>>>>> #2  0x55956feb in aio_poll (ctx=0x563528e0,
>>>>>>>>>> blocking=true)
>>>>>>>>>>   at aio-posix.c:231
>>>>>>>>>>   node = 0x0
>>>>>>>>>>   was_dispatching = false
>>>>>>>>>>   ret = 1
>>>>>>>>>>   progress = false
>>>>>>>>>> #3  0x5594aeed in bdrv_prwv_co (bs=0x5637eae0,
>>>>>>>>>> offset=4292007936,
>>>>>>>>>>   qiov=0x7554f760, is_write=false, flags=0) at
>>>>>>>>>> block.c:2699
>>>>>>>>>>   aio_context = 0x563528e0
>>>>>>>>>>   co = 0x563888a0
>>>>>>>>>>   rwco = {bs = 0x5637eae0, offset = 4292007936,
>>>>>>>>>> qiov = 0x7554f760, is_write = false, ret =
>>>>>>>>>> 2147483647,
>>>>>>>>>> flags = 0}
>>>>>>>>>> #4  0x5594afa9 in bdrv_rw_co (bs=0x5637eae0,
>>>>>>>>>> sector_num=8382828,
>>>>>>>>>>   buf=0x744cc800 "(", nb_sectors=4, is_write=false,
>>>>>>>>>> flags=0)
>>>>>>>>>>   at block.c:2722
>>>>>>>>>>   qiov = {iov = 0x7554f780, niov = 1, nalloc = -1,
>>>>>>>>>> size =
>>>>>>>>>> 2048}
>>>>>>>>>>   iov = {iov_base = 0x744cc800, iov_len = 2048}
>>>>>>>>>> #5  0x5594b008 in bdrv_read (bs=0x5637eae0,
>>>>>>>>>> sector_num=8382828,
>>>>>>>>>>   buf=0x744cc800 "(", nb_sectors=4) at block.c:2730
>>>>>>>>>> No locals.
>>>>>>>>>> #6  0x5599acef in blk_read (blk=0x56376820,
>>>>>>>>>> sector_num=8382828,
>>>>>>>>>>   buf=0x744cc800 "(", nb_sectors=4) at
>>>>>>>>>> block/block-backend.c:404
>>>>>>>>>> No locals.
>>>>>>>>>> #7  0x55833ed2 in cd_read_sector (s=0x56408f88,
>>>>>>>>>> lba=2095707,
>>>>>>>>>>   buf=0x744cc800 "(", sector_size=2048) at
>>>>>>>>>> hw/ide/atapi.c:116
>>>>>>>>>>   ret = 32767
>>>>>>>>> Here is the problem: The ATAPI emulation uses synchronous
>>>>>>>>> blk_read()
>>>>>>>>> instead of the AIO or coroutine interfaces. This means that it
>>>>>>>>> keeps
>>>>>>>>> polling for request completion while it holds the BQL until the
>>>>>>>>> request
>>>>>>>>> is completed.
>>>>>>>> I will look at this.
>>>>>> I need some further help. My way to "emula

[Qemu-block] [PATCH] block/iscsi: validate block size returned from target

2015-08-14 Thread Peter Lieven
It has been reported that at least tgtd returns a block size of 0
for LUN 0. To avoid running into divide by zero later on and protect
against other problematic block sizes validate the block size right
at connection time.

Cc: qemu-sta...@nongnu.org
Reported-by: Andrey Korolyov 
Signed-off-by: Peter Lieven 
---
 block/iscsi.c | 4 
 dtc   | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index 5002916..fac3a7a 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -1214,6 +1214,10 @@ static void iscsi_readcapacity_sync(IscsiLun *iscsilun, 
Error **errp)
 
 if (task == NULL || task->status != SCSI_STATUS_GOOD) {
 error_setg(errp, "iSCSI: failed to send readcapacity10 command.");
+} else if (!iscsilun->block_size ||
+   iscsilun->block_size % BDRV_SECTOR_SIZE) {
+error_setg(errp, "iSCSI: the target returned an invalid "
+   "block size of %d.", iscsilun->block_size);
 }
 if (task) {
 scsi_free_scsi_task(task);



Re: [Qemu-block] [RESEND] RFC chaning zlib windowBits for QCOW2

2015-07-30 Thread Peter Lieven

Am 27.07.2015 um 16:30 schrieb Stefan Hajnoczi:

On Mon, Jul 27, 2015 at 2:58 PM, Peter Lieven  wrote:

I was experimenting a little with optimized zlib versions (like zlib-ng) to
optimize the speed of compressed qcow2 generation.

I found out that chaning the windowBits from -12 to -15 alone increases
compression speed by about 15% while lowering
the compressed size by about 6%. As a test case I used a Debian 8.1 disk
image.

What is your opinion in chaning the windowBits? The cost is a higher memory
usage. The usage changes from 272k -> 384k for
compression and from 4k -> 32k for decompression.

Kevin is the qcow2 maintainer, but judging purely by the numbers you
posted it seems attractive to make this change.

Does it have any impact on the file format?  Right now -12 is
hard-coded in QEMU, so we need to make sure that existing files and
programs that can access qcow2 files continue to work with both
old/new images.  If compatibility is not possible then you'll have to
investigate feature bits.


The decompress code also takes a windowBits parameter and the docu says
that decoding a -15 image with a -12 setting should fail. It doesn't, but that
might depend on the zlib implementation.

The other way around works.

Peter




[Qemu-block] [RESEND] RFC chaning zlib windowBits for QCOW2

2015-07-27 Thread Peter Lieven

Hi,

I was experimenting a little with optimized zlib versions (like zlib-ng) to 
optimize the speed of compressed qcow2 generation.

I found out that chaning the windowBits from -12 to -15 alone increases 
compression speed by about 15% while lowering
the compressed size by about 6%. As a test case I used a Debian 8.1 disk image.

What is your opinion in chaning the windowBits? The cost is a higher memory usage. 
The usage changes from 272k -> 384k for
compression and from 4k -> 32k for decompression.

Peter




[Qemu-block] RFC chaning zlib windowBits for QCOW2

2015-07-13 Thread Peter Lieven

Hi,

I was experimenting a little with optimized zlib versions (like zlib-ng) to 
optimize the speed of compressed qcow2 generation.

I found out that chaning the windowBits from -12 to -15 alone increases 
compression speed by about 15% while lowering
the compressed size by about 6%. As a test case I used a Debian 8.1 disk image.

What is your opinion in chaning the windowBits? The cost is a higher memory usage. 
The usage changes from 272k -> 384k for
compression and from 4k -> 32k for decompression.

Peter




Re: [Qemu-block] [PATCH for-2.5 1/4] block: add BlockLimits.max_iov field

2015-07-08 Thread Peter Lieven

Am 08.07.2015 um 17:30 schrieb Stefan Hajnoczi:

The maximum number of struct iovec elements depends on the
BlockDriverState.  The raw-posix protocol has a maximum of IOV_MAX but
others could have different values.

Instead of assuming raw-posix and hardcoding IOV_MAX in several places,
put the limit into BlockLimits.

Cc: Peter Lieven 
Suggested-by: Kevin Wolf 
Signed-off-by: Stefan Hajnoczi 
---
Peter Lieven: I think the SCSI LUN level does not have a maximum
scatter-gather segments constraint.  That is probably only at the HBA
level.  CCed you anyway in case you think block/iscsi.c should set the
max_iov field.


libiscsi will send the iovec array straight to readv and writev to
read/write from the TCP socket. So we need IOV_MAX here as well.



Kevin: The default is now INT_MAX.  This means non-raw-posix users will
now be able to merge more requests than before.  They were limited to
IOV_MAX previously.  This could expose limits in other BlockDrivers
which we weren't aware of...


Why rise the default to INT_MAX and not leave it at IOV_MAX?
Is there any case where we except that much iovectors coming in?

Peter




[Qemu-block] [PATCH V3] block/nfs: add support for setting debug level

2015-07-06 Thread Peter Lieven
upcoming libnfs versions will support logging debug messages. Add
support for it in qemu through a per-drive option.

Examples:
 qemu -drive if=virtio,file=nfs://...,file.debug=2
 qemu-img create -o debug=2 nfs://... 10G

Signed-off-by: Peter Lieven 
---
v2->v3: use a per-drive option instead of a global one. [Stefan]
v1->v2: reworked patch to accept the debug level as a cmdline
parameter instead of an URI parameter [Stefan]

 block/nfs.c  | 28 
 qapi/block-core.json | 20 
 2 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/block/nfs.c b/block/nfs.c
index c026ff6..72a4247 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -233,6 +233,11 @@ static QemuOptsList runtime_opts = {
 .type = QEMU_OPT_STRING,
 .help = "URL to the NFS file",
 },
+{
+.name = "debug",
+.type = QEMU_OPT_NUMBER,
+.help = "Set libnfs debug level (default 0 = no debug)",
+},
 { /* end of list */ }
 },
 };
@@ -277,9 +282,9 @@ static void nfs_file_close(BlockDriverState *bs)
 }
 
 static int64_t nfs_client_open(NFSClient *client, const char *filename,
-   int flags, Error **errp)
+   int flags, QemuOpts *opts, Error **errp)
 {
-int ret = -EINVAL, i;
+int ret = -EINVAL, i, debug;
 struct stat st;
 URI *uri;
 QueryParams *qp = NULL;
@@ -343,6 +348,16 @@ static int64_t nfs_client_open(NFSClient *client, const 
char *filename,
 }
 }
 
+debug = qemu_opt_get_number(opts, "debug", 0);
+if (debug) {
+#ifdef LIBNFS_FEATURE_DEBUG
+nfs_set_debug(client->context, debug);
+#else
+error_report("NFS Warning: The linked version of libnfs does"
+ " not support setting debug levels");
+#endif
+}
+
 ret = nfs_mount(client->context, uri->server, uri->path);
 if (ret < 0) {
 error_setg(errp, "Failed to mount nfs share: %s",
@@ -405,7 +420,7 @@ static int nfs_file_open(BlockDriverState *bs, QDict 
*options, int flags,
 }
 ret = nfs_client_open(client, qemu_opt_get(opts, "filename"),
   (flags & BDRV_O_RDWR) ? O_RDWR : O_RDONLY,
-  errp);
+  opts, errp);
 if (ret < 0) {
 goto out;
 }
@@ -425,6 +440,11 @@ static QemuOptsList nfs_create_opts = {
 .type = QEMU_OPT_SIZE,
 .help = "Virtual disk size"
 },
+{
+.name = "debug",
+.type = QEMU_OPT_NUMBER,
+.help = "Set libnfs debug level (default 0 = no debug)",
+},
 { /* end of list */ }
 }
 };
@@ -441,7 +461,7 @@ static int nfs_file_create(const char *url, QemuOpts *opts, 
Error **errp)
 total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
   BDRV_SECTOR_SIZE);
 
-ret = nfs_client_open(client, url, O_CREAT, errp);
+ret = nfs_client_open(client, url, O_CREAT, opts, errp);
 if (ret < 0) {
 goto out;
 }
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 7b2efb8..f43a1b1 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -1381,9 +1381,9 @@
 { 'enum': 'BlockdevDriver',
   'data': [ 'archipelago', 'blkdebug', 'blkverify', 'bochs', 'cloop',
 'dmg', 'file', 'ftp', 'ftps', 'host_cdrom', 'host_device',
-'host_floppy', 'http', 'https', 'null-aio', 'null-co', 'parallels',
-'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'tftp', 'vdi', 'vhdx',
-'vmdk', 'vpc', 'vvfat' ] }
+'host_floppy', 'http', 'https', 'nfs', 'null-aio', 'null-co',
+'parallels', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'tftp',
+'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
 
 ##
 # @BlockdevOptionsBase
@@ -1635,6 +1635,18 @@
 '*vport': 'int',
 '*segment': 'str' } }
 
+##
+# @BlockdevOptionsNFS
+#
+# Driver specific block device options for NFS.
+#
+# @debug:   #optional set libnfs debug level (default: 0 = disabled)
+#
+# Since: 2.4
+##
+{ 'struct': 'BlockdevOptionsNFS',
+  'base': 'BlockdevOptionsFile',
+  'data': { '*debug': 'int' } }
 
 ##
 # @BlkdebugEvent
@@ -1816,7 +1828,7 @@
   'https':  'BlockdevOptionsFile',
 # TODO iscsi: Wait for structured options
 # TODO nbd: Should take InetSocketAddress for 'host'?
-# TODO nfs: Wait for structured options
+  'nfs':'BlockdevOptionsNFS',
   'null-aio':   'BlockdevOptionsNull',
   'null-co':'BlockdevOptionsNull',
   'parallels':  'BlockdevOptionsGenericFormat',
-- 
1.9.1




[Qemu-block] [PATCH] block/nfs: limit maximum readahead size to 1MB

2015-06-26 Thread Peter Lieven
a malicious caller could otherwise specify a very
large value via the URI and force libnfs to allocate
a large amount of memory for the readahead buffer.

Cc: qemu-sta...@nongnu.org
Signed-off-by: Peter Lieven 
---
 block/nfs.c |7 +++
 1 file changed, 7 insertions(+)

diff --git a/block/nfs.c b/block/nfs.c
index 43d48ae..4fb1937 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -35,6 +35,8 @@
 #include "sysemu/sysemu.h"
 #include 
 
+#define QEMU_NFS_MAX_READAHEAD_SIZE 1048576
+
 typedef struct NFSClient {
 struct nfs_context *context;
 struct nfsfh *fh;
@@ -351,6 +353,11 @@ static int64_t nfs_client_open(NFSClient *client, const 
char *filename,
 nfs_set_tcp_syncnt(client->context, val);
 #ifdef LIBNFS_FEATURE_READAHEAD
 } else if (!strcmp(qp->p[i].name, "readahead")) {
+if (val > QEMU_NFS_MAX_READAHEAD_SIZE) {
+error_report("NFS Warning: Truncating NFS readahead"
+ " size to %d", QEMU_NFS_MAX_READAHEAD_SIZE);
+val = QEMU_NFS_MAX_READAHEAD_SIZE;
+}
 nfs_set_readahead(client->context, val);
 #endif
 } else {
-- 
1.7.9.5




[Qemu-block] [PATCH V2] block/nfs: add support for setting debug level

2015-06-26 Thread Peter Lieven
upcoming libnfs versions will support logging debug messages. Add
support for it in qemu through a cmdline parameter.

Example
 qemu -nfs debug=99 -cdrom nfs://...

Signed-off-by: Peter Lieven 
---
v1->v2: reworked patch to accept the debug level as a cmdline
parameter instead of an URI parameter [Stefan]

 block/nfs.c |   40 
 qemu-options.hx |   21 +
 vl.c|8 
 3 files changed, 69 insertions(+)

diff --git a/block/nfs.c b/block/nfs.c
index ca9e24e..43d48ae 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -274,6 +274,30 @@ static void nfs_file_close(BlockDriverState *bs)
 nfs_client_close(client);
 }
 
+static void nfs_parse_options(NFSClient *client)
+{
+QemuOptsList *list;
+QemuOpts *opts;
+const char *debug;
+
+list = qemu_find_opts("nfs");
+if (list) {
+opts = QTAILQ_FIRST(&list->head);
+if (opts) {
+debug = qemu_opt_get(opts, "debug");
+if (debug) {
+#ifdef LIBNFS_FEATURE_DEBUG
+nfs_set_debug(client->context, atoi(debug));
+#else
+error_report("NFS Warning: The linked version of libnfs does"
+ " not support setting debug levels");
+#endif
+}
+}
+}
+}
+
+
 static int64_t nfs_client_open(NFSClient *client, const char *filename,
int flags, Error **errp)
 {
@@ -336,6 +360,8 @@ static int64_t nfs_client_open(NFSClient *client, const 
char *filename,
 }
 }
 
+nfs_parse_options(client);
+
 ret = nfs_mount(client->context, uri->server, uri->path);
 if (ret < 0) {
 error_setg(errp, "Failed to mount nfs share: %s",
@@ -501,9 +527,23 @@ static BlockDriver bdrv_nfs = {
 .bdrv_attach_aio_context= nfs_attach_aio_context,
 };
 
+static QemuOptsList qemu_nfs_opts = {
+.name = "nfs",
+.head = QTAILQ_HEAD_INITIALIZER(qemu_nfs_opts.head),
+.desc = {
+{
+.name = "debug",
+.type = QEMU_OPT_NUMBER,
+.help = "Set libnfs debug level (default 0 = no debug)",
+},
+{ /* end of list */ }
+},
+};
+
 static void nfs_block_init(void)
 {
 bdrv_register(&bdrv_nfs);
+qemu_add_opts(&qemu_nfs_opts);
 }
 
 block_init(nfs_block_init);
diff --git a/qemu-options.hx b/qemu-options.hx
index 389cf64..63c60e7 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -2329,6 +2329,26 @@ STEXI
 iSCSI parameters such as username and password can also be specified via
 a configuration file. See qemu-doc for more information and examples.
 
+@item NFS
+NFS support allows QEMU to access NFS shares directly and use as
+images for the guest storage. Both disk and cdrom images are supported.
+
+Syntax for specifying NFS shares is
+``nfs:''
+
+Example (setting deubg level to 2 and use ISO from NFS share as CDROM):
+@example
+qemu-system-i386 -nfs debug=2 -cdrom nfs://127.0.0.1/isos/cdimage.iso
+@end example
+
+NFS support is an optional feature of QEMU and only available when
+compiled and linked against libnfs.
+ETEXI
+DEF("nfs", HAS_ARG, QEMU_OPTION_nfs,
+"-nfs   [debug=debug]\n"
+"NFS connection parameters\n", QEMU_ARCH_ALL)
+STEXI
+
 @item NBD
 QEMU supports NBD (Network Block Devices) both using TCP protocol as well
 as Unix Domain Sockets.
@@ -2480,6 +2500,7 @@ ETEXI
 STEXI
 @end table
 ETEXI
+DEFHEADING()
 
 DEFHEADING(Bluetooth(R) options:)
 STEXI
diff --git a/vl.c b/vl.c
index 9542095..4317572 100644
--- a/vl.c
+++ b/vl.c
@@ -3141,6 +3141,14 @@ int main(int argc, char **argv, char **envp)
 }
 break;
 #endif
+#ifdef CONFIG_LIBNFS
+case QEMU_OPTION_nfs:
+opts = qemu_opts_parse(qemu_find_opts("nfs"), optarg, 0);
+if (!opts) {
+exit(1);
+}
+break;
+#endif
 #ifdef CONFIG_SLIRP
 case QEMU_OPTION_tftp:
 legacy_tftp_prefix = optarg;
-- 
1.7.9.5




[Qemu-block] [PATCH V2] block/iscsi: restore compatiblity with libiscsi 1.9.0

2015-06-26 Thread Peter Lieven
RHEL7 and others are stuck with libiscsi 1.9.0 since there
unfortunately was an ABI breakage after that release.

Signed-off-by: Peter Lieven 
---
v1->v2: - do not use reserved macro names [Paolo]
- change text in qemu-options.hx to "libiscsi 1.15.0 or greater"

 block/iscsi.c   |   32 +++-
 configure   |6 +++---
 qemu-options.hx |3 ++-
 3 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index f19a56a..fd4a66e 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -168,6 +168,19 @@ static inline unsigned exp_random(double mean)
 return -mean * log((double)rand() / RAND_MAX);
 }
 
+/* SCSI_STATUS_TASK_SET_FULL and SCSI_STATUS_TIMEOUT were introduced
+ * in libiscsi 1.10.0 as part of an enum. The LIBISCSI_API_VERSION
+ * macro was introduced in 1.11.0. So use the API_VERSION macro as
+ * a hint that the macros are defined and define them ourselves
+ * otherwise to keep the required libiscsi version at 1.9.0 */
+#if !defined(LIBISCSI_API_VERSION)
+#define QEMU_SCSI_STATUS_TASK_SET_FULL  0x28
+#define QEMU_SCSI_STATUS_TIMEOUT0x0f02
+#else
+#define QEMU_SCSI_STATUS_TASK_SET_FULL  SCSI_STATUS_TASK_SET_FULL
+#define QEMU_SCSI_STATUS_TIMEOUTSCSI_STATUS_TIMEOUT
+#endif
+
 static void
 iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,
 void *command_data, void *opaque)
@@ -188,11 +201,12 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int 
status,
 iTask->do_retry = 1;
 goto out;
 }
-if (status == SCSI_STATUS_BUSY || status == SCSI_STATUS_TIMEOUT ||
-status == SCSI_STATUS_TASK_SET_FULL) {
+if (status == SCSI_STATUS_BUSY ||
+status == QEMU_SCSI_STATUS_TIMEOUT ||
+status == QEMU_SCSI_STATUS_TASK_SET_FULL) {
 unsigned retry_time =
 exp_random(iscsi_retry_times[iTask->retries - 1]);
-if (status == SCSI_STATUS_TIMEOUT) {
+if (status == QEMU_SCSI_STATUS_TIMEOUT) {
 /* make sure the request is rescheduled AFTER the
  * reconnect is initiated */
 retry_time = EVENT_INTERVAL * 2;
@@ -1358,7 +1372,7 @@ static int iscsi_open(BlockDriverState *bs, QDict 
*options, int flags,
 QemuOpts *opts;
 Error *local_err = NULL;
 const char *filename;
-int i, ret = 0;
+int i, ret = 0, timeout = 0;
 
 opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
 qemu_opts_absorb_qdict(opts, options, &local_err);
@@ -1428,7 +1442,15 @@ static int iscsi_open(BlockDriverState *bs, QDict 
*options, int flags,
 goto out;
 }
 
-iscsi_set_timeout(iscsi, parse_timeout(iscsi_url->target));
+/* timeout handling is broken in libiscsi before 1.15.0 */
+timeout = parse_timeout(iscsi_url->target);
+#if defined(LIBISCSI_API_VERSION) && LIBISCSI_API_VERSION >= 20150621
+iscsi_set_timeout(iscsi, timeout);
+#else
+if (timeout) {
+error_report("iSCSI: ignoring timeout value for libiscsi <1.15.0");
+}
+#endif
 
 if (iscsi_full_connect_sync(iscsi, iscsi_url->portal, iscsi_url->lun) != 
0) {
 error_setg(errp, "iSCSI: Failed to connect to LUN : %s",
diff --git a/configure b/configure
index 2ed9db2..222694f 100755
--- a/configure
+++ b/configure
@@ -3660,15 +3660,15 @@ if compile_prog "" "" ; then
 fi
 
 ##
-# Do we have libiscsi >= 1.10.0
+# Do we have libiscsi >= 1.9.0
 if test "$libiscsi" != "no" ; then
-  if $pkg_config --atleast-version=1.10.0 libiscsi; then
+  if $pkg_config --atleast-version=1.9.0 libiscsi; then
 libiscsi="yes"
 libiscsi_cflags=$($pkg_config --cflags libiscsi)
 libiscsi_libs=$($pkg_config --libs libiscsi)
   else
 if test "$libiscsi" = "yes" ; then
-  feature_not_found "libiscsi" "Install libiscsi >= 1.10.0"
+  feature_not_found "libiscsi" "Install libiscsi >= 1.9.0"
 fi
 libiscsi="no"
   fi
diff --git a/qemu-options.hx b/qemu-options.hx
index ca37481..389cf64 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -2293,7 +2293,8 @@ line or a configuration file.
 
 Since version Qemu 2.4 it is possible to specify a iSCSI request timeout to 
detect
 stalled requests and force a reestablishment of the session. The timeout
-is specified in seconds. The default is 0 which means no timeout.
+is specified in seconds. The default is 0 which means no timeout. Libiscsi
+1.15.0 or greater is required for this feature.
 
 Example (without authentication):
 @example
-- 
1.7.9.5




[Qemu-block] [PATCH] block/iscsi: restore compatiblity with libiscsi 1.9.0

2015-06-26 Thread Peter Lieven
RHEL7 and others are stuck with libiscsi 1.9.0 since there
unfortunately was an ABI breakage after that release.

Signed-off-by: Peter Lieven 
---
 block/iscsi.c   |   31 ++-
 configure   |6 +++---
 qemu-options.hx |3 ++-
 3 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index f19a56a..551d583 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -168,6 +168,19 @@ static inline unsigned exp_random(double mean)
 return -mean * log((double)rand() / RAND_MAX);
 }
 
+/* SCSI_STATUS_TASK_SET_FULL and SCSI_STATUS_TIMEOUT were introduced
+ * in libiscsi 1.10.0 as part of an enum. The LIBISCSI_API_VERSION
+ * macro was introduced in 1.11.0. So use the API_VERSION macro as
+ * a hint that the macros are defined and define them ourselves
+ * otherwise to keep the required libiscsi version at 1.9.0 */
+#if !defined(LIBISCSI_API_VERSION)
+#define _SCSI_STATUS_TASK_SET_FULL  0x28
+#define _SCSI_STATUS_TIMEOUT0x0f02
+#else
+#define _SCSI_STATUS_TASK_SET_FULL  SCSI_STATUS_TASK_SET_FULL
+#define _SCSI_STATUS_TIMEOUTSCSI_STATUS_TIMEOUT
+#endif
+
 static void
 iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,
 void *command_data, void *opaque)
@@ -188,11 +201,11 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int 
status,
 iTask->do_retry = 1;
 goto out;
 }
-if (status == SCSI_STATUS_BUSY || status == SCSI_STATUS_TIMEOUT ||
-status == SCSI_STATUS_TASK_SET_FULL) {
+if (status == SCSI_STATUS_BUSY || status == _SCSI_STATUS_TIMEOUT ||
+status == _SCSI_STATUS_TASK_SET_FULL) {
 unsigned retry_time =
 exp_random(iscsi_retry_times[iTask->retries - 1]);
-if (status == SCSI_STATUS_TIMEOUT) {
+if (status == _SCSI_STATUS_TIMEOUT) {
 /* make sure the request is rescheduled AFTER the
  * reconnect is initiated */
 retry_time = EVENT_INTERVAL * 2;
@@ -1358,7 +1371,7 @@ static int iscsi_open(BlockDriverState *bs, QDict 
*options, int flags,
 QemuOpts *opts;
 Error *local_err = NULL;
 const char *filename;
-int i, ret = 0;
+int i, ret = 0, timeout = 0;
 
 opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
 qemu_opts_absorb_qdict(opts, options, &local_err);
@@ -1428,7 +1441,15 @@ static int iscsi_open(BlockDriverState *bs, QDict 
*options, int flags,
 goto out;
 }
 
-iscsi_set_timeout(iscsi, parse_timeout(iscsi_url->target));
+/* timeout handling is broken in libiscsi before 1.15.0 */
+timeout = parse_timeout(iscsi_url->target);
+#if defined(LIBISCSI_API_VERSION) && LIBISCSI_API_VERSION >= 20150621
+iscsi_set_timeout(iscsi, timeout);
+#else
+if (timeout) {
+error_report("iSCSI: ignoring timeout value for libiscsi <1.15.0");
+}
+#endif
 
 if (iscsi_full_connect_sync(iscsi, iscsi_url->portal, iscsi_url->lun) != 
0) {
 error_setg(errp, "iSCSI: Failed to connect to LUN : %s",
diff --git a/configure b/configure
index 2ed9db2..222694f 100755
--- a/configure
+++ b/configure
@@ -3660,15 +3660,15 @@ if compile_prog "" "" ; then
 fi
 
 ##
-# Do we have libiscsi >= 1.10.0
+# Do we have libiscsi >= 1.9.0
 if test "$libiscsi" != "no" ; then
-  if $pkg_config --atleast-version=1.10.0 libiscsi; then
+  if $pkg_config --atleast-version=1.9.0 libiscsi; then
 libiscsi="yes"
 libiscsi_cflags=$($pkg_config --cflags libiscsi)
 libiscsi_libs=$($pkg_config --libs libiscsi)
   else
 if test "$libiscsi" = "yes" ; then
-  feature_not_found "libiscsi" "Install libiscsi >= 1.10.0"
+  feature_not_found "libiscsi" "Install libiscsi >= 1.9.0"
 fi
 libiscsi="no"
   fi
diff --git a/qemu-options.hx b/qemu-options.hx
index ca37481..2d23330 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -2293,7 +2293,8 @@ line or a configuration file.
 
 Since version Qemu 2.4 it is possible to specify a iSCSI request timeout to 
detect
 stalled requests and force a reestablishment of the session. The timeout
-is specified in seconds. The default is 0 which means no timeout.
+is specified in seconds. The default is 0 which means no timeout. Libiscsi
+1.15.0 is required for this feature.
 
 Example (without authentication):
 @example
-- 
1.7.9.5




Re: [Qemu-block] [PATCH] block/nfs: add support for setting debug level

2015-06-26 Thread Peter Lieven
Am 26.06.2015 um 11:14 schrieb Stefan Hajnoczi:
> On Thu, Jun 25, 2015 at 03:26:46PM +0200, Peter Lieven wrote:
>> Am 25.06.2015 um 15:18 schrieb Stefan Hajnoczi:
>>> On Tue, Jun 23, 2015 at 10:12:15AM +0200, Peter Lieven wrote:
>>>> upcoming libnfs versions will support logging debug messages. Add
>>>> support for it in qemu through an URL parameter.
>>>>
>>>> Signed-off-by: Peter Lieven 
>>>> ---
>>>>  block/nfs.c | 4 
>>>>  1 file changed, 4 insertions(+)
>>>>
>>>> diff --git a/block/nfs.c b/block/nfs.c
>>>> index ca9e24e..f7388a3 100644
>>>> --- a/block/nfs.c
>>>> +++ b/block/nfs.c
>>>> @@ -329,6 +329,10 @@ static int64_t nfs_client_open(NFSClient *client, 
>>>> const char *filename,
>>>>  } else if (!strcmp(qp->p[i].name, "readahead")) {
>>>>  nfs_set_readahead(client->context, val);
>>>>  #endif
>>>> +#ifdef LIBNFS_FEATURE_DEBUG
>>>> +} else if (!strcmp(qp->p[i].name, "debug")) {
>>>> +nfs_set_debug(client->context, val);
>>>> +#endif
>>>>  } else {
>>>>  error_setg(errp, "Unknown NFS parameter name: %s",
>>>> qp->p[i].name);
>>> Untrusted users may be able to set these options since they are encoded
>>> in the URI.  I'm imagining a hosting or cloud scenario like OpenStack.
>>>
>>> A verbose debug level spams stderr and could consume a lot of disk
>>> space.
>>>
>>> (The uid and gid options are probably okay since the NFS server cannot
>>> trust the uid/gid coming from QEMU anyway.)
>>>
>>> I think we can merge this patch for QEMU 2.4 but I'd like to have a
>>> discussion about the security risk of encoding libnfs options in the
>>> URI.
>>>
>>> CCed Eric Blake in case libvirt is affected.
>>>
>>> Has anyone thought about this and what are the rules?
>> Good point. In general I think there should be some kind of sanitization of 
>> the parameters
>> before they are passed on to Qemu. In our use case the user cannot pass any 
>> kind of URIs himself,
>> but this might be different in other backends. The readahead value is as 
>> dangerous as well
>> if not sanitized.
>>
>> I had a discussion with Ronnie in the past if we should encode parameters in 
>> the URI or via environment
>> like it is done in libiscsi. If I remember correctly we came up with the URI 
>> parameters for better usability,
>> but hadn't attack scenarios in mind.
>>
>> I am also open to only allow uncritical parameters in the URI and pass 
>> others via a new -nfs cmdline option.
>> Or limit max readahead and max debug level settable via URI.
> I'd feel safer if the option was in runtime_opts instead.  The the
> management tool has to pass them explicitly and the end user cannot
> influence them via the URI.
>
> If an option is needed both at open and create time, then it must also
> be parsed from nfs_file_create() opts.

Ok, I will send a patch that follows this approach. And also a second one to
limit the readahead size to a reasonable value.

Peter



Re: [Qemu-block] [PATCH] block/iscsi: add support for request timeouts

2015-06-25 Thread Peter Lieven
Am 25.06.2015 um 23:08 schrieb Paolo Bonzini:
>
> On 16/06/2015 13:45, Peter Lieven wrote:
>> libiscsi starting with 1.15 will properly support timeout of iscsi
>> commands. The default will remain no timeout, but this can
>> be changed via cmdline parameters, e.g.:
>>
>> qemu -iscsi timeout=30 -drive file=iscsi://...
>>
>> If a timeout occurs a reconnect is scheduled and the timed out command
>> will be requeued for processing after a successful reconnect.
>>
>> The required API call iscsi_set_timeout is present since libiscsi
>> 1.10 which was released in October 2013. However, due to some bugs
>> in the libiscsi code the use is not recommended before version 1.15.
> If so, QEMU should not allow it if libiscsi is older than 1.15.

Not accept a timeout parameter or ignore it and print a warning?

>
>> Please note that this patch bumps the libiscsi requirement to 1.10
>> to have all function and macros defined.
> This is not acceptable, unfortunately.  I explained this two months ago
> (https://lists.gnu.org/archive/html/qemu-devel/2015-04/msg01847.html)
> and it is still true.

Sorry, i missed that. Can you verify if 1.15.0 has a soname that
makes it possible to jump to 1.15.0 at some point in the future?

>
> libiscsi keeps breaking ABI compatibility and for a while did not even
> bump the soname when they do.  This makes it completely impossible for
> distros to upgrade to a newer libiscsi, and RHEL7 is thus stuck with 1.9.
>
> Yes, it is 2 years old.  It doesn't matter.  If libiscsi upstream only
> _tried_ to preserve ABI compatibility, they wouldn't be in this
> situation.  And I know that it is not even trying, because it broke
> again sometime between 1.11 and 1.14 for a totally trivial reason:
>
> --- a/iscsi/iscsi.h
> +++ b/iscsi/iscsi.h
> @@ -91,6 +136,8 @@ struct iscsi_url {
> char target[MAX_STRING_SIZE + 1];
> char user[MAX_STRING_SIZE + 1];
> char passwd[MAX_STRING_SIZE + 1];
> +   char target_user[MAX_STRING_SIZE + 1];
> +   char target_passwd[MAX_STRING_SIZE + 1];
> int lun;
> struct iscsi_context *iscsi;
>  };
>
>
> This is the only change between these releases that breaks the ABI, but
> it is already one too much. :(
>
> (Also, the parsing of URLs into iscsi_url doesn't even try to obey the
> RFCs...).
>
>> The patch fixes also a
>> off-by-one error in the NOP timeout calculation which was fixed
>> while touching these code parts.
> Can you please separate this part anyway?

Sure.

I will send a v2 that is compatible with 1.9.0 and enables the timeout
stuff only for libiscsi >= 1.15.0

Peter



Re: [Qemu-block] [PATCH] block/nfs: add support for setting debug level

2015-06-25 Thread Peter Lieven

Am 25.06.2015 um 15:18 schrieb Stefan Hajnoczi:

On Tue, Jun 23, 2015 at 10:12:15AM +0200, Peter Lieven wrote:

upcoming libnfs versions will support logging debug messages. Add
support for it in qemu through an URL parameter.

Signed-off-by: Peter Lieven 
---
  block/nfs.c | 4 
  1 file changed, 4 insertions(+)

diff --git a/block/nfs.c b/block/nfs.c
index ca9e24e..f7388a3 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -329,6 +329,10 @@ static int64_t nfs_client_open(NFSClient *client, const 
char *filename,
  } else if (!strcmp(qp->p[i].name, "readahead")) {
  nfs_set_readahead(client->context, val);
  #endif
+#ifdef LIBNFS_FEATURE_DEBUG
+} else if (!strcmp(qp->p[i].name, "debug")) {
+nfs_set_debug(client->context, val);
+#endif
  } else {
  error_setg(errp, "Unknown NFS parameter name: %s",
 qp->p[i].name);

Untrusted users may be able to set these options since they are encoded
in the URI.  I'm imagining a hosting or cloud scenario like OpenStack.

A verbose debug level spams stderr and could consume a lot of disk
space.

(The uid and gid options are probably okay since the NFS server cannot
trust the uid/gid coming from QEMU anyway.)

I think we can merge this patch for QEMU 2.4 but I'd like to have a
discussion about the security risk of encoding libnfs options in the
URI.

CCed Eric Blake in case libvirt is affected.

Has anyone thought about this and what are the rules?


Good point. In general I think there should be some kind of sanitization of the 
parameters
before they are passed on to Qemu. In our use case the user cannot pass any 
kind of URIs himself,
but this might be different in other backends. The readahead value is as 
dangerous as well
if not sanitized.

I had a discussion with Ronnie in the past if we should encode parameters in 
the URI or via environment
like it is done in libiscsi. If I remember correctly we came up with the URI 
parameters for better usability,
but hadn't attack scenarios in mind.

I am also open to only allow uncritical parameters in the URI and pass others 
via a new -nfs cmdline option.
Or limit max readahead and max debug level settable via URI.

Peter



Re: [Qemu-block] [PATCH] block/iscsi: add support for request timeouts

2015-06-24 Thread Peter Lieven

Am 23.06.2015 um 01:03 schrieb ronnie sahlberg:

LGTM

It is good to finally have timeouts that work in libiscsi,  and a consumer that 
can use and benefit from it.


Paolo, Kevin, Stefan, do you think this is sth for 2.4?

Peter



[Qemu-block] [PATCH] block/nfs: add support for setting debug level

2015-06-23 Thread Peter Lieven
upcoming libnfs versions will support logging debug messages. Add
support for it in qemu through an URL parameter.

Signed-off-by: Peter Lieven 
---
 block/nfs.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/block/nfs.c b/block/nfs.c
index ca9e24e..f7388a3 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -329,6 +329,10 @@ static int64_t nfs_client_open(NFSClient *client, const 
char *filename,
 } else if (!strcmp(qp->p[i].name, "readahead")) {
 nfs_set_readahead(client->context, val);
 #endif
+#ifdef LIBNFS_FEATURE_DEBUG
+} else if (!strcmp(qp->p[i].name, "debug")) {
+nfs_set_debug(client->context, val);
+#endif
 } else {
 error_setg(errp, "Unknown NFS parameter name: %s",
qp->p[i].name);
-- 
1.9.1




Re: [Qemu-block] [Qemu-devel] RFC cdrom in own thread?

2015-06-22 Thread Peter Lieven

Am 22.06.2015 um 23:54 schrieb John Snow:


On 06/22/2015 09:09 AM, Peter Lieven wrote:

Am 22.06.2015 um 11:25 schrieb Stefan Hajnoczi:

On Fri, Jun 19, 2015 at 2:14 PM, Peter Lieven  wrote:

Am 18.06.2015 um 11:36 schrieb Stefan Hajnoczi:

On Thu, Jun 18, 2015 at 10:29 AM, Peter Lieven  wrote:

Am 18.06.2015 um 10:42 schrieb Kevin Wolf:

Am 18.06.2015 um 10:30 hat Peter Lieven geschrieben:

Am 18.06.2015 um 09:45 schrieb Kevin Wolf:

Am 18.06.2015 um 09:12 hat Peter Lieven geschrieben:

Thread 2 (Thread 0x75550700 (LWP 2636)):
#0  0x75d87aa3 in ppoll () from
/lib/x86_64-linux-gnu/libc.so.6
No symbol table info available.
#1  0x55955d91 in qemu_poll_ns (fds=0x563889c0,
nfds=3,
   timeout=4999424576) at qemu-timer.c:326
   ts = {tv_sec = 4, tv_nsec = 999424576}
   tvsec = 4
#2  0x55956feb in aio_poll (ctx=0x563528e0,
blocking=true)
   at aio-posix.c:231
   node = 0x0
   was_dispatching = false
   ret = 1
   progress = false
#3  0x5594aeed in bdrv_prwv_co (bs=0x5637eae0,
offset=4292007936,
   qiov=0x7554f760, is_write=false, flags=0) at
block.c:2699
   aio_context = 0x563528e0
   co = 0x563888a0
   rwco = {bs = 0x5637eae0, offset = 4292007936,
 qiov = 0x7554f760, is_write = false, ret =
2147483647,
flags = 0}
#4  0x5594afa9 in bdrv_rw_co (bs=0x5637eae0,
sector_num=8382828,
   buf=0x744cc800 "(", nb_sectors=4, is_write=false,
flags=0)
   at block.c:2722
   qiov = {iov = 0x7554f780, niov = 1, nalloc = -1,
size =
2048}
   iov = {iov_base = 0x744cc800, iov_len = 2048}
#5  0x5594b008 in bdrv_read (bs=0x5637eae0,
sector_num=8382828,
   buf=0x744cc800 "(", nb_sectors=4) at block.c:2730
No locals.
#6  0x5599acef in blk_read (blk=0x56376820,
sector_num=8382828,
   buf=0x744cc800 "(", nb_sectors=4) at
block/block-backend.c:404
No locals.
#7  0x55833ed2 in cd_read_sector (s=0x56408f88,
lba=2095707,
   buf=0x744cc800 "(", sector_size=2048) at
hw/ide/atapi.c:116
   ret = 32767

Here is the problem: The ATAPI emulation uses synchronous
blk_read()
instead of the AIO or coroutine interfaces. This means that it
keeps
polling for request completion while it holds the BQL until the
request
is completed.

I will look at this.

I need some further help. My way to "emulate" a hung NFS Server is to
block it in the Firewall. Currently I face the problem that I
cannot mount
a CD Iso via libnfs (nfs://) without hanging Qemu (i previously
tried with
a kernel NFS mount). It reads a few sectors and then stalls (maybe
another
bug):

(gdb) thread apply all bt full

Thread 3 (Thread 0x70c21700 (LWP 29710)):
#0  qemu_cond_broadcast (cond=cond@entry=0x56259940) at
util/qemu-thread-posix.c:120
  err = 
  __func__ = "qemu_cond_broadcast"
#1  0x55911164 in rfifolock_unlock
(r=r@entry=0x56259910) at
util/rfifolock.c:75
  __PRETTY_FUNCTION__ = "rfifolock_unlock"
#2  0x55875921 in aio_context_release
(ctx=ctx@entry=0x562598b0)
at async.c:329
No locals.
#3  0x5588434c in aio_poll (ctx=ctx@entry=0x562598b0,
blocking=blocking@entry=true) at aio-posix.c:272
  node = 
  was_dispatching = false
  i = 
  ret = 
  progress = false
  timeout = 611734526
  __PRETTY_FUNCTION__ = "aio_poll"
#4  0x558bc43d in bdrv_prwv_co (bs=bs@entry=0x5627c0f0,
offset=offset@entry=7038976, qiov=qiov@entry=0x70c208f0,
is_write=is_write@entry=false, flags=flags@entry=(unknown: 0)) at
block/io.c:552
  aio_context = 0x562598b0
  co = 
  rwco = {bs = 0x5627c0f0, offset = 7038976, qiov =
0x70c208f0, is_write = false, ret = 2147483647, flags =
(unknown: 0)}
#5  0x558bc533 in bdrv_rw_co (bs=0x5627c0f0,
sector_num=sector_num@entry=13748, buf=buf@entry=0x57874800 "(",
nb_sectors=nb_sectors@entry=4, is_write=is_write@entry=false,
  flags=flags@entry=(unknown: 0)) at block/io.c:575
  qiov = {iov = 0x70c208e0, niov = 1, nalloc = -1, size
= 2048}
  iov = {iov_base = 0x57874800, iov_len = 2048}
#6  0x558bc593 in bdrv_read (bs=,
sector_num=sector_num@entry=13748, buf=buf@entry=0x57874800 "(",
nb_sectors=nb_sectors@entry=4) at block/io.c:583
No locals.
#7  0x558af75d in blk_read (blk=,
sector_num=sector_num@entry=13748, buf=buf@entry=0x57874800 "(",
nb_sectors=nb_sectors@entry=4) at block/block-backend.c:493
  ret = 
#8  0x557abb88 in cd_read_sector (sector_size=,
buf=0x57874800 "(", lba=3437, s=0x5760db70) at
hw/ide/atapi.c:116
  ret = 
#9  ide_atapi_cmd_reply_end (s=0x5760db70) at hw/ide/atapi.c:190
  byt

Re: [Qemu-block] RFC cdrom in own thread?

2015-06-22 Thread Peter Lieven

Am 22.06.2015 um 11:25 schrieb Stefan Hajnoczi:

On Fri, Jun 19, 2015 at 2:14 PM, Peter Lieven  wrote:

Am 18.06.2015 um 11:36 schrieb Stefan Hajnoczi:

On Thu, Jun 18, 2015 at 10:29 AM, Peter Lieven  wrote:

Am 18.06.2015 um 10:42 schrieb Kevin Wolf:

Am 18.06.2015 um 10:30 hat Peter Lieven geschrieben:

Am 18.06.2015 um 09:45 schrieb Kevin Wolf:

Am 18.06.2015 um 09:12 hat Peter Lieven geschrieben:

Thread 2 (Thread 0x75550700 (LWP 2636)):
#0  0x75d87aa3 in ppoll () from /lib/x86_64-linux-gnu/libc.so.6
No symbol table info available.
#1  0x55955d91 in qemu_poll_ns (fds=0x563889c0, nfds=3,
  timeout=4999424576) at qemu-timer.c:326
  ts = {tv_sec = 4, tv_nsec = 999424576}
  tvsec = 4
#2  0x55956feb in aio_poll (ctx=0x563528e0, blocking=true)
  at aio-posix.c:231
  node = 0x0
  was_dispatching = false
  ret = 1
  progress = false
#3  0x5594aeed in bdrv_prwv_co (bs=0x5637eae0,
offset=4292007936,
  qiov=0x7554f760, is_write=false, flags=0) at block.c:2699
  aio_context = 0x563528e0
  co = 0x563888a0
  rwco = {bs = 0x5637eae0, offset = 4292007936,
qiov = 0x7554f760, is_write = false, ret = 2147483647,
flags = 0}
#4  0x5594afa9 in bdrv_rw_co (bs=0x5637eae0,
sector_num=8382828,
  buf=0x744cc800 "(", nb_sectors=4, is_write=false, flags=0)
  at block.c:2722
  qiov = {iov = 0x7554f780, niov = 1, nalloc = -1, size =
2048}
  iov = {iov_base = 0x744cc800, iov_len = 2048}
#5  0x5594b008 in bdrv_read (bs=0x5637eae0,
sector_num=8382828,
  buf=0x744cc800 "(", nb_sectors=4) at block.c:2730
No locals.
#6  0x5599acef in blk_read (blk=0x56376820,
sector_num=8382828,
  buf=0x744cc800 "(", nb_sectors=4) at block/block-backend.c:404
No locals.
#7  0x55833ed2 in cd_read_sector (s=0x56408f88,
lba=2095707,
  buf=0x744cc800 "(", sector_size=2048) at hw/ide/atapi.c:116
  ret = 32767

Here is the problem: The ATAPI emulation uses synchronous blk_read()
instead of the AIO or coroutine interfaces. This means that it keeps
polling for request completion while it holds the BQL until the request
is completed.

I will look at this.

I need some further help. My way to "emulate" a hung NFS Server is to
block it in the Firewall. Currently I face the problem that I cannot mount
a CD Iso via libnfs (nfs://) without hanging Qemu (i previously tried with
a kernel NFS mount). It reads a few sectors and then stalls (maybe another
bug):

(gdb) thread apply all bt full

Thread 3 (Thread 0x70c21700 (LWP 29710)):
#0  qemu_cond_broadcast (cond=cond@entry=0x56259940) at
util/qemu-thread-posix.c:120
 err = 
 __func__ = "qemu_cond_broadcast"
#1  0x55911164 in rfifolock_unlock (r=r@entry=0x56259910) at
util/rfifolock.c:75
 __PRETTY_FUNCTION__ = "rfifolock_unlock"
#2  0x55875921 in aio_context_release (ctx=ctx@entry=0x562598b0)
at async.c:329
No locals.
#3  0x5588434c in aio_poll (ctx=ctx@entry=0x562598b0,
blocking=blocking@entry=true) at aio-posix.c:272
 node = 
 was_dispatching = false
 i = 
 ret = 
 progress = false
 timeout = 611734526
 __PRETTY_FUNCTION__ = "aio_poll"
#4  0x558bc43d in bdrv_prwv_co (bs=bs@entry=0x5627c0f0,
offset=offset@entry=7038976, qiov=qiov@entry=0x70c208f0,
is_write=is_write@entry=false, flags=flags@entry=(unknown: 0)) at
block/io.c:552
 aio_context = 0x562598b0
 co = 
 rwco = {bs = 0x5627c0f0, offset = 7038976, qiov =
0x70c208f0, is_write = false, ret = 2147483647, flags = (unknown: 0)}
#5  0x558bc533 in bdrv_rw_co (bs=0x5627c0f0,
sector_num=sector_num@entry=13748, buf=buf@entry=0x57874800 "(",
nb_sectors=nb_sectors@entry=4, is_write=is_write@entry=false,
 flags=flags@entry=(unknown: 0)) at block/io.c:575
 qiov = {iov = 0x70c208e0, niov = 1, nalloc = -1, size = 2048}
 iov = {iov_base = 0x57874800, iov_len = 2048}
#6  0x558bc593 in bdrv_read (bs=,
sector_num=sector_num@entry=13748, buf=buf@entry=0x57874800 "(",
nb_sectors=nb_sectors@entry=4) at block/io.c:583
No locals.
#7  0x558af75d in blk_read (blk=,
sector_num=sector_num@entry=13748, buf=buf@entry=0x57874800 "(",
nb_sectors=nb_sectors@entry=4) at block/block-backend.c:493
 ret = 
#8  0x557abb88 in cd_read_sector (sector_size=,
buf=0x57874800 "(", lba=3437, s=0x5760db70) at hw/ide/atapi.c:116
 ret = 
#9  ide_atapi_cmd_reply_end (s=0x5760db70) at hw/ide/atapi.c:190
 byte_count_limit = 
 size = 
 ret = 2

This is still the same scenario Kevin explained.

The ATAPI CD-ROM emulation code i

Re: [Qemu-block] RFC cdrom in own thread?

2015-06-19 Thread Peter Lieven
Am 18.06.2015 um 11:36 schrieb Stefan Hajnoczi:
> On Thu, Jun 18, 2015 at 10:29 AM, Peter Lieven  wrote:
>> Am 18.06.2015 um 10:42 schrieb Kevin Wolf:
>>> Am 18.06.2015 um 10:30 hat Peter Lieven geschrieben:
>>>> Am 18.06.2015 um 09:45 schrieb Kevin Wolf:
>>>>> Am 18.06.2015 um 09:12 hat Peter Lieven geschrieben:
>>>>>> Thread 2 (Thread 0x75550700 (LWP 2636)):
>>>>>> #0  0x75d87aa3 in ppoll () from /lib/x86_64-linux-gnu/libc.so.6
>>>>>> No symbol table info available.
>>>>>> #1  0x55955d91 in qemu_poll_ns (fds=0x563889c0, nfds=3,
>>>>>>  timeout=4999424576) at qemu-timer.c:326
>>>>>>  ts = {tv_sec = 4, tv_nsec = 999424576}
>>>>>>  tvsec = 4
>>>>>> #2  0x55956feb in aio_poll (ctx=0x563528e0, blocking=true)
>>>>>>  at aio-posix.c:231
>>>>>>  node = 0x0
>>>>>>  was_dispatching = false
>>>>>>  ret = 1
>>>>>>  progress = false
>>>>>> #3  0x5594aeed in bdrv_prwv_co (bs=0x5637eae0,
>>>>>> offset=4292007936,
>>>>>>  qiov=0x7554f760, is_write=false, flags=0) at block.c:2699
>>>>>>  aio_context = 0x563528e0
>>>>>>  co = 0x563888a0
>>>>>>  rwco = {bs = 0x5637eae0, offset = 4292007936,
>>>>>>qiov = 0x7554f760, is_write = false, ret = 2147483647,
>>>>>> flags = 0}
>>>>>> #4  0x5594afa9 in bdrv_rw_co (bs=0x5637eae0,
>>>>>> sector_num=8382828,
>>>>>>  buf=0x744cc800 "(", nb_sectors=4, is_write=false, flags=0)
>>>>>>  at block.c:2722
>>>>>>  qiov = {iov = 0x7554f780, niov = 1, nalloc = -1, size =
>>>>>> 2048}
>>>>>>  iov = {iov_base = 0x744cc800, iov_len = 2048}
>>>>>> #5  0x5594b008 in bdrv_read (bs=0x5637eae0,
>>>>>> sector_num=8382828,
>>>>>>  buf=0x744cc800 "(", nb_sectors=4) at block.c:2730
>>>>>> No locals.
>>>>>> #6  0x5599acef in blk_read (blk=0x56376820,
>>>>>> sector_num=8382828,
>>>>>>  buf=0x744cc800 "(", nb_sectors=4) at block/block-backend.c:404
>>>>>> No locals.
>>>>>> #7  0x55833ed2 in cd_read_sector (s=0x56408f88,
>>>>>> lba=2095707,
>>>>>>  buf=0x744cc800 "(", sector_size=2048) at hw/ide/atapi.c:116
>>>>>>  ret = 32767
>>>>> Here is the problem: The ATAPI emulation uses synchronous blk_read()
>>>>> instead of the AIO or coroutine interfaces. This means that it keeps
>>>>> polling for request completion while it holds the BQL until the request
>>>>> is completed.
>>>> I will look at this.
>>
>> I need some further help. My way to "emulate" a hung NFS Server is to
>> block it in the Firewall. Currently I face the problem that I cannot mount
>> a CD Iso via libnfs (nfs://) without hanging Qemu (i previously tried with
>> a kernel NFS mount). It reads a few sectors and then stalls (maybe another
>> bug):
>>
>> (gdb) thread apply all bt full
>>
>> Thread 3 (Thread 0x70c21700 (LWP 29710)):
>> #0  qemu_cond_broadcast (cond=cond@entry=0x56259940) at
>> util/qemu-thread-posix.c:120
>> err = 
>> __func__ = "qemu_cond_broadcast"
>> #1  0x55911164 in rfifolock_unlock (r=r@entry=0x56259910) at
>> util/rfifolock.c:75
>> __PRETTY_FUNCTION__ = "rfifolock_unlock"
>> #2  0x55875921 in aio_context_release (ctx=ctx@entry=0x562598b0)
>> at async.c:329
>> No locals.
>> #3  0x5588434c in aio_poll (ctx=ctx@entry=0x562598b0,
>> blocking=blocking@entry=true) at aio-posix.c:272
>> node = 
>> was_dispatching = false
>> i = 
>> ret = 
>> progress = false
>> timeout = 611734526
>> __PRETTY_FUNCTION__ = "aio_poll"
>> #4  0x558bc43d in bdrv_prwv_co (bs=bs@entry=0x5627c0f0,
>> offset=offset@entry=7038976, qiov=qiov@entry=0x70c208f0,
>> is_write=is_write@entry=false, flags=flags@entry=(unknown: 0)) at
>> block/io.c:552
>>

Re: [Qemu-block] RFC cdrom in own thread?

2015-06-18 Thread Peter Lieven

Am 18.06.2015 um 10:42 schrieb Kevin Wolf:

Am 18.06.2015 um 10:30 hat Peter Lieven geschrieben:

Am 18.06.2015 um 09:45 schrieb Kevin Wolf:

Am 18.06.2015 um 09:12 hat Peter Lieven geschrieben:

Thread 2 (Thread 0x75550700 (LWP 2636)):
#0  0x75d87aa3 in ppoll () from /lib/x86_64-linux-gnu/libc.so.6
No symbol table info available.
#1  0x55955d91 in qemu_poll_ns (fds=0x563889c0, nfds=3,
 timeout=4999424576) at qemu-timer.c:326
 ts = {tv_sec = 4, tv_nsec = 999424576}
 tvsec = 4
#2  0x55956feb in aio_poll (ctx=0x563528e0, blocking=true)
 at aio-posix.c:231
 node = 0x0
 was_dispatching = false
 ret = 1
 progress = false
#3  0x5594aeed in bdrv_prwv_co (bs=0x5637eae0, offset=4292007936,
 qiov=0x7554f760, is_write=false, flags=0) at block.c:2699
 aio_context = 0x563528e0
 co = 0x563888a0
 rwco = {bs = 0x5637eae0, offset = 4292007936,
   qiov = 0x7554f760, is_write = false, ret = 2147483647, flags = 0}
#4  0x5594afa9 in bdrv_rw_co (bs=0x5637eae0, sector_num=8382828,
 buf=0x744cc800 "(", nb_sectors=4, is_write=false, flags=0)
 at block.c:2722
 qiov = {iov = 0x7554f780, niov = 1, nalloc = -1, size = 2048}
 iov = {iov_base = 0x744cc800, iov_len = 2048}
#5  0x5594b008 in bdrv_read (bs=0x5637eae0, sector_num=8382828,
 buf=0x744cc800 "(", nb_sectors=4) at block.c:2730
No locals.
#6  0x5599acef in blk_read (blk=0x56376820, sector_num=8382828,
 buf=0x744cc800 "(", nb_sectors=4) at block/block-backend.c:404
No locals.
#7  0x55833ed2 in cd_read_sector (s=0x56408f88, lba=2095707,
 buf=0x744cc800 "(", sector_size=2048) at hw/ide/atapi.c:116
 ret = 32767

Here is the problem: The ATAPI emulation uses synchronous blk_read()
instead of the AIO or coroutine interfaces. This means that it keeps
polling for request completion while it holds the BQL until the request
is completed.

I will look at this.


We can (and should) fix that, otherwise the VCPUs is blocked while we're
reading from the image, even without a hang. It doesn't fully fix your
problem, though, as bdrv_drain_all() and friends still exist.

Any idea which commands actually call bdrv_drain_alll?

At least 'stop' and all commands changing the BDS graph (block jobs,
snapshots, commit, etc.). For a full list, I would have to inspect each
command in the code.

The guest can even trigger bdrv_drain_all() by stopping a running DMA
operation.


Unfortunately, excactly this is happening...
Is there any way to avoid the bdrv_drain_all in bmdma_cmd_writeb?

Peter



Re: [Qemu-block] RFC cdrom in own thread?

2015-06-18 Thread Peter Lieven

Am 18.06.2015 um 11:36 schrieb Stefan Hajnoczi:

On Thu, Jun 18, 2015 at 10:29 AM, Peter Lieven  wrote:

Am 18.06.2015 um 10:42 schrieb Kevin Wolf:

Am 18.06.2015 um 10:30 hat Peter Lieven geschrieben:

Am 18.06.2015 um 09:45 schrieb Kevin Wolf:

Am 18.06.2015 um 09:12 hat Peter Lieven geschrieben:

Thread 2 (Thread 0x75550700 (LWP 2636)):
#0  0x75d87aa3 in ppoll () from /lib/x86_64-linux-gnu/libc.so.6
No symbol table info available.
#1  0x55955d91 in qemu_poll_ns (fds=0x563889c0, nfds=3,
  timeout=4999424576) at qemu-timer.c:326
  ts = {tv_sec = 4, tv_nsec = 999424576}
  tvsec = 4
#2  0x55956feb in aio_poll (ctx=0x563528e0, blocking=true)
  at aio-posix.c:231
  node = 0x0
  was_dispatching = false
  ret = 1
  progress = false
#3  0x5594aeed in bdrv_prwv_co (bs=0x5637eae0,
offset=4292007936,
  qiov=0x7554f760, is_write=false, flags=0) at block.c:2699
  aio_context = 0x563528e0
  co = 0x563888a0
  rwco = {bs = 0x5637eae0, offset = 4292007936,
qiov = 0x7554f760, is_write = false, ret = 2147483647,
flags = 0}
#4  0x5594afa9 in bdrv_rw_co (bs=0x5637eae0,
sector_num=8382828,
  buf=0x744cc800 "(", nb_sectors=4, is_write=false, flags=0)
  at block.c:2722
  qiov = {iov = 0x7554f780, niov = 1, nalloc = -1, size =
2048}
  iov = {iov_base = 0x744cc800, iov_len = 2048}
#5  0x5594b008 in bdrv_read (bs=0x5637eae0,
sector_num=8382828,
  buf=0x744cc800 "(", nb_sectors=4) at block.c:2730
No locals.
#6  0x5599acef in blk_read (blk=0x56376820,
sector_num=8382828,
  buf=0x744cc800 "(", nb_sectors=4) at block/block-backend.c:404
No locals.
#7  0x55833ed2 in cd_read_sector (s=0x56408f88,
lba=2095707,
  buf=0x744cc800 "(", sector_size=2048) at hw/ide/atapi.c:116
  ret = 32767

Here is the problem: The ATAPI emulation uses synchronous blk_read()
instead of the AIO or coroutine interfaces. This means that it keeps
polling for request completion while it holds the BQL until the request
is completed.

I will look at this.


I need some further help. My way to "emulate" a hung NFS Server is to
block it in the Firewall. Currently I face the problem that I cannot mount
a CD Iso via libnfs (nfs://) without hanging Qemu (i previously tried with
a kernel NFS mount). It reads a few sectors and then stalls (maybe another
bug):

(gdb) thread apply all bt full

Thread 3 (Thread 0x70c21700 (LWP 29710)):
#0  qemu_cond_broadcast (cond=cond@entry=0x56259940) at
util/qemu-thread-posix.c:120
 err = 
 __func__ = "qemu_cond_broadcast"
#1  0x55911164 in rfifolock_unlock (r=r@entry=0x56259910) at
util/rfifolock.c:75
 __PRETTY_FUNCTION__ = "rfifolock_unlock"
#2  0x55875921 in aio_context_release (ctx=ctx@entry=0x562598b0)
at async.c:329
No locals.
#3  0x5588434c in aio_poll (ctx=ctx@entry=0x562598b0,
blocking=blocking@entry=true) at aio-posix.c:272
 node = 
 was_dispatching = false
 i = 
 ret = 
 progress = false
 timeout = 611734526
 __PRETTY_FUNCTION__ = "aio_poll"
#4  0x558bc43d in bdrv_prwv_co (bs=bs@entry=0x5627c0f0,
offset=offset@entry=7038976, qiov=qiov@entry=0x70c208f0,
is_write=is_write@entry=false, flags=flags@entry=(unknown: 0)) at
block/io.c:552
 aio_context = 0x562598b0
 co = 
 rwco = {bs = 0x5627c0f0, offset = 7038976, qiov =
0x70c208f0, is_write = false, ret = 2147483647, flags = (unknown: 0)}
#5  0x558bc533 in bdrv_rw_co (bs=0x5627c0f0,
sector_num=sector_num@entry=13748, buf=buf@entry=0x57874800 "(",
nb_sectors=nb_sectors@entry=4, is_write=is_write@entry=false,
 flags=flags@entry=(unknown: 0)) at block/io.c:575
 qiov = {iov = 0x70c208e0, niov = 1, nalloc = -1, size = 2048}
 iov = {iov_base = 0x57874800, iov_len = 2048}
#6  0x558bc593 in bdrv_read (bs=,
sector_num=sector_num@entry=13748, buf=buf@entry=0x57874800 "(",
nb_sectors=nb_sectors@entry=4) at block/io.c:583
No locals.
#7  0x558af75d in blk_read (blk=,
sector_num=sector_num@entry=13748, buf=buf@entry=0x57874800 "(",
nb_sectors=nb_sectors@entry=4) at block/block-backend.c:493
 ret = 
#8  0x557abb88 in cd_read_sector (sector_size=,
buf=0x57874800 "(", lba=3437, s=0x5760db70) at hw/ide/atapi.c:116
 ret = 
#9  ide_atapi_cmd_reply_end (s=0x5760db70) at hw/ide/atapi.c:190
 byte_count_limit = 
 size = 
 ret = 2

This is still the same scenario Kevin explained.

The ATAPI CD-ROM emulation code is using synchronous blk_read().  This
function holds the QEMU global mutex while waiting for the I

Re: [Qemu-block] RFC cdrom in own thread?

2015-06-18 Thread Peter Lieven

Am 18.06.2015 um 10:42 schrieb Kevin Wolf:

Am 18.06.2015 um 10:30 hat Peter Lieven geschrieben:

Am 18.06.2015 um 09:45 schrieb Kevin Wolf:

Am 18.06.2015 um 09:12 hat Peter Lieven geschrieben:

Thread 2 (Thread 0x75550700 (LWP 2636)):
#0  0x75d87aa3 in ppoll () from /lib/x86_64-linux-gnu/libc.so.6
No symbol table info available.
#1  0x55955d91 in qemu_poll_ns (fds=0x563889c0, nfds=3,
 timeout=4999424576) at qemu-timer.c:326
 ts = {tv_sec = 4, tv_nsec = 999424576}
 tvsec = 4
#2  0x55956feb in aio_poll (ctx=0x563528e0, blocking=true)
 at aio-posix.c:231
 node = 0x0
 was_dispatching = false
 ret = 1
 progress = false
#3  0x5594aeed in bdrv_prwv_co (bs=0x5637eae0, offset=4292007936,
 qiov=0x7554f760, is_write=false, flags=0) at block.c:2699
 aio_context = 0x563528e0
 co = 0x563888a0
 rwco = {bs = 0x5637eae0, offset = 4292007936,
   qiov = 0x7554f760, is_write = false, ret = 2147483647, flags = 0}
#4  0x5594afa9 in bdrv_rw_co (bs=0x5637eae0, sector_num=8382828,
 buf=0x744cc800 "(", nb_sectors=4, is_write=false, flags=0)
 at block.c:2722
 qiov = {iov = 0x7554f780, niov = 1, nalloc = -1, size = 2048}
 iov = {iov_base = 0x744cc800, iov_len = 2048}
#5  0x5594b008 in bdrv_read (bs=0x5637eae0, sector_num=8382828,
 buf=0x744cc800 "(", nb_sectors=4) at block.c:2730
No locals.
#6  0x5599acef in blk_read (blk=0x56376820, sector_num=8382828,
 buf=0x744cc800 "(", nb_sectors=4) at block/block-backend.c:404
No locals.
#7  0x55833ed2 in cd_read_sector (s=0x56408f88, lba=2095707,
 buf=0x744cc800 "(", sector_size=2048) at hw/ide/atapi.c:116
 ret = 32767

Here is the problem: The ATAPI emulation uses synchronous blk_read()
instead of the AIO or coroutine interfaces. This means that it keeps
polling for request completion while it holds the BQL until the request
is completed.

I will look at this.


I need some further help. My way to "emulate" a hung NFS Server is to
block it in the Firewall. Currently I face the problem that I cannot mount
a CD Iso via libnfs (nfs://) without hanging Qemu (i previously tried with
a kernel NFS mount). It reads a few sectors and then stalls (maybe another bug):

(gdb) thread apply all bt full

Thread 3 (Thread 0x70c21700 (LWP 29710)):
#0  qemu_cond_broadcast (cond=cond@entry=0x56259940) at 
util/qemu-thread-posix.c:120
err = 
__func__ = "qemu_cond_broadcast"
#1  0x55911164 in rfifolock_unlock (r=r@entry=0x56259910) at 
util/rfifolock.c:75
__PRETTY_FUNCTION__ = "rfifolock_unlock"
#2  0x55875921 in aio_context_release (ctx=ctx@entry=0x562598b0) at 
async.c:329
No locals.
#3  0x5588434c in aio_poll (ctx=ctx@entry=0x562598b0, 
blocking=blocking@entry=true) at aio-posix.c:272
node = 
was_dispatching = false
i = 
ret = 
progress = false
timeout = 611734526
__PRETTY_FUNCTION__ = "aio_poll"
#4  0x558bc43d in bdrv_prwv_co (bs=bs@entry=0x5627c0f0, 
offset=offset@entry=7038976, qiov=qiov@entry=0x70c208f0, 
is_write=is_write@entry=false, flags=flags@entry=(unknown: 0)) at block/io.c:552
aio_context = 0x562598b0
co = 
rwco = {bs = 0x5627c0f0, offset = 7038976, qiov = 0x70c208f0, 
is_write = false, ret = 2147483647, flags = (unknown: 0)}
#5  0x558bc533 in bdrv_rw_co (bs=0x5627c0f0, 
sector_num=sector_num@entry=13748, buf=buf@entry=0x57874800 "(", 
nb_sectors=nb_sectors@entry=4, is_write=is_write@entry=false,
flags=flags@entry=(unknown: 0)) at block/io.c:575
qiov = {iov = 0x70c208e0, niov = 1, nalloc = -1, size = 2048}
iov = {iov_base = 0x57874800, iov_len = 2048}
#6  0x558bc593 in bdrv_read (bs=, 
sector_num=sector_num@entry=13748, buf=buf@entry=0x57874800 "(", 
nb_sectors=nb_sectors@entry=4) at block/io.c:583
No locals.
#7  0x558af75d in blk_read (blk=, 
sector_num=sector_num@entry=13748, buf=buf@entry=0x57874800 "(", 
nb_sectors=nb_sectors@entry=4) at block/block-backend.c:493
ret = 
#8  0x557abb88 in cd_read_sector (sector_size=, buf=0x57874800 
"(", lba=3437, s=0x5760db70) at hw/ide/atapi.c:116
ret = 
#9  ide_atapi_cmd_reply_end (s=0x5760db70) at hw/ide/atapi.c:190
byte_count_limit = 
size = 
ret = 2
#10 0x556398a6 in memory_region_write_accessor (mr=0x577f85d0, addr=, value=0x70c20a68, size=2, shift=, mask=, 
attrs=...)
at /home/lieven/git/qemu/memory.c:459
tmp = 
#11 0x5563956b in access_with_adjusted_size (addr=addr@entry=0, 
value=value@entry=0x70c20a68, size=size@entry=2

Re: [Qemu-block] RFC cdrom in own thread?

2015-06-18 Thread Peter Lieven

Am 18.06.2015 um 09:45 schrieb Kevin Wolf:

Am 18.06.2015 um 09:12 hat Peter Lieven geschrieben:

Thread 2 (Thread 0x75550700 (LWP 2636)):
#0  0x75d87aa3 in ppoll () from /lib/x86_64-linux-gnu/libc.so.6
No symbol table info available.
#1  0x55955d91 in qemu_poll_ns (fds=0x563889c0, nfds=3,
 timeout=4999424576) at qemu-timer.c:326
 ts = {tv_sec = 4, tv_nsec = 999424576}
 tvsec = 4
#2  0x55956feb in aio_poll (ctx=0x563528e0, blocking=true)
 at aio-posix.c:231
 node = 0x0
 was_dispatching = false
 ret = 1
 progress = false
#3  0x5594aeed in bdrv_prwv_co (bs=0x5637eae0, offset=4292007936,
 qiov=0x7554f760, is_write=false, flags=0) at block.c:2699
 aio_context = 0x563528e0
 co = 0x563888a0
 rwco = {bs = 0x5637eae0, offset = 4292007936,
   qiov = 0x7554f760, is_write = false, ret = 2147483647, flags = 0}
#4  0x5594afa9 in bdrv_rw_co (bs=0x5637eae0, sector_num=8382828,
 buf=0x744cc800 "(", nb_sectors=4, is_write=false, flags=0)
 at block.c:2722
 qiov = {iov = 0x7554f780, niov = 1, nalloc = -1, size = 2048}
 iov = {iov_base = 0x744cc800, iov_len = 2048}
#5  0x5594b008 in bdrv_read (bs=0x5637eae0, sector_num=8382828,
 buf=0x744cc800 "(", nb_sectors=4) at block.c:2730
No locals.
#6  0x5599acef in blk_read (blk=0x56376820, sector_num=8382828,
 buf=0x744cc800 "(", nb_sectors=4) at block/block-backend.c:404
No locals.
#7  0x55833ed2 in cd_read_sector (s=0x56408f88, lba=2095707,
 buf=0x744cc800 "(", sector_size=2048) at hw/ide/atapi.c:116
 ret = 32767

Here is the problem: The ATAPI emulation uses synchronous blk_read()
instead of the AIO or coroutine interfaces. This means that it keeps
polling for request completion while it holds the BQL until the request
is completed.


I will look at this.



We can (and should) fix that, otherwise the VCPUs is blocked while we're
reading from the image, even without a hang. It doesn't fully fix your
problem, though, as bdrv_drain_all() and friends still exist.


Any idea which commands actually call bdrv_drain_alll?

Peter



Re: [Qemu-block] RFC cdrom in own thread?

2015-06-18 Thread Peter Lieven

Am 18.06.2015 um 09:03 schrieb Peter Lieven:

Am 18.06.2015 um 08:59 schrieb Paolo Bonzini:


On 18/06/2015 08:39, Peter Lieven wrote:

It seems like the mainloop is waiting here:

#0  0x7606c89c in __lll_lock_wait ()
from /lib/x86_64-linux-gnu/libpthread.so.0
No symbol table info available.
#1  0x76068065 in _L_lock_858 ()
from /lib/x86_64-linux-gnu/libpthread.so.0
No symbol table info available.
#2  0x76067eba in pthread_mutex_lock ()
from /lib/x86_64-linux-gnu/libpthread.so.0
No symbol table info available.
#3  0x559f2557 in qemu_mutex_lock (mutex=0x55ed6d40)
 at util/qemu-thread-posix.c:76
 err = 0
 __func__ = "qemu_mutex_lock"
#4  0x556306ef in qemu_mutex_lock_iothread ()
 at /usr/src/qemu-2.2.0/cpus.c:1123
No locals.

This means the VCPU is busy with some synchronous activity---maybe a
bdrv_aio_cancel?


Here is what the other threads are doing (dropped VNC thread):


Sorry, sth messed up while copying the buffer. Here should be the correct 
output:

(gdb) thread apply all bt full

Thread 4 (Thread 0x7fffee9ff700 (LWP 2640)):
#0  0x76069d84 in pthread_cond_wait@@GLIBC_2.3.2 ()
   from /lib/x86_64-linux-gnu/libpthread.so.0
No symbol table info available.
#1  0x559f27ae in qemu_cond_wait (cond=0x563beed0,
mutex=0x563bef00) at util/qemu-thread-posix.c:135
err = 0
__func__ = "qemu_cond_wait"
#2  0x5593f12e in vnc_worker_thread_loop (queue=0x563beed0)
at ui/vnc-jobs.c:222
job = 0x5637bbd0
entry = 0x0
tmp = 0x0
vs = {csock = -1, dirty = {{0, 0, 0} },
  lossy_rect = 0x563ecd10, vd = 0x74465010, need_update = 0,
  force_update = 0, has_dirty = 0, features = 195, absolute = 0,
  last_x = 0, last_y = 0, last_bmask = 0, client_width = 0,
  client_height = 0, share_mode = 0, vnc_encoding = 5, major = 0,
  minor = 0, auth = 0, challenge = '\000' ,
  info = 0x0, output = {capacity = 6257, offset = 1348,
buffer = 0x7fffe4000d10 ""}, input = {capacity = 0, offset = 0,
buffer = 0x0},
  write_pixels = 0x55925d57 , client_pf = {
bits_per_pixel = 32 ' ', bytes_per_pixel = 4 '\004',
depth = 24 '\030', rmask = 16711680, gmask = 65280, bmask = 255,
amask = 0, rshift = 16 '\020', gshift = 8 '\b', bshift = 0 '\000',
ashift = 24 '\030', rmax = 255 '\377', gmax = 255 '\377',
bmax = 255 '\377', amax = 0 '\000', rbits = 8 '\b',
gbits = 8 '\b', bbits = 8 '\b', abits = 0 '\000'},
  client_format = 0, client_be = false, audio_cap = 0x0, as = {
freq = 0, nchannels = 0, fmt = AUD_FMT_U8, endianness = 0},
  read_handler = 0, read_handler_expect = 0,
  modifiers_state = '\000' , led = 0x0,
  abort = false, initialized = false, output_mutex = {lock = {
  __data = {__lock = 0, __count = 0, __owner = 0, __nusers = 0,
__kind = 0, __spins = 0, __list = {__prev = 0x0,
  __next = 0x0}}, __size = '\000' ,
  __align = 0}}, bh = 0x0, jobs_buffer = {capacity = 0,
offset = 0, buffer = 0x0}, tight = {type = 0,
quality = 255 '\377', compression = 9 '\t', pixel24 = 0 '\000',
tight = {capacity = 0, offset = 0, buffer = 0x0}, tmp = {
  capacity = 0, offset = 0, buffer = 0x0}, zlib = {capacity = 0,
  offset = 0, buffer = 0x0}, gradient = {capacity = 0, offset = 0,
  buffer = 0x0}, levels = {0, 0, 0, 0}, stream = {{next_in = 0x0,
avail_in = 0, total_in = 0, next_out = 0x0, avail_out = 0,
total_out = 0, msg = 0x0, state = 0x0, zalloc = 0, zfree = 0,
opaque = 0x0, data_type = 0, adler = 0, reserved = 0}, {
next_in = 0x0, avail_in = 0, total_in = 0, next_out = 0x0,
avail_out = 0, total_out = 0, msg = 0x0, state = 0x0,
zalloc = 0, zfree = 0, opaque = 0x0, data_type = 0, adler = 0,
reserved = 0}, {next_in = 0x0, avail_in = 0, total_in = 0,
next_out = 0x0, avail_out = 0, total_out = 0, msg = 0x0,
state = 0x0, zalloc = 0, zfree = 0, opaque = 0x0,
data_type = 0, adler = 0, reserved = 0}, {next_in = 0x0,
avail_in = 0, total_in = 0, next_out = 0x0, avail_out = 0,
total_out = 0, msg = 0x0, state = 0x0, zalloc = 0, zfree = 0,
opaque = 0x0, data_type = 0, adler = 0, reserved = 0}}},
  zlib = {zlib = {capacity = 0, offset = 0, buffer = 0x0}, tmp = {
  capacity = 0, offset = 0, buffer = 0x0}, stream = {

Re: [Qemu-block] RFC cdrom in own thread?

2015-06-18 Thread Peter Lieven

Am 18.06.2015 um 08:59 schrieb Paolo Bonzini:


On 18/06/2015 08:39, Peter Lieven wrote:

It seems like the mainloop is waiting here:

#0  0x7606c89c in __lll_lock_wait ()
from /lib/x86_64-linux-gnu/libpthread.so.0
No symbol table info available.
#1  0x76068065 in _L_lock_858 ()
from /lib/x86_64-linux-gnu/libpthread.so.0
No symbol table info available.
#2  0x76067eba in pthread_mutex_lock ()
from /lib/x86_64-linux-gnu/libpthread.so.0
No symbol table info available.
#3  0x559f2557 in qemu_mutex_lock (mutex=0x55ed6d40)
 at util/qemu-thread-posix.c:76
 err = 0
 __func__ = "qemu_mutex_lock"
#4  0x556306ef in qemu_mutex_lock_iothread ()
 at /usr/src/qemu-2.2.0/cpus.c:1123
No locals.

This means the VCPU is busy with some synchronous activity---maybe a
bdrv_aio_cancel?


Here is what the other threads are doing (dropped VNC thread):

Thread 3 (Thread 0x74d4f700 (LWP 2637)):
#0  0x7606c89c in __lll_lock_wait ()
   from /lib/x86_64-linux-gnu/libpthread.so.0
No symbol table info available.
#1  0x76068065 in _L_lock_858 ()
   from /lib/x86_64-linux-gnu/libpthread.so.0
No symbol table info available.
#2  0x76067eba in pthread_mutex_lock ()
   from /lib/x86_64-linux-gnu/libpthread.so.0
No symbol table info available.
#3  0x559f2557 in qemu_mutex_lock (mutex=0x55ed6d40)
at util/qemu-thread-posix.c:76
err = 0
__func__ = "qemu_mutex_lock"
#4  0x556306ef in qemu_mutex_lock_iothread ()
at /usr/src/qemu-2.2.0/cpus.c:1123
No locals.
#5  0x5564b9ac in kvm_cpu_exec (cpu=0x563cb870)
at /usr/src/qemu-2.2.0/kvm-all.c:1770
run = 0x77ee2000
ret = 65536
run_ret = -4
#6  0x556301dc in qemu_kvm_cpu_thread_fn (arg=0x563cb870)
at /usr/src/qemu-2.2.0/cpus.c:953
cpu = 0x563cb870
r = 65536
#5  0x75d9338d in clone () from /lib/x86_64-linux-gnu/libc.so.6
No symbol table info available.
#6  0x in ?? ()
No symbol table info available.

Thread 3 (Thread 0x74d4f700 (LWP 2637)):
#0  0x7606c89c in __lll_lock_wait ()
   from /lib/x86_64-linux-gnu/libpthread.so.0
No symbol table info available.
#1  0x76068065 in _L_lock_858 ()
   from /lib/x86_64-linux-gnu/libpthread.so.0
No symbol table info available.
#2  0x76067eba in pthread_mutex_lock ()
   from /lib/x86_64-linux-gnu/libpthread.so.0
No symbol table info available.
#3  0x559f2557 in qemu_mutex_lock (mutex=0x55ed6d40)
at util/qemu-thread-posix.c:76
err = 0
__func__ = "qemu_mutex_lock"
#4  0x556306ef in qemu_mutex_lock_iothread ()
at /usr/src/qemu-2.2.0/cpus.c:1123
No locals.
#5  0x5564b9ac in kvm_cpu_exec (cpu=0x563cb870)
at /usr/src/qemu-2.2.0/kvm-all.c:1770
run = 0x77ee2000
ret = 65536
run_ret = -4
#6  0x556301dc in qemu_kvm_cpu_thread_fn (arg=0x563cb870)
at /usr/src/qemu-2.2.0/cpus.c:953
cpu = 0x563cb870
r = 65536
#7  0x76065e9a in start_thread ()
   from /lib/x86_64-linux-gnu/libpthread.so.0
No symbol table info available.
#8  0x75d9338d in clone () from /lib/x86_64-linux-gnu/libc.so.6
No symbol table info available.
#9  0x in ?? ()
No symbol table info available.

Thread 2 (Thread 0x75550700 (LWP 2636)):
#0  0x75d87aa3 in ppoll () from /lib/x86_64-linux-gnu/libc.so.6
No symbol table info available.
#1  0x55955d91 in qemu_poll_ns (fds=0x563889c0, nfds=3,
timeout=4999424576) at qemu-timer.c:326
ts = {tv_sec = 4, tv_nsec = 999424576}
tvsec = 4
#2  0x55956feb in aio_poll (ctx=0x563528e0, blocking=true)
at aio-posix.c:231
node = 0x0
was_dispatching = false
ret = 1
progress = false
#3  0x5594aeed in bdrv_prwv_co (bs=0x5637eae0, offset=4292007936,
qiov=0x7554f760, is_write=false, flags=0) at block.c:2699
aio_context = 0x563528e0
co = 0x563888a0
rwco = {bs = 0x5637eae0, offset = 4292007936,
  qiov = 0x7554f760, is_write = false, ret = 2147483647, flags = 0}
#4  0x5594afa9 in bdrv_rw_co (bs=0x5637eae0, sector_num=8382828,
buf=0x744cc800 "(", nb_sectors=4, is_write=false, flags=0)
at block.c:2722
qiov = {iov = 0x7554f780, niov = 1, nalloc = -1, size = 2048}
iov = {iov_base = 0x744cc800, iov_len = 2048}
   from /lib/x86_64-linux-gnu/libpthread.so.0
No symbol table info available.
#8  0x75d9338d in clone () from /lib/x86_64-linux-gnu/libc.so.6
No symbol table info available.
#9  0x in ?? ()
No symbol table info available.

Thread 2 (Thread 0x75550700 (LWP 2636)):
#0  0x75d87aa3 in ppoll () from /lib/x86_64-linux-gnu/libc.so.6
No symbol table info available.
#1  0x55955d91 in 

Re: [Qemu-block] RFC cdrom in own thread?

2015-06-17 Thread Peter Lieven

Am 17.06.2015 um 10:35 schrieb Kevin Wolf:

Am 16.06.2015 um 17:34 hat Stefan Hajnoczi geschrieben:

On Tue, Jun 16, 2015 at 3:44 PM, Peter Lieven  wrote:

I wonder how difficult it would be to have the IDE CDROM run in its own
thread?
We usually have ISOs mounted on an NFS share as CDROM. Problem: If the NFS
Share
goes down, it takes down monitor, qmp, vnc etc. with it.

Maybe its already possible to do this via cmdline args?

Any ideas, comments?

If QEMU hangs in the read/write/flush/discard code path due to NFS
downtime it is a bug.

QEMU is expected to hang in open/reopen because those are performed in
a blocking fashion.

Which of these cases applies to what you are seeing?  Maybe it can be fixed.

Don't forget bdrv_drain_all(), which is called a lot by the monitor. So
no matter what you do (and this includes moving to a thread as in a
hypothetical "ATAPI dataplane"), you end up with a hang sooner or later.


It seems like the mainloop is waiting here:

#0  0x7606c89c in __lll_lock_wait ()
   from /lib/x86_64-linux-gnu/libpthread.so.0
No symbol table info available.
#1  0x76068065 in _L_lock_858 ()
   from /lib/x86_64-linux-gnu/libpthread.so.0
No symbol table info available.
#2  0x76067eba in pthread_mutex_lock ()
   from /lib/x86_64-linux-gnu/libpthread.so.0
No symbol table info available.
#3  0x559f2557 in qemu_mutex_lock (mutex=0x55ed6d40)
at util/qemu-thread-posix.c:76
err = 0
__func__ = "qemu_mutex_lock"
#4  0x556306ef in qemu_mutex_lock_iothread ()
at /usr/src/qemu-2.2.0/cpus.c:1123
No locals.
#5  0x55954a87 in os_host_main_loop_wait (timeout=79413589)
at main-loop.c:242
ret = 1
spin_counter = 0
#6  0x55954b5f in main_loop_wait (nonblocking=0) at main-loop.c:494
ret = 15
timeout = 4294967295
timeout_ns = 79413589
#7  0x5575e702 in main_loop () at vl.c:1882
nonblocking = false
last_io = 1

Peter




Re: [Qemu-block] RFC cdrom in own thread?

2015-06-17 Thread Peter Lieven

Am 17.06.2015 um 10:35 schrieb Kevin Wolf:

Am 16.06.2015 um 17:34 hat Stefan Hajnoczi geschrieben:

On Tue, Jun 16, 2015 at 3:44 PM, Peter Lieven  wrote:

I wonder how difficult it would be to have the IDE CDROM run in its own
thread?
We usually have ISOs mounted on an NFS share as CDROM. Problem: If the NFS
Share
goes down, it takes down monitor, qmp, vnc etc. with it.

Maybe its already possible to do this via cmdline args?

Any ideas, comments?

If QEMU hangs in the read/write/flush/discard code path due to NFS
downtime it is a bug.

QEMU is expected to hang in open/reopen because those are performed in
a blocking fashion.

Which of these cases applies to what you are seeing?  Maybe it can be fixed.

Don't forget bdrv_drain_all(), which is called a lot by the monitor. So
no matter what you do (and this includes moving to a thread as in a
hypothetical "ATAPI dataplane"), you end up with a hang sooner or later.


I will have a look where qemu hangs. The problem exists with an NFS share
mounted by the kernel and also with libnfs. So it might be a bdrv_drain_all.
I regularly query info block and info blockstats. Do these commands always
call bdrv_drain_all()?.

Peter




[Qemu-block] RFC cdrom in own thread?

2015-06-16 Thread Peter Lieven

Hi,

I wonder how difficult it would be to have the IDE CDROM run in its own thread?
We usually have ISOs mounted on an NFS share as CDROM. Problem: If the NFS Share
goes down, it takes down monitor, qmp, vnc etc. with it.

Maybe its already possible to do this via cmdline args?

Any ideas, comments?

Thanks,
Peter




[Qemu-block] [PATCH] block/iscsi: add support for request timeouts

2015-06-16 Thread Peter Lieven
libiscsi starting with 1.15 will properly support timeout of iscsi
commands. The default will remain no timeout, but this can
be changed via cmdline parameters, e.g.:

qemu -iscsi timeout=30 -drive file=iscsi://...

If a timeout occurs a reconnect is scheduled and the timed out command
will be requeued for processing after a successful reconnect.

The required API call iscsi_set_timeout is present since libiscsi
1.10 which was released in October 2013. However, due to some bugs
in the libiscsi code the use is not recommended before version 1.15.

Please note that this patch bumps the libiscsi requirement to 1.10
to have all function and macros defined. The patch fixes also a
off-by-one error in the NOP timeout calculation which was fixed
while touching these code parts.

Signed-off-by: Peter Lieven 
---
 block/iscsi.c   | 87 ++---
 configure   |  6 ++--
 qemu-options.hx |  4 +++
 3 files changed, 72 insertions(+), 25 deletions(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index 14e97a6..f19a56a 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -69,6 +69,7 @@ typedef struct IscsiLun {
 bool dpofua;
 bool has_write_same;
 bool force_next_flush;
+bool request_timed_out;
 } IscsiLun;
 
 typedef struct IscsiTask {
@@ -99,7 +100,8 @@ typedef struct IscsiAIOCB {
 #endif
 } IscsiAIOCB;
 
-#define EVENT_INTERVAL 250
+/* libiscsi uses time_t so its enough to process events every second */
+#define EVENT_INTERVAL 1000
 #define NOP_INTERVAL 5000
 #define MAX_NOP_FAILURES 3
 #define ISCSI_CMD_RETRIES ARRAY_SIZE(iscsi_retry_times)
@@ -186,13 +188,18 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int 
status,
 iTask->do_retry = 1;
 goto out;
 }
-/* status 0x28 is SCSI_TASK_SET_FULL. It was first introduced
- * in libiscsi 1.10.0. Hardcode this value here to avoid
- * the need to bump the libiscsi requirement to 1.10.0 */
-if (status == SCSI_STATUS_BUSY || status == 0x28) {
+if (status == SCSI_STATUS_BUSY || status == SCSI_STATUS_TIMEOUT ||
+status == SCSI_STATUS_TASK_SET_FULL) {
 unsigned retry_time =
 exp_random(iscsi_retry_times[iTask->retries - 1]);
-error_report("iSCSI Busy/TaskSetFull (retry #%u in %u ms): %s",
+if (status == SCSI_STATUS_TIMEOUT) {
+/* make sure the request is rescheduled AFTER the
+ * reconnect is initiated */
+retry_time = EVENT_INTERVAL * 2;
+iTask->iscsilun->request_timed_out = true;
+}
+error_report("iSCSI Busy/TaskSetFull/TimeOut"
+ " (retry #%u in %u ms): %s",
  iTask->retries, retry_time,
  iscsi_get_error(iscsi));
 aio_timer_init(iTask->iscsilun->aio_context,
@@ -276,20 +283,26 @@ iscsi_set_events(IscsiLun *iscsilun)
iscsilun);
 iscsilun->events = ev;
 }
-
-/* newer versions of libiscsi may return zero events. In this
- * case start a timer to ensure we are able to return to service
- * once this situation changes. */
-if (!ev) {
-timer_mod(iscsilun->event_timer,
-  qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + EVENT_INTERVAL);
-}
 }
 
-static void iscsi_timed_set_events(void *opaque)
+static void iscsi_timed_check_events(void *opaque)
 {
 IscsiLun *iscsilun = opaque;
+
+/* check for timed out requests */
+iscsi_service(iscsilun->iscsi, 0);
+
+if (iscsilun->request_timed_out) {
+iscsilun->request_timed_out = false;
+iscsi_reconnect(iscsilun->iscsi);
+}
+
+/* newer versions of libiscsi may return zero events. Ensure we are able
+ * to return to service once this situation changes. */
 iscsi_set_events(iscsilun);
+
+timer_mod(iscsilun->event_timer,
+  qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + EVENT_INTERVAL);
 }
 
 static void
@@ -1096,16 +1109,37 @@ static char *parse_initiator_name(const char *target)
 return iscsi_name;
 }
 
+static int parse_timeout(const char *target)
+{
+QemuOptsList *list;
+QemuOpts *opts;
+const char *timeout;
+
+list = qemu_find_opts("iscsi");
+if (list) {
+opts = qemu_opts_find(list, target);
+if (!opts) {
+opts = QTAILQ_FIRST(&list->head);
+}
+if (opts) {
+timeout = qemu_opt_get(opts, "timeout");
+if (timeout) {
+return atoi(timeout);
+}
+}
+}
+
+return 0;
+}
+
 static void iscsi_nop_timed_event(void *opaque)
 {
 IscsiLun *iscsilun = opaque;
 
-if (iscsi_get_nops_in_flight(iscsilun->iscsi) > MAX_NOP_FAILURES)

Re: [Qemu-block] RFC block/iscsi command timeout

2015-06-02 Thread Peter Lieven

Am 26.05.2015 um 12:21 schrieb Paolo Bonzini:


On 26/05/2015 12:06, Kevin Wolf wrote:

Am 26.05.2015 um 11:44 hat Paolo Bonzini geschrieben:


On 26/05/2015 11:37, Kevin Wolf wrote:

If we run into a timeout we theoretically have the following options:
  - reconnect
  - retry
  - error

I would reconnect as Ronnie proposed.

Just trying to reconnect indefinitely might not be the best option.
Consider the situation where you're inside a bdrv_drain_all(), which
blocks qemu completely. Trying to reconnect once or twice is probably
fine, but if that doesn't work, eventually you want to return an error
so that qemu is unstuck.

Whenever the topic of timeout is brought about, I'm worried that
introducing timeouts (and doing anything except reconnecting) is the
same as NFS's soft option, which can actually cause data corruption.
So, why would it be safe?

How would it cause data corruption for qemu, i.e. which of the block
layer assumptions would be broken?

Reordering of operations.  Say you have:

  guest -> QEMUwrite A to sector 1
  QEMU -> NFS  write A to sector 1
  QEMU -> guestwrite A to sector 1 timed out
  guest -> QEMUwrite B to sector 1

At this point you have the two outstanding writes are for the same
sector and with different payloads, so it's undefined which one wins.

  QEMU -> NFS  write B to sector 1
  NFS -> QEMU  write B to sector 1 completed
  QEMU -> guestwrite B to sector 1 completed
  NFS -> QEMU  write A to sector 1 completed
   (QEMU doesn't report this to the guest)

The guest thinks it has written B, but it's possible that the storage
has written A.


So you would go for infinite reconnecting? We can SIGKILL then anyway.

As said before my idea would be default of 5000ms for all sync calls and
no timeout for all async calls coming from the block layer.

A user settable timeout can be optionally specified via cmdline options
to define a timeout for both sync and async calls.

Peter



Re: [Qemu-block] RFC block/iscsi command timeout

2015-05-26 Thread Peter Lieven

Am 26.05.2015 um 11:01 schrieb Kevin Wolf:

Am 22.05.2015 um 09:30 hat Peter Lieven geschrieben:

Hi,

next libiscsi version will allow to define command timeouts. At least for sync 
commands this
was in fact possible since somewhen in 2013. Per default this timeout is 
disabled.

I would like to at least define a timeout for the sync commands (login, 
readcapacity, inquiry, logout etc)
of about 5000msec.

You should probably make the timeout an integer option rather than a
boolean one. Then we don't need to discuss whether 5000 msec are
appropriate...


The chance to the iSCSI driver of qemu would be minimal for that. The timeout
handling is done in the sync event loop of libiscsi.

My question is if it would be a good idea to have timeout (fairly high of 
course) for all other commands as well?

...and what "fairly high" actually means.


Following your idea to introduce an option I would do the following:

Default:
 sync commands: 5000msec (this should be far, far enough for login, logout, 
etc.) These are all calls in iscsi_open / iscsi_close, not called during normal 
operation.
 async commands: no timeout

Secifying a timeout will then set the timeout for sync and async commands with
the option to specify 0 to disable timeouts at all.

If we run into a timeout we theoretically have the following options:
 - reconnect
 - retry
 - error

I would reconnect as Ronnie proposed.

Peter




Re: [Qemu-block] RFC block/iscsi command timeout

2015-05-26 Thread Peter Lieven

Am 22.05.2015 um 14:22 schrieb ronnie sahlberg:



On Fri, May 22, 2015 at 12:30 AM, Peter Lieven mailto:p...@kamp.de>> wrote:

Hi,

next libiscsi version will allow to define command timeouts. At least for 
sync commands this
was in fact possible since somewhen in 2013. Per default this timeout is 
disabled.


Just to mention that this does not require any API changes or need to update 
library version requirements.
It just require that the application will be calling iscsi_service(iscsi, 0) at 
regular intervals to
trigger the timeout scans.

On older versions of libiscsi, this will be a no-op,  so situation is unchanged 
and timeouts may not work. On newer versions of libiscsi this will actually 
process the timeout scanning and cause the timeouts to actually happen.



I would like to at least define a timeout for the sync commands (login, 
readcapacity, inquiry, logout etc)
of about 5000msec. The chance to the iSCSI driver of qemu would be minimal 
for that. The timeout
handling is done in the sync event loop of libiscsi.


Is the idea that when a timeout occurs, the callback will eventually trigger an 
attempt to tear down and reconnect the session?

I think a lot of/most linux kernels (guests)  have a default timeout that they 
will perform a full bus reset once a task has hung for 60 seconds.

Thus a tmeout or <<60 seconds will give you a lot of time to do several 
attempts to bounce the session to the target and try to recover before it hits the 
guest.


The problem with a timeout for *ALL* commands is that we might execute a 
command (even from the guest) which may take a very long time.

So, I think, the safe approach would be to set only timeouts at the sync 
commands first.

Peter



Re: [Qemu-block] [Qemu-stable] [PATCH v5 3/8] mirror: Do zero write on target if sectors not allocated

2015-05-25 Thread Peter Lieven

Am 26.05.2015 um 05:36 schrieb Fam Zheng:

If guest discards a source cluster, mirroring with bdrv_aio_readv is overkill.
Some protocols do zero upon discard, where it's best to use
bdrv_aio_write_zeroes, otherwise, bdrv_aio_discard will be enough.

Signed-off-by: Fam Zheng 
---
  block/mirror.c | 20 ++--
  1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index 85995b2..ba33254 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -164,6 +164,8 @@ static uint64_t coroutine_fn 
mirror_iteration(MirrorBlockJob *s)
  int64_t end, sector_num, next_chunk, next_sector, hbitmap_next_sector;
  uint64_t delay_ns = 0;
  MirrorOp *op;
+int pnum;
+int64_t ret;
  
  s->sector_num = hbitmap_iter_next(&s->hbi);

  if (s->sector_num < 0) {
@@ -290,8 +292,22 @@ static uint64_t coroutine_fn 
mirror_iteration(MirrorBlockJob *s)
  s->in_flight++;
  s->sectors_in_flight += nb_sectors;
  trace_mirror_one_iteration(s, sector_num, nb_sectors);
-bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
-   mirror_read_complete, op);
+
+ret = bdrv_get_block_status_above(source, NULL, sector_num,
+  nb_sectors, &pnum);
+if (ret < 0 || pnum < nb_sectors ||
+(ret & BDRV_BLOCK_DATA && !(ret & BDRV_BLOCK_ZERO))) {
+bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
+   mirror_read_complete, op);
+} else if (ret & BDRV_BLOCK_ZERO) {
+bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors,
+  s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
+  mirror_write_complete, op);
+} else {
+assert(!(ret & BDRV_BLOCK_DATA));
+bdrv_aio_discard(s->target, sector_num, op->nb_sectors,
+ mirror_write_complete, op);
+}


I wonder what happens if on the destination the discard is a NOP which is legal 
(at least in SCSI).
In this case we might end up having different contents and source and 
destination. Or is this
not a problem?

Peter



[Qemu-block] RFC block/iscsi command timeout

2015-05-22 Thread Peter Lieven
Hi,

next libiscsi version will allow to define command timeouts. At least for sync 
commands this
was in fact possible since somewhen in 2013. Per default this timeout is 
disabled.

I would like to at least define a timeout for the sync commands (login, 
readcapacity, inquiry, logout etc)
of about 5000msec. The chance to the iSCSI driver of qemu would be minimal for 
that. The timeout
handling is done in the sync event loop of libiscsi.

My question is if it would be a good idea to have timeout (fairly high of 
course) for all other commands as well?

Peter




Re: [Qemu-block] qemu exit with pending I/O

2015-04-27 Thread Peter Lieven

Am 27.04.2015 um 14:11 schrieb Paolo Bonzini:


On 27/04/2015 13:51, Peter Lieven wrote:

The reason to do this, is that while I/O is pending you do not know if
the I/O has been completed or not.

But if I consider a raw image, quitting without a clean shutdown can
also leave an inconsistent state?

But you know that all I/O submitted to disk has been handled.

The bdrv_drain_all() avoids that some leftover I/O mysteriously pops up
while another VM, or another instance of this VM, is using the image.
That's much worse.


Can this happen with a raw image used by a single VM?

Would it be an option to have a quit command with lets call it "noflush" flag?

I consider the case that my storage has died. In the iSCSI case all vServers 
will
end up in the now async reconnect loop. In such a case I might need to 
terminate the
vServers and want to restart them later from a backup. It would be interesting 
to be able
to terminate the vServer with a command via hmp/qmp wihtout the need to send a 
SIGKILL
from external.

Peter




Re: [Qemu-block] qemu exit with pending I/O

2015-04-27 Thread Peter Lieven

Am 27.04.2015 um 12:47 schrieb Paolo Bonzini:


On 16/04/2015 10:46, Peter Lieven wrote:


I just run tests with the new libiscsi branch that allows async
reconnects. I found that
I cannot quit qemu while qemu is waiting for a reconnect to the storage.
E.g.
I start qemu and then shut down the storage. Qemu will then loop forever in
bdrv_flush() and/or bdrv_drain_all().

It would be nice to be able to shut down qemu via normal quit and not have
to kill it in such a case.

The reason to do this, is that while I/O is pending you do not know if
the I/O has been completed or not.


But if I consider a raw image, quitting without a clean shutdown can
also leave an inconsistent state?

Peter



It's then possible that the I/O is completed after some writes from
another invocation of the VM, causing data corruption.  For the same
reason the NBD driver does not do timeouts.

Basically, bdrv_drain_all() is there to emulate nfs=hard.  You always
have SIGKILL, so it's not exactly the same, but leaving it out would
have the same problems as nfs=soft.

Paolo



--

Mit freundlichen Grüßen

Peter Lieven

...

  KAMP Netzwerkdienste GmbH
  Vestische Str. 89-91 | 46117 Oberhausen
  Tel: +49 (0) 208.89 402-50 | Fax: +49 (0) 208.89 402-40
  p...@kamp.de | http://www.kamp.de

  Geschäftsführer: Heiner Lante | Michael Lante
  Amtsgericht Duisburg | HRB Nr. 12154
  USt-Id-Nr.: DE 120607556

...




Re: [Qemu-block] [PATCH for-2.4 07/10] block/iscsi: bump libiscsi requirement to 1.10.0

2015-04-17 Thread Peter Lieven
Am 16.04.2015 um 15:20 schrieb Paolo Bonzini:
>
> On 16/04/2015 14:58, Peter Lieven wrote:
>>> On 16/04/2015 14:18, Peter Lieven wrote:
>>>> We need this to support SCSI_STATUS_TASK_SET_FULL.
>>> Any reason apart from the missing constant?
>> No, but I wanted to avoid starting checking for constants that were
>> added shortly after this.
>> You can't check with #ifdef for a constant in an enum.
> But you can #define it if libiscsi version is <1.10.

There is no macro to check for that. I took the easy way
hardcoding the value. Its an official standard so there is
no chance it will change.

Peter



Re: [Qemu-block] [PATCH for-2.4 05/10] block/iscsi: optimize WRITE10/16 if cache.writeback is not set

2015-04-17 Thread Peter Lieven
Am 16.04.2015 um 15:17 schrieb Paolo Bonzini:
>
> On 16/04/2015 15:02, Peter Lieven wrote:
>>> Also, I think it is iscsi_co_generic_cb that should set
>>> force_next_flush, so that it is only set on failure.  Not really for the
>>> optimization value, but because it's clearer.
>> I don't get what you mean with it should only "set on failure".
>> My approach would be to add a flag to the iTask that the command
>> requires to set force_flush after successful completion. Is it that
>> what you mean?
> Set on success.  Lack of sleep.

I have send a v2 following your suggestions.

Peter




[Qemu-block] [PATCH for-2.4 V2 2/9] block/iscsi: change all iscsilun properties from uint8_t to bool

2015-04-16 Thread Peter Lieven
Signed-off-by: Peter Lieven 
---
 block/iscsi.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index be8af46..6cf7e99 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -57,9 +57,6 @@ typedef struct IscsiLun {
 int events;
 QEMUTimer *nop_timer;
 QEMUTimer *event_timer;
-uint8_t lbpme;
-uint8_t lbprz;
-uint8_t has_write_same;
 struct scsi_inquiry_logical_block_provisioning lbp;
 struct scsi_inquiry_block_limits bl;
 unsigned char *zeroblock;
@@ -67,6 +64,9 @@ typedef struct IscsiLun {
 int cluster_sectors;
 bool use_16_for_rw;
 bool write_protected;
+bool lbpme;
+bool lbprz;
+bool has_write_same;
 } IscsiLun;
 
 typedef struct IscsiTask {
@@ -460,7 +460,7 @@ static int64_t coroutine_fn 
iscsi_co_get_block_status(BlockDriverState *bs,
 *pnum = nb_sectors;
 
 /* LUN does not support logical block provisioning */
-if (iscsilun->lbpme == 0) {
+if (!iscsilun->lbpme) {
 goto out;
 }
 
@@ -1121,8 +1121,8 @@ static void iscsi_readcapacity_sync(IscsiLun *iscsilun, 
Error **errp)
 } else {
 iscsilun->block_size = rc16->block_length;
 iscsilun->num_blocks = rc16->returned_lba + 1;
-iscsilun->lbpme = rc16->lbpme;
-iscsilun->lbprz = rc16->lbprz;
+iscsilun->lbpme = !!rc16->lbpme;
+iscsilun->lbprz = !!rc16->lbprz;
 iscsilun->use_16_for_rw = (rc16->returned_lba > 
0x);
 }
 }
@@ -1655,7 +1655,7 @@ out:
 static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
 IscsiLun *iscsilun = bs->opaque;
-bdi->unallocated_blocks_are_zero = !!iscsilun->lbprz;
+bdi->unallocated_blocks_are_zero = iscsilun->lbprz;
 bdi->can_write_zeroes_with_unmap = iscsilun->lbprz && iscsilun->lbp.lbpws;
 bdi->cluster_size = iscsilun->cluster_sectors * BDRV_SECTOR_SIZE;
 return 0;
-- 
1.9.1




[Qemu-block] [PATCH for-2.4 V2 8/9] block/iscsi: bump year in copyright notice

2015-04-16 Thread Peter Lieven
Signed-off-by: Peter Lieven 
---
 block/iscsi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index 328907b..8364f97 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -2,7 +2,7 @@
  * QEMU Block driver for iSCSI images
  *
  * Copyright (c) 2010-2011 Ronnie Sahlberg 
- * Copyright (c) 2012-2014 Peter Lieven 
+ * Copyright (c) 2012-2015 Peter Lieven 
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to 
deal
-- 
1.9.1




[Qemu-block] [PATCH for-2.4 V2 6/9] block/iscsi: increase retry count

2015-04-16 Thread Peter Lieven
The idea is that a command is retried in a BUSY condition
up a time of approx. 60 seconds before it is failed. This should
be far higher than any command timeout in the guest.

Signed-off-by: Peter Lieven 
---
 block/iscsi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index 600..5999f74 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -103,7 +103,7 @@ typedef struct IscsiAIOCB {
 #define NOP_INTERVAL 5000
 #define MAX_NOP_FAILURES 3
 #define ISCSI_CMD_RETRIES ARRAY_SIZE(iscsi_retry_times)
-static const unsigned iscsi_retry_times[] = {8, 32, 128, 512, 2048};
+static const unsigned iscsi_retry_times[] = {8, 32, 128, 512, 2048, 8192, 
32768};
 
 /* this threshold is a trade-off knob to choose between
  * the potential additional overhead of an extra GET_LBA_STATUS request
-- 
1.9.1




[Qemu-block] [PATCH for-2.4 V2 7/9] block/iscsi: handle SCSI_STATUS_TASK_SET_FULL

2015-04-16 Thread Peter Lieven
a target may issue a SCSI_STATUS_TASK_SET_FULL status
if there is more than one "BUSY" command queued already.

Signed-off-by: Peter Lieven 
---
 block/iscsi.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index 5999f74..328907b 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -186,10 +186,13 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int 
status,
 iTask->do_retry = 1;
 goto out;
 }
-if (status == SCSI_STATUS_BUSY) {
+/* status 0x28 is SCSI_TASK_SET_FULL. It was first introduced
+ * in libiscsi 1.10.0. Hardcode this value here to avoid
+ * the need to bump the libiscsi requirement to 1.10.0 */
+if (status == SCSI_STATUS_BUSY || status == 0x28) {
 unsigned retry_time =
 exp_random(iscsi_retry_times[iTask->retries - 1]);
-error_report("iSCSI Busy (retry #%u in %u ms): %s",
+error_report("iSCSI Busy/TaskSetFull (retry #%u in %u ms): %s",
  iTask->retries, retry_time,
  iscsi_get_error(iscsi));
 aio_timer_init(iTask->iscsilun->aio_context,
-- 
1.9.1




[Qemu-block] [PATCH for-2.4 V2 9/9] block/iscsi: use the allocationmap also if cache.direct=on

2015-04-16 Thread Peter Lieven
the allocationmap has only a hint character. The driver always
double checks that blocks marked unallocated in the cache are
still unallocated before taking the fast path and return zeroes.
So using the allocationmap is migration safe and can
also be enabled with cache.direct=on.

Signed-off-by: Peter Lieven 
---
 block/iscsi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index 8364f97..8fca1d3 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -1499,7 +1499,7 @@ static int iscsi_open(BlockDriverState *bs, QDict 
*options, int flags,
 iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 
1024) {
 iscsilun->cluster_sectors = (iscsilun->bl.opt_unmap_gran *
  iscsilun->block_size) >> BDRV_SECTOR_BITS;
-if (iscsilun->lbprz && !(bs->open_flags & BDRV_O_NOCACHE)) {
+if (iscsilun->lbprz) {
 iscsilun->allocationmap = iscsi_allocationmap_init(iscsilun);
 if (iscsilun->allocationmap == NULL) {
 ret = -ENOMEM;
-- 
1.9.1




[Qemu-block] [PATCH for-2.4 V2 5/9] block/iscsi: optimize WRITE10/16 if cache.writeback is not set

2015-04-16 Thread Peter Lieven
SCSI allowes to tell the target to not return from a write command
if the date is not written to the disk. Use this so called FUA
bit if it is supported to optimize WRITE commands if writeback is
not allowed.

In this case qemu always issues a WRITE followed by a FLUSH. This
is 2 round trip times. If we set the FUA bit we can ignore the
following FLUSH.

Signed-off-by: Peter Lieven 
---
 block/iscsi.c | 18 +++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index 237faa1..600 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -68,6 +68,7 @@ typedef struct IscsiLun {
 bool lbprz;
 bool dpofua;
 bool has_write_same;
+bool force_next_flush;
 } IscsiLun;
 
 typedef struct IscsiTask {
@@ -80,6 +81,7 @@ typedef struct IscsiTask {
 QEMUBH *bh;
 IscsiLun *iscsilun;
 QEMUTimer retry_timer;
+bool force_next_flush;
 } IscsiTask;
 
 typedef struct IscsiAIOCB {
@@ -200,6 +202,8 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,
 }
 }
 error_report("iSCSI Failure: %s", iscsi_get_error(iscsi));
+} else {
+iTask->iscsilun->force_next_flush |= iTask->force_next_flush;
 }
 
 out:
@@ -370,6 +374,7 @@ static int coroutine_fn iscsi_co_writev(BlockDriverState 
*bs,
 struct IscsiTask iTask;
 uint64_t lba;
 uint32_t num_sectors;
+int fua;
 
 if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
 return -EINVAL;
@@ -385,15 +390,17 @@ static int coroutine_fn iscsi_co_writev(BlockDriverState 
*bs,
 num_sectors = sector_qemu2lun(nb_sectors, iscsilun);
 iscsi_co_init_iscsitask(iscsilun, &iTask);
 retry:
+fua = iscsilun->dpofua && !bs->enable_write_cache;
+iTask.force_next_flush = !fua;
 if (iscsilun->use_16_for_rw) {
 iTask.task = iscsi_write16_task(iscsilun->iscsi, iscsilun->lun, lba,
 NULL, num_sectors * 
iscsilun->block_size,
-iscsilun->block_size, 0, 0, 0, 0, 0,
+iscsilun->block_size, 0, 0, fua, 0, 0,
 iscsi_co_generic_cb, &iTask);
 } else {
 iTask.task = iscsi_write10_task(iscsilun->iscsi, iscsilun->lun, lba,
 NULL, num_sectors * 
iscsilun->block_size,
-iscsilun->block_size, 0, 0, 0, 0, 0,
+iscsilun->block_size, 0, 0, fua, 0, 0,
 iscsi_co_generic_cb, &iTask);
 }
 if (iTask.task == NULL) {
@@ -621,8 +628,12 @@ static int coroutine_fn iscsi_co_flush(BlockDriverState 
*bs)
 return 0;
 }
 
-iscsi_co_init_iscsitask(iscsilun, &iTask);
+if (!iscsilun->force_next_flush) {
+return 0;
+}
+iscsilun->force_next_flush = false;
 
+iscsi_co_init_iscsitask(iscsilun, &iTask);
 retry:
 if (iscsi_synchronizecache10_task(iscsilun->iscsi, iscsilun->lun, 0, 0, 0,
   0, iscsi_co_generic_cb, &iTask) == NULL) 
{
@@ -918,6 +929,7 @@ coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, 
int64_t sector_num,
 }
 
 iscsi_co_init_iscsitask(iscsilun, &iTask);
+iTask.force_next_flush = true;
 retry:
 if (use_16_for_ws) {
 iTask.task = iscsi_writesame16_task(iscsilun->iscsi, iscsilun->lun, 
lba,
-- 
1.9.1




[Qemu-block] [PATCH for-2.4 V2 3/9] block/iscsi: rename iscsi_write_protected and let it return void

2015-04-16 Thread Peter Lieven
Signed-off-by: Peter Lieven 
---
 block/iscsi.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index 6cf7e99..221c9fc 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -1253,11 +1253,11 @@ static void iscsi_attach_aio_context(BlockDriverState 
*bs,
   iscsi_timed_set_events, iscsilun);
 }
 
-static bool iscsi_is_write_protected(IscsiLun *iscsilun)
+static void iscsi_modesense_sync(IscsiLun *iscsilun)
 {
 struct scsi_task *task;
 struct scsi_mode_sense *ms = NULL;
-bool wrprotected = false;
+iscsilun->write_protected = false;
 
 task = iscsi_modesense6_sync(iscsilun->iscsi, iscsilun->lun,
  1, SCSI_MODESENSE_PC_CURRENT,
@@ -1278,13 +1278,12 @@ static bool iscsi_is_write_protected(IscsiLun *iscsilun)
  iscsi_get_error(iscsilun->iscsi));
 goto out;
 }
-wrprotected = ms->device_specific_parameter & 0x80;
+iscsilun->write_protected = ms->device_specific_parameter & 0x80;
 
 out:
 if (task) {
 scsi_free_scsi_task(task);
 }
-return wrprotected;
 }
 
 /*
@@ -1403,7 +1402,8 @@ static int iscsi_open(BlockDriverState *bs, QDict 
*options, int flags,
 scsi_free_scsi_task(task);
 task = NULL;
 
-iscsilun->write_protected = iscsi_is_write_protected(iscsilun);
+iscsi_modesense_sync(iscsilun);
+
 /* Check the write protect flag of the LUN if we want to write */
 if (iscsilun->type == TYPE_DISK && (flags & BDRV_O_RDWR) &&
 iscsilun->write_protected) {
-- 
1.9.1




[Qemu-block] [PATCH for-2.4 V2 1/9] block/iscsi: do not forget to logout from target

2015-04-16 Thread Peter Lieven
We actually were always impolitely dropping the connection and
not cleanly logging out.

CC: qemu-sta...@nongnu.org
Signed-off-by: Peter Lieven 
---
 block/iscsi.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/block/iscsi.c b/block/iscsi.c
index ba33290..be8af46 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -1501,6 +1501,9 @@ out:
 
 if (ret) {
 if (iscsi != NULL) {
+if (iscsi_is_logged_in(iscsi)) {
+iscsi_logout_sync(iscsi);
+}
 iscsi_destroy_context(iscsi);
 }
 memset(iscsilun, 0, sizeof(IscsiLun));
@@ -1514,6 +1517,9 @@ static void iscsi_close(BlockDriverState *bs)
 struct iscsi_context *iscsi = iscsilun->iscsi;
 
 iscsi_detach_aio_context(bs);
+if (iscsi_is_logged_in(iscsi)) {
+iscsi_logout_sync(iscsi);
+}
 iscsi_destroy_context(iscsi);
 g_free(iscsilun->zeroblock);
 g_free(iscsilun->allocationmap);
-- 
1.9.1




[Qemu-block] [PATCH for-2.4 V2 4/9] block/iscsi: store DPOFUA bit from the modesense command

2015-04-16 Thread Peter Lieven
Signed-off-by: Peter Lieven 
---
 block/iscsi.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/block/iscsi.c b/block/iscsi.c
index 221c9fc..237faa1 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -66,6 +66,7 @@ typedef struct IscsiLun {
 bool write_protected;
 bool lbpme;
 bool lbprz;
+bool dpofua;
 bool has_write_same;
 } IscsiLun;
 
@@ -1258,6 +1259,7 @@ static void iscsi_modesense_sync(IscsiLun *iscsilun)
 struct scsi_task *task;
 struct scsi_mode_sense *ms = NULL;
 iscsilun->write_protected = false;
+iscsilun->dpofua = false;
 
 task = iscsi_modesense6_sync(iscsilun->iscsi, iscsilun->lun,
  1, SCSI_MODESENSE_PC_CURRENT,
@@ -1279,6 +1281,7 @@ static void iscsi_modesense_sync(IscsiLun *iscsilun)
 goto out;
 }
 iscsilun->write_protected = ms->device_specific_parameter & 0x80;
+iscsilun->dpofua  = ms->device_specific_parameter & 0x10;
 
 out:
 if (task) {
-- 
1.9.1




[Qemu-block] [PATCH for-2.4 V2 0/9] various improvements for the iSCSI driver

2015-04-16 Thread Peter Lieven
v1->v2: - removed the requirement for libiscsi 1.10.0 [Paolo]
- reworked to force_next_flush logic [Paolo]

Peter Lieven (9):
  block/iscsi: do not forget to logout from target
  block/iscsi: change all iscsilun properties from uint8_t to bool
  block/iscsi: rename iscsi_write_protected and let it return void
  block/iscsi: store DPOFUA bit from the modesense command
  block/iscsi: optimize WRITE10/16 if cache.writeback is not set
  block/iscsi: increase retry count
  block/iscsi: handle SCSI_STATUS_TASK_SET_FULL
  block/iscsi: bump year in copyright notice
  block/iscsi: use the allocationmap also if cache.direct=on

 block/iscsi.c | 64 ---
 1 file changed, 44 insertions(+), 20 deletions(-)

-- 
1.9.1




Re: [Qemu-block] [PATCH for-2.4 05/10] block/iscsi: optimize WRITE10/16 if cache.writeback is not set

2015-04-16 Thread Peter Lieven

Am 16.04.2015 um 14:42 schrieb Paolo Bonzini:


On 16/04/2015 14:18, Peter Lieven wrote:

SCSI allowes to tell the target to not return from a write command
if the date is not written to the disk. Use this so called FUA
bit if it is supported to optimize WRITE commands if writeback is
not allowed.

In this case qemu always issues a WRITE followed by a FLUSH. This
is 2 round trip times. If we set the FUA bit we can ignore the
following FLUSH.

Signed-off-by: Peter Lieven 
---
  block/iscsi.c | 15 +--
  1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index 237faa1..7fb04d7 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -68,6 +68,7 @@ typedef struct IscsiLun {
  bool lbprz;
  bool dpofua;
  bool has_write_same;
+bool force_next_flush;
  } IscsiLun;
  
  typedef struct IscsiTask {

@@ -370,6 +371,7 @@ static int coroutine_fn iscsi_co_writev(BlockDriverState 
*bs,
  struct IscsiTask iTask;
  uint64_t lba;
  uint32_t num_sectors;
+int fua = iscsilun->dpofua && !bs->enable_write_cache;
  
  if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {

  return -EINVAL;
@@ -388,12 +390,12 @@ retry:
  if (iscsilun->use_16_for_rw) {
  iTask.task = iscsi_write16_task(iscsilun->iscsi, iscsilun->lun, lba,
  NULL, num_sectors * 
iscsilun->block_size,
-iscsilun->block_size, 0, 0, 0, 0, 0,
+iscsilun->block_size, 0, 0, fua, 0, 0,
  iscsi_co_generic_cb, &iTask);
  } else {
  iTask.task = iscsi_write10_task(iscsilun->iscsi, iscsilun->lun, lba,
  NULL, num_sectors * 
iscsilun->block_size,
-iscsilun->block_size, 0, 0, 0, 0, 0,
+iscsilun->block_size, 0, 0, fua, 0, 0,
  iscsi_co_generic_cb, &iTask);
  }
  if (iTask.task == NULL) {
@@ -621,6 +623,11 @@ static int coroutine_fn iscsi_co_flush(BlockDriverState 
*bs)
  return 0;
  }
  
+if (iscsilun->dpofua && !bs->enable_write_cache &&

+!iscsilun->force_next_flush) {
+return 0;
+}
+
  iscsi_co_init_iscsitask(iscsilun, &iTask);
  
  retry:

@@ -648,6 +655,8 @@ retry:
  return -EIO;
  }
  
+iscsilun->force_next_flush = false;

You still need a flush if you do WRITE SAME, WRITE+FUA, WRITE+FUA.
Also, since bs->enable_write_cache can be toggled arbitrarily, I would
prefer to set force_next_flush on all non-FUA writes, including those
done with bs->enable_write_cache.


Good idea. So we can avoid flushes if there was never a write before.




  return 0;
  }
  
@@ -969,6 +978,8 @@ retry:

  iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors);
  }
  
+iscsilun->force_next_flush = true;

Also, I think it is iscsi_co_generic_cb that should set
force_next_flush, so that it is only set on failure.  Not really for the
optimization value, but because it's clearer.


I don't get what you mean with it should only "set on failure".
My approach would be to add a flag to the iTask that the command
requires to set force_flush after successful completion. Is it that
what you mean?



I think that if you do these changes, iscsi_co_flush can just check "if
(!iscsilun->force_next_flush)".

But still---nice approach. :)


Thanks :-)

Peter



Re: [Qemu-block] [PATCH for-2.4 07/10] block/iscsi: bump libiscsi requirement to 1.10.0

2015-04-16 Thread Peter Lieven

Am 16.04.2015 um 14:33 schrieb Paolo Bonzini:


On 16/04/2015 14:18, Peter Lieven wrote:

We need this to support SCSI_STATUS_TASK_SET_FULL.

Any reason apart from the missing constant?


No, but I wanted to avoid starting checking for constants that were added 
shortly after this.
You can't check with #ifdef for a constant in an enum.
Libiscsi 1.10 was released in September 2013.

Peter




Paolo


Signed-off-by: Peter Lieven 
---
  configure | 6 +++---
  1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/configure b/configure
index 6969f6f..f73b8d0 100755
--- a/configure
+++ b/configure
@@ -3630,15 +3630,15 @@ if compile_prog "" "" ; then
  fi
  
  ##

-# Do we have libiscsi >= 1.9.0
+# Do we have libiscsi >= 1.10.0
  if test "$libiscsi" != "no" ; then
-  if $pkg_config --atleast-version=1.9.0 libiscsi; then
+  if $pkg_config --atleast-version=1.10.0 libiscsi; then
  libiscsi="yes"
  libiscsi_cflags=$($pkg_config --cflags libiscsi)
  libiscsi_libs=$($pkg_config --libs libiscsi)
else
  if test "$libiscsi" = "yes" ; then
-  feature_not_found "libiscsi" "Install libiscsi >= 1.9.0"
+  feature_not_found "libiscsi" "Install libiscsi >= 1.10.0"
  fi
  libiscsi="no"
fi




--

Mit freundlichen Grüßen

Peter Lieven

...

  KAMP Netzwerkdienste GmbH
  Vestische Str. 89-91 | 46117 Oberhausen
  Tel: +49 (0) 208.89 402-50 | Fax: +49 (0) 208.89 402-40
  p...@kamp.de | http://www.kamp.de

  Geschäftsführer: Heiner Lante | Michael Lante
  Amtsgericht Duisburg | HRB Nr. 12154
  USt-Id-Nr.: DE 120607556

...




[Qemu-block] [PATCH for-2.4 09/10] block/iscsi: bump year in copyright notice

2015-04-16 Thread Peter Lieven
Signed-off-by: Peter Lieven 
---
 block/iscsi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index 3d0ffeb..04c1309 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -2,7 +2,7 @@
  * QEMU Block driver for iSCSI images
  *
  * Copyright (c) 2010-2011 Ronnie Sahlberg 
- * Copyright (c) 2012-2014 Peter Lieven 
+ * Copyright (c) 2012-2015 Peter Lieven 
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to 
deal
-- 
1.9.1




[Qemu-block] [PATCH for-2.4 00/10] various improvements for the iSCSI driver

2015-04-16 Thread Peter Lieven
Peter Lieven (10):
  block/iscsi: do not forget to logout from target
  block/iscsi: change all iscsilun properties from uint8_t to bool
  block/iscsi: rename iscsi_write_protected and let it return void
  block/iscsi: store DPOFUA bit from the modesense command
  block/iscsi: optimize WRITE10/16 if cache.writeback is not set
  block/iscsi: increase retry count
  block/iscsi: bump libiscsi requirement to 1.10.0
  block/iscsi: handle SCSI_STATUS_TASK_SET_FULL
  block/iscsi: bump year in copyright notice
  block/iscsi: use the allocationmap also if cache.direct=on

 block/iscsi.c | 58 +++---
 configure |  6 +++---
 2 files changed, 42 insertions(+), 22 deletions(-)

-- 
1.9.1




[Qemu-block] [PATCH for-2.4 07/10] block/iscsi: bump libiscsi requirement to 1.10.0

2015-04-16 Thread Peter Lieven
We need this to support SCSI_STATUS_TASK_SET_FULL.

Signed-off-by: Peter Lieven 
---
 configure | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/configure b/configure
index 6969f6f..f73b8d0 100755
--- a/configure
+++ b/configure
@@ -3630,15 +3630,15 @@ if compile_prog "" "" ; then
 fi
 
 ##
-# Do we have libiscsi >= 1.9.0
+# Do we have libiscsi >= 1.10.0
 if test "$libiscsi" != "no" ; then
-  if $pkg_config --atleast-version=1.9.0 libiscsi; then
+  if $pkg_config --atleast-version=1.10.0 libiscsi; then
 libiscsi="yes"
 libiscsi_cflags=$($pkg_config --cflags libiscsi)
 libiscsi_libs=$($pkg_config --libs libiscsi)
   else
 if test "$libiscsi" = "yes" ; then
-  feature_not_found "libiscsi" "Install libiscsi >= 1.9.0"
+  feature_not_found "libiscsi" "Install libiscsi >= 1.10.0"
 fi
 libiscsi="no"
   fi
-- 
1.9.1




[Qemu-block] [PATCH for-2.4 08/10] block/iscsi: handle SCSI_STATUS_TASK_SET_FULL

2015-04-16 Thread Peter Lieven
a target may issue a SCSI_STATUS_TASK_SET_FULL status
if there is more than one "BUSY" command queued already.

Signed-off-by: Peter Lieven 
---
 block/iscsi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index a4902ea..3d0ffeb 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -185,10 +185,10 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int 
status,
 iTask->do_retry = 1;
 goto out;
 }
-if (status == SCSI_STATUS_BUSY) {
+if (status == SCSI_STATUS_BUSY || status == 
SCSI_STATUS_TASK_SET_FULL) {
 unsigned retry_time =
 exp_random(iscsi_retry_times[iTask->retries - 1]);
-error_report("iSCSI Busy (retry #%u in %u ms): %s",
+error_report("iSCSI Busy/TaskSetFull (retry #%u in %u ms): %s",
  iTask->retries, retry_time,
  iscsi_get_error(iscsi));
 aio_timer_init(iTask->iscsilun->aio_context,
-- 
1.9.1




[Qemu-block] [PATCH for-2.4 10/10] block/iscsi: use the allocationmap also if cache.direct=on

2015-04-16 Thread Peter Lieven
the allocationmap has only a hint character. The driver always
double checks that blocks marked unallocated in the cache are
still unallocated before taking the fast path and return zeroes.
So using the allocationmap is migration safe and can
also be enabled with cache.direct=on.

Signed-off-by: Peter Lieven 
---
 block/iscsi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index 04c1309..0737354 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -1495,7 +1495,7 @@ static int iscsi_open(BlockDriverState *bs, QDict 
*options, int flags,
 iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 
1024) {
 iscsilun->cluster_sectors = (iscsilun->bl.opt_unmap_gran *
  iscsilun->block_size) >> BDRV_SECTOR_BITS;
-if (iscsilun->lbprz && !(bs->open_flags & BDRV_O_NOCACHE)) {
+if (iscsilun->lbprz) {
 iscsilun->allocationmap = iscsi_allocationmap_init(iscsilun);
 if (iscsilun->allocationmap == NULL) {
 ret = -ENOMEM;
-- 
1.9.1




[Qemu-block] [PATCH for-2.4 06/10] block/iscsi: increase retry count

2015-04-16 Thread Peter Lieven
The idea is that a command is retried in a BUSY condition
up a time of approx. 60 seconds before it is failed. This should
be far higher than any command timeout in the guest.

Signed-off-by: Peter Lieven 
---
 block/iscsi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index 7fb04d7..a4902ea 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -102,7 +102,7 @@ typedef struct IscsiAIOCB {
 #define NOP_INTERVAL 5000
 #define MAX_NOP_FAILURES 3
 #define ISCSI_CMD_RETRIES ARRAY_SIZE(iscsi_retry_times)
-static const unsigned iscsi_retry_times[] = {8, 32, 128, 512, 2048};
+static const unsigned iscsi_retry_times[] = {8, 32, 128, 512, 2048, 8192, 
32768};
 
 /* this threshold is a trade-off knob to choose between
  * the potential additional overhead of an extra GET_LBA_STATUS request
-- 
1.9.1




[Qemu-block] [PATCH for-2.4 01/10] block/iscsi: do not forget to logout from target

2015-04-16 Thread Peter Lieven
We actually were always impolitely dropping the connection and
not cleanly logging out.

CC: qemu-sta...@nongnu.org
Signed-off-by: Peter Lieven 
---
 block/iscsi.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/block/iscsi.c b/block/iscsi.c
index ba33290..be8af46 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -1501,6 +1501,9 @@ out:
 
 if (ret) {
 if (iscsi != NULL) {
+if (iscsi_is_logged_in(iscsi)) {
+iscsi_logout_sync(iscsi);
+}
 iscsi_destroy_context(iscsi);
 }
 memset(iscsilun, 0, sizeof(IscsiLun));
@@ -1514,6 +1517,9 @@ static void iscsi_close(BlockDriverState *bs)
 struct iscsi_context *iscsi = iscsilun->iscsi;
 
 iscsi_detach_aio_context(bs);
+if (iscsi_is_logged_in(iscsi)) {
+iscsi_logout_sync(iscsi);
+}
 iscsi_destroy_context(iscsi);
 g_free(iscsilun->zeroblock);
 g_free(iscsilun->allocationmap);
-- 
1.9.1




[Qemu-block] [PATCH for-2.4 05/10] block/iscsi: optimize WRITE10/16 if cache.writeback is not set

2015-04-16 Thread Peter Lieven
SCSI allowes to tell the target to not return from a write command
if the date is not written to the disk. Use this so called FUA
bit if it is supported to optimize WRITE commands if writeback is
not allowed.

In this case qemu always issues a WRITE followed by a FLUSH. This
is 2 round trip times. If we set the FUA bit we can ignore the
following FLUSH.

Signed-off-by: Peter Lieven 
---
 block/iscsi.c | 15 +--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index 237faa1..7fb04d7 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -68,6 +68,7 @@ typedef struct IscsiLun {
 bool lbprz;
 bool dpofua;
 bool has_write_same;
+bool force_next_flush;
 } IscsiLun;
 
 typedef struct IscsiTask {
@@ -370,6 +371,7 @@ static int coroutine_fn iscsi_co_writev(BlockDriverState 
*bs,
 struct IscsiTask iTask;
 uint64_t lba;
 uint32_t num_sectors;
+int fua = iscsilun->dpofua && !bs->enable_write_cache;
 
 if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
 return -EINVAL;
@@ -388,12 +390,12 @@ retry:
 if (iscsilun->use_16_for_rw) {
 iTask.task = iscsi_write16_task(iscsilun->iscsi, iscsilun->lun, lba,
 NULL, num_sectors * 
iscsilun->block_size,
-iscsilun->block_size, 0, 0, 0, 0, 0,
+iscsilun->block_size, 0, 0, fua, 0, 0,
 iscsi_co_generic_cb, &iTask);
 } else {
 iTask.task = iscsi_write10_task(iscsilun->iscsi, iscsilun->lun, lba,
 NULL, num_sectors * 
iscsilun->block_size,
-iscsilun->block_size, 0, 0, 0, 0, 0,
+iscsilun->block_size, 0, 0, fua, 0, 0,
 iscsi_co_generic_cb, &iTask);
 }
 if (iTask.task == NULL) {
@@ -621,6 +623,11 @@ static int coroutine_fn iscsi_co_flush(BlockDriverState 
*bs)
 return 0;
 }
 
+if (iscsilun->dpofua && !bs->enable_write_cache &&
+!iscsilun->force_next_flush) {
+return 0;
+}
+
 iscsi_co_init_iscsitask(iscsilun, &iTask);
 
 retry:
@@ -648,6 +655,8 @@ retry:
 return -EIO;
 }
 
+iscsilun->force_next_flush = false;
+
 return 0;
 }
 
@@ -969,6 +978,8 @@ retry:
 iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors);
 }
 
+iscsilun->force_next_flush = true;
+
 return 0;
 }
 
-- 
1.9.1




<    1   2   3   4   5   6   7   >