* Pavel Butsykin (pbutsy...@virtuozzo.com) wrote: > This feature frees the migrated memory on the source during postcopy-ram > migration. In the second step of postcopy-ram migration when the source vm > is put on pause we can free unnecessary memory. It will allow, in particular, > to start relaxing the memory stress on the source host in a load-balancing > scenario. > > Signed-off-by: Pavel Butsykin <pbutsy...@virtuozzo.com>
Hi Pavel, Firstly a higher-level thing, can we use a different word than 'discard' because I already use 'discard' in postcopy to mean the request from the source to the destination to discard pages that are redirtied. I suggest 'release-ram' to just pick a different word that means the same thing. Also, see patchew's build error it spotted. > --- > include/migration/migration.h | 1 + > include/migration/qemu-file.h | 3 ++- > migration/migration.c | 9 +++++++ > migration/qemu-file.c | 59 > ++++++++++++++++++++++++++++++++++++++----- > migration/ram.c | 24 +++++++++++++++++- > qapi-schema.json | 5 +++- > 6 files changed, 91 insertions(+), 10 deletions(-) > > diff --git a/include/migration/migration.h b/include/migration/migration.h > index c309d23370..d7bd404365 100644 > --- a/include/migration/migration.h > +++ b/include/migration/migration.h > @@ -294,6 +294,7 @@ void migrate_add_blocker(Error *reason); > */ > void migrate_del_blocker(Error *reason); > > +bool migrate_discard_ram(void); > bool migrate_postcopy_ram(void); > bool migrate_zero_blocks(void); > > diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h > index abedd466c9..0cd648a733 100644 > --- a/include/migration/qemu-file.h > +++ b/include/migration/qemu-file.h > @@ -132,7 +132,8 @@ void qemu_put_byte(QEMUFile *f, int v); > * put_buffer without copying the buffer. > * The buffer should be available till it is sent asynchronously. > */ > -void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, size_t size); > +void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, size_t size, > + bool may_free); > bool qemu_file_mode_is_not_valid(const char *mode); > bool qemu_file_is_writable(QEMUFile *f); > > diff --git a/migration/migration.c b/migration/migration.c > index f498ab84f2..391db6f28b 100644 > --- a/migration/migration.c > +++ b/migration/migration.c > @@ -1251,6 +1251,15 @@ void qmp_migrate_set_downtime(double value, Error > **errp) > qmp_migrate_set_parameters(&p, errp); > } > > +bool migrate_discard_ram(void) > +{ > + MigrationState *s; > + > + s = migrate_get_current(); > + > + return s->enabled_capabilities[MIGRATION_CAPABILITY_DISCARD_RAM]; > +} > + > bool migrate_postcopy_ram(void) > { > MigrationState *s; > diff --git a/migration/qemu-file.c b/migration/qemu-file.c > index e9fae31158..f85a0ecd9e 100644 > --- a/migration/qemu-file.c > +++ b/migration/qemu-file.c > @@ -30,6 +30,7 @@ > #include "qemu/coroutine.h" > #include "migration/migration.h" > #include "migration/qemu-file.h" > +#include "sysemu/sysemu.h" > #include "trace.h" > > #define IO_BUF_SIZE 32768 > @@ -49,6 +50,7 @@ struct QEMUFile { > int buf_size; /* 0 when writing */ > uint8_t buf[IO_BUF_SIZE]; > > + DECLARE_BITMAP(may_free, MAX_IOV_SIZE); > struct iovec iov[MAX_IOV_SIZE]; > unsigned int iovcnt; > > @@ -132,6 +134,40 @@ bool qemu_file_is_writable(QEMUFile *f) > return f->ops->writev_buffer; > } > > +static void qemu_iovec_discard_ram(QEMUFile *f) > +{ > + struct iovec iov; > + unsigned long idx; > + > + if (!migrate_discard_ram() || !runstate_check(RUN_STATE_FINISH_MIGRATE)) > { > + return; > + } Can we split this out into a separate function please; so qemu_iovec_discard_ram always does it, and then you have something that only calls it if enabled. > + idx = find_next_bit(f->may_free, f->iovcnt, 0); > + if (idx >= f->iovcnt) { > + return; > + } > + iov = f->iov[idx]; > + > + while ((idx = find_next_bit(f->may_free, f->iovcnt, idx + 1)) < > f->iovcnt) { > + /* check for adjacent buffer and coalesce them */ > + if (iov.iov_base + iov.iov_len == f->iov[idx].iov_base) { > + iov.iov_len += f->iov[idx].iov_len; > + continue; > + } > + if (qemu_madvise(iov.iov_base, iov.iov_len, QEMU_MADV_DONTNEED) < 0) > { > + error_report("migrate: madvise DONTNEED failed %p %ld: %s", > + iov.iov_base, iov.iov_len, strerror(errno)); > + } > + iov = f->iov[idx]; Can you add some comments in here please; it took me a while to understand it; I think what you're doing is that the madvise in the loop is called for the last iov within a continuous range and then you fall through to deal with the last one. Also, see my 'postcopy: enhance ram_discard_range for hugepages' - these madvise's get a bit more complex with hugepage. > + } > + if (qemu_madvise(iov.iov_base, iov.iov_len, QEMU_MADV_DONTNEED) < 0) { > + error_report("migrate: madvise DONTNEED failed %p %ld: %s", > + iov.iov_base, iov.iov_len, strerror(errno)); > + } > + memset(f->may_free, 0, sizeof(f->may_free)); > +} > + > /** > * Flushes QEMUFile buffer > * > @@ -151,6 +187,8 @@ void qemu_fflush(QEMUFile *f) > if (f->iovcnt > 0) { > expect = iov_size(f->iov, f->iovcnt); > ret = f->ops->writev_buffer(f->opaque, f->iov, f->iovcnt, f->pos); > + > + qemu_iovec_discard_ram(f); > } > > if (ret >= 0) { > @@ -304,13 +342,19 @@ int qemu_fclose(QEMUFile *f) > return ret; > } > > -static void add_to_iovec(QEMUFile *f, const uint8_t *buf, size_t size) > +static void add_to_iovec(QEMUFile *f, const uint8_t *buf, size_t size, > + bool may_free) > { > /* check for adjacent buffer and coalesce them */ > if (f->iovcnt > 0 && buf == f->iov[f->iovcnt - 1].iov_base + > - f->iov[f->iovcnt - 1].iov_len) { > + f->iov[f->iovcnt - 1].iov_len && > + may_free == test_bit(f->iovcnt - 1, f->may_free)) > + { > f->iov[f->iovcnt - 1].iov_len += size; > } else { > + if (may_free) { > + set_bit(f->iovcnt, f->may_free); > + } > f->iov[f->iovcnt].iov_base = (uint8_t *)buf; > f->iov[f->iovcnt++].iov_len = size; > } > @@ -320,14 +364,15 @@ static void add_to_iovec(QEMUFile *f, const uint8_t > *buf, size_t size) > } > } > > -void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, size_t size) > +void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, size_t size, > + bool may_free) > { > if (f->last_error) { > return; > } > > f->bytes_xfer += size; > - add_to_iovec(f, buf, size); > + add_to_iovec(f, buf, size, may_free); > } > > void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size) > @@ -345,7 +390,7 @@ void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, > size_t size) > } > memcpy(f->buf + f->buf_index, buf, l); > f->bytes_xfer += l; > - add_to_iovec(f, f->buf + f->buf_index, l); > + add_to_iovec(f, f->buf + f->buf_index, l, false); > f->buf_index += l; > if (f->buf_index == IO_BUF_SIZE) { > qemu_fflush(f); > @@ -366,7 +411,7 @@ void qemu_put_byte(QEMUFile *f, int v) > > f->buf[f->buf_index] = v; > f->bytes_xfer++; > - add_to_iovec(f, f->buf + f->buf_index, 1); > + add_to_iovec(f, f->buf + f->buf_index, 1, false); > f->buf_index++; > if (f->buf_index == IO_BUF_SIZE) { > qemu_fflush(f); > @@ -647,7 +692,7 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const > uint8_t *p, size_t size, > } > qemu_put_be32(f, blen); > if (f->ops->writev_buffer) { > - add_to_iovec(f, f->buf + f->buf_index, blen); > + add_to_iovec(f, f->buf + f->buf_index, blen, false); > } > f->buf_index += blen; > if (f->buf_index == IO_BUF_SIZE) { > diff --git a/migration/ram.c b/migration/ram.c > index a1c8089010..b0322a0b5c 100644 > --- a/migration/ram.c > +++ b/migration/ram.c > @@ -27,6 +27,7 @@ > */ > #include "qemu/osdep.h" > #include "qemu-common.h" > +#include "sysemu/sysemu.h" > #include "cpu.h" > #include <zlib.h> > #include "qapi-event.h" > @@ -713,6 +714,18 @@ static int save_zero_page(QEMUFile *f, RAMBlock *block, > ram_addr_t offset, > return pages; > } > > +static void ram_discard_page(uint8_t *addr, int pages) > +{ > + if (!migrate_discard_ram() || !runstate_check(RUN_STATE_FINISH_MIGRATE)) > { > + return; > + } > + > + if (qemu_madvise(addr, pages << TARGET_PAGE_BITS, QEMU_MADV_DONTNEED) < > 0) { > + error_report("migrate: madvise DONTNEED failed %p %d: %s", > + addr, pages << TARGET_PAGE_BITS, strerror(errno)); > + } > +} Would it make sense to pick up my 'Fold postcopy_ram_discard_range into ram_discard_range' patch and then just call that wrapped with your test? Again, this is another reason to change the word 'discard'. > /** > * ram_save_page: Send the given page to the stream > * > @@ -772,6 +785,7 @@ static int ram_save_page(QEMUFile *f, PageSearchStatus > *pss, > * page would be stale > */ > xbzrle_cache_zero_page(current_addr); > + ram_discard_page(p, pages); > } else if (!ram_bulk_stage && > !migration_in_postcopy(migrate_get_current()) && > migrate_use_xbzrle()) { > @@ -791,9 +805,11 @@ static int ram_save_page(QEMUFile *f, PageSearchStatus > *pss, > *bytes_transferred += save_page_header(f, block, > offset | RAM_SAVE_FLAG_PAGE); > if (send_async) { > - qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE); > + qemu_put_buffer_async( > + f, p, TARGET_PAGE_SIZE, migrate_discard_ram()); > } else { > qemu_put_buffer(f, p, TARGET_PAGE_SIZE); > + ram_discard_page(p, 1); Does this case ever happen? Isn't the only time we hit non-async if xbzrle is enabled, and then 'p' can be a pointer into the xbzrle cache? (and we don't support xbzrle with postcopy). > } > *bytes_transferred += TARGET_PAGE_SIZE; > pages = 1; > @@ -821,6 +837,7 @@ static int do_compress_ram_page(QEMUFile *f, RAMBlock > *block, > error_report("compressed data failed!"); > } else { > bytes_sent += blen; > + ram_discard_page(p, 1); > } > > return bytes_sent; > @@ -959,12 +976,17 @@ static int ram_save_compressed_page(QEMUFile *f, > PageSearchStatus *pss, > error_report("compressed data failed!"); > } > } > + if (pages > 0) { > + ram_discard_page(p, pages); > + } > } else { > offset |= RAM_SAVE_FLAG_CONTINUE; > pages = save_zero_page(f, block, offset, p, bytes_transferred); > if (pages == -1) { > pages = compress_page_with_multi_thread(f, block, offset, > bytes_transferred); > + } else { > + ram_discard_page(p, pages); > } > } Not that we support compressed pages on postcopy yet (destination code needs writing). > } > diff --git a/qapi-schema.json b/qapi-schema.json > index ce20f16757..f02b434765 100644 > --- a/qapi-schema.json > +++ b/qapi-schema.json > @@ -588,11 +588,14 @@ > # side, this process is called COarse-Grain LOck Stepping (COLO) for > # Non-stop Service. (since 2.8) > # > +# @discard-ram: if enabled, qemu will free the migrated ram pages on the > source > +# during postcopy-ram migration. (since 2.9) > +# > # Since: 1.2 > ## > { 'enum': 'MigrationCapability', > 'data': ['xbzrle', 'rdma-pin-all', 'auto-converge', 'zero-blocks', > - 'compress', 'events', 'postcopy-ram', 'x-colo'] } > + 'compress', 'events', 'postcopy-ram', 'x-colo', 'discard-ram'] } Interestingly, it's not directly tied to postcopy, but I guess postcopy is the only time you get the bulk of pages sent in FINISH_MIGRATE mode; if enabled it would trigger for the final stage of a precopy migration. I suppose we have to worry about a case like: postcopy is enabled discard-ram is enabled migration starts migration goes into FINISH_MIGRATE very quickly in precopy discard-ram throws source pages away. migration fails (some screwup as it starts destination) management tells source to carry on (after all it never actually entered postcopy) -> source is using some emptied ram. The fix for that would be to tie it to postcopy. Dave > > ## > # @MigrationCapabilityStatus: > -- > 2.11.0 > > -- Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK