On Thursday, March 5, 2020 11:29:42 AM -03 David Hildenbrand wrote: > Let's implement ram_block_resized(), allowing resizeable mappings. > > For resizeable mappings, we reserve $max_size IOVA address space, but only > map $size of it. When resizing, unmap the old part and remap the new > part. We'll need e.g., new ioctl to do this atomically (e.g., to resize > while the guest is running). > > Right now, we only resize RAM blocks during incoming migration (when > syncing RAM block sizes during the precopy phase) or after guest > resets when building acpi tables. Any future user of resizeable RAM has to > be aware that vfio has to be treated with care. > > Reviewed-by: Peter Xu <pet...@redhat.com> > Cc: Richard Henderson <r...@twiddle.net> > Cc: Paolo Bonzini <pbonz...@redhat.com> > Cc: Eduardo Habkost <ehabk...@redhat.com> > Cc: Marcel Apfelbaum <marcel.apfelb...@gmail.com> > Cc: Alex Williamson <alex.william...@redhat.com> > Cc: Stefan Hajnoczi <stefa...@redhat.com> > Cc: "Dr. David Alan Gilbert" <dgilb...@redhat.com> > Signed-off-by: David Hildenbrand <da...@redhat.com> > ---
Acked-by: Murilo Opsfelder Araujo <muri...@linux.ibm.com> > util/trace-events | 7 ++-- > util/vfio-helpers.c | 95 +++++++++++++++++++++++++++++++++++++++------ > 2 files changed, 88 insertions(+), 14 deletions(-) > > diff --git a/util/trace-events b/util/trace-events > index 83b6639018..a4d39eca5e 100644 > --- a/util/trace-events > +++ b/util/trace-events > @@ -74,10 +74,11 @@ qemu_mutex_unlock(void *mutex, const char *file, const > int line) "released mutex > > # vfio-helpers.c > qemu_vfio_dma_reset_temporary(void *s) "s %p" > -qemu_vfio_ram_block_added(void *s, void *p, size_t size) "s %p host %p size > 0x%zx" -qemu_vfio_ram_block_removed(void *s, void *p, size_t size) "s %p > host %p size 0x%zx" +qemu_vfio_ram_block_added(void *s, void *p, size_t > size, size_t max_size) "s %p host %p size 0x%zx max_size 0x%zx" > +qemu_vfio_ram_block_removed(void *s, void *p, size_t size, size_t > max_size) "s %p host %p size 0x%zx max_size 0x%zx" > +qemu_vfio_ram_block_resized(void *s, void *p, size_t old_size, size_t > new_sizze) "s %p host %p old_size 0x%zx new_size 0x%zx" > qemu_vfio_find_mapping(void *s, void *p) "s %p host %p" > -qemu_vfio_new_mapping(void *s, void *host, size_t size, int index, uint64_t > iova) "s %p host %p size %zu index %d iova 0x%"PRIx64 > +qemu_vfio_new_mapping(void *s, void *host, size_t size, size_t max_size, > int index, uint64_t iova) "s %p host %p size %zu max_size %zu index %d iova > 0x%"PRIx64 qemu_vfio_do_mapping(void *s, void *host, size_t size, uint64_t > iova) "s %p host %p size %zu iova 0x%"PRIx64 qemu_vfio_dma_map(void *s, > void *host, size_t size, bool temporary, uint64_t *iova) "s %p host %p size > %zu temporary %d iova %p" qemu_vfio_dma_unmap(void *s, void *host) "s %p > host %p" > diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c > index f0c77f0d69..789faf38bd 100644 > --- a/util/vfio-helpers.c > +++ b/util/vfio-helpers.c > @@ -36,6 +36,7 @@ typedef struct { > /* Page aligned addr. */ > void *host; > size_t size; > + size_t max_size; > uint64_t iova; > } IOVAMapping; > > @@ -372,14 +373,20 @@ fail_container: > return ret; > } > > +static int qemu_vfio_dma_map_resizeable(QEMUVFIOState *s, void *host, > + size_t size, size_t max_size, > + bool temporary, uint64_t *iova); > +static void qemu_vfio_dma_map_resize(QEMUVFIOState *s, void *host, > + size_t old_size, size_t new_size); > + > static void qemu_vfio_ram_block_added(RAMBlockNotifier *n, void *host, > size_t size, size_t max_size) > { > QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier); > int ret; > > - trace_qemu_vfio_ram_block_added(s, host, max_size); > - ret = qemu_vfio_dma_map(s, host, max_size, false, NULL); > + trace_qemu_vfio_ram_block_added(s, host, size, max_size); > + ret = qemu_vfio_dma_map_resizeable(s, host, size, max_size, false, > NULL); if (ret) { > error_report("qemu_vfio_dma_map(%p, %zu) failed: %s", host, > max_size, strerror(-ret)); > @@ -391,16 +398,28 @@ static void > qemu_vfio_ram_block_removed(RAMBlockNotifier *n, void *host, { > QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier); > if (host) { > - trace_qemu_vfio_ram_block_removed(s, host, max_size); > + trace_qemu_vfio_ram_block_removed(s, host, size, max_size); > qemu_vfio_dma_unmap(s, host); > } > } > > +static void qemu_vfio_ram_block_resized(RAMBlockNotifier *n, void *host, > + size_t old_size, size_t new_size) > +{ > + QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier); > + > + if (host) { > + trace_qemu_vfio_ram_block_resized(s, host, old_size, new_size); > + qemu_vfio_dma_map_resize(s, host, old_size, new_size); > + } > +} > + > static void qemu_vfio_open_common(QEMUVFIOState *s) > { > qemu_mutex_init(&s->lock); > s->ram_notifier.ram_block_added = qemu_vfio_ram_block_added; > s->ram_notifier.ram_block_removed = qemu_vfio_ram_block_removed; > + s->ram_notifier.ram_block_resized = qemu_vfio_ram_block_resized; > s->low_water_mark = QEMU_VFIO_IOVA_MIN; > s->high_water_mark = QEMU_VFIO_IOVA_MAX; > ram_block_notifier_add(&s->ram_notifier); > @@ -495,16 +514,23 @@ static IOVAMapping > *qemu_vfio_find_mapping(QEMUVFIOState *s, void *host, */ > static IOVAMapping *qemu_vfio_add_mapping(QEMUVFIOState *s, > void *host, size_t size, > - int index, uint64_t iova) > + size_t max_size, int index, > + uint64_t iova) > { > + const IOVAMapping m = { > + .host = host, > + .size = size, > + .max_size = max_size, > + .iova = iova, > + }; > int shift; > - IOVAMapping m = {.host = host, .size = size, .iova = iova}; > IOVAMapping *insert; > > assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size)); > + assert(size <= max_size); > assert(QEMU_IS_ALIGNED(s->low_water_mark, qemu_real_host_page_size)); > assert(QEMU_IS_ALIGNED(s->high_water_mark, qemu_real_host_page_size)); > - trace_qemu_vfio_new_mapping(s, host, size, index, iova); > + trace_qemu_vfio_new_mapping(s, host, size, max_size, index, iova); > > assert(index >= 0); > s->nr_mappings++; > @@ -597,9 +623,14 @@ static bool qemu_vfio_verify_mappings(QEMUVFIOState *s) > * the result in @iova if not NULL. The caller need to make sure the area is > * aligned to page size, and mustn't overlap with existing mapping areas > (split * mapping status within this area is not allowed). > + * > + * If size < max_size, a region of max_size in IOVA address is reserved, > such + * that the mapping can later be resized. Resizeable mappings are > only allowed + * for !temporary mappings. > */ > -int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size, > - bool temporary, uint64_t *iova) > +static int qemu_vfio_dma_map_resizeable(QEMUVFIOState *s, void *host, > + size_t size, size_t max_size, > + bool temporary, uint64_t *iova) > { > int ret = 0; > int index; > @@ -608,19 +639,24 @@ int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, > size_t size, > > assert(QEMU_PTR_IS_ALIGNED(host, qemu_real_host_page_size)); > assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size)); > + assert(QEMU_IS_ALIGNED(max_size, qemu_real_host_page_size)); > + assert(size == max_size || !temporary); > + assert(size <= max_size); > + > trace_qemu_vfio_dma_map(s, host, size, temporary, iova); > qemu_mutex_lock(&s->lock); > mapping = qemu_vfio_find_mapping(s, host, &index); > if (mapping) { > iova0 = mapping->iova + ((uint8_t *)host - (uint8_t > *)mapping->host); } else { > - if (s->high_water_mark - s->low_water_mark + 1 < size) { > + if (s->high_water_mark - s->low_water_mark + 1 < max_size) { > ret = -ENOMEM; > goto out; > } > if (!temporary) { > iova0 = s->low_water_mark; > - mapping = qemu_vfio_add_mapping(s, host, size, index + 1, > iova0); + mapping = qemu_vfio_add_mapping(s, host, size, > max_size, index + 1, + iova0); > if (!mapping) { > ret = -ENOMEM; > goto out; > @@ -631,7 +667,7 @@ int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, > size_t size, qemu_vfio_remove_mapping(s, mapping); > goto out; > } > - s->low_water_mark += size; > + s->low_water_mark += max_size; > qemu_vfio_dump_mappings(s); > } else { > iova0 = s->high_water_mark - size; > @@ -650,6 +686,12 @@ out: > return ret; > } > > +int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size, > + bool temporary, uint64_t *iova) > +{ > + return qemu_vfio_dma_map_resizeable(s, host, size, size, temporary, > iova); +} > + > /* Reset the high watermark and free all "temporary" mappings. */ > int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s) > { > @@ -694,6 +736,37 @@ out: > qemu_mutex_unlock(&s->lock); > } > > +static void qemu_vfio_dma_map_resize(QEMUVFIOState *s, void *host, > + size_t old_size, size_t new_size) > +{ > + IOVAMapping *m; > + int index = 0; > + > + qemu_mutex_lock(&s->lock); > + m = qemu_vfio_find_mapping(s, host, &index); > + if (!m) { > + return; > + } > + assert(m->size == old_size); > + assert(new_size <= m->max_size); > + > + /* > + * For now, we must unmap the whole mapped range first and remap with > + * the new size. The reason is that VFIO_IOMMU_UNMAP_DMA might fail > + * when partially unmapping previous mappings. Although we could add > + * new mappings to extend the old range, we won't able to always > + * shrink. The side effect is that it's never safe to resize during VM > + * execution and we'll e.g., need a new IOCTL to make this work. > + */ > + qemu_vfio_undo_mapping(s, m->iova, m->size); > + qemu_vfio_do_mapping(s, host, m->iova, new_size); > + > + m->size = new_size; > + assert(qemu_vfio_verify_mappings(s)); > + > + qemu_mutex_unlock(&s->lock); > +} > + > static void qemu_vfio_reset(QEMUVFIOState *s) > { > ioctl(s->device, VFIO_DEVICE_RESET); -- Murilo