On 06/03/2023 18:15, Alex Williamson wrote: > On Sat, 4 Mar 2023 01:43:37 +0000 > Joao Martins <joao.m.mart...@oracle.com> wrote: > >> According to the device DMA logging uAPI, IOVA ranges to be logged by >> the device must be provided all at once upon DMA logging start. >> >> As preparation for the following patches which will add device dirty >> page tracking, keep a record of all DMA mapped IOVA ranges so later they >> can be used for DMA logging start. >> >> Note that when vIOMMU is enabled DMA mapped IOVA ranges are not tracked. >> This is due to the dynamic nature of vIOMMU DMA mapping/unmapping. > > Commit log is outdated for this version. > I will remove the paragraph. I can't mention vIOMMU usage blocks migration as I effectively do that later in the series.
>> Signed-off-by: Avihai Horon <avih...@nvidia.com> >> Signed-off-by: Joao Martins <joao.m.mart...@oracle.com> >> --- >> hw/vfio/common.c | 84 +++++++++++++++++++++++++++++++++++ >> hw/vfio/trace-events | 1 + >> include/hw/vfio/vfio-common.h | 11 +++++ >> 3 files changed, 96 insertions(+) >> >> diff --git a/hw/vfio/common.c b/hw/vfio/common.c >> index ed908e303dbb..d84e5fd86bb4 100644 >> --- a/hw/vfio/common.c >> +++ b/hw/vfio/common.c >> @@ -44,6 +44,7 @@ >> #include "migration/blocker.h" >> #include "migration/qemu-file.h" >> #include "sysemu/tpm.h" >> +#include "qemu/iova-tree.h" > > Unnecessary > True, I had it removed for v4 as Avihai pointed that out to me too offlist. Same for the other one down below. >> >> VFIOGroupList vfio_group_list = >> QLIST_HEAD_INITIALIZER(vfio_group_list); >> @@ -1313,11 +1314,94 @@ static int >> vfio_set_dirty_page_tracking(VFIOContainer *container, bool start) >> return ret; >> } >> >> +/* >> + * Called for the dirty tracking memory listener to calculate the iova/end >> + * for a given memory region section. The checks here, replicate the logic >> + * in vfio_listener_region_{add,del}() used for the same purpose. And thus >> + * both listener should be kept in sync. >> + */ >> +static bool vfio_get_section_iova_range(VFIOContainer *container, >> + MemoryRegionSection *section, >> + hwaddr *out_iova, hwaddr *out_end) >> +{ >> + Int128 llend; >> + hwaddr iova; >> + >> + iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space); >> + llend = int128_make64(section->offset_within_address_space); >> + llend = int128_add(llend, section->size); >> + llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask())); >> + >> + if (int128_ge(int128_make64(iova), llend)) { >> + return false; >> + } >> + >> + *out_iova = iova; >> + *out_end = int128_get64(llend) - 1; >> + return true; >> +} > > Not sure why this isn't turned into a helper here to avoid the issue > noted in the comment. The reason I didn't directly reused, as the calculation happens in different places in the existing listener. But I noticed now that I am confusing with llsize (in the old checks I have now removed). @end is not used in the check that preceeds it, so I am moving this calculation into a helper. Presumably I have a new preceeding patch where I have this vfio_get_section_iova_range() added. and this patch just uses it. > Also why do both of the existing listener > implementations resolve the end address as: > > int128_get64(int128_sub(llend, int128_one())); > > While here we use: > > int128_get64(llend) - 1; > > We're already out of sync. > True >> + >> +static void vfio_dirty_tracking_update(MemoryListener *listener, >> + MemoryRegionSection *section) >> +{ >> + VFIOContainer *container = container_of(listener, VFIOContainer, >> + tracking_listener); >> + VFIODirtyTrackingRange *range = &container->tracking_range; >> + hwaddr max32 = (1ULL << 32) - 1ULL; > > UINT32_MAX > /me nods >> + hwaddr iova, end; >> + >> + if (!vfio_listener_valid_section(section) || >> + !vfio_get_section_iova_range(container, section, &iova, &end)) { >> + return; >> + } >> + >> + WITH_QEMU_LOCK_GUARD(&container->tracking_mutex) { >> + if (iova < max32 && end <= max32) { >> + if (range->min32 > iova) { >> + range->min32 = iova; >> + } >> + if (range->max32 < end) { >> + range->max32 = end; >> + } >> + trace_vfio_device_dirty_tracking_update(iova, end, >> + range->min32, range->max32); >> + } else { >> + if (!range->min64 || range->min64 > iova) { >> + range->min64 = iova; >> + } > > I think this improperly handles a range starting at zero, min64 should > be UINT64_MAX initially. For example, if we have ranges 0-8GB and > 12-16GB, this effectively ignores the first range. Likewise I think > range->min32 has a similar problem, it's initialized to zero, it will > never be updated to match a non-zero lowest range. It needs to be > initialized to UINT32_MAX. > Yes, let me switch to that. I'll place the min64/min32 to UINT64_MAX/UINT32_MAX in the place where we initialize the state for the dma tracking listener. > A comment describing the purpose of the 32/64 split tracking would be > useful too. > Yes, definitely e.g. /* * The address space passed to the dirty tracker is reduced to two ranges: one * for 32-bit DMA ranges, and another one for 64-bit DMA ranges. The underlying * reports of dirty will query a sub-interval of each of these ranges. The * purpose of the dual range handling is to handle known cases of big holes in * the address space, like the x86 AMD 1T hole. The alternative would be an * IOVATree but that has a much bigger runtime overhead and unnecessary * complexity. */ >> + if (range->max64 < end) { >> + range->max64 = end; >> + } >> + trace_vfio_device_dirty_tracking_update(iova, end, >> + range->min64, range->max64); >> + } >> + } >> + return; >> +} >> + >> +static const MemoryListener vfio_dirty_tracking_listener = { >> + .name = "vfio-tracking", >> + .region_add = vfio_dirty_tracking_update, >> +}; >> + >> +static void vfio_dirty_tracking_init(VFIOContainer *container) >> +{ >> + memset(&container->tracking_range, 0, >> sizeof(container->tracking_range)); >> + qemu_mutex_init(&container->tracking_mutex); > > As noted in other thread, this mutex seems unnecessary. > Already deleted it for v4. > The listener needs to be embedded in an object, but it doesn't need to > be the container. Couldn't we create: > > typedef struct VFIODirtyRanges { > VFIOContainer *container; > VFIODirtyTrackingRange ranges; > MemoryListener listener; > } VFIODirectRanges; > > For use here? Caller could provide VFIODirtyTrackingRange pointer for > the resulting ranges, which then gets passed to > vfio_device_feature_dma_logging_start_create() Oh, that would be so much cleaner, yes. Will switch to that. > >> + container->tracking_listener = vfio_dirty_tracking_listener; >> + memory_listener_register(&container->tracking_listener, >> + container->space->as); >> + memory_listener_unregister(&container->tracking_listener); > > It's sufficiently subtle that the listener callback is synchronous and > we're done with it here for a comment. > Will add a comment e.g.: /* * The memory listener is synchronous, and used to calculate the range to dirty * tracking. Unregister it after we are done as we are not interesting in any * follow-up updates. */ >> + qemu_mutex_destroy(&container->tracking_mutex); >> +} >> + >> static void vfio_listener_log_global_start(MemoryListener *listener) >> { >> VFIOContainer *container = container_of(listener, VFIOContainer, >> listener); >> int ret; >> >> + vfio_dirty_tracking_init(container); >> + >> ret = vfio_set_dirty_page_tracking(container, true); >> if (ret) { >> vfio_set_migration_error(ret); >> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events >> index 669d9fe07cd9..d97a6de17921 100644 >> --- a/hw/vfio/trace-events >> +++ b/hw/vfio/trace-events >> @@ -104,6 +104,7 @@ vfio_known_safe_misalignment(const char *name, uint64_t >> iova, uint64_t offset_wi >> vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, >> uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" >> size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA" >> vfio_listener_region_del_skip(uint64_t start, uint64_t end) "SKIPPING >> region_del 0x%"PRIx64" - 0x%"PRIx64 >> vfio_listener_region_del(uint64_t start, uint64_t end) "region_del >> 0x%"PRIx64" - 0x%"PRIx64 >> +vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t >> min, uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" >> - 0x%"PRIx64"]" >> vfio_disconnect_container(int fd) "close container->fd=%d" >> vfio_put_group(int fd) "close group->fd=%d" >> vfio_get_device(const char * name, unsigned int flags, unsigned int >> num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: >> %u" >> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h >> index 87524c64a443..96791add2719 100644 >> --- a/include/hw/vfio/vfio-common.h >> +++ b/include/hw/vfio/vfio-common.h >> @@ -23,6 +23,7 @@ >> >> #include "exec/memory.h" >> #include "qemu/queue.h" >> +#include "qemu/iova-tree.h" > > Unused. > Had it removed already preemptively >> #include "qemu/notify.h" >> #include "ui/console.h" >> #include "hw/display/ramfb.h" >> @@ -68,6 +69,13 @@ typedef struct VFIOMigration { >> size_t data_buffer_size; >> } VFIOMigration; >> >> +typedef struct VFIODirtyTrackingRange { >> + hwaddr min32; >> + hwaddr max32; >> + hwaddr min64; >> + hwaddr max64; >> +} VFIODirtyTrackingRange; >> + >> typedef struct VFIOAddressSpace { >> AddressSpace *as; >> QLIST_HEAD(, VFIOContainer) containers; >> @@ -89,6 +97,9 @@ typedef struct VFIOContainer { >> uint64_t max_dirty_bitmap_size; >> unsigned long pgsizes; >> unsigned int dma_max_mappings; >> + VFIODirtyTrackingRange tracking_range; >> + QemuMutex tracking_mutex; >> + MemoryListener tracking_listener; > > Let's make this unnecessary too. Thanks, > Got it.