On 07/03/2023 02:57, Alex Williamson wrote: > On Tue, 7 Mar 2023 02:02:52 +0000 > Joao Martins <joao.m.mart...@oracle.com> wrote: > >> According to the device DMA logging uAPI, IOVA ranges to be logged by >> the device must be provided all at once upon DMA logging start. >> >> As preparation for the following patches which will add device dirty >> page tracking, keep a record of all DMA mapped IOVA ranges so later they >> can be used for DMA logging start. >> >> Signed-off-by: Avihai Horon <avih...@nvidia.com> >> Signed-off-by: Joao Martins <joao.m.mart...@oracle.com> >> --- >> hw/vfio/common.c | 76 +++++++++++++++++++++++++++++++++++ >> hw/vfio/trace-events | 1 + >> include/hw/vfio/vfio-common.h | 13 ++++++ >> 3 files changed, 90 insertions(+) >> >> diff --git a/hw/vfio/common.c b/hw/vfio/common.c >> index 3a6491dbc523..a9b1fc999121 100644 >> --- a/hw/vfio/common.c >> +++ b/hw/vfio/common.c >> @@ -1334,11 +1334,87 @@ static int >> vfio_set_dirty_page_tracking(VFIOContainer *container, bool start) >> return ret; >> } >> >> +static void vfio_dirty_tracking_update(MemoryListener *listener, >> + MemoryRegionSection *section) >> +{ >> + VFIODirtyRanges *dirty = container_of(listener, VFIODirtyRanges, >> listener); >> + VFIODirtyTrackingRange *range = &dirty->ranges; >> + hwaddr max32 = UINT32_MAX - 1ULL; > > The -1 is wrong here, UINT32_MAX is (2^32 - 1) > Ugh, what a distraction.
The reason it worked in my tests is because there's a whole at the boundary, being off by one wouldn't change the end range. >> + hwaddr iova, end; >> + >> + if (!vfio_listener_valid_section(section) || >> + !vfio_get_section_iova_range(dirty->container, section, >> + &iova, &end, NULL)) { >> + return; >> + } >> + >> + /* >> + * The address space passed to the dirty tracker is reduced to two >> ranges: >> + * one for 32-bit DMA ranges, and another one for 64-bit DMA ranges. >> + * The underlying reports of dirty will query a sub-interval of each of >> + * these ranges. >> + * >> + * The purpose of the dual range handling is to handle known cases of >> big >> + * holes in the address space, like the x86 AMD 1T hole. The alternative >> + * would be an IOVATree but that has a much bigger runtime overhead and >> + * unnecessary complexity. >> + */ >> + if (iova < max32 && end <= max32) { > > Nit, the first test is redundant, iova is necessarily less than end. > I'll delete it. >> + if (range->min32 > iova) { >> + range->min32 = iova; >> + } >> + if (range->max32 < end) { >> + range->max32 = end; >> + } >> + trace_vfio_device_dirty_tracking_update(iova, end, >> + range->min32, range->max32); >> + } else { >> + if (!range->min64 || range->min64 > iova) { > > The first test should be removed, we're initializing min64 to a > non-zero value now, so if it's zero it's been set and we can't > de-prioritize that set value. > Distraction again, I am sure I removed all. and the test is pretty useless as this will never be true. >> + range->min64 = iova; >> + } >> + if (range->max64 < end) { >> + range->max64 = end; >> + } >> + trace_vfio_device_dirty_tracking_update(iova, end, >> + range->min64, range->max64); >> + } >> + >> + return; >> +} >> + >> +static const MemoryListener vfio_dirty_tracking_listener = { >> + .name = "vfio-tracking", >> + .region_add = vfio_dirty_tracking_update, >> +}; >> + >> +static void vfio_dirty_tracking_init(VFIOContainer *container, >> + VFIODirtyRanges *dirty) >> +{ >> + memset(dirty, 0, sizeof(*dirty)); >> + dirty->ranges.min32 = UINT32_MAX; >> + dirty->ranges.min64 = UINT64_MAX; >> + dirty->listener = vfio_dirty_tracking_listener; >> + dirty->container = container; >> + > > I was actually thinking the caller would just pass > VFIODirtyTrackingRange and VFIODirtyRanges would be allocated on the > stack here, perhaps both are defined private to this file, but this > works and we can refine later if we so decide. Thanks, > OK, I see what you mean. Since I have to respin v5, I'll fix this. > >> + memory_listener_register(&dirty->listener, >> + container->space->as); >> + >> + /* >> + * The memory listener is synchronous, and used to calculate the range >> + * to dirty tracking. Unregister it after we are done as we are not >> + * interested in any follow-up updates. >> + */ >> + memory_listener_unregister(&dirty->listener); >> +} >> + >> static void vfio_listener_log_global_start(MemoryListener *listener) >> { >> VFIOContainer *container = container_of(listener, VFIOContainer, >> listener); >> + VFIODirtyRanges dirty; >> int ret; >> >> + vfio_dirty_tracking_init(container, &dirty); >> + >> ret = vfio_set_dirty_page_tracking(container, true); >> if (ret) { >> vfio_set_migration_error(ret); >> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events >> index 669d9fe07cd9..d97a6de17921 100644 >> --- a/hw/vfio/trace-events >> +++ b/hw/vfio/trace-events >> @@ -104,6 +104,7 @@ vfio_known_safe_misalignment(const char *name, uint64_t >> iova, uint64_t offset_wi >> vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, >> uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" >> size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA" >> vfio_listener_region_del_skip(uint64_t start, uint64_t end) "SKIPPING >> region_del 0x%"PRIx64" - 0x%"PRIx64 >> vfio_listener_region_del(uint64_t start, uint64_t end) "region_del >> 0x%"PRIx64" - 0x%"PRIx64 >> +vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t >> min, uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" >> - 0x%"PRIx64"]" >> vfio_disconnect_container(int fd) "close container->fd=%d" >> vfio_put_group(int fd) "close group->fd=%d" >> vfio_get_device(const char * name, unsigned int flags, unsigned int >> num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: >> %u" >> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h >> index 87524c64a443..0f84136cceb5 100644 >> --- a/include/hw/vfio/vfio-common.h >> +++ b/include/hw/vfio/vfio-common.h >> @@ -96,6 +96,19 @@ typedef struct VFIOContainer { >> QLIST_ENTRY(VFIOContainer) next; >> } VFIOContainer; >> >> +typedef struct VFIODirtyTrackingRange { >> + hwaddr min32; >> + hwaddr max32; >> + hwaddr min64; >> + hwaddr max64; >> +} VFIODirtyTrackingRange; >> + >> +typedef struct VFIODirtyRanges { >> + VFIOContainer *container; >> + VFIODirtyTrackingRange ranges; >> + MemoryListener listener; >> +} VFIODirtyRanges; >> + >> typedef struct VFIOGuestIOMMU { >> VFIOContainer *container; >> IOMMUMemoryRegion *iommu_mr; >