Hi Shameer,

On 23/01/18 13:16, Shameerali Kolothum Thodi wrote:
> Hi Eric,
> 
>> -----Original Message-----
>> From: Auger Eric [mailto:eric.au...@redhat.com]
>> Sent: Tuesday, January 23, 2018 8:32 AM
>> To: Alex Williamson <alex.william...@redhat.com>; Shameerali Kolothum
>> Thodi <shameerali.kolothum.th...@huawei.com>
>> Cc: pmo...@linux.vnet.ibm.com; k...@vger.kernel.org; linux-
>> ker...@vger.kernel.org; Linuxarm <linux...@huawei.com>; John Garry
>> <john.ga...@huawei.com>; xuwei (O) <xuw...@huawei.com>
>> Subject: Re: [RFC v2 2/5] vfio/type1: Check reserve region conflict and 
>> update
>> iova list
>>
>> Hi Shameer,
>>
>> On 18/01/18 01:04, Alex Williamson wrote:
>>> On Fri, 12 Jan 2018 16:45:28 +0000
>>> Shameer Kolothum <shameerali.kolothum.th...@huawei.com> wrote:
>>>
>>>> This retrieves the reserved regions associated with dev group and
>>>> checks for conflicts with any existing dma mappings. Also update
>>>> the iova list excluding the reserved regions.
>>>>
>>>> Signed-off-by: Shameer Kolothum
>> <shameerali.kolothum.th...@huawei.com>
>>>> ---
>>>>  drivers/vfio/vfio_iommu_type1.c | 161
>> +++++++++++++++++++++++++++++++++++++++-
>>>>  1 file changed, 159 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c
>> b/drivers/vfio/vfio_iommu_type1.c
>>>> index 11cbd49..7609070 100644
>>>> --- a/drivers/vfio/vfio_iommu_type1.c
>>>> +++ b/drivers/vfio/vfio_iommu_type1.c
>>>> @@ -28,6 +28,7 @@
>>>>  #include <linux/device.h>
>>>>  #include <linux/fs.h>
>>>>  #include <linux/iommu.h>
>>>> +#include <linux/list_sort.h>
>>>>  #include <linux/module.h>
>>>>  #include <linux/mm.h>
>>>>  #include <linux/rbtree.h>
>>>> @@ -1199,6 +1200,20 @@ static bool vfio_iommu_has_sw_msi(struct
>> iommu_group *group, phys_addr_t *base)
>>>>    return ret;
>>>>  }
>>>>
>>>
>>> /* list_sort helper */
>>>
>>>> +static int vfio_resv_cmp(void *priv, struct list_head *a, struct 
>>>> list_head *b)
>>>> +{
>>>> +  struct iommu_resv_region *ra, *rb;
>>>> +
>>>> +  ra = container_of(a, struct iommu_resv_region, list);
>>>> +  rb = container_of(b, struct iommu_resv_region, list);
>>>> +
>>>> +  if (ra->start < rb->start)
>>>> +          return -1;
>>>> +  if (ra->start > rb->start)
>>>> +          return 1;
>>>> +  return 0;
>>>> +}
>>>> +
>>>>  static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,
>>>>                            struct list_head *head)
>>>>  {
>>>> @@ -1274,6 +1289,24 @@ static int vfio_iommu_valid_aperture(struct
>> vfio_iommu *iommu,
>>>>  }
>>>>
>>>>  /*
>>>> + * Check reserved region conflicts with existing dma mappings
>>>> + */
>>>> +static int vfio_iommu_resv_region_conflict(struct vfio_iommu *iommu,
>>>> +                          struct list_head *resv_regions)
>>>> +{
>>>> +  struct iommu_resv_region *region;
>>>> +
>>>> +  /* Check for conflict with existing dma mappings */
>>>> +  list_for_each_entry(region, resv_regions, list) {
>>>> +          if (vfio_find_dma_overlap(iommu, region->start,
>>>> +                              region->start + region->length - 1))
>>>> +                  return -EINVAL;
>>>> +  }
>>>> +
>>>> +  return 0;
>>>> +}
>>>
>>> This basically does the same test as vfio_iommu_valid_aperture but
>>> properly names it a conflict test.  Please be consistent.  Should this
>>> also return bool, "conflict" is a yes/no answer.
>>>
>>>> +
>>>> +/*
>>>>   * Adjust the iommu aperture window if new aperture is a valid one
>>>>   */
>>>>  static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,
>>>> @@ -1316,6 +1349,51 @@ static int vfio_iommu_iova_aper_adjust(struct
>> vfio_iommu *iommu,
>>>>    return 0;
>>>>  }
>>>>
>>>> +/*
>>>> + * Check and update iova region list in case a reserved region
>>>> + * overlaps the iommu iova range
>>>> + */
>>>> +static int vfio_iommu_iova_resv_adjust(struct vfio_iommu *iommu,
>>>> +                                  struct list_head *resv_regions)
>>>
>>> "resv_region" in previous function, just "resv" here, use consistent
>>> names.  Also, what are we adjusting.  Maybe "exclude" is a better term.
>>>
>>>> +{
>>>> +  struct iommu_resv_region *resv;
>>>> +  struct list_head *iova = &iommu->iova_list;
>>>> +  struct vfio_iova *n, *next;
>>>> +
>>>> +  list_for_each_entry(resv, resv_regions, list) {
>>>> +          phys_addr_t start, end;
>>>> +
>>>> +          start = resv->start;
>>>> +          end = resv->start + resv->length - 1;
>>>> +
>>>> +          list_for_each_entry_safe(n, next, iova, list) {
>>>> +                  phys_addr_t a, b;
>>>> +                  int ret = 0;
>>>> +
>>>> +                  a = n->start;
>>>> +                  b = n->end;
>>>
>>> 'a' and 'b' variables actually make this incredibly confusing.  Use
>>> better variable names or just drop them entirely, it's much easier to
>>> follow as n->start & n->end.
>>>
>>>> +                  /* No overlap */
>>>> +                  if ((start > b) || (end < a))
>>>> +                          continue;
>>>> +                  /* Split the current node and create holes */
>>>> +                  if (start > a)
>>>> +                          ret = vfio_insert_iova(a, start - 1, &n->list);
>>>> +                  if (!ret && end < b)
>>>> +                          ret = vfio_insert_iova(end + 1, b, &n->list);
>>>> +                  if (ret)
>>>> +                          return ret;
>>>> +
>>>> +                  list_del(&n->list);
>>>
>>> This is trickier than it appears and deserves some explanation.  AIUI,
>>> we're actually inserting duplicate entries for the remainder at the
>>> start of the range and then at the end of the range (and the order is
>>> important here because we're inserting each before the current node),
>>> and then we delete the current node.  So the iova_list is kept sorted
>>> through this process, though temporarily includes some bogus, unordered
>>> sub-sets.
>>>
>>>> +                  kfree(n);
>>>> +          }
>>>> +  }
>>>> +
>>>> +  if (list_empty(iova))
>>>> +          return -EINVAL;
>>>> +
>>>> +  return 0;
>>>> +}
>>>> +
>>>>  static int vfio_iommu_type1_attach_group(void *iommu_data,
>>>>                                     struct iommu_group *iommu_group)
>>>>  {
>>>> @@ -1327,6 +1405,8 @@ static int vfio_iommu_type1_attach_group(void
>> *iommu_data,
>>>>    bool resv_msi, msi_remap;
>>>>    phys_addr_t resv_msi_base;
>>>>    struct iommu_domain_geometry geo;
>>>> +  struct list_head group_resv_regions;
>>>> +  struct iommu_resv_region *resv, *resv_next;
>>>>
>>>>    mutex_lock(&iommu->lock);
>>>>
>>>> @@ -1404,6 +1484,14 @@ static int vfio_iommu_type1_attach_group(void
>> *iommu_data,
>>>>    if (ret)
>>>>            goto out_detach;
>>>>
>>>> +  INIT_LIST_HEAD(&group_resv_regions);
>>>> +  iommu_get_group_resv_regions(iommu_group, &group_resv_regions);
>>>> +  list_sort(NULL, &group_resv_regions, vfio_resv_cmp);
>> iommu_get_group_resv_regions returns a sorted list (see
>> iommu_insert_resv_regions kerneldoc comment). You can have overlapping
>> regions of different types though.
> 
> Hmm..I am not sure. It looks like it is sorted only if the regions are of 
> same type.
> 
> "* The new element is sorted by address with respect to the other
>  * regions of the same type."
> 
> So hypothetically if there are two groups with regions like,
> 
> Group 1.
>  Start       size            type  
>   0x0000   0x1000        1
>   0x2000   0x1000        1
>   0x5000   0x1000        1
> 
> Group 2
>   Start       size              type
>    0x2000  0x4000           2
>    0x7000   0x1000          1
> 
> Then the  iommu_get_group_resv_regions() will return,
> 
> 0x0000   0x1000        1
> 0x2000   0x1000        1
> 0x5000   0x1000        1
> 0x2000  0x4000         2
> 0x7000   0x1000        1  

Hum yes, I remember now, sorry. It was made on purpose to avoid to
display interleaved resv region types in
/sys/kernel/iommu_groups/reserved_regions. I think it gives a better
user experience.

Thanks

Eric
> 
> But honestly I am not sure the above is a valid scenario or not. I am
> happy to remove the sorting if such a case will never happen.
> 
> Please let me know.
> 
> Thanks,
> Shameer
> 
>> Eric
>>>> +
>>>> +  ret = vfio_iommu_resv_region_conflict(iommu, &group_resv_regions);
>>>> +  if (ret)
>>>> +          goto out_detach;
>>>> +
>>>>    resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);
>>>>
>>>>    INIT_LIST_HEAD(&domain->group_list);
>>>> @@ -1434,11 +1522,15 @@ static int vfio_iommu_type1_attach_group(void
>> *iommu_data,
>>>>                d->prot == domain->prot) {
>>>>                    iommu_detach_group(domain->domain,
>> iommu_group);
>>>>                    if (!iommu_attach_group(d->domain, iommu_group)) {
>>>> +                          ret = vfio_iommu_iova_resv_adjust(iommu,
>>>> +
>>      &group_resv_regions);
>>>> +                          if (!ret)
>>>> +                                  goto out_domain;
>>>
>>> The above function is not without side effects if it fails, it's
>>> altered the iova_list.  It needs to be valid for the remaining domains
>>> if we're going to continue.
>>>
>>>> +
>>>>                            list_add(&group->next, &d->group_list);
>>>>                            iommu_domain_free(domain->domain);
>>>>                            kfree(domain);
>>>> -                          mutex_unlock(&iommu->lock);
>>>> -                          return 0;
>>>> +                          goto done;
>>>>                    }
>>>>
>>>>                    ret = iommu_attach_group(domain->domain,
>> iommu_group);
>>>> @@ -1465,8 +1557,15 @@ static int vfio_iommu_type1_attach_group(void
>> *iommu_data,
>>>>    if (ret)
>>>>            goto out_detach;
>>>>
>>>> +  ret = vfio_iommu_iova_resv_adjust(iommu, &group_resv_regions);
>>>> +  if (ret)
>>>> +          goto out_detach;
>>>
>>> Can't we process the reserved regions once before we get here rather
>>> than have two separate call points that do the same thing?  In order to
>>> roll back from errors above, it seems like we need to copy iova_list
>>> and work on the copy, installing it and deleting the original only on
>>> success.
>>>
>>>> +
>>>>    list_add(&domain->next, &iommu->domain_list);
>>>>
>>>> +done:
>>>> +  list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)
>>>> +          kfree(resv);
>>>>    mutex_unlock(&iommu->lock);
>>>>
>>>>    return 0;
>>>> @@ -1475,6 +1574,8 @@ static int vfio_iommu_type1_attach_group(void
>> *iommu_data,
>>>>    iommu_detach_group(domain->domain, iommu_group);
>>>>  out_domain:
>>>>    iommu_domain_free(domain->domain);
>>>> +  list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)
>>>> +          kfree(resv);
>>>>  out_free:
>>>>    kfree(domain);
>>>>    kfree(group);
>>>> @@ -1559,6 +1660,60 @@ static void vfio_iommu_iova_aper_refresh(struct
>> vfio_iommu *iommu)
>>>>    node->end = end;
>>>>  }
>>>>
>>>> +/*
>>>> + * Called when a group is detached. The reserved regions for that
>>>> + * group can be part of valid iova now. But since reserved regions
>>>> + * may be duplicated among groups, populate the iova valid regions
>>>> +   list again.
>>>> + */
>>>> +static void vfio_iommu_iova_resv_refresh(struct vfio_iommu *iommu)
>>>> +{
>>>> +  struct vfio_domain *d;
>>>> +  struct vfio_group *g;
>>>> +  struct vfio_iova *node, *tmp;
>>>> +  struct iommu_resv_region *resv, *resv_next;
>>>> +  struct list_head resv_regions;
>>>> +  phys_addr_t start, end;
>>>> +
>>>> +  INIT_LIST_HEAD(&resv_regions);
>>>> +
>>>> +  list_for_each_entry(d, &iommu->domain_list, next) {
>>>> +          list_for_each_entry(g, &d->group_list, next)
>>>> +                  iommu_get_group_resv_regions(g->iommu_group,
>>>> +                                                   &resv_regions);
>>>> +  }
>>>> +
>>>> +  if (list_empty(&resv_regions))
>>>> +          return;
>>>> +
>>>> +  list_sort(NULL, &resv_regions, vfio_resv_cmp);
>>>> +
>>>> +  node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);
>>>> +  start = node->start;
>>>> +  node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);
>>>> +  end = node->end;
>>>
>>> list_sort() only sorts based on ->start, we added reserved regions for
>>> all our groups to one list, we potentially have multiple entries with
>>> the same ->start.  How can we be sure that the last one in the list
>>> actually has the largest ->end value?
>>>
>>>> +
>>>> +  /* purge the iova list and create new one */
>>>> +  list_for_each_entry_safe(node, tmp, &iommu->iova_list, list) {
>>>> +          list_del(&node->list);
>>>> +          kfree(node);
>>>> +  }
>>>> +
>>>> +  if (vfio_iommu_iova_aper_adjust(iommu, start, end)) {
>>>> +          pr_warn("%s: Failed to update iova aperture. VFIO DMA map
>> request may fail\n",
>>>> +                  __func__);
>>>
>>> Map requests "will" fail.  Is this the right error strategy?  Detaching
>>> a group cannot fail.  Aren't we better off leaving the iova_list we had
>>> in place?  If we cannot expand the iova aperture when a group is
>>> removed, a user can continue unscathed.
>>>
>>>> +          goto done;
>>>> +  }
>>>> +
>>>> +  /* adjust the iova with current reserved regions */
>>>> +  if (vfio_iommu_iova_resv_adjust(iommu, &resv_regions))
>>>> +          pr_warn("%s: Failed to update iova list with reserve regions.
>> VFIO DMA map request may fail\n",
>>>> +                  __func__);
>>>
>>> Same.
>>>
>>>> +done:
>>>> +  list_for_each_entry_safe(resv, resv_next, &resv_regions, list)
>>>> +          kfree(resv);
>>>> +}
>>>> +
>>>>  static void vfio_iommu_type1_detach_group(void *iommu_data,
>>>>                                      struct iommu_group *iommu_group)
>>>>  {
>>>> @@ -1617,6 +1772,8 @@ static void vfio_iommu_type1_detach_group(void
>> *iommu_data,
>>>>            break;
>>>>    }
>>>>
>>>> +  vfio_iommu_iova_resv_refresh(iommu);
>>>> +
>>>>  detach_group_done:
>>>>    mutex_unlock(&iommu->lock);
>>>>  }
>>>

Reply via email to