Am 2021-11-18 um 1:53 a.m. schrieb Alistair Popple:
> On Tuesday, 16 November 2021 6:30:18 AM AEDT Alex Sierra wrote:
>> Device memory that is cache coherent from device and CPU point of view.
>> This is used on platforms that have an advanced system bus (like CAPI
>> or CXL). Any page of a process can be migrated to such memory. However,
>> no one should be allowed to pin such memory so that it can always be
>> evicted.
>>
>> Signed-off-by: Alex Sierra <alex.sie...@amd.com>
>> ---
>>  include/linux/memremap.h |  8 ++++++++
>>  include/linux/mm.h       |  9 +++++++++
>>  mm/memcontrol.c          |  6 +++---
>>  mm/memory-failure.c      |  6 +++++-
>>  mm/memremap.c            |  5 ++++-
>>  mm/migrate.c             | 21 +++++++++++++--------
>>  6 files changed, 42 insertions(+), 13 deletions(-)
>>
>> diff --git a/include/linux/memremap.h b/include/linux/memremap.h
>> index c0e9d35889e8..ff4d398edf35 100644
>> --- a/include/linux/memremap.h
>> +++ b/include/linux/memremap.h
>> @@ -39,6 +39,13 @@ struct vmem_altmap {
>>   * A more complete discussion of unaddressable memory may be found in
>>   * include/linux/hmm.h and Documentation/vm/hmm.rst.
>>   *
>> + * MEMORY_DEVICE_COHERENT:
>> + * Device memory that is cache coherent from device and CPU point of view. 
>> This
>> + * is used on platforms that have an advanced system bus (like CAPI or 
>> CXL). A
>> + * driver can hotplug the device memory using ZONE_DEVICE and with that 
>> memory
>> + * type. Any page of a process can be migrated to such memory. However no 
>> one
>> + * should be allowed to pin such memory so that it can always be evicted.
>> + *
>>   * MEMORY_DEVICE_FS_DAX:
>>   * Host memory that has similar access semantics as System RAM i.e. DMA
>>   * coherent and supports page pinning. In support of coordinating page
>> @@ -59,6 +66,7 @@ struct vmem_altmap {
>>  enum memory_type {
>>      /* 0 is reserved to catch uninitialized type fields */
>>      MEMORY_DEVICE_PRIVATE = 1,
>> +    MEMORY_DEVICE_COHERENT,
>>      MEMORY_DEVICE_FS_DAX,
>>      MEMORY_DEVICE_GENERIC,
>>      MEMORY_DEVICE_PCI_P2PDMA,
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index 73a52aba448f..d23b6ebd2177 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -1162,6 +1162,7 @@ static inline bool page_is_devmap_managed(struct page 
>> *page)
>>              return false;
>>      switch (page->pgmap->type) {
>>      case MEMORY_DEVICE_PRIVATE:
>> +    case MEMORY_DEVICE_COHERENT:
>>      case MEMORY_DEVICE_FS_DAX:
>>              return true;
>>      default:
>> @@ -1191,6 +1192,14 @@ static inline bool is_device_private_page(const 
>> struct page *page)
>>              page->pgmap->type == MEMORY_DEVICE_PRIVATE;
>>  }
>>  
>> +static inline bool is_device_page(const struct page *page)
>> +{
>> +    return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
>> +            is_zone_device_page(page) &&
>> +            (page->pgmap->type == MEMORY_DEVICE_PRIVATE ||
>> +            page->pgmap->type == MEMORY_DEVICE_COHERENT);
>> +}
>> +
>>  static inline bool is_pci_p2pdma_page(const struct page *page)
>>  {
>>      return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index 6da5020a8656..e0275ebe1198 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -5695,8 +5695,8 @@ static int mem_cgroup_move_account(struct page *page,
>>   *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
>>   *     target for charge migration. if @target is not NULL, the entry is 
>> stored
>>   *     in target->ent.
>> - *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is 
>> MEMORY_DEVICE_PRIVATE
>> - *     (so ZONE_DEVICE page and thus not on the lru).
>> + *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is 
>> MEMORY_DEVICE_COHERENT
>> + *     or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the 
>> lru).
>>   *     For now we such page is charge like a regular page would be as for 
>> all
>>   *     intent and purposes it is just special memory taking the place of a
>>   *     regular page.
>> @@ -5730,7 +5730,7 @@ static enum mc_target_type get_mctgt_type(struct 
>> vm_area_struct *vma,
>>               */
>>              if (page_memcg(page) == mc.from) {
>>                      ret = MC_TARGET_PAGE;
>> -                    if (is_device_private_page(page))
>> +                    if (is_device_page(page))
>>                              ret = MC_TARGET_DEVICE;
>>                      if (target)
>>                              target->page = page;
>> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
>> index 3e6449f2102a..51b55fc5172c 100644
>> --- a/mm/memory-failure.c
>> +++ b/mm/memory-failure.c
>> @@ -1554,12 +1554,16 @@ static int memory_failure_dev_pagemap(unsigned long 
>> pfn, int flags,
>>              goto unlock;
>>      }
>>  
>> -    if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
>> +    switch (pgmap->type) {
>> +    case MEMORY_DEVICE_PRIVATE:
>> +    case MEMORY_DEVICE_COHERENT:
>>              /*
>>               * TODO: Handle HMM pages which may need coordination
>>               * with device-side memory.
>>               */
>>              goto unlock;
>> +    default:
>> +            break;
>>      }
>>  
>>      /*
>> diff --git a/mm/memremap.c b/mm/memremap.c
>> index ed593bf87109..94d6a1e01d42 100644
>> --- a/mm/memremap.c
>> +++ b/mm/memremap.c
>> @@ -44,6 +44,7 @@ EXPORT_SYMBOL(devmap_managed_key);
>>  static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
>>  {
>>      if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
>> +        pgmap->type == MEMORY_DEVICE_COHERENT ||
>>          pgmap->type == MEMORY_DEVICE_FS_DAX)
>>              static_branch_dec(&devmap_managed_key);
>>  }
>> @@ -51,6 +52,7 @@ static void devmap_managed_enable_put(struct dev_pagemap 
>> *pgmap)
>>  static void devmap_managed_enable_get(struct dev_pagemap *pgmap)
>>  {
>>      if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
>> +        pgmap->type == MEMORY_DEVICE_COHERENT ||
>>          pgmap->type == MEMORY_DEVICE_FS_DAX)
>>              static_branch_inc(&devmap_managed_key);
>>  }
>> @@ -328,6 +330,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
>>  
>>      switch (pgmap->type) {
>>      case MEMORY_DEVICE_PRIVATE:
>> +    case MEMORY_DEVICE_COHERENT:
>>              if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) {
>>                      WARN(1, "Device private memory not supported\n");
>>                      return ERR_PTR(-EINVAL);
>> @@ -498,7 +501,7 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap);
>>  void free_devmap_managed_page(struct page *page)
>>  {
>>      /* notify page idle for dax */
>> -    if (!is_device_private_page(page)) {
>> +    if (!is_device_page(page)) {
>>              wake_up_var(&page->_refcount);
>>              return;
>>      }
>> diff --git a/mm/migrate.c b/mm/migrate.c
>> index 1852d787e6ab..f74422a42192 100644
>> --- a/mm/migrate.c
>> +++ b/mm/migrate.c
>> @@ -362,7 +362,7 @@ static int expected_page_refs(struct address_space 
>> *mapping, struct page *page)
>>       * Device private pages have an extra refcount as they are
>>       * ZONE_DEVICE pages.
>>       */
>> -    expected_count += is_device_private_page(page);
>> +    expected_count += is_device_page(page);
>>      if (mapping)
>>              expected_count += thp_nr_pages(page) + page_has_private(page);
>>  
>> @@ -2503,7 +2503,7 @@ static bool migrate_vma_check_page(struct page *page)
>>               * FIXME proper solution is to rework migration_entry_wait() so
>>               * it does not need to take a reference on page.
>>               */
> Note that I have posted a patch to fix this - see
> https://lore.kernel.org/all/20211118020754.954425-1-apop...@nvidia.com/ This
> looks ok for now assuming coherent pages can never be pinned.
>
> However that raises a question - what happens when something calls
> get_user_pages() on a pfn pointing to a coherent device page? I can't see
> anything in this series that prevents pinning of coherent device pages, so we
> can't just assume they aren't pinned.

I agree. I think we need to depend on your patch to go in first.

I'm also wondering if we need to do something to prevent get_user_pages
from pinning device pages. And by "pin", I think migrate_vma_check_page
is not talking about FOLL_PIN, but any get_user_pages call. As far as I
can tell, there should be nothing fundamentally wrong with pinning
device pages for a short time. But I think we'll want to avoid
FOLL_LONGTERM because that would affect our memory manager's ability to
evict device memory.


>
> In the case of device-private pages this is enforced by the fact they never
> have present pte's, so any attempt to GUP them results in a fault. But if I'm
> understanding this series correctly that won't be the case for coherent device
> pages right?

Right.

Regards,
  Felix


>
>> -            return is_device_private_page(page);
>> +            return is_device_page(page);
>>      }
>>  
>>      /* For file back page */
>> @@ -2791,7 +2791,7 @@ EXPORT_SYMBOL(migrate_vma_setup);
>>   *     handle_pte_fault()
>>   *       do_anonymous_page()
>>   * to map in an anonymous zero page but the struct page will be a 
>> ZONE_DEVICE
>> - * private page.
>> + * private or coherent page.
>>   */
>>  static void migrate_vma_insert_page(struct migrate_vma *migrate,
>>                                  unsigned long addr,
>> @@ -2867,10 +2867,15 @@ static void migrate_vma_insert_page(struct 
>> migrate_vma *migrate,
>>                              swp_entry = make_readable_device_private_entry(
>>                                                      page_to_pfn(page));
>>                      entry = swp_entry_to_pte(swp_entry);
>> +            } else if (is_device_page(page)) {
> How about adding an explicit `is_device_coherent_page()` helper? It would make
> the test more explicit that this is expected to handle just coherent pages and
> I bet there will be future changes that need to differentiate between private
> and coherent pages anyway.
>
>> +                    entry = pte_mkold(mk_pte(page,
>> +                                             READ_ONCE(vma->vm_page_prot)));
>> +                    if (vma->vm_flags & VM_WRITE)
>> +                            entry = pte_mkwrite(pte_mkdirty(entry));
>>              } else {
>>                      /*
>> -                     * For now we only support migrating to un-addressable
>> -                     * device memory.
>> +                     * We support migrating to private and coherent types
>> +                     * for device zone memory.
>>                       */
>>                      pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
>>                      goto abort;
>> @@ -2976,10 +2981,10 @@ void migrate_vma_pages(struct migrate_vma *migrate)
>>              mapping = page_mapping(page);
>>  
>>              if (is_zone_device_page(newpage)) {
>> -                    if (is_device_private_page(newpage)) {
>> +                    if (is_device_page(newpage)) {
>>                              /*
>> -                             * For now only support private anonymous when
>> -                             * migrating to un-addressable device memory.
>> +                             * For now only support private and coherent
>> +                             * anonymous when migrating to device memory.
>>                               */
>>                              if (mapping) {
>>                                      migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
>>
>
>

Reply via email to