from:"Dan Williams"

libnvdimm fixes 5.9-rc6

2020-09-20 Thread Dan Williams

Hi Linus, please pull from:

  git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm
tags/libnvdimm-fixes-5.9-rc6

...to receive a handful of fixes to address a string of mistakes in
the mechanism for device-mapper to determine if its component devices
are dax capable. You will notice that this branch was rebased this
morning and it has not appeared in -next. I decided to cut short the
soak time because the infinite-recursion regression is currently
crashing anyone attempting to test filesystem-dax in 5.9-rc5+. The
most recent rebase folded in a compile fix reported by the kbuild
robot for the uncommon CONFIG_DAX=m case. It should, "should",  be all
good now.

---

The following changes since commit 856deb866d16e29bd65952e0289066f6078af773:

  Linux 5.9-rc5 (2020-09-13 16:06:00 -0700)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm
tags/libnvdimm-fixes-5.9-rc6

for you to fetch changes up to d4c5da5049ac27c6ef8f6f98548c3a1ade352d25:

  dax: Fix stack overflow when mounting fsdax pmem device (2020-09-20
08:57:36 -0700)


libnvdimm fixes for 5.9-rc6

- Fix an original bug in device-mapper table reference counting when
  interrogating dax capability in the component device. This bug was
  hidden by the following bug.

- Fix device-mapper to use the proper helper (dax_supported() instead of
  the leaf helper generic_fsdax_supported()) to determine dax operation
  of a stacked block device configuration. The original implementation
  is only valid for one level of dax-capable block device stacking. This
  bug was discovered while fixing the below regression.

- Fix an infinite recursion regression introduced by broken attempts to
  quiet the generic_fsdax_supported() path and make it bail out before
  logging "dax capability not found" errors.


Adrian Huang (1):
  dax: Fix stack overflow when mounting fsdax pmem device

Dan Williams (1):
  dm/dax: Fix table reference counts

Jan Kara (1):
  dm: Call proper helper to determine dax support

 drivers/dax/super.c   | 16 ++--
 drivers/md/dm-table.c | 10 +++---
 drivers/md/dm.c   |  5 +++--
 include/linux/dax.h   | 22 --
 4 files changed, 40 insertions(+), 13 deletions(-)

[PATCH v3] dm: Call proper helper to determine dax support

2020-09-18 Thread Dan Williams

From: Jan Kara 

DM was calling generic_fsdax_supported() to determine whether a device
referenced in the DM table supports DAX. However this is a helper for "leaf" 
device drivers so that
they don't have to duplicate common generic checks. High level code
should call dax_supported() helper which that calls into appropriate
helper for the particular device. This problem manifested itself as
kernel messages:

dm-3: error: dax access failed (-95)

when lvm2-testsuite run in cases where a DM device was stacked on top of
another DM device.

Fixes: 7bf7eac8d648 ("dax: Arrange for dax_supported check to span multiple 
devices")
Cc: 
Tested-by: Adrian Huang 
Signed-off-by: Jan Kara 
Acked-by: Mike Snitzer 
Reported-by: kernel test robot 
Signed-off-by: Dan Williams 
---
Changes since v2 [1]:
- Add dummy definitions for dax_read_{lock,unlock} in the CONFIG_DAX=n
  case (0day robot)

[1]: 
http://lore.kernel.org/r/160040692945.25320.13233625491405115889.st...@dwillia2-desk3.amr.corp.intel.com

 drivers/dax/super.c   |4 
 drivers/md/dm-table.c |   10 +++---
 include/linux/dax.h   |   22 --
 3 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index e5767c83ea23..b6284c5cae0a 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -325,11 +325,15 @@ EXPORT_SYMBOL_GPL(dax_direct_access);
 bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
int blocksize, sector_t start, sector_t len)
 {
+   if (!dax_dev)
+   return false;
+
if (!dax_alive(dax_dev))
return false;
 
return dax_dev->ops->dax_supported(dax_dev, bdev, blocksize, start, 
len);
 }
+EXPORT_SYMBOL_GPL(dax_supported);
 
 size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void 
*addr,
size_t bytes, struct iov_iter *i)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 5edc3079e7c1..229f461e7def 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -860,10 +860,14 @@ EXPORT_SYMBOL_GPL(dm_table_set_type);
 int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
 {
-   int blocksize = *(int *) data;
+   int blocksize = *(int *) data, id;
+   bool rc;
 
-   return generic_fsdax_supported(dev->dax_dev, dev->bdev, blocksize,
-  start, len);
+   id = dax_read_lock();
+   rc = dax_supported(dev->dax_dev, dev->bdev, blocksize, start, len);
+   dax_read_unlock(id);
+
+   return rc;
 }
 
 /* Check devices support synchronous DAX */
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 6904d4e0b2e0..d0af16b23122 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -130,6 +130,8 @@ static inline bool generic_fsdax_supported(struct 
dax_device *dax_dev,
return __generic_fsdax_supported(dax_dev, bdev, blocksize, start,
sectors);
 }
+bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
+   int blocksize, sector_t start, sector_t len);
 
 static inline void fs_put_dax(struct dax_device *dax_dev)
 {
@@ -157,6 +159,13 @@ static inline bool generic_fsdax_supported(struct 
dax_device *dax_dev,
return false;
 }
 
+static inline bool dax_supported(struct dax_device *dax_dev,
+   struct block_device *bdev, int blocksize, sector_t start,
+   sector_t len)
+{
+   return false;
+}
+
 static inline void fs_put_dax(struct dax_device *dax_dev)
 {
 }
@@ -189,14 +198,23 @@ static inline void dax_unlock_page(struct page *page, 
dax_entry_t cookie)
 }
 #endif
 
+#ifdef CONFIG_DAX
 int dax_read_lock(void);
 void dax_read_unlock(int id);
+#else
+static inline int dax_read_lock(void)
+{
+   return 0;
+}
+
+static inline void dax_read_unlock(int id)
+{
+}
+#endif /* CONFIG_DAX */
 bool dax_alive(struct dax_device *dax_dev);
 void *dax_get_private(struct dax_device *dax_dev);
 long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long 
nr_pages,
void **kaddr, pfn_t *pfn);
-bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
-   int blocksize, sector_t start, sector_t len);
 size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void 
*addr,
size_t bytes, struct iov_iter *i);
 size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,

[PATCH] dm/dax: Fix table reference counts

2020-09-18 Thread Dan Williams

A recent fix to the dm_dax_supported() flow uncovered a latent bug. When
dm_get_live_table() fails it is still required to drop the
srcu_read_lock(). Without this change the lvm2 test-suite triggers this
warning:

# lvm2-testsuite --only pvmove-abort-all.sh

WARNING: lock held when returning to user space!
5.9.0-rc5+ #251 Tainted: G   OE

lvm/1318 is leaving the kernel with locks still held!
1 lock held by lvm/1318:
 #0: 9372abb5a340 (>io_barrier){}-{0:0}, at: 
dm_get_live_table+0x5/0xb0 [dm_mod]

...and later on this hang signature:

INFO: task lvm:1344 blocked for more than 122 seconds.
  Tainted: G   OE 5.9.0-rc5+ #251
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:lvm state:D stack:0 pid: 1344 ppid: 1 
flags:0x4000
Call Trace:
 __schedule+0x45f/0xa80
 ? finish_task_switch+0x249/0x2c0
 ? wait_for_completion+0x86/0x110
 schedule+0x5f/0xd0
 schedule_timeout+0x212/0x2a0
 ? __schedule+0x467/0xa80
 ? wait_for_completion+0x86/0x110
 wait_for_completion+0xb0/0x110
 __synchronize_srcu+0xd1/0x160
 ? __bpf_trace_rcu_utilization+0x10/0x10
 __dm_suspend+0x6d/0x210 [dm_mod]
 dm_suspend+0xf6/0x140 [dm_mod]

Fixes: 7bf7eac8d648 ("dax: Arrange for dax_supported check to span multiple 
devices")
Cc: 
Cc: Jan Kara 
Cc: Alasdair Kergon 
Cc: Mike Snitzer 
Reported-by: Adrian Huang 
Signed-off-by: Dan Williams 
---
 drivers/md/dm.c |5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index fb0255d25e4b..4a40df8af7d3 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1136,15 +1136,16 @@ static bool dm_dax_supported(struct dax_device 
*dax_dev, struct block_device *bd
 {
struct mapped_device *md = dax_get_private(dax_dev);
struct dm_table *map;
+   bool ret = false;
int srcu_idx;
-   bool ret;
 
map = dm_get_live_table(md, _idx);
if (!map)
-   return false;
+   goto out;
 
ret = dm_table_supports_dax(map, device_supports_dax, );
 
+out:
dm_put_live_table(md, srcu_idx);
 
return ret;

Re: [dm-devel] [PATCH v2] dm: Call proper helper to determine dax support

2020-09-18 Thread Dan Williams

On Fri, Sep 18, 2020 at 8:31 AM Darrick J. Wong  wrote:
>
> On Thu, Sep 17, 2020 at 10:30:03PM -0700, Dan Williams wrote:
> > From: Jan Kara 
> >
> > DM was calling generic_fsdax_supported() to determine whether a device
> > referenced in the DM table supports DAX. However this is a helper for 
> > "leaf" device drivers so that
> > they don't have to duplicate common generic checks. High level code
> > should call dax_supported() helper which that calls into appropriate
> > helper for the particular device. This problem manifested itself as
> > kernel messages:
> >
> > dm-3: error: dax access failed (-95)
> >
> > when lvm2-testsuite run in cases where a DM device was stacked on top of
> > another DM device.
>
> Is there somewhere where it is documented which of:
>
> bdev_dax_supported, generic_fsdax_supported, and dax_supported
>
> one is supposed to use for a given circumstance?

generic_fsdax_supported should be private to device drivers populating
their dax_operations. I think it deserves a rename at this point.
dax_supported() knows how to route through multiple layers of stacked
block-devices to ask the "is dax supported" question at each level.

> I guess the last two can test a given range w/ blocksize; the first one
> only does blocksize; and the middle one also checks with whatever fs
> might be mounted? 
>
> (I ask because it took me a while to figure out how to revert correctly
> the brokenness in rc3-5 that broke my nightly dax fstesting.)

Again, apologies for that.

[PATCH v2] dm: Call proper helper to determine dax support

2020-09-17 Thread Dan Williams

From: Jan Kara 

DM was calling generic_fsdax_supported() to determine whether a device
referenced in the DM table supports DAX. However this is a helper for "leaf" 
device drivers so that
they don't have to duplicate common generic checks. High level code
should call dax_supported() helper which that calls into appropriate
helper for the particular device. This problem manifested itself as
kernel messages:

dm-3: error: dax access failed (-95)

when lvm2-testsuite run in cases where a DM device was stacked on top of
another DM device.

Fixes: 7bf7eac8d648 ("dax: Arrange for dax_supported check to span multiple 
devices")
Cc: 
Tested-by: Adrian Huang 
Signed-off-by: Jan Kara 
Acked-by: Mike Snitzer 
Signed-off-by: Dan Williams 
---
Changes since v1 [1]:
- Add missing dax_read_lock() around dax_supported()

[1]: http://lore.kernel.org/r/20200916151445.450-1-j...@suse.cz

 drivers/dax/super.c   |4 
 drivers/md/dm-table.c |   10 +++---
 include/linux/dax.h   |   11 +--
 3 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index e5767c83ea23..b6284c5cae0a 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -325,11 +325,15 @@ EXPORT_SYMBOL_GPL(dax_direct_access);
 bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
int blocksize, sector_t start, sector_t len)
 {
+   if (!dax_dev)
+   return false;
+
if (!dax_alive(dax_dev))
return false;
 
return dax_dev->ops->dax_supported(dax_dev, bdev, blocksize, start, 
len);
 }
+EXPORT_SYMBOL_GPL(dax_supported);
 
 size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void 
*addr,
size_t bytes, struct iov_iter *i)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 5edc3079e7c1..229f461e7def 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -860,10 +860,14 @@ EXPORT_SYMBOL_GPL(dm_table_set_type);
 int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
 {
-   int blocksize = *(int *) data;
+   int blocksize = *(int *) data, id;
+   bool rc;
 
-   return generic_fsdax_supported(dev->dax_dev, dev->bdev, blocksize,
-  start, len);
+   id = dax_read_lock();
+   rc = dax_supported(dev->dax_dev, dev->bdev, blocksize, start, len);
+   dax_read_unlock(id);
+
+   return rc;
 }
 
 /* Check devices support synchronous DAX */
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 6904d4e0b2e0..9f916326814a 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -130,6 +130,8 @@ static inline bool generic_fsdax_supported(struct 
dax_device *dax_dev,
return __generic_fsdax_supported(dax_dev, bdev, blocksize, start,
sectors);
 }
+bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
+   int blocksize, sector_t start, sector_t len);
 
 static inline void fs_put_dax(struct dax_device *dax_dev)
 {
@@ -157,6 +159,13 @@ static inline bool generic_fsdax_supported(struct 
dax_device *dax_dev,
return false;
 }
 
+static inline bool dax_supported(struct dax_device *dax_dev,
+   struct block_device *bdev, int blocksize, sector_t start,
+   sector_t len)
+{
+   return false;
+}
+
 static inline void fs_put_dax(struct dax_device *dax_dev)
 {
 }
@@ -195,8 +204,6 @@ bool dax_alive(struct dax_device *dax_dev);
 void *dax_get_private(struct dax_device *dax_dev);
 long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long 
nr_pages,
void **kaddr, pfn_t *pfn);
-bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
-   int blocksize, sector_t start, sector_t len);
 size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void 
*addr,
size_t bytes, struct iov_iter *i);
 size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,

Re: [PATCH] pmem: fix __copy_user_flushcache

2020-09-17 Thread Dan Williams

On Wed, Sep 16, 2020 at 11:57 AM Mikulas Patocka  wrote:
>
>
>
> On Wed, 16 Sep 2020, Dan Williams wrote:
>
> > On Wed, Sep 16, 2020 at 10:24 AM Mikulas Patocka  
> > wrote:
> > >
> > >
> > >
> > > On Wed, 16 Sep 2020, Dan Williams wrote:
> > >
> > > > On Wed, Sep 16, 2020 at 3:57 AM Mikulas Patocka  
> > > > wrote:
> > > > >
> > > > >
> > > > >
> > > > > I'm submitting this patch that adds the required exports (so that we 
> > > > > could
> > > > > use __copy_from_user_flushcache on x86, arm64 and powerpc). Please, 
> > > > > queue
> > > > > it for the next merge window.
> > > >
> > > > Why? This should go with the first user, and it's not clear that it
> > > > needs to be relative to the current dax_operations export scheme.
> > >
> > > Before nvfs gets included in the kernel, I need to distribute it as a
> > > module. So, it would make my maintenance easier. But if you don't want to
> > > export it now, no problem, I can just copy __copy_user_flushcache from the
> > > kernel to the module.
> >
> > That sounds a better plan than exporting symbols with no in-kernel consumer.
>
> BTW, this function is buggy. Here I'm submitting the patch.
>
>
>
> From: Mikulas Patocka 
>
> If we copy less than 8 bytes and if the destination crosses a cache line,
> __copy_user_flushcache would invalidate only the first cache line. This
> patch makes it invalidate the second cache line as well.

Good catch.

> Signed-off-by: Mikulas Patocka 
> Cc: sta...@vger.kernel.org
>
> ---
>  arch/x86/lib/usercopy_64.c |2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> Index: linux-2.6/arch/x86/lib/usercopy_64.c
> ===
> --- linux-2.6.orig/arch/x86/lib/usercopy_64.c   2020-09-05 10:01:27.0 
> +0200
> +++ linux-2.6/arch/x86/lib/usercopy_64.c2020-09-16 20:48:31.0 
> +0200
> @@ -120,7 +120,7 @@ long __copy_user_flushcache(void *dst, c
>  */
> if (size < 8) {
> if (!IS_ALIGNED(dest, 4) || size != 4)
> -   clean_cache_range(dst, 1);
> +   clean_cache_range(dst, size);
> } else {
> if (!IS_ALIGNED(dest, 8)) {
> dest = ALIGN(dest, boot_cpu_data.x86_clflush_size);
>

You can add:

Fixes: 0aed55af8834 ("x86, uaccess: introduce
copy_from_iter_flushcache for pmem / cache-bypass operations")
Reviewed-by: Dan Williams

Re: [PATCH] mm: remove extra ZONE_DEVICE struct page refcount

2020-09-16 Thread Dan Williams

On Wed, Sep 16, 2020 at 5:29 PM Ralph Campbell  wrote:
>
>
> On 9/15/20 10:36 PM, Christoph Hellwig wrote:
> > On Tue, Sep 15, 2020 at 09:39:47AM -0700, Ralph Campbell wrote:
> >>> I don't think any of the three ->page_free instances even cares about
> >>> the page refcount.
> >>>
> >> Not true. The page_free() callback records the page is free by setting
> >> a bit or putting the page on a free list but when it allocates a free
> >> device private struct page to be used with migrate_vma_setup(), it needs to
> >> increment the refcount.
> >>
> >> For the ZONE_DEVICE MEMORY_DEVICE_GENERIC and MEMORY_DEVICE_PCI_P2PDMA
> >> struct pages, I think you are correct because they don't define page_free()
> >> and from what I can see, don't decrement the page refcount to zero.
> >
> > Umm, the whole point of ZONE_DEVICE is to have a struct page for
> > something that is not system memory.  For both the ppc kvm case (magic
> > hypervisor pool) and Noveau (device internal) memory that clear is the
> > case.  But looks like test_hmm uses normal pages to fake this up, so
> > I was wrong about the third caller.  But I think we can just call
> > set_page_count just before freeing the page there with a comment
> > explaining what is goin on.
>
> Dan Williams thought that having the ZONE_DEVICE struct pages
> be on a free list with a refcount of one was a bit strange and
> that the driver should handle the zero to one transition.
> But, that would mean a bit more invasive change to the 3 drivers
> to set the reference count to zero after calling memremap_pages()
> and setting the reference count to one when allocating a struct
> page. What you are suggesting is what I also proposed in v1.

IIUC, isn't what Christoph recommending is that drivers handle
set_page_count() directly rather than the core since some are prepared
for it to be zero on entry?

Re: [PATCH] pmem: export the symbols __copy_user_flushcache and __copy_from_user_flushcache

2020-09-16 Thread Dan Williams

On Wed, Sep 16, 2020 at 3:57 AM Mikulas Patocka  wrote:
>
>
>
> On Tue, 15 Sep 2020, Mikulas Patocka wrote:
>
> >
> >
> > On Tue, 15 Sep 2020, Mikulas Patocka wrote:
> >
> > > > > - __copy_from_user_inatomic_nocache doesn't flush cache for leading 
> > > > > and
> > > > > trailing bytes.
> > > >
> > > > You want copy_user_flushcache(). See how fs/dax.c arranges for
> > > > dax_copy_from_iter() to route to pmem_copy_from_iter().
> > >
> > > Is it something new for the kernel 5.10? I see only __copy_user_flushcache
> > > that is implemented just for x86 and arm64.
> > >
> > > There is __copy_from_user_flushcache implemented for x86, arm64 and power.
> > > It is used in lib/iov_iter.c under
> > > #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE - so should I use this?

Yes, but maybe not directly.

> > >
> > > Mikulas
> >
> > ... and __copy_user_flushcache is not exported for modules. So, I am stuck
> > with __copy_from_user_inatomic_nocache.
> >
> > Mikulas
>
> I'm submitting this patch that adds the required exports (so that we could
> use __copy_from_user_flushcache on x86, arm64 and powerpc). Please, queue
> it for the next merge window.

Why? This should go with the first user, and it's not clear that it
needs to be relative to the current dax_operations export scheme.

My first question about nvfs is how it compares to a daxfs with
executables and other binaries configured to use page cache with the
new per-file dax facility?

Re: [PATCH] pmem: export the symbols __copy_user_flushcache and __copy_from_user_flushcache

2020-09-16 Thread Dan Williams

On Wed, Sep 16, 2020 at 10:24 AM Mikulas Patocka  wrote:
>
>
>
> On Wed, 16 Sep 2020, Dan Williams wrote:
>
> > On Wed, Sep 16, 2020 at 3:57 AM Mikulas Patocka  wrote:
> > >
> > >
> > >
> > > I'm submitting this patch that adds the required exports (so that we could
> > > use __copy_from_user_flushcache on x86, arm64 and powerpc). Please, queue
> > > it for the next merge window.
> >
> > Why? This should go with the first user, and it's not clear that it
> > needs to be relative to the current dax_operations export scheme.
>
> Before nvfs gets included in the kernel, I need to distribute it as a
> module. So, it would make my maintenance easier. But if you don't want to
> export it now, no problem, I can just copy __copy_user_flushcache from the
> kernel to the module.

That sounds a better plan than exporting symbols with no in-kernel consumer.

> > My first question about nvfs is how it compares to a daxfs with
> > executables and other binaries configured to use page cache with the
> > new per-file dax facility?
>
> nvfs is faster than dax-based filesystems on metadata-heavy operations
> because it doesn't have the overhead of the buffer cache and bios. See
> this: http://people.redhat.com/~mpatocka/nvfs/BENCHMARKS

...and that metadata problem is intractable upstream? Christoph poked
at bypassing the block layer for xfs metadata operations [1], I just
have not had time to carry that further.

[1]: "xfs: use dax_direct_access for log writes", although it seems
he's dropped that branch from his xfs.git

Re: [RFC] nvfs: a filesystem for persistent memory

2020-09-15 Thread Dan Williams

On Tue, Sep 15, 2020 at 5:35 AM Mikulas Patocka  wrote:
>
> Hi
>
> I am developing a new filesystem suitable for persistent memory - nvfs.

Nice!

> The goal is to have a small and fast filesystem that can be used on
> DAX-based devices. Nvfs maps the whole device into linear address space
> and it completely bypasses the overhead of the block layer and buffer
> cache.

So does device-dax, but device-dax lacks read(2)/write(2).

> In the past, there was nova filesystem for pmem, but it was abandoned a
> year ago (the last version is for the kernel 5.1 -
> https://github.com/NVSL/linux-nova ). Nvfs is smaller and performs better.
>
> The design of nvfs is similar to ext2/ext4, so that it fits into the VFS
> layer naturally, without too much glue code.
>
> I'd like to ask you to review it.
>
>
> tarballs:
> http://people.redhat.com/~mpatocka/nvfs/
> git:
> git://leontynka.twibright.com/nvfs.git
> the description of filesystem internals:
> http://people.redhat.com/~mpatocka/nvfs/INTERNALS
> benchmarks:
> http://people.redhat.com/~mpatocka/nvfs/BENCHMARKS
>
>
> TODO:
>
> - programs run approximately 4% slower when running from Optane-based
> persistent memory. Therefore, programs and libraries should use page cache
> and not DAX mapping.

This needs to be based on platform firmware data f(ACPI HMAT) for the
relative performance of a PMEM range vs DRAM. For example, this
tradeoff should not exist with battery backed DRAM, or virtio-pmem.

>
> - when the fsck.nvfs tool mmaps the device /dev/pmem0, the kernel uses
> buffer cache for the mapping. The buffer cache slows does fsck by a factor
> of 5 to 10. Could it be possible to change the kernel so that it maps DAX
> based block devices directly?

We've been down this path before.

5a023cdba50c block: enable dax for raw block devices
9f4736fe7ca8 block: revert runtime dax control of the raw block device
acc93d30d7d4 Revert "block: enable dax for raw block devices"

EXT2/4 metadata buffer management depends on the page cache and we
eliminated a class of bugs by removing that support. The problems are
likely tractable, but there was not a straightforward fix visible at
the time.

> - __copy_from_user_inatomic_nocache doesn't flush cache for leading and
> trailing bytes.

You want copy_user_flushcache(). See how fs/dax.c arranges for
dax_copy_from_iter() to route to pmem_copy_from_iter().

Re: [PATCH] mm: remove extra ZONE_DEVICE struct page refcount

2020-09-14 Thread Dan Williams

On Mon, Sep 14, 2020 at 3:45 PM Ralph Campbell  wrote:
>
> ZONE_DEVICE struct pages have an extra reference count that complicates the
> code for put_page() and several places in the kernel that need to check the
> reference count to see that a page is not being used (gup, compaction,
> migration, etc.). Clean up the code so the reference count doesn't need to
> be treated specially for ZONE_DEVICE.
>
> Signed-off-by: Ralph Campbell 
> ---
>
> Matthew Wilcox, Ira Weiny, and others have complained that ZONE_DEVICE
> struct page reference counting is ugly/broken. This is my attempt to
> fix it and it works for the HMM migration self tests.

Can you link to a technical description of what's broken? Or better
yet, summarize that argument in the changelog?

> I'm only sending this out as a RFC since I'm not that familiar with the
> DAX, PMEM, XEN, and other uses of ZONE_DEVICE struct pages allocated
> with devm_memremap_pages() or memremap_pages() but my best reading of
> the code looks like it might be OK. I could use help testing these
> configurations.

Back in the 4.15 days I could not convince myself that some code paths
blindly assumed that pages with refcount==0 were on an lru list. Since
then, struct page has been reorganized to not collide the ->pgmap back
pointer with the ->lru list and there have been other cleanups for
page pinning that might make this incremental cleanup viable.

You also need to fix up ext4_break_layouts() and
xfs_break_dax_layouts() to expect ->_refcount is 0 instead of 1. This
also needs some fstests exposure.

> I have a modified THP migration patch series that applies on top of
> this one and is cleaner since I don't have to add code to handle the
> +1 reference count. The link below is for the earlier v2:
> ("mm/hmm/nouveau: add THP migration to migrate_vma_*")
> https://lore.kernel.org/linux-mm/20200902165830.5367-1-rcampb...@nvidia.com
>
>
>  arch/powerpc/kvm/book3s_hv_uvmem.c |  1 -
>  drivers/gpu/drm/nouveau/nouveau_dmem.c |  1 -
>  include/linux/memremap.h   |  6 +--
>  include/linux/mm.h | 39 ---
>  lib/test_hmm.c |  1 -
>  mm/gup.c   | 44 -
>  mm/memremap.c  | 20 
>  mm/migrate.c   |  5 --
>  mm/swap.c  | 66 +++---
>  9 files changed, 41 insertions(+), 142 deletions(-)

This diffstat is indeed appealing.

>
> diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c 
> b/arch/powerpc/kvm/book3s_hv_uvmem.c
> index 84e5a2dc8be5..00d97050d7ff 100644
> --- a/arch/powerpc/kvm/book3s_hv_uvmem.c
> +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
> @@ -711,7 +711,6 @@ static struct page *kvmppc_uvmem_get_page(unsigned long 
> gpa, struct kvm *kvm)
>
> dpage = pfn_to_page(uvmem_pfn);
> dpage->zone_device_data = pvt;
> -   get_page(dpage);
> lock_page(dpage);
> return dpage;
>  out_clear:
> diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c 
> b/drivers/gpu/drm/nouveau/nouveau_dmem.c
> index a13c6215bba8..2a4bbe01a455 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
> @@ -324,7 +324,6 @@ nouveau_dmem_page_alloc_locked(struct nouveau_drm *drm)
> return NULL;
> }
>
> -   get_page(page);
> lock_page(page);
> return page;
>  }
> diff --git a/include/linux/memremap.h b/include/linux/memremap.h
> index 4e9c738f4b31..7dd9802d2612 100644
> --- a/include/linux/memremap.h
> +++ b/include/linux/memremap.h
> @@ -67,9 +67,9 @@ enum memory_type {
>
>  struct dev_pagemap_ops {
> /*
> -* Called once the page refcount reaches 1.  (ZONE_DEVICE pages never
> -* reach 0 refcount unless there is a refcount bug. This allows the
> -* device driver to implement its own memory management.)
> +* Called once the page refcount reaches 0. The reference count is
> +* reset to 1 before calling page_free(). This allows the
> +* device driver to implement its own memory management.

I'd clarify the order events / responsibility of the common core
page_free() and the device specific page_free(). At the same time, why
not update drivers to expect that the page is already refcount==0 on
entry? Seems odd to go through all this trouble to make the reference
count appear to be zero to the wider kernel but expect that drivers
get a fake reference on entry to their ->page_free() callbacks.

Re: [PATCH v3] x86/uaccess: Use pointer masking to limit uaccess speculation

2020-09-14 Thread Dan Williams

 +   dst, len);

I look at this and wonder if the open-coded "(__force void *)" should
be subsumed in the new macro. It also feels like the name should be
"enforce" to distinguish it from the type cast case?

> user_access_end();
>
> return ret;
> @@ -177,8 +178,7 @@ static inline __wsum csum_and_copy_to_user(const void 
> *src,
> might_sleep();
> if (!user_access_begin(dst, len))
> return 0;
> -
> -   ret = csum_partial_copy_generic(src, (__force void *)dst, len);
> +   ret = csum_partial_copy_generic(src, (__force void 
> *)force_user_ptr(dst), len);
> user_access_end();
> return ret;
>  }
> diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
> index f9c00110a69a..0cecdaa362b1 100644
> --- a/arch/x86/include/asm/futex.h
> +++ b/arch/x86/include/asm/futex.h
> @@ -59,6 +59,8 @@ static __always_inline int arch_futex_atomic_op_inuser(int 
> op, int oparg, int *o
> if (!user_access_begin(uaddr, sizeof(u32)))
> return -EFAULT;
>
> +   uaddr = force_user_ptr(uaddr);
> +
> switch (op) {
> case FUTEX_OP_SET:
> unsafe_atomic_op1("xchgl %0, %2", oval, uaddr, oparg, Efault);
> @@ -94,6 +96,9 @@ static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, 
> u32 __user *uaddr,
>
> if (!user_access_begin(uaddr, sizeof(u32)))
> return -EFAULT;
> +
> +   uaddr = force_user_ptr(uaddr);
> +
> asm volatile("\n"
> "1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n"
> "2:\n"
> diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
> index a4ceda0510ea..d35f6dc22341 100644
> --- a/arch/x86/include/asm/uaccess.h
> +++ b/arch/x86/include/asm/uaccess.h
> @@ -6,6 +6,7 @@
>   */
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -66,12 +67,23 @@ static inline bool pagefault_disabled(void);
>   * Return: true (nonzero) if the memory block may be valid, false (zero)
>   * if it is definitely invalid.
>   */
> -#define access_ok(addr, size)  \

unnecessary whitespace change?

Other than that and the optional s/force/enforce/ rename + cast
collapse you can add:

Reviewed-by: Dan Williams

Re: [PATCH v3] x86/uaccess: Use pointer masking to limit uaccess speculation

2020-09-14 Thread Dan Williams

On Mon, Sep 14, 2020 at 10:56 AM Borislav Petkov  wrote:
>
> On Thu, Sep 10, 2020 at 12:22:53PM -0500, Josh Poimboeuf wrote:
> > +/*
> > + * Sanitize a user pointer such that it becomes NULL if it's not a valid 
> > user
> > + * pointer.  This prevents speculative dereferences of user-controlled 
> > pointers
> > + * to kernel space when access_ok() speculatively returns true.  This 
> > should be
> > + * done *after* access_ok(), to avoid affecting error handling behavior.
>
> Err, stupid question: can this macro then be folded into access_ok() so
> that you don't have to touch so many places and the check can happen
> automatically?

I think that ends up with more changes because it changes the flow of
access_ok() from returning a boolean to returning a modified user
address that can be used in the speculative path.

Re: [PATCH] PCI/ASPM: Enable ASPM for links under VMD domain

2020-08-27 Thread Dan Williams

On Thu, Aug 27, 2020 at 9:46 AM Derrick, Jonathan
 wrote:
>
> On Thu, 2020-08-27 at 17:23 +0100, h...@infradead.org wrote:
> > On Thu, Aug 27, 2020 at 04:13:44PM +, Derrick, Jonathan wrote:
> > > On Thu, 2020-08-27 at 06:34 +, h...@infradead.org wrote:
> > > > On Wed, Aug 26, 2020 at 09:43:27PM +, Derrick, Jonathan wrote:
> > > > > Feel free to review my set to disable the MSI remapping which will
> > > > > make
> > > > > it perform as well as direct-attached:
> > > > >
> > > > > https://patchwork.kernel.org/project/linux-pci/list/?series=325681
> > > >
> > > > So that then we have to deal with your schemes to make individual
> > > > device direct assignment work in a convoluted way?
> > >
> > > That's not the intent of that patchset -at all-. It was to address the
> > > performance bottlenecks with VMD that you constantly complain about.
> >
> > I know.  But once we fix that bottleneck we fix the next issue,
> > then to tackle the next.  While at the same time VMD brings zero
> > actual benefits.
> >
>
> Just a few benefits and there are other users with unique use cases:
> 1. Passthrough of the endpoint to OSes which don't natively support
> hotplug can enable hotplug for that OS using the guest VMD driver
> 2. Some hypervisors have a limit on the number of devices that can be
> passed through. VMD endpoint is a single device that expands to many.
> 3. Expansion of possible bus numbers beyond 256 by using other
> segments.
> 4. Custom RAID LED patterns driven by ledctl
>
> I'm not trying to market this. Just pointing out that this isn't
> "bringing zero actual benefits" to many users.
>

The initial intent of the VMD driver was to allow Linux to find and
initialize devices behind a VMD configuration where VMD was required
for a non-Linux OS. For Linux, if full native PCI-E is an available
configuration option I think it makes sense to recommend Linux users
to flip that knob rather than continue to wrestle with the caveats of
the VMD driver. Where that knob isn't possible / available VMD can be
a fallback, but full native PCI-E is what Linux wants in the end.

Re: [PATCH v4 00/23] device-dax: Support sub-dividing soft-reserved ranges

2020-08-21 Thread Dan Williams

On Fri, Aug 21, 2020 at 11:30 AM David Hildenbrand  wrote:
>
> On 21.08.20 20:27, Dan Williams wrote:
> > On Fri, Aug 21, 2020 at 3:15 AM David Hildenbrand  wrote:
> >>
> >>>>
> >>>> 1. On x86-64, e820 indicates "soft-reserved" memory. This memory is not
> >>>> automatically used in the buddy during boot, but remains untouched
> >>>> (similar to pmem). But as it involves ACPI as well, it could also be
> >>>> used on arm64 (-e820), correct?
> >>>
> >>> Correct, arm64 also gets the EFI support for enumerating memory this
> >>> way. However, I would clarify that whether soft-reserved is given to
> >>> the buddy allocator by default or not is the kernel's policy choice,
> >>> "buddy-by-default" is ok and is what will happen anyways with older
> >>> kernels on platforms that enumerate a memory range this way.
> >>
> >> Is "soft-reserved" then the right terminology for that? It sounds very
> >> x86-64/e820 specific. Maybe a compressed for of "performance
> >> differentiated memory" might be a better fit to expose to user space, no?
> >
> > No. The EFI "Specific Purpose" bit is an attribute independent of
> > e820, it's x86-Linux that entangles those together. There is no
> > requirement for platform firmware to use that designation even for
> > drastic performance differentiation between ranges, and conversely
> > there is no requirement that memory *with* that designation has any
> > performance difference compared to the default memory pool. So it
> > really is a reservation policy about a memory range to keep out of the
> > buddy allocator by default.
>
> Okay, still "soft-reserved" is x86-64 specific, no?

There's nothing preventing other EFI archs, or a similar designation
in another firmware spec, picking up this policy.

>   (AFAIK,
> "soft-reserved" will be visible in /proc/iomem, or am I confusing
> stuff?)

No, you're correct.

> IOW, it "performance differentiated" is not universally
> applicable, maybe  "specific purpose memory" is ?

Those bikeshed colors don't seem an improvement to me.

"Soft-reserved" actually tells you something about the kernel policy
for the memory. The criticism of "specific purpose" that led to
calling it "soft-reserved" in Linux is the fact that "specific" is
undefined as far as the firmware knows, and "specific" may have
different applications based on the platform user. "Soft-reserved"
like "Reserved" tells you that a driver policy might be in play for
that memory.

Also note that the current color of the bikeshed has already shipped since v5.5:

   262b45ae3ab4 x86/efi: EFI soft reservation to E820 enumeration

Re: [PATCH v4 00/23] device-dax: Support sub-dividing soft-reserved ranges

2020-08-21 Thread Dan Williams

On Fri, Aug 21, 2020 at 3:15 AM David Hildenbrand  wrote:
>
> >>
> >> 1. On x86-64, e820 indicates "soft-reserved" memory. This memory is not
> >> automatically used in the buddy during boot, but remains untouched
> >> (similar to pmem). But as it involves ACPI as well, it could also be
> >> used on arm64 (-e820), correct?
> >
> > Correct, arm64 also gets the EFI support for enumerating memory this
> > way. However, I would clarify that whether soft-reserved is given to
> > the buddy allocator by default or not is the kernel's policy choice,
> > "buddy-by-default" is ok and is what will happen anyways with older
> > kernels on platforms that enumerate a memory range this way.
>
> Is "soft-reserved" then the right terminology for that? It sounds very
> x86-64/e820 specific. Maybe a compressed for of "performance
> differentiated memory" might be a better fit to expose to user space, no?

No. The EFI "Specific Purpose" bit is an attribute independent of
e820, it's x86-Linux that entangles those together. There is no
requirement for platform firmware to use that designation even for
drastic performance differentiation between ranges, and conversely
there is no requirement that memory *with* that designation has any
performance difference compared to the default memory pool. So it
really is a reservation policy about a memory range to keep out of the
buddy allocator by default.

[..]
> > Both, but note that PMEM is already hard-reserved by default.
> > Soft-reserved is about a memory range that, for example, an
> > administrator may want to reserve 100% for a weather simulation where
> > if even a small amount of memory was stolen for the page cache the
> > application may not meet its performance targets. It could also be a
> > memory range that is so slow that only applications with higher
> > latency tolerances would be prepared to consume it.
> >
> > In other words the soft-reserved memory can be used to indicate memory
> > that is either too precious, or too slow for general purpose OS
> > allocations.
>
> Right, so actually performance-differentiated in any way :)

... or not differentiated at all which is Joao's use case for example.

[..]
> > Numa node numbers / are how performance differentiated memory ranges
> > are enumerated. The expectation is that all distinct performance
> > memory targets have unique ACPI proximity domains and Linux numa node
> > numbers as a result.
>
> Makes sense to me (although it's somehow weird, because memory of the
> same socket/node would be represented via different NUMA nodes), thanks!

Yes, numa ids as only physical socket identifiers is no longer a
reliable assumption since the introduction of the ACPI HMAT.

Re: [PATCH v4 00/23] device-dax: Support sub-dividing soft-reserved ranges

2020-08-19 Thread Dan Williams

On Mon, Aug 3, 2020 at 12:48 AM David Hildenbrand  wrote:
>
> [...]
>
> > Well, no v5.8-rc8 to line this up for v5.9, so next best is early
> > integration into -mm before other collisions develop.
> >
> > Chatted with Justin offline and it currently appears that the missing
> > numa information is the fault of the platform firmware to populate all
> > the necessary NUMA data in the NFIT.
>
> I'm planning on looking at some bits of this series this week, but some
> questions upfront ...
>
> >
> > ---
> > Cover:
> >
> > The device-dax facility allows an address range to be directly mapped
> > through a chardev, or optionally hotplugged to the core kernel page
> > allocator as System-RAM. It is the mechanism for converting persistent
> > memory (pmem) to be used as another volatile memory pool i.e. the
> > current Memory Tiering hot topic on linux-mm.
> >
> > In the case of pmem the nvdimm-namespace-label mechanism can sub-divide
> > it, but that labeling mechanism is not available / applicable to
> > soft-reserved ("EFI specific purpose") memory [3]. This series provides
> > a sysfs-mechanism for the daxctl utility to enable provisioning of
> > volatile-soft-reserved memory ranges.
> >
> > The motivations for this facility are:
> >
> > 1/ Allow performance differentiated memory ranges to be split between
> >kernel-managed and directly-accessed use cases.
> >
> > 2/ Allow physical memory to be provisioned along performance relevant
> >address boundaries. For example, divide a memory-side cache [4] along
> >cache-color boundaries.
> >
> > 3/ Parcel out soft-reserved memory to VMs using device-dax as a security
> >/ permissions boundary [5]. Specifically I have seen people (ab)using
> >memmap=nn!ss (mark System-RAM as Persistent Memory) just to get the
> >device-dax interface on custom address ranges. A follow-on for the VM
> >use case is to teach device-dax to dynamically allocate 'struct page' at
> >runtime to reduce the duplication of 'struct page' space in both the
> >guest and the host kernel for the same physical pages.
>
>
> I think I am missing some important pieces. Bear with me.

No worries, also bear with me, I'm going to be offline intermittently
until at least mid-September. Hopefully Joao and/or Vishal can jump in
on this discussion.

>
> 1. On x86-64, e820 indicates "soft-reserved" memory. This memory is not
> automatically used in the buddy during boot, but remains untouched
> (similar to pmem). But as it involves ACPI as well, it could also be
> used on arm64 (-e820), correct?

Correct, arm64 also gets the EFI support for enumerating memory this
way. However, I would clarify that whether soft-reserved is given to
the buddy allocator by default or not is the kernel's policy choice,
"buddy-by-default" is ok and is what will happen anyways with older
kernels on platforms that enumerate a memory range this way.

> 2. Soft-reserved memory is volatile RAM with differing performance
> characteristics ("performance differentiated memory"). What would be
> examples of such memory?

Likely the most prominent one that drove the creation of the "EFI
Specific Purpose" attribute bit is high-bandwidth memory. One concrete
example of that was a platform called Knights Landing [1] that ended
up shipping firmware that lied to the OS about the latency
characteristics of the memory to try to reverse engineer OS behavior
to not allocate from that memory range by default. With the EFI
attribute firmware performance tables can tell the truth about the
performance characteristics of the memory range *and* indicate that
the OS not use it for general purpose allocations by default.

[1]: 
https://software.intel.com/content/www/us/en/develop/blogs/an-intro-to-mcdram-high-bandwidth-memory-on-knights-landing.html

> Like, memory that is faster than RAM (scratch
> pad), or slower (pmem)? Or both? :)

Both, but note that PMEM is already hard-reserved by default.
Soft-reserved is about a memory range that, for example, an
administrator may want to reserve 100% for a weather simulation where
if even a small amount of memory was stolen for the page cache the
application may not meet its performance targets. It could also be a
memory range that is so slow that only applications with higher
latency tolerances would be prepared to consume it.

In other words the soft-reserved memory can be used to indicate memory
that is either too precious, or too slow for general purpose OS
allocations.

> Is it a valid use case to use pmem
> in a hypervisor to back this memory?

Depends on the pmem. That performance capability is indicated by the
ACPI HMAT, not the EFI soft-reserved designation.

> 3. There seem to be use cases where "soft-reserved" memory is used via
> DAX. What is an example use case? I assume it's *not* to treat it like
> PMEM but instead e.g., use it as a fast buffer inside applications or
> similar.

Right, in that weather-simulation example that application could just
mmap /dev/daxX.Y

Re: [PATCH] dma-debug: fix debug_dma_assert_idle(), use rcu_read_lock()

2020-08-13 Thread Dan Williams

On Thu, Aug 13, 2020 at 12:03 PM Linus Torvalds
 wrote:
>
> On Wed, Aug 12, 2020 at 8:17 PM Hugh Dickins  wrote:
> >
> > Since commit 2a9127fcf229 ("mm: rewrite wait_on_page_bit_common() logic")
> > improved unlock_page(), it has become more noticeable how cow_user_page()
> > in a kernel with CONFIG_DMA_API_DEBUG=y can create and suffer from heavy
> > contention on DMA debug's radix_lock in debug_dma_assert_idle().
>
> Ooh.
>
> Yeah, that's ridiculously expensive, and serializes things for no good reason.
>
> Your patch looks obviously correct to me (Christoph?), but it also
> makes me go "why are we doing this in the first place"?
>
> Because it looks to me like
>  (a) the debug check is wrong
>  (b) this is left-over from early debugging
>
> In particular, I don't see why we couldn't do a COW on a page that is
> under writeback at the same time. We're not changing the page that is
> doing DMA.
>
> In fact, the whole "COW with DMA" makes me feel like the real bug may
> have been due that whole "ambiguous COW" thing, which was fixed in
> 17839856fd58 ("gup: document and work around "COW can break either
> way" issue")
>
> That debug thing goes back almost 7 years, and I don't think it has
> caught anything in those seven years, but I could be wrong.
>
> The commit that adds it does talk about a bug, but that code was
> removed entirely eventually. And google shows no hits for
> debug_dma_assert_idle() since - until your email.
>
> So my gut feel is that we should remove the check entirely, although
> your patch does seem like a big improvement.
>
> Christoph?
>
> (And Dan too, of course, in case he happens to be relaxing in front of
> the computer away from a newborn baby ;)
>

I can at least confirm that it has not caught anything in a long while
except a false positive that needed a fix up.

https://lore.kernel.org/lkml/capcyv4hy_nne8g0o8smrz9a8hcdrzaukgxmvdjkusaaa3fo...@mail.gmail.com/

Part of me says it's not doing anything worthwhile upstream, but I
wonder if it is keeping some people from submitting patches that play
these page reference shenanigans? I know they're out there. The land
of gup and truncate is where questionable kernel changes go to die.

Outside of that, Hugh's patch looks like a definite improvement so I'd
be inclined to run with that, but rip the whole facility out at the
next sign of a false positive.

Re: [x86/copy_mc] a0ac629ebe: fio.read_iops -43.3% regression

2020-08-06 Thread Dan Williams

On Thu, Aug 6, 2020 at 6:35 AM Ingo Molnar  wrote:
>
>
> * kernel test robot  wrote:
>
> > Greeting,
> >
> > FYI, we noticed a -43.3% regression of fio.read_iops due to commit:
> >
> >
> > commit: a0ac629ebe7b3d248cb93807782a00d9142fdb98 ("x86/copy_mc: Introduce 
> > copy_mc_generic()")
> > url: 
> > https://github.com/0day-ci/linux/commits/Dan-Williams/Renovate-memcpy_mcsafe-with-copy_mc_to_-user-kernel/20200802-014046
> >
> >
> > in testcase: fio-basic
> > on test machine: 96 threads Intel(R) Xeon(R) Gold 6252 CPU @ 2.10GHz with 
> > 256G memory
> > with following parameters:
>
> So this performance regression, if it isn't a spurious result, looks
> concerning. Is this expected?

This is not expected and I think delays these patches until I'm back
from leave in a few weeks. I know that we might lose some inlining
effect due to replacing native memcpy, but I did not expect it would
have an impact like this. In my testing I was seeing a performance
improvement from replacing the careful / open-coded copy with rep;
mov;, which increases the surprise of this result.

[PATCH v4 16/23] mm/memremap_pages: Convert to 'struct range'

2020-08-02 Thread Dan Williams

The 'struct resource' in 'struct dev_pagemap' is only used for holding
resource span information. The other fields, 'name', 'flags', 'desc',
'parent', 'sibling', and 'child' are all unused wasted space.

This is in preparation for introducing a multi-range extension of
devm_memremap_pages().

The bulk of this change is unwinding all the places internal to
libnvdimm that used 'struct resource' unnecessarily.

P2PDMA had a minor usage of the flags field, but only to report failures
with "%pR". That is replaced with an open coded print of the range.

Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: Benjamin Herrenschmidt 
Cc: Dan Williams 
Cc: Vishal Verma 
Cc: Dave Jiang 
Cc: Ben Skeggs 
Cc: David Airlie 
Cc: Daniel Vetter 
Cc: Ira Weiny 
Cc: Jason Gunthorpe 
Signed-off-by: Dan Williams 
---
 arch/powerpc/kvm/book3s_hv_uvmem.c |   13 +++--
 drivers/dax/bus.c  |   10 ++--
 drivers/dax/bus.h  |2 -
 drivers/dax/dax-private.h  |5 --
 drivers/dax/device.c   |3 -
 drivers/dax/hmem/hmem.c|5 ++
 drivers/dax/pmem/core.c|   12 ++---
 drivers/gpu/drm/nouveau/nouveau_dmem.c |   14 +++---
 drivers/nvdimm/badrange.c  |   26 +--
 drivers/nvdimm/claim.c |   13 +++--
 drivers/nvdimm/nd.h|3 +
 drivers/nvdimm/pfn_devs.c  |   12 ++---
 drivers/nvdimm/pmem.c  |   26 ++-
 drivers/nvdimm/region.c|   21 +
 drivers/pci/p2pdma.c   |   11 ++---
 include/linux/memremap.h   |5 +-
 include/linux/range.h  |6 ++
 lib/test_hmm.c |   14 +++---
 mm/memremap.c  |   77 
 tools/testing/nvdimm/test/iomap.c  |2 -
 20 files changed, 147 insertions(+), 133 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c 
b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 7705d5557239..29ec555055c2 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -687,9 +687,9 @@ static struct page *kvmppc_uvmem_get_page(unsigned long 
gpa, struct kvm *kvm)
struct kvmppc_uvmem_page_pvt *pvt;
unsigned long pfn_last, pfn_first;
 
-   pfn_first = kvmppc_uvmem_pgmap.res.start >> PAGE_SHIFT;
+   pfn_first = kvmppc_uvmem_pgmap.range.start >> PAGE_SHIFT;
pfn_last = pfn_first +
-  (resource_size(_uvmem_pgmap.res) >> PAGE_SHIFT);
+  (range_len(_uvmem_pgmap.range) >> PAGE_SHIFT);
 
spin_lock(_uvmem_bitmap_lock);
bit = find_first_zero_bit(kvmppc_uvmem_bitmap,
@@ -1007,7 +1007,7 @@ static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct 
vm_fault *vmf)
 static void kvmppc_uvmem_page_free(struct page *page)
 {
unsigned long pfn = page_to_pfn(page) -
-   (kvmppc_uvmem_pgmap.res.start >> PAGE_SHIFT);
+   (kvmppc_uvmem_pgmap.range.start >> PAGE_SHIFT);
struct kvmppc_uvmem_page_pvt *pvt;
 
spin_lock(_uvmem_bitmap_lock);
@@ -1170,7 +1170,8 @@ int kvmppc_uvmem_init(void)
}
 
kvmppc_uvmem_pgmap.type = MEMORY_DEVICE_PRIVATE;
-   kvmppc_uvmem_pgmap.res = *res;
+   kvmppc_uvmem_pgmap.range.start = res->start;
+   kvmppc_uvmem_pgmap.range.end = res->end;
kvmppc_uvmem_pgmap.ops = _uvmem_ops;
/* just one global instance: */
kvmppc_uvmem_pgmap.owner = _uvmem_pgmap;
@@ -1205,7 +1206,7 @@ void kvmppc_uvmem_free(void)
return;
 
memunmap_pages(_uvmem_pgmap);
-   release_mem_region(kvmppc_uvmem_pgmap.res.start,
-  resource_size(_uvmem_pgmap.res));
+   release_mem_region(kvmppc_uvmem_pgmap.range.start,
+  range_len(_uvmem_pgmap.range));
kfree(kvmppc_uvmem_bitmap);
 }
diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 53d07f2f1285..00fa73a8dfb4 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -515,7 +515,7 @@ static void dax_region_unregister(void *region)
 }
 
 struct dax_region *alloc_dax_region(struct device *parent, int region_id,
-   struct resource *res, int target_node, unsigned int align,
+   struct range *range, int target_node, unsigned int align,
unsigned long flags)
 {
struct dax_region *dax_region;
@@ -530,8 +530,8 @@ struct dax_region *alloc_dax_region(struct device *parent, 
int region_id,
return NULL;
}
 
-   if (!IS_ALIGNED(res->start, align)
-   || !IS_ALIGNED(resource_size(res), align))
+   if (!IS_ALIGNED(range->start, align)
+   || !IS_ALIGNED(range_len(range), align))
return NULL;
 
dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL);
@@ -546,8 +546,8 @@ struct dax_regi

[PATCH v4 18/23] device-dax: Add dis-contiguous resource support

2020-08-02 Thread Dan Williams

Break the requirement that device-dax instances are physically
contiguous. With this constraint removed it allows fragmented available
capacity to be fully allocated.

This capability is useful to mitigate the "noisy neighbor" problem with
memory-side-cache management for virtual machines, or any other scenario
where a platform address boundary also designates a performance
boundary. For example a direct mapped memory side cache might rotate
cache colors at 1GB boundaries.  With dis-contiguous allocations a
device-dax instance could be configured to contain only 1 cache color.

It also satisfies Joao's use case (see link) for partitioning memory for
exclusive guest access. It allows for a future potential mode where the
host kernel need not allocate 'struct page' capacity up-front.

Link: 
https://lore.kernel.org/lkml/20200110190313.17144-1-joao.m.mart...@oracle.com/
Reported-by: Joao Martins 
Signed-off-by: Dan Williams 
---
 drivers/dax/bus.c  |  230 +++-
 drivers/dax/dax-private.h  |9 +-
 drivers/dax/device.c   |   55 ++
 drivers/dax/kmem.c |  132 +++
 tools/testing/nvdimm/dax-dev.c |   20 ++-
 5 files changed, 319 insertions(+), 127 deletions(-)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 00fa73a8dfb4..8dd82ea9d53d 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -136,15 +136,27 @@ static bool is_static(struct dax_region *dax_region)
return (dax_region->res.flags & IORESOURCE_DAX_STATIC) != 0;
 }
 
+static u64 dev_dax_size(struct dev_dax *dev_dax)
+{
+   u64 size = 0;
+   int i;
+
+   device_lock_assert(_dax->dev);
+
+   for (i = 0; i < dev_dax->nr_range; i++)
+   size += range_len(_dax->ranges[i].range);
+
+   return size;
+}
+
 static int dax_bus_probe(struct device *dev)
 {
struct dax_device_driver *dax_drv = to_dax_drv(dev->driver);
struct dev_dax *dev_dax = to_dev_dax(dev);
struct dax_region *dax_region = dev_dax->region;
-   struct range *range = _dax->range;
int rc;
 
-   if (range_len(range) == 0 || dev_dax->id < 0)
+   if (dev_dax_size(dev_dax) == 0 || dev_dax->id < 0)
return -ENXIO;
 
rc = dax_drv->probe(dev_dax);
@@ -354,15 +366,19 @@ void kill_dev_dax(struct dev_dax *dev_dax)
 }
 EXPORT_SYMBOL_GPL(kill_dev_dax);
 
-static void free_dev_dax_range(struct dev_dax *dev_dax)
+static void free_dev_dax_ranges(struct dev_dax *dev_dax)
 {
struct dax_region *dax_region = dev_dax->region;
-   struct range *range = _dax->range;
+   int i;
 
device_lock_assert(dax_region->dev);
-   if (range_len(range))
+   for (i = 0; i < dev_dax->nr_range; i++) {
+   struct range *range = _dax->ranges[i].range;
+
__release_region(_region->res, range->start,
range_len(range));
+   }
+   dev_dax->nr_range = 0;
 }
 
 static void unregister_dev_dax(void *dev)
@@ -372,7 +388,7 @@ static void unregister_dev_dax(void *dev)
dev_dbg(dev, "%s\n", __func__);
 
kill_dev_dax(dev_dax);
-   free_dev_dax_range(dev_dax);
+   free_dev_dax_ranges(dev_dax);
device_del(dev);
put_device(dev);
 }
@@ -423,7 +439,7 @@ static ssize_t delete_store(struct device *dev, struct 
device_attribute *attr,
device_lock(dev);
device_lock(victim);
dev_dax = to_dev_dax(victim);
-   if (victim->driver || range_len(_dax->range))
+   if (victim->driver || dev_dax_size(dev_dax))
rc = -EBUSY;
else {
/*
@@ -569,51 +585,83 @@ static int alloc_dev_dax_range(struct dev_dax *dev_dax, 
u64 start,
struct dax_region *dax_region = dev_dax->region;
struct resource *res = _region->res;
struct device *dev = _dax->dev;
+   struct dev_dax_range *ranges;
+   unsigned long pgoff = 0;
struct resource *alloc;
+   int i;
 
device_lock_assert(dax_region->dev);
 
/* handle the seed alloc special case */
if (!size) {
-   dev_dax->range = (struct range) {
-   .start = res->start,
-   .end = res->start - 1,
-   };
+   if (dev_WARN_ONCE(dev, dev_dax->nr_range,
+   "0-size allocation must be first\n"))
+   return -EBUSY;
+   /* nr_range == 0 is elsewhere special cased as 0-size device */
return 0;
}
 
+   ranges = krealloc(dev_dax->ranges, sizeof(*ranges)
+   * (dev_dax->nr_range + 1), GFP_KERNEL);
+   if (!ranges)
+   return -ENOMEM;
+
alloc = __request_region(res, start, size, dev_name(dev), 0);
-   if (

[PATCH v4 17/23] mm/memremap_pages: Support multiple ranges per invocation

2020-08-02 Thread Dan Williams

In support of device-dax growing the ability to front physically
dis-contiguous ranges of memory, update devm_memremap_pages() to track
multiple ranges with a single reference counter and devm instance.

Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: Benjamin Herrenschmidt 
Cc: Dan Williams 
Cc: Vishal Verma 
Cc: Dave Jiang 
Cc: Ben Skeggs 
Cc: David Airlie 
Cc: Daniel Vetter 
Cc: Ira Weiny 
Cc: Jason Gunthorpe 
Signed-off-by: Dan Williams 
---
 arch/powerpc/kvm/book3s_hv_uvmem.c |1 
 drivers/dax/device.c   |1 
 drivers/gpu/drm/nouveau/nouveau_dmem.c |1 
 drivers/nvdimm/pfn_devs.c  |1 
 drivers/nvdimm/pmem.c  |1 
 drivers/pci/p2pdma.c   |1 
 include/linux/memremap.h   |   10 +
 lib/test_hmm.c |1 
 mm/memremap.c  |  258 +++-
 9 files changed, 165 insertions(+), 110 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c 
b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 29ec555055c2..84e5a2dc8be5 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -1172,6 +1172,7 @@ int kvmppc_uvmem_init(void)
kvmppc_uvmem_pgmap.type = MEMORY_DEVICE_PRIVATE;
kvmppc_uvmem_pgmap.range.start = res->start;
kvmppc_uvmem_pgmap.range.end = res->end;
+   kvmppc_uvmem_pgmap.nr_range = 1;
kvmppc_uvmem_pgmap.ops = _uvmem_ops;
/* just one global instance: */
kvmppc_uvmem_pgmap.owner = _uvmem_pgmap;
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index fffc54ce0911..f3755df4ae29 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -417,6 +417,7 @@ int dev_dax_probe(struct dev_dax *dev_dax)
if (!pgmap)
return -ENOMEM;
pgmap->range = *range;
+   pgmap->nr_range = 1;
}
pgmap->type = MEMORY_DEVICE_DEVDAX;
addr = devm_memremap_pages(dev, pgmap);
diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c 
b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 25811ed7e274..a13c6215bba8 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -251,6 +251,7 @@ nouveau_dmem_chunk_alloc(struct nouveau_drm *drm, struct 
page **ppage)
chunk->pagemap.type = MEMORY_DEVICE_PRIVATE;
chunk->pagemap.range.start = res->start;
chunk->pagemap.range.end = res->end;
+   chunk->pagemap.nr_range = 1;
chunk->pagemap.ops = _dmem_pagemap_ops;
chunk->pagemap.owner = drm->dev;
 
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 3c4787b92a6a..b499df630d4d 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -693,6 +693,7 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct 
dev_pagemap *pgmap)
.start = nsio->res.start + start_pad,
.end = nsio->res.end - end_trunc,
};
+   pgmap->nr_range = 1;
if (nd_pfn->mode == PFN_MODE_RAM) {
if (offset < reserve)
return -EINVAL;
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 69cc0e783709..1f45af363a94 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -442,6 +442,7 @@ static int pmem_attach_disk(struct device *dev,
} else if (pmem_should_map_pages(dev)) {
pmem->pgmap.range.start = res->start;
pmem->pgmap.range.end = res->end;
+   pmem->pgmap.nr_range = 1;
pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
pmem->pgmap.ops = _pagemap_ops;
addr = devm_memremap_pages(dev, >pgmap);
diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index dd6b0d51a50c..403304785561 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -187,6 +187,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, 
size_t size,
pgmap = _pgmap->pgmap;
pgmap->range.start = pci_resource_start(pdev, bar) + offset;
pgmap->range.end = pgmap->range.start + size - 1;
+   pgmap->nr_range = 1;
pgmap->type = MEMORY_DEVICE_PCI_P2PDMA;
 
p2p_pgmap->provider = pdev;
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 6c21951bdb16..4e9c738f4b31 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -95,7 +95,6 @@ struct dev_pagemap_ops {
 /**
  * struct dev_pagemap - metadata for ZONE_DEVICE mappings
  * @altmap: pre-allocated/reserved memory for vmemmap allocations
- * @range: physical address range covered by @ref
  * @ref: reference count that pins the devm_memremap_pages() mapping
  * @internal_ref: internal reference if @ref is not provided by the caller
  * @done: completion for @internal_ref
@@ -105,10 +104,12 @@ struct dev_pagemap_ops {
  * @owner: an opaque po

[PATCH v4 14/23] drivers/base: Make device_find_child_by_name() compatible with sysfs inputs

2020-08-02 Thread Dan Williams

Use sysfs_streq() in device_find_child_by_name() to allow it to use a
sysfs input string that might contain a trailing newline.

The other "device by name" interfaces,
{bus,driver,class}_find_device_by_name(), already account for sysfs
strings.

Cc: "Rafael J. Wysocki" 
Reviewed-by: Greg Kroah-Hartman 
Signed-off-by: Dan Williams 
---
 drivers/base/core.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 2169c5132558..231189dd6599 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -3328,7 +3328,7 @@ struct device *device_find_child_by_name(struct device 
*parent,
 
klist_iter_init(>p->klist_children, );
while ((child = next_device()))
-   if (!strcmp(dev_name(child), name) && get_device(child))
+   if (sysfs_streq(dev_name(child), name) && get_device(child))
break;
klist_iter_exit();
return child;

[PATCH v4 19/23] device-dax: Introduce 'mapping' devices

2020-08-02 Thread Dan Williams

In support of interrogating the physical address layout of a device with
dis-contiguous ranges, introduce a sysfs directory with 'start', 'end',
and 'page_offset' attributes. The alternative is trying to parse
/proc/iomem, and that file will not reflect the extent layout until the
device is enabled.

Cc: Vishal Verma 
Signed-off-by: Dan Williams 
---
 drivers/dax/bus.c |  191 +
 drivers/dax/dax-private.h |   14 +++
 2 files changed, 203 insertions(+), 2 deletions(-)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 8dd82ea9d53d..2779c65dc7c0 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -579,6 +579,167 @@ struct dax_region *alloc_dax_region(struct device 
*parent, int region_id,
 }
 EXPORT_SYMBOL_GPL(alloc_dax_region);
 
+static void dax_mapping_release(struct device *dev)
+{
+   struct dax_mapping *mapping = to_dax_mapping(dev);
+   struct dev_dax *dev_dax = to_dev_dax(dev->parent);
+
+   ida_free(_dax->ida, mapping->id);
+   kfree(mapping);
+}
+
+static void unregister_dax_mapping(void *data)
+{
+   struct device *dev = data;
+   struct dax_mapping *mapping = to_dax_mapping(dev);
+   struct dev_dax *dev_dax = to_dev_dax(dev->parent);
+   struct dax_region *dax_region = dev_dax->region;
+
+   dev_dbg(dev, "%s\n", __func__);
+
+   device_lock_assert(dax_region->dev);
+
+   dev_dax->ranges[mapping->range_id].mapping = NULL;
+   mapping->range_id = -1;
+
+   device_del(dev);
+   put_device(dev);
+}
+
+static struct dev_dax_range *get_dax_range(struct device *dev)
+{
+   struct dax_mapping *mapping = to_dax_mapping(dev);
+   struct dev_dax *dev_dax = to_dev_dax(dev->parent);
+   struct dax_region *dax_region = dev_dax->region;
+
+   device_lock(dax_region->dev);
+   if (mapping->range_id < 0) {
+   device_unlock(dax_region->dev);
+   return NULL;
+   }
+
+   return _dax->ranges[mapping->range_id];
+}
+
+static void put_dax_range(struct dev_dax_range *dax_range)
+{
+   struct dax_mapping *mapping = dax_range->mapping;
+   struct dev_dax *dev_dax = to_dev_dax(mapping->dev.parent);
+   struct dax_region *dax_region = dev_dax->region;
+
+   device_unlock(dax_region->dev);
+}
+
+static ssize_t start_show(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   struct dev_dax_range *dax_range;
+   ssize_t rc;
+
+   dax_range = get_dax_range(dev);
+   if (!dax_range)
+   return -ENXIO;
+   rc = sprintf(buf, "%#llx\n", dax_range->range.start);
+   put_dax_range(dax_range);
+
+   return rc;
+}
+static DEVICE_ATTR(start, 0400, start_show, NULL);
+
+static ssize_t end_show(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   struct dev_dax_range *dax_range;
+   ssize_t rc;
+
+   dax_range = get_dax_range(dev);
+   if (!dax_range)
+   return -ENXIO;
+   rc = sprintf(buf, "%#llx\n", dax_range->range.end);
+   put_dax_range(dax_range);
+
+   return rc;
+}
+static DEVICE_ATTR(end, 0400, end_show, NULL);
+
+static ssize_t pgoff_show(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   struct dev_dax_range *dax_range;
+   ssize_t rc;
+
+   dax_range = get_dax_range(dev);
+   if (!dax_range)
+   return -ENXIO;
+   rc = sprintf(buf, "%#lx\n", dax_range->pgoff);
+   put_dax_range(dax_range);
+
+   return rc;
+}
+static DEVICE_ATTR(page_offset, 0400, pgoff_show, NULL);
+
+static struct attribute *dax_mapping_attributes[] = {
+   _attr_start.attr,
+   _attr_end.attr,
+   _attr_page_offset.attr,
+   NULL,
+};
+
+static const struct attribute_group dax_mapping_attribute_group = {
+   .attrs = dax_mapping_attributes,
+};
+
+static const struct attribute_group *dax_mapping_attribute_groups[] = {
+   _mapping_attribute_group,
+   NULL,
+};
+
+static struct device_type dax_mapping_type = {
+   .release = dax_mapping_release,
+   .groups = dax_mapping_attribute_groups,
+};
+
+static int devm_register_dax_mapping(struct dev_dax *dev_dax, int range_id)
+{
+   struct dax_region *dax_region = dev_dax->region;
+   struct dax_mapping *mapping;
+   struct device *dev;
+   int rc;
+
+   device_lock_assert(dax_region->dev);
+
+   if (dev_WARN_ONCE(_dax->dev, !dax_region->dev->driver,
+   "region disabled\n"))
+   return -ENXIO;
+
+   mapping = kzalloc(sizeof(*mapping), GFP_KERNEL);
+   if (!mapping)
+   return -ENOMEM;
+   mapping->range_id = range_id;
+   mapping->id = ida_alloc(_dax->ida, GFP_KERNEL);
+   if (mapping->id < 0) {
+   kfree(ma

[PATCH v4 22/23] dax/hmem: Introduce dax_hmem.region_idle parameter

2020-08-02 Thread Dan Williams

From: Joao Martins 

Introduce a new module parameter for dax_hmem which
initializes all region devices as free, rather than allocating
a pagemap for the region by default.

All hmem devices created with dax_hmem.region_idle=1 will have full
available size for creating dynamic dax devices.

Signed-off-by: Joao Martins 
Link: https://lore.kernel.org/r/20200716172913.19658-4-joao.m.mart...@oracle.com
Signed-off-by: Dan Williams 
---
 drivers/dax/hmem/hmem.c |5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
index 1a3347bb6143..1bf040dbc834 100644
--- a/drivers/dax/hmem/hmem.c
+++ b/drivers/dax/hmem/hmem.c
@@ -5,6 +5,9 @@
 #include 
 #include "../bus.h"
 
+static bool region_idle;
+module_param_named(region_idle, region_idle, bool, 0644);
+
 static int dax_hmem_probe(struct platform_device *pdev)
 {
struct device *dev = >dev;
@@ -30,7 +33,7 @@ static int dax_hmem_probe(struct platform_device *pdev)
data = (struct dev_dax_data) {
.dax_region = dax_region,
.id = -1,
-   .size = resource_size(res),
+   .size = region_idle ? 0 : resource_size(res),
};
dev_dax = devm_create_dev_dax();
if (IS_ERR(dev_dax))

[PATCH v4 23/23] device-dax: Add a range mapping allocation attribute

2020-08-02 Thread Dan Williams

From: Joao Martins 

Add a sysfs attribute which denotes a range from the dax region
to be allocated. It's an write only @mapping sysfs attribute in
the format of '-' to allocate a range. @start and
@end use hexadecimal values and the @pgoff is implicitly ordered
wrt to previous writes to @mapping sysfs e.g. a write of a range
of length 1G the pgoff is 0..1G(-4K), a second write will use
@pgoff for 1G+4K...

This range mapping interface is useful for:

 1) Application which want to implement its own allocation logic,
 and thus pick the desired ranges from dax_region.

 2) For use cases like VMM fast restart[0] where after kexec we
 want to the same gpa<->phys mappings (as originally created
 before kexec).

[0] 
https://static.sched.com/hosted_files/kvmforum2019/66/VMM-fast-restart_kvmforum2019.pdf

Signed-off-by: Joao Martins 
Link: https://lore.kernel.org/r/20200716172913.19658-5-joao.m.mart...@oracle.com
Signed-off-by: Dan Williams 
---
 drivers/dax/bus.c |   64 +
 1 file changed, 64 insertions(+)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index b984213c315f..092112bba6ed 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -1040,6 +1040,67 @@ static ssize_t size_store(struct device *dev, struct 
device_attribute *attr,
 }
 static DEVICE_ATTR_RW(size);
 
+static ssize_t range_parse(const char *opt, size_t len, struct range *range)
+{
+   unsigned long long addr = 0;
+   char *start, *end, *str;
+   ssize_t rc = EINVAL;
+
+   str = kstrdup(opt, GFP_KERNEL);
+   if (!str)
+   return rc;
+
+   end = str;
+   start = strsep(, "-");
+   if (!start || !end)
+   goto err;
+
+   rc = kstrtoull(start, 16, );
+   if (rc)
+   goto err;
+   range->start = addr;
+
+   rc = kstrtoull(end, 16, );
+   if (rc)
+   goto err;
+   range->end = addr;
+
+err:
+   kfree(str);
+   return rc;
+}
+
+static ssize_t mapping_store(struct device *dev, struct device_attribute *attr,
+   const char *buf, size_t len)
+{
+   struct dev_dax *dev_dax = to_dev_dax(dev);
+   struct dax_region *dax_region = dev_dax->region;
+   size_t to_alloc;
+   struct range r;
+   ssize_t rc;
+
+   rc = range_parse(buf, len, );
+   if (rc)
+   return rc;
+
+   rc = -ENXIO;
+   device_lock(dax_region->dev);
+   if (!dax_region->dev->driver) {
+   device_unlock(dax_region->dev);
+   return rc;
+   }
+   device_lock(dev);
+
+   to_alloc = range_len();
+   if (alloc_is_aligned(dev_dax, to_alloc))
+   rc = alloc_dev_dax_range(dev_dax, r.start, to_alloc);
+   device_unlock(dev);
+   device_unlock(dax_region->dev);
+
+   return rc == 0 ? len : rc;
+}
+static DEVICE_ATTR_WO(mapping);
+
 static ssize_t align_show(struct device *dev,
struct device_attribute *attr, char *buf)
 {
@@ -1172,6 +1233,8 @@ static umode_t dev_dax_visible(struct kobject *kobj, 
struct attribute *a, int n)
return 0;
if (a == _attr_numa_node.attr && !IS_ENABLED(CONFIG_NUMA))
return 0;
+   if (a == _attr_mapping.attr && is_static(dax_region))
+   return 0;
if ((a == _attr_align.attr ||
 a == _attr_size.attr) && is_static(dax_region))
return 0444;
@@ -1181,6 +1244,7 @@ static umode_t dev_dax_visible(struct kobject *kobj, 
struct attribute *a, int n)
 static struct attribute *dev_dax_attributes[] = {
_attr_modalias.attr,
_attr_size.attr,
+   _attr_mapping.attr,
_attr_target_node.attr,
_attr_align.attr,
_attr_resource.attr,

[PATCH v4 21/23] device-dax: Add an 'align' attribute

2020-08-02 Thread Dan Williams

From: Joao Martins 

Introduce a device align attribute. While doing so,
rename the region align attribute to be more explicitly
named as so, but keep it named as @align to retain the API
for tools like daxctl.

Changes on align may not always be valid, when say certain
mappings were created with 2M and then we switch to 1G. So, we
validate all ranges against the new value being attempted,
post resizing.

Signed-off-by: Joao Martins 
Link: https://lore.kernel.org/r/20200716172913.19658-3-joao.m.mart...@oracle.com
Signed-off-by: Dan Williams 
---
 drivers/dax/bus.c |   93 -
 drivers/dax/dax-private.h |   18 +
 2 files changed, 101 insertions(+), 10 deletions(-)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 9edfdf83408e..b984213c315f 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -230,14 +230,15 @@ static ssize_t region_size_show(struct device *dev,
 static struct device_attribute dev_attr_region_size = __ATTR(size, 0444,
region_size_show, NULL);
 
-static ssize_t align_show(struct device *dev,
+static ssize_t region_align_show(struct device *dev,
struct device_attribute *attr, char *buf)
 {
struct dax_region *dax_region = dev_get_drvdata(dev);
 
return sprintf(buf, "%u\n", dax_region->align);
 }
-static DEVICE_ATTR_RO(align);
+static struct device_attribute dev_attr_region_align =
+   __ATTR(align, 0400, region_align_show, NULL);
 
 #define for_each_dax_region_resource(dax_region, res) \
for (res = (dax_region)->res.child; res; res = res->sibling)
@@ -488,7 +489,7 @@ static umode_t dax_region_visible(struct kobject *kobj, 
struct attribute *a,
 static struct attribute *dax_region_attributes[] = {
_attr_available_size.attr,
_attr_region_size.attr,
-   _attr_align.attr,
+   _attr_region_align.attr,
_attr_create.attr,
_attr_seed.attr,
_attr_delete.attr,
@@ -855,15 +856,13 @@ static ssize_t size_show(struct device *dev,
return sprintf(buf, "%llu\n", size);
 }
 
-static bool alloc_is_aligned(struct dax_region *dax_region,
-   resource_size_t size)
+static bool alloc_is_aligned(struct dev_dax *dev_dax, resource_size_t size)
 {
/*
 * The minimum mapping granularity for a device instance is a
 * single subsection, unless the arch says otherwise.
 */
-   return IS_ALIGNED(size, max_t(unsigned long, dax_region->align,
-   memremap_compat_align()));
+   return IS_ALIGNED(size, max_t(unsigned long, dev_dax->align, 
memremap_compat_align()));
 }
 
 static int dev_dax_shrink(struct dev_dax *dev_dax, resource_size_t size)
@@ -958,7 +957,7 @@ static ssize_t dev_dax_resize(struct dax_region *dax_region,
return dev_dax_shrink(dev_dax, size);
 
to_alloc = size - dev_size;
-   if (dev_WARN_ONCE(dev, !alloc_is_aligned(dax_region, to_alloc),
+   if (dev_WARN_ONCE(dev, !alloc_is_aligned(dev_dax, to_alloc),
"resize of %pa misaligned\n", _alloc))
return -ENXIO;
 
@@ -1022,7 +1021,7 @@ static ssize_t size_store(struct device *dev, struct 
device_attribute *attr,
if (rc)
return rc;
 
-   if (!alloc_is_aligned(dax_region, val)) {
+   if (!alloc_is_aligned(dev_dax, val)) {
dev_dbg(dev, "%s: size: %lld misaligned\n", __func__, val);
return -EINVAL;
}
@@ -1041,6 +1040,78 @@ static ssize_t size_store(struct device *dev, struct 
device_attribute *attr,
 }
 static DEVICE_ATTR_RW(size);
 
+static ssize_t align_show(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   struct dev_dax *dev_dax = to_dev_dax(dev);
+
+   return sprintf(buf, "%d\n", dev_dax->align);
+}
+
+static ssize_t dev_dax_validate_align(struct dev_dax *dev_dax)
+{
+   resource_size_t dev_size = dev_dax_size(dev_dax);
+   struct device *dev = _dax->dev;
+   int i;
+
+   if (dev_size > 0 && !alloc_is_aligned(dev_dax, dev_size)) {
+   dev_dbg(dev, "%s: align %u invalid for size %pa\n",
+   __func__, dev_dax->align, _size);
+   return -EINVAL;
+   }
+
+   for (i = 0; i < dev_dax->nr_range; i++) {
+   size_t len = range_len(_dax->ranges[i].range);
+
+   if (!alloc_is_aligned(dev_dax, len)) {
+   dev_dbg(dev, "%s: align %u invalid for range %d\n",
+   __func__, dev_dax->align, i);
+   return -EINVAL;
+   }
+   }
+
+   return 0;
+}
+
+static ssize_t align_store(struct device *dev, struct device_attribute *attr,
+   const char *buf, size_t len)
+{
+   struct dev_dax *dev_dax = to_dev_dax(dev);
+   str

[PATCH v4 15/23] device-dax: Add resize support

2020-08-02 Thread Dan Williams

Make the device-dax 'size' attribute writable to allow capacity to be
split between multiple instances in a region. The intended consumers of
this capability are users that want to split a scarce memory resource
between device-dax and System-RAM access, or users that want to have
multiple security domains for a large region.

By default the hmem instance provider allocates an entire region to the
first instance. The process of creating a new instance (assuming a
region-id of 0) is find the region and trigger the 'create' attribute
which yields an empty instance to configure. For example:

cd /sys/bus/dax/devices
echo dax0.0 > dax0.0/driver/unbind
echo $new_size > dax0.0/size
echo 1 > $(readlink -f dax0.0)../dax_region/create
seed=$(cat $(readlink -f dax0.0)../dax_region/seed)
echo $new_size > $seed/size
echo dax0.0 > ../drivers/{device_dax,kmem}/bind
echo dax0.1 > ../drivers/{device_dax,kmem}/bind

Instances can be destroyed by:

echo $device > $(readlink -f $device)../dax_region/delete

Cc: Vishal Verma 
Signed-off-by: Dan Williams 
---
 drivers/dax/bus.c |  161 ++---
 1 file changed, 152 insertions(+), 9 deletions(-)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index dce9413a4394..53d07f2f1285 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -6,6 +6,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "dax-private.h"
 #include "bus.h"
 
@@ -562,7 +563,8 @@ struct dax_region *alloc_dax_region(struct device *parent, 
int region_id,
 }
 EXPORT_SYMBOL_GPL(alloc_dax_region);
 
-static int alloc_dev_dax_range(struct dev_dax *dev_dax, resource_size_t size)
+static int alloc_dev_dax_range(struct dev_dax *dev_dax, u64 start,
+   resource_size_t size)
 {
struct dax_region *dax_region = dev_dax->region;
struct resource *res = _region->res;
@@ -580,12 +582,7 @@ static int alloc_dev_dax_range(struct dev_dax *dev_dax, 
resource_size_t size)
return 0;
}
 
-   /* TODO: handle multiple allocations per region */
-   if (res->child)
-   return -ENOMEM;
-
-   alloc = __request_region(res, res->start, size, dev_name(dev), 0);
-
+   alloc = __request_region(res, start, size, dev_name(dev), 0);
if (!alloc)
return -ENOMEM;
 
@@ -597,6 +594,29 @@ static int alloc_dev_dax_range(struct dev_dax *dev_dax, 
resource_size_t size)
return 0;
 }
 
+static int adjust_dev_dax_range(struct dev_dax *dev_dax, struct resource *res, 
resource_size_t size)
+{
+   struct dax_region *dax_region = dev_dax->region;
+   struct range *range = _dax->range;
+   int rc = 0;
+
+   device_lock_assert(dax_region->dev);
+
+   if (size)
+   rc = adjust_resource(res, range->start, size);
+   else
+   __release_region(_region->res, range->start, 
range_len(range));
+   if (rc)
+   return rc;
+
+   dev_dax->range = (struct range) {
+   .start = range->start,
+   .end = range->start + size - 1,
+   };
+
+   return 0;
+}
+
 static ssize_t size_show(struct device *dev,
struct device_attribute *attr, char *buf)
 {
@@ -605,7 +625,127 @@ static ssize_t size_show(struct device *dev,
 
return sprintf(buf, "%llu\n", size);
 }
-static DEVICE_ATTR_RO(size);
+
+static bool alloc_is_aligned(struct dax_region *dax_region,
+   resource_size_t size)
+{
+   /*
+* The minimum mapping granularity for a device instance is a
+* single subsection, unless the arch says otherwise.
+*/
+   return IS_ALIGNED(size, max_t(unsigned long, dax_region->align,
+   memremap_compat_align()));
+}
+
+static int dev_dax_shrink(struct dev_dax *dev_dax, resource_size_t size)
+{
+   struct dax_region *dax_region = dev_dax->region;
+   struct range *range = _dax->range;
+   struct resource *res, *adjust = NULL;
+   struct device *dev = _dax->dev;
+
+   for_each_dax_region_resource(dax_region, res)
+   if (strcmp(res->name, dev_name(dev)) == 0
+   && res->start == range->start) {
+   adjust = res;
+   break;
+   }
+
+   if (dev_WARN_ONCE(dev, !adjust, "failed to find matching resource\n"))
+   return -ENXIO;
+   return adjust_dev_dax_range(dev_dax, adjust, size);
+}
+
+static ssize_t dev_dax_resize(struct dax_region *dax_region,
+   struct dev_dax *dev_dax, resource_size_t size)
+{
+   resource_size_t avail = dax_region_avail_size(dax_region), to_alloc;
+   resource_size_t dev_size = range_len(_dax->range);
+   struct resource *region_res = _region->res;
+   struct device *dev = _dax->dev

[PATCH v4 20/23] device-dax: Make align a per-device property

2020-08-02 Thread Dan Williams

From: Joao Martins 

Introduce @align to struct dev_dax.

When creating a new device, we still initialize to the default
dax_region @align. Child devices belonging to a region may wish
to keep a different alignment property instead of a global
region-defined one.

Signed-off-by: Joao Martins 
Link: https://lore.kernel.org/r/20200716172913.19658-2-joao.m.mart...@oracle.com
Signed-off-by: Dan Williams 
---
 drivers/dax/bus.c |1 +
 drivers/dax/dax-private.h |3 +++
 drivers/dax/device.c  |   37 +++--
 3 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 2779c65dc7c0..9edfdf83408e 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -1215,6 +1215,7 @@ struct dev_dax *devm_create_dev_dax(struct dev_dax_data 
*data)
 
dev_dax->dax_dev = dax_dev;
dev_dax->target_node = dax_region->target_node;
+   dev_dax->align = dax_region->align;
ida_init(_dax->ida);
kref_get(_region->kref);
 
diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h
index 13780f62b95e..5fd3a26cfcea 100644
--- a/drivers/dax/dax-private.h
+++ b/drivers/dax/dax-private.h
@@ -62,6 +62,7 @@ struct dax_mapping {
 struct dev_dax {
struct dax_region *region;
struct dax_device *dax_dev;
+   unsigned int align;
int target_node;
int id;
struct ida ida;
@@ -84,4 +85,6 @@ static inline struct dax_mapping *to_dax_mapping(struct 
device *dev)
 {
return container_of(dev, struct dax_mapping, dev);
 }
+
+phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, unsigned 
long size);
 #endif
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 2bfc5c83e3b0..d2b1892cb1b2 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -17,7 +17,6 @@
 static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma,
const char *func)
 {
-   struct dax_region *dax_region = dev_dax->region;
struct device *dev = _dax->dev;
unsigned long mask;
 
@@ -32,7 +31,7 @@ static int check_vma(struct dev_dax *dev_dax, struct 
vm_area_struct *vma,
return -EINVAL;
}
 
-   mask = dax_region->align - 1;
+   mask = dev_dax->align - 1;
if (vma->vm_start & mask || vma->vm_end & mask) {
dev_info_ratelimited(dev,
"%s: %s: fail, unaligned vma (%#lx - %#lx, 
%#lx)\n",
@@ -78,21 +77,19 @@ static vm_fault_t __dev_dax_pte_fault(struct dev_dax 
*dev_dax,
struct vm_fault *vmf, pfn_t *pfn)
 {
struct device *dev = _dax->dev;
-   struct dax_region *dax_region;
phys_addr_t phys;
unsigned int fault_size = PAGE_SIZE;
 
if (check_vma(dev_dax, vmf->vma, __func__))
return VM_FAULT_SIGBUS;
 
-   dax_region = dev_dax->region;
-   if (dax_region->align > PAGE_SIZE) {
+   if (dev_dax->align > PAGE_SIZE) {
dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n",
-   dax_region->align, fault_size);
+   dev_dax->align, fault_size);
return VM_FAULT_SIGBUS;
}
 
-   if (fault_size != dax_region->align)
+   if (fault_size != dev_dax->align)
return VM_FAULT_SIGBUS;
 
phys = dax_pgoff_to_phys(dev_dax, vmf->pgoff, PAGE_SIZE);
@@ -120,15 +117,15 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax 
*dev_dax,
return VM_FAULT_SIGBUS;
 
dax_region = dev_dax->region;
-   if (dax_region->align > PMD_SIZE) {
+   if (dev_dax->align > PMD_SIZE) {
dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n",
-   dax_region->align, fault_size);
+   dev_dax->align, fault_size);
return VM_FAULT_SIGBUS;
}
 
-   if (fault_size < dax_region->align)
+   if (fault_size < dev_dax->align)
return VM_FAULT_SIGBUS;
-   else if (fault_size > dax_region->align)
+   else if (fault_size > dev_dax->align)
return VM_FAULT_FALLBACK;
 
/* if we are outside of the VMA */
@@ -164,15 +161,15 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax 
*dev_dax,
return VM_FAULT_SIGBUS;
 
dax_region = dev_dax->region;
-   if (dax_region->align > PUD_SIZE) {
+   if (dev_dax->align > PUD_SIZE) {
dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n",
-   dax_region->align, fault_size);
+   dev_dax->align, fault_size);
return VM_FAULT_SIGBUS;
}
 
-   if (fault_size < dax_region->align)
+   if (fault_size < dev_dax->alig

[PATCH v4 11/23] device-dax: Kill dax_kmem_res

2020-08-02 Thread Dan Williams

Several related issues around this unneeded attribute:

- The dax_kmem_res property allows the kmem driver to stash the adjusted
  resource range that was used for the hotplug operation, but that can be
  recalculated from the original base range.

- kmem is using an open coded release_resource() + kfree() when an
  idiomatic release_mem_region() is sufficient.

- The driver managed resource need only manage the busy flag. Other flags
  are of no concern to the kmem driver. In fact if kmem inherits some
  memory range that add_memory_driver_managed() rejects that is a
  memory-hotplug-core policy that the driver is in no position to
  override.

- The implementation trusts that failed remove_memory() results in the
  entire resource range remaining pinned busy. The driver need not make
  that layering violation assumption and just maintain the busy state in
  its local resource.

- The "Hot-remove not yet implemented." comment is stale since hotremove
  support is now included.

Cc: David Hildenbrand 
Cc: Vishal Verma 
Cc: Dave Hansen 
Cc: Pavel Tatashin 
Signed-off-by: Dan Williams 
---
 drivers/dax/dax-private.h |3 -
 drivers/dax/kmem.c|  123 +
 2 files changed, 58 insertions(+), 68 deletions(-)

diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h
index 6779f683671d..12a2dbc43b40 100644
--- a/drivers/dax/dax-private.h
+++ b/drivers/dax/dax-private.h
@@ -42,8 +42,6 @@ struct dax_region {
  * @dev - device core
  * @pgmap - pgmap for memmap setup / lifetime (driver owned)
  * @range: resource range for the instance
- * @dax_mem_res: physical address range of hotadded DAX memory
- * @dax_mem_name: name for hotadded DAX memory via add_memory_driver_managed()
  */
 struct dev_dax {
struct dax_region *region;
@@ -52,7 +50,6 @@ struct dev_dax {
struct device dev;
struct dev_pagemap *pgmap;
struct range range;
-   struct resource *dax_kmem_res;
 };
 
 static inline u64 range_len(struct range *range)
diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index 5bb133df147d..77e25361fbeb 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -19,16 +19,24 @@ static const char *kmem_name;
 /* Set if any memory will remain added when the driver will be unloaded. */
 static bool any_hotremove_failed;
 
+static struct range dax_kmem_range(struct dev_dax *dev_dax)
+{
+   struct range range;
+
+   /* memory-block align the hotplug range */
+   range.start = ALIGN(dev_dax->range.start, memory_block_size_bytes());
+   range.end = ALIGN_DOWN(dev_dax->range.end + 1,
+   memory_block_size_bytes()) - 1;
+   return range;
+}
+
 int dev_dax_kmem_probe(struct device *dev)
 {
struct dev_dax *dev_dax = to_dev_dax(dev);
-   struct range *range = _dax->range;
-   resource_size_t kmem_start;
-   resource_size_t kmem_size;
-   resource_size_t kmem_end;
-   struct resource *new_res;
-   const char *new_res_name;
-   int numa_node;
+   struct range range = dax_kmem_range(dev_dax);
+   int numa_node = dev_dax->target_node;
+   struct resource *res;
+   char *res_name;
int rc;
 
/*
@@ -37,109 +45,94 @@ int dev_dax_kmem_probe(struct device *dev)
 * could be mixed in a node with faster memory, causing
 * unavoidable performance issues.
 */
-   numa_node = dev_dax->target_node;
if (numa_node < 0) {
dev_warn(dev, "rejecting DAX region with invalid node: %d\n",
numa_node);
return -EINVAL;
}
 
-   /* Hotplug starting at the beginning of the next block: */
-   kmem_start = ALIGN(range->start, memory_block_size_bytes());
-
-   kmem_size = range_len(range);
-   /* Adjust the size down to compensate for moving up kmem_start: */
-   kmem_size -= kmem_start - range->start;
-   /* Align the size down to cover only complete blocks: */
-   kmem_size &= ~(memory_block_size_bytes() - 1);
-   kmem_end = kmem_start + kmem_size;
-
-   new_res_name = kstrdup(dev_name(dev), GFP_KERNEL);
-   if (!new_res_name)
+   res_name = kstrdup(dev_name(dev), GFP_KERNEL);
+   if (!res_name)
return -ENOMEM;
 
-   /* Region is permanently reserved if hotremove fails. */
-   new_res = request_mem_region(kmem_start, kmem_size, new_res_name);
-   if (!new_res) {
-   dev_warn(dev, "could not reserve region [%pa-%pa]\n",
-_start, _end);
-   kfree(new_res_name);
+   res = request_mem_region(range.start, range_len(), res_name);
+   if (!res) {
+   dev_warn(dev, "could not reserve region [%#llx-%#llx]\n",
+   range.start, range.end);
+   kfree(res_name);
return -EBUSY;
}
 
/*
-

[PATCH v4 03/23] efi/fake_mem: Arrange for a resource entry per efi_fake_mem instance

2020-08-02 Thread Dan Williams

In preparation for attaching a platform device per iomem resource teach
the efi_fake_mem code to create an e820 entry per instance. Similar to
E820_TYPE_PRAM, bypass merging resource when the e820 map is sanitized.

Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: "H. Peter Anvin" 
Cc: x...@kernel.org
Acked-by: Ard Biesheuvel 
Signed-off-by: Dan Williams 
---
 arch/x86/kernel/e820.c  |   16 +++-
 drivers/firmware/efi/x86_fake_mem.c |   12 +---
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 983cd53ed4c9..22aad412f965 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -305,6 +305,20 @@ static int __init cpcompare(const void *a, const void *b)
return (ap->addr != ap->entry->addr) - (bp->addr != bp->entry->addr);
 }
 
+static bool e820_nomerge(enum e820_type type)
+{
+   /*
+* These types may indicate distinct platform ranges aligned to
+* numa node, protection domain, performance domain, or other
+* boundaries. Do not merge them.
+*/
+   if (type == E820_TYPE_PRAM)
+   return true;
+   if (type == E820_TYPE_SOFT_RESERVED)
+   return true;
+   return false;
+}
+
 int __init e820__update_table(struct e820_table *table)
 {
struct e820_entry *entries = table->entries;
@@ -380,7 +394,7 @@ int __init e820__update_table(struct e820_table *table)
}
 
/* Continue building up new map based on this information: */
-   if (current_type != last_type || current_type == 
E820_TYPE_PRAM) {
+   if (current_type != last_type || e820_nomerge(current_type)) {
if (last_type != 0)  {
new_entries[new_nr_entries].size = 
change_point[chg_idx]->addr - last_addr;
/* Move forward only if the new size was 
non-zero: */
diff --git a/drivers/firmware/efi/x86_fake_mem.c 
b/drivers/firmware/efi/x86_fake_mem.c
index e5d6d5a1b240..0bafcc1bb0f6 100644
--- a/drivers/firmware/efi/x86_fake_mem.c
+++ b/drivers/firmware/efi/x86_fake_mem.c
@@ -38,7 +38,7 @@ void __init efi_fake_memmap_early(void)
m_start = mem->range.start;
m_end = mem->range.end;
for_each_efi_memory_desc(md) {
-   u64 start, end;
+   u64 start, end, size;
 
if (md->type != EFI_CONVENTIONAL_MEMORY)
continue;
@@ -58,11 +58,17 @@ void __init efi_fake_memmap_early(void)
 */
start = max(start, m_start);
end = min(end, m_end);
+   size = end - start + 1;
 
if (end <= start)
continue;
-   e820__range_update(start, end - start + 1, 
E820_TYPE_RAM,
-   E820_TYPE_SOFT_RESERVED);
+
+   /*
+* Ensure each efi_fake_mem instance results in
+* a unique e820 resource
+*/
+   e820__range_remove(start, size, E820_TYPE_RAM, 1);
+   e820__range_add(start, size, E820_TYPE_SOFT_RESERVED);
e820__update_table(e820_table);
}
}

[PATCH v4 13/23] device-dax: Introduce 'seed' devices

2020-08-02 Thread Dan Williams

Add a seed device concept for dynamic dax regions to be able to split
the region amongst multiple sub-instances. The seed device, similar to
libnvdimm seed devices, is a device that starts with zero capacity
allocated and unbound to a driver. In contrast to libnvdimm seed devices
explicit 'create' and 'delete' interfaces are added to the region to
trigger seeds to be created and unused devices to be reclaimed. The
explicit create and delete replaces implicit create as a side effect of
probe and implicit delete when writing 0 to the size that libnvdimm
implements.

Delete can be performed on any 0-sized and idle device.  This avoids the
gymnastics of needing to move device_unregister() to its own async
context.  Specifically, it avoids the deadlock of deleting a device via
one of its own attributes. It is also less surprising to userspace which
never sees an extra device it did not request.

For now just add the device creation, teardown, and ->probe()
prevention. A later patch will arrange for the 'dax/size' attribute to
be writable to allocate capacity from the region.

Cc: Vishal Verma 
Signed-off-by: Dan Williams 
---
 drivers/dax/bus.c |  317 -
 drivers/dax/bus.h |4 -
 drivers/dax/dax-private.h |9 +
 drivers/dax/device.c  |   12 +-
 drivers/dax/hmem/hmem.c   |2 
 drivers/dax/kmem.c|   14 +-
 drivers/dax/pmem/compat.c |2 
 7 files changed, 304 insertions(+), 56 deletions(-)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 0a48ce378686..dce9413a4394 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -135,10 +135,46 @@ static bool is_static(struct dax_region *dax_region)
return (dax_region->res.flags & IORESOURCE_DAX_STATIC) != 0;
 }
 
+static int dax_bus_probe(struct device *dev)
+{
+   struct dax_device_driver *dax_drv = to_dax_drv(dev->driver);
+   struct dev_dax *dev_dax = to_dev_dax(dev);
+   struct dax_region *dax_region = dev_dax->region;
+   struct range *range = _dax->range;
+   int rc;
+
+   if (range_len(range) == 0 || dev_dax->id < 0)
+   return -ENXIO;
+
+   rc = dax_drv->probe(dev_dax);
+
+   if (rc || is_static(dax_region))
+   return rc;
+
+   /*
+* Track new seed creation only after successful probe of the
+* previous seed.
+*/
+   if (dax_region->seed == dev)
+   dax_region->seed = NULL;
+
+   return 0;
+}
+
+static int dax_bus_remove(struct device *dev)
+{
+   struct dax_device_driver *dax_drv = to_dax_drv(dev->driver);
+   struct dev_dax *dev_dax = to_dev_dax(dev);
+
+   return dax_drv->remove(dev_dax);
+}
+
 static struct bus_type dax_bus_type = {
.name = "dax",
.uevent = dax_bus_uevent,
.match = dax_bus_match,
+   .probe = dax_bus_probe,
+   .remove = dax_bus_remove,
.drv_groups = dax_drv_groups,
 };
 
@@ -219,14 +255,216 @@ static ssize_t available_size_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(available_size);
 
+static ssize_t seed_show(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   struct dax_region *dax_region = dev_get_drvdata(dev);
+   struct device *seed;
+   ssize_t rc;
+
+   if (is_static(dax_region))
+   return -EINVAL;
+
+   device_lock(dev);
+   seed = dax_region->seed;
+   rc = sprintf(buf, "%s\n", seed ? dev_name(seed) : "");
+   device_unlock(dev);
+
+   return rc;
+}
+static DEVICE_ATTR_RO(seed);
+
+static ssize_t create_show(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   struct dax_region *dax_region = dev_get_drvdata(dev);
+   struct device *youngest;
+   ssize_t rc;
+
+   if (is_static(dax_region))
+   return -EINVAL;
+
+   device_lock(dev);
+   youngest = dax_region->youngest;
+   rc = sprintf(buf, "%s\n", youngest ? dev_name(youngest) : "");
+   device_unlock(dev);
+
+   return rc;
+}
+
+static ssize_t create_store(struct device *dev, struct device_attribute *attr,
+   const char *buf, size_t len)
+{
+   struct dax_region *dax_region = dev_get_drvdata(dev);
+   unsigned long long avail;
+   ssize_t rc;
+   int val;
+
+   if (is_static(dax_region))
+   return -EINVAL;
+
+   rc = kstrtoint(buf, 0, );
+   if (rc)
+   return rc;
+   if (val != 1)
+   return -EINVAL;
+
+   device_lock(dev);
+   avail = dax_region_avail_size(dax_region);
+   if (avail == 0)
+   rc = -ENOSPC;
+   else {
+   struct dev_dax_data data = {
+   .dax_region = dax_region,
+   .size = 0,
+   .id = -1,
+   };
+   struct dev_dax *dev_dax = devm_creat

[PATCH v4 10/23] device-dax: Make pgmap optional for instance creation

2020-08-02 Thread Dan Williams

The passed in dev_pagemap is only required in the pmem case as the
libnvdimm core may have reserved a vmem_altmap for dev_memremap_pages()
to place the memmap in pmem directly. In the hmem case there is no
agent reserving an altmap so it can all be handled by a core internal
default.

Pass the resource range via a new @range property of 'struct
dev_dax_data'.

Cc: Vishal Verma 
Signed-off-by: Dan Williams 
---
 drivers/dax/bus.c  |   29 +++--
 drivers/dax/bus.h  |2 ++
 drivers/dax/dax-private.h  |9 -
 drivers/dax/device.c   |   28 +++-
 drivers/dax/hmem/hmem.c|8 
 drivers/dax/kmem.c |   12 ++--
 drivers/dax/pmem/core.c|4 
 tools/testing/nvdimm/dax-dev.c |8 
 8 files changed, 62 insertions(+), 38 deletions(-)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index dffa4655e128..96bd64ba95a5 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -271,7 +271,7 @@ static ssize_t size_show(struct device *dev,
struct device_attribute *attr, char *buf)
 {
struct dev_dax *dev_dax = to_dev_dax(dev);
-   unsigned long long size = resource_size(_dax->region->res);
+   unsigned long long size = range_len(_dax->range);
 
return sprintf(buf, "%llu\n", size);
 }
@@ -293,19 +293,12 @@ static ssize_t target_node_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(target_node);
 
-static unsigned long long dev_dax_resource(struct dev_dax *dev_dax)
-{
-   struct dax_region *dax_region = dev_dax->region;
-
-   return dax_region->res.start;
-}
-
 static ssize_t resource_show(struct device *dev,
struct device_attribute *attr, char *buf)
 {
struct dev_dax *dev_dax = to_dev_dax(dev);
 
-   return sprintf(buf, "%#llx\n", dev_dax_resource(dev_dax));
+   return sprintf(buf, "%#llx\n", dev_dax->range.start);
 }
 static DEVICE_ATTR(resource, 0400, resource_show, NULL);
 
@@ -376,6 +369,7 @@ static void dev_dax_release(struct device *dev)
 
dax_region_put(dax_region);
put_dax(dax_dev);
+   kfree(dev_dax->pgmap);
kfree(dev_dax);
 }
 
@@ -412,7 +406,12 @@ struct dev_dax *devm_create_dev_dax(struct dev_dax_data 
*data)
if (!dev_dax)
return ERR_PTR(-ENOMEM);
 
-   memcpy(_dax->pgmap, data->pgmap, sizeof(struct dev_pagemap));
+   if (data->pgmap) {
+   dev_dax->pgmap = kmemdup(data->pgmap,
+   sizeof(struct dev_pagemap), GFP_KERNEL);
+   if (!dev_dax->pgmap)
+   goto err_pgmap;
+   }
 
/*
 * No 'host' or dax_operations since there is no access to this
@@ -421,18 +420,19 @@ struct dev_dax *devm_create_dev_dax(struct dev_dax_data 
*data)
dax_dev = alloc_dax(dev_dax, NULL, NULL, DAXDEV_F_SYNC);
if (IS_ERR(dax_dev)) {
rc = PTR_ERR(dax_dev);
-   goto err;
+   goto err_alloc_dax;
}
 
/* a device_dax instance is dead while the driver is not attached */
kill_dax(dax_dev);
 
-   /* from here on we're committed to teardown via dax_dev_release() */
+   /* from here on we're committed to teardown via dev_dax_release() */
dev = _dax->dev;
device_initialize(dev);
 
dev_dax->dax_dev = dax_dev;
dev_dax->region = dax_region;
+   dev_dax->range = data->range;
dev_dax->target_node = dax_region->target_node;
kref_get(_region->kref);
 
@@ -458,8 +458,9 @@ struct dev_dax *devm_create_dev_dax(struct dev_dax_data 
*data)
return ERR_PTR(rc);
 
return dev_dax;
-
- err:
+err_alloc_dax:
+   kfree(dev_dax->pgmap);
+err_pgmap:
kfree(dev_dax);
 
return ERR_PTR(rc);
diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h
index 299c2e7fac09..4aeb36da83a4 100644
--- a/drivers/dax/bus.h
+++ b/drivers/dax/bus.h
@@ -3,6 +3,7 @@
 #ifndef __DAX_BUS_H__
 #define __DAX_BUS_H__
 #include 
+#include 
 
 struct dev_dax;
 struct resource;
@@ -21,6 +22,7 @@ struct dev_dax_data {
struct dax_region *dax_region;
struct dev_pagemap *pgmap;
enum dev_dax_subsys subsys;
+   struct range range;
int id;
 };
 
diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h
index 8a4c40ccd2ef..6779f683671d 100644
--- a/drivers/dax/dax-private.h
+++ b/drivers/dax/dax-private.h
@@ -41,6 +41,7 @@ struct dax_region {
  * @target_node: effective numa node if dev_dax memory range is onlined
  * @dev - device core
  * @pgmap - pgmap for memmap setup / lifetime (driver owned)
+ * @range: resource range for the instance
  * @dax_mem_res: physical address range of hotadded DAX memory
  * @dax_mem_name: name for hotadded DAX memory via add_memory_driver_managed()
  */
@@ -49,10 +50,16 @@ struct

[PATCH v4 08/23] device-dax: Drop the dax_region.pfn_flags attribute

2020-08-02 Thread Dan Williams

All callers specify the same flags to alloc_dax_region(), so there is no
need to allow for anything other than PFN_DEV|PFN_MAP, or carry a
->pfn_flags around on the region. Device-dax instances are always page
backed.

Cc: Vishal Verma 
Signed-off-by: Dan Williams 
---
 drivers/dax/bus.c |4 +---
 drivers/dax/bus.h |3 +--
 drivers/dax/dax-private.h |2 --
 drivers/dax/device.c  |   26 +++---
 drivers/dax/hmem/hmem.c   |2 +-
 drivers/dax/pmem/core.c   |3 +--
 6 files changed, 7 insertions(+), 33 deletions(-)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index df238c8b6ef2..f06ffa66cd78 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -226,8 +226,7 @@ static void dax_region_unregister(void *region)
 }
 
 struct dax_region *alloc_dax_region(struct device *parent, int region_id,
-   struct resource *res, int target_node, unsigned int align,
-   unsigned long long pfn_flags)
+   struct resource *res, int target_node, unsigned int align)
 {
struct dax_region *dax_region;
 
@@ -251,7 +250,6 @@ struct dax_region *alloc_dax_region(struct device *parent, 
int region_id,
 
dev_set_drvdata(parent, dax_region);
memcpy(_region->res, res, sizeof(*res));
-   dax_region->pfn_flags = pfn_flags;
kref_init(_region->kref);
dax_region->id = region_id;
dax_region->align = align;
diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h
index 9e4eba67e8b9..55577e9791da 100644
--- a/drivers/dax/bus.h
+++ b/drivers/dax/bus.h
@@ -10,8 +10,7 @@ struct dax_device;
 struct dax_region;
 void dax_region_put(struct dax_region *dax_region);
 struct dax_region *alloc_dax_region(struct device *parent, int region_id,
-   struct resource *res, int target_node, unsigned int align,
-   unsigned long long flags);
+   struct resource *res, int target_node, unsigned int align);
 
 enum dev_dax_subsys {
DEV_DAX_BUS,
diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h
index 16850d5388ab..8a4c40ccd2ef 100644
--- a/drivers/dax/dax-private.h
+++ b/drivers/dax/dax-private.h
@@ -23,7 +23,6 @@ void dax_bus_exit(void);
  * @dev: parent device backing this region
  * @align: allocation and mapping alignment for child dax devices
  * @res: physical address range of the region
- * @pfn_flags: identify whether the pfns are paged back or not
  */
 struct dax_region {
int id;
@@ -32,7 +31,6 @@ struct dax_region {
struct device *dev;
unsigned int align;
struct resource res;
-   unsigned long long pfn_flags;
 };
 
 /**
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 4c0af2eb7e19..bffef1b21144 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -41,14 +41,6 @@ static int check_vma(struct dev_dax *dev_dax, struct 
vm_area_struct *vma,
return -EINVAL;
}
 
-   if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV
-   && (vma->vm_flags & VM_DONTCOPY) == 0) {
-   dev_info_ratelimited(dev,
-   "%s: %s: fail, dax range requires 
MADV_DONTFORK\n",
-   current->comm, func);
-   return -EINVAL;
-   }
-
if (!vma_is_dax(vma)) {
dev_info_ratelimited(dev,
"%s: %s: fail, vma is not DAX capable\n",
@@ -102,7 +94,7 @@ static vm_fault_t __dev_dax_pte_fault(struct dev_dax 
*dev_dax,
return VM_FAULT_SIGBUS;
}
 
-   *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+   *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
 
return vmf_insert_mixed(vmf->vma, vmf->address, *pfn);
 }
@@ -127,12 +119,6 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax 
*dev_dax,
return VM_FAULT_SIGBUS;
}
 
-   /* dax pmd mappings require pfn_t_devmap() */
-   if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) {
-   dev_dbg(dev, "region lacks devmap flags\n");
-   return VM_FAULT_SIGBUS;
-   }
-
if (fault_size < dax_region->align)
return VM_FAULT_SIGBUS;
else if (fault_size > dax_region->align)
@@ -150,7 +136,7 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax 
*dev_dax,
return VM_FAULT_SIGBUS;
}
 
-   *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+   *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
 
return vmf_insert_pfn_pmd(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE);
 }
@@ -177,12 +163,6 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax 
*dev_dax,
return VM_FAULT_SIGBUS;
}
 
-   /* dax pud mappings require pfn_t_devmap() */
-   if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (P

[PATCH v4 09/23] device-dax: Move instance creation parameters to 'struct dev_dax_data'

2020-08-02 Thread Dan Williams

In preparation for adding more parameters to instance creation, move
existing parameters to a new struct.

Cc: Vishal Verma 
Signed-off-by: Dan Williams 
---
 drivers/dax/bus.c   |   14 +++---
 drivers/dax/bus.h   |   16 
 drivers/dax/hmem/hmem.c |8 +++-
 drivers/dax/pmem/core.c |9 -
 4 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index f06ffa66cd78..dffa4655e128 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -395,9 +395,9 @@ static void unregister_dev_dax(void *dev)
put_device(dev);
 }
 
-struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id,
-   struct dev_pagemap *pgmap, enum dev_dax_subsys subsys)
+struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data)
 {
+   struct dax_region *dax_region = data->dax_region;
struct device *parent = dax_region->dev;
struct dax_device *dax_dev;
struct dev_dax *dev_dax;
@@ -405,14 +405,14 @@ struct dev_dax *__devm_create_dev_dax(struct dax_region 
*dax_region, int id,
struct device *dev;
int rc = -ENOMEM;
 
-   if (id < 0)
+   if (data->id < 0)
return ERR_PTR(-EINVAL);
 
dev_dax = kzalloc(sizeof(*dev_dax), GFP_KERNEL);
if (!dev_dax)
return ERR_PTR(-ENOMEM);
 
-   memcpy(_dax->pgmap, pgmap, sizeof(*pgmap));
+   memcpy(_dax->pgmap, data->pgmap, sizeof(struct dev_pagemap));
 
/*
 * No 'host' or dax_operations since there is no access to this
@@ -438,13 +438,13 @@ struct dev_dax *__devm_create_dev_dax(struct dax_region 
*dax_region, int id,
 
inode = dax_inode(dax_dev);
dev->devt = inode->i_rdev;
-   if (subsys == DEV_DAX_BUS)
+   if (data->subsys == DEV_DAX_BUS)
dev->bus = _bus_type;
else
dev->class = dax_class;
dev->parent = parent;
dev->type = _dax_type;
-   dev_set_name(dev, "dax%d.%d", dax_region->id, id);
+   dev_set_name(dev, "dax%d.%d", dax_region->id, data->id);
 
rc = device_add(dev);
if (rc) {
@@ -464,7 +464,7 @@ struct dev_dax *__devm_create_dev_dax(struct dax_region 
*dax_region, int id,
 
return ERR_PTR(rc);
 }
-EXPORT_SYMBOL_GPL(__devm_create_dev_dax);
+EXPORT_SYMBOL_GPL(devm_create_dev_dax);
 
 static int match_always_count;
 
diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h
index 55577e9791da..299c2e7fac09 100644
--- a/drivers/dax/bus.h
+++ b/drivers/dax/bus.h
@@ -13,18 +13,18 @@ struct dax_region *alloc_dax_region(struct device *parent, 
int region_id,
struct resource *res, int target_node, unsigned int align);
 
 enum dev_dax_subsys {
-   DEV_DAX_BUS,
+   DEV_DAX_BUS = 0, /* zeroed dev_dax_data picks this by default */
DEV_DAX_CLASS,
 };
 
-struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id,
-   struct dev_pagemap *pgmap, enum dev_dax_subsys subsys);
+struct dev_dax_data {
+   struct dax_region *dax_region;
+   struct dev_pagemap *pgmap;
+   enum dev_dax_subsys subsys;
+   int id;
+};
 
-static inline struct dev_dax *devm_create_dev_dax(struct dax_region 
*dax_region,
-   int id, struct dev_pagemap *pgmap)
-{
-   return __devm_create_dev_dax(dax_region, id, pgmap, DEV_DAX_BUS);
-}
+struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data);
 
 /* to be deleted when DEV_DAX_CLASS is removed */
 struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys 
subsys);
diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
index 506893861253..b84fe17178d8 100644
--- a/drivers/dax/hmem/hmem.c
+++ b/drivers/dax/hmem/hmem.c
@@ -11,6 +11,7 @@ static int dax_hmem_probe(struct platform_device *pdev)
struct dev_pagemap pgmap = { };
struct dax_region *dax_region;
struct memregion_info *mri;
+   struct dev_dax_data data;
struct dev_dax *dev_dax;
struct resource *res;
 
@@ -26,7 +27,12 @@ static int dax_hmem_probe(struct platform_device *pdev)
if (!dax_region)
return -ENOMEM;
 
-   dev_dax = devm_create_dev_dax(dax_region, 0, );
+   data = (struct dev_dax_data) {
+   .dax_region = dax_region,
+   .id = 0,
+   .pgmap = ,
+   };
+   dev_dax = devm_create_dev_dax();
if (IS_ERR(dev_dax))
return PTR_ERR(dev_dax);
 
diff --git a/drivers/dax/pmem/core.c b/drivers/dax/pmem/core.c
index ea52bb77a294..08ee5947a49c 100644
--- a/drivers/dax/pmem/core.c
+++ b/drivers/dax/pmem/core.c
@@ -14,6 +14,7 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum 
dev_dax_subsys subsys)
resource_size_t offset;
struct nd_pfn_sb *pfn_sb;
struct dev_dax *dev_dax;
+   struct dev_dax_data data;
struct nd_namespace_io *nsi

[PATCH v4 05/23] resource: Report parent to walk_iomem_res_desc() callback

2020-08-02 Thread Dan Williams

In support of detecting whether a resource might have been been claimed,
report the parent to the walk_iomem_res_desc() callback. For example,
the ACPI HMAT parser publishes "hmem" platform devices per target range.
However, if the HMAT is disabled / missing a fallback driver can attach
devices to the raw memory ranges as a fallback if it sees unclaimed /
orphan "Soft Reserved" resources in the resource tree.

Otherwise, find_next_iomem_res() returns a resource with garbage data
from the stack allocation in __walk_iomem_res_desc() for the res->parent
field.

There are currently no users that expect ->child and ->sibling to be
valid, and the resource_lock would be needed to traverse them. Use a
compound literal to implicitly zero initialize the fields that are not
being returned in addition to setting ->parent.

Cc: Jason Gunthorpe 
Cc: Dave Hansen 
Cc: Wei Yang 
Cc: Tom Lendacky 
Signed-off-by: Dan Williams 
---
 kernel/resource.c |   11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/kernel/resource.c b/kernel/resource.c
index 841737bbda9e..f1175ce93a1d 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -382,10 +382,13 @@ static int find_next_iomem_res(resource_size_t start, 
resource_size_t end,
 
if (p) {
/* copy data */
-   res->start = max(start, p->start);
-   res->end = min(end, p->end);
-   res->flags = p->flags;
-   res->desc = p->desc;
+   *res = (struct resource) {
+   .start = max(start, p->start),
+   .end = min(end, p->end),
+   .flags = p->flags,
+   .desc = p->desc,
+   .parent = p->parent,
+   };
}
 
read_unlock(_lock);

[PATCH v4 06/23] mm/memory_hotplug: Introduce default phys_to_target_node() implementation

2020-08-02 Thread Dan Williams

In preparation to set a fallback value for dev_dax->target_node,
introduce generic fallback helpers for phys_to_target_node()

A generic implementation based on node-data or memblock was proposed,
but as noted by Mike:

"Here again, I would prefer to add a weak default for
 phys_to_target_node() because the "generic" implementation is not really
 generic.

 The fallback to reserved ranges is x86 specfic because on x86 most of
 the reserved areas is not in memblock.memory. AFAIK, no other
 architecture does this."

The info message in the generic memory_add_physaddr_to_nid()
implementation is fixed up to properly reflect that
memory_add_physaddr_to_nid() communicates "online" node info and
phys_to_target_node() indicates "target / to-be-onlined" node info.

Cc: David Hildenbrand 
Cc: Mike Rapoport 
Cc: Jia He 
Signed-off-by: Dan Williams 
---
 arch/x86/mm/numa.c |1 -
 include/linux/memory_hotplug.h |5 +
 include/linux/numa.h   |   11 ---
 mm/memory_hotplug.c|   10 +-
 4 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index f3805bbaa784..c62e274d52d0 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -917,7 +917,6 @@ int phys_to_target_node(phys_addr_t start)
 
return meminfo_to_nid(_reserved_meminfo, start);
 }
-EXPORT_SYMBOL_GPL(phys_to_target_node);
 
 int memory_add_physaddr_to_nid(u64 start)
 {
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 375515803cd8..dcdc7d6206d5 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -151,11 +151,16 @@ int add_pages(int nid, unsigned long start_pfn, unsigned 
long nr_pages,
 
 #ifdef CONFIG_NUMA
 extern int memory_add_physaddr_to_nid(u64 start);
+extern int phys_to_target_node(u64 start);
 #else
 static inline int memory_add_physaddr_to_nid(u64 start)
 {
return 0;
 }
+static inline int phys_to_target_node(u64 start)
+{
+   return 0;
+}
 #endif
 
 #ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION
diff --git a/include/linux/numa.h b/include/linux/numa.h
index a42df804679e..8cb33ccfb671 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -23,22 +23,11 @@
 #ifdef CONFIG_NUMA
 /* Generic implementation available */
 int numa_map_to_online_node(int node);
-
-/*
- * Optional architecture specific implementation, users need a "depends
- * on $ARCH"
- */
-int phys_to_target_node(phys_addr_t addr);
 #else
 static inline int numa_map_to_online_node(int node)
 {
return NUMA_NO_NODE;
 }
-
-static inline int phys_to_target_node(phys_addr_t addr)
-{
-   return NUMA_NO_NODE;
-}
 #endif
 
 #endif /* _LINUX_NUMA_H */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index dcdf3271f87e..426b79adf529 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -353,11 +353,19 @@ int __ref __add_pages(int nid, unsigned long pfn, 
unsigned long nr_pages,
 #ifdef CONFIG_NUMA
 int __weak memory_add_physaddr_to_nid(u64 start)
 {
-   pr_info_once("Unknown target node for memory at 0x%llx, assuming node 
0\n",
+   pr_info_once("Unknown online node for memory at 0x%llx, assuming node 
0\n",
start);
return 0;
 }
 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+
+int __weak phys_to_target_node(u64 start)
+{
+   pr_info_once("Unknown target node for memory at 0x%llx, assuming node 
0\n",
+   start);
+   return 0;
+}
+EXPORT_SYMBOL_GPL(phys_to_target_node);
 #endif
 
 /* find the smallest valid pfn in the range [start_pfn, end_pfn) */

[PATCH v4 04/23] ACPI: HMAT: Refactor hmat_register_target_device to hmem_register_device

2020-08-02 Thread Dan Williams

In preparation for exposing "Soft Reserved" memory ranges without an
HMAT, move the hmem device registration to its own compilation unit and
make the implementation generic.

The generic implementation drops usage acpi_map_pxm_to_online_node()
that was translating ACPI proximity domain values and instead relies on
numa_map_to_online_node() to determine the numa node for the device.

Cc: "Rafael J. Wysocki" 
Link: 
https://lore.kernel.org/r/158318761484.2216124.2049322072599482736.st...@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams 
---
 drivers/acpi/numa/hmat.c  |   68 -
 drivers/dax/Kconfig   |4 +++
 drivers/dax/Makefile  |3 +-
 drivers/dax/hmem.c|   56 -
 drivers/dax/hmem/Makefile |5 +++
 drivers/dax/hmem/device.c |   65 +++
 drivers/dax/hmem/hmem.c   |   56 +
 include/linux/dax.h   |8 +
 8 files changed, 145 insertions(+), 120 deletions(-)
 delete mode 100644 drivers/dax/hmem.c
 create mode 100644 drivers/dax/hmem/Makefile
 create mode 100644 drivers/dax/hmem/device.c
 create mode 100644 drivers/dax/hmem/hmem.c

diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c
index a12e36a12618..134bcb40b2af 100644
--- a/drivers/acpi/numa/hmat.c
+++ b/drivers/acpi/numa/hmat.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static u8 hmat_revision;
 static int hmat_disable __initdata;
@@ -640,66 +641,6 @@ static void hmat_register_target_perf(struct memory_target 
*target)
node_set_perf_attrs(mem_nid, >hmem_attrs, 0);
 }
 
-static void hmat_register_target_device(struct memory_target *target,
-   struct resource *r)
-{
-   /* define a clean / non-busy resource for the platform device */
-   struct resource res = {
-   .start = r->start,
-   .end = r->end,
-   .flags = IORESOURCE_MEM,
-   };
-   struct platform_device *pdev;
-   struct memregion_info info;
-   int rc, id;
-
-   rc = region_intersects(res.start, resource_size(), IORESOURCE_MEM,
-   IORES_DESC_SOFT_RESERVED);
-   if (rc != REGION_INTERSECTS)
-   return;
-
-   id = memregion_alloc(GFP_KERNEL);
-   if (id < 0) {
-   pr_err("memregion allocation failure for %pr\n", );
-   return;
-   }
-
-   pdev = platform_device_alloc("hmem", id);
-   if (!pdev) {
-   pr_err("hmem device allocation failure for %pr\n", );
-   goto out_pdev;
-   }
-
-   pdev->dev.numa_node = acpi_map_pxm_to_online_node(target->memory_pxm);
-   info = (struct memregion_info) {
-   .target_node = acpi_map_pxm_to_node(target->memory_pxm),
-   };
-   rc = platform_device_add_data(pdev, , sizeof(info));
-   if (rc < 0) {
-   pr_err("hmem memregion_info allocation failure for %pr\n", 
);
-   goto out_pdev;
-   }
-
-   rc = platform_device_add_resources(pdev, , 1);
-   if (rc < 0) {
-   pr_err("hmem resource allocation failure for %pr\n", );
-   goto out_resource;
-   }
-
-   rc = platform_device_add(pdev);
-   if (rc < 0) {
-   dev_err(>dev, "device add failed for %pr\n", );
-   goto out_resource;
-   }
-
-   return;
-
-out_resource:
-   put_device(>dev);
-out_pdev:
-   memregion_free(id);
-}
-
 static void hmat_register_target_devices(struct memory_target *target)
 {
struct resource *res;
@@ -711,8 +652,11 @@ static void hmat_register_target_devices(struct 
memory_target *target)
if (!IS_ENABLED(CONFIG_DEV_DAX_HMEM))
return;
 
-   for (res = target->memregions.child; res; res = res->sibling)
-   hmat_register_target_device(target, res);
+   for (res = target->memregions.child; res; res = res->sibling) {
+   int target_nid = acpi_map_pxm_to_node(target->memory_pxm);
+
+   hmem_register_device(target_nid, res);
+   }
 }
 
 static void hmat_register_target(struct memory_target *target)
diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
index 3b6c06f07326..a229f45d34aa 100644
--- a/drivers/dax/Kconfig
+++ b/drivers/dax/Kconfig
@@ -48,6 +48,10 @@ config DEV_DAX_HMEM
 
  Say M if unsure.
 
+config DEV_DAX_HMEM_DEVICES
+   depends on DEV_DAX_HMEM
+   def_bool y
+
 config DEV_DAX_KMEM
tristate "KMEM DAX: volatile-use of persistent memory"
default DEV_DAX
diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile
index 80065b38b3c4..9d4ba672d305 100644
--- a/drivers/dax/Makefile
+++ b/drivers/dax/Makefile
@@ -2,11 +2,10 @@
 obj-$(CONFIG_DAX) += dax.o
 obj-$(CONFIG_DEV_DAX) += d

[PATCH v4 07/23] ACPI: HMAT: Attach a device for each soft-reserved range

2020-08-02 Thread Dan Williams

The hmem enabling in commit 'cf8741ac57ed ("ACPI: NUMA: HMAT: Register
"soft reserved" memory as an "hmem" device")' only registered ranges to
the hmem driver for each soft-reservation that also appeared in the
HMAT. While this is meant to encourage platform firmware to "do the
right thing" and publish an HMAT, the corollary is that platforms that
fail to publish an accurate HMAT will strand memory from Linux usage.
Additionally, the "efi_fake_mem" kernel command line option enabling
will strand memory by default without an HMAT.

Arrange for "soft reserved" memory that goes unclaimed by HMAT entries
to be published as raw resource ranges for the hmem driver to consume.

Include a module parameter to disable either this fallback behavior, or
the hmat enabling from creating hmem devices. The module parameter
requires the hmem device enabling to have unique name in the module
namespace: "device_hmem".

The driver depends on the architecture providing phys_to_target_node()
which is only x86 via numa_meminfo() and arm64 via a generic memblock
implementation.

Cc: Jonathan Cameron 
Cc: Brice Goglin 
Cc: Ard Biesheuvel 
Cc: "Rafael J. Wysocki" 
Cc: Jeff Moyer 
Cc: Catalin Marinas 
Cc: Will Deacon 
Reviewed-by: Joao Martins 
Signed-off-by: Dan Williams 
---
 drivers/dax/hmem/Makefile |3 ++-
 drivers/dax/hmem/device.c |   35 +++
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/drivers/dax/hmem/Makefile b/drivers/dax/hmem/Makefile
index a9d353d0c9ed..57377b4c3d47 100644
--- a/drivers/dax/hmem/Makefile
+++ b/drivers/dax/hmem/Makefile
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_DEV_DAX_HMEM) += dax_hmem.o
-obj-$(CONFIG_DEV_DAX_HMEM_DEVICES) += device.o
+obj-$(CONFIG_DEV_DAX_HMEM_DEVICES) += device_hmem.o
 
+device_hmem-y := device.o
 dax_hmem-y := hmem.o
diff --git a/drivers/dax/hmem/device.c b/drivers/dax/hmem/device.c
index b9dd6b27745c..cb6401c9e9a4 100644
--- a/drivers/dax/hmem/device.c
+++ b/drivers/dax/hmem/device.c
@@ -5,6 +5,9 @@
 #include 
 #include 
 
+static bool nohmem;
+module_param_named(disable, nohmem, bool, 0444);
+
 void hmem_register_device(int target_nid, struct resource *r)
 {
/* define a clean / non-busy resource for the platform device */
@@ -17,6 +20,9 @@ void hmem_register_device(int target_nid, struct resource *r)
struct memregion_info info;
int rc, id;
 
+   if (nohmem)
+   return;
+
rc = region_intersects(res.start, resource_size(), IORESOURCE_MEM,
IORES_DESC_SOFT_RESERVED);
if (rc != REGION_INTERSECTS)
@@ -63,3 +69,32 @@ void hmem_register_device(int target_nid, struct resource *r)
 out_pdev:
memregion_free(id);
 }
+
+static __init int hmem_register_one(struct resource *res, void *data)
+{
+   /*
+* If the resource is not a top-level resource it was already
+* assigned to a device by the HMAT parsing.
+*/
+   if (res->parent != _resource) {
+   pr_info("HMEM: skip %pr, already claimed\n", res);
+   return 0;
+   }
+
+   hmem_register_device(phys_to_target_node(res->start), res);
+
+   return 0;
+}
+
+static __init int hmem_init(void)
+{
+   walk_iomem_res_desc(IORES_DESC_SOFT_RESERVED,
+   IORESOURCE_MEM, 0, -1, NULL, hmem_register_one);
+   return 0;
+}
+
+/*
+ * As this is a fallback for address ranges unclaimed by the ACPI HMAT
+ * parsing it must be at an initcall level greater than hmat_init().
+ */
+late_initcall(hmem_init);

[PATCH v4 12/23] device-dax: Add an allocation interface for device-dax instances

2020-08-02 Thread Dan Williams

In preparation for a facility that enables dax regions to be
sub-divided, introduce infrastructure to track and allocate region
capacity.

The new dax_region/available_size attribute is only enabled for volatile
hmem devices, not pmem devices that are defined by nvdimm namespace
boundaries. This is per Jeff's feedback the last time dynamic device-dax
capacity allocation support was discussed.

Link: 
https://lore.kernel.org/linux-nvdimm/x49shpp3zn8@segfault.boston.devel.redhat.com
Cc: Vishal Verma 
Signed-off-by: Dan Williams 
---
 drivers/dax/bus.c |  120 +
 drivers/dax/bus.h |7 ++-
 drivers/dax/dax-private.h |2 -
 drivers/dax/hmem/hmem.c   |7 +--
 drivers/dax/pmem/core.c   |8 +--
 5 files changed, 121 insertions(+), 23 deletions(-)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 96bd64ba95a5..0a48ce378686 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -130,6 +130,11 @@ ATTRIBUTE_GROUPS(dax_drv);
 
 static int dax_bus_match(struct device *dev, struct device_driver *drv);
 
+static bool is_static(struct dax_region *dax_region)
+{
+   return (dax_region->res.flags & IORESOURCE_DAX_STATIC) != 0;
+}
+
 static struct bus_type dax_bus_type = {
.name = "dax",
.uevent = dax_bus_uevent,
@@ -185,7 +190,48 @@ static ssize_t align_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(align);
 
+#define for_each_dax_region_resource(dax_region, res) \
+   for (res = (dax_region)->res.child; res; res = res->sibling)
+
+static unsigned long long dax_region_avail_size(struct dax_region *dax_region)
+{
+   resource_size_t size = resource_size(_region->res);
+   struct resource *res;
+
+   device_lock_assert(dax_region->dev);
+
+   for_each_dax_region_resource(dax_region, res)
+   size -= resource_size(res);
+   return size;
+}
+
+static ssize_t available_size_show(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   struct dax_region *dax_region = dev_get_drvdata(dev);
+   unsigned long long size;
+
+   device_lock(dev);
+   size = dax_region_avail_size(dax_region);
+   device_unlock(dev);
+
+   return sprintf(buf, "%llu\n", size);
+}
+static DEVICE_ATTR_RO(available_size);
+
+static umode_t dax_region_visible(struct kobject *kobj, struct attribute *a,
+   int n)
+{
+   struct device *dev = container_of(kobj, struct device, kobj);
+   struct dax_region *dax_region = dev_get_drvdata(dev);
+
+   if (is_static(dax_region) && a == _attr_available_size.attr)
+   return 0;
+   return a->mode;
+}
+
 static struct attribute *dax_region_attributes[] = {
+   _attr_available_size.attr,
_attr_region_size.attr,
_attr_align.attr,
_attr_id.attr,
@@ -195,6 +241,7 @@ static struct attribute *dax_region_attributes[] = {
 static const struct attribute_group dax_region_attribute_group = {
.name = "dax_region",
.attrs = dax_region_attributes,
+   .is_visible = dax_region_visible,
 };
 
 static const struct attribute_group *dax_region_attribute_groups[] = {
@@ -226,7 +273,8 @@ static void dax_region_unregister(void *region)
 }
 
 struct dax_region *alloc_dax_region(struct device *parent, int region_id,
-   struct resource *res, int target_node, unsigned int align)
+   struct resource *res, int target_node, unsigned int align,
+   unsigned long flags)
 {
struct dax_region *dax_region;
 
@@ -249,12 +297,17 @@ struct dax_region *alloc_dax_region(struct device 
*parent, int region_id,
return NULL;
 
dev_set_drvdata(parent, dax_region);
-   memcpy(_region->res, res, sizeof(*res));
kref_init(_region->kref);
dax_region->id = region_id;
dax_region->align = align;
dax_region->dev = parent;
dax_region->target_node = target_node;
+   dax_region->res = (struct resource) {
+   .start = res->start,
+   .end = res->end,
+   .flags = IORESOURCE_MEM | flags,
+   };
+
if (sysfs_create_groups(>kobj, dax_region_attribute_groups)) {
kfree(dax_region);
return NULL;
@@ -267,6 +320,32 @@ struct dax_region *alloc_dax_region(struct device *parent, 
int region_id,
 }
 EXPORT_SYMBOL_GPL(alloc_dax_region);
 
+static int alloc_dev_dax_range(struct dev_dax *dev_dax, resource_size_t size)
+{
+   struct dax_region *dax_region = dev_dax->region;
+   struct resource *res = _region->res;
+   struct device *dev = _dax->dev;
+   struct resource *alloc;
+
+   device_lock_assert(dax_region->dev);
+
+   /* TODO: handle multiple allocations per region */
+   if (res->child)
+   return -ENOMEM;
+
+   alloc = __request_region(res, res->start, size,

[PATCH v4 00/23] device-dax: Support sub-dividing soft-reserved ranges

2020-08-02 Thread Dan Williams

Changes since v3 [1]:
- Update x86 boot options documentation for 'nohmat' (Randy)

- Fixup a handful of kbuild robot reports, the most significant being
  moving usage of PUD_SIZE and PMD_SIZE under
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE protection.

[1]: 
http://lore.kernel.org/r/159625229779.3040297.11363509688097221416.st...@dwillia2-desk3.amr.corp.intel.com

---
Merge notes:

Well, no v5.8-rc8 to line this up for v5.9, so next best is early
integration into -mm before other collisions develop.

Chatted with Justin offline and it currently appears that the missing
numa information is the fault of the platform firmware to populate all
the necessary NUMA data in the NFIT.

---
Cover:

The device-dax facility allows an address range to be directly mapped
through a chardev, or optionally hotplugged to the core kernel page
allocator as System-RAM. It is the mechanism for converting persistent
memory (pmem) to be used as another volatile memory pool i.e. the
current Memory Tiering hot topic on linux-mm.

In the case of pmem the nvdimm-namespace-label mechanism can sub-divide
it, but that labeling mechanism is not available / applicable to
soft-reserved ("EFI specific purpose") memory [3]. This series provides
a sysfs-mechanism for the daxctl utility to enable provisioning of
volatile-soft-reserved memory ranges.

The motivations for this facility are:

1/ Allow performance differentiated memory ranges to be split between
   kernel-managed and directly-accessed use cases.

2/ Allow physical memory to be provisioned along performance relevant
   address boundaries. For example, divide a memory-side cache [4] along
   cache-color boundaries.

3/ Parcel out soft-reserved memory to VMs using device-dax as a security
   / permissions boundary [5]. Specifically I have seen people (ab)using
   memmap=nn!ss (mark System-RAM as Persistent Memory) just to get the
   device-dax interface on custom address ranges. A follow-on for the VM
   use case is to teach device-dax to dynamically allocate 'struct page' at
   runtime to reduce the duplication of 'struct page' space in both the
   guest and the host kernel for the same physical pages.

[2]: http://lore.kernel.org/r/20200713160837.13774-11-joao.m.mart...@oracle.com
[3]: 
http://lore.kernel.org/r/157309097008.1579826.12818463304589384434.st...@dwillia2-desk3.amr.corp.intel.com
[4]: 
http://lore.kernel.org/r/154899811738.3165233.12325692939590944259.st...@dwillia2-desk3.amr.corp.intel.com
[5]: http://lore.kernel.org/r/20200110190313.17144-1-joao.m.mart...@oracle.com

---

Dan Williams (19):
  x86/numa: Cleanup configuration dependent command-line options
  x86/numa: Add 'nohmat' option
  efi/fake_mem: Arrange for a resource entry per efi_fake_mem instance
  ACPI: HMAT: Refactor hmat_register_target_device to hmem_register_device
  resource: Report parent to walk_iomem_res_desc() callback
  mm/memory_hotplug: Introduce default phys_to_target_node() implementation
  ACPI: HMAT: Attach a device for each soft-reserved range
  device-dax: Drop the dax_region.pfn_flags attribute
  device-dax: Move instance creation parameters to 'struct dev_dax_data'
  device-dax: Make pgmap optional for instance creation
  device-dax: Kill dax_kmem_res
  device-dax: Add an allocation interface for device-dax instances
  device-dax: Introduce 'seed' devices
  drivers/base: Make device_find_child_by_name() compatible with sysfs 
inputs
  device-dax: Add resize support
  mm/memremap_pages: Convert to 'struct range'
  mm/memremap_pages: Support multiple ranges per invocation
  device-dax: Add dis-contiguous resource support
  device-dax: Introduce 'mapping' devices

Joao Martins (4):
  device-dax: Make align a per-device property
  device-dax: Add an 'align' attribute
  dax/hmem: Introduce dax_hmem.region_idle parameter
  device-dax: Add a range mapping allocation attribute


 Documentation/x86/x86_64/boot-options.rst |4 
 arch/powerpc/kvm/book3s_hv_uvmem.c|   14 
 arch/x86/include/asm/numa.h   |8 
 arch/x86/kernel/e820.c|   16 
 arch/x86/mm/numa.c|   11 
 arch/x86/mm/numa_emulation.c  |3 
 arch/x86/xen/enlighten_pv.c   |2 
 drivers/acpi/numa/hmat.c  |   76 --
 drivers/acpi/numa/srat.c  |9 
 drivers/base/core.c   |2 
 drivers/dax/Kconfig   |4 
 drivers/dax/Makefile  |3 
 drivers/dax/bus.c | 1046 +++--
 drivers/dax/bus.h |   28 -
 drivers/dax/dax-private.h |   60 +-
 drivers/dax/device.c  |  134 ++--
 drivers/dax/hmem.c|   56 --
 drivers/dax/hmem/Makefile |6 
 drivers/dax/hmem/device.c |  100 +++
 drivers/dax/h

[PATCH v4 02/23] x86/numa: Add 'nohmat' option

2020-08-02 Thread Dan Williams

Disable parsing of the HMAT for debug, to workaround broken platform
instances, or cases where it is otherwise not wanted.

Cc: x...@kernel.org
Cc: "Rafael J. Wysocki" 
Cc: Dave Hansen 
Cc: Andy Lutomirski 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: "H. Peter Anvin" 
Signed-off-by: Dan Williams 
---
 Documentation/x86/x86_64/boot-options.rst |4 
 arch/x86/mm/numa.c|2 ++
 drivers/acpi/numa/hmat.c  |8 +++-
 include/acpi/acpi_numa.h  |8 
 4 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/Documentation/x86/x86_64/boot-options.rst 
b/Documentation/x86/x86_64/boot-options.rst
index 2b98efb5ba7f..324cefff92e7 100644
--- a/Documentation/x86/x86_64/boot-options.rst
+++ b/Documentation/x86/x86_64/boot-options.rst
@@ -173,6 +173,10 @@ NUMA
   numa=noacpi
 Don't parse the SRAT table for NUMA setup
 
+  numa=nohmat
+Don't parse the HMAT table for NUMA setup, or soft-reserved memory
+partitioning.
+
   numa=fake=[MG]
 If given as a memory unit, fills all system RAM with nodes of
 size interleaved over physical nodes.
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 87c52822cc44..f3805bbaa784 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -41,6 +41,8 @@ static __init int numa_setup(char *opt)
return numa_emu_cmdline(opt + 5);
if (!strncmp(opt, "noacpi", 6))
disable_srat();
+   if (!strncmp(opt, "nohmat", 6))
+   disable_hmat();
return 0;
 }
 early_param("numa", numa_setup);
diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c
index 2c32cfb72370..a12e36a12618 100644
--- a/drivers/acpi/numa/hmat.c
+++ b/drivers/acpi/numa/hmat.c
@@ -26,6 +26,12 @@
 #include 
 
 static u8 hmat_revision;
+static int hmat_disable __initdata;
+
+void __init disable_hmat(void)
+{
+   hmat_disable = 1;
+}
 
 static LIST_HEAD(targets);
 static LIST_HEAD(initiators);
@@ -814,7 +820,7 @@ static __init int hmat_init(void)
enum acpi_hmat_type i;
acpi_status status;
 
-   if (srat_disabled())
+   if (srat_disabled() || hmat_disable)
return 0;
 
status = acpi_get_table(ACPI_SIG_SRAT, 0, );
diff --git a/include/acpi/acpi_numa.h b/include/acpi/acpi_numa.h
index 8784183b2204..0e9302285f14 100644
--- a/include/acpi/acpi_numa.h
+++ b/include/acpi/acpi_numa.h
@@ -27,4 +27,12 @@ static inline void disable_srat(void)
 {
 }
 #endif /* CONFIG_ACPI_NUMA */
+
+#ifdef CONFIG_ACPI_HMAT
+extern void disable_hmat(void);
+#else  /* CONFIG_ACPI_HMAT */
+static inline void disable_hmat(void)
+{
+}
+#endif /* CONFIG_ACPI_HMAT */
 #endif /* __ACP_NUMA_H */

[PATCH v4 01/23] x86/numa: Cleanup configuration dependent command-line options

2020-08-02 Thread Dan Williams

In preparation for adding a new numa= option clean up the existing ones
to avoid ifdefs in numa_setup(), and provide feedback when the option is
numa=fake= option is invalid due to kernel config. The same does not
need to be done for numa=noacpi, since the capability is already hard
disabled at compile-time.

Suggested-by: Rafael J. Wysocki 
Signed-off-by: Dan Williams 
---
 arch/x86/include/asm/numa.h  |8 +++-
 arch/x86/mm/numa.c   |8 ++--
 arch/x86/mm/numa_emulation.c |3 ++-
 arch/x86/xen/enlighten_pv.c  |2 +-
 drivers/acpi/numa/srat.c |9 +++--
 include/acpi/acpi_numa.h |6 +-
 6 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index bbfde3d2662f..0aecc0b629e0 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -3,6 +3,7 @@
 #define _ASM_X86_NUMA_H
 
 #include 
+#include 
 
 #include 
 #include 
@@ -77,7 +78,12 @@ void debug_cpumask_set_cpu(int cpu, int node, bool enable);
 #ifdef CONFIG_NUMA_EMU
 #define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
 #define FAKE_NODE_MIN_HASH_MASK(~(FAKE_NODE_MIN_SIZE - 1UL))
-void numa_emu_cmdline(char *);
+int numa_emu_cmdline(char *str);
+#else /* CONFIG_NUMA_EMU */
+static inline int numa_emu_cmdline(char *str)
+{
+   return -EINVAL;
+}
 #endif /* CONFIG_NUMA_EMU */
 
 #endif /* _ASM_X86_NUMA_H */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index aa76ec2d359b..87c52822cc44 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -37,14 +37,10 @@ static __init int numa_setup(char *opt)
return -EINVAL;
if (!strncmp(opt, "off", 3))
numa_off = 1;
-#ifdef CONFIG_NUMA_EMU
if (!strncmp(opt, "fake=", 5))
-   numa_emu_cmdline(opt + 5);
-#endif
-#ifdef CONFIG_ACPI_NUMA
+   return numa_emu_cmdline(opt + 5);
if (!strncmp(opt, "noacpi", 6))
-   acpi_numa = -1;
-#endif
+   disable_srat();
return 0;
 }
 early_param("numa", numa_setup);
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index c5174b4e318b..847c23196e57 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -13,9 +13,10 @@
 static int emu_nid_to_phys[MAX_NUMNODES];
 static char *emu_cmdline __initdata;
 
-void __init numa_emu_cmdline(char *str)
+int __init numa_emu_cmdline(char *str)
 {
emu_cmdline = str;
+   return 0;
 }
 
 static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo 
*mi)
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 2aab43a13a8c..64b81ba5a4d6 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -1350,7 +1350,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
 * any NUMA information the kernel tries to get from ACPI will
 * be meaningless.  Prevent it from trying.
 */
-   acpi_numa = -1;
+   disable_srat();
 #endif
WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv));
 
diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c
index 15bbaab8500b..1b0ae0a1959b 100644
--- a/drivers/acpi/numa/srat.c
+++ b/drivers/acpi/numa/srat.c
@@ -27,7 +27,12 @@ static int node_to_pxm_map[MAX_NUMNODES]
= { [0 ... MAX_NUMNODES - 1] = PXM_INVAL };
 
 unsigned char acpi_srat_revision __initdata;
-int acpi_numa __initdata;
+static int acpi_numa __initdata;
+
+void __init disable_srat(void)
+{
+   acpi_numa = -1;
+}
 
 int pxm_to_node(int pxm)
 {
@@ -163,7 +168,7 @@ static int __init slit_valid(struct acpi_table_slit *slit)
 void __init bad_srat(void)
 {
pr_err("SRAT: SRAT not used.\n");
-   acpi_numa = -1;
+   disable_srat();
 }
 
 int __init srat_disabled(void)
diff --git a/include/acpi/acpi_numa.h b/include/acpi/acpi_numa.h
index fdebcfc6c8df..8784183b2204 100644
--- a/include/acpi/acpi_numa.h
+++ b/include/acpi/acpi_numa.h
@@ -17,10 +17,14 @@ extern int pxm_to_node(int);
 extern int node_to_pxm(int);
 extern int acpi_map_pxm_to_node(int);
 extern unsigned char acpi_srat_revision;
-extern int acpi_numa __initdata;
+extern void disable_srat(void);
 
 extern void bad_srat(void);
 extern int srat_disabled(void);
 
+#else  /* CONFIG_ACPI_NUMA */
+static inline void disable_srat(void)
+{
+}
 #endif /* CONFIG_ACPI_NUMA */
 #endif /* __ACP_NUMA_H */

[PATCH v8 1/2] x86, powerpc: Rename memcpy_mcsafe() to copy_mc_to_{user, kernel}()

2020-08-01 Thread Dan Williams

In reaction to a proposal to introduce a memcpy_mcsafe_fast()
implementation Linus points out that memcpy_mcsafe() is poorly named
relative to communicating the scope of the interface. Specifically what
addresses are valid to pass as source, destination, and what faults /
exceptions are handled. Of particular concern is that even though x86
might be able to handle the semantics of copy_mc_to_user() with its
common copy_user_generic() implementation other archs likely need / want
an explicit path for this case:

  On Fri, May 1, 2020 at 11:28 AM Linus Torvalds 
 wrote:
  >
  > On Thu, Apr 30, 2020 at 6:21 PM Dan Williams  
wrote:
  > >
  > > However now I see that copy_user_generic() works for the wrong reason.
  > > It works because the exception on the source address due to poison
  > > looks no different than a write fault on the user address to the
  > > caller, it's still just a short copy. So it makes copy_to_user() work
  > > for the wrong reason relative to the name.
  >
  > Right.
  >
  > And it won't work that way on other architectures. On x86, we have a
  > generic function that can take faults on either side, and we use it
  > for both cases (and for the "in_user" case too), but that's an
  > artifact of the architecture oddity.
  >
  > In fact, it's probably wrong even on x86 - because it can hide bugs -
  > but writing those things is painful enough that everybody prefers
  > having just one function.

The rename replaces a single top-level memcpy_mcsafe() with either
copy_mc_to_user(), or copy_mc_to_kernel().

An x86 copy_mc_fragile() name is introduced as the rename for the
low-level x86 implementation formerly named memcpy_mcsafe(). It is used
as the slow / careful backend that is supplanted by a fast
copy_mc_generic() in a follow-on patch.

One side-effect of this reorganization is that separating copy_mc_64.S
to its own file means that perf no longer needs to track dependencies
for its memcpy_64.S benchmarks.

Cc: x...@kernel.org
Cc: 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: "H. Peter Anvin" 
Cc: Paul Mackerras 
Cc: Thomas Gleixner 
Cc: Peter Zijlstra 
Cc: Mikulas Patocka 
Cc: Alexander Viro 
Cc: Arnaldo Carvalho de Melo 
Cc: Linus Torvalds 
Cc: Benjamin Herrenschmidt 
Reviewed-by: Tony Luck 
Acked-by: Michael Ellerman 
Link: 
http://lore.kernel.org/r/CAHk-=wjsqtxaqfujxftwnwmgufastgb0dz1dt3v-78quiez...@mail.gmail.com
Signed-off-by: Dan Williams 
---
 arch/powerpc/Kconfig   |2 
 arch/powerpc/include/asm/string.h  |2 
 arch/powerpc/include/asm/uaccess.h |   40 --
 arch/powerpc/lib/Makefile  |2 
 arch/powerpc/lib/copy_mc_64.S  |4 -
 arch/x86/Kconfig   |2 
 arch/x86/Kconfig.debug |2 
 arch/x86/include/asm/copy_mc_test.h|   75 
 arch/x86/include/asm/mcsafe_test.h |   75 
 arch/x86/include/asm/string_64.h   |   32 -
 arch/x86/include/asm/uaccess.h |   18 +++
 arch/x86/include/asm/uaccess_64.h  |   20 ---
 arch/x86/kernel/cpu/mce/core.c |8 -
 arch/x86/kernel/quirks.c   |9 -
 arch/x86/lib/Makefile  |1 
 arch/x86/lib/copy_mc.c |   66 +++
 arch/x86/lib/copy_mc_64.S  |  125 
 arch/x86/lib/memcpy_64.S   |  115 --
 arch/x86/lib/usercopy_64.c |   21 ---
 drivers/md/dm-writecache.c |   15 +-
 drivers/nvdimm/claim.c |2 
 drivers/nvdimm/pmem.c  |6 -
 include/linux/string.h |9 -
 include/linux/uaccess.h|9 +
 include/linux/uio.h|   10 +-
 lib/Kconfig|7 +
 lib/iov_iter.c |   43 ---
 tools/arch/x86/include/asm/mcsafe_test.h   |   13 --
 tools/arch/x86/lib/memcpy_64.S |  115 --
 tools/objtool/check.c  |4 -
 tools/perf/bench/Build |1 
 tools/perf/bench/mem-memcpy-x86-64-lib.c   |   24 
 tools/testing/nvdimm/test/nfit.c   |   48 
 .../testing/selftests/powerpc/copyloops/.gitignore |2 
 tools/testing/selftests/powerpc/copyloops/Makefile |6 -
 .../selftests/powerpc/copyloops/copy_mc_64.S   |1 
 .../selftests/powerpc/copyloops/memcpy_mcsafe_64.S |1 
 37 files changed, 409 insertions(+), 526 deletions(-)
 rename arch/powerpc/lib/{memcpy_mcsafe_64.S => copy_mc_64.S} (98%

[PATCH v8 2/2] x86/copy_mc: Introduce copy_mc_generic()

2020-08-01 Thread Dan Williams

The original copy_mc_fragile() implementation had negative performance
implications since it did not use the fast-string instruction sequence
to perform copies. For this reason copy_mc_to_kernel() fell back to
plain memcpy() to preserve performance on platform that did not indicate
the capability to recover from machine check exceptions. However, that
capability detection was not architectural and now that some platforms
can recover from fast-string consumption of memory errors the memcpy()
fallback now causes these more capable platforms to fail.

Introduce copy_mc_generic() as the fast default implementation of
copy_mc_to_kernel() and finalize the transition of copy_mc_fragile() to
be a platform quirk to indicate 'fragility'. With this in place
copy_mc_to_kernel() is fast and recovery-ready by default regardless of
hardware capability.

Thanks to Vivek for identifying that copy_user_generic() is not suitable
as the copy_mc_to_user() backend since the #MC handler explicitly checks
ex_has_fault_handler().

Cc: x...@kernel.org
Cc: 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: Vivek Goyal 
Cc: "H. Peter Anvin" 
Cc: Andy Lutomirski 
Cc: Thomas Gleixner 
Cc: Peter Zijlstra 
Cc: Linus Torvalds 
Reviewed-by: Tony Luck 
Reported-by: Erwin Tsaur 
Tested-by: Erwin Tsaur 
Fixes: 92b0729c34ca ("x86/mm, x86/mce: Add memcpy_mcsafe()")
Signed-off-by: Dan Williams 
---
 arch/x86/include/asm/uaccess.h |3 +++
 arch/x86/lib/copy_mc.c |   12 +---
 arch/x86/lib/copy_mc_64.S  |   40 
 tools/objtool/check.c  |1 +
 4 files changed, 49 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 4b2082b61e3e..b038eda58958 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -464,6 +464,9 @@ copy_mc_to_user(void *to, const void *from, unsigned len);
 
 unsigned long __must_check
 copy_mc_fragile(void *dst, const void *src, unsigned cnt);
+
+unsigned long __must_check
+copy_mc_generic(void *dst, const void *src, unsigned cnt);
 #else
 static inline void enable_copy_mc_fragile(void)
 {
diff --git a/arch/x86/lib/copy_mc.c b/arch/x86/lib/copy_mc.c
index cdb8f5dc403d..9e6fac1ab72e 100644
--- a/arch/x86/lib/copy_mc.c
+++ b/arch/x86/lib/copy_mc.c
@@ -23,7 +23,7 @@ void enable_copy_mc_fragile(void)
  *
  * Call into the 'fragile' version on systems that have trouble
  * actually do machine check recovery. Everyone else can just
- * use memcpy().
+ * use copy_mc_generic().
  *
  * Return 0 for success, or number of bytes not copied if there was an
  * exception.
@@ -33,8 +33,7 @@ copy_mc_to_kernel(void *dst, const void *src, unsigned cnt)
 {
if (static_branch_unlikely(_mc_fragile_key))
return copy_mc_fragile(dst, src, cnt);
-   memcpy(dst, src, cnt);
-   return 0;
+   return copy_mc_generic(dst, src, cnt);
 }
 EXPORT_SYMBOL_GPL(copy_mc_to_kernel);
 
@@ -56,11 +55,10 @@ copy_mc_to_user(void *to, const void *from, unsigned len)
 {
unsigned long ret;
 
-   if (!static_branch_unlikely(_mc_fragile_key))
-   return copy_user_generic(to, from, len);
-
__uaccess_begin();
-   ret = copy_mc_fragile(to, from, len);
+   if (static_branch_unlikely(_mc_fragile_key))
+   ret = copy_mc_fragile(to, from, len);
+   ret = copy_mc_generic(to, from, len);
__uaccess_end();
return ret;
 }
diff --git a/arch/x86/lib/copy_mc_64.S b/arch/x86/lib/copy_mc_64.S
index 35a67c50890b..a08e7a4d9e28 100644
--- a/arch/x86/lib/copy_mc_64.S
+++ b/arch/x86/lib/copy_mc_64.S
@@ -2,7 +2,9 @@
 /* Copyright(c) 2016-2020 Intel Corporation. All rights reserved. */
 
 #include 
+#include 
 #include 
+#include 
 #include 
 #include 
 
@@ -122,4 +124,42 @@ EXPORT_SYMBOL_GPL(copy_mc_fragile)
_ASM_EXTABLE(.L_write_leading_bytes, .E_leading_bytes)
_ASM_EXTABLE(.L_write_words, .E_write_words)
_ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes)
+
+/*
+ * copy_mc_generic - memory copy with exception handling
+ *
+ * Fast string copy + fault / exception handling. If the CPU does
+ * support machine check exception recovery, but does not support
+ * recovering from fast-string exceptions then this CPU needs to be
+ * added to the copy_mc_fragile_key set of quirks. Otherwise, absent any
+ * machine check recovery support this version should be no slower than
+ * standard memcpy.
+ */
+SYM_FUNC_START(copy_mc_generic)
+   ALTERNATIVE "jmp copy_mc_fragile", "", X86_FEATURE_ERMS
+   movq %rdi, %rax
+   movq %rdx, %rcx
+.L_copy:
+   rep movsb
+   /* Copy successful. Return zero */
+   xorl %eax, %eax
+   ret
+SYM_FUNC_END(copy_mc_generic)
+EXPORT_SYMBOL_GPL(copy_mc_generic)
+
+   .section .fixup, "ax"
+.E_copy:
+   /*
+* On fault %rcx is updated such that the copy instruction could
+* optionally be restarted

[PATCH v8 0/2] Renovate memcpy_mcsafe with copy_mc_to_{user, kernel}

2020-08-01 Thread Dan Williams

Changes since v7 [1]:
- Rebased on v5.8-rc5 to resolve a conflict with commit eb25de276505
  ("tools arch: Update arch/x86/lib/memcpy_64.S copy used in 'perf bench
mem memcpy'")

[1]: 
http://lore.kernel.org/r/159408043801.2272533.17485467640602344900.st...@dwillia2-desk3.amr.corp.intel.com

---
Vishal, since this patch set has experienced unprecedented silence from
x86 folks I expect you will need to send it to Linus directly during the
merge window. It merges cleanly with recent -next.

Thomas, Ingo, Boris, please chime in to save Vishal from that
awkwardness. I am only going to be sporadically online for the next few
weeks.

---

The primary motivation to go touch memcpy_mcsafe() is that the existing
benefit of doing slow and careful copies is obviated on newer CPUs. That
fact solves the problem of needing to detect machine-check recovery
capability. Now the old "mcsafe_key" opt-in to careful copying can be made
an opt-out from the default fast copy implementation.

The discussion with Linus further made clear that this facility had
already lost its x86-machine-check specificity starting with commit
2c89130a56a ("x86/asm/memcpy_mcsafe: Add write-protection-fault
handling"). The new changes to not require a "careful copy" further
de-emphasizes the role that x86-MCA plays in the implementation to just
one more source of recoverable trap during the operation.

With the above realizations the name "mcsafe" is no longer accurate and
copy_safe() is proposed as its replacement. x86 grows a copy_safe_fast()
implementation as a default implementation that is independent of
detecting the presence of x86-MCA.

---

Dan Williams (2):
  x86, powerpc: Rename memcpy_mcsafe() to copy_mc_to_{user,kernel}()
  x86/copy_mc: Introduce copy_mc_generic()


 arch/powerpc/Kconfig   |2 
 arch/powerpc/include/asm/string.h  |2 
 arch/powerpc/include/asm/uaccess.h |   40 +++--
 arch/powerpc/lib/Makefile  |2 
 arch/powerpc/lib/copy_mc_64.S  |4 
 arch/x86/Kconfig   |2 
 arch/x86/Kconfig.debug |2 
 arch/x86/include/asm/copy_mc_test.h|   75 +
 arch/x86/include/asm/mcsafe_test.h |   75 -
 arch/x86/include/asm/string_64.h   |   32 
 arch/x86/include/asm/uaccess.h |   21 +++
 arch/x86/include/asm/uaccess_64.h  |   20 --
 arch/x86/kernel/cpu/mce/core.c |8 -
 arch/x86/kernel/quirks.c   |9 -
 arch/x86/lib/Makefile  |1 
 arch/x86/lib/copy_mc.c |   64 
 arch/x86/lib/copy_mc_64.S  |  165 
 arch/x86/lib/memcpy_64.S   |  115 --
 arch/x86/lib/usercopy_64.c |   21 ---
 drivers/md/dm-writecache.c |   15 +-
 drivers/nvdimm/claim.c |2 
 drivers/nvdimm/pmem.c  |6 -
 include/linux/string.h |9 -
 include/linux/uaccess.h|9 +
 include/linux/uio.h|   10 +
 lib/Kconfig|7 +
 lib/iov_iter.c |   43 +++--
 tools/arch/x86/include/asm/mcsafe_test.h   |   13 --
 tools/arch/x86/lib/memcpy_64.S |  115 --
 tools/objtool/check.c  |5 -
 tools/perf/bench/Build |1 
 tools/perf/bench/mem-memcpy-x86-64-lib.c   |   24 ---
 tools/testing/nvdimm/test/nfit.c   |   48 +++---
 .../testing/selftests/powerpc/copyloops/.gitignore |2 
 tools/testing/selftests/powerpc/copyloops/Makefile |6 -
 .../selftests/powerpc/copyloops/copy_mc_64.S   |1 
 .../selftests/powerpc/copyloops/memcpy_mcsafe_64.S |1 
 37 files changed, 451 insertions(+), 526 deletions(-)
 rename arch/powerpc/lib/{memcpy_mcsafe_64.S => copy_mc_64.S} (98%)
 create mode 100644 arch/x86/include/asm/copy_mc_test.h
 delete mode 100644 arch/x86/include/asm/mcsafe_test.h
 create mode 100644 arch/x86/lib/copy_mc.c
 create mode 100644 arch/x86/lib/copy_mc_64.S
 delete mode 100644 tools/arch/x86/include/asm/mcsafe_test.h
 delete mode 100644 tools/perf/bench/mem-memcpy-x86-64-lib.c
 create mode 12 tools/testing/selftests/powerpc/copyloops/copy_mc_64.S
 delete mode 12 tools/testing/selftests/powerpc/copyloops/memcpy_mcsafe_64.S

base-commit: 11ba468877bb23f28956a35e896356252d63c983

Re: [PATCH v3 02/23] x86/numa: Add 'nohmat' option

2020-08-01 Thread Dan Williams

On Fri, Jul 31, 2020 at 8:51 PM Randy Dunlap  wrote:
>
> On 7/31/20 8:25 PM, Dan Williams wrote:
> > Disable parsing of the HMAT for debug, to workaround broken platform
> > instances, or cases where it is otherwise not wanted.
> >
> > ---
> >  arch/x86/mm/numa.c   |2 ++
> >  drivers/acpi/numa/hmat.c |8 +++-
> >  include/acpi/acpi_numa.h |8 
> >  3 files changed, 17 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> > index 87c52822cc44..f3805bbaa784 100644
> > --- a/arch/x86/mm/numa.c
> > +++ b/arch/x86/mm/numa.c
> > @@ -41,6 +41,8 @@ static __init int numa_setup(char *opt)
> >   return numa_emu_cmdline(opt + 5);
> >   if (!strncmp(opt, "noacpi", 6))
> >   disable_srat();
> > + if (!strncmp(opt, "nohmat", 6))
> > + disable_hmat();
>
> Hopefully that will be documented in
> Documentation/x86/x86_64/boot-options.rst.

Sorry, yes, you gave that feedback before. I can do a quick respin
with this and the kbuild-robot compile fixups.

[PATCH v3 17/23] mm/memremap_pages: Support multiple ranges per invocation

2020-07-31 Thread Dan Williams

In support of device-dax growing the ability to front physically
dis-contiguous ranges of memory, update devm_memremap_pages() to track
multiple ranges with a single reference counter and devm instance.

Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: Benjamin Herrenschmidt 
Cc: Dan Williams 
Cc: Vishal Verma 
Cc: Dave Jiang 
Cc: Ben Skeggs 
Cc: David Airlie 
Cc: Daniel Vetter 
Cc: Ira Weiny 
Cc: Jason Gunthorpe 
Signed-off-by: Dan Williams 
---
 arch/powerpc/kvm/book3s_hv_uvmem.c |1 
 drivers/dax/device.c   |1 
 drivers/gpu/drm/nouveau/nouveau_dmem.c |1 
 drivers/nvdimm/pfn_devs.c  |1 
 drivers/nvdimm/pmem.c  |1 
 drivers/pci/p2pdma.c   |1 
 include/linux/memremap.h   |   10 +
 lib/test_hmm.c |1 
 mm/memremap.c  |  258 +++-
 9 files changed, 165 insertions(+), 110 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c 
b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 29ec555055c2..84e5a2dc8be5 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -1172,6 +1172,7 @@ int kvmppc_uvmem_init(void)
kvmppc_uvmem_pgmap.type = MEMORY_DEVICE_PRIVATE;
kvmppc_uvmem_pgmap.range.start = res->start;
kvmppc_uvmem_pgmap.range.end = res->end;
+   kvmppc_uvmem_pgmap.nr_range = 1;
kvmppc_uvmem_pgmap.ops = _uvmem_ops;
/* just one global instance: */
kvmppc_uvmem_pgmap.owner = _uvmem_pgmap;
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index fffc54ce0911..f3755df4ae29 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -417,6 +417,7 @@ int dev_dax_probe(struct dev_dax *dev_dax)
if (!pgmap)
return -ENOMEM;
pgmap->range = *range;
+   pgmap->nr_range = 1;
}
pgmap->type = MEMORY_DEVICE_DEVDAX;
addr = devm_memremap_pages(dev, pgmap);
diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c 
b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 25811ed7e274..a13c6215bba8 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -251,6 +251,7 @@ nouveau_dmem_chunk_alloc(struct nouveau_drm *drm, struct 
page **ppage)
chunk->pagemap.type = MEMORY_DEVICE_PRIVATE;
chunk->pagemap.range.start = res->start;
chunk->pagemap.range.end = res->end;
+   chunk->pagemap.nr_range = 1;
chunk->pagemap.ops = _dmem_pagemap_ops;
chunk->pagemap.owner = drm->dev;
 
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 3c4787b92a6a..b499df630d4d 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -693,6 +693,7 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct 
dev_pagemap *pgmap)
.start = nsio->res.start + start_pad,
.end = nsio->res.end - end_trunc,
};
+   pgmap->nr_range = 1;
if (nd_pfn->mode == PFN_MODE_RAM) {
if (offset < reserve)
return -EINVAL;
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 69cc0e783709..1f45af363a94 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -442,6 +442,7 @@ static int pmem_attach_disk(struct device *dev,
} else if (pmem_should_map_pages(dev)) {
pmem->pgmap.range.start = res->start;
pmem->pgmap.range.end = res->end;
+   pmem->pgmap.nr_range = 1;
pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
pmem->pgmap.ops = _pagemap_ops;
addr = devm_memremap_pages(dev, >pgmap);
diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index dd6b0d51a50c..403304785561 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -187,6 +187,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, 
size_t size,
pgmap = _pgmap->pgmap;
pgmap->range.start = pci_resource_start(pdev, bar) + offset;
pgmap->range.end = pgmap->range.start + size - 1;
+   pgmap->nr_range = 1;
pgmap->type = MEMORY_DEVICE_PCI_P2PDMA;
 
p2p_pgmap->provider = pdev;
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 6c21951bdb16..4e9c738f4b31 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -95,7 +95,6 @@ struct dev_pagemap_ops {
 /**
  * struct dev_pagemap - metadata for ZONE_DEVICE mappings
  * @altmap: pre-allocated/reserved memory for vmemmap allocations
- * @range: physical address range covered by @ref
  * @ref: reference count that pins the devm_memremap_pages() mapping
  * @internal_ref: internal reference if @ref is not provided by the caller
  * @done: completion for @internal_ref
@@ -105,10 +104,12 @@ struct dev_pagemap_ops {
  * @owner: an opaque po

[PATCH v3 16/23] mm/memremap_pages: Convert to 'struct range'

2020-07-31 Thread Dan Williams

The 'struct resource' in 'struct dev_pagemap' is only used for holding
resource span information. The other fields, 'name', 'flags', 'desc',
'parent', 'sibling', and 'child' are all unused wasted space.

This is in preparation for introducing a multi-range extension of
devm_memremap_pages().

The bulk of this change is unwinding all the places internal to
libnvdimm that used 'struct resource' unnecessarily.

P2PDMA had a minor usage of the flags field, but only to report failures
with "%pR". That is replaced with an open coded print of the range.

Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: Benjamin Herrenschmidt 
Cc: Dan Williams 
Cc: Vishal Verma 
Cc: Dave Jiang 
Cc: Ben Skeggs 
Cc: David Airlie 
Cc: Daniel Vetter 
Cc: Ira Weiny 
Cc: Jason Gunthorpe 
Signed-off-by: Dan Williams 
---
 arch/powerpc/kvm/book3s_hv_uvmem.c |   13 +++--
 drivers/dax/bus.c  |   10 ++--
 drivers/dax/bus.h  |2 -
 drivers/dax/dax-private.h  |5 --
 drivers/dax/device.c   |3 -
 drivers/dax/hmem/hmem.c|5 ++
 drivers/dax/pmem/core.c|   12 ++---
 drivers/gpu/drm/nouveau/nouveau_dmem.c |   14 +++---
 drivers/nvdimm/badrange.c  |   26 +--
 drivers/nvdimm/claim.c |   13 +++--
 drivers/nvdimm/nd.h|3 +
 drivers/nvdimm/pfn_devs.c  |   12 ++---
 drivers/nvdimm/pmem.c  |   26 ++-
 drivers/nvdimm/region.c|   21 +
 drivers/pci/p2pdma.c   |   11 ++---
 include/linux/memremap.h   |5 +-
 include/linux/range.h  |6 ++
 lib/test_hmm.c |   14 +++---
 mm/memremap.c  |   77 
 tools/testing/nvdimm/test/iomap.c  |2 -
 20 files changed, 147 insertions(+), 133 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c 
b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 7705d5557239..29ec555055c2 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -687,9 +687,9 @@ static struct page *kvmppc_uvmem_get_page(unsigned long 
gpa, struct kvm *kvm)
struct kvmppc_uvmem_page_pvt *pvt;
unsigned long pfn_last, pfn_first;
 
-   pfn_first = kvmppc_uvmem_pgmap.res.start >> PAGE_SHIFT;
+   pfn_first = kvmppc_uvmem_pgmap.range.start >> PAGE_SHIFT;
pfn_last = pfn_first +
-  (resource_size(_uvmem_pgmap.res) >> PAGE_SHIFT);
+  (range_len(_uvmem_pgmap.range) >> PAGE_SHIFT);
 
spin_lock(_uvmem_bitmap_lock);
bit = find_first_zero_bit(kvmppc_uvmem_bitmap,
@@ -1007,7 +1007,7 @@ static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct 
vm_fault *vmf)
 static void kvmppc_uvmem_page_free(struct page *page)
 {
unsigned long pfn = page_to_pfn(page) -
-   (kvmppc_uvmem_pgmap.res.start >> PAGE_SHIFT);
+   (kvmppc_uvmem_pgmap.range.start >> PAGE_SHIFT);
struct kvmppc_uvmem_page_pvt *pvt;
 
spin_lock(_uvmem_bitmap_lock);
@@ -1170,7 +1170,8 @@ int kvmppc_uvmem_init(void)
}
 
kvmppc_uvmem_pgmap.type = MEMORY_DEVICE_PRIVATE;
-   kvmppc_uvmem_pgmap.res = *res;
+   kvmppc_uvmem_pgmap.range.start = res->start;
+   kvmppc_uvmem_pgmap.range.end = res->end;
kvmppc_uvmem_pgmap.ops = _uvmem_ops;
/* just one global instance: */
kvmppc_uvmem_pgmap.owner = _uvmem_pgmap;
@@ -1205,7 +1206,7 @@ void kvmppc_uvmem_free(void)
return;
 
memunmap_pages(_uvmem_pgmap);
-   release_mem_region(kvmppc_uvmem_pgmap.res.start,
-  resource_size(_uvmem_pgmap.res));
+   release_mem_region(kvmppc_uvmem_pgmap.range.start,
+  range_len(_uvmem_pgmap.range));
kfree(kvmppc_uvmem_bitmap);
 }
diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 53d07f2f1285..00fa73a8dfb4 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -515,7 +515,7 @@ static void dax_region_unregister(void *region)
 }
 
 struct dax_region *alloc_dax_region(struct device *parent, int region_id,
-   struct resource *res, int target_node, unsigned int align,
+   struct range *range, int target_node, unsigned int align,
unsigned long flags)
 {
struct dax_region *dax_region;
@@ -530,8 +530,8 @@ struct dax_region *alloc_dax_region(struct device *parent, 
int region_id,
return NULL;
}
 
-   if (!IS_ALIGNED(res->start, align)
-   || !IS_ALIGNED(resource_size(res), align))
+   if (!IS_ALIGNED(range->start, align)
+   || !IS_ALIGNED(range_len(range), align))
return NULL;
 
dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL);
@@ -546,8 +546,8 @@ struct dax_regi

[PATCH v3 22/23] dax/hmem: Introduce dax_hmem.region_idle parameter

2020-07-31 Thread Dan Williams

From: Joao Martins 

Introduce a new module parameter for dax_hmem which
initializes all region devices as free, rather than allocating
a pagemap for the region by default.

All hmem devices created with dax_hmem.region_idle=1 will have full
available size for creating dynamic dax devices.

Signed-off-by: Joao Martins 
Link: https://lore.kernel.org/r/20200716172913.19658-4-joao.m.mart...@oracle.com
Signed-off-by: Dan Williams 
---
 drivers/dax/hmem/hmem.c |5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
index 1a3347bb6143..1bf040dbc834 100644
--- a/drivers/dax/hmem/hmem.c
+++ b/drivers/dax/hmem/hmem.c
@@ -5,6 +5,9 @@
 #include 
 #include "../bus.h"
 
+static bool region_idle;
+module_param_named(region_idle, region_idle, bool, 0644);
+
 static int dax_hmem_probe(struct platform_device *pdev)
 {
struct device *dev = >dev;
@@ -30,7 +33,7 @@ static int dax_hmem_probe(struct platform_device *pdev)
data = (struct dev_dax_data) {
.dax_region = dax_region,
.id = -1,
-   .size = resource_size(res),
+   .size = region_idle ? 0 : resource_size(res),
};
dev_dax = devm_create_dev_dax();
if (IS_ERR(dev_dax))

[PATCH v3 21/23] device-dax: Add an 'align' attribute

2020-07-31 Thread Dan Williams

From: Joao Martins 

Introduce a device align attribute. While doing so,
rename the region align attribute to be more explicitly
named as so, but keep it named as @align to retain the API
for tools like daxctl.

Changes on align may not always be valid, when say certain
mappings were created with 2M and then we switch to 1G. So, we
validate all ranges against the new value being attempted,
post resizing.

Signed-off-by: Joao Martins 
Link: https://lore.kernel.org/r/20200716172913.19658-3-joao.m.mart...@oracle.com
Signed-off-by: Dan Williams 
---
 drivers/dax/bus.c |  102 -
 1 file changed, 92 insertions(+), 10 deletions(-)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index feca1413481c..7a9439132573 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -230,14 +230,15 @@ static ssize_t region_size_show(struct device *dev,
 static struct device_attribute dev_attr_region_size = __ATTR(size, 0444,
region_size_show, NULL);
 
-static ssize_t align_show(struct device *dev,
+static ssize_t region_align_show(struct device *dev,
struct device_attribute *attr, char *buf)
 {
struct dax_region *dax_region = dev_get_drvdata(dev);
 
return sprintf(buf, "%u\n", dax_region->align);
 }
-static DEVICE_ATTR_RO(align);
+static struct device_attribute dev_attr_region_align =
+   __ATTR(align, 0400, region_align_show, NULL);
 
 #define for_each_dax_region_resource(dax_region, res) \
for (res = (dax_region)->res.child; res; res = res->sibling)
@@ -488,7 +489,7 @@ static umode_t dax_region_visible(struct kobject *kobj, 
struct attribute *a,
 static struct attribute *dax_region_attributes[] = {
_attr_available_size.attr,
_attr_region_size.attr,
-   _attr_align.attr,
+   _attr_region_align.attr,
_attr_create.attr,
_attr_seed.attr,
_attr_delete.attr,
@@ -855,15 +856,13 @@ static ssize_t size_show(struct device *dev,
return sprintf(buf, "%llu\n", size);
 }
 
-static bool alloc_is_aligned(struct dax_region *dax_region,
-   resource_size_t size)
+static bool alloc_is_aligned(struct dev_dax *dev_dax, resource_size_t size)
 {
/*
 * The minimum mapping granularity for a device instance is a
 * single subsection, unless the arch says otherwise.
 */
-   return IS_ALIGNED(size, max_t(unsigned long, dax_region->align,
-   memremap_compat_align()));
+   return IS_ALIGNED(size, max_t(unsigned long, dev_dax->align, 
memremap_compat_align()));
 }
 
 static int dev_dax_shrink(struct dev_dax *dev_dax, resource_size_t size)
@@ -958,7 +957,7 @@ static ssize_t dev_dax_resize(struct dax_region *dax_region,
return dev_dax_shrink(dev_dax, size);
 
to_alloc = size - dev_size;
-   if (dev_WARN_ONCE(dev, !alloc_is_aligned(dax_region, to_alloc),
+   if (dev_WARN_ONCE(dev, !alloc_is_aligned(dev_dax, to_alloc),
"resize of %pa misaligned\n", _alloc))
return -ENXIO;
 
@@ -1022,7 +1021,7 @@ static ssize_t size_store(struct device *dev, struct 
device_attribute *attr,
if (rc)
return rc;
 
-   if (!alloc_is_aligned(dax_region, val)) {
+   if (!alloc_is_aligned(dev_dax, val)) {
dev_dbg(dev, "%s: size: %lld misaligned\n", __func__, val);
return -EINVAL;
}
@@ -1041,6 +1040,87 @@ static ssize_t size_store(struct device *dev, struct 
device_attribute *attr,
 }
 static DEVICE_ATTR_RW(size);
 
+static ssize_t align_show(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   struct dev_dax *dev_dax = to_dev_dax(dev);
+
+   return sprintf(buf, "%d\n", dev_dax->align);
+}
+
+static ssize_t dev_dax_validate_align(struct dev_dax *dev_dax)
+{
+   resource_size_t dev_size = dev_dax_size(dev_dax);
+   struct device *dev = _dax->dev;
+   ssize_t rc, i;
+
+   if (dev->driver)
+   return -EBUSY;
+
+   rc = -EINVAL;
+   if (dev_size > 0 && !alloc_is_aligned(dev_dax, dev_size)) {
+   dev_dbg(dev, "%s: align %u invalid for size %llu\n",
+   __func__, dev_dax->align, dev_size);
+   return rc;
+   }
+
+   for (i = 0; i < dev_dax->nr_range; i++) {
+   size_t len = range_len(_dax->ranges[i].range);
+
+   if (!alloc_is_aligned(dev_dax, len)) {
+   dev_dbg(dev, "%s: align %u invalid for range %ld\n",
+   __func__, dev_dax->align, i);
+   return rc;
+   }
+   }
+
+   switch (dev_dax->align) {
+   case PUD_SIZE:
+   if (!IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD))
+   break;
+

[PATCH v3 20/23] device-dax: Make align a per-device property

2020-07-31 Thread Dan Williams

From: Joao Martins 

Introduce @align to struct dev_dax.

When creating a new device, we still initialize to the default
dax_region @align. Child devices belonging to a region may wish
to keep a different alignment property instead of a global
region-defined one.

Signed-off-by: Joao Martins 
Link: https://lore.kernel.org/r/20200716172913.19658-2-joao.m.mart...@oracle.com
Signed-off-by: Dan Williams 
---
 drivers/dax/bus.c |1 +
 drivers/dax/dax-private.h |1 +
 drivers/dax/device.c  |   35 +++
 3 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index ffb27964deb2..feca1413481c 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -1215,6 +1215,7 @@ struct dev_dax *devm_create_dev_dax(struct dev_dax_data 
*data)
 
dev_dax->dax_dev = dax_dev;
dev_dax->target_node = dax_region->target_node;
+   dev_dax->align = dax_region->align;
ida_init(_dax->ida);
kref_get(_region->kref);
 
diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h
index 13780f62b95e..96ef5a8ae0ba 100644
--- a/drivers/dax/dax-private.h
+++ b/drivers/dax/dax-private.h
@@ -62,6 +62,7 @@ struct dax_mapping {
 struct dev_dax {
struct dax_region *region;
struct dax_device *dax_dev;
+   unsigned int align;
int target_node;
int id;
struct ida ida;
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 2bfc5c83e3b0..346c7bb8cf06 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -17,7 +17,6 @@
 static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma,
const char *func)
 {
-   struct dax_region *dax_region = dev_dax->region;
struct device *dev = _dax->dev;
unsigned long mask;
 
@@ -32,7 +31,7 @@ static int check_vma(struct dev_dax *dev_dax, struct 
vm_area_struct *vma,
return -EINVAL;
}
 
-   mask = dax_region->align - 1;
+   mask = dev_dax->align - 1;
if (vma->vm_start & mask || vma->vm_end & mask) {
dev_info_ratelimited(dev,
"%s: %s: fail, unaligned vma (%#lx - %#lx, 
%#lx)\n",
@@ -86,13 +85,13 @@ static vm_fault_t __dev_dax_pte_fault(struct dev_dax 
*dev_dax,
return VM_FAULT_SIGBUS;
 
dax_region = dev_dax->region;
-   if (dax_region->align > PAGE_SIZE) {
+   if (dev_dax->align > PAGE_SIZE) {
dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n",
-   dax_region->align, fault_size);
+   dev_dax->align, fault_size);
return VM_FAULT_SIGBUS;
}
 
-   if (fault_size != dax_region->align)
+   if (fault_size != dev_dax->align)
return VM_FAULT_SIGBUS;
 
phys = dax_pgoff_to_phys(dev_dax, vmf->pgoff, PAGE_SIZE);
@@ -120,15 +119,15 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax 
*dev_dax,
return VM_FAULT_SIGBUS;
 
dax_region = dev_dax->region;
-   if (dax_region->align > PMD_SIZE) {
+   if (dev_dax->align > PMD_SIZE) {
dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n",
-   dax_region->align, fault_size);
+   dev_dax->align, fault_size);
return VM_FAULT_SIGBUS;
}
 
-   if (fault_size < dax_region->align)
+   if (fault_size < dev_dax->align)
return VM_FAULT_SIGBUS;
-   else if (fault_size > dax_region->align)
+   else if (fault_size > dev_dax->align)
return VM_FAULT_FALLBACK;
 
/* if we are outside of the VMA */
@@ -164,15 +163,15 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax 
*dev_dax,
return VM_FAULT_SIGBUS;
 
dax_region = dev_dax->region;
-   if (dax_region->align > PUD_SIZE) {
+   if (dev_dax->align > PUD_SIZE) {
dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n",
-   dax_region->align, fault_size);
+   dev_dax->align, fault_size);
return VM_FAULT_SIGBUS;
}
 
-   if (fault_size < dax_region->align)
+   if (fault_size < dev_dax->align)
return VM_FAULT_SIGBUS;
-   else if (fault_size > dax_region->align)
+   else if (fault_size > dev_dax->align)
return VM_FAULT_FALLBACK;
 
/* if we are outside of the VMA */
@@ -267,9 +266,8 @@ static int dev_dax_split(struct vm_area_struct *vma, 
unsigned long addr)
 {
struct file *filp = vma->vm_file;
struct dev_dax *dev_dax = filp->private_data;
-   struct dax_region *dax_region = dev_dax->region;
 
-   if (!IS_ALIGNED(addr, dax_region

[PATCH v3 23/23] device-dax: Add a range mapping allocation attribute

2020-07-31 Thread Dan Williams

From: Joao Martins 

Add a sysfs attribute which denotes a range from the dax region
to be allocated. It's an write only @mapping sysfs attribute in
the format of '-' to allocate a range. @start and
@end use hexadecimal values and the @pgoff is implicitly ordered
wrt to previous writes to @mapping sysfs e.g. a write of a range
of length 1G the pgoff is 0..1G(-4K), a second write will use
@pgoff for 1G+4K...

This range mapping interface is useful for:

 1) Application which want to implement its own allocation logic,
 and thus pick the desired ranges from dax_region.

 2) For use cases like VMM fast restart[0] where after kexec we
 want to the same gpa<->phys mappings (as originally created
 before kexec).

[0] 
https://static.sched.com/hosted_files/kvmforum2019/66/VMM-fast-restart_kvmforum2019.pdf

Signed-off-by: Joao Martins 
Link: https://lore.kernel.org/r/20200716172913.19658-5-joao.m.mart...@oracle.com
Signed-off-by: Dan Williams 
---
 drivers/dax/bus.c |   64 +
 1 file changed, 64 insertions(+)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 7a9439132573..aa67555ba183 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -1040,6 +1040,67 @@ static ssize_t size_store(struct device *dev, struct 
device_attribute *attr,
 }
 static DEVICE_ATTR_RW(size);
 
+static ssize_t range_parse(const char *opt, size_t len, struct range *range)
+{
+   unsigned long long addr = 0;
+   char *start, *end, *str;
+   ssize_t rc = EINVAL;
+
+   str = kstrdup(opt, GFP_KERNEL);
+   if (!str)
+   return rc;
+
+   end = str;
+   start = strsep(, "-");
+   if (!start || !end)
+   goto err;
+
+   rc = kstrtoull(start, 16, );
+   if (rc)
+   goto err;
+   range->start = addr;
+
+   rc = kstrtoull(end, 16, );
+   if (rc)
+   goto err;
+   range->end = addr;
+
+err:
+   kfree(str);
+   return rc;
+}
+
+static ssize_t mapping_store(struct device *dev, struct device_attribute *attr,
+   const char *buf, size_t len)
+{
+   struct dev_dax *dev_dax = to_dev_dax(dev);
+   struct dax_region *dax_region = dev_dax->region;
+   size_t to_alloc;
+   struct range r;
+   ssize_t rc;
+
+   rc = range_parse(buf, len, );
+   if (rc)
+   return rc;
+
+   rc = -ENXIO;
+   device_lock(dax_region->dev);
+   if (!dax_region->dev->driver) {
+   device_unlock(dax_region->dev);
+   return rc;
+   }
+   device_lock(dev);
+
+   to_alloc = range_len();
+   if (alloc_is_aligned(dev_dax, to_alloc))
+   rc = alloc_dev_dax_range(dev_dax, r.start, to_alloc);
+   device_unlock(dev);
+   device_unlock(dax_region->dev);
+
+   return rc == 0 ? len : rc;
+}
+static DEVICE_ATTR_WO(mapping);
+
 static ssize_t align_show(struct device *dev,
struct device_attribute *attr, char *buf)
 {
@@ -1181,6 +1242,8 @@ static umode_t dev_dax_visible(struct kobject *kobj, 
struct attribute *a, int n)
return 0;
if (a == _attr_numa_node.attr && !IS_ENABLED(CONFIG_NUMA))
return 0;
+   if (a == _attr_mapping.attr && is_static(dax_region))
+   return 0;
if ((a == _attr_align.attr ||
 a == _attr_size.attr) && is_static(dax_region))
return 0444;
@@ -1190,6 +1253,7 @@ static umode_t dev_dax_visible(struct kobject *kobj, 
struct attribute *a, int n)
 static struct attribute *dev_dax_attributes[] = {
_attr_modalias.attr,
_attr_size.attr,
+   _attr_mapping.attr,
_attr_target_node.attr,
_attr_align.attr,
_attr_resource.attr,

[PATCH v3 18/23] device-dax: Add dis-contiguous resource support

2020-07-31 Thread Dan Williams

Break the requirement that device-dax instances are physically
contiguous. With this constraint removed it allows fragmented available
capacity to be fully allocated.

This capability is useful to mitigate the "noisy neighbor" problem with
memory-side-cache management for virtual machines, or any other scenario
where a platform address boundary also designates a performance
boundary. For example a direct mapped memory side cache might rotate
cache colors at 1GB boundaries.  With dis-contiguous allocations a
device-dax instance could be configured to contain only 1 cache color.

It also satisfies Joao's use case (see link) for partitioning memory for
exclusive guest access. It allows for a future potential mode where the
host kernel need not allocate 'struct page' capacity up-front.

Link: 
https://lore.kernel.org/lkml/20200110190313.17144-1-joao.m.mart...@oracle.com/
Reported-by: Joao Martins 
Signed-off-by: Dan Williams 
---
 drivers/dax/bus.c  |  230 +++-
 drivers/dax/dax-private.h  |9 +-
 drivers/dax/device.c   |   55 ++
 drivers/dax/kmem.c |  132 +++
 tools/testing/nvdimm/dax-dev.c |   20 ++-
 5 files changed, 319 insertions(+), 127 deletions(-)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 00fa73a8dfb4..f342e36c69a1 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -136,15 +136,27 @@ static bool is_static(struct dax_region *dax_region)
return (dax_region->res.flags & IORESOURCE_DAX_STATIC) != 0;
 }
 
+static u64 dev_dax_size(struct dev_dax *dev_dax)
+{
+   u64 size = 0;
+   int i;
+
+   device_lock_assert(_dax->dev);
+
+   for (i = 0; i < dev_dax->nr_range; i++)
+   size += range_len(_dax->ranges[i].range);
+
+   return size;
+}
+
 static int dax_bus_probe(struct device *dev)
 {
struct dax_device_driver *dax_drv = to_dax_drv(dev->driver);
struct dev_dax *dev_dax = to_dev_dax(dev);
struct dax_region *dax_region = dev_dax->region;
-   struct range *range = _dax->range;
int rc;
 
-   if (range_len(range) == 0 || dev_dax->id < 0)
+   if (dev_dax_size(dev_dax) == 0 || dev_dax->id < 0)
return -ENXIO;
 
rc = dax_drv->probe(dev_dax);
@@ -354,15 +366,19 @@ void kill_dev_dax(struct dev_dax *dev_dax)
 }
 EXPORT_SYMBOL_GPL(kill_dev_dax);
 
-static void free_dev_dax_range(struct dev_dax *dev_dax)
+static void free_dev_dax_ranges(struct dev_dax *dev_dax)
 {
struct dax_region *dax_region = dev_dax->region;
-   struct range *range = _dax->range;
+   int i;
 
device_lock_assert(dax_region->dev);
-   if (range_len(range))
+   for (i = 0; i < dev_dax->nr_range; i++) {
+   struct range *range = _dax->ranges[i].range;
+
__release_region(_region->res, range->start,
range_len(range));
+   }
+   dev_dax->nr_range = 0;
 }
 
 static void unregister_dev_dax(void *dev)
@@ -372,7 +388,7 @@ static void unregister_dev_dax(void *dev)
dev_dbg(dev, "%s\n", __func__);
 
kill_dev_dax(dev_dax);
-   free_dev_dax_range(dev_dax);
+   free_dev_dax_ranges(dev_dax);
device_del(dev);
put_device(dev);
 }
@@ -423,7 +439,7 @@ static ssize_t delete_store(struct device *dev, struct 
device_attribute *attr,
device_lock(dev);
device_lock(victim);
dev_dax = to_dev_dax(victim);
-   if (victim->driver || range_len(_dax->range))
+   if (victim->driver || dev_dax_size(dev_dax))
rc = -EBUSY;
else {
/*
@@ -569,51 +585,83 @@ static int alloc_dev_dax_range(struct dev_dax *dev_dax, 
u64 start,
struct dax_region *dax_region = dev_dax->region;
struct resource *res = _region->res;
struct device *dev = _dax->dev;
+   struct dev_dax_range *ranges;
+   unsigned long pgoff = 0;
struct resource *alloc;
+   int i;
 
device_lock_assert(dax_region->dev);
 
/* handle the seed alloc special case */
if (!size) {
-   dev_dax->range = (struct range) {
-   .start = res->start,
-   .end = res->start - 1,
-   };
+   if (dev_WARN_ONCE(dev, dev_dax->nr_range,
+   "0-size allocation must be first\n"))
+   return -EBUSY;
+   /* nr_range == 0 is elsewhere special cased as 0-size device */
return 0;
}
 
+   ranges = krealloc(dev_dax->ranges, sizeof(*ranges)
+   * (dev_dax->nr_range + 1), GFP_KERNEL);
+   if (!ranges)
+   return -ENOMEM;
+
alloc = __request_region(res, start, size, dev_name(dev), 0);
-   if (

[PATCH v3 19/23] device-dax: Introduce 'mapping' devices

2020-07-31 Thread Dan Williams

In support of interrogating the physical address layout of a device with
dis-contiguous ranges, introduce a sysfs directory with 'start', 'end',
and 'page_offset' attributes. The alternative is trying to parse
/proc/iomem, and that file will not reflect the extent layout until the
device is enabled.

Cc: Vishal Verma 
Signed-off-by: Dan Williams 
---
 drivers/dax/bus.c |  191 +
 drivers/dax/dax-private.h |   14 +++
 2 files changed, 203 insertions(+), 2 deletions(-)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index f342e36c69a1..ffb27964deb2 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -579,6 +579,167 @@ struct dax_region *alloc_dax_region(struct device 
*parent, int region_id,
 }
 EXPORT_SYMBOL_GPL(alloc_dax_region);
 
+static void dax_mapping_release(struct device *dev)
+{
+   struct dax_mapping *mapping = to_dax_mapping(dev);
+   struct dev_dax *dev_dax = to_dev_dax(dev->parent);
+
+   ida_free(_dax->ida, mapping->id);
+   kfree(mapping);
+}
+
+static void unregister_dax_mapping(void *data)
+{
+   struct device *dev = data;
+   struct dax_mapping *mapping = to_dax_mapping(dev);
+   struct dev_dax *dev_dax = to_dev_dax(dev->parent);
+   struct dax_region *dax_region = dev_dax->region;
+
+   dev_dbg(dev, "%s\n", __func__);
+
+   device_lock_assert(dax_region->dev);
+
+   dev_dax->ranges[mapping->range_id].mapping = NULL;
+   mapping->range_id = -1;
+
+   device_del(dev);
+   put_device(dev);
+}
+
+static struct dev_dax_range *get_dax_range(struct device *dev)
+{
+   struct dax_mapping *mapping = to_dax_mapping(dev);
+   struct dev_dax *dev_dax = to_dev_dax(dev->parent);
+   struct dax_region *dax_region = dev_dax->region;
+
+   device_lock(dax_region->dev);
+   if (mapping->range_id < 0) {
+   device_unlock(dax_region->dev);
+   return NULL;
+   }
+
+   return _dax->ranges[mapping->range_id];
+}
+
+static void put_dax_range(struct dev_dax_range *dax_range)
+{
+   struct dax_mapping *mapping = dax_range->mapping;
+   struct dev_dax *dev_dax = to_dev_dax(mapping->dev.parent);
+   struct dax_region *dax_region = dev_dax->region;
+
+   device_unlock(dax_region->dev);
+}
+
+static ssize_t start_show(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   struct dev_dax_range *dax_range;
+   ssize_t rc;
+
+   dax_range = get_dax_range(dev);
+   if (!dax_range)
+   return -ENXIO;
+   rc = sprintf(buf, "%#llx\n", dax_range->range.start);
+   put_dax_range(dax_range);
+
+   return rc;
+}
+static DEVICE_ATTR(start, 0400, start_show, NULL);
+
+static ssize_t end_show(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   struct dev_dax_range *dax_range;
+   ssize_t rc;
+
+   dax_range = get_dax_range(dev);
+   if (!dax_range)
+   return -ENXIO;
+   rc = sprintf(buf, "%#llx\n", dax_range->range.end);
+   put_dax_range(dax_range);
+
+   return rc;
+}
+static DEVICE_ATTR(end, 0400, end_show, NULL);
+
+static ssize_t pgoff_show(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   struct dev_dax_range *dax_range;
+   ssize_t rc;
+
+   dax_range = get_dax_range(dev);
+   if (!dax_range)
+   return -ENXIO;
+   rc = sprintf(buf, "%#lx\n", dax_range->pgoff);
+   put_dax_range(dax_range);
+
+   return rc;
+}
+static DEVICE_ATTR(page_offset, 0400, pgoff_show, NULL);
+
+static struct attribute *dax_mapping_attributes[] = {
+   _attr_start.attr,
+   _attr_end.attr,
+   _attr_page_offset.attr,
+   NULL,
+};
+
+static const struct attribute_group dax_mapping_attribute_group = {
+   .attrs = dax_mapping_attributes,
+};
+
+static const struct attribute_group *dax_mapping_attribute_groups[] = {
+   _mapping_attribute_group,
+   NULL,
+};
+
+static struct device_type dax_mapping_type = {
+   .release = dax_mapping_release,
+   .groups = dax_mapping_attribute_groups,
+};
+
+static int devm_register_dax_mapping(struct dev_dax *dev_dax, int range_id)
+{
+   struct dax_region *dax_region = dev_dax->region;
+   struct dax_mapping *mapping;
+   struct device *dev;
+   int rc;
+
+   device_lock_assert(dax_region->dev);
+
+   if (dev_WARN_ONCE(_dax->dev, !dax_region->dev->driver,
+   "region disabled\n"))
+   return -ENXIO;
+
+   mapping = kzalloc(sizeof(*mapping), GFP_KERNEL);
+   if (!mapping)
+   return -ENOMEM;
+   mapping->range_id = range_id;
+   mapping->id = ida_alloc(_dax->ida, GFP_KERNEL);
+   if (mapping->id < 0) {
+   kfree(ma

[PATCH v3 09/23] device-dax: Move instance creation parameters to 'struct dev_dax_data'

2020-07-31 Thread Dan Williams

In preparation for adding more parameters to instance creation, move
existing parameters to a new struct.

Cc: Vishal Verma 
Signed-off-by: Dan Williams 
---
 drivers/dax/bus.c   |   14 +++---
 drivers/dax/bus.h   |   16 
 drivers/dax/hmem/hmem.c |8 +++-
 drivers/dax/pmem/core.c |9 -
 4 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index f06ffa66cd78..dffa4655e128 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -395,9 +395,9 @@ static void unregister_dev_dax(void *dev)
put_device(dev);
 }
 
-struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id,
-   struct dev_pagemap *pgmap, enum dev_dax_subsys subsys)
+struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data)
 {
+   struct dax_region *dax_region = data->dax_region;
struct device *parent = dax_region->dev;
struct dax_device *dax_dev;
struct dev_dax *dev_dax;
@@ -405,14 +405,14 @@ struct dev_dax *__devm_create_dev_dax(struct dax_region 
*dax_region, int id,
struct device *dev;
int rc = -ENOMEM;
 
-   if (id < 0)
+   if (data->id < 0)
return ERR_PTR(-EINVAL);
 
dev_dax = kzalloc(sizeof(*dev_dax), GFP_KERNEL);
if (!dev_dax)
return ERR_PTR(-ENOMEM);
 
-   memcpy(_dax->pgmap, pgmap, sizeof(*pgmap));
+   memcpy(_dax->pgmap, data->pgmap, sizeof(struct dev_pagemap));
 
/*
 * No 'host' or dax_operations since there is no access to this
@@ -438,13 +438,13 @@ struct dev_dax *__devm_create_dev_dax(struct dax_region 
*dax_region, int id,
 
inode = dax_inode(dax_dev);
dev->devt = inode->i_rdev;
-   if (subsys == DEV_DAX_BUS)
+   if (data->subsys == DEV_DAX_BUS)
dev->bus = _bus_type;
else
dev->class = dax_class;
dev->parent = parent;
dev->type = _dax_type;
-   dev_set_name(dev, "dax%d.%d", dax_region->id, id);
+   dev_set_name(dev, "dax%d.%d", dax_region->id, data->id);
 
rc = device_add(dev);
if (rc) {
@@ -464,7 +464,7 @@ struct dev_dax *__devm_create_dev_dax(struct dax_region 
*dax_region, int id,
 
return ERR_PTR(rc);
 }
-EXPORT_SYMBOL_GPL(__devm_create_dev_dax);
+EXPORT_SYMBOL_GPL(devm_create_dev_dax);
 
 static int match_always_count;
 
diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h
index 55577e9791da..299c2e7fac09 100644
--- a/drivers/dax/bus.h
+++ b/drivers/dax/bus.h
@@ -13,18 +13,18 @@ struct dax_region *alloc_dax_region(struct device *parent, 
int region_id,
struct resource *res, int target_node, unsigned int align);
 
 enum dev_dax_subsys {
-   DEV_DAX_BUS,
+   DEV_DAX_BUS = 0, /* zeroed dev_dax_data picks this by default */
DEV_DAX_CLASS,
 };
 
-struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id,
-   struct dev_pagemap *pgmap, enum dev_dax_subsys subsys);
+struct dev_dax_data {
+   struct dax_region *dax_region;
+   struct dev_pagemap *pgmap;
+   enum dev_dax_subsys subsys;
+   int id;
+};
 
-static inline struct dev_dax *devm_create_dev_dax(struct dax_region 
*dax_region,
-   int id, struct dev_pagemap *pgmap)
-{
-   return __devm_create_dev_dax(dax_region, id, pgmap, DEV_DAX_BUS);
-}
+struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data);
 
 /* to be deleted when DEV_DAX_CLASS is removed */
 struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys 
subsys);
diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
index 506893861253..b84fe17178d8 100644
--- a/drivers/dax/hmem/hmem.c
+++ b/drivers/dax/hmem/hmem.c
@@ -11,6 +11,7 @@ static int dax_hmem_probe(struct platform_device *pdev)
struct dev_pagemap pgmap = { };
struct dax_region *dax_region;
struct memregion_info *mri;
+   struct dev_dax_data data;
struct dev_dax *dev_dax;
struct resource *res;
 
@@ -26,7 +27,12 @@ static int dax_hmem_probe(struct platform_device *pdev)
if (!dax_region)
return -ENOMEM;
 
-   dev_dax = devm_create_dev_dax(dax_region, 0, );
+   data = (struct dev_dax_data) {
+   .dax_region = dax_region,
+   .id = 0,
+   .pgmap = ,
+   };
+   dev_dax = devm_create_dev_dax();
if (IS_ERR(dev_dax))
return PTR_ERR(dev_dax);
 
diff --git a/drivers/dax/pmem/core.c b/drivers/dax/pmem/core.c
index ea52bb77a294..08ee5947a49c 100644
--- a/drivers/dax/pmem/core.c
+++ b/drivers/dax/pmem/core.c
@@ -14,6 +14,7 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum 
dev_dax_subsys subsys)
resource_size_t offset;
struct nd_pfn_sb *pfn_sb;
struct dev_dax *dev_dax;
+   struct dev_dax_data data;
struct nd_namespace_io *nsi

[PATCH v3 10/23] device-dax: Make pgmap optional for instance creation

2020-07-31 Thread Dan Williams

The passed in dev_pagemap is only required in the pmem case as the
libnvdimm core may have reserved a vmem_altmap for dev_memremap_pages()
to place the memmap in pmem directly. In the hmem case there is no
agent reserving an altmap so it can all be handled by a core internal
default.

Pass the resource range via a new @range property of 'struct
dev_dax_data'.

Cc: Vishal Verma 
Signed-off-by: Dan Williams 
---
 drivers/dax/bus.c  |   29 +++--
 drivers/dax/bus.h  |2 ++
 drivers/dax/dax-private.h  |9 -
 drivers/dax/device.c   |   28 +++-
 drivers/dax/hmem/hmem.c|8 
 drivers/dax/kmem.c |   12 ++--
 drivers/dax/pmem/core.c|4 
 tools/testing/nvdimm/dax-dev.c |8 
 8 files changed, 62 insertions(+), 38 deletions(-)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index dffa4655e128..96bd64ba95a5 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -271,7 +271,7 @@ static ssize_t size_show(struct device *dev,
struct device_attribute *attr, char *buf)
 {
struct dev_dax *dev_dax = to_dev_dax(dev);
-   unsigned long long size = resource_size(_dax->region->res);
+   unsigned long long size = range_len(_dax->range);
 
return sprintf(buf, "%llu\n", size);
 }
@@ -293,19 +293,12 @@ static ssize_t target_node_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(target_node);
 
-static unsigned long long dev_dax_resource(struct dev_dax *dev_dax)
-{
-   struct dax_region *dax_region = dev_dax->region;
-
-   return dax_region->res.start;
-}
-
 static ssize_t resource_show(struct device *dev,
struct device_attribute *attr, char *buf)
 {
struct dev_dax *dev_dax = to_dev_dax(dev);
 
-   return sprintf(buf, "%#llx\n", dev_dax_resource(dev_dax));
+   return sprintf(buf, "%#llx\n", dev_dax->range.start);
 }
 static DEVICE_ATTR(resource, 0400, resource_show, NULL);
 
@@ -376,6 +369,7 @@ static void dev_dax_release(struct device *dev)
 
dax_region_put(dax_region);
put_dax(dax_dev);
+   kfree(dev_dax->pgmap);
kfree(dev_dax);
 }
 
@@ -412,7 +406,12 @@ struct dev_dax *devm_create_dev_dax(struct dev_dax_data 
*data)
if (!dev_dax)
return ERR_PTR(-ENOMEM);
 
-   memcpy(_dax->pgmap, data->pgmap, sizeof(struct dev_pagemap));
+   if (data->pgmap) {
+   dev_dax->pgmap = kmemdup(data->pgmap,
+   sizeof(struct dev_pagemap), GFP_KERNEL);
+   if (!dev_dax->pgmap)
+   goto err_pgmap;
+   }
 
/*
 * No 'host' or dax_operations since there is no access to this
@@ -421,18 +420,19 @@ struct dev_dax *devm_create_dev_dax(struct dev_dax_data 
*data)
dax_dev = alloc_dax(dev_dax, NULL, NULL, DAXDEV_F_SYNC);
if (IS_ERR(dax_dev)) {
rc = PTR_ERR(dax_dev);
-   goto err;
+   goto err_alloc_dax;
}
 
/* a device_dax instance is dead while the driver is not attached */
kill_dax(dax_dev);
 
-   /* from here on we're committed to teardown via dax_dev_release() */
+   /* from here on we're committed to teardown via dev_dax_release() */
dev = _dax->dev;
device_initialize(dev);
 
dev_dax->dax_dev = dax_dev;
dev_dax->region = dax_region;
+   dev_dax->range = data->range;
dev_dax->target_node = dax_region->target_node;
kref_get(_region->kref);
 
@@ -458,8 +458,9 @@ struct dev_dax *devm_create_dev_dax(struct dev_dax_data 
*data)
return ERR_PTR(rc);
 
return dev_dax;
-
- err:
+err_alloc_dax:
+   kfree(dev_dax->pgmap);
+err_pgmap:
kfree(dev_dax);
 
return ERR_PTR(rc);
diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h
index 299c2e7fac09..4aeb36da83a4 100644
--- a/drivers/dax/bus.h
+++ b/drivers/dax/bus.h
@@ -3,6 +3,7 @@
 #ifndef __DAX_BUS_H__
 #define __DAX_BUS_H__
 #include 
+#include 
 
 struct dev_dax;
 struct resource;
@@ -21,6 +22,7 @@ struct dev_dax_data {
struct dax_region *dax_region;
struct dev_pagemap *pgmap;
enum dev_dax_subsys subsys;
+   struct range range;
int id;
 };
 
diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h
index 8a4c40ccd2ef..6779f683671d 100644
--- a/drivers/dax/dax-private.h
+++ b/drivers/dax/dax-private.h
@@ -41,6 +41,7 @@ struct dax_region {
  * @target_node: effective numa node if dev_dax memory range is onlined
  * @dev - device core
  * @pgmap - pgmap for memmap setup / lifetime (driver owned)
+ * @range: resource range for the instance
  * @dax_mem_res: physical address range of hotadded DAX memory
  * @dax_mem_name: name for hotadded DAX memory via add_memory_driver_managed()
  */
@@ -49,10 +50,16 @@ struct

[PATCH v3 13/23] device-dax: Introduce 'seed' devices

2020-07-31 Thread Dan Williams

Add a seed device concept for dynamic dax regions to be able to split
the region amongst multiple sub-instances. The seed device, similar to
libnvdimm seed devices, is a device that starts with zero capacity
allocated and unbound to a driver. In contrast to libnvdimm seed devices
explicit 'create' and 'delete' interfaces are added to the region to
trigger seeds to be created and unused devices to be reclaimed. The
explicit create and delete replaces implicit create as a side effect of
probe and implicit delete when writing 0 to the size that libnvdimm
implements.

Delete can be performed on any 0-sized and idle device.  This avoids the
gymnastics of needing to move device_unregister() to its own async
context.  Specifically, it avoids the deadlock of deleting a device via
one of its own attributes. It is also less surprising to userspace which
never sees an extra device it did not request.

For now just add the device creation, teardown, and ->probe()
prevention. A later patch will arrange for the 'dax/size' attribute to
be writable to allocate capacity from the region.

Cc: Vishal Verma 
Signed-off-by: Dan Williams 
---
 drivers/dax/bus.c |  317 -
 drivers/dax/bus.h |4 -
 drivers/dax/dax-private.h |9 +
 drivers/dax/device.c  |   12 +-
 drivers/dax/hmem/hmem.c   |2 
 drivers/dax/kmem.c|   14 +-
 drivers/dax/pmem/compat.c |2 
 7 files changed, 304 insertions(+), 56 deletions(-)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 0a48ce378686..dce9413a4394 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -135,10 +135,46 @@ static bool is_static(struct dax_region *dax_region)
return (dax_region->res.flags & IORESOURCE_DAX_STATIC) != 0;
 }
 
+static int dax_bus_probe(struct device *dev)
+{
+   struct dax_device_driver *dax_drv = to_dax_drv(dev->driver);
+   struct dev_dax *dev_dax = to_dev_dax(dev);
+   struct dax_region *dax_region = dev_dax->region;
+   struct range *range = _dax->range;
+   int rc;
+
+   if (range_len(range) == 0 || dev_dax->id < 0)
+   return -ENXIO;
+
+   rc = dax_drv->probe(dev_dax);
+
+   if (rc || is_static(dax_region))
+   return rc;
+
+   /*
+* Track new seed creation only after successful probe of the
+* previous seed.
+*/
+   if (dax_region->seed == dev)
+   dax_region->seed = NULL;
+
+   return 0;
+}
+
+static int dax_bus_remove(struct device *dev)
+{
+   struct dax_device_driver *dax_drv = to_dax_drv(dev->driver);
+   struct dev_dax *dev_dax = to_dev_dax(dev);
+
+   return dax_drv->remove(dev_dax);
+}
+
 static struct bus_type dax_bus_type = {
.name = "dax",
.uevent = dax_bus_uevent,
.match = dax_bus_match,
+   .probe = dax_bus_probe,
+   .remove = dax_bus_remove,
.drv_groups = dax_drv_groups,
 };
 
@@ -219,14 +255,216 @@ static ssize_t available_size_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(available_size);
 
+static ssize_t seed_show(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   struct dax_region *dax_region = dev_get_drvdata(dev);
+   struct device *seed;
+   ssize_t rc;
+
+   if (is_static(dax_region))
+   return -EINVAL;
+
+   device_lock(dev);
+   seed = dax_region->seed;
+   rc = sprintf(buf, "%s\n", seed ? dev_name(seed) : "");
+   device_unlock(dev);
+
+   return rc;
+}
+static DEVICE_ATTR_RO(seed);
+
+static ssize_t create_show(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   struct dax_region *dax_region = dev_get_drvdata(dev);
+   struct device *youngest;
+   ssize_t rc;
+
+   if (is_static(dax_region))
+   return -EINVAL;
+
+   device_lock(dev);
+   youngest = dax_region->youngest;
+   rc = sprintf(buf, "%s\n", youngest ? dev_name(youngest) : "");
+   device_unlock(dev);
+
+   return rc;
+}
+
+static ssize_t create_store(struct device *dev, struct device_attribute *attr,
+   const char *buf, size_t len)
+{
+   struct dax_region *dax_region = dev_get_drvdata(dev);
+   unsigned long long avail;
+   ssize_t rc;
+   int val;
+
+   if (is_static(dax_region))
+   return -EINVAL;
+
+   rc = kstrtoint(buf, 0, );
+   if (rc)
+   return rc;
+   if (val != 1)
+   return -EINVAL;
+
+   device_lock(dev);
+   avail = dax_region_avail_size(dax_region);
+   if (avail == 0)
+   rc = -ENOSPC;
+   else {
+   struct dev_dax_data data = {
+   .dax_region = dax_region,
+   .size = 0,
+   .id = -1,
+   };
+   struct dev_dax *dev_dax = devm_creat

[PATCH v3 15/23] device-dax: Add resize support

2020-07-31 Thread Dan Williams

Make the device-dax 'size' attribute writable to allow capacity to be
split between multiple instances in a region. The intended consumers of
this capability are users that want to split a scarce memory resource
between device-dax and System-RAM access, or users that want to have
multiple security domains for a large region.

By default the hmem instance provider allocates an entire region to the
first instance. The process of creating a new instance (assuming a
region-id of 0) is find the region and trigger the 'create' attribute
which yields an empty instance to configure. For example:

cd /sys/bus/dax/devices
echo dax0.0 > dax0.0/driver/unbind
echo $new_size > dax0.0/size
echo 1 > $(readlink -f dax0.0)../dax_region/create
seed=$(cat $(readlink -f dax0.0)../dax_region/seed)
echo $new_size > $seed/size
echo dax0.0 > ../drivers/{device_dax,kmem}/bind
echo dax0.1 > ../drivers/{device_dax,kmem}/bind

Instances can be destroyed by:

echo $device > $(readlink -f $device)../dax_region/delete

Cc: Vishal Verma 
Signed-off-by: Dan Williams 
---
 drivers/dax/bus.c |  161 ++---
 1 file changed, 152 insertions(+), 9 deletions(-)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index dce9413a4394..53d07f2f1285 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -6,6 +6,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "dax-private.h"
 #include "bus.h"
 
@@ -562,7 +563,8 @@ struct dax_region *alloc_dax_region(struct device *parent, 
int region_id,
 }
 EXPORT_SYMBOL_GPL(alloc_dax_region);
 
-static int alloc_dev_dax_range(struct dev_dax *dev_dax, resource_size_t size)
+static int alloc_dev_dax_range(struct dev_dax *dev_dax, u64 start,
+   resource_size_t size)
 {
struct dax_region *dax_region = dev_dax->region;
struct resource *res = _region->res;
@@ -580,12 +582,7 @@ static int alloc_dev_dax_range(struct dev_dax *dev_dax, 
resource_size_t size)
return 0;
}
 
-   /* TODO: handle multiple allocations per region */
-   if (res->child)
-   return -ENOMEM;
-
-   alloc = __request_region(res, res->start, size, dev_name(dev), 0);
-
+   alloc = __request_region(res, start, size, dev_name(dev), 0);
if (!alloc)
return -ENOMEM;
 
@@ -597,6 +594,29 @@ static int alloc_dev_dax_range(struct dev_dax *dev_dax, 
resource_size_t size)
return 0;
 }
 
+static int adjust_dev_dax_range(struct dev_dax *dev_dax, struct resource *res, 
resource_size_t size)
+{
+   struct dax_region *dax_region = dev_dax->region;
+   struct range *range = _dax->range;
+   int rc = 0;
+
+   device_lock_assert(dax_region->dev);
+
+   if (size)
+   rc = adjust_resource(res, range->start, size);
+   else
+   __release_region(_region->res, range->start, 
range_len(range));
+   if (rc)
+   return rc;
+
+   dev_dax->range = (struct range) {
+   .start = range->start,
+   .end = range->start + size - 1,
+   };
+
+   return 0;
+}
+
 static ssize_t size_show(struct device *dev,
struct device_attribute *attr, char *buf)
 {
@@ -605,7 +625,127 @@ static ssize_t size_show(struct device *dev,
 
return sprintf(buf, "%llu\n", size);
 }
-static DEVICE_ATTR_RO(size);
+
+static bool alloc_is_aligned(struct dax_region *dax_region,
+   resource_size_t size)
+{
+   /*
+* The minimum mapping granularity for a device instance is a
+* single subsection, unless the arch says otherwise.
+*/
+   return IS_ALIGNED(size, max_t(unsigned long, dax_region->align,
+   memremap_compat_align()));
+}
+
+static int dev_dax_shrink(struct dev_dax *dev_dax, resource_size_t size)
+{
+   struct dax_region *dax_region = dev_dax->region;
+   struct range *range = _dax->range;
+   struct resource *res, *adjust = NULL;
+   struct device *dev = _dax->dev;
+
+   for_each_dax_region_resource(dax_region, res)
+   if (strcmp(res->name, dev_name(dev)) == 0
+   && res->start == range->start) {
+   adjust = res;
+   break;
+   }
+
+   if (dev_WARN_ONCE(dev, !adjust, "failed to find matching resource\n"))
+   return -ENXIO;
+   return adjust_dev_dax_range(dev_dax, adjust, size);
+}
+
+static ssize_t dev_dax_resize(struct dax_region *dax_region,
+   struct dev_dax *dev_dax, resource_size_t size)
+{
+   resource_size_t avail = dax_region_avail_size(dax_region), to_alloc;
+   resource_size_t dev_size = range_len(_dax->range);
+   struct resource *region_res = _region->res;
+   struct device *dev = _dax->dev

[PATCH v3 11/23] device-dax: Kill dax_kmem_res

2020-07-31 Thread Dan Williams

Several related issues around this unneeded attribute:

- The dax_kmem_res property allows the kmem driver to stash the adjusted
  resource range that was used for the hotplug operation, but that can be
  recalculated from the original base range.

- kmem is using an open coded release_resource() + kfree() when an
  idiomatic release_mem_region() is sufficient.

- The driver managed resource need only manage the busy flag. Other flags
  are of no concern to the kmem driver. In fact if kmem inherits some
  memory range that add_memory_driver_managed() rejects that is a
  memory-hotplug-core policy that the driver is in no position to
  override.

- The implementation trusts that failed remove_memory() results in the
  entire resource range remaining pinned busy. The driver need not make
  that layering violation assumption and just maintain the busy state in
  its local resource.

- The "Hot-remove not yet implemented." comment is stale since hotremove
  support is now included.

Cc: David Hildenbrand 
Cc: Vishal Verma 
Cc: Dave Hansen 
Cc: Pavel Tatashin 
Signed-off-by: Dan Williams 
---
 drivers/dax/dax-private.h |3 -
 drivers/dax/kmem.c|  123 +
 2 files changed, 58 insertions(+), 68 deletions(-)

diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h
index 6779f683671d..12a2dbc43b40 100644
--- a/drivers/dax/dax-private.h
+++ b/drivers/dax/dax-private.h
@@ -42,8 +42,6 @@ struct dax_region {
  * @dev - device core
  * @pgmap - pgmap for memmap setup / lifetime (driver owned)
  * @range: resource range for the instance
- * @dax_mem_res: physical address range of hotadded DAX memory
- * @dax_mem_name: name for hotadded DAX memory via add_memory_driver_managed()
  */
 struct dev_dax {
struct dax_region *region;
@@ -52,7 +50,6 @@ struct dev_dax {
struct device dev;
struct dev_pagemap *pgmap;
struct range range;
-   struct resource *dax_kmem_res;
 };
 
 static inline u64 range_len(struct range *range)
diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index 5bb133df147d..77e25361fbeb 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -19,16 +19,24 @@ static const char *kmem_name;
 /* Set if any memory will remain added when the driver will be unloaded. */
 static bool any_hotremove_failed;
 
+static struct range dax_kmem_range(struct dev_dax *dev_dax)
+{
+   struct range range;
+
+   /* memory-block align the hotplug range */
+   range.start = ALIGN(dev_dax->range.start, memory_block_size_bytes());
+   range.end = ALIGN_DOWN(dev_dax->range.end + 1,
+   memory_block_size_bytes()) - 1;
+   return range;
+}
+
 int dev_dax_kmem_probe(struct device *dev)
 {
struct dev_dax *dev_dax = to_dev_dax(dev);
-   struct range *range = _dax->range;
-   resource_size_t kmem_start;
-   resource_size_t kmem_size;
-   resource_size_t kmem_end;
-   struct resource *new_res;
-   const char *new_res_name;
-   int numa_node;
+   struct range range = dax_kmem_range(dev_dax);
+   int numa_node = dev_dax->target_node;
+   struct resource *res;
+   char *res_name;
int rc;
 
/*
@@ -37,109 +45,94 @@ int dev_dax_kmem_probe(struct device *dev)
 * could be mixed in a node with faster memory, causing
 * unavoidable performance issues.
 */
-   numa_node = dev_dax->target_node;
if (numa_node < 0) {
dev_warn(dev, "rejecting DAX region with invalid node: %d\n",
numa_node);
return -EINVAL;
}
 
-   /* Hotplug starting at the beginning of the next block: */
-   kmem_start = ALIGN(range->start, memory_block_size_bytes());
-
-   kmem_size = range_len(range);
-   /* Adjust the size down to compensate for moving up kmem_start: */
-   kmem_size -= kmem_start - range->start;
-   /* Align the size down to cover only complete blocks: */
-   kmem_size &= ~(memory_block_size_bytes() - 1);
-   kmem_end = kmem_start + kmem_size;
-
-   new_res_name = kstrdup(dev_name(dev), GFP_KERNEL);
-   if (!new_res_name)
+   res_name = kstrdup(dev_name(dev), GFP_KERNEL);
+   if (!res_name)
return -ENOMEM;
 
-   /* Region is permanently reserved if hotremove fails. */
-   new_res = request_mem_region(kmem_start, kmem_size, new_res_name);
-   if (!new_res) {
-   dev_warn(dev, "could not reserve region [%pa-%pa]\n",
-_start, _end);
-   kfree(new_res_name);
+   res = request_mem_region(range.start, range_len(), res_name);
+   if (!res) {
+   dev_warn(dev, "could not reserve region [%#llx-%#llx]\n",
+   range.start, range.end);
+   kfree(res_name);
return -EBUSY;
}
 
/*
-

[PATCH v3 14/23] drivers/base: Make device_find_child_by_name() compatible with sysfs inputs

2020-07-31 Thread Dan Williams

Use sysfs_streq() in device_find_child_by_name() to allow it to use a
sysfs input string that might contain a trailing newline.

The other "device by name" interfaces,
{bus,driver,class}_find_device_by_name(), already account for sysfs
strings.

Cc: "Rafael J. Wysocki" 
Reviewed-by: Greg Kroah-Hartman 
Signed-off-by: Dan Williams 
---
 drivers/base/core.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 2169c5132558..231189dd6599 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -3328,7 +3328,7 @@ struct device *device_find_child_by_name(struct device 
*parent,
 
klist_iter_init(>p->klist_children, );
while ((child = next_device()))
-   if (!strcmp(dev_name(child), name) && get_device(child))
+   if (sysfs_streq(dev_name(child), name) && get_device(child))
break;
klist_iter_exit();
return child;

[PATCH v3 04/23] ACPI: HMAT: Refactor hmat_register_target_device to hmem_register_device

2020-07-31 Thread Dan Williams

In preparation for exposing "Soft Reserved" memory ranges without an
HMAT, move the hmem device registration to its own compilation unit and
make the implementation generic.

The generic implementation drops usage acpi_map_pxm_to_online_node()
that was translating ACPI proximity domain values and instead relies on
numa_map_to_online_node() to determine the numa node for the device.

Cc: "Rafael J. Wysocki" 
Link: 
https://lore.kernel.org/r/158318761484.2216124.2049322072599482736.st...@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams 
---
 drivers/acpi/numa/hmat.c  |   68 -
 drivers/dax/Kconfig   |4 +++
 drivers/dax/Makefile  |3 +-
 drivers/dax/hmem.c|   56 -
 drivers/dax/hmem/Makefile |5 +++
 drivers/dax/hmem/device.c |   65 +++
 drivers/dax/hmem/hmem.c   |   56 +
 include/linux/dax.h   |8 +
 8 files changed, 145 insertions(+), 120 deletions(-)
 delete mode 100644 drivers/dax/hmem.c
 create mode 100644 drivers/dax/hmem/Makefile
 create mode 100644 drivers/dax/hmem/device.c
 create mode 100644 drivers/dax/hmem/hmem.c

diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c
index a12e36a12618..134bcb40b2af 100644
--- a/drivers/acpi/numa/hmat.c
+++ b/drivers/acpi/numa/hmat.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static u8 hmat_revision;
 static int hmat_disable __initdata;
@@ -640,66 +641,6 @@ static void hmat_register_target_perf(struct memory_target 
*target)
node_set_perf_attrs(mem_nid, >hmem_attrs, 0);
 }
 
-static void hmat_register_target_device(struct memory_target *target,
-   struct resource *r)
-{
-   /* define a clean / non-busy resource for the platform device */
-   struct resource res = {
-   .start = r->start,
-   .end = r->end,
-   .flags = IORESOURCE_MEM,
-   };
-   struct platform_device *pdev;
-   struct memregion_info info;
-   int rc, id;
-
-   rc = region_intersects(res.start, resource_size(), IORESOURCE_MEM,
-   IORES_DESC_SOFT_RESERVED);
-   if (rc != REGION_INTERSECTS)
-   return;
-
-   id = memregion_alloc(GFP_KERNEL);
-   if (id < 0) {
-   pr_err("memregion allocation failure for %pr\n", );
-   return;
-   }
-
-   pdev = platform_device_alloc("hmem", id);
-   if (!pdev) {
-   pr_err("hmem device allocation failure for %pr\n", );
-   goto out_pdev;
-   }
-
-   pdev->dev.numa_node = acpi_map_pxm_to_online_node(target->memory_pxm);
-   info = (struct memregion_info) {
-   .target_node = acpi_map_pxm_to_node(target->memory_pxm),
-   };
-   rc = platform_device_add_data(pdev, , sizeof(info));
-   if (rc < 0) {
-   pr_err("hmem memregion_info allocation failure for %pr\n", 
);
-   goto out_pdev;
-   }
-
-   rc = platform_device_add_resources(pdev, , 1);
-   if (rc < 0) {
-   pr_err("hmem resource allocation failure for %pr\n", );
-   goto out_resource;
-   }
-
-   rc = platform_device_add(pdev);
-   if (rc < 0) {
-   dev_err(>dev, "device add failed for %pr\n", );
-   goto out_resource;
-   }
-
-   return;
-
-out_resource:
-   put_device(>dev);
-out_pdev:
-   memregion_free(id);
-}
-
 static void hmat_register_target_devices(struct memory_target *target)
 {
struct resource *res;
@@ -711,8 +652,11 @@ static void hmat_register_target_devices(struct 
memory_target *target)
if (!IS_ENABLED(CONFIG_DEV_DAX_HMEM))
return;
 
-   for (res = target->memregions.child; res; res = res->sibling)
-   hmat_register_target_device(target, res);
+   for (res = target->memregions.child; res; res = res->sibling) {
+   int target_nid = acpi_map_pxm_to_node(target->memory_pxm);
+
+   hmem_register_device(target_nid, res);
+   }
 }
 
 static void hmat_register_target(struct memory_target *target)
diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
index 3b6c06f07326..a229f45d34aa 100644
--- a/drivers/dax/Kconfig
+++ b/drivers/dax/Kconfig
@@ -48,6 +48,10 @@ config DEV_DAX_HMEM
 
  Say M if unsure.
 
+config DEV_DAX_HMEM_DEVICES
+   depends on DEV_DAX_HMEM
+   def_bool y
+
 config DEV_DAX_KMEM
tristate "KMEM DAX: volatile-use of persistent memory"
default DEV_DAX
diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile
index 80065b38b3c4..9d4ba672d305 100644
--- a/drivers/dax/Makefile
+++ b/drivers/dax/Makefile
@@ -2,11 +2,10 @@
 obj-$(CONFIG_DAX) += dax.o
 obj-$(CONFIG_DEV_DAX) += d

[PATCH v3 06/23] mm/memory_hotplug: Introduce default phys_to_target_node() implementation

2020-07-31 Thread Dan Williams

In preparation to set a fallback value for dev_dax->target_node,
introduce generic fallback helpers for phys_to_target_node()

A generic implementation based on node-data or memblock was proposed,
but as noted by Mike:

"Here again, I would prefer to add a weak default for
 phys_to_target_node() because the "generic" implementation is not really
 generic.

 The fallback to reserved ranges is x86 specfic because on x86 most of
 the reserved areas is not in memblock.memory. AFAIK, no other
 architecture does this."

The info message in the generic memory_add_physaddr_to_nid()
implementation is fixed up to properly reflect that
memory_add_physaddr_to_nid() communicates "online" node info and
phys_to_target_node() indicates "target / to-be-onlined" node info.

Cc: David Hildenbrand 
Cc: Mike Rapoport 
Cc: Jia He 
Signed-off-by: Dan Williams 
---
 arch/x86/mm/numa.c |1 -
 include/linux/memory_hotplug.h |5 +
 mm/memory_hotplug.c|   10 +-
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index f3805bbaa784..c62e274d52d0 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -917,7 +917,6 @@ int phys_to_target_node(phys_addr_t start)
 
return meminfo_to_nid(_reserved_meminfo, start);
 }
-EXPORT_SYMBOL_GPL(phys_to_target_node);
 
 int memory_add_physaddr_to_nid(u64 start)
 {
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 375515803cd8..dcdc7d6206d5 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -151,11 +151,16 @@ int add_pages(int nid, unsigned long start_pfn, unsigned 
long nr_pages,
 
 #ifdef CONFIG_NUMA
 extern int memory_add_physaddr_to_nid(u64 start);
+extern int phys_to_target_node(u64 start);
 #else
 static inline int memory_add_physaddr_to_nid(u64 start)
 {
return 0;
 }
+static inline int phys_to_target_node(u64 start)
+{
+   return 0;
+}
 #endif
 
 #ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index dcdf3271f87e..426b79adf529 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -353,11 +353,19 @@ int __ref __add_pages(int nid, unsigned long pfn, 
unsigned long nr_pages,
 #ifdef CONFIG_NUMA
 int __weak memory_add_physaddr_to_nid(u64 start)
 {
-   pr_info_once("Unknown target node for memory at 0x%llx, assuming node 
0\n",
+   pr_info_once("Unknown online node for memory at 0x%llx, assuming node 
0\n",
start);
return 0;
 }
 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+
+int __weak phys_to_target_node(u64 start)
+{
+   pr_info_once("Unknown target node for memory at 0x%llx, assuming node 
0\n",
+   start);
+   return 0;
+}
+EXPORT_SYMBOL_GPL(phys_to_target_node);
 #endif
 
 /* find the smallest valid pfn in the range [start_pfn, end_pfn) */

[PATCH v3 12/23] device-dax: Add an allocation interface for device-dax instances

2020-07-31 Thread Dan Williams

In preparation for a facility that enables dax regions to be
sub-divided, introduce infrastructure to track and allocate region
capacity.

The new dax_region/available_size attribute is only enabled for volatile
hmem devices, not pmem devices that are defined by nvdimm namespace
boundaries. This is per Jeff's feedback the last time dynamic device-dax
capacity allocation support was discussed.

Link: 
https://lore.kernel.org/linux-nvdimm/x49shpp3zn8@segfault.boston.devel.redhat.com
Cc: Vishal Verma 
Signed-off-by: Dan Williams 
---
 drivers/dax/bus.c |  120 +
 drivers/dax/bus.h |7 ++-
 drivers/dax/dax-private.h |2 -
 drivers/dax/hmem/hmem.c   |7 +--
 drivers/dax/pmem/core.c   |8 +--
 5 files changed, 121 insertions(+), 23 deletions(-)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 96bd64ba95a5..0a48ce378686 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -130,6 +130,11 @@ ATTRIBUTE_GROUPS(dax_drv);
 
 static int dax_bus_match(struct device *dev, struct device_driver *drv);
 
+static bool is_static(struct dax_region *dax_region)
+{
+   return (dax_region->res.flags & IORESOURCE_DAX_STATIC) != 0;
+}
+
 static struct bus_type dax_bus_type = {
.name = "dax",
.uevent = dax_bus_uevent,
@@ -185,7 +190,48 @@ static ssize_t align_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(align);
 
+#define for_each_dax_region_resource(dax_region, res) \
+   for (res = (dax_region)->res.child; res; res = res->sibling)
+
+static unsigned long long dax_region_avail_size(struct dax_region *dax_region)
+{
+   resource_size_t size = resource_size(_region->res);
+   struct resource *res;
+
+   device_lock_assert(dax_region->dev);
+
+   for_each_dax_region_resource(dax_region, res)
+   size -= resource_size(res);
+   return size;
+}
+
+static ssize_t available_size_show(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   struct dax_region *dax_region = dev_get_drvdata(dev);
+   unsigned long long size;
+
+   device_lock(dev);
+   size = dax_region_avail_size(dax_region);
+   device_unlock(dev);
+
+   return sprintf(buf, "%llu\n", size);
+}
+static DEVICE_ATTR_RO(available_size);
+
+static umode_t dax_region_visible(struct kobject *kobj, struct attribute *a,
+   int n)
+{
+   struct device *dev = container_of(kobj, struct device, kobj);
+   struct dax_region *dax_region = dev_get_drvdata(dev);
+
+   if (is_static(dax_region) && a == _attr_available_size.attr)
+   return 0;
+   return a->mode;
+}
+
 static struct attribute *dax_region_attributes[] = {
+   _attr_available_size.attr,
_attr_region_size.attr,
_attr_align.attr,
_attr_id.attr,
@@ -195,6 +241,7 @@ static struct attribute *dax_region_attributes[] = {
 static const struct attribute_group dax_region_attribute_group = {
.name = "dax_region",
.attrs = dax_region_attributes,
+   .is_visible = dax_region_visible,
 };
 
 static const struct attribute_group *dax_region_attribute_groups[] = {
@@ -226,7 +273,8 @@ static void dax_region_unregister(void *region)
 }
 
 struct dax_region *alloc_dax_region(struct device *parent, int region_id,
-   struct resource *res, int target_node, unsigned int align)
+   struct resource *res, int target_node, unsigned int align,
+   unsigned long flags)
 {
struct dax_region *dax_region;
 
@@ -249,12 +297,17 @@ struct dax_region *alloc_dax_region(struct device 
*parent, int region_id,
return NULL;
 
dev_set_drvdata(parent, dax_region);
-   memcpy(_region->res, res, sizeof(*res));
kref_init(_region->kref);
dax_region->id = region_id;
dax_region->align = align;
dax_region->dev = parent;
dax_region->target_node = target_node;
+   dax_region->res = (struct resource) {
+   .start = res->start,
+   .end = res->end,
+   .flags = IORESOURCE_MEM | flags,
+   };
+
if (sysfs_create_groups(>kobj, dax_region_attribute_groups)) {
kfree(dax_region);
return NULL;
@@ -267,6 +320,32 @@ struct dax_region *alloc_dax_region(struct device *parent, 
int region_id,
 }
 EXPORT_SYMBOL_GPL(alloc_dax_region);
 
+static int alloc_dev_dax_range(struct dev_dax *dev_dax, resource_size_t size)
+{
+   struct dax_region *dax_region = dev_dax->region;
+   struct resource *res = _region->res;
+   struct device *dev = _dax->dev;
+   struct resource *alloc;
+
+   device_lock_assert(dax_region->dev);
+
+   /* TODO: handle multiple allocations per region */
+   if (res->child)
+   return -ENOMEM;
+
+   alloc = __request_region(res, res->start, size,

[PATCH v3 07/23] ACPI: HMAT: Attach a device for each soft-reserved range

2020-07-31 Thread Dan Williams

The hmem enabling in commit 'cf8741ac57ed ("ACPI: NUMA: HMAT: Register
"soft reserved" memory as an "hmem" device")' only registered ranges to
the hmem driver for each soft-reservation that also appeared in the
HMAT. While this is meant to encourage platform firmware to "do the
right thing" and publish an HMAT, the corollary is that platforms that
fail to publish an accurate HMAT will strand memory from Linux usage.
Additionally, the "efi_fake_mem" kernel command line option enabling
will strand memory by default without an HMAT.

Arrange for "soft reserved" memory that goes unclaimed by HMAT entries
to be published as raw resource ranges for the hmem driver to consume.

Include a module parameter to disable either this fallback behavior, or
the hmat enabling from creating hmem devices. The module parameter
requires the hmem device enabling to have unique name in the module
namespace: "device_hmem".

The driver depends on the architecture providing phys_to_target_node()
which is only x86 via numa_meminfo() and arm64 via a generic memblock
implementation.

Cc: Jonathan Cameron 
Cc: Brice Goglin 
Cc: Ard Biesheuvel 
Cc: "Rafael J. Wysocki" 
Cc: Jeff Moyer 
Cc: Catalin Marinas 
Cc: Will Deacon 
Reviewed-by: Joao Martins 
Signed-off-by: Dan Williams 
---
 drivers/dax/hmem/Makefile |3 ++-
 drivers/dax/hmem/device.c |   35 +++
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/drivers/dax/hmem/Makefile b/drivers/dax/hmem/Makefile
index a9d353d0c9ed..57377b4c3d47 100644
--- a/drivers/dax/hmem/Makefile
+++ b/drivers/dax/hmem/Makefile
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_DEV_DAX_HMEM) += dax_hmem.o
-obj-$(CONFIG_DEV_DAX_HMEM_DEVICES) += device.o
+obj-$(CONFIG_DEV_DAX_HMEM_DEVICES) += device_hmem.o
 
+device_hmem-y := device.o
 dax_hmem-y := hmem.o
diff --git a/drivers/dax/hmem/device.c b/drivers/dax/hmem/device.c
index b9dd6b27745c..cb6401c9e9a4 100644
--- a/drivers/dax/hmem/device.c
+++ b/drivers/dax/hmem/device.c
@@ -5,6 +5,9 @@
 #include 
 #include 
 
+static bool nohmem;
+module_param_named(disable, nohmem, bool, 0444);
+
 void hmem_register_device(int target_nid, struct resource *r)
 {
/* define a clean / non-busy resource for the platform device */
@@ -17,6 +20,9 @@ void hmem_register_device(int target_nid, struct resource *r)
struct memregion_info info;
int rc, id;
 
+   if (nohmem)
+   return;
+
rc = region_intersects(res.start, resource_size(), IORESOURCE_MEM,
IORES_DESC_SOFT_RESERVED);
if (rc != REGION_INTERSECTS)
@@ -63,3 +69,32 @@ void hmem_register_device(int target_nid, struct resource *r)
 out_pdev:
memregion_free(id);
 }
+
+static __init int hmem_register_one(struct resource *res, void *data)
+{
+   /*
+* If the resource is not a top-level resource it was already
+* assigned to a device by the HMAT parsing.
+*/
+   if (res->parent != _resource) {
+   pr_info("HMEM: skip %pr, already claimed\n", res);
+   return 0;
+   }
+
+   hmem_register_device(phys_to_target_node(res->start), res);
+
+   return 0;
+}
+
+static __init int hmem_init(void)
+{
+   walk_iomem_res_desc(IORES_DESC_SOFT_RESERVED,
+   IORESOURCE_MEM, 0, -1, NULL, hmem_register_one);
+   return 0;
+}
+
+/*
+ * As this is a fallback for address ranges unclaimed by the ACPI HMAT
+ * parsing it must be at an initcall level greater than hmat_init().
+ */
+late_initcall(hmem_init);

[PATCH v3 01/23] x86/numa: Cleanup configuration dependent command-line options

2020-07-31 Thread Dan Williams

In preparation for adding a new numa= option clean up the existing ones
to avoid ifdefs in numa_setup(), and provide feedback when the option is
numa=fake= option is invalid due to kernel config. The same does not
need to be done for numa=noacpi, since the capability is already hard
disabled at compile-time.

Suggested-by: Rafael J. Wysocki 
Signed-off-by: Dan Williams 
---
 arch/x86/include/asm/numa.h  |8 +++-
 arch/x86/mm/numa.c   |8 ++--
 arch/x86/mm/numa_emulation.c |3 ++-
 arch/x86/xen/enlighten_pv.c  |2 +-
 drivers/acpi/numa/srat.c |9 +++--
 include/acpi/acpi_numa.h |6 +-
 6 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index bbfde3d2662f..0aecc0b629e0 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -3,6 +3,7 @@
 #define _ASM_X86_NUMA_H
 
 #include 
+#include 
 
 #include 
 #include 
@@ -77,7 +78,12 @@ void debug_cpumask_set_cpu(int cpu, int node, bool enable);
 #ifdef CONFIG_NUMA_EMU
 #define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
 #define FAKE_NODE_MIN_HASH_MASK(~(FAKE_NODE_MIN_SIZE - 1UL))
-void numa_emu_cmdline(char *);
+int numa_emu_cmdline(char *str);
+#else /* CONFIG_NUMA_EMU */
+static inline int numa_emu_cmdline(char *str)
+{
+   return -EINVAL;
+}
 #endif /* CONFIG_NUMA_EMU */
 
 #endif /* _ASM_X86_NUMA_H */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index aa76ec2d359b..87c52822cc44 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -37,14 +37,10 @@ static __init int numa_setup(char *opt)
return -EINVAL;
if (!strncmp(opt, "off", 3))
numa_off = 1;
-#ifdef CONFIG_NUMA_EMU
if (!strncmp(opt, "fake=", 5))
-   numa_emu_cmdline(opt + 5);
-#endif
-#ifdef CONFIG_ACPI_NUMA
+   return numa_emu_cmdline(opt + 5);
if (!strncmp(opt, "noacpi", 6))
-   acpi_numa = -1;
-#endif
+   disable_srat();
return 0;
 }
 early_param("numa", numa_setup);
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index c5174b4e318b..847c23196e57 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -13,9 +13,10 @@
 static int emu_nid_to_phys[MAX_NUMNODES];
 static char *emu_cmdline __initdata;
 
-void __init numa_emu_cmdline(char *str)
+int __init numa_emu_cmdline(char *str)
 {
emu_cmdline = str;
+   return 0;
 }
 
 static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo 
*mi)
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 2aab43a13a8c..64b81ba5a4d6 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -1350,7 +1350,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
 * any NUMA information the kernel tries to get from ACPI will
 * be meaningless.  Prevent it from trying.
 */
-   acpi_numa = -1;
+   disable_srat();
 #endif
WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv));
 
diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c
index 15bbaab8500b..1b0ae0a1959b 100644
--- a/drivers/acpi/numa/srat.c
+++ b/drivers/acpi/numa/srat.c
@@ -27,7 +27,12 @@ static int node_to_pxm_map[MAX_NUMNODES]
= { [0 ... MAX_NUMNODES - 1] = PXM_INVAL };
 
 unsigned char acpi_srat_revision __initdata;
-int acpi_numa __initdata;
+static int acpi_numa __initdata;
+
+void __init disable_srat(void)
+{
+   acpi_numa = -1;
+}
 
 int pxm_to_node(int pxm)
 {
@@ -163,7 +168,7 @@ static int __init slit_valid(struct acpi_table_slit *slit)
 void __init bad_srat(void)
 {
pr_err("SRAT: SRAT not used.\n");
-   acpi_numa = -1;
+   disable_srat();
 }
 
 int __init srat_disabled(void)
diff --git a/include/acpi/acpi_numa.h b/include/acpi/acpi_numa.h
index fdebcfc6c8df..8784183b2204 100644
--- a/include/acpi/acpi_numa.h
+++ b/include/acpi/acpi_numa.h
@@ -17,10 +17,14 @@ extern int pxm_to_node(int);
 extern int node_to_pxm(int);
 extern int acpi_map_pxm_to_node(int);
 extern unsigned char acpi_srat_revision;
-extern int acpi_numa __initdata;
+extern void disable_srat(void);
 
 extern void bad_srat(void);
 extern int srat_disabled(void);
 
+#else  /* CONFIG_ACPI_NUMA */
+static inline void disable_srat(void)
+{
+}
 #endif /* CONFIG_ACPI_NUMA */
 #endif /* __ACP_NUMA_H */

[PATCH v3 03/23] efi/fake_mem: Arrange for a resource entry per efi_fake_mem instance

2020-07-31 Thread Dan Williams

In preparation for attaching a platform device per iomem resource teach
the efi_fake_mem code to create an e820 entry per instance. Similar to
E820_TYPE_PRAM, bypass merging resource when the e820 map is sanitized.

Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: "H. Peter Anvin" 
Cc: x...@kernel.org
Acked-by: Ard Biesheuvel 
Signed-off-by: Dan Williams 
---
 arch/x86/kernel/e820.c  |   16 +++-
 drivers/firmware/efi/x86_fake_mem.c |   12 +---
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 983cd53ed4c9..22aad412f965 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -305,6 +305,20 @@ static int __init cpcompare(const void *a, const void *b)
return (ap->addr != ap->entry->addr) - (bp->addr != bp->entry->addr);
 }
 
+static bool e820_nomerge(enum e820_type type)
+{
+   /*
+* These types may indicate distinct platform ranges aligned to
+* numa node, protection domain, performance domain, or other
+* boundaries. Do not merge them.
+*/
+   if (type == E820_TYPE_PRAM)
+   return true;
+   if (type == E820_TYPE_SOFT_RESERVED)
+   return true;
+   return false;
+}
+
 int __init e820__update_table(struct e820_table *table)
 {
struct e820_entry *entries = table->entries;
@@ -380,7 +394,7 @@ int __init e820__update_table(struct e820_table *table)
}
 
/* Continue building up new map based on this information: */
-   if (current_type != last_type || current_type == 
E820_TYPE_PRAM) {
+   if (current_type != last_type || e820_nomerge(current_type)) {
if (last_type != 0)  {
new_entries[new_nr_entries].size = 
change_point[chg_idx]->addr - last_addr;
/* Move forward only if the new size was 
non-zero: */
diff --git a/drivers/firmware/efi/x86_fake_mem.c 
b/drivers/firmware/efi/x86_fake_mem.c
index e5d6d5a1b240..0bafcc1bb0f6 100644
--- a/drivers/firmware/efi/x86_fake_mem.c
+++ b/drivers/firmware/efi/x86_fake_mem.c
@@ -38,7 +38,7 @@ void __init efi_fake_memmap_early(void)
m_start = mem->range.start;
m_end = mem->range.end;
for_each_efi_memory_desc(md) {
-   u64 start, end;
+   u64 start, end, size;
 
if (md->type != EFI_CONVENTIONAL_MEMORY)
continue;
@@ -58,11 +58,17 @@ void __init efi_fake_memmap_early(void)
 */
start = max(start, m_start);
end = min(end, m_end);
+   size = end - start + 1;
 
if (end <= start)
continue;
-   e820__range_update(start, end - start + 1, 
E820_TYPE_RAM,
-   E820_TYPE_SOFT_RESERVED);
+
+   /*
+* Ensure each efi_fake_mem instance results in
+* a unique e820 resource
+*/
+   e820__range_remove(start, size, E820_TYPE_RAM, 1);
+   e820__range_add(start, size, E820_TYPE_SOFT_RESERVED);
e820__update_table(e820_table);
}
}

[PATCH v3 02/23] x86/numa: Add 'nohmat' option

2020-07-31 Thread Dan Williams

Disable parsing of the HMAT for debug, to workaround broken platform
instances, or cases where it is otherwise not wanted.

Cc: x...@kernel.org
Cc: "Rafael J. Wysocki" 
Cc: Dave Hansen 
Cc: Andy Lutomirski 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: "H. Peter Anvin" 
Signed-off-by: Dan Williams 
---
 arch/x86/mm/numa.c   |2 ++
 drivers/acpi/numa/hmat.c |8 +++-
 include/acpi/acpi_numa.h |8 
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 87c52822cc44..f3805bbaa784 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -41,6 +41,8 @@ static __init int numa_setup(char *opt)
return numa_emu_cmdline(opt + 5);
if (!strncmp(opt, "noacpi", 6))
disable_srat();
+   if (!strncmp(opt, "nohmat", 6))
+   disable_hmat();
return 0;
 }
 early_param("numa", numa_setup);
diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c
index 2c32cfb72370..a12e36a12618 100644
--- a/drivers/acpi/numa/hmat.c
+++ b/drivers/acpi/numa/hmat.c
@@ -26,6 +26,12 @@
 #include 
 
 static u8 hmat_revision;
+static int hmat_disable __initdata;
+
+void __init disable_hmat(void)
+{
+   hmat_disable = 1;
+}
 
 static LIST_HEAD(targets);
 static LIST_HEAD(initiators);
@@ -814,7 +820,7 @@ static __init int hmat_init(void)
enum acpi_hmat_type i;
acpi_status status;
 
-   if (srat_disabled())
+   if (srat_disabled() || hmat_disable)
return 0;
 
status = acpi_get_table(ACPI_SIG_SRAT, 0, );
diff --git a/include/acpi/acpi_numa.h b/include/acpi/acpi_numa.h
index 8784183b2204..0e9302285f14 100644
--- a/include/acpi/acpi_numa.h
+++ b/include/acpi/acpi_numa.h
@@ -27,4 +27,12 @@ static inline void disable_srat(void)
 {
 }
 #endif /* CONFIG_ACPI_NUMA */
+
+#ifdef CONFIG_ACPI_HMAT
+extern void disable_hmat(void);
+#else  /* CONFIG_ACPI_HMAT */
+static inline void disable_hmat(void)
+{
+}
+#endif /* CONFIG_ACPI_HMAT */
 #endif /* __ACP_NUMA_H */

[PATCH v3 08/23] device-dax: Drop the dax_region.pfn_flags attribute

2020-07-31 Thread Dan Williams

All callers specify the same flags to alloc_dax_region(), so there is no
need to allow for anything other than PFN_DEV|PFN_MAP, or carry a
->pfn_flags around on the region. Device-dax instances are always page
backed.

Cc: Vishal Verma 
Signed-off-by: Dan Williams 
---
 drivers/dax/bus.c |4 +---
 drivers/dax/bus.h |3 +--
 drivers/dax/dax-private.h |2 --
 drivers/dax/device.c  |   26 +++---
 drivers/dax/hmem/hmem.c   |2 +-
 drivers/dax/pmem/core.c   |3 +--
 6 files changed, 7 insertions(+), 33 deletions(-)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index df238c8b6ef2..f06ffa66cd78 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -226,8 +226,7 @@ static void dax_region_unregister(void *region)
 }
 
 struct dax_region *alloc_dax_region(struct device *parent, int region_id,
-   struct resource *res, int target_node, unsigned int align,
-   unsigned long long pfn_flags)
+   struct resource *res, int target_node, unsigned int align)
 {
struct dax_region *dax_region;
 
@@ -251,7 +250,6 @@ struct dax_region *alloc_dax_region(struct device *parent, 
int region_id,
 
dev_set_drvdata(parent, dax_region);
memcpy(_region->res, res, sizeof(*res));
-   dax_region->pfn_flags = pfn_flags;
kref_init(_region->kref);
dax_region->id = region_id;
dax_region->align = align;
diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h
index 9e4eba67e8b9..55577e9791da 100644
--- a/drivers/dax/bus.h
+++ b/drivers/dax/bus.h
@@ -10,8 +10,7 @@ struct dax_device;
 struct dax_region;
 void dax_region_put(struct dax_region *dax_region);
 struct dax_region *alloc_dax_region(struct device *parent, int region_id,
-   struct resource *res, int target_node, unsigned int align,
-   unsigned long long flags);
+   struct resource *res, int target_node, unsigned int align);
 
 enum dev_dax_subsys {
DEV_DAX_BUS,
diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h
index 16850d5388ab..8a4c40ccd2ef 100644
--- a/drivers/dax/dax-private.h
+++ b/drivers/dax/dax-private.h
@@ -23,7 +23,6 @@ void dax_bus_exit(void);
  * @dev: parent device backing this region
  * @align: allocation and mapping alignment for child dax devices
  * @res: physical address range of the region
- * @pfn_flags: identify whether the pfns are paged back or not
  */
 struct dax_region {
int id;
@@ -32,7 +31,6 @@ struct dax_region {
struct device *dev;
unsigned int align;
struct resource res;
-   unsigned long long pfn_flags;
 };
 
 /**
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 4c0af2eb7e19..bffef1b21144 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -41,14 +41,6 @@ static int check_vma(struct dev_dax *dev_dax, struct 
vm_area_struct *vma,
return -EINVAL;
}
 
-   if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV
-   && (vma->vm_flags & VM_DONTCOPY) == 0) {
-   dev_info_ratelimited(dev,
-   "%s: %s: fail, dax range requires 
MADV_DONTFORK\n",
-   current->comm, func);
-   return -EINVAL;
-   }
-
if (!vma_is_dax(vma)) {
dev_info_ratelimited(dev,
"%s: %s: fail, vma is not DAX capable\n",
@@ -102,7 +94,7 @@ static vm_fault_t __dev_dax_pte_fault(struct dev_dax 
*dev_dax,
return VM_FAULT_SIGBUS;
}
 
-   *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+   *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
 
return vmf_insert_mixed(vmf->vma, vmf->address, *pfn);
 }
@@ -127,12 +119,6 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax 
*dev_dax,
return VM_FAULT_SIGBUS;
}
 
-   /* dax pmd mappings require pfn_t_devmap() */
-   if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) {
-   dev_dbg(dev, "region lacks devmap flags\n");
-   return VM_FAULT_SIGBUS;
-   }
-
if (fault_size < dax_region->align)
return VM_FAULT_SIGBUS;
else if (fault_size > dax_region->align)
@@ -150,7 +136,7 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax 
*dev_dax,
return VM_FAULT_SIGBUS;
}
 
-   *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+   *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
 
return vmf_insert_pfn_pmd(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE);
 }
@@ -177,12 +163,6 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax 
*dev_dax,
return VM_FAULT_SIGBUS;
}
 
-   /* dax pud mappings require pfn_t_devmap() */
-   if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (P

[PATCH v3 05/23] resource: Report parent to walk_iomem_res_desc() callback

2020-07-31 Thread Dan Williams

In support of detecting whether a resource might have been been claimed,
report the parent to the walk_iomem_res_desc() callback. For example,
the ACPI HMAT parser publishes "hmem" platform devices per target range.
However, if the HMAT is disabled / missing a fallback driver can attach
devices to the raw memory ranges as a fallback if it sees unclaimed /
orphan "Soft Reserved" resources in the resource tree.

Otherwise, find_next_iomem_res() returns a resource with garbage data
from the stack allocation in __walk_iomem_res_desc() for the res->parent
field.

There are currently no users that expect ->child and ->sibling to be
valid, and the resource_lock would be needed to traverse them. Use a
compound literal to implicitly zero initialize the fields that are not
being returned in addition to setting ->parent.

Cc: Jason Gunthorpe 
Cc: Dave Hansen 
Cc: Wei Yang 
Cc: Tom Lendacky 
Signed-off-by: Dan Williams 
---
 kernel/resource.c |   11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/kernel/resource.c b/kernel/resource.c
index 841737bbda9e..f1175ce93a1d 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -382,10 +382,13 @@ static int find_next_iomem_res(resource_size_t start, 
resource_size_t end,
 
if (p) {
/* copy data */
-   res->start = max(start, p->start);
-   res->end = min(end, p->end);
-   res->flags = p->flags;
-   res->desc = p->desc;
+   *res = (struct resource) {
+   .start = max(start, p->start),
+   .end = min(end, p->end),
+   .flags = p->flags,
+   .desc = p->desc,
+   .parent = p->parent,
+   };
}
 
read_unlock(_lock);

[PATCH v3 00/23] device-dax: Support sub-dividing soft-reserved ranges

2020-07-31 Thread Dan Williams

Changes since v2 [1]:
- Rebase on next/master to resolve conflicts with pending mem-hotplug
  and memremap_pages() changes in -mm

- Drop attempt at a generic phys_to_target_node() implementation and
  just follow the default fallback approach taken with
  memory_add_physaddr_to_nid() (Mike)

- Fix test_hmm and other compilation fixups (Ralph)

- Integrate Joao's extensions to the device-dax sub-division interface
  (per-device align, user-directed extent allocation). (Joao)

[1]: 
http://lore.kernel.org/r/159457116473.754248.7879464730875147365.st...@dwillia2-desk3.amr.corp.intel.com

---
Merge notes:

Andrew, this series is rebased on today's next/master to resolve
conflicts with some pending patches in -mm. I'd like to take it through
your tree given the intersections with memremap_pages() and memory
hotplug. If at all possible I'd like to see it in v5.10, but I realize
time is short. Outside of the Intel identified use cases for this Joao
has identified a use case for Oracle as well.

I would have sent this earlier save for the fact I am mostly offline
tending to a newborn these days. Vishal has stepped up to take on care
and feeding of this patchset if additional review / integration fixups
are needed.

The one test feedback this wants is from Justin (justin...@arm.com), and
whether this lights up dax_kmem and now dax_hmem for him on arm64.
Otherwise, Joao has written unit tests for this in his enabling of the
daxctl userspace utility [2].

---
Cover:

The device-dax facility allows an address range to be directly mapped
through a chardev, or optionally hotplugged to the core kernel page
allocator as System-RAM. It is the mechanism for converting persistent
memory (pmem) to be used as another volatile memory pool i.e. the
current Memory Tiering hot topic on linux-mm.

In the case of pmem the nvdimm-namespace-label mechanism can sub-divide
it, but that labeling mechanism is not available / applicable to
soft-reserved ("EFI specific purpose") memory [3]. This series provides
a sysfs-mechanism for the daxctl utility to enable provisioning of
volatile-soft-reserved memory ranges.

The motivations for this facility are:

1/ Allow performance differentiated memory ranges to be split between
   kernel-managed and directly-accessed use cases.

2/ Allow physical memory to be provisioned along performance relevant
   address boundaries. For example, divide a memory-side cache [4] along
   cache-color boundaries.

3/ Parcel out soft-reserved memory to VMs using device-dax as a security
   / permissions boundary [5]. Specifically I have seen people (ab)using
   memmap=nn!ss (mark System-RAM as Persistent Memory) just to get the
   device-dax interface on custom address ranges. A follow-on for the VM
   use case is to teach device-dax to dynamically allocate 'struct page' at
   runtime to reduce the duplication of 'struct page' space in both the
   guest and the host kernel for the same physical pages.

[2]: http://lore.kernel.org/r/20200713160837.13774-11-joao.m.mart...@oracle.com
[3]: 
http://lore.kernel.org/r/157309097008.1579826.12818463304589384434.st...@dwillia2-desk3.amr.corp.intel.com
[4]: 
http://lore.kernel.org/r/154899811738.3165233.12325692939590944259.st...@dwillia2-desk3.amr.corp.intel.com
[5]: http://lore.kernel.org/r/20200110190313.17144-1-joao.m.mart...@oracle.com

---

Dan Williams (19):
  x86/numa: Cleanup configuration dependent command-line options
  x86/numa: Add 'nohmat' option
  efi/fake_mem: Arrange for a resource entry per efi_fake_mem instance
  ACPI: HMAT: Refactor hmat_register_target_device to hmem_register_device
  resource: Report parent to walk_iomem_res_desc() callback
  mm/memory_hotplug: Introduce default phys_to_target_node() implementation
  ACPI: HMAT: Attach a device for each soft-reserved range
  device-dax: Drop the dax_region.pfn_flags attribute
  device-dax: Move instance creation parameters to 'struct dev_dax_data'
  device-dax: Make pgmap optional for instance creation
  device-dax: Kill dax_kmem_res
  device-dax: Add an allocation interface for device-dax instances
  device-dax: Introduce 'seed' devices
  drivers/base: Make device_find_child_by_name() compatible with sysfs 
inputs
  device-dax: Add resize support
  mm/memremap_pages: Convert to 'struct range'
  mm/memremap_pages: Support multiple ranges per invocation
  device-dax: Add dis-contiguous resource support
  device-dax: Introduce 'mapping' devices

Joao Martins (4):
  device-dax: Make align a per-device property
  device-dax: Add an 'align' attribute
  dax/hmem: Introduce dax_hmem.region_idle parameter
  device-dax: Add a range mapping allocation attribute


 arch/powerpc/kvm/book3s_hv_uvmem.c |   14 
 arch/x86/include/asm/numa.h|8 
 arch/x86/kernel/e820.c |   16 
 arch/x86/mm/numa.c |   11 
 arch/x86/mm/numa_emulation.c   |3 
 ar

[PATCH] ACPI: NFIT: Fix ARS zero-sized allocation

2020-07-31 Thread Dan Williams

Pending commit in -next "devres: handle zero size in devm_kmalloc()"
triggers a boot regression due to the ARS implementation expecting NULL
from a zero-sized allocation. Avoid the zero-sized allocation by
skipping ARS, otherwise crashes with the following signature when
de-referencing ZERO_SIZE_PTR.

 BUG: kernel NULL pointer dereference, address: 0018
 #PF: supervisor read access in kernel mode
 #PF: error_code(0x) - not-present page
 RIP: 0010:__acpi_nfit_scrub+0x28a/0x350 [nfit]
 [..]
 Call Trace:
   ? acpi_nfit_query_poison+0x6a/0x180 [nfit]
   acpi_nfit_scrub+0x36/0xb0 [nfit]
   process_one_work+0x23c/0x580
   worker_thread+0x50/0x3b0

Otherwise the implementation correctly aborts when NULL is returned from
devm_kzalloc() in ars_status_alloc().

Cc: Vishal Verma 
Cc: Dave Jiang 
Cc: Ira Weiny 
Signed-off-by: Dan Williams 
---
 drivers/acpi/nfit/core.c |   15 ---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index fb775b967c52..26dd208a0d63 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -3334,7 +3334,7 @@ static void acpi_nfit_init_ars(struct acpi_nfit_desc 
*acpi_desc,
 static int acpi_nfit_register_regions(struct acpi_nfit_desc *acpi_desc)
 {
struct nfit_spa *nfit_spa;
-   int rc;
+   int rc, do_sched_ars = 0;
 
set_bit(ARS_VALID, _desc->scrub_flags);
list_for_each_entry(nfit_spa, _desc->spas, list) {
@@ -3346,7 +3346,7 @@ static int acpi_nfit_register_regions(struct 
acpi_nfit_desc *acpi_desc)
}
}
 
-   list_for_each_entry(nfit_spa, _desc->spas, list)
+   list_for_each_entry(nfit_spa, _desc->spas, list) {
switch (nfit_spa_type(nfit_spa->spa)) {
case NFIT_SPA_VOLATILE:
case NFIT_SPA_PM:
@@ -3354,6 +3354,13 @@ static int acpi_nfit_register_regions(struct 
acpi_nfit_desc *acpi_desc)
rc = ars_register(acpi_desc, nfit_spa);
if (rc)
return rc;
+
+   /*
+* Kick off background ARS if at least one
+* region successfully registered ARS
+*/
+   if (!test_bit(ARS_FAILED, _spa->ars_state))
+   do_sched_ars++;
break;
case NFIT_SPA_BDW:
/* nothing to register */
@@ -3372,8 +3379,10 @@ static int acpi_nfit_register_regions(struct 
acpi_nfit_desc *acpi_desc)
/* don't register unknown regions */
break;
}
+   }
 
-   sched_ars(acpi_desc);
+   if (do_sched_ars)
+   sched_ars(acpi_desc);
return 0;
 }

Re: [PATCH v3 0/6] Fix and enable pmem as RAM device on arm64

2020-07-31 Thread Dan Williams

On Wed, Jul 8, 2020 at 7:06 PM Jia He  wrote:
>
> This fixies a few issues when I tried to enable pmem as RAM device on arm64.

What NVDIMM bus driver is being used in this case? The ACPI NFIT
driver? I'm just looking to see if currently deployed
phys_to_target_node() is sufficient, or if this is coming in a new
driver?

Re: [PATCH v1 2/4] device-dax: Add an 'align' attribute

2020-07-31 Thread Dan Williams

On Thu, Jul 16, 2020 at 10:31 AM Joao Martins  wrote:
>
> Introduce a device align attribute. While doing so,
> rename the region align attribute to be more explicitly
> named as so, but keep it named as @align to retain the API
> for tools like daxctl.
>
> Changes on align may not always be valid, when say certain
> mappings were created with 2M and then we switch to 1G. So, we
> validate all ranges against the new value being attempted,
> post resizing.
>
> Signed-off-by: Joao Martins 
> ---
>  drivers/dax/bus.c | 101 +-
>  1 file changed, 92 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
> index 2578651c596e..eb384dd6a376 100644
> --- a/drivers/dax/bus.c
> +++ b/drivers/dax/bus.c
> @@ -230,14 +230,15 @@ static ssize_t region_size_show(struct device *dev,
>  static struct device_attribute dev_attr_region_size = __ATTR(size, 0444,
> region_size_show, NULL);
>
> -static ssize_t align_show(struct device *dev,
> +static ssize_t region_align_show(struct device *dev,
> struct device_attribute *attr, char *buf)
>  {
> struct dax_region *dax_region = dev_get_drvdata(dev);
>
> return sprintf(buf, "%u\n", dax_region->align);
>  }
> -static DEVICE_ATTR_RO(align);
> +static struct device_attribute dev_attr_region_align =
> +   __ATTR(align, 0400, region_align_show, NULL);
>
>  #define for_each_dax_region_resource(dax_region, res) \
> for (res = (dax_region)->res.child; res; res = res->sibling)
> @@ -488,7 +489,7 @@ static umode_t dax_region_visible(struct kobject *kobj, 
> struct attribute *a,
>  static struct attribute *dax_region_attributes[] = {
> _attr_available_size.attr,
> _attr_region_size.attr,
> -   _attr_align.attr,
> +   _attr_region_align.attr,
> _attr_create.attr,
> _attr_seed.attr,
> _attr_delete.attr,
> @@ -855,14 +856,13 @@ static ssize_t size_show(struct device *dev,
> return sprintf(buf, "%llu\n", size);
>  }
>
> -static bool alloc_is_aligned(struct dax_region *dax_region,
> -   resource_size_t size)
> +static bool alloc_is_aligned(resource_size_t size, unsigned long align)

For type safety, let's make this take @dev_dax as a parameter. For the
dev_dax_set_align() case I think it is ok to provisionally adjust
dev_dax->align under the lock before entry and revert to the old
alignment on failure.

I can fix that up locally on applying.

Re: [PATCH RFC v2 00/18] Add VFIO mediated device support and DEV-MSI support for the idxd driver

2020-07-21 Thread Dan Williams

On Tue, Jul 21, 2020 at 9:29 AM Greg KH  wrote:
>
> On Tue, Jul 21, 2020 at 09:02:15AM -0700, Dave Jiang wrote:
> > v2:
>
> "RFC" to me means "I don't really think this is mergable, so I'm
> throwing it out there."  Which implies you know it needs more work
> before others should review it as you are not comfortable with it :(

There's full blown reviewed-by from me on the irq changes. The VFIO /
mdev changes looked ok to me, but I did not feel comfortable / did not
have time to sign-off on them. At the same time I did not see much to
be gained to keeping those internal. So "RFC" in this case is a bit
modest. It's more internal reviewer said this looks like it is going
in the right direction, but wants more community discussion on the
approach.

> So, back-of-the-queue you go...

Let's consider this not RFC in that context. The drivers/base/ pieces
have my review for you, the rest are dmaengine and vfio subsystem
concerns that could use some commentary.

Re: [PATCH v3 10/11] PM, libnvdimm: Add runtime firmware activation support

2020-07-20 Thread Dan Williams

On Mon, Jul 20, 2020 at 5:14 PM Vishal Verma  wrote:
>
> On Mon, 2020-07-20 at 17:02 -0700, Randy Dunlap wrote:
> > Hi Dan,
> >
> > Documentation comments below:
>
> Dan, Randy,
>
> I'm happy to fix these up when applying.

Sounds good. Thanks Vishal.

Re: [PATCH v3 10/11] PM, libnvdimm: Add runtime firmware activation support

2020-07-20 Thread Dan Williams

On Mon, Jul 20, 2020 at 5:02 PM Randy Dunlap  wrote:
>
> Hi Dan,
>
> Documentation comments below:
>
> On 7/20/20 3:08 PM, Dan Williams wrote:
> > Abstract platform specific mechanics for nvdimm firmware activation
> > behind a handful of generic ops. At the bus level ->activate_state()
> > indicates the unified state (idle, busy, armed) of all DIMMs on the bus,
> > and ->capability() indicates the system state expectations for activate.
> > At the DIMM level ->activate_state() indicates the per-DIMM state,
> > ->activate_result() indicates the outcome of the last activation
> > attempt, and ->arm() attempts to transition the DIMM from 'idle' to
> > 'armed'.
> >
> > A new hibernate_quiet_exec() facility is added to support firmware
> > activation in an OS defined system quiesce state. It leverages the fact
> > that the hibernate-freeze state wants to assert that a memory
> > hibernation snapshot can be taken. This is in contrast to a platform
> > firmware defined quiesce state that may forcefully quiet the memory
> > controller independent of whether an individual device-driver properly
> > supports hibernate-freeze.
> >
> > The libnvdimm sysfs interface is extended to support detection of a
> > firmware activate capability. The mechanism supports enumeration and
> > triggering of firmware activate, optionally in the
> > hibernate_quiet_exec() context.
> >
> > Cc: Pavel Machek 
> > Cc: Ira Weiny 
> > Cc: Len Brown 
> > Cc: Jonathan Corbet 
> > Cc: Dave Jiang 
> > Cc: Vishal Verma 
> > [rafael: hibernate_quiet_exec() proposal]
> > Co-developed-by: "Rafael J. Wysocki" 
> > Signed-off-by: Dan Williams 
> > ---
> >  Documentation/ABI/testing/sysfs-bus-nvdimm |2
> >  .../driver-api/nvdimm/firmware-activate.rst|   86 
> >  drivers/nvdimm/core.c  |  149 
> > 
> >  drivers/nvdimm/dimm_devs.c |  115 +++
> >  drivers/nvdimm/nd-core.h   |1
> >  include/linux/libnvdimm.h  |   44 ++
> >  include/linux/suspend.h|6 +
> >  kernel/power/hibernate.c   |   97 +
> >  8 files changed, 500 insertions(+)
> >  create mode 100644 Documentation/ABI/testing/sysfs-bus-nvdimm
> >  create mode 100644 Documentation/driver-api/nvdimm/firmware-activate.rst
>
>
> > diff --git a/Documentation/driver-api/nvdimm/firmware-activate.rst 
> > b/Documentation/driver-api/nvdimm/firmware-activate.rst
> > new file mode 100644
> > index ..9eb98aa833c5
> > --- /dev/null
> > +++ b/Documentation/driver-api/nvdimm/firmware-activate.rst
> > @@ -0,0 +1,86 @@
> > +.. SPDX-License-Identifier: GPL-2.0
> > +
> > +==
> > +NVDIMM Runtime Firmware Activation
> > +==
> > +
> > +Some persistent memory devices run a firmware locally on the device /
>
>   run firmware

That works too. I was going to say "run a firmware image", but "run
firmware" is clearer.

>
> > +"DIMM" to perform tasks like media management, capacity provisioning,
> > +and health monitoring. The process of updating that firmware typically
> > +involves a reboot because it has implications for in-flight memory
> > +transactions. However, reboots are disruptive and at least the Intel
> > +persistent memory platform implementation, described by the Intel ACPI
> > +DSM specification [1], has added support for activating firmware at
>
> that's an Intel spec?  just checking.

Correct. It's a public specification of the ACPI methods that Intel
platform BIOS or virtual-machine BIOS deploys to talk to NVDIMM
devices.

>
> > +runtime.
> > +
> > +A native sysfs interface is implemented in libnvdimm to allow platform
>
>  platforms

Ack.

>
> > +to advertise and control their local runtime firmware activation
> > +capability.
> > +
> > +The libnvdimm bus object, ndbusX, implements an ndbusX/firmware/activate
> > +attribute that shows the state of the firmware activation as one of 'idle',
> > +'armed', 'overflow', and 'busy'.
>
> or

Yup.

>
> > +
> > +- idle:
> > +  No devices are set / armed to activate firmware
> > +
> > +- armed:
> > +  At least one device is armed
> > +
> > +- busy:
>

[PATCH v3 03/11] ACPI: NFIT: Define runtime firmware activation commands

2020-07-20 Thread Dan Williams

Platform reboots are expensive. Towards reducing downtime to apply
firmware updates the Intel NVDIMM command definition is growing support
for applying live firmware updates that only require temporarily
suspending memory traffic instead of a full reboot.

Follow-on commits add support for triggering firmware activation, this
patch only defines the commands, adds probe support, and validates that
they are blocked via the ioctl path. The ioctl-path block ensures that
the OS is in charge since these commands have side effects only the OS
can handle. Specifically firmware activation may cause the memory
controller to be quiesced on the order of 100s of milliseconds. In that
case Linux ensure the activation only takes place while the OS is in a
suspend state.

Cc: Vishal Verma 
Cc: Dave Jiang 
Cc: Ira Weiny 
Cc: "Rafael J. Wysocki" 
Cc: Len Brown 
Link: https://pmem.io/documents/IntelOptanePMem_DSM_Interface-V2.0.pdf
Signed-off-by: Dan Williams 
---
 drivers/acpi/nfit/core.c   |   86 ++--
 drivers/acpi/nfit/intel.h  |   53 +++
 drivers/acpi/nfit/nfit.h   |   25 -
 include/uapi/linux/ndctl.h |3 +-
 4 files changed, 137 insertions(+), 30 deletions(-)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 9fdd655bdf0e..78cc9e2d2aa3 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -73,6 +73,18 @@ const guid_t *to_nfit_uuid(enum nfit_uuids id)
 }
 EXPORT_SYMBOL(to_nfit_uuid);
 
+static const guid_t *to_nfit_bus_uuid(int family)
+{
+   if (WARN_ONCE(family == NVDIMM_BUS_FAMILY_NFIT,
+   "only secondary bus families can be translated\n"))
+   return NULL;
+   /*
+* The index of bus UUIDs starts immediately following the last
+* NVDIMM/leaf family.
+*/
+   return to_nfit_uuid(family + NVDIMM_FAMILY_MAX);
+}
+
 static struct acpi_device *to_acpi_dev(struct acpi_nfit_desc *acpi_desc)
 {
struct nvdimm_bus_descriptor *nd_desc = _desc->nd_desc;
@@ -362,24 +374,8 @@ static u8 nfit_dsm_revid(unsigned family, unsigned func)
 {
static const u8 revid_table[NVDIMM_FAMILY_MAX+1][NVDIMM_CMD_MAX+1] = {
[NVDIMM_FAMILY_INTEL] = {
-   [NVDIMM_INTEL_GET_MODES] = 2,
-   [NVDIMM_INTEL_GET_FWINFO] = 2,
-   [NVDIMM_INTEL_START_FWUPDATE] = 2,
-   [NVDIMM_INTEL_SEND_FWUPDATE] = 2,
-   [NVDIMM_INTEL_FINISH_FWUPDATE] = 2,
-   [NVDIMM_INTEL_QUERY_FWUPDATE] = 2,
-   [NVDIMM_INTEL_SET_THRESHOLD] = 2,
-   [NVDIMM_INTEL_INJECT_ERROR] = 2,
-   [NVDIMM_INTEL_GET_SECURITY_STATE] = 2,
-   [NVDIMM_INTEL_SET_PASSPHRASE] = 2,
-   [NVDIMM_INTEL_DISABLE_PASSPHRASE] = 2,
-   [NVDIMM_INTEL_UNLOCK_UNIT] = 2,
-   [NVDIMM_INTEL_FREEZE_LOCK] = 2,
-   [NVDIMM_INTEL_SECURE_ERASE] = 2,
-   [NVDIMM_INTEL_OVERWRITE] = 2,
-   [NVDIMM_INTEL_QUERY_OVERWRITE] = 2,
-   [NVDIMM_INTEL_SET_MASTER_PASSPHRASE] = 2,
-   [NVDIMM_INTEL_MASTER_SECURE_ERASE] = 2,
+   [NVDIMM_INTEL_GET_MODES ...
+   NVDIMM_INTEL_FW_ACTIVATE_ARM] = 2,
},
};
u8 id;
@@ -406,7 +402,7 @@ static bool payload_dumpable(struct nvdimm *nvdimm, 
unsigned int func)
 }
 
 static int cmd_to_func(struct nfit_mem *nfit_mem, unsigned int cmd,
-   struct nd_cmd_pkg *call_pkg)
+   struct nd_cmd_pkg *call_pkg, int *family)
 {
if (call_pkg) {
int i;
@@ -417,6 +413,7 @@ static int cmd_to_func(struct nfit_mem *nfit_mem, unsigned 
int cmd,
for (i = 0; i < ARRAY_SIZE(call_pkg->nd_reserved2); i++)
if (call_pkg->nd_reserved2[i])
return -EINVAL;
+   *family = call_pkg->nd_family;
return call_pkg->nd_command;
}
 
@@ -450,13 +447,14 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, 
struct nvdimm *nvdimm,
acpi_handle handle;
const guid_t *guid;
int func, rc, i;
+   int family = 0;
 
if (cmd_rc)
*cmd_rc = -EINVAL;
 
if (cmd == ND_CMD_CALL)
call_pkg = buf;
-   func = cmd_to_func(nfit_mem, cmd, call_pkg);
+   func = cmd_to_func(nfit_mem, cmd, call_pkg, );
if (func < 0)
return func;
 
@@ -478,9 +476,17 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, 
struct nvdimm *nvdimm,
 
cmd_name = nvdimm_bus_cmd_name(cmd);
cmd_mask = nd_desc->cmd_mask;
-   dsm_mask = acpi_desc->bus_dsm_mask;
+

[PATCH v3 08/11] driver-core: Introduce DEVICE_ATTR_ADMIN_{RO,RW}

2020-07-20 Thread Dan Williams

A common pattern for using plain DEVICE_ATTR() instead of
DEVICE_ATTR_RO() and DEVICE_ATTR_RW() is for attributes that want to
limit read to only root.  I.e. many users of DEVICE_ATTR() are
specifying 0400 or 0600 for permissions.

Given the expectation that CAP_SYS_ADMIN is needed to access these
sensitive attributes add an explicit helper with the _ADMIN_ identifier
for DEVICE_ATTR_ADMIN_{RO,RW}.

Reviewed-by: Greg Kroah-Hartman 
Cc: "Rafael J. Wysocki" 
Signed-off-by: Dan Williams 
---
 include/linux/device.h |4 
 include/linux/sysfs.h  |7 +++
 2 files changed, 11 insertions(+)

diff --git a/include/linux/device.h b/include/linux/device.h
index 15460a5ac024..d7c2570368fa 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -128,8 +128,12 @@ ssize_t device_store_bool(struct device *dev, struct 
device_attribute *attr,
__ATTR_PREALLOC(_name, _mode, _show, _store)
 #define DEVICE_ATTR_RW(_name) \
struct device_attribute dev_attr_##_name = __ATTR_RW(_name)
+#define DEVICE_ATTR_ADMIN_RW(_name) \
+   struct device_attribute dev_attr_##_name = __ATTR_RW_MODE(_name, 0600)
 #define DEVICE_ATTR_RO(_name) \
struct device_attribute dev_attr_##_name = __ATTR_RO(_name)
+#define DEVICE_ATTR_ADMIN_RO(_name) \
+   struct device_attribute dev_attr_##_name = __ATTR_RO_MODE(_name, 0400)
 #define DEVICE_ATTR_WO(_name) \
struct device_attribute dev_attr_##_name = __ATTR_WO(_name)
 #define DEVICE_ULONG_ATTR(_name, _mode, _var) \
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 86067dbe7745..34e84122f635 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -123,6 +123,13 @@ struct attribute_group {
.show   = _name##_show, \
 }
 
+#define __ATTR_RW_MODE(_name, _mode) { \
+   .attr   = { .name = __stringify(_name), \
+   .mode = VERIFY_OCTAL_PERMISSIONS(_mode) },  \
+   .show   = _name##_show, \
+   .store  = _name##_store,\
+}
+
 #define __ATTR_WO(_name) { \
.attr   = { .name = __stringify(_name), .mode = 0200 }, \
.store  = _name##_store,\

[PATCH v3 04/11] tools/testing/nvdimm: Cleanup dimm index passing

2020-07-20 Thread Dan Williams

The ND_CMD_CALL path only applies to the nfit_test0 emulated DIMMs.
Cleanup occurrences of (i - t->dcr_idx) since that offset fixup only
applies to cases where nfit_test1 needs a bus-local index.

Cc: Vishal Verma 
Cc: Dave Jiang 
Cc: Ira Weiny 
Signed-off-by: Dan Williams 
---
 tools/testing/nvdimm/test/nfit.c |   34 ++
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index a59174ba1d2a..ddf9b3095bfa 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -1224,6 +1224,11 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor 
*nd_desc,
i = get_dimm(nfit_mem, func);
if (i < 0)
return i;
+   if (i >= NUM_DCR) {
+   dev_WARN_ONCE(>pdev.dev, 1,
+   "ND_CMD_CALL only valid for 
nfit_test0\n");
+   return -EINVAL;
+   }
 
switch (func) {
case NVDIMM_INTEL_GET_SECURITY_STATE:
@@ -1252,11 +1257,11 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor 
*nd_desc,
break;
case NVDIMM_INTEL_OVERWRITE:
rc = nd_intel_test_cmd_overwrite(t,
-   buf, buf_len, i - t->dcr_idx);
+   buf, buf_len, i);
break;
case NVDIMM_INTEL_QUERY_OVERWRITE:
rc = nd_intel_test_cmd_query_overwrite(t,
-   buf, buf_len, i - t->dcr_idx);
+   buf, buf_len, i);
break;
case NVDIMM_INTEL_SET_MASTER_PASSPHRASE:
rc = nd_intel_test_cmd_master_set_pass(t,
@@ -1272,48 +1277,45 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor 
*nd_desc,
break;
case ND_INTEL_FW_GET_INFO:
rc = nd_intel_test_get_fw_info(t, buf,
-   buf_len, i - t->dcr_idx);
+   buf_len, i);
break;
case ND_INTEL_FW_START_UPDATE:
rc = nd_intel_test_start_update(t, buf,
-   buf_len, i - t->dcr_idx);
+   buf_len, i);
break;
case ND_INTEL_FW_SEND_DATA:
rc = nd_intel_test_send_data(t, buf,
-   buf_len, i - t->dcr_idx);
+   buf_len, i);
break;
case ND_INTEL_FW_FINISH_UPDATE:
rc = nd_intel_test_finish_fw(t, buf,
-   buf_len, i - t->dcr_idx);
+   buf_len, i);
break;
case ND_INTEL_FW_FINISH_QUERY:
rc = nd_intel_test_finish_query(t, buf,
-   buf_len, i - t->dcr_idx);
+   buf_len, i);
break;
case ND_INTEL_SMART:
rc = nfit_test_cmd_smart(buf, buf_len,
-   >smart[i - t->dcr_idx]);
+   >smart[i]);
break;
case ND_INTEL_SMART_THRESHOLD:
rc = nfit_test_cmd_smart_threshold(buf,
buf_len,
-   >smart_threshold[i -
-   t->dcr_idx]);
+   >smart_threshold[i]);
break;
case ND_INTEL_SMART_SET_THRESHOLD:
rc = nfit_test_cmd_smart_set_threshold(buf,
buf_len,
-   >smart_threshold[i -
-   t->dcr_idx],
-   >smart[i - t->dcr_idx],
+   >

[PATCH v3 09/11] libnvdimm: Convert to DEVICE_ATTR_ADMIN_RO()

2020-07-20 Thread Dan Williams

Move libnvdimm sysfs attributes that currently use an open coded
DEVICE_ATTR() to hide sensitive root-only information (physical memory
layout) to the new DEVICE_ATTR_ADMIN_RO() helper.

Cc: Vishal Verma 
Cc: Dave Jiang 
Cc: Ira Weiny 
Signed-off-by: Dan Williams 
---
 drivers/nvdimm/namespace_devs.c |2 +-
 drivers/nvdimm/pfn_devs.c   |2 +-
 drivers/nvdimm/region_devs.c|2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index ae155e860fdc..6da67f4d641a 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -1309,7 +1309,7 @@ static ssize_t resource_show(struct device *dev,
return -ENXIO;
return sprintf(buf, "%#llx\n", (unsigned long long) res->start);
 }
-static DEVICE_ATTR(resource, 0400, resource_show, NULL);
+static DEVICE_ATTR_ADMIN_RO(resource);
 
 static const unsigned long blk_lbasize_supported[] = { 512, 520, 528,
4096, 4104, 4160, 4224, 0 };
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 34db557dbad1..3e11ef8d3f5b 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -218,7 +218,7 @@ static ssize_t resource_show(struct device *dev,
 
return rc;
 }
-static DEVICE_ATTR(resource, 0400, resource_show, NULL);
+static DEVICE_ATTR_ADMIN_RO(resource);
 
 static ssize_t size_show(struct device *dev,
struct device_attribute *attr, char *buf)
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 4502f9c4708d..20ff30c2ab93 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -605,7 +605,7 @@ static ssize_t resource_show(struct device *dev,
 
return sprintf(buf, "%#llx\n", nd_region->ndr_start);
 }
-static DEVICE_ATTR(resource, 0400, resource_show, NULL);
+static DEVICE_ATTR_ADMIN_RO(resource);
 
 static ssize_t persistence_domain_show(struct device *dev,
struct device_attribute *attr, char *buf)

[PATCH v3 11/11] ACPI: NFIT: Add runtime firmware activate support

2020-07-20 Thread Dan Williams

Plumb the platform specific backend for the generic libnvdimm firmware
activate interface. Register dimm level operations to arm/disarm
activation, and register bus level operations to report the dynamic
platform-quiesce time relative to the number of dimms armed for firmware
activation.

A new nfit-specific bus attribute "firmware_activate_noidle" is added to
allow the activation to switch between platform enforced, and OS
opportunistic device quiesce. In other words, let the hibernate cycle
handle in-flight device-dma rather than the platform attempting to
increase PCI-E timeouts and the like.

Cc: Dave Jiang 
Cc: Ira Weiny 
Cc: Vishal Verma 
Signed-off-by: Dan Williams 
---
 Documentation/ABI/testing/sysfs-bus-nfit |   19 +
 drivers/acpi/nfit/core.c |   41 +++
 drivers/acpi/nfit/intel.c|  386 ++
 drivers/acpi/nfit/intel.h|3 
 drivers/acpi/nfit/nfit.h |   10 +
 drivers/nvdimm/dimm_devs.c   |4 
 include/linux/libnvdimm.h|5 
 7 files changed, 461 insertions(+), 7 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-bus-nfit 
b/Documentation/ABI/testing/sysfs-bus-nfit
index a1cb44dcb908..e4f76e7eab93 100644
--- a/Documentation/ABI/testing/sysfs-bus-nfit
+++ b/Documentation/ABI/testing/sysfs-bus-nfit
@@ -202,6 +202,25 @@ Description:
functions. See the section named 'NVDIMM Root Device _DSMs' in
the ACPI specification.
 
+What:  /sys/bus/nd/devices/ndbusX/nfit/firmware_activate_noidle
+Date:  Apr, 2020
+KernelVersion: v5.8
+Contact:   linux-nvd...@lists.01.org
+Description:
+   (RW) The Intel platform implementation of firmware activate
+   support exposes an option let the platform force idle devices in
+   the system over the activation event, or trust that the OS will
+   do it. The safe default is to let the platform force idle
+   devices since the kernel is already in a suspend state, and on
+   the chance that a driver does not properly quiesce bus-mastering
+   after a suspend callback the platform will handle it.  However,
+   the activation might abort if, for example, platform firmware
+   determines that the activation time exceeds the max PCI-E
+   completion timeout. Since the platform does not know whether the
+   OS is running the activation from a suspend context it aborts,
+   but if the system owner trusts driver suspend callback to be
+   sufficient then 'firmware_activation_noidle' can be
+   enabled to bypass the activation abort.
 
 What:  /sys/bus/nd/devices/regionX/nfit/range_index
 Date:  Jun, 2015
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 78cc9e2d2aa3..fb775b967c52 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1392,8 +1392,12 @@ static umode_t nfit_visible(struct kobject *kobj, struct 
attribute *a, int n)
struct device *dev = container_of(kobj, struct device, kobj);
struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
 
-   if (a == _attr_scrub.attr && !ars_supported(nvdimm_bus))
-   return 0;
+   if (a == _attr_scrub.attr)
+   return ars_supported(nvdimm_bus) ? a->mode : 0;
+
+   if (a == _attr_firmware_activate_noidle.attr)
+   return intel_fwa_supported(nvdimm_bus) ? a->mode : 0;
+
return a->mode;
 }
 
@@ -1402,6 +1406,7 @@ static struct attribute *acpi_nfit_attributes[] = {
_attr_scrub.attr,
_attr_hw_error_scrub.attr,
_attr_bus_dsm_mask.attr,
+   _attr_firmware_activate_noidle.attr,
NULL,
 };
 
@@ -2019,6 +2024,26 @@ static const struct nvdimm_security_ops 
*acpi_nfit_get_security_ops(int family)
}
 }
 
+static const struct nvdimm_fw_ops *acpi_nfit_get_fw_ops(
+   struct nfit_mem *nfit_mem)
+{
+   unsigned long mask;
+   struct acpi_nfit_desc *acpi_desc = nfit_mem->acpi_desc;
+   struct nvdimm_bus_descriptor *nd_desc = _desc->nd_desc;
+
+   if (!nd_desc->fw_ops)
+   return NULL;
+
+   if (nfit_mem->family != NVDIMM_FAMILY_INTEL)
+   return NULL;
+
+   mask = nfit_mem->dsm_mask & NVDIMM_INTEL_FW_ACTIVATE_CMDMASK;
+   if (mask != NVDIMM_INTEL_FW_ACTIVATE_CMDMASK)
+   return NULL;
+
+   return intel_fw_ops;
+}
+
 static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
 {
struct nfit_mem *nfit_mem;
@@ -2095,7 +2120,8 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc 
*acpi_desc)
acpi_nfit_dimm_attribute_groups,
flags, cmd_mask, flush ? flush->hint_count : 0,

[PATCH v3 06/11] tools/testing/nvdimm: Prepare nfit_ctl_test() for ND_CMD_CALL emulation

2020-07-20 Thread Dan Williams

In preparation for adding a mocked implementation of the
firmware-activate bus-info command, rework nfit_ctl_test() to operate on
a local command payload wrapped in a 'struct nd_cmd_pkg'.

Cc: Vishal Verma 
Cc: Dave Jiang 
Cc: Ira Weiny 
Signed-off-by: Dan Williams 
---
 tools/testing/nvdimm/test/nfit.c |   83 --
 1 file changed, 43 insertions(+), 40 deletions(-)

diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 9c6f475befe4..2b0bfbfc0abb 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -2726,14 +2726,17 @@ static int nfit_ctl_test(struct device *dev)
struct acpi_nfit_desc *acpi_desc;
const u64 test_val = 0x0123456789abcdefULL;
unsigned long mask, cmd_size, offset;
-   union {
-   struct nd_cmd_get_config_size cfg_size;
-   struct nd_cmd_clear_error clear_err;
-   struct nd_cmd_ars_status ars_stat;
-   struct nd_cmd_ars_cap ars_cap;
-   char buf[sizeof(struct nd_cmd_ars_status)
-   + sizeof(struct nd_ars_record)];
-   } cmds;
+   struct nfit_ctl_test_cmd {
+   struct nd_cmd_pkg pkg;
+   union {
+   struct nd_cmd_get_config_size cfg_size;
+   struct nd_cmd_clear_error clear_err;
+   struct nd_cmd_ars_status ars_stat;
+   struct nd_cmd_ars_cap ars_cap;
+   char buf[sizeof(struct nd_cmd_ars_status)
+   + sizeof(struct nd_ars_record)];
+   };
+   } cmd;
 
adev = devm_kzalloc(dev, sizeof(*adev), GFP_KERNEL);
if (!adev)
@@ -2793,21 +2796,21 @@ static int nfit_ctl_test(struct device *dev)
 
 
/* basic checkout of a typical 'get config size' command */
-   cmd_size = sizeof(cmds.cfg_size);
-   cmds.cfg_size = (struct nd_cmd_get_config_size) {
+   cmd_size = sizeof(cmd.cfg_size);
+   cmd.cfg_size = (struct nd_cmd_get_config_size) {
.status = 0,
.config_size = SZ_128K,
.max_xfer = SZ_4K,
};
-   rc = setup_result(cmds.buf, cmd_size);
+   rc = setup_result(cmd.buf, cmd_size);
if (rc)
return rc;
rc = acpi_nfit_ctl(_desc->nd_desc, nvdimm, ND_CMD_GET_CONFIG_SIZE,
-   cmds.buf, cmd_size, _rc);
+   cmd.buf, cmd_size, _rc);
 
-   if (rc < 0 || cmd_rc || cmds.cfg_size.status != 0
-   || cmds.cfg_size.config_size != SZ_128K
-   || cmds.cfg_size.max_xfer != SZ_4K) {
+   if (rc < 0 || cmd_rc || cmd.cfg_size.status != 0
+   || cmd.cfg_size.config_size != SZ_128K
+   || cmd.cfg_size.max_xfer != SZ_4K) {
dev_dbg(dev, "%s: failed at: %d rc: %d cmd_rc: %d\n",
__func__, __LINE__, rc, cmd_rc);
return -EIO;
@@ -2816,14 +2819,14 @@ static int nfit_ctl_test(struct device *dev)
 
/* test ars_status with zero output */
cmd_size = offsetof(struct nd_cmd_ars_status, address);
-   cmds.ars_stat = (struct nd_cmd_ars_status) {
+   cmd.ars_stat = (struct nd_cmd_ars_status) {
.out_length = 0,
};
-   rc = setup_result(cmds.buf, cmd_size);
+   rc = setup_result(cmd.buf, cmd_size);
if (rc)
return rc;
rc = acpi_nfit_ctl(_desc->nd_desc, NULL, ND_CMD_ARS_STATUS,
-   cmds.buf, cmd_size, _rc);
+   cmd.buf, cmd_size, _rc);
 
if (rc < 0 || cmd_rc) {
dev_dbg(dev, "%s: failed at: %d rc: %d cmd_rc: %d\n",
@@ -2833,16 +2836,16 @@ static int nfit_ctl_test(struct device *dev)
 
 
/* test ars_cap with benign extended status */
-   cmd_size = sizeof(cmds.ars_cap);
-   cmds.ars_cap = (struct nd_cmd_ars_cap) {
+   cmd_size = sizeof(cmd.ars_cap);
+   cmd.ars_cap = (struct nd_cmd_ars_cap) {
.status = ND_ARS_PERSISTENT << 16,
};
offset = offsetof(struct nd_cmd_ars_cap, status);
-   rc = setup_result(cmds.buf + offset, cmd_size - offset);
+   rc = setup_result(cmd.buf + offset, cmd_size - offset);
if (rc)
return rc;
rc = acpi_nfit_ctl(_desc->nd_desc, NULL, ND_CMD_ARS_CAP,
-   cmds.buf, cmd_size, _rc);
+   cmd.buf, cmd_size, _rc);
 
if (rc < 0 || cmd_rc) {
dev_dbg(dev, "%s: failed at: %d rc: %d cmd_rc: %d\n",
@@ -2852,19 +2855,19 @@ static int nfit_ctl_test(struct device *dev)
 
 
/* test ars_status with 'status' trimmed from 'out_length' */
-   cmd_size = sizeof(cmds.ars_stat) + sizeof(struct nd_ars_record);
-   cmds.ars_stat = (struct nd_cmd_ars_status) {
+   cmd_

[PATCH v3 07/11] tools/testing/nvdimm: Emulate firmware activation commands

2020-07-20 Thread Dan Williams

Augment the existing firmware update emulation to track activations and
validate proper update vs activate sequencing.

The DIMM firmware activate capability has a concept of a maximum amount
of time platform firmware will quiesce the system relative to how many
DIMMs are being activated in parallel. Simulate that DIMM activation
happens serially, 1 second per-DIMM, and limit the max at 3 seconds. The
nfit_test0 bus emulates 5 DIMMs so it will take 2 activations to update
all DIMMs.

Cc: Vishal Verma 
Cc: Dave Jiang 
Cc: Ira Weiny 
Reported-by: Andy Shevchenko 
Signed-off-by: Dan Williams 
---
 drivers/acpi/nfit/intel.h|5 +
 tools/testing/nvdimm/test/nfit.c |  209 +-
 2 files changed, 210 insertions(+), 4 deletions(-)

diff --git a/drivers/acpi/nfit/intel.h b/drivers/acpi/nfit/intel.h
index 868d073731cc..49a598623024 100644
--- a/drivers/acpi/nfit/intel.h
+++ b/drivers/acpi/nfit/intel.h
@@ -132,6 +132,9 @@ struct nd_intel_fw_activate_dimminfo {
u8 reserved[7];
 } __packed;
 
+#define ND_INTEL_DIMM_FWA_ARM 1
+#define ND_INTEL_DIMM_FWA_DISARM 0
+
 struct nd_intel_fw_activate_arm {
u8 activate_arm;
u32 status;
@@ -160,6 +163,8 @@ struct nd_intel_bus_fw_activate_businfo {
 #define ND_INTEL_BUS_FWA_STATUS_NOIDLE (6 | 5 << 16)
 #define ND_INTEL_BUS_FWA_STATUS_ABORT  (6 | 6 << 16)
 
+#define ND_INTEL_BUS_FWA_IODEV_FORCE_IDLE (0)
+#define ND_INTEL_BUS_FWA_IODEV_OS_IDLE (1)
 struct nd_intel_bus_fw_activate {
u8 iodev_state;
u32 status;
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 2b0bfbfc0abb..a1a5dc645b40 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -173,6 +173,9 @@ struct nfit_test_fw {
u64 version;
u32 size_received;
u64 end_time;
+   bool armed;
+   bool missed_activate;
+   unsigned long last_activate;
 };
 
 struct nfit_test {
@@ -345,7 +348,7 @@ static int nd_intel_test_finish_fw(struct nfit_test *t,
__func__, t, nd_cmd, buf_len, idx);
 
if (fw->state == FW_STATE_UPDATED) {
-   /* update already done, need cold boot */
+   /* update already done, need activation */
nd_cmd->status = 0x20007;
return 0;
}
@@ -430,6 +433,7 @@ static int nd_intel_test_finish_query(struct nfit_test *t,
}
dev_dbg(dev, "%s: transition out verify\n", __func__);
fw->state = FW_STATE_UPDATED;
+   fw->missed_activate = false;
/* fall through */
case FW_STATE_UPDATED:
nd_cmd->status = 0;
@@ -1178,6 +1182,134 @@ static int nd_intel_test_cmd_master_secure_erase(struct 
nfit_test *t,
return 0;
 }
 
+static unsigned long last_activate;
+
+static int nvdimm_bus_intel_fw_activate_businfo(struct nfit_test *t,
+   struct nd_intel_bus_fw_activate_businfo *nd_cmd,
+   unsigned int buf_len)
+{
+   int i, armed = 0;
+   int state;
+   u64 tmo;
+
+   for (i = 0; i < NUM_DCR; i++) {
+   struct nfit_test_fw *fw = >fw[i];
+
+   if (fw->armed)
+   armed++;
+   }
+
+   /*
+* Emulate 3 second activation max, and 1 second incremental
+* quiesce time per dimm requiring multiple activates to get all
+* DIMMs updated.
+*/
+   if (armed)
+   state = ND_INTEL_FWA_ARMED;
+   else if (!last_activate || time_after(jiffies, last_activate + 3 * HZ))
+   state = ND_INTEL_FWA_IDLE;
+   else
+   state = ND_INTEL_FWA_BUSY;
+
+   tmo = armed * USEC_PER_SEC;
+   *nd_cmd = (struct nd_intel_bus_fw_activate_businfo) {
+   .capability = ND_INTEL_BUS_FWA_CAP_FWQUIESCE
+   | ND_INTEL_BUS_FWA_CAP_OSQUIESCE
+   | ND_INTEL_BUS_FWA_CAP_RESET,
+   .state = state,
+   .activate_tmo = tmo,
+   .cpu_quiesce_tmo = tmo,
+   .io_quiesce_tmo = tmo,
+   .max_quiesce_tmo = 3 * USEC_PER_SEC,
+   };
+
+   return 0;
+}
+
+static int nvdimm_bus_intel_fw_activate(struct nfit_test *t,
+   struct nd_intel_bus_fw_activate *nd_cmd,
+   unsigned int buf_len)
+{
+   struct nd_intel_bus_fw_activate_businfo info;
+   u32 status = 0;
+   int i;
+
+   nvdimm_bus_intel_fw_activate_businfo(t, , sizeof(info));
+   if (info.state == ND_INTEL_FWA_BUSY)
+   status = ND_INTEL_BUS_FWA_STATUS_BUSY;
+   else if (info.activate_tmo > info.max_quiesce_tmo)
+   status = ND_INTEL_BUS_FWA_STATUS_TMO;
+   else if (info.state == ND_INTEL_FWA_IDLE)
+   status = ND_INTEL_BUS_FWA_STATUS_NOARM;
+
+   dev_dbg(>pdev.dev, "status: %d\n", status);
+   nd_cmd->status = st

[PATCH v3 10/11] PM, libnvdimm: Add runtime firmware activation support

2020-07-20 Thread Dan Williams

Abstract platform specific mechanics for nvdimm firmware activation
behind a handful of generic ops. At the bus level ->activate_state()
indicates the unified state (idle, busy, armed) of all DIMMs on the bus,
and ->capability() indicates the system state expectations for activate.
At the DIMM level ->activate_state() indicates the per-DIMM state,
->activate_result() indicates the outcome of the last activation
attempt, and ->arm() attempts to transition the DIMM from 'idle' to
'armed'.

A new hibernate_quiet_exec() facility is added to support firmware
activation in an OS defined system quiesce state. It leverages the fact
that the hibernate-freeze state wants to assert that a memory
hibernation snapshot can be taken. This is in contrast to a platform
firmware defined quiesce state that may forcefully quiet the memory
controller independent of whether an individual device-driver properly
supports hibernate-freeze.

The libnvdimm sysfs interface is extended to support detection of a
firmware activate capability. The mechanism supports enumeration and
triggering of firmware activate, optionally in the
hibernate_quiet_exec() context.

Cc: Pavel Machek 
Cc: Ira Weiny 
Cc: Len Brown 
Cc: Jonathan Corbet 
Cc: Dave Jiang 
Cc: Vishal Verma 
[rafael: hibernate_quiet_exec() proposal]
Co-developed-by: "Rafael J. Wysocki" 
Signed-off-by: Dan Williams 
---
 Documentation/ABI/testing/sysfs-bus-nvdimm |2 
 .../driver-api/nvdimm/firmware-activate.rst|   86 
 drivers/nvdimm/core.c  |  149 
 drivers/nvdimm/dimm_devs.c |  115 +++
 drivers/nvdimm/nd-core.h   |1 
 include/linux/libnvdimm.h  |   44 ++
 include/linux/suspend.h|6 +
 kernel/power/hibernate.c   |   97 +
 8 files changed, 500 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-bus-nvdimm
 create mode 100644 Documentation/driver-api/nvdimm/firmware-activate.rst

diff --git a/Documentation/ABI/testing/sysfs-bus-nvdimm 
b/Documentation/ABI/testing/sysfs-bus-nvdimm
new file mode 100644
index ..d64380262be8
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-nvdimm
@@ -0,0 +1,2 @@
+The libnvdimm sub-system implements a common sysfs interface for
+platform nvdimm resources. See Documentation/driver-api/nvdimm/.
diff --git a/Documentation/driver-api/nvdimm/firmware-activate.rst 
b/Documentation/driver-api/nvdimm/firmware-activate.rst
new file mode 100644
index ..9eb98aa833c5
--- /dev/null
+++ b/Documentation/driver-api/nvdimm/firmware-activate.rst
@@ -0,0 +1,86 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==
+NVDIMM Runtime Firmware Activation
+==
+
+Some persistent memory devices run a firmware locally on the device /
+"DIMM" to perform tasks like media management, capacity provisioning,
+and health monitoring. The process of updating that firmware typically
+involves a reboot because it has implications for in-flight memory
+transactions. However, reboots are disruptive and at least the Intel
+persistent memory platform implementation, described by the Intel ACPI
+DSM specification [1], has added support for activating firmware at
+runtime.
+
+A native sysfs interface is implemented in libnvdimm to allow platform
+to advertise and control their local runtime firmware activation
+capability.
+
+The libnvdimm bus object, ndbusX, implements an ndbusX/firmware/activate
+attribute that shows the state of the firmware activation as one of 'idle',
+'armed', 'overflow', and 'busy'.
+
+- idle:
+  No devices are set / armed to activate firmware
+
+- armed:
+  At least one device is armed
+
+- busy:
+  In the busy state armed devices are in the process of transitioning
+  back to idle and completing an activation cycle.
+
+- overflow:
+  If the platform has a concept of incremental work needed to perform
+  the activation it could be the case that too many DIMMs are armed for
+  activation. In that scenario the potential for firmware activation to
+  timeout is indicated by the 'overflow' state.
+
+The 'ndbusX/firmware/activate' property can be written with a value of
+either 'live', or 'quiesce'. A value of 'quiesce' triggers the kernel to
+run firmware activation from within the equivalent of the hibernation
+'freeze' state where drivers and applications are notified to stop their
+modifications of system memory. A value of 'live' attempts
+firmware-activation without this hibernation cycle. The
+'ndbusX/firmware/activate' property will be elided completely if no
+firmware activation capability is detected.
+
+Another property 'ndbusX/firmware/capability' indicates a value of
+'live', or 'quiesce'. Where 'live' indicates that the firmware
+does not require or inflict any quiesce period on the system to u

[PATCH v3 05/11] tools/testing/nvdimm: Add command debug messages

2020-07-20 Thread Dan Williams

Arrange the for nfit_test_ctl() path to dump command payloads similarly
to the acpi_nfit_ctl() path. This is useful for comparing the
sequence of command events between an emulated ACPI-NFIT platform and a
real one.

Cc: Vishal Verma 
Cc: Dave Jiang 
Cc: Ira Weiny 
Signed-off-by: Dan Williams 
---
 tools/testing/nvdimm/test/nfit.c |   25 +
 1 file changed, 25 insertions(+)

diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index ddf9b3095bfa..9c6f475befe4 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -1192,6 +1192,29 @@ static int get_dimm(struct nfit_mem *nfit_mem, unsigned 
int func)
return i;
 }
 
+static void nfit_ctl_dbg(struct acpi_nfit_desc *acpi_desc,
+   struct nvdimm *nvdimm, unsigned int cmd, void *buf,
+   unsigned int len)
+{
+   struct nfit_test *t = container_of(acpi_desc, typeof(*t), acpi_desc);
+   unsigned int func = cmd;
+   unsigned int family = 0;
+
+   if (cmd == ND_CMD_CALL) {
+   struct nd_cmd_pkg *pkg = buf;
+
+   len = pkg->nd_size_in;
+   family = pkg->nd_family;
+   buf = pkg->nd_payload;
+   func = pkg->nd_command;
+   }
+   dev_dbg(>pdev.dev, "%s family: %d cmd: %d: func: %d input length: 
%d\n",
+   nvdimm ? nvdimm_name(nvdimm) : "bus", family, cmd, func,
+   len);
+   print_hex_dump_debug("nvdimm in  ", DUMP_PREFIX_OFFSET, 16, 4,
+   buf, min(len, 256u), true);
+}
+
 static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
struct nvdimm *nvdimm, unsigned int cmd, void *buf,
unsigned int buf_len, int *cmd_rc)
@@ -1205,6 +1228,8 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor 
*nd_desc,
cmd_rc = &__cmd_rc;
*cmd_rc = 0;
 
+   nfit_ctl_dbg(acpi_desc, nvdimm, cmd, buf, buf_len);
+
if (nvdimm) {
struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
unsigned long cmd_mask = nvdimm_cmd_mask(nvdimm);

[PATCH v3 00/11] ACPI/NVDIMM: Runtime Firmware Activation

2020-07-20 Thread Dan Williams

Changes since v2 [1]:
- Drop the "mem-quiet" pm-debug interface in favor of an explicit
hibernate_quiet_exec() helper that executes firmware activation, or
any other subsystem provided routine, in a system-quiet context.
(Rafael)

- Rework the sysfs interface to add an explicit trigger to run
activation under hibernate_quiet_exec(). Rename
ndbusX/firmware_activate to ndbusX/firmware/activate, and add a
ndbusX/firmware/capability. Some ndctl reworks are needed to catch up
with this change.

- The new ndbusX/firmware/capability attribute indicates the default
activation method / execution context between "live" and "suspend".

[1]:
http://lore.kernel.org/r/159408711335.2385045.2567600405906448375.st...@dwillia2-desk3.amr.corp.intel.com

---

Quoting the documentation:

Some persistent memory devices run a firmware locally on the device /
"DIMM" to perform tasks like media management, capacity provisioning,
and health monitoring. The process of updating that firmware typically
involves a reboot because it has implications for in-flight memory
transactions. However, reboots are disruptive and at least the Intel
persistent memory platform implementation, described by the Intel ACPI
DSM specification [1], has added support for activating firmware at
runtime.

[1]: https://docs.pmem.io/persistent-memory/

The approach taken is to abstract the Intel platform specific mechanism
behind a libnvdimm-generic sysfs interface. The interface could support
runtime-firmware-activation on another architecture without need to
change userspace tooling.

The ACPI NFIT implementation involves a set of device-specific-methods
(DSMs) to 'arm' individual devices for activation and bus-level
'trigger' method to execute the activation. Informational / enumeration
methods are also provided at the bus and device level.

One complicating aspect of the memory device firmware activation is that
the memory controller may need to be quiesced, no memory cycles, during
the activation. While the platform has mechanisms to support holding off
in-flight DMA during the activation, the device response to that delay
is potentially undefined. The platform may reject a runtime firmware
update if, for example a PCI-E device does not support its completion
timeout value being increased to meet the activation time. Outside of
device timeouts the quiesce period may also violate application
timeouts.

Given the above device and application timeout considerations the
implementation uses a new hibernate_quiet_exec() facility to carry-out
firmware activation. This imposes the same conditions that allow for a
stable memory image snapshot to be taken for a hibernate-to-disk
sequence. However, if desired, runtime activation without the hibernate
freeze can be forced as an override.

The ndctl utility grows the following extensions / commands to drive
this mechanism:

1/ The existing update-firmware command will 'arm' devices where the
firmware image is staged by default.

ndctl update-firmware all -f firmware_image.bin

2/ The existing ability to enumerate firmware-update capabilities now
includes firmware activate capabilities at the 'bus' and 'dimm/device'
level:

ndctl list -BDF -b nfit_test.0
[
{
"provider":"nfit_test.0",
"dev":"ndbus2",
"scrub_state":"idle",
"firmware":{
"activate_method":"suspend",
"activate_state":"idle"
},
"dimms":[
{
"dev":"nmem1",
"id":"cdab-0a-07e0-",
"handle":0,
"phys_id":0,
"security":"disabled",
"firmware":{
"current_version":0,
"can_update":true
}
},
...

3/ The new activate-firmware command triggers firmware activation per
the platform enumerated context, "suspend" vs "live", or can be forced
to "live" if there is a explicit knowledge that allowing applications
and devices to race the quiesce timeout will have no adverse effects.

ndctl activate-firmware nfit_test.0 [--force]

These patches are passing an updated version of the ndctl
"firmware-update.sh" unit test (to be posted).

---

Dan Williams (11):
libnvdimm: Validate command family indices
ACPI: NFIT: Move bus_dsm_mask out of generic nvdimm_bus_descriptor
ACPI: NFIT: Define runtime firmware activation commands
tools/testing/nvdimm: Cleanup dimm index passing
tools/testing/nvdimm: Add command debug messages
tools/testing/nvdimm: Prepare nfit_ctl_test() for ND_CMD_CALL emulation
tools/testing/nvdimm: Emulate firmware activation

[PATCH v3 02/11] ACPI: NFIT: Move bus_dsm_mask out of generic nvdimm_bus_descriptor

2020-07-20 Thread Dan Williams

DSMs are strictly an ACPI mechanism, evict the bus_dsm_mask concept from
the generic 'struct nvdimm_bus_descriptor' object.

As a side effect the test facility ->bus_nfit_cmd_force_en is no longer
necessary. The test infrastructure can communicate that information
directly in ->bus_dsm_mask.

Cc: Vishal Verma 
Cc: Dave Jiang 
Cc: Ira Weiny 
Signed-off-by: Dan Williams 
---
 drivers/acpi/nfit/core.c |8 
 drivers/acpi/nfit/nfit.h |2 +-
 include/linux/libnvdimm.h|1 -
 tools/testing/nvdimm/test/nfit.c |   16 
 4 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 1f72ce1a782b..9fdd655bdf0e 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -478,7 +478,7 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, 
struct nvdimm *nvdimm,
 
cmd_name = nvdimm_bus_cmd_name(cmd);
cmd_mask = nd_desc->cmd_mask;
-   dsm_mask = nd_desc->bus_dsm_mask;
+   dsm_mask = acpi_desc->bus_dsm_mask;
desc = nd_cmd_bus_desc(cmd);
guid = to_nfit_uuid(NFIT_DEV_BUS);
handle = adev->handle;
@@ -1238,8 +1238,9 @@ static ssize_t bus_dsm_mask_show(struct device *dev,
 {
struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus);
+   struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
 
-   return sprintf(buf, "%#lx\n", nd_desc->bus_dsm_mask);
+   return sprintf(buf, "%#lx\n", acpi_desc->bus_dsm_mask);
 }
 static struct device_attribute dev_attr_bus_dsm_mask =
__ATTR(dsm_mask, 0444, bus_dsm_mask_show, NULL);
@@ -2157,7 +2158,6 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc 
*acpi_desc)
int i;
 
nd_desc->cmd_mask = acpi_desc->bus_cmd_force_en;
-   nd_desc->bus_dsm_mask = acpi_desc->bus_nfit_cmd_force_en;
set_bit(ND_CMD_CALL, _desc->cmd_mask);
set_bit(NVDIMM_BUS_FAMILY_NFIT, _desc->bus_family_mask);
 
@@ -2180,7 +2180,7 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc 
*acpi_desc)
(1 << NFIT_CMD_ARS_INJECT_GET);
for_each_set_bit(i, _mask, BITS_PER_LONG)
if (acpi_check_dsm(adev->handle, guid, 1, 1ULL << i))
-   set_bit(i, _desc->bus_dsm_mask);
+   set_bit(i, _desc->bus_dsm_mask);
 }
 
 static ssize_t range_index_show(struct device *dev,
diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h
index 5c5e7ebba8dc..da097149d94d 100644
--- a/drivers/acpi/nfit/nfit.h
+++ b/drivers/acpi/nfit/nfit.h
@@ -237,7 +237,7 @@ struct acpi_nfit_desc {
unsigned long scrub_flags;
unsigned long dimm_cmd_force_en;
unsigned long bus_cmd_force_en;
-   unsigned long bus_nfit_cmd_force_en;
+   unsigned long bus_dsm_mask;
unsigned int platform_cap;
unsigned int scrub_tmo;
int (*blk_do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index bd39a2cf7972..ad9898ece7d3 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -76,7 +76,6 @@ typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc,
 struct device_node;
 struct nvdimm_bus_descriptor {
const struct attribute_group **attr_groups;
-   unsigned long bus_dsm_mask;
unsigned long cmd_mask;
unsigned long dimm_family_mask;
unsigned long bus_family_mask;
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index a8ee5c4d41eb..a59174ba1d2a 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -2507,10 +2507,10 @@ static void nfit_test0_setup(struct nfit_test *t)
set_bit(ND_CMD_ARS_STATUS, _desc->bus_cmd_force_en);
set_bit(ND_CMD_CLEAR_ERROR, _desc->bus_cmd_force_en);
set_bit(ND_CMD_CALL, _desc->bus_cmd_force_en);
-   set_bit(NFIT_CMD_TRANSLATE_SPA, _desc->bus_nfit_cmd_force_en);
-   set_bit(NFIT_CMD_ARS_INJECT_SET, _desc->bus_nfit_cmd_force_en);
-   set_bit(NFIT_CMD_ARS_INJECT_CLEAR, _desc->bus_nfit_cmd_force_en);
-   set_bit(NFIT_CMD_ARS_INJECT_GET, _desc->bus_nfit_cmd_force_en);
+   set_bit(NFIT_CMD_TRANSLATE_SPA, _desc->bus_dsm_mask);
+   set_bit(NFIT_CMD_ARS_INJECT_SET, _desc->bus_dsm_mask);
+   set_bit(NFIT_CMD_ARS_INJECT_CLEAR, _desc->bus_dsm_mask);
+   set_bit(NFIT_CMD_ARS_INJECT_GET, _desc->bus_dsm_mask);
set_bit(ND_INTEL_FW_GET_INFO, _desc->dimm_cmd_force_en);
set_bit(ND_INTEL_FW_START_UPDATE, _desc->dimm_cmd_force_en);
set_bit(ND_INTEL_FW_SEND_DATA, _desc->dimm_cmd_force_en);
@@ -2731,11 +2731,11 @@ static int nfit_ctl_test(struct device *dev)

[PATCH v3 01/11] libnvdimm: Validate command family indices

2020-07-20 Thread Dan Williams

The ND_CMD_CALL format allows for a general passthrough of passlisted
commands targeting a given command set. However there is no validation
of the family index relative to what the bus supports.

- Update the NFIT bus implementation (the only one that supports
  ND_CMD_CALL passthrough) to also passlist the valid set of command
  family indices.

- Update the generic __nd_ioctl() path to validate that field on behalf
  of all implementations.

Cc: Vishal Verma 
Cc: Dave Jiang 
Cc: Ira Weiny 
Cc: "Rafael J. Wysocki" 
Cc: Len Brown 
Fixes: 31eca76ba2fc ("nfit, libnvdimm: limited/whitelisted dimm command 
marshaling mechanism")
Cc: 
Signed-off-by: Dan Williams 
---
 drivers/acpi/nfit/core.c   |   11 +--
 drivers/acpi/nfit/nfit.h   |1 -
 drivers/nvdimm/bus.c   |   16 
 include/linux/libnvdimm.h  |2 ++
 include/uapi/linux/ndctl.h |4 
 5 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 7c138a4edc03..1f72ce1a782b 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1823,6 +1823,7 @@ static void populate_shutdown_status(struct nfit_mem 
*nfit_mem)
 static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
struct nfit_mem *nfit_mem, u32 device_handle)
 {
+   struct nvdimm_bus_descriptor *nd_desc = _desc->nd_desc;
struct acpi_device *adev, *adev_dimm;
struct device *dev = acpi_desc->dev;
unsigned long dsm_mask, label_mask;
@@ -1834,6 +1835,7 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc 
*acpi_desc,
/* nfit test assumes 1:1 relationship between commands and dsms */
nfit_mem->dsm_mask = acpi_desc->dimm_cmd_force_en;
nfit_mem->family = NVDIMM_FAMILY_INTEL;
+   set_bit(NVDIMM_FAMILY_INTEL, _desc->dimm_family_mask);
 
if (dcr->valid_fields & ACPI_NFIT_CONTROL_MFG_INFO_VALID)
sprintf(nfit_mem->id, "%04x-%02x-%04x-%08x",
@@ -1886,10 +1888,13 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc 
*acpi_desc,
 * Note, that checking for function0 (bit0) tells us if any commands
 * are reachable through this GUID.
 */
+   clear_bit(NVDIMM_FAMILY_INTEL, _desc->dimm_family_mask);
for (i = 0; i <= NVDIMM_FAMILY_MAX; i++)
-   if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1))
+   if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1)) {
+   set_bit(i, _desc->dimm_family_mask);
if (family < 0 || i == default_dsm_family)
family = i;
+   }
 
/* limit the supported commands to those that are publicly documented */
nfit_mem->family = family;
@@ -2153,6 +2158,9 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc 
*acpi_desc)
 
nd_desc->cmd_mask = acpi_desc->bus_cmd_force_en;
nd_desc->bus_dsm_mask = acpi_desc->bus_nfit_cmd_force_en;
+   set_bit(ND_CMD_CALL, _desc->cmd_mask);
+   set_bit(NVDIMM_BUS_FAMILY_NFIT, _desc->bus_family_mask);
+
adev = to_acpi_dev(acpi_desc);
if (!adev)
return;
@@ -2160,7 +2168,6 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc 
*acpi_desc)
for (i = ND_CMD_ARS_CAP; i <= ND_CMD_CLEAR_ERROR; i++)
if (acpi_check_dsm(adev->handle, guid, 1, 1ULL << i))
set_bit(i, _desc->cmd_mask);
-   set_bit(ND_CMD_CALL, _desc->cmd_mask);
 
dsm_mask =
(1 << ND_CMD_ARS_CAP) |
diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h
index f5525f8bb770..5c5e7ebba8dc 100644
--- a/drivers/acpi/nfit/nfit.h
+++ b/drivers/acpi/nfit/nfit.h
@@ -33,7 +33,6 @@
| ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \
| ACPI_NFIT_MEM_NOT_ARMED | ACPI_NFIT_MEM_MAP_FAILED)
 
-#define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_HYPERV
 #define NVDIMM_CMD_MAX 31
 
 #define NVDIMM_STANDARD_CMDMASK \
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 09087c38fabd..955265656b96 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -1037,9 +1037,25 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, 
struct nvdimm *nvdimm,
dimm_name = "bus";
}
 
+   /* Validate command family support against bus declared support */
if (cmd == ND_CMD_CALL) {
+   unsigned long *mask;
+
if (copy_from_user(, p, sizeof(pkg)))
return -EFAULT;
+
+   if (nvdimm) {
+   if (pkg.nd_family > NVDIMM_FAMILY_MAX)
+   return -EINVAL;
+   mask = _desc->dimm_family_mask;
+   } else {
+   if (pkg

Re: [PATCH v2 22/22] device-dax: Introduce 'mapping' devices

2020-07-16 Thread Dan Williams

On Thu, Jul 16, 2020 at 6:19 AM Joao Martins  wrote:
>
> On 7/12/20 5:28 PM, Dan Williams wrote:
> > In support of interrogating the physical address layout of a device with
> > dis-contiguous ranges, introduce a sysfs directory with 'start', 'end',
> > and 'page_offset' attributes. The alternative is trying to parse
> > /proc/iomem, and that file will not reflect the extent layout until the
> > device is enabled.
> >
> > Cc: Vishal Verma 
> > Signed-off-by: Dan Williams 
> > ---
> >  drivers/dax/bus.c |  191 
> > +
> >  drivers/dax/dax-private.h |   14 +++
> >  2 files changed, 203 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
> > index f342e36c69a1..8b6c4ddc5f42 100644
> > --- a/drivers/dax/bus.c
> > +++ b/drivers/dax/bus.c
> > @@ -579,6 +579,167 @@ struct dax_region *alloc_dax_region(struct device 
> > *parent, int region_id,
> >  }
> >  EXPORT_SYMBOL_GPL(alloc_dax_region);
> >
> > +static void dax_mapping_release(struct device *dev)
> > +{
> > + struct dax_mapping *mapping = to_dax_mapping(dev);
> > + struct dev_dax *dev_dax = to_dev_dax(dev->parent);
> > +
> > + ida_free(_dax->ida, mapping->id);
> > + kfree(mapping);
> > +}
> > +
> > +static void unregister_dax_mapping(void *data)
> > +{
> > + struct device *dev = data;
> > + struct dax_mapping *mapping = to_dax_mapping(dev);
> > + struct dev_dax *dev_dax = to_dev_dax(dev->parent);
> > + struct dax_region *dax_region = dev_dax->region;
> > +
> > + dev_dbg(dev, "%s\n", __func__);
> > +
> > + device_lock_assert(dax_region->dev);
> > +
> > + dev_dax->ranges[mapping->range_id].mapping = NULL;
> > + mapping->range_id = -1;
> > +
> > + device_del(dev);
> > + put_device(dev);
> > +}
> > +
> > +static struct dev_dax_range *get_dax_range(struct device *dev)
> > +{
> > + struct dax_mapping *mapping = to_dax_mapping(dev);
> > + struct dev_dax *dev_dax = to_dev_dax(dev->parent);
> > + struct dax_region *dax_region = dev_dax->region;
> > +
> > + device_lock(dax_region->dev);
> > + if (mapping->range_id < 1) {
> ^ it's 'mapping->range_id < 0'
>
> Otherwise 'mapping0' sysfs entries won't work.
> Disabled ranges use id -1.

Whoops, yes. Needs a unit test.

Re: [PATCH] /dev/mem: Add missing memory barriers for devmem_inode

2020-07-16 Thread Dan Williams

On Wed, Jul 15, 2020 at 11:07 PM Eric Biggers  wrote:
>
> From: Eric Biggers 
>
> WRITE_ONCE() isn't the correct way to publish a pointer to a data
> structure, since it doesn't include a write memory barrier.  Therefore
> other tasks may see that the pointer has been set but not see that the
> pointed-to memory has finished being initialized yet.  Instead a
> primitive with "release" semantics is needed.
>
> Use smp_store_release() for this.
>
> The use of READ_ONCE() on the read side is still potentially correct if
> there's no control dependency, i.e. if all memory being "published" is
> transitively reachable via the pointer itself.  But this pairing is
> somewhat confusing and error-prone.  So just upgrade the read side to
> smp_load_acquire() so that it clearly pairs with smp_store_release().
>
> Cc: Dan Williams 
> Cc: Arnd Bergmann 
> Cc: Ingo Molnar 
> Cc: Kees Cook 
> Cc: Matthew Wilcox 
> Cc: Russell King 
> Cc: Andrew Morton 
> Cc: Greg Kroah-Hartman 
> Fixes: 3234ac664a87 ("/dev/mem: Revoke mappings when a driver claims the 
> region")
> Signed-off-by: Eric Biggers 

Makes sense:

Acked-by: Dan Williams

Re: [PATCH v2 19/22] mm/memremap_pages: Convert to 'struct range'

2020-07-13 Thread Dan Williams

On Mon, Jul 13, 2020 at 9:36 AM Ralph Campbell  wrote:
>
>
> On 7/12/20 9:27 AM, Dan Williams wrote:
> > The 'struct resource' in 'struct dev_pagemap' is only used for holding
> > resource span information. The other fields, 'name', 'flags', 'desc',
> > 'parent', 'sibling', and 'child' are all unused wasted space.
> >
> > This is in preparation for introducing a multi-range extension of
> > devm_memremap_pages().
> >
> > The bulk of this change is unwinding all the places internal to
> > libnvdimm that used 'struct resource' unnecessarily.
> >
> > P2PDMA had a minor usage of the flags field, but only to report failures
> > with "%pR". That is replaced with an open coded print of the range.
> >
> > Cc: Paul Mackerras 
> > Cc: Michael Ellerman 
> > Cc: Benjamin Herrenschmidt 
> > Cc: Dan Williams 
> > Cc: Vishal Verma 
> > Cc: Dave Jiang 
> > Cc: Ben Skeggs 
> > Cc: David Airlie 
> > Cc: Daniel Vetter 
> > Cc: Ira Weiny 
> > Cc: Jason Gunthorpe 
> > Signed-off-by: Dan Williams 
> > ---
> >   arch/powerpc/kvm/book3s_hv_uvmem.c |   13 +++--
> >   drivers/dax/bus.c  |   10 ++--
> >   drivers/dax/bus.h  |2 -
> >   drivers/dax/dax-private.h  |5 --
> >   drivers/dax/device.c   |3 -
> >   drivers/dax/hmem/hmem.c|5 ++
> >   drivers/dax/pmem/core.c|   12 ++---
> >   drivers/gpu/drm/nouveau/nouveau_dmem.c |3 +
> >   drivers/nvdimm/badrange.c  |   26 +--
> >   drivers/nvdimm/claim.c |   13 +++--
> >   drivers/nvdimm/nd.h|3 +
> >   drivers/nvdimm/pfn_devs.c  |   12 ++---
> >   drivers/nvdimm/pmem.c  |   26 ++-
> >   drivers/nvdimm/region.c|   21 +
> >   drivers/pci/p2pdma.c   |   11 ++---
> >   include/linux/memremap.h   |5 +-
> >   include/linux/range.h  |6 ++
> >   mm/memremap.c  |   77 
> > 
> >   tools/testing/nvdimm/test/iomap.c  |2 -
> >   19 files changed, 135 insertions(+), 120 deletions(-)
>
> I think you are missing a call to memremap_pages() in lib/test_hmm.c
> and a call to release_mem_region() that need to be converted too.
> Try setting CONFIG_TEST_HMM=m.

Thanks Ralph, looks like I overlooked these changes since the rebase.

> Also, what about the call to release_mem_region() in
> drivers/gpu/drm/nouveau/nouveau_dmem.c? Doesn't that need a small change too?

I'll double check my config, that one should have been flagged at build time.

Re: [PATCH v2 17/22] drivers/base: Make device_find_child_by_name() compatible with sysfs inputs

2020-07-13 Thread Dan Williams

On Mon, Jul 13, 2020 at 9:13 AM Greg Kroah-Hartman
 wrote:
>
> On Mon, Jul 13, 2020 at 09:09:18AM -0700, Dan Williams wrote:
> > On Mon, Jul 13, 2020 at 8:52 AM Greg Kroah-Hartman
> >  wrote:
> > >
> > > On Mon, Jul 13, 2020 at 08:39:43AM -0700, Dan Williams wrote:
> > > > On Sun, Jul 12, 2020 at 10:09 AM Greg Kroah-Hartman
> > > >  wrote:
> > > > >
> > > > > On Sun, Jul 12, 2020 at 09:27:37AM -0700, Dan Williams wrote:
> > > > > > Use sysfs_streq() in device_find_child_by_name() to allow it to use 
> > > > > > a
> > > > > > sysfs input string that might contain a trailing newline.
> > > > > >
> > > > > > The other "device by name" interfaces,
> > > > > > {bus,driver,class}_find_device_by_name(), already account for sysfs
> > > > > > strings.
> > > > > >
> > > > > > Cc: Greg Kroah-Hartman 
> > > > > > Cc: "Rafael J. Wysocki" 
> > > > > > Signed-off-by: Dan Williams 
> > > > > > ---
> > > > > >  drivers/base/core.c |2 +-
> > > > > >  1 file changed, 1 insertion(+), 1 deletion(-)
> > > > > >
> > > > > > diff --git a/drivers/base/core.c b/drivers/base/core.c
> > > > > > index 67d39a90b45c..5d31b962c898 100644
> > > > > > --- a/drivers/base/core.c
> > > > > > +++ b/drivers/base/core.c
> > > > > > @@ -3078,7 +3078,7 @@ struct device 
> > > > > > *device_find_child_by_name(struct device *parent,
> > > > > >
> > > > > >   klist_iter_init(>p->klist_children, );
> > > > > >   while ((child = next_device()))
> > > > > > - if (!strcmp(dev_name(child), name) && 
> > > > > > get_device(child))
> > > > > > + if (sysfs_streq(dev_name(child), name) && 
> > > > > > get_device(child))
> > > > >
> > > > > Who wants to call this function with a name passed from userspace?
> > > > >
> > > > > Not objecting to it, just curious...
> > > > >
> > > >
> > > > The series that incorporates this patch adds a partitioning mechanism
> > > > to "device-dax region" devices with an:
> > > > "echo 1 > regionX/create" to create a new partition / sub-instance
> > > > of a region, and...
> > > > "echo $devname > regionX/delete" to delete. Where $devname is
> > > > searched in the child devices of regionX to trigger device_del().
> > >
> > > Shouldn't that be done in configfs, not sysfs?
> >
> > I see configfs as an awkward fit for this situation. configfs wants to
> > software define kernel objects whereas this facility wants to augment
> > existing kernel enumerated device objects. The region device is
> > created by firmware policy and is optionally partitioned, configfs
> > objects don't exist at all until created. So for this I see sysfs +
> > 'scheme to trigger child device creation' as just enough mechanism
> > that does not warrant full blown configfs.
> >
> > I believe it was debates like this [1] that have led me to the camp of
> > sysfs being capable of some device creation dynamism and leave
> > configfs for purely software constructed objects.
> >
> > [1]: https://lore.kernel.org/lkml/17377.42813.479466.690...@cse.unsw.edu.au/
>
> "some" :)

Yes, lowercase and quoted: "some" :).

> And that was from 2006, ugh, how did you find that...

Oh, public-inbox is a wonderful thing. "I kinda sort of remember Neil
laying out a configfs vs sysfs argument", /me searches for "f:neil
configfs" and voila.

> Ok, that's fine, no objection from me for this patch:
>
> Reviewed-by: Greg Kroah-Hartman 

Thanks, Greg.

Re: [PATCH v2 17/22] drivers/base: Make device_find_child_by_name() compatible with sysfs inputs

2020-07-13 Thread Dan Williams

On Mon, Jul 13, 2020 at 8:52 AM Greg Kroah-Hartman
 wrote:
>
> On Mon, Jul 13, 2020 at 08:39:43AM -0700, Dan Williams wrote:
> > On Sun, Jul 12, 2020 at 10:09 AM Greg Kroah-Hartman
> >  wrote:
> > >
> > > On Sun, Jul 12, 2020 at 09:27:37AM -0700, Dan Williams wrote:
> > > > Use sysfs_streq() in device_find_child_by_name() to allow it to use a
> > > > sysfs input string that might contain a trailing newline.
> > > >
> > > > The other "device by name" interfaces,
> > > > {bus,driver,class}_find_device_by_name(), already account for sysfs
> > > > strings.
> > > >
> > > > Cc: Greg Kroah-Hartman 
> > > > Cc: "Rafael J. Wysocki" 
> > > > Signed-off-by: Dan Williams 
> > > > ---
> > > >  drivers/base/core.c |2 +-
> > > >  1 file changed, 1 insertion(+), 1 deletion(-)
> > > >
> > > > diff --git a/drivers/base/core.c b/drivers/base/core.c
> > > > index 67d39a90b45c..5d31b962c898 100644
> > > > --- a/drivers/base/core.c
> > > > +++ b/drivers/base/core.c
> > > > @@ -3078,7 +3078,7 @@ struct device *device_find_child_by_name(struct 
> > > > device *parent,
> > > >
> > > >   klist_iter_init(>p->klist_children, );
> > > >   while ((child = next_device()))
> > > > - if (!strcmp(dev_name(child), name) && get_device(child))
> > > > + if (sysfs_streq(dev_name(child), name) && 
> > > > get_device(child))
> > >
> > > Who wants to call this function with a name passed from userspace?
> > >
> > > Not objecting to it, just curious...
> > >
> >
> > The series that incorporates this patch adds a partitioning mechanism
> > to "device-dax region" devices with an:
> > "echo 1 > regionX/create" to create a new partition / sub-instance
> > of a region, and...
> > "echo $devname > regionX/delete" to delete. Where $devname is
> > searched in the child devices of regionX to trigger device_del().
>
> Shouldn't that be done in configfs, not sysfs?

I see configfs as an awkward fit for this situation. configfs wants to
software define kernel objects whereas this facility wants to augment
existing kernel enumerated device objects. The region device is
created by firmware policy and is optionally partitioned, configfs
objects don't exist at all until created. So for this I see sysfs +
'scheme to trigger child device creation' as just enough mechanism
that does not warrant full blown configfs.

I believe it was debates like this [1] that have led me to the camp of
sysfs being capable of some device creation dynamism and leave
configfs for purely software constructed objects.

[1]: https://lore.kernel.org/lkml/17377.42813.479466.690...@cse.unsw.edu.au/

> > This arrangement avoids one of the design mistakes of libnvdimm which
> > uses a sysfs attribute of the device to delete itself. Parent-device
> > triggered deletion rather than self-deletion avoids those locking
> > entanglements.
>
> Ugh, yeah, getting rid of that would be great, it's a mess.  I think
> scsi still does that :(

Yeah, both nvdimm and scsi both end up need to delay device deletion
to its own thread, and it has led to bugs in the nvdimm case.

Re: [Ksummit-discuss] [PATCH] CodingStyle: Inclusive Terminology

2020-07-13 Thread Dan Williams

On Sun, Jul 12, 2020 at 9:26 PM Vinod Koul  wrote:
>
> Hi Mauro,
>
> On 09-07-20, 13:11, Mauro Carvalho Chehab wrote:
> > Em Mon, 06 Jul 2020 06:30:01 -0700
> > Joe Perches  escreveu:
> > >
> > > $ git grep -i -w -P '\w*slave\w*' drivers | \
> > >   cut -f1,2 -d/ | uniq -c | sort -rn | head -20 | cat -n
> > >  1 5683 drivers/net
> > >  2 2118 drivers/gpu
> > >  3 1807 drivers/dma
> > >  4 1389 drivers/i2c
> > >  5  866 drivers/interconnect
> > >  6  835 drivers/soundwire
> > >  7  821 drivers/spi
> > >  8  698 drivers/w1
> > >  9  508 drivers/media
> > > 10  481 drivers/infiniband
> > > 11  440 drivers/ata
> > > 12  317 drivers/scsi
> > > 13  267 drivers/fsi
> > > 14  240 drivers/tty
> > > 15  225 drivers/vme
> > > 16  223 drivers/staging
> > > 17  157 drivers/mmc
> > > 18  155 drivers/usb
> > > 19  141 drivers/video
> > > 20  140 drivers/char
> >
> > It sounds that, as soon after this patch gets merged, the mailing lists
> > will be flooded by lots of patches replacing such terms with something
> > else :-(
> >
> > Doing a quick look at the media subsystem, it sounds that most terms
> > come from I2C master/slave and DiSEqC terminology, as defined by their
> > specs (and the others seem to be derived from some hardware vendor
> > specific terminology).
> >
> > As they're all supported by the current specs, if one would want
> > to replace them, it should first ensure that the supporting specs
> > should be using a different terminology, as otherwise replacing
> > them would just make harder for anyone trying to understand the
> > code.
>
> I think waiting for specs may result in long delays, we all know how
> 'fast' spec bodies work!
>
> Putting my soundwire maintainer hat, I see more than 1K uses of 'slave'
> in the subsystem due to MIPI defined terms of SoundWire Master/Slave, so
> I am planning to replace that and not wait for MIPI to update the spec.

Sounds good.

> A similar approach where we discuss with relevant stakeholder and arrive
> at replacement terms and swap them would be great

Right, just like any other coding-style cleanup, stage it the way that
makes the most sense for the subsystem you maintain.

Re: [PATCH v2 08/22] memblock: Introduce a generic phys_addr_to_target_node()

2020-07-13 Thread Dan Williams

On Mon, Jul 13, 2020 at 12:04 AM Mike Rapoport  wrote:
>
> Hi Dan,
>
> On Sun, Jul 12, 2020 at 09:26:48AM -0700, Dan Williams wrote:
> > Similar to how generic memory_add_physaddr_to_nid() interrogates
> > memblock data for numa information, introduce
> > get_reserved_pfn_range_from_nid() to enable the same operation for
> > reserved memory ranges. Example memory ranges that are reserved, but
> > still have associated numa-info are persistent memory or Soft Reserved
> > (EFI_MEMORY_SP) memory.
>
> Here again, I would prefer to add a weak default for
> phys_to_target_node() because the "generic" implementation is not really
> generic.
>
> The fallback to reserved ranges is x86 specfic because on x86 most of the
> reserved areas is not in memblock.memory. AFAIK, no other architecture
> does this.

True, I was pre-fetching ARM using the new EFI "Special Purpose"
memory attribute. However, until that becomes something that platforms
deploy in practice I'm ok with not solving that problem for now.

> And x86 anyway has implementation of phys_to_target_node().

Sure, let's go with the default stub for non-x86.

Justin, do you think it would make sense to fold your dax_kmem
enabling for arm64 series into my enabling of dax_hmem for all
memory-hotplug archs?

Re: [PATCH v2 07/22] numa: Introduce a generic memory_add_physaddr_to_nid()

2020-07-13 Thread Dan Williams

On Sun, Jul 12, 2020 at 11:58 PM Mike Rapoport  wrote:
>
> Hi Dan,
>
> On Sun, Jul 12, 2020 at 09:26:43AM -0700, Dan Williams wrote:
> > For architectures that opt into storing their numa data in memblock
> > (only ARM64 currently), add a memblock generic way to interrogate that
> > data for memory_add_physaddr_to_nid(). This requires ARCH_KEEP_MEMBLOCK
> > to keep memblock text and data around after boot.
>
> I afraid we are too far from using memblock as a generic placeholder for
> numa data. Although all architectures now have the numa info in
> memblock, only arm64 uses memblock as the primary source of that data.
>
> I'd rather prefer Jia's solution [1] to have a weak default for
> memory_add_physaddr_to_nid() and let architectures override it.

I'm ok with that as long as we do the same for phys_to_target_node().

Will had the concern about adding a generic numa-info facility the
last I tried this. I just don't see a practical way to get there in
the near term.

Re: [PATCH v2 17/22] drivers/base: Make device_find_child_by_name() compatible with sysfs inputs

2020-07-13 Thread Dan Williams

On Sun, Jul 12, 2020 at 10:09 AM Greg Kroah-Hartman
 wrote:
>
> On Sun, Jul 12, 2020 at 09:27:37AM -0700, Dan Williams wrote:
> > Use sysfs_streq() in device_find_child_by_name() to allow it to use a
> > sysfs input string that might contain a trailing newline.
> >
> > The other "device by name" interfaces,
> > {bus,driver,class}_find_device_by_name(), already account for sysfs
> > strings.
> >
> > Cc: Greg Kroah-Hartman 
> > Cc: "Rafael J. Wysocki" 
> > Signed-off-by: Dan Williams 
> > ---
> >  drivers/base/core.c |2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/drivers/base/core.c b/drivers/base/core.c
> > index 67d39a90b45c..5d31b962c898 100644
> > --- a/drivers/base/core.c
> > +++ b/drivers/base/core.c
> > @@ -3078,7 +3078,7 @@ struct device *device_find_child_by_name(struct 
> > device *parent,
> >
> >   klist_iter_init(>p->klist_children, );
> >   while ((child = next_device()))
> > - if (!strcmp(dev_name(child), name) && get_device(child))
> > + if (sysfs_streq(dev_name(child), name) && get_device(child))
>
> Who wants to call this function with a name passed from userspace?
>
> Not objecting to it, just curious...
>

The series that incorporates this patch adds a partitioning mechanism
to "device-dax region" devices with an:
"echo 1 > regionX/create" to create a new partition / sub-instance
of a region, and...
"echo $devname > regionX/delete" to delete. Where $devname is
searched in the child devices of regionX to trigger device_del().

This arrangement avoids one of the design mistakes of libnvdimm which
uses a sysfs attribute of the device to delete itself. Parent-device
triggered deletion rather than self-deletion avoids those locking
entanglements.

[PATCH v2 20/22] mm/memremap_pages: Support multiple ranges per invocation

2020-07-12 Thread Dan Williams

In support of device-dax growing the ability to front physically
dis-contiguous ranges of memory, update devm_memremap_pages() to track
multiple ranges with a single reference counter and devm instance.

Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: Benjamin Herrenschmidt 
Cc: Dan Williams 
Cc: Vishal Verma 
Cc: Dave Jiang 
Cc: Ben Skeggs 
Cc: David Airlie 
Cc: Daniel Vetter 
Cc: Ira Weiny 
Cc: Jason Gunthorpe 
Signed-off-by: Dan Williams 
---
 arch/powerpc/kvm/book3s_hv_uvmem.c |1 
 drivers/dax/device.c   |1 
 drivers/gpu/drm/nouveau/nouveau_dmem.c |1 
 drivers/nvdimm/pfn_devs.c  |1 
 drivers/nvdimm/pmem.c  |1 
 drivers/pci/p2pdma.c   |1 
 include/linux/memremap.h   |   10 +
 mm/memremap.c  |  259 +++-
 8 files changed, 165 insertions(+), 110 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c 
b/arch/powerpc/kvm/book3s_hv_uvmem.c
index a3032cc9a4da..f5108c20d926 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -808,6 +808,7 @@ int kvmppc_uvmem_init(void)
kvmppc_uvmem_pgmap.type = MEMORY_DEVICE_PRIVATE;
kvmppc_uvmem_pgmap.range.start = res->start;
kvmppc_uvmem_pgmap.range.end = res->end;
+   kvmppc_uvmem_pgmap.nr_range = 1;
kvmppc_uvmem_pgmap.ops = _uvmem_ops;
/* just one global instance: */
kvmppc_uvmem_pgmap.owner = _uvmem_pgmap;
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index fffc54ce0911..f3755df4ae29 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -417,6 +417,7 @@ int dev_dax_probe(struct dev_dax *dev_dax)
if (!pgmap)
return -ENOMEM;
pgmap->range = *range;
+   pgmap->nr_range = 1;
}
pgmap->type = MEMORY_DEVICE_DEVDAX;
addr = devm_memremap_pages(dev, pgmap);
diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c 
b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 395a5b7cb76d..ab0788dd8a97 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -241,6 +241,7 @@ nouveau_dmem_chunk_alloc(struct nouveau_drm *drm, struct 
page **ppage)
chunk->pagemap.type = MEMORY_DEVICE_PRIVATE;
chunk->pagemap.range.start = res->start;
chunk->pagemap.range.end = res->end;
+   chunk->pagemap.nr_range = 1;
chunk->pagemap.ops = _dmem_pagemap_ops;
chunk->pagemap.owner = drm->dev;
 
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 7ad9ea107810..88a1aabe0657 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -693,6 +693,7 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct 
dev_pagemap *pgmap)
.start = nsio->res.start + start_pad,
.end = nsio->res.end - end_trunc,
};
+   pgmap->nr_range = 1;
if (nd_pfn->mode == PFN_MODE_RAM) {
if (offset < reserve)
return -EINVAL;
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 08efe0cc9903..705d6df43d72 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -443,6 +443,7 @@ static int pmem_attach_disk(struct device *dev,
} else if (pmem_should_map_pages(dev)) {
pmem->pgmap.range.start = res->start;
pmem->pgmap.range.end = res->end;
+   pmem->pgmap.nr_range = 1;
pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
pmem->pgmap.ops = _pagemap_ops;
addr = devm_memremap_pages(dev, >pgmap);
diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 5ed28a8c7264..4ed77f0bd9e3 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -187,6 +187,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, 
size_t size,
pgmap = _pgmap->pgmap;
pgmap->range.start = pci_resource_start(pdev, bar) + offset;
pgmap->range.end = pgmap->range.start + size - 1;
+   pgmap->nr_range = 1;
pgmap->type = MEMORY_DEVICE_PCI_P2PDMA;
 
p2p_pgmap->provider = pdev;
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index d5d5e37db916..e63f3aff8f7d 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -95,7 +95,6 @@ struct dev_pagemap_ops {
 /**
  * struct dev_pagemap - metadata for ZONE_DEVICE mappings
  * @altmap: pre-allocated/reserved memory for vmemmap allocations
- * @range: physical address range covered by @ref
  * @ref: reference count that pins the devm_memremap_pages() mapping
  * @internal_ref: internal reference if @ref is not provided by the caller
  * @done: completion for @internal_ref
@@ -105,10 +104,12 @@ struct dev_pagemap_ops {
  * @owner: an opaque pointer identifying the entity that manages t

[PATCH v2 22/22] device-dax: Introduce 'mapping' devices

2020-07-12 Thread Dan Williams

In support of interrogating the physical address layout of a device with
dis-contiguous ranges, introduce a sysfs directory with 'start', 'end',
and 'page_offset' attributes. The alternative is trying to parse
/proc/iomem, and that file will not reflect the extent layout until the
device is enabled.

Cc: Vishal Verma 
Signed-off-by: Dan Williams 
---
 drivers/dax/bus.c |  191 +
 drivers/dax/dax-private.h |   14 +++
 2 files changed, 203 insertions(+), 2 deletions(-)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index f342e36c69a1..8b6c4ddc5f42 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -579,6 +579,167 @@ struct dax_region *alloc_dax_region(struct device 
*parent, int region_id,
 }
 EXPORT_SYMBOL_GPL(alloc_dax_region);
 
+static void dax_mapping_release(struct device *dev)
+{
+   struct dax_mapping *mapping = to_dax_mapping(dev);
+   struct dev_dax *dev_dax = to_dev_dax(dev->parent);
+
+   ida_free(_dax->ida, mapping->id);
+   kfree(mapping);
+}
+
+static void unregister_dax_mapping(void *data)
+{
+   struct device *dev = data;
+   struct dax_mapping *mapping = to_dax_mapping(dev);
+   struct dev_dax *dev_dax = to_dev_dax(dev->parent);
+   struct dax_region *dax_region = dev_dax->region;
+
+   dev_dbg(dev, "%s\n", __func__);
+
+   device_lock_assert(dax_region->dev);
+
+   dev_dax->ranges[mapping->range_id].mapping = NULL;
+   mapping->range_id = -1;
+
+   device_del(dev);
+   put_device(dev);
+}
+
+static struct dev_dax_range *get_dax_range(struct device *dev)
+{
+   struct dax_mapping *mapping = to_dax_mapping(dev);
+   struct dev_dax *dev_dax = to_dev_dax(dev->parent);
+   struct dax_region *dax_region = dev_dax->region;
+
+   device_lock(dax_region->dev);
+   if (mapping->range_id < 1) {
+   device_unlock(dax_region->dev);
+   return NULL;
+   }
+
+   return _dax->ranges[mapping->range_id];
+}
+
+static void put_dax_range(struct dev_dax_range *dax_range)
+{
+   struct dax_mapping *mapping = dax_range->mapping;
+   struct dev_dax *dev_dax = to_dev_dax(mapping->dev.parent);
+   struct dax_region *dax_region = dev_dax->region;
+
+   device_unlock(dax_region->dev);
+}
+
+static ssize_t start_show(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   struct dev_dax_range *dax_range;
+   ssize_t rc;
+
+   dax_range = get_dax_range(dev);
+   if (!dax_range)
+   return -ENXIO;
+   rc = sprintf(buf, "%#llx\n", dax_range->range.start);
+   put_dax_range(dax_range);
+
+   return rc;
+}
+static DEVICE_ATTR(start, 0400, start_show, NULL);
+
+static ssize_t end_show(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   struct dev_dax_range *dax_range;
+   ssize_t rc;
+
+   dax_range = get_dax_range(dev);
+   if (!dax_range)
+   return -ENXIO;
+   rc = sprintf(buf, "%#llx\n", dax_range->range.end);
+   put_dax_range(dax_range);
+
+   return rc;
+}
+static DEVICE_ATTR(end, 0400, end_show, NULL);
+
+static ssize_t pgoff_show(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   struct dev_dax_range *dax_range;
+   ssize_t rc;
+
+   dax_range = get_dax_range(dev);
+   if (!dax_range)
+   return -ENXIO;
+   rc = sprintf(buf, "%#lx\n", dax_range->pgoff);
+   put_dax_range(dax_range);
+
+   return rc;
+}
+static DEVICE_ATTR(page_offset, 0400, pgoff_show, NULL);
+
+static struct attribute *dax_mapping_attributes[] = {
+   _attr_start.attr,
+   _attr_end.attr,
+   _attr_page_offset.attr,
+   NULL,
+};
+
+static const struct attribute_group dax_mapping_attribute_group = {
+   .attrs = dax_mapping_attributes,
+};
+
+static const struct attribute_group *dax_mapping_attribute_groups[] = {
+   _mapping_attribute_group,
+   NULL,
+};
+
+static struct device_type dax_mapping_type = {
+   .release = dax_mapping_release,
+   .groups = dax_mapping_attribute_groups,
+};
+
+static int devm_register_dax_mapping(struct dev_dax *dev_dax, int range_id)
+{
+   struct dax_region *dax_region = dev_dax->region;
+   struct dax_mapping *mapping;
+   struct device *dev;
+   int rc;
+
+   device_lock_assert(dax_region->dev);
+
+   if (dev_WARN_ONCE(_dax->dev, !dax_region->dev->driver,
+   "region disabled\n"))
+   return -ENXIO;
+
+   mapping = kzalloc(sizeof(*mapping), GFP_KERNEL);
+   if (!mapping)
+   return -ENOMEM;
+   mapping->range_id = range_id;
+   mapping->id = ida_alloc(_dax->ida, GFP_KERNEL);
+   if (mapping->id < 0) {
+   kfree(ma

< 3 4 5 6 7 8 9 10 11 12 >

701 - 800 of 9322 matches

Mail list logo