Re: [PATCH v3 08/12] x86/memory_failure: Introduce {set, clear}_mce_nospec()

2018-06-06 Thread Dan Williams
On Mon, Jun 4, 2018 at 4:12 PM, Dan Williams  wrote:
> Currently memory_failure() returns zero if the error was handled. On
> that result mce_unmap_kpfn() is called to zap the page out of the kernel
> linear mapping to prevent speculative fetches of potentially poisoned
> memory. However, in the case of dax mapped devmap pages the page may be
> in active permanent use by the device driver, so it cannot be unmapped
> from the kernel.
>
> Instead of marking the page not present, marking the page UC should
> be sufficient for preventing poison from being pre-fetched into the
> cache. Convert mce_unmap_pfn() to set_mce_nospec() remapping the page as
> UC, to hide it from speculative accesses.
>
> Given that that persistent memory errors can be cleared by the driver,
> include a facility to restore the page to cacheable operation,
> clear_mce_nospec().
>
> Cc: Thomas Gleixner 
> Cc: Ingo Molnar 
> Cc: "H. Peter Anvin" 
> Cc: Tony Luck 

Tony, safe to assume you are ok with this patch now that the
decoy_addr approach is back?

> Cc: Borislav Petkov 
> Cc: 
> Cc: 
> Signed-off-by: Dan Williams 
> ---
>  arch/x86/include/asm/set_memory.h |   42 
> +
>  arch/x86/kernel/cpu/mcheck/mce-internal.h |   15 --
>  arch/x86/kernel/cpu/mcheck/mce.c  |   38 ++
>  include/linux/set_memory.h|   14 ++
>  4 files changed, 59 insertions(+), 50 deletions(-)
>
> diff --git a/arch/x86/include/asm/set_memory.h 
> b/arch/x86/include/asm/set_memory.h
> index bd090367236c..cf5e9124b45e 100644
> --- a/arch/x86/include/asm/set_memory.h
> +++ b/arch/x86/include/asm/set_memory.h
> @@ -88,4 +88,46 @@ extern int kernel_set_to_readonly;
>  void set_kernel_text_rw(void);
>  void set_kernel_text_ro(void);
>
> +#ifdef CONFIG_X86_64
> +static inline int set_mce_nospec(unsigned long pfn)
> +{
> +   unsigned long decoy_addr;
> +   int rc;
> +
> +   /*
> +* Mark the linear address as UC to make sure we don't log more
> +* errors because of speculative access to the page.
> +* We would like to just call:
> +*  set_memory_uc((unsigned long)pfn_to_kaddr(pfn), 1);
> +* but doing that would radically increase the odds of a
> +* speculative access to the poison page because we'd have
> +* the virtual address of the kernel 1:1 mapping sitting
> +* around in registers.
> +* Instead we get tricky.  We create a non-canonical address
> +* that looks just like the one we want, but has bit 63 flipped.
> +* This relies on set_memory_uc() properly sanitizing any __pa()
> +* results with __PHYSICAL_MASK or PTE_PFN_MASK.
> +*/
> +   decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
> +
> +   rc = set_memory_uc(decoy_addr, 1);
> +   if (rc)
> +   pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
> +   return rc;
> +}
> +#define set_mce_nospec set_mce_nospec
> +
> +/* Restore full speculative operation to the pfn. */
> +static inline int clear_mce_nospec(unsigned long pfn)
> +{
> +   return set_memory_wb((unsigned long) pfn_to_kaddr(pfn), 1);
> +}
> +#define clear_mce_nospec clear_mce_nospec
> +#else
> +/*
> + * Few people would run a 32-bit kernel on a machine that supports
> + * recoverable errors because they have too much memory to boot 32-bit.
> + */
> +#endif
> +
>  #endif /* _ASM_X86_SET_MEMORY_H */
> diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h 
> b/arch/x86/kernel/cpu/mcheck/mce-internal.h
> index 374d1aa66952..ceb67cd5918f 100644
> --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
> +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
> @@ -113,21 +113,6 @@ static inline void mce_register_injector_chain(struct 
> notifier_block *nb)  { }
>  static inline void mce_unregister_injector_chain(struct notifier_block *nb)  
>   { }
>  #endif
>
> -#ifndef CONFIG_X86_64
> -/*
> - * On 32-bit systems it would be difficult to safely unmap a poison page
> - * from the kernel 1:1 map because there are no non-canonical addresses that
> - * we can use to refer to the address without risking a speculative access.
> - * However, this isn't much of an issue because:
> - * 1) Few unmappable pages are in the 1:1 map. Most are in HIGHMEM which
> - *are only mapped into the kernel as needed
> - * 2) Few people would run a 32-bit kernel on a machine that supports
> - *recoverable errors because they have too much memory to boot 32-bit.
> - */
> -static inline void mce_unmap_kpfn(unsigned long pfn) {}
> -#define mce_unmap_kpfn mce_unmap_kpfn
> -#endif
> -
>  struct mca_config {
> bool dont_log_ce;
> bool cmci_disabled;
> diff --git a/arch/x86/kernel/cpu/mcheck/mce.c 
> b/arch/x86/kernel/cpu/mcheck/mce.c
> index 42cf2880d0ed..a0fbf0a8b7e6 100644
> --- a/arch/x86/kernel/cpu/mcheck/mce.c
> +++ b/arch/x86/kernel/cpu/mcheck/mce.c
> @@ -42,6 +42,7 @@
>  #include 
>  #include 
>  #include 
> 

[ndctl PATCH] test: Add device-dax MADV_HWPOISON test

2018-06-06 Thread Dan Williams
Reuse test_dax_poison() for triggering memory_failure() on device-dax
huge/gigantic page mappings.

Signed-off-by: Dan Williams 
---
 test.h|4 
 test/dax-pmd.c|   34 +-
 test/device-dax.c |   28 +++-
 3 files changed, 48 insertions(+), 18 deletions(-)

diff --git a/test.h b/test.h
index 5f2d6293c104..ce873f51f7aa 100644
--- a/test.h
+++ b/test.h
@@ -12,6 +12,8 @@
  */
 #ifndef __TEST_H__
 #define __TEST_H__
+#include 
+
 struct ndctl_test;
 struct ndctl_ctx;
 struct ndctl_test *ndctl_test_new(unsigned int kver);
@@ -36,6 +38,8 @@ struct ndctl_ctx;
 int test_parent_uuid(int loglevel, struct ndctl_test *test, struct ndctl_ctx 
*ctx);
 int test_multi_pmem(int loglevel, struct ndctl_test *test, struct ndctl_ctx 
*ctx);
 int test_dax_directio(int dax_fd, unsigned long align, void *dax_addr, off_t 
offset);
+int test_dax_poison(int dax_fd, unsigned long align, void *dax_addr,
+   off_t offset, bool fsdax);
 int test_dpa_alloc(int loglevel, struct ndctl_test *test, struct ndctl_ctx 
*ctx);
 int test_dsm_fail(int loglevel, struct ndctl_test *test, struct ndctl_ctx 
*ctx);
 int test_libndctl(int loglevel, struct ndctl_test *test, struct ndctl_ctx 
*ctx);
diff --git a/test/dax-pmd.c b/test/dax-pmd.c
index abff4f9fd199..65110b7c6a4c 100644
--- a/test/dax-pmd.c
+++ b/test/dax-pmd.c
@@ -27,6 +27,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define NUM_EXTENTS 5
 #define fail() fprintf(stderr, "%s: failed at: %d (%s)\n", \
@@ -217,8 +218,8 @@ static void sigbus_hdl(int sig, siginfo_t *si, void *ptr)
siglongjmp(sj_env, 1);
 }
 
-static int test_dax_poison(int dax_fd, unsigned long align, void *dax_addr,
-   off_t offset)
+int test_dax_poison(int dax_fd, unsigned long align, void *dax_addr,
+   off_t offset, bool fsdax)
 {
unsigned char *addr = MAP_FAILED;
struct sigaction act;
@@ -226,6 +227,13 @@ static int test_dax_poison(int dax_fd, unsigned long 
align, void *dax_addr,
void *buf;
int rc;
 
+   /*
+* MADV_HWPOISON must be page aligned, and this routine assumes
+* align is >= 8K
+*/
+   if (align < SZ_2M)
+   return 0;
+
if (posix_memalign(, 4096, 4096) != 0)
return -ENOMEM;
 
@@ -240,13 +248,15 @@ static int test_dax_poison(int dax_fd, unsigned long 
align, void *dax_addr,
}
 
/* dirty the block on disk to bypass the default zero page */
-   rc = pwrite(dax_fd, buf, 4096, offset + align / 2);
-   if (rc < 4096) {
-   fail();
-   rc = -ENXIO;
-   goto out;
+   if (fsdax) {
+   rc = pwrite(dax_fd, buf, 4096, offset + align / 2);
+   if (rc < 4096) {
+   fail();
+   rc = -ENXIO;
+   goto out;
+   }
+   fsync(dax_fd);
}
-   fsync(dax_fd);
 
addr = mmap(dax_addr, 2*align, PROT_READ|PROT_WRITE,
MAP_SHARED_VALIDATE|MAP_POPULATE|MAP_SYNC, dax_fd, 
offset);
@@ -281,6 +291,11 @@ clear_error:
goto out;
}
 
+   if (!fsdax) {
+   rc = 0;
+   goto out;
+   }
+
rc = fallocate(dax_fd, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE,
offset + align / 2, 4096);
if (rc) {
@@ -312,6 +327,7 @@ out:
 static int test_pmd(int fd)
 {
unsigned long long m_align, p_align, pmd_off;
+   static const bool fsdax = true;
struct fiemap_extent *ext;
void *base, *pmd_addr;
struct fiemap *map;
@@ -371,7 +387,7 @@ static int test_pmd(int fd)
if (rc)
goto err_directio;
 
-   rc = test_dax_poison(fd, HPAGE_SIZE, pmd_addr, pmd_off);
+   rc = test_dax_poison(fd, HPAGE_SIZE, pmd_addr, pmd_off, fsdax);
 
  err_directio:
  err_extent:
diff --git a/test/device-dax.c b/test/device-dax.c
index 0a42a327c96d..712c247adfb2 100644
--- a/test/device-dax.c
+++ b/test/device-dax.c
@@ -151,15 +151,6 @@ static int __test_device_dax(unsigned long align, int 
loglevel,
struct daxctl_region *dax_region;
char *buf, path[100], data[VERIFY_BUF_SIZE];
 
-   memset (, 0, sizeof(act));
-   act.sa_sigaction = sigbus;
-   act.sa_flags = SA_SIGINFO;
-
-   if (sigaction(SIGBUS, , 0)) {
-   perror("sigaction");
-   return 1;
-   }
-
ndctl_set_log_priority(ctx, loglevel);
 
ndns = ndctl_get_test_dev(ctx);
@@ -276,6 +267,7 @@ static int __test_device_dax(unsigned long align, int 
loglevel,
 * otherwise not supported.
 */
if (ndctl_test_attempt(test, KERNEL_VERSION(4, 9, 0))) {
+   static const bool devdax = false;
int fd2;
 
rc = test_dax_directio(fd, align, NULL, 0);
@@ -285,6 +277,15 @@ static int __test_device_dax(unsigned long align, int 
loglevel,
  

Re: [PATCH v3 11/12] mm, memory_failure: Teach memory_failure() about dev_pagemap pages

2018-06-06 Thread Dan Williams
On Mon, Jun 4, 2018 at 4:12 PM, Dan Williams  wrote:
> mce: Uncorrected hardware memory error in user-access at af34214200
> {1}[Hardware Error]: It has been corrected by h/w and requires no further 
> action
> mce: [Hardware Error]: Machine check events logged
> {1}[Hardware Error]: event severity: corrected
> Memory failure: 0xaf34214: reserved kernel page still referenced by 1 
> users
> [..]
> Memory failure: 0xaf34214: recovery action for reserved kernel page: 
> Failed
> mce: Memory error not recovered
>
> In contrast to typical memory, dev_pagemap pages may be dax mapped. With
> dax there is no possibility to map in another page dynamically since dax
> establishes 1:1 physical address to file offset associations. Also
> dev_pagemap pages associated with NVDIMM / persistent memory devices can
> internal remap/repair addresses with poison. While memory_failure()
> assumes that it can discard typical poisoned pages and keep them
> unmapped indefinitely, dev_pagemap pages may be returned to service
> after the error is cleared.
>
> Teach memory_failure() to detect and handle MEMORY_DEVICE_HOST
> dev_pagemap pages that have poison consumed by userspace. Mark the
> memory as UC instead of unmapping it completely to allow ongoing access
> via the device driver (nd_pmem). Later, nd_pmem will grow support for
> marking the page back to WB when the error is cleared.
>
> Cc: Jan Kara 
> Cc: Christoph Hellwig 
> Cc: Jérôme Glisse 
> Cc: Matthew Wilcox 
> Cc: Naoya Horiguchi 

Naoya, your thoughts on this patch? It is passing my unit tests for
filesystem-dax and device-dax.

> Cc: Ross Zwisler 
> Signed-off-by: Dan Williams 
> ---
>  include/linux/mm.h  |1
>  mm/memory-failure.c |  145 
> +++
>  2 files changed, 146 insertions(+)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 1ac1f06a4be6..566c972e03e7 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2669,6 +2669,7 @@ enum mf_action_page_type {
> MF_MSG_TRUNCATED_LRU,
> MF_MSG_BUDDY,
> MF_MSG_BUDDY_2ND,
> +   MF_MSG_DAX,
> MF_MSG_UNKNOWN,
>  };
>
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index b6efb78ba49b..de0bc897d6e7 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -55,6 +55,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include "internal.h"
> @@ -531,6 +532,7 @@ static const char * const action_page_types[] = {
> [MF_MSG_TRUNCATED_LRU]  = "already truncated LRU page",
> [MF_MSG_BUDDY]  = "free buddy page",
> [MF_MSG_BUDDY_2ND]  = "free buddy page (2nd try)",
> +   [MF_MSG_DAX]= "dax page",
> [MF_MSG_UNKNOWN]= "unknown page",
>  };
>
> @@ -1132,6 +1134,144 @@ static int memory_failure_hugetlb(unsigned long pfn, 
> int flags)
> return res;
>  }
>
> +static unsigned long dax_mapping_size(struct address_space *mapping,
> +   struct page *page)
> +{
> +   pgoff_t pgoff = page_to_pgoff(page);
> +   struct vm_area_struct *vma;
> +   unsigned long size = 0;
> +
> +   i_mmap_lock_read(mapping);
> +   xa_lock_irq(>i_pages);
> +   /* validate that @page is still linked to @mapping */
> +   if (page->mapping != mapping) {
> +   xa_unlock_irq(>i_pages);
> +   i_mmap_unlock_read(mapping);
> +   return 0;
> +   }
> +   vma_interval_tree_foreach(vma, >i_mmap, pgoff, pgoff) {
> +   unsigned long address = vma_address(page, vma);
> +   pgd_t *pgd;
> +   p4d_t *p4d;
> +   pud_t *pud;
> +   pmd_t *pmd;
> +   pte_t *pte;
> +
> +   pgd = pgd_offset(vma->vm_mm, address);
> +   if (!pgd_present(*pgd))
> +   continue;
> +   p4d = p4d_offset(pgd, address);
> +   if (!p4d_present(*p4d))
> +   continue;
> +   pud = pud_offset(p4d, address);
> +   if (!pud_present(*pud))
> +   continue;
> +   if (pud_devmap(*pud)) {
> +   size = PUD_SIZE;
> +   break;
> +   }
> +   pmd = pmd_offset(pud, address);
> +   if (!pmd_present(*pmd))
> +   continue;
> +   if (pmd_devmap(*pmd)) {
> +   size = PMD_SIZE;
> +   break;
> +   }
> +   pte = pte_offset_map(pmd, address);
> +   if (!pte_present(*pte))
> +   continue;
> +   if (pte_devmap(*pte)) {
> +   size = PAGE_SIZE;
> +   break;
> +   }
> +   }
> +   xa_unlock_irq(>i_pages);
> +   i_mmap_unlock_read(mapping);
> +
> +   return 

Re: [qemu PATCH v4 3/4] nvdimm, acpi: support NFIT platform capabilities

2018-06-06 Thread Dan Williams
On Wed, Jun 6, 2018 at 4:20 PM, Elliott, Robert (Persistent Memory)
 wrote:
>
>> > Okay, we can move to the symbolic names.  Do you want them to be that
>> long, or
>> > would:
>> >
>> > nvdimm-cap-cpu
>> > nvdimm-cap-mem-ctrl
>> > nvdimm-cap-mirroring
>>
>> Wait, why is mirroring part of this?
>
> This data structure is intended to report any kind of platform capability, not
> just platform persistence capabilities.

Yes, but here's nothing actionable that a qemu guest OS can do with
that mirroring information, so there's no need at this time to add cli
cruft and code to support it.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


RE: [qemu PATCH v4 3/4] nvdimm, acpi: support NFIT platform capabilities

2018-06-06 Thread Elliott, Robert (Persistent Memory)


> > Okay, we can move to the symbolic names.  Do you want them to be that
> long, or
> > would:
> >
> > nvdimm-cap-cpu
> > nvdimm-cap-mem-ctrl
> > nvdimm-cap-mirroring
> 
> Wait, why is mirroring part of this?

This data structure is intended to report any kind of platform capability, not 
just platform persistence capabilities.

We could add a short symbolic name to the definitions in ACPI that matches
the ones selected for this command line option, if that'll help people
find the right names to use.

I recommend mc rather than mem-ctrl to keep dashes as special.


___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v2 5/7] dm: remove DM_TYPE_DAX_BIO_BASED dm_queue_mode

2018-06-06 Thread Mike Snitzer
On Wed, Jun 06 2018 at  1:24P -0400,
Ross Zwisler  wrote:

> On Mon, Jun 04, 2018 at 08:46:28PM -0400, Mike Snitzer wrote:
> > On Mon, Jun 04 2018 at  7:24pm -0400,
> > Ross Zwisler  wrote:
> > 
> > > On Fri, Jun 01, 2018 at 06:04:43PM -0400, Mike Snitzer wrote:
> > > > On Tue, May 29 2018 at  3:51pm -0400,
> > > > Ross Zwisler  wrote:
> > > > 
> > > > > The DM_TYPE_DAX_BIO_BASED dm_queue_mode was introduced to prevent DM
> > > > > devices that could possibly support DAX from transitioning into DM 
> > > > > devices
> > > > > that cannot support DAX.
> > > > > 
> > > > > For example, the following transition will currently fail:
> > > > > 
> > > > >  dm-linear: [fsdax pmem][fsdax pmem] => [fsdax pmem][fsdax raw]
> > > > > DM_TYPE_DAX_BIO_BASED   DM_TYPE_BIO_BASED
> > > > > 
> > > > > but these will both succeed:
> > > > > 
> > > > >  dm-linear: [fsdax pmem][brd ramdisk] => [fsdax pmem][fsdax raw]
> > > > > DM_TYPE_DAX_BIO_BASEDDM_TYPE_BIO_BASED
> > > > > 
> > > > 
> > > > I fail to see how this succeeds given
> > > > drivers/md/dm-ioctl.c:is_valid_type() only allows transitions from:
> > > > 
> > > > DM_TYPE_BIO_BASED => DM_TYPE_DAX_BIO_BASED
> > > 
> > > Right, sorry, that was a typo.  What I meant was:
> > > 
> > > > For example, the following transition will currently fail:
> > > > 
> > > >  dm-linear: [fsdax pmem][fsdax pmem] => [fsdax pmem][fsdax raw]
> > > >   DM_TYPE_DAX_BIO_BASED   DM_TYPE_BIO_BASED
> > > > 
> > > > but these will both succeed:
> > > > 
> > > >  dm-linear: [fsdax pmem][brd ramdisk] => [fsdax pmem][fsdax raw]
> > > > DM_TYPE_BIO_BASEDDM_TYPE_BIO_BASED
> > > > 
> > > >  dm-linear: [fsdax pmem][fsdax raw] => [fsdax pmem][fsdax pmem]
> > > > DM_TYPE_BIO_BASEDDM_TYPE_DAX_BIO_BASED
> > > 
> > > So we allow 2 of the 3 transitions, but the reason that we disallow the 
> > > third
> > > isn't fully clear to me.
> > > 
> > > > >  dm-linear: [fsdax pmem][fsdax raw] => [fsdax pmem][fsdax pmem]
> > > > >   DM_TYPE_BIO_BASEDDM_TYPE_DAX_BIO_BASED
> > > > > 
> > > > > This seems arbitrary, as really the choice on whether to use DAX 
> > > > > happens at
> > > > > filesystem mount time.  There's no guarantee that the in the first 
> > > > > case
> > > > > (double fsdax pmem) we were using the dax mount option with our file
> > > > > system.
> > > > > 
> > > > > Instead, get rid of DM_TYPE_DAX_BIO_BASED and all the special casing 
> > > > > around
> > > > > it, and instead make the request queue's QUEUE_FLAG_DAX be our one 
> > > > > source
> > > > > of truth.  If this is set, we can use DAX, and if not, not.  We keep 
> > > > > this
> > > > > up to date in table_load() as the table changes.  As with regular 
> > > > > block
> > > > > devices the filesystem will then know at mount time whether DAX is a
> > > > > supported mount option or not.
> > > > 
> > > > If you don't think you need this specialization that is fine.. but DM
> > > > devices supporting suspending (as part of table reloads) so is there any
> > > > risk that there will be inflight IO (say if someone did 'dmsetup suspend
> > > > --noflush').. and then upon reload the device type changed out from
> > > > under us.. anyway, I don't have all the PMEM DAX stuff paged back into
> > > > my head yet.
> > > > 
> > > > But this just seems like we really shouldn't be allowing the
> > > > transition from what was DM_TYPE_DAX_BIO_BASED back to DM_TYPE_BIO_BASED
> > > 
> > > I admit I don't fully understand all the ways that DM supports suspending 
> > > and
> > > resuming devices.  Is there actually a case where we can change out the DM
> > > devices while I/O is running, and somehow end up trying to issue a DAX 
> > > I/O to
> > > a device that doesn't support DAX?
> > 
> > Yes, provided root permissions, it's very easy to dmsetup 
> > suspend/load/resume
> > to replace any portion of the DM device's logical address space to map to an
> > entirely different DM target (with a different backing store).  It's
> > pretty intrusive to do such things, but easily done and powerful.
> > 
> > Mike
> 
> Hmmm, I don't understand how you can do this if there is a filesystem built on
> your DM device?  Say you have a DM device, either striped or linear, that is
> made up of 2 devices, and then you use dmsetup to replace one of the DM member
> devices with something else.  You've just swapped out half of your LBA space
> with new data, right? 
> 
> I don't understand how you can expect a filesystem built on the old DM device
> to still work?  You especially can't do this while the filesystem is mounted -
> all the in-core filesystem metadata would be garbage because the on-media data
> would have totally changed.

Sure it can cause you to no longer have access to the original backing
store (e.g. swapping in an "error" target instead of linear).

But this ability to suspend a DM device with a table that is using
"linear", load a new table 

[ndctl PATCH v4] ndctl: add an api for getting the ars_status overflow flag

2018-06-06 Thread Vishal Verma
The ARS status command defines a 'flags' field that wasn't being exposed
via an API yet. Since there is only one flag defined in ACPI 6.2, add a
helper for retrieving it (overflow flag).

Reported-by: Jacek Zloch 
Cc: Dan Williams 
Reviewed-by: Dan Williams 
Signed-off-by: Vishal Verma 
---
 ndctl/lib/ars.c| 11 +++
 ndctl/lib/libndctl.sym |  1 +
 ndctl/lib/private.h|  3 +++
 ndctl/libndctl.h   |  1 +
 4 files changed, 16 insertions(+)

v4: move the flag mask to private.h (Dan)

diff --git a/ndctl/lib/ars.c b/ndctl/lib/ars.c
index e04b51e..c78e3bf 100644
--- a/ndctl/lib/ars.c
+++ b/ndctl/lib/ars.c
@@ -269,6 +269,17 @@ NDCTL_EXPORT unsigned long long 
ndctl_cmd_ars_get_record_len(
return ars_stat->ars_status->records[rec_index].length;
 }
 
+NDCTL_EXPORT int ndctl_cmd_ars_stat_get_flag_overflow(
+   struct ndctl_cmd *ars_stat)
+{
+   struct ndctl_ctx *ctx = ndctl_bus_get_ctx(cmd_to_bus(ars_stat));
+
+   if (!validate_ars_stat(ctx, ars_stat))
+   return -EINVAL;
+
+   return !!(ars_stat->ars_status->flags & ND_ARS_STAT_FLAG_OVERFLOW);
+}
+
 NDCTL_EXPORT struct ndctl_cmd *ndctl_bus_cmd_new_clear_error(
unsigned long long address, unsigned long long len,
struct ndctl_cmd *ars_cap)
diff --git a/ndctl/lib/libndctl.sym b/ndctl/lib/libndctl.sym
index c1228e5..e939993 100644
--- a/ndctl/lib/libndctl.sym
+++ b/ndctl/lib/libndctl.sym
@@ -365,4 +365,5 @@ global:
ndctl_cmd_ars_cap_get_clear_unit;
ndctl_namespace_inject_error2;
ndctl_namespace_uninject_error2;
+   ndctl_cmd_ars_stat_get_flag_overflow;
 } LIBNDCTL_15;
diff --git a/ndctl/lib/private.h b/ndctl/lib/private.h
index 73bbeed..b756b74 100644
--- a/ndctl/lib/private.h
+++ b/ndctl/lib/private.h
@@ -278,6 +278,9 @@ struct ndctl_bb {
struct list_node list;
 };
 
+/* ars_status flags */
+#define ND_ARS_STAT_FLAG_OVERFLOW (1 << 0)
+
 struct ndctl_dimm_ops {
const char *(*cmd_desc)(int);
struct ndctl_cmd *(*new_smart)(struct ndctl_dimm *);
diff --git a/ndctl/libndctl.h b/ndctl/libndctl.h
index be997ac..9270bae 100644
--- a/ndctl/libndctl.h
+++ b/ndctl/libndctl.h
@@ -210,6 +210,7 @@ struct ndctl_cmd *ndctl_bus_cmd_new_clear_error(unsigned 
long long address,
 unsigned long long ndctl_cmd_clear_error_get_cleared(
struct ndctl_cmd *clear_err);
 unsigned int ndctl_cmd_ars_cap_get_clear_unit(struct ndctl_cmd *ars_cap);
+int ndctl_cmd_ars_stat_get_flag_overflow(struct ndctl_cmd *ars_stat);
 
 /*
  * Note: ndctl_cmd_smart_get_temperature is an alias for
-- 
2.17.0

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [ndctl PATCH v3] ndctl: add an api for getting the ars_status overflow flag

2018-06-06 Thread Verma, Vishal L
On Wed, 2018-06-06 at 13:22 -0700, Dan Williams wrote:
> On Wed, Jun 6, 2018 at 1:18 PM, Vishal Verma 
> wrote:
> > The ARS status command defines a 'flags' field that wasn't being
> > exposed
> > via an API yet. Since there is only one flag defined in ACPI 6.2, add a
> > helper for retrieving it (overflow flag).
> > 
> > Reported-by: Jacek Zloch 
> > Cc: Dan Williams 
> > Signed-off-by: Vishal Verma 
> > ---
> >  ndctl/lib/ars.c| 11 +++
> >  ndctl/lib/libndctl.sym |  1 +
> >  ndctl/libndctl.h   |  4 
> >  3 files changed, 16 insertions(+)
> > 
> > v3: ensure we can only return 0, 1, or -error from this interface.
> > (Dan)
> > 
> > diff --git a/ndctl/lib/ars.c b/ndctl/lib/ars.c
> > index e04b51e..c78e3bf 100644
> > --- a/ndctl/lib/ars.c
> > +++ b/ndctl/lib/ars.c
> > @@ -269,6 +269,17 @@ NDCTL_EXPORT unsigned long long
> > ndctl_cmd_ars_get_record_len(
> > return ars_stat->ars_status->records[rec_index].length;
> >  }
> > 
> > +NDCTL_EXPORT int ndctl_cmd_ars_stat_get_flag_overflow(
> > +   struct ndctl_cmd *ars_stat)
> > +{
> > +   struct ndctl_ctx *ctx =
> > ndctl_bus_get_ctx(cmd_to_bus(ars_stat));
> > +
> > +   if (!validate_ars_stat(ctx, ars_stat))
> > +   return -EINVAL;
> > +
> > +   return !!(ars_stat->ars_status->flags &
> > ND_ARS_STAT_FLAG_OVERFLOW);
> > +}
> > +
> >  NDCTL_EXPORT struct ndctl_cmd *ndctl_bus_cmd_new_clear_error(
> > unsigned long long address, unsigned long long len,
> > struct ndctl_cmd *ars_cap)
> > diff --git a/ndctl/lib/libndctl.sym b/ndctl/lib/libndctl.sym
> > index c1228e5..e939993 100644
> > --- a/ndctl/lib/libndctl.sym
> > +++ b/ndctl/lib/libndctl.sym
> > @@ -365,4 +365,5 @@ global:
> > ndctl_cmd_ars_cap_get_clear_unit;
> > ndctl_namespace_inject_error2;
> > ndctl_namespace_uninject_error2;
> > +   ndctl_cmd_ars_stat_get_flag_overflow;
> >  } LIBNDCTL_15;
> > diff --git a/ndctl/libndctl.h b/ndctl/libndctl.h
> > index be997ac..3d141a6 100644
> > --- a/ndctl/libndctl.h
> > +++ b/ndctl/libndctl.h
> > @@ -187,6 +187,9 @@ int ndctl_dimm_is_enabled(struct ndctl_dimm *dimm);
> >  int ndctl_dimm_disable(struct ndctl_dimm *dimm);
> >  int ndctl_dimm_enable(struct ndctl_dimm *dimm);
> > 
> > +/* ars_status flags */
> > +#define ND_ARS_STAT_FLAG_OVERFLOW (1 << 0)
> 
> Oh, sorry, one more thing. This should move to ndctl/lib/private.h,
> right?

Yes, good catch, thanks!

> 
> With that you can add:
> 
> Reviewed-by: Dan Williams 
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [ndctl PATCH v3] ndctl: add an api for getting the ars_status overflow flag

2018-06-06 Thread Dan Williams
On Wed, Jun 6, 2018 at 1:18 PM, Vishal Verma  wrote:
> The ARS status command defines a 'flags' field that wasn't being exposed
> via an API yet. Since there is only one flag defined in ACPI 6.2, add a
> helper for retrieving it (overflow flag).
>
> Reported-by: Jacek Zloch 
> Cc: Dan Williams 
> Signed-off-by: Vishal Verma 
> ---
>  ndctl/lib/ars.c| 11 +++
>  ndctl/lib/libndctl.sym |  1 +
>  ndctl/libndctl.h   |  4 
>  3 files changed, 16 insertions(+)
>
> v3: ensure we can only return 0, 1, or -error from this interface. (Dan)
>
> diff --git a/ndctl/lib/ars.c b/ndctl/lib/ars.c
> index e04b51e..c78e3bf 100644
> --- a/ndctl/lib/ars.c
> +++ b/ndctl/lib/ars.c
> @@ -269,6 +269,17 @@ NDCTL_EXPORT unsigned long long 
> ndctl_cmd_ars_get_record_len(
> return ars_stat->ars_status->records[rec_index].length;
>  }
>
> +NDCTL_EXPORT int ndctl_cmd_ars_stat_get_flag_overflow(
> +   struct ndctl_cmd *ars_stat)
> +{
> +   struct ndctl_ctx *ctx = ndctl_bus_get_ctx(cmd_to_bus(ars_stat));
> +
> +   if (!validate_ars_stat(ctx, ars_stat))
> +   return -EINVAL;
> +
> +   return !!(ars_stat->ars_status->flags & ND_ARS_STAT_FLAG_OVERFLOW);
> +}
> +
>  NDCTL_EXPORT struct ndctl_cmd *ndctl_bus_cmd_new_clear_error(
> unsigned long long address, unsigned long long len,
> struct ndctl_cmd *ars_cap)
> diff --git a/ndctl/lib/libndctl.sym b/ndctl/lib/libndctl.sym
> index c1228e5..e939993 100644
> --- a/ndctl/lib/libndctl.sym
> +++ b/ndctl/lib/libndctl.sym
> @@ -365,4 +365,5 @@ global:
> ndctl_cmd_ars_cap_get_clear_unit;
> ndctl_namespace_inject_error2;
> ndctl_namespace_uninject_error2;
> +   ndctl_cmd_ars_stat_get_flag_overflow;
>  } LIBNDCTL_15;
> diff --git a/ndctl/libndctl.h b/ndctl/libndctl.h
> index be997ac..3d141a6 100644
> --- a/ndctl/libndctl.h
> +++ b/ndctl/libndctl.h
> @@ -187,6 +187,9 @@ int ndctl_dimm_is_enabled(struct ndctl_dimm *dimm);
>  int ndctl_dimm_disable(struct ndctl_dimm *dimm);
>  int ndctl_dimm_enable(struct ndctl_dimm *dimm);
>
> +/* ars_status flags */
> +#define ND_ARS_STAT_FLAG_OVERFLOW (1 << 0)

Oh, sorry, one more thing. This should move to ndctl/lib/private.h, right?

With that you can add:

Reviewed-by: Dan Williams 
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[ndctl PATCH v3] ndctl: add an api for getting the ars_status overflow flag

2018-06-06 Thread Vishal Verma
The ARS status command defines a 'flags' field that wasn't being exposed
via an API yet. Since there is only one flag defined in ACPI 6.2, add a
helper for retrieving it (overflow flag).

Reported-by: Jacek Zloch 
Cc: Dan Williams 
Signed-off-by: Vishal Verma 
---
 ndctl/lib/ars.c| 11 +++
 ndctl/lib/libndctl.sym |  1 +
 ndctl/libndctl.h   |  4 
 3 files changed, 16 insertions(+)

v3: ensure we can only return 0, 1, or -error from this interface. (Dan)

diff --git a/ndctl/lib/ars.c b/ndctl/lib/ars.c
index e04b51e..c78e3bf 100644
--- a/ndctl/lib/ars.c
+++ b/ndctl/lib/ars.c
@@ -269,6 +269,17 @@ NDCTL_EXPORT unsigned long long 
ndctl_cmd_ars_get_record_len(
return ars_stat->ars_status->records[rec_index].length;
 }
 
+NDCTL_EXPORT int ndctl_cmd_ars_stat_get_flag_overflow(
+   struct ndctl_cmd *ars_stat)
+{
+   struct ndctl_ctx *ctx = ndctl_bus_get_ctx(cmd_to_bus(ars_stat));
+
+   if (!validate_ars_stat(ctx, ars_stat))
+   return -EINVAL;
+
+   return !!(ars_stat->ars_status->flags & ND_ARS_STAT_FLAG_OVERFLOW);
+}
+
 NDCTL_EXPORT struct ndctl_cmd *ndctl_bus_cmd_new_clear_error(
unsigned long long address, unsigned long long len,
struct ndctl_cmd *ars_cap)
diff --git a/ndctl/lib/libndctl.sym b/ndctl/lib/libndctl.sym
index c1228e5..e939993 100644
--- a/ndctl/lib/libndctl.sym
+++ b/ndctl/lib/libndctl.sym
@@ -365,4 +365,5 @@ global:
ndctl_cmd_ars_cap_get_clear_unit;
ndctl_namespace_inject_error2;
ndctl_namespace_uninject_error2;
+   ndctl_cmd_ars_stat_get_flag_overflow;
 } LIBNDCTL_15;
diff --git a/ndctl/libndctl.h b/ndctl/libndctl.h
index be997ac..3d141a6 100644
--- a/ndctl/libndctl.h
+++ b/ndctl/libndctl.h
@@ -187,6 +187,9 @@ int ndctl_dimm_is_enabled(struct ndctl_dimm *dimm);
 int ndctl_dimm_disable(struct ndctl_dimm *dimm);
 int ndctl_dimm_enable(struct ndctl_dimm *dimm);
 
+/* ars_status flags */
+#define ND_ARS_STAT_FLAG_OVERFLOW (1 << 0)
+
 struct ndctl_cmd;
 struct ndctl_cmd *ndctl_bus_cmd_new_ars_cap(struct ndctl_bus *bus,
unsigned long long address, unsigned long long len);
@@ -210,6 +213,7 @@ struct ndctl_cmd *ndctl_bus_cmd_new_clear_error(unsigned 
long long address,
 unsigned long long ndctl_cmd_clear_error_get_cleared(
struct ndctl_cmd *clear_err);
 unsigned int ndctl_cmd_ars_cap_get_clear_unit(struct ndctl_cmd *ars_cap);
+int ndctl_cmd_ars_stat_get_flag_overflow(struct ndctl_cmd *ars_stat);
 
 /*
  * Note: ndctl_cmd_smart_get_temperature is an alias for
-- 
2.17.0

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [ndctl PATCH v2] ndctl: add an api for getting the ars_status overflow flag

2018-06-06 Thread Verma, Vishal L
On Wed, 2018-06-06 at 13:00 -0700, Dan Williams wrote:
> On Wed, Jun 6, 2018 at 12:53 PM, Verma, Vishal L
>  wrote:
> > On Wed, 2018-06-06 at 12:51 -0700, Dan Williams wrote:
> > > On Wed, Jun 6, 2018 at 11:26 AM, Vishal Verma  > > om>
> > > wrote:
> > > > The ARS status command defines a 'flags' field that wasn't being
> > > > exposed
> > > > via an API yet. Since there is only one flag defined in ACPI 6.2,
> > > > add a
> > > > helper for retrieving it (overflow flag).
> > > > 
> > > > Reported-by: Jacek Zloch 
> > > > Cc: Dan Williams 
> > > > Signed-off-by: Vishal Verma 
> > > > ---
> > > >  ndctl/lib/ars.c| 11 +++
> > > >  ndctl/lib/libndctl.sym |  1 +
> > > >  ndctl/libndctl.h   |  4 
> > > >  3 files changed, 16 insertions(+)
> > > > 
> > > > v2: instead of exposing the binary representation of flags, provide
> > > > a
> > > > helper for each flag. ACPI currently defines a single 'overflow'
> > > > flag.
> > > > (Dan)
> > > > 
> > > > diff --git a/ndctl/lib/ars.c b/ndctl/lib/ars.c
> > > > index e04b51e..b765c88 100644
> > > > --- a/ndctl/lib/ars.c
> > > > +++ b/ndctl/lib/ars.c
> > > > @@ -269,6 +269,17 @@ NDCTL_EXPORT unsigned long long
> > > > ndctl_cmd_ars_get_record_len(
> > > > return ars_stat->ars_status->records[rec_index].length;
> > > >  }
> > > > 
> > > > +NDCTL_EXPORT int ndctl_cmd_ars_stat_get_flag_overflow(
> > > > +   struct ndctl_cmd *ars_stat)
> > > > +{
> > > > +   struct ndctl_ctx *ctx =
> > > > ndctl_bus_get_ctx(cmd_to_bus(ars_stat));
> > > > +
> > > > +   if (!validate_ars_stat(ctx, ars_stat))
> > > > +   return -EINVAL;
> > > > +
> > > > +   return (ars_stat->ars_status->flags &
> > > > ND_ARS_STAT_FLAG_OVERFLOW);
> > > > +}
> > > 
> > > How about return bool since it's a flag?
> > 
> > I considered it, but int allows us to return an error for an invalid
> > status
> > command. If we use a bool, should we just return 'false' for a bad
> > command?
> 
> Oh, sorry, missed that. In that case let's do:
> 
>return !!(ars_stat->ars_status->flags & ND_ARS_STAT_FLAG_OVERFLOW);
> 
> So it's defined to be 0, 1, or -error.

Ok, yep that is better, I'll fix and send a new version.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [ndctl PATCH v2] ndctl: add an api for getting the ars_status overflow flag

2018-06-06 Thread Dan Williams
On Wed, Jun 6, 2018 at 12:53 PM, Verma, Vishal L
 wrote:
> On Wed, 2018-06-06 at 12:51 -0700, Dan Williams wrote:
>> On Wed, Jun 6, 2018 at 11:26 AM, Vishal Verma 
>> wrote:
>> > The ARS status command defines a 'flags' field that wasn't being
>> > exposed
>> > via an API yet. Since there is only one flag defined in ACPI 6.2, add a
>> > helper for retrieving it (overflow flag).
>> >
>> > Reported-by: Jacek Zloch 
>> > Cc: Dan Williams 
>> > Signed-off-by: Vishal Verma 
>> > ---
>> >  ndctl/lib/ars.c| 11 +++
>> >  ndctl/lib/libndctl.sym |  1 +
>> >  ndctl/libndctl.h   |  4 
>> >  3 files changed, 16 insertions(+)
>> >
>> > v2: instead of exposing the binary representation of flags, provide a
>> > helper for each flag. ACPI currently defines a single 'overflow' flag.
>> > (Dan)
>> >
>> > diff --git a/ndctl/lib/ars.c b/ndctl/lib/ars.c
>> > index e04b51e..b765c88 100644
>> > --- a/ndctl/lib/ars.c
>> > +++ b/ndctl/lib/ars.c
>> > @@ -269,6 +269,17 @@ NDCTL_EXPORT unsigned long long
>> > ndctl_cmd_ars_get_record_len(
>> > return ars_stat->ars_status->records[rec_index].length;
>> >  }
>> >
>> > +NDCTL_EXPORT int ndctl_cmd_ars_stat_get_flag_overflow(
>> > +   struct ndctl_cmd *ars_stat)
>> > +{
>> > +   struct ndctl_ctx *ctx =
>> > ndctl_bus_get_ctx(cmd_to_bus(ars_stat));
>> > +
>> > +   if (!validate_ars_stat(ctx, ars_stat))
>> > +   return -EINVAL;
>> > +
>> > +   return (ars_stat->ars_status->flags &
>> > ND_ARS_STAT_FLAG_OVERFLOW);
>> > +}
>>
>> How about return bool since it's a flag?
>
> I considered it, but int allows us to return an error for an invalid status
> command. If we use a bool, should we just return 'false' for a bad command?

Oh, sorry, missed that. In that case let's do:

   return !!(ars_stat->ars_status->flags & ND_ARS_STAT_FLAG_OVERFLOW);

So it's defined to be 0, 1, or -error.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [ndctl PATCH v2] ndctl: add an api for getting the ars_status overflow flag

2018-06-06 Thread Verma, Vishal L
On Wed, 2018-06-06 at 12:51 -0700, Dan Williams wrote:
> On Wed, Jun 6, 2018 at 11:26 AM, Vishal Verma 
> wrote:
> > The ARS status command defines a 'flags' field that wasn't being
> > exposed
> > via an API yet. Since there is only one flag defined in ACPI 6.2, add a
> > helper for retrieving it (overflow flag).
> > 
> > Reported-by: Jacek Zloch 
> > Cc: Dan Williams 
> > Signed-off-by: Vishal Verma 
> > ---
> >  ndctl/lib/ars.c| 11 +++
> >  ndctl/lib/libndctl.sym |  1 +
> >  ndctl/libndctl.h   |  4 
> >  3 files changed, 16 insertions(+)
> > 
> > v2: instead of exposing the binary representation of flags, provide a
> > helper for each flag. ACPI currently defines a single 'overflow' flag.
> > (Dan)
> > 
> > diff --git a/ndctl/lib/ars.c b/ndctl/lib/ars.c
> > index e04b51e..b765c88 100644
> > --- a/ndctl/lib/ars.c
> > +++ b/ndctl/lib/ars.c
> > @@ -269,6 +269,17 @@ NDCTL_EXPORT unsigned long long
> > ndctl_cmd_ars_get_record_len(
> > return ars_stat->ars_status->records[rec_index].length;
> >  }
> > 
> > +NDCTL_EXPORT int ndctl_cmd_ars_stat_get_flag_overflow(
> > +   struct ndctl_cmd *ars_stat)
> > +{
> > +   struct ndctl_ctx *ctx =
> > ndctl_bus_get_ctx(cmd_to_bus(ars_stat));
> > +
> > +   if (!validate_ars_stat(ctx, ars_stat))
> > +   return -EINVAL;
> > +
> > +   return (ars_stat->ars_status->flags &
> > ND_ARS_STAT_FLAG_OVERFLOW);
> > +}
> 
> How about return bool since it's a flag?

I considered it, but int allows us to return an error for an invalid status
command. If we use a bool, should we just return 'false' for a bad command?
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [ndctl PATCH v2] ndctl: add an api for getting the ars_status overflow flag

2018-06-06 Thread Dan Williams
On Wed, Jun 6, 2018 at 11:26 AM, Vishal Verma  wrote:
> The ARS status command defines a 'flags' field that wasn't being exposed
> via an API yet. Since there is only one flag defined in ACPI 6.2, add a
> helper for retrieving it (overflow flag).
>
> Reported-by: Jacek Zloch 
> Cc: Dan Williams 
> Signed-off-by: Vishal Verma 
> ---
>  ndctl/lib/ars.c| 11 +++
>  ndctl/lib/libndctl.sym |  1 +
>  ndctl/libndctl.h   |  4 
>  3 files changed, 16 insertions(+)
>
> v2: instead of exposing the binary representation of flags, provide a
> helper for each flag. ACPI currently defines a single 'overflow' flag.
> (Dan)
>
> diff --git a/ndctl/lib/ars.c b/ndctl/lib/ars.c
> index e04b51e..b765c88 100644
> --- a/ndctl/lib/ars.c
> +++ b/ndctl/lib/ars.c
> @@ -269,6 +269,17 @@ NDCTL_EXPORT unsigned long long 
> ndctl_cmd_ars_get_record_len(
> return ars_stat->ars_status->records[rec_index].length;
>  }
>
> +NDCTL_EXPORT int ndctl_cmd_ars_stat_get_flag_overflow(
> +   struct ndctl_cmd *ars_stat)
> +{
> +   struct ndctl_ctx *ctx = ndctl_bus_get_ctx(cmd_to_bus(ars_stat));
> +
> +   if (!validate_ars_stat(ctx, ars_stat))
> +   return -EINVAL;
> +
> +   return (ars_stat->ars_status->flags & ND_ARS_STAT_FLAG_OVERFLOW);
> +}

How about return bool since it's a flag?
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [qemu PATCH v4 3/4] nvdimm, acpi: support NFIT platform capabilities

2018-06-06 Thread Michael S. Tsirkin
On Wed, Jun 06, 2018 at 10:50:51AM -0600, Ross Zwisler wrote:
> On Tue, Jun 05, 2018 at 03:21:30PM -0700, Dan Williams wrote:
> > On Tue, Jun 5, 2018 at 3:07 PM, Ross Zwisler
> >  wrote:
> > > On Tue, Jun 05, 2018 at 09:37:25PM +0300, Michael S. Tsirkin wrote:
> > >> On Tue, Jun 05, 2018 at 11:15:00AM -0700, Dan Williams wrote:
> > >> > On Tue, Jun 5, 2018 at 9:42 AM, Ross Zwisler
> > >> >  wrote:
> > >> > > On Tue, Jun 05, 2018 at 06:25:27PM +0300, Michael S. Tsirkin wrote:
> > >> > >> On Mon, May 21, 2018 at 10:32:02AM -0600, Ross Zwisler wrote:
> > >> > >> > Add a machine command line option to allow the user to control 
> > >> > >> > the Platform
> > >> > >> > Capabilities Structure in the virtualized NFIT.  This Platform 
> > >> > >> > Capabilities
> > >> > >> > Structure was added in ACPI 6.2 Errata A.
> > >> > >> >
> > >> > >> > Signed-off-by: Ross Zwisler 
> > >> > >>
> > >> > >> I tried playing with it and encoding the capabilities is
> > >> > >> quite awkward.
> > >> > >>
> > >> > >> Can we add bits for specific capabilities instead of nvdimm-cap?
> > >> > >>
> > >> > >> How about:
> > >> > >>
> > >> > >> "cpu-flush-on-power-loss-cap"
> > >> > >> "memory-flush-on-power-loss-cap"
> > >> > >> "byte-addressable-mirroring-cap"
> > >> > >
> > >> > > Hmmm...I don't like that as much because:
> > >> > >
> > >> > > a) It's very verbose.  Looking at my current qemu command line few 
> > >> > > other
> > >> > >options require that many characters, and you'd commonly be 
> > >> > > defining more
> > >> > >than one of these for a given VM.
> > >> > >
> > >> > > b) It means that the QEMU will need to be updated if/when new flags 
> > >> > > are added,
> > >> > >because we'll have to have new options for each flag.  The current
> > >> > >implementation is more future-proof because you can specify any 
> > >> > > flags
> > >> > >value you want.
> > >> > >
> > >> > > However, if you feel strongly about this, I'll make the change.
> > >> >
> > >> > Straw-man: Could we do something similar with what we are doing in 
> > >> > ndctl?
> > >> >
> > >> > enum ndctl_persistence_domain {
> > >> > PERSISTENCE_NONE = 0,
> > >> > PERSISTENCE_MEM_CTRL = 10,
> > >> > PERSISTENCE_CPU_CACHE = 20,
> > >> > PERSISTENCE_UNKNOWN = INT_MAX,
> > >> > };
> > >> >
> > >> > ...and have the command line take a number where "10" and "20" are
> > >> > supported today, but allows us to adapt to new persistence domains in
> > >> > the future.
> > >>
> > >> I'm fine with that except can we have symbolic names instead of numbers
> > >> on command line?
> > >>
> > >> --
> > >> MST
> > >
> > > Okay, we can move to the symbolic names.  Do you want them to be that 
> > > long, or
> > > would:
> > >
> > > nvdimm-cap-cpu
> > > nvdimm-cap-mem-ctrl
> > > nvdimm-cap-mirroring
> > 
> > Wait, why is mirroring part of this?
> > 
> > I was thinking this option would be:
> > 
> > --persistence-domain={cpu,mem-ctrl}
> > 
> > ...and try not to let ACPI specifics leak into the qemu command line
> > interface. For example PowerPC qemu could have a persistence domain
> > communicated via Open Firmware or some other mechanism.
> 
> Sure, this seems fine, though we may want to throw an "nvdimm" in the name
> somewhere so it's clear what the option affects.
> 
> nvdimm-persistence={cpu,mem-ctrl} maybe?
> 
> Michael, does this work for you?

Sounds good to me.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [qemu PATCH v4 3/4] nvdimm, acpi: support NFIT platform capabilities

2018-06-06 Thread Michael S. Tsirkin
On Wed, Jun 06, 2018 at 11:08:49AM -0600, Ross Zwisler wrote:
> On Wed, Jun 06, 2018 at 11:00:33AM -0600, Ross Zwisler wrote:
> > On Wed, Jun 06, 2018 at 10:50:51AM -0600, Ross Zwisler wrote:
> > > On Tue, Jun 05, 2018 at 03:21:30PM -0700, Dan Williams wrote:
> > > > On Tue, Jun 5, 2018 at 3:07 PM, Ross Zwisler
> > > >  wrote:
> > > > > On Tue, Jun 05, 2018 at 09:37:25PM +0300, Michael S. Tsirkin wrote:
> > > > >> On Tue, Jun 05, 2018 at 11:15:00AM -0700, Dan Williams wrote:
> > > > >> > On Tue, Jun 5, 2018 at 9:42 AM, Ross Zwisler
> > > > >> >  wrote:
> > > > >> > > On Tue, Jun 05, 2018 at 06:25:27PM +0300, Michael S. Tsirkin 
> > > > >> > > wrote:
> > > > >> > >> On Mon, May 21, 2018 at 10:32:02AM -0600, Ross Zwisler wrote:
> > > > >> > >> > Add a machine command line option to allow the user to 
> > > > >> > >> > control the Platform
> > > > >> > >> > Capabilities Structure in the virtualized NFIT.  This 
> > > > >> > >> > Platform Capabilities
> > > > >> > >> > Structure was added in ACPI 6.2 Errata A.
> > > > >> > >> >
> > > > >> > >> > Signed-off-by: Ross Zwisler 
> > > > >> > >>
> > > > >> > >> I tried playing with it and encoding the capabilities is
> > > > >> > >> quite awkward.
> > > > >> > >>
> > > > >> > >> Can we add bits for specific capabilities instead of nvdimm-cap?
> > > > >> > >>
> > > > >> > >> How about:
> > > > >> > >>
> > > > >> > >> "cpu-flush-on-power-loss-cap"
> > > > >> > >> "memory-flush-on-power-loss-cap"
> > > > >> > >> "byte-addressable-mirroring-cap"
> > > > >> > >
> > > > >> > > Hmmm...I don't like that as much because:
> > > > >> > >
> > > > >> > > a) It's very verbose.  Looking at my current qemu command line 
> > > > >> > > few other
> > > > >> > >options require that many characters, and you'd commonly be 
> > > > >> > > defining more
> > > > >> > >than one of these for a given VM.
> > > > >> > >
> > > > >> > > b) It means that the QEMU will need to be updated if/when new 
> > > > >> > > flags are added,
> > > > >> > >because we'll have to have new options for each flag.  The 
> > > > >> > > current
> > > > >> > >implementation is more future-proof because you can specify 
> > > > >> > > any flags
> > > > >> > >value you want.
> > > > >> > >
> > > > >> > > However, if you feel strongly about this, I'll make the change.
> > > > >> >
> > > > >> > Straw-man: Could we do something similar with what we are doing in 
> > > > >> > ndctl?
> > > > >> >
> > > > >> > enum ndctl_persistence_domain {
> > > > >> > PERSISTENCE_NONE = 0,
> > > > >> > PERSISTENCE_MEM_CTRL = 10,
> > > > >> > PERSISTENCE_CPU_CACHE = 20,
> > > > >> > PERSISTENCE_UNKNOWN = INT_MAX,
> > > > >> > };
> > > > >> >
> > > > >> > ...and have the command line take a number where "10" and "20" are
> > > > >> > supported today, but allows us to adapt to new persistence domains 
> > > > >> > in
> > > > >> > the future.
> > > > >>
> > > > >> I'm fine with that except can we have symbolic names instead of 
> > > > >> numbers
> > > > >> on command line?
> > > > >>
> > > > >> --
> > > > >> MST
> > > > >
> > > > > Okay, we can move to the symbolic names.  Do you want them to be that 
> > > > > long, or
> > > > > would:
> > > > >
> > > > > nvdimm-cap-cpu
> > > > > nvdimm-cap-mem-ctrl
> > > > > nvdimm-cap-mirroring
> > > > 
> > > > Wait, why is mirroring part of this?
> > > > 
> > > > I was thinking this option would be:
> > > > 
> > > > --persistence-domain={cpu,mem-ctrl}
> > > > 
> > > > ...and try not to let ACPI specifics leak into the qemu command line
> > > > interface. For example PowerPC qemu could have a persistence domain
> > > > communicated via Open Firmware or some other mechanism.
> > > 
> > > Sure, this seems fine, though we may want to throw an "nvdimm" in the name
> > > somewhere so it's clear what the option affects.
> > > 
> > > nvdimm-persistence={cpu,mem-ctrl} maybe?
> > > 
> > > Michael, does this work for you?
> > 
> > Hmm...also, the version of these patches that used numeric values did go
> > upstream in QEMU.  So, do we need to leave that interface intact, and just 
> > add
> > a new one that uses symbolic names?  Or did you still just want to replace 
> > the
> > numeric one since it hasn't appeared in a QEMU release yet?
> 
> I guess another alternative would be to just add symbolic name options to the
> existing command line option, i.e. allow all of these:
> 
> nvdimm-cap=3  # CPU cache
> nvdimm-cap=cpu# CPU cache
> nvdimm-cap=2  # memory controller
> nvdimm-cap=mem-ctrl   # memory controller
> 
> And just have them as aliases.  This retains backwards compatibility with
> what is already there, allows for other numeric values without QEMU updates if
> other bits are defined (though we are still tightly tied to ACPI in this
> case), and provides a symbolic name which is easier to use.
> 
> Thoughts?

I'm inclined to say let's just keep the symbolic names.
Less of a chance users configure something 

Re: [qemu PATCH v4 3/4] nvdimm, acpi: support NFIT platform capabilities

2018-06-06 Thread Michael S. Tsirkin
On Wed, Jun 06, 2018 at 11:00:33AM -0600, Ross Zwisler wrote:
> On Wed, Jun 06, 2018 at 10:50:51AM -0600, Ross Zwisler wrote:
> > On Tue, Jun 05, 2018 at 03:21:30PM -0700, Dan Williams wrote:
> > > On Tue, Jun 5, 2018 at 3:07 PM, Ross Zwisler
> > >  wrote:
> > > > On Tue, Jun 05, 2018 at 09:37:25PM +0300, Michael S. Tsirkin wrote:
> > > >> On Tue, Jun 05, 2018 at 11:15:00AM -0700, Dan Williams wrote:
> > > >> > On Tue, Jun 5, 2018 at 9:42 AM, Ross Zwisler
> > > >> >  wrote:
> > > >> > > On Tue, Jun 05, 2018 at 06:25:27PM +0300, Michael S. Tsirkin wrote:
> > > >> > >> On Mon, May 21, 2018 at 10:32:02AM -0600, Ross Zwisler wrote:
> > > >> > >> > Add a machine command line option to allow the user to control 
> > > >> > >> > the Platform
> > > >> > >> > Capabilities Structure in the virtualized NFIT.  This Platform 
> > > >> > >> > Capabilities
> > > >> > >> > Structure was added in ACPI 6.2 Errata A.
> > > >> > >> >
> > > >> > >> > Signed-off-by: Ross Zwisler 
> > > >> > >>
> > > >> > >> I tried playing with it and encoding the capabilities is
> > > >> > >> quite awkward.
> > > >> > >>
> > > >> > >> Can we add bits for specific capabilities instead of nvdimm-cap?
> > > >> > >>
> > > >> > >> How about:
> > > >> > >>
> > > >> > >> "cpu-flush-on-power-loss-cap"
> > > >> > >> "memory-flush-on-power-loss-cap"
> > > >> > >> "byte-addressable-mirroring-cap"
> > > >> > >
> > > >> > > Hmmm...I don't like that as much because:
> > > >> > >
> > > >> > > a) It's very verbose.  Looking at my current qemu command line few 
> > > >> > > other
> > > >> > >options require that many characters, and you'd commonly be 
> > > >> > > defining more
> > > >> > >than one of these for a given VM.
> > > >> > >
> > > >> > > b) It means that the QEMU will need to be updated if/when new 
> > > >> > > flags are added,
> > > >> > >because we'll have to have new options for each flag.  The 
> > > >> > > current
> > > >> > >implementation is more future-proof because you can specify any 
> > > >> > > flags
> > > >> > >value you want.
> > > >> > >
> > > >> > > However, if you feel strongly about this, I'll make the change.
> > > >> >
> > > >> > Straw-man: Could we do something similar with what we are doing in 
> > > >> > ndctl?
> > > >> >
> > > >> > enum ndctl_persistence_domain {
> > > >> > PERSISTENCE_NONE = 0,
> > > >> > PERSISTENCE_MEM_CTRL = 10,
> > > >> > PERSISTENCE_CPU_CACHE = 20,
> > > >> > PERSISTENCE_UNKNOWN = INT_MAX,
> > > >> > };
> > > >> >
> > > >> > ...and have the command line take a number where "10" and "20" are
> > > >> > supported today, but allows us to adapt to new persistence domains in
> > > >> > the future.
> > > >>
> > > >> I'm fine with that except can we have symbolic names instead of numbers
> > > >> on command line?
> > > >>
> > > >> --
> > > >> MST
> > > >
> > > > Okay, we can move to the symbolic names.  Do you want them to be that 
> > > > long, or
> > > > would:
> > > >
> > > > nvdimm-cap-cpu
> > > > nvdimm-cap-mem-ctrl
> > > > nvdimm-cap-mirroring
> > > 
> > > Wait, why is mirroring part of this?
> > > 
> > > I was thinking this option would be:
> > > 
> > > --persistence-domain={cpu,mem-ctrl}
> > > 
> > > ...and try not to let ACPI specifics leak into the qemu command line
> > > interface. For example PowerPC qemu could have a persistence domain
> > > communicated via Open Firmware or some other mechanism.
> > 
> > Sure, this seems fine, though we may want to throw an "nvdimm" in the name
> > somewhere so it's clear what the option affects.
> > 
> > nvdimm-persistence={cpu,mem-ctrl} maybe?
> > 
> > Michael, does this work for you?
> 
> Hmm...also, the version of these patches that used numeric values did go
> upstream in QEMU.  So, do we need to leave that interface intact, and just add
> a new one that uses symbolic names?  Or did you still just want to replace the
> numeric one since it hasn't appeared in a QEMU release yet?

The later. No release => no stable APIs.

-- 
MST
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v3 2/4] libnvdimm: unconditionally deep flush on *sync

2018-06-06 Thread Dan Williams
On Wed, Jun 6, 2018 at 11:16 AM, Ross Zwisler
 wrote:
> On Wed, Jun 06, 2018 at 10:57:59AM -0700, Dan Williams wrote:
>> On Wed, Jun 6, 2018 at 9:45 AM, Ross Zwisler
>>  wrote:
>> > Prior to this commit we would only do a "deep flush" (have nvdimm_flush()
>> > write to each of the flush hints for a region) in response to an
>> > msync/fsync/sync call if the nvdimm_has_cache() returned true at the time
>> > we were setting up the request queue.  This happens due to the write cache
>> > value passed in to blk_queue_write_cache(), which then causes the block
>> > layer to send down BIOs with REQ_FUA and REQ_PREFLUSH set.  We do have a
>> > "write_cache" sysfs entry for namespaces, i.e.:
>> >
>> >   /sys/bus/nd/devices/pfn0.1/block/pmem0/dax/write_cache
>> >
>> > which can be used to control whether or not the kernel thinks a given
>> > namespace has a write cache, but this didn't modify the deep flush behavior
>> > that we set up when the driver was initialized.  Instead, it only modified
>> > whether or not DAX would flush CPU caches via dax_flush() in response to
>> > *sync calls.
>> >
>> > Simplify this by making the *sync deep flush always happen, regardless of
>> > the write cache setting of a namespace.  The DAX CPU cache flushing will
>> > still be controlled the write_cache setting of the namespace.
>> >
>> > Signed-off-by: Ross Zwisler 
>> > Suggested-by: Dan Williams 
>>
>> Looks, good. I believe we want this one and ["PATCH v3 4/4] libnvdimm:
>> don't flush power-fail protected CPU caches" marked for -stable and
>> tagged with:
>>
>> Fixes: 5fdf8e5ba566 ("libnvdimm: re-enable deep flush for pmem devices
>> via fsync()")
>>
>> ...any concerns with that?
>
> Nope, sounds good.  Can you fix that up when you apply, or would it be helpful
> for me to send another revision with those tags?

I'll fix it up, thanks Ross.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[ndctl PATCH v2] ndctl: add an api for getting the ars_status overflow flag

2018-06-06 Thread Vishal Verma
The ARS status command defines a 'flags' field that wasn't being exposed
via an API yet. Since there is only one flag defined in ACPI 6.2, add a
helper for retrieving it (overflow flag).

Reported-by: Jacek Zloch 
Cc: Dan Williams 
Signed-off-by: Vishal Verma 
---
 ndctl/lib/ars.c| 11 +++
 ndctl/lib/libndctl.sym |  1 +
 ndctl/libndctl.h   |  4 
 3 files changed, 16 insertions(+)

v2: instead of exposing the binary representation of flags, provide a
helper for each flag. ACPI currently defines a single 'overflow' flag.
(Dan)

diff --git a/ndctl/lib/ars.c b/ndctl/lib/ars.c
index e04b51e..b765c88 100644
--- a/ndctl/lib/ars.c
+++ b/ndctl/lib/ars.c
@@ -269,6 +269,17 @@ NDCTL_EXPORT unsigned long long 
ndctl_cmd_ars_get_record_len(
return ars_stat->ars_status->records[rec_index].length;
 }
 
+NDCTL_EXPORT int ndctl_cmd_ars_stat_get_flag_overflow(
+   struct ndctl_cmd *ars_stat)
+{
+   struct ndctl_ctx *ctx = ndctl_bus_get_ctx(cmd_to_bus(ars_stat));
+
+   if (!validate_ars_stat(ctx, ars_stat))
+   return -EINVAL;
+
+   return (ars_stat->ars_status->flags & ND_ARS_STAT_FLAG_OVERFLOW);
+}
+
 NDCTL_EXPORT struct ndctl_cmd *ndctl_bus_cmd_new_clear_error(
unsigned long long address, unsigned long long len,
struct ndctl_cmd *ars_cap)
diff --git a/ndctl/lib/libndctl.sym b/ndctl/lib/libndctl.sym
index c1228e5..e939993 100644
--- a/ndctl/lib/libndctl.sym
+++ b/ndctl/lib/libndctl.sym
@@ -365,4 +365,5 @@ global:
ndctl_cmd_ars_cap_get_clear_unit;
ndctl_namespace_inject_error2;
ndctl_namespace_uninject_error2;
+   ndctl_cmd_ars_stat_get_flag_overflow;
 } LIBNDCTL_15;
diff --git a/ndctl/libndctl.h b/ndctl/libndctl.h
index be997ac..3d141a6 100644
--- a/ndctl/libndctl.h
+++ b/ndctl/libndctl.h
@@ -187,6 +187,9 @@ int ndctl_dimm_is_enabled(struct ndctl_dimm *dimm);
 int ndctl_dimm_disable(struct ndctl_dimm *dimm);
 int ndctl_dimm_enable(struct ndctl_dimm *dimm);
 
+/* ars_status flags */
+#define ND_ARS_STAT_FLAG_OVERFLOW (1 << 0)
+
 struct ndctl_cmd;
 struct ndctl_cmd *ndctl_bus_cmd_new_ars_cap(struct ndctl_bus *bus,
unsigned long long address, unsigned long long len);
@@ -210,6 +213,7 @@ struct ndctl_cmd *ndctl_bus_cmd_new_clear_error(unsigned 
long long address,
 unsigned long long ndctl_cmd_clear_error_get_cleared(
struct ndctl_cmd *clear_err);
 unsigned int ndctl_cmd_ars_cap_get_clear_unit(struct ndctl_cmd *ars_cap);
+int ndctl_cmd_ars_stat_get_flag_overflow(struct ndctl_cmd *ars_stat);
 
 /*
  * Note: ndctl_cmd_smart_get_temperature is an alias for
-- 
2.17.0

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v3 2/4] libnvdimm: unconditionally deep flush on *sync

2018-06-06 Thread Ross Zwisler
On Wed, Jun 06, 2018 at 10:57:59AM -0700, Dan Williams wrote:
> On Wed, Jun 6, 2018 at 9:45 AM, Ross Zwisler
>  wrote:
> > Prior to this commit we would only do a "deep flush" (have nvdimm_flush()
> > write to each of the flush hints for a region) in response to an
> > msync/fsync/sync call if the nvdimm_has_cache() returned true at the time
> > we were setting up the request queue.  This happens due to the write cache
> > value passed in to blk_queue_write_cache(), which then causes the block
> > layer to send down BIOs with REQ_FUA and REQ_PREFLUSH set.  We do have a
> > "write_cache" sysfs entry for namespaces, i.e.:
> >
> >   /sys/bus/nd/devices/pfn0.1/block/pmem0/dax/write_cache
> >
> > which can be used to control whether or not the kernel thinks a given
> > namespace has a write cache, but this didn't modify the deep flush behavior
> > that we set up when the driver was initialized.  Instead, it only modified
> > whether or not DAX would flush CPU caches via dax_flush() in response to
> > *sync calls.
> >
> > Simplify this by making the *sync deep flush always happen, regardless of
> > the write cache setting of a namespace.  The DAX CPU cache flushing will
> > still be controlled the write_cache setting of the namespace.
> >
> > Signed-off-by: Ross Zwisler 
> > Suggested-by: Dan Williams 
> 
> Looks, good. I believe we want this one and ["PATCH v3 4/4] libnvdimm:
> don't flush power-fail protected CPU caches" marked for -stable and
> tagged with:
> 
> Fixes: 5fdf8e5ba566 ("libnvdimm: re-enable deep flush for pmem devices
> via fsync()")
> 
> ...any concerns with that?

Nope, sounds good.  Can you fix that up when you apply, or would it be helpful
for me to send another revision with those tags?
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v3 2/4] libnvdimm: unconditionally deep flush on *sync

2018-06-06 Thread Dan Williams
On Wed, Jun 6, 2018 at 9:45 AM, Ross Zwisler
 wrote:
> Prior to this commit we would only do a "deep flush" (have nvdimm_flush()
> write to each of the flush hints for a region) in response to an
> msync/fsync/sync call if the nvdimm_has_cache() returned true at the time
> we were setting up the request queue.  This happens due to the write cache
> value passed in to blk_queue_write_cache(), which then causes the block
> layer to send down BIOs with REQ_FUA and REQ_PREFLUSH set.  We do have a
> "write_cache" sysfs entry for namespaces, i.e.:
>
>   /sys/bus/nd/devices/pfn0.1/block/pmem0/dax/write_cache
>
> which can be used to control whether or not the kernel thinks a given
> namespace has a write cache, but this didn't modify the deep flush behavior
> that we set up when the driver was initialized.  Instead, it only modified
> whether or not DAX would flush CPU caches via dax_flush() in response to
> *sync calls.
>
> Simplify this by making the *sync deep flush always happen, regardless of
> the write cache setting of a namespace.  The DAX CPU cache flushing will
> still be controlled the write_cache setting of the namespace.
>
> Signed-off-by: Ross Zwisler 
> Suggested-by: Dan Williams 

Looks, good. I believe we want this one and ["PATCH v3 4/4] libnvdimm:
don't flush power-fail protected CPU caches" marked for -stable and
tagged with:

Fixes: 5fdf8e5ba566 ("libnvdimm: re-enable deep flush for pmem devices
via fsync()")

...any concerns with that?
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v3 1/4] pmem, libnvdimm: complete REQ_FLUSH => REQ_PREFLUSH

2018-06-06 Thread Dan Williams
On Wed, Jun 6, 2018 at 9:45 AM, Ross Zwisler
 wrote:
> Complete the move from REQ_FLUSH to REQ_PREFLUSH that apparently started
> way back in v4.8.
>
> Signed-off-by: Ross Zwisler 

Yup, long overdue. Applied for 4.18.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v2 5/7] dm: remove DM_TYPE_DAX_BIO_BASED dm_queue_mode

2018-06-06 Thread Ross Zwisler
On Mon, Jun 04, 2018 at 08:46:28PM -0400, Mike Snitzer wrote:
> On Mon, Jun 04 2018 at  7:24pm -0400,
> Ross Zwisler  wrote:
> 
> > On Fri, Jun 01, 2018 at 06:04:43PM -0400, Mike Snitzer wrote:
> > > On Tue, May 29 2018 at  3:51pm -0400,
> > > Ross Zwisler  wrote:
> > > 
> > > > The DM_TYPE_DAX_BIO_BASED dm_queue_mode was introduced to prevent DM
> > > > devices that could possibly support DAX from transitioning into DM 
> > > > devices
> > > > that cannot support DAX.
> > > > 
> > > > For example, the following transition will currently fail:
> > > > 
> > > >  dm-linear: [fsdax pmem][fsdax pmem] => [fsdax pmem][fsdax raw]
> > > >   DM_TYPE_DAX_BIO_BASED   DM_TYPE_BIO_BASED
> > > > 
> > > > but these will both succeed:
> > > > 
> > > >  dm-linear: [fsdax pmem][brd ramdisk] => [fsdax pmem][fsdax raw]
> > > >   DM_TYPE_DAX_BIO_BASEDDM_TYPE_BIO_BASED
> > > > 
> > > 
> > > I fail to see how this succeeds given
> > > drivers/md/dm-ioctl.c:is_valid_type() only allows transitions from:
> > > 
> > > DM_TYPE_BIO_BASED => DM_TYPE_DAX_BIO_BASED
> > 
> > Right, sorry, that was a typo.  What I meant was:
> > 
> > > For example, the following transition will currently fail:
> > > 
> > >  dm-linear: [fsdax pmem][fsdax pmem] => [fsdax pmem][fsdax raw]
> > >   DM_TYPE_DAX_BIO_BASED   DM_TYPE_BIO_BASED
> > > 
> > > but these will both succeed:
> > > 
> > >  dm-linear: [fsdax pmem][brd ramdisk] => [fsdax pmem][fsdax raw]
> > > DM_TYPE_BIO_BASEDDM_TYPE_BIO_BASED
> > > 
> > >  dm-linear: [fsdax pmem][fsdax raw] => [fsdax pmem][fsdax pmem]
> > > DM_TYPE_BIO_BASEDDM_TYPE_DAX_BIO_BASED
> > 
> > So we allow 2 of the 3 transitions, but the reason that we disallow the 
> > third
> > isn't fully clear to me.
> > 
> > > >  dm-linear: [fsdax pmem][fsdax raw] => [fsdax pmem][fsdax pmem]
> > > > DM_TYPE_BIO_BASEDDM_TYPE_DAX_BIO_BASED
> > > > 
> > > > This seems arbitrary, as really the choice on whether to use DAX 
> > > > happens at
> > > > filesystem mount time.  There's no guarantee that the in the first case
> > > > (double fsdax pmem) we were using the dax mount option with our file
> > > > system.
> > > > 
> > > > Instead, get rid of DM_TYPE_DAX_BIO_BASED and all the special casing 
> > > > around
> > > > it, and instead make the request queue's QUEUE_FLAG_DAX be our one 
> > > > source
> > > > of truth.  If this is set, we can use DAX, and if not, not.  We keep 
> > > > this
> > > > up to date in table_load() as the table changes.  As with regular block
> > > > devices the filesystem will then know at mount time whether DAX is a
> > > > supported mount option or not.
> > > 
> > > If you don't think you need this specialization that is fine.. but DM
> > > devices supporting suspending (as part of table reloads) so is there any
> > > risk that there will be inflight IO (say if someone did 'dmsetup suspend
> > > --noflush').. and then upon reload the device type changed out from
> > > under us.. anyway, I don't have all the PMEM DAX stuff paged back into
> > > my head yet.
> > > 
> > > But this just seems like we really shouldn't be allowing the
> > > transition from what was DM_TYPE_DAX_BIO_BASED back to DM_TYPE_BIO_BASED
> > 
> > I admit I don't fully understand all the ways that DM supports suspending 
> > and
> > resuming devices.  Is there actually a case where we can change out the DM
> > devices while I/O is running, and somehow end up trying to issue a DAX I/O 
> > to
> > a device that doesn't support DAX?
> 
> Yes, provided root permissions, it's very easy to dmsetup suspend/load/resume
> to replace any portion of the DM device's logical address space to map to an
> entirely different DM target (with a different backing store).  It's
> pretty intrusive to do such things, but easily done and powerful.
> 
> Mike

Hmmm, I don't understand how you can do this if there is a filesystem built on
your DM device?  Say you have a DM device, either striped or linear, that is
made up of 2 devices, and then you use dmsetup to replace one of the DM member
devices with something else.  You've just swapped out half of your LBA space
with new data, right? 

I don't understand how you can expect a filesystem built on the old DM device
to still work?  You especially can't do this while the filesystem is mounted -
all the in-core filesystem metadata would be garbage because the on-media data
would have totally changed.

So, when dealing with a filesystem, the flow must be:

unmount your filesystem
redo your DM device, changing out devices
reformat your filesystem on the new DM device
remount your filesystem

Right?  If so, then I don't see how a transition of the DM device from
supporting DAX to not supporting DAX or vice versa could harm us, as we can't
be doing filesystem I/O at the time when we change the composition of the DM
device.
___
Linux-nvdimm mailing list

Re: [qemu PATCH v4 3/4] nvdimm, acpi: support NFIT platform capabilities

2018-06-06 Thread Ross Zwisler
On Wed, Jun 06, 2018 at 11:00:33AM -0600, Ross Zwisler wrote:
> On Wed, Jun 06, 2018 at 10:50:51AM -0600, Ross Zwisler wrote:
> > On Tue, Jun 05, 2018 at 03:21:30PM -0700, Dan Williams wrote:
> > > On Tue, Jun 5, 2018 at 3:07 PM, Ross Zwisler
> > >  wrote:
> > > > On Tue, Jun 05, 2018 at 09:37:25PM +0300, Michael S. Tsirkin wrote:
> > > >> On Tue, Jun 05, 2018 at 11:15:00AM -0700, Dan Williams wrote:
> > > >> > On Tue, Jun 5, 2018 at 9:42 AM, Ross Zwisler
> > > >> >  wrote:
> > > >> > > On Tue, Jun 05, 2018 at 06:25:27PM +0300, Michael S. Tsirkin wrote:
> > > >> > >> On Mon, May 21, 2018 at 10:32:02AM -0600, Ross Zwisler wrote:
> > > >> > >> > Add a machine command line option to allow the user to control 
> > > >> > >> > the Platform
> > > >> > >> > Capabilities Structure in the virtualized NFIT.  This Platform 
> > > >> > >> > Capabilities
> > > >> > >> > Structure was added in ACPI 6.2 Errata A.
> > > >> > >> >
> > > >> > >> > Signed-off-by: Ross Zwisler 
> > > >> > >>
> > > >> > >> I tried playing with it and encoding the capabilities is
> > > >> > >> quite awkward.
> > > >> > >>
> > > >> > >> Can we add bits for specific capabilities instead of nvdimm-cap?
> > > >> > >>
> > > >> > >> How about:
> > > >> > >>
> > > >> > >> "cpu-flush-on-power-loss-cap"
> > > >> > >> "memory-flush-on-power-loss-cap"
> > > >> > >> "byte-addressable-mirroring-cap"
> > > >> > >
> > > >> > > Hmmm...I don't like that as much because:
> > > >> > >
> > > >> > > a) It's very verbose.  Looking at my current qemu command line few 
> > > >> > > other
> > > >> > >options require that many characters, and you'd commonly be 
> > > >> > > defining more
> > > >> > >than one of these for a given VM.
> > > >> > >
> > > >> > > b) It means that the QEMU will need to be updated if/when new 
> > > >> > > flags are added,
> > > >> > >because we'll have to have new options for each flag.  The 
> > > >> > > current
> > > >> > >implementation is more future-proof because you can specify any 
> > > >> > > flags
> > > >> > >value you want.
> > > >> > >
> > > >> > > However, if you feel strongly about this, I'll make the change.
> > > >> >
> > > >> > Straw-man: Could we do something similar with what we are doing in 
> > > >> > ndctl?
> > > >> >
> > > >> > enum ndctl_persistence_domain {
> > > >> > PERSISTENCE_NONE = 0,
> > > >> > PERSISTENCE_MEM_CTRL = 10,
> > > >> > PERSISTENCE_CPU_CACHE = 20,
> > > >> > PERSISTENCE_UNKNOWN = INT_MAX,
> > > >> > };
> > > >> >
> > > >> > ...and have the command line take a number where "10" and "20" are
> > > >> > supported today, but allows us to adapt to new persistence domains in
> > > >> > the future.
> > > >>
> > > >> I'm fine with that except can we have symbolic names instead of numbers
> > > >> on command line?
> > > >>
> > > >> --
> > > >> MST
> > > >
> > > > Okay, we can move to the symbolic names.  Do you want them to be that 
> > > > long, or
> > > > would:
> > > >
> > > > nvdimm-cap-cpu
> > > > nvdimm-cap-mem-ctrl
> > > > nvdimm-cap-mirroring
> > > 
> > > Wait, why is mirroring part of this?
> > > 
> > > I was thinking this option would be:
> > > 
> > > --persistence-domain={cpu,mem-ctrl}
> > > 
> > > ...and try not to let ACPI specifics leak into the qemu command line
> > > interface. For example PowerPC qemu could have a persistence domain
> > > communicated via Open Firmware or some other mechanism.
> > 
> > Sure, this seems fine, though we may want to throw an "nvdimm" in the name
> > somewhere so it's clear what the option affects.
> > 
> > nvdimm-persistence={cpu,mem-ctrl} maybe?
> > 
> > Michael, does this work for you?
> 
> Hmm...also, the version of these patches that used numeric values did go
> upstream in QEMU.  So, do we need to leave that interface intact, and just add
> a new one that uses symbolic names?  Or did you still just want to replace the
> numeric one since it hasn't appeared in a QEMU release yet?

I guess another alternative would be to just add symbolic name options to the
existing command line option, i.e. allow all of these:

nvdimm-cap=3# CPU cache
nvdimm-cap=cpu  # CPU cache
nvdimm-cap=2# memory controller
nvdimm-cap=mem-ctrl # memory controller

And just have them as aliases.  This retains backwards compatibility with
what is already there, allows for other numeric values without QEMU updates if
other bits are defined (though we are still tightly tied to ACPI in this
case), and provides a symbolic name which is easier to use.

Thoughts?
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [qemu PATCH v4 3/4] nvdimm, acpi: support NFIT platform capabilities

2018-06-06 Thread Ross Zwisler
On Wed, Jun 06, 2018 at 10:50:51AM -0600, Ross Zwisler wrote:
> On Tue, Jun 05, 2018 at 03:21:30PM -0700, Dan Williams wrote:
> > On Tue, Jun 5, 2018 at 3:07 PM, Ross Zwisler
> >  wrote:
> > > On Tue, Jun 05, 2018 at 09:37:25PM +0300, Michael S. Tsirkin wrote:
> > >> On Tue, Jun 05, 2018 at 11:15:00AM -0700, Dan Williams wrote:
> > >> > On Tue, Jun 5, 2018 at 9:42 AM, Ross Zwisler
> > >> >  wrote:
> > >> > > On Tue, Jun 05, 2018 at 06:25:27PM +0300, Michael S. Tsirkin wrote:
> > >> > >> On Mon, May 21, 2018 at 10:32:02AM -0600, Ross Zwisler wrote:
> > >> > >> > Add a machine command line option to allow the user to control 
> > >> > >> > the Platform
> > >> > >> > Capabilities Structure in the virtualized NFIT.  This Platform 
> > >> > >> > Capabilities
> > >> > >> > Structure was added in ACPI 6.2 Errata A.
> > >> > >> >
> > >> > >> > Signed-off-by: Ross Zwisler 
> > >> > >>
> > >> > >> I tried playing with it and encoding the capabilities is
> > >> > >> quite awkward.
> > >> > >>
> > >> > >> Can we add bits for specific capabilities instead of nvdimm-cap?
> > >> > >>
> > >> > >> How about:
> > >> > >>
> > >> > >> "cpu-flush-on-power-loss-cap"
> > >> > >> "memory-flush-on-power-loss-cap"
> > >> > >> "byte-addressable-mirroring-cap"
> > >> > >
> > >> > > Hmmm...I don't like that as much because:
> > >> > >
> > >> > > a) It's very verbose.  Looking at my current qemu command line few 
> > >> > > other
> > >> > >options require that many characters, and you'd commonly be 
> > >> > > defining more
> > >> > >than one of these for a given VM.
> > >> > >
> > >> > > b) It means that the QEMU will need to be updated if/when new flags 
> > >> > > are added,
> > >> > >because we'll have to have new options for each flag.  The current
> > >> > >implementation is more future-proof because you can specify any 
> > >> > > flags
> > >> > >value you want.
> > >> > >
> > >> > > However, if you feel strongly about this, I'll make the change.
> > >> >
> > >> > Straw-man: Could we do something similar with what we are doing in 
> > >> > ndctl?
> > >> >
> > >> > enum ndctl_persistence_domain {
> > >> > PERSISTENCE_NONE = 0,
> > >> > PERSISTENCE_MEM_CTRL = 10,
> > >> > PERSISTENCE_CPU_CACHE = 20,
> > >> > PERSISTENCE_UNKNOWN = INT_MAX,
> > >> > };
> > >> >
> > >> > ...and have the command line take a number where "10" and "20" are
> > >> > supported today, but allows us to adapt to new persistence domains in
> > >> > the future.
> > >>
> > >> I'm fine with that except can we have symbolic names instead of numbers
> > >> on command line?
> > >>
> > >> --
> > >> MST
> > >
> > > Okay, we can move to the symbolic names.  Do you want them to be that 
> > > long, or
> > > would:
> > >
> > > nvdimm-cap-cpu
> > > nvdimm-cap-mem-ctrl
> > > nvdimm-cap-mirroring
> > 
> > Wait, why is mirroring part of this?
> > 
> > I was thinking this option would be:
> > 
> > --persistence-domain={cpu,mem-ctrl}
> > 
> > ...and try not to let ACPI specifics leak into the qemu command line
> > interface. For example PowerPC qemu could have a persistence domain
> > communicated via Open Firmware or some other mechanism.
> 
> Sure, this seems fine, though we may want to throw an "nvdimm" in the name
> somewhere so it's clear what the option affects.
> 
> nvdimm-persistence={cpu,mem-ctrl} maybe?
> 
> Michael, does this work for you?

Hmm...also, the version of these patches that used numeric values did go
upstream in QEMU.  So, do we need to leave that interface intact, and just add
a new one that uses symbolic names?  Or did you still just want to replace the
numeric one since it hasn't appeared in a QEMU release yet?
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [qemu PATCH v4 3/4] nvdimm, acpi: support NFIT platform capabilities

2018-06-06 Thread Ross Zwisler
On Tue, Jun 05, 2018 at 03:21:30PM -0700, Dan Williams wrote:
> On Tue, Jun 5, 2018 at 3:07 PM, Ross Zwisler
>  wrote:
> > On Tue, Jun 05, 2018 at 09:37:25PM +0300, Michael S. Tsirkin wrote:
> >> On Tue, Jun 05, 2018 at 11:15:00AM -0700, Dan Williams wrote:
> >> > On Tue, Jun 5, 2018 at 9:42 AM, Ross Zwisler
> >> >  wrote:
> >> > > On Tue, Jun 05, 2018 at 06:25:27PM +0300, Michael S. Tsirkin wrote:
> >> > >> On Mon, May 21, 2018 at 10:32:02AM -0600, Ross Zwisler wrote:
> >> > >> > Add a machine command line option to allow the user to control the 
> >> > >> > Platform
> >> > >> > Capabilities Structure in the virtualized NFIT.  This Platform 
> >> > >> > Capabilities
> >> > >> > Structure was added in ACPI 6.2 Errata A.
> >> > >> >
> >> > >> > Signed-off-by: Ross Zwisler 
> >> > >>
> >> > >> I tried playing with it and encoding the capabilities is
> >> > >> quite awkward.
> >> > >>
> >> > >> Can we add bits for specific capabilities instead of nvdimm-cap?
> >> > >>
> >> > >> How about:
> >> > >>
> >> > >> "cpu-flush-on-power-loss-cap"
> >> > >> "memory-flush-on-power-loss-cap"
> >> > >> "byte-addressable-mirroring-cap"
> >> > >
> >> > > Hmmm...I don't like that as much because:
> >> > >
> >> > > a) It's very verbose.  Looking at my current qemu command line few 
> >> > > other
> >> > >options require that many characters, and you'd commonly be 
> >> > > defining more
> >> > >than one of these for a given VM.
> >> > >
> >> > > b) It means that the QEMU will need to be updated if/when new flags 
> >> > > are added,
> >> > >because we'll have to have new options for each flag.  The current
> >> > >implementation is more future-proof because you can specify any 
> >> > > flags
> >> > >value you want.
> >> > >
> >> > > However, if you feel strongly about this, I'll make the change.
> >> >
> >> > Straw-man: Could we do something similar with what we are doing in ndctl?
> >> >
> >> > enum ndctl_persistence_domain {
> >> > PERSISTENCE_NONE = 0,
> >> > PERSISTENCE_MEM_CTRL = 10,
> >> > PERSISTENCE_CPU_CACHE = 20,
> >> > PERSISTENCE_UNKNOWN = INT_MAX,
> >> > };
> >> >
> >> > ...and have the command line take a number where "10" and "20" are
> >> > supported today, but allows us to adapt to new persistence domains in
> >> > the future.
> >>
> >> I'm fine with that except can we have symbolic names instead of numbers
> >> on command line?
> >>
> >> --
> >> MST
> >
> > Okay, we can move to the symbolic names.  Do you want them to be that long, 
> > or
> > would:
> >
> > nvdimm-cap-cpu
> > nvdimm-cap-mem-ctrl
> > nvdimm-cap-mirroring
> 
> Wait, why is mirroring part of this?
> 
> I was thinking this option would be:
> 
> --persistence-domain={cpu,mem-ctrl}
> 
> ...and try not to let ACPI specifics leak into the qemu command line
> interface. For example PowerPC qemu could have a persistence domain
> communicated via Open Firmware or some other mechanism.

Sure, this seems fine, though we may want to throw an "nvdimm" in the name
somewhere so it's clear what the option affects.

nvdimm-persistence={cpu,mem-ctrl} maybe?

Michael, does this work for you?
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[PATCH v3 1/4] pmem, libnvdimm: complete REQ_FLUSH => REQ_PREFLUSH

2018-06-06 Thread Ross Zwisler
Complete the move from REQ_FLUSH to REQ_PREFLUSH that apparently started
way back in v4.8.

Signed-off-by: Ross Zwisler 
---
 drivers/nvdimm/pmem.c | 7 +--
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 9d714926ecf5..252adfab1e47 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -164,11 +164,6 @@ static blk_status_t pmem_do_bvec(struct pmem_device *pmem, 
struct page *page,
return rc;
 }
 
-/* account for REQ_FLUSH rename, replace with REQ_PREFLUSH after v4.8-rc1 */
-#ifndef REQ_FLUSH
-#define REQ_FLUSH REQ_PREFLUSH
-#endif
-
 static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
 {
blk_status_t rc = 0;
@@ -179,7 +174,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, 
struct bio *bio)
struct pmem_device *pmem = q->queuedata;
struct nd_region *nd_region = to_region(pmem);
 
-   if (bio->bi_opf & REQ_FLUSH)
+   if (bio->bi_opf & REQ_PREFLUSH)
nvdimm_flush(nd_region);
 
do_acct = nd_iostat_start(bio, );
-- 
2.14.4

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[PATCH v3 4/4] libnvdimm: don't flush power-fail protected CPU caches

2018-06-06 Thread Ross Zwisler
This commit:

5fdf8e5ba566 ("libnvdimm: re-enable deep flush for pmem devices via fsync()")

intended to make sure that deep flush was always available even on
platforms which support a power-fail protected CPU cache.  An unintended
side effect of this change was that we also lost the ability to skip
flushing CPU caches on those power-fail protected CPU cache.

Fix this by skipping the low level cache flushing in dax_flush() if we have
CPU caches which are power-fail protected.  The user can still override this
behavior by manually setting the write_cache state of a namespace.  See
libndctl's ndctl_namespace_write_cache_is_enabled(),
ndctl_namespace_enable_write_cache() and
ndctl_namespace_disable_write_cache() functions.

Signed-off-by: Ross Zwisler 
Fixes: 5fdf8e5ba566 ("libnvdimm: re-enable deep flush for pmem devices via 
fsync()")
---
 drivers/nvdimm/region_devs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index a612be6f019d..ec3543b83330 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -1132,7 +1132,8 @@ EXPORT_SYMBOL_GPL(nvdimm_has_flush);
 
 int nvdimm_has_cache(struct nd_region *nd_region)
 {
-   return is_nd_pmem(_region->dev);
+   return is_nd_pmem(_region->dev) &&
+   !test_bit(ND_REGION_PERSIST_CACHE, _region->flags);
 }
 EXPORT_SYMBOL_GPL(nvdimm_has_cache);
 
-- 
2.14.4

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[PATCH v3 2/4] libnvdimm: unconditionally deep flush on *sync

2018-06-06 Thread Ross Zwisler
Prior to this commit we would only do a "deep flush" (have nvdimm_flush()
write to each of the flush hints for a region) in response to an
msync/fsync/sync call if the nvdimm_has_cache() returned true at the time
we were setting up the request queue.  This happens due to the write cache
value passed in to blk_queue_write_cache(), which then causes the block
layer to send down BIOs with REQ_FUA and REQ_PREFLUSH set.  We do have a
"write_cache" sysfs entry for namespaces, i.e.:

  /sys/bus/nd/devices/pfn0.1/block/pmem0/dax/write_cache

which can be used to control whether or not the kernel thinks a given
namespace has a write cache, but this didn't modify the deep flush behavior
that we set up when the driver was initialized.  Instead, it only modified
whether or not DAX would flush CPU caches via dax_flush() in response to
*sync calls.

Simplify this by making the *sync deep flush always happen, regardless of
the write cache setting of a namespace.  The DAX CPU cache flushing will
still be controlled the write_cache setting of the namespace.

Signed-off-by: Ross Zwisler 
Suggested-by: Dan Williams 
---
 drivers/nvdimm/pmem.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 252adfab1e47..97b4c39a9267 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -294,7 +294,7 @@ static int pmem_attach_disk(struct device *dev,
 {
struct nd_namespace_io *nsio = to_nd_namespace_io(>dev);
struct nd_region *nd_region = to_nd_region(dev->parent);
-   int nid = dev_to_node(dev), fua, wbc;
+   int nid = dev_to_node(dev), fua;
struct resource *res = >res;
struct resource bb_res;
struct nd_pfn *nd_pfn = NULL;
@@ -330,7 +330,6 @@ static int pmem_attach_disk(struct device *dev,
dev_warn(dev, "unable to guarantee persistence of writes\n");
fua = 0;
}
-   wbc = nvdimm_has_cache(nd_region);
 
if (!devm_request_mem_region(dev, res->start, resource_size(res),
dev_name(>dev))) {
@@ -377,7 +376,7 @@ static int pmem_attach_disk(struct device *dev,
return PTR_ERR(addr);
pmem->virt_addr = addr;
 
-   blk_queue_write_cache(q, wbc, fua);
+   blk_queue_write_cache(q, true, fua);
blk_queue_make_request(q, pmem_make_request);
blk_queue_physical_block_size(q, PAGE_SIZE);
blk_queue_logical_block_size(q, pmem_sector_size(ndns));
@@ -408,7 +407,7 @@ static int pmem_attach_disk(struct device *dev,
put_disk(disk);
return -ENOMEM;
}
-   dax_write_cache(dax_dev, wbc);
+   dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
pmem->dax_dev = dax_dev;
 
gendev = disk_to_dev(disk);
-- 
2.14.4

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[PATCH v3 3/4] libnvdimm: use dax_write_cache* helpers

2018-06-06 Thread Ross Zwisler
Use dax_write_cache() and dax_write_cache_enabled() instead of open coding
the bit operations.

Signed-off-by: Ross Zwisler 
---
 drivers/dax/super.c | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 2b2332b605e4..c2c46f96b18c 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -182,8 +182,7 @@ static ssize_t write_cache_show(struct device *dev,
if (!dax_dev)
return -ENXIO;
 
-   rc = sprintf(buf, "%d\n", !!test_bit(DAXDEV_WRITE_CACHE,
-   _dev->flags));
+   rc = sprintf(buf, "%d\n", !!dax_write_cache_enabled(dax_dev));
put_dax(dax_dev);
return rc;
 }
@@ -201,10 +200,8 @@ static ssize_t write_cache_store(struct device *dev,
 
if (rc)
len = rc;
-   else if (write_cache)
-   set_bit(DAXDEV_WRITE_CACHE, _dev->flags);
else
-   clear_bit(DAXDEV_WRITE_CACHE, _dev->flags);
+   dax_write_cache(dax_dev, write_cache);
 
put_dax(dax_dev);
return len;
@@ -286,7 +283,7 @@ EXPORT_SYMBOL_GPL(dax_copy_from_iter);
 void arch_wb_cache_pmem(void *addr, size_t size);
 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
 {
-   if (unlikely(!test_bit(DAXDEV_WRITE_CACHE, _dev->flags)))
+   if (unlikely(!dax_write_cache_enabled(dax_dev)))
return;
 
arch_wb_cache_pmem(addr, size);
-- 
2.14.4

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v2 3/3] libnvdimm: don't flush power-fail protected CPU caches

2018-06-06 Thread Ross Zwisler
On Tue, Jun 05, 2018 at 07:00:14PM -0700, Dan Williams wrote:
> On Tue, Jun 5, 2018 at 4:58 PM, Ross Zwisler
>  wrote:
> > This commit:
> >
> > 5fdf8e5ba566 ("libnvdimm: re-enable deep flush for pmem devices via 
> > fsync()")
> >
> > intended to make sure that deep flush was always available even on
> > platforms which support a power-fail protected CPU cache.  An unintended
> > side effect of this change was that we also lost the ability to skip
> > flushing CPU caches on those power-fail protected CPU cache.
> >
> > Signed-off-by: Ross Zwisler 
> > Fixes: 5fdf8e5ba566 ("libnvdimm: re-enable deep flush for pmem devices via 
> > fsync()")
> > ---
> >  drivers/dax/super.c   | 14 +-
> >  drivers/nvdimm/pmem.c |  2 ++
> >  include/linux/dax.h   |  4 
> >  3 files changed, 19 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/dax/super.c b/drivers/dax/super.c
> > index c2c46f96b18c..80253c531a9b 100644
> > --- a/drivers/dax/super.c
> > +++ b/drivers/dax/super.c
> > @@ -152,6 +152,8 @@ enum dax_device_flags {
> > DAXDEV_ALIVE,
> > /* gate whether dax_flush() calls the low level flush routine */
> > DAXDEV_WRITE_CACHE,
> > +   /* only flush the CPU caches if they are not power fail protected */
> > +   DAXDEV_FLUSH_ON_SYNC,
> 
> I'm not grokking why we need DAXDEV_FLUSH_ON_SYNC. The power fail
> protected status of the cache only determines the default for
> DAXDEV_WRITE_CACHE.

Ah, yea, that's much cleaner.  I'll send out a v3.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v2 00/11] mm: Teach memory_failure() about ZONE_DEVICE pages

2018-06-06 Thread Dan Williams
On Wed, Jun 6, 2018 at 12:39 AM, Michal Hocko  wrote:
> On Tue 05-06-18 07:33:17, Dan Williams wrote:
>> On Tue, Jun 5, 2018 at 7:11 AM, Michal Hocko  wrote:
>> > On Mon 04-06-18 07:31:25, Dan Williams wrote:
>> > [...]
>> >> I'm trying to solve this real world problem when real poison is
>> >> consumed through a dax mapping:
>> >>
>> >> mce: Uncorrected hardware memory error in user-access at 
>> >> af34214200
>> >> {1}[Hardware Error]: It has been corrected by h/w and requires
>> >> no further action
>> >> mce: [Hardware Error]: Machine check events logged
>> >> {1}[Hardware Error]: event severity: corrected
>> >> Memory failure: 0xaf34214: reserved kernel page still
>> >> referenced by 1 users
>> >> [..]
>> >> Memory failure: 0xaf34214: recovery action for reserved kernel
>> >> page: Failed
>> >> mce: Memory error not recovered
>> >>
>> >> ...i.e. currently all poison consumed through dax mappings is
>> >> needlessly system fatal.
>> >
>> > Thanks. That should be a part of the changelog.
>>
>> ...added for v3:
>> https://lists.01.org/pipermail/linux-nvdimm/2018-June/016153.html
>>
>> > It would be great to
>> > describe why this cannot be simply handled by hwpoison code without any
>> > ZONE_DEVICE specific hacks? The error is recoverable so why does
>> > hwpoison code even care?
>> >
>>
>> Up until we started testing hardware poison recovery for persistent
>> memory I assumed that the kernel did not need any new enabling to get
>> basic support for recovering userspace consumed poison.
>>
>> However, the recovery code has a dedicated path for many different
>> page states (see: action_page_types). Without any changes it
>> incorrectly assumes that a dax mapped page is a page cache page
>> undergoing dma, or some other pinned operation. It also assumes that
>> the page must be offlined which is not correct / possible for dax
>> mapped pages. There is a possibility to repair poison to dax mapped
>> persistent memory pages, and the pages can't otherwise be offlined
>> because they 1:1 correspond with a physical storage block, i.e.
>> offlining pmem would be equivalent to punching a hole in the physical
>> address space.
>>
>> There's also the entanglement of device-dax which guarantees a given
>> mapping size (4K, 2M, 1G). This requires determining the size of the
>> mapping encompassing a given pfn to know how much to unmap. Since dax
>> mapped pfns don't come from the page allocator we need to read the
>> page size from the page tables, not compound_order(page).
>
> OK, but my question is still. Do we really want to do more on top of the
> existing code and add even more special casing or it is time to rethink
> the whole hwpoison design?

Well, there's the immediate problem that the current implementation is
broken for dax and then the longer term problem that the current
design appears to be too literal with a lot of custom marshaling.

At least for dax in the long term we want to offer an alternative
error handling model and get the filesystem much more involved. That
filesystem redesign work has been waiting for the reverse-block-map
effort to settle in xfs. However, that's more custom work for dax and
not a redesign that helps the core-mm more generically.

I think the unmap and SIGBUS portion of poison handling is relatively
straightforward. It's the handling of the page HWPoison page flag that
seems a bit ad hoc. The current implementation certainly was not
prepared for the concept that memory can be repaired. set_mce_nospec()
is a step in the direction of generic memory error handling.

Thoughts on other pain points in the design that are on your mind, Michal?
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v2 00/11] mm: Teach memory_failure() about ZONE_DEVICE pages

2018-06-06 Thread Michal Hocko
On Tue 05-06-18 07:33:17, Dan Williams wrote:
> On Tue, Jun 5, 2018 at 7:11 AM, Michal Hocko  wrote:
> > On Mon 04-06-18 07:31:25, Dan Williams wrote:
> > [...]
> >> I'm trying to solve this real world problem when real poison is
> >> consumed through a dax mapping:
> >>
> >> mce: Uncorrected hardware memory error in user-access at af34214200
> >> {1}[Hardware Error]: It has been corrected by h/w and requires
> >> no further action
> >> mce: [Hardware Error]: Machine check events logged
> >> {1}[Hardware Error]: event severity: corrected
> >> Memory failure: 0xaf34214: reserved kernel page still
> >> referenced by 1 users
> >> [..]
> >> Memory failure: 0xaf34214: recovery action for reserved kernel
> >> page: Failed
> >> mce: Memory error not recovered
> >>
> >> ...i.e. currently all poison consumed through dax mappings is
> >> needlessly system fatal.
> >
> > Thanks. That should be a part of the changelog.
> 
> ...added for v3:
> https://lists.01.org/pipermail/linux-nvdimm/2018-June/016153.html
> 
> > It would be great to
> > describe why this cannot be simply handled by hwpoison code without any
> > ZONE_DEVICE specific hacks? The error is recoverable so why does
> > hwpoison code even care?
> >
> 
> Up until we started testing hardware poison recovery for persistent
> memory I assumed that the kernel did not need any new enabling to get
> basic support for recovering userspace consumed poison.
> 
> However, the recovery code has a dedicated path for many different
> page states (see: action_page_types). Without any changes it
> incorrectly assumes that a dax mapped page is a page cache page
> undergoing dma, or some other pinned operation. It also assumes that
> the page must be offlined which is not correct / possible for dax
> mapped pages. There is a possibility to repair poison to dax mapped
> persistent memory pages, and the pages can't otherwise be offlined
> because they 1:1 correspond with a physical storage block, i.e.
> offlining pmem would be equivalent to punching a hole in the physical
> address space.
> 
> There's also the entanglement of device-dax which guarantees a given
> mapping size (4K, 2M, 1G). This requires determining the size of the
> mapping encompassing a given pfn to know how much to unmap. Since dax
> mapped pfns don't come from the page allocator we need to read the
> page size from the page tables, not compound_order(page).

OK, but my question is still. Do we really want to do more on top of the
existing code and add even more special casing or it is time to rethink
the whole hwpoison design?
-- 
Michal Hocko
SUSE Labs
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm