Re: [PATCH 3/3] ndctl: add filtering based on numa_node

2018-03-07 Thread Ross Zwisler
On Wed, Mar 07, 2018 at 10:36:47AM -0800, Dan Williams wrote:
> > @@ -342,6 +372,10 @@ int util_filter_walk(struct ndctl_ctx *ctx, struct 
> > util_filter_ctx *fctx,
> > param->namespace))
> > continue;
> >
> > +   if (numa_node != -1 &&
> > +   ndctl_region_get_numa_node(region) != numa_node)
> > +   continue;
> 
> Maybe "numa_node >= 0", or "#define NUMA_NO_NODE  (-1)" and use that?

Yea, that does make it easier to read.  Thanks for the review.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[PATCH 1/3] ndctl: don't print erroneous namespace numa_nodes

2018-03-07 Thread Ross Zwisler
If the kernel has CONFIG_NUMA unset namespaces in sysfs will lack a
numa_node attribute.  In such cases ndctl will report a value of 0 for the
namespace numa_node in 'ndctl list'.  Instead of reporting potentially bad
data just hide the numa_node field if it is unsupported.

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
Fixes: commit f7d3de80a121 ("ndctl: support machines without numa")
---
 ndctl/lib/libndctl.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ndctl/lib/libndctl.c b/ndctl/lib/libndctl.c
index ed5a65b..b7180e8 100644
--- a/ndctl/lib/libndctl.c
+++ b/ndctl/lib/libndctl.c
@@ -3008,6 +3008,8 @@ static void *add_namespace(void *parent, int id, const 
char *ndns_base)
sprintf(path, "%s/numa_node", ndns_base);
if (sysfs_read_attr(ctx, path, buf) == 0)
ndns->numa_node = strtol(buf, NULL, 0);
+   else
+   ndns->numa_node = -1;
 
sprintf(path, "%s/holder_class", ndns_base);
if (sysfs_read_attr(ctx, path, buf) == 0)
-- 
2.14.3

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[PATCH 3/3] ndctl: add filtering based on numa_node

2018-03-07 Thread Ross Zwisler
Add support to 'ndctl list' so that we can filter DIMMs, regions and
namespaces based on numa node.

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
---
 Documentation/ndctl/ndctl-list.txt |  4 
 ndctl/list.c   |  2 ++
 util/filter.c  | 38 --
 util/filter.h  |  1 +
 4 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/Documentation/ndctl/ndctl-list.txt 
b/Documentation/ndctl/ndctl-list.txt
index 02d4f04..c711c1f 100644
--- a/Documentation/ndctl/ndctl-list.txt
+++ b/Documentation/ndctl/ndctl-list.txt
@@ -83,6 +83,10 @@ include::xable-region-options.txt[]
Filter listing by the mode ('raw', 'fsdax', 'sector' or 'devdax')
of the namespace(s).
 
+-U::
+--numa_node=::
+   Filter listing by numa node
+
 -B::
 --buses::
Include bus info in the listing
diff --git a/ndctl/list.c b/ndctl/list.c
index 4cb66de..e861f8b 100644
--- a/ndctl/list.c
+++ b/ndctl/list.c
@@ -400,6 +400,8 @@ int cmd_list(int argc, const char **argv, void *ctx)
"filter by namespace mode"),
OPT_STRING('t', "type", , "region-type",
"filter by region-type"),
+   OPT_STRING('U', "numa_node", _node, "numa node",
+   "filter by numa node"),
OPT_BOOLEAN('B', "buses", , "include bus info"),
OPT_BOOLEAN('D', "dimms", , "include dimm info"),
OPT_BOOLEAN('F', "firmware", , "include firmware 
info"),
diff --git a/util/filter.c b/util/filter.c
index b0b7fdf..85630ca 100644
--- a/util/filter.c
+++ b/util/filter.c
@@ -146,7 +146,6 @@ struct ndctl_bus *util_bus_filter_by_region(struct 
ndctl_bus *bus,
return NULL;
 }
 
-
 struct ndctl_bus *util_bus_filter_by_namespace(struct ndctl_bus *bus,
const char *ident)
 {
@@ -223,6 +222,25 @@ struct ndctl_dimm *util_dimm_filter_by_namespace(struct 
ndctl_dimm *dimm,
return NULL;
 }
 
+struct ndctl_dimm *util_dimm_filter_by_numa_node(struct ndctl_dimm *dimm,
+   int numa_node)
+{
+   struct ndctl_bus *bus = ndctl_dimm_get_bus(dimm);
+   struct ndctl_region *region;
+   struct ndctl_dimm *check;
+
+   if (numa_node == -1)
+   return dimm;
+
+   ndctl_region_foreach(bus, region)
+   ndctl_dimm_foreach_in_region(region, check)
+   if (check == dimm &&
+   ndctl_region_get_numa_node(region) == numa_node)
+   return dimm;
+
+   return NULL;
+}
+
 struct ndctl_region *util_region_filter_by_namespace(struct ndctl_region 
*region,
const char *ident)
 {
@@ -285,6 +303,8 @@ int util_filter_walk(struct ndctl_ctx *ctx, struct 
util_filter_ctx *fctx,
 {
struct ndctl_bus *bus;
unsigned int type = 0;
+   int numa_node = -1;
+   char *end = NULL;
 
if (param->type && (strcmp(param->type, "pmem") != 0
&& strcmp(param->type, "blk") != 0)) {
@@ -305,6 +325,14 @@ int util_filter_walk(struct ndctl_ctx *ctx, struct 
util_filter_ctx *fctx,
return -EINVAL;
}
 
+   if (param->numa_node && strcmp(param->numa_node, "all") != 0) {
+   numa_node = strtol(param->numa_node, , 0);
+   if (end == param->numa_node || end[0]) {
+   error("invalid numa_node: '%s'\n", param->numa_node);
+   return -EINVAL;
+   }
+   }
+
ndctl_bus_foreach(ctx, bus) {
struct ndctl_region *region;
struct ndctl_dimm *dimm;
@@ -326,7 +354,9 @@ int util_filter_walk(struct ndctl_ctx *ctx, struct 
util_filter_ctx *fctx,
|| !util_dimm_filter_by_region(dimm,
param->region)
|| !util_dimm_filter_by_namespace(dimm,
-   param->namespace))
+   param->namespace)
+   || !util_dimm_filter_by_numa_node(dimm,
+   numa_node))
continue;
 
fctx->filter_dimm(dimm, fctx);
@@ -342,6 +372,10 @@ int util_filter_walk(struct ndctl_ctx *ctx, struct 
util_filter_ctx *fctx,
param->namespace))
continue;
 
+   if (numa_node != -1 &&
+   ndctl_region_get_numa_node(region) != numa_node)
+ 

[PATCH 2/3] ndctl: add numa_node support for regions

2018-03-07 Thread Ross Zwisler
Add an API for getting the numa node of a given region and add a numa_node
field to the "ndctl list" region output.

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
---
 ndctl/lib/libndctl.c   | 13 +
 ndctl/lib/libndctl.sym |  1 +
 ndctl/libndctl.h   |  1 +
 ndctl/list.c   |  8 
 4 files changed, 23 insertions(+)

diff --git a/ndctl/lib/libndctl.c b/ndctl/lib/libndctl.c
index b7180e8..a165e69 100644
--- a/ndctl/lib/libndctl.c
+++ b/ndctl/lib/libndctl.c
@@ -132,6 +132,7 @@ struct ndctl_mapping {
  * @type_name: 'pmem' or 'block'
  * @generation: incremented everytime the region is disabled
  * @nstype: the resulting type of namespace this region produces
+ * @numa_node: numa node attribute
  *
  * A region may alias between pmem and block-window access methods.  The
  * region driver is tasked with parsing the label (if their is one) and
@@ -157,6 +158,7 @@ struct ndctl_region {
char *region_buf;
int buf_len;
int generation;
+   int numa_node;
struct list_head btts;
struct list_head pfns;
struct list_head daxs;
@@ -1808,6 +1810,12 @@ static void *add_region(void *parent, int id, const char 
*region_base)
goto err_read;
region->module = to_module(ctx, buf);
 
+   sprintf(path, "%s/numa_node", region_base);
+   if (sysfs_read_attr(ctx, path, buf) == 0)
+   region->numa_node = strtol(buf, NULL, 0);
+   else
+   region->numa_node = -1;
+
if (region_set_type(region, path) < 0)
goto err_read;
 
@@ -2011,6 +2019,11 @@ NDCTL_EXPORT struct ndctl_dimm 
*ndctl_region_get_next_dimm(struct ndctl_region *
return NULL;
 }
 
+NDCTL_EXPORT int ndctl_region_get_numa_node(struct ndctl_region *region)
+{
+   return region->numa_node;
+}
+
 static int regions_badblocks_init(struct ndctl_region *region)
 {
struct ndctl_ctx *ctx = ndctl_region_get_ctx(region);
diff --git a/ndctl/lib/libndctl.sym b/ndctl/lib/libndctl.sym
index af9b7d5..175a482 100644
--- a/ndctl/lib/libndctl.sym
+++ b/ndctl/lib/libndctl.sym
@@ -344,4 +344,5 @@ global:
ndctl_cmd_fw_fquery_get_fw_rev;
ndctl_cmd_fw_xlat_firmware_status;
ndctl_dimm_cmd_new_ack_shutdown_count;
+   ndctl_region_get_numa_node;
 } LIBNDCTL_13;
diff --git a/ndctl/libndctl.h b/ndctl/libndctl.h
index 9db775b..017c9cc 100644
--- a/ndctl/libndctl.h
+++ b/ndctl/libndctl.h
@@ -340,6 +340,7 @@ struct ndctl_ctx *ndctl_region_get_ctx(struct ndctl_region 
*region);
 struct ndctl_dimm *ndctl_region_get_first_dimm(struct ndctl_region *region);
 struct ndctl_dimm *ndctl_region_get_next_dimm(struct ndctl_region *region,
struct ndctl_dimm *dimm);
+int ndctl_region_get_numa_node(struct ndctl_region *region);
 struct ndctl_region *ndctl_bus_get_region_by_physical_address(struct ndctl_bus 
*bus,
unsigned long long address);
 #define ndctl_dimm_foreach_in_region(region, dimm) \
diff --git a/ndctl/list.c b/ndctl/list.c
index af1c024..4cb66de 100644
--- a/ndctl/list.c
+++ b/ndctl/list.c
@@ -73,6 +73,7 @@ static struct json_object *region_to_json(struct ndctl_region 
*region,
struct ndctl_interleave_set *iset;
struct ndctl_mapping *mapping;
unsigned int bb_count = 0;
+   int numa;
 
if (!jregion)
return NULL;
@@ -107,6 +108,13 @@ static struct json_object *region_to_json(struct 
ndctl_region *region,
goto err;
json_object_object_add(jregion, "type", jobj);
 
+   numa = ndctl_region_get_numa_node(region);
+   if (numa >= 0) {
+   jobj = json_object_new_int(numa);
+   if (jobj)
+   json_object_object_add(jregion, "numa_node", jobj);
+   }
+
iset = ndctl_region_get_interleave_set(region);
if (iset) {
jobj = util_json_object_hex(
-- 
2.14.3

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH] libnvdimm: remove redundant __func__ in dev_dbg

2018-03-06 Thread Ross Zwisler
On Tue, Mar 06, 2018 at 08:59:11AM -0800, Dan Williams wrote:
> On Mon, Mar 5, 2018 at 7:54 PM, Ross Zwisler
> <ross.zwis...@linux.intel.com> wrote:
> > On Mon, Mar 05, 2018 at 05:09:21PM -0800, Dan Williams wrote:
> >> Dynamic debug can be instructed to add the function name to the debug
> >> output using the +f switch, so there is no need for the libnvdimm
> >> modules to do it again. If a user decides to add the +f switch for
> >> libnvdimm's dynamic debug this results in double prints of the function
> >> name.
> >>
> >> Reported-by: Johannes Thumshirn <jthumsh...@suse.de>
> >> Reported-by: Ross Zwisler <ross.zwis...@linux.intel.com>
> >> Signed-off-by: Dan Williams <dan.j.willi...@intel.com>
> >> ---
> >>  drivers/nvdimm/badrange.c   |3 +-
> >>  drivers/nvdimm/btt_devs.c   |   21 
> >>  drivers/nvdimm/bus.c|   13 +-
> >>  drivers/nvdimm/claim.c  |2 +-
> >>  drivers/nvdimm/core.c   |4 ++-
> >>  drivers/nvdimm/dax_devs.c   |5 ++--
> >>  drivers/nvdimm/dimm_devs.c  |7 ++---
> >>  drivers/nvdimm/label.c  |   51 
> >> ++-
> >>  drivers/nvdimm/namespace_devs.c |   38 -
> >>  drivers/nvdimm/pfn_devs.c   |   25 +--
> >>  drivers/nvdimm/pmem.c   |2 +-
> >>  11 files changed, 77 insertions(+), 94 deletions(-)
> >>
> >> diff --git a/drivers/nvdimm/badrange.c b/drivers/nvdimm/badrange.c
> >> index e068d72b4357..df17f1cd696d 100644
> >> --- a/drivers/nvdimm/badrange.c
> >> +++ b/drivers/nvdimm/badrange.c
> >> @@ -176,8 +176,7 @@ static void set_badblock(struct badblocks *bb, 
> >> sector_t s, int num)
> >>   (u64) s * 512, (u64) num * 512);
> >>   /* this isn't an error as the hardware will still throw an exception 
> >> */
> >>   if (badblocks_set(bb, s, num, 1))
> >> - dev_info_once(bb->dev, "%s: failed for sector %llx\n",
> >> - __func__, (u64) s);
> >> + dev_info_once(bb->dev, "failed for sector %llx\n", (u64) s);
> >
> > I don't think you should remove this one.  dev_info_once() is just a 
> > printk(),
> > and doesn't inherit the +f flag from the dynamic debugging code. The 
> > __func__
> > here does add value.
> >
> > The rest of these look correct, though I think you missed one in each of
> > nvdimm_map_release()
> 
> This one is now fixed.
> 
> > and validate_dimm().  (I made these changes as well, but
> > you sent out your patch first. :)
> 
> The validate_dimm() one is printing the callee function one level up
> in the call chain, so it's fine as is.

Sounds good.  you can add:

Reviewed-by: Ross Zwisler <ross.zwis...@linux.intel.com>
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH] acpi, nfit: remove redundant __func__ in dev_dbg

2018-03-05 Thread Ross Zwisler
On Fri, Mar 02, 2018 at 01:20:49PM +0100, Johannes Thumshirn wrote:
> Dynamic debug can be instructed to add the function name to the debug
> output using the +f switch, so there is no need for the nfit module to
> do it again. If a user decides to add the +f switch for nfit's dynamic
> debug this results in double prints of the function name like the
> following:
> 
> [ 2391.935383] acpi_nfit_ctl: nfit ACPI0012:00: acpi_nfit_ctl:nmem8 cmd: 10: 
> func: 1 input length: 0
> 
> Thus remove the stray __func__ printing.
> 
> Signed-off-by: Johannes Thumshirn 

Oh, Johannes I noticed that here is one stray one still in
drivers/acpi/nfit/mce.c.  Do you mind pulling it into your patch to keep the
drivers/acpi/nfit/* changes together?
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH] device-dax: remove redundant __func__ in dev_dbg

2018-03-05 Thread Ross Zwisler
On Mon, Mar 05, 2018 at 05:09:32PM -0800, Dan Williams wrote:
> Dynamic debug can be instructed to add the function name to the debug
> output using the +f switch, so there is no need for the dax modules to
> do it again. If a user decides to add the +f switch for the dax modules'
> dynamic debug this results in double prints of the function name.
> 
> Reported-by: Johannes Thumshirn <jthumsh...@suse.de>
> Reported-by: Ross Zwisler <ross.zwis...@linux.intel.com>
> Signed-off-by: Dan Williams <dan.j.willi...@intel.com>

Looks good to me.

Reviewed-by: Ross Zwisler <ross.zwis...@linux.intel.com>
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH] libnvdimm: remove redundant __func__ in dev_dbg

2018-03-05 Thread Ross Zwisler
On Mon, Mar 05, 2018 at 05:09:21PM -0800, Dan Williams wrote:
> Dynamic debug can be instructed to add the function name to the debug
> output using the +f switch, so there is no need for the libnvdimm
> modules to do it again. If a user decides to add the +f switch for
> libnvdimm's dynamic debug this results in double prints of the function
> name.
> 
> Reported-by: Johannes Thumshirn <jthumsh...@suse.de>
> Reported-by: Ross Zwisler <ross.zwis...@linux.intel.com>
> Signed-off-by: Dan Williams <dan.j.willi...@intel.com>
> ---
>  drivers/nvdimm/badrange.c   |3 +-
>  drivers/nvdimm/btt_devs.c   |   21 
>  drivers/nvdimm/bus.c|   13 +-
>  drivers/nvdimm/claim.c  |2 +-
>  drivers/nvdimm/core.c   |4 ++-
>  drivers/nvdimm/dax_devs.c   |5 ++--
>  drivers/nvdimm/dimm_devs.c  |7 ++---
>  drivers/nvdimm/label.c  |   51 
> ++-
>  drivers/nvdimm/namespace_devs.c |   38 -
>  drivers/nvdimm/pfn_devs.c   |   25 +--
>  drivers/nvdimm/pmem.c   |2 +-
>  11 files changed, 77 insertions(+), 94 deletions(-)
> 
> diff --git a/drivers/nvdimm/badrange.c b/drivers/nvdimm/badrange.c
> index e068d72b4357..df17f1cd696d 100644
> --- a/drivers/nvdimm/badrange.c
> +++ b/drivers/nvdimm/badrange.c
> @@ -176,8 +176,7 @@ static void set_badblock(struct badblocks *bb, sector_t 
> s, int num)
>   (u64) s * 512, (u64) num * 512);
>   /* this isn't an error as the hardware will still throw an exception */
>   if (badblocks_set(bb, s, num, 1))
> - dev_info_once(bb->dev, "%s: failed for sector %llx\n",
> - __func__, (u64) s);
> + dev_info_once(bb->dev, "failed for sector %llx\n", (u64) s);

I don't think you should remove this one.  dev_info_once() is just a printk(),
and doesn't inherit the +f flag from the dynamic debugging code. The __func__
here does add value.

The rest of these look correct, though I think you missed one in each of
nvdimm_map_release() and validate_dimm().  (I made these changes as well, but
you sent out your patch first. :)
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH] acpi, nfit: remove redundant __func__ in dev_dbg

2018-03-05 Thread Ross Zwisler
On Fri, Mar 02, 2018 at 01:20:49PM +0100, Johannes Thumshirn wrote:
> Dynamic debug can be instructed to add the function name to the debug
> output using the +f switch, so there is no need for the nfit module to
> do it again. If a user decides to add the +f switch for nfit's dynamic
> debug this results in double prints of the function name like the
> following:
> 
> [ 2391.935383] acpi_nfit_ctl: nfit ACPI0012:00: acpi_nfit_ctl:nmem8 cmd: 10: 
> func: 1 input length: 0
> 
> Thus remove the stray __func__ printing.
> 
> Signed-off-by: Johannes Thumshirn <jthumsh...@suse.de>

I looked through this and this all looked right to me.

Reviewed-by: Ross Zwisler <ross.zwis...@linux.intel.com>
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH 2 2/2] xfs: fix rt_dev usage for DAX

2018-03-05 Thread Ross Zwisler
On Tue, Feb 06, 2018 at 03:19:15PM -0800, Darrick J. Wong wrote:
<>
> The last time I paid much attention to DAX was the thread "re-enable XFS
> per-inode DAX"[1] last September.  Motivating me to merge anything else
> into DAX involves convincing me that we (mm, fs, dax developers) have
> some kind of agreement about what we want the user-visible interfaces to
> DAX to look like.  

Yep, I agree that is the next step.

> Namely:
> 
> 0. On what level do we allow users / administrators to control usage of
> the dax paths?  Can the hardware convey enough detail to the kernel that
> the kernel can make a reasonable decision on its own whether buffered or
> dax io make more sense?  If so, can we please just have that?  If not,
> why?

Maybe eventually via the HMAT, but I don't think we have any systems today
that do a good job of this.

> 1. If we want to let users override whatever decision the kernel makes,
> how should we do this?  One mount option that applies to everything,
> like ext4?  Inheritable inode flags, like xfs?  Do we have one to force
> it on even if the kernel doesn't want to?  Do we have another to force
> it off even if the kernel wants to?  Do we even want to go down this
> path?  Can we get away with making the answer to Q0 "yes" and then see
> if anyone actually complains about not having fine-grained control?

I agree with Dan's assessment that even if we can make the kernel smart enough
to know when it's not a performance loss to use DAX (i.e. the persistent
memory you're using DAX on is just as fast as the page cache), users will
probably still want to retain the ability to force it on for use cases like
MAP_SYNC, and force it off for things like RDMA or VFIO, at least until the
page pinning work is complete.

Personally I'm still hopeful that we can have both the mount option and the
inheritable inode flags, and that we can figure out what we need to to get
S_DAX transitions happening again.

> 2. Under what conditions can we support dynamic changing of S_DAX on
> inodes at runtime?  Will this switching work at any time?  Only for
> files that are open but not mmap'd?  Only for files that are empty?
>
> 3. The MAP_SYNC support that was merged into 4.15 -- is this sufficient
> to allow this fsyncless clflush business that everyone seems to want?

Yep, I think so.  The next big battles are S_DAX transitions, per-inode DAX
support, and of course the page pinning / leases code that Dan & Christoph
have been talking about.

> 4. Can someone please fix the XFS iomap_begin function to handle CoW
> properly?  I think it's a simple matter of allocate blocks, memcpy, and
> remap, though I don't know how to do that. ;)
> 
> 5. Do we test any of this stuff?

Yes, I think in general we do a pretty good job of DAX test case coverage
between a combination of xfstests (which I have added to as I've fixed DAX
related bugs), nfit_test and the ndctl unit tests.  hch has recently suggested
we start using blktests as well, though I don't think we've actually made any
new tests there yet.  Suggestions on how we can get better test coverage are
welcome.

> The thread from last September left off with promises to go define what
> interface and behaviors we are providing to userspace, but afaict none
> of that ever happened?  If we don't resolve these questions before LSF
> then I think what's needed is to lock everyone in a room to hash all
> this out. :P

Yep, that's accurate.  I got pulled off onto other work and am just now
finding my way back.  I think talking about it at LSF sounds great, but it's a
shame that hch won't be available.  It'll be nice to finally meet dchinner,
though. :)

> --D
> 
> PS: My personal inclination is {yes, get rid of all that until someone
> complains, i think so but haven't tested it, ???, i sure hope so}.
> 
> [1] https://marc.info/?l=linux-xfs=150638135225793=2
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH 1/3] nfit_test: improve structure offset handling

2018-03-05 Thread Ross Zwisler
On Tue, Feb 27, 2018 at 10:29:50AM -0700, Ross Zwisler wrote:
> In nfit_test0_setup() and nfit_test1_setup() we keep an 'offset' value
> which we use to calculate where in our 'nfit_buf' we will place our next
> structure.  The handling of 'offset' and the calculation of the placement
> of the next structure is a bit inconsistent, though.  We don't update
> 'offset' after we insert each structure, sometimes causing us to update it
> for multiple structures' sizes at once.  When calculating the position of
> the next structure we aren't always able to just use 'offset', but
> sometimes have to add in other structure sizes as well.
> 
> Fix this by updating 'offset' after each structure insertion in a
> consistent way, allowing us to always calculate the position of the next
> structure to be inserted by just using 'nfit_buf + offset'.
> 
> Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>

Ping on this series.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH] acpi, nfit: remove redundant __func__ in dev_dbg

2018-03-05 Thread Ross Zwisler
On Mon, Mar 05, 2018 at 09:53:47AM -0800, Dan Williams wrote:
> On Mon, Mar 5, 2018 at 9:49 AM, Ross Zwisler
> <ross.zwis...@linux.intel.com> wrote:
> > On Fri, Mar 02, 2018 at 01:20:49PM +0100, Johannes Thumshirn wrote:
> >> Dynamic debug can be instructed to add the function name to the debug
> >> output using the +f switch, so there is no need for the nfit module to
> >> do it again. If a user decides to add the +f switch for nfit's dynamic
> >> debug this results in double prints of the function name like the
> >> following:
> >>
> >> [ 2391.935383] acpi_nfit_ctl: nfit ACPI0012:00: acpi_nfit_ctl:nmem8 cmd: 
> >> 10: func: 1 input length: 0
> >>
> >> Thus remove the stray __func__ printing.
> >>
> >> Signed-off-by: Johannes Thumshirn <jthumsh...@suse.de>
> >
> > This makes sense to me, but I guess we'll see what Dan thinks.  If we decide
> > to go this route we may also want to do the same to all the instances of 
> > this
> > pattern in the libnvdimm debug output.
> 
> Is there a way to turn on this '+f' flag from the kernel boot command
> line? My primary debug scenario is specifying "libnvdimm.dyndbg
> nfit.dyndbg" on the command line.

Yep, it's just:

libnvdimm.dyndbg="+fp"
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH] acpi, nfit: remove redundant __func__ in dev_dbg

2018-03-05 Thread Ross Zwisler
On Fri, Mar 02, 2018 at 01:20:49PM +0100, Johannes Thumshirn wrote:
> Dynamic debug can be instructed to add the function name to the debug
> output using the +f switch, so there is no need for the nfit module to
> do it again. If a user decides to add the +f switch for nfit's dynamic
> debug this results in double prints of the function name like the
> following:
> 
> [ 2391.935383] acpi_nfit_ctl: nfit ACPI0012:00: acpi_nfit_ctl:nmem8 cmd: 10: 
> func: 1 input length: 0
> 
> Thus remove the stray __func__ printing.
> 
> Signed-off-by: Johannes Thumshirn 

This makes sense to me, but I guess we'll see what Dan thinks.  If we decide
to go this route we may also want to do the same to all the instances of this
pattern in the libnvdimm debug output.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH 3/5] acpi, nfit: Add function to look up nvdimm device and provide SMBIOS handle

2018-02-28 Thread Ross Zwisler
On Thu, Feb 22, 2018 at 11:58:09AM -0800, Tony Luck wrote:
> EDAC driver needs to look up attributes of NVDIMMs provided in SMBIOS.
> 
> Provide a function that looks up an acpi_nfit_memory_map from a device
> handle (node/socket/mc/channel/dimm) and returns the SMBIOS handle.
> Also pass back the "flags" so we can see if the NVDIMM is OK.
> 
> Signed-off-by: Tony Luck 
> ---
<>
> diff --git a/include/acpi/nfit.h b/include/acpi/nfit.h
> new file mode 100644
> index ..6ccc6eacd855
> --- /dev/null
> +++ b/include/acpi/nfit.h
> @@ -0,0 +1,26 @@
> +/*
> + * Copyright(c) 2017 Intel Corporation. All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of version 2 of the GNU General Public License as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * General Public License for more details.
> + */

Just a quick nit - I think we're supposed to use SPDX license identifiers for
new files?
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[PATCH 3/3] nfit_test: prevent parsing error of nfit_test.0

2018-02-27 Thread Ross Zwisler
When you load nfit_test you currently see the following error in dmesg:

 nfit_test nfit_test.0: found a zero length table '0' parsing nfit

This happens because when we parse the nfit_test.0 table via
acpi_nfit_init(), we specify a size of nfit_test->nfit_size.  For the first
pass through nfit_test.0 where (t->setup_hotplug == 0) this is the size of
the entire buffer we allocated, including space for the hot plug
structures, not the size that we've actually filled in.

Fix this by only trying to parse the size of the structures that we've
filled in.

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
---
 tools/testing/nvdimm/test/nfit.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index fcd233342273..d785235ba04e 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -154,6 +154,7 @@ struct nfit_test {
void *nfit_buf;
dma_addr_t nfit_dma;
size_t nfit_size;
+   size_t nfit_filled;
int dcr_idx;
int num_dcr;
int num_pm;
@@ -2053,6 +2054,8 @@ static void nfit_test0_setup(struct nfit_test *t)
WARN_ON(offset != t->nfit_size);
}
 
+   t->nfit_filled = offset;
+
post_ars_status(>ars_state, >badrange, t->spa_set_dma[0],
SPA0_SIZE);
 
@@ -2172,6 +2175,8 @@ static void nfit_test1_setup(struct nfit_test *t)
/* sanity check to make sure we've filled the buffer */
WARN_ON(offset != t->nfit_size);
 
+   t->nfit_filled = offset;
+
post_ars_status(>ars_state, >badrange, t->spa_set_dma[0],
SPA2_SIZE);
 
@@ -2529,7 +2534,7 @@ static int nfit_test_probe(struct platform_device *pdev)
nd_desc->ndctl = nfit_test_ctl;
 
rc = acpi_nfit_init(acpi_desc, nfit_test->nfit_buf,
-   nfit_test->nfit_size);
+   nfit_test->nfit_filled);
if (rc)
return rc;
 
-- 
2.14.3

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[PATCH 2/3] nfit_test: fix buffer overrun, add sanity check

2018-02-27 Thread Ross Zwisler
It turns out that we were overrunning the 'nfit_buf' buffer in
nfit_test0_setup() in the (t->setup_hotplug == 1) case because we failed to
correctly account for all of the acpi_nfit_memory_map structures.

Fix the structure count which will increase the allocation size of
'nfit_buf' in nfit_test0_alloc().  Also add some WARN_ON()s to
nfit_test0_setup() and nfit_test1_setup() to catch future issues where the
size of the buffer doesn't match the amount of data we're writing.

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
---
 tools/testing/nvdimm/test/nfit.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 1376fc95c33a..fcd233342273 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -104,7 +104,8 @@ enum {
NUM_HINTS = 8,
NUM_BDW = NUM_DCR,
NUM_SPA = NUM_PM + NUM_DCR + NUM_BDW,
-   NUM_MEM = NUM_DCR + NUM_BDW + 2 /* spa0 iset */ + 4 /* spa1 iset */,
+   NUM_MEM = NUM_DCR + NUM_BDW + 2 /* spa0 iset */
+   + 4 /* spa1 iset */ + 1 /* spa11 iset */,
DIMM_SIZE = SZ_32M,
LABEL_SIZE = SZ_128K,
SPA_VCD_SIZE = SZ_4M,
@@ -2047,6 +2048,9 @@ static void nfit_test0_setup(struct nfit_test *t)
flush->hint_address[i] = t->flush_dma[4]
+ i * sizeof(u64);
offset += flush->header.length;
+
+   /* sanity check to make sure we've filled the buffer */
+   WARN_ON(offset != t->nfit_size);
}
 
post_ars_status(>ars_state, >badrange, t->spa_set_dma[0],
@@ -2165,6 +2169,9 @@ static void nfit_test1_setup(struct nfit_test *t)
dcr->windows = 0;
offset += dcr->header.length;
 
+   /* sanity check to make sure we've filled the buffer */
+   WARN_ON(offset != t->nfit_size);
+
post_ars_status(>ars_state, >badrange, t->spa_set_dma[0],
SPA2_SIZE);
 
-- 
2.14.3

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[PATCH 1/3] nfit_test: improve structure offset handling

2018-02-27 Thread Ross Zwisler
In nfit_test0_setup() and nfit_test1_setup() we keep an 'offset' value
which we use to calculate where in our 'nfit_buf' we will place our next
structure.  The handling of 'offset' and the calculation of the placement
of the next structure is a bit inconsistent, though.  We don't update
'offset' after we insert each structure, sometimes causing us to update it
for multiple structures' sizes at once.  When calculating the position of
the next structure we aren't always able to just use 'offset', but
sometimes have to add in other structure sizes as well.

Fix this by updating 'offset' after each structure insertion in a
consistent way, allowing us to always calculate the position of the next
structure to be inserted by just using 'nfit_buf + offset'.

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
---
 tools/testing/nvdimm/test/nfit.c | 183 +++
 1 file changed, 109 insertions(+), 74 deletions(-)

diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 620fa78b3b1b..1376fc95c33a 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -1366,7 +1366,7 @@ static void nfit_test0_setup(struct nfit_test *t)
struct acpi_nfit_data_region *bdw;
struct acpi_nfit_flush_address *flush;
struct acpi_nfit_capabilities *pcap;
-   unsigned int offset, i;
+   unsigned int offset = 0, i;
 
/*
 * spa0 (interleave first half of dimm0 and dimm1, note storage
@@ -1380,93 +1380,102 @@ static void nfit_test0_setup(struct nfit_test *t)
spa->range_index = 0+1;
spa->address = t->spa_set_dma[0];
spa->length = SPA0_SIZE;
+   offset += spa->header.length;
 
/*
 * spa1 (interleave last half of the 4 DIMMS, note storage
 * does not actually alias the related block-data-window
 * regions)
 */
-   spa = nfit_buf + sizeof(*spa);
+   spa = nfit_buf + offset;
spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS;
spa->header.length = sizeof(*spa);
memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_PM), 16);
spa->range_index = 1+1;
spa->address = t->spa_set_dma[1];
spa->length = SPA1_SIZE;
+   offset += spa->header.length;
 
/* spa2 (dcr0) dimm0 */
-   spa = nfit_buf + sizeof(*spa) * 2;
+   spa = nfit_buf + offset;
spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS;
spa->header.length = sizeof(*spa);
memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_DCR), 16);
spa->range_index = 2+1;
spa->address = t->dcr_dma[0];
spa->length = DCR_SIZE;
+   offset += spa->header.length;
 
/* spa3 (dcr1) dimm1 */
-   spa = nfit_buf + sizeof(*spa) * 3;
+   spa = nfit_buf + offset;
spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS;
spa->header.length = sizeof(*spa);
memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_DCR), 16);
spa->range_index = 3+1;
spa->address = t->dcr_dma[1];
spa->length = DCR_SIZE;
+   offset += spa->header.length;
 
/* spa4 (dcr2) dimm2 */
-   spa = nfit_buf + sizeof(*spa) * 4;
+   spa = nfit_buf + offset;
spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS;
spa->header.length = sizeof(*spa);
memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_DCR), 16);
spa->range_index = 4+1;
spa->address = t->dcr_dma[2];
spa->length = DCR_SIZE;
+   offset += spa->header.length;
 
/* spa5 (dcr3) dimm3 */
-   spa = nfit_buf + sizeof(*spa) * 5;
+   spa = nfit_buf + offset;
spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS;
spa->header.length = sizeof(*spa);
memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_DCR), 16);
spa->range_index = 5+1;
spa->address = t->dcr_dma[3];
spa->length = DCR_SIZE;
+   offset += spa->header.length;
 
/* spa6 (bdw for dcr0) dimm0 */
-   spa = nfit_buf + sizeof(*spa) * 6;
+   spa = nfit_buf + offset;
spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS;
spa->header.length = sizeof(*spa);
memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_BDW), 16);
spa->range_index = 6+1;
spa->address = t->dimm_dma[0];
spa->length = DIMM_SIZE;
+   offset += spa->header.length;
 
/* spa7 (bdw for dcr1) dimm1 */
-   spa = nfit_buf + sizeof(*spa) * 7;
+   spa = nfit_buf + offset;
spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS;
spa->header.length = sizeof(*spa);
memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_BDW), 16);
spa->range_index = 7+1;
spa->address = t->dimm_dma[1];
spa->length = DIMM_SIZE;
+   offset += spa->header.length

Re: [ndctl PATCH] ndctl, test: kill usage of fallocate in firmware-update.sh

2018-02-23 Thread Ross Zwisler
On Fri, Feb 23, 2018 at 01:45:40PM -0800, Dan Williams wrote:
> On Fri, Feb 23, 2018 at 12:58 PM, Ross Zwisler
> <ross.zwis...@linux.intel.com> wrote:
> > On Thu, Feb 22, 2018 at 10:59:22PM -0800, Dan Williams wrote:
> >> The 'fallocate -l 196608 $image' step in the test fails when $image is
> >> on an NFS mount. Use dd instead to create a sparse file. We do not need
> >> to allocate anything since we are only writing zeros.
> >>
> >> Cc: Dave Jiang <dave.ji...@intel.com>
> >> Signed-off-by: Dan Williams <dan.j.willi...@intel.com>
> >> ---
> >>  test/firmware-update.sh |2 +-
> >>  1 file changed, 1 insertion(+), 1 deletion(-)
> >>
> >> diff --git a/test/firmware-update.sh b/test/firmware-update.sh
> >> index 0d5bcdb3cc42..173647218c28 100755
> >> --- a/test/firmware-update.sh
> >> +++ b/test/firmware-update.sh
> >> @@ -49,7 +49,7 @@ detect()
> >>
> >>  do_tests()
> >>  {
> >> - fallocate -l 196608 $image
> >> + dd if=/dev/zero of=$image bs=1 count=1 skip=196607
> >>   $ndctl update-firmware -d $dev -f $image
> >>  }
> >
> > Hmm, I'm not seeing this failure in my NFS based setup.  Out of curiosity, 
> > do
> > you know why it's failing?  Some difference in our NFS configs?
> 
> Probably, here are my mount options:
> 
> root on / type nfs4
> (rw,relatime,vers=4.0,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,clientaddr=192.168.100.127,local_lock=none,addr=192.168.100.1)
> 
> > Anyway, this seems fine, but
> >
> > fallocate -l 196608 $image
> >
> > does the same thing and seems a little simpler, IMO.
> 
> Not simpler if it randomly fails depending on the filesystem, and
> there is no need to allocate that space since we're just creating a
> file full of zeros.

sorry, I meant "truncate -s 196608 $image".

Anyway, this is fine with either dd or truncate.   I agree that not actually
reserving filesystem space is a win.

Reviewed-by: Ross Zwisler <ross.zwis...@linux.intel.com>
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[ndctl PATCH] ndctl, tests: firmware-update.sh post-test cleanup

2018-02-23 Thread Ross Zwisler
When firmware-update.sh is run multiple times by itself, all runs but the
first fail with this:

+ ../ndctl/ndctl update-firmware -d nmem1 -f update-fw.img
  Error: START FIRMWARE UPDATE failed: 0x9

  Error: Another firmware upload in progress or finished.

Fix this by having the test properly clean up and disable all the nfit_test
regions and unloading the nfit_test module, as we do in test/pmem-errors.sh
et al.

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
---
 test/firmware-update.sh | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/test/firmware-update.sh b/test/firmware-update.sh
index 1736472..44c1412 100755
--- a/test/firmware-update.sh
+++ b/test/firmware-update.sh
@@ -6,6 +6,7 @@
 [ -f "./ndctl/ndctl" ] && [ -x "./ndctl/ndctl" ] && ndctl="./ndctl/ndctl"
 [ -z "$ndctl" ] && echo "Couldn't find an ndctl binary" && exit 1
 bus="nfit_test.0"
+bus1="nfit_test.1"
 json2var="s/[{}\",]//g; s/:/=/g"
 rc=77
 dev=""
@@ -41,6 +42,13 @@ reset()
fi
 }
 
+cleanup()
+{
+   $ndctl disable-region -b "$bus" all
+   $ndctl disable-region -b "$bus1" all
+   modprobe -r nfit_test
+}
+
 detect()
 {
dev=$($ndctl list -b "$bus" -D | jq .[0].dev | tr -d '"')
@@ -59,6 +67,5 @@ rc=1
 reset
 detect
 do_tests
-reset
+cleanup
 exit 0
-
-- 
2.14.3

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [ndctl PATCH] ndctl, test: skip btt-pad compat test on pre-4K capable kernels

2018-02-23 Thread Ross Zwisler
On Thu, Feb 22, 2018 at 10:59:17PM -0800, Dan Williams wrote:
> Prior to the introduction of v1.2 namespace labels it was not possible
> to specify a pmem namespace with a 4K sector size. Skip this test when
> that namespace creation attempt fails.
> 
> Cc: Vishal Verma <vishal.l.ve...@intel.com>
> Signed-off-by: Dan Williams <dan.j.willi...@intel.com>

Sure, seems fine.

Reviewed-by: Ross Zwisler <ross.zwis...@linux.intel.com>
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [ndctl PATCH] ndctl, test: kill usage of fallocate in firmware-update.sh

2018-02-23 Thread Ross Zwisler
On Thu, Feb 22, 2018 at 10:59:22PM -0800, Dan Williams wrote:
> The 'fallocate -l 196608 $image' step in the test fails when $image is
> on an NFS mount. Use dd instead to create a sparse file. We do not need
> to allocate anything since we are only writing zeros.
> 
> Cc: Dave Jiang 
> Signed-off-by: Dan Williams 
> ---
>  test/firmware-update.sh |2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/test/firmware-update.sh b/test/firmware-update.sh
> index 0d5bcdb3cc42..173647218c28 100755
> --- a/test/firmware-update.sh
> +++ b/test/firmware-update.sh
> @@ -49,7 +49,7 @@ detect()
>  
>  do_tests()
>  {
> - fallocate -l 196608 $image
> + dd if=/dev/zero of=$image bs=1 count=1 skip=196607
>   $ndctl update-firmware -d $dev -f $image
>  }

Hmm, I'm not seeing this failure in my NFS based setup.  Out of curiosity, do
you know why it's failing?  Some difference in our NFS configs?

Anyway, this seems fine, but 

fallocate -l 196608 $image

does the same thing and seems a little simpler, IMO.

- Ross
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [ndctl PATCH] ndctl, firmware-update: kill usage of flock() in verify_fw_file()

2018-02-23 Thread Ross Zwisler
On Thu, Feb 22, 2018 at 10:59:27PM -0800, Dan Williams wrote:
> It serves no purpose, we never explicitly unlock it, and it causes
> needless failures if the firmware file happens to be on a filesystem
> that does not support file locks.
> 
> Cc: Dave Jiang <dave.ji...@intel.com>
> Signed-off-by: Dan Williams <dan.j.willi...@intel.com>

Yay, I was hitting this as well in my setup which uses NFS, but wasn't sure if
the flock was necessary.

Reviewed-by: Ross Zwisler <ross.zwis...@linux.intel.com>
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH] acpi: add NFIT and HMAT to the initrd override list

2018-02-22 Thread Ross Zwisler
On Mon, Dec 18, 2017 at 12:54:05PM -0700, Ross Zwisler wrote:
> On Fri, Dec 08, 2017 at 08:29:25AM -0800, Dan Williams wrote:
> > These tables, NFIT and HMAT, are essential for describing
> > next-generation platform memory topologies and performance
> > characteristics. Allow them to be overridden for debug and test and
> > purposes.
> > 
> > Cc: Ross Zwisler <ross.zwis...@linux.intel.com>
> 
> Yes please!  This works in my test setup for inserting custom HMAT tables, and
> I would really like to have this for testing my HMAT series:
> 
> https://lists.01.org/pipermail/linux-nvdimm/2017-December/013571.html
> 
> Reviewed-by: Ross Zwisler <ross.zwis...@linux.intel.com>
> 
> > Signed-off-by: Dan Williams <dan.j.willi...@intel.com>

Ping on this patch - can we get this merged?
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[PATCH] ndctl, test: explicitly request namespace size

2018-02-21 Thread Ross Zwisler
The btt-pad-compat.sh test was failing for me because the namespace it was
creating was 32 MiB in size, but it was trying to do 64 MiB worth of I/O
using xxd.  This happened because when we created the namespace we didn't
specify a size, and in nfit_test we could have hit either a 64 MiB or a 32
MiB region.  Vishal happened to be getting the 64 MiB region, and I
happened to get the 32 MiB.

Fix this by explicitly requesting a namespace size of 64 MiB.

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
---
 test/btt-pad-compat.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/btt-pad-compat.sh b/test/btt-pad-compat.sh
index 4dfe8b0..a157c58 100755
--- a/test/btt-pad-compat.sh
+++ b/test/btt-pad-compat.sh
@@ -137,7 +137,7 @@ copy_xxd_img()
 create_oldfmt_ns()
 {
# create null-uuid namespace
-   json=$($ndctl create-namespace -b "$bus" -t pmem -m raw -l 4096 -u 
----)
+   json=$($ndctl create-namespace -b "$bus" -s 64M -t pmem -m raw -l 4096 
-u ----)
eval "$(echo "$json" | sed -e "$json2var")"
[ -n "$dev" ] || err "$LINENO" 2
[ -n "$size" ] || err "$LINENO" 2
-- 
2.14.3

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[PATCH 2/2] ndctl: add test files to .gitignore

2018-02-21 Thread Ross Zwisler
Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
---
 .gitignore | 21 +
 1 file changed, 21 insertions(+)

diff --git a/.gitignore b/.gitignore
index 3c2de0e..20a04de 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,3 +32,24 @@ version.m4
 cscope.files
 cscope*.out
 tags
+test/*.log
+test/*.trs
+test/blk-ns
+test/dax-dev
+test/dax-errors
+test/dax-pmd
+test/daxdev-errors
+test/device-dax
+test/dpa-alloc
+test/dsm-fail
+test/hugetlb
+test/image
+test/libndctl
+test/mmap
+test/multi-pmem
+test/parent-uuid
+test/pmem-ns
+test/smart-listen
+test/smart-notify
+test/fio.job
+test/local-write-0-verify.state
-- 
2.14.3

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v6] ndctl: add option to list firmware information for a DIMM

2018-02-20 Thread Ross Zwisler
On Tue, Feb 20, 2018 at 11:18:28AM -0700, Dave Jiang wrote:
> Adding firmware output of firmware information when ndctl list -D -F is used.
> Components displayed are current firmware version, next firmware version,
> and if a powercycle is required (firmware updated).
> 
> Signed-off-by: Dave Jiang 
> Tested-by: Jeff Moyer 
<>
> +struct json_object *util_dimm_firmware_to_json(struct ndctl_dimm *dimm,
> + unsigned long flags)
> +{
> + struct json_object *jfirmware = json_object_new_object();
> + struct json_object *jobj;
> + struct ndctl_cmd *cmd;
> + int rc;
> + uint64_t run, next;
> + bool reboot = false;
> +
> + if (!jfirmware)
> + return NULL;
> +
> + cmd = ndctl_dimm_cmd_new_fw_get_info(dimm);
> + if (!cmd)
> + goto err;
> +
> + rc = ndctl_cmd_submit(cmd);
> + if (rc || ndctl_cmd_fw_xlat_firmware_status(cmd) != FW_SUCCESS) {
> + int run_err = -1;

Just a few nits.  Looks good otherwise.

This new version defines three different local variables with the value -1
(run_err, run_err, next_err) and uses each one only once.  Can we just use -1
instead of a variable in util_json_object_hex(), or maybe just have one
"error_val" variable that is defined for the whole function?

> +
> + jobj = util_json_object_hex(run_err, flags);
> + if (jobj)
> + json_object_object_add(jfirmware, "current_version",
> + jobj);
> + goto out;
> + }
> +
> + run = ndctl_cmd_fw_info_get_run_version(cmd);
> + if (run == ULLONG_MAX) {
> + int run_err = -1;
> +
> + jobj = util_json_object_hex(run_err, flags);
> + if (jobj)
> + json_object_object_add(jfirmware, "current_version",
> + jobj);
> + goto out;
> + }
> +
> + jobj = util_json_object_hex(run, flags);
> + if (jobj)
> + json_object_object_add(jfirmware, "current_version", jobj);
> +
> + next = ndctl_cmd_fw_info_get_next_version(cmd);
> + if (next == ULLONG_MAX) {
> + int next_err = -1;
> +
> + jobj = util_json_object_hex(next_err, flags);
> + if (jobj)
> + json_object_object_add(jfirmware, "next_version",
> + jobj);
> + goto out;
> + }
> +
> + if (next != 0) {
> + reboot = true;
> + jobj = util_json_object_hex(next, flags);
> + if (jobj)
> + json_object_object_add(jfirmware,
> + "next_version", jobj);
> + }
> +
> + if (reboot) {
> + jobj = json_object_new_boolean(reboot);
> + if (jobj)
> + json_object_object_add(jfirmware,
> + "need_powercycle", jobj);
> + }

You don't need this 'if (reboot) block' because 'reboot == (next != 0)' - you
can just combine this with the above conditional:

if (next != 0) {
jobj = util_json_object_hex(next, flags);
if (jobj)
json_object_object_add(jfirmware,
"next_version", jobj);

jobj = json_object_new_boolean(true);
if (jobj)
json_object_object_add(jfirmware,
"need_powercycle", jobj);
}

and you can get rid of the variable 'reboot' entirely.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v4] ndctl: add option to list firmware information for a DIMM

2018-02-16 Thread Ross Zwisler
On Tue, Feb 13, 2018 at 02:43:54PM -0700, Dave Jiang wrote:
> Adding firmware output of firmware information when ndctl list -D -F is used.
> Components displayed are current firmware version, updated firmware version,
> and if a coldboot is required (firmware updated).
> 
> Signed-off-by: Dave Jiang 
> Tested-by: Jeff Moyer 
> ---
> v4:
> - Remove output when updated_version is 0. That indicates no updated firmware.

I think you mean 'next_version'.  It's a little confusing because the variable
you use for this is 'updated', but the json field is 'next_version'.   Maybe
it'd be better to keep things consistent and just s/updated/next/g in
util_dimm_firmware_to_json()?  Although it's still not totally consistent
because of the naming of ndctl_cmd_fw_info_get_updated_version()...up to you.

> v3:
> - Fixed issue where it skips displaying rest of the details if there's no
>   firmware details.
> 
> v2:
> - Added copyright
> - Added support for human readable option (hex) for versions
> - Removed check against CMD_CALL as it's not useful
> 
>  Documentation/ndctl/ndctl-list.txt |   13 +
>  ndctl/Makefile.am  |1 
>  ndctl/list.c   |   13 +
>  ndctl/util/json-firmware.c |   87 
> 
>  util/json.h|2 +
>  5 files changed, 116 insertions(+)
>  create mode 100644 ndctl/util/json-firmware.c
> 
> diff --git a/Documentation/ndctl/ndctl-list.txt 
> b/Documentation/ndctl/ndctl-list.txt
> index fc07a71..85b87d4 100644
> --- a/Documentation/ndctl/ndctl-list.txt
> +++ b/Documentation/ndctl/ndctl-list.txt
> @@ -110,6 +110,19 @@ include::xable-region-options.txt[]
>}
>  }
>  
> +-F::
> +--firmware::
> + Include dimm firmware info in the listing. For example:
> +[verse]
> +{
> +  "dev":"nmem0",
> +  "firmware":{
> +  "current_version":0,
> +  "next_version":1,
> +  "coldboot_required":true
> +  }
> +}
> +
>  -X::
>  --device-dax::
>   Include device-dax ("daxregion") details when a namespace is in
> diff --git a/ndctl/Makefile.am b/ndctl/Makefile.am
> index 2054c1a..e0db97b 100644
> --- a/ndctl/Makefile.am
> +++ b/ndctl/Makefile.am
> @@ -13,6 +13,7 @@ ndctl_SOURCES = ndctl.c \
>   test.c \
>   ../util/json.c \
>   util/json-smart.c \
> + util/json-firmware.c \
>   inject-error.c \
>   update.c \
>   inject-smart.c
> diff --git a/ndctl/list.c b/ndctl/list.c
> index 37a224a..8bb9920 100644
> --- a/ndctl/list.c
> +++ b/ndctl/list.c
> @@ -35,6 +35,7 @@ static struct {
>   bool dax;
>   bool media_errors;
>   bool human;
> + bool firmware;
>  } list;
>  
>  static unsigned long listopts_to_flags(void)
> @@ -277,6 +278,7 @@ int cmd_list(int argc, const char **argv, void *ctx)
>   "filter by region-type"),
>   OPT_BOOLEAN('B', "buses", , "include bus info"),
>   OPT_BOOLEAN('D', "dimms", , "include dimm info"),
> + OPT_BOOLEAN('F', "firmware", , "include firmware 
> info"),
>   OPT_BOOLEAN('H', "health", , "include dimm health"),
>   OPT_BOOLEAN('R', "regions", ,
>   "include region info"),
> @@ -420,6 +422,17 @@ int cmd_list(int argc, const char **argv, void *ctx)
>   }
>   }
>  
> + if (list.firmware) {
> + struct json_object *jfirmware;
> +
> + jfirmware = util_dimm_firmware_to_json(dimm,
> + listopts_to_flags());
> + if (jfirmware)
> + json_object_object_add(jdimm,
> + "firmware",
> + jfirmware);
> + }
> +
>   /*
>* Without a bus we are collecting dimms anonymously
>* across the platform.
> diff --git a/ndctl/util/json-firmware.c b/ndctl/util/json-firmware.c
> new file mode 100644
> index 000..04297ec
> --- /dev/null
> +++ b/ndctl/util/json-firmware.c
> @@ -0,0 +1,87 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/* Copyright(c) 2018 Intel Corporation. All rights reserved. */
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +struct json_object *util_dimm_firmware_to_json(struct ndctl_dimm *dimm,
> + unsigned long flags)
> +{
> + struct json_object *jfirmware = json_object_new_object();
> + struct json_object *jobj;
> + struct ndctl_cmd *cmd;
> + int rc;
> + uint64_t run, updated;
> + bool reboot = false;
> +
> + if (!jfirmware)
> + return NULL;
> +
> + cmd = ndctl_dimm_cmd_new_fw_get_info(dimm);
> + if (!cmd)

Re: [PATCH v3 2/3] dax: change bdev_dax_supported() to support boolean returns

2018-02-13 Thread Ross Zwisler
On Tue, Feb 13, 2018 at 01:35:19PM -0700, Dave Jiang wrote:
> 
> 
> On 02/13/2018 01:31 PM, Ross Zwisler wrote:
> > On Thu, Feb 08, 2018 at 05:38:19PM -0700, Dave Jiang wrote:
> >> The function return values are confusing with the way the function is
> >> named. We expect a true or false return value but it actually returns
> >> 0/-errno.  This makes the code very confusing. Changing the return values
> >> to return a bool where if DAX is supported then return true and no DAX
> >> support returns false.
> >>
> >> Signed-off-by: Dave Jiang <dave.ji...@intel.com>
> >> ---
> > 
> >> diff --git a/fs/ext2/super.c b/fs/ext2/super.c
> >> index 655699321c45..636b9c5e1bff 100644
> >> --- a/fs/ext2/super.c
> >> +++ b/fs/ext2/super.c
> >> @@ -958,9 +958,10 @@ static int ext2_fill_super(struct super_block *sb, 
> >> void *data, int silent)
> >>blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
> >>  
> >>if (sbi->s_mount_opt & EXT2_MOUNT_DAX) {
> >> -  err = sb_dax_supported(sb, blocksize);
> >> -  if (err)
> >> +  if(!sb_dax_supported(sb, blocksize)) {
> >> +  err = -EIO;
> > 
> > No need to set 'err' here.  This is just a temporary variable used for some
> > local checks later in the function.  'ret' is the value that will be 
> > returned,
> > and that is already initialized to -EINVAL which should be fine.
> 
> Change ret to -EIO instead to set the correct error return code?

I'm not sure that -EIO is the 'correct' return code.  The old
sb_dax_supported() code could have returned -EINVAL, -EOPNOTSUPP or -EIO,
based on what went wrong.  All the other error cases in this function just
goto failed_mount without messing with 'ret', and we should probably do the
same.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v3 2/3] dax: change bdev_dax_supported() to support boolean returns

2018-02-13 Thread Ross Zwisler
On Thu, Feb 08, 2018 at 05:38:19PM -0700, Dave Jiang wrote:
> The function return values are confusing with the way the function is
> named. We expect a true or false return value but it actually returns
> 0/-errno.  This makes the code very confusing. Changing the return values
> to return a bool where if DAX is supported then return true and no DAX
> support returns false.
> 
> Signed-off-by: Dave Jiang 
> ---

> diff --git a/fs/ext2/super.c b/fs/ext2/super.c
> index 655699321c45..636b9c5e1bff 100644
> --- a/fs/ext2/super.c
> +++ b/fs/ext2/super.c
> @@ -958,9 +958,10 @@ static int ext2_fill_super(struct super_block *sb, void 
> *data, int silent)
>   blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
>  
>   if (sbi->s_mount_opt & EXT2_MOUNT_DAX) {
> - err = sb_dax_supported(sb, blocksize);
> - if (err)
> + if(!sb_dax_supported(sb, blocksize)) {
> + err = -EIO;

No need to set 'err' here.  This is just a temporary variable used for some
local checks later in the function.  'ret' is the value that will be returned,
and that is already initialized to -EINVAL which should be fine.

>   goto failed_mount;
> + }
>   }
>  
>   /* If the blocksize doesn't match, re-read the thing.. */
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 804a2d6729af..7b7650ac9c53 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -3712,9 +3712,10 @@ static int ext4_fill_super(struct super_block *sb, 
> void *data, int silent)
>   " that may contain inline data");
>   goto failed_mount;
>   }
> - err = sb_dax_supported(sb, blocksize);
> - if (err)
> + if (!sb_dax_supported(sb, blocksize)) {
> + err = -EIO;

Same comment as with the ext2 case - no need to set 'err', 'ret' is -EINVAL
and has you covered.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH] loop: Fix lost writes caused by missing flag

2018-02-13 Thread Ross Zwisler
On Tue, Feb 13, 2018 at 03:54:04PM +0100, Christoph Hellwig wrote:
> Looks good:
> 
> Reviewed-by: Christoph Hellwig 
> 
> Can you wire up your test cases for blktests?

Is blktests really the right place for this test?  This failure is highly
dependent on the configuration of the filesystem that is holding the file that
we are using for the loopback device.  It doesn't seem like blktests has
support for mount options (dax), etc?

Because of the interaction with the underlying filesystem this seems like a
better fit with xfstests to me, but I don't know if we need to add tests there
because we already have pretty good coverage of loopback device failures.
That's how we found this - this bug causes all these tests to fail:
xfs/074 xfs/078 xfs/216 xfs/217 xfs/250
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v2] libnvdimm: re-enable deep flush for pmem devices

2018-02-12 Thread Ross Zwisler
On Mon, Feb 12, 2018 at 03:05:10PM -0800, Dan Williams wrote:
> On Mon, Feb 12, 2018 at 2:53 PM, Jeff Moyer  wrote:
> > Dave Jiang  writes:
> >
> >> Re-enable deep flush so that users always have a way to be sure that a 
> >> write
> >> does make it all the way out to the NVDIMM. The PMEM driver writes always
> >> make it "all the way to the NVDIMM", and it relies on the ADR mechanism to
> >> flush the write buffers on power failure. Deep flush is there to explicitly
> >> flush those write buffers to protect against (rare) ADR failure.
> >> This change prevents a regression in deep flush behavior so that 
> >> applications
> >> can continue to depend on fsync() as a mechanism to trigger deep flush in 
> >> the
> >> filesystem-dax case.
> >
> > That's still very confusing text.  Specifically, the part where you say
> > that pmem driver writes always make it to the DIMM.  I think the
> > changelog could start with "Deep flush is there to explicitly flush
> > write buffers"  Anyway, the fix looks right to me.
> 
> I ended up changing the commit message to this, let me know if it reads 
> better:
> 
> 
> libnvdimm: re-enable deep flush for pmem devices via fsync()
> 
> Re-enable deep flush so that users always have a way to be sure that a
> write makes it all the way out to media. The PMEM driver writes always
> arrive at the NVDIMM, and it relies on the ADR (Asynchronous DRAM
> Refresh) mechanism to flush the write buffers on power failure. Deep
> flush is there to explicitly flush those write buffers to protect
> against (rare) ADR failure.  This change prevents a regression in deep
> flush behavior so that applications can continue to depend on fsync() as
> a mechanism to trigger deep flush in the filesystem-DAX case.
> 
> Fixes: 06e8ccdab15f4 ("acpi: nfit: Add support for detect platform
> CPU cache...")
> Signed-off-by: Dave Jiang 
> Signed-off-by: Dan Williams 

Plus Jeff's reviewed-by.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[PATCH] loop: Fix lost writes caused by missing flag

2018-02-12 Thread Ross Zwisler
The following commit:

commit aa4d86163e4e ("block: loop: switch to VFS ITER_BVEC")

replaced __do_lo_send_write(), which used ITER_KVEC iterators, with
lo_write_bvec() which uses ITER_BVEC iterators.  In this change, though,
the WRITE flag was lost:

-   iov_iter_kvec(, ITER_KVEC | WRITE, , 1, len);
+   iov_iter_bvec(, ITER_BVEC, bvec, 1, bvec->bv_len);

This flag is necessary for the DAX case because we make decisions based on
whether or not the iterator is a READ or a WRITE in dax_iomap_actor() and
in dax_iomap_rw().

We end up going through this path in configurations where we combine a PMEM
device with 4k sectors, a loopback device and DAX.  The consequence of this
missed flag is that what we intend as a write actually turns into a read in
the DAX code, so no data is ever written.

The very simplest test case is to create a loopback device and try and
write a small string to it, then hexdump a few bytes of the device to see
if the write took.  Without this patch you read back all zeros, with this
you read back the string you wrote.

For XFS this causes us to fail or panic during the following xfstests:

xfs/074 xfs/078 xfs/216 xfs/217 xfs/250

For ext4 we have a similar issue where writes never happen, but we don't
currently have any xfstests that use loopback and show this issue.

Fix this by restoring the WRITE flag argument to iov_iter_bvec().  This
causes the xfstests to all pass.

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
Fixes: commit aa4d86163e4e ("block: loop: switch to VFS ITER_BVEC")
Cc: Christoph Hellwig <h...@lst.de>
Cc: Al Viro <v...@zeniv.linux.org.uk>
Cc: sta...@vger.kernel.org
---
 drivers/block/loop.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index d5fe720cf149..89d2ee00cced 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -266,7 +266,7 @@ static int lo_write_bvec(struct file *file, struct bio_vec 
*bvec, loff_t *ppos)
struct iov_iter i;
ssize_t bw;
 
-   iov_iter_bvec(, ITER_BVEC, bvec, 1, bvec->bv_len);
+   iov_iter_bvec(, ITER_BVEC | WRITE, bvec, 1, bvec->bv_len);
 
file_start_write(file);
bw = vfs_iter_write(file, , ppos, 0);
-- 
2.14.3

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[GIT PULL] libnvdimm for 4.16

2018-02-05 Thread Ross Zwisler
Hi Linus, please pull from:

  git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm 
tags/libnvdimm-for-4.16

...to receive the libnvdimm update for 4.16.

All of these commits have shipped in -next and have received build verification
notices from 0day.

There is one small merge conflict that Michael Ellerman also noted in his pull
request.  In Michael's tree the axonram driver was removed, and in our tree it
received two small updates.  The merge can be trivially resolved by just
keeping the driver removed per Michael's tree.

Regarding the vmem_altmap patches, Michal Hocko took a quick look and while it
wasn't a full review he has future plans to reuse vmem_altmap in wider places
in the memory hotplug code, so he's in favor of getting it properly plumbed.

In case you wanted to inspect them or verify the merge I've also pushed the
three topic branches for this cycle to our nvdimm tree: 

  for-4.16/nfit
  for-4.16/libnvdimm
  for-4.16/dax

---

The following changes since commit a7f2766ac7c359216da4e714dc117c881e39a74a:

  Merge branches 'acpi-gpio', 'acpi-button', 'acpi-battery' and 'acpi-video' 
(2018-01-18 03:02:16 +0100)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm 
tags/libnvdimm-for-4.16

for you to fetch changes up to ee95f4059a833839bf52972191b2d4c3d3cec552:

  Merge branch 'for-4.16/nfit' into libnvdimm-for-next (2018-02-03 00:26:26 
-0700)


libnvdimm for 4.16

* Require struct page by default for filesystem DAX to remove a number of
  surprising failure cases.  This includes failures with direct I/O, gdb and
  fork(2).

* Add support for the new Platform Capabilities Structure added to the NFIT in
  ACPI 6.2a.  This new table tells us whether the platform supports flushing
  of CPU and memory controller caches on unexpected power loss events.

* Revamp vmem_altmap and dev_pagemap handling to clean up code and better
  support future future PCI P2P uses.

* Deprecate the ND_IOCTL_SMART_THRESHOLD command whose payload has become
  out-of-sync with recent versions of the NVDIMM_FAMILY_INTEL spec, and
  instead rely on the generic ND_CMD_CALL approach used by the two other IOCTL
  families, NVDIMM_FAMILY_{HPE,MSFT}.

* Enhance nfit_test so we can test some of the new things added in version 1.6
  of the DSM specification.  This includes testing firmware download and
  simulating the Last Shutdown State (LSS) status.


Christoph Hellwig (16):
  memremap: provide stubs for vmem_altmap_offset and vmem_altmap_free
  mm: don't export arch_add_memory
  mm: don't export __add_pages
  mm: pass the vmem_altmap to arch_add_memory and __add_pages
  mm: pass the vmem_altmap to vmemmap_populate
  mm: pass the vmem_altmap to arch_remove_memory and __remove_pages
  mm: pass the vmem_altmap to vmemmap_free
  mm: pass the vmem_altmap to memmap_init_zone
  mm: split altmap memory map allocation from normal case
  mm: merge vmem_altmap_alloc into altmap_alloc_block_buf
  mm: move get_dev_pagemap out of line
  mm: optimize dev_pagemap reference counting around get_dev_pagemap
  memremap: remove to_vmem_altmap
  memremap: simplify duplicate region handling in devm_memremap_pages
  memremap: change devm_memremap_pages interface to use struct dev_pagemap
  memremap: merge find_dev_pagemap into get_dev_pagemap

Colin Ian King (1):
  libnvdimm, namespace: remove redundant initialization of 'nd_mapping'

Dan Williams (8):
  nfit, libnvdimm: deprecate the generic SMART ioctl
  tools/testing/nvdimm: smart alarm/threshold control
  mm, dax: introduce pfn_t_special()
  ext4: auto disable dax instead of failing mount
  ext2: auto disable dax instead of failing mount
  dax: require 'struct page' by default for filesystem dax
  tools/testing/nvdimm: force nfit_test to depend on instrumented modules
  libnvdimm, namespace: make min namespace size 4K

Dave Jiang (6):
  acpi: nfit: Add support for detect platform CPU cache flush on power loss
  acpi: nfit: add persistent memory control flag for nd_region
  libnvdimm: expose platform persistence attribute for nd_region
  nfit-test: Add platform cap support from ACPI 6.2a to test
  libnvdimm/nfit_test: add firmware download emulation
  libnvdimm/nfit_test: adding support for unit testing enable LSS status

Jan H. Schönherr (2):
  mm: Fix memory size alignment in devm_memremap_pages_release()
  mm: Fix devm_memremap_pages() collision handling

Jeff Moyer (1):
  libnvdimm, btt: fix uninitialized err_lock

Logan Gunthorpe (1):
  memremap: drop private struct page_map

Luis de Bethencourt (1):
  device-dax: Fix trailing semicolon

Ross Zwisler (2):
  Merge branch 'for-4.16/dax' into libnvdimm-for-next
  Merge branch 'for-4.16/nfit' into libnvdimm

Re: [PATCH] libnvdimm: remove redundant assignment to pointer 'dev'

2018-02-05 Thread Ross Zwisler
On Mon, Feb 05, 2018 at 10:44:07AM -0800, Dan Williams wrote:
> On Mon, Feb 5, 2018 at 10:20 AM, Ross Zwisler
> <ross.zwis...@linux.intel.com> wrote:
> > On Mon, Feb 05, 2018 at 02:08:52PM +, Colin King wrote:
> >> From: Colin Ian King <colin.k...@canonical.com>
> >>
> >> Pointer dev is being assigned a value that is never read, it is being
> >> re-assigned the same value later on, hence the initialization is redundant
> >> and can be removed.
> >>
> >> Cleans up clang warning:
> >> drivers/nvdimm/pfn_devs.c:307:17: warning: Value stored to 'dev' during
> >> its initialization is never read
> >>
> >> Signed-off-by: Colin Ian King <colin.k...@canonical.com>
> >
> > Reviewed-by: Ross Zwisler <ross.zwis...@linux.intel.com>
> >
> > More importantly this fixes a potential NULL pointer dereference.  nd_pfn
> > is checked for NULL a few lines down, but we would have crashed here trying 
> > to
> > get nd_pfn->dev.
> >
> 
> No we wouldn't crash. We're just calculating the address, not
> de-referencing a NULL pointer.

Ah, yep, you're right of course.  This is exactly how offsetof() is
implemented in some architectures.  Thanks.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH] libnvdimm: remove redundant assignment to pointer 'dev'

2018-02-05 Thread Ross Zwisler
On Mon, Feb 05, 2018 at 02:08:52PM +, Colin King wrote:
> From: Colin Ian King <colin.k...@canonical.com>
> 
> Pointer dev is being assigned a value that is never read, it is being
> re-assigned the same value later on, hence the initialization is redundant
> and can be removed.
> 
> Cleans up clang warning:
> drivers/nvdimm/pfn_devs.c:307:17: warning: Value stored to 'dev' during
> its initialization is never read
> 
> Signed-off-by: Colin Ian King <colin.k...@canonical.com>

Reviewed-by: Ross Zwisler <ross.zwis...@linux.intel.com>

More importantly this fixes a potential NULL pointer dereference.  nd_pfn
is checked for NULL a few lines down, but we would have crashed here trying to
get nd_pfn->dev.

We can append the above info to the changelog when we apply.

> ---
>  drivers/nvdimm/pfn_devs.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
> index f5c4e8c6e29d..2f4d18752c97 100644
> --- a/drivers/nvdimm/pfn_devs.c
> +++ b/drivers/nvdimm/pfn_devs.c
> @@ -304,7 +304,7 @@ static const struct attribute_group 
> *nd_pfn_attribute_groups[] = {
>  struct device *nd_pfn_devinit(struct nd_pfn *nd_pfn,
>   struct nd_namespace_common *ndns)
>  {
> - struct device *dev = _pfn->dev;
> + struct device *dev;
>  
>   if (!nd_pfn)
>   return NULL;
> -- 
> 2.15.1
> 
> ___
> Linux-nvdimm mailing list
> Linux-nvdimm@lists.01.org
> https://lists.01.org/mailman/listinfo/linux-nvdimm
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v4 4/4] nfit-test: Add platform cap support from ACPI 6.2a to test

2018-01-31 Thread Ross Zwisler
On Tue, Jan 30, 2018 at 05:21:39PM -0700, Dave Jiang wrote:
> Adding NFIT platform capabilities sub table in nfit_test simulated ACPI
> NFIT table. Only the first NFIT table is added with the capability
> sub-table.
> 
> Signed-off-by: Dave Jiang <dave.ji...@intel.com>
> Reviewed-by: Ross Zwisler <ross.zwis...@linux.intel.com>
> ---
>  tools/testing/nvdimm/test/nfit.c |   14 --
>  1 file changed, 12 insertions(+), 2 deletions(-)
> 
> diff --git a/tools/testing/nvdimm/test/nfit.c 
> b/tools/testing/nvdimm/test/nfit.c
> index 7217b2b953b5..f4791e6e70cf 100644
> --- a/tools/testing/nvdimm/test/nfit.c
> +++ b/tools/testing/nvdimm/test/nfit.c
> @@ -881,7 +881,8 @@ static int nfit_test0_alloc(struct nfit_test *t)
>   window_size) * NUM_DCR
>   + sizeof(struct acpi_nfit_data_region) * NUM_BDW
>   + (sizeof(struct acpi_nfit_flush_address)
> - + sizeof(u64) * NUM_HINTS) * NUM_DCR;
> + + sizeof(u64) * NUM_HINTS) * NUM_DCR
> + + sizeof(struct acpi_nfit_capabilities);
>   int i;
>  
>   t->nfit_buf = test_alloc(t, nfit_size, >nfit_dma);
> @@ -993,6 +994,7 @@ static void nfit_test0_setup(struct nfit_test *t)
>   struct acpi_nfit_control_region *dcr;
>   struct acpi_nfit_data_region *bdw;
>   struct acpi_nfit_flush_address *flush;
> + struct acpi_nfit_capabilities *pcap;
>   unsigned int offset, i;
>  
>   /*
> @@ -1500,8 +1502,16 @@ static void nfit_test0_setup(struct nfit_test *t)
>   for (i = 0; i < NUM_HINTS; i++)
>   flush->hint_address[i] = t->flush_dma[3] + i * sizeof(u64);
>  
> + /* platform capabilities */
> + pcap = nfit_buf + offset + flush_hint_size * 4;
> + pcap->header.type = ACPI_NFIT_TYPE_CAPABILITIES;
> + pcap->header.length = sizeof(*pcap);
> + pcap->highest_capability = BIT(2);

One last thing I noticed: I'm pretty sure this needs to be 
pcap->highest_capability = 1;

The way I read the spec, this is the value of the highest bit index which is
valid.  We define bits 0 and 1, so this should be 1.  BIT(2) = 0x4, which
gives us a mask of 0x1f for bits 0, 1, 2, 3, and 4.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v3 4/4] nfit-test: Add platform cap support from ACPI 6.2a to test

2018-01-30 Thread Ross Zwisler
On Tue, Jan 30, 2018 at 05:01:29PM -0700, Dave Jiang wrote:
> Adding NFIT platform capabilities sub table in nfit_test simulated ACPI
> NFIT table. Only the first NFIT table is added with the capability
> sub-table.
> 
> Signed-off-by: Dave Jiang <dave.ji...@intel.com>

This looks good to me.  You can add:

Reviewed-by: Ross Zwisler <ross.zwis...@linux.intel.com>
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v3 1/4] acpi: nfit: Add support for detect platform CPU cache flush on power loss

2018-01-30 Thread Ross Zwisler
On Tue, Jan 30, 2018 at 05:01:13PM -0700, Dave Jiang wrote:
> In ACPI 6.2a the platform capability structure has been added to the NFIT
> tables. That provides software the ability to determine whether a system
> supports the auto flushing of CPU caches on power loss. If the capability
> is supported, we do not need to do dax_flush(). Plumbing the path to set the
> property on per region from the NFTI tables.
  NFIT
> 
> This patch depends on the ACPI NFIT 6.2a platform capabilities support code
> in include/acpi/actbl1.h.
> 
> Signed-off-by: Dave Jiang <dave.ji...@intel.com>

Looks good aside from the one nit.

Reviewed-by: Ross Zwisler <ross.zwis...@linux.intel.com>
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH] libnvdimm, namespace: remove redundant initialization of 'nd_mapping'

2018-01-30 Thread Ross Zwisler
On Tue, Jan 30, 2018 at 05:47:07PM +, Colin King wrote:
> From: Colin Ian King <colin.k...@canonical.com>
> 
> Pointer nd_mapping is being initialized to a value that is never read,
> instead it is being updated to a new value in all the cases where it
> is being read afterwards, hence the initialization is redundant and
> can be removed.
> 
> Cleans up clang warning:
> drivers/nvdimm/namespace_devs.c:2411:21: warning: Value stored to
> 'nd_mapping' during its initialization is never rea
> 
> Signed-off-by: Colin Ian King <colin.k...@canonical.com>

Sure, this looks good.  Thanks.

Reviewed-by: Ross Zwisler <ross.zwis...@linux.intel.com>
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v2 3/4] libnvdimm: expose platform persistence attribute for nd_region

2018-01-30 Thread Ross Zwisler
On Fri, Dec 08, 2017 at 02:00:20PM -0700, Dave Jiang wrote:
> Providing a sysfs attribute for nd_region that shows the persistence
> capabilities for the platform.
> 
> Signed-off-by: Dave Jiang <dave.ji...@intel.com>

Sure, looks fine.

Reviewed-by: Ross Zwisler <ross.zwis...@linux.intel.com>
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v2 2/4] acpi: nfit: add persistent memory control flag for nd_region

2018-01-30 Thread Ross Zwisler
On Fri, Dec 08, 2017 at 02:00:14PM -0700, Dave Jiang wrote:
> Propogate the ADR attribute flag from the NFIT platform capabilities
> sub-table to nd_region.
> 
> Signed-off-by: Dave Jiang <dave.ji...@intel.com>

Sure, looks fine.

Reviewed-by: Ross Zwisler <ross.zwis...@linux.intel.com>
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v2 4/4] nfit-test: Add platform cap support from ACPI 6.2a to test

2018-01-30 Thread Ross Zwisler
On Fri, Dec 08, 2017 at 02:00:25PM -0700, Dave Jiang wrote:
> Adding NFIT platform capabilities sub table in nfit_test simulated ACPI
> NFIT table. Only the first NFIT table is added with the capability
> sub-table.
> 
> Signed-off-by: Dave Jiang 
> ---
>  tools/testing/nvdimm/test/nfit.c |   11 ++-
>  1 file changed, 10 insertions(+), 1 deletion(-)
> 
> diff --git a/tools/testing/nvdimm/test/nfit.c 
> b/tools/testing/nvdimm/test/nfit.c
> index 7217b2b953b5..8a16f90a9573 100644
> --- a/tools/testing/nvdimm/test/nfit.c
> +++ b/tools/testing/nvdimm/test/nfit.c
> @@ -993,6 +993,7 @@ static void nfit_test0_setup(struct nfit_test *t)
>   struct acpi_nfit_control_region *dcr;
>   struct acpi_nfit_data_region *bdw;
>   struct acpi_nfit_flush_address *flush;
> + struct acpi_nfit_capabilities *pcap;
>   unsigned int offset, i;
>  
>   /*
> @@ -1500,8 +1501,16 @@ static void nfit_test0_setup(struct nfit_test *t)
>   for (i = 0; i < NUM_HINTS; i++)
>   flush->hint_address[i] = t->flush_dma[3] + i * sizeof(u64);
>  
> + /* platform capabilities */
> + pcap = nfit_buf + offset + flush_hint_size * 4;
> + pcap->header.type = ACPI_NFIT_TYPE_CAPABILITIES;
> + pcap->header.length = sizeof(*pcap);
> + pcap->highest_capability = BIT(2);
> + pcap->capabilities = ACPI_NFIT_CAPABILITY_CACHE_FLUSH |
> + ACPI_NFIT_CAPABILITY_MEM_FLUSH;
> +
>   if (t->setup_hotplug) {
> - offset = offset + flush_hint_size * 4;
> + offset = offset + sizeof(*pcap);

You still need to increase the offset by flush_hint_size*4, in addition to
sizeof(*pcap):

offset = offset + flush_hint_size * 4 + sizeof(*pcap);

Also, you need to increase the size of the memory allocation for t->nfit_buf
in nfit_test_alloc() by adding in a sizeof(struct acpi_nfit_capabilities) to
the calculation of nfit_size.  Otherwise you'll increment offset off the end
of nfit_buf in nfit_test0_setup().

>   /* dcr-descriptor4: blk */
>   dcr = nfit_buf + offset;
>   dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION;
> 
> ___
> Linux-nvdimm mailing list
> Linux-nvdimm@lists.01.org
> https://lists.01.org/mailman/listinfo/linux-nvdimm
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v2 1/4] acpi: nfit: Add support for detect platform CPU cache flush on power loss

2018-01-30 Thread Ross Zwisler
On Fri, Dec 08, 2017 at 02:00:09PM -0700, Dave Jiang wrote:
> In ACPI 6.2a the platform capability structure has been added to the NFIT
> tables. That provides software the ability to determine whether a system
> supports the auto flushing of CPU caches on power loss. If the capability
> is supported, we do not need to do dax_write_cache(). Plumbing the path
 dax_flush() ?

dax_write_cache() just sets or clears DAXDEV_WRITE_CACHE.  dax_flush() is the
place where we check DAXDEV_WRITE_CACHE and avoid calling arch_wb_cache_pmem()
if the platform supports a flush-on-fail CPU cache.

> to set the property on per region from the NFTI tables.
 NFIT
> 
> This patch depends on the ACPI NFIT 6.2a platform capabilities support code
> in include/acpi/actbl1.h.
>
> Signed-off-by: Dave Jiang 
> ---
>  drivers/acpi/nfit/core.c |   20 
>  drivers/acpi/nfit/nfit.h |1 +
>  drivers/nvdimm/pmem.c|4 +++-
>  drivers/nvdimm/region_devs.c |1 +
>  include/linux/libnvdimm.h|5 +
>  5 files changed, 30 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
> index ff2580e7611d..c08b3da61b93 100644
> --- a/drivers/acpi/nfit/core.c
> +++ b/drivers/acpi/nfit/core.c
> @@ -838,6 +838,18 @@ static bool add_flush(struct acpi_nfit_desc *acpi_desc,
>   return true;
>  }
>  
> +static bool add_platform_cap(struct acpi_nfit_desc *acpi_desc,
> + struct acpi_nfit_capabilities *pcap)
> +{
> + struct device *dev = acpi_desc->dev;
> + u8 mask;

'mask' cannot be a u8, else you'll lose all the upper bits when you compute
it.  It needs to be a u32.

> +
> + mask = pcap->highest_capability - 1 + pcap->highest_capability;

This calculation is broken.   Take the simplest case, highest_capability = 0.
This should translate into a mask of 0x1, so you only look at bit 0, but:

0 - 1 + 0 == -1

giving you a mask that includes all the bits.  I think the mask you're looking
for is:
mask = (1 << (pcap->highest_capability + 1)) - 1;

> + acpi_desc->platform_cap = pcap->capabilities & (u32)mask;
> + dev_dbg(dev, "%s: cap: %#x\n", __func__, acpi_desc->platform_cap);
> + return true;
> +}
> +
>  static void *add_table(struct acpi_nfit_desc *acpi_desc,
>   struct nfit_table_prev *prev, void *table, const void *end)
>  {
<>
> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
> index 7fbc5c5dc8e1..13f2ed80899e 100644
> --- a/drivers/nvdimm/pmem.c
> +++ b/drivers/nvdimm/pmem.c
> @@ -35,6 +35,7 @@
>  #include "pmem.h"
>  #include "pfn.h"
>  #include "nd.h"
> +#include "nd-core.h"
>  
>  static struct device *to_dev(struct pmem_device *pmem)
>  {
> @@ -334,7 +335,8 @@ static int pmem_attach_disk(struct device *dev,
>   dev_warn(dev, "unable to guarantee persistence of writes\n");
>   fua = 0;
>   }
> - wbc = nvdimm_has_cache(nd_region);
> + wbc = nvdimm_has_cache(nd_region) &
I'm guessing you meant:   &&

> + !test_bit(ND_REGION_PERSIST_CACHE, _region->flags);
>  
>   if (!devm_request_mem_region(dev, res->start, resource_size(res),
>   dev_name(>dev))) {
> diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
> index abaf38c61220..87e8cd10de05 100644
> --- a/drivers/nvdimm/region_devs.c
> +++ b/drivers/nvdimm/region_devs.c
> @@ -994,6 +994,7 @@ static struct nd_region *nd_region_create(struct 
> nvdimm_bus *nvdimm_bus,
>   dev->groups = ndr_desc->attr_groups;
>   nd_region->ndr_size = resource_size(ndr_desc->res);
>   nd_region->ndr_start = ndr_desc->res->start;
> +
Unrelated whitespace change.

>   nd_device_register(dev);
>  
>   return nd_region;
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[ndctl PATCH v3 2/2] ndctl: add sector_size to 'ndctl list' output

2018-01-29 Thread Ross Zwisler
It used to be that the only PMEM namespaces with a variable sector size
were ones with a BTT, but that has changed.  sector_size still doesn't make
sense for device DAX since we don't have a block I/O path.

If we don't have a specified sector size, as happens with namespaces of
devtype nd_namespace_io and older v1.1 namespace labels, we will display
the default sector size of 512.

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
Reviewed-by: Dave Jiang <dave.ji...@intel.com>
---
 util/json.c | 29 -
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/util/json.c b/util/json.c
index 95beaa2..17d8f13 100644
--- a/util/json.c
+++ b/util/json.c
@@ -718,11 +718,6 @@ struct json_object *util_namespace_to_json(struct 
ndctl_namespace *ndns,
goto err;
json_object_object_add(jndns, "uuid", jobj);
 
-   jobj = json_object_new_int(ndctl_btt_get_sector_size(btt));
-   if (!jobj)
-   goto err;
-   json_object_object_add(jndns, "sector_size", jobj);
-
bdev = ndctl_btt_get_block_device(btt);
} else if (pfn) {
ndctl_pfn_get_uuid(pfn, uuid);
@@ -774,6 +769,30 @@ struct json_object *util_namespace_to_json(struct 
ndctl_namespace *ndns,
} else
bdev = ndctl_namespace_get_block_device(ndns);
 
+   jobj = NULL;
+   if (btt) {
+   jobj = json_object_new_int(ndctl_btt_get_sector_size(btt));
+   if (!jobj)
+   goto err;
+   } else if (!dax) {
+   unsigned int sector_size = 
ndctl_namespace_get_sector_size(ndns);
+
+   /*
+* The kernel will default to a 512 byte sector size on PMEM
+* namespaces that don't explicitly have a sector size. This
+* happens because they use pre-v1.2 labels or because they
+* don't have a label space (devtype=nd_namespace_io).
+*/
+   if (!sector_size)
+   sector_size = 512;
+
+   jobj = json_object_new_int(sector_size);
+   if (!jobj)
+   goto err;
+   }
+   if (jobj)
+   json_object_object_add(jndns, "sector_size", jobj);
+
if (bdev && bdev[0]) {
jobj = json_object_new_string(bdev);
if (!jobj)
-- 
2.14.3

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[ndctl PATCH v2 2/2] ndctl: add sector_size to 'ndctl list' output

2018-01-26 Thread Ross Zwisler
It used to be that the only PMEM namespaces with a variable sector size
were ones with a BTT, but that has changed.  sector_size still doesn't make
sense for device DAX since we don't have a block I/O path.

If we don't have a specified sector size, as happens with namespaces of
devtype nd_namespace_io and older v1.1 namespace labels, we will display
the default sector size of 512.

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
---
 util/json.c | 23 ++-
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/util/json.c b/util/json.c
index 95beaa2..647bf03 100644
--- a/util/json.c
+++ b/util/json.c
@@ -718,11 +718,6 @@ struct json_object *util_namespace_to_json(struct 
ndctl_namespace *ndns,
goto err;
json_object_object_add(jndns, "uuid", jobj);
 
-   jobj = json_object_new_int(ndctl_btt_get_sector_size(btt));
-   if (!jobj)
-   goto err;
-   json_object_object_add(jndns, "sector_size", jobj);
-
bdev = ndctl_btt_get_block_device(btt);
} else if (pfn) {
ndctl_pfn_get_uuid(pfn, uuid);
@@ -774,6 +769,24 @@ struct json_object *util_namespace_to_json(struct 
ndctl_namespace *ndns,
} else
bdev = ndctl_namespace_get_block_device(ndns);
 
+   jobj = NULL;
+   if (btt) {
+   jobj = json_object_new_int(ndctl_btt_get_sector_size(btt));
+   if (!jobj)
+   goto err;
+   } else if (!dax) {
+   unsigned int sector_size = 
ndctl_namespace_get_sector_size(ndns);
+
+   if (!sector_size)
+   sector_size = 512;
+
+   jobj = json_object_new_int(sector_size);
+   if (!jobj)
+   goto err;
+   }
+   if (jobj)
+   json_object_object_add(jndns, "sector_size", jobj);
+
if (bdev && bdev[0]) {
jobj = json_object_new_string(bdev);
if (!jobj)
-- 
2.14.3

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[ndctl PATCH 1/2] ndctl: update .gitignore

2018-01-25 Thread Ross Zwisler
I noticed that my 'git status' output was quite noisy.  Here's what I get
with a full build using the suggested
./configure CFLAGS='-g -O0' --prefix=/usr --sysconfdir=/etc --libdir=/usr/lib64
command and with some cscope files built:

  $ git status
  On branch pending
  Your branch is up-to-date with 'github/pending'.

  Untracked files:
(use "git add ..." to include in what will be committed)

Documentation/daxctl/asciidoc.conf
Documentation/daxctl/daxctl-io.1
Documentation/daxctl/daxctl-list.1
Documentation/daxctl/daxctl.1
Documentation/ndctl/asciidoc.conf
Documentation/ndctl/ndctl-check-labels.1
Documentation/ndctl/ndctl-check-namespace.1
Documentation/ndctl/ndctl-create-namespace.1
Documentation/ndctl/ndctl-destroy-namespace.1
Documentation/ndctl/ndctl-disable-dimm.1
Documentation/ndctl/ndctl-disable-namespace.1
Documentation/ndctl/ndctl-disable-region.1
Documentation/ndctl/ndctl-enable-dimm.1
Documentation/ndctl/ndctl-enable-namespace.1
Documentation/ndctl/ndctl-enable-region.1
Documentation/ndctl/ndctl-init-labels.1
Documentation/ndctl/ndctl-inject-error.1
Documentation/ndctl/ndctl-list.1
Documentation/ndctl/ndctl-read-labels.1
Documentation/ndctl/ndctl-write-labels.1
Documentation/ndctl/ndctl-zero-labels.1
Documentation/ndctl/ndctl.1
ccan/list/.dirstamp
ccan/str/.dirstamp
cscope.files
cscope.out
daxctl/daxctl
daxctl/lib/libdaxctl.la
daxctl/lib/libdaxctl.lo
daxctl/lib/libdaxctl.pc
libccan.a
libutil.a
ndctl/lib/libndctl.pc
ndctl/ndctl
ndctl/util/.dirstamp
rhel/
sles/ndctl.spec
util/.dirstamp
util/log.lo
util/sysfs.lo
version.m4

  nothing added to commit but untracked files present (use "git add" to track)

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
---
 .gitignore | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/.gitignore b/.gitignore
index 4899534..64e58fa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,22 @@ Makefile.in
 /configure
 /libtool
 /stamp-h1
+*.1
+Documentation/daxctl/asciidoc.conf
+Documentation/ndctl/asciidoc.conf
+.dirstamp
+daxctl/daxctl
+daxctl/lib/libdaxctl.la
+daxctl/lib/libdaxctl.lo
+daxctl/lib/libdaxctl.pc
+*.a
+ndctl/lib/libndctl.pc
+ndctl/ndctl
+rhel/
+sles/ndctl.spec
+util/log.lo
+util/sysfs.lo
+version.m4
+*.swp
+cscope.files
+cscope.out
-- 
2.14.3

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[ndctl PATCH 2/2] ndctl: add sector_size to 'ndctl list' output

2018-01-25 Thread Ross Zwisler
It used to be that the only PMEM namespaces with a variable sector size
were ones with a BTT, but that has changed.  sector_size still doesn't make
sense for device DAX since we don't have a block I/O path, and we also skip
it if we don't have a specified sector size, as happens with namespaces of
devtype nd_namespace_io.

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
---
 util/json.c | 22 +-
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/util/json.c b/util/json.c
index 95beaa2..89306f0 100644
--- a/util/json.c
+++ b/util/json.c
@@ -718,11 +718,6 @@ struct json_object *util_namespace_to_json(struct 
ndctl_namespace *ndns,
goto err;
json_object_object_add(jndns, "uuid", jobj);
 
-   jobj = json_object_new_int(ndctl_btt_get_sector_size(btt));
-   if (!jobj)
-   goto err;
-   json_object_object_add(jndns, "sector_size", jobj);
-
bdev = ndctl_btt_get_block_device(btt);
} else if (pfn) {
ndctl_pfn_get_uuid(pfn, uuid);
@@ -774,6 +769,23 @@ struct json_object *util_namespace_to_json(struct 
ndctl_namespace *ndns,
} else
bdev = ndctl_namespace_get_block_device(ndns);
 
+   jobj = NULL;
+   if (btt) {
+   jobj = json_object_new_int(ndctl_btt_get_sector_size(btt));
+   if (!jobj)
+   goto err;
+   } else if (!dax) {
+   unsigned int sector_size = 
ndctl_namespace_get_sector_size(ndns);
+
+   if (sector_size) {
+   jobj = json_object_new_int(sector_size);
+   if (!jobj)
+   goto err;
+   }
+   }
+   if (jobj)
+   json_object_object_add(jndns, "sector_size", jobj);
+
if (bdev && bdev[0]) {
jobj = json_object_new_string(bdev);
if (!jobj)
-- 
2.14.3

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[fstests PATCH v2] generic/347: dm-thin lacks DAX support

2018-01-18 Thread Ross Zwisler
generic/347 currently fails when run in conjunction with the DAX mount
option:

generic/347 72s ... - output mismatch (see
/root/project/xfstests/results//generic/347.out.bad)
--- tests/generic/347.out   2016-05-12 11:56:32.086618744 -0600
+++ /root/project/xfstests/results//generic/347.out.bad 2018-01-17
16:04:33.459348448 -0700
@@ -1,2 +1,3 @@
 QA output created by 347
+mount: /mnt/xfstests_scratch: can't read superblock on
/dev/mapper/thin-vol.
 === completed
...
(Run 'diff -u tests/generic/347.out
/root/project/xfstests/results//generic/347.out.bad'  to see the entire
diff)

This is expected because the dm-thin target currently lacks DAX support.

Just skip this test if we are using DAX.

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
---
 common/dmthin | 5 +
 1 file changed, 5 insertions(+)

diff --git a/common/dmthin b/common/dmthin
index baab6284..da0ad284 100644
--- a/common/dmthin
+++ b/common/dmthin
@@ -35,6 +35,11 @@ DMTHIN_POOL_DEV="/dev/mapper/$DMTHIN_POOL_NAME"
 DMTHIN_VOL_NAME="thin-vol"
 DMTHIN_VOL_DEV="/dev/mapper/$DMTHIN_VOL_NAME"
 
+echo $MOUNT_OPTIONS | grep -q dax
+if [ $? -eq 0 ]; then
+   _notrun "Cannot run tests with DAX on dmthin devices"
+fi
+
 _dmthin_cleanup()
 {
$UMOUNT_PROG $SCRATCH_MNT > /dev/null 2>&1
-- 
2.14.3

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [fstests PATCH 2/2] generic/347: dm-thin lacks DAX support

2018-01-18 Thread Ross Zwisler
On Thu, Jan 18, 2018 at 12:23:43PM +0800, Eryu Guan wrote:
> On Wed, Jan 17, 2018 at 04:23:51PM -0700, Ross Zwisler wrote:
> > generic/347 currently fails when run in cojunction with the DAX mount
> > option:
> > 
> > generic/347 72s ... - output mismatch (see
> > /root/project/xfstests/results//generic/347.out.bad)
> > --- tests/generic/347.out   2016-05-12 11:56:32.086618744 -0600
> > +++ /root/project/xfstests/results//generic/347.out.bad 2018-01-17
> > 16:04:33.459348448 -0700
> > @@ -1,2 +1,3 @@
> >  QA output created by 347
> > +mount: /mnt/xfstests_scratch: can't read superblock on
> > /dev/mapper/thin-vol.
> >  === completed
> > ...
> > (Run 'diff -u tests/generic/347.out
> > /root/project/xfstests/results//generic/347.out.bad'  to see the entire
> > diff)
> > 
> > This is expected because the dm-thin target currently lacks DAX support.
> > 
> > Just skip this test if we are using DAX.
> > 
> > Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
> > ---
> >  tests/generic/347 | 1 +
> >  1 file changed, 1 insertion(+)
> > 
> > diff --git a/tests/generic/347 b/tests/generic/347
> > index 3adc6744..0c2dec98 100755
> > --- a/tests/generic/347
> > +++ b/tests/generic/347
> > @@ -72,6 +72,7 @@ _supported_fs generic
> >  _supported_os Linux
> >  _require_scratch_nocheck
> >  _require_dm_target thin-pool
> > +_exclude_scratch_mount_option dax
> 
> IMHO, it'd be better to put this in common/dmthin file, as what's done
> in common/dmerror or common/dmflakey (we grep for dax directly there,
> that's because they've been added before _exclude_scratch_mount_option
> was introduced), so all thinp tests are excluded from dax tests.

Sure, will do.  Thanks for the review.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[fstests PATCH 1/2] shared/272: don't use data journaling with DAX

2018-01-17 Thread Ross Zwisler
shared/272 fails with kernels v4.15-rc1 and beyond when you are mounted
with DAX:

shared/272   [failed, exit status 1] - output mismatch (see
/root/project/xfstests/results//shared/272.out.bad)
--- tests/shared/272.out2015-12-05 13:12:17.038257578 -0700
+++ /root/project/xfstests/results//shared/272.out.bad  2018-01-17
15:37:18.581631116 -0700
@@ -1,3 +1,3 @@
 QA output created by 272
 Switch data journalling mode. Silence is golden.
-Check filesystem
+/usr/bin/chattr: Device or resource busy while setting flags on
/mnt/xfstests_scratch/file.1
...
(Run 'diff -u tests/shared/272.out
/root/project/xfstests/results//shared/272.out.bad'  to see the entire
diff)

This is expected.  The following kernel commit:

commit e9072d859df3 ("ext4: prevent data corruption with journaling + DAX")

makes "chattr +j", which is attempting to turn on data journaling, return
-EBUSY if the ext4 DAX mount option is in use.  This was done to prevent
the data corruption shown in xfstest ext4/030, added by this xfstests
commit:

commit 750a24e99e48 ("ext4: test for DAX + journaling corruption")

So, just skip shared/272 if the DAX mount option is in use.

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
---
 tests/shared/272 | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/shared/272 b/tests/shared/272
index 7023b657..0c9763df 100755
--- a/tests/shared/272
+++ b/tests/shared/272
@@ -83,6 +83,7 @@ chattr_opt: $chattr_opt" >>$seqres.full
 _supported_fs ext3 ext4
 _supported_os Linux
 _require_scratch
+_exclude_scratch_mount_option dax
 
 rm -f $seqres.full
 _scratch_mkfs_sized $((64 * 1024 * 1024)) >> $seqres.full 2>&1
-- 
2.14.3

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[fstests PATCH 2/2] generic/347: dm-thin lacks DAX support

2018-01-17 Thread Ross Zwisler
generic/347 currently fails when run in cojunction with the DAX mount
option:

generic/347 72s ... - output mismatch (see
/root/project/xfstests/results//generic/347.out.bad)
--- tests/generic/347.out   2016-05-12 11:56:32.086618744 -0600
+++ /root/project/xfstests/results//generic/347.out.bad 2018-01-17
16:04:33.459348448 -0700
@@ -1,2 +1,3 @@
 QA output created by 347
+mount: /mnt/xfstests_scratch: can't read superblock on
/dev/mapper/thin-vol.
 === completed
...
(Run 'diff -u tests/generic/347.out
/root/project/xfstests/results//generic/347.out.bad'  to see the entire
diff)

This is expected because the dm-thin target currently lacks DAX support.

Just skip this test if we are using DAX.

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
---
 tests/generic/347 | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/generic/347 b/tests/generic/347
index 3adc6744..0c2dec98 100755
--- a/tests/generic/347
+++ b/tests/generic/347
@@ -72,6 +72,7 @@ _supported_fs generic
 _supported_os Linux
 _require_scratch_nocheck
 _require_dm_target thin-pool
+_exclude_scratch_mount_option dax
 
 _setup_thin
 _workout
-- 
2.14.3

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH] NVDIMM: Reduced-the-ND_MIN_NAMESPACE_SIZE-from-4MB-to-4KB

2018-01-16 Thread Ross Zwisler
On Thu, Jan 11, 2018 at 06:49:17PM +, Cheng-mean Liu (SOCCER) wrote:
> In the case of emulated NVDIMM devices in the VM environment, there
> are scenarios that NVDIMM device with much smaller sizes are desired, for 
> example, we might
> use a single enumerated NVDIMM DAX device for representing each container 
> layer, which in some
> cases could be just a few KBs size.The current ND_MIN_NAMESPACE_SIZE is 4MB. 
> To avoid wasting
> address and inefficient zero padding for meeting this 4MB min requirement, 
> the proposed change is to
> reduce it to 4KB, a single page size, is a size good for all platforms.

A few comments:

- You need to trim the lines of your changelog just as you would in an email
  on-list.  No more than 80 characters per line, please.

- The above information needs to be in the changelogs for the individual
  patches so that it will live on if they are committed.  This allows
  developers to find this info later as well when they bisect the code, etc.

- It doesn't work to have multiple patches for multiple git repos appended
  together in the same email.  Developers need to be able to save off the text
  from your email and feed it into "git am" without having to edit it.  These
  two patches need to be sent separately (possibly with different CC lists),
  and each with it's own complete changelog.  It's fine if they reference one
  another, but they need to each stand on their own.

- Both of your patches have broken whitespace.  Here's what happens if I try
  and apply the kernel patch from your mail:

  $ git am ~/patch/kernel.patch
  Applying: reduced the ND_MIN_NAMESPACE_SIZE from 4MB to 4KB
  error: corrupt patch at line 10
  error: could not build fake ancestor
  Patch failed at 0001 reduced the ND_MIN_NAMESPACE_SIZE from 4MB to 4KB
  The copy of the patch that failed is found in:
  /home/rzwisler/project/linux/.git/worktrees/review/rebase-apply/patch
  When you have resolved this problem, run "git am --continue".
  If you prefer to skip this patch, run "git am --skip" instead.
  To restore the original branch and stop patching, run "git am --abort".

  You should be able to save off your mail and feed it into 'git am', because
  that's exactly what your reviewers and the maintainer will do.

I'd highly recommend just using git for creating the patches (git
format-patch) and sending them out (git send-email).  If you need help doing
this from within the Microsoft network I'm betting Matthew can probably help
you out.

- Ross

> Two patches are included in this request :
> 
> 1. A patch for Linux kernel changes
> 2. A patch for ndctl project to keep it in sync with the Linux kernel header 
> file
> 
> 
> From 29e173c32661d976cda073438979991167ee13fc Mon Sep 17 00:00:00 2001
> From: Cheng-mean Liu 
> Date: Thu, 11 Jan 2018 10:06:13 -0800
> Subject: [PATCH] reduced the ND_MIN_NAMESPACE_SIZE from 4MB to 4KB
> 
> Signed-off-by: Cheng-mean Liu 
> ---
> include/uapi/linux/ndctl.h | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
> index 3f03567631cb..e63c201ed1ef 100644
> --- a/include/uapi/linux/ndctl.h
> +++ b/include/uapi/linux/ndctl.h
> @@ -263,7 +263,7 @@ enum nd_driver_flags {
> };
>  enum {
> -  ND_MIN_NAMESPACE_SIZE = 0x0040,
> + ND_MIN_NAMESPACE_SIZE = 0x1000,
> };
>  enum ars_masks {
> --
> 2.11.0
> 
> 
> 
> From 2bf3e2bbfae81ab50d141571414c0e6556bc0e0c Mon Sep 17 00:00:00 2001
> From: Cheng-mean Liu 
> Date: Thu, 11 Jan 2018 10:02:52 -0800
> Subject: [PATCH] reduced the ND_MIN_NAMESPACE_SIZE from 4MB to 4KB
> 
> Signed-off-by: Cheng-mean Liu 
> ---
> ndctl/ndctl.h| 2 +-
> test/dpa-alloc.c | 6 ++
> 2 files changed, 7 insertions(+), 1 deletion(-)
> 
> diff --git a/ndctl/ndctl.h b/ndctl/ndctl.h
> index 5e6905c..8c14d90 100644
> --- a/ndctl/ndctl.h
> +++ b/ndctl/ndctl.h
> @@ -263,7 +263,7 @@ enum nd_driver_flags {
> };
>  enum {
> -  ND_MIN_NAMESPACE_SIZE = 0x0040,
> + ND_MIN_NAMESPACE_SIZE = 0x1000,
> };
>  enum ars_masks {
> diff --git a/test/dpa-alloc.c b/test/dpa-alloc.c
> index d13cf5d..ba3deed 100644
> --- a/test/dpa-alloc.c
> +++ b/test/dpa-alloc.c
> @@ -237,6 +237,12 @@ static int do_test(struct ndctl_ctx *ctx, struct 
> ndctl_test *test)
> uuid_unparse(namespaces[i].uuid, uuid_str);
>size = ndctl_namespace_get_size(victim);
> +
> + rc = ndctl_namespace_disable_invalidate(victim);
> +if (rc) {
> + fprintf(stderr, "failed to 
> disable %s\n", uuid_str);
> + return rc;
> + }
>rc = ndctl_namespace_delete(victim);
>if (rc) {
>  

Re: [PATCH v3 2/4] KVM: X86: Fix loss of exception which has not yet injected

2018-01-06 Thread Ross Zwisler
On Wed, Aug 23, 2017 at 10:21 PM, Wanpeng Li  wrote:
> From: Wanpeng Li 
>
> vmx_complete_interrupts() assumes that the exception is always injected,
> so it would be dropped by kvm_clear_exception_queue(). This patch separates
> exception.pending from exception.injected, exception.inject represents the
> exception is injected or the exception should be reinjected due to vmexit
> occurs during event delivery in VMX non-root operation. exception.pending
> represents the exception is queued and will be cleared when injecting the
> exception to the guest. So exception.pending and exception.injected can
> cooperate to guarantee exception will not be lost.
>
> Reported-by: Radim Krčmář 
> Cc: Paolo Bonzini 
> Cc: Radim Krčmář 
> Signed-off-by: Wanpeng Li 
> ---

I'm seeing a regression in my QEMU based NVDIMM testing system, and I
bisected it to this commit.

The behavior I'm seeing is that heavy I/O to simulated NVDIMMs in
multiple virtual machines causes the QEMU guests to receive double
faults, crashing them.  Here's an example backtrace:

[ 1042.653816] PANIC: double fault, error_code: 0x0
[ 1042.654398] CPU: 2 PID: 30257 Comm: fsstress Not tainted 4.15.0-rc5 #1
[ 1042.655169] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS 1.10.2-2.fc27 04/01/2014
[ 1042.656121] RIP: 0010:memcpy_flushcache+0x4d/0x180
[ 1042.656631] RSP: 0018:ac098c7d3808 EFLAGS: 00010286
[ 1042.657245] RAX: ac0d18ca8000 RBX: 0fe0 RCX: ac0d18ca8000
[ 1042.658085] RDX: 921aaa5df000 RSI: 921aaa5e RDI: 19f26e6c9000
[ 1042.658802] RBP: 1000 R08:  R09: 
[ 1042.659503] R10:  R11:  R12: 921aaa5df020
[ 1042.660306] R13: ac0d18ca8000 R14: f4c102a977c0 R15: 1000
[ 1042.661132] FS:  7f71530b90c0() GS:921b3b28()
knlGS:
[ 1042.662051] CS:  0010 DS:  ES:  CR0: 80050033
[ 1042.662528] CR2: 01156002 CR3: 00012a936000 CR4: 06e0
[ 1042.663093] Call Trace:
[ 1042.663329]  write_pmem+0x6c/0xa0 [nd_pmem]
[ 1042.663668]  pmem_do_bvec+0x15f/0x330 [nd_pmem]
[ 1042.664056]  ? kmem_alloc+0x61/0xe0 [xfs]
[ 1042.664393]  pmem_make_request+0xdd/0x220 [nd_pmem]
[ 1042.664781]  generic_make_request+0x11f/0x300
[ 1042.665135]  ? submit_bio+0x6c/0x140
[ 1042.665436]  submit_bio+0x6c/0x140
[ 1042.665754]  ? next_bio+0x18/0x40
[ 1042.666025]  ? _cond_resched+0x15/0x40
[ 1042.666341]  submit_bio_wait+0x53/0x80
[ 1042.666804]  blkdev_issue_zeroout+0xdc/0x210
[ 1042.667336]  ? __dax_zero_page_range+0xb5/0x140
[ 1042.667810]  __dax_zero_page_range+0xb5/0x140
[ 1042.668197]  ? xfs_file_iomap_begin+0x2bd/0x8e0 [xfs]
[ 1042.668611]  iomap_zero_range_actor+0x7c/0x1b0
[ 1042.668974]  ? iomap_write_actor+0x170/0x170
[ 1042.669318]  iomap_apply+0xa4/0x110
[ 1042.669616]  ? iomap_write_actor+0x170/0x170
[ 1042.669958]  iomap_zero_range+0x52/0x80
[ 1042.670255]  ? iomap_write_actor+0x170/0x170
[ 1042.670616]  xfs_setattr_size+0xd4/0x330 [xfs]
[ 1042.670995]  xfs_ioc_space+0x27e/0x2f0 [xfs]
[ 1042.671332]  ? terminate_walk+0x87/0xf0
[ 1042.671662]  xfs_file_ioctl+0x862/0xa40 [xfs]
[ 1042.672035]  ? _copy_to_user+0x22/0x30
[ 1042.672346]  ? cp_new_stat+0x150/0x180
[ 1042.672663]  do_vfs_ioctl+0xa1/0x610
[ 1042.672960]  ? SYSC_newfstat+0x3c/0x60
[ 1042.673264]  SyS_ioctl+0x74/0x80
[ 1042.673661]  entry_SYSCALL_64_fastpath+0x1a/0x7d
[ 1042.674239] RIP: 0033:0x7f71525a2dc7
[ 1042.674681] RSP: 002b:7ffef97aa778 EFLAGS: 0246 ORIG_RAX:
0010
[ 1042.675664] RAX: ffda RBX: 000112bc RCX: 7f71525a2dc7
[ 1042.676592] RDX: 7ffef97aa7a0 RSI: 40305825 RDI: 0003
[ 1042.677520] RBP: 0009 R08: 0045 R09: 7ffef97aa78c
[ 1042.678442] R10:  R11: 0246 R12: 0003
[ 1042.679330] R13: 00019e38 R14: 000fcca7 R15: 0016
[ 1042.680216] Code: 48 8d 5d e0 4c 8d 62 20 48 89 cf 48 29 d7 48 89
de 48 83 e6 e0 4c 01 e6 48 8d 04 17 4c 8b 02 4c 8b 4a 08 4c 8b 52 10
4c 8b 5a 18 <4c> 0f c3 00 4c 0f c3 48 08 4c 0f c3 50 10 4c 0f c3 58 18
48 83

This appears to be independent of both the guest kernel version (this
backtrace has v4.15.0-rc5, but I've seen it with other kernels) as
well as independent of the host QMEU version (mine happens to be
qemu-2.10.1-2.fc27 in Fedora 27).

The new behavior is due to this commit being present in the host OS
kernel.  Prior to this commit I could fire up 4 VMs and run xfstests
on my simulated NVDIMMs, but after this commit such testing results in
multiple of my VMs crashing almost immediately.

Reproduction is very simple, at least on my development box.  All you
need are a pair of VMs (I just did it with clean installs of Fedora
27) with NVDIMMs.  Here's a sample QEMU command to get 

[PATCH 2/2] ext4: test for inline data + DAX corruption

2018-01-04 Thread Ross Zwisler
Add a regression test for the following kernel commit:

  ext4: prevent data corruption with inline data + DAX

The test passes either if we don't encounter corruption, or if mounting
with DAX + inline data fails.  The latter is the way that we prevent this
issue in the kernel.

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>

---

Changes since v1:

 - Changed ordering of .gitignore entries. (Eryu)

 - Added '_require_scratch_ext4_feature "inline_data"' test. (Eryu)
---
 .gitignore |  1 +
 src/Makefile   |  2 +-
 src/t_ext4_dax_inline_corruption.c | 70 +++
 tests/ext4/031 | 86 ++
 tests/ext4/031.out |  2 +
 tests/ext4/group   |  1 +
 6 files changed, 161 insertions(+), 1 deletion(-)
 create mode 100644 src/t_ext4_dax_inline_corruption.c
 create mode 100755 tests/ext4/031
 create mode 100644 tests/ext4/031.out

diff --git a/.gitignore b/.gitignore
index 840e4fe4..09dd01cb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -116,6 +116,7 @@
 /src/t_dir_type
 /src/t_encrypted_d_revalidate
 /src/t_ext4_dax_journal_corruption
+/src/t_ext4_dax_inline_corruption
 /src/t_futimens
 /src/t_getcwd
 /src/t_holes
diff --git a/src/Makefile b/src/Makefile
index 86c5440c..b96b8cf2 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -14,7 +14,7 @@ TARGETS = dirstress fill fill2 getpagesize holes lstat64 \
t_mmap_writev t_truncate_cmtime dirhash_collide t_rename_overwrite \
holetest t_truncate_self t_mmap_dio af_unix t_mmap_stale_pmd \
t_mmap_cow_race t_mmap_fallocate fsync-err t_mmap_write_ro \
-   t_ext4_dax_journal_corruption
+   t_ext4_dax_journal_corruption t_ext4_dax_inline_corruption
 
 LINUX_TARGETS = xfsctl bstat t_mtab getdevicesize preallo_rw_pattern_reader \
preallo_rw_pattern_writer ftrunc trunc fs_perms testx looptest \
diff --git a/src/t_ext4_dax_inline_corruption.c 
b/src/t_ext4_dax_inline_corruption.c
new file mode 100644
index ..4b7d8938
--- /dev/null
+++ b/src/t_ext4_dax_inline_corruption.c
@@ -0,0 +1,70 @@
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define PAGE(a) ((a)*0x1000)
+#define STRLEN 256
+
+void err_exit(char *op)
+{
+   fprintf(stderr, "%s: %s\n", op, strerror(errno));
+   exit(1);
+}
+
+int main(int argc, char *argv[])
+{
+   int fd, err, len = PAGE(1);
+   char *dax_data, *data;
+   char string[STRLEN];
+
+   if (argc < 2) {
+   printf("Usage: %s \n", basename(argv[0]));
+   exit(0);
+   }
+
+   srand(time(NULL));
+   snprintf(string, STRLEN, "random number %d\n", rand());
+
+   fd = open(argv[1], O_RDWR);
+   if (fd < 0)
+   err_exit("fd");
+
+   data = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+   if (!data)
+   err_exit("mmap data");
+
+   /* this fallocate turns off inline data and turns on DAX */
+   fallocate(fd, 0, 0, PAGE(2));
+
+   dax_data = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
+   if (!dax_data)
+   err_exit("mmap dax_data");
+
+   /*
+* Write the data using the non-DAX mapping, and try and read it back
+* using the DAX mapping.
+*/
+   strcpy(data, string);
+   if (strcmp(dax_data, string) != 0)
+   printf("Data miscompare\n");
+
+   err = munmap(dax_data, len);
+   if (err < 0)
+   err_exit("munmap dax_data");
+
+   err = munmap(data, len);
+   if (err < 0)
+   err_exit("munmap data");
+
+   err = close(fd);
+   if (err < 0)
+   err_exit("close");
+   return 0;
+}
diff --git a/tests/ext4/031 b/tests/ext4/031
new file mode 100755
index ..58177b44
--- /dev/null
+++ b/tests/ext4/031
@@ -0,0 +1,86 @@
+#! /bin/bash
+# FS QA Test ext4/031
+#
+# This is a regression test for kernel patch:
+#   ext4: prevent data corruption with inline data + DAX
+# created by Ross Zwisler <ross.zwis...@linux.intel.com>
+#
+#---
+# Copyright (c) 2017-2018 Intel Corporation.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+

[PATCH 1/2] ext4: test for DAX + journaling corruption

2018-01-04 Thread Ross Zwisler
Add a regression test for the following kernel commit:

  ext4: prevent data corruption with journaling + DAX

The test passes if either we successfully compare the data between the mmap
with journaling turned on and the one with journaling turned off, or if we
fail the chattr command to turn on or off journaling.  The latter is how we
prevent this issue in the kernel.

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>

---

Changes since v1:

 - Reordered .gitignore entry. (Eryu)

 - Added comments about how "chattr +j" turns off DAX and about why we need
   the 'nodelalloc' mount option. (Eryu)

 - Added a _require_command for chattr. (Eryu)

 - Added $here for src/t_ext4_dax_journal_corruption command. (Eryu)

The previous version of this series is here:

https://lists.01.org/pipermail/linux-nvdimm/2017-September/012463.html

The related kernel patches were merged in v4.15-rc1.
---
 .gitignore  |   1 +
 src/Makefile|   3 +-
 src/t_ext4_dax_journal_corruption.c | 102 
 tests/ext4/030  |  74 ++
 tests/ext4/030.out  |   2 +
 tests/ext4/group|   1 +
 6 files changed, 182 insertions(+), 1 deletion(-)
 create mode 100644 src/t_ext4_dax_journal_corruption.c
 create mode 100755 tests/ext4/030
 create mode 100644 tests/ext4/030.out

diff --git a/.gitignore b/.gitignore
index f27c30af..840e4fe4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -115,6 +115,7 @@
 /src/t_dir_offset2
 /src/t_dir_type
 /src/t_encrypted_d_revalidate
+/src/t_ext4_dax_journal_corruption
 /src/t_futimens
 /src/t_getcwd
 /src/t_holes
diff --git a/src/Makefile b/src/Makefile
index b1012172..86c5440c 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -13,7 +13,8 @@ TARGETS = dirstress fill fill2 getpagesize holes lstat64 \
multi_open_unlink dmiperf unwritten_sync genhashnames t_holes \
t_mmap_writev t_truncate_cmtime dirhash_collide t_rename_overwrite \
holetest t_truncate_self t_mmap_dio af_unix t_mmap_stale_pmd \
-   t_mmap_cow_race t_mmap_fallocate fsync-err t_mmap_write_ro
+   t_mmap_cow_race t_mmap_fallocate fsync-err t_mmap_write_ro \
+   t_ext4_dax_journal_corruption
 
 LINUX_TARGETS = xfsctl bstat t_mtab getdevicesize preallo_rw_pattern_reader \
preallo_rw_pattern_writer ftrunc trunc fs_perms testx looptest \
diff --git a/src/t_ext4_dax_journal_corruption.c 
b/src/t_ext4_dax_journal_corruption.c
new file mode 100644
index ..18a2acdc
--- /dev/null
+++ b/src/t_ext4_dax_journal_corruption.c
@@ -0,0 +1,102 @@
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define PAGE(a) ((a)*0x1000)
+#define STRLEN 256
+
+void err_exit(char *op)
+{
+   fprintf(stderr, "%s: %s\n", op, strerror(errno));
+   exit(1);
+}
+
+void chattr_cmd(char *chattr, char *cmd, char *file)
+{
+   int ret;
+   char command[STRLEN];
+
+   ret = snprintf(command, STRLEN, "%s %s %s 2>/dev/null", chattr, cmd, 
file);
+   if (ret < 0)
+   err_exit("snprintf");
+
+   ret = system(command);
+   if (ret) /* Success - the kernel fix is to have this chattr fail */
+   exit(77);
+}
+
+int main(int argc, char *argv[])
+{
+   int fd, err, len = PAGE(1);
+   char *data, *dax_data, *chattr, *file;
+   char string[STRLEN];
+
+   if (argc < 3) {
+   printf("Usage: %s  \n", 
basename(argv[0]));
+   exit(0);
+   }
+
+   chattr = argv[1];
+   file = argv[2];
+
+   srand(time(NULL));
+   snprintf(string, STRLEN, "random number %d\n", rand());
+
+   fd = open(file, O_RDWR|O_CREAT, S_IRUSR|S_IWUSR);
+   if (fd < 0)
+   err_exit("fd");
+
+   /* begin with journaling off and DAX on */
+   chattr_cmd(chattr, "-j", file);
+
+   ftruncate(fd, 0);
+   fallocate(fd, 0, 0, len);
+
+   dax_data = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
+   if (!dax_data)
+   err_exit("mmap dax_data");
+
+   /*
+* This turns on journaling.  It also has the side-effect that it
+* turns off DAX for the given inode since journaling and DAX aren't
+* allowed to be on at the same time.  This happens in
+* ext4_change_inode_journal_flag() in kernel v4.14 and before.
+*
+* Note that this turns off the runtime DAX flag (S_DAX) in the
+* in-memory inode, and has nothing to do with per-inode on-media DAX
+* inode flags.
+*/
+   chattr_cmd(chattr, "+j", file);
+
+   data = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+   if (!data)
+   err_exit("mmap data");
+
+   /*
+* Write the data using the non-DAX mapping, and try and re

Re: [PATCH v4 08/18] tools/testing/nvdimm: add 'bio_delay' mechanism

2017-12-27 Thread Ross Zwisler
On Sat, Dec 23, 2017 at 04:56:43PM -0800, Dan Williams wrote:
> In support of testing truncate colliding with dma add a mechanism that
> delays the completion of block I/O requests by a programmable number of
> seconds. This allows a truncate operation to be issued while page
> references are held for direct-I/O.
> 
> Signed-off-by: Dan Williams 

> @@ -387,4 +389,64 @@ union acpi_object * __wrap_acpi_evaluate_dsm(acpi_handle 
> handle, const guid_t *g
>  }
>  EXPORT_SYMBOL(__wrap_acpi_evaluate_dsm);
>  
> +static DEFINE_SPINLOCK(bio_lock);
> +static struct bio *biolist;
> +int bio_do_queue;
> +
> +static void run_bio(struct work_struct *work)
> +{
> + struct delayed_work *dw = container_of(work, typeof(*dw), work);
> + struct bio *bio, *next;
> +
> + pr_info("%s\n", __func__);

Did you mean to leave this print in, or was it part of your debug while
developing?  I don't see any other prints in the rest of the nvdimm testing
code?

> + spin_lock(_lock);
> + bio_do_queue = 0;
> + bio = biolist;
> + biolist = NULL;
> + spin_unlock(_lock);
> +
> + while (bio) {
> + next = bio->bi_next;
> + bio->bi_next = NULL;
> + bio_endio(bio);
> + bio = next;
> + }
> + kfree(dw);
> +}
> +
> +void nfit_test_inject_bio_delay(int sec)
> +{
> + struct delayed_work *dw = kzalloc(sizeof(*dw), GFP_KERNEL);
> +
> + spin_lock(_lock);
> + if (!bio_do_queue) {
> + pr_info("%s: %d seconds\n", __func__, sec);

Ditto with this print - did you mean to leave it in?

> + INIT_DELAYED_WORK(dw, run_bio);
> + bio_do_queue = 1;
> + schedule_delayed_work(dw, sec * HZ);
> + dw = NULL;

Why set dw = NULL here?  In the else case we leak dw - was this dw=NULL meant
to allow a kfree(dw) after we get out of the if() (and probably after we drop
the spinlock)?

> + }
> + spin_unlock(_lock);
> +}
> +EXPORT_SYMBOL_GPL(nfit_test_inject_bio_delay);
> +

> diff --git a/tools/testing/nvdimm/test/nfit.c 
> b/tools/testing/nvdimm/test/nfit.c
> index 7217b2b953b5..9362b01e9a8f 100644
> --- a/tools/testing/nvdimm/test/nfit.c
> +++ b/tools/testing/nvdimm/test/nfit.c
> @@ -872,6 +872,39 @@ static const struct attribute_group 
> *nfit_test_dimm_attribute_groups[] = {
>   NULL,
>  };
>  
> +static ssize_t bio_delay_show(struct device_driver *drv, char *buf)
> +{
> + return sprintf(buf, "0\n");
> +}

It doesn't seem like this _show() routine adds much?  We could have it print
out the value of 'bio_do_queue' so we can see if we are currently queueing
bios in a workqueue element, but that suffers pretty badly from a TOCTOU race.

Otherwise we could just omit the _show() altogether and just use
DRIVER_ATTR_WO(bio_delay).

> +
> +static ssize_t bio_delay_store(struct device_driver *drv, const char *buf,
> + size_t count)
> +{
> + unsigned long delay;
> + int rc = kstrtoul(buf, 0, );
> +
> + if (rc < 0)
> + return rc;
> +
> + nfit_test_inject_bio_delay(delay);
> + return count;
> +}
> +DRIVER_ATTR_RW(bio_delay);

   DRIVER_ATTR_WO(bio_delay);  ?
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v4 07/18] dax: store pfns in the radix

2017-12-26 Thread Ross Zwisler
On Sat, Dec 23, 2017 at 04:56:38PM -0800, Dan Williams wrote:
> In preparation for examining the busy state of dax pages in the truncate
> path, switch from sectors to pfns in the radix.
> 
> Cc: Jan Kara <j...@suse.cz>
> Cc: Jeff Moyer <jmo...@redhat.com>
> Cc: Christoph Hellwig <h...@lst.de>
> Cc: Matthew Wilcox <mawil...@microsoft.com>
> Cc: Ross Zwisler <ross.zwis...@linux.intel.com>
> Signed-off-by: Dan Williams <dan.j.willi...@intel.com>
> ---
>  drivers/dax/super.c |   15 --
>  fs/dax.c|   75 
> ++-
>  2 files changed, 39 insertions(+), 51 deletions(-)
<>
> @@ -688,7 +685,7 @@ static int dax_writeback_one(struct block_device *bdev,
>* compare sectors as we must not bail out due to difference in lockbit
>* or entry type.
>*/

Can you please also fix the comment above this test so it talks about pfns
instead of sectors?

> - if (dax_radix_sector(entry2) != dax_radix_sector(entry))
> + if (dax_radix_pfn(entry2) != dax_radix_pfn(entry))
>   goto put_unlocked;
>   if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
>   dax_is_zero_entry(entry))) {
> @@ -718,29 +715,11 @@ static int dax_writeback_one(struct block_device *bdev,
>* 'entry'.  This allows us to flush for PMD_SIZE and not have to
>* worry about partial PMD writebacks.
>*/

Ditto for this comment ^^^

> - sector = dax_radix_sector(entry);
> + pfn = dax_radix_pfn(entry);
>   size = PAGE_SIZE << dax_radix_order(entry);
>  
> - id = dax_read_lock();
> - ret = bdev_dax_pgoff(bdev, sector, size, );
> - if (ret)
> - goto dax_unlock;
> -
> - /*
> -  * dax_direct_access() may sleep, so cannot hold tree_lock over
> -  * its invocation.
> -  */
> - ret = dax_direct_access(dax_dev, pgoff, size / PAGE_SIZE, , );
> - if (ret < 0)
> - goto dax_unlock;
> -
> - if (WARN_ON_ONCE(ret < size / PAGE_SIZE)) {
> - ret = -EIO;
> - goto dax_unlock;
> - }
> -
> - dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn));
> - dax_flush(dax_dev, kaddr, size);
> + dax_mapping_entry_mkclean(mapping, index, pfn);
> + dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size);
>   /*
>* After we have flushed the cache, we can clear the dirty tag. There
>* cannot be new dirty data in the pfn after the flush has completed as
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v3 0/3] create sysfs representation of ACPI HMAT

2017-12-22 Thread Ross Zwisler
On Fri, Dec 22, 2017 at 02:53:42PM -0800, Dan Williams wrote:
> On Thu, Dec 21, 2017 at 12:31 PM, Brice Goglin <brice.gog...@gmail.com> wrote:
> > Le 20/12/2017 à 23:41, Ross Zwisler a écrit :
> [..]
> > Hello
> >
> > I can confirm that HPC runtimes are going to use these patches (at least
> > all runtimes that use hwloc for topology discovery, but that's the vast
> > majority of HPC anyway).
> >
> > We really didn't like KNL exposing a hacky SLIT table [1]. We had to
> > explicitly detect that specific crazy table to find out which NUMA nodes
> > were local to which cores, and to find out which NUMA nodes were
> > HBM/MCDRAM or DDR. And then we had to hide the SLIT values to the
> > application because the reported latencies didn't match reality. Quite
> > annoying.
> >
> > With Ross' patches, we can easily get what we need:
> > * which NUMA nodes are local to which CPUs? /sys/devices/system/node/
> > can only report a single local node per CPU (doesn't work for KNL and
> > upcoming architectures with HBM+DDR+...)
> > * which NUMA nodes are slow/fast (for both bandwidth and latency)
> > And we can still look at SLIT under /sys/devices/system/node if really
> > needed.
> >
> > And of course having this in sysfs is much better than parsing ACPI
> > tables that are only accessible to root :)
> 
> On this point, it's not clear to me that we should allow these sysfs
> entries to be world readable. Given /proc/iomem now hides physical
> address information from non-root we at least need to be careful not
> to undo that with new sysfs HMAT attributes.

This enabling does not expose any physical addresses to userspace.  It only
provides performance numbers from the HMAT and associates them with existing
NUMA nodes.  Are you worried that exposing performance numbers to non-root
users via sysfs poses a security risk?

> Once you need to be root for this info, is parsing binary HMAT vs sysfs a
> blocker for the HPC use case?
> 
> Perhaps we can enlist /proc/iomem or a similar enumeration interface
> to tell userspace the NUMA node and whether the kernel thinks it has
> better or worse performance characteristics relative to base
> system-RAM, i.e. new IORES_DESC_* values. I'm worried that if we start
> publishing absolute numbers in sysfs userspace will default to looking
> for specific magic numbers in sysfs vs asking the kernel for memory
> that has performance characteristics relative to base "System RAM". In
> other words the absolute performance information that the HMAT
> publishes is useful to the kernel, but it's not clear that userspace
> needs that vs a relative indicator for making NUMA node preference
> decisions.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v3 0/3] create sysfs representation of ACPI HMAT

2017-12-22 Thread Ross Zwisler
On Fri, Dec 22, 2017 at 08:39:41AM +0530, Anshuman Khandual wrote:
> On 12/14/2017 07:40 AM, Ross Zwisler wrote:
<>
> > We solve this issue by providing userspace with performance information on
> > individual memory ranges.  This performance information is exposed via
> > sysfs:
> > 
> >   # grep . mem_tgt2/* mem_tgt2/local_init/* 2>/dev/null
> >   mem_tgt2/firmware_id:1
> >   mem_tgt2/is_cached:0
> >   mem_tgt2/local_init/read_bw_MBps:40960
> >   mem_tgt2/local_init/read_lat_nsec:50
> >   mem_tgt2/local_init/write_bw_MBps:40960
> >   mem_tgt2/local_init/write_lat_nsec:50
<>
> We will enlist properties for all possible "source --> target" on the system?

Nope, just 'local' initiator/target pairs.  I talk about the reasoning for
this in the cover letter for patch 3:

https://lists.01.org/pipermail/linux-nvdimm/2017-December/013574.html

> Right now it shows only bandwidth and latency properties, can it accommodate
> other properties as well in future ?

We also have an 'is_cached' attribute for the memory targets if they are
involved in a caching hierarchy, but right now those are all the things we
expose.  We can potentially expose whatever we want that is present in the
HMAT, but those seemed like a good start.

I noticed that in your presentation you had some other examples of attributes
you cared about:

 * reliability
 * power consumption
 * density

The HMAT doesn't provide this sort of information at present, but we
could/would add them to sysfs if the HMAT ever grew support for them.

> > This allows applications to easily find the memory that they want to use.
> > We expect that the existing NUMA APIs will be enhanced to use this new
> > information so that applications can continue to use them to select their
> > desired memory.
> 
> I had presented a proposal for NUMA redesign in the Plumbers Conference this
> year where various memory devices with different kind of memory attributes
> can be represented in the kernel and be used explicitly from the user space.
> Here is the link to the proposal if you feel interested. The proposal is
> very intrusive and also I dont have a RFC for it yet for discussion here.
> 
> https://linuxplumbersconf.org/2017/ocw//system/presentations/4656/original/Hierarchical_NUMA_Design_Plumbers_2017.pdf
> 
> Problem is, designing the sysfs interface for memory attribute detection
> from user space without first thinking about redesigning the NUMA for
> heterogeneous memory may not be a good idea. Will look into this further.

I took another look at your presentation, and overall I think that if/when a
NUMA redesign like this takes place ACPI systems with HMAT tables will be able
to participate.  But I think we are probably a ways away from that, and like I
said in my previous mail ACPI systems with memory-only NUMA nodes are going to
exist and need to be supported with the current NUMA scheme.  Hence I don't
think that this patch series conflicts with your proposal.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v3 0/3] create sysfs representation of ACPI HMAT

2017-12-22 Thread Ross Zwisler
On Fri, Dec 22, 2017 at 08:39:41AM +0530, Anshuman Khandual wrote:
> On 12/14/2017 07:40 AM, Ross Zwisler wrote:
> >  Quick Summary 
> > 
> > Platforms exist today which have multiple types of memory attached to a
> > single CPU.  These disparate memory ranges have some characteristics in
> > common, such as CPU cache coherence, but they can have wide ranges of
> > performance both in terms of latency and bandwidth.
> 
> Right.
> 
> > 
> > For example, consider a system that contains persistent memory, standard
> > DDR memory and High Bandwidth Memory (HBM), all attached to the same CPU.
> > There could potentially be an order of magnitude or more difference in
> > performance between the slowest and fastest memory attached to that CPU.
> 
> Right.
> 
> > 
> > With the current Linux code NUMA nodes are CPU-centric, so all the memory
> > attached to a given CPU will be lumped into the same NUMA node.  This makes
> > it very difficult for userspace applications to understand the performance
> > of different memory ranges on a given CPU.
> 
> Right but that might require fundamental changes to the NUMA representation.
> Plugging those memory as separate NUMA nodes, identify them through sysfs
> and try allocating from it through mbind() seems like a short term solution.
> 
> Though if we decide to go in this direction, sysfs interface or something
> similar is required to enumerate memory properties.

Yep, and this patch series is trying to be the sysfs interface that is
required to the memory properties.  :)  It's a certainty that we will have
memory-only NUMA nodes, at least on platforms that support ACPI.  Supporting
memory-only proximity domains (which Linux turns in to memory-only NUMA nodes)
is explicitly supported with the introduction of the HMAT in ACPI 6.2.

It also turns out that the existing memory management code already deals with
them just fine - you see this with my hmat_examples setup:

https://github.com/rzwisler/hmat_examples

Both configurations created by this repo create memory-only NUMA nodes, even
with upstream kernels.  My patches don't change that, they just provide a
sysfs representation of the HMAT so users can discover the memory that exists
in the system.

> > We solve this issue by providing userspace with performance information on
> > individual memory ranges.  This performance information is exposed via
> > sysfs:
> > 
> >   # grep . mem_tgt2/* mem_tgt2/local_init/* 2>/dev/null
> >   mem_tgt2/firmware_id:1
> >   mem_tgt2/is_cached:0
> >   mem_tgt2/local_init/read_bw_MBps:40960
> >   mem_tgt2/local_init/read_lat_nsec:50
> >   mem_tgt2/local_init/write_bw_MBps:40960
> >   mem_tgt2/local_init/write_lat_nsec:50
> 
> I might have missed discussions from earlier versions, why we have this
> kind of a "source --> target" model ? We will enlist properties for all
> possible "source --> target" on the system ? Right now it shows only
> bandwidth and latency properties, can it accommodate other properties
> as well in future ?

The initiator/target model is useful in preventing us from needing a
MAX_NUMA_NODES x MAX_NUMA_NODES sized table for each performance attribute.  I
talked about it a little more here:

https://lists.01.org/pipermail/linux-nvdimm/2017-December/013654.html

> > This allows applications to easily find the memory that they want to use.
> > We expect that the existing NUMA APIs will be enhanced to use this new
> > information so that applications can continue to use them to select their
> > desired memory.
> 
> I had presented a proposal for NUMA redesign in the Plumbers Conference this
> year where various memory devices with different kind of memory attributes
> can be represented in the kernel and be used explicitly from the user space.
> Here is the link to the proposal if you feel interested. The proposal is
> very intrusive and also I dont have a RFC for it yet for discussion here.
> 
> https://linuxplumbersconf.org/2017/ocw//system/presentations/4656/original/Hierarchical_NUMA_Design_Plumbers_2017.pdf

I'll take a look, but my first reaction is that I agree with Dave that it
seems hard to re-teach systems a new NUMA scheme.  This patch series doesn't
attempt to do that - it is very unintrusive and only informs users about the
memory-only NUMA nodes that will already exist in their ACPI-based systems.

> Problem is, designing the sysfs interface for memory attribute detection
> from user space without first thinking about redesigning the NUMA for
> heterogeneous memory may not be a good idea. Will look into this further.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[xfsprogs PATCH v4 2/2] xfs_io: add a new 'log_writes' command

2017-12-21 Thread Ross Zwisler
Add a new 'log_writes' command to xfs_io so that we can add dm-log-writes
log marks.  It's helpful to allow users of xfs_io to adds these marks from
within xfs_io instead of waiting until after xfs_io exits because then they
are able to replay the dm-log-writes log up to immediately after another
xfs_io operation such as mwrite.  This isolates the log replay from other
operations that happen as part of xfs_io exiting (file handles being
closed, mmaps being torn down, etc.).  This also allows users to insert
multiple marks between different xfs_io commands.

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
Suggested-by: Dave Chinner <da...@fromorbit.com>
Reviewed-by: Darrick J. Wong <darrick.w...@oracle.com>
---
 configure.ac|   1 +
 debian/control  |   2 +-
 include/builddefs.in|   2 +
 io/Makefile |   6 +++
 io/init.c   |   1 +
 io/io.h |   6 +++
 io/log_writes.c | 106 
 m4/Makefile |   1 +
 m4/package_devmapper.m4 |  11 +
 man/man8/xfs_io.8   |  23 ++-
 10 files changed, 157 insertions(+), 2 deletions(-)
 create mode 100644 io/log_writes.c
 create mode 100644 m4/package_devmapper.m4

diff --git a/configure.ac b/configure.ac
index f3325aa0..f83d5817 100644
--- a/configure.ac
+++ b/configure.ac
@@ -164,6 +164,7 @@ AC_NEED_INTERNAL_FSXATTR
 AC_HAVE_GETFSMAP
 AC_HAVE_STATFS_FLAGS
 AC_HAVE_MAP_SYNC
+AC_HAVE_DEVMAPPER
 
 if test "$enable_blkid" = yes; then
 AC_HAVE_BLKID_TOPO
diff --git a/debian/control b/debian/control
index ad816622..5c26e3d7 100644
--- a/debian/control
+++ b/debian/control
@@ -3,7 +3,7 @@ Section: admin
 Priority: optional
 Maintainer: XFS Development Team <linux-...@vger.kernel.org>
 Uploaders: Nathan Scott <nath...@debian.org>, Anibal Monsalve Salazar 
<ani...@debian.org>
-Build-Depends: uuid-dev, dh-autoreconf, debhelper (>= 5), gettext, libtool, 
libreadline-gplv2-dev | libreadline5-dev, libblkid-dev (>= 2.17), linux-libc-dev
+Build-Depends: uuid-dev, dh-autoreconf, debhelper (>= 5), gettext, libtool, 
libreadline-gplv2-dev | libreadline5-dev, libblkid-dev (>= 2.17), 
linux-libc-dev, libdevmapper-dev
 Standards-Version: 3.9.1
 Homepage: http://xfs.org/
 
diff --git a/include/builddefs.in b/include/builddefs.in
index 126f7e95..66bdbfa2 100644
--- a/include/builddefs.in
+++ b/include/builddefs.in
@@ -35,6 +35,7 @@ LIBTERMCAP = @libtermcap@
 LIBEDITLINE = @libeditline@
 LIBREADLINE = @libreadline@
 LIBBLKID = @libblkid@
+LIBDEVMAPPER = @libdevmapper@
 LIBXFS = $(TOPDIR)/libxfs/libxfs.la
 LIBXCMD = $(TOPDIR)/libxcmd/libxcmd.la
 LIBXLOG = $(TOPDIR)/libxlog/libxlog.la
@@ -116,6 +117,7 @@ NEED_INTERNAL_FSXATTR = @need_internal_fsxattr@
 HAVE_GETFSMAP = @have_getfsmap@
 HAVE_STATFS_FLAGS = @have_statfs_flags@
 HAVE_MAP_SYNC = @have_map_sync@
+HAVE_DEVMAPPER = @have_devmapper@
 
 GCCFLAGS = -funsigned-char -fno-strict-aliasing -Wall
 # -Wbitwise -Wno-transparent-union -Wno-old-initializer -Wno-decl
diff --git a/io/Makefile b/io/Makefile
index 2987ee11..b7585a1b 100644
--- a/io/Makefile
+++ b/io/Makefile
@@ -107,6 +107,12 @@ ifeq ($(HAVE_MAP_SYNC),yes)
 LCFLAGS += -DHAVE_MAP_SYNC
 endif
 
+ifeq ($(HAVE_DEVMAPPER),yes)
+CFILES += log_writes.c
+LLDLIBS += $(LIBDEVMAPPER)
+LCFLAGS += -DHAVE_DEVMAPPER
+endif
+
 # On linux we get fsmap from the system or define it ourselves
 # so include this based on platform type.  If this reverts to only
 # the autoconf check w/o local definition, change to testing HAVE_GETFSMAP
diff --git a/io/init.c b/io/init.c
index 20d5f80d..34d87b5f 100644
--- a/io/init.c
+++ b/io/init.c
@@ -72,6 +72,7 @@ init_commands(void)
help_init();
imap_init();
inject_init();
+   log_writes_init();
madvise_init();
mincore_init();
mmap_init();
diff --git a/io/io.h b/io/io.h
index 8b2753b3..9349cc75 100644
--- a/io/io.h
+++ b/io/io.h
@@ -186,3 +186,9 @@ extern void fsmap_init(void);
 #else
 # define fsmap_init()  do { } while (0)
 #endif
+
+#ifdef HAVE_DEVMAPPER
+extern voidlog_writes_init(void);
+#else
+#define log_writes_init()  do { } while (0)
+#endif
diff --git a/io/log_writes.c b/io/log_writes.c
new file mode 100644
index ..46ea1c26
--- /dev/null
+++ b/io/log_writes.c
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2017 Intel Corporation.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this pr

[xfsprogs PATCH v4 1/2] xfs_io: add MAP_SYNC support to mmap()

2017-12-21 Thread Ross Zwisler
Add support for a new -S flag to xfs_io's mmap command.  This opens the
mapping with the (MAP_SYNC | MAP_SHARED_VALIDATE) flags instead of the
standard MAP_SHARED flag.

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
Suggested-by: Dave Chinner <da...@fromorbit.com>
---
 configure.ac  |  1 +
 include/builddefs.in  |  1 +
 include/linux.h   |  8 
 io/Makefile   |  4 
 io/io.h   |  1 +
 io/mmap.c | 29 -
 m4/package_libcdev.m4 | 16 
 man/man8/xfs_io.8 |  6 +-
 8 files changed, 60 insertions(+), 6 deletions(-)

diff --git a/configure.ac b/configure.ac
index e5d0309f..f3325aa0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -163,6 +163,7 @@ AC_HAVE_MREMAP
 AC_NEED_INTERNAL_FSXATTR
 AC_HAVE_GETFSMAP
 AC_HAVE_STATFS_FLAGS
+AC_HAVE_MAP_SYNC
 
 if test "$enable_blkid" = yes; then
 AC_HAVE_BLKID_TOPO
diff --git a/include/builddefs.in b/include/builddefs.in
index fd274ddc..126f7e95 100644
--- a/include/builddefs.in
+++ b/include/builddefs.in
@@ -115,6 +115,7 @@ HAVE_MREMAP = @have_mremap@
 NEED_INTERNAL_FSXATTR = @need_internal_fsxattr@
 HAVE_GETFSMAP = @have_getfsmap@
 HAVE_STATFS_FLAGS = @have_statfs_flags@
+HAVE_MAP_SYNC = @have_map_sync@
 
 GCCFLAGS = -funsigned-char -fno-strict-aliasing -Wall
 # -Wbitwise -Wno-transparent-union -Wno-old-initializer -Wno-decl
diff --git a/include/linux.h b/include/linux.h
index 6ce344c5..1998941a 100644
--- a/include/linux.h
+++ b/include/linux.h
@@ -327,4 +327,12 @@ fsmap_advance(
 #define HAVE_GETFSMAP
 #endif /* HAVE_GETFSMAP */
 
+#ifndef HAVE_MAP_SYNC
+#define MAP_SYNC 0
+#define MAP_SHARED_VALIDATE 0
+#else
+#include 
+#include 
+#endif /* HAVE_MAP_SYNC */
+
 #endif /* __XFS_LINUX_H__ */
diff --git a/io/Makefile b/io/Makefile
index 6725936d..2987ee11 100644
--- a/io/Makefile
+++ b/io/Makefile
@@ -103,6 +103,10 @@ ifeq ($(HAVE_MREMAP),yes)
 LCFLAGS += -DHAVE_MREMAP
 endif
 
+ifeq ($(HAVE_MAP_SYNC),yes)
+LCFLAGS += -DHAVE_MAP_SYNC
+endif
+
 # On linux we get fsmap from the system or define it ourselves
 # so include this based on platform type.  If this reverts to only
 # the autoconf check w/o local definition, change to testing HAVE_GETFSMAP
diff --git a/io/io.h b/io/io.h
index 3862985f..8b2753b3 100644
--- a/io/io.h
+++ b/io/io.h
@@ -65,6 +65,7 @@ typedef struct mmap_region {
size_t  length; /* length of mapping */
off64_t offset; /* start offset into backing file */
int prot;   /* protection mode of the mapping */
+   boolmap_sync;   /* is this a MAP_SYNC mapping? */
char*name;  /* name of backing file */
 } mmap_region_t;
 
diff --git a/io/mmap.c b/io/mmap.c
index b0c1f764..04a828a4 100644
--- a/io/mmap.c
+++ b/io/mmap.c
@@ -42,7 +42,7 @@ print_mapping(
int index,
int braces)
 {
-   unsigned char   buffer[8] = { 0 };
+   charbuffer[8] = { 0 };
int i;
 
static struct {
@@ -57,6 +57,10 @@ print_mapping(
 
for (i = 0, p = pflags; p->prot != PROT_NONE; i++, p++)
buffer[i] = (map->prot & p->prot) ? p->mode : '-';
+
+   if (map->map_sync)
+   sprintf([i], " S");
+
printf("%c%03d%c 0x%lx - 0x%lx %s  %14s (%lld : %ld)\n",
braces? '[' : ' ', index, braces? ']' : ' ',
(unsigned long)map->addr,
@@ -147,6 +151,7 @@ mmap_help(void)
 " -r -- map with PROT_READ protection\n"
 " -w -- map with PROT_WRITE protection\n"
 " -x -- map with PROT_EXEC protection\n"
+" -S -- map with MAP_SYNC and MAP_SHARED_VALIDATE flags\n"
 " -s  -- first do mmap(size)/munmap(size), try to reserve some free 
space\n"
 " If no protection mode is specified, all are used by default.\n"
 "\n"));
@@ -162,7 +167,7 @@ mmap_f(
void*address = NULL;
char*filename;
size_t  blocksize, sectsize;
-   int c, prot = 0;
+   int c, prot = 0, flags = MAP_SHARED;
 
exitcode = 1;
if (argc == 1) {
@@ -186,7 +191,7 @@ mmap_f(
 
init_cvtnum(, );
 
-   while ((c = getopt(argc, argv, "rwxs:")) != EOF) {
+   while ((c = getopt(argc, argv, "rwxSs:")) != EOF) {
switch (c) {
case 'r':
prot |= PROT_READ;
@@ -197,6 +202,19 @@ mmap_f(
case 'x':
prot |= PROT_EXEC;
break;
+   case 'S':
+   flags = MAP_SYNC | MAP_SHARED_VALIDATE;
+
+   /*
+* If MAP_SYNC and MAP_SHARED_VALIDATE aren't defined
+* in the system headers we will have defined the

Re: [xfsprogs PATCH v2 2/3] xfs_io: add MAP_SYNC support to mmap()

2017-12-21 Thread Ross Zwisler
On Thu, Dec 21, 2017 at 09:09:08AM -0800, Darrick J. Wong wrote:
> On Tue, Dec 05, 2017 at 04:56:50PM -0700, Ross Zwisler wrote:

> > @@ -195,6 +200,13 @@ mmap_f(
> > case 'x':
> > prot |= PROT_EXEC;
> > break;
> > +   case 'S':
> > +   flags = MAP_SYNC | MAP_SHARED_VALIDATE;
> > +   if (!flags) {
> 
> Heh, subtle. :)
> 
> /me wonders if it'd be better to do this explicitly:
> 
> #ifdef HAVE_MAP_SYNC
>   flags = MAP_SYNC | MAP_SHARED_VALIDATE;
> #else
>   printf("MAP_SYNC not supported\n");
>   return 1;
> #endif
> 
> ...though it's ugly.

Yea, I was trying to avoid #ifdefery.  If you prefer this, though, I'm happy
to change it.  Or maybe a comment would be enough?

/*
 * If MAP_SYNC and MAP_SHARED_VALIDATE aren't defined in the system headers we
 * will have defined them both as 0.
 */

?

> > +   printf("MAP_SYNC not supported\n");
> > +   return 0;
> 
> Are we supposed to be returning nonzero values for failing commands?

I don't think so.  All the other error conditions in this function also return
0.  I think the important thing is that 'exitcode' is set to 1 at the
beginning of the function per Dave's patch,

https://www.spinics.net/lists/linux-xfs/msg13605.html

which I pulled into my baseline:

https://git.kernel.org/pub/scm/linux/kernel/git/zwisler/xfsprogs-dev.git/refs/?h=map_sync_v3

So, I likewise leave 'exitcode' as 1, bail out with a return code of 0, and
then you get the overall failure return of 1 from xfs_io at the shell.

> > @@ -764,7 +764,7 @@ Each (sec, nsec) pair constitutes a single timestamp 
> > value.
> >  
> >  .SH MEMORY MAPPED I/O COMMANDS
> >  .TP
> > -.BI "mmap [ " N " | [[ \-rwx ] [\-s " size " ] " "offset length " ]]
> > +.BI "mmap [ " N " | [[ \-rwxS ] [\-s " size " ] " "offset length " ]]
> >  With no arguments,
> >  .B mmap
> >  shows the current mappings. Specifying a single numeric argument
> > @@ -780,6 +780,10 @@ PROT_WRITE
> >  .RB ( \-w ),
> >  and PROT_EXEC
> >  .RB ( \-x ).
> > +The mapping will be created with the MAP_SHARED flag by default, or with 
> > the
> > +Linux specific (MAP_SYNC | MAP_SHARED_VALIDATE) flags if
> 
> I assume I'll be able to look up exactly what MAP_SYNC provides in the mmap
> manpage, right?

Yep, Jan updated the man page for both MAP_SYNC and MAP_SHARED_VALIDATE:
https://lists.01.org/pipermail/linux-nvdimm/2017-November/013158.html

Thank you for the review.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH 2/2] ext4: Fix ENOSPC handling in DAX page fault handler

2017-12-21 Thread Ross Zwisler
On Thu, Dec 21, 2017 at 05:30:55PM +0100, Jan Kara wrote:
> When allocation of underlying block for a page fault fails, we fail the
> fault with SIGBUS. However we may well hit ENOSPC just due to lots of
> free blocks being held by the running / committing transaction. So
> propagate the error from ext4_iomap_begin() and implement do standard
> allocation retry loop in ext4_dax_huge_fault().
> 
> Signed-off-by: Jan Kara <j...@suse.cz>

Looks good.

Reviewed-by: Ross Zwisler <ross.zwis...@linux.intel.com>
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH 1/2] dax: Pass detailed error code from dax_iomap_fault()

2017-12-21 Thread Ross Zwisler
On Thu, Dec 21, 2017 at 05:30:54PM +0100, Jan Kara wrote:
> Ext4 needs to pass through error from its iomap handler to the page
> fault handler so that it can properly detect ENOSPC and force
> transaction commit and retry the fault (and block allocation). Add
> argument to dax_iomap_fault() for passing such error.
> 
> Signed-off-by: Jan Kara <j...@suse.cz>

I like how much simpler this version is.  Looks good.

Reviewed-by: Ross Zwisler <ross.zwis...@linux.intel.com>
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [xfsprogs PATCH v2 0/3] Add necessary items for MAP_SYNC testing

2017-12-21 Thread Ross Zwisler
On Wed, Dec 13, 2017 at 09:45:52AM -0700, Ross Zwisler wrote:
> On Tue, Dec 05, 2017 at 04:56:48PM -0700, Ross Zwisler wrote:
> > This is the second revision of my MAP_SYNC + dm-log-writes support for
> > xfsprogs.
> 
> Friendly ping on this xfsprogs series.

ping
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v3 0/3] create sysfs representation of ACPI HMAT

2017-12-20 Thread Ross Zwisler
On Wed, Dec 20, 2017 at 10:19:37AM -0800, Matthew Wilcox wrote:
> On Mon, Dec 18, 2017 at 01:35:47PM -0700, Ross Zwisler wrote:
> > What I'm hoping to do with this series is to just provide a sysfs
> > representation of the HMAT so that applications can know which NUMA nodes to
> > select with existing utilities like numactl.  This series does not currently
> > alter any kernel behavior, it only provides a sysfs interface.
> > 
> > Say for example you had a system with some high bandwidth memory (HBM), and
> > you wanted to use it for a specific application.  You could use the sysfs
> > representation of the HMAT to figure out which memory target held your HBM.
> > You could do this by looking at the local bandwidth values for the various
> > memory targets, so:
> > 
> > # grep . /sys/devices/system/hmat/mem_tgt*/local_init/write_bw_MBps
> > /sys/devices/system/hmat/mem_tgt2/local_init/write_bw_MBps:81920
> > /sys/devices/system/hmat/mem_tgt3/local_init/write_bw_MBps:40960
> > /sys/devices/system/hmat/mem_tgt4/local_init/write_bw_MBps:40960
> > /sys/devices/system/hmat/mem_tgt5/local_init/write_bw_MBps:40960
> > 
> > and look for the one that corresponds to your HBM speed. (These numbers are
> > made up, but you get the idea.)
> 
> Presumably ACPI-based platforms will not be the only ones who have the
> ability to expose different bandwidth memories in the future.  I think
> we need a platform-agnostic way ... right, PowerPC people?

Hey Matthew,

Yep, this is where I started as well.  My plan with my initial implementation
was to try and make the sysfs representation as platform agnostic as possible,
and just have the ACPI HMAT as one of the many places to gather the data
needed to populate sysfs.

However, as I began coding the implementation became very specific to the
HMAT, probably because I don't know of way that this type of info is
represented on another platform.  John Hubbard noticed the same thing and
asked me to s/HMEM/HMAT/ everywhere and just make it HMAT specific, and to
prevent it from being confused with the HMM work:

https://lkml.org/lkml/2017/7/7/33
https://lkml.org/lkml/2017/7/7/442

I'm open to making it more platform agnostic if I can get my hands on a
parallel effort in another platform and tease out the commonality, but trying
to do that without a second example hasn't worked out.  If we don't have a
good second example right now I think maybe we should put this in and then
merge it with the second example when it comes along.

> I don't know what the right interface is, but my laptop has a set of
> /sys/devices/system/memory/memoryN/ directories.  Perhaps this is the
> right place to expose write_bw (etc).
> 
> > Once you know the NUMA node of your HBM, you can figure out the NUMA node of
> > it's local initiator:
> > 
> > # ls -d /sys/devices/system/hmat/mem_tgt2/local_init/mem_init*
> > /sys/devices/system/hmat/mem_tgt2/local_init/mem_init0
> > 
> > So, in our made-up example our HBM is located in numa node 2, and the local
> > CPU for that HBM is at numa node 0.
> 
> initiator is a CPU?  I'd have expected you to expose a memory controller
> abstraction rather than re-use storage terminology.

Yea, I agree that at first blush it seems weird.  It turns out that looking at
it in sort of a storage initiator/target way is beneficial, though, because it
allows us to cut down on the number of data values we need to represent.

For example the SLIT, which doesn't differentiate between initiator and target
proximity domains (and thus nodes) always represents a system with N proximity
domains using a NxN distance table.  This makes sense if every node contains
both CPUs and memory.

With the introduction of the HMAT, though, we can have memory-only initiator
nodes and we can explicitly associate them with their local CPU.  This is
necessary so that we can separate memory with different performance
characteristics (HBM vs normal memory vs persistent memory, for example) that
are all attached to the same CPU.

So, say we now have a system with 4 CPUs, and each of those CPUs has 3
different types of memory attached to it.  We now have 16 total proximity
domains, 4 CPU and 12 memory.

If we represent this with the SLIT we end up with a 16 X 16 distance table
(256 entries), most of which don't matter because they are memory-to-memory
distances which don't make sense.

In the HMAT, though, we separate out the initiators and the targets and put
them into separate lists.  (See 5.2.27.4 System Locality Latency and Bandwidth
Information Structure in ACPI 6.2 for details.)  So, this same config in the
HMAT only has 4*12=48 performance values of each type, all of which convey
meaningful information.

The HMAT indeed even uses the storage "initiator" and "target" terminology. :)
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v3 0/3] create sysfs representation of ACPI HMAT

2017-12-20 Thread Ross Zwisler
On Mon, Dec 18, 2017 at 01:35:47PM -0700, Ross Zwisler wrote:
> On Thu, Dec 14, 2017 at 02:00:32PM +0100, Michal Hocko wrote:
<>
> > What is the testing procedure? How can I setup qemu to simlate such HW?
> 
> Well, the QEMU table simulation is gross, so I'd rather not get everyone
> testing with that.  Injecting custom HMAT and SRAT tables via initrd/initramfs
> is a much better way:
> 
> https://www.kernel.org/doc/Documentation/acpi/initrd_table_override.txt
> 
> Dan recently posted a patch that lets this happen for the HMAT:
> 
> https://lists.01.org/pipermail/linux-nvdimm/2017-December/013545.html
> 
> I'm working right now on getting an easier way to generate HMAT tables - I'll
> let you know when I have something working.

I've posted details on how to set up test configurations using injected HMAT
and SRAT tables here:

https://github.com/rzwisler/hmat_examples

So far I've got two different sample configs, and we can add more as they are
useful.  Having the sample configs in github is also nice because if someone
finds a config that causes a kernel issue it can be reported then added to
this list of example configs for future testing.

Please let me know if you have trouble getting this working.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH 0/2] ext4: Fix ENOSPC handling for DAX faults

2017-12-19 Thread Ross Zwisler
On Tue, Dec 19, 2017 at 03:30:51PM +0100, Jan Kara wrote:
> On Fri 15-12-17 12:36:25, Ross Zwisler wrote:
> > On Wed, Dec 13, 2017 at 10:13:50AM +0100, Jan Kara wrote:
> > > Hello,
> > > 
> > > these two patches fix handling of ENOSPC during DAX faults. The problem is
> > > that currently running transaction may be holding lots of already freed
> > > blocks which can be reallocated only once the transaction commits. 
> > > Standard
> > > retry logic in ext4_iomap_end() does not work for DAX page fault handler
> > > since we hold current transaction open in ext4_dax_huge_fault() and thus
> > > retry logic cannot force the running transaction and as a result 
> > > application
> > > gets SIGBUS due to premature ENOSPC error.
> > > 
> > > These two patches fix the problem. I'm not too happy about patch 1/2 
> > > passing
> > > additional info (error code) from the fault handler but I don't see an
> > > easy better option since we want to also pass back special return values
> > > like VM_FAULT_FALLBACK. Comments are welcome.
> > 
> > I also don't love having two error codes coming back out of the DAX fault
> > handlers.  I'm worried that we'll end up forgetting to set errp in some 
> > cases,
> > and will only set the VM_FAULT_* error code.
> > 
> > I wonder if a better way to solve this would be to define a new
> > VM_FAULT_NOSPC, just like we have VM_FAULT_OOM for ENOMEM errors?  
> > Essentially
> > what it seems like we are saying is that the very general return of just
> > VM_FAULT_SIGBUS doesn't provide us enough information, and that being able 
> > to
> > distinguish that from ENOSPC errors would be useful.
> 
> Yes, we need to propagate more information than just VM_FAULT_SIGBUS from
> ->iomap_begin() down into the page fault handler.
> 
> > With that flag we would have 2 choices:
> > 
> > 1) Add VM_FAULT_NOSPC to the VM_FAULT_ERROR mask, and then update things 
> > like
> > mm_fault_error() appropriately so, like the other errors in this class, it
> > results in SIGBUS, or
> > 
> > 2) Just always return (VM_FAULT_NOSPC|VM_FAULT_SIGBUS), which I think would
> > mean that you wouldn't need to alter mm_fault_error() et al.
> > 
> > Do either of these sound appealing?
> 
> What I don't like about VM_FAULT_NOSPC is that it is polluting generic
> VM_FAULT_ namespace for something that just needs to be propagated from one
> ext4 function to another through the DAX helper function. In particular
> generic MM has nothing to do with such error.
> 
> If forgetting to set *errp is your only concern, I think I can address that
> by making the argument there 'int *iomap_ret' and storing there return from
> ->iomap_begin() unconditionally. That will work for ext4 as well and we
> won't forget to set it. Thoughts?

Yea, this sounds nicer to me.  Less chance we'll forget to set it somewhere,
and it's much more specific about the error that it's trying to capture.
Sounds good.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v3 0/3] create sysfs representation of ACPI HMAT

2017-12-18 Thread Ross Zwisler
On Thu, Dec 14, 2017 at 02:00:32PM +0100, Michal Hocko wrote:
> [CC linix-api]

Oh, thanks.  I'll add them to my CC list for sysfs related changes in the
future.

> On Wed 13-12-17 19:10:16, Ross Zwisler wrote:
> > This is the third revision of my patches adding a sysfs representation
> > of the ACPI Heterogeneous Memory Attribute Table (HMAT).  These patches
> > are based on v4.15-rc3 and a working tree can be found here:
> > 
> > https://git.kernel.org/pub/scm/linux/kernel/git/zwisler/linux.git/log/?h=hmat_v3
> > 
> > My goal is to get these patches merged for v4.16.
> 
> Has actually reviewed the overal design already for this to be 4.16
> thing? I do not see any acks/reviewed-bys in any of the patches...
> 
> > Changes from previous version (https://lkml.org/lkml/2017/7/6/749):
> 
> ... comments on this last posting are touching the surface rather than
> really discuss the overal design.

Yep, that's a fair assessment.  I would love a more in-depth review of the
code so far. :)

What I'm hoping to do with this series is to just provide a sysfs
representation of the HMAT so that applications can know which NUMA nodes to
select with existing utilities like numactl.  This series does not currently
alter any kernel behavior, it only provides a sysfs interface.

Say for example you had a system with some high bandwidth memory (HBM), and
you wanted to use it for a specific application.  You could use the sysfs
representation of the HMAT to figure out which memory target held your HBM.
You could do this by looking at the local bandwidth values for the various
memory targets, so:

# grep . /sys/devices/system/hmat/mem_tgt*/local_init/write_bw_MBps
/sys/devices/system/hmat/mem_tgt2/local_init/write_bw_MBps:81920
/sys/devices/system/hmat/mem_tgt3/local_init/write_bw_MBps:40960
/sys/devices/system/hmat/mem_tgt4/local_init/write_bw_MBps:40960
/sys/devices/system/hmat/mem_tgt5/local_init/write_bw_MBps:40960

and look for the one that corresponds to your HBM speed. (These numbers are
made up, but you get the idea.)

Alternatively if you knew the physical addresses of your HBM you could look
for it by finding the numa node that owns the appropriate memory sections, so:

# ls -d /sys/devices/system/hmat/mem_tgt2/node2/memory*
/sys/devices/system/hmat/mem_tgt2/node2/memory0
/sys/devices/system/hmat/mem_tgt2/node2/memory1
etc.

Once you know the NUMA node of your HBM, you can figure out the NUMA node of
it's local initiator:

# ls -d /sys/devices/system/hmat/mem_tgt2/local_init/mem_init*
/sys/devices/system/hmat/mem_tgt2/local_init/mem_init0

So, in our made-up example our HBM is located in numa node 2, and the local
CPU for that HBM is at numa node 0.

You would then use numactl to bind your app to those numa nodes:

numactl --membind=2 --cpunodebind=0 ./my_application

Does that make sense?

Eventually we can enhance numactl so it can automatically choose memory with
higher bandwidth, etc., but I think just this bit of kernel enabling gets us
started in the right direction.

> >  - Changed "HMEM" to "HMAT" and "hmem" to "hmat" throughout to make sure
> >that this effort doesn't get confused with Jerome's HMM work and to
> >make it clear that this enabling is tightly tied to the ACPI HMAT
> >table.  (John Hubbard)
> > 
> >  - Moved the link in the initiator (i.e. mem_init0/mem_tgt2) from
> >pointing to the "mem_tgt2/local_init" attribute group to instead
> >point at the mem_tgt2 target itself.  (Brice Goglin)
> > 
> >  - Simplified the contents of both the initiators and the targets so
> >that we just symlink to the NUMA node and don't duplicate
> >information.  For initiators this means that we no longer enumerate
> >CPUs, and for targets this means that we don't provide physical
> >address start and length information.  All of this is already
> >available in the NUMA node directory itself (i.e.
> >/sys/devices/system/node/node0), and it already accounts for the fact
> >that both multiple CPUs and multiple memory regions can be owned by a
> >given NUMA node.  Also removed some extra attributes (is_enabled,
> >is_isolated) which I don't think are useful at this point in time.
> > 
> > I have tested this against many different configs that I implemented
> > using qemu.
> 
> What is the testing procedure? How can I setup qemu to simlate such HW?

Well, the QEMU table simulation is gross, so I'd rather not get everyone
testing with that.  Injecting custom HMAT and SRAT tables via initrd/initramfs
is a much better way:

https://www.kernel.org/doc/Documentation/acpi/initrd_table_override.txt

Dan recently posted a patch 

Re: [PATCH] acpi: add NFIT and HMAT to the initrd override list

2017-12-18 Thread Ross Zwisler
On Fri, Dec 08, 2017 at 08:29:25AM -0800, Dan Williams wrote:
> These tables, NFIT and HMAT, are essential for describing
> next-generation platform memory topologies and performance
> characteristics. Allow them to be overridden for debug and test and
> purposes.
> 
> Cc: Ross Zwisler <ross.zwis...@linux.intel.com>

Yes please!  This works in my test setup for inserting custom HMAT tables, and
I would really like to have this for testing my HMAT series:

https://lists.01.org/pipermail/linux-nvdimm/2017-December/013571.html

Reviewed-by: Ross Zwisler <ross.zwis...@linux.intel.com>

> Signed-off-by: Dan Williams <dan.j.willi...@intel.com>
> ---
>  drivers/acpi/tables.c |3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c
> index 80ce2a7d224b..67a44fd79449 100644
> --- a/drivers/acpi/tables.c
> +++ b/drivers/acpi/tables.c
> @@ -456,7 +456,8 @@ static const char * const table_sigs[] = {
>   ACPI_SIG_SLIC, ACPI_SIG_SPCR, ACPI_SIG_SPMI, ACPI_SIG_TCPA,
>   ACPI_SIG_UEFI, ACPI_SIG_WAET, ACPI_SIG_WDAT, ACPI_SIG_WDDT,
>   ACPI_SIG_WDRT, ACPI_SIG_DSDT, ACPI_SIG_FADT, ACPI_SIG_PSDT,
> - ACPI_SIG_RSDT, ACPI_SIG_XSDT, ACPI_SIG_SSDT, NULL };
> + ACPI_SIG_RSDT, ACPI_SIG_XSDT, ACPI_SIG_SSDT, ACPI_SIG_NFIT,
> + ACPI_SIG_HMAT, NULL };
>  
>  #define ACPI_HEADER_SIZE sizeof(struct acpi_table_header)
>  
> 
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v3 2/3] hmat: add heterogeneous memory sysfs support

2017-12-15 Thread Ross Zwisler
On Fri, Dec 15, 2017 at 01:52:03AM +0100, Rafael J. Wysocki wrote:
<>
> > diff --git a/drivers/acpi/hmat/core.c b/drivers/acpi/hmat/core.c
> > new file mode 100644
> > index ..61b90dadf84b
> > --- /dev/null
> > +++ b/drivers/acpi/hmat/core.c
> > @@ -0,0 +1,536 @@
> > +/*
> > + * Heterogeneous Memory Attributes Table (HMAT) representation in sysfs
> > + *
> > + * Copyright (c) 2017, Intel Corporation.
> > + *
> > + * This program is free software; you can redistribute it and/or modify it
> > + * under the terms and conditions of the GNU General Public License,
> > + * version 2, as published by the Free Software Foundation.
> > + *
> > + * This program is distributed in the hope it will be useful, but WITHOUT
> > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> > + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
> > for
> > + * more details.
> > + */
> 
> Minor nit for starters: you should use SPDX license indentifiers in
> new files and if you do so, the license boilerplace is not necessary
> any more.

Okay, I'll fix that up.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH 0/2] ext4: Fix ENOSPC handling for DAX faults

2017-12-15 Thread Ross Zwisler
On Wed, Dec 13, 2017 at 10:13:50AM +0100, Jan Kara wrote:
> Hello,
> 
> these two patches fix handling of ENOSPC during DAX faults. The problem is
> that currently running transaction may be holding lots of already freed
> blocks which can be reallocated only once the transaction commits. Standard
> retry logic in ext4_iomap_end() does not work for DAX page fault handler
> since we hold current transaction open in ext4_dax_huge_fault() and thus
> retry logic cannot force the running transaction and as a result application
> gets SIGBUS due to premature ENOSPC error.
> 
> These two patches fix the problem. I'm not too happy about patch 1/2 passing
> additional info (error code) from the fault handler but I don't see an
> easy better option since we want to also pass back special return values
> like VM_FAULT_FALLBACK. Comments are welcome.

I also don't love having two error codes coming back out of the DAX fault
handlers.  I'm worried that we'll end up forgetting to set errp in some cases,
and will only set the VM_FAULT_* error code.

I wonder if a better way to solve this would be to define a new
VM_FAULT_NOSPC, just like we have VM_FAULT_OOM for ENOMEM errors?  Essentially
what it seems like we are saying is that the very general return of just
VM_FAULT_SIGBUS doesn't provide us enough information, and that being able to
distinguish that from ENOSPC errors would be useful.

With that flag we would have 2 choices:

1) Add VM_FAULT_NOSPC to the VM_FAULT_ERROR mask, and then update things like
mm_fault_error() appropriately so, like the other errors in this class, it
results in SIGBUS, or

2) Just always return (VM_FAULT_NOSPC|VM_FAULT_SIGBUS), which I think would
mean that you wouldn't need to alter mm_fault_error() et al.

Do either of these sound appealing?
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[PATCH v3 1/3] acpi: HMAT support in acpi_parse_entries_array()

2017-12-13 Thread Ross Zwisler
The current implementation of acpi_parse_entries_array() assumes that each
subtable has a standard ACPI subtable entry of type struct
acpi_subtable_header.  This standard subtable header has a one byte length
followed by a one byte type.

The HMAT subtables have to allow for a longer length so they have subtable
headers of type struct acpi_hmat_structure which has a 2 byte type and a 4
byte length.

Enhance the subtable parsing in acpi_parse_entries_array() so that it can
handle these new HMAT subtables.

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
---
 drivers/acpi/tables.c | 52 ---
 1 file changed, 41 insertions(+), 11 deletions(-)

diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c
index 80ce2a7d224b..f777b94c234a 100644
--- a/drivers/acpi/tables.c
+++ b/drivers/acpi/tables.c
@@ -218,6 +218,33 @@ void acpi_table_print_madt_entry(struct 
acpi_subtable_header *header)
}
 }
 
+static unsigned long __init
+acpi_get_entry_type(char *id, void *entry)
+{
+   if (strncmp(id, ACPI_SIG_HMAT, 4) == 0)
+   return ((struct acpi_hmat_structure *)entry)->type;
+   else
+   return ((struct acpi_subtable_header *)entry)->type;
+}
+
+static unsigned long __init
+acpi_get_entry_length(char *id, void *entry)
+{
+   if (strncmp(id, ACPI_SIG_HMAT, 4) == 0)
+   return ((struct acpi_hmat_structure *)entry)->length;
+   else
+   return ((struct acpi_subtable_header *)entry)->length;
+}
+
+static unsigned long __init
+acpi_get_subtable_header_length(char *id)
+{
+   if (strncmp(id, ACPI_SIG_HMAT, 4) == 0)
+   return sizeof(struct acpi_hmat_structure);
+   else
+   return sizeof(struct acpi_subtable_header);
+}
+
 /**
  * acpi_parse_entries_array - for each proc_num find a suitable subtable
  *
@@ -242,10 +269,10 @@ acpi_parse_entries_array(char *id, unsigned long 
table_size,
struct acpi_subtable_proc *proc, int proc_num,
unsigned int max_entries)
 {
-   struct acpi_subtable_header *entry;
-   unsigned long table_end;
+   unsigned long table_end, subtable_header_length;
int count = 0;
int errs = 0;
+   void *entry;
int i;
 
if (acpi_disabled)
@@ -263,19 +290,23 @@ acpi_parse_entries_array(char *id, unsigned long 
table_size,
}
 
table_end = (unsigned long)table_header + table_header->length;
+   subtable_header_length = acpi_get_subtable_header_length(id);
 
/* Parse all entries looking for a match. */
 
-   entry = (struct acpi_subtable_header *)
-   ((unsigned long)table_header + table_size);
+   entry = (void *)table_header + table_size;
+
+   while (((unsigned long)entry) + subtable_header_length  < table_end) {
+   unsigned long entry_type, entry_length;
 
-   while (((unsigned long)entry) + sizeof(struct acpi_subtable_header) <
-  table_end) {
if (max_entries && count >= max_entries)
break;
 
+   entry_type = acpi_get_entry_type(id, entry);
+   entry_length = acpi_get_entry_length(id, entry);
+
for (i = 0; i < proc_num; i++) {
-   if (entry->type != proc[i].id)
+   if (entry_type != proc[i].id)
continue;
if (!proc[i].handler ||
 (!errs && proc[i].handler(entry, table_end))) {
@@ -290,16 +321,15 @@ acpi_parse_entries_array(char *id, unsigned long 
table_size,
count++;
 
/*
-* If entry->length is 0, break from this loop to avoid
+* If entry_length is 0, break from this loop to avoid
 * infinite loop.
 */
-   if (entry->length == 0) {
+   if (entry_length == 0) {
pr_err("[%4.4s:0x%02x] Invalid zero length\n", id, 
proc->id);
return -EINVAL;
}
 
-   entry = (struct acpi_subtable_header *)
-   ((unsigned long)entry + entry->length);
+   entry += entry_length;
}
 
if (max_entries && count > max_entries) {
-- 
2.14.3

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[PATCH v3 2/3] hmat: add heterogeneous memory sysfs support

2017-12-13 Thread Ross Zwisler
Add a new sysfs subsystem, /sys/devices/system/hmat, which surfaces
information about memory initiators and memory targets to the user.  These
initiators and targets are described by the ACPI SRAT and HMAT tables.

A "memory initiator" in this case is a NUMA node containing one or more
devices such as CPU or separate memory I/O devices that can initiate
memory requests.  A "memory target" is NUMA node containing at least one
CPU-accessible physical address range.

The key piece of information surfaced by this patch is the mapping between
the ACPI table "proximity domain" numbers, held in the "firmware_id"
attribute, and Linux NUMA node numbers.  Every ACPI proximity domain will
end up being a unique NUMA node in Linux, but the numbers may get reordered
and Linux can create extra NUMA nodes that don't map back to ACPI proximity
domains.  The firmware_id value is needed if anyone ever wants to look at
the ACPI HMAT and SRAT tables directly and make sense of how they map to
NUMA nodes in Linux.

Initiators are found at /sys/devices/system/hmat/mem_initX, and the
attributes for a given initiator look like this:

  # tree mem_init0
  mem_init0
  ├── firmware_id
  ├── node0 -> ../../node/node0
  ├── power
  │   ├── async
  │   ...
  ├── subsystem -> ../../../../bus/hmat
  └── uevent

Where "mem_init0" on my system represents the CPU acting as a memory
initiator at NUMA node 0.  Users can discover which CPUs are part of this
memory initiator by following the node0 symlink and looking at cpumap,
cpulist and the cpu* symlinks.

Targets are found at /sys/devices/system/hmat/mem_tgtX, and the attributes
for a given target look like this:

  # tree mem_tgt2
  mem_tgt2
  ├── firmware_id
  ├── is_cached
  ├── node2 -> ../../node/node2
  ├── power
  │   ├── async
  │   ...
  ├── subsystem -> ../../../../bus/hmat
  └── uevent

Users can discover information about the memory owned by this memory target
by following the node2 symlink and looking at meminfo, vmstat and at the
memory* memory section symlinks.

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
---
 MAINTAINERS   |   6 +
 drivers/acpi/Kconfig  |   1 +
 drivers/acpi/Makefile |   1 +
 drivers/acpi/hmat/Kconfig |   7 +
 drivers/acpi/hmat/Makefile|   2 +
 drivers/acpi/hmat/core.c  | 536 ++
 drivers/acpi/hmat/hmat.h  |  47 
 drivers/acpi/hmat/initiator.c |  43 
 drivers/acpi/hmat/target.c|  55 +
 9 files changed, 698 insertions(+)
 create mode 100644 drivers/acpi/hmat/Kconfig
 create mode 100644 drivers/acpi/hmat/Makefile
 create mode 100644 drivers/acpi/hmat/core.c
 create mode 100644 drivers/acpi/hmat/hmat.h
 create mode 100644 drivers/acpi/hmat/initiator.c
 create mode 100644 drivers/acpi/hmat/target.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 82ad0eabce4f..64ebec0708de 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6366,6 +6366,12 @@ S:   Supported
 F: drivers/scsi/hisi_sas/
 F: Documentation/devicetree/bindings/scsi/hisilicon-sas.txt
 
+HMAT - ACPI Heterogeneous Memory Attribute Table Support
+M: Ross Zwisler <ross.zwis...@linux.intel.com>
+L: linux...@kvack.org
+S: Supported
+F: drivers/acpi/hmat/
+
 HMM - Heterogeneous Memory Management
 M: Jérôme Glisse <jgli...@redhat.com>
 L: linux...@kvack.org
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 46505396869e..21cdd1288430 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -466,6 +466,7 @@ config ACPI_REDUCED_HARDWARE_ONLY
  If you are unsure what to do, do not enable this option.
 
 source "drivers/acpi/nfit/Kconfig"
+source "drivers/acpi/hmat/Kconfig"
 
 source "drivers/acpi/apei/Kconfig"
 source "drivers/acpi/dptf/Kconfig"
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index 41954a601989..ed5eab6b0412 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -75,6 +75,7 @@ obj-$(CONFIG_ACPI_PROCESSOR)  += processor.o
 obj-$(CONFIG_ACPI) += container.o
 obj-$(CONFIG_ACPI_THERMAL) += thermal.o
 obj-$(CONFIG_ACPI_NFIT)+= nfit/
+obj-$(CONFIG_ACPI_HMAT)+= hmat/
 obj-$(CONFIG_ACPI) += acpi_memhotplug.o
 obj-$(CONFIG_ACPI_HOTPLUG_IOAPIC) += ioapic.o
 obj-$(CONFIG_ACPI_BATTERY) += battery.o
diff --git a/drivers/acpi/hmat/Kconfig b/drivers/acpi/hmat/Kconfig
new file mode 100644
index ..954ad4701005
--- /dev/null
+++ b/drivers/acpi/hmat/Kconfig
@@ -0,0 +1,7 @@
+config ACPI_HMAT
+   bool "ACPI Heterogeneous Memory Attribute Table Support"
+   depends on ACPI_NUMA
+   depends on SYSFS
+   help
+ Exports a sysfs representation of the ACPI Heterogeneous Memory
+ Attributes Table (HMAT).
diff --git a/drivers/acpi/hmat/Makefile b/drivers/acpi/hmat/Makefile
new file mode 100644
index 

[PATCH v3 0/3] create sysfs representation of ACPI HMAT

2017-12-13 Thread Ross Zwisler
This is the third revision of my patches adding a sysfs representation
of the ACPI Heterogeneous Memory Attribute Table (HMAT).  These patches
are based on v4.15-rc3 and a working tree can be found here:

https://git.kernel.org/pub/scm/linux/kernel/git/zwisler/linux.git/log/?h=hmat_v3

My goal is to get these patches merged for v4.16.

Changes from previous version (https://lkml.org/lkml/2017/7/6/749):

 - Changed "HMEM" to "HMAT" and "hmem" to "hmat" throughout to make sure
   that this effort doesn't get confused with Jerome's HMM work and to
   make it clear that this enabling is tightly tied to the ACPI HMAT
   table.  (John Hubbard)

 - Moved the link in the initiator (i.e. mem_init0/mem_tgt2) from
   pointing to the "mem_tgt2/local_init" attribute group to instead
   point at the mem_tgt2 target itself.  (Brice Goglin)

 - Simplified the contents of both the initiators and the targets so
   that we just symlink to the NUMA node and don't duplicate
   information.  For initiators this means that we no longer enumerate
   CPUs, and for targets this means that we don't provide physical
   address start and length information.  All of this is already
   available in the NUMA node directory itself (i.e.
   /sys/devices/system/node/node0), and it already accounts for the fact
   that both multiple CPUs and multiple memory regions can be owned by a
   given NUMA node.  Also removed some extra attributes (is_enabled,
   is_isolated) which I don't think are useful at this point in time.

I have tested this against many different configs that I implemented
using qemu.

---

 Quick Summary 

Platforms exist today which have multiple types of memory attached to a
single CPU.  These disparate memory ranges have some characteristics in
common, such as CPU cache coherence, but they can have wide ranges of
performance both in terms of latency and bandwidth.

For example, consider a system that contains persistent memory, standard
DDR memory and High Bandwidth Memory (HBM), all attached to the same CPU.
There could potentially be an order of magnitude or more difference in
performance between the slowest and fastest memory attached to that CPU.

With the current Linux code NUMA nodes are CPU-centric, so all the memory
attached to a given CPU will be lumped into the same NUMA node.  This makes
it very difficult for userspace applications to understand the performance
of different memory ranges on a given CPU.

We solve this issue by providing userspace with performance information on
individual memory ranges.  This performance information is exposed via
sysfs:

  # grep . mem_tgt2/* mem_tgt2/local_init/* 2>/dev/null
  mem_tgt2/firmware_id:1
  mem_tgt2/is_cached:0
  mem_tgt2/local_init/read_bw_MBps:40960
  mem_tgt2/local_init/read_lat_nsec:50
  mem_tgt2/local_init/write_bw_MBps:40960
  mem_tgt2/local_init/write_lat_nsec:50

This allows applications to easily find the memory that they want to use.
We expect that the existing NUMA APIs will be enhanced to use this new
information so that applications can continue to use them to select their
desired memory.

 Lots of Details 

This patch set provides a sysfs representation of parts of the
Heterogeneous Memory Attribute Table (HMAT), newly defined in ACPI 6.2.
One major conceptual change in ACPI 6.2 related to this work is that
proximity domains no longer need to contain a processor.  We can now
have memory-only proximity domains, which means that we can now have
memory-only Linux NUMA nodes.

Here is an example configuration where we have a single processor, one
range of regular memory and one range of HBM:

  +---+   ++
  | Processor |   | Memory |
  | prox domain 0 +---+ prox domain 1  |
  | NUMA node 1   |   | NUMA node 2|
  +---+---+   ++
  |
  +---+--+
  | HBM  |
  | prox domain 2|
  | NUMA node 0  |
  +--+

This gives us one initiator (the processor) and two targets (the two memory
ranges).  Each of these three has its own ACPI proximity domain and
associated Linux NUMA node.  Note also that while there is a 1:1 mapping
from each proximity domain to each NUMA node, the numbers don't necessarily
match up.  Additionally we can have extra NUMA nodes that don't map back to
ACPI proximity domains.

The above configuration could also have the processor and one of the two
memory ranges sharing a proximity domain and NUMA node, but for the
purposes of the HMAT the two memory ranges will need to be separated.

The overall goal of this series and of the HMAT is to allow users to
identify memory using its performance characteristics.  This is
complicated by the amount of HMAT data that could be present in very
large systems, so in this series we only surface performance information
for local (initiator,target) pairings.  The changelog for patch 5
discusses this in detail.

Ross Zwisler 

[PATCH v3 3/3] hmat: add performance attributes

2017-12-13 Thread Ross Zwisler
Add performance information found in the HMAT to the sysfs representation.
This information lives as an attribute group named "local_init" in the
memory target:

  # tree mem_tgt2
  mem_tgt2
  ├── firmware_id
  ├── is_cached
  ├── local_init
  │   ├── mem_init0 -> ../../mem_init0
  │   ├── mem_init1 -> ../../mem_init1
  │   ├── read_bw_MBps
  │   ├── read_lat_nsec
  │   ├── write_bw_MBps
  │   └── write_lat_nsec
  ├── node2 -> ../../node/node2
  ├── power
  │   ├── async
  │   ...
  ├── subsystem -> ../../../../bus/hmat
  └── uevent

This attribute group surfaces latency and bandwidth performance for a
memory target and its local initiators.  For example:

  # grep . mem_tgt2/local_init/* 2>/dev/null
  mem_tgt2/local_init/read_bw_MBps:30720
  mem_tgt2/local_init/read_lat_nsec:100
  mem_tgt2/local_init/write_bw_MBps:30720
  mem_tgt2/local_init/write_lat_nsec:100

The initiators also have a symlink to their local targets:

  # ls -l mem_init0/mem_tgt2
  lrwxrwxrwx. 1 root root 0 Dec 13 16:45 mem_init0/mem_tgt2 -> ../mem_tgt2

We create performance attribute groups only for local (initiator,target)
pairings, where the first local initiator for a given target is defined by
the "Processor Proximity Domain" field in the HMAT's Memory Subsystem
Address Range Structure table.  After we have one local initiator we scan
the performance data to link to any other "local" initiators with the same
local performance to a given memory target.

A given target only has one set of local performance values, so each target
will have at most one "local_init" attribute group, though that group can
contain links to multiple initiators that all have local performance.  A
given memory initiator may have multiple local memory targets, so multiple
"mem_tgtX" links may exist for a given initiator.

If a given memory target is cached we give performance numbers only for the
media itself, and rely on the "is_cached" attribute to represent the
fact that there is a caching layer.

The fact that we only expose a subset of the performance information
presented in the HMAT via sysfs as a compromise, driven by fact that those
usages will be the highest performing and because to represent all possible
paths could cause an unmanageable explosion of sysfs entries.

If we dump everything from the HMAT into sysfs we end up with
O(num_targets * num_initiators * num_caching_levels) attributes.  Each of
these attributes only takes up 2 bytes in a System Locality Latency and
Bandwidth Information Structure, but if we have to create a directory entry
for each it becomes much more expensive.

For example, very large systems today can have on the order of thousands of
NUMA nodes.  Say we have a system which used to have 1,000 NUMA nodes that
each had both a CPU and local memory.  The HMAT allows us to separate the
CPUs and memory into separate NUMA nodes, so we can end up with 1,000 CPU
initiator NUMA nodes and 1,000 memory target NUMA nodes.  If we represented
the performance information for each possible CPU/memory pair in sysfs we
would end up with 1,000,000 attribute groups.

This is a lot to pass in a set of packed data tables, but I think we'll
break sysfs if we try to create millions of attributes, regardless of how
we nest them in a directory hierarchy.

By only representing performance information for local (initiator,target)
pairings, we reduce the number of sysfs entries to O(num_targets).

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
---
 drivers/acpi/hmat/Makefile  |   2 +-
 drivers/acpi/hmat/core.c| 263 +++-
 drivers/acpi/hmat/hmat.h|  17 +++
 drivers/acpi/hmat/perf_attributes.c |  56 
 4 files changed, 336 insertions(+), 2 deletions(-)
 create mode 100644 drivers/acpi/hmat/perf_attributes.c

diff --git a/drivers/acpi/hmat/Makefile b/drivers/acpi/hmat/Makefile
index edf4bcb1c97d..b5d1d83684da 100644
--- a/drivers/acpi/hmat/Makefile
+++ b/drivers/acpi/hmat/Makefile
@@ -1,2 +1,2 @@
 obj-$(CONFIG_ACPI_HMAT) := hmat.o
-hmat-y := core.o initiator.o target.o
+hmat-y := core.o initiator.o target.o perf_attributes.o
diff --git a/drivers/acpi/hmat/core.c b/drivers/acpi/hmat/core.c
index 61b90dadf84b..89e84658fc73 100644
--- a/drivers/acpi/hmat/core.c
+++ b/drivers/acpi/hmat/core.c
@@ -23,11 +23,225 @@
 #include 
 #include "hmat.h"
 
+#define NO_VALUE   -1
+#define LOCAL_INIT "local_init"
+
 static LIST_HEAD(target_list);
 static LIST_HEAD(initiator_list);
+LIST_HEAD(locality_list);
 
 static bool bad_hmat;
 
+/* Performance attributes for an initiator/target pair. */
+static int get_performance_data(u32 init_pxm, u32 tgt_pxm,
+   struct acpi_hmat_locality *hmat_loc)
+{
+   int num_init = hmat_loc->number_of_initiator_Pds;
+   int num_tgt = hmat_loc->number_of_target_Pds;
+   int init_idx = NO_VALUE;
+   int tgt_idx = NO_

Re: [xfsprogs PATCH v2 0/3] Add necessary items for MAP_SYNC testing

2017-12-13 Thread Ross Zwisler
On Tue, Dec 05, 2017 at 04:56:48PM -0700, Ross Zwisler wrote:
> This is the second revision of my MAP_SYNC + dm-log-writes support for
> xfsprogs.

Friendly ping on this xfsprogs series.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [fstests PATCH v6 2/2] generic: add test for DAX MAP_SYNC support

2017-12-08 Thread Ross Zwisler
On Fri, Dec 08, 2017 at 02:36:10PM +0800, Eryu Guan wrote:

> (Test was re-numbered as generic/470, BTW.)

Thanks!  For future reference, does the pattern of us submitting tests with
high numbers (generic/999) to avoid merge conflicts and asking you to renumber
them when you merge work for you?  Or would you prefer that we number our
tests to the next available, which may change from submission to submission?
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [fstests PATCH v5 2/2] generic: add test for DAX MAP_SYNC support

2017-12-07 Thread Ross Zwisler
On Thu, Dec 07, 2017 at 06:36:57PM +0800, Eryu Guan wrote:

> Now we have two _require rules to test log_writes dm target:
> _require_log_writes   # _notrun explicitly when MOUNT_OPTIONS contains dax
> _require_log_writes_dax   # _notrun if log-writes target doesn't support 
> dax
> 
> I think we can merge the two into one, i.e. extend _require_log_writes
> to check dax support status only when
> - MOUNT_OPTIONS contains dax, or
> - dax is given as a param explicitly, e.g. _require_log_writes dax
> 
> So old kernels that don't support dax log-writes still _notrun, and new
> kernels that have dax log-writes support could run all log-writes tests,
> like generic/455 and generic/457, in dax environment.
> 
> (I did a quick test with generic/45[57] on v4.15-rc2 kernel, 455 always
> fails due to md5sum mismatch, not sure where the problem is yet; 457 is
> _notrun, perhaps due to there's no dax support on reflink XFS.)

I looked in to generic/455 md5sum mismatch, and it is expected with the
current DAX support found in dm-log-writes.  The issue is that we snoop I/O,
but we *don't* snoop the data written by mmap().

For DAX mmaps, the sync calls msync()/fsync() don't cause writes to the block
driver of flushed page cache pages as they would in the page cache case.
Instead the user writes directly into the persistent memory, and all we see is
a flush call.  For us to properly handle mmap() writes we'll need to catch the
flush happening in dm-log-writes, iterate through the flushed data and copy it
into the dm-log-writes log.

I actually had this implemented in my initial version of the DAX support for
dm-log-writes, but by the time I was ready to merge the DAX flush path had
been removed from DM.  See
commit c3ca015fab6d ("dax: remove the pmem_dax_ops->flush abstraction")
for more info.

I can look at adding some level of flush support back into DM so we can handle
this case.  Until/unless that happens, though, I think we should continue to
make users specifically request DAX+dm-log-writes support that lacks mmap()
data verfication capabilities via _require_log_writes_dax.

Thank you for the feedback.  I'll post an updated patch that takes care of the
rest of your comments.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [xfsprogs PATCH v2 1/3] xfs_io: fix compiler warnings in getfsmap code

2017-12-06 Thread Ross Zwisler
On Wed, Dec 06, 2017 at 12:47:49PM -0800, Darrick J. Wong wrote:
> On Wed, Dec 06, 2017 at 01:10:14PM -0700, Ross Zwisler wrote:
> > On Wed, Dec 06, 2017 at 11:27:43AM +1100, Dave Chinner wrote:
> > > On Tue, Dec 05, 2017 at 04:56:49PM -0700, Ross Zwisler wrote:
> > > > I recently upgraded my compiler from
> > > > gcc (GCC) 6.4.1 20170727 (Red Hat 6.4.1-1)
> > > > to
> > > > gcc (GCC) 7.2.1 20170915 (Red Hat 7.2.1-2)
> > > > and started getting a bunch of compiler warnings in io/fsmap.c:
> > > > 
> > > >   fsmap.c: In function ‘fsmap_f’:
> > > >   fsmap.c:228:40: warning: ‘%lld’ directive output may be truncated 
> > > > writing
> > > >   between 1 and 17 bytes into a region of size between 12 and 28
> > > >   [-Wformat-truncation=]
> > > >  snprintf(bbuf, sizeof(bbuf), "[%lld..%lld]:",
> > > >   ^~~~
> > > >   fsmap.c:228:32: note: directive argument in the range [0, 
> > > > 36028797018963967]
> > > >  snprintf(bbuf, sizeof(bbuf), "[%lld..%lld]:",
> > > >   ^~~
> > > >   fsmap.c:228:3: note: ‘snprintf’ output between 8 and 40 bytes into a
> > > >   destination of size 32
> > > >  snprintf(bbuf, sizeof(bbuf), "[%lld..%lld]:",
> > > >  ^
> > > >   (long long)BTOBBT(p->fmr_physical),
> > > >   ~~
> > > >   (long long)BTOBBT(p->fmr_physical + p->fmr_length - 1));
> > > >   ~~
> > > > 
> > > > The issue is that 'bbuf' is only defined to be 32 characters wide, but 
> > > > each
> > > > signed long long can potentially print as many as 19 characters
> > > > (9223372036854775807 is the max value).  The format we're using for 
> > > > bbuf is
> > > > "[%lld..%lld]:" which has 2 signed long longs plus 6 other characters
> > > > "[..]:\0", which means it's possible we'll print up to 44 characters,
> > > > overflowing our 32 char buffer.
> > > > 
> > > > Fix this by bumping all the buffer sizes in dump_map_verbose() to 64
> > > > characters.
> > > > 
> > > > Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
> > > > Cc: Darrick J. Wong <darrick.w...@oracle.com>
> > > > Fixes: 3fcab549a234 ("xfs_io: support the new getfsmap ioctl")
> > > 
> > > FYI, I posted a fix for this weeks ago. I think Eric has already
> > > picked it up, but it hasn't been pushed out into the for-next branch
> > > yet.
> > 
> > I'm seeing similar new compiler warnings when compiling xfstests:
> > 
> > write_log.c: In function ‘wlog_open’:
> > write_log.c:124:37: warning: ‘%s’ directive writing up to 1023 bytes into a 
> > region of size 224 [-Wformat-overflow=]
> > "Could not open write_log - open(%s, %#o, %#o) failed:  %s\n",
> >  ^~
> > write_log.c:124:4: note: directive argument in the range [1089, 2047]
> > "Could not open write_log - open(%s, %#o, %#o) failed:  %s\n",
> > ^
> > 
> > etc.
> > 
> > I don't see any patches posted that fix these, as of yet.  As far as you 
> > know,
> > am I correct in thinking that these still need to be fixed?
> 
> It sure looks that way.

Cool, thanks.  I'll take a crack at them.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [xfsprogs PATCH v2 1/3] xfs_io: fix compiler warnings in getfsmap code

2017-12-06 Thread Ross Zwisler
On Wed, Dec 06, 2017 at 11:27:43AM +1100, Dave Chinner wrote:
> On Tue, Dec 05, 2017 at 04:56:49PM -0700, Ross Zwisler wrote:
> > I recently upgraded my compiler from
> > gcc (GCC) 6.4.1 20170727 (Red Hat 6.4.1-1)
> > to
> > gcc (GCC) 7.2.1 20170915 (Red Hat 7.2.1-2)
> > and started getting a bunch of compiler warnings in io/fsmap.c:
> > 
> >   fsmap.c: In function ‘fsmap_f’:
> >   fsmap.c:228:40: warning: ‘%lld’ directive output may be truncated writing
> >   between 1 and 17 bytes into a region of size between 12 and 28
> >   [-Wformat-truncation=]
> >  snprintf(bbuf, sizeof(bbuf), "[%lld..%lld]:",
> >   ^~~~
> >   fsmap.c:228:32: note: directive argument in the range [0, 
> > 36028797018963967]
> >  snprintf(bbuf, sizeof(bbuf), "[%lld..%lld]:",
> >   ^~~
> >   fsmap.c:228:3: note: ‘snprintf’ output between 8 and 40 bytes into a
> >   destination of size 32
> >  snprintf(bbuf, sizeof(bbuf), "[%lld..%lld]:",
> >  ^
> >   (long long)BTOBBT(p->fmr_physical),
> >   ~~
> >   (long long)BTOBBT(p->fmr_physical + p->fmr_length - 1));
> >   ~~
> > 
> > The issue is that 'bbuf' is only defined to be 32 characters wide, but each
> > signed long long can potentially print as many as 19 characters
> > (9223372036854775807 is the max value).  The format we're using for bbuf is
> > "[%lld..%lld]:" which has 2 signed long longs plus 6 other characters
> > "[..]:\0", which means it's possible we'll print up to 44 characters,
> > overflowing our 32 char buffer.
> > 
> > Fix this by bumping all the buffer sizes in dump_map_verbose() to 64
> > characters.
> > 
> > Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
> > Cc: Darrick J. Wong <darrick.w...@oracle.com>
> > Fixes: 3fcab549a234 ("xfs_io: support the new getfsmap ioctl")
> 
> FYI, I posted a fix for this weeks ago. I think Eric has already
> picked it up, but it hasn't been pushed out into the for-next branch
> yet.

I'm seeing similar new compiler warnings when compiling xfstests:

write_log.c: In function ‘wlog_open’:
write_log.c:124:37: warning: ‘%s’ directive writing up to 1023 bytes into a 
region of size 224 [-Wformat-overflow=]
"Could not open write_log - open(%s, %#o, %#o) failed:  %s\n",
 ^~
write_log.c:124:4: note: directive argument in the range [1089, 2047]
"Could not open write_log - open(%s, %#o, %#o) failed:  %s\n",
^

etc.

I don't see any patches posted that fix these, as of yet.  As far as you know,
am I correct in thinking that these still need to be fixed?

- Ross
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[xfsprogs PATCH v3 3/3] xfs_io: add a new 'log_writes' command

2017-12-06 Thread Ross Zwisler
Add a new 'log_writes' command to xfs_io so that we can add dm-log-writes
log marks.  It's helpful to allow users of xfs_io to adds these marks from
within xfs_io instead of waiting until after xfs_io exits because then they
are able to replay the dm-log-writes log up to immediately after another
xfs_io operation such as mwrite.  This isolates the log replay from other
operations that happen as part of xfs_io exiting (file handles being
closed, mmaps being torn down, etc.).  This also allows users to insert
multiple marks between different xfs_io commands.

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
Suggested-by: Dave Chinner <da...@fromorbit.com>
---

Changes from v2:
 - Use 'exitcode' to have the xfs_io shell command error out with status
   1 if we hit any of our many error condtions during execution. (Dave)

---
 configure.ac|   1 +
 debian/control  |   2 +-
 include/builddefs.in|   2 +
 io/Makefile |   6 +++
 io/init.c   |   1 +
 io/io.h |   6 +++
 io/log_writes.c | 106 
 m4/Makefile |   1 +
 m4/package_devmapper.m4 |  11 +
 man/man8/xfs_io.8   |  23 ++-
 10 files changed, 157 insertions(+), 2 deletions(-)
 create mode 100644 io/log_writes.c
 create mode 100644 m4/package_devmapper.m4

diff --git a/configure.ac b/configure.ac
index f3325aa0..f83d5817 100644
--- a/configure.ac
+++ b/configure.ac
@@ -164,6 +164,7 @@ AC_NEED_INTERNAL_FSXATTR
 AC_HAVE_GETFSMAP
 AC_HAVE_STATFS_FLAGS
 AC_HAVE_MAP_SYNC
+AC_HAVE_DEVMAPPER
 
 if test "$enable_blkid" = yes; then
 AC_HAVE_BLKID_TOPO
diff --git a/debian/control b/debian/control
index ad816622..5c26e3d7 100644
--- a/debian/control
+++ b/debian/control
@@ -3,7 +3,7 @@ Section: admin
 Priority: optional
 Maintainer: XFS Development Team <linux-...@vger.kernel.org>
 Uploaders: Nathan Scott <nath...@debian.org>, Anibal Monsalve Salazar 
<ani...@debian.org>
-Build-Depends: uuid-dev, dh-autoreconf, debhelper (>= 5), gettext, libtool, 
libreadline-gplv2-dev | libreadline5-dev, libblkid-dev (>= 2.17), linux-libc-dev
+Build-Depends: uuid-dev, dh-autoreconf, debhelper (>= 5), gettext, libtool, 
libreadline-gplv2-dev | libreadline5-dev, libblkid-dev (>= 2.17), 
linux-libc-dev, libdevmapper-dev
 Standards-Version: 3.9.1
 Homepage: http://xfs.org/
 
diff --git a/include/builddefs.in b/include/builddefs.in
index 126f7e95..66bdbfa2 100644
--- a/include/builddefs.in
+++ b/include/builddefs.in
@@ -35,6 +35,7 @@ LIBTERMCAP = @libtermcap@
 LIBEDITLINE = @libeditline@
 LIBREADLINE = @libreadline@
 LIBBLKID = @libblkid@
+LIBDEVMAPPER = @libdevmapper@
 LIBXFS = $(TOPDIR)/libxfs/libxfs.la
 LIBXCMD = $(TOPDIR)/libxcmd/libxcmd.la
 LIBXLOG = $(TOPDIR)/libxlog/libxlog.la
@@ -116,6 +117,7 @@ NEED_INTERNAL_FSXATTR = @need_internal_fsxattr@
 HAVE_GETFSMAP = @have_getfsmap@
 HAVE_STATFS_FLAGS = @have_statfs_flags@
 HAVE_MAP_SYNC = @have_map_sync@
+HAVE_DEVMAPPER = @have_devmapper@
 
 GCCFLAGS = -funsigned-char -fno-strict-aliasing -Wall
 # -Wbitwise -Wno-transparent-union -Wno-old-initializer -Wno-decl
diff --git a/io/Makefile b/io/Makefile
index 2987ee11..b7585a1b 100644
--- a/io/Makefile
+++ b/io/Makefile
@@ -107,6 +107,12 @@ ifeq ($(HAVE_MAP_SYNC),yes)
 LCFLAGS += -DHAVE_MAP_SYNC
 endif
 
+ifeq ($(HAVE_DEVMAPPER),yes)
+CFILES += log_writes.c
+LLDLIBS += $(LIBDEVMAPPER)
+LCFLAGS += -DHAVE_DEVMAPPER
+endif
+
 # On linux we get fsmap from the system or define it ourselves
 # so include this based on platform type.  If this reverts to only
 # the autoconf check w/o local definition, change to testing HAVE_GETFSMAP
diff --git a/io/init.c b/io/init.c
index 20d5f80d..34d87b5f 100644
--- a/io/init.c
+++ b/io/init.c
@@ -72,6 +72,7 @@ init_commands(void)
help_init();
imap_init();
inject_init();
+   log_writes_init();
madvise_init();
mincore_init();
mmap_init();
diff --git a/io/io.h b/io/io.h
index 8b2753b3..9349cc75 100644
--- a/io/io.h
+++ b/io/io.h
@@ -186,3 +186,9 @@ extern void fsmap_init(void);
 #else
 # define fsmap_init()  do { } while (0)
 #endif
+
+#ifdef HAVE_DEVMAPPER
+extern voidlog_writes_init(void);
+#else
+#define log_writes_init()  do { } while (0)
+#endif
diff --git a/io/log_writes.c b/io/log_writes.c
new file mode 100644
index ..c7b7392e
--- /dev/null
+++ b/io/log_writes.c
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2017 Intel Corporation.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for m

[fstests PATCH v5 2/2] generic: add test for DAX MAP_SYNC support

2017-12-05 Thread Ross Zwisler
This test creates a file and writes to it via an mmap(), but never syncs
via fsync/msync.  This process is tracked via dm-log-writes, then replayed.

If MAP_SYNC is working the dm-log-writes replay will show the test file
with 1 MiB of on-media block allocations.  This is because each allocating
page fault included an implicit metadata sync.  If MAP_SYNC isn't working
(which you can test by removing the "-S" flag to xfs_io mmap) the file
will be smaller or missing entirely.

Note that dm-log-writes doesn't track the data that we write via the
mmap(), so we can't do any data integrity checking.  We can only verify
that the metadata writes for the page faults happened.

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
---
 common/dmlogwrites| 20 +
 tests/generic/999 | 82 +++
 tests/generic/999.out |  3 ++
 tests/generic/group   |  1 +
 4 files changed, 106 insertions(+)
 create mode 100755 tests/generic/999
 create mode 100644 tests/generic/999.out

diff --git a/common/dmlogwrites b/common/dmlogwrites
index 05829dbc..2b697bec 100644
--- a/common/dmlogwrites
+++ b/common/dmlogwrites
@@ -28,6 +28,26 @@ _require_log_writes()
_require_test_program "log-writes/replay-log"
 }
 
+_require_log_writes_dax()
+{
+   [ -z "$LOGWRITES_DEV" -o ! -b "$LOGWRITES_DEV" ] && \
+   _notrun "This test requires a valid \$LOGWRITES_DEV"
+
+   _require_dm_target log-writes
+   _require_test_program "log-writes/replay-log"
+
+   _scratch_unmount
+   _log_writes_init
+   _log_writes_mkfs > /dev/null 2>&1
+   _log_writes_mount -o dax
+   # Check options to be sure. XFS ignores dax option
+   # and goes on if dev underneath does not support dax.
+   _fs_options $LOGWRITES_DMDEV | grep -qw "dax" || \
+   _notrun "$LOGWRITES_DMDEV $FSTYP does not support -o dax"
+   _log_writes_unmount
+   _log_writes_remove
+}
+
 _log_writes_init()
 {
local BLK_DEV_SIZE=`blockdev --getsz $SCRATCH_DEV`
diff --git a/tests/generic/999 b/tests/generic/999
new file mode 100755
index ..ca5772da
--- /dev/null
+++ b/tests/generic/999
@@ -0,0 +1,82 @@
+#! /bin/bash
+# FS QA Test No. 999
+#
+# Use dm-log-writes to verify that MAP_SYNC actually syncs metadata during
+# page faults.
+#
+#---
+# Copyright (c) 2017 Intel Corporation.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   _log_writes_cleanup
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/dmlogwrites
+
+# remove previous $seqres.full before test
+rm -f $seqres.full
+
+# real QA test starts here
+_supported_fs generic
+_supported_os Linux
+_require_log_writes_dax
+_require_xfs_io_command "log_writes"
+
+_log_writes_init
+_log_writes_mkfs >> $seqres.full 2>&1
+_log_writes_mount -o dax
+
+LEN=$((1024 * 1024)) # 1 MiB
+
+xfs_io -t -c "truncate $LEN" -c "mmap -S 0 $LEN" -c "mwrite 0 $LEN" \
+   -c "log_writes -d $LOGWRITES_NAME -m preunmap" \
+   -f $SCRATCH_MNT/test
+
+# Unmount the scratch dir and tear down the log writes target
+_log_writes_unmount
+_log_writes_remove
+_check_scratch_fs
+
+# destroy previous filesystem so we can be sure our rebuild works
+_scratch_mkfs >> $seqres.full 2>&1
+
+# check pre-unmap state
+_log_writes_replay_log preunmap
+_scratch_mount
+
+# We should see $SCRATCH_MNT/test as having 1 MiB in block allocations
+du -sh $SCRATCH_MNT/test | _filter_scratch | _filter_spaces
+
+_scratch_unmount
+_check_scratch_fs
+
+echo "Silence is golden"
+status=0
+exit
diff --git a/tests/generic/999.out b/tests/generic/999.out
new file mode 100644
index ..c7b8f8a2
--- /dev/null
+++ b/tests/generic/999.out
@@ -0,0 +1,3 @@
+QA output created by 999
+1.0M SCRATCH_MNT/test
+Silence is golden
diff --git a/tests/gene

[xfsprogs PATCH v2 1/3] xfs_io: fix compiler warnings in getfsmap code

2017-12-05 Thread Ross Zwisler
I recently upgraded my compiler from
gcc (GCC) 6.4.1 20170727 (Red Hat 6.4.1-1)
to
gcc (GCC) 7.2.1 20170915 (Red Hat 7.2.1-2)
and started getting a bunch of compiler warnings in io/fsmap.c:

  fsmap.c: In function ‘fsmap_f’:
  fsmap.c:228:40: warning: ‘%lld’ directive output may be truncated writing
  between 1 and 17 bytes into a region of size between 12 and 28
  [-Wformat-truncation=]
 snprintf(bbuf, sizeof(bbuf), "[%lld..%lld]:",
  ^~~~
  fsmap.c:228:32: note: directive argument in the range [0, 36028797018963967]
 snprintf(bbuf, sizeof(bbuf), "[%lld..%lld]:",
  ^~~
  fsmap.c:228:3: note: ‘snprintf’ output between 8 and 40 bytes into a
  destination of size 32
 snprintf(bbuf, sizeof(bbuf), "[%lld..%lld]:",
 ^
  (long long)BTOBBT(p->fmr_physical),
  ~~
  (long long)BTOBBT(p->fmr_physical + p->fmr_length - 1));
  ~~

The issue is that 'bbuf' is only defined to be 32 characters wide, but each
signed long long can potentially print as many as 19 characters
(9223372036854775807 is the max value).  The format we're using for bbuf is
"[%lld..%lld]:" which has 2 signed long longs plus 6 other characters
"[..]:\0", which means it's possible we'll print up to 44 characters,
overflowing our 32 char buffer.

Fix this by bumping all the buffer sizes in dump_map_verbose() to 64
characters.

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
Cc: Darrick J. Wong <darrick.w...@oracle.com>
Fixes: 3fcab549a234 ("xfs_io: support the new getfsmap ioctl")
---
 io/fsmap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/io/fsmap.c b/io/fsmap.c
index 448fb535..3d8a6700 100644
--- a/io/fsmap.c
+++ b/io/fsmap.c
@@ -184,8 +184,8 @@ dump_map_verbose(
off64_t agoff, bperag;
int foff_w, boff_w, aoff_w, tot_w, agno_w, own_w;
int nr_w, dev_w;
-   charrbuf[32], bbuf[32], abuf[32], obuf[32];
-   charnbuf[32], dbuf[32], gbuf[32];
+   charrbuf[64], bbuf[64], abuf[64], obuf[64];
+   charnbuf[64], dbuf[64], gbuf[64];
charowner[OWNER_BUF_SZ];
int sunit, swidth;
int flg = 0;
-- 
2.14.3

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[xfsprogs PATCH v2 3/3] xfs_io: add a new 'log_writes' command

2017-12-05 Thread Ross Zwisler
Add a new 'log_writes' command to xfs_io so that we can add dm-log-writes
log marks.  It's helpful to allow users of xfs_io to adds these marks from
within xfs_io instead of waiting until after xfs_io exits because then they
are able to replay the dm-log-writes log up to immediately after another
xfs_io operation such as mwrite.  This isolates the log replay from other
operations that happen as part of xfs_io exiting (file handles being
closed, mmaps being torn down, etc.).  This also allows users to insert
multiple marks between different xfs_io commands.

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
Suggested-by: Dave Chinner <da...@fromorbit.com>
---
 configure.ac|   1 +
 debian/control  |   2 +-
 include/builddefs.in|   2 +
 io/Makefile |   6 +++
 io/init.c   |   1 +
 io/io.h |   6 +++
 io/log_writes.c | 101 
 m4/Makefile |   1 +
 m4/package_devmapper.m4 |  11 ++
 man/man8/xfs_io.8   |  23 ++-
 10 files changed, 152 insertions(+), 2 deletions(-)
 create mode 100644 io/log_writes.c
 create mode 100644 m4/package_devmapper.m4

diff --git a/configure.ac b/configure.ac
index f3325aa0..f83d5817 100644
--- a/configure.ac
+++ b/configure.ac
@@ -164,6 +164,7 @@ AC_NEED_INTERNAL_FSXATTR
 AC_HAVE_GETFSMAP
 AC_HAVE_STATFS_FLAGS
 AC_HAVE_MAP_SYNC
+AC_HAVE_DEVMAPPER
 
 if test "$enable_blkid" = yes; then
 AC_HAVE_BLKID_TOPO
diff --git a/debian/control b/debian/control
index ad816622..5c26e3d7 100644
--- a/debian/control
+++ b/debian/control
@@ -3,7 +3,7 @@ Section: admin
 Priority: optional
 Maintainer: XFS Development Team <linux-...@vger.kernel.org>
 Uploaders: Nathan Scott <nath...@debian.org>, Anibal Monsalve Salazar 
<ani...@debian.org>
-Build-Depends: uuid-dev, dh-autoreconf, debhelper (>= 5), gettext, libtool, 
libreadline-gplv2-dev | libreadline5-dev, libblkid-dev (>= 2.17), linux-libc-dev
+Build-Depends: uuid-dev, dh-autoreconf, debhelper (>= 5), gettext, libtool, 
libreadline-gplv2-dev | libreadline5-dev, libblkid-dev (>= 2.17), 
linux-libc-dev, libdevmapper-dev
 Standards-Version: 3.9.1
 Homepage: http://xfs.org/
 
diff --git a/include/builddefs.in b/include/builddefs.in
index 126f7e95..66bdbfa2 100644
--- a/include/builddefs.in
+++ b/include/builddefs.in
@@ -35,6 +35,7 @@ LIBTERMCAP = @libtermcap@
 LIBEDITLINE = @libeditline@
 LIBREADLINE = @libreadline@
 LIBBLKID = @libblkid@
+LIBDEVMAPPER = @libdevmapper@
 LIBXFS = $(TOPDIR)/libxfs/libxfs.la
 LIBXCMD = $(TOPDIR)/libxcmd/libxcmd.la
 LIBXLOG = $(TOPDIR)/libxlog/libxlog.la
@@ -116,6 +117,7 @@ NEED_INTERNAL_FSXATTR = @need_internal_fsxattr@
 HAVE_GETFSMAP = @have_getfsmap@
 HAVE_STATFS_FLAGS = @have_statfs_flags@
 HAVE_MAP_SYNC = @have_map_sync@
+HAVE_DEVMAPPER = @have_devmapper@
 
 GCCFLAGS = -funsigned-char -fno-strict-aliasing -Wall
 # -Wbitwise -Wno-transparent-union -Wno-old-initializer -Wno-decl
diff --git a/io/Makefile b/io/Makefile
index 2987ee11..b7585a1b 100644
--- a/io/Makefile
+++ b/io/Makefile
@@ -107,6 +107,12 @@ ifeq ($(HAVE_MAP_SYNC),yes)
 LCFLAGS += -DHAVE_MAP_SYNC
 endif
 
+ifeq ($(HAVE_DEVMAPPER),yes)
+CFILES += log_writes.c
+LLDLIBS += $(LIBDEVMAPPER)
+LCFLAGS += -DHAVE_DEVMAPPER
+endif
+
 # On linux we get fsmap from the system or define it ourselves
 # so include this based on platform type.  If this reverts to only
 # the autoconf check w/o local definition, change to testing HAVE_GETFSMAP
diff --git a/io/init.c b/io/init.c
index 20d5f80d..34d87b5f 100644
--- a/io/init.c
+++ b/io/init.c
@@ -72,6 +72,7 @@ init_commands(void)
help_init();
imap_init();
inject_init();
+   log_writes_init();
madvise_init();
mincore_init();
mmap_init();
diff --git a/io/io.h b/io/io.h
index 8b2753b3..9349cc75 100644
--- a/io/io.h
+++ b/io/io.h
@@ -186,3 +186,9 @@ extern void fsmap_init(void);
 #else
 # define fsmap_init()  do { } while (0)
 #endif
+
+#ifdef HAVE_DEVMAPPER
+extern voidlog_writes_init(void);
+#else
+#define log_writes_init()  do { } while (0)
+#endif
diff --git a/io/log_writes.c b/io/log_writes.c
new file mode 100644
index ..37c0024f
--- /dev/null
+++ b/io/log_writes.c
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2017 Intel Corporation.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * I

[xfsprogs PATCH v2 0/3] Add necessary items for MAP_SYNC testing

2017-12-05 Thread Ross Zwisler
This is the second revision of my MAP_SYNC + dm-log-writes support for
xfsprogs.  The previous revision can be found here:

https://lists.01.org/pipermail/linux-nvdimm/2017-November/013326.html

Changes since v1:

 - Updated the dm-log-writes support so that it uses libdevmapper
   instead of calling the "dmsetup" stand-alone exectuable via system().
   (Eric and Darrick)

 - Fixed our MAP_SYNC handling so that instead of defining the flags for
   systems that don't have them in the headers, just set them to 0 and
   fail when the -S flag is used. (Dan)

You can find an xfsprogs branch with this series here:

https://git.kernel.org/pub/scm/linux/kernel/git/zwisler/xfsprogs-dev.git/log/?h=map_sync_v2

Both MAP_SYNC and the DAX enhancements for dm-log-writes can be found in
v4.15-rc*.  For ease of testing I've posted a kernel that is v4.14 plus
just those two patch series here:

https://git.kernel.org/pub/scm/linux/kernel/git/zwisler/linux.git/log/?h=map_sync_dm_log_writes

---

As suggested by Dave Chinner:

As I say to all these sorts of one-off test prgrams: please add the
new MAP_SYNC flag to xfs_io rather than writing a one-off
test program to set it and write some data.

And if we're going to be adding special custom tests just because
we need to insert dm-log marks, add that functionality to xfs_io,
too.

That way we can create complex custom dm logwrite tests without
needing one-off test programs for them all...

This series enhances xfs_io by adding support for the MAP_SYNC mmap() flag
and for dm-log-writes marks.  This allows the resulting xfstest for
MAP_SYNC to be much simpler and have no custom C programs.

Ross Zwisler (3):
  xfs_io: fix compiler warnings in getfsmap code
  xfs_io: add MAP_SYNC support to mmap()
  xfs_io: add a new 'log_writes' command

 configure.ac|   2 +
 debian/control  |   2 +-
 include/builddefs.in|   3 ++
 include/linux.h |   8 
 io/Makefile |  10 +
 io/fsmap.c  |   4 +-
 io/init.c   |   1 +
 io/io.h |   7 
 io/log_writes.c | 101 
 io/mmap.c   |  23 ---
 m4/Makefile |   1 +
 m4/package_devmapper.m4 |  11 ++
 m4/package_libcdev.m4   |  16 
 man/man8/xfs_io.8   |  29 +-
 14 files changed, 208 insertions(+), 10 deletions(-)
 create mode 100644 io/log_writes.c
 create mode 100644 m4/package_devmapper.m4

-- 
2.14.3

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [fstests PATCH v4 3/4] dm-log-writes: allow DAX to be used when possible

2017-12-05 Thread Ross Zwisler
On Sat, Nov 18, 2017 at 02:17:03PM +0800, Eryu Guan wrote:
> On Fri, Nov 17, 2017 at 01:28:27PM -0700, Ross Zwisler wrote:
> > Enhance _require_dm_target so that a user can request a minimum version of a
> > given dm target.
> > 
> > DAX support was added to v1.1.0 of the dm-log-writes kernel module, so
> > allow the DAX mount option starting with that version.
> 
> Is is possible not relying on the version numbers but actually trying
> what you want to do and _notrun if that fails? Currently fstests does no
> version number checking at all, and it'd be great if we keep that
> tradition.

Yep, and I agree that this is better.  I'll be fixed in the next version.
Thanks for the feedback.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [xfsprogs PATCH 1/2] xfs_io: add MAP_SYNC support to mmap()

2017-11-17 Thread Ross Zwisler
On Fri, Nov 17, 2017 at 12:40:21PM -0800, Darrick J. Wong wrote:
> On Fri, Nov 17, 2017 at 01:25:23PM -0700, Ross Zwisler wrote:
> > Add support for a new -S flag to xfs_io's mmap command.  This opens the
> > mapping with the (MAP_SYNC | MAP_SHARED_VALIDATE) flags instead of the
> > standard MAP_SHARED flag.
> > 
> > Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
> > Suggested-by: Dave Chinner <da...@fromorbit.com>
> > ---
<>
> > diff --git a/include/linux.h b/include/linux.h
> > index 6ce344c..4ee03ed 100644
> > --- a/include/linux.h
> > +++ b/include/linux.h
> > @@ -327,4 +327,9 @@ fsmap_advance(
> >  #define HAVE_GETFSMAP
> >  #endif /* HAVE_GETFSMAP */
> >  
> > +#ifndef HAVE_MAP_SYNC
> > +#define MAP_SYNC 0x8
> > +#define MAP_SHARED_VALIDATE 0x3
> > +#endif /* HAVE_MAP_SYNC */
> 
> Hmm, what's the point of ifndef/define if you have an configure.ac check?

I'm following the example of HAVE_GETFSMAP.  It does a check to see if the
headers have proper support in m4/package_libcdev.m4, then has code in this
file to provide defines if they aren't provided in the system.

> > diff --git a/io/mmap.c b/io/mmap.c
> > index 7a8150e..520b037 100644
> > --- a/io/mmap.c
> > +++ b/io/mmap.c
> > @@ -42,7 +42,7 @@ print_mapping(
> > int index,
> > int braces)
> >  {
> > -   unsigned char   buffer[8] = { 0 };
> > +   charbuffer[8] = { 0 };
> > int i;
> >  
> > static struct {
> > @@ -57,6 +57,10 @@ print_mapping(
> >  
> > for (i = 0, p = pflags; p->prot != PROT_NONE; i++, p++)
> > buffer[i] = (map->prot & p->prot) ? p->mode : '-';
> > +
> > +   if (map->map_sync)
> > +   sprintf([i], " S");
> 
> Does buffer need enlarging here?

Nope.  The buffer is 8 chars, and the 'rwx\0' string only uses 4.
"rwx S\0" uses 6, so we're still good to go.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [xfsprogs PATCH 1/2] xfs_io: add MAP_SYNC support to mmap()

2017-11-17 Thread Ross Zwisler
On Fri, Nov 17, 2017 at 12:35:43PM -0800, Dan Williams wrote:
> On Fri, Nov 17, 2017 at 12:25 PM, Ross Zwisler
> <ross.zwis...@linux.intel.com> wrote:
> > Add support for a new -S flag to xfs_io's mmap command.  This opens the
> > mapping with the (MAP_SYNC | MAP_SHARED_VALIDATE) flags instead of the
> > standard MAP_SHARED flag.
> >
> > Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
> > Suggested-by: Dave Chinner <da...@fromorbit.com>
> > ---
> >  configure.ac  |  1 +
> >  include/builddefs.in  |  1 +
> >  include/linux.h   |  5 +
> >  io/io.h   |  1 +
> >  io/mmap.c | 19 ++-
> >  m4/package_libcdev.m4 | 16 
> >  man/man8/xfs_io.8 |  6 +-
> >  7 files changed, 43 insertions(+), 6 deletions(-)
> >
> > diff --git a/configure.ac b/configure.ac
> > index cb23fb8..7153c9f 100644
> > --- a/configure.ac
> > +++ b/configure.ac
> > @@ -162,6 +162,7 @@ AC_HAVE_FSETXATTR
> >  AC_HAVE_MREMAP
> >  AC_NEED_INTERNAL_FSXATTR
> >  AC_HAVE_GETFSMAP
> > +AC_HAVE_MAP_SYNC
> >
> >  if test "$enable_blkid" = yes; then
> >  AC_HAVE_BLKID_TOPO
> > diff --git a/include/builddefs.in b/include/builddefs.in
> > index 1d454b6..78b71fe 100644
> > --- a/include/builddefs.in
> > +++ b/include/builddefs.in
> > @@ -114,6 +114,7 @@ HAVE_FSETXATTR = @have_fsetxattr@
> >  HAVE_MREMAP = @have_mremap@
> >  NEED_INTERNAL_FSXATTR = @need_internal_fsxattr@
> >  HAVE_GETFSMAP = @have_getfsmap@
> > +HAVE_MAP_SYNC = @have_map_sync@
> >
> >  GCCFLAGS = -funsigned-char -fno-strict-aliasing -Wall
> >  # -Wbitwise -Wno-transparent-union -Wno-old-initializer -Wno-decl
> > diff --git a/include/linux.h b/include/linux.h
> > index 6ce344c..4ee03ed 100644
> > --- a/include/linux.h
> > +++ b/include/linux.h
> > @@ -327,4 +327,9 @@ fsmap_advance(
> >  #define HAVE_GETFSMAP
> >  #endif /* HAVE_GETFSMAP */
> >
> > +#ifndef HAVE_MAP_SYNC
> > +#define MAP_SYNC 0x8
> 
> Hmm, this is the x86 specific value of MAP_SYNC. I think it would be
> better to define MAP_SYNC to zero in this case and check for MAP_SYNC
> == 0 at run time.

Ah, so instead of defining MAP_SYNC if the proper headers aren't installed,
set to 0 and disallow the -S option?  Fair enough.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [xfsprogs PATCH 2/2] xfs_io: add a new 'log_writes' command

2017-11-17 Thread Ross Zwisler
On Fri, Nov 17, 2017 at 03:03:39PM -0600, Eric Sandeen wrote:
> On 11/17/17 2:48 PM, Ross Zwisler wrote:
> > On Fri, Nov 17, 2017 at 02:39:07PM -0600, Eric Sandeen wrote:
> >> On 11/17/17 2:25 PM, Ross Zwisler wrote:
> >>> Add a new 'log_writes' command to xfs_io so that we can add dm-log-writes
> >>> log marks via the external 'dmsetup' executable.  It's helpful to allow
> >>> users of xfs_io to adds these marks from within xfs_io instead of waiting
> >>> until after xfs_io exits because then they are able to replay the
> >>> dm-log-writes log up to immediately after another xfs_io operation such as
> >>> mwrite.  This isolates the log replay from other operations that happen as
> >>> part of xfs_io exiting (file handles being closed, mmaps being torn down,
> >>> etc.).  This also allows users to insert multiple marks between different
> >>> xfs_io commands.
> >>>
> >>> Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
> >>> Suggested-by: Dave Chinner <da...@fromorbit.com>
> >>
> >> Without reviewing in detail, what is the advantage of wrapping dmsetup
> >> into xfs_io?  My first inclination is that there is none at all, and
> >> xfstests can call dmsetup as easily as they can call xfs_io.  No?
> >>
> >> -Eric
> > 
> > I commented on this a bit in the changelog for the 2nd patch:
> > 
> > It's helpful to allow users of xfs_io to adds these marks from within xfs_io
> > instead of waiting until after xfs_io exits because then they are able to
> > replay the dm-log-writes log up to immediately after another xfs_io 
> > operation
> > such as mwrite.  This isolates the log replay from other operations that
> > happen as part of xfs_io exiting (file handles being closed, mmaps being 
> > torn
> > down, etc.).  This also allows users to insert multiple marks between
> > different xfs_io commands.
> > 
> > I agree that the shell-out to dmsetup isn't awesome...  For the current 
> > test I
> > have written I think we can get away with just assuming that the xfs_io exit
> > stuff won't interact too heavily with the dm-log-writes log, and we could
> > potentially move the dmsetup call back into the fstest.  This is how I
> > initially had it, and moved it into the C program via shell-out in response 
> > to
> > Amir's feedback:
> 
> Sorry, terrible of me to not have read that.  :(  Ok, so next question - 
> DM_TARGET_MSG seems to be public, can we just invoke the ioctl directly
> instead of shelling out to dmsetup?
> 
> I'm checking w/ the dm folks too, to make sure that's expected to work.  As
> long as the use isn't too tricky it seems like that might be better.

Yea, that seems like a better option - I'll take a look.  Thanks for the
suggestion.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [xfsprogs PATCH 2/2] xfs_io: add a new 'log_writes' command

2017-11-17 Thread Ross Zwisler
On Fri, Nov 17, 2017 at 02:39:07PM -0600, Eric Sandeen wrote:
> On 11/17/17 2:25 PM, Ross Zwisler wrote:
> > Add a new 'log_writes' command to xfs_io so that we can add dm-log-writes
> > log marks via the external 'dmsetup' executable.  It's helpful to allow
> > users of xfs_io to adds these marks from within xfs_io instead of waiting
> > until after xfs_io exits because then they are able to replay the
> > dm-log-writes log up to immediately after another xfs_io operation such as
> > mwrite.  This isolates the log replay from other operations that happen as
> > part of xfs_io exiting (file handles being closed, mmaps being torn down,
> > etc.).  This also allows users to insert multiple marks between different
> > xfs_io commands.
> > 
> > Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
> > Suggested-by: Dave Chinner <da...@fromorbit.com>
> 
> Without reviewing in detail, what is the advantage of wrapping dmsetup
> into xfs_io?  My first inclination is that there is none at all, and
> xfstests can call dmsetup as easily as they can call xfs_io.  No?
> 
> -Eric

I commented on this a bit in the changelog for the 2nd patch:

It's helpful to allow users of xfs_io to adds these marks from within xfs_io
instead of waiting until after xfs_io exits because then they are able to
replay the dm-log-writes log up to immediately after another xfs_io operation
such as mwrite.  This isolates the log replay from other operations that
happen as part of xfs_io exiting (file handles being closed, mmaps being torn
down, etc.).  This also allows users to insert multiple marks between
different xfs_io commands.

I agree that the shell-out to dmsetup isn't awesome...  For the current test I
have written I think we can get away with just assuming that the xfs_io exit
stuff won't interact too heavily with the dm-log-writes log, and we could
potentially move the dmsetup call back into the fstest.  This is how I
initially had it, and moved it into the C program via shell-out in response to
Amir's feedback:

https://lists.01.org/pipermail/linux-nvdimm/2017-October/012976.html
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[fstests PATCH v4 3/4] dm-log-writes: allow DAX to be used when possible

2017-11-17 Thread Ross Zwisler
Enhance _require_dm_target so that a user can request a minimum version of a
given dm target.

DAX support was added to v1.1.0 of the dm-log-writes kernel module, so
allow the DAX mount option starting with that version.

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
Suggested-by: Amir Goldstein <amir7...@gmail.com>
---
 common/dmlogwrites   |  5 +++--
 common/rc| 23 +--
 doc/requirement-checking.txt |  5 +++--
 3 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/common/dmlogwrites b/common/dmlogwrites
index 80ed4fc..2cb804c 100644
--- a/common/dmlogwrites
+++ b/common/dmlogwrites
@@ -23,8 +23,9 @@ _require_log_writes()
[ -z "$LOGWRITES_DEV" -o ! -b "$LOGWRITES_DEV" ] && \
_notrun "This test requires a valid \$LOGWRITES_DEV"
 
-   _exclude_scratch_mount_option dax
-   _require_dm_target log-writes
+   local z=0
+   _scratch_has_mount_option dax && z=1
+   _require_dm_target log-writes 1 $z 0
_require_test_program "log-writes/replay-log"
 }
 
diff --git a/common/rc b/common/rc
index 701b1ff..c1f24ed 100644
--- a/common/rc
+++ b/common/rc
@@ -1782,6 +1782,18 @@ _require_sane_bdev_flush()
fi
 }
 
+_compare_dm_target_versions()
+{
+   for i in $(seq 0 2); do
+   if [[ ${_actual_ver[$i]} > ${_required_ver[$i]} ]]; then
+   return
+   fi
+   if [[ ${_actual_ver[$i]} < ${_required_ver[$i]} ]]; then
+   _notrun "dm $_target version ${_required_ver[@]} 
required"
+   fi
+   done
+}
+
 # this test requires a specific device mapper target
 _require_dm_target()
 {
@@ -1795,8 +1807,15 @@ _require_dm_target()
 
modprobe dm-$_target >/dev/null 2>&1
 
-   $DMSETUP_PROG targets 2>&1 | grep -q ^$_target
-   if [ $? -ne 0 ]; then
+   local _version=$($DMSETUP_PROG targets 2>&1 | grep ^$_target)
+   if [[ $_version =~ .*v([0-9]+)\.([0-9]+)\.([0-9]+) ]]; then
+   # check for a required minimum version?
+   if [[ $# == 4 ]]; then
+   local _actual_ver=(${BASH_REMATCH[@]:1})
+   local _required_ver=($2 $3 $4)
+   _compare_dm_target_versions
+   fi
+   else
_notrun "This test requires dm $_target support"
fi
 }
diff --git a/doc/requirement-checking.txt b/doc/requirement-checking.txt
index 4e01b1f..bbfc34f 100644
--- a/doc/requirement-checking.txt
+++ b/doc/requirement-checking.txt
@@ -112,10 +112,11 @@ _require_statx
 DEVICE MAPPER REQUIREMENTS
 ==
 
-_require_dm_target 
+_require_dm_target  [  ]
 
  The test requires the use of the device mapper target and will be skipped
- if it isn't available in the kernel.
+ if it isn't available in the kernel.  Optionally specify the minimum
+ three part version number of the dm target that is required.
 
 _require_log_writes
 
-- 
2.9.5

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[fstests PATCH v4 1/4] common/rc: add _scratch_has_mount_option()

2017-11-17 Thread Ross Zwisler
Allow tests to inquire about whether a mount option is set, rather than
just disallowing it via _exclude_scratch_mount_option().

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
Suggested-by: Amir Goldstein <amir7...@gmail.com>
---
 common/rc | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/common/rc b/common/rc
index 0cda9da..701b1ff 100644
--- a/common/rc
+++ b/common/rc
@@ -3105,6 +3105,16 @@ _exclude_scratch_mount_option()
done
 }
 
+_scratch_has_mount_option()
+{
+   local mnt_opts=$(_normalize_mount_options)
+
+   if echo $mnt_opts | grep -qw "$1"; then
+   return 0
+   fi
+   return 1
+}
+
 _require_atime()
 {
_exclude_scratch_mount_option "noatime"
-- 
2.9.5

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[fstests PATCH v4 2/4] dm-log-writes: only replay log to marks that exist

2017-11-17 Thread Ross Zwisler
The 'replay-log' executable will replay the dm-log-writes log until the
given mark, or until the end of the log if the mark isn't found.

This means that if the mark you're looking for was never inserted in the
log or if you give garbage to _log_writes_replay_log() the entire log will
be replayed.  This can cause unexpected test results.

Fix this by making sure that the mark we're given actually exists in the
log before we allow the replay.

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
---
 common/dmlogwrites | 4 
 1 file changed, 4 insertions(+)

diff --git a/common/dmlogwrites b/common/dmlogwrites
index 247c744..80ed4fc 100644
--- a/common/dmlogwrites
+++ b/common/dmlogwrites
@@ -72,6 +72,10 @@ _log_writes_replay_log()
 {
_mark=$1
 
+   $here/src/log-writes/replay-log --log $LOGWRITES_DEV --find \
+   --end-mark $_mark >> $seqres.full 2>&1
+   [ $? -ne 0 ] && _fail "mark does not exist"
+
$here/src/log-writes/replay-log --log $LOGWRITES_DEV --replay 
$SCRATCH_DEV \
--end-mark $_mark >> $seqres.full 2>&1
[ $? -ne 0 ] && _fail "replay failed"
-- 
2.9.5

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[fstests PATCH v4 4/4] generic: add test for DAX MAP_SYNC support

2017-11-17 Thread Ross Zwisler
This test creates a file and writes to it via an mmap(), but never syncs
via fsync/msync.  This process is tracked via dm-log-writes, then replayed.

If MAP_SYNC is working the dm-log-writes replay will show the test file
with 1 MiB of on-media block allocations.  This is because each allocating
page fault included an implicit metadata sync.  If MAP_SYNC isn't working
(which you can test by removing the "-S" flag to xfs_io mmap) the file
will be smaller or missing entirely.

Note that dm-log-writes doesn't track the data that we write via the
mmap(), so we can't do any data integrity checking.  We can only verify
that the metadata writes for the page faults happened.

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
---
 tests/generic/468 | 83 +++
 tests/generic/468.out |  3 ++
 tests/generic/group   |  1 +
 3 files changed, 87 insertions(+)
 create mode 100755 tests/generic/468
 create mode 100644 tests/generic/468.out

diff --git a/tests/generic/468 b/tests/generic/468
new file mode 100755
index 000..fcbbf34
--- /dev/null
+++ b/tests/generic/468
@@ -0,0 +1,83 @@
+#! /bin/bash
+# FS QA Test No. 468
+#
+# Use dm-log-writes to verify that MAP_SYNC actually syncs metadata during
+# page faults.
+#
+#---
+# Copyright (c) 2017 Intel Corporation.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   _log_writes_cleanup
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/dmlogwrites
+
+# remove previous $seqres.full before test
+rm -f $seqres.full
+
+# real QA test starts here
+_supported_fs generic
+_supported_os Linux
+_require_log_writes
+_require_scratch_dax
+_require_xfs_io_command "log_writes"
+
+_log_writes_init
+_log_writes_mkfs >> $seqres.full 2>&1
+_log_writes_mount -o dax
+
+LEN=$((1024 * 1024)) # 1 MiB
+
+xfs_io -t -c "truncate $LEN" -c "mmap -S 0 $LEN" -c "mwrite 0 $LEN" \
+   -c "log_writes -d $LOGWRITES_NAME -m preunmap" \
+   -f $SCRATCH_MNT/test > /dev/null 2>&1
+
+# Unmount the scratch dir and tear down the log writes target
+_log_writes_unmount
+_log_writes_remove
+_check_scratch_fs
+
+# destroy previous filesystem so we can be sure our rebuild works
+_scratch_mkfs >> $seqres.full 2>&1
+
+# check pre-unmap state
+_log_writes_replay_log preunmap
+_scratch_mount
+
+# We should see $SCRATCH_MNT/test as having 1 MiB in block allocations
+stat -c "%s" $SCRATCH_MNT/test
+
+_scratch_unmount
+_check_scratch_fs
+
+echo "Silence is golden"
+status=0
+exit
diff --git a/tests/generic/468.out b/tests/generic/468.out
new file mode 100644
index 000..5f897af
--- /dev/null
+++ b/tests/generic/468.out
@@ -0,0 +1,3 @@
+QA output created by 468
+1048576
+Silence is golden
diff --git a/tests/generic/group b/tests/generic/group
index 9183950..c061842 100644
--- a/tests/generic/group
+++ b/tests/generic/group
@@ -470,3 +470,4 @@
 465 auto rw quick aio
 466 auto quick rw
 467 auto quick exportfs
+468 auto quick dax
-- 
2.9.5

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[fstests PATCH v4 0/4] add test for DAX MAP_SYNC support

2017-11-17 Thread Ross Zwisler
The purpose of this series is to exercise the new MAP_SYNC mmap()
functionality [1].  It adds a test which uses dm-log-writes to try and
replay filesystem metadata operations for a file that is being written via
mmap().

If MAP_SYNC is active the dm-log-writes replay will recreate the file's
block allocations and you'll end up with a test file which is a known
size.

If MAP_SYNC is not active the metadata writes will most likely be lost and
the replay will either fail to create the test file at all or it will may
be smaller.  In all of my testing the file simply doesn't exist on the
replay if MAP_SYNC is ommited.

This test relies on a kernel with both the MAP_SYNC mmap() functionality
and the DAX enabling in dm-log-writes.  These will both appear in kernel
v4.15-rc1.  For ease of testing I've posted a kernel that is v4.14 plus
just those two patch series [2].

This test also relies on xfsprogs having support for MAP_SYNC and for
dm-log-writes.  I've posed a tree adding that support [3].  This xfsprogs
series is still under review so if the xfs_io interfaces change this
test will of course need to be updated as well.

Lastly, I've also posted a working version of this series [4].

Changes since v3:
 - Enhanced xfs_io with MAP_SYNC and dm-log-writes functionality instead of
   creating a one-off test program.  (Dave Chinner)

 - Improved dm target version checking. (Amir)

 - Fixed dm-log-writes replay issue, some general cleanup, broke changes
   out into a series.

[1]: https://lists.01.org/pipermail/linux-nvdimm/2017-November/013164.html
[2]: 
https://git.kernel.org/pub/scm/linux/kernel/git/zwisler/linux.git/log/?h=map_sync_dm_log_writes
[3]: 
https://git.kernel.org/pub/scm/linux/kernel/git/zwisler/xfsprogs-dev.git/log/?h=map_sync
[4]: 
https://git.kernel.org/pub/scm/linux/kernel/git/zwisler/xfstests-dev.git/log/?h=map_sync_test_v4

Ross Zwisler (4):
  common/rc: add _scratch_has_mount_option()
  dm-log-writes: only replay log to marks that exist
  dm-log-writes: allow DAX to be used when possible
  generic: add test for DAX MAP_SYNC support

 common/dmlogwrites   |  9 +++--
 common/rc| 33 --
 doc/requirement-checking.txt |  5 +--
 tests/generic/468| 83 
 tests/generic/468.out|  3 ++
 tests/generic/group  |  1 +
 6 files changed, 128 insertions(+), 6 deletions(-)
 create mode 100755 tests/generic/468
 create mode 100644 tests/generic/468.out

-- 
2.9.5

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[xfsprogs PATCH 0/2] Add necessary items for MAP_SYNC testing

2017-11-17 Thread Ross Zwisler
As suggested by Dave Chinner:

As I say to all these sorts of one-off test prgrams: please add the
new MAP_SYNC flag to xfs_io rather than writing a one-off
test program to set it and write some data.

And if we're going to be adding special custom tests just because
we need to insert dm-log marks, add that functionality to xfs_io,
too.

That way we can create complex custom dm logwrite tests without
needing one-off test programs for them all...

This series enhances xfs_io by adding support for the MAP_SYNC mmap() flag
and for dm-log-writes marks.  This allows the resulting xfstest for
MAP_SYNC to be much simpler and have no custom C programs.

You can find an xfsprogs branch with this series here:

https://git.kernel.org/pub/scm/linux/kernel/git/zwisler/xfsprogs-dev.git/log/?h=map_sync

Both MAP_SYNC and the DAX enhancements for dm-log-writes will be found in
the upcoming v4.15-rc1.  For ease of testing I've posted a kernel that is
v4.14 plus just those two patch series here:

https://git.kernel.org/pub/scm/linux/kernel/git/zwisler/linux.git/log/?h=map_sync_dm_log_writes

Ross Zwisler (2):
  xfs_io: add MAP_SYNC support to mmap()
  xfs_io: add a new 'log_writes' command

 configure.ac  |  1 +
 include/builddefs.in  |  1 +
 include/linux.h   |  5 
 io/Makefile   |  5 ++--
 io/init.c |  1 +
 io/io.h   |  2 ++
 io/log_writes.c   | 78 +++
 io/mmap.c | 19 +
 m4/package_libcdev.m4 | 16 +++
 man/man8/xfs_io.8 | 25 -
 10 files changed, 145 insertions(+), 8 deletions(-)
 create mode 100644 io/log_writes.c

--
2.9.5

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[xfsprogs PATCH 2/2] xfs_io: add a new 'log_writes' command

2017-11-17 Thread Ross Zwisler
Add a new 'log_writes' command to xfs_io so that we can add dm-log-writes
log marks via the external 'dmsetup' executable.  It's helpful to allow
users of xfs_io to adds these marks from within xfs_io instead of waiting
until after xfs_io exits because then they are able to replay the
dm-log-writes log up to immediately after another xfs_io operation such as
mwrite.  This isolates the log replay from other operations that happen as
part of xfs_io exiting (file handles being closed, mmaps being torn down,
etc.).  This also allows users to insert multiple marks between different
xfs_io commands.

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
Suggested-by: Dave Chinner <da...@fromorbit.com>
---
 io/Makefile   |  5 ++--
 io/init.c |  1 +
 io/io.h   |  1 +
 io/log_writes.c   | 78 +++
 man/man8/xfs_io.8 | 19 ++
 5 files changed, 102 insertions(+), 2 deletions(-)
 create mode 100644 io/log_writes.c

diff --git a/io/Makefile b/io/Makefile
index 050d6bd..51b2eae 100644
--- a/io/Makefile
+++ b/io/Makefile
@@ -10,8 +10,9 @@ LSRCFILES = xfs_bmap.sh xfs_freeze.sh xfs_mkfile.sh
 HFILES = init.h io.h
 CFILES = init.c \
attr.c bmap.c cowextsize.c encrypt.c file.c freeze.c fsync.c \
-   getrusage.c imap.c link.c mmap.c open.c parent.c pread.c prealloc.c \
-   pwrite.c reflink.c seek.c shutdown.c stat.c sync.c truncate.c utimes.c
+   getrusage.c imap.c link.c log_writes.c mmap.c open.c parent.c pread.c \
+   prealloc.c pwrite.c reflink.c seek.c shutdown.c stat.c sync.c \
+   truncate.c utimes.c
 
 LLDLIBS = $(LIBXCMD) $(LIBHANDLE) $(LIBPTHREAD)
 LTDEPENDENCIES = $(LIBXCMD) $(LIBHANDLE)
diff --git a/io/init.c b/io/init.c
index 20d5f80..34d87b5 100644
--- a/io/init.c
+++ b/io/init.c
@@ -72,6 +72,7 @@ init_commands(void)
help_init();
imap_init();
inject_init();
+   log_writes_init();
madvise_init();
mincore_init();
mmap_init();
diff --git a/io/io.h b/io/io.h
index 8b2753b..d62034a 100644
--- a/io/io.h
+++ b/io/io.h
@@ -109,6 +109,7 @@ extern void getrusage_init(void);
 extern voidhelp_init(void);
 extern voidimap_init(void);
 extern voidinject_init(void);
+extern voidlog_writes_init(void);
 extern voidmmap_init(void);
 extern voidopen_init(void);
 extern voidparent_init(void);
diff --git a/io/log_writes.c b/io/log_writes.c
new file mode 100644
index 000..bc3952c
--- /dev/null
+++ b/io/log_writes.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2017 Intel Corporation.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "platform_defs.h"
+#include "command.h"
+#include "init.h"
+
+static cmdinfo_t log_writes_cmd;
+
+static int
+mark_log(char *device, char *mark)
+{
+   char command[256];
+
+   snprintf(command, 256, "dmsetup message %s 0 mark %s",
+   device, mark);
+
+   return system(command);
+}
+
+static int
+log_writes_f(
+   int argc,
+   char**argv)
+{
+   char *device = NULL;
+   char *mark = NULL;
+   int c;
+
+   while ((c = getopt(argc, argv, "d:m:")) != EOF) {
+   switch (c) {
+   case 'd':
+   device = optarg;
+   break;
+   case 'm':
+   mark = optarg;
+   break;
+   default:
+   return command_usage(_writes_cmd);
+   }
+   }
+
+   if (device == NULL || mark == NULL)
+   return command_usage(_writes_cmd);
+
+   return mark_log(device, mark);
+}
+
+void
+log_writes_init(void)
+{
+   log_writes_cmd.name = "log_writes";
+   log_writes_cmd.altname = "lw";
+   log_writes_cmd.cfunc = log_writes_f;
+   log_writes_cmd.flags = CMD_NOMAP_OK | CMD_NOFILE_OK | CMD_FOREIGN_OK;
+   log_writes_cmd.argmin = 0;
+   log_writes_cmd.argmax = -1;
+   log_writes_cmd.args = _("-d device -m mark");
+   log_writes_cmd.oneline =
+   _("uses dmsetup to interact with the dm-log-writes module");
+
+   add_command(_writes_cm

Re: [fstests PATCH v3] generic: add test for DAX MAP_SYNC support

2017-11-16 Thread Ross Zwisler
On Thu, Oct 26, 2017 at 07:59:39AM +0300, Amir Goldstein wrote:
> On Wed, Oct 25, 2017 at 11:47 PM, Ross Zwisler
> <ross.zwis...@linux.intel.com> wrote:
> > Add a test that exercises DAX's new MAP_SYNC flag.
> >
> > This test creates a file and writes to it via an mmap(), but never syncs
> > via fsync/msync.  This process is tracked via dm-log-writes, then replayed.
> >
> > If MAP_SYNC is working the dm-log-writes replay will show the test file
> > with 1MiB of on-media block allocations.  This is because each allocating
> > page fault included an implicit metadata sync.  If MAP_SYNC isn't working
> > (which you can test by fiddling with the parameters to mmap()) the file
> > will be smaller or missing entirely.
> >
> > Note that dm-log-writes doesn't track the data that we write via the
> > mmap(), so we can't do any data integrity checking.  We can only verify
> > that the metadata writes for the page faults happened.
> >
> > Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
> >
> > ---
> >
> > Changes since v2:
> >  - Fixed _require_log_writes() so that DAX will be disallowed if the
> >version of the dm-log-writes target is older than v1.1.0.  (Amir)
> 
> It seems like your kernel patch bumped the version to 1.0.1...

Ah, yep, that's the version number that I submitted but Mike changed it to
v1.1.0 in his tree.  Here's the patch that was merged for v4.15-rc1:

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/drivers?id=98d82f48f1983ceef5c8d2f6c87bfee2918790ee

> >
> > ---
> >  .gitignore|  1 +
> >  common/dmlogwrites|  4 ++-
> >  common/rc |  6 ++--
> >  src/Makefile  |  3 +-
> >  src/t_map_sync.c  | 92 
> > +++
> >  tests/generic/466 | 76 ++
> >  tests/generic/466.out |  3 ++
> >  tests/generic/group   |  1 +
> >  8 files changed, 182 insertions(+), 4 deletions(-)
> >  create mode 100644 src/t_map_sync.c
> >  create mode 100755 tests/generic/466
> >  create mode 100644 tests/generic/466.out
> >
> > diff --git a/.gitignore b/.gitignore
> > index 2014c08..9fc0695 100644
> > --- a/.gitignore
> > +++ b/.gitignore
> > @@ -119,6 +119,7 @@
> >  /src/t_getcwd
> >  /src/t_holes
> >  /src/t_immutable
> > +/src/t_map_sync
> >  /src/t_mmap_cow_race
> >  /src/t_mmap_dio
> >  /src/t_mmap_fallocate
> > diff --git a/common/dmlogwrites b/common/dmlogwrites
> > index 247c744..71d008d 100644
> > --- a/common/dmlogwrites
> > +++ b/common/dmlogwrites
> > @@ -23,8 +23,10 @@ _require_log_writes()
> > [ -z "$LOGWRITES_DEV" -o ! -b "$LOGWRITES_DEV" ] && \
> > _notrun "This test requires a valid \$LOGWRITES_DEV"
> >
> > -   _exclude_scratch_mount_option dax
> > _require_dm_target log-writes
> > +   if [[ ${DMTARGET_VER[0]} == 1 && ${DMTARGET_VER[1]} < 1 ]]; then
> > +   _exclude_scratch_mount_option dax
> > +   fi
> 
> IMO, this would be better as:
> 
> local z=0
> _scratch_has_mount_option dax && z=1
> _require_dm_target log-writes 1 0 $z
> 
> Or something like that

Yep, this is nicer.  Fixed.

Thanks for the review.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


<    1   2   3   4   5   6   7   8   >