[ndctl PATCH] ndctl, list: export mapping position data

2017-08-04 Thread Dan Williams
If the kernel provides position data include it in the region mapping
listing.

Signed-off-by: Dan Williams 
---
 ndctl/lib/libndctl.c   |   16 ++--
 ndctl/lib/libndctl.sym |1 +
 ndctl/libndctl.h.in|1 +
 util/json.c|9 +
 4 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/ndctl/lib/libndctl.c b/ndctl/lib/libndctl.c
index dda134549842..c2e0efbd87b0 100644
--- a/ndctl/lib/libndctl.c
+++ b/ndctl/lib/libndctl.c
@@ -166,6 +166,7 @@ struct ndctl_dimm {
  * @dimm: backing dimm for the mapping
  * @offset: dimm relative offset
  * @length: span of the extent
+ * @position: interleave-order of the extent
  *
  * This data can be used to identify the dimm ranges contributing to a
  * region / interleave-set and identify how regions alias each other.
@@ -174,6 +175,7 @@ struct ndctl_mapping {
struct ndctl_region *region;
struct ndctl_dimm *dimm;
unsigned long long offset, length;
+   int position;
struct list_node list;
 };
 
@@ -2723,6 +2725,7 @@ static void mappings_init(struct ndctl_region *region)
unsigned long long offset, length;
struct ndctl_dimm *dimm;
unsigned int dimm_id;
+   int position, match;
 
sprintf(mapping_path, "%s/mapping%d", region->region_path, i);
if (sysfs_read_attr(ctx, mapping_path, buf) < 0) {
@@ -2731,8 +2734,11 @@ static void mappings_init(struct ndctl_region *region)
continue;
}
 
-   if (sscanf(buf, "nmem%u,%llu,%llu", _id, ,
-   ) != 3) {
+   match = sscanf(buf, "nmem%u,%llu,%llu,%d", _id, ,
+   , );
+   if (match < 4)
+   position = -1;
+   if (match < 3) {
err(ctx, "bus%d mapping parse failure\n",
ndctl_bus_get_id(bus));
continue;
@@ -2756,6 +2762,7 @@ static void mappings_init(struct ndctl_region *region)
mapping->offset = offset;
mapping->length = length;
mapping->dimm = dimm;
+   mapping->position = position;
list_add(>mappings, >list);
}
free(mapping_path);
@@ -2790,6 +2797,11 @@ NDCTL_EXPORT unsigned long long 
ndctl_mapping_get_length(struct ndctl_mapping *m
return mapping->length;
 }
 
+NDCTL_EXPORT int ndctl_mapping_get_position(struct ndctl_mapping *mapping)
+{
+   return mapping->position;
+}
+
 NDCTL_EXPORT struct ndctl_region *ndctl_mapping_get_region(
struct ndctl_mapping *mapping)
 {
diff --git a/ndctl/lib/libndctl.sym b/ndctl/lib/libndctl.sym
index 0e592435ff70..b8ac65f2917f 100644
--- a/ndctl/lib/libndctl.sym
+++ b/ndctl/lib/libndctl.sym
@@ -157,6 +157,7 @@ global:
ndctl_mapping_get_region;
ndctl_mapping_get_offset;
ndctl_mapping_get_length;
+   ndctl_mapping_get_position;
ndctl_namespace_get_first;
ndctl_namespace_get_next;
ndctl_namespace_get_ctx;
diff --git a/ndctl/libndctl.h.in b/ndctl/libndctl.h.in
index 200c5cf8781a..855d883c3380 100644
--- a/ndctl/libndctl.h.in
+++ b/ndctl/libndctl.h.in
@@ -460,6 +460,7 @@ struct ndctl_bus *ndctl_mapping_get_bus(struct 
ndctl_mapping *mapping);
 struct ndctl_region *ndctl_mapping_get_region(struct ndctl_mapping *mapping);
 unsigned long long ndctl_mapping_get_offset(struct ndctl_mapping *mapping);
 unsigned long long ndctl_mapping_get_length(struct ndctl_mapping *mapping);
+int ndctl_mapping_get_position(struct ndctl_mapping *mapping);
 
 struct ndctl_namespace;
 struct ndctl_namespace *ndctl_namespace_get_first(struct ndctl_region *region);
diff --git a/util/json.c b/util/json.c
index 5f1492711c47..98165b789ff8 100644
--- a/util/json.c
+++ b/util/json.c
@@ -737,6 +737,7 @@ struct json_object *util_mapping_to_json(struct 
ndctl_mapping *mapping,
struct json_object *jmapping = json_object_new_object();
struct ndctl_dimm *dimm = ndctl_mapping_get_dimm(mapping);
struct json_object *jobj;
+   int position;
 
if (!jmapping)
return NULL;
@@ -756,6 +757,14 @@ struct json_object *util_mapping_to_json(struct 
ndctl_mapping *mapping,
goto err;
json_object_object_add(jmapping, "length", jobj);
 
+   position = ndctl_mapping_get_position(mapping);
+   if (position >= 0) {
+   jobj = json_object_new_int(position);
+   if (!jobj)
+   goto err;
+   json_object_object_add(jmapping, "position", jobj);
+   }
+
return jmapping;
  err:
json_object_put(jmapping);

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[PATCH] nfit, libnvdimm, region: export 'position' in mapping info

2017-08-04 Thread Dan Williams
It is useful to be able to know the position of a DIMM in an
interleave-set. Consider the case where the order of the DIMMs changes
causing a namespace to be invalidated because the interleave-set cookie no
longer matches. If the before and after state of each DIMM position is
known this state debugged by the system owner.

Signed-off-by: Dan Williams 
---
 drivers/acpi/nfit/core.c |   24 
 drivers/nvdimm/nd.h  |1 +
 drivers/nvdimm/region_devs.c |6 --
 include/linux/libnvdimm.h|1 +
 4 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 19182d091587..be231a549eb0 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1835,6 +1835,30 @@ static int acpi_nfit_init_interleave_set(struct 
acpi_nfit_desc *acpi_desc,
cmp_map_compat, NULL);
nd_set->altcookie = nd_fletcher64(info, sizeof_nfit_set_info(nr), 0);
 
+   /* record the result of the sort for the mapping position */
+   for (i = 0; i < nr; i++) {
+   struct nfit_set_info_map2 *map2 = >mapping[i];
+   int j;
+
+   for (j = 0; j < nr; j++) {
+   struct nd_mapping_desc *mapping = _desc->mapping[j];
+   struct nvdimm *nvdimm = mapping->nvdimm;
+   struct nfit_mem *nfit_mem = 
nvdimm_provider_data(nvdimm);
+
+   if (map2->serial_number
+   == nfit_mem->dcr->serial_number &&
+   map2->vendor_id
+   == nfit_mem->dcr->vendor_id &&
+   map2->manufacturing_date
+   == nfit_mem->dcr->manufacturing_date &&
+   map2->manufacturing_location
+   == nfit_mem->dcr->manufacturing_location) {
+   mapping->position = i;
+   break;
+   }
+   }
+   }
+
ndr_desc->nd_set = nd_set;
devm_kfree(dev, info);
devm_kfree(dev, info2);
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index e9fa9e84b364..a08fc2e24fb3 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -134,6 +134,7 @@ struct nd_mapping {
struct nvdimm *nvdimm;
u64 start;
u64 size;
+   int position;
struct list_head labels;
struct mutex lock;
/*
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 5954cfbea3fc..829d760f651c 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -723,8 +723,9 @@ static ssize_t mappingN(struct device *dev, char *buf, int 
n)
nd_mapping = _region->mapping[n];
nvdimm = nd_mapping->nvdimm;
 
-   return sprintf(buf, "%s,%llu,%llu\n", dev_name(>dev),
-   nd_mapping->start, nd_mapping->size);
+   return sprintf(buf, "%s,%llu,%llu,%d\n", dev_name(>dev),
+   nd_mapping->start, nd_mapping->size,
+   nd_mapping->position);
 }
 
 #define REGION_MAPPING(idx) \
@@ -965,6 +966,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus 
*nvdimm_bus,
nd_region->mapping[i].nvdimm = nvdimm;
nd_region->mapping[i].start = mapping->start;
nd_region->mapping[i].size = mapping->size;
+   nd_region->mapping[i].position = mapping->position;
INIT_LIST_HEAD(_region->mapping[i].labels);
mutex_init(_region->mapping[i].lock);
 
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index f3d3e6af8838..9b8d81a7b80e 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -87,6 +87,7 @@ struct nd_mapping_desc {
struct nvdimm *nvdimm;
u64 start;
u64 size;
+   int position;
 };
 
 struct nd_region_desc {

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v2 2/5] fs, xfs: introduce FALLOC_FL_SEAL_BLOCK_MAP

2017-08-04 Thread Dave Chinner
On Fri, Aug 04, 2017 at 04:43:50PM -0700, Dan Williams wrote:
> On Fri, Aug 4, 2017 at 4:31 PM, Dave Chinner  wrote:
> > On Thu, Aug 03, 2017 at 07:28:17PM -0700, Dan Williams wrote:
> >> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> >> index fe0f8f7f4bb7..46d8eb9e19fc 100644
> >> --- a/fs/xfs/xfs_bmap_util.c
> >> +++ b/fs/xfs/xfs_bmap_util.c
> >> @@ -1393,6 +1393,107 @@ xfs_zero_file_space(
> >>
> >>  }
> >>
> >> +/* Return 1 if hole detected, 0 if not, and < 0 if fail to determine */
> >> +STATIC int
> >> +xfs_file_has_holes(
> >> + struct xfs_inode*ip)
> >> +{
> >
> > Why do we need this function?
> >
> > We've just run xfs_alloc_file_space() across the entire range we
> > are sealing, so we've already guaranteed that it won't have holes
> > in it.
> 
> I'm sure this is due to my ignorance of the scope of XFS_IOLOCK_EXCL
> vs XFS_ILOCK_EXCL. I had assumed that since we drop and retake
> XFS_ILOCK_EXCL that we need to re-validate the block map before
> setting S_IOMAP_IMMUTABLE.

THe ILOCK is there to protect the inode metadata when there is
concurrent access through the IO/MMAP lock paths.  However, if we
hold the IOLOCK_EXCL and the MMAPLOCK_EXCL, then nothing can get
through the IO interfaces to modify the data in the file.  This is
required because APIs that directly modify the extent map (e.g.
fallocate, truncate, etc) have to lock out the IO path to ensure
there are no IOs in flight across the range we are manipulating.

Holding these locks also locks out other APIs that modify the extent
map and so effectively nothing else can be accessing or modifying
the extent map while a fallocate or truncate operation is in
progress.

Cheers,

Dave.

-- 
Dave Chinner
da...@fromorbit.com
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [RFC/Patch 4/5] libndctl Make interfaces to use Translate SPA

2017-08-04 Thread Dan Williams
On Fri, Aug 4, 2017 at 2:12 AM, Yasunori Goto  wrote:
> ndctl:libndctl Make interfaces to use Translate SPA.

A process note, please separate kernel patches and ndctl patches into
a different set. The expectation is that all the [PATCH 1/n] to [PATCH
n/n] should apply to the same project.

I have typically been marking my ndctl patches with a prefix like
[ndctl PATCH 1/n], where the kernel patches just say [PATCH].

>
> This patch makes 3 new interfaces :
>   - to ask bus has translate SPA feature.
>   - to call translate SPA.
>   - to find DIMM by SPA address.
>
>
> Note) I'm not sure how many buffer should be prepared, because
>   it depends on max # of mirroring way.
>   This patch assume maxmum # is 4 way.

I think it is ok for now to just reserve space for 1-DIMM. When / if
mirroring support materializes in the kernel you should then be able
to detect how many DIMMs are in the mirror set. Since mirror support
would require changes to the Namespace Label specification I think we
have time to circle back and fix up ndctl for the multiple DIMM per
Translate SPA request case.

Otherwise, the kernel will handle the case where the provided buffer
is too small and return an error.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v2 4/5] xfs: introduce XFS_DIFLAG2_IOMAP_IMMUTABLE

2017-08-04 Thread Darrick J. Wong
On Sat, Aug 05, 2017 at 09:46:15AM +1000, Dave Chinner wrote:
> On Fri, Aug 04, 2017 at 01:33:12PM -0700, Darrick J. Wong wrote:
> > On Thu, Aug 03, 2017 at 07:28:30PM -0700, Dan Williams wrote:
> > > Add an on-disk inode flag to record the state of the S_IOMAP_IMMUTABLE
> > > in-memory vfs inode flags. This allows the protections against reflink
> > > and hole punch to be automatically restored on a sub-sequent boot when
> > > the in-memory inode is established.
> > > 
> > > The FS_XFLAG_IOMAP_IMMUTABLE is introduced to allow xfs_io to read the
> > > state of the flag, but toggling the flag requires going through
> > > fallocate(FALLOC_FL_[UN]SEAL_BLOCK_MAP). Support for toggling this
> > > on-disk state is saved for a later patch.
> > > 
> > > Cc: Jan Kara 
> > > Cc: Jeff Moyer 
> > > Cc: Christoph Hellwig 
> > > Cc: Ross Zwisler 
> > > Suggested-by: Dave Chinner 
> > > Suggested-by: "Darrick J. Wong" 
> > > Signed-off-by: Dan Williams 
> > > ---
> > >  fs/xfs/libxfs/xfs_format.h |5 -
> > >  fs/xfs/xfs_inode.c |2 ++
> > >  fs/xfs/xfs_ioctl.c |1 +
> > >  fs/xfs/xfs_iops.c  |8 +---
> > >  include/uapi/linux/fs.h|1 +
> > >  5 files changed, 13 insertions(+), 4 deletions(-)
> > > 
> > > diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
> > > index d4d9bef20c3a..9e720e55776b 100644
> > > --- a/fs/xfs/libxfs/xfs_format.h
> > > +++ b/fs/xfs/libxfs/xfs_format.h
> > > @@ -1063,12 +1063,15 @@ static inline void xfs_dinode_put_rdev(struct 
> > > xfs_dinode *dip, xfs_dev_t rdev)
> > >  #define XFS_DIFLAG2_DAX_BIT  0   /* use DAX for this inode */
> > >  #define XFS_DIFLAG2_REFLINK_BIT  1   /* file's blocks may be shared 
> > > */
> > >  #define XFS_DIFLAG2_COWEXTSIZE_BIT   2  /* copy on write extent size 
> > > hint */
> > > +#define XFS_DIFLAG2_IOMAP_IMMUTABLE_BIT 3 /* set S_IOMAP_IMMUTABLE for 
> > > this inode */
> > 
> > So... the greedy part of my brain that doesn't want to give out flags2
> > bits has been wondering,
> 
> FWIW, I made di_flags2 a 64 bit value in the first place precisely
> so we didn't have a scarcity problem and can just give out flag
> bits for enabling new functionality like this...

Ok.  That's what I thought.

> > what if we just didn't have an on-disk
> > IOMAP_IMMUTABLE bit, and set FS_XFLAG based only on the in-core
> > S_IOMAP_IMMUTABLE bit?  If a program wants the immutable iomap
> > semantics, they will have to code some variant on the following:
> > 
> > fd = open(...);
> > ret = fallocate(fd, FALLOC_FL_SEAL_BLOCK_MAP, 0, len...)
> > if (ret) {
> > printf("couldn't seal block map");
> > close(fd);
> > return;
> > }
> > 
> > mmap(fd...);
> > /* do sensitive io operations here */
> > munmap(fd...);
> > 
> > close(fd);
> > 
> > Therefore the cost of not having the on-disk flag is that we'll have to
> > do more unshare/alloc/test/set cycles than we would if we could remember
> > the iomap-immutable state across unmounts and inode reclaiming.
> > However, if the data map is already ready to go, this shouldn't have a
> > lot of overhead since we only have to iterate the in-core extents.
> > 
> > Just trying to make sure we /need/ the inode flag bit. :)
> 
> IMO, fallocate() is for making permanent changes to file extents. If
> this is not going to be a permanent state change but only a
> runtime-while-the-inode-is-in-cache flag, then it's probably not the
> right interface to use.
> 
> This also seems problematic for applications other than DAX where
> the block map may be sealed, the fd closed and access handed off to
> another entity for remote storage access. If the inode gets
> reclaimed due to memory pressure, the system loses the fact that
> that the inode has been sealed. Hence another process can come
> along, re-read the inode and modify the block map because it hasn't
> been sealed in this new cache life cycle.

 Ok, I'm convinced. :)

--D

> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> da...@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v2 4/5] xfs: introduce XFS_DIFLAG2_IOMAP_IMMUTABLE

2017-08-04 Thread Dave Chinner
On Fri, Aug 04, 2017 at 01:33:12PM -0700, Darrick J. Wong wrote:
> On Thu, Aug 03, 2017 at 07:28:30PM -0700, Dan Williams wrote:
> > Add an on-disk inode flag to record the state of the S_IOMAP_IMMUTABLE
> > in-memory vfs inode flags. This allows the protections against reflink
> > and hole punch to be automatically restored on a sub-sequent boot when
> > the in-memory inode is established.
> > 
> > The FS_XFLAG_IOMAP_IMMUTABLE is introduced to allow xfs_io to read the
> > state of the flag, but toggling the flag requires going through
> > fallocate(FALLOC_FL_[UN]SEAL_BLOCK_MAP). Support for toggling this
> > on-disk state is saved for a later patch.
> > 
> > Cc: Jan Kara 
> > Cc: Jeff Moyer 
> > Cc: Christoph Hellwig 
> > Cc: Ross Zwisler 
> > Suggested-by: Dave Chinner 
> > Suggested-by: "Darrick J. Wong" 
> > Signed-off-by: Dan Williams 
> > ---
> >  fs/xfs/libxfs/xfs_format.h |5 -
> >  fs/xfs/xfs_inode.c |2 ++
> >  fs/xfs/xfs_ioctl.c |1 +
> >  fs/xfs/xfs_iops.c  |8 +---
> >  include/uapi/linux/fs.h|1 +
> >  5 files changed, 13 insertions(+), 4 deletions(-)
> > 
> > diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
> > index d4d9bef20c3a..9e720e55776b 100644
> > --- a/fs/xfs/libxfs/xfs_format.h
> > +++ b/fs/xfs/libxfs/xfs_format.h
> > @@ -1063,12 +1063,15 @@ static inline void xfs_dinode_put_rdev(struct 
> > xfs_dinode *dip, xfs_dev_t rdev)
> >  #define XFS_DIFLAG2_DAX_BIT0   /* use DAX for this inode */
> >  #define XFS_DIFLAG2_REFLINK_BIT1   /* file's blocks may be shared 
> > */
> >  #define XFS_DIFLAG2_COWEXTSIZE_BIT   2  /* copy on write extent size hint 
> > */
> > +#define XFS_DIFLAG2_IOMAP_IMMUTABLE_BIT 3 /* set S_IOMAP_IMMUTABLE for 
> > this inode */
> 
> So... the greedy part of my brain that doesn't want to give out flags2
> bits has been wondering,

FWIW, I made di_flags2 a 64 bit value in the first place precisely
so we didn't have a scarcity problem and can just give out flag
bits for enabling new functionality like this...

> what if we just didn't have an on-disk
> IOMAP_IMMUTABLE bit, and set FS_XFLAG based only on the in-core
> S_IOMAP_IMMUTABLE bit?  If a program wants the immutable iomap
> semantics, they will have to code some variant on the following:
> 
> fd = open(...);
> ret = fallocate(fd, FALLOC_FL_SEAL_BLOCK_MAP, 0, len...)
> if (ret) {
>   printf("couldn't seal block map");
>   close(fd);
>   return;
> }
> 
> mmap(fd...);
> /* do sensitive io operations here */
> munmap(fd...);
> 
> close(fd);
> 
> Therefore the cost of not having the on-disk flag is that we'll have to
> do more unshare/alloc/test/set cycles than we would if we could remember
> the iomap-immutable state across unmounts and inode reclaiming.
> However, if the data map is already ready to go, this shouldn't have a
> lot of overhead since we only have to iterate the in-core extents.
> 
> Just trying to make sure we /need/ the inode flag bit. :)

IMO, fallocate() is for making permanent changes to file extents. If
this is not going to be a permanent state change but only a
runtime-while-the-inode-is-in-cache flag, then it's probably not the
right interface to use.

This also seems problematic for applications other than DAX where
the block map may be sealed, the fd closed and access handed off to
another entity for remote storage access. If the inode gets
reclaimed due to memory pressure, the system loses the fact that
that the inode has been sealed. Hence another process can come
along, re-read the inode and modify the block map because it hasn't
been sealed in this new cache life cycle.

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v2 2/5] fs, xfs: introduce FALLOC_FL_SEAL_BLOCK_MAP

2017-08-04 Thread Dan Williams
On Fri, Aug 4, 2017 at 4:31 PM, Dave Chinner  wrote:
> On Thu, Aug 03, 2017 at 07:28:17PM -0700, Dan Williams wrote:
>> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
>> index fe0f8f7f4bb7..46d8eb9e19fc 100644
>> --- a/fs/xfs/xfs_bmap_util.c
>> +++ b/fs/xfs/xfs_bmap_util.c
>> @@ -1393,6 +1393,107 @@ xfs_zero_file_space(
>>
>>  }
>>
>> +/* Return 1 if hole detected, 0 if not, and < 0 if fail to determine */
>> +STATIC int
>> +xfs_file_has_holes(
>> + struct xfs_inode*ip)
>> +{
>
> Why do we need this function?
>
> We've just run xfs_alloc_file_space() across the entire range we
> are sealing, so we've already guaranteed that it won't have holes
> in it.

I'm sure this is due to my ignorance of the scope of XFS_IOLOCK_EXCL
vs XFS_ILOCK_EXCL. I had assumed that since we drop and retake
XFS_ILOCK_EXCL that we need to re-validate the block map before
setting S_IOMAP_IMMUTABLE.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v2 2/5] fs, xfs: introduce FALLOC_FL_SEAL_BLOCK_MAP

2017-08-04 Thread Dave Chinner
On Thu, Aug 03, 2017 at 07:28:17PM -0700, Dan Williams wrote:
> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> index fe0f8f7f4bb7..46d8eb9e19fc 100644
> --- a/fs/xfs/xfs_bmap_util.c
> +++ b/fs/xfs/xfs_bmap_util.c
> @@ -1393,6 +1393,107 @@ xfs_zero_file_space(
>  
>  }
>  
> +/* Return 1 if hole detected, 0 if not, and < 0 if fail to determine */
> +STATIC int
> +xfs_file_has_holes(
> + struct xfs_inode*ip)
> +{

Why do we need this function?

We've just run xfs_alloc_file_space() across the entire range we
are sealing, so we've already guaranteed that it won't have holes
in it.

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [RFC/Patch 2/5] Support Translate SPA for NVDIMM Root Device.

2017-08-04 Thread Dan Williams
On Fri, Aug 4, 2017 at 2:08 AM, Yasunori Goto  wrote:
>
> Support Translate SPA for NVDIMM Root Device.
>
> ACPI 6.2 has new specification on _DSM of NVDIMM Root Device
> It is "Translate SPA" which translate from system physical address(SPA)
> to NVDIMM handle and dimm physical address(DPA).
>
> This patch is to support Translate SPA.
>

Ah, yes, this is not needed. The Translate SPA command can be
submitted via ND_CMD_CALL mechanism.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v2 5/5] xfs: toggle XFS_DIFLAG2_IOMAP_IMMUTABLE in response to fallocate

2017-08-04 Thread Dan Williams
On Fri, Aug 4, 2017 at 1:53 PM, Darrick J. Wong  wrote:
> On Fri, Aug 04, 2017 at 01:47:32PM -0700, Dan Williams wrote:
>> On Fri, Aug 4, 2017 at 1:14 PM, Darrick J. Wong  
>> wrote:
>> > On Thu, Aug 03, 2017 at 07:28:35PM -0700, Dan Williams wrote:
>> >> After validating the state of the file as not having holes, shared
>> >> extents, or active mappings try to commit the
>> >> XFS_DIFLAG2_IOMAP_IMMUTABLE flag to the on-disk inode metadata. If that
>> >> succeeds then allow the S_IOMAP_IMMUTABLE to be set on the vfs inode.
>> >>
>> >> Cc: Jan Kara 
>> >> Cc: Jeff Moyer 
>> >> Cc: Christoph Hellwig 
>> >> Cc: Ross Zwisler 
>> >> Suggested-by: Dave Chinner 
>> >> Suggested-by: "Darrick J. Wong" 
>> >> Signed-off-by: Dan Williams 
>> >> ---
>> >>  fs/xfs/xfs_bmap_util.c |   32 
>> >>  1 file changed, 32 insertions(+)
>> >>
>> >> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
>> >> index 70ac2d33ab27..8464c25a2403 100644
>> >> --- a/fs/xfs/xfs_bmap_util.c
>> >> +++ b/fs/xfs/xfs_bmap_util.c
>> >> @@ -1436,9 +1436,11 @@ xfs_seal_file_space(
>> >>   xfs_off_t   offset,
>> >>   xfs_off_t   len)
>> >>  {
>> >> + struct xfs_mount*mp = ip->i_mount;
>> >>   struct inode*inode = VFS_I(ip);
>> >>   struct address_space*mapping = inode->i_mapping;
>> >>   int error;
>> >> + struct xfs_trans*tp;
>> >>
>> >>   ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
>> >>
>> >> @@ -1454,6 +1456,10 @@ xfs_seal_file_space(
>> >>   if (error)
>> >>   return error;
>> >>
>> >> + error = xfs_trans_alloc(mp, _RES(mp)->tr_ichange, 0, 0, 0, );
>> >> + if (error)
>> >> + return error;
>> >> +
>> >>   xfs_ilock(ip, XFS_ILOCK_EXCL);
>> >>   /*
>> >>* Either the size changed after we performed allocation /
>> >> @@ -1486,10 +1492,20 @@ xfs_seal_file_space(
>> >>   if (error < 0)
>> >>   goto out_unlock;
>> >>
>> >> + xfs_trans_ijoin(tp, ip, 0);
>> >
>> > FWIW if you change that third parameter to XFS_ILOCK_EXCL then
>> > xfs_trans_commit will do the xfs_iunlock(ip, XFS_ILOCK_EXCL) for you if
>> > the commit succeeds...
>> >
>> >> + ip->i_d.di_flags2 |= XFS_DIFLAG2_IOMAP_IMMUTABLE;
>> >> + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
>> >> + error = xfs_trans_commit(tp);
>> >> + tp = NULL; /* nothing to cancel */
>> >> + if (error)
>> >> + goto out_unlock;
>> >> +
>> >>   inode->i_flags |= S_IOMAP_IMMUTABLE;
>> >
>> > ...and then you can just return out here.
>>
>> Do we not need to hold XFS_ILOCK_EXCL over ->i_flags changes, or is
>> XFS_IOLOCK_EXCL enough?
>
> Oh, heh, I missed a piece.  Set the flag before the transaction commit,
> because if the commit fails the fs is shut down anyway. :)

Ah, got it, thanks.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v2 5/5] xfs: toggle XFS_DIFLAG2_IOMAP_IMMUTABLE in response to fallocate

2017-08-04 Thread Darrick J. Wong
On Fri, Aug 04, 2017 at 01:47:32PM -0700, Dan Williams wrote:
> On Fri, Aug 4, 2017 at 1:14 PM, Darrick J. Wong  
> wrote:
> > On Thu, Aug 03, 2017 at 07:28:35PM -0700, Dan Williams wrote:
> >> After validating the state of the file as not having holes, shared
> >> extents, or active mappings try to commit the
> >> XFS_DIFLAG2_IOMAP_IMMUTABLE flag to the on-disk inode metadata. If that
> >> succeeds then allow the S_IOMAP_IMMUTABLE to be set on the vfs inode.
> >>
> >> Cc: Jan Kara 
> >> Cc: Jeff Moyer 
> >> Cc: Christoph Hellwig 
> >> Cc: Ross Zwisler 
> >> Suggested-by: Dave Chinner 
> >> Suggested-by: "Darrick J. Wong" 
> >> Signed-off-by: Dan Williams 
> >> ---
> >>  fs/xfs/xfs_bmap_util.c |   32 
> >>  1 file changed, 32 insertions(+)
> >>
> >> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> >> index 70ac2d33ab27..8464c25a2403 100644
> >> --- a/fs/xfs/xfs_bmap_util.c
> >> +++ b/fs/xfs/xfs_bmap_util.c
> >> @@ -1436,9 +1436,11 @@ xfs_seal_file_space(
> >>   xfs_off_t   offset,
> >>   xfs_off_t   len)
> >>  {
> >> + struct xfs_mount*mp = ip->i_mount;
> >>   struct inode*inode = VFS_I(ip);
> >>   struct address_space*mapping = inode->i_mapping;
> >>   int error;
> >> + struct xfs_trans*tp;
> >>
> >>   ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
> >>
> >> @@ -1454,6 +1456,10 @@ xfs_seal_file_space(
> >>   if (error)
> >>   return error;
> >>
> >> + error = xfs_trans_alloc(mp, _RES(mp)->tr_ichange, 0, 0, 0, );
> >> + if (error)
> >> + return error;
> >> +
> >>   xfs_ilock(ip, XFS_ILOCK_EXCL);
> >>   /*
> >>* Either the size changed after we performed allocation /
> >> @@ -1486,10 +1492,20 @@ xfs_seal_file_space(
> >>   if (error < 0)
> >>   goto out_unlock;
> >>
> >> + xfs_trans_ijoin(tp, ip, 0);
> >
> > FWIW if you change that third parameter to XFS_ILOCK_EXCL then
> > xfs_trans_commit will do the xfs_iunlock(ip, XFS_ILOCK_EXCL) for you if
> > the commit succeeds...
> >
> >> + ip->i_d.di_flags2 |= XFS_DIFLAG2_IOMAP_IMMUTABLE;
> >> + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
> >> + error = xfs_trans_commit(tp);
> >> + tp = NULL; /* nothing to cancel */
> >> + if (error)
> >> + goto out_unlock;
> >> +
> >>   inode->i_flags |= S_IOMAP_IMMUTABLE;
> >
> > ...and then you can just return out here.
> 
> Do we not need to hold XFS_ILOCK_EXCL over ->i_flags changes, or is
> XFS_IOLOCK_EXCL enough?

Oh, heh, I missed a piece.  Set the flag before the transaction commit,
because if the commit fails the fs is shut down anyway. :)

--D

> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v2 5/5] xfs: toggle XFS_DIFLAG2_IOMAP_IMMUTABLE in response to fallocate

2017-08-04 Thread Dan Williams
On Fri, Aug 4, 2017 at 1:14 PM, Darrick J. Wong  wrote:
> On Thu, Aug 03, 2017 at 07:28:35PM -0700, Dan Williams wrote:
>> After validating the state of the file as not having holes, shared
>> extents, or active mappings try to commit the
>> XFS_DIFLAG2_IOMAP_IMMUTABLE flag to the on-disk inode metadata. If that
>> succeeds then allow the S_IOMAP_IMMUTABLE to be set on the vfs inode.
>>
>> Cc: Jan Kara 
>> Cc: Jeff Moyer 
>> Cc: Christoph Hellwig 
>> Cc: Ross Zwisler 
>> Suggested-by: Dave Chinner 
>> Suggested-by: "Darrick J. Wong" 
>> Signed-off-by: Dan Williams 
>> ---
>>  fs/xfs/xfs_bmap_util.c |   32 
>>  1 file changed, 32 insertions(+)
>>
>> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
>> index 70ac2d33ab27..8464c25a2403 100644
>> --- a/fs/xfs/xfs_bmap_util.c
>> +++ b/fs/xfs/xfs_bmap_util.c
>> @@ -1436,9 +1436,11 @@ xfs_seal_file_space(
>>   xfs_off_t   offset,
>>   xfs_off_t   len)
>>  {
>> + struct xfs_mount*mp = ip->i_mount;
>>   struct inode*inode = VFS_I(ip);
>>   struct address_space*mapping = inode->i_mapping;
>>   int error;
>> + struct xfs_trans*tp;
>>
>>   ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
>>
>> @@ -1454,6 +1456,10 @@ xfs_seal_file_space(
>>   if (error)
>>   return error;
>>
>> + error = xfs_trans_alloc(mp, _RES(mp)->tr_ichange, 0, 0, 0, );
>> + if (error)
>> + return error;
>> +
>>   xfs_ilock(ip, XFS_ILOCK_EXCL);
>>   /*
>>* Either the size changed after we performed allocation /
>> @@ -1486,10 +1492,20 @@ xfs_seal_file_space(
>>   if (error < 0)
>>   goto out_unlock;
>>
>> + xfs_trans_ijoin(tp, ip, 0);
>
> FWIW if you change that third parameter to XFS_ILOCK_EXCL then
> xfs_trans_commit will do the xfs_iunlock(ip, XFS_ILOCK_EXCL) for you if
> the commit succeeds...
>
>> + ip->i_d.di_flags2 |= XFS_DIFLAG2_IOMAP_IMMUTABLE;
>> + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
>> + error = xfs_trans_commit(tp);
>> + tp = NULL; /* nothing to cancel */
>> + if (error)
>> + goto out_unlock;
>> +
>>   inode->i_flags |= S_IOMAP_IMMUTABLE;
>
> ...and then you can just return out here.

Do we not need to hold XFS_ILOCK_EXCL over ->i_flags changes, or is
XFS_IOLOCK_EXCL enough?
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v2 4/5] xfs: introduce XFS_DIFLAG2_IOMAP_IMMUTABLE

2017-08-04 Thread Dan Williams
On Fri, Aug 4, 2017 at 1:33 PM, Darrick J. Wong  wrote:
> On Thu, Aug 03, 2017 at 07:28:30PM -0700, Dan Williams wrote:
>> Add an on-disk inode flag to record the state of the S_IOMAP_IMMUTABLE
>> in-memory vfs inode flags. This allows the protections against reflink
>> and hole punch to be automatically restored on a sub-sequent boot when
>> the in-memory inode is established.
>>
>> The FS_XFLAG_IOMAP_IMMUTABLE is introduced to allow xfs_io to read the
>> state of the flag, but toggling the flag requires going through
>> fallocate(FALLOC_FL_[UN]SEAL_BLOCK_MAP). Support for toggling this
>> on-disk state is saved for a later patch.
>>
>> Cc: Jan Kara 
>> Cc: Jeff Moyer 
>> Cc: Christoph Hellwig 
>> Cc: Ross Zwisler 
>> Suggested-by: Dave Chinner 
>> Suggested-by: "Darrick J. Wong" 
>> Signed-off-by: Dan Williams 
>> ---
>>  fs/xfs/libxfs/xfs_format.h |5 -
>>  fs/xfs/xfs_inode.c |2 ++
>>  fs/xfs/xfs_ioctl.c |1 +
>>  fs/xfs/xfs_iops.c  |8 +---
>>  include/uapi/linux/fs.h|1 +
>>  5 files changed, 13 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
>> index d4d9bef20c3a..9e720e55776b 100644
>> --- a/fs/xfs/libxfs/xfs_format.h
>> +++ b/fs/xfs/libxfs/xfs_format.h
>> @@ -1063,12 +1063,15 @@ static inline void xfs_dinode_put_rdev(struct 
>> xfs_dinode *dip, xfs_dev_t rdev)
>>  #define XFS_DIFLAG2_DAX_BIT  0   /* use DAX for this inode */
>>  #define XFS_DIFLAG2_REFLINK_BIT  1   /* file's blocks may be shared 
>> */
>>  #define XFS_DIFLAG2_COWEXTSIZE_BIT   2  /* copy on write extent size hint */
>> +#define XFS_DIFLAG2_IOMAP_IMMUTABLE_BIT 3 /* set S_IOMAP_IMMUTABLE for this 
>> inode */
>
> So... the greedy part of my brain that doesn't want to give out flags2
> bits has been wondering, what if we just didn't have an on-disk
> IOMAP_IMMUTABLE bit, and set FS_XFLAG based only on the in-core
> S_IOMAP_IMMUTABLE bit?  If a program wants the immutable iomap
> semantics, they will have to code some variant on the following:
>
> fd = open(...);
> ret = fallocate(fd, FALLOC_FL_SEAL_BLOCK_MAP, 0, len...)
> if (ret) {
> printf("couldn't seal block map");
> close(fd);
> return;
> }
>
> mmap(fd...);
> /* do sensitive io operations here */
> munmap(fd...);
>
> close(fd);
>
> Therefore the cost of not having the on-disk flag is that we'll have to
> do more unshare/alloc/test/set cycles than we would if we could remember
> the iomap-immutable state across unmounts and inode reclaiming.
> However, if the data map is already ready to go, this shouldn't have a
> lot of overhead since we only have to iterate the in-core extents.
>
> Just trying to make sure we /need/ the inode flag bit. :)

A fair point.

The use case I imagine is a privileged (CAP_IMMUTABLE) process setting
up the data file space at server provisioning time, and then
unprivileged processes using the immutable semantic thereafter.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v2 4/5] xfs: introduce XFS_DIFLAG2_IOMAP_IMMUTABLE

2017-08-04 Thread Darrick J. Wong
On Thu, Aug 03, 2017 at 07:28:30PM -0700, Dan Williams wrote:
> Add an on-disk inode flag to record the state of the S_IOMAP_IMMUTABLE
> in-memory vfs inode flags. This allows the protections against reflink
> and hole punch to be automatically restored on a sub-sequent boot when
> the in-memory inode is established.
> 
> The FS_XFLAG_IOMAP_IMMUTABLE is introduced to allow xfs_io to read the
> state of the flag, but toggling the flag requires going through
> fallocate(FALLOC_FL_[UN]SEAL_BLOCK_MAP). Support for toggling this
> on-disk state is saved for a later patch.
> 
> Cc: Jan Kara 
> Cc: Jeff Moyer 
> Cc: Christoph Hellwig 
> Cc: Ross Zwisler 
> Suggested-by: Dave Chinner 
> Suggested-by: "Darrick J. Wong" 
> Signed-off-by: Dan Williams 
> ---
>  fs/xfs/libxfs/xfs_format.h |5 -
>  fs/xfs/xfs_inode.c |2 ++
>  fs/xfs/xfs_ioctl.c |1 +
>  fs/xfs/xfs_iops.c  |8 +---
>  include/uapi/linux/fs.h|1 +
>  5 files changed, 13 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
> index d4d9bef20c3a..9e720e55776b 100644
> --- a/fs/xfs/libxfs/xfs_format.h
> +++ b/fs/xfs/libxfs/xfs_format.h
> @@ -1063,12 +1063,15 @@ static inline void xfs_dinode_put_rdev(struct 
> xfs_dinode *dip, xfs_dev_t rdev)
>  #define XFS_DIFLAG2_DAX_BIT  0   /* use DAX for this inode */
>  #define XFS_DIFLAG2_REFLINK_BIT  1   /* file's blocks may be shared 
> */
>  #define XFS_DIFLAG2_COWEXTSIZE_BIT   2  /* copy on write extent size hint */
> +#define XFS_DIFLAG2_IOMAP_IMMUTABLE_BIT 3 /* set S_IOMAP_IMMUTABLE for this 
> inode */

So... the greedy part of my brain that doesn't want to give out flags2
bits has been wondering, what if we just didn't have an on-disk
IOMAP_IMMUTABLE bit, and set FS_XFLAG based only on the in-core
S_IOMAP_IMMUTABLE bit?  If a program wants the immutable iomap
semantics, they will have to code some variant on the following:

fd = open(...);
ret = fallocate(fd, FALLOC_FL_SEAL_BLOCK_MAP, 0, len...)
if (ret) {
printf("couldn't seal block map");
close(fd);
return;
}

mmap(fd...);
/* do sensitive io operations here */
munmap(fd...);

close(fd);

Therefore the cost of not having the on-disk flag is that we'll have to
do more unshare/alloc/test/set cycles than we would if we could remember
the iomap-immutable state across unmounts and inode reclaiming.
However, if the data map is already ready to go, this shouldn't have a
lot of overhead since we only have to iterate the in-core extents.

Just trying to make sure we /need/ the inode flag bit. :)

--D

>  #define XFS_DIFLAG2_DAX  (1 << XFS_DIFLAG2_DAX_BIT)
>  #define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT)
>  #define XFS_DIFLAG2_COWEXTSIZE  (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
> +#define XFS_DIFLAG2_IOMAP_IMMUTABLE (1 << XFS_DIFLAG2_IOMAP_IMMUTABLE_BIT)
>  
>  #define XFS_DIFLAG2_ANY \
> - (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE)
> + (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
> +  XFS_DIFLAG2_IOMAP_IMMUTABLE)
>  
>  /*
>   * Inode number format:
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index ceef77c0416a..4ca22e272ce6 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -674,6 +674,8 @@ _xfs_dic2xflags(
>   flags |= FS_XFLAG_DAX;
>   if (di_flags2 & XFS_DIFLAG2_COWEXTSIZE)
>   flags |= FS_XFLAG_COWEXTSIZE;
> + if (di_flags2 & XFS_DIFLAG2_IOMAP_IMMUTABLE)
> + flags |= FS_XFLAG_IOMAP_IMMUTABLE;
>   }
>  
>   if (has_attr)
> diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
> index 2e64488bc4de..df2eef0f9d45 100644
> --- a/fs/xfs/xfs_ioctl.c
> +++ b/fs/xfs/xfs_ioctl.c
> @@ -978,6 +978,7 @@ xfs_set_diflags(
>   return;
>  
>   di_flags2 = (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);
> + di_flags2 |= (ip->i_d.di_flags2 & XFS_DIFLAG2_IOMAP_IMMUTABLE);
>   if (xflags & FS_XFLAG_DAX)
>   di_flags2 |= XFS_DIFLAG2_DAX;
>   if (xflags & FS_XFLAG_COWEXTSIZE)
> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index 469c9fa4c178..174ef95453f5 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -1186,9 +1186,10 @@ xfs_diflags_to_iflags(
>   struct xfs_inode*ip)
>  {
>   uint16_tflags = ip->i_d.di_flags;
> + uint64_tflags2 = ip->i_d.di_flags2;
>  
>   inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC |
> - S_NOATIME | S_DAX);
> + S_NOATIME | S_DAX | S_IOMAP_IMMUTABLE);
>  
>   if (flags & XFS_DIFLAG_IMMUTABLE)
>   inode->i_flags |= S_IMMUTABLE;
> @@ -1201,9 +1202,10 @@ xfs_diflags_to_iflags(
>   if (S_ISREG(inode->i_mode) 

Re: [PATCH v2 1/5] fs, xfs: introduce S_IOMAP_IMMUTABLE

2017-08-04 Thread Dan Williams
On Fri, Aug 4, 2017 at 1:00 PM, Darrick J. Wong  wrote:
> On Thu, Aug 03, 2017 at 07:28:10PM -0700, Dan Williams wrote:
>> An inode with this flag set indicates that the file's block map cannot
>> be changed from the currently allocated set.
>>
>> The implementation of toggling the flag and sealing the state of the
>> extent map is saved for a later patch. The functionality provided by
>> S_IOMAP_IMMUTABLE, once toggle support is added, will be a superset of
>> that provided by S_SWAPFILE, and it is targeted to replace it.
>>
>> For now, only xfs and the core vfs are updated to consider the new flag.
>>
>> The additional checks that are added for this flag, beyond what we are
>> already doing for swapfiles, are:
>> * fail writes that try to extend the file size
>> * fail attempts to directly change the allocation map via fallocate or
>>   xfs ioctls. This can be done centrally by blocking
>>   xfs_alloc_file_space and xfs_free_file_space when the flag is set.
>>
>> Cc: Jan Kara 
>> Cc: Jeff Moyer 
>> Cc: Christoph Hellwig 
>> Cc: Ross Zwisler 
>> Cc: Alexander Viro 
>> Suggested-by: Dave Chinner 
>> Suggested-by: "Darrick J. Wong" 
>> Signed-off-by: Dan Williams 
>> ---
>>  fs/attr.c  |   10 ++
>>  fs/open.c  |6 ++
>>  fs/read_write.c|3 +++
>>  fs/xfs/xfs_bmap_util.c |6 ++
>>  fs/xfs/xfs_ioctl.c |6 ++
>>  include/linux/fs.h |2 ++
>>  mm/filemap.c   |5 +
>>  7 files changed, 38 insertions(+)
>>
>> diff --git a/fs/attr.c b/fs/attr.c
>> index 135304146120..8573e364bd06 100644
>> --- a/fs/attr.c
>> +++ b/fs/attr.c
>> @@ -112,6 +112,16 @@ EXPORT_SYMBOL(setattr_prepare);
>>   */
>>  int inode_newsize_ok(const struct inode *inode, loff_t offset)
>>  {
>> + if (IS_IOMAP_IMMUTABLE(inode)) {
>> + /*
>> +  * Any size change is disallowed. Size increases may
>> +  * dirty metadata that an application is not prepared to
>> +  * sync, and a size decrease may expose free blocks to
>> +  * in-flight DMA.
>> +  */
>> + return -ETXTBSY;
>> + }
>> +
>>   if (inode->i_size < offset) {
>>   unsigned long limit;
>>
>> diff --git a/fs/open.c b/fs/open.c
>> index 35bb784763a4..7395860d7164 100644
>> --- a/fs/open.c
>> +++ b/fs/open.c
>> @@ -292,6 +292,12 @@ int vfs_fallocate(struct file *file, int mode, loff_t 
>> offset, loff_t len)
>>   return -ETXTBSY;
>>
>>   /*
>> +  * We cannot allow any allocation changes on an iomap immutable file
>> +  */
>> + if (IS_IOMAP_IMMUTABLE(inode))
>> + return -ETXTBSY;
>> +
>> + /*
>>* Revalidate the write permissions, in case security policy has
>>* changed since the files were opened.
>>*/
>> diff --git a/fs/read_write.c b/fs/read_write.c
>> index 0cc7033aa413..dc673be7c7cb 100644
>> --- a/fs/read_write.c
>> +++ b/fs/read_write.c
>> @@ -1706,6 +1706,9 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, 
>> loff_t pos_in,
>>   if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
>>   return -ETXTBSY;
>>
>> + if (IS_IOMAP_IMMUTABLE(inode_in) || IS_IOMAP_IMMUTABLE(inode_out))
>> + return -ETXTBSY;
>> +
>>   /* Don't reflink dirs, pipes, sockets... */
>>   if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
>>   return -EISDIR;
>> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
>> index 93e955262d07..fe0f8f7f4bb7 100644
>> --- a/fs/xfs/xfs_bmap_util.c
>> +++ b/fs/xfs/xfs_bmap_util.c
>> @@ -1044,6 +1044,9 @@ xfs_alloc_file_space(
>>   if (XFS_FORCED_SHUTDOWN(mp))
>>   return -EIO;
>>
>> + if (IS_IOMAP_IMMUTABLE(VFS_I(ip)))
>> + return -ETXTBSY;
>> +
>
> Hm.  The 'seal this up' caller in the next patch doesn't check for
> ETXTBSY (or if it does I missed that), so if you try to seal an already
> sealed file you'll get an error code even though you actually got the
> state you wanted.

That's a good point, I'll fix that up.

>
> Second question: How might we handle the situation where a filesystem
> /has/ to alter a block mapping?  Hypothetically, if the block layer
> tells the fs that some range of storage has gone bad and the fs decides
> to punch out that part of the file (or mark it unwritten or whatever) to
> avoid a machine check, can we lock out file IO, forcibly remove the
> mapping from memory, make whatever block map updates we want, and then
> unlock?

It's not clear that the filesystem /has/ to change the block mappings
when the backing media supports error clearing. Unlike bad DRAM ranges
where the address is permanently mapped out, we can clear pmem and
disk errors by writing new data. The bad block can be repaired 

Re: [PATCH v2 5/5] xfs: toggle XFS_DIFLAG2_IOMAP_IMMUTABLE in response to fallocate

2017-08-04 Thread Darrick J. Wong
On Thu, Aug 03, 2017 at 07:28:35PM -0700, Dan Williams wrote:
> After validating the state of the file as not having holes, shared
> extents, or active mappings try to commit the
> XFS_DIFLAG2_IOMAP_IMMUTABLE flag to the on-disk inode metadata. If that
> succeeds then allow the S_IOMAP_IMMUTABLE to be set on the vfs inode.
> 
> Cc: Jan Kara 
> Cc: Jeff Moyer 
> Cc: Christoph Hellwig 
> Cc: Ross Zwisler 
> Suggested-by: Dave Chinner 
> Suggested-by: "Darrick J. Wong" 
> Signed-off-by: Dan Williams 
> ---
>  fs/xfs/xfs_bmap_util.c |   32 
>  1 file changed, 32 insertions(+)
> 
> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> index 70ac2d33ab27..8464c25a2403 100644
> --- a/fs/xfs/xfs_bmap_util.c
> +++ b/fs/xfs/xfs_bmap_util.c
> @@ -1436,9 +1436,11 @@ xfs_seal_file_space(
>   xfs_off_t   offset,
>   xfs_off_t   len)
>  {
> + struct xfs_mount*mp = ip->i_mount;
>   struct inode*inode = VFS_I(ip);
>   struct address_space*mapping = inode->i_mapping;
>   int error;
> + struct xfs_trans*tp;
>
>   ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
>  
> @@ -1454,6 +1456,10 @@ xfs_seal_file_space(
>   if (error)
>   return error;
>  
> + error = xfs_trans_alloc(mp, _RES(mp)->tr_ichange, 0, 0, 0, );
> + if (error)
> + return error;
> +
>   xfs_ilock(ip, XFS_ILOCK_EXCL);
>   /*
>* Either the size changed after we performed allocation /
> @@ -1486,10 +1492,20 @@ xfs_seal_file_space(
>   if (error < 0)
>   goto out_unlock;
>  
> + xfs_trans_ijoin(tp, ip, 0);

FWIW if you change that third parameter to XFS_ILOCK_EXCL then
xfs_trans_commit will do the xfs_iunlock(ip, XFS_ILOCK_EXCL) for you if
the commit succeeds...

> + ip->i_d.di_flags2 |= XFS_DIFLAG2_IOMAP_IMMUTABLE;
> + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
> + error = xfs_trans_commit(tp);
> + tp = NULL; /* nothing to cancel */
> + if (error)
> + goto out_unlock;
> +
>   inode->i_flags |= S_IOMAP_IMMUTABLE;

...and then you can just return out here.

--D

>  out_unlock:
>   xfs_iunlock(ip, XFS_ILOCK_EXCL);
> + if (tp)
> + xfs_trans_cancel(tp);
>  
>   return error;
>  }
> @@ -1500,15 +1516,21 @@ xfs_unseal_file_space(
>   xfs_off_t   offset,
>   xfs_off_t   len)
>  {
> + struct xfs_mount*mp = ip->i_mount;
>   struct inode*inode = VFS_I(ip);
>   struct address_space*mapping = inode->i_mapping;
>   int error;
> + struct xfs_trans*tp;
>  
>   ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
>  
>   if (offset)
>   return -EINVAL;
>  
> + error = xfs_trans_alloc(mp, _RES(mp)->tr_ichange, 0, 0, 0, );
> + if (error)
> + return error;
> +
>   xfs_ilock(ip, XFS_ILOCK_EXCL);
>   /*
>* It does not make sense to unseal less than the full range of
> @@ -1527,11 +1549,21 @@ xfs_unseal_file_space(
>   if (mapping_mapped(mapping))
>   goto out_unlock;
>  
> + xfs_trans_ijoin(tp, ip, 0);
> + ip->i_d.di_flags2 &= ~XFS_DIFLAG2_IOMAP_IMMUTABLE;
> + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
> + error = xfs_trans_commit(tp);
> + tp = NULL; /* nothing to cancel */
> + if (error)
> + goto out_unlock;
> +
>   inode->i_flags &= ~S_IOMAP_IMMUTABLE;
>   error = 0;
>  
>  out_unlock:
>   xfs_iunlock(ip, XFS_ILOCK_EXCL);
> + if (tp)
> + xfs_trans_cancel(tp);
>  
>   return error;
>  }
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v2 3/5] fs, xfs: introduce FALLOC_FL_UNSEAL_BLOCK_MAP

2017-08-04 Thread Darrick J. Wong
On Thu, Aug 03, 2017 at 07:28:23PM -0700, Dan Williams wrote:
> Provide an explicit fallocate operation type for clearing the
> S_IOMAP_IMMUTABLE flag. Like the enable case it requires CAP_IMMUTABLE
> and it can only be performed while no process has the file mapped.
> 
> Cc: Jan Kara 
> Cc: Jeff Moyer 
> Cc: Christoph Hellwig 
> Cc: Ross Zwisler 
> Cc: Alexander Viro 
> Cc: "Darrick J. Wong" 
> Suggested-by: Dave Chinner 
> Signed-off-by: Dan Williams 
> ---
>  fs/open.c   |   17 +++--
>  fs/xfs/xfs_bmap_util.c  |   42 ++
>  fs/xfs/xfs_bmap_util.h  |3 +++
>  fs/xfs/xfs_file.c   |4 +++-
>  include/linux/falloc.h  |3 ++-
>  include/uapi/linux/falloc.h |1 +
>  6 files changed, 62 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/open.c b/fs/open.c
> index e3aae59785ae..ccfd8d3becc8 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -274,13 +274,17 @@ int vfs_fallocate(struct file *file, int mode, loff_t 
> offset, loff_t len)
>   return -EINVAL;
>  
>   /*
> -  * Seal block map operation should only be used exclusively, and
> -  * with the IMMUTABLE capability.
> +  * Seal/unseal block map operations should only be used
> +  * exclusively, and with the IMMUTABLE capability.
>*/
> - if (mode & FALLOC_FL_SEAL_BLOCK_MAP) {
> + if (mode & (FALLOC_FL_SEAL_BLOCK_MAP | FALLOC_FL_UNSEAL_BLOCK_MAP)) {
>   if (!capable(CAP_LINUX_IMMUTABLE))
>   return -EPERM;
> - if (mode & ~FALLOC_FL_SEAL_BLOCK_MAP)
> + if (mode == (FALLOC_FL_SEAL_BLOCK_MAP
> + | FALLOC_FL_UNSEAL_BLOCK_MAP))
> + return -EINVAL;
> + if (mode & ~(FALLOC_FL_SEAL_BLOCK_MAP
> + | FALLOC_FL_UNSEAL_BLOCK_MAP))
>   return -EINVAL;
>   }
>  
> @@ -303,9 +307,10 @@ int vfs_fallocate(struct file *file, int mode, loff_t 
> offset, loff_t len)
>   return -ETXTBSY;
>  
>   /*
> -  * We cannot allow any allocation changes on an iomap immutable file
> +  * We cannot allow any allocation changes on an iomap immutable
> +  * file, but we can allow clearing the immutable state.
>*/
> - if (IS_IOMAP_IMMUTABLE(inode))
> + if (IS_IOMAP_IMMUTABLE(inode) && !(mode & FALLOC_FL_UNSEAL_BLOCK_MAP))
>   return -ETXTBSY;
>  
>   /*
> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> index 46d8eb9e19fc..70ac2d33ab27 100644
> --- a/fs/xfs/xfs_bmap_util.c
> +++ b/fs/xfs/xfs_bmap_util.c
> @@ -1494,6 +1494,48 @@ xfs_seal_file_space(
>   return error;
>  }
>  
> +int
> +xfs_unseal_file_space(
> + struct xfs_inode*ip,
> + xfs_off_t   offset,
> + xfs_off_t   len)
> +{
> + struct inode*inode = VFS_I(ip);
> + struct address_space*mapping = inode->i_mapping;
> + int error;
> +
> + ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));

Same assert-on-the-iolock comment as the previous patch.

> +
> + if (offset)
> + return -EINVAL;
> +
> + xfs_ilock(ip, XFS_ILOCK_EXCL);
> + /*
> +  * It does not make sense to unseal less than the full range of
> +  * the file.
> +  */
> + error = -EINVAL;
> + if (len < i_size_read(inode))
> + goto out_unlock;

Hmm, should we be picky and require len == i_size_read() here?

> + /*
> +  * Provide safety against one thread changing the policy of not
> +  * requiring fsync/msync (for block allocations) behind another
> +  * thread's back.
> +  */
> + error = -EBUSY;
> + if (mapping_mapped(mapping))
> + goto out_unlock;
> +
> + inode->i_flags &= ~S_IOMAP_IMMUTABLE;

It occurred to me, should we jump out early from the seal/unseal
operations if the flag state matches whatever the user is asking for?
This is perhaps not necessary for unseal since we don't do a lot of
work.

--D

> + error = 0;
> +
> +out_unlock:
> + xfs_iunlock(ip, XFS_ILOCK_EXCL);
> +
> + return error;
> +}
> +
>  /*
>   * @next_fsb will keep track of the extent currently undergoing shift.
>   * @stop_fsb will keep track of the extent at which we have to stop.
> diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
> index 5115a32a2483..b64653a75942 100644
> --- a/fs/xfs/xfs_bmap_util.h
> +++ b/fs/xfs/xfs_bmap_util.h
> @@ -62,6 +62,9 @@ int xfs_insert_file_space(struct xfs_inode *, xfs_off_t 
> offset,
>   xfs_off_t len);
>  int  xfs_seal_file_space(struct xfs_inode *, xfs_off_t offset,
>   xfs_off_t len);
> +int  xfs_unseal_file_space(struct xfs_inode *, xfs_off_t 

Re: [4.9-stable PATCH] device-dax: fix sysfs duplicate warnings

2017-08-04 Thread Greg KH
On Thu, Aug 03, 2017 at 05:25:02PM -0700, Dan Williams wrote:
> commit bbb3be170ac2891526ad07b18af7db226879a8e7 upstream.
> 
> Fix warnings of the form...
> 
>  WARNING: CPU: 10 PID: 4983 at fs/sysfs/dir.c:31 sysfs_warn_dup+0x62/0x80
>  sysfs: cannot create duplicate filename '/class/dax/dax12.0'
>  Call Trace:
>   dump_stack+0x63/0x86
>   __warn+0xcb/0xf0
>   warn_slowpath_fmt+0x5a/0x80
>   ? kernfs_path_from_node+0x4f/0x60
>   sysfs_warn_dup+0x62/0x80
>   sysfs_do_create_link_sd.isra.2+0x97/0xb0
>   sysfs_create_link+0x25/0x40
>   device_add+0x266/0x630
>   devm_create_dax_dev+0x2cf/0x340 [dax]
>   dax_pmem_probe+0x1f5/0x26e [dax_pmem]
>   nvdimm_bus_probe+0x71/0x120
> 
> ...by reusing the namespace id for the device-dax instance name.
> 
> Now that we have decided that there will never by more than one
> device-dax instance per libnvdimm-namespace parent device [1], we can
> directly reuse the namepace ids. There are some possible follow-on
> cleanups, but those are saved for a later patch to simplify the -stable
> backport.
> 
> [1]: https://lists.01.org/pipermail/linux-nvdimm/2016-December/008266.html
> 
> Fixes: 98a29c39dc68 ("libnvdimm, namespace: allow creation of multiple 
> pmem...")
> Cc: Jeff Moyer 
> Cc: 
> Reported-by: Dariusz Dokupil 
> Signed-off-by: Dan Williams 
> ---
>  drivers/dax/dax.c  |   24 
>  drivers/dax/dax.h  |2 +-
>  drivers/dax/pmem.c |   12 +++-
>  3 files changed, 24 insertions(+), 14 deletions(-)

Thanks for the backport, now applied.

greg k-h
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v2 1/5] fs, xfs: introduce S_IOMAP_IMMUTABLE

2017-08-04 Thread Darrick J. Wong
On Thu, Aug 03, 2017 at 07:28:10PM -0700, Dan Williams wrote:
> An inode with this flag set indicates that the file's block map cannot
> be changed from the currently allocated set.
> 
> The implementation of toggling the flag and sealing the state of the
> extent map is saved for a later patch. The functionality provided by
> S_IOMAP_IMMUTABLE, once toggle support is added, will be a superset of
> that provided by S_SWAPFILE, and it is targeted to replace it.
> 
> For now, only xfs and the core vfs are updated to consider the new flag.
> 
> The additional checks that are added for this flag, beyond what we are
> already doing for swapfiles, are:
> * fail writes that try to extend the file size
> * fail attempts to directly change the allocation map via fallocate or
>   xfs ioctls. This can be done centrally by blocking
>   xfs_alloc_file_space and xfs_free_file_space when the flag is set.
> 
> Cc: Jan Kara 
> Cc: Jeff Moyer 
> Cc: Christoph Hellwig 
> Cc: Ross Zwisler 
> Cc: Alexander Viro 
> Suggested-by: Dave Chinner 
> Suggested-by: "Darrick J. Wong" 
> Signed-off-by: Dan Williams 
> ---
>  fs/attr.c  |   10 ++
>  fs/open.c  |6 ++
>  fs/read_write.c|3 +++
>  fs/xfs/xfs_bmap_util.c |6 ++
>  fs/xfs/xfs_ioctl.c |6 ++
>  include/linux/fs.h |2 ++
>  mm/filemap.c   |5 +
>  7 files changed, 38 insertions(+)
> 
> diff --git a/fs/attr.c b/fs/attr.c
> index 135304146120..8573e364bd06 100644
> --- a/fs/attr.c
> +++ b/fs/attr.c
> @@ -112,6 +112,16 @@ EXPORT_SYMBOL(setattr_prepare);
>   */
>  int inode_newsize_ok(const struct inode *inode, loff_t offset)
>  {
> + if (IS_IOMAP_IMMUTABLE(inode)) {
> + /*
> +  * Any size change is disallowed. Size increases may
> +  * dirty metadata that an application is not prepared to
> +  * sync, and a size decrease may expose free blocks to
> +  * in-flight DMA.
> +  */
> + return -ETXTBSY;
> + }
> +
>   if (inode->i_size < offset) {
>   unsigned long limit;
>  
> diff --git a/fs/open.c b/fs/open.c
> index 35bb784763a4..7395860d7164 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -292,6 +292,12 @@ int vfs_fallocate(struct file *file, int mode, loff_t 
> offset, loff_t len)
>   return -ETXTBSY;
>  
>   /*
> +  * We cannot allow any allocation changes on an iomap immutable file
> +  */
> + if (IS_IOMAP_IMMUTABLE(inode))
> + return -ETXTBSY;
> +
> + /*
>* Revalidate the write permissions, in case security policy has
>* changed since the files were opened.
>*/
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 0cc7033aa413..dc673be7c7cb 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1706,6 +1706,9 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, 
> loff_t pos_in,
>   if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
>   return -ETXTBSY;
>  
> + if (IS_IOMAP_IMMUTABLE(inode_in) || IS_IOMAP_IMMUTABLE(inode_out))
> + return -ETXTBSY;
> +
>   /* Don't reflink dirs, pipes, sockets... */
>   if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
>   return -EISDIR;
> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> index 93e955262d07..fe0f8f7f4bb7 100644
> --- a/fs/xfs/xfs_bmap_util.c
> +++ b/fs/xfs/xfs_bmap_util.c
> @@ -1044,6 +1044,9 @@ xfs_alloc_file_space(
>   if (XFS_FORCED_SHUTDOWN(mp))
>   return -EIO;
>  
> + if (IS_IOMAP_IMMUTABLE(VFS_I(ip)))
> + return -ETXTBSY;
> +

Hm.  The 'seal this up' caller in the next patch doesn't check for
ETXTBSY (or if it does I missed that), so if you try to seal an already
sealed file you'll get an error code even though you actually got the
state you wanted.

Second question: How might we handle the situation where a filesystem
/has/ to alter a block mapping?  Hypothetically, if the block layer
tells the fs that some range of storage has gone bad and the fs decides
to punch out that part of the file (or mark it unwritten or whatever) to
avoid a machine check, can we lock out file IO, forcibly remove the
mapping from memory, make whatever block map updates we want, and then
unlock?

(Conceptually, the bmbt rebuilder in the online fsck patchset operates
in a similar manner...)

--D

>   error = xfs_qm_dqattach(ip, 0);
>   if (error)
>   return error;
> @@ -1294,6 +1297,9 @@ xfs_free_file_space(
>  
>   trace_xfs_free_file_space(ip);
>  
> + if (IS_IOMAP_IMMUTABLE(VFS_I(ip)))
> + return -ETXTBSY;
> +
>   error = xfs_qm_dqattach(ip, 0);
>   if (error)
>   return error;
> diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c

Re: [PATCH v2 2/5] fs, xfs: introduce FALLOC_FL_SEAL_BLOCK_MAP

2017-08-04 Thread Dan Williams
On Fri, Aug 4, 2017 at 12:46 PM, Darrick J. Wong
 wrote:
> On Thu, Aug 03, 2017 at 07:28:17PM -0700, Dan Williams wrote:
>> >From falloc.h:
>>
>> FALLOC_FL_SEAL_BLOCK_MAP is used to seal (make immutable) all of the
>> file logical-to-physical extent offset mappings in the file. The
>> purpose is to allow an application to assume that there are no holes
>> or shared extents in the file and that the metadata needed to find
>> all the physical extents of the file is stable and can never be
>> dirtied.
>>
>> For now this patch only permits setting the in-memory state of
>> S_IOMAP_IMMMUTABLE. Support for clearing and persisting the state is
>> saved for later patches.
>>
>> The implementation is careful to not allow the immutable state to change
>> while any process might have any established mappings. It reuses the
>> existing xfs_reflink_unshare() and xfs_alloc_file_space() to unshare
>> extents and fill all holes in the file. It then holds XFS_ILOCK_EXCL
>> while it validates the file is in the proper state and sets
>> S_IOMAP_IMMUTABLE.
>>
>> Cc: Jan Kara 
>> Cc: Jeff Moyer 
>> Cc: Christoph Hellwig 
>> Cc: Ross Zwisler 
>> Cc: Alexander Viro 
>> Suggested-by: Dave Chinner 
>> Suggested-by: "Darrick J. Wong" 
>> Signed-off-by: Dan Williams 
>> ---
>>  fs/open.c   |   11 +
>>  fs/xfs/xfs_bmap_util.c  |  101 
>> +++
>>  fs/xfs/xfs_bmap_util.h  |2 +
>>  fs/xfs/xfs_file.c   |   14 --
>>  include/linux/falloc.h  |3 +
>>  include/uapi/linux/falloc.h |   19 
>>  6 files changed, 145 insertions(+), 5 deletions(-)
>>
>> diff --git a/fs/open.c b/fs/open.c
>> index 7395860d7164..e3aae59785ae 100644
>> --- a/fs/open.c
>> +++ b/fs/open.c
>> @@ -273,6 +273,17 @@ int vfs_fallocate(struct file *file, int mode, loff_t 
>> offset, loff_t len)
>>   (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE)))
>>   return -EINVAL;
>>
>> + /*
>> +  * Seal block map operation should only be used exclusively, and
>> +  * with the IMMUTABLE capability.
>> +  */
>> + if (mode & FALLOC_FL_SEAL_BLOCK_MAP) {
>> + if (!capable(CAP_LINUX_IMMUTABLE))
>> + return -EPERM;
>> + if (mode & ~FALLOC_FL_SEAL_BLOCK_MAP)
>> + return -EINVAL;
>> + }
>> +
>>   if (!(file->f_mode & FMODE_WRITE))
>>   return -EBADF;
>>
>> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
>> index fe0f8f7f4bb7..46d8eb9e19fc 100644
>> --- a/fs/xfs/xfs_bmap_util.c
>> +++ b/fs/xfs/xfs_bmap_util.c
>> @@ -1393,6 +1393,107 @@ xfs_zero_file_space(
>>
>>  }
>>
>> +/* Return 1 if hole detected, 0 if not, and < 0 if fail to determine */
>> +STATIC int
>> +xfs_file_has_holes(
>> + struct xfs_inode*ip)
>> +{
>> + struct xfs_mount*mp = ip->i_mount;
>> + struct xfs_bmbt_irec*map;
>> + const int   map_size = 10;  /* constrain memory overhead */
>> + int i, nmaps;
>> + int error = 0;
>> + xfs_fileoff_t   lblkno = 0;
>> + xfs_filblks_t   maxlblkcnt;
>> +
>> + map = kmem_alloc(map_size * sizeof(*map), KM_SLEEP);
>
> Sleeping with an inode fully locked and (eventually) a running
> transaction?  Yikes.
>
> Just allocate one xfs_bmbt_irec on the stack and pass in nmaps=1.
>
> This method might fit better in libxfs/xfs_bmap.c where it'll be
> able to scan the extent list more quickly with the iext helpers.

Ok, I'll take a look.

>
>> +
>> + maxlblkcnt = XFS_B_TO_FSB(mp, i_size_read(VFS_I(ip)));
>> + do {
>> + nmaps = map_size;
>> + error = xfs_bmapi_read(ip, lblkno, maxlblkcnt - lblkno,
>> +map, , 0);
>> + if (error)
>> + break;
>> +
>> + ASSERT(nmaps <= map_size);
>> + for (i = 0; i < nmaps; i++) {
>> + lblkno += map[i].br_blockcount;
>> + if (map[i].br_startblock == HOLESTARTBLOCK) {
>
> I think we also need to check for unwritten extents here, because a
> write to an unwritten block requires some zeroing and a mapping metdata
> update.

Will do.

>
>> + error = 1;
>> + break;
>> + }
>> + }
>> + } while (nmaps > 0 && error == 0);
>> +
>> + kmem_free(map);
>> + return error;
>> +}
>> +
>> +int
>> +xfs_seal_file_space(
>> + struct xfs_inode*ip,
>> + xfs_off_t   offset,
>> + xfs_off_t   len)
>> +{
>> + struct inode*inode = VFS_I(ip);
>> + struct address_space*mapping = inode->i_mapping;
>> + 

Re: [PATCH v2 2/5] fs, xfs: introduce FALLOC_FL_SEAL_BLOCK_MAP

2017-08-04 Thread Darrick J. Wong
On Thu, Aug 03, 2017 at 07:28:17PM -0700, Dan Williams wrote:
> >From falloc.h:
> 
> FALLOC_FL_SEAL_BLOCK_MAP is used to seal (make immutable) all of the
> file logical-to-physical extent offset mappings in the file. The
> purpose is to allow an application to assume that there are no holes
> or shared extents in the file and that the metadata needed to find
> all the physical extents of the file is stable and can never be
> dirtied.
> 
> For now this patch only permits setting the in-memory state of
> S_IOMAP_IMMMUTABLE. Support for clearing and persisting the state is
> saved for later patches.
> 
> The implementation is careful to not allow the immutable state to change
> while any process might have any established mappings. It reuses the
> existing xfs_reflink_unshare() and xfs_alloc_file_space() to unshare
> extents and fill all holes in the file. It then holds XFS_ILOCK_EXCL
> while it validates the file is in the proper state and sets
> S_IOMAP_IMMUTABLE.
> 
> Cc: Jan Kara 
> Cc: Jeff Moyer 
> Cc: Christoph Hellwig 
> Cc: Ross Zwisler 
> Cc: Alexander Viro 
> Suggested-by: Dave Chinner 
> Suggested-by: "Darrick J. Wong" 
> Signed-off-by: Dan Williams 
> ---
>  fs/open.c   |   11 +
>  fs/xfs/xfs_bmap_util.c  |  101 
> +++
>  fs/xfs/xfs_bmap_util.h  |2 +
>  fs/xfs/xfs_file.c   |   14 --
>  include/linux/falloc.h  |3 +
>  include/uapi/linux/falloc.h |   19 
>  6 files changed, 145 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/open.c b/fs/open.c
> index 7395860d7164..e3aae59785ae 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -273,6 +273,17 @@ int vfs_fallocate(struct file *file, int mode, loff_t 
> offset, loff_t len)
>   (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE)))
>   return -EINVAL;
>  
> + /*
> +  * Seal block map operation should only be used exclusively, and
> +  * with the IMMUTABLE capability.
> +  */
> + if (mode & FALLOC_FL_SEAL_BLOCK_MAP) {
> + if (!capable(CAP_LINUX_IMMUTABLE))
> + return -EPERM;
> + if (mode & ~FALLOC_FL_SEAL_BLOCK_MAP)
> + return -EINVAL;
> + }
> +
>   if (!(file->f_mode & FMODE_WRITE))
>   return -EBADF;
>  
> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> index fe0f8f7f4bb7..46d8eb9e19fc 100644
> --- a/fs/xfs/xfs_bmap_util.c
> +++ b/fs/xfs/xfs_bmap_util.c
> @@ -1393,6 +1393,107 @@ xfs_zero_file_space(
>  
>  }
>  
> +/* Return 1 if hole detected, 0 if not, and < 0 if fail to determine */
> +STATIC int
> +xfs_file_has_holes(
> + struct xfs_inode*ip)
> +{
> + struct xfs_mount*mp = ip->i_mount;
> + struct xfs_bmbt_irec*map;
> + const int   map_size = 10;  /* constrain memory overhead */
> + int i, nmaps;
> + int error = 0;
> + xfs_fileoff_t   lblkno = 0;
> + xfs_filblks_t   maxlblkcnt;
> +
> + map = kmem_alloc(map_size * sizeof(*map), KM_SLEEP);

Sleeping with an inode fully locked and (eventually) a running
transaction?  Yikes.

Just allocate one xfs_bmbt_irec on the stack and pass in nmaps=1.

This method might fit better in libxfs/xfs_bmap.c where it'll be
able to scan the extent list more quickly with the iext helpers.

> +
> + maxlblkcnt = XFS_B_TO_FSB(mp, i_size_read(VFS_I(ip)));
> + do {
> + nmaps = map_size;
> + error = xfs_bmapi_read(ip, lblkno, maxlblkcnt - lblkno,
> +map, , 0);
> + if (error)
> + break;
> +
> + ASSERT(nmaps <= map_size);
> + for (i = 0; i < nmaps; i++) {
> + lblkno += map[i].br_blockcount;
> + if (map[i].br_startblock == HOLESTARTBLOCK) {

I think we also need to check for unwritten extents here, because a
write to an unwritten block requires some zeroing and a mapping metdata
update.

> + error = 1;
> + break;
> + }
> + }
> + } while (nmaps > 0 && error == 0);
> +
> + kmem_free(map);
> + return error;
> +}
> +
> +int
> +xfs_seal_file_space(
> + struct xfs_inode*ip,
> + xfs_off_t   offset,
> + xfs_off_t   len)
> +{
> + struct inode*inode = VFS_I(ip);
> + struct address_space*mapping = inode->i_mapping;
> + int error;
> +
> + ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));

The IOLOCK must be held here too.

> +
> + if (offset)
> + return -EINVAL;
> +
> + error = xfs_reflink_unshare(ip, offset, len);
> + 

Re: [PATCH 5/6] arm64: Implement pmem API support

2017-08-04 Thread Dan Williams
On Fri, Aug 4, 2017 at 11:35 AM, Robin Murphy  wrote:
> On 04/08/17 19:09, Dan Williams wrote:
>> On Fri, Aug 4, 2017 at 10:43 AM, Robin Murphy  wrote:
>>> On 04/08/17 16:25, Catalin Marinas wrote:
 Two minor comments below.

 On Tue, Jul 25, 2017 at 11:55:42AM +0100, Robin Murphy wrote:
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -960,6 +960,17 @@ config ARM64_UAO
>regular load/store instructions if the cpu does not implement the
>feature.
>
> +config ARM64_PMEM
> +bool "Enable support for persistent memory"
> +select ARCH_HAS_PMEM_API
> +help
> +  Say Y to enable support for the persistent memory API based on the
> +  ARMv8.2 DCPoP feature.
> +
> +  The feature is detected at runtime, and the kernel will use DC CVAC
> +  operations if DC CVAP is not supported (following the behaviour of
> +  DC CVAP itself if the system does not define a point of 
> persistence).

 Any reason not to have this default y?
>>>
>>> Mostly because it's untested, and not actually useful without some way
>>> of describing persistent memory regions to the kernel (I'm currently
>>> trying to make sense of what exactly ARCH_HAS_MMIO_FLUSH is supposed to
>>> mean in order to enable ACPI NFIT support).
>>
>> This is related to block-aperture support described by the NFIT where
>> a sliding-memory-mapped window can be programmed to access different
>> ranges of the NVDIMM. Before the window is programmed to a new
>> DIMM-address we need to flush any dirty data through the current
>> window setting to media. See the call to mmio_flush_range() in
>> acpi_nfit_blk_single_io(). I think it's ok to omit ARCH_HAS_MMIO_FLUSH
>> support, and add a configuration option to compile out the
>> block-aperture support.
>
> Oh, I have every intention of implementing it one way or another if
> necessary - it's not difficult, it's just been a question of working
> through the NFIT code to figure out the subtleties of translation to
> arm64 ;)
>
> If mmio_flush_range() is for true MMIO (i.e. __iomem) mappings, then
> arm64 should only need a barrier, rather than actual cache operations.
> If on the other hand it's misleadingly named and only actually used on
> MEMREMAP_WB mappings (as I'm staring to think it might be), then I can't
> help thinking it could simply go away in favour of arch_wb_pmem(), since
> that now seems to have those same semantics and intent, plus a much more
> appropriate name.
>

The mapping type of block-apertures is up to the architecture, so you
could mark them uncacheable and not worry about mmio_flush_range().
Also, arch_wb_pmem() is not a replacement for mmio_flush_range() since
we also need the cache to be invalidated. arch_wb_pmem() is allowed to
leave clean cache lines present.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH 5/6] arm64: Implement pmem API support

2017-08-04 Thread Robin Murphy
On 04/08/17 19:09, Dan Williams wrote:
> On Fri, Aug 4, 2017 at 10:43 AM, Robin Murphy  wrote:
>> On 04/08/17 16:25, Catalin Marinas wrote:
>>> Two minor comments below.
>>>
>>> On Tue, Jul 25, 2017 at 11:55:42AM +0100, Robin Murphy wrote:
 --- a/arch/arm64/Kconfig
 +++ b/arch/arm64/Kconfig
 @@ -960,6 +960,17 @@ config ARM64_UAO
regular load/store instructions if the cpu does not implement the
feature.

 +config ARM64_PMEM
 +bool "Enable support for persistent memory"
 +select ARCH_HAS_PMEM_API
 +help
 +  Say Y to enable support for the persistent memory API based on the
 +  ARMv8.2 DCPoP feature.
 +
 +  The feature is detected at runtime, and the kernel will use DC CVAC
 +  operations if DC CVAP is not supported (following the behaviour of
 +  DC CVAP itself if the system does not define a point of 
 persistence).
>>>
>>> Any reason not to have this default y?
>>
>> Mostly because it's untested, and not actually useful without some way
>> of describing persistent memory regions to the kernel (I'm currently
>> trying to make sense of what exactly ARCH_HAS_MMIO_FLUSH is supposed to
>> mean in order to enable ACPI NFIT support).
> 
> This is related to block-aperture support described by the NFIT where
> a sliding-memory-mapped window can be programmed to access different
> ranges of the NVDIMM. Before the window is programmed to a new
> DIMM-address we need to flush any dirty data through the current
> window setting to media. See the call to mmio_flush_range() in
> acpi_nfit_blk_single_io(). I think it's ok to omit ARCH_HAS_MMIO_FLUSH
> support, and add a configuration option to compile out the
> block-aperture support.

Oh, I have every intention of implementing it one way or another if
necessary - it's not difficult, it's just been a question of working
through the NFIT code to figure out the subtleties of translation to
arm64 ;)

If mmio_flush_range() is for true MMIO (i.e. __iomem) mappings, then
arm64 should only need a barrier, rather than actual cache operations.
If on the other hand it's misleadingly named and only actually used on
MEMREMAP_WB mappings (as I'm staring to think it might be), then I can't
help thinking it could simply go away in favour of arch_wb_pmem(), since
that now seems to have those same semantics and intent, plus a much more
appropriate name.

Robin.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH 0/3] remove rw_page() from brd, pmem and btt

2017-08-04 Thread Dan Williams
On Fri, Aug 4, 2017 at 11:21 AM, Ross Zwisler
 wrote:
> On Fri, Aug 04, 2017 at 11:01:08AM -0700, Dan Williams wrote:
>> [ adding Dave who is working on a blk-mq + dma offload version of the
>> pmem driver ]
>>
>> On Fri, Aug 4, 2017 at 1:17 AM, Minchan Kim  wrote:
>> > On Fri, Aug 04, 2017 at 12:54:41PM +0900, Minchan Kim wrote:
>> [..]
>> >> Thanks for the testing. Your testing number is within noise level?
>> >>
>> >> I cannot understand why PMEM doesn't have enough gain while BTT is 
>> >> significant
>> >> win(8%). I guess no rw_page with BTT testing had more chances to wait bio 
>> >> dynamic
>> >> allocation and mine and rw_page testing reduced it significantly. However,
>> >> in no rw_page with pmem, there wasn't many cases to wait bio allocations 
>> >> due
>> >> to the device is so fast so the number comes from purely the number of
>> >> instructions has done. At a quick glance of bio init/submit, it's not 
>> >> trivial
>> >> so indeed, i understand where the 12% enhancement comes from but I'm not 
>> >> sure
>> >> it's really big difference in real practice at the cost of maintaince 
>> >> burden.
>> >
>> > I tested pmbench 10 times in my local machine(4 core) with zram-swap.
>> > In my machine, even, on-stack bio is faster than rw_page. Unbelievable.
>> >
>> > I guess it's really hard to get stable result in severe memory pressure.
>> > It would be a result within noise level(see below stddev).
>> > So, I think it's hard to conclude rw_page is far faster than onstack-bio.
>> >
>> > rw_page
>> > avg 5.54us
>> > stddev  8.89%
>> > max 6.02us
>> > min 4.20us
>> >
>> > onstack bio
>> > avg 5.27us
>> > stddev  13.03%
>> > max 5.96us
>> > min 3.55us
>>
>> The maintenance burden of having alternative submission paths is
>> significant especially as we consider the pmem driver ising more
>> services of the core block layer. Ideally, I'd want to complete the
>> rw_page removal work before we look at the blk-mq + dma offload
>> reworks.
>>
>> The change to introduce BDI_CAP_SYNC is interesting because we might
>> have use for switching between dma offload and cpu copy based on
>> whether the I/O is synchronous or otherwise hinted to be a low latency
>> request. Right now the dma offload patches are using "bio_segments() >
>> 1" as the gate for selecting offload vs cpu copy which seem
>> inadequate.
>
> Okay, so based on the feedback above and from Jens[1], it sounds like we want
> to go forward with removing the rw_page() interface, and instead optimize the
> regular I/O path via on-stack BIOS and dma offload, correct?
>
> If so, I'll prepare patches that fully remove the rw_page() code, and let
> Minchan and Dave work on their optimizations.

I think the conversion to on-stack-bio should be done in the same
patchset that removes rw_page. We don't want to leave a known
performance regression while the on-stack-bio work is in-flight.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH 0/3] remove rw_page() from brd, pmem and btt

2017-08-04 Thread Ross Zwisler
On Fri, Aug 04, 2017 at 11:01:08AM -0700, Dan Williams wrote:
> [ adding Dave who is working on a blk-mq + dma offload version of the
> pmem driver ]
> 
> On Fri, Aug 4, 2017 at 1:17 AM, Minchan Kim  wrote:
> > On Fri, Aug 04, 2017 at 12:54:41PM +0900, Minchan Kim wrote:
> [..]
> >> Thanks for the testing. Your testing number is within noise level?
> >>
> >> I cannot understand why PMEM doesn't have enough gain while BTT is 
> >> significant
> >> win(8%). I guess no rw_page with BTT testing had more chances to wait bio 
> >> dynamic
> >> allocation and mine and rw_page testing reduced it significantly. However,
> >> in no rw_page with pmem, there wasn't many cases to wait bio allocations 
> >> due
> >> to the device is so fast so the number comes from purely the number of
> >> instructions has done. At a quick glance of bio init/submit, it's not 
> >> trivial
> >> so indeed, i understand where the 12% enhancement comes from but I'm not 
> >> sure
> >> it's really big difference in real practice at the cost of maintaince 
> >> burden.
> >
> > I tested pmbench 10 times in my local machine(4 core) with zram-swap.
> > In my machine, even, on-stack bio is faster than rw_page. Unbelievable.
> >
> > I guess it's really hard to get stable result in severe memory pressure.
> > It would be a result within noise level(see below stddev).
> > So, I think it's hard to conclude rw_page is far faster than onstack-bio.
> >
> > rw_page
> > avg 5.54us
> > stddev  8.89%
> > max 6.02us
> > min 4.20us
> >
> > onstack bio
> > avg 5.27us
> > stddev  13.03%
> > max 5.96us
> > min 3.55us
> 
> The maintenance burden of having alternative submission paths is
> significant especially as we consider the pmem driver ising more
> services of the core block layer. Ideally, I'd want to complete the
> rw_page removal work before we look at the blk-mq + dma offload
> reworks.
> 
> The change to introduce BDI_CAP_SYNC is interesting because we might
> have use for switching between dma offload and cpu copy based on
> whether the I/O is synchronous or otherwise hinted to be a low latency
> request. Right now the dma offload patches are using "bio_segments() >
> 1" as the gate for selecting offload vs cpu copy which seem
> inadequate.

Okay, so based on the feedback above and from Jens[1], it sounds like we want
to go forward with removing the rw_page() interface, and instead optimize the
regular I/O path via on-stack BIOS and dma offload, correct?

If so, I'll prepare patches that fully remove the rw_page() code, and let
Minchan and Dave work on their optimizations.

[1]: https://lkml.org/lkml/2017/8/3/803
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH 5/6] arm64: Implement pmem API support

2017-08-04 Thread Dan Williams
On Fri, Aug 4, 2017 at 10:43 AM, Robin Murphy  wrote:
> On 04/08/17 16:25, Catalin Marinas wrote:
>> Two minor comments below.
>>
>> On Tue, Jul 25, 2017 at 11:55:42AM +0100, Robin Murphy wrote:
>>> --- a/arch/arm64/Kconfig
>>> +++ b/arch/arm64/Kconfig
>>> @@ -960,6 +960,17 @@ config ARM64_UAO
>>>regular load/store instructions if the cpu does not implement the
>>>feature.
>>>
>>> +config ARM64_PMEM
>>> +bool "Enable support for persistent memory"
>>> +select ARCH_HAS_PMEM_API
>>> +help
>>> +  Say Y to enable support for the persistent memory API based on the
>>> +  ARMv8.2 DCPoP feature.
>>> +
>>> +  The feature is detected at runtime, and the kernel will use DC CVAC
>>> +  operations if DC CVAP is not supported (following the behaviour of
>>> +  DC CVAP itself if the system does not define a point of persistence).
>>
>> Any reason not to have this default y?
>
> Mostly because it's untested, and not actually useful without some way
> of describing persistent memory regions to the kernel (I'm currently
> trying to make sense of what exactly ARCH_HAS_MMIO_FLUSH is supposed to
> mean in order to enable ACPI NFIT support).

This is related to block-aperture support described by the NFIT where
a sliding-memory-mapped window can be programmed to access different
ranges of the NVDIMM. Before the window is programmed to a new
DIMM-address we need to flush any dirty data through the current
window setting to media. See the call to mmio_flush_range() in
acpi_nfit_blk_single_io(). I think it's ok to omit ARCH_HAS_MMIO_FLUSH
support, and add a configuration option to compile out the
block-aperture support.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH 0/3] remove rw_page() from brd, pmem and btt

2017-08-04 Thread Dan Williams
[ adding Dave who is working on a blk-mq + dma offload version of the
pmem driver ]

On Fri, Aug 4, 2017 at 1:17 AM, Minchan Kim  wrote:
> On Fri, Aug 04, 2017 at 12:54:41PM +0900, Minchan Kim wrote:
[..]
>> Thanks for the testing. Your testing number is within noise level?
>>
>> I cannot understand why PMEM doesn't have enough gain while BTT is 
>> significant
>> win(8%). I guess no rw_page with BTT testing had more chances to wait bio 
>> dynamic
>> allocation and mine and rw_page testing reduced it significantly. However,
>> in no rw_page with pmem, there wasn't many cases to wait bio allocations due
>> to the device is so fast so the number comes from purely the number of
>> instructions has done. At a quick glance of bio init/submit, it's not trivial
>> so indeed, i understand where the 12% enhancement comes from but I'm not sure
>> it's really big difference in real practice at the cost of maintaince burden.
>
> I tested pmbench 10 times in my local machine(4 core) with zram-swap.
> In my machine, even, on-stack bio is faster than rw_page. Unbelievable.
>
> I guess it's really hard to get stable result in severe memory pressure.
> It would be a result within noise level(see below stddev).
> So, I think it's hard to conclude rw_page is far faster than onstack-bio.
>
> rw_page
> avg 5.54us
> stddev  8.89%
> max 6.02us
> min 4.20us
>
> onstack bio
> avg 5.27us
> stddev  13.03%
> max 5.96us
> min 3.55us

The maintenance burden of having alternative submission paths is
significant especially as we consider the pmem driver ising more
services of the core block layer. Ideally, I'd want to complete the
rw_page removal work before we look at the blk-mq + dma offload
reworks.

The change to introduce BDI_CAP_SYNC is interesting because we might
have use for switching between dma offload and cpu copy based on
whether the I/O is synchronous or otherwise hinted to be a low latency
request. Right now the dma offload patches are using "bio_segments() >
1" as the gate for selecting offload vs cpu copy which seem
inadequate.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH 5/6] arm64: Implement pmem API support

2017-08-04 Thread Robin Murphy
On 04/08/17 16:25, Catalin Marinas wrote:
> Two minor comments below.
> 
> On Tue, Jul 25, 2017 at 11:55:42AM +0100, Robin Murphy wrote:
>> --- a/arch/arm64/Kconfig
>> +++ b/arch/arm64/Kconfig
>> @@ -960,6 +960,17 @@ config ARM64_UAO
>>regular load/store instructions if the cpu does not implement the
>>feature.
>>  
>> +config ARM64_PMEM
>> +bool "Enable support for persistent memory"
>> +select ARCH_HAS_PMEM_API
>> +help
>> +  Say Y to enable support for the persistent memory API based on the
>> +  ARMv8.2 DCPoP feature.
>> +
>> +  The feature is detected at runtime, and the kernel will use DC CVAC
>> +  operations if DC CVAP is not supported (following the behaviour of
>> +  DC CVAP itself if the system does not define a point of persistence).
> 
> Any reason not to have this default y?

Mostly because it's untested, and not actually useful without some way
of describing persistent memory regions to the kernel (I'm currently
trying to make sense of what exactly ARCH_HAS_MMIO_FLUSH is supposed to
mean in order to enable ACPI NFIT support).

There's also the potential issue that we can't disable ARCH_HAS_PMEM_API
at runtime on pre-v8.2 systems where DC CVAC may not strictly give the
guarantee of persistence that that is supposed to imply. However, I
guess that's more of an open problem, since even on a v8.2 CPU reporting
(mandatory) DC CVAP support we've still no way to actually know whether
the interconnect/memory controller/etc. of any old system is up to the
job. At this point I'm mostly hoping that people will only be sticking
NVDIMMs into systems that *are* properly designed to support them, v8.2
CPUs or not.

>> --- a/arch/arm64/mm/cache.S
>> +++ b/arch/arm64/mm/cache.S
>> @@ -172,6 +172,20 @@ ENDPIPROC(__clean_dcache_area_poc)
>>  ENDPROC(__dma_clean_area)
>>  
>>  /*
>> + *  __clean_dcache_area_pop(kaddr, size)
>> + *
>> + *  Ensure that any D-cache lines for the interval [kaddr, kaddr+size)
>> + *  are cleaned to the PoP.
>> + *
>> + *  - kaddr   - kernel address
>> + *  - size- size in question
>> + */
>> +ENTRY(__clean_dcache_area_pop)
>> +dcache_by_line_op cvap, sy, x0, x1, x2, x3
>> +ret
>> +ENDPIPROC(__clean_dcache_area_pop)
>> +
>> +/*
>>   *  __dma_flush_area(start, size)
>>   *
>>   *  clean & invalidate D / U line
>> diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
>> index a682a0a2a0fa..a461a00ceb3e 100644
>> --- a/arch/arm64/mm/pageattr.c
>> +++ b/arch/arm64/mm/pageattr.c
>> @@ -183,3 +183,21 @@ bool kernel_page_present(struct page *page)
>>  }
>>  #endif /* CONFIG_HIBERNATION */
>>  #endif /* CONFIG_DEBUG_PAGEALLOC */
>> +
>> +#ifdef CONFIG_ARCH_HAS_PMEM_API
>> +#include 
>> +
>> +static inline void arch_wb_cache_pmem(void *addr, size_t size)
>> +{
>> +/* Ensure order against any prior non-cacheable writes */
>> +dmb(sy);
>> +__clean_dcache_area_pop(addr, size);
>> +}
> 
> Could we keep the dmb() in the actual __clean_dcache_area_pop()
> implementation?

Mark held the opinion that it should follow the same pattern as the
other cache maintenance primitives - e.g. we don't have such a dmb in
__inval_cache_range(), but do place them at callsites where we know it
may be necessary (head.S) - and I found it hard to disagree. The callers
in patch #6 should never need a barrier, and arguably we may not even
need this one, since it looks like pmem should currently always be
mapped as MEMREMAP_WB if ARCH_HAS_PMEM_API.

> I can do the changes myself if you don't have any objections.

If you would prefer to play safe and move it back into the assembly
that's fine by me, but note that the associated comments in patch #6
should also be removed if so.

Robin.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH 5/5] libnvdimm: add DMA support for pmem blk-mq

2017-08-04 Thread Dan Williams
On Thu, Aug 3, 2017 at 11:07 PM, Johannes Thumshirn  wrote:
> On Thu, Aug 03, 2017 at 09:15:12AM -0700, Dan Williams wrote:
>> > I can put in a tunable knob like Johannes suggested and leave the
>> > default for that to what it is currently. And we can adjust as necessary
>> > as we do more testing.
>>
>> That's the problem, we need a multi-dimensional knob. It's not just
>> transfer-size that effects total delivered bandwidth.
>
> If Dan opposes it so much, I'm OK with no tunable. I'm just expressing my
> concern that we get reports like this [1] one here as a result.

Sorry if I came off harsh. In that report from Mel he's showing a slow
down in a representative workload not just a canned bandwidth
benchmark. I'm worried that tuning for a single dimension on an fio
run doesn't actually improve throughput delivered to an application.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH 5/6] arm64: Implement pmem API support

2017-08-04 Thread Catalin Marinas
Two minor comments below.

On Tue, Jul 25, 2017 at 11:55:42AM +0100, Robin Murphy wrote:
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -960,6 +960,17 @@ config ARM64_UAO
> regular load/store instructions if the cpu does not implement the
> feature.
>  
> +config ARM64_PMEM
> + bool "Enable support for persistent memory"
> + select ARCH_HAS_PMEM_API
> + help
> +   Say Y to enable support for the persistent memory API based on the
> +   ARMv8.2 DCPoP feature.
> +
> +   The feature is detected at runtime, and the kernel will use DC CVAC
> +   operations if DC CVAP is not supported (following the behaviour of
> +   DC CVAP itself if the system does not define a point of persistence).

Any reason not to have this default y?

> --- a/arch/arm64/mm/cache.S
> +++ b/arch/arm64/mm/cache.S
> @@ -172,6 +172,20 @@ ENDPIPROC(__clean_dcache_area_poc)
>  ENDPROC(__dma_clean_area)
>  
>  /*
> + *   __clean_dcache_area_pop(kaddr, size)
> + *
> + *   Ensure that any D-cache lines for the interval [kaddr, kaddr+size)
> + *   are cleaned to the PoP.
> + *
> + *   - kaddr   - kernel address
> + *   - size- size in question
> + */
> +ENTRY(__clean_dcache_area_pop)
> + dcache_by_line_op cvap, sy, x0, x1, x2, x3
> + ret
> +ENDPIPROC(__clean_dcache_area_pop)
> +
> +/*
>   *   __dma_flush_area(start, size)
>   *
>   *   clean & invalidate D / U line
> diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
> index a682a0a2a0fa..a461a00ceb3e 100644
> --- a/arch/arm64/mm/pageattr.c
> +++ b/arch/arm64/mm/pageattr.c
> @@ -183,3 +183,21 @@ bool kernel_page_present(struct page *page)
>  }
>  #endif /* CONFIG_HIBERNATION */
>  #endif /* CONFIG_DEBUG_PAGEALLOC */
> +
> +#ifdef CONFIG_ARCH_HAS_PMEM_API
> +#include 
> +
> +static inline void arch_wb_cache_pmem(void *addr, size_t size)
> +{
> + /* Ensure order against any prior non-cacheable writes */
> + dmb(sy);
> + __clean_dcache_area_pop(addr, size);
> +}

Could we keep the dmb() in the actual __clean_dcache_area_pop()
implementation?

I can do the changes myself if you don't have any objections.

-- 
Catalin
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [RFC 01/16] NOVA: Documentation

2017-08-04 Thread Bart Van Assche
On Thu, 2017-08-03 at 00:48 -0700, Steven Swanson wrote:
> +### DAX Support
> +
> +Supporting DAX efficiently is a core feature of NOVA and one of the 
> challenges
> +in designing NOVA is reconciling DAX support which aims to avoid file system
> +intervention when file data changes, and other features that require such
> +intervention.
> +
> +NOVA's philosophy with respect to DAX is that when a program uses DAX mmap to
> +to modify a file, the program must take full responsibility for that data and
> +NOVA must ensure that the memory will behave as expected.  At other times, 
> the
> +file system provides protection.  This approach has several implications:
> +
> +1. Implementing `msync()` in user space works fine.
> +
> +2. While a file is mmap'd, it is not protected by NOVA's RAID-style parity
> +mechanism, because protecting it would be too expensive.  When the file is
> +unmapped and/or during file system recovery, protection is restored.
> +
> +3. The snapshot mechanism must be careful about the order in which in adds
> +pages to the file's snapshot image.

Hello Steven,

Thank you for having shared this very interesting work. After having read the
NOVA paper and patch 01/16 I have a question for you. Does the above mean that
COW is disabled for writable mmap-ed files? If so, what is the reason behind
this? Is there a fundamental issue that does not allow to implement COW for
writable mmap-ed files? Or have you perhaps tried to implement this and was the
performance not sufficient? Please note that I'm neither a filesystem nor a
persistent memory expert.

Thanks,

Bart.
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v3 6/8] dmaengine: add SG support to dmaengine_unmap

2017-08-04 Thread kbuild test robot
Hi Dave,

[auto build test WARNING on linus/master]
[also build test WARNING on v4.13-rc3]
[cannot apply to next-20170804]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Dave-Jiang/Adding-blk-mq-and-DMA-support-to-pmem-block-driver/20170804-191719
config: i386-randconfig-x003-201731 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
# save the attached .config to linux build tree
make ARCH=i386 

All warnings (new ones prefixed by >>):

   drivers//dma/dmaengine.c: In function 'dmaengine_unmap':
>> drivers//dma/dmaengine.c:1140:8: warning: cast to pointer from integer of 
>> different size [-Wint-to-pointer-cast]
  sg = (struct scatterlist *)unmap->addr[i];
   ^
   drivers//dma/dmaengine.c:1151:8: warning: cast to pointer from integer of 
different size [-Wint-to-pointer-cast]
  sg = (struct scatterlist *)unmap->addr[i];
   ^
   Cyclomatic Complexity 5 include/linux/compiler.h:__read_once_size
   Cyclomatic Complexity 5 include/linux/compiler.h:__write_once_size
   Cyclomatic Complexity 3 arch/x86/include/asm/bitops.h:set_bit
   Cyclomatic Complexity 3 arch/x86/include/asm/bitops.h:clear_bit
   Cyclomatic Complexity 1 arch/x86/include/asm/bitops.h:constant_test_bit
   Cyclomatic Complexity 1 arch/x86/include/asm/bitops.h:variable_test_bit
   Cyclomatic Complexity 1 arch/x86/include/asm/bitops.h:fls
   Cyclomatic Complexity 1 include/linux/log2.h:__ilog2_u32
   Cyclomatic Complexity 1 include/linux/list.h:__list_add_valid
   Cyclomatic Complexity 1 include/linux/list.h:__list_del_entry_valid
   Cyclomatic Complexity 1 include/linux/list.h:__list_del
   Cyclomatic Complexity 2 include/linux/list.h:__list_del_entry
   Cyclomatic Complexity 1 include/linux/err.h:ERR_PTR
   Cyclomatic Complexity 1 include/linux/err.h:PTR_ERR
   Cyclomatic Complexity 1 arch/x86/include/asm/atomic.h:atomic_read
   Cyclomatic Complexity 1 arch/x86/include/asm/atomic.h:atomic_set
   Cyclomatic Complexity 1 arch/x86/include/asm/atomic.h:atomic_inc
   Cyclomatic Complexity 1 arch/x86/include/asm/atomic.h:atomic_dec
   Cyclomatic Complexity 1 arch/x86/include/asm/atomic.h:atomic_dec_and_test
   Cyclomatic Complexity 1 include/asm-generic/getorder.h:__get_order
   Cyclomatic Complexity 1 arch/x86/include/asm/processor.h:rep_nop
   Cyclomatic Complexity 1 arch/x86/include/asm/processor.h:cpu_relax
   Cyclomatic Complexity 1 include/linux/jiffies.h:_msecs_to_jiffies
   Cyclomatic Complexity 5 include/linux/jiffies.h:msecs_to_jiffies
   Cyclomatic Complexity 1 include/linux/refcount.h:refcount_set
   Cyclomatic Complexity 1 include/linux/kref.h:kref_init
   Cyclomatic Complexity 1 include/linux/kobject.h:kobject_name
   Cyclomatic Complexity 1 include/linux/rculist.h:list_del_rcu
   Cyclomatic Complexity 1 include/linux/device.h:dev_to_node
   Cyclomatic Complexity 1 include/linux/dma-debug.h:debug_dma_unmap_page
   Cyclomatic Complexity 1 include/linux/dma-debug.h:debug_dma_unmap_sg
   Cyclomatic Complexity 1 include/linux/dma-mapping.h:valid_dma_direction
   Cyclomatic Complexity 1 arch/x86/include/asm/dma-mapping.h:get_arch_dma_ops
   Cyclomatic Complexity 1 include/linux/module.h:__module_get
   Cyclomatic Complexity 1 include/linux/module.h:try_module_get
   Cyclomatic Complexity 1 include/linux/module.h:module_put
   Cyclomatic Complexity 1 include/linux/dmaengine.h:txd_lock
   Cyclomatic Complexity 1 include/linux/dmaengine.h:txd_unlock
   Cyclomatic Complexity 1 include/linux/dmaengine.h:txd_clear_parent
   Cyclomatic Complexity 1 include/linux/dmaengine.h:txd_clear_next
   Cyclomatic Complexity 1 include/linux/dmaengine.h:txd_next
   Cyclomatic Complexity 1 include/linux/dmaengine.h:__dma_cap_set
   Cyclomatic Complexity 1 include/linux/dmaengine.h:__dma_cap_clear
   Cyclomatic Complexity 1 include/linux/dmaengine.h:dma_async_issue_pending
   Cyclomatic Complexity 56 include/linux/slab.h:kmalloc_index
   Cyclomatic Complexity 67 include/linux/slab.h:kmalloc_large
   Cyclomatic Complexity 9 include/linux/slab.h:kmalloc
   Cyclomatic Complexity 1 include/linux/slab.h:kzalloc
   Cyclomatic Complexity 1 drivers//dma/dmaengine.c:dma_chan_to_owner
   Cyclomatic Complexity 2 drivers//dma/dmaengine.c:balance_ref_count
   Cyclomatic Complexity 5 include/linux/dmaengine.h:dma_async_is_tx_complete
   Cyclomatic Complexity 2 include/linux/dmaengine.h:__dma_has_cap
   Cyclomatic Complexity 7 drivers//dma/dmaengine.c:device_has_all_tx_types
   Cyclomatic Complexity 9 drivers//dma/dmaengine.c:dma_chan_get
   Cyclomatic Complexity 3 include/linux/device.h:dev_name
   Cyclomatic Complexity 1 include/linux/dmaengine.h:dma_chan_name
   Cyclomatic Complexity 10 include/linux/dma-mapping.h:get_dma_ops
   Cyclomatic Complexity 3 include/linux/bitops.h:get_count_order
   Cyclomatic Complexity 9 include/linux/bitmap.h:bitmap_fill
   Cyclomatic C

Re: [PATCH v3 8/8] libnvdimm: add DMA support for pmem blk-mq

2017-08-04 Thread kbuild test robot
Hi Dave,

[auto build test WARNING on linus/master]
[also build test WARNING on v4.13-rc3]
[cannot apply to next-20170804]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Dave-Jiang/Adding-blk-mq-and-DMA-support-to-pmem-block-driver/20170804-191719
config: i386-randconfig-x018-201731 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
# save the attached .config to linux build tree
make ARCH=i386 

All warnings (new ones prefixed by >>):

   drivers//nvdimm/pmem.c: In function 'pmem_handle_cmd_dma':
>> drivers//nvdimm/pmem.c:416:20: warning: cast from pointer to integer of 
>> different size [-Wpointer-to-int-cast]
  unmap->addr[1] = (dma_addr_t)cmd->sg;
   ^
   drivers//nvdimm/pmem.c:421:20: warning: cast from pointer to integer of 
different size [-Wpointer-to-int-cast]
  unmap->addr[0] = (dma_addr_t)cmd->sg;
   ^

vim +416 drivers//nvdimm/pmem.c

   347  
   348  static int pmem_handle_cmd_dma(struct pmem_cmd *cmd, bool is_write)
   349  {
   350  struct request *req = cmd->rq;
   351  struct request_queue *q = req->q;
   352  struct pmem_device *pmem = q->queuedata;
   353  struct device *dev = to_dev(pmem);
   354  phys_addr_t pmem_off = blk_rq_pos(req) * 512 + 
pmem->data_offset;
   355  void *pmem_addr = pmem->virt_addr + pmem_off;
   356  struct nd_region *nd_region = to_region(pmem);
   357  size_t len;
   358  struct dma_device *dma = cmd->chan->device;
   359  struct dmaengine_unmap_data *unmap;
   360  dma_cookie_t cookie;
   361  struct dma_async_tx_descriptor *txd;
   362  struct page *page;
   363  unsigned int off;
   364  int rc;
   365  enum dma_data_direction dir;
   366  dma_addr_t dma_addr;
   367  
   368  if (req->cmd_flags & REQ_FLUSH)
   369  nvdimm_flush(nd_region);
   370  
   371  unmap = dmaengine_get_unmap_data(dma->dev, 2, GFP_NOWAIT);
   372  if (!unmap) {
   373  dev_dbg(dev, "failed to get dma unmap data\n");
   374  rc = -ENOMEM;
   375  goto err;
   376  }
   377  
   378  /*
   379   * If reading from pmem, writing to scatterlist,
   380   * and if writing to pmem, reading from scatterlist.
   381   */
   382  dir = is_write ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
   383  cmd->sg_nents = blk_rq_map_sg(req->q, req, cmd->sg);
   384  if (cmd->sg_nents < 1) {
   385  rc = -EINVAL;
   386  goto err;
   387  }
   388  
   389  if (cmd->sg_nents > 128) {
   390  rc = -ENOMEM;
   391  dev_warn(dev, "Number of sg greater than allocated\n");
   392  goto err;
   393  }
   394  
   395  rc = dma_map_sg(dma->dev, cmd->sg, cmd->sg_nents, dir);
   396  if (rc < 1) {
   397  rc = -ENXIO;
   398  goto err;
   399  }
   400  
   401  len = blk_rq_payload_bytes(req);
   402  page = virt_to_page(pmem_addr);
   403  off = offset_in_page(pmem_addr);
   404  dir = is_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
   405  dma_addr = dma_map_page(dma->dev, page, off, len, dir);
   406  if (dma_mapping_error(dma->dev, unmap->addr[0])) {
   407  dev_dbg(dma->dev, "src DMA mapping error\n");
   408  rc = -ENXIO;
   409  goto err_unmap_sg;
   410  }
   411  
   412  unmap->len = len;
   413  
   414  if (is_write) {
   415  unmap->addr[0] = dma_addr;
 > 416  unmap->addr[1] = (dma_addr_t)cmd->sg;
   417  unmap->to_cnt = 1;
   418  unmap->from_cnt = 0;
   419  dma_unmap_data_sg_from_nents(unmap, 2) = cmd->sg_nents;
   420  } else {
   421  unmap->addr[0] = (dma_addr_t)cmd->sg;
   422  unmap->addr[1] = dma_addr;
   423  unmap->from_cnt = 1;
   424  unmap->to_cnt = 0;
   425  dma_unmap_data_sg_to_nents(unmap, 2) = cmd->sg_nents;
   426  }
   427  
   428  txd = dma->device_prep_dma_memcpy_sg(cmd->chan,
   429  cmd->sg, cmd->sg_nents, dma_addr,
   430  !is_write, DMA_PREP_INTERRUPT);
   431  if (!txd) {
   432  dev_dbg(dma->dev, "dma prep failed\n");
   433  rc = -ENXIO;
   434  

[RFC/Patch 5/5] ndctl: show dimm's name which has badblock by ndctl list command.

2017-08-04 Thread Yasunori Goto
ndctl: show dimm's name which has badblock by ndctl list command.

This patch uses translate SPA interface to get bad dimm info.

Since this patch is likely Proof of Concept,
because libnvdimm functions of this feature will change yet.
So, I don't think this patch can be merged.

However, I hope this patch is good start for discussion



Signed-off-by: Yasunori Goto 
---
 util/json.c | 49 +
 1 file changed, 49 insertions(+)

diff --git a/util/json.c b/util/json.c
index 2b2b5af..b791054 100644
--- a/util/json.c
+++ b/util/json.c
@@ -381,6 +381,22 @@ struct json_object *util_region_badblocks_to_json(struct 
ndctl_region *region,
 
ndctl_region_badblock_foreach(region, bb) {
if (flags & UTIL_JSON_MEDIA_ERRORS) {
+   struct ndctl_bus *bus;
+   struct ndctl_dimm *dimms[ND_MIRROR_MAX_WAY];
+   memset(dimms, 0, sizeof(dimms));
+   bus = ndctl_region_get_bus(region);
+   if (ndctl_bus_has_trans_spa(bus)) {
+   int rc;
+   unsigned long long spa;
+   spa = ndctl_region_get_resource(region);
+   if (spa == ULONG_MAX)
+   goto err_array;
+   spa += bb->offset << 9;
+   rc = ndctl_dimms_get_by_spa(bus, spa, dimms);
+   if (rc)
+   goto err_array;
+   }
+
jbb = json_object_new_object();
if (!jbb)
goto err_array;
@@ -395,6 +411,14 @@ struct json_object *util_region_badblocks_to_json(struct 
ndctl_region *region,
goto err;
json_object_object_add(jbb, "length", jobj);
 
+   for (int i = 0; i < ND_MIRROR_MAX_WAY; i++) {
+   if (dimms[i]) {
+   jobj = 
json_object_new_string(ndctl_dimm_get_devname(dimms[i]));
+   if (!jobj)
+   goto err;
+   json_object_object_add(jbb, "dimm", 
jobj);
+   }
+   }
json_object_array_add(jbbs, jbb);
}
 
@@ -436,6 +460,8 @@ static struct json_object *dev_badblocks_to_json(struct 
ndctl_region *region,
 
ndctl_region_badblock_foreach(region, bb) {
unsigned long long bb_begin, bb_end, begin, end;
+   struct ndctl_bus *bus;
+   struct ndctl_dimm *dimms[ND_MIRROR_MAX_WAY];
 
bb_begin = region_begin + (bb->offset << 9);
bb_end = bb_begin + (bb->len << 9) - 1;
@@ -453,6 +479,20 @@ static struct json_object *dev_badblocks_to_json(struct 
ndctl_region *region,
else
end = bb_end;
 
+   memset(dimms, 0, sizeof(dimms));
+   bus = ndctl_region_get_bus(region);
+   if (ndctl_bus_has_trans_spa(bus)) {
+   int rc;
+   unsigned long long spa;
+   spa = ndctl_region_get_resource(region);
+   if (spa == ULLONG_MAX)
+   goto err_array;
+   spa += bb->offset << 9;
+   rc = ndctl_dimms_get_by_spa(bus, spa, dimms);
+   if (rc)
+   goto err_array;
+   }
+
offset = (begin - dev_begin) >> 9;
len = (end - begin + 1) >> 9;
 
@@ -472,6 +512,15 @@ static struct json_object *dev_badblocks_to_json(struct 
ndctl_region *region,
goto err;
json_object_object_add(jbb, "length", jobj);
 
+   for (int i = 1; i < ND_MIRROR_MAX_WAY; i++) {
+   if (dimms[i]) {
+   jobj = 
json_object_new_string(ndctl_dimm_get_devname(dimms[i]));
+   if (!jobj)
+   goto err;
+   json_object_object_add(jbb, "dimm", 
jobj);
+   }
+   }
+
json_object_array_add(jbbs, jbb);
}
bbs += len;



___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[RFC/Patch 4/5] libndctl Make interfaces to use Translate SPA

2017-08-04 Thread Yasunori Goto
ndctl:libndctl Make interfaces to use Translate SPA.

This patch makes 3 new interfaces :
  - to ask bus has translate SPA feature.
  - to call translate SPA.
  - to find DIMM by SPA address.


Note) I'm not sure how many buffer should be prepared, because
  it depends on max # of mirroring way.
  This patch assume maxmum # is 4 way.


Signed-off-by: Yasunori Goto 

---
 configure.ac |  19 
 ndctl/lib/libndctl-private.h |   7 +++
 ndctl/lib/libndctl.c | 103 ++-
 ndctl/lib/libndctl.sym   |   3 ++
 ndctl/libndctl.h.in  |  23 ++
 ndctl/ndctl.h|   8 
 6 files changed, 162 insertions(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index 316f5b7..653fde0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -162,6 +162,25 @@ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
 )
 AM_CONDITIONAL([ENABLE_CLEAR_ERROR], [test "x$enable_clear_err" = "xyes"])
 
+AC_MSG_CHECKING([for TRANSLATE SPA support])
+AC_LANG(C)
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+   #ifdef HAVE_NDCTL_H
+   #include 
+   #else
+   #include "ndctl/ndctl.h"
+   #endif
+   ]], [[
+   int x = ND_CMD_TRANS_SPA;
+   ]]
+   )], [AC_MSG_RESULT([yes])
+enable_trans_spa=yes
+AC_DEFINE([HAVE_NDCTL_TRANS_SPA], [1],
+   [Define to 1 if ndctl.h has TRANSLATE SPA 
support.])
+   ], [AC_MSG_RESULT([no])]
+)
+AM_CONDITIONAL([ENABLE_TRANS_SPA], [test "x$enable_trans_spa" = "xyes"])
+
 AC_MSG_CHECKING([for device DAX support])
 AC_LANG(C)
 AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
diff --git a/ndctl/lib/libndctl-private.h b/ndctl/lib/libndctl-private.h
index 8f10fbc..a1fcd2f 100644
--- a/ndctl/lib/libndctl-private.h
+++ b/ndctl/lib/libndctl-private.h
@@ -196,6 +196,7 @@ struct ndctl_cmd {
 #ifdef HAVE_NDCTL_CLEAR_ERROR
struct nd_cmd_clear_error clear_err[0];
 #endif
+   struct nd_cmd_trans_spa trans_spa[0];
struct ndn_pkg_hpe1 hpe1[0];
struct ndn_pkg_msft msft[0];
struct nd_cmd_smart smart[0];
@@ -250,6 +251,12 @@ static const int nd_cmd_clear_error = ND_CMD_CLEAR_ERROR;
 static const int nd_cmd_clear_error;
 #endif
 
+#ifdef HAVE_NDCTL_TRANS_SPA
+static const int nd_cmd_trans_spa = ND_CMD_TRANS_SPA;
+#else
+static const int nd_cmd_trans_spa;
+#endif
+
 static inline struct ndctl_bus *cmd_to_bus(struct ndctl_cmd *cmd)
 {
if (cmd->dimm)
diff --git a/ndctl/lib/libndctl.c b/ndctl/lib/libndctl.c
index 68d8064..5ebcc45 100644
--- a/ndctl/lib/libndctl.c
+++ b/ndctl/lib/libndctl.c
@@ -744,7 +744,9 @@ static int to_dsm_index(const char *name, int dimm)
end_cmd = ND_CMD_CALL;
cmd_name_fn = nvdimm_cmd_name;
} else {
-   end_cmd = nd_cmd_clear_error;
+   end_cmd = nd_cmd_trans_spa;
+   if (!end_cmd)
+   end_cmd = nd_cmd_clear_error;
if (!end_cmd)
end_cmd = nd_cmd_ars_status;
cmd_name_fn = nvdimm_bus_cmd_name;
@@ -1943,6 +1945,102 @@ NDCTL_EXPORT struct badblock 
*ndctl_region_get_first_badblock(struct ndctl_regio
return ndctl_region_get_next_badblock(region);
 }
 
+#ifdef HAVE_NDCTL_TRANS_SPA
+NDCTL_EXPORT int ndctl_bus_has_trans_spa(struct ndctl_bus *bus)
+{
+   if (!bus)
+   return 0;
+
+   return ndctl_bus_is_cmd_supported(bus, ND_CMD_TRANS_SPA);
+}
+
+static struct ndctl_cmd *ndctl_bus_cmd_new_trans_spa(struct ndctl_bus *bus)
+{
+   struct ndctl_cmd *cmd;
+   size_t size, spa_length;
+
+   spa_length = sizeof(struct nd_cmd_trans_spa)
+   + sizeof(struct nd_nvdimm_device) * ND_MIRROR_MAX_WAY;
+   size = sizeof(*cmd) + spa_length;
+   cmd = calloc(1, size);
+   if (!cmd)
+   return NULL;
+
+   cmd->bus = bus;
+   ndctl_cmd_ref(cmd);
+   cmd->type = ND_CMD_TRANS_SPA;
+   cmd->size = size;
+   cmd->status = 1;
+   cmd->firmware_status = >trans_spa->status;
+   cmd->trans_spa->trans_length = spa_length;
+
+   return cmd;
+}
+
+static int ndctl_bus_cmd_get_trans_spa(struct ndctl_cmd *cmd,
+   unsigned int *handles, unsigned long 
long *dpas)
+{
+   int i;
+   int num_nvdimms;
+
+   if (cmd->trans_spa->status == ND_TRANS_SPA_STATUS_INVALID_SPA)
+   return -EINVAL;
+
+   num_nvdimms = cmd->trans_spa->num_nvdimms;
+   for (i = 0; i < num_nvdimms; i++) {
+   handles[i] = cmd->trans_spa->devices[i].nfit_device_handle;
+   dpas[i] = cmd->trans_spa->devices[i].dpa;
+   }
+
+   ndctl_cmd_unref(cmd);
+   return 0;
+}
+
+NDCTL_EXPORT int 

[RFC/Patch 3/5] nfit_test supports Translate SPA

2017-08-04 Thread Yasunori Goto
nfit_test supports Translate SPA

To test interface of Translate SPA, nfit_test must emulate it.
This test module searches region which includes spa and 
returns only 1 dimm handle which is last one currently.



Signed-off-by: Yasunori Goto 

---
 drivers/nvdimm/region_devs.c |  1 +
 tools/testing/nvdimm/test/nfit.c | 80 +++-
 2 files changed, 80 insertions(+), 1 deletion(-)

diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 5954cfb..3c8cc7f 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -162,6 +162,7 @@ bool is_nd_pmem(struct device *dev)
 {
return dev ? dev->type == _pmem_device_type : false;
 }
+EXPORT_SYMBOL_GPL(is_nd_pmem);
 
 bool is_nd_blk(struct device *dev)
 {
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 4c2fa98..09dcdf5 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -342,6 +342,78 @@ static int nfit_test_cmd_clear_error(struct 
nd_cmd_clear_error *clear_err,
return 0;
 }
 
+struct region_search_spa{
+   u64 addr;
+   struct nd_region *region;
+};
+
+static int nfit_test_search_region_spa(struct device *dev, void *data)
+{
+   struct region_search_spa *ctx = data;
+   struct nd_region *nd_region;
+   resource_size_t ndr_end;
+
+   if (!is_nd_pmem(dev))
+   return 0;
+
+   nd_region = to_nd_region(dev);
+   ndr_end = nd_region->ndr_start + nd_region->ndr_size;
+
+   if (ctx->addr >= nd_region->ndr_start && ctx->addr < ndr_end) {
+   ctx->region = nd_region;
+   return 1;
+   }
+
+   return 0;
+}
+
+static int nfit_test_search_spa(struct nvdimm_bus *bus, struct 
nd_cmd_trans_spa *spa)
+{
+   int ret;
+   struct nd_region *nd_region = NULL;
+   struct nvdimm *nvdimm = NULL;
+   struct nd_mapping *nd_mapping = NULL;
+   struct region_search_spa ctx = {
+   .addr = spa->spa,
+   .region = NULL,
+   };
+   u64 dpa;
+
+   ret = device_for_each_child(>dev, , 
nfit_test_search_region_spa);
+
+   if (!ret)
+   return -ENODEV;
+
+   nd_region = ctx.region;
+
+   dpa = ctx.addr - nd_region->ndr_start;
+
+   /*
+* last dimm is selected for test
+*/
+   nd_mapping = _region->mapping[nd_region->ndr_mappings - 1];
+   nvdimm = nd_mapping->nvdimm;
+
+   spa->devices[0].nfit_device_handle = handle[nvdimm->id];
+   spa->num_nvdimms = 1;
+   spa->devices[0].dpa = dpa;
+
+   return 0;
+}
+
+static int nfit_test_cmd_translate_spa(struct nvdimm_bus *bus, struct 
nd_cmd_trans_spa *spa,
+   unsigned int buf_len)
+{
+
+   if (buf_len < spa->trans_length)
+   return -EINVAL;
+
+   if (nfit_test_search_spa(bus, spa) < 0|| !spa->num_nvdimms)
+   spa->status = 2;
+
+   return 0;
+}
+
 static int nfit_test_cmd_smart(struct nd_cmd_smart *smart, unsigned int 
buf_len)
 {
static const struct nd_smart_payload smart_data = {
@@ -468,6 +540,9 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor 
*nd_desc,
case ND_CMD_CLEAR_ERROR:
rc = nfit_test_cmd_clear_error(buf, buf_len, cmd_rc);
break;
+   case ND_CMD_TRANS_SPA:
+   rc = nfit_test_cmd_translate_spa(acpi_desc->nvdimm_bus, 
buf, buf_len);
+   break;
default:
return -ENOTTY;
}
@@ -1430,6 +1505,7 @@ static void nfit_test0_setup(struct nfit_test *t)
set_bit(ND_CMD_ARS_START, _desc->bus_cmd_force_en);
set_bit(ND_CMD_ARS_STATUS, _desc->bus_cmd_force_en);
set_bit(ND_CMD_CLEAR_ERROR, _desc->bus_cmd_force_en);
+   set_bit(ND_CMD_TRANS_SPA, _desc->bus_cmd_force_en);
set_bit(ND_CMD_SMART_THRESHOLD, _desc->dimm_cmd_force_en);
 }
 
@@ -1527,6 +1603,7 @@ static void nfit_test1_setup(struct nfit_test *t)
set_bit(ND_CMD_ARS_START, _desc->bus_cmd_force_en);
set_bit(ND_CMD_ARS_STATUS, _desc->bus_cmd_force_en);
set_bit(ND_CMD_CLEAR_ERROR, _desc->bus_cmd_force_en);
+   set_bit(ND_CMD_TRANS_SPA, _desc->bus_cmd_force_en);
set_bit(ND_CMD_GET_CONFIG_SIZE, _desc->dimm_cmd_force_en);
set_bit(ND_CMD_GET_CONFIG_DATA, _desc->dimm_cmd_force_en);
set_bit(ND_CMD_SET_CONFIG_DATA, _desc->dimm_cmd_force_en);
@@ -1616,7 +1693,8 @@ static int nfit_ctl_test(struct device *dev)
.cmd_mask = 1UL << ND_CMD_ARS_CAP
| 1UL << ND_CMD_ARS_START
| 1UL << ND_CMD_ARS_STATUS
-   | 1UL << ND_CMD_CLEAR_ERROR,
+   | 1UL << ND_CMD_CLEAR_ERROR
+   | 1UL << ND_CMD_TRANS_SPA,
.module = 

[RFC/Patch 2/5] Support Translate SPA for NVDIMM Root Device.

2017-08-04 Thread Yasunori Goto

Support Translate SPA for NVDIMM Root Device.

ACPI 6.2 has new specification on _DSM of NVDIMM Root Device
It is "Translate SPA" which translate from system physical address(SPA)
to NVDIMM handle and dimm physical address(DPA).

This patch is to support Translate SPA.


Signed-off-by: Yasunori Goto 
---
 drivers/acpi/nfit/core.c   |  7 +--
 drivers/nvdimm/bus.c   | 11 ++-
 include/linux/libnvdimm.h  |  2 +-
 include/uapi/linux/ndctl.h |  4 
 4 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 5334755..e8c67a3 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -173,6 +173,8 @@ static int xlat_bus_status(void *buf, unsigned int cmd, u32 
status)
if (clear_err->length > clear_err->cleared)
return clear_err->cleared;
return 0;
+   case ND_CMD_TRANS_SPA:
+   return 0;
default:
break;
}
@@ -352,7 +354,7 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, 
struct nvdimm *nvdimm,
 * Set fw_status for all the commands with a known format to be
 * later interpreted by xlat_status().
 */
-   if (i >= 1 && ((cmd >= ND_CMD_ARS_CAP && cmd <= ND_CMD_CLEAR_ERROR)
+   if (i >= 1 && ((cmd >= ND_CMD_ARS_CAP && cmd <= ND_CMD_TRANS_SPA)
|| (cmd >= ND_CMD_SMART && cmd <= ND_CMD_VENDOR)))
fw_status = *(u32 *) out_obj->buffer.pointer;
 
@@ -1644,7 +1646,7 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc 
*acpi_desc)
if (!adev)
return;
 
-   for (i = ND_CMD_ARS_CAP; i <= ND_CMD_CLEAR_ERROR; i++)
+   for (i = ND_CMD_ARS_CAP; i <= ND_CMD_TRANS_SPA; i++)
if (acpi_check_dsm(adev->handle, guid, 1, 1ULL << i))
set_bit(i, _desc->cmd_mask);
set_bit(ND_CMD_CALL, _desc->cmd_mask);
@@ -1654,6 +1656,7 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc 
*acpi_desc)
(1 << ND_CMD_ARS_START) |
(1 << ND_CMD_ARS_STATUS) |
(1 << ND_CMD_CLEAR_ERROR) |
+   (1 << ND_CMD_TRANS_SPA) |
(1 << NFIT_CMD_ARS_INJECT_SET) |
(1 << NFIT_CMD_ARS_INJECT_CLEAR) |
(1 << NFIT_CMD_ARS_INJECT_GET);
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 937fafa..ef05fe1 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -727,6 +727,12 @@ static const struct nd_cmd_desc __nd_cmd_bus_descs[] = {
.out_num = 3,
.out_sizes = { 4, 4, 8, },
},
+   [ND_CMD_TRANS_SPA] = {
+   .in_num = 1,
+   .in_sizes = { 8, },
+   .out_num = 7,
+   .out_sizes = { 2, 2, 1, 3, 8, 4, UINT_MAX, },
+   },
[ND_CMD_CALL] = {
.in_num = 2,
.in_sizes = { sizeof(struct nd_cmd_pkg), UINT_MAX, },
@@ -801,7 +807,9 @@ u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd,
if (out_field[1] - 8 == remainder)
return remainder;
return out_field[1] - 4;
-   } else if (cmd == ND_CMD_CALL) {
+   } else if (!nvdimm && cmd == ND_CMD_TRANS_SPA && idx == 6)
+   return out_field[2];
+   else if (cmd == ND_CMD_CALL) {
struct nd_cmd_pkg *pkg = (struct nd_cmd_pkg *) in_field;
 
return pkg->nd_size_out;
@@ -947,6 +955,7 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct 
nvdimm *nvdimm,
case ND_CMD_SET_CONFIG_DATA:
case ND_CMD_ARS_START:
case ND_CMD_CLEAR_ERROR:
+   case ND_CMD_TRANS_SPA:
case ND_CMD_CALL:
dev_dbg(_bus->dev, "'%s' command while 
read-only.\n",
nvdimm ? nvdimm_cmd_name(cmd)
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index f3d3e6a..c179ed0 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -29,7 +29,7 @@ enum {
 
/* need to set a limit somewhere, but yes, this is likely overkill */
ND_IOCTL_MAX_BUFLEN = SZ_4M,
-   ND_CMD_MAX_ELEM = 5,
+   ND_CMD_MAX_ELEM = 7,
ND_CMD_MAX_ENVELOPE = 256,
ND_MAX_MAPPINGS = 32,
 
diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index 6d3c542..91bf129 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -190,6 +190,7 @@ enum {
ND_CMD_ARS_START = 2,
ND_CMD_ARS_STATUS = 3,
ND_CMD_CLEAR_ERROR = 4,
+   ND_CMD_TRANS_SPA = 5,
 
/* per-dimm commands */
ND_CMD_SMART = 1,
@@ -218,6 +219,7 @@ static inline const char *nvdimm_bus_cmd_name(unsigned cmd)
[ND_CMD_ARS_START] = "ars_start",
[ND_CMD_ARS_STATUS] = "ars_status",
[ND_CMD_CLEAR_ERROR] = 

[RFC/Patch 1/5] Remove old enum definition of "Translate Spa"

2017-08-04 Thread Yasunori Goto

Remove old enum definition of "Translate Spa".

Translate SPA becomes standard specification on ACPI 6.2,
but current definition is likely temporary.
So it is removed.

Signed-off-by: Yasunori Goto 


---
 drivers/acpi/nfit/core.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 19182d0..5334755 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1626,7 +1626,6 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc 
*acpi_desc)
  * these commands.
  */
 enum nfit_aux_cmds {
-NFIT_CMD_TRANSLATE_SPA = 5,
 NFIT_CMD_ARS_INJECT_SET = 7,
 NFIT_CMD_ARS_INJECT_CLEAR = 8,
 NFIT_CMD_ARS_INJECT_GET = 9,
@@ -1655,7 +1654,6 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc 
*acpi_desc)
(1 << ND_CMD_ARS_START) |
(1 << ND_CMD_ARS_STATUS) |
(1 << ND_CMD_CLEAR_ERROR) |
-   (1 << NFIT_CMD_TRANSLATE_SPA) |
(1 << NFIT_CMD_ARS_INJECT_SET) |
(1 << NFIT_CMD_ARS_INJECT_CLEAR) |
(1 << NFIT_CMD_ARS_INJECT_GET);




___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[RFC/Patch 0/5] ndctl list show broken nvdimm info.

2017-08-04 Thread Yasunori Goto

Hello,

I made a patch set to show information of broken NVDIMM.

When a region has a broken block, user need to replace
the NVDIMM which includes the block.
However there is no information to find which DIMM module have the block.
Not only ndctl does not have such information, nvdimm driver can not
find it.


Fortunately, ACPI 6.2 has new specification of _DSM.
It is "translate spa" which can get NVDIMM handle
and DPA(Dimm Physical Address) from SPA(system Physical Addreess).
It helps for ndctl command to find broken NVDIMM.
So, This patch set includes followings.
 - support Translate SPA interface,
 - ndctl ask DIMM by Translate SPA and show it.


To be honest, since I'm still newbie about the internal of NVDIMM driver
and ndctl, I may misunderstand/miss something, and I feel my patch
set has likely quick hack

Especiall, I'm not sure about the followings.

  - What is the maximum size of input/output argument table for _DSM
for Translate SPA and ioctl()? It seems to depend on max # of
ways of mirroring. Since ACPI 6.2 seems not to mention it, 
it depends on vedor's decision, I think
Usually, mirroring is 2 way, but I'm not sure it is the maximum number...

  - Though libndctl has many good API, I'm not sure how new APIs
should be named.


However, I think progress will be better by getting many others advises
than thinking the aboves alone. So, I hope this post is good start
to make this feature.

Anyway, please check and comment.

Thanks,
---
Yasunori Goto



___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH 0/3] remove rw_page() from brd, pmem and btt

2017-08-04 Thread Minchan Kim
On Fri, Aug 04, 2017 at 12:54:41PM +0900, Minchan Kim wrote:
> On Thu, Aug 03, 2017 at 03:13:35PM -0600, Ross Zwisler wrote:
> > On Thu, Aug 03, 2017 at 09:13:15AM +0900, Minchan Kim wrote:
> > > Hi Ross,
> > > 
> > > On Wed, Aug 02, 2017 at 04:13:59PM -0600, Ross Zwisler wrote:
> > > > On Fri, Jul 28, 2017 at 10:31:43AM -0700, Matthew Wilcox wrote:
> > > > > On Fri, Jul 28, 2017 at 10:56:01AM -0600, Ross Zwisler wrote:
> > > > > > Dan Williams and Christoph Hellwig have recently expressed doubt 
> > > > > > about
> > > > > > whether the rw_page() interface made sense for synchronous memory 
> > > > > > drivers
> > > > > > [1][2].  It's unclear whether this interface has any performance 
> > > > > > benefit
> > > > > > for these drivers, but as we continue to fix bugs it is clear that 
> > > > > > it does
> > > > > > have a maintenance burden.  This series removes the rw_page()
> > > > > > implementations in brd, pmem and btt to relieve this burden.
> > > > > 
> > > > > Why don't you measure whether it has performance benefits?  I don't
> > > > > understand why zram would see performance benefits and not other 
> > > > > drivers.
> > > > > If it's going to be removed, then the whole interface should be 
> > > > > removed,
> > > > > not just have the implementations removed from some drivers.
> > > > 
> > > > Okay, I've run a bunch of performance tests with the PMEM and with BTT 
> > > > entry
> > > > points for rw_pages() in a swap workload, and in all cases I do see an
> > > > improvement over the code when rw_pages() is removed.  Here are the 
> > > > results
> > > > from my random lab box:
> > > > 
> > > >   Average latency of swap_writepage()
> > > > +--++-+-+
> > > > |  | no rw_page | rw_page | Improvement |
> > > > +---+
> > > > | PMEM |  5.0 us|  4.7 us | 6%  |
> > > > +---+
> > > > |  BTT |  6.8 us|  6.1 us |10%  |
> > > > +--++-+-+
> > > > 
> > > >   Average latency of swap_readpage()
> > > > +--++-+-+
> > > > |  | no rw_page | rw_page | Improvement |
> > > > +---+
> > > > | PMEM |  3.3 us|  2.9 us |12%  |
> > > > +---+
> > > > |  BTT |  3.7 us|  3.4 us | 8%  |
> > > > +--++-+-+
> > > > 
> > > > The workload was pmbench, a memory benchmark, run on a system where I 
> > > > had
> > > > severely restricted the amount of memory in the system with the 'mem' 
> > > > kernel
> > > > command line parameter.  The benchmark was set up to test more memory 
> > > > than I
> > > > allowed the OS to have so it spilled over into swap.
> > > > 
> > > > The PMEM or BTT device was set up as my swap device, and during the 
> > > > test I got
> > > > a few hundred thousand samples of each of swap_writepage() and
> > > > swap_writepage().  The PMEM/BTT device was just memory reserved with the
> > > > memmap kernel command line parameter.
> > > > 
> > > > Thanks, Matthew, for asking for performance data.  It looks like 
> > > > removing this
> > > > code would have been a mistake.
> > > 
> > > By suggestion of Christoph Hellwig, I made a quick patch which does IO 
> > > without
> > > dynamic bio allocation for swap IO. Actually, it's not formal patch to be
> > > worth to send mainline yet but I believe it's enough to test the 
> > > improvement.
> > > 
> > > Could you test patchset on pmem and btt without rw_page?
> > > 
> > > For working the patch, block drivers need to declare it's synchronous IO
> > > device via BDI_CAP_SYNC but if it's hard, you can just make every swap IO
> > > comes from (sis->flags & SWP_SYNC_IO) with removing condition check
> > > 
> > > if (!(sis->flags & SWP_SYNC_IO)) in swap_[read|write]page.
> > > 
> > > Patchset is based on 4.13-rc3.
> > 
> > Thanks for the patch, here are the updated results from my test box:
> > 
> >  Average latency of swap_writepage()
> > +--++-+-+
> > |  | no rw_page | minchan | rw_page |
> > +
> > | PMEM |  5.0 us| 4.98 us |  4.7 us |
> > +
> > |  BTT |  6.8 us| 6.3 us  |  6.1 us |
> > +--++-+-+
> >
> >  Average latency of swap_readpage()
> > +--++-+-+
> > |  | no rw_page | minchan | rw_page |
> > +
> > | PMEM |  3.3 us| 3.27 us |  2.9 us |
> > +
> > |  BTT |  3.7 us| 3.44 us |  3.4 us |
> > +--++-+-+
> > 
> > I've added another digit in precision in some cases to help differentiate 
> > the
> > various results.
> > 
> > In all cases your patches did perform better than with 

Re: [PATCH 5/5] libnvdimm: add DMA support for pmem blk-mq

2017-08-04 Thread Johannes Thumshirn
On Thu, Aug 03, 2017 at 09:15:12AM -0700, Dan Williams wrote:
> > I can put in a tunable knob like Johannes suggested and leave the
> > default for that to what it is currently. And we can adjust as necessary
> > as we do more testing.
> 
> That's the problem, we need a multi-dimensional knob. It's not just
> transfer-size that effects total delivered bandwidth.

If Dan opposes it so much, I'm OK with no tunable. I'm just expressing my
concern that we get reports like this [1] one here as a result.

Thanks,
Johannes

[1] http://marc.info/?l=linux-kernel=150175029403913=2

-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm