from:"Ross Zwisler"

[PATCH 0/4] x86: Add and use support for clflushopt

2014-02-26 Thread Ross Zwisler

This patch series adds support for the new clflushopt instruction and then
uses it in a few appropriate places in x86 specific code.  It does this using
the alternatives mechanism, so any platforms without support for clflushopt
will continue to use clflush instead.

clflushopt was announced in the document Intel Architecture Instruction Set
Extensions Programming Reference with Ref # 319433-018.

http://download-software.intel.com/sites/default/files/managed/50/1a/319433-018.pdf

clflushopt has the same flushing behavior as clflush, but has more relaxed
ordering.  clflushopt must be explicitly ordered by sfence or mfence.

The inline assembly for clflushopt was implemented using %P so that the
generated addresses will always be absolute instead of sometimes being RIP
relative.  This is necessary for the alternatives code to behave correctly.

Ross Zwisler (4):
  x86: Add support for clflushopt
  x86: Use clflushopt in clflush_cache_range
  x86: Use clflushopt in drm_clflush_page
  x86: Use clflushopt in drm_clflush_virt_range

 arch/x86/include/asm/cpufeature.h|1 +
 arch/x86/include/asm/special_insns.h |8 
 arch/x86/mm/pageattr.c   |8 
 drivers/gpu/drm/drm_cache.c  |   10 --
 4 files changed, 21 insertions(+), 6 deletions(-)

-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/4] x86: Add support for clflushopt

2014-02-26 Thread Ross Zwisler

Add support for the new clflushopt instruction.  This instruction was
announced in the document Intel Architecture Instruction Set Extensions
Programming Reference with Ref # 319433-018.

http://download-software.intel.com/sites/default/files/managed/50/1a/319433-018.pdf

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
Cc: H Peter Anvin h.peter.an...@intel.com
Cc: Ingo Molnar mi...@kernel.org
Cc: Thomas Gleixner t...@linutronix.de
---
 arch/x86/include/asm/cpufeature.h|1 +
 arch/x86/include/asm/special_insns.h |8 
 2 files changed, 9 insertions(+)

diff --git a/arch/x86/include/asm/cpufeature.h 
b/arch/x86/include/asm/cpufeature.h
index 89270b4..bfad1ad 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -219,6 +219,7 @@
 #define X86_FEATURE_RDSEED (9*32+18) /* The RDSEED instruction */
 #define X86_FEATURE_ADX(9*32+19) /* The ADCX and ADOX 
instructions */
 #define X86_FEATURE_SMAP   (9*32+20) /* Supervisor Mode Access Prevention 
*/
+#define X86_FEATURE_CLFLSHOPT   (9*32+23) /* clflushopt CLFLUSHOPT 
instruction */
 
 /*
  * BUG word(s)
diff --git a/arch/x86/include/asm/special_insns.h 
b/arch/x86/include/asm/special_insns.h
index 645cad2..617389a 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -191,6 +191,14 @@ static inline void clflush(volatile void *__p)
asm volatile(clflush %0 : +m (*(volatile char __force *)__p));
 }
 
+static inline void clflushopt(volatile void *__p)
+{
+   alternative_io(.byte  __stringify(NOP_DS_PREFIX) ; clflush %P0,
+  .byte 0x66; clflush %P0,
+  X86_FEATURE_CLFLSHOPT,
+  +m (*(volatile char __force *)__p));
+}
+
 #define nop() asm volatile (nop)
 
 
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 4/4] x86: Use clflushopt in drm_clflush_virt_range

2014-02-26 Thread Ross Zwisler

If clflushopt is available on the system, use it instead of clflush in
drm_clflush_virt_range.

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
Cc: H Peter Anvin h.peter.an...@intel.com
Cc: Ingo Molnar mi...@kernel.org
Cc: Thomas Gleixner t...@linutronix.de
---
 drivers/gpu/drm/drm_cache.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
index c518fb6..534cb89 100644
--- a/drivers/gpu/drm/drm_cache.c
+++ b/drivers/gpu/drm/drm_cache.c
@@ -139,7 +139,7 @@ drm_clflush_virt_range(char *addr, unsigned long length)
mb();
for (; addr  end; addr += boot_cpu_data.x86_clflush_size)
clflush(addr);
-   clflush(end - 1);
+   clflushopt(end - 1);
mb();
return;
}
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/4] x86: Use clflushopt in clflush_cache_range

2014-02-26 Thread Ross Zwisler

If clflushopt is available on the system, use it instead of clflush in
clflush_cache_range.

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
Cc: H Peter Anvin h.peter.an...@intel.com
Cc: Ingo Molnar mi...@kernel.org
Cc: Thomas Gleixner t...@linutronix.de
---
 arch/x86/mm/pageattr.c |8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index bb32480..11d500a 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -125,8 +125,8 @@ within(unsigned long addr, unsigned long start, unsigned 
long end)
  * @vaddr: virtual start address
  * @size:  number of bytes to flush
  *
- * clflush is an unordered instruction which needs fencing with mfence
- * to avoid ordering issues.
+ * clflushopt is an unordered instruction which needs fencing with mfence or
+ * sfence to avoid ordering issues.
  */
 void clflush_cache_range(void *vaddr, unsigned int size)
 {
@@ -135,11 +135,11 @@ void clflush_cache_range(void *vaddr, unsigned int size)
mb();
 
for (; vaddr  vend; vaddr += boot_cpu_data.x86_clflush_size)
-   clflush(vaddr);
+   clflushopt(vaddr);
/*
 * Flush any possible final partial cacheline:
 */
-   clflush(vend);
+   clflushopt(vend);
 
mb();
 }
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 3/4] x86: Use clflushopt in drm_clflush_page

2014-02-26 Thread Ross Zwisler

If clflushopt is available on the system, use it instead of clflush in
drm_clflush_page.

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
Cc: H Peter Anvin h.peter.an...@intel.com
Cc: Ingo Molnar mi...@kernel.org
Cc: Thomas Gleixner t...@linutronix.de
---
 drivers/gpu/drm/drm_cache.c |8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
index bb8f580..c518fb6 100644
--- a/drivers/gpu/drm/drm_cache.c
+++ b/drivers/gpu/drm/drm_cache.c
@@ -32,6 +32,12 @@
 #include drm/drmP.h
 
 #if defined(CONFIG_X86)
+
+/*
+ * clflushopt is an unordered instruction which needs fencing with mfence or
+ * sfence to avoid ordering issues.  For drm_clflush_page this fencing happens
+ * in the caller.
+ */
 static void
 drm_clflush_page(struct page *page)
 {
@@ -44,7 +50,7 @@ drm_clflush_page(struct page *page)
 
page_virtual = kmap_atomic(page);
for (i = 0; i  PAGE_SIZE; i += size)
-   clflush(page_virtual + i);
+   clflushopt(page_virtual + i);
kunmap_atomic(page_virtual);
 }
 
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v6 07/22] Replace the XIP page fault handler with the DAX page fault handler

2014-03-03 Thread Ross Zwisler

On Mon, 3 Mar 2014, Dave Chinner wrote:
 On Fri, Feb 28, 2014 at 03:20:31PM -0500, Matthew Wilcox wrote:
  On Fri, Feb 28, 2014 at 10:49:31AM -0700, Toshi Kani wrote:
   The original code,
   xip_file_fault(), jumps to found: and calls vm_insert_mixed() when
   get_xip_mem(,,0,,) succeeded.  If get_xip_mem() returns -ENODATA, it
   calls either get_xip_mem(,,1,,) or xip_sparse_page().  In this new
   function, it looks to me that get_block(,,,0) returns 0 for both cases
   (success and -ENODATA previously), which are dealt in the same way.  Is
   that right?  If so, is there any reason for the change?
  
  Yes, get_xip_mem() returned -ENODATA for a hole.  That was a suboptimal
  interface because filesystems are actually capable of returning more
  information than that, eg how long the hole is (ext4 *doesn't*, but I
  consider that to be a bug).
  
  I don't get to decide what the get_block() interface looks like.  It's the
  standard way that the VFS calls back into the filesystem and has been
  around for probably close to twenty years at this point.  I'm still trying
  to understand exactly what the contract is for get_blocks() ... I have
  a document that I'm working on to try to explain it, but it's tough going!
  
   Also, isn't it
   possible to call get_block(,,,1) even if get_block(,,,0) found a block?
  
  The code in question looks like this:
  
  error = get_block(inode, block, bh, 0);
  if (error || bh.b_size  PAGE_SIZE)
  goto sigbus;
  
  if (!buffer_written(bh)  !vmf-cow_page) {
  if (vmf-flags  FAULT_FLAG_WRITE) {
  error = get_block(inode, block, bh, 1);
  
  where buffer_written is defined as:
  return buffer_mapped(bh)  !buffer_unwritten(bh);
  
  Doing some boolean algebra, that's:
  
  if (!buffer_mapped || buffer_unwritten)
  
  In either case, we want to tell the filesystem that we're writing to
  this block.  At least, that's my current understanding of the get_block()
  interface.  I'm open to correction here!
 
 I've got a rewritten version on this that doesn't require two calls
 to get_block() that I wrote while prototyping the XFS code. It also
 fixes all the misunderstandings about what get_block() actually does
 and returns so it works correctly with XFS.
 
 I need to port it forward to your new patch set (hopefully later
 this week), so don't spend too much time trying to work out exactly
 what this code needs to do...

Here is a writeup from Matthew Wilcox describing the get_block() interface.

He sent this to me before Dave sent out the latest mail in this thread. :)

Corrections and updates are very welcome.

- Ross



get_block_t is used by the VFS to ask filesystem to translate logical
blocks within a file to sectors on a block device.

typedef int (get_block_t)(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);

get_block() must not be called simultaneously with the create flag set
for overlapping extents *** or is/was this a bug in ext2? ***

Despite the iblock argument having type sector_t, iblock is actually
in units of the file block size, not in units of 512-byte sectors.
iblock must not extend beyond i_size. *** um, looks like xfs permits
this ... ? ***

If there is no current mapping from the block to the media, one will be
created if 'create' is set to 1.  'create' should not be set to a value
other than '0' or '1'.

On entry, bh_result should have b_size set to the number of bytes that the
caller is interested in and b_state initialised to zero.  b_size should
be a multiple of the file block size.  On exit, bh_result describes a
physically contiguous extent starting at iblock.  b_size will not be
increased by get_block, but it may be decreased if the filesystem extent
is shorter than the extent requested.

If bh_result describes an extent that is allocated, then BH_Mapped will
be set, and b_bdev and b_blocknr will be set to indicate the physical
location on the media.  The filesystem may wish to use map_bh() in order
to set BH_Mapped and initialise b_bdev, b_blocknr and b_size.  If the
filesystem knows that the extent is read into memory (eg because it
decided to populate the page cache as part of its get_block operation),
it should set BH_Uptodate.

If bh_result describes a hole, the filesystem should clear BH_Mapped and
set BH_Uptodate.  It will not set b_bdev or b_blocknr, but it should set
b_size to indicate the length of the hole.  It may also opt to leave
bh_result untouched as described above.  If the block corresponds to
a hole, bh_result *may* be unmodified, but the VFS can optimise some
operations if the filesystem reports the length of the hole as described
below. *** or is this a bug in ext4? ***

If bh_result describes an extent which has data in the pagecache, but
that data has not yet had space allocated on the media (due to delayed
allocation), BH_Mapped, BH_Uptodate and BH_Delay

Re: [PATCH v5 19/22] ext4: Add XIP functionality

2014-02-11 Thread Ross Zwisler

On Wed, 15 Jan 2014, Matthew Wilcox wrote:
 From: Ross Zwisler ross.zwis...@linux.intel.com
 
 This is a port of the XIP functionality found in the current version of
 ext2.
 
 Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
 Reviewed-by: Andreas Dilger andreas.dil...@intel.com
 [heavily tweaked]
 Signed-off-by: Matthew Wilcox matthew.r.wil...@intel.com

...

 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
 index c767666..8b73d77 100644
 --- a/fs/ext4/inode.c
 +++ b/fs/ext4/inode.c
 @@ -663,6 +663,18 @@ found:
   WARN_ON(1);
   }
  
 + /* this is probably wrong for ext4.  unlike ext2, ext4 supports
 +  * uninitialised extents, so we should probably be hooking
 +  * into the make it initialised code instead. */
 + if (IS_XIP(inode)) {

With the very first version of this patch the above logic seemed to work
correctly, zeroing blocks as we allocated them.  With the current XIP
infrastructure based tightly on direct IO this ends up being wrong because in
some cases we can call ext4_map_blocks() twice for a given block.  

A quick userland test program that creates a new file, truncates it up to 4k
and then does a partial block write will end up giving you a file filled with
all zeros.  This is because we zero the data before the write, do the write,
and then zero again, overwriting the data.  The second call to
ext4_map_blocks() happens via ext4_ext_direct_IO =
ext4_convert_unwritten_extents() = ext4_map_blocks().

We can know in ext4_map_blocks() that we are being called after a write has
already completed by looking at the flags.  One solution to get around this
double-zeroing would be to change the above test to:

+ if (IS_XIP(inode)  !(flags  EXT4_GET_BLOCKS_CONVERT)) {

This fixes the tests I've been able to come up with, but I'm not certain it's
the correct fix for the long term.  It seems wasteful to zero the blocks we're
allocating, just to have the zeros overwritten immediately by a write.  Maybe
a cleaner way would be to try and zero the unwritten bits inside of
ext4_convert_unwritten_extents(), or somewhere similar?

It's worth noting that I don't think the direct I/O path has this kind of
logic because they don't allow partial block writes.  The regular I/O path
knows to zero unwritten space based on the BH_New flag, as set via the
set_buffer_new() call in ext4_da_map_blocks().  This is a pretty different I/O
path, though, so I'm not sure how much we can borrow for the XIP code.

Thoughts on the correct fix?

- Ross
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v5 06/22] Treat XIP like O_DIRECT

2014-02-12 Thread Ross Zwisler

On Wed, 15 Jan 2014, Matthew Wilcox wrote:
 Instead of separate read and write methods, use the generic AIO
 infrastructure.  In addition to giving us support for AIO, this adds
 the locking between read() and truncate() that was missing.
 
 Signed-off-by: Matthew Wilcox matthew.r.wil...@intel.com

...

 +static ssize_t xip_io(int rw, struct inode *inode, const struct iovec
 *iov,
 + loff_t start, loff_t end, unsigned nr_segs,
 + get_block_t get_block, struct buffer_head *bh)
 +{
 + ssize_t retval = 0;
 + unsigned seg = 0;
 + unsigned len;
 + unsigned copied = 0;
 + loff_t offset = start;
 + loff_t max = start;
 + void *addr;
 + bool hole = false;
 +
 + while (offset  end) {
 + void __user *buf = iov[seg].iov_base + copied;
 +
 + if (max == offset) {
 + sector_t block = offset  inode-i_blkbits;
 + long size;
 + memset(bh, 0, sizeof(*bh));
 + bh-b_size = ALIGN(end - offset, PAGE_SIZE);
 + retval = get_block(inode, block, bh, rw == WRITE);
 + if (retval)
 + break;
 + if (buffer_mapped(bh)) {
 + retval = xip_get_addr(inode, bh, addr);
 + if (retval  0)
 + break;
 + addr += offset - (block  inode-i_blkbits);
 + hole = false;
 + size = retval;
 + } else {
 + if (rw == WRITE) {
 + retval = -EIO;
 + break;
 + }
 + addr = NULL;
 + hole = true;
 + size = bh-b_size;
 + }
 + max = offset + size;
 + }
 +
 + len = min_t(unsigned, iov[seg].iov_len - copied, max - offset);
 +
 + if (rw == WRITE)
 + len -= __copy_from_user_nocache(addr, buf, len);
 + else if (!hole)
 + len -= __copy_to_user(buf, addr, len);
 + else
 + len -= __clear_user(buf, len);
 +
 + if (!len)
 + break;
 +
 + offset += len;
 + copied += len;
 + if (copied == iov[seg].iov_len) {
 + seg++;
 + copied = 0;
 + }
 + }
 +
 + return (offset == start) ? retval : offset - start;
 +}

xip_io() as it is currently written has an issue where reads can go beyond
inode-i_size.  A quick test to show this issue is:

create a new file
write to the file for 1/2 a block
seek back to 0
read for a full block

The read in this case will return 4096, the length of the full block that was
requested.  It should return 2048, reading just the data that was written.

The issue is that we do have a full block allocated in ext4, we do have it
available via XIP via xip_get_addr(), and the only extra check that we
currently have is a check against iov_len.  iov_len in this case is 4096, so
no one stops us from doing a full block read.

Here is a quick patch that fixes this issue:

diff --git a/fs/xip.c b/fs/xip.c
index e902593..1608f29 100644
--- a/fs/xip.c
+++ b/fs/xip.c
@@ -91,13 +91,16 @@ static ssize_t xip_io(int rw, struct inode *inode, const 
struct
 {
ssize_t retval = 0;
unsigned seg = 0;
-   unsigned len;
+   unsigned len, total_len;
unsigned copied = 0;
loff_t offset = start;
loff_t max = start;
void *addr;
bool hole = false;
 
+   end = min(end, inode-i_size);
+   total_len = end - start;
+
while (offset  end) {
void __user *buf = iov[seg].iov_base + copied;
 
@@ -136,6 +139,7 @@ static ssize_t xip_io(int rw, struct inode *inode, const 
struct
}
 
len = min_t(unsigned, iov[seg].iov_len - copied, max - offset);
+   len = min(len, total_len);
 
if (rw == WRITE)
len -= __copy_from_user_nocache(addr, buf, len);
@@ -149,6 +153,7 @@ static ssize_t xip_io(int rw, struct inode *inode, const 
struct
 
offset += len;
copied += len;
+   total_len -= len;
if (copied == iov[seg].iov_len) {
seg++;
copied = 0;
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v5 19/22] ext4: Add XIP functionality

2014-02-12 Thread Ross Zwisler

On Tue, 11 Feb 2014, Ross Zwisler wrote:
 On Wed, 15 Jan 2014, Matthew Wilcox wrote:
  From: Ross Zwisler ross.zwis...@linux.intel.com
  
  This is a port of the XIP functionality found in the current version of
  ext2.
  
  Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
  Reviewed-by: Andreas Dilger andreas.dil...@intel.com
  [heavily tweaked]
  Signed-off-by: Matthew Wilcox matthew.r.wil...@intel.com
 
 ...
 
  diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
  index c767666..8b73d77 100644
  --- a/fs/ext4/inode.c
  +++ b/fs/ext4/inode.c
  @@ -663,6 +663,18 @@ found:
  WARN_ON(1);
  }
   
  +   /* this is probably wrong for ext4.  unlike ext2, ext4 supports
  +* uninitialised extents, so we should probably be hooking
  +* into the make it initialised code instead. */
  +   if (IS_XIP(inode)) {
 
 With the very first version of this patch the above logic seemed to work
 correctly, zeroing blocks as we allocated them.  With the current XIP
 infrastructure based tightly on direct IO this ends up being wrong because in
 some cases we can call ext4_map_blocks() twice for a given block.  
 
 A quick userland test program that creates a new file, truncates it up to 4k
 and then does a partial block write will end up giving you a file filled with
 all zeros.  This is because we zero the data before the write, do the write,
 and then zero again, overwriting the data.  The second call to
 ext4_map_blocks() happens via ext4_ext_direct_IO =
 ext4_convert_unwritten_extents() = ext4_map_blocks().
 
 We can know in ext4_map_blocks() that we are being called after a write has
 already completed by looking at the flags.  One solution to get around this
 double-zeroing would be to change the above test to:
 
 + if (IS_XIP(inode)  !(flags  EXT4_GET_BLOCKS_CONVERT)) {
 
 This fixes the tests I've been able to come up with, but I'm not certain it's
 the correct fix for the long term.  It seems wasteful to zero the blocks we're
 allocating, just to have the zeros overwritten immediately by a write.  Maybe
 a cleaner way would be to try and zero the unwritten bits inside of
 ext4_convert_unwritten_extents(), or somewhere similar?
 
 It's worth noting that I don't think the direct I/O path has this kind of
 logic because they don't allow partial block writes.  The regular I/O path
 knows to zero unwritten space based on the BH_New flag, as set via the
 set_buffer_new() call in ext4_da_map_blocks().  This is a pretty different I/O
 path, though, so I'm not sure how much we can borrow for the XIP code.
 
 Thoughts on the correct fix?
 
 - Ross

It looks like Dave Chinner outlined a way to deal with this in response to the
[PATCH v5 00/22] Rewrite XIP code and add XIP support to ext4 mail.

I'll try and implement things as Dave has described (zero full blocks in the
case of xip_fault() and mark extents as written, use buffer_new(bh) to zero
edges for normal I/O) and send out code or questions as I have them.

- Ross
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v6 20/22] ext4: Add DAX functionality

2014-03-18 Thread Ross Zwisler

On Tue, 25 Feb 2014, Matthew Wilcox wrote:
 From: Ross Zwisler ross.zwis...@linux.intel.com
 
 This is a port of the DAX functionality found in the current version of
 ext2.
 
 Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
 Reviewed-by: Andreas Dilger andreas.dil...@intel.com
 [heavily tweaked]
 Signed-off-by: Matthew Wilcox matthew.r.wil...@intel.com

...

 diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
 index 594009f..dbdacef 100644
 --- a/fs/ext4/indirect.c
 +++ b/fs/ext4/indirect.c
 @@ -686,15 +686,22 @@ retry:
   inode_dio_done(inode);
   goto locked;
   }
 - ret = __blockdev_direct_IO(rw, iocb, inode,
 -  inode-i_sb-s_bdev, iov,
 -  offset, nr_segs,
 -  ext4_get_block, NULL, NULL, 0);
 + if (IS_DAX(inode))
 + ret = dax_do_io(rw, iocb, inode, iov, offset, nr_segs,
 + ext4_get_block, NULL, 0);
 + else
 + ret = __blockdev_direct_IO(rw, iocb, inode,
 + inode-i_sb-s_bdev, iov, offset,
 + nr_segs, ext4_get_block, NULL, NULL, 0);
   inode_dio_done(inode);
   } else {
  locked:
 - ret = blockdev_direct_IO(rw, iocb, inode, iov,
 -  offset, nr_segs, ext4_get_block);
 + if (IS_DAX(inode))
 + ret = dax_do_io(rw, iocb, inode, iov, offset, nr_segs,
 + ext4_get_block, NULL, 0);

We need to pass in a DIO_LOCKING flag to this call to dax_do_io.  This flag is
provided correctly in ext2_direct_IO which is the only other place I found
where we have a call to dax_do_io as an alternative to blockdev_direct_IO.

The other calls to dax_do_io are alternatives to __blockdev_direct_IO, which
has an explicit flags parameter.  I believe all of these cases are being
handled correctly.

- Ross
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] x86: remove obsolete comment in uapi/e820.h

2014-05-19 Thread Ross Zwisler

A comment introduced by this commit:

028b785888c5 x86 boot: extend some internal memory map arrays to handle
larger EFI input

had to do with some nested preprocessor directives.  The directives were
split into separate files by this commit:

af170c5061dd  UAPI: (Scripted) Disintegrate arch/x86/include/asm

The comment explaining their interaction was retained and is now present
in arch/x86/include/uapi/asm/e820.h.  This comment is no longer correct,
so delete it.

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com

Cc: Thomas Gleixner t...@linutronix.de
Cc: Ingo Molnar mi...@redhat.com
Cc: H. Peter Anvin h...@zytor.com
Cc: x...@kernel.org
---
 arch/x86/include/uapi/asm/e820.h | 5 -
 1 file changed, 5 deletions(-)

diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h
index bbae024..d993e33 100644
--- a/arch/x86/include/uapi/asm/e820.h
+++ b/arch/x86/include/uapi/asm/e820.h
@@ -21,11 +21,6 @@
  * this size.
  */
 
-/*
- * Odd: 'make headers_check' complains about numa.h if I try
- * to collapse the next two #ifdef lines to a single line:
- * #if defined(__KERNEL__)  defined(CONFIG_EFI)
- */
 #ifndef __KERNEL__
 #define E820_X_MAX E820MAX
 #endif
-- 
1.9.0

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] drm: Missed clflushopt in drm_clflush_virt_range

2014-05-14 Thread Ross Zwisler

With this commit:

2a0788dc9bc4 x86: Use clflushopt in drm_clflush_virt_range

If clflushopt is available on the system, we use it instead of clflush
in drm_clflush_virt_range.  There were two calls to clflush in this
function, but only one was changed to clflushopt.  This patch changes
the other clflush call to clflushopt.

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
Reported-by: Matthew Wilcox matthew.r.wil...@intel.com

Cc: David Airlie airl...@linux.ie
Cc: dri-de...@lists.freedesktop.org
Cc: H Peter Anvin h.peter.an...@intel.com
Cc: Ingo Molnar mi...@kernel.org
Cc: Thomas Gleixner t...@linutronix.de

---
 drivers/gpu/drm/drm_cache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
index 534cb89..041b73b 100644
--- a/drivers/gpu/drm/drm_cache.c
+++ b/drivers/gpu/drm/drm_cache.c
@@ -138,7 +138,7 @@ drm_clflush_virt_range(char *addr, unsigned long length)
char *end = addr + length;
mb();
for (; addr  end; addr += boot_cpu_data.x86_clflush_size)
-   clflush(addr);
+   clflushopt(addr);
clflushopt(end - 1);
mb();
return;
-- 
1.9.0

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v5 22/22] XIP: Add support for unwritten extents

2014-01-22 Thread Ross Zwisler

On Wed, 15 Jan 2014, Matthew Wilcox wrote:

  static ssize_t xip_io(int rw, struct inode *inode, const struct iovec
 *iov,
   loff_t start, loff_t end, unsigned nr_segs,
   get_block_t get_block, struct buffer_head *bh)
 @@ -103,21 +109,29 @@ static ssize_t xip_io(int rw, struct inode *inode,
 const struct iovec *iov,
   retval = get_block(inode, block, bh, rw == WRITE);
   if (retval)
   break;
 - if (buffer_mapped(bh)) {
 - retval = xip_get_addr(inode, bh, addr);
 - if (retval  0)
 - break;
 - addr += offset - (block  inode-i_blkbits);
 - hole = false;
 - size = retval;
 - } else {
 - if (rw == WRITE) {
 + if (rw == WRITE) {
 + if (!buffer_mapped(bh)) {
   retval = -EIO;
   break;
   }
 + hole = false;
 + } else {
 + hole = !buffer_written(bh);
 + }
 +
 + if (hole) {
   addr = NULL;
 - hole = true;
   size = bh-b_size;
 + } else {
 + unsigned first;
 + retval = xip_get_addr(inode, bh, addr);
 + if (retval  0)
 + break;
 + size = retval;
 + first = offset - (block  inode-i_blkbits);
 + if (buffer_unwritten(bh))
 + memset(addr, 0, first);
 + addr += first;

+   size -= first;

This is needed so that we don't overrun the XIP buffer we are given in the
event that our user buffer = our XIP buffer and the start of our I/O isn't
block aligned.

You can add my 
Reviewed-by: Ross Zwisler ross.zwis...@linux.intel.com 

Thanks,
- Ross
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v5 22/22] XIP: Add support for unwritten extents

2014-01-23 Thread Ross Zwisler

On Thu, 23 Jan 2014, Matthew Wilcox wrote:
 On Wed, Jan 22, 2014 at 03:51:56PM -0700, Ross Zwisler wrote:
   + if (hole) {
 addr = NULL;
   - hole = true;
 size = bh-b_size;
   + } else {
   + unsigned first;
   + retval = xip_get_addr(inode, bh, addr);
   + if (retval  0)
   + break;
   + size = retval;
   + first = offset - (block  inode-i_blkbits);
   + if (buffer_unwritten(bh))
   + memset(addr, 0, first);
   + addr += first;
  
  +   size -= first;
  
  This is needed so that we don't overrun the XIP buffer we are given in the
  event that our user buffer = our XIP buffer and the start of our I/O isn't
  block aligned.
 
 You're right!  Thank you!  However, we also need it for the hole ==
 true case, don't we?  So maybe something like this, incrementally on top of
 patch 22/22:
 
 P.S. Can someone come up with a better name for this variable than 'first'?
 I'd usually use 'offset', but that's already taken.  'annoying_bit' seems a
 bit judgemental.  'misaligned', maybe?  'skip' or 'seek' like dd uses?
 
 diff --git a/fs/xip.c b/fs/xip.c
 index 92157ff..1ae00db 100644
 --- a/fs/xip.c
 +++ b/fs/xip.c
 @@ -103,6 +103,7 @@ static ssize_t xip_io(int rw, struct inode *inode, const 
 struct iovec *iov,
  
   if (max == offset) {
   sector_t block = offset  inode-i_blkbits;
 + unsigned first = offset - (block  inode-i_blkbits);
   long size;
   memset(bh, 0, sizeof(*bh));
   bh-b_size = ALIGN(end - offset, PAGE_SIZE);
 @@ -121,14 +122,12 @@ static ssize_t xip_io(int rw, struct inode *inode, 
 const struct iovec *iov,
  
   if (hole) {
   addr = NULL;
 - size = bh-b_size;
 + size = bh-b_size - first;
   } else {
 - unsigned first;
   retval = xip_get_addr(inode, bh, addr);
   if (retval  0)
   break;
 - size = retval;
 - first = offset - (block  inode-i_blkbits);
 + size = retval - first;
   if (buffer_unwritten(bh))
   memset(addr, 0, first);
   addr += first;

Yep, this seems right to me.

Maybe misalignment?  Seems more descriptive (if a bit long), but I don't
know if there are other, better existing conventions.

- Ross
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v5 19/22] ext4: Add XIP functionality

2014-01-16 Thread Ross Zwisler

On Wed, 15 Jan 2014, Matthew Wilcox wrote:

 +#ifdef CONFIG_FS_XIP
 +const struct file_operations ext4_xip_file_operations = {
 + .llseek = ext4_llseek,
 + .read   = do_sync_read,
 + .write  = do_sync_write,

I think we may always need to define ext2_xip_file_operations and
ext4_xip_file_operations, even if we have XIP compiled out.  We make the
decision on which file operations table to use at runtime:

from ext4_iget:
if (test_opt(inode-i_sb, XIP))
inode-i_fop = ext4_xip_file_operations;
else
inode-i_fop = ext4_file_operations;

With CONFIG_FS_XIP undefined, we get a compile error:
ERROR: ext4_xip_file_operations [fs/ext4/ext4.ko] undefined!
ERROR: ext2_xip_file_operations [fs/ext2/ext2.ko] undefined!

My guess is that with the old ext2 XIP code and with the first pass of the ext4
XIP code, we weren't seeing this because the uses of the xip file operations
table were optimized out, removing the undefined symbol?

- Ross
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v5 22/22] XIP: Add support for unwritten extents

2014-01-27 Thread Ross Zwisler

On Thu, 23 Jan 2014, Matthew Wilcox wrote:
 On Wed, Jan 22, 2014 at 03:51:56PM -0700, Ross Zwisler wrote:
   + if (hole) {
 addr = NULL;
   - hole = true;
 size = bh-b_size;
   + } else {
   + unsigned first;
   + retval = xip_get_addr(inode, bh, addr);
   + if (retval  0)
   + break;
   + size = retval;
   + first = offset - (block  inode-i_blkbits);
   + if (buffer_unwritten(bh))
   + memset(addr, 0, first);
   + addr += first;
  
  +   size -= first;
  
  This is needed so that we don't overrun the XIP buffer we are given in the
  event that our user buffer = our XIP buffer and the start of our I/O isn't
  block aligned.
 
 You're right!  Thank you!  However, we also need it for the hole ==
 true case, don't we?  So maybe something like this, incrementally on top of
 patch 22/22:
 
 P.S. Can someone come up with a better name for this variable than 'first'?
 I'd usually use 'offset', but that's already taken.  'annoying_bit' seems a
 bit judgemental.  'misaligned', maybe?  'skip' or 'seek' like dd uses?
 
 diff --git a/fs/xip.c b/fs/xip.c
 index 92157ff..1ae00db 100644
 --- a/fs/xip.c
 +++ b/fs/xip.c
 @@ -103,6 +103,7 @@ static ssize_t xip_io(int rw, struct inode *inode, const
 struct iovec *iov,
  
   if (max == offset) {
   sector_t block = offset  inode-i_blkbits;
 + unsigned first = offset - (block  inode-i_blkbits);
   long size;
   memset(bh, 0, sizeof(*bh));
   bh-b_size = ALIGN(end - offset, PAGE_SIZE);
 @@ -121,14 +122,12 @@ static ssize_t xip_io(int rw, struct inode *inode,
 const struct iovec *iov,
  
   if (hole) {
   addr = NULL;
 - size = bh-b_size;
 + size = bh-b_size - first;

It looks like we have an additional bit of complexity with the hole case.  The
issue is that for holes, bh-b_size is just the full size of the write as set
earlier in the function:

bh-b_size = ALIGN(end - offset, PAGE_SIZE);

From this code it seems like you hoped the call into get_block() would adjust
bh-b_size to the size of the hole, allowing you to zero just the hole space
in the user buffer.  It doesn't look like it does, though, at least for ext4.
In looking at the direct I/O case (do_direct_IO()), they deal with holes on a
per FS block basis, and don't ever look at bh-b_size once they've figured out
the buffer is unmapped.

The result of this is that when you get a read that starts at a hole but moves
into real data, the read will just see a hole and return data of all zeros.

To just assume the current FS block is a hole, we can do something like this:

diff --git a/fs/xip.c b/fs/xip.c
index 35e401e..e902593 100644
--- a/fs/xip.c
+++ b/fs/xip.c
@@ -122,7 +122,7 @@ static ssize_t xip_io(int rw, struct inode *inode, const 
struct
 
if (hole) {
addr = NULL;
-   size = bh-b_size - first;
+   size = (1  inode-i_blkbits) - first;
} else {
retval = xip_get_addr(inode, bh, addr);
if (retval  0)

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v5 00/22] Rewrite XIP code and add XIP support to ext4

2014-01-30 Thread Ross Zwisler

On Fri, 31 Jan 2014, Dave Chinner wrote:
 The read/write path is broken, Willy. We can't map arbitrary byte
 ranges to the DIO subsystem. I'm now certain that the data
 corruptions I'm seeing are in sub-sector regions from unaligned IOs
 from userspace. We still need to use the buffered IO path for non
 O_DIRECT IO to avoid these problems. I think I've worked out a way
 to short-circuit page cache lookups for the buffered IO path, so
 stay tuned

Hi Dave,

I found an issue that would cause reads to return bad data earlier this week,
and sent a response to [PATCH v5 22/22] XIP: Add support for unwritten
extents.  Just wanted to make sure you're not running into that issue.  

I'm also currently chasing a write corruption where we lose the data that we
had just written because ext4 thinks the portion of the extent we had just
written needs to be converted from an unwritten extent to a written extent, so
it clears the data to all zeros via:

xip_clear_blocks+0x53/0xd7
ext4_map_blocks+0x306/0x3d9 [ext4]
jbd2__journal_start+0xbd/0x188 [jbd2]
ext4_convert_unwritten_extents+0xf9/0x1ac [ext4]
ext4_direct_IO+0x2ca/0x3a5 [ext4]

This bug can be easily reproduced by fallocating an empty file up to a page,
and then writing into that page.  The first write is essentially lost, and the
page remains all zeros.  Subsequent writes succeed.

I'm still in the process of figuring out exactly why this is happening, but
unfortunately I won't be able to look at again until next week.  I don't know
if it's related to the corruption that you're seeing or not, just wanted to
let you know.

- Ross
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/2] brd: Fix the partitions BUG

2014-07-30 Thread Ross Zwisler

On Wed, 2014-07-30 at 17:15 +0300, Boaz Harrosh wrote:
 With current code after a call to:
   bdev = blkdev_get_by_path(dev_name, mode, fs_type);
   size = i_size_read(bdev-bd_inode);
   part_size  = bdev-bd_part-nr_sects  9;
 
 I get the following bad results:
 dev_name == /dev/ram0
   foo: [foo_mount:880] size=0x4000 bdev=88003dc24340 \
   bd_inode=88003dc24430 bd_part=88003ca22848 part_size=0x4000
 dev_name == /dev/ram0p1
   foo: [foo_mount:880] size=0x4000 bdev=88003d2f6d80 \
   bd_inode=88003d2f6e70 bd_part=88003ca22848 part_size=0x4000
 dev_name == /dev/ram0p2
   foo: [foo_mount:880] size=0x4000 bdev=88003dc24680 \
   bd_inode=88003dc24770 bd_part=88003ca22848 part_size=0x4000
 Note how all three bdev(s) point to the same bd_part.
 
 This is do to a single bad clubber in brd_probe() which is
 removed in this patch:
 - *part = 0;
 
 because of this all 3 bdev(s) above get to point to the same bd_part[0]
 
 While at it fix/rename brd_init_one() since all devices are created on
 load of driver, brd_probe() will never be called with a new un-created
 device.
 brd_init_one() is now renamed to brd_find() which is what it does.
 
 TODO: There is one more partitions BUG regarding
   brd_direct_access() which is fixed on the next patch.
 
 Signed-off-by: Boaz Harrosh b...@plexistor.com
 ---
  drivers/block/brd.c | 19 ---
  1 file changed, 8 insertions(+), 11 deletions(-)
 
 diff --git a/drivers/block/brd.c b/drivers/block/brd.c
 index c7d138e..92334f6 100644
 --- a/drivers/block/brd.c
 +++ b/drivers/block/brd.c
 @@ -523,22 +523,20 @@ static void brd_free(struct brd_device *brd)
   kfree(brd);
  }
  
 -static struct brd_device *brd_init_one(int i)
 +static struct brd_device *brd_find(int i)
  {
   struct brd_device *brd;
  
   list_for_each_entry(brd, brd_devices, brd_list) {
   if (brd-brd_number == i)
 - goto out;
 + return brd;
   }
  
 - brd = brd_alloc(i);
 - if (brd) {
 - add_disk(brd-brd_disk);
 - list_add_tail(brd-brd_list, brd_devices);
 - }
 -out:
 - return brd;
 + /* brd always allocates all its devices at load time, therefor
 +  * brd_probe will never be called with a new brd_number
 +  */
 + printk(KERN_EROR brd: brd_find unexpected device %d\n, i);

s/KERN_EROR/KERN_ERR/

 + return NULL;
  }
  
  static void brd_del_one(struct brd_device *brd)
 @@ -554,11 +552,10 @@ static struct kobject *brd_probe(dev_t dev, int *part, 
 void *data)
   struct kobject *kobj;
  
   mutex_lock(brd_devices_mutex);
 - brd = brd_init_one(MINOR(dev)  part_shift);
 + brd = brd_find(MINOR(dev)  part_shift);
   kobj = brd ? get_disk(brd-brd_disk) : NULL;
   mutex_unlock(brd_devices_mutex);
  
 - *part = 0;
   return kobj;
  }

It is possible to create new block devices with BRD at runtime:

# mknod /dev/new_brd b 1 4 
# fdisk -l /dev/new_brd

This causes a new BRD disk to be created, and hits your error case:

Jul 30 10:40:57 alara kernel: brd: brd_find unexpected device 4

I guess in general I'm not saying that BRD needs to have partitions - indeed
it may not give you much in the way of added functionality.  As the code
currently stands partitions aren't surfaced anyway
(GENHD_FL_SUPPRESS_PARTITION_INFO is set).  For PRD, however, I *do* want to
enable partitions correctly because eventually I'd like to enhance PRD so that
it *does* actually handle NVDIMMs correctly, and for that partitions do make
sense.  And if I have to implement and debug partitions for PRD, it's easy to
stick them in BRD in case anyone wants to use them.

- Ross


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v9 00/22] Support ext4 on NV-DIMMs

2014-08-01 Thread Ross Zwisler

On Fri, 2014-08-01 at 09:27 -0400, Matthew Wilcox wrote:
 From: Matthew Wilcox wi...@linux.intel.com
 
 One of the primary uses for NV-DIMMs is to expose them as a block device
 and use a filesystem to store files on the NV-DIMM.  While that works,
 it currently wastes memory and CPU time buffering the files in the page
 cache.  We have support in ext2 for bypassing the page cache, but it
 has some races which are unfixable in the current design.  This series
 of patches rewrite the underlying support, and add support for direct
 access to ext4.
 
 This iteration of the patchset rebases to 3.16-rc7 and makes substantial
 changes based on feedback from Jan Kara, Boaz Harrosh and Kirill Shutemov:
 
  - Fixes a double-unlock on i_mmap_mutex
  - Switch the order of calling delete_from_page_cache() and
unmap_mapping_range() to match the truncate path
  - Make dax_mkwrite a macro (Kirill)
  - Drop vm_replace_mixed(); instead call unmap_mapping_range() before calling
vm_insert_mixed() (Kirill)
  - Avoid lock inversion between i_mmap_mutex and transaction start (Jan)
  - Move alignment  length checks into bdev_direct_access() (Boaz)
  - Fix bugs in COW code; unfortunately this means reintroducing the knowledge
that the i_mmap_mutex protects PFNs to the core MM code.
 
 Jan Kara (1):
   ext4: Avoid lock inversion between i_mmap_mutex and transaction start
 
 Matthew Wilcox (20):
   axonram: Fix bug in direct_access
   Change direct_access calling convention
   Fix XIP fault vs truncate race
   Allow page fault handlers to perform the COW
   Introduce IS_DAX(inode)
   Add copy_to_iter(), copy_from_iter() and iov_iter_zero()
   Replace XIP read and write with DAX I/O
   Replace ext2_clear_xip_target with dax_clear_blocks
   Replace the XIP page fault handler with the DAX page fault handler
   Replace xip_truncate_page with dax_truncate_page
   Replace XIP documentation with DAX documentation
   Remove get_xip_mem
   ext2: Remove ext2_xip_verify_sb()
   ext2: Remove ext2_use_xip
   ext2: Remove xip.c and xip.h
   Remove CONFIG_EXT2_FS_XIP and rename CONFIG_FS_XIP to CONFIG_FS_DAX
   ext2: Remove ext2_aops_xip
   Get rid of most mentions of XIP in ext2
   xip: Add xip_zero_page_range
   brd: Rename XIP to DAX
 
 Ross Zwisler (1):
   ext4: Add DAX functionality
 
  Documentation/filesystems/Locking  |   3 -
  Documentation/filesystems/dax.txt  |  91 +++
  Documentation/filesystems/ext4.txt |   2 +
  Documentation/filesystems/xip.txt  |  68 --
  arch/powerpc/sysdev/axonram.c  |  19 +-
  drivers/block/Kconfig  |  13 +-
  drivers/block/brd.c|  26 +-
  drivers/s390/block/dcssblk.c   |  21 +-
  fs/Kconfig |  21 +-
  fs/Makefile|   1 +
  fs/block_dev.c |  34 +++
  fs/dax.c   | 476 
  fs/exofs/inode.c   |   1 -
  fs/ext2/Kconfig|  11 -
  fs/ext2/Makefile   |   1 -
  fs/ext2/ext2.h |  10 +-
  fs/ext2/file.c |  45 +++-
  fs/ext2/inode.c|  38 +--
  fs/ext2/namei.c|  13 +-
  fs/ext2/super.c|  53 ++--
  fs/ext2/xip.c  |  91 ---
  fs/ext2/xip.h  |  26 --
  fs/ext4/ext4.h |   6 +
  fs/ext4/file.c |  53 +++-
  fs/ext4/indirect.c |  18 +-
  fs/ext4/inode.c|  65 +++--
  fs/ext4/namei.c|  10 +-
  fs/ext4/super.c|  39 ++-
  fs/open.c  |   5 +-
  include/linux/blkdev.h |   6 +-
  include/linux/fs.h |  49 +++-
  include/linux/mm.h |   1 +
  include/linux/uio.h|   3 +
  mm/Makefile|   1 -
  mm/fadvise.c   |   6 +-
  mm/filemap.c   |   6 +-
  mm/filemap_xip.c   | 483 
 -
  mm/iov_iter.c  | 237 --
  mm/madvise.c   |   2 +-
  mm/memory.c|  33 ++-
  40 files changed, 1206 insertions(+), 881 deletions(-)
  create mode 100644 Documentation/filesystems/dax.txt
  delete mode 100644 Documentation/filesystems/xip.txt
  create mode 100644 fs/dax.c
  delete mode 100644 fs/ext2/xip.c
  delete mode 100644 fs/ext2/xip.h
  delete mode 100644 mm/filemap_xip.c

I've updated the master branch of PRD's GitHub repo
(https://github.com/01org/prd) so that it is Linus's tip + DAX v9 + PRD.

I've also added a patch to PRD to enable dynamic allocation of partition
numbers.

- Ross


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http

Re: [PATCH] x86: remove obsolete comment in uapi/e820.h

2014-08-20 Thread Ross Zwisler

On Mon, 2014-05-19 at 11:50 -0600, Ross Zwisler wrote:
 A comment introduced by this commit:
 
 028b785888c5 x86 boot: extend some internal memory map arrays to handle
   larger EFI input
 
 had to do with some nested preprocessor directives.  The directives were
 split into separate files by this commit:
 
 af170c5061dd  UAPI: (Scripted) Disintegrate arch/x86/include/asm
 
 The comment explaining their interaction was retained and is now present
 in arch/x86/include/uapi/asm/e820.h.  This comment is no longer correct,
 so delete it.
 
 Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
 
 Cc: Thomas Gleixner t...@linutronix.de
 Cc: Ingo Molnar mi...@redhat.com
 Cc: H. Peter Anvin h...@zytor.com
 Cc: x...@kernel.org
 ---
  arch/x86/include/uapi/asm/e820.h | 5 -
  1 file changed, 5 deletions(-)
 
 diff --git a/arch/x86/include/uapi/asm/e820.h 
 b/arch/x86/include/uapi/asm/e820.h
 index bbae024..d993e33 100644
 --- a/arch/x86/include/uapi/asm/e820.h
 +++ b/arch/x86/include/uapi/asm/e820.h
 @@ -21,11 +21,6 @@
   * this size.
   */
  
 -/*
 - * Odd: 'make headers_check' complains about numa.h if I try
 - * to collapse the next two #ifdef lines to a single line:
 - *   #if defined(__KERNEL__)  defined(CONFIG_EFI)
 - */
  #ifndef __KERNEL__
  #define E820_X_MAX E820MAX
  #endif

Ping.  :)


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v8 00/22] Support ext4 on NV-DIMMs

2014-07-24 Thread Ross Zwisler

On Wed, 2014-07-23 at 15:50 -0400, Matthew Wilcox wrote:
 On Wed, Jul 23, 2014 at 06:58:38PM +0300, Boaz Harrosh wrote:
  Have you please pushed this tree to git hub. It used to be on the prd
  tree, if you could just add another branch there, it would be cool.
  (https://github.com/01org/prd)
 
 Ross handles the care  feeding of that tree ... he'll push that branch
 out soon.

I've updated the master branch of PRD's GitHub repo
(https://github.com/01org/prd) so that it is v3.16-rc6 + DAX v8 + PRD.

- Ross



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/4] brd: Add getgeo to block ops

2014-08-06 Thread Ross Zwisler

On Wed, 6 Aug 2014, Boaz Harrosh wrote:
 From: Ross Zwisler ross.zwis...@linux.intel.com
 
 Some programs require HDIO_GETGEO work, which requires we implement
 getgeo.  Based off of the work done to the NVMe driver in this commit:
 
 4cc09e2dc4cb NVMe: Add getgeo to block ops
 
 [Boaz] Converted original work done for prd.c for here.
This is needed if we want to support partitions, fdisk
calls this.
 
 Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
 Signed-off-by: Boaz Harrosh b...@plexistor.com
 ---
  drivers/block/brd.c | 11 +++
  1 file changed, 11 insertions(+)
 
 diff --git a/drivers/block/brd.c b/drivers/block/brd.c
 index a10a0a9..3f07cb4 100644
 --- a/drivers/block/brd.c
 +++ b/drivers/block/brd.c
 @@ -19,6 +19,7 @@
  #include linux/radix-tree.h
  #include linux/fs.h
  #include linux/slab.h
 +#include linux/hdreg.h
  
  #include asm/uaccess.h
  
 @@ -424,6 +425,15 @@ static int brd_ioctl(struct block_device *bdev, fmode_t 
 mode,
   return error;
  }
  
 +static int brd_getgeo(struct block_device *bd, struct hd_geometry *geo)
 +{
 + /* some standard values */
 + geo-heads = 1  6;
 + geo-sectors = 1  5;
 + geo-cylinders = get_capacity(bd-bd_disk)  11;
 + return 0;
 +}
 +
  static const struct block_device_operations brd_fops = {
   .owner =THIS_MODULE,
   .rw_page =  brd_rw_page,
 @@ -431,6 +441,7 @@ static const struct block_device_operations brd_fops = {
  #ifdef CONFIG_BLK_DEV_XIP
   .direct_access =brd_direct_access,
  #endif
 + .getgeo =   brd_getgeo,
  };
  
  /*
 -- 
 1.9.3

This looks good.

- Ross
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 4/4] brd: Request from fdisk 4k alignment

2014-08-06 Thread Ross Zwisler

On Wed, 2014-08-06 at 14:35 +0300, Boaz Harrosh wrote:
 Because of the direct_access() API which returns a PFN. partitions
 better start on 4K boundary, else offset ZERO of a partition will
 not be aligned and blk_direct_access() will fail the call.
 
 By setting blk_queue_physical_block_size(PAGE_SIZE) we can communicate
 this to fdisk and friends.
 Note that blk_queue_physical_block_size() also trashes io_min, but
 we can leave this one to be 512.
 
 Signed-off-by: Boaz Harrosh b...@plexistor.com
 ---
  drivers/block/brd.c | 7 +++
  1 file changed, 7 insertions(+)
 
 diff --git a/drivers/block/brd.c b/drivers/block/brd.c
 index 9673704..514cfe1 100644
 --- a/drivers/block/brd.c
 +++ b/drivers/block/brd.c
 @@ -495,10 +495,17 @@ static struct brd_device *brd_alloc(int i)
   brd-brd_queue = blk_alloc_queue(GFP_KERNEL);
   if (!brd-brd_queue)
   goto out_free_dev;
 +
   blk_queue_make_request(brd-brd_queue, brd_make_request);
   blk_queue_max_hw_sectors(brd-brd_queue, 1024);
   blk_queue_bounce_limit(brd-brd_queue, BLK_BOUNCE_ANY);
  
 + /* This is so fdisk will align partitions on 4k, because of
 +  * direct_access API needing 4k alignment, returning a PFN
 +  */
 + blk_queue_physical_block_size(brd-brd_queue, PAGE_SIZE);
 + brd-brd_queue-limits.io_min = 512; /* Don't use the accessor */
 +
   brd-brd_queue-limits.discard_granularity = PAGE_SIZE;
   brd-brd_queue-limits.max_discard_sectors = UINT_MAX;
   brd-brd_queue-limits.discard_zeroes_data = 1;

Is there an error case that this patch fixes?  I've had page alignment checks
in my PRD direct_access code forever, and I don't know if they've ever
tripped.  

Also, blk_queue_physical_block_size() seems wrong - here's the comment for
that function:

/**
 * blk_queue_physical_block_size - set physical block size for the queue
 * @q:  the request queue for the device
 * @size:  the physical block size, in bytes
 *
 * Description:
 *   This should be set to the lowest possible sector size that the
 *   hardware can operate on without reverting to read-modify-write
 *   operations.
 */

It doesn't sound like this is what you're after?  It sounds like instead you
want to control the alignment, not minimum natural I/O size?  It seems like if
you did want to do this, blk_queue_alignment_offset() would be more what you
were after?

- Ross


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/4] brd: Fix all partitions BUGs

2014-08-06 Thread Ross Zwisler

On Wed, 2014-08-06 at 14:33 +0300, Boaz Harrosh wrote:
 This patch fixes up brd's partitions scheme, now enjoying all worlds.
 
 The MAIN fix here is that currently if one fdisks some partitions,
 a BAD bug will make all partitions point to the same start-end sector
 ie: 0 - brd_size And an mkfs of any partition would trash the partition
 table and the other partition.
 
 Another fix is that mount -U uuid did not work, because of the
 GENHD_FL_SUPPRESS_PARTITION_INFO flag.
 
 So NOW the logic goes like this:
 * max_part - Just says how many minors to reserve between devices
   But in any way, there can be as many partition as requested.
   If minors between devices ends, then dynamic 259-major ids will
   be allocated on the fly.
   The default is now max_part=1, which means all partitions devt
   will be from the dynamic major-range.
   (If persistent partition minors is needed use max_part=)
 
 * Creation of new devices on the fly still/always work:
   mknod /path/devnod b 1 X
   fdisk -l /path/devnod
   Will create a new device if (X / max_part) was not already
   created before. (Just as before)
 
   partitions on the dynamically created device will work as well
   Same logic applies with minors as with the pre-created ones.
 
 TODO: dynamic grow of device size, maybe through sysfs. So each
   device can have it's own size.

With this patch we end up in what feels like a weird place where we're half
using the old scheme of major/minor allocation, and half in the world of
dynamic major/minors.  Devices have a major of 1 and minors that increment by
1, but partitions have a major of 259 (BLOCK_EXT_MAJOR):

brw-rw 1 root disk   1, 0 Aug  6 14:10 /dev/ram0
brw-rw 1 root disk   1, 1 Aug  6 14:13 /dev/ram1
brw-rw 1 root disk 259, 0 Aug  6 14:14 /dev/ram1p1
brw-rw 1 root disk 259, 1 Aug  6 14:13 /dev/ram1p2
brw-rw 1 root disk 259, 2 Aug  6 14:14 /dev/ram1p51

For NVMe and PRD you get a major of 259 all around:

brw-rw 1 root disk 259, 0 Aug  6 16:55 /dev/pmem0
brw-rw 1 root disk 259, 2 Aug  6 16:55 /dev/pmem0p1
brw-rw 1 root disk 259, 3 Aug  6 16:55 /dev/pmem0p2
brw-rw 1 root disk 259, 1 Aug  6 16:54 /dev/pmem1

It could be that this is fine, but it just smells fishy to me I guess.

Also, it looks like you can still create a new device with this patch, but you
can't create partitions on that device.  Not sure if this is just what you get
when you dynamically create a device on the fly, or if it's a symptom of
something larger.

- Ross

 
 Signed-off-by: Boaz Harrosh b...@plexistor.com
 ---
  drivers/block/brd.c | 93 
 +
  1 file changed, 36 insertions(+), 57 deletions(-)
 
 diff --git a/drivers/block/brd.c b/drivers/block/brd.c
 index 3f07cb4..9673704 100644
 --- a/drivers/block/brd.c
 +++ b/drivers/block/brd.c
 @@ -447,16 +447,18 @@ static const struct block_device_operations brd_fops = {
  /*
   * And now the modules code and kernel interface.
   */
 -static int rd_nr;
 -int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
 -static int max_part;
 -static int part_shift;
 +static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT;
  module_param(rd_nr, int, S_IRUGO);
  MODULE_PARM_DESC(rd_nr, Maximum number of brd devices);
 +
 +int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
  module_param(rd_size, int, S_IRUGO);
  MODULE_PARM_DESC(rd_size, Size of each RAM disk in kbytes.);
 +
 +static int max_part = 1;
  module_param(max_part, int, S_IRUGO);
 -MODULE_PARM_DESC(max_part, Maximum number of partitions per RAM disk);
 +MODULE_PARM_DESC(max_part, Num Minors to reserve between devices);
 +
  MODULE_LICENSE(GPL);
  MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
  MODULE_ALIAS(rd);
 @@ -502,15 +504,15 @@ static struct brd_device *brd_alloc(int i)
   brd-brd_queue-limits.discard_zeroes_data = 1;
   queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd-brd_queue);
  
 - disk = brd-brd_disk = alloc_disk(1  part_shift);
 + disk = brd-brd_disk = alloc_disk(max_part);
   if (!disk)
   goto out_free_queue;
   disk-major = RAMDISK_MAJOR;
 - disk-first_minor   = i  part_shift;
 + disk-first_minor   = i * max_part;
   disk-fops  = brd_fops;
   disk-private_data  = brd;
   disk-queue = brd-brd_queue;
 - disk-flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
 + disk-flags = GENHD_FL_EXT_DEVT;
   sprintf(disk-disk_name, ram%d, i);
   set_capacity(disk, rd_size * 2);
  
 @@ -532,10 +534,11 @@ static void brd_free(struct brd_device *brd)
   kfree(brd);
  }
  
 -static struct brd_device *brd_init_one(int i)
 +static struct brd_device *brd_init_one(int i, bool *new)
  {
   struct brd_device *brd;
  
 + *new = false;
   list_for_each_entry(brd, brd_devices, brd_list) {
   if (brd-brd_number == i)
   goto out;
 @@ -546,6 +549,7 @@ static struct

Re: [PATCH 3/4] brd: Fix all partitions BUGs

2014-08-07 Thread Ross Zwisler

On Thu, 2014-08-07 at 12:11 +0300, Boaz Harrosh wrote:
 On 08/07/2014 02:06 AM, Ross Zwisler wrote:
  Also, it looks like you can still create a new device with this patch, but 
  you
  can't create partitions on that device.  Not sure if this is just what you 
  get
  when you dynamically create a device on the fly, or if it's a symptom of
  something larger.
  
 
 What? I just tried again this all works fine for me, here with fdisk.
 $ modprobe brd  # will create ram0-7
 $ mknod /dev/ram8 b 1 8
 $ fdisk /dev/ram8
   g, n, , , +2M, n, , , , , w 
 
 I create 2 partitions 2M each and press w and it is all there.
 
 What numbers did you use ? rd_nr, max_part, and the mknod numbers. Here it
 just works fine. What did you try?

Ah - it turns out the issue was that I wasn't following the naming scheme
ramX where X is your new device name.  Here's the sequence:

# mknod /dev/ram_new b 1 6
# fdisk /dev/ram_new 
 create some partitions

This ends up creating a ram_new and a ram6, which have the same
major/minor.  The partitions do show up, but they live under ram6:

brw-rw 1 root disk   1, 6 Aug  7 12:36 ram6
brw-rw 1 root disk 259, 0 Aug  7 12:36 ram6p1
brw-rw 1 root disk 259, 1 Aug  7 12:36 ram6p2
brw-r--r-- 1 root root   1, 6 Aug  7 12:36 ram_new

You can run fdisk -l, etc, on ram_new, and it'll show you the partitions, they
just won't be surfaced in /dev.  ram6 and ram_new seem to be alaises:

# fdisk -l /dev/ram_new

Disk /dev/ram_new: 8589 MB, 8589934592 bytes
64 heads, 32 sectors/track, 8192 cylinders
Units = cylinders of 2048 * 512 = 1048576 bytes
Sector size (logical/physical): 512 bytes / 4096 bytes
I/O size (minimum/optimal): 512 bytes / 512 bytes
Disk identifier: 0x2812942c

   Device Boot  Start End  Blocks   Id  System
/dev/ram_new1   12049 2098160   83  Linux
/dev/ram_new220508192 6290432   83  Linux

This device aliasing happened with the old BRD code as well, so this isn't new
behavior.

- Ross


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/4] brd: Fix all partitions BUGs

2014-08-07 Thread Ross Zwisler

On Wed, 2014-08-06 at 14:33 +0300, Boaz Harrosh wrote:
 This patch fixes up brd's partitions scheme, now enjoying all worlds.
 
 The MAIN fix here is that currently if one fdisks some partitions,
 a BAD bug will make all partitions point to the same start-end sector
 ie: 0 - brd_size And an mkfs of any partition would trash the partition
 table and the other partition.
 
 Another fix is that mount -U uuid did not work, because of the
 GENHD_FL_SUPPRESS_PARTITION_INFO flag.
 
 So NOW the logic goes like this:
 * max_part - Just says how many minors to reserve between devices
   But in any way, there can be as many partition as requested.
   If minors between devices ends, then dynamic 259-major ids will
   be allocated on the fly.
   The default is now max_part=1, which means all partitions devt
   will be from the dynamic major-range.
   (If persistent partition minors is needed use max_part=)
 
 * Creation of new devices on the fly still/always work:
   mknod /path/devnod b 1 X
   fdisk -l /path/devnod
   Will create a new device if (X / max_part) was not already
   created before. (Just as before)
 
   partitions on the dynamically created device will work as well
   Same logic applies with minors as with the pre-created ones.
 
 TODO: dynamic grow of device size, maybe through sysfs. So each
   device can have it's own size.
 
 Signed-off-by: Boaz Harrosh b...@plexistor.com


Tested-by: Ross Zwisler ross.zwis...@linux.intel.com


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 0/4] Add persistent memory driver

2014-08-27 Thread Ross Zwisler

PMEM is a modified version of the Block RAM Driver, BRD. The major difference
is that BRD allocates its backing store pages from the page cache, whereas
PMEM uses reserved memory that has been ioremapped.

One benefit of this approach is that there is a direct mapping between
filesystem block numbers and virtual addresses.  In PMEM, filesystem blocks N,
N+1, N+2, etc. will all be adjacent in the virtual memory space. This property
allows us to set up PMD mappings (2 MiB) for DAX.

This patch set is builds upon the work that Matthew Wilcox has been doing for
DAX:

https://lkml.org/lkml/2014/8/27/31

Specifically, my implementation of pmem_direct_access() in patch 4/4 uses API
enhancements introduced in Matthew's DAX patch v10 02/21:

https://lkml.org/lkml/2014/8/27/48

Ross Zwisler (4):
  pmem: Initial version of persistent memory driver
  pmem: Add support for getgeo()
  pmem: Add support for rw_page()
  pmem: Add support for direct_access()

 MAINTAINERS|   6 +
 drivers/block/Kconfig  |  41 ++
 drivers/block/Makefile |   1 +
 drivers/block/pmem.c   | 375 +
 4 files changed, 423 insertions(+)
 create mode 100644 drivers/block/pmem.c

-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/4] pmem: Initial version of persistent memory driver

2014-08-27 Thread Ross Zwisler

PMEM is a new driver that presents a reserved range of memory as a
block device.  This is useful for developing with NV-DIMMs, and
can be used with volatile memory as a development platform.

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
---
 MAINTAINERS|   6 +
 drivers/block/Kconfig  |  41 ++
 drivers/block/Makefile |   1 +
 drivers/block/pmem.c   | 330 +
 4 files changed, 378 insertions(+)
 create mode 100644 drivers/block/pmem.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 3f29153..028dc99 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7505,6 +7505,12 @@ S:   Maintained
 F: Documentation/blockdev/ramdisk.txt
 F: drivers/block/brd.c
 
+PERSISTENT MEMORY DRIVER
+M: Ross Zwisler ross.zwis...@linux.intel.com
+L: linux-nvd...@lists.01.org
+S: Supported
+F: drivers/block/pmem.c
+
 RANDOM NUMBER DRIVER
 M: Theodore Ts'o ty...@mit.edu
 S: Maintained
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 1b8094d..ac52f5a 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -404,6 +404,47 @@ config BLK_DEV_RAM_DAX
  and will prevent RAM block device backing store memory from being
  allocated from highmem (only a problem for highmem systems).
 
+config BLK_DEV_PMEM
+   tristate Persistent memory block device support
+   help
+ Saying Y here will allow you to use a contiguous range of reserved
+ memory as one or more block devices.  Memory for PMEM should be
+ reserved using the memmap kernel parameter.
+
+ To compile this driver as a module, choose M here: the module will be
+ called pmem.
+
+ Most normal users won't need this functionality, and can thus say N
+ here.
+
+config BLK_DEV_PMEM_START
+   int Offset in GiB of where to start claiming space
+   default 0
+   depends on BLK_DEV_PMEM
+   help
+ Starting offset in GiB that PMEM should use when claiming memory.  
This
+ memory needs to be reserved from the OS at boot time using the
+ memmap kernel parameter.
+
+ If you provide PMEM with volatile memory it will act as a volatile
+ RAM disk and your data will not be persistent.
+
+config BLK_DEV_PMEM_COUNT
+   int Default number of PMEM disks
+   default 4
+   depends on BLK_DEV_PMEM
+   help
+ Number of equal sized block devices that PMEM should create.
+
+config BLK_DEV_PMEM_SIZE
+   int Size in GiB of space to claim
+   depends on BLK_DEV_PMEM
+   default 0
+   help
+ Amount of memory in GiB that PMEM should use when creating block
+ devices.  This memory needs to be reserved from the OS at
+ boot time using the memmap kernel parameter.
+
 config CDROM_PKTCDVD
tristate Packet writing on CD/DVD media
depends on !UML
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 02b688d..9cc6c18 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_PS3_VRAM)+= ps3vram.o
 obj-$(CONFIG_ATARI_FLOPPY) += ataflop.o
 obj-$(CONFIG_AMIGA_Z2RAM)  += z2ram.o
 obj-$(CONFIG_BLK_DEV_RAM)  += brd.o
+obj-$(CONFIG_BLK_DEV_PMEM) += pmem.o
 obj-$(CONFIG_BLK_DEV_LOOP) += loop.o
 obj-$(CONFIG_BLK_CPQ_DA)   += cpqarray.o
 obj-$(CONFIG_BLK_CPQ_CISS_DA)  += cciss.o
diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
new file mode 100644
index 000..d366b9b
--- /dev/null
+++ b/drivers/block/pmem.c
@@ -0,0 +1,330 @@
+/*
+ * Persistent Memory Driver
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * This driver is heavily based on drivers/block/brd.c.
+ * Copyright (C) 2007 Nick Piggin
+ * Copyright (C) 2007 Novell Inc.
+ */
+
+#include linux/bio.h
+#include linux/blkdev.h
+#include linux/fs.h
+#include linux/hdreg.h
+#include linux/highmem.h
+#include linux/init.h
+#include linux/major.h
+#include linux/module.h
+#include linux/moduleparam.h
+#include linux/slab.h
+#include linux/uaccess.h
+
+#define SECTOR_SHIFT   9
+#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
+#define PAGE_SECTORS   (1  PAGE_SECTORS_SHIFT)
+
+/*
+ * driver-wide physical address and total_size - one single, contiguous memory
+ * region that we divide up in to same-sized devices
+ */
+phys_addr_tphys_addr;
+void   *virt_addr;
+size_t total_size;
+
+struct pmem_device {
+   struct request_queue*pmem_queue;
+   struct

[PATCH 4/4] pmem: Add support for direct_access()

2014-08-27 Thread Ross Zwisler

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
---
 drivers/block/pmem.c | 24 
 1 file changed, 24 insertions(+)

diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
index 0be3669..d63bc96 100644
--- a/drivers/block/pmem.c
+++ b/drivers/block/pmem.c
@@ -74,6 +74,15 @@ static void *pmem_lookup_pg_addr(struct pmem_device *pmem, 
sector_t sector)
return pmem-virt_addr + offset;
 }
 
+/* sector must be page aligned */
+static unsigned long pmem_lookup_pfn(struct pmem_device *pmem, sector_t sector)
+{
+   size_t page_offset = sector  PAGE_SECTORS_SHIFT;
+
+   BUG_ON(sector  (PAGE_SECTORS - 1));
+   return (pmem-phys_addr  PAGE_SHIFT) + page_offset;
+}
+
 /*
  * sector is not required to be page aligned.
  * n is at most a single page, but could be less.
@@ -193,9 +202,24 @@ static int pmem_rw_page(struct block_device *bdev, 
sector_t sector,
return 0;
 }
 
+static long pmem_direct_access(struct block_device *bdev, sector_t sector,
+ void **kaddr, unsigned long *pfn, long size)
+{
+   struct pmem_device *pmem = bdev-bd_disk-private_data;
+
+   if (!pmem)
+   return -ENODEV;
+
+   *kaddr = pmem_lookup_pg_addr(pmem, sector);
+   *pfn = pmem_lookup_pfn(pmem, sector);
+
+   return pmem-size - (sector * 512);
+}
+
 static const struct block_device_operations pmem_fops = {
.owner =THIS_MODULE,
.rw_page =  pmem_rw_page,
+   .direct_access =pmem_direct_access,
.getgeo =   pmem_getgeo,
 };
 
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/4] pmem: Add support for getgeo()

2014-08-27 Thread Ross Zwisler

Some programs require HDIO_GETGEO work, which requires we implement
getgeo.  Based off of the work done to the NVMe driver in this commit:

commit 4cc09e2dc4cb (NVMe: Add getgeo to block ops)

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
---
 drivers/block/pmem.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
index d366b9b..60bbe0d 100644
--- a/drivers/block/pmem.c
+++ b/drivers/block/pmem.c
@@ -50,6 +50,15 @@ struct pmem_device {
size_t  size;
 };
 
+static int pmem_getgeo(struct block_device *bd, struct hd_geometry *geo)
+{
+   /* some standard values */
+   geo-heads = 1  6;
+   geo-sectors = 1  5;
+   geo-cylinders = get_capacity(bd-bd_disk)  11;
+   return 0;
+}
+
 /*
  * direct translation from (pmem,sector) = void*
  * We do not require that sector be page aligned.
@@ -176,6 +185,7 @@ out:
 
 static const struct block_device_operations pmem_fops = {
.owner =THIS_MODULE,
+   .getgeo =   pmem_getgeo,
 };
 
 /* Kernel module stuff */
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 3/4] pmem: Add support for rw_page()

2014-08-27 Thread Ross Zwisler

Based on commit a72132c31d58 (brd: add support for rw_page())

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
---
 drivers/block/pmem.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
index 60bbe0d..0be3669 100644
--- a/drivers/block/pmem.c
+++ b/drivers/block/pmem.c
@@ -183,8 +183,19 @@ out:
bio_endio(bio, err);
 }
 
+static int pmem_rw_page(struct block_device *bdev, sector_t sector,
+  struct page *page, int rw)
+{
+   struct pmem_device *pmem = bdev-bd_disk-private_data;
+
+   pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector);
+   page_endio(page, rw  WRITE, 0);
+   return 0;
+}
+
 static const struct block_device_operations pmem_fops = {
.owner =THIS_MODULE,
+   .rw_page =  pmem_rw_page,
.getgeo =   pmem_getgeo,
 };
 
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 0/6] add support for new persistent memory instructions

2014-11-11 Thread Ross Zwisler

This patch set adds support for two new persistent memory instructions, pcommit
and clwb.  These instructions were announced in the document Intel
Architecture Instruction Set Extensions Programming Reference with reference
number 319433-022.

https://software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf

These patches apply cleanly to v3.18-rc4.

Here are some things of note:

 - As with the clflushopt patches before this, I'm assuming that the addressing
   mode generated by the original clflush instruction will match the new
   clflush instruction with the 0x66 prefix for clflushopt, and for the
   xsaveopt instruction with the 0x66 prefix for clwb.  For all the test cases
   that I've come up with and for the new clwb code generated by this patch
   series, this has proven to be true on my test machine.

 - According to the SDM, xsaveopt has a form where it has a REX.W prefix.  I
   believe that this prefix will not be generated by gcc in x86_64 kernel code.
   Based on this, I don't believe I need to account for this extra prefix when
   dealing with the assembly language created for clwb.  Please correct me if
   I'm wrong.

 - The last three patches in this series update existing uses of clflushopt to
   use clwb instead.  The assertion is that clwb is preferable to clflushopt in
   these cases because after a clwb the cache line will be clean and ready for
   eviction, but that there is a possibility that it might be referenced again
   in the future while it is still in the CPU cache, giving us a performance
   boost.

Cc: H Peter Anvin h.peter.an...@intel.com
Cc: Ingo Molnar mi...@kernel.org
Cc: Thomas Gleixner t...@linutronix.de
Cc: David Airlie airl...@linux.ie
Cc: dri-de...@lists.freedesktop.org
Cc: x...@kernel.org

Ross Zwisler (6):
  x86: Add support for the pcommit instruction
  x86/alternative: Add alternative_io_2
  x86: Add support for the clwb instruction
  x86: Use clwb in clflush_cache_range
  x86: Use clwb in drm_clflush_page
  x86: Use clwb in drm_clflush_virt_range

 arch/x86/include/asm/alternative.h   | 14 ++
 arch/x86/include/asm/cpufeature.h|  2 ++
 arch/x86/include/asm/special_insns.h | 16 
 arch/x86/mm/pageattr.c   |  8 
 drivers/gpu/drm/drm_cache.c  | 12 ++--
 5 files changed, 42 insertions(+), 10 deletions(-)

-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 6/6] x86: Use clwb in drm_clflush_virt_range

2014-11-11 Thread Ross Zwisler

If clwb is available on the system, use it in drm_clflush_virt_range.
If clwb is not available, fall back to clflushopt if you can.
If clflushopt is not supported, fall all the way back to clflush.

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
Cc: H Peter Anvin h.peter.an...@intel.com
Cc: Ingo Molnar mi...@kernel.org
Cc: Thomas Gleixner t...@linutronix.de
Cc: David Airlie airl...@linux.ie
Cc: dri-de...@lists.freedesktop.org
Cc: x...@kernel.org
---
 drivers/gpu/drm/drm_cache.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
index aad9d82..84e9a04 100644
--- a/drivers/gpu/drm/drm_cache.c
+++ b/drivers/gpu/drm/drm_cache.c
@@ -138,8 +138,8 @@ drm_clflush_virt_range(void *addr, unsigned long length)
void *end = addr + length;
mb();
for (; addr  end; addr += boot_cpu_data.x86_clflush_size)
-   clflushopt(addr);
-   clflushopt(end - 1);
+   clwb(addr);
+   clwb(end - 1);
mb();
return;
}
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 3/6] x86: Add support for the clwb instruction

2014-11-11 Thread Ross Zwisler

Add support for the new clwb instruction.  This instruction was
announced in the document Intel Architecture Instruction Set Extensions
Programming Reference with reference number 319433-022.

https://software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf

Here are some things of note:

 - As with the clflushopt patches before this, I'm assuming that the addressing
   mode generated by the original clflush instruction will match the new
   clflush instruction with the 0x66 prefix for clflushopt, and for the
   xsaveopt instruction with the 0x66 prefix for clwb.  For all the test cases
   that I've come up with and for the new clwb code generated by this patch
   series, this has proven to be true on my test machine.

 - According to the SDM, xsaveopt has a form where it has a REX.W prefix.  I
   believe that this prefix will not be generated by gcc in x86_64 kernel code.
   Based on this, I don't believe I need to account for this extra prefix when
   dealing with the assembly language created for clwb.  Please correct me if
   I'm wrong.

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
Cc: H Peter Anvin h.peter.an...@intel.com
Cc: Ingo Molnar mi...@kernel.org
Cc: Thomas Gleixner t...@linutronix.de
Cc: David Airlie airl...@linux.ie
Cc: dri-de...@lists.freedesktop.org
Cc: x...@kernel.org
---
 arch/x86/include/asm/cpufeature.h|  1 +
 arch/x86/include/asm/special_insns.h | 10 ++
 2 files changed, 11 insertions(+)

diff --git a/arch/x86/include/asm/cpufeature.h 
b/arch/x86/include/asm/cpufeature.h
index b3e6b89..fbbed34 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -227,6 +227,7 @@
 #define X86_FEATURE_SMAP   ( 9*32+20) /* Supervisor Mode Access Prevention 
*/
 #define X86_FEATURE_PCOMMIT( 9*32+22) /* PCOMMIT instruction */
 #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
+#define X86_FEATURE_CLWB   ( 9*32+24) /* CLWB instruction */
 #define X86_FEATURE_AVX512PF   ( 9*32+26) /* AVX-512 Prefetch */
 #define X86_FEATURE_AVX512ER   ( 9*32+27) /* AVX-512 Exponential and 
Reciprocal */
 #define X86_FEATURE_AVX512CD   ( 9*32+28) /* AVX-512 Conflict Detection */
diff --git a/arch/x86/include/asm/special_insns.h 
b/arch/x86/include/asm/special_insns.h
index 1709a2e..a328460 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -199,6 +199,16 @@ static inline void clflushopt(volatile void *__p)
   +m (*(volatile char __force *)__p));
 }
 
+static inline void clwb(volatile void *__p)
+{
+   alternative_io_2(.byte  __stringify(NOP_DS_PREFIX) ; clflush %P0,
+.byte 0x66; clflush %P0,
+X86_FEATURE_CLFLUSHOPT,
+.byte 0x66; xsaveopt %P0,
+X86_FEATURE_CLWB,
++m (*(volatile char __force *)__p));
+}
+
 static inline void pcommit(void)
 {
alternative(ASM_NOP4, .byte 0x66, 0x0f, 0xae, 0xf8,
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 4/6] x86: Use clwb in clflush_cache_range

2014-11-11 Thread Ross Zwisler

If clwb is available on the system, use it in clflush_cache_range.
If clwb is not available, fall back to clflushopt if you can.
If clflushopt is not supported, fall all the way back to clflush.

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
Cc: H Peter Anvin h.peter.an...@intel.com
Cc: Ingo Molnar mi...@kernel.org
Cc: Thomas Gleixner t...@linutronix.de
Cc: David Airlie airl...@linux.ie
Cc: dri-de...@lists.freedesktop.org
Cc: x...@kernel.org
---
 arch/x86/mm/pageattr.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 36de293..5229d45 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -126,8 +126,8 @@ within(unsigned long addr, unsigned long start, unsigned 
long end)
  * @vaddr: virtual start address
  * @size:  number of bytes to flush
  *
- * clflushopt is an unordered instruction which needs fencing with mfence or
- * sfence to avoid ordering issues.
+ * clflushopt and clwb are unordered instructions which need fencing with
+ * mfence or sfence to avoid ordering issues.
  */
 void clflush_cache_range(void *vaddr, unsigned int size)
 {
@@ -136,11 +136,11 @@ void clflush_cache_range(void *vaddr, unsigned int size)
mb();
 
for (; vaddr  vend; vaddr += boot_cpu_data.x86_clflush_size)
-   clflushopt(vaddr);
+   clwb(vaddr);
/*
 * Flush any possible final partial cacheline:
 */
-   clflushopt(vend);
+   clwb(vend);
 
mb();
 }
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 5/6] x86: Use clwb in drm_clflush_page

2014-11-11 Thread Ross Zwisler

If clwb is available on the system, use it in drm_clflush_page.
If clwb is not available, fall back to clflushopt if you can.
If clflushopt is not supported, fall all the way back to clflush.

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
Cc: H Peter Anvin h.peter.an...@intel.com
Cc: Ingo Molnar mi...@kernel.org
Cc: Thomas Gleixner t...@linutronix.de
Cc: David Airlie airl...@linux.ie
Cc: dri-de...@lists.freedesktop.org
Cc: x...@kernel.org
---
 drivers/gpu/drm/drm_cache.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
index a6b6906..aad9d82 100644
--- a/drivers/gpu/drm/drm_cache.c
+++ b/drivers/gpu/drm/drm_cache.c
@@ -34,9 +34,9 @@
 #if defined(CONFIG_X86)
 
 /*
- * clflushopt is an unordered instruction which needs fencing with mfence or
- * sfence to avoid ordering issues.  For drm_clflush_page this fencing happens
- * in the caller.
+ * clwb and clflushopt are unordered instructions which need fencing with
+ * mfence or sfence to avoid ordering issues.  For drm_clflush_page this
+ * fencing happens in the caller.
  */
 static void
 drm_clflush_page(struct page *page)
@@ -50,7 +50,7 @@ drm_clflush_page(struct page *page)
 
page_virtual = kmap_atomic(page);
for (i = 0; i  PAGE_SIZE; i += size)
-   clflushopt(page_virtual + i);
+   clwb(page_virtual + i);
kunmap_atomic(page_virtual);
 }
 
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/6] x86: Add support for the pcommit instruction

2014-11-11 Thread Ross Zwisler

Add support for the new pcommit instruction.  This instruction was
announced in the document Intel Architecture Instruction Set Extensions
Programming Reference with reference number 319433-022.

https://software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
Cc: H Peter Anvin h.peter.an...@intel.com
Cc: Ingo Molnar mi...@kernel.org
Cc: Thomas Gleixner t...@linutronix.de
Cc: David Airlie airl...@linux.ie
Cc: dri-de...@lists.freedesktop.org
Cc: x...@kernel.org
---
 arch/x86/include/asm/cpufeature.h| 1 +
 arch/x86/include/asm/special_insns.h | 6 ++
 2 files changed, 7 insertions(+)

diff --git a/arch/x86/include/asm/cpufeature.h 
b/arch/x86/include/asm/cpufeature.h
index 0bb1335..b3e6b89 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -225,6 +225,7 @@
 #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */
 #define X86_FEATURE_ADX( 9*32+19) /* The ADCX and ADOX 
instructions */
 #define X86_FEATURE_SMAP   ( 9*32+20) /* Supervisor Mode Access Prevention 
*/
+#define X86_FEATURE_PCOMMIT( 9*32+22) /* PCOMMIT instruction */
 #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
 #define X86_FEATURE_AVX512PF   ( 9*32+26) /* AVX-512 Prefetch */
 #define X86_FEATURE_AVX512ER   ( 9*32+27) /* AVX-512 Exponential and 
Reciprocal */
diff --git a/arch/x86/include/asm/special_insns.h 
b/arch/x86/include/asm/special_insns.h
index e820c08..1709a2e 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -199,6 +199,12 @@ static inline void clflushopt(volatile void *__p)
   +m (*(volatile char __force *)__p));
 }
 
+static inline void pcommit(void)
+{
+   alternative(ASM_NOP4, .byte 0x66, 0x0f, 0xae, 0xf8,
+   X86_FEATURE_PCOMMIT);
+}
+
 #define nop() asm volatile (nop)
 
 
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/6] x86/alternative: Add alternative_io_2

2014-11-11 Thread Ross Zwisler

Add alternative_io_2 in the spirit of alternative_input_2 and
alternative_io.  This will allow us to have instructions with an output
parameter that vary based on two CPU features.

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
Cc: H Peter Anvin h.peter.an...@intel.com
Cc: Ingo Molnar mi...@kernel.org
Cc: Thomas Gleixner t...@linutronix.de
Cc: David Airlie airl...@linux.ie
Cc: dri-de...@lists.freedesktop.org
Cc: x...@kernel.org
---
 arch/x86/include/asm/alternative.h | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/arch/x86/include/asm/alternative.h 
b/arch/x86/include/asm/alternative.h
index 473bdbe..7d9ead9 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -180,6 +180,20 @@ static inline int alternatives_text_reserved(void *start, 
void *end)
asm volatile (ALTERNATIVE(oldinstr, newinstr, feature)  \
: output : i (0), ## input)
 
+/*
+ * This is similar to alternative_io. But it has two features and
+ * respective instructions.
+ *
+ * If CPU has feature2, newinstr2 is used.
+ * Otherwise, if CPU has feature1, newinstr1 is used.
+ * Otherwise, oldinstr is used.
+ */
+#define alternative_io_2(oldinstr, newinstr1, feature1, newinstr2,  \
+  feature2, output, input...)   \
+   asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1,\
+   newinstr2, feature2) \
+   : output : i (0), ## input)
+
 /* Like alternative_io, but for replacing a direct call with another one. */
 #define alternative_call(oldfunc, newfunc, feature, output, input...)  \
asm volatile (ALTERNATIVE(call %P[old], call %P[new], feature) \
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/6] x86: Add support for the clwb instruction

2014-11-11 Thread Ross Zwisler

On Tue, 2014-11-11 at 20:19 +0100, Borislav Petkov wrote:
 On Tue, Nov 11, 2014 at 08:12:39PM +0100, Borislav Petkov wrote:
   +  .byte 0x66; xsaveopt %P0,
  
  Huh, XSAVEOPT?!? Shouldn't that be CLWB??
 
 Bah, the same opcodes, only 0x66 prefix makes it into CLWB. Could use a
 comment I guess.

Yep, it's weird, I know.  :)  I'll add a comment.

Thanks,
- Ross

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/6] x86: Add support for the clwb instruction

2014-11-11 Thread Ross Zwisler

On Tue, 2014-11-11 at 20:12 +0100, Borislav Petkov wrote:
 On Tue, Nov 11, 2014 at 11:43:13AM -0700, Ross Zwisler wrote:
  Add support for the new clwb instruction.  This instruction was
  announced in the document Intel Architecture Instruction Set Extensions
  Programming Reference with reference number 319433-022.
  
  https://software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf
  
  Here are some things of note:
  
   - As with the clflushopt patches before this, I'm assuming that the 
  addressing
 mode generated by the original clflush instruction will match the new
 clflush instruction with the 0x66 prefix for clflushopt, and for the
 xsaveopt instruction with the 0x66 prefix for clwb.  For all the test 
  cases
 that I've come up with and for the new clwb code generated by this patch
 series, this has proven to be true on my test machine.
  
   - According to the SDM, xsaveopt has a form where it has a REX.W prefix.  I
 believe that this prefix will not be generated by gcc in x86_64 kernel 
  code.
 Based on this, I don't believe I need to account for this extra prefix 
  when
 dealing with the assembly language created for clwb.  Please correct me 
  if
 I'm wrong.
  
  Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
  Cc: H Peter Anvin h.peter.an...@intel.com
  Cc: Ingo Molnar mi...@kernel.org
  Cc: Thomas Gleixner t...@linutronix.de
  Cc: David Airlie airl...@linux.ie
  Cc: dri-de...@lists.freedesktop.org
  Cc: x...@kernel.org
  ---
   arch/x86/include/asm/cpufeature.h|  1 +
   arch/x86/include/asm/special_insns.h | 10 ++
   2 files changed, 11 insertions(+)
  
  diff --git a/arch/x86/include/asm/cpufeature.h 
  b/arch/x86/include/asm/cpufeature.h
  index b3e6b89..fbbed34 100644
  --- a/arch/x86/include/asm/cpufeature.h
  +++ b/arch/x86/include/asm/cpufeature.h
  @@ -227,6 +227,7 @@
   #define X86_FEATURE_SMAP   ( 9*32+20) /* Supervisor Mode Access Prevention 
  */
   #define X86_FEATURE_PCOMMIT( 9*32+22) /* PCOMMIT instruction */
   #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
  +#define X86_FEATURE_CLWB   ( 9*32+24) /* CLWB instruction */
   #define X86_FEATURE_AVX512PF   ( 9*32+26) /* AVX-512 Prefetch */
   #define X86_FEATURE_AVX512ER   ( 9*32+27) /* AVX-512 Exponential and 
  Reciprocal */
   #define X86_FEATURE_AVX512CD   ( 9*32+28) /* AVX-512 Conflict 
  Detection */
  diff --git a/arch/x86/include/asm/special_insns.h 
  b/arch/x86/include/asm/special_insns.h
  index 1709a2e..a328460 100644
  --- a/arch/x86/include/asm/special_insns.h
  +++ b/arch/x86/include/asm/special_insns.h
  @@ -199,6 +199,16 @@ static inline void clflushopt(volatile void *__p)
 +m (*(volatile char __force *)__p));
   }
   
  +static inline void clwb(volatile void *__p)
  +{
  +   alternative_io_2(.byte  __stringify(NOP_DS_PREFIX) ; clflush %P0,
 
 Any particular reason for using 0x3e as a prefix to have the insns be
 the same size or is it simply because CLFLUSH can stomach it?
 
 :-)

Essentially we need one additional byte at the beginning of the clflush so
that we can flip it into a clflushopt by changing that byte into a 0x66
prefix.  Two options are to either insert a 1 byte ASM_NOP1, or to add a 1
byte NOP_DS_PREFIX.  Both have no functional effect with the plain clflush,
but I've been told that executing a clflush + prefix should be faster than
executing a clflush + NOP.


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/6] x86: Add support for the clwb instruction

2014-11-11 Thread Ross Zwisler

On Tue, 2014-11-11 at 20:46 +0100, Borislav Petkov wrote:
 On Tue, Nov 11, 2014 at 12:40:00PM -0700, Ross Zwisler wrote:
  Yep, it's weird, I know.  :)
 
 But sure, saving opcode space, makes sense to me.
 
 Btw, I'd still be interested about this:
 
  +static inline void clwb(volatile void *__p)
  +{
  + alternative_io_2(.byte  __stringify(NOP_DS_PREFIX) ; clflush %P0,
 
 Any particular reason for using 0x3e as a prefix to have the insns be
 the same size or is it simply because CLFLUSH can stomach it?

Ah, sorry, I was still responding to your first mail.  :)  Response
copied here to save searching:

Essentially we need one additional byte at the beginning of the
clflush so that we can flip it into a clflushopt by changing that byte
into a 0x66 prefix.  Two options are to either insert a 1 byte
ASM_NOP1, or to add a 1 byte NOP_DS_PREFIX.  Both have no functional
effect with the plain clflush, but I've been told that executing a
clflush + prefix should be faster than executing a clflush + NOP.

I agree, this is useful info - I'll add it to the patch comments for v2.

Thank you for the feedback.

- Ross

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/6] x86: Add support for the clwb instruction

2014-11-13 Thread Ross Zwisler

On Wed, 2014-11-12 at 15:12 +0100, Borislav Petkov wrote:
 On Wed, Nov 12, 2014 at 01:38:45PM +, Anvin, H Peter wrote:
  No, it doesn't.  x86 requires 3.4+ at a minimum.
 
 The only test I see is:
 
 #if GCC_VERSION  30200
 # error Sorry, your compiler is too old - please upgrade it.
 #endif
 
 And even if we do require 3.4, the build fails with 4.1+ so...

Ah, dang, you're right.  Okay, I'll figure out how to do this without
using xsaveopt.

Thank you for pointing this out.

- Ross

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] sizes: Add 64 bit constants to sizes.h

2014-12-11 Thread Ross Zwisler

This patch adds 64 bit ULL constants to include/linux/sizes.h.  These
sizes range from SZ_4G (4 gibibyte) through SZ_8E (8 exbibyte).

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
---
 include/linux/sizes.h | 35 +++
 1 file changed, 35 insertions(+)

diff --git a/include/linux/sizes.h b/include/linux/sizes.h
index ce3e815..06ab35b 100644
--- a/include/linux/sizes.h
+++ b/include/linux/sizes.h
@@ -43,5 +43,40 @@
 
 #define SZ_1G  0x4000
 #define SZ_2G  0x8000
+#define SZ_4G  0x0001ULL
+#define SZ_8G  0x0002ULL
+#define SZ_16G 0x0004ULL
+#define SZ_32G 0x0008ULL
+#define SZ_64G 0x0010ULL
+#define SZ_128G0x0020ULL
+#define SZ_256G0x0040ULL
+#define SZ_512G0x0080ULL
+
+#define SZ_1T  0x0100ULL
+#define SZ_2T  0x0200ULL
+#define SZ_4T  0x0400ULL
+#define SZ_8T  0x0800ULL
+#define SZ_16T 0x1000ULL
+#define SZ_32T 0x2000ULL
+#define SZ_64T 0x4000ULL
+#define SZ_128T0x8000ULL
+#define SZ_256T0x0001ULL
+#define SZ_512T0x0002ULL
+
+#define SZ_1P  0x0004ULL
+#define SZ_2P  0x0008ULL
+#define SZ_4P  0x0010ULL
+#define SZ_8P  0x0020ULL
+#define SZ_16P 0x0040ULL
+#define SZ_32P 0x0080ULL
+#define SZ_64P 0x0100ULL
+#define SZ_128P0x0200ULL
+#define SZ_256P0x0400ULL
+#define SZ_512P0x0800ULL
+
+#define SZ_1E  0x1000ULL
+#define SZ_2E  0x2000ULL
+#define SZ_4E  0x4000ULL
+#define SZ_8E  0x8000ULL
 
 #endif /* __LINUX_SIZES_H__ */
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/6] x86: Add support for the pcommit instruction

2014-11-14 Thread Ross Zwisler

On Wed, 2014-11-12 at 19:25 -0800, Andy Lutomirski wrote:
 On 11/11/2014 10:43 AM, Ross Zwisler wrote:
  Add support for the new pcommit instruction.  This instruction was
  announced in the document Intel Architecture Instruction Set Extensions
  Programming Reference with reference number 319433-022.
  
  https://software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf
  
  Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
  Cc: H Peter Anvin h.peter.an...@intel.com
  Cc: Ingo Molnar mi...@kernel.org
  Cc: Thomas Gleixner t...@linutronix.de
  Cc: David Airlie airl...@linux.ie
  Cc: dri-de...@lists.freedesktop.org
  Cc: x...@kernel.org
  ---
   arch/x86/include/asm/cpufeature.h| 1 +
   arch/x86/include/asm/special_insns.h | 6 ++
   2 files changed, 7 insertions(+)
  
  diff --git a/arch/x86/include/asm/cpufeature.h 
  b/arch/x86/include/asm/cpufeature.h
  index 0bb1335..b3e6b89 100644
  --- a/arch/x86/include/asm/cpufeature.h
  +++ b/arch/x86/include/asm/cpufeature.h
  @@ -225,6 +225,7 @@
   #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */
   #define X86_FEATURE_ADX( 9*32+19) /* The ADCX and ADOX 
  instructions */
   #define X86_FEATURE_SMAP   ( 9*32+20) /* Supervisor Mode Access Prevention 
  */
  +#define X86_FEATURE_PCOMMIT( 9*32+22) /* PCOMMIT instruction */
   #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
   #define X86_FEATURE_AVX512PF   ( 9*32+26) /* AVX-512 Prefetch */
   #define X86_FEATURE_AVX512ER   ( 9*32+27) /* AVX-512 Exponential and 
  Reciprocal */
  diff --git a/arch/x86/include/asm/special_insns.h 
  b/arch/x86/include/asm/special_insns.h
  index e820c08..1709a2e 100644
  --- a/arch/x86/include/asm/special_insns.h
  +++ b/arch/x86/include/asm/special_insns.h
  @@ -199,6 +199,12 @@ static inline void clflushopt(volatile void *__p)
 +m (*(volatile char __force *)__p));
   }
   
  +static inline void pcommit(void)
  +{
  +   alternative(ASM_NOP4, .byte 0x66, 0x0f, 0xae, 0xf8,
  +   X86_FEATURE_PCOMMIT);
  +}
  +
 
 Should this patch add the feature bit and cpuinfo entry to go with it?
 
 --Andy

I think this patch does everything we need?  The text for cpuinfo is
auto-generated in arch/x86/kernel/cpu/capflags.c from the flags defined
in arch/x86/include/asm/cpufeature.h, I think.  Here's what I get in
cpuinfo on my system with a faked-out CPUID saying that clwb and pcommit
are present:

$ grep 'flags' /proc/cpuinfo 
flags   : fpu snip erms pcommit clflushopt clwb xsaveopt

The X86_FEATURE_CLWB and X86_FEATURE_PCOMMIT flags are being set up
according to what's in CPUID, and the proper alternatives are being
triggered.  I stuck some debug code in the alternatives code to see what
was being patched in the presence and absence of each of the flags.

Is there something else I'm missing?

Thanks,
- Ross

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/4] pmem: Initial version of persistent memory driver

2014-11-04 Thread Ross Zwisler

On Tue, 2014-11-04 at 16:26 +, Elliott, Robert (Server Storage)
wrote:

  -Original Message-
  From: Boaz Harrosh [mailto:b...@plexistor.com]
  Sent: Tuesday, 04 November, 2014 4:38 AM
  To: Wilcox, Matthew R; Elliott, Robert (Server Storage); Ross
  Zwisler; Jens Axboe; Nick Piggin; Kani, Toshimitsu; Knippers, Linda;
  linux-fsde...@vger.kernel.org; linux-kernel@vger.kernel.org; linux-
  nvd...@lists.01.org; Matthew Wilcox
  Subject: Re: [PATCH 1/4] pmem: Initial version of persistent memory
  driver

  On 11/03/2014 06:19 PM, Wilcox, Matthew R wrote:
 ...

  I wish you guys would actually review the correct code.

  In the actual good driver that has any shape of proper code all these
  issue are gone.

  * config defaults gone, multiple-devices multiple-memory ranges fully
 supported hot plug style.
  * above shifts cruft completely gone it is left overs from brd.c and
its page usage.
  * getgeo fixed to do what we realy want by the only application on earth
that still uses it, fdisk. All other partitioners do not call it at
all.

  Why are we reviewing dead code ?

  Cheers
  Boaz

 Ross, what's the status of Boaz' patches (available in
 git://git.open-osd.org/pmem.git)?

 https://github.com/01org/prd.git doesn't include any of 
 them yet.

Hey Robert,

The UEFI organization is in the process of defining a generic specification
for platform non-volatile memory resources.  Essentially the thought was to
wait until that was publicly available before adding any new device discovery
capabilities to pmem.

What Boaz has suggested and coded up is certainly useful, but the worry is
that it will end up being incompatible with what comes out of UEFI.  If we
stay with the dead-simple module parameter method, we will have less code to
unwind later.

Thanks,
- Ross

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 0/2] add support for new persistent memory instructions

2015-01-23 Thread Ross Zwisler

This patch set adds support for two new persistent memory instructions, pcommit
and clwb.  These instructions were announced in the document Intel
Architecture Instruction Set Extensions Programming Reference with reference
number 319433-022.

https://software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf

These patches apply cleanly to v3.19-rc5.

Changes from v1:

 - This series no longer updates the DRM code or clflush_cache_range to use
   clwb instead of clflushopt, as there was concern over whether clflushopt was
   used in some cases to explicitly evict lines from the processor cache.  I'll
   leave it up to the owners of this code to decide how they want to use clwb.

 - Reworked the clwb patch so that it doesn't use xsaveopt, as xsaveopt wasn't
   included in all GCC versions that the kernel needs to support. The assembly
   is now sort of complex because we need to hard code the clwb instruction to
   use a known register, but we also need to pull in the parameter as a memory
   constraint so that gcc doesn't reorder this instruction around other
   accesses to the same memory location.  Many thanks to hpa and Boris Petkov
   for their help on getting this right.

Cc: H Peter Anvin h.peter.an...@intel.com
Cc: Ingo Molnar mi...@kernel.org
Cc: Thomas Gleixner t...@linutronix.de
Cc: Borislav Petkov b...@alien8.de

Ross Zwisler (2):
  x86: Add support for the pcommit instruction
  x86: Add support for the clwb instruction

 arch/x86/include/asm/cpufeature.h|  2 ++
 arch/x86/include/asm/special_insns.h | 20 
 2 files changed, 22 insertions(+)

-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 1/2] x86: Add support for the pcommit instruction

2015-01-23 Thread Ross Zwisler

Add support for the new pcommit instruction.  This instruction was
announced in the document Intel Architecture Instruction Set Extensions
Programming Reference with reference number 319433-022.

https://software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
Cc: H Peter Anvin h.peter.an...@intel.com
Cc: Ingo Molnar mi...@kernel.org
Cc: Thomas Gleixner t...@linutronix.de
Cc: Borislav Petkov b...@alien8.de
---
 arch/x86/include/asm/cpufeature.h| 1 +
 arch/x86/include/asm/special_insns.h | 6 ++
 2 files changed, 7 insertions(+)

diff --git a/arch/x86/include/asm/cpufeature.h 
b/arch/x86/include/asm/cpufeature.h
index bb9b258..dfdd689 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -220,6 +220,7 @@
 #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */
 #define X86_FEATURE_ADX( 9*32+19) /* The ADCX and ADOX 
instructions */
 #define X86_FEATURE_SMAP   ( 9*32+20) /* Supervisor Mode Access Prevention 
*/
+#define X86_FEATURE_PCOMMIT( 9*32+22) /* PCOMMIT instruction */
 #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
 #define X86_FEATURE_AVX512PF   ( 9*32+26) /* AVX-512 Prefetch */
 #define X86_FEATURE_AVX512ER   ( 9*32+27) /* AVX-512 Exponential and 
Reciprocal */
diff --git a/arch/x86/include/asm/special_insns.h 
b/arch/x86/include/asm/special_insns.h
index e820c08..1709a2e 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -199,6 +199,12 @@ static inline void clflushopt(volatile void *__p)
   +m (*(volatile char __force *)__p));
 }
 
+static inline void pcommit(void)
+{
+   alternative(ASM_NOP4, .byte 0x66, 0x0f, 0xae, 0xf8,
+   X86_FEATURE_PCOMMIT);
+}
+
 #define nop() asm volatile (nop)
 
 
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 2/2] x86: Add support for the clwb instruction

2015-01-23 Thread Ross Zwisler

Add support for the new clwb instruction.  This instruction was
announced in the document Intel Architecture Instruction Set Extensions
Programming Reference with reference number 319433-022.

https://software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf

Regarding the details of how the alternatives assembly is set up, we
need one additional byte at the beginning of the clflush so that we can
flip it into a clflushopt by changing that byte into a 0x66 prefix.  Two
options are to either insert a 1 byte ASM_NOP1, or to add a 1 byte
NOP_DS_PREFIX.  Both have no functional effect with the plain clflush,
but I've been told that executing a clflush + prefix should be faster
than executing a clflush + NOP.

We had to hard code the assembly for clwb because, lacking the ability
to assemble the clwb instruction itself, the next closest thing is to
have an xsaveopt instruction with a 0x66 prefix.  Unfortunately xsaveopt
itself is also relatively new, and isn't included by all the GCC
versions that the kernel needs to support.

I tested this patch with the following versions of GCC:
gcc (GCC) 4.8.3 20140911 (Red Hat 4.8.3-7)
gcc (GCC) 4.1.2 20080704 (Red Hat 4.1.2-55)

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
Cc: H Peter Anvin h.peter.an...@intel.com
Cc: Ingo Molnar mi...@kernel.org
Cc: Thomas Gleixner t...@linutronix.de
Cc: Borislav Petkov b...@alien8.de
---
 arch/x86/include/asm/cpufeature.h|  1 +
 arch/x86/include/asm/special_insns.h | 14 ++
 2 files changed, 15 insertions(+)

diff --git a/arch/x86/include/asm/cpufeature.h 
b/arch/x86/include/asm/cpufeature.h
index dfdd689..dc91747 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -222,6 +222,7 @@
 #define X86_FEATURE_SMAP   ( 9*32+20) /* Supervisor Mode Access Prevention 
*/
 #define X86_FEATURE_PCOMMIT( 9*32+22) /* PCOMMIT instruction */
 #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
+#define X86_FEATURE_CLWB   ( 9*32+24) /* CLWB instruction */
 #define X86_FEATURE_AVX512PF   ( 9*32+26) /* AVX-512 Prefetch */
 #define X86_FEATURE_AVX512ER   ( 9*32+27) /* AVX-512 Exponential and 
Reciprocal */
 #define X86_FEATURE_AVX512CD   ( 9*32+28) /* AVX-512 Conflict Detection */
diff --git a/arch/x86/include/asm/special_insns.h 
b/arch/x86/include/asm/special_insns.h
index 1709a2e..8883cbc 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -199,6 +199,20 @@ static inline void clflushopt(volatile void *__p)
   +m (*(volatile char __force *)__p));
 }
 
+static inline void clwb(volatile void *__p)
+{
+   volatile struct { char x[64]; } *p = __p;
+
+   asm volatile(ALTERNATIVE_2(
+   .byte  __stringify(NOP_DS_PREFIX) ; clflush (%[pax]),
+   .byte 0x66; clflush (%[pax]), /* clflushopt (%%rax) */
+   X86_FEATURE_CLFLUSHOPT,
+   .byte 0x66, 0x0f, 0xae, 0x30,  /* clwb (%%rax) */
+   X86_FEATURE_CLWB)
+   : [p] +m (*p)
+   : [pax] a (p));
+}
+
 static inline void pcommit(void)
 {
alternative(ASM_NOP4, .byte 0x66, 0x0f, 0xae, 0xf8,
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 0/2] add support for new persistent memory instructions

2015-01-26 Thread Ross Zwisler

On Fri, 2015-01-23 at 15:03 -0800, H. Peter Anvin wrote:
 On 01/23/2015 12:40 PM, Ross Zwisler wrote:
  This patch set adds support for two new persistent memory instructions, 
  pcommit
  and clwb.  These instructions were announced in the document Intel
  Architecture Instruction Set Extensions Programming Reference with 
  reference
  number 319433-022.
  
  https://software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf
  
 
 Please explain in these patch descriptions what the instructions
 actually do.

Sure, will do.

 + volatile struct { char x[64]; } *p = __p;
 +
 + asm volatile(ALTERNATIVE_2(
 + .byte  __stringify(NOP_DS_PREFIX) ; clflush (%[pax]),
 + .byte 0x66; clflush (%[pax]), /* clflushopt (%%rax) */
 + X86_FEATURE_CLFLUSHOPT,
 + .byte 0x66, 0x0f, 0xae, 0x30,  /* clwb (%%rax) */
 + X86_FEATURE_CLWB)
 + : [p] +m (*p)
 + : [pax] a (p));
 
 For the specific case of CLWB, we can use an m input rather than a
 +m output, simply because CLWB (or CLFLUSH* used as a standin for CLWB
 doesn't need to be ordered with respect to loads (whereas CLFLUSH* do).
 
 Now, one can argue that for performance reasons we should should still
 use +m in case we use the CLFLUSH* standin, to avoid flushing a cache
 line to memory just to bring it back in.

Understood, and an interesting point.  It seems like we can be correct using
either, yea?  I guess I'm happy with +m output since it's consistent with
clflush and clflushopt, and since we avoid the clflush* then read issue.
Please let me know if you have a preference.

 +static inline void pcommit(void)
 +{
 + alternative(ASM_NOP4, .byte 0x66, 0x0f, 0xae, 0xf8,
 + X86_FEATURE_PCOMMIT);
 +}
 +
 
 Should we use an SFENCE as a standin if pcommit is unavailable, in case
 we end up using CLFLUSHOPT?

Ah, sorry, I really need to include an example flow in my patch descriptions
to make this more clear. :)

Both the flushes (wmb/clflushopt/clflush) and the pcommit are ordered by
either mfence or sfence.

An example function that flushes and commits a buffer could look like this
(based on clflush_cache_range):

void flush_and_commit_buffer(void *vaddr, unsigned int size)
{   
void *vend = vaddr + size - 1;

for (; vaddr  vend; vaddr += boot_cpu_data.x86_clflush_size)
clwb(vaddr);

/* Flush any possible final partial cacheline */
clwb(vend);

/* 
 * sfence to order clwb/clflushopt/clflush cache flushes
 * mfence via mb() also works 
 */
wmb();

pcommit();

/* 
 * sfence to order pcommit
 * mfence via mb() also works 
 */
wmb();
}

In this example function I don't begin with a fence because clwb (which may
fall back to clflush/clflushopt) will be ordered with respect to either writes
or reads and writes depending on whether the argument is given as an input or
output parameter.

If the platform doesn't support PCOMMIT, you end up with this:

void flush_and_commit_buffer(void *vaddr, unsigned int size)
{   
void *vend = vaddr + size - 1;

for (; vaddr  vend; vaddr += boot_cpu_data.x86_clflush_size)
clwb(vaddr);

/* Flush any possible final partial cacheline */
clwb(vend);

/* 
 * sfence to order clwb/clflushopt/clflush cache flushes
 * mfence via mb() also works 
 */
wmb();

nop(); /* from pcommit(), via alternatives */

/* 
 * sfence to order pcommit
 * mfence via mb() also works 
 */
wmb();
}

This is fine, but now you've got two fences in a row.  Another slightly more
messy choice would be to include the fence in the pcommit assembly, so
you either get pcommit + sfence or a pair of NOPs.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 0/2] add support for new persistent memory instructions

2015-01-26 Thread Ross Zwisler

On Sat, 2015-01-24 at 12:14 +0100, Borislav Petkov wrote:
 On Fri, Jan 23, 2015 at 03:03:41PM -0800, H. Peter Anvin wrote:
  For the specific case of CLWB, we can use an m input rather than a
  +m output, simply because CLWB (or CLFLUSH* used as a standin for CLWB
  doesn't need to be ordered with respect to loads (whereas CLFLUSH* do).
 
 Well, we could do something like:
 
 volatile struct { char x[64]; } *p = __p;
 
 if (static_cpu_has(X86_FEATURE_CLWB))
 asm volatile(.byte 0x66,0x0f,0xae,0x30 :: m (*p), a 
 (p));
 else
 asm volatile(ALTERNATIVE(
 .byte 0x3e; clflush (%[pax]),
 .byte 0x66; clflush (%[pax]), /* clflushopt (%%rax) 
 */
 X86_FEATURE_CLFLUSHOPT)
 : [p] +m (*p)
 : [pax] a (p));
 
 which would simplify the alternative macro too.

This is interesting!  I guess I'm confused as to how this solves the ordering
issue, though.  The m input vs +m output parameter will tell gcc whether
or not the assembly can be reordered at compile time with respect to reads at
that same location, correct?

So if we have an inline function that could either read or write from gcc's
point of view (input vs output parameter, depending on the branch), it seems
like it would be forced to fall back to the most restrictive case (assume it
will write), and not reorder with respect to reads.  If so, you'd end up in
the same place as using +m output, only now you've got an additional branch
instead of a 3-way alternative.

Am I misunderstanding this?

 Generated asm looks ok to me (my objdump doesn't know CLWB yet :)):
 
 0aa0 myclflush:
  aa0:   55  push   %rbp
  aa1:   48 89 e5mov%rsp,%rbp
  aa4:   eb 0a   jmpab0 myclflush+0x10
  aa6:   48 89 f8mov%rdi,%rax
  aa9:   66 0f ae 30 data16 xsaveopt (%rax)
  aad:   5d  pop%rbp
  aae:   c3  retq
  aaf:   90  nop
  ab0:   48 89 f8mov%rdi,%rax
  ab3:   3e 0f ae 38 clflush %ds:(%rax)
  ab7:   5d  pop%rbp
  ab8:   c3  retq
 
  Should we use an SFENCE as a standin if pcommit is unavailable, in case
  we end up using CLFLUSHOPT?
 
 Btw, is PCOMMIT a lightweight SFENCE for this persistent memory aspect
 to make sure stuff has become persistent after executing it? But not all
 stuff like SFENCE so SFENCE is the bigger hammer?

Ah, yep, I definitely need to include an example flow in my commit comments.
:) Here's a snip from my reply to hpa, to save searching:

Both the flushes (wmb/clflushopt/clflush) and the pcommit are ordered
by either mfence or sfence.

An example function that flushes and commits a buffer could look like
this (based on clflush_cache_range):

void flush_and_commit_buffer(void *vaddr, unsigned int size)
{   
void *vend = vaddr + size - 1;

for (; vaddr  vend; vaddr += boot_cpu_data.x86_clflush_size)
clwb(vaddr);

/* Flush any possible final partial cacheline */
clwb(vend);

/* 
 * sfence to order clwb/clflushopt/clflush cache flushes
 * mfence via mb() also works 
 */
wmb();

pcommit();

/* 
 * sfence to order pcommit
 * mfence via mb() also works 
 */
wmb();
}


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 0/2] add support for new persistent memory instructions

2015-01-26 Thread Ross Zwisler

On Mon, 2015-01-26 at 22:34 +0100, Borislav Petkov wrote:
 On Mon, Jan 26, 2015 at 12:59:29PM -0700, Ross Zwisler wrote:
  /* 
   * sfence to order pcommit
   * mfence via mb() also works 
   */
  wmb();
 
 Doc says PCOMMIT is not ordered wrt loads and SFENCE too. Don't we want
 to be absolutely conservative here and use MFENCE both times?

The code, for easy viewing:
void write_and_commit_buffer(void *vaddr, unsigned int size)
{
void *vend = vaddr + size - 1;

for (; vaddr  vend; vaddr += boot_cpu_data.x86_clflush_size)
clwb(vaddr);

/* Flush any possible final partial cacheline */
clwb(vend);

/* 
 * sfence to order clwb/clflushopt/clflush cache flushes
 * mfence via mb() also works 
 */
wmb();

pcommit();

/* 
 * sfence to order pcommit
 * mfence via mb() also works 
 */
wmb();
}

We can use MFENCE, but I don't think we need to.  With SFENCE we will be
ordered with respect to stores, and the flushes and pcommit will be ordered
with respect to one another.  I think you can sprinkle in loads anywhere you
want in that flow and everything will work. The worst that will happen is
that if you've used clflush or clflushopt you'll have to re-fetch something
you just flushed out of the CPU cache hierarchy, but you'll always get correct
data from your load and you'll always pcommit valid data to the DIMM.


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v3 0/2] add support for new persistent memory instructions

2015-02-05 Thread Ross Zwisler

On Tue, 2015-01-27 at 09:53 -0700, Ross Zwisler wrote:
 This patch set adds support for two new persistent memory instructions, 
 pcommit
 and clwb.  These instructions were announced in the document Intel
 Architecture Instruction Set Extensions Programming Reference with reference
 number 319433-022.
 
 https://software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf
 
 These patches apply cleanly to v3.19-rc6.
 
 Changes from v2:
 
  - Added instruction descriptions and flows to the patch descriptions.
  - Added needed sfence to pcommit alternatives assembly.  The inline function
is now called pcommit_sfence().  If pcommit is not supported on the 
 platform
both the pcommit and the sfence will be nops.
 
 Cc: H Peter Anvin h.peter.an...@intel.com
 Cc: Ingo Molnar mi...@kernel.org
 Cc: Thomas Gleixner t...@linutronix.de
 Cc: Borislav Petkov b...@alien8.de
 
 Ross Zwisler (2):
   x86: Add support for the pcommit instruction
   x86: Add support for the clwb instruction
 
  arch/x86/include/asm/cpufeature.h|  2 ++
  arch/x86/include/asm/special_insns.h | 22 ++
  2 files changed, 24 insertions(+)

Ping?  :)

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: linux-next: Tree for Jan 20 -- Kernel panic - Unable to mount root fs

2015-01-20 Thread Ross Zwisler

On Tue, 2015-01-20 at 15:54 -0200, Fabio Estevam wrote:
 On Tue, Jan 20, 2015 at 3:39 PM, Paul Moore pmo...@redhat.com wrote:
 
  Thanks for testing this and reporting the problem, especially such a small
  bisection.  Unfortunately nothing is immediately obvious to me, would you 
  mind
  sharing your kernel config so I can try to reproduce and debug the problem?
 
 In case it helps, I also get the similar errors on a mx6 which is
 built with arch/arm/configs/imx_v6_v7_defconfig:
 
 [1.594196] imx-ipuv3 280.ipu: IPUv3H probed
 [1.601836]  ram0: unknown partition table
 [1.607247]  ram1: unknown partition table
 [1.612617]  ram2: unknown partition table
 [1.618010]  ram3: unknown partition table
 [1.623359]  ram4: unknown partition table
 [1.628761]  ram5: unknown partition table
 [1.634065]  ram6: unknown partition table
 [1.639436]  ram7: unknown partition table
 [1.644749]  ram8: unknown partition table
 [1.650132]  ram9: unknown partition table
 [1.655447]  ram10: unknown partition table
 [1.660911]  ram11: unknown partition table
 [1.666268]  ram12: unknown partition table
 [1.671778]  ram13: unknown partition table
 [1.677154]  ram14: unknown partition table
 [1.682593]  ram15: unknown partition table
 [1.687165] brd: module loaded
 [1.705148] loop: module loaded

This is probably related to the issue reported in this thread:

https://lkml.org/lkml/2015/1/16/563


These were just nuisance warnings, I believe, so my guess is that this
isn't related to your kernel panic.   Reverting Boaz's patches to make
these warnings go away would let you know for sure.

- Ross

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] drm/radeon: Fix regression with suspend/resume

2015-02-12 Thread Ross Zwisler

This patch reverts the changes made in this commit:

  deadcb36f49b (drm/radeon: Use two-ended allocation by size, v2)

That patch caused a regression on my system where the bottom of the
screen flickers after my laptop goes thorough a suspend and resume.
This is reproducible 100% of the time.

This patch applies cleanly to v3.19, and fixes the screen flicker issue
on my system.  Here is the hardware that I'm using (from lspci):

01:00.0 VGA compatible controller: Advanced Micro Devices, Inc.
[AMD/ATI] Seymour [Radeon HD 6400M/7400M Series]

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
Cc: Lauri Kasanen c...@gmx.com
Cc: Christian König christian.koe...@amd.com
Cc: Dave Airlie airl...@redhat.com
Cc: Alex Deucher alexander.deuc...@amd.com
Cc: dri-de...@lists.freedesktop.org
---
 drivers/gpu/drm/radeon/radeon_object.c | 11 ---
 1 file changed, 11 deletions(-)

diff --git a/drivers/gpu/drm/radeon/radeon_object.c 
b/drivers/gpu/drm/radeon/radeon_object.c
index 86fc564..dea1baf 100644
--- a/drivers/gpu/drm/radeon/radeon_object.c
+++ b/drivers/gpu/drm/radeon/radeon_object.c
@@ -173,17 +173,6 @@ void radeon_ttm_placement_from_domain(struct radeon_bo 
*rbo, u32 domain)
else
rbo-placements[i].lpfn = 0;
}
-
-   /*
-* Use two-ended allocation depending on the buffer size to
-* improve fragmentation quality.
-* 512kb was measured as the most optimal number.
-*/
-   if (rbo-tbo.mem.size  512 * 1024) {
-   for (i = 0; i  c; i++) {
-   rbo-placements[i].flags |= TTM_PL_FLAG_TOPDOWN;
-   }
-   }
 }
 
 int radeon_bo_create(struct radeon_device *rdev,
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] drm/radeon: Fix regression with suspend/resume

2015-02-17 Thread Ross Zwisler

On Sat, 2015-02-14 at 06:25 +, Deucher, Alexander wrote:
  -Original Message-
  From: Ross Zwisler [mailto:ross.zwis...@linux.intel.com]
  Sent: Friday, February 13, 2015 10:55 PM
  To: Michel Dänzer
  Cc: linux-kernel@vger.kernel.org; dri-de...@lists.freedesktop.org; Deucher,
  Alexander; Dave Airlie; Lauri Kasanen; Koenig, Christian
  Subject: Re: [PATCH] drm/radeon: Fix regression with suspend/resume

  On Fri, 2015-02-13 at 11:41 +0900, Michel Dänzer wrote:
   On 13.02.2015 05:30, Ross Zwisler wrote:
This patch reverts the changes made in this commit:

  deadcb36f49b (drm/radeon: Use two-ended allocation by size, v2)

That patch caused a regression on my system where the bottom of the
screen flickers after my laptop goes thorough a suspend and resume.

   What kind of flicker is it? E.g. does it only affect X or also console,
   does it flicker all the time or only when there is activity, what does
   it look like, ...

  It's kind of hard to describe it precisely, so I made a video.  :)

  http://youtu.be/ESm9SMnr0do

  It only affects X, not the console, and it seems to go away if you log
  out back to the login manager (I'm using GDM on Fedora 20) and back into
  your window manager.

 Does a VT switch or forcing a dpms cycle (sleep 5; xset dpms force off)
 also fix it?  It doesn't look related to the patch in question at all. 
 Is the flickering 100% reproducible or does it only happen
 periodically?

From kernels 3.14 or so (when the deadcb36f49b patch was introduced)
till 3.18 it happened 100% of the time.  With 3.19 it only seems to
happen maybe 50% of the time, but is still very easily reproducible.

It's entirely possible that the patch isn't the root cause, but it just
brought out a bug somewhere else.  All I know is that I did a bisect,
and with the commit before this the issue never happens, and after this
commit it happens 100% of the time. :)  Also, reverting that commit with
3.19 makes the issue go away.

Nope, xset dpms force off doesn't fix it. After the screen goes black
and comes back, the flicker is still there.

- Ross

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/3] pmem: Allow request_mem to fail, (CONFIG_BLK_DEV_PMEM_IGNORE_REQUEST_MEM_RET)

2015-02-17 Thread Ross Zwisler

On Mon, 2015-02-16 at 13:24 +0200, Boaz Harrosh wrote:
 With old Kernels there was a bug in x86 where any unknown
 memory chip type would come up BUSY when calling
 request_mem_region_exclusive().
 
 So for pmem to work with old Kernels and real NvDIMM chips
 we have a new Kconfig option CONFIG_BLK_DEV_PMEM_IGNORE_REQUEST_MEM_RET.
 
 People have been running with hacked up pmem that will ignore
 the return code from request_mem_region_exclusive. So here it is
 official
 
 Signed-off-by: Boaz Harrosh b...@plexistor.com

I'm confused - I thought that this behavior was fixed by patch 1/3?
With that patch this memory reservation should not fail, correct?

If so, why do we need this patch?

 ---
  drivers/block/Kconfig | 12 
  drivers/block/pmem.c  | 11 +++
  2 files changed, 19 insertions(+), 4 deletions(-)
 
 diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
 index 3b3200f..10879b8 100644
 --- a/drivers/block/Kconfig
 +++ b/drivers/block/Kconfig
 @@ -430,6 +430,18 @@ config BLK_DEV_PMEM_USE_PAGES
 to other devices in the system, then you must say Yes here.
 If unsure leave as Yes.
  
 +config BLK_DEV_PMEM_IGNORE_REQUEST_MEM_RET
 + bool Ignore the return code from request_mem_region_exclusive
 + depends on BLK_DEV_PMEM
 + help
 +   In Old Kernels type-12 Memory type which is used by NvDIMM
 +   chips Comes out busy when calling request_mem_region_exclusive,
 +   because of a bug.
 +   If this option is set to yes. The pmem will ignore the
 +   failure, and continue as usual. If you have an old Kernel and
 +   a real NvDIMM chip you must say yes here.
 +   (Ignored if BLK_DEV_PMEM_USE_PAGES=y)
 +
  config CDROM_PKTCDVD
   tristate Packet writing on CD/DVD media
   depends on !UML
 diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
 index 9eb7ffe..f84d033 100644
 --- a/drivers/block/pmem.c
 +++ b/drivers/block/pmem.c
 @@ -197,10 +197,12 @@ int pmem_mapmem(struct pmem_device *pmem)
  
   res_mem = request_mem_region_exclusive(pmem-phys_addr, pmem-size,
  pmem);
 - if (!res_mem) {
 + if (unlikely(!res_mem)) {
   pr_warn(pmem: request_mem_region_exclusive phys=0x%llx 
 size=0x%zx failed\n,
 -pmem-phys_addr, pmem-size);
 - return -EINVAL;
 + pmem-phys_addr, pmem-size);
 +#ifndef CONFIG_BLK_DEV_PMEM_IGNORE_REQUEST_MEM_RET
 + return -EBUSY;
 +#endif
   }
  
   pmem-virt_addr = ioremap_cache(pmem-phys_addr, pmem-size);
 @@ -211,7 +213,8 @@ int pmem_mapmem(struct pmem_device *pmem)
   return 0;
  
  out_release:
 - release_mem_region(pmem-phys_addr, pmem-size);
 + if (res_mem)
 + release_mem_region(pmem-phys_addr, pmem-size);
   return err;
  }
  



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [tip:x86/asm] x86: Add support for the pcommit instruction

2015-02-19 Thread Ross Zwisler

On Thu, 2015-02-19 at 02:15 +0100, Ingo Molnar wrote:
 * tip-bot for Ross Zwisler tip...@zytor.com wrote:
 
  Commit-ID:  a71ef01336f2228dc9d47320492360d6848e591e
  Gitweb: 
  http://git.kernel.org/tip/a71ef01336f2228dc9d47320492360d6848e591e
  Author: Ross Zwisler ross.zwis...@linux.intel.com
  AuthorDate: Tue, 27 Jan 2015 09:53:50 -0700
  Committer:  Ingo Molnar mi...@kernel.org
  CommitDate: Thu, 19 Feb 2015 00:06:37 +0100
  
  x86: Add support for the pcommit instruction
 
 So this breaks the UML build:
 
 /home/mingo/tip/arch/x86/include/asm/special_insns.h: In function 
 ‘pcommit_sfence’:
 /home/mingo/tip/arch/x86/include/asm/special_insns.h:218:14: error: expected 
 ‘:’ or ‘)’ before ‘ASM_NOP7’
 
 Thanks,
 
   Ingo

Interesting, it looks like I need to include asm/nops.h explicitly for
UML.  New patch on the way.

Thanks,
- Ross

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v4] x86: Add support for the pcommit instruction

2015-02-19 Thread Ross Zwisler

Add support for the new pcommit (persistent commit) instruction.
This instruction was announced in the document Intel
Architecture Instruction Set Extensions Programming Reference
with reference number 319433-022.

https://software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf

The pcommit instruction ensures that data that has been flushed
from the processor's cache hierarchy with clwb, clflushopt or
clflush is accepted to memory and is durable on the DIMM.  The
primary use case for this is persistent memory.

This function shows how to properly use clwb/clflushopt/clflush
and pcommit with appropriate fencing:

void flush_and_commit_buffer(void *vaddr, unsigned int size)
{
void *vend = vaddr + size - 1;

for (; vaddr  vend; vaddr += boot_cpu_data.x86_clflush_size)
clwb(vaddr);

/* Flush any possible final partial cacheline */
clwb(vend);

/*
 * sfence to order clwb/clflushopt/clflush cache flushes
 * mfence via mb() also works
 */
wmb();

/* pcommit and the required sfence for ordering */
pcommit_sfence();
}

After this function completes the data pointed to by vaddr is
has been accepted to memory and will be durable if the vaddr
points to persistent memory.

Pcommit must always be ordered by an mfence or sfence, so to
help simplify things we include both the pcommit and the
required sfence in the alternatives generated by
pcommit_sfence().  The other option is to keep them separated,
but on platforms that don't support pcommit this would then turn
into:

void flush_and_commit_buffer(void *vaddr, unsigned int size)
{
void *vend = vaddr + size - 1;

for (; vaddr  vend; vaddr += boot_cpu_data.x86_clflush_size)
clwb(vaddr);

/* Flush any possible final partial cacheline */
clwb(vend);

/*
 * sfence to order clwb/clflushopt/clflush cache flushes
 * mfence via mb() also works
 */
wmb();

nop(); /* from pcommit(), via alternatives */

/*
 * sfence to order pcommit
 * mfence via mb() also works
 */
wmb();
}

This is still correct, but now you've got two fences separated
by only a nop.  With the commit and the fence together in
pcommit_sfence() you avoid the final unneeded fence.

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
Acked-by: Borislav Petkov b...@suse.de
Acked-by: H. Peter Anvin h...@linux.intel.com
Cc: Linus Torvalds torva...@linux-foundation.org
Cc: Thomas Gleixner t...@linutronix.de
Cc: Ingo Molnar mi...@kernel.org
---
 arch/x86/include/asm/cpufeature.h|  1 +
 arch/x86/include/asm/special_insns.h | 10 ++
 2 files changed, 11 insertions(+)

diff --git a/arch/x86/include/asm/cpufeature.h 
b/arch/x86/include/asm/cpufeature.h
index aede2c3..af1d5f7 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -230,6 +230,7 @@
 #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */
 #define X86_FEATURE_ADX( 9*32+19) /* The ADCX and ADOX 
instructions */
 #define X86_FEATURE_SMAP   ( 9*32+20) /* Supervisor Mode Access Prevention 
*/
+#define X86_FEATURE_PCOMMIT( 9*32+22) /* PCOMMIT instruction */
 #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
 #define X86_FEATURE_AVX512PF   ( 9*32+26) /* AVX-512 Prefetch */
 #define X86_FEATURE_AVX512ER   ( 9*32+27) /* AVX-512 Exponential and 
Reciprocal */
diff --git a/arch/x86/include/asm/special_insns.h 
b/arch/x86/include/asm/special_insns.h
index e820c08..0962501 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -4,6 +4,8 @@
 
 #ifdef __KERNEL__
 
+#include asm/nops.h
+
 static inline void native_clts(void)
 {
asm volatile(clts);
@@ -199,6 +201,14 @@ static inline void clflushopt(volatile void *__p)
   +m (*(volatile char __force *)__p));
 }
 
+static inline void pcommit_sfence(void)
+{
+   alternative(ASM_NOP7,
+   .byte 0x66, 0x0f, 0xae, 0xf8\n\t /* pcommit */
+   sfence,
+   X86_FEATURE_PCOMMIT);
+}
+
 #define nop() asm volatile (nop)
 
 
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [tip:x86/asm] x86: Add support for the pcommit instruction

2015-02-19 Thread Ross Zwisler

On Thu, 2015-02-19 at 18:33 +0100, Borislav Petkov wrote:
 On Thu, Feb 19, 2015 at 10:21:53AM -0700, Ross Zwisler wrote:
  Interesting, it looks like I need to include asm/nops.h explicitly for
  UML.  New patch on the way.
 
 You'd need to do an incremental fix ontop, though.

Oh, instead of just sending out a new patch that does the include?

Sorry, didn't see this before I sent out v4 of the patch that added the
include - Ingo, if you'd rather have a separate patch that adds the
include to fix the compile error, please let me know  I can send one
out.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v3 1/2] x86: Add support for the pcommit instruction

2015-01-28 Thread Ross Zwisler

On Wed, 2015-01-28 at 18:21 +0100, Borislav Petkov wrote:
 On Wed, Jan 28, 2015 at 05:10:46PM +, Elliott, Robert (Server Storage) 
 wrote:
  Should this patch series also add defines for the virtual 
  machine control data structure changes?
  
  1. Add the new VM-Execution Controls bit 21 as
  SECONDARY_EXEC_PCOMMIT_EXITING 0x0020
  to arch/x86/include/asm/vmx.h.
  
  2. Add the new exit reason of 64 (0x41) as
  EXIT_REASON_PCOMMIT  64
  to arch/x86/include/uapi/asm/vmx.h and (with a
  VMX_EXIT_REASONS string) to usr/include/asm/vmx.h.
  
  3. Add a kvm_vmx_exit_handler to arch/x86/kvm/vmx.c.
 
 These look like a separate patchset for kvm enablement to me.

Agreed, I think they are a separate patch set.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v3 0/2] add support for new persistent memory instructions

2015-01-27 Thread Ross Zwisler

This patch set adds support for two new persistent memory instructions, pcommit
and clwb.  These instructions were announced in the document Intel
Architecture Instruction Set Extensions Programming Reference with reference
number 319433-022.

https://software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf

These patches apply cleanly to v3.19-rc6.

Changes from v2:

 - Added instruction descriptions and flows to the patch descriptions.
 - Added needed sfence to pcommit alternatives assembly.  The inline function
   is now called pcommit_sfence().  If pcommit is not supported on the platform
   both the pcommit and the sfence will be nops.

Cc: H Peter Anvin h.peter.an...@intel.com
Cc: Ingo Molnar mi...@kernel.org
Cc: Thomas Gleixner t...@linutronix.de
Cc: Borislav Petkov b...@alien8.de

Ross Zwisler (2):
  x86: Add support for the pcommit instruction
  x86: Add support for the clwb instruction

 arch/x86/include/asm/cpufeature.h|  2 ++
 arch/x86/include/asm/special_insns.h | 22 ++
 2 files changed, 24 insertions(+)

-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v3 1/2] x86: Add support for the pcommit instruction

2015-01-27 Thread Ross Zwisler

Add support for the new pcommit (persistent commit) instruction.  This
instruction was announced in the document Intel Architecture
Instruction Set Extensions Programming Reference with reference number
319433-022.

https://software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf

The pcommit instruction ensures that data that has been flushed from the
processor's cache hierarchy with clwb, clflushopt or clflush is accepted to
memory and is durable on the DIMM.  The primary use case for this is persistent
memory.

This function shows how to properly use clwb/clflushopt/clflush and
pcommit with appropriate fencing:

void flush_and_commit_buffer(void *vaddr, unsigned int size)
{
void *vend = vaddr + size - 1;

for (; vaddr  vend; vaddr += boot_cpu_data.x86_clflush_size)
clwb(vaddr);

/* Flush any possible final partial cacheline */
clwb(vend);

/*
 * sfence to order clwb/clflushopt/clflush cache flushes
 * mfence via mb() also works
 */
wmb();

/* pcommit and the required sfence for ordering */
pcommit_sfence();
}

After this function completes the data pointed to by vaddr is has been
accepted to memory and will be durable if the vaddr points to
persistent memory.

Pcommit must always be ordered by an mfence or sfence, so to help
simplify things we include both the pcommit and the required sfence in
the alternatives generated by pcommit_sfence().  The other option is to
keep them separated, but on platforms that don't support pcommit this
would then turn into:

void flush_and_commit_buffer(void *vaddr, unsigned int size)
{
void *vend = vaddr + size - 1;

for (; vaddr  vend; vaddr += boot_cpu_data.x86_clflush_size)
clwb(vaddr);

/* Flush any possible final partial cacheline */
clwb(vend);

/*
 * sfence to order clwb/clflushopt/clflush cache flushes
 * mfence via mb() also works
 */
wmb();

nop(); /* from pcommit(), via alternatives */

/*
 * sfence to order pcommit
 * mfence via mb() also works
 */
wmb();
}

This is still correct, but now you've got two fences separated by only a
nop.  With the commit and the fence together in pcommit_sfence() you
avoid the final unneeded fence.

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
Cc: H Peter Anvin h.peter.an...@intel.com
Cc: Ingo Molnar mi...@kernel.org
Cc: Thomas Gleixner t...@linutronix.de
Cc: Borislav Petkov b...@alien8.de
---
 arch/x86/include/asm/cpufeature.h| 1 +
 arch/x86/include/asm/special_insns.h | 8 
 2 files changed, 9 insertions(+)

diff --git a/arch/x86/include/asm/cpufeature.h 
b/arch/x86/include/asm/cpufeature.h
index bb9b258..dfdd689 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -220,6 +220,7 @@
 #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */
 #define X86_FEATURE_ADX( 9*32+19) /* The ADCX and ADOX 
instructions */
 #define X86_FEATURE_SMAP   ( 9*32+20) /* Supervisor Mode Access Prevention 
*/
+#define X86_FEATURE_PCOMMIT( 9*32+22) /* PCOMMIT instruction */
 #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
 #define X86_FEATURE_AVX512PF   ( 9*32+26) /* AVX-512 Prefetch */
 #define X86_FEATURE_AVX512ER   ( 9*32+27) /* AVX-512 Exponential and 
Reciprocal */
diff --git a/arch/x86/include/asm/special_insns.h 
b/arch/x86/include/asm/special_insns.h
index e820c08..d686f9b 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -199,6 +199,14 @@ static inline void clflushopt(volatile void *__p)
   +m (*(volatile char __force *)__p));
 }
 
+static inline void pcommit_sfence(void)
+{
+   alternative(ASM_NOP7,
+   .byte 0x66, 0x0f, 0xae, 0xf8\n\t /* pcommit */
+   sfence,
+   X86_FEATURE_PCOMMIT);
+}
+
 #define nop() asm volatile (nop)
 
 
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v3 2/2] x86: Add support for the clwb instruction

2015-01-27 Thread Ross Zwisler

Add support for the new clwb (cache line write back) instruction.  This
instruction was announced in the document Intel Architecture
Instruction Set Extensions Programming Reference with reference number
319433-022.

https://software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf

The clwb instruction is used to write back the contents of dirtied cache
lines to memory without evicting the cache lines from the processor's
cache hierarchy.  This should be used in favor of clflushopt or clflush
in cases where you require the cache line to be written to memory but
plan to access the data again in the near future.

One of the main use cases for this is with persistent memory where clwb
can be used with pcommit to ensure that data has been accepted to memory
and is durable on the DIMM.

This function shows how to properly use clwb/clflushopt/clflush and
pcommit with appropriate fencing:

void flush_and_commit_buffer(void *vaddr, unsigned int size)
{
void *vend = vaddr + size - 1;

for (; vaddr  vend; vaddr += boot_cpu_data.x86_clflush_size)
clwb(vaddr);

/* Flush any possible final partial cacheline */
clwb(vend);

/*
 * sfence to order clwb/clflushopt/clflush cache flushes
 * mfence via mb() also works
 */
wmb();

/* pcommit and the required sfence for ordering */
pcommit_sfence();
}

After this function completes the data pointed to by vaddr is has been
accepted to memory and will be durable if the vaddr points to
persistent memory.

Regarding the details of how the alternatives assembly is set up, we
need one additional byte at the beginning of the clflush so that we can
flip it into a clflushopt by changing that byte into a 0x66 prefix.  Two
options are to either insert a 1 byte ASM_NOP1, or to add a 1 byte
NOP_DS_PREFIX.  Both have no functional effect with the plain clflush,
but I've been told that executing a clflush + prefix should be faster
than executing a clflush + NOP.

We had to hard code the assembly for clwb because, lacking the ability
to assemble the clwb instruction itself, the next closest thing is to
have an xsaveopt instruction with a 0x66 prefix.  Unfortunately xsaveopt
itself is also relatively new, and isn't included by all the GCC
versions that the kernel needs to support.

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
Cc: H Peter Anvin h.peter.an...@intel.com
Cc: Ingo Molnar mi...@kernel.org
Cc: Thomas Gleixner t...@linutronix.de
Cc: Borislav Petkov b...@alien8.de
---
 arch/x86/include/asm/cpufeature.h|  1 +
 arch/x86/include/asm/special_insns.h | 14 ++
 2 files changed, 15 insertions(+)

diff --git a/arch/x86/include/asm/cpufeature.h 
b/arch/x86/include/asm/cpufeature.h
index dfdd689..dc91747 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -222,6 +222,7 @@
 #define X86_FEATURE_SMAP   ( 9*32+20) /* Supervisor Mode Access Prevention 
*/
 #define X86_FEATURE_PCOMMIT( 9*32+22) /* PCOMMIT instruction */
 #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
+#define X86_FEATURE_CLWB   ( 9*32+24) /* CLWB instruction */
 #define X86_FEATURE_AVX512PF   ( 9*32+26) /* AVX-512 Prefetch */
 #define X86_FEATURE_AVX512ER   ( 9*32+27) /* AVX-512 Exponential and 
Reciprocal */
 #define X86_FEATURE_AVX512CD   ( 9*32+28) /* AVX-512 Conflict Detection */
diff --git a/arch/x86/include/asm/special_insns.h 
b/arch/x86/include/asm/special_insns.h
index d686f9b..0772365 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -199,6 +199,20 @@ static inline void clflushopt(volatile void *__p)
   +m (*(volatile char __force *)__p));
 }
 
+static inline void clwb(volatile void *__p)
+{
+   volatile struct { char x[64]; } *p = __p;
+
+   asm volatile(ALTERNATIVE_2(
+   .byte  __stringify(NOP_DS_PREFIX) ; clflush (%[pax]),
+   .byte 0x66; clflush (%[pax]), /* clflushopt (%%rax) */
+   X86_FEATURE_CLFLUSHOPT,
+   .byte 0x66, 0x0f, 0xae, 0x30,  /* clwb (%%rax) */
+   X86_FEATURE_CLWB)
+   : [p] +m (*p)
+   : [pax] a (p));
+}
+
 static inline void pcommit_sfence(void)
 {
alternative(ASM_NOP7,
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 0/2] add support for new persistent memory instructions

2015-01-26 Thread Ross Zwisler

On Mon, 2015-01-26 at 23:39 +0100, Borislav Petkov wrote:
 On Mon, Jan 26, 2015 at 02:50:07PM -0700, Ross Zwisler wrote:
  We can use MFENCE, but I don't think we need to. With SFENCE we will
  be ordered with respect to stores, and the flushes and pcommit will be
  ordered with respect to one another. I think you can sprinkle in loads
  anywhere you want in that flow and everything will work.
 
 
 Ok, maybe we should hold down that sequence and this somewhere as a
 this-is-how-you-should-do-pcommit-properly explanation or so... or do
 you have an actual patch adding write_and_commit_buffer()?

I don't have a patch to add and use this yet.  I'll add the flow to the
commit messages for both pcommit and clwb.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v12 18/20] dax: Add dax_zero_page_range

2015-01-12 Thread Ross Zwisler

On Mon, 2015-01-12 at 15:10 -0800, Andrew Morton wrote:
 On Fri, 24 Oct 2014 17:20:50 -0400 Matthew Wilcox 
 matthew.r.wil...@intel.com wrote:
 
  Signed-off-by: Matthew Wilcox matthew.r.wil...@intel.com
  [ported to 3.13-rc2]
  Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
 
 I never know what this means :(
 
 I switched it to 
 
 [ross.zwis...@linux.intel.com: ported to 3.13-rc2]
 Signed-off-by: Matthew Wilcox matthew.r.wil...@intel.com
 Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com

The way that you've interpreted it is correct.  Thanks!

- Ross

 but perhaps that was wrong?
 
 
 
 
 also, coupla typos:
 
 
 diff -puN fs/dax.c~dax-add-dax_zero_page_range-fix fs/dax.c
 --- a/fs/dax.c~dax-add-dax_zero_page_range-fix
 +++ a/fs/dax.c
 @@ -475,7 +475,7 @@ EXPORT_SYMBOL_GPL(dax_fault);
   * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
   * took care of disposing of the unnecessary blocks.  Even if the filesystem
   * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
 - * since the file might be mmaped.
 + * since the file might be mmapped.
   */
  int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
   get_block_t get_block)
 @@ -514,13 +514,13 @@ EXPORT_SYMBOL_GPL(dax_zero_page_range);
   * @get_block: The filesystem method used to translate file offsets to blocks
   *
   * Similar to block_truncate_page(), this function can be called by a
 - * filesystem when it is truncating an DAX file to handle the partial page.
 + * filesystem when it is truncating a DAX file to handle the partial page.
   *
   * We work in terms of PAGE_CACHE_SIZE here for commonality with
   * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
   * took care of disposing of the unnecessary blocks.  Even if the filesystem
   * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
 - * since the file might be mmaped.
 + * since the file might be mmapped.
   */
  int dax_truncate_page(struct inode *inode, loff_t from, get_block_t 
 get_block)
  {
 diff -puN include/linux/fs.h~dax-add-dax_zero_page_range-fix 
 include/linux/fs.h
 _
 
 
 akpm3:/usr/src/linux-3.19-rc4 grep -r mmaped .| wc -l
 70
 akpm3:/usr/src/linux-3.19-rc4 grep -r mmapped .| wc -l 
 107
 
 lol.



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] drm/radeon: Fix regression with suspend/resume

2015-02-13 Thread Ross Zwisler

On Fri, 2015-02-13 at 11:41 +0900, Michel Dänzer wrote:
 On 13.02.2015 05:30, Ross Zwisler wrote:
  This patch reverts the changes made in this commit:
  
deadcb36f49b (drm/radeon: Use two-ended allocation by size, v2)
  
  That patch caused a regression on my system where the bottom of the
  screen flickers after my laptop goes thorough a suspend and resume.
 
 What kind of flicker is it? E.g. does it only affect X or also console,
 does it flicker all the time or only when there is activity, what does
 it look like, ...

It's kind of hard to describe it precisely, so I made a video.  :)

http://youtu.be/ESm9SMnr0do

It only affects X, not the console, and it seems to go away if you log
out back to the login manager (I'm using GDM on Fedora 20) and back into
your window manager.

I've tested with OpenBox, Gnome and KDE, and it happens in all three, so
it doesn't appear to be related to the window manager.

It does appear to flicker more if you move the mouse, but it still does
flicker occasionally if you aren't doing anything.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 6/6] pmem: Let each device manage private memory region

2015-03-16 Thread Ross Zwisler

From: Boaz Harrosh b...@plexistor.com

This patch removes any global memory information. And lets
each pmem-device manage it's own memory region.

pmem_alloc() Now receives phys_addr and disk_size and will
map that region, also pmem_free will do the unmaping.

This is so we can support multiple discontinuous memory regions
in the next patch

Signed-off-by: Boaz Harrosh b...@plexistor.com
Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
Cc: linux-nvd...@lists.01.org
Cc: linux-fsde...@vger.kernel.org
Cc: ax...@kernel.dk
Cc: h...@infradead.org
Cc: r...@redhat.com
---
 drivers/block/pmem.c | 122 +++
 1 file changed, 75 insertions(+), 47 deletions(-)

diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
index 8f39ef4..1bd9ab0 100644
--- a/drivers/block/pmem.c
+++ b/drivers/block/pmem.c
@@ -30,19 +30,12 @@
 #define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
 #define PAGE_SECTORS   (1  PAGE_SECTORS_SHIFT)
 
-/*
- * driver-wide physical address and total_size - one single, contiguous memory
- * region that we divide up in to same-sized devices
- */
-phys_addr_tphys_addr;
-void   *virt_addr;
-size_t total_size;
-
 struct pmem_device {
struct request_queue*pmem_queue;
struct gendisk  *pmem_disk;
struct list_headpmem_list;
 
+   /* One contiguous memory region per device */
phys_addr_t phys_addr;
void*virt_addr;
size_t  size;
@@ -237,33 +230,80 @@ MODULE_PARM_DESC(pmem_count, Number of pmem devices to 
evenly split allocated s
 static LIST_HEAD(pmem_devices);
 static int pmem_major;
 
-/* FIXME: move phys_addr, virt_addr, size calls up to caller */
-static struct pmem_device *pmem_alloc(int i)
+/* pmem-phys_addr and pmem-size need to be set.
+ * Will then set virt_addr if successful.
+ */
+int pmem_mapmem(struct pmem_device *pmem)
+{
+   struct resource *res_mem;
+   int err;
+
+   res_mem = request_mem_region_exclusive(pmem-phys_addr, pmem-size,
+  pmem);
+   if (!res_mem) {
+   pr_warn(pmem: request_mem_region_exclusive phys=0x%llx 
size=0x%zx failed\n,
+  pmem-phys_addr, pmem-size);
+   return -EINVAL;
+   }
+
+   pmem-virt_addr = ioremap_cache(pmem-phys_addr, pmem-size);
+   if (unlikely(!pmem-virt_addr)) {
+   err = -ENXIO;
+   goto out_release;
+   }
+   return 0;
+
+out_release:
+   release_mem_region(pmem-phys_addr, pmem-size);
+   return err;
+}
+
+void pmem_unmapmem(struct pmem_device *pmem)
+{
+   if (unlikely(!pmem-virt_addr))
+   return;
+
+   iounmap(pmem-virt_addr);
+   release_mem_region(pmem-phys_addr, pmem-size);
+   pmem-virt_addr = NULL;
+}
+
+static struct pmem_device *pmem_alloc(phys_addr_t phys_addr, size_t disk_size,
+ int i)
 {
struct pmem_device *pmem;
struct gendisk *disk;
-   size_t disk_size = total_size / pmem_count;
-   size_t disk_sectors = disk_size / 512;
+   int err;
 
pmem = kzalloc(sizeof(*pmem), GFP_KERNEL);
-   if (!pmem)
+   if (unlikely(!pmem)) {
+   err = -ENOMEM;
goto out;
+   }
 
-   pmem-phys_addr = phys_addr + i * disk_size;
-   pmem-virt_addr = virt_addr + i * disk_size;
+   pmem-phys_addr = phys_addr;
pmem-size = disk_size;
 
-   pmem-pmem_queue = blk_alloc_queue(GFP_KERNEL);
-   if (!pmem-pmem_queue)
+   err = pmem_mapmem(pmem);
+   if (unlikely(err))
goto out_free_dev;
 
+   pmem-pmem_queue = blk_alloc_queue(GFP_KERNEL);
+   if (unlikely(!pmem-pmem_queue)) {
+   err = -ENOMEM;
+   goto out_unmap;
+   }
+
blk_queue_make_request(pmem-pmem_queue, pmem_make_request);
blk_queue_max_hw_sectors(pmem-pmem_queue, 1024);
blk_queue_bounce_limit(pmem-pmem_queue, BLK_BOUNCE_ANY);
 
-   disk = pmem-pmem_disk = alloc_disk(0);
-   if (!disk)
+   disk = alloc_disk(0);
+   if (unlikely(!disk)) {
+   err = -ENOMEM;
goto out_free_queue;
+   }
+
disk-major = pmem_major;
disk-first_minor   = 0;
disk-fops  = pmem_fops;
@@ -271,22 +311,26 @@ static struct pmem_device *pmem_alloc(int i)
disk-queue = pmem-pmem_queue;
disk-flags = GENHD_FL_EXT_DEVT;
sprintf(disk-disk_name, pmem%d, i);
-   set_capacity(disk, disk_sectors);
+   set_capacity(disk, disk_size  SECTOR_SHIFT);
+   pmem-pmem_disk = disk;
 
return pmem;
 
 out_free_queue:
blk_cleanup_queue(pmem-pmem_queue);
+out_unmap:
+   pmem_unmapmem(pmem);
 out_free_dev:
kfree(pmem);
 out:
-   return NULL;
+   return ERR_PTR(err

[PATCH 0/6] Add persistent memory driver

2015-03-16 Thread Ross Zwisler

PMEM is a modified version of the Block RAM Driver, BRD. The major difference
is that BRD allocates its backing store pages from the page cache, whereas
PMEM uses reserved memory that has been ioremapped.

One benefit of this approach is that there is a direct mapping between
filesystem block numbers and virtual addresses.  In PMEM, filesystem blocks N,
N+1, N+2, etc. will all be adjacent in the virtual memory space. This property
allows us to set up PMD mappings (2 MiB) for DAX.

This patch set is builds upon the work that Matthew Wilcox has been doing for
DAX, which has been merged into the v4.0 kernel series.

For more information on PMEM and for some instructions on how to use it, please
check out PMEM's github tree:

https://github.com/01org/prd

Cc: linux-nvd...@lists.01.org
Cc: linux-fsde...@vger.kernel.org
Cc: ax...@kernel.dk
Cc: h...@infradead.org
Cc: r...@redhat.com

Boaz Harrosh (1):
  pmem: Let each device manage private memory region

Ross Zwisler (5):
  pmem: Initial version of persistent memory driver
  pmem: Add support for getgeo()
  pmem: Add support for rw_page()
  pmem: Add support for direct_access()
  pmem: Clean up includes

 MAINTAINERS|   6 +
 drivers/block/Kconfig  |  41 +
 drivers/block/Makefile |   1 +
 drivers/block/pmem.c   | 401 +
 4 files changed, 449 insertions(+)
 create mode 100644 drivers/block/pmem.c

-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 5/6] pmem: Clean up includes

2015-03-16 Thread Ross Zwisler

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
Cc: linux-nvd...@lists.01.org
Cc: linux-fsde...@vger.kernel.org
Cc: ax...@kernel.dk
Cc: h...@infradead.org
Cc: r...@redhat.com
---
 drivers/block/pmem.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
index d63bc96..8f39ef4 100644
--- a/drivers/block/pmem.c
+++ b/drivers/block/pmem.c
@@ -16,17 +16,15 @@
  * Copyright (C) 2007 Novell Inc.
  */
 
+#include asm/cacheflush.h
 #include linux/bio.h
 #include linux/blkdev.h
 #include linux/fs.h
 #include linux/hdreg.h
-#include linux/highmem.h
 #include linux/init.h
-#include linux/major.h
 #include linux/module.h
 #include linux/moduleparam.h
 #include linux/slab.h
-#include linux/uaccess.h
 
 #define SECTOR_SHIFT   9
 #define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/6] pmem: Add support for getgeo()

2015-03-16 Thread Ross Zwisler

Some programs require HDIO_GETGEO work, which requires we implement
getgeo.  Based off of the work done to the NVMe driver in this commit:

commit 4cc09e2dc4cb (NVMe: Add getgeo to block ops)

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
Cc: linux-nvd...@lists.01.org
Cc: linux-fsde...@vger.kernel.org
Cc: ax...@kernel.dk
Cc: h...@infradead.org
Cc: r...@redhat.com
---
 drivers/block/pmem.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
index d366b9b..60bbe0d 100644
--- a/drivers/block/pmem.c
+++ b/drivers/block/pmem.c
@@ -50,6 +50,15 @@ struct pmem_device {
size_t  size;
 };
 
+static int pmem_getgeo(struct block_device *bd, struct hd_geometry *geo)
+{
+   /* some standard values */
+   geo-heads = 1  6;
+   geo-sectors = 1  5;
+   geo-cylinders = get_capacity(bd-bd_disk)  11;
+   return 0;
+}
+
 /*
  * direct translation from (pmem,sector) = void*
  * We do not require that sector be page aligned.
@@ -176,6 +185,7 @@ out:
 
 static const struct block_device_operations pmem_fops = {
.owner =THIS_MODULE,
+   .getgeo =   pmem_getgeo,
 };
 
 /* Kernel module stuff */
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/6] pmem: Initial version of persistent memory driver

2015-03-16 Thread Ross Zwisler

PMEM is a new driver that presents a reserved range of memory as a
block device.  This is useful for developing with NV-DIMMs, and
can be used with volatile memory as a development platform.

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
Cc: linux-nvd...@lists.01.org
Cc: linux-fsde...@vger.kernel.org
Cc: ax...@kernel.dk
Cc: h...@infradead.org
Cc: r...@redhat.com
---
 MAINTAINERS|   6 +
 drivers/block/Kconfig  |  41 ++
 drivers/block/Makefile |   1 +
 drivers/block/pmem.c   | 330 +
 4 files changed, 378 insertions(+)
 create mode 100644 drivers/block/pmem.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 6239a30..9414b42 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8052,6 +8052,12 @@ S:   Maintained
 F: Documentation/blockdev/ramdisk.txt
 F: drivers/block/brd.c
 
+PERSISTENT MEMORY DRIVER
+M: Ross Zwisler ross.zwis...@linux.intel.com
+L: linux-nvd...@lists.01.org
+S: Supported
+F: drivers/block/pmem.c
+
 RANDOM NUMBER DRIVER
 M: Theodore Ts'o ty...@mit.edu
 S: Maintained
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 1b8094d..ac52f5a 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -404,6 +404,47 @@ config BLK_DEV_RAM_DAX
  and will prevent RAM block device backing store memory from being
  allocated from highmem (only a problem for highmem systems).
 
+config BLK_DEV_PMEM
+   tristate Persistent memory block device support
+   help
+ Saying Y here will allow you to use a contiguous range of reserved
+ memory as one or more block devices.  Memory for PMEM should be
+ reserved using the memmap kernel parameter.
+
+ To compile this driver as a module, choose M here: the module will be
+ called pmem.
+
+ Most normal users won't need this functionality, and can thus say N
+ here.
+
+config BLK_DEV_PMEM_START
+   int Offset in GiB of where to start claiming space
+   default 0
+   depends on BLK_DEV_PMEM
+   help
+ Starting offset in GiB that PMEM should use when claiming memory.  
This
+ memory needs to be reserved from the OS at boot time using the
+ memmap kernel parameter.
+
+ If you provide PMEM with volatile memory it will act as a volatile
+ RAM disk and your data will not be persistent.
+
+config BLK_DEV_PMEM_COUNT
+   int Default number of PMEM disks
+   default 4
+   depends on BLK_DEV_PMEM
+   help
+ Number of equal sized block devices that PMEM should create.
+
+config BLK_DEV_PMEM_SIZE
+   int Size in GiB of space to claim
+   depends on BLK_DEV_PMEM
+   default 0
+   help
+ Amount of memory in GiB that PMEM should use when creating block
+ devices.  This memory needs to be reserved from the OS at
+ boot time using the memmap kernel parameter.
+
 config CDROM_PKTCDVD
tristate Packet writing on CD/DVD media
depends on !UML
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 02b688d..9cc6c18 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_PS3_VRAM)+= ps3vram.o
 obj-$(CONFIG_ATARI_FLOPPY) += ataflop.o
 obj-$(CONFIG_AMIGA_Z2RAM)  += z2ram.o
 obj-$(CONFIG_BLK_DEV_RAM)  += brd.o
+obj-$(CONFIG_BLK_DEV_PMEM) += pmem.o
 obj-$(CONFIG_BLK_DEV_LOOP) += loop.o
 obj-$(CONFIG_BLK_CPQ_DA)   += cpqarray.o
 obj-$(CONFIG_BLK_CPQ_CISS_DA)  += cciss.o
diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
new file mode 100644
index 000..d366b9b
--- /dev/null
+++ b/drivers/block/pmem.c
@@ -0,0 +1,330 @@
+/*
+ * Persistent Memory Driver
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * This driver is heavily based on drivers/block/brd.c.
+ * Copyright (C) 2007 Nick Piggin
+ * Copyright (C) 2007 Novell Inc.
+ */
+
+#include linux/bio.h
+#include linux/blkdev.h
+#include linux/fs.h
+#include linux/hdreg.h
+#include linux/highmem.h
+#include linux/init.h
+#include linux/major.h
+#include linux/module.h
+#include linux/moduleparam.h
+#include linux/slab.h
+#include linux/uaccess.h
+
+#define SECTOR_SHIFT   9
+#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
+#define PAGE_SECTORS   (1  PAGE_SECTORS_SHIFT)
+
+/*
+ * driver-wide physical address and total_size - one single, contiguous memory
+ * region that we divide up in to same-sized devices
+ */
+phys_addr_tphys_addr;
+void

[PATCH 3/6] pmem: Add support for rw_page()

2015-03-16 Thread Ross Zwisler

Based on commit a72132c31d58 (brd: add support for rw_page())

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
Cc: linux-nvd...@lists.01.org
Cc: linux-fsde...@vger.kernel.org
Cc: ax...@kernel.dk
Cc: h...@infradead.org
Cc: r...@redhat.com
---
 drivers/block/pmem.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
index 60bbe0d..0be3669 100644
--- a/drivers/block/pmem.c
+++ b/drivers/block/pmem.c
@@ -183,8 +183,19 @@ out:
bio_endio(bio, err);
 }
 
+static int pmem_rw_page(struct block_device *bdev, sector_t sector,
+  struct page *page, int rw)
+{
+   struct pmem_device *pmem = bdev-bd_disk-private_data;
+
+   pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector);
+   page_endio(page, rw  WRITE, 0);
+   return 0;
+}
+
 static const struct block_device_operations pmem_fops = {
.owner =THIS_MODULE,
+   .rw_page =  pmem_rw_page,
.getgeo =   pmem_getgeo,
 };
 
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 4/6] pmem: Add support for direct_access()

2015-03-16 Thread Ross Zwisler

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
Cc: linux-nvd...@lists.01.org
Cc: linux-fsde...@vger.kernel.org
Cc: ax...@kernel.dk
Cc: h...@infradead.org
Cc: r...@redhat.com
---
 drivers/block/pmem.c | 24 
 1 file changed, 24 insertions(+)

diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
index 0be3669..d63bc96 100644
--- a/drivers/block/pmem.c
+++ b/drivers/block/pmem.c
@@ -74,6 +74,15 @@ static void *pmem_lookup_pg_addr(struct pmem_device *pmem, 
sector_t sector)
return pmem-virt_addr + offset;
 }
 
+/* sector must be page aligned */
+static unsigned long pmem_lookup_pfn(struct pmem_device *pmem, sector_t sector)
+{
+   size_t page_offset = sector  PAGE_SECTORS_SHIFT;
+
+   BUG_ON(sector  (PAGE_SECTORS - 1));
+   return (pmem-phys_addr  PAGE_SHIFT) + page_offset;
+}
+
 /*
  * sector is not required to be page aligned.
  * n is at most a single page, but could be less.
@@ -193,9 +202,24 @@ static int pmem_rw_page(struct block_device *bdev, 
sector_t sector,
return 0;
 }
 
+static long pmem_direct_access(struct block_device *bdev, sector_t sector,
+ void **kaddr, unsigned long *pfn, long size)
+{
+   struct pmem_device *pmem = bdev-bd_disk-private_data;
+
+   if (!pmem)
+   return -ENODEV;
+
+   *kaddr = pmem_lookup_pg_addr(pmem, sector);
+   *pfn = pmem_lookup_pfn(pmem, sector);
+
+   return pmem-size - (sector * 512);
+}
+
 static const struct block_device_operations pmem_fops = {
.owner =THIS_MODULE,
.rw_page =  pmem_rw_page,
+   .direct_access =pmem_direct_access,
.getgeo =   pmem_getgeo,
 };
 
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/8] pmem: Initial version of persistent memory driver

2015-03-18 Thread Ross Zwisler

On Thu, 2015-03-05 at 13:55 +0200, Boaz Harrosh wrote:
 From: Ross Zwisler ross.zwis...@linux.intel.com
 
 PMEM is a new driver That supports any physical contiguous iomem range
 as a single block device. The driver has support for as many as needed
 iomem ranges each as its own device.
 
 The driver is not only good for NvDIMMs, It is good for any flat memory
 mapped device. We've used it with NvDIMMs, Kernel reserved DRAM
 (memmap= on command line), PCIE Battery backed memory cards, VM shared
 memory, and so on.
 
 The API to pmem module a single string parameter named map
 of the form:
map=mapS[,mapS...]
 
where mapS=nn[KMG]$ss[KMG],
ormapS=nn[KMG]@ss[KMG],
 
nn=size, ss=offset
 
 Just like the Kernel command line map  memmap parameters,
 so anything you did at grub just copy/paste to here.
 
 The @ form is exactly the same as the $ form only that
 at bash prompt we need to escape the $ with \$ so also
 support the '@' char for convenience.
 
 For each specified mapS there will be a device created.
 
 [This is the accumulated version of the driver developed by
  multiple programmers. To see the real history of these
  patches see:
   git://git.open-osd.org/pmem.git
   https://github.com/01org/prd
  This patch is based on (git://git.open-osd.org/pmem.git):
   [5ccf703] SQUASHME: Don't clobber the map module param
 
 list-of-changes
 [boaz]
 SQUASHME: pmem: Remove unused #include headers
 SQUASHME: pmem: Request from fdisk 4k alignment
 SQUASHME: pmem: Let each device manage private memory region
 SQUASHME: pmem: Support of multiple memory regions
 SQUASHME: pmem: Micro optimization the hotpath 001
 SQUASHME: pmem: no need to copy a page at a time
 SQUASHME: pmem that 4k sector thing
 SQUASHME: pmem: Cleanliness is neat
 SQUASHME: Don't clobber the map module param
 SQUASHME: pmem: Few changes to Initial version of pmem
 SQUASHME: Changes to copyright text (trivial)
 /list-of-changes
 
 TODO: Add Documentation/blockdev/pmem.txt
 
 Need-signed-by: Ross Zwisler ross.zwis...@linux.intel.com
 Signed-off-by: Boaz Harrosh b...@plexistor.com

I wrote the initial version of the PMEM driver (then called PRD for Persistent
RAM Driver) in late 2013/early 2014, and posted it on GitHub.  Here's a link
to my first version:

https://github.com/01org/prd/tree/prd_3.13

Matthew Wilcox pointed Boaz to it in June of 2014, and he cloned my tree and
went off and made a bunch of changes.  A few of those changes he sent back to
me, like the one I included in the patch series I recently sent for upstream
inclusion:

https://lkml.org/lkml/2015/3/16/1102

Many of the changes he did not submit back to me for review or inclusion in my
tree.

With the first patch in this series Boaz is squashing all of our changes
together, adding his copyright and trying to install himself as maintainer.  I
believe this to be unacceptable.  

Boaz, if you have contributions that you would like to make to PMEM, please
submit them to our mailing list (linux-nvd...@lists.01.org) and we will be
happy to review them.  But please don't try and steal control of my driver.

- Ross


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] brd: Ensure that bio_vecs have size = PAGE_SIZE

2015-03-11 Thread Ross Zwisler

The functions copy_from_brd() and copy_to_brd() are written with an
assumption that the bio_vec they are given has size = PAGE_SIZE.  This
assumption is not enforced in any way, and if the bio_vec has size
larger than PAGE_SIZE data will just be lost.

Such a situation can occur with I/Os generated from in-kernel sources,
or with coalesced bio_vecs.  This bug was originally reported against
the pmem driver, where it was found using the Enmotus tiering engine.

Instead we should have brd explicitly tell the block layer that it can
handle data segments of at most PAGE_SIZE.

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
Reported-by: Hugh Daschbach hugh.daschb...@enmotus.com
Cc: Roger C. Pao (Enmotus) rcpao.enmo...@gmail.com
Cc: Boaz Harrosh b...@plexistor.com
Cc: linux-nvd...@lists.01.org
Cc: Nick Piggin npig...@kernel.dk
---
 drivers/block/brd.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 898b4f256782..7e4873361b64 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -490,6 +490,7 @@ static struct brd_device *brd_alloc(int i)
blk_queue_make_request(brd-brd_queue, brd_make_request);
blk_queue_max_hw_sectors(brd-brd_queue, 1024);
blk_queue_bounce_limit(brd-brd_queue, BLK_BOUNCE_ANY);
+   blk_queue_max_segment_size(brd-brd_queue, PAGE_SIZE);
 
brd-brd_queue-limits.discard_granularity = PAGE_SIZE;
brd-brd_queue-limits.max_discard_sectors = UINT_MAX;
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] x86: improve algorithm in clflush_cache_range

2015-03-11 Thread Ross Zwisler

The current algorithm used in clflush_cache_range() can cause the last
cache line of the buffer to be flushed twice.

Fix that algorithm so that each cache line will only be flushed once,
and remove arithmetic on void pointers.  Void pointer arithmetic is
allowed by GCC extensions, but isn't part of the base C standards.

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
Reported-by: H. Peter Anvin h...@zytor.com
Cc: H. Peter Anvin h...@zytor.com
Cc: Thomas Gleixner t...@linutronix.de
Cc: Ingo Molnar mi...@redhat.com
Cc: x...@kernel.org
Cc: Dan Williams dan.j.willi...@intel.com
Cc: Borislav Petkov b...@suse.de
---
 arch/x86/mm/pageattr.c | 13 ++---
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index ae242a7c11c7..b75ecac859f2 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -131,16 +131,15 @@ within(unsigned long addr, unsigned long start, unsigned 
long end)
  */
 void clflush_cache_range(void *vaddr, unsigned int size)
 {
-   void *vend = vaddr + size - 1;
+   unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1;
+   char *vend = (char *)vaddr + size;
+   char *p;
 
mb();
 
-   for (; vaddr  vend; vaddr += boot_cpu_data.x86_clflush_size)
-   clflushopt(vaddr);
-   /*
-* Flush any possible final partial cacheline:
-*/
-   clflushopt(vend);
+   for (p = (char *)((unsigned long)vaddr  ~clflush_mask);
+p  vend; p += boot_cpu_data.x86_clflush_size)
+   clflushopt(p);
 
mb();
 }
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] x86: Add kerneldoc for pcommit_sfence()

2015-03-11 Thread Ross Zwisler

Add kerneldoc comments for pcommit_sfence() describing the purpose of
the pcommit instruction and demonstrating the usage of that instruction.

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
Cc: H Peter Anvin h.peter.an...@intel.com
Cc: Ingo Molnar mi...@kernel.org
Cc: Thomas Gleixner t...@linutronix.de
Cc: Borislav Petkov b...@alien8.de
---
 arch/x86/include/asm/special_insns.h | 37 
 1 file changed, 37 insertions(+)

diff --git a/arch/x86/include/asm/special_insns.h 
b/arch/x86/include/asm/special_insns.h
index aeb4666e0c0a..1ae81757c05b 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -215,6 +215,43 @@ static inline void clwb(volatile void *__p)
: [pax] a (p));
 }
 
+/**
+ * pcommit_sfence() - persistent commit and fence
+ *
+ * The pcommit instruction ensures that data that has been flushed from the
+ * processor's cache hierarchy with clwb, clflushopt or clflush is accepted to
+ * memory and is durable on the DIMM.  The primary use case for this is
+ * persistent memory.
+ *
+ * This function shows how to properly use clwb/clflushopt/clflush and pcommit
+ * with appropriate fencing:
+ *
+ * void flush_and_commit_buffer(void *vaddr, unsigned int size)
+ * {
+ * unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1;
+ * char *vend = (char *)vaddr + size;
+ * char *p;
+ *
+ * for (p = (char *)((unsigned long)vaddr  ~clflush_mask);
+ *  p  vend; p += boot_cpu_data.x86_clflush_size)
+ * clwb(p);
+ *
+ * // sfence to order clwb/clflushopt/clflush cache flushes
+ * // mfence via mb() also works
+ * wmb();
+ *
+ * // pcommit and the required sfence for ordering
+ * pcommit_sfence();
+ * }
+ *
+ * After this function completes the data pointed to by vaddr is has been
+ * accepted to memory and will be durable if the vaddr points to persistent
+ * memory.
+ *
+ * Pcommit must always be ordered by an mfence or sfence, so to help simplify
+ * things we include both the pcommit and the required sfence in the
+ * alternatives generated by pcommit_sfence().
+ */
 static inline void pcommit_sfence(void)
 {
alternative(ASM_NOP7,
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] brd: Ensure that bio_vecs have size = PAGE_SIZE

2015-03-11 Thread Ross Zwisler

On Wed, 2015-03-11 at 19:17 +0200, Boaz Harrosh wrote:
 On 03/11/2015 07:02 PM, Ross Zwisler wrote:
  The functions copy_from_brd() and copy_to_brd() are written with an
  assumption that the bio_vec they are given has size = PAGE_SIZE.  This
  assumption is not enforced in any way, and if the bio_vec has size
  larger than PAGE_SIZE data will just be lost.
  
  Such a situation can occur with I/Os generated from in-kernel sources,
  or with coalesced bio_vecs.  
 
 I wish you could show me where in Kernel this can happen.
 who coalesced bio_vecs ? what Kernel sources generate bio-b_size  
 PAGE_SIZE ?
 I did try to look and could not find any. Sorry for my slowness.

In truth I'm not certain I know of a place either. :)  In part I'm quoting the
original bug report:

https://lists.01.org/pipermail/linux-nvdimm/2015-February/79.html

The pertinent lines, in case you don't want to follow the link:

 The biovec can present a size greater that PAGE_SIZE if an I/O buffer
  contains physically contiguous pages.  This may be unusual for user space
  pages, as the virtual to physical memory map gets fragmented.  But for
  buffers allocated by the kernel with kmalloc, physical memory will be
  contiguous.

  Even if a single I/O request does not contain two contiguous pages, the
  block layer may merge two requests that have contiguous pages.  It will then
  attempt to coalesce biovecs.  You probably won't see that if you avoid the
  I/O scheduler by capturing requests at make_request.  But it is still a good
  idea to declare your devices segment size limitation with
  blk_queue_max_segment_size.  There are a couple drivers in drivers/block/
  that do just that to limit segments to PAGE_SIZE. 

I wandered around a bit in the block code and I *think* that bvec coalescing
happens via the merge_bvec_fn() function pointers.  DM, for instance, sets
this to dm_merge_bvec() via the blk_queue_merge_bvec() function.  After that
it gets into lots of DM code.

 In fact I know of a couple of places that would break if this is true

Yep, PMEM and BRD both currently break because of this.

  This bug was originally reported against
  the pmem driver, where it was found using the Enmotus tiering engine.
 
 This out-of-tree driver - none-gpl, with no source code - is the first I have
 heard of this.

It was hidden in the original bug report.  Same link as above, and here are
the relevant lines:

 We caught this because the Enmotus tiering engine issues rather large I/O
  requests to buffers that were allocated with kmalloc.  It is fairly common
  for the tiering engine to allocate I/O buffers of 64KB or greater.  If the
  underlying block device supports it, we will submit a bio with a biovec
  mapping many contiguous pages.  The entire buffer will possibly be mapped by
  a single biovec.  The tiering engine uses max_segment_size to determine how
  to build it's biovec list. 

I've never used it or heard of it before this either.

  Instead we should have brd explicitly tell the block layer that it can
  handle data segments of at most PAGE_SIZE.
  
  Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
  Reported-by: Hugh Daschbach hugh.daschb...@enmotus.com
  Cc: Roger C. Pao (Enmotus) rcpao.enmo...@gmail.com
  Cc: Boaz Harrosh b...@plexistor.com
  Cc: linux-nvd...@lists.01.org
  Cc: Nick Piggin npig...@kernel.dk
  ---
   drivers/block/brd.c | 1 +
   1 file changed, 1 insertion(+)
  
  diff --git a/drivers/block/brd.c b/drivers/block/brd.c
  index 898b4f256782..7e4873361b64 100644
  --- a/drivers/block/brd.c
  +++ b/drivers/block/brd.c
  @@ -490,6 +490,7 @@ static struct brd_device *brd_alloc(int i)
  blk_queue_make_request(brd-brd_queue, brd_make_request);
  blk_queue_max_hw_sectors(brd-brd_queue, 1024);
  blk_queue_bounce_limit(brd-brd_queue, BLK_BOUNCE_ANY);
  +   blk_queue_max_segment_size(brd-brd_queue, PAGE_SIZE);
 
 The only place that I can find that uses _max_segment_size is
 when translating a bio list to an sg_list, where physical segments
 may coalesce. I have never seen it at the bio level
 
   
  brd-brd_queue-limits.discard_granularity = PAGE_SIZE;
  brd-brd_queue-limits.max_discard_sectors = UINT_MAX;
  
 
 Cheers
 Boaz

Anyway, I thought your response to the original bug report against PMEM was
that you were alright with this one line change since it didn't hurt anything,
and perhaps it helped someone.  Do you have the same stance for BRD, or do you
think we need to track down if or how bio_vecs can make it to the driver with
more than one page of data first?

- Ross


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 5/8] mm: Let sparse_{add,remove}_one_section receive a node_id

2015-03-06 Thread Ross Zwisler

On Thu, 2015-03-05 at 13:58 +0200, Boaz Harrosh wrote:
 From: Yigal Korman yi...@plexistor.com
 
 Refactored the arguments of sparse_add_one_section / sparse_remove_one_section
 to use node id instead of struct zone * - A memory section has no direct
 connection to zones, all that was needed from zone was the node id.
 
 This is for add_persistent_memory that will want a section of pages
 allocated but without any zone associated. This is because belonging
 to a zone will give the memory to the page allocators, but
 persistent_memory belongs to a block device, and is not available for
 regular volatile usage.
 
 Signed-off-by: Yigal Korman yi...@plexistor.com
 Signed-off-by: Boaz Harrosh b...@plexistor.com
 ---
  include/linux/memory_hotplug.h | 4 ++--
  mm/memory_hotplug.c| 4 ++--
  mm/sparse.c| 9 +
  3 files changed, 9 insertions(+), 8 deletions(-)

For both of the MM patches in this series (this one, 5/8, and the next one,
6/8), please be sure and CC the MM folks.  I know that Dave Hansen had
feedback on an earlier version of this patch:

https://lkml.org/lkml/2014/9/9/742

We need to make sure that feedback is addressed.

Thanks,
- Ross


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC 0/8] pmem: Submission of the Persistent memory block device

2015-03-06 Thread Ross Zwisler

On Thu, 2015-03-05 at 12:32 +0200, Boaz Harrosh wrote:
 There are already NvDIMMs and other Persistent-memory devices in the market, 
 and
 lots more of them will be coming in near future.
 
 Current stack is coming along very nice, and filesystems support for 
 leveraging this
 technologies has been submitted to Linus in the DAX series by Matthew Wilcox.
 
 The general stack does not change:
   block-device
   partition
   file-system
   application file
 
 The only extra care, see Matthew's DAX patches, Is the -direct_access() API 
 from
 block devices that enables a direct mapping from Persistent-memory to user 
 application
 and/or Kernel for direct store/load of data.
 
 The only missing piece is the actual block device that enables support
 for such NvDIMM chips. This is the driver we submit here.
 
 The driver is very simple, in fact it is the 2nd smallest driver inside 
 drivers/block
 What the driver does is support a physical contiguous iomem range as a single 
 block
 device. The driver has support for as many as needed iomem ranges each as its 
 own device.
 (See patch-1 for more details)
 
 We are using this driver over a year now, In a lab with combination of VMs 
 and real
 hardware, with a variety of hardware and vendors, and it is very stable. 
 Actually why
 not it is so simple it does nothing almost.
 
 The driver is not only good for NvDIMMs, It is good for any flat memory mapped
 device. We've used it with NvDIMMs, Kernel reserved DRAM (memmap= on command 
 line),
 PCIE Battery backed memory cards, VM shared memory, and so on.
 
 Together with this driver also submitted support for page-struct with
 Persistent-memory, so Persistent-memory can be used with RDMA, DMA, 
 block-devices
 and so on, just as regular memory, in a copy-less manner.
 With the use of these two simple patches, we were able to set up an RDMA 
 target
 machine which exports NvDIMMs and enables direct remote storage. The only
 complicated thing was the remote flush of caches because most RDMA nicks in
 Kernel will RDMA directly to L3 cache, so we needed to establish a message 
 that
 involves the remote CPU for this. But otherwise the mapping of pmem pointer
 to an RDMA key was trivial, directly from user-mode, with no extra Kernel 
 code.
 [The target is simple with no extra code, the RDMA client on the other hand 
 needs
  a special driver]
 
 I maintain these patches on latest Kernels here:
   git://git.open-osd.org/pmem.git branch pmem
 
 Thanks for reviewing
 Boaz

Hey Boaz,

Regarding the PMEM series, my group has been working on an updated
version of this driver for the past 6 months or so since I initially
posted the beginnings of this series:

https://lkml.org/lkml/2014/8/27/674

That new version should be ready for public viewing sometime in April.

It's my preference that we wait to try and upstream any form of PMEM
until we've released our updated version of the driver, and you've had a
chance to review and add in any changes you need.  I'm cool with
gathering additional feedback until then, of course.

Trying to upstream this older version and then merging it with the newer
stuff in-kernel seems like it'll just end up being more work in the end.

Thanks,
- Ross

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] x86: Add kerneldoc for pcommit_sfence()

2015-03-13 Thread Ross Zwisler

On Thu, 2015-03-12 at 11:58 +0100, Ingo Molnar wrote:
   +/**
   + * pcommit_sfence() - persistent commit and fence
   + *
   + * The pcommit instruction ensures that data that has been flushed from 
   the
   + * processor's cache hierarchy with clwb, clflushopt or clflush is 
   accepted to
   + * memory and is durable on the DIMM.  The primary use case for this is
   + * persistent memory.
 
 Please capitalize canonical instruction names like the CPU makers do, 
 so that they stand out better in free flowing English text, i.e. 
 something like:
 
  *
  * The PCOMMIT instruction ensures that data that has been flushed from the
  * processor's cache hierarchy with CLWB, CLFLUSHOPT or CLFLUSH is accepted to
  * memory and is durable on the DIMM.  The primary use case for this is
  * persistent memory.

Sure, will do.

   + * void flush_and_commit_buffer(void *vaddr, unsigned int size)
   + * {
   + * unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 
   1;
   + * char *vend = (char *)vaddr + size;
 
 So here we cast vaddr to (char *) - which is unnecessary, as 'void *' 
 has byte granular pointer arithmetics.
 
 And 'vend' should be void *' to begin with, to match the type 
 of 'vaddr'.

The original version, copied in part from clflush_cache_range, did do
everything with void* pointers.  I changed it to use char* pointers based on
feedback from hpa.  :)

It seems like both have arguments for them.  Char pointer arithmetic has the
advantage that its behavior is standard in C, so it's not specific to gcc.  I
agree that void* has the advantage that it fits more naturally with the types
of the parameters passed in, requiring no casting.

I honestly don't feel strongly either way - please let me know what you guys
prefer in the x86 arch code.

   + * for (p = (char *)((unsigned long)vaddr  ~clflush_mask);
   + *  p  vend; p += boot_cpu_data.x86_clflush_size)
   + * clwb(p);
   + *
   + * // sfence to order clwb/clflushopt/clflush cache flushes
   + * // mfence via mb() also works
 
 Yeah so this isn't a C++ kernel, thank all the 3000+ gods and other 
 supreme beings worshipped on this planet!

Yep.  C++ style // comments are happily accepted by gcc in C code, though, and
this was my attempt to get around the fact that /* */ style comments can't be
nested.  I couldn't think of a more elegant way of having code + comments in a
kerneldoc comment.  I agree that if this code were ever to be pulled out and
used, the comment style would need to be corrected to be the standard kernel
style.

 Also please put 'vaddr' into single quotes, to make the parameter name 
 stand out better in written text:
 
   + * After this function completes the data pointed to by 'vaddr' has been

Sure.

Thanks,
- Ross


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] x86: improve algorithm in clflush_cache_range

2015-03-12 Thread Ross Zwisler

On Thu, 2015-03-12 at 12:09 +0100, Ingo Molnar wrote:
 * Ross Zwisler ross.zwis...@linux.intel.com wrote:
 
  The current algorithm used in clflush_cache_range() can cause the last
  cache line of the buffer to be flushed twice.
  
  Fix that algorithm so that each cache line will only be flushed once,
  and remove arithmetic on void pointers.  Void pointer arithmetic is
  allowed by GCC extensions, but isn't part of the base C standards.
 
 The optimization itself is fine, but that last argument is bogus: the 
 Linux kernel very much relies on 'void *' arithmetics in a gazillion 
 places.

Okay.  Are you happy with the patch as is or would you like me to
resubmit with that bit omitted from the change log?

- Ross

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/3] pmem: Initial version of persistent memory driver

2015-03-25 Thread Ross Zwisler

On Wed, 2015-03-25 at 17:04 +0100, Christoph Hellwig wrote:
 From: Ross Zwisler ross.zwis...@linux.intel.com
 
 PMEM is a new driver that presents a reserved range of memory as a
 block device.  This is useful for developing with NV-DIMMs, and
 can be used with volatile memory as a development platform.
 
 Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
 [hch: convert to use a platform_device for discovery, fix partition
  support]

Overall I really like this approach.  It makes things simpler, removes
unneeded code and most importantly removes the ability for the user to have a
configuration where the PMEM / memmap reservation via the command line don't
match the parameters given to pmem.

What needed to be fixed with the partition support?  I used to have real
numbers for first_minor and passed into alloc_disk(), but simplified it based
on code found in this commit in the nvme driver:

469071a37afc NVMe: Dynamically allocate partition numbers

This has worked fine for me - is there some test case in which it breaks?

 +static int pmem_probe(struct platform_device *pdev)
 +{
 + struct pmem_device *pmem;
 + struct gendisk *disk;
 + struct resource *res;
 + int idx, err;
 +
 + if (WARN_ON(pdev-num_resources  1))
 + return -ENXIO;
 +
 + res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 + if (!res)
 + return -ENXIO;
 +
 + pmem = kzalloc(sizeof(*pmem), GFP_KERNEL);
 + if (unlikely(!pmem))
 + return -ENOMEM;
 +
 + pmem-phys_addr = res-start;
 + pmem-size = resource_size(res);
 +
 + err = pmem_mapmem(pmem);
 + if (unlikely(err))
 + goto out_free_dev;
 +
 + err = -ENOMEM;
 + pmem-pmem_queue = blk_alloc_queue(GFP_KERNEL);
 + if (unlikely(!pmem-pmem_queue))
 + goto out_unmap;
 +
 + blk_queue_make_request(pmem-pmem_queue, pmem_make_request);
 + blk_queue_max_hw_sectors(pmem-pmem_queue, 1024);
 + blk_queue_bounce_limit(pmem-pmem_queue, BLK_BOUNCE_ANY);
 +
 + disk = alloc_disk(PMEM_MINORS);
 + if (unlikely(!disk))
 + goto out_free_queue;
 +
 + idx = atomic_inc_return(pmem_index) - 1;
 +
 + disk-major = pmem_major;
 + disk-first_minor   = PMEM_MINORS * idx;
 + disk-fops  = pmem_fops;
 + disk-private_data  = pmem;
 + disk-queue = pmem-pmem_queue;
 + disk-flags = GENHD_FL_EXT_DEVT;
 + sprintf(disk-disk_name, pmem%d, idx);
 + disk-driverfs_dev = pdev-dev;
 + set_capacity(disk, pmem-size  SECTOR_SHIFT);
 + pmem-pmem_disk = disk;
 +
 + add_disk(disk);
 +
 + platform_set_drvdata(pdev, pmem);
 + return 0;
 +
 +out_free_queue:
 + blk_cleanup_queue(pmem-pmem_queue);
 +out_unmap:
 + pmem_unmapmem(pmem);
 +out_free_dev:
 + kfree(pmem);
 +out:

This label is no longer used, and can be removed.


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/3] pmem: Initial version of persistent memory driver

2015-03-25 Thread Ross Zwisler

On Wed, 2015-03-25 at 21:19 +0100, Paul Bolle wrote:
 The same license nit I found in the previous two versions of this patch.
 
 On Wed, 2015-03-25 at 17:04 +0100, Christoph Hellwig wrote:
  --- /dev/null
  +++ b/drivers/block/pmem.c
 
  + * This program is free software; you can redistribute it and/or modify it
  + * under the terms and conditions of the GNU General Public License,
  + * version 2, as published by the Free Software Foundation.
  + *
  + * This program is distributed in the hope it will be useful, but WITHOUT
  + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
  for
  + * more details.
 
 This states the license is GPL v2.
 
  +MODULE_LICENSE(GPL);
 
 So you probably want to use
 MODULE_LICENSE(GPL v2);
 
 here.

Cool - yep, feel free to update this if you want in the next version of
your series, Christoph.

- Ross

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/3] x86: add support for the non-standard protected e820 type

2015-03-25 Thread Ross Zwisler

On Wed, 2015-03-25 at 17:04 +0100, Christoph Hellwig wrote:
 Various recent bioses support NVDIMMs or ADR using a non-standard
 e820 memory type, and Intel supplied reference Linux code using this
 type to various vendors.
 
 Wire this e820 table type up to export platform devices for the pmem
 driver so that we can use it in Linux, and also provide a memmap=
 argument to manually tag memory as protected, which can be used
 if the bios doesn't use the standard nonstandard interface, or
 we just want to test the pmem driver with regular memory.

snip

 @@ -154,6 +166,9 @@ static void __init e820_print_type(u32 type)
   case E820_UNUSABLE:
   printk(KERN_CONT unusable);
   break;
 + case E820_PROTECTED_KERN:
 + printk(KERN_CONT protected (type %u)\n, type);

I don't think we want a newline in this string.


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: another pmem variant

2015-03-25 Thread Ross Zwisler

On Wed, 2015-03-25 at 17:04 +0100, Christoph Hellwig wrote:
 Here is another version of the same trivial pmem driver, because two
 obviously aren't enough.  The first patch is the same pmem driver
 that Ross posted a short time ago, just modified to use platform_devices
 to find the persistant memory region instead of hardconding it in the
 Kconfig.  This allows to keep pmem.c separate from any discovery mechanism,
 but still allow auto-discovery.
 
 The other two patches are a heavily rewritten version of the code that
 Intel gave to various storage vendors to discover the type 12 (and earlier
 type 6) nvdimms, which I massaged into a form that is hopefully suitable
 for mainline.
 
 Note that pmem.c really is the minimal version as I think we need something
 included ASAP.  We'll eventually need to be able to do other I/O from and
 to it, and as most people know everyone has their own preferre method to
 do it, which I'd like to discuss once we have the basic driver in.
 
 This has been tested both with a real NVDIMM on a system with a type 12
 capable bios, as well as with fake persistent memory using the memmap=
 option.
 

For the series:
Tested-by: Ross Zwisler ross.zwis...@linux.intel.com

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/3] x86: add support for the non-standard protected e820 type

2015-03-25 Thread Ross Zwisler

On Wed, 2015-03-25 at 17:04 +0100, Christoph Hellwig wrote:
 Various recent bioses support NVDIMMs or ADR using a non-standard
 e820 memory type, and Intel supplied reference Linux code using this
 type to various vendors.
 
 Wire this e820 table type up to export platform devices for the pmem
 driver so that we can use it in Linux, and also provide a memmap=
 argument to manually tag memory as protected, which can be used
 if the bios doesn't use the standard nonstandard interface, or
 we just want to test the pmem driver with regular memory.
 
 Based on an earlier patch from Dave Jiang dave.ji...@intel.com
 Signed-off-by: Christoph Hellwig h...@lst.de

snip

 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
 index b7d31ca..93a27e4 100644
 --- a/arch/x86/Kconfig
 +++ b/arch/x86/Kconfig
 @@ -1430,6 +1430,19 @@ config ILLEGAL_POINTER_VALUE
  
  source mm/Kconfig
  
 +config X86_PMEM_LEGACY
 + bool Support non-stanard NVDIMMs and ADR protected memory
 + help
 +   Treat memory marked using the non-stard e820 type of 12 as used
 +   by the Intel Sandy Bridge-EP reference BIOS as protected memory.
 +   The kernel will the offer these regions to the pmem driver so
 +   they can be used for persistent storage.
 +
 +   If you say N the kernel will treat the ADR region like an e820
 +   reserved region.
 +
 +   Say Y if unsure

Would it make sense to have this default to y, or is that too strong?


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] drm/radeon: Fix regression with suspend/resume

2015-02-21 Thread Ross Zwisler

On Wed, 2015-02-18 at 13:02 +0100, Christian König wrote:
 Well, what the patch does is just changing where buffers are placed in 
 memory. E.g. now we place the buffer at the end of memory as well.
 
 So I can imagine at least three possible causes for the issues you see:
 1. We haven't implemented all buffer placement restrictions correctly 
 and without the patch everything just works fine by coincident.
 2. Something is overwriting the buffer at it's new location. 
 @AlexMichel: Didn't we had a similar problem internally recently? Or 
 was that just for APUs?
 3. One of the memory chips on your hardware is faulty and without the 
 patch the we just don't use the affected region (rather unlikely).
 
 For testing could you try to limit the amount of VRAM used? E.g. give 
 radeon.vramlimit=256 as kernel commandline to limit the VRAM to the 
 first 256MB.

Tried with the kernel parameter radeon.vramlimit=256, and it seemed to have
the exact same behavior.  The flicker was still there, same size, same
frequency.

Thanks,
- Ross


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [tip:x86/asm] x86/asm: Add support for the pcommit instruction

2015-02-24 Thread Ross Zwisler

On Tue, 2015-02-24 at 13:41 -0800, H. Peter Anvin wrote:
 On 02/24/2015 01:40 PM, H. Peter Anvin wrote:
  On 02/24/2015 01:30 AM, Borislav Petkov wrote:
  On Mon, Feb 23, 2015 at 03:14:01PM -0800, H. Peter Anvin wrote:
  That may cause the same line to be flushed twice.  I would suggest,
  instead, also removing the arithmetic on void *:
 
  Btw, should we hold down all those suggested usages somewhere in
  Documentation/x86/ as commit messages are generally harder to find?
 
  
  Yes, and commit messages can't be corrected after the fact.
  
 
 Either that, or perhaps even better in kerneldoc comments in the source
 itself.

Either sounds like a good idea to me. :)  Let me know which you'd prefer
and I'll get something written up.

- Ross

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] SQUASHME: Streamline pmem.c

2015-03-26 Thread Ross Zwisler

On Thu, 2015-03-26 at 19:02 +0200, Boaz Harrosh wrote:
 Christoph why did you choose the fat and ugly version of
 pmem.c beats me. Anyway, here are the cleanups you need on
 top of your pmem patch.
 
 Among other it does:
 * Remove getgeo. It is not needed for modern fdisk and was never
   needed for libgparted and cfdisk.
 
 * remove 89 lines of code to do a single memcpy. The reason
   this was so in brd (done badly BTW) is because destination
   memory is page-by-page based. With pmem we have the destination
   contiguous so we can do any size, in one go.
 
 * Remove SECTOR_SHIFT. It is defined in 6 other places
   in the Kernel. I do not like a new one. 9 is used through
   out, including block core. I do not like pmem to blasphemy
   more than needed.
 
 * More style stuff ...
 
 Please squash into your initial submission
 
 Signed-off-by: Boaz Harrosh b...@plexistor.com

I agree with Christoph's comments, but overall I think these changes are
great.  Please send out as a series  you can add:

Reviewed-by: Ross Zwisler ross.zwis...@linux.intel.com

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [Linux-nvdimm] [PATCH 1/3] pmem: Initial version of persistent memory driver

2015-03-26 Thread Ross Zwisler

On Thu, 2015-03-26 at 15:35 +0100, Christoph Hellwig wrote:
 On Thu, Mar 26, 2015 at 07:12:23AM -0700, Dan Williams wrote:
   +   struct resource *res_mem;
   +   int err;
   +
   +   res_mem = request_mem_region_exclusive(pmem-phys_addr, 
   pmem-size,
   +  pmem);
  
  Isn't request_mem_region() enough?  i.e. it seems
  request_mem_region_exclusive() assumes no DAX, at least in theory?
 
 This is 1:1 from the patch Ross sent, but I've been wondering why
 request_mem_region_exclusive is used here.  All it does is setting the
 IORESOURCE_EXCLUSIVE flag, which prevents /dev/mem and sysfs from accessing
 the memory while the driver claims it. Besides pmem only a watchdog driver
 and e1000 make use of this flag, and there's various function related to
 it that are entirely unused.  It's a weird beast.

I don't have a compelling reason to use request_mem_region_exclusive()
over request_mem_region().  If the latter is cleaner I'm fine with the
change.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/3] x86: add a is_e820_ram() helper

2015-03-26 Thread Ross Zwisler

On Thu, 2015-03-26 at 17:43 +0100, Christoph Hellwig wrote:
 On Thu, Mar 26, 2015 at 05:49:38PM +0200, Boaz Harrosh wrote:
   +#define E820_PRAM12
  
  Why the PRAM Name. For one 2/3 of this patch say PMEM the Kconfig
  to enable is _PMEM_, the driver stack that gets loaded is pmem,
  so PRAM is unexpected.
  
  Also I do believe PRAM is not the correct name. Yes NvDIMMs are RAM,
  but there are other not RAM technologies that can be supported exactly
  the same way.
  MEM is a more general name meaning on the memory bus. I think.
  
  I would love the consistency.
 
 Ingo asked for the PRAM name, I don't really care either way.

I also prefer E820_PMEM, fwiw.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] SQUASHME: Streamline pmem.c

2015-03-26 Thread Ross Zwisler

On Thu, 2015-03-26 at 19:02 +0200, Boaz Harrosh wrote:
  static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
   unsigned int len, unsigned int off, int rw,
   sector_t sector)
  {
   void *mem = kmap_atomic(page);
 + size_t pmem_off = sector  9;
 +
 + BUG_ON(pmem_off = pmem-size);

This check should take 'len' into account so we don't copy off the end of our
PMEM space.

We should also just return -EIO back up to pmem_make_request() and have that
fail the bio, as opposed to doing the drastic BUG_ON.


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [tip:x86/asm] x86: Add support for the clwb instruction

2015-04-02 Thread Ross Zwisler

On Wed, 2015-02-18 at 16:29 -0800, tip-bot for Ross Zwisler wrote:
 Commit-ID:  3b68983dc66c61da3ab4191b891084a7ab09e3e1
 Gitweb: http://git.kernel.org/tip/3b68983dc66c61da3ab4191b891084a7ab09e3e1
 Author: Ross Zwisler ross.zwis...@linux.intel.com
 AuthorDate: Tue, 27 Jan 2015 09:53:51 -0700
 Committer:  Ingo Molnar mi...@kernel.org
 CommitDate: Thu, 19 Feb 2015 00:06:38 +0100
 
 x86: Add support for the clwb instruction
 
 Add support for the new clwb (cache line write back)
 instruction.  This instruction was announced in the document
 Intel Architecture Instruction Set Extensions Programming
 Reference with reference number 319433-022.
 
 https://software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf
 
 The clwb instruction is used to write back the contents of
 dirtied cache lines to memory without evicting the cache lines
 from the processor's cache hierarchy.  This should be used in
 favor of clflushopt or clflush in cases where you require the
 cache line to be written to memory but plan to access the data
 again in the near future.
 
 One of the main use cases for this is with persistent memory
 where clwb can be used with pcommit to ensure that data has been
 accepted to memory and is durable on the DIMM.
 
 This function shows how to properly use clwb/clflushopt/clflush
 and pcommit with appropriate fencing:
 
 void flush_and_commit_buffer(void *vaddr, unsigned int size)
 {
   void *vend = vaddr + size - 1;
 
   for (; vaddr  vend; vaddr += boot_cpu_data.x86_clflush_size)
   clwb(vaddr);
 
   /* Flush any possible final partial cacheline */
   clwb(vend);
 
   /*
* sfence to order clwb/clflushopt/clflush cache flushes
* mfence via mb() also works
*/
   wmb();
 
   /* pcommit and the required sfence for ordering */
   pcommit_sfence();
 }
 
 After this function completes the data pointed to by vaddr is
 has been accepted to memory and will be durable if the vaddr
 points to persistent memory.
 
 Regarding the details of how the alternatives assembly is set
 up, we need one additional byte at the beginning of the clflush
 so that we can flip it into a clflushopt by changing that byte
 into a 0x66 prefix.  Two options are to either insert a 1 byte
 ASM_NOP1, or to add a 1 byte NOP_DS_PREFIX.  Both have no
 functional effect with the plain clflush, but I've been told
 that executing a clflush + prefix should be faster than
 executing a clflush + NOP.
 
 We had to hard code the assembly for clwb because, lacking the
 ability to assemble the clwb instruction itself, the next
 closest thing is to have an xsaveopt instruction with a 0x66
 prefix.  Unfortunately xsaveopt itself is also relatively new,
 and isn't included by all the GCC versions that the kernel needs
 to support.
 
 Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
 Acked-by: Borislav Petkov b...@suse.de
 Acked-by: H. Peter Anvin h...@linux.intel.com
 Cc: Linus Torvalds torva...@linux-foundation.org
 Cc: Thomas Gleixner t...@linutronix.de
 Link: 
 http://lkml.kernel.org/r/1422377631-8986-3-git-send-email-ross.zwis...@linux.intel.com
 Signed-off-by: Ingo Molnar mi...@kernel.org

Ping on this patch - it looks like the pcommit patch is in the tip tree,
but this one is missing?

I'm looking at the tree as of:
9a760fbbdc7 Merge branch 'tools/kvm'

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2] x86: Add kerneldoc for pcommit_sfence()

2015-04-28 Thread Ross Zwisler

Add kerneldoc comments for pcommit_sfence() describing the purpose of
the pcommit instruction and demonstrating the usage of that instruction.

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
Cc: H Peter Anvin h.peter.an...@intel.com
Cc: Ingo Molnar mi...@kernel.org
Cc: Thomas Gleixner t...@linutronix.de
Cc: Borislav Petkov b...@alien8.de
---
 arch/x86/include/asm/special_insns.h | 37 
 1 file changed, 37 insertions(+)

diff --git a/arch/x86/include/asm/special_insns.h 
b/arch/x86/include/asm/special_insns.h
index aeb4666e0c0a..c9f2ebec33ac 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -215,6 +215,43 @@ static inline void clwb(volatile void *__p)
: [pax] a (p));
 }
 
+/**
+ * pcommit_sfence() - persistent commit and fence
+ *
+ * The PCOMMIT instruction ensures that data that has been flushed from the
+ * processor's cache hierarchy with CLWB, CLFLUSHOPT or CLFLUSH is accepted to
+ * memory and is durable on the DIMM.  The primary use case for this is
+ * persistent memory.
+ *
+ * This function shows how to properly use CLWB/CLFLUSHOPT/CLFLUSH and PCOMMIT
+ * with appropriate fencing:
+ *
+ * void flush_and_commit_buffer(void *vaddr, unsigned int size)
+ * {
+ * unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1;
+ * void *vend = vaddr + size;
+ * void *p;
+ *
+ * for (p = (void *)((unsigned long)vaddr  ~clflush_mask);
+ *  p  vend; p += boot_cpu_data.x86_clflush_size)
+ * clwb(p);
+ *
+ * // SFENCE to order CLWB/CLFLUSHOPT/CLFLUSH cache flushes
+ * // MFENCE via mb() also works
+ * wmb();
+ *
+ * // PCOMMIT and the required SFENCE for ordering
+ * pcommit_sfence();
+ * }
+ *
+ * After this function completes the data pointed to by 'vaddr' has been
+ * accepted to memory and will be durable if the 'vaddr' points to persistent
+ * memory.
+ *
+ * PCOMMIT must always be ordered by an MFENCE or SFENCE, so to help simplify
+ * things we include both the PCOMMIT and the required SFENCE in the
+ * alternatives generated by pcommit_sfence().
+ */
 static inline void pcommit_sfence(void)
 {
alternative(ASM_NOP7,
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] x86: improve algorithm in clflush_cache_range

2015-04-28 Thread Ross Zwisler

The current algorithm used in clflush_cache_range() can cause the last
cache line of the buffer to be flushed twice.  Fix that algorithm so
that each cache line will only be flushed once.

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
Reported-by: H. Peter Anvin h...@zytor.com
Cc: H. Peter Anvin h...@zytor.com
Cc: Thomas Gleixner t...@linutronix.de
Cc: Ingo Molnar mi...@redhat.com
Cc: x...@kernel.org
Cc: Borislav Petkov b...@suse.de
---
 arch/x86/mm/pageattr.c | 13 ++---
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 89af288ec674..338e507f95b8 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -129,16 +129,15 @@ within(unsigned long addr, unsigned long start, unsigned 
long end)
  */
 void clflush_cache_range(void *vaddr, unsigned int size)
 {
-   void *vend = vaddr + size - 1;
+   unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1;
+   char *vend = (char *)vaddr + size;
+   char *p;
 
mb();
 
-   for (; vaddr  vend; vaddr += boot_cpu_data.x86_clflush_size)
-   clflushopt(vaddr);
-   /*
-* Flush any possible final partial cacheline:
-*/
-   clflushopt(vend);
+   for (p = (char *)((unsigned long)vaddr  ~clflush_mask);
+p  vend; p += boot_cpu_data.x86_clflush_size)
+   clflushopt(p);
 
mb();
 }
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 00/20] libnd: non-volatile memory device support

2015-04-30 Thread Ross Zwisler

On Tue, 2015-04-28 at 16:05 -0700, Andy Lutomirski wrote:
 On Tue, Apr 28, 2015 at 3:28 PM, Dan Williams dan.j.willi...@intel.com 
 wrote:
  On Tue, Apr 28, 2015 at 2:06 PM, Andy Lutomirski l...@amacapital.net 
  wrote:
  On Tue, Apr 28, 2015 at 1:59 PM, Dan Williams dan.j.willi...@intel.com 
  wrote:
  On Tue, Apr 28, 2015 at 1:52 PM, Andy Lutomirski l...@amacapital.net 
  wrote:
  On Tue, Apr 28, 2015 at 11:24 AM, Dan Williams 
  dan.j.willi...@intel.com wrote:

  Mostly for my understanding: is there a name for address relative to
  the address lines on the DIMM?  That is, a DIMM that exposes 8 GB of
  apparent physical memory, possibly interleaved, broken up, or weirdly
  remapped by the memory controller, would still have addresses between
  0 and 8 GB.  Some of those might be PMEM windows, some might be MMIO,
  some might be BLK apertures, etc.
 
  IIUC DPA refers to actual addressable storage, not this type of 
  address?
 
  No, DPA is exactly as you describe above.  You can't directly access
  it except through a PMEM mapping (possibly interleaved with DPA from
  other DIMMs) or a BLK aperture (mmio window into DPA).
 
  So the thing I'm describing has no name, then?  Oh, well.
 
  What?  The thing you are describing *is* DPA.
 
 I'm confused.  Here are the two things I have in mind:
 
 1. An address into on-DIMM storage.  If I have a DIMM that is mapped
 to 8 GB of SPA but has 64 GB of usable storage (accessed through BLK
 apertures, say), then this address runs from 0 to 64 GB.
 
 2. An address into the DIMM's view of physical address space.  If I
 have a DIMM that is mapped to 8 GB of SPA but has 64 GB of usable
 storage (accessed through BLK apertures, say), then this address runs
 from 0 to 8 GB.  There's a one-to-one mapping between SPA and this
 type of address.
 
 Since you said a dimm may provide both PMEM-mode and BLK-mode access
 to a range of DPA., I thought that DPA was #1.
 
 --Andy

I think that you've got the right definition, #1 above, for DPA.  The DPA is
relative to the DIMM, knows nothing about interleaving or SPA or anything else
in the system, and is basically equivalent to the idea of an LBA on a disk.  A
DIMM that has 64 GiB of storage could have a DPA space ranging from 0 to 64
GiB.

The second concept is a little trickier - we've been talking about this by
using the term N-way interleave set.  Say you have your 64 GiB DIMM and only
the first 8 GiB are given to the OS in an SPA, and that DIMM isn't interleaved
with any other DIMMs.  This would be a 1-way interleave set, ranging from DPA
0 - 8GiB on the DIMM.

If you have 2 DIMMs of size 64 GiB, and they each have a 8 GiB region given to
the SPA space, those two regions could be interleaved together.  The OS would
then see a 16 GiB 2-way interleave set, made up of DPAs 0 - 8 GiB on each of
the two DIMMs.

You can figure out exactly how all the interleaving works by looking at the
SPA tables, the Memory Device tables and the Interleave Tables.

These are in sections 5.2.25.1 - 5.2.25.3 in ACPI 6, and are in our code as
struct acpi_nfit_spa, struct acpi_nfit_memdev and struct acpi_nfit_idt.

- Ross


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/3] pmem: Initial version of persistent memory driver

2015-05-04 Thread Ross Zwisler

On Thu, 2015-03-26 at 09:06 +0100, Christoph Hellwig wrote:
 On Wed, Mar 25, 2015 at 02:21:53PM -0600, Ross Zwisler wrote:
  What needed to be fixed with the partition support?  I used to have real
  numbers for first_minor and passed into alloc_disk(), but simplified it 
  based
  on code found in this commit in the nvme driver:
  
  469071a37afc NVMe: Dynamically allocate partition numbers
  
  This has worked fine for me - is there some test case in which it breaks?
 
 Yes, if CONFIG_DEBUG_BLOCK_EXT_DEVT isn't set that code doesn't work at all.

I can't figure out a use case that breaks when using dynamically allocated
minors without CONFIG_DEBUG_BLOCK_EXT_DEVT.  The patch that I've been testing
against is at the bottom of this mail.

Here are the minors that I get when creating a bunch of partitions using the
current code with PMEM_MINORS=16, with CONFIG_DEBUG_BLOCK_EXT_DEVT turned off:

pmem0  249:00 63.5G  0 rom  
├─pmem0p1  249:101G  0 part 
├─pmem0p2  249:201G  0 part 
├─pmem0p3  249:301G  0 part 
├─pmem0p4  249:401G  0 part 
├─pmem0p5  249:501G  0 part 
├─pmem0p6  249:601G  0 part 
├─pmem0p7  249:701G  0 part 
├─pmem0p8  249:801G  0 part 
├─pmem0p9  249:901G  0 part 
├─pmem0p10 249:10   01G  0 part 
├─pmem0p11 249:11   01G  0 part 
├─pmem0p12 249:12   01G  0 part 
├─pmem0p13 249:13   01G  0 part 
├─pmem0p14 249:14   01G  0 part 
├─pmem0p15 249:15   01G  0 part 
├─pmem0p16 259:001G  0 part 
├─pmem0p17 259:101G  0 part 
└─pmem0p18 259:201G  0 part 

With dynamic minor allocation, with CONFIG_DEBUG_BLOCK_EXT_DEVT turned off:

pmem0  259:00 63.5G  0 rom  
├─pmem0p1  259:101G  0 part 
├─pmem0p2  259:201G  0 part 
├─pmem0p3  259:301G  0 part 
├─pmem0p4  259:401G  0 part 
├─pmem0p5  259:501G  0 part 
├─pmem0p6  259:601G  0 part 
├─pmem0p7  259:701G  0 part 
├─pmem0p8  259:801G  0 part 
├─pmem0p9  259:901G  0 part 
├─pmem0p10 259:10   01G  0 part 
├─pmem0p11 259:11   01G  0 part 
├─pmem0p12 259:12   01G  0 part 
├─pmem0p13 259:13   01G  0 part 
├─pmem0p14 259:14   01G  0 part 
├─pmem0p15 259:15   01G  0 part 
├─pmem0p16 259:16   01G  0 part 
├─pmem0p17 259:17   01G  0 part 
└─pmem0p18 259:18   01G  0 part

And with CONFIG_DEBUG_BLOCK_EXT_DEVT turned on:

pmem0  259:262144  0 63.5G  0 rom  
├─pmem0p1  259:786432  01G  0 part 
├─pmem0p2  259:131072  01G  0 part 
├─pmem0p3  259:655360  01G  0 part 
├─pmem0p4  259:393216  01G  0 part 
├─pmem0p5  259:917504  01G  0 part 
├─pmem0p6  259:65536   01G  0 part 
├─pmem0p7  259:589824  01G  0 part 
├─pmem0p8  259:327680  01G  0 part 
├─pmem0p9  259:851968  01G  0 part 
├─pmem0p10 259:196608  01G  0 part 
├─pmem0p11 259:720896  01G  0 part 
├─pmem0p12 259:458752  01G  0 part 
├─pmem0p13 259:983040  01G  0 part 
├─pmem0p14 259:32768   01G  0 part 
├─pmem0p15 259:557056  01G  0 part 
├─pmem0p16 259:294912  01G  0 part 
├─pmem0p17 259:819200  01G  0 part 
└─pmem0p18 259:163840  01G  0 part

With CONFIG_DEBUG_BLOCK_EXT_DEVT the minors are all mangled due to
blk_mangle_minor(), but I think that all three configs work?

Was there maybe confusion between that config option and the GENHD_FL_EXT_DEVT
gendisk flag, which AFAIK are independent?

Is there a use case that breaks when using dynamic minors without
CONFIG_DEBUG_BLOCK_EXT_DEVT?

Thanks,
- Ross

--- 8 ---
From 6202dc7c1ef765faebb905161860c6b9ab19cc8a Mon Sep 17 00:00:00 2001
From: Ross Zwisler ross.zwis...@linux.intel.com
Date: Mon, 4 May 2015 10:26:54 -0600
Subject: [PATCH] pmem: Dynamically allocate partition numbers

Dynamically allocate minor numbers for partitions instead of statically
preallocating them.

Inspired by this commit:

469071a37afc NVMe: Dynamically allocate partition numbers

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
---
 drivers/block/nd/pmem.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/block/nd/pmem.c b/drivers/block/nd/pmem.c
index 900dad61a6b9..b977def8981e 100644
--- a/drivers/block/nd/pmem.c
+++ b/drivers/block/nd/pmem.c
@@ -26,8 +26,6 @@
 #include linux/nd.h
 #include nd.h
 
-#define PMEM_MINORS16
-
 struct pmem_device {
struct request_queue*pmem_queue;
struct gendisk  *pmem_disk;
@@ -185,12 +183,12 @@ static struct pmem_device *pmem_alloc(struct device *dev, 
struct resource *res)
blk_queue_max_hw_sectors(pmem-pmem_queue, 1024);
blk_queue_bounce_limit(pmem-pmem_queue, BLK_BOUNCE_ANY);
 
-   disk = alloc_disk(PMEM_MINORS);
+   disk = alloc_disk(0);
if (!disk)
goto out_free_queue;
 
disk-major = pmem_major;
-   disk-first_minor   = PMEM_MINORS * pmem-id;
+   disk-first_minor   = 0;
disk-fops

Re: [PATCH] block: pmem: Add dependency on HAS_IOMEM

2015-05-04 Thread Ross Zwisler

On Mon, 2015-05-04 at 20:58 +0200, Richard Weinberger wrote:
 Not all architectures have io memory.
 
 Fixes:
 drivers/block/pmem.c: In function ‘pmem_alloc’:
 drivers/block/pmem.c:146:2: error: implicit declaration of function 
 ‘ioremap_nocache’ [-Werror=implicit-function-declaration]
   pmem-virt_addr = ioremap_nocache(pmem-phys_addr, pmem-size);
   ^
 drivers/block/pmem.c:146:18: warning: assignment makes pointer from integer 
 without a cast [enabled by default]
   pmem-virt_addr = ioremap_nocache(pmem-phys_addr, pmem-size);
   ^
 drivers/block/pmem.c:182:2: error: implicit declaration of function ‘iounmap’ 
 [-Werror=implicit-function-declaration]
   iounmap(pmem-virt_addr);
   ^
 
 Signed-off-by: Richard Weinberger rich...@nod.at
 ---
  drivers/block/Kconfig | 1 +
  1 file changed, 1 insertion(+)
 
 diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
 index eb1fed5..3ccef9e 100644
 --- a/drivers/block/Kconfig
 +++ b/drivers/block/Kconfig
 @@ -406,6 +406,7 @@ config BLK_DEV_RAM_DAX
  
  config BLK_DEV_PMEM
   tristate Persistent memory block device support
 + depends on HAS_IOMEM
   help
 Saying Y here will allow you to use a contiguous range of reserved
 memory as one or more persistent block devices.

Reviewed-by: Ross Zwisler ross.zwis...@linux.intel.com


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 3/6] x86, pmem: add PMEM API for persistent memory

2015-05-28 Thread Ross Zwisler

Add a new PMEM API to x86, and allow for architectures that do not
implement this API.  Architectures that implement the PMEM API should
define ARCH_HAS_PMEM_API in their kernel configuration and must provide
implementations for persistent_copy(), persistent_flush() and
persistent_sync().

Signed-off-by: Ross Zwisler ross.zwis...@linux.intel.com
Cc: Dan Williams dan.j.willi...@intel.com
Cc: Thomas Gleixner t...@linutronix.de
Cc: Ingo Molnar mi...@redhat.com
Cc: H. Peter Anvin h...@zytor.com
Cc: x...@kernel.org
Cc: linux-nvd...@lists.01.org
---
 MAINTAINERS   |  1 +
 arch/x86/Kconfig  |  3 ++
 arch/x86/include/asm/cacheflush.h | 23 
 include/linux/pmem.h  | 79 +++
 4 files changed, 106 insertions(+)
 create mode 100644 include/linux/pmem.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 0448fec8e44a..ca1f3d99618d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5944,6 +5944,7 @@ L:linux-nvd...@lists.01.org
 Q: https://patchwork.kernel.org/project/linux-nvdimm/list/
 S: Supported
 F: drivers/block/nd/pmem.c
+F: include/linux/pmem.h
 
 LINUX FOR IBM pSERIES (RS/6000)
 M: Paul Mackerras pau...@au.ibm.com
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 23c587938804..eb8f12e715af 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -215,6 +215,9 @@ config ARCH_HAS_CPU_RELAX
 config ARCH_HAS_CACHE_LINE_SIZE
def_bool y
 
+config ARCH_HAS_PMEM_API
+   def_bool y
+
 config HAVE_SETUP_PER_CPU_AREA
def_bool y
 
diff --git a/arch/x86/include/asm/cacheflush.h 
b/arch/x86/include/asm/cacheflush.h
index 47c8e32f621a..ffd5ccdc86f0 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -4,6 +4,7 @@
 /* Caches aren't brain-dead on the intel. */
 #include asm-generic/cacheflush.h
 #include asm/special_insns.h
+#include asm/uaccess.h
 
 /*
  * The set_memory_* API can be used to change various attributes of a virtual
@@ -84,6 +85,28 @@ int set_pages_rw(struct page *page, int numpages);
 
 void clflush_cache_range(void *addr, unsigned int size);
 
+static inline void arch_persistent_copy(void *dst, const void *src, size_t n)
+{
+   /*
+* We are copying between two kernel buffers, so it should be
+* impossible for us to hit this BUG_ON() because we should never need
+* to take a page fault.
+*/
+   BUG_ON(__copy_from_user_inatomic_nocache(dst,
+   (__user const void *)src, n));
+}
+
+static inline void arch_persistent_flush(void *vaddr, size_t size)
+{
+   clflush_cache_range(vaddr, size);
+}
+
+static inline void arch_persistent_sync(void)
+{
+   wmb();
+   pcommit_sfence();
+}
+
 #ifdef CONFIG_DEBUG_RODATA
 void mark_rodata_ro(void);
 extern const int rodata_test_data;
diff --git a/include/linux/pmem.h b/include/linux/pmem.h
new file mode 100644
index ..88ade7376632
--- /dev/null
+++ b/include/linux/pmem.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright(c) 2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef __PMEM_H__
+#define __PMEM_H__
+
+#include asm/cacheflush.h
+
+/*
+ * Architectures that define ARCH_HAS_PMEM_API must provide implementations
+ * for persistent_copy(), persistent_flush() and persistent_sync().
+ */
+
+#ifdef CONFIG_ARCH_HAS_PMEM_API
+/**
+ * persistent_copy - copy data to persistent memory
+ * @dst: destination buffer for the copy
+ * @src: source buffer for the copy
+ * @n: length of the copy in bytes
+ *
+ * Perform a memory copy that results in the destination of the copy being
+ * evicted from the processor cache hierarchy (accepted to memory).  This
+ * can typically be accomplished with non-temporal stores or by regular stores
+ * followed by cache flush commands via persistent_flush().
+ */
+static inline void persistent_copy(void *dst, const void *src, size_t n)
+{
+   arch_persistent_copy(dst, src, n);
+}
+
+/**
+ * persistent_flush - flush a memory range from the processor cache
+ * @vaddr: virtual address to begin flushing
+ * @size: number of bytes to flush
+ *
+ * This call needs to include fencing so that the flushing will be ordered
+ * with respect to both reads and writes.
+ */
+static inline void persistent_flush(void *vaddr, size_t size)
+{
+   arch_persistent_flush(vaddr, size);
+}
+
+/**
+ * persistent_sync - synchronize writes to persistent memory
+ *
+ * To be used after a series of copies and/or flushes, this should perform any
+ * necessary fencing to order writes

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 2219 matches

Mail list logo