[PATCH 08/14] mm: add struct address_space to releasepage() callback

2020-10-06 Thread jglisse
From: Jérôme Glisse 

This is part of patchset to remove dependency on struct page.mapping
field so that we can temporarily update it to point to a special
structure tracking temporary page state (note that original mapping
pointer is preserved and can still be accessed but at a cost).

Add struct address_space to releasepage() callback arguments.

Note that this patch does not make use of the new argument, nor does
it use a valid one at call site (by default this patch just use NULL
for new argument value).

Use following script (from root of linux kernel tree):

./that-script.sh that-semantic-patch.spatch

%<
#!/bin/sh
spatch_file=$1

echo PART1 ===

# P1 find callback functions name
spatch  --dir . --no-includes -D part1 --sp-file $spatch_file

echo PART2 ===

# P2 change callback function prototype
cat /tmp/unicorn-functions | sort | uniq | while read func ; do
for file in $( git grep -l $func -- '*.[ch]' ) ; do
echo $file
spatch --no-includes --in-place -D part2 \
   -D fn=$func --sp-file $spatch_file $file
done
done

echo PART 3 ==

# P3 find all function which call the callback
spatch --dir . --include-headers -D part3 --sp-file $spatch_file

echo PART 4===

# P4 change all funcitons which call the callback
cat /tmp/unicorn-files | sort | uniq | while read file ; do
echo $file
spatch --no-includes --in-place -D part4 \
   --sp-file $spatch_file $file
done
>%

With the following semantic patch:

%<
virtual part1, part2, part3, part4

// 
// Part 1 is grepping all function that are use as callback for releasepage.

// initialize file where we collect all function name (erase it)
@initialize:python depends on part1@
@@
file=open('/tmp/unicorn-functions', 'w')
file.close()

// match function name use as a callback
@p1r2 depends on part1@
identifier I1, FN;
@@
struct address_space_operations I1 = {..., .releasepage = FN, ...};

@script:python p1r3 depends on p1r2@
funcname << p1r2.FN;
@@
if funcname != "NULL":
  file=open('/tmp/unicorn-functions', 'a')
  file.write(funcname + '\n')
  file.close()

// ---
// Part 2 modify callback

// Add address_space argument to the function (releasepage callback one)
@p2r1 depends on part2@
identifier virtual.fn;
identifier I1, I2;
type T1, T2;
@@
int fn(
+struct address_space *__mapping,
T1 I1, T2 I2) { ... }

@p2r2 depends on part2@
identifier virtual.fn;
identifier I1, I2;
type T1, T2;
@@
int fn(
+struct address_space *__mapping,
T1 I1, T2 I2);

@p2r3 depends on part2@
identifier virtual.fn;
expression E1, E2;
@@
fn(
+MAPPING_NULL,
E1, E2)

// 
// Part 3 is grepping all function that are use the callback for releasepage.

// initialize file where we collect all function name (erase it)
@initialize:python depends on part3@
@@
file=open('/tmp/unicorn-files', 'w')
file.write("./include/linux/pagemap.h\n")
file.write("./include/linux/mm.h\n")
file.write("./include/linux/fs.h\n")
file.write("./mm/readahead.c\n")
file.write("./mm/filemap.c\n")
file.close()

@p3r1 depends on part3 exists@
expression E1, E2, E3;
identifier FN;
position P;
@@
FN@P(...) {...
(
E1.a_ops->releasepage(E2, E3)
|
E1->a_ops->releasepage(E2, E3)
)
...}

@script:python p3r2 depends on p3r1@
P << p3r1.P;
@@
file=open('/tmp/unicorn-files', 'a')
file.write(P[0].file + '\n')
file.close()

// ---
// Part 4 generic modification
@p4r1 depends on part4@
@@
struct address_space_operations { ... int (*releasepage)(
+struct address_space *,
struct page *, ...); ... };

@p4r2 depends on part4@
expression E1, E2, E3;
@@
E1.a_ops->releasepage(
+MAPPING_NULL,
E2, E3)

@p4r3 depends on part4@
expression E1, E2, E3;
@@
E1->a_ops->releasepage(
+MAPPING_NULL,
E2, E3)
>%

Signed-off-by: Jérôme Glisse 
Cc: linux...@kvack.org
Cc: linux-fsde...@vger.kernel.org
Cc: Andrew Morton 
Cc: Alexander Viro 
Cc: linux-fsde...@vger.kernel.org
Cc: Tejun Heo 
Cc: Jan Kara 
Cc: Josef Bacik 
---
 fs/9p/vfs_addr.c   | 3 ++-
 fs/afs/dir.c   | 6 --
 fs/afs/file.c  | 6 --
 fs/block_dev.c | 3 ++-
 fs/btrfs/disk-io.c | 5 +++--
 fs/btrfs/inode.c   | 5 +++--
 fs/ceph/addr.c | 3 ++-
 fs/cifs/file.c | 3 ++-
 fs/erofs/super.c   | 5 +++--
 fs/ext4/inode.c| 3 ++-
 

[PATCH 14/14] mm: add struct address_space to is_dirty_writeback() callback

2020-10-06 Thread jglisse
From: Jérôme Glisse 

This is part of patchset to remove dependency on struct page.mapping
field so that we can temporarily update it to point to a special
structure tracking temporary page state (note that original mapping
pointer is preserved and can still be accessed but at a cost).

Add struct address_space to is_dirty_writeback() callback arguments.

Note that this patch does not make use of the new argument, nor does
it use a valid one at call site (by default this patch just use NULL
for new argument value).

Use following script (from root of linux kernel tree):

./that-script.sh that-semantic-patch.spatch

%<
#!/bin/sh
spatch_file=$1

echo PART1 ===

# P1 find callback functions name
spatch  --dir . --no-includes -D part1 --sp-file $spatch_file

echo PART2 ===

# P2 change callback function prototype
cat /tmp/unicorn-functions | sort | uniq | while read func ; do
for file in $( git grep -l $func -- '*.[ch]' ) ; do
echo $file
spatch --no-includes --in-place -D part2 \
   -D fn=$func --sp-file $spatch_file $file
done
done

echo PART 3 ==

# P3 find all function which call the callback
spatch --dir . --include-headers -D part3 --sp-file $spatch_file

echo PART 4===

# P4 change all funcitons which call the callback
cat /tmp/unicorn-files | sort | uniq | while read file ; do
echo $file
spatch --no-includes --in-place -D part4 \
   --sp-file $spatch_file $file
done
>%

With the following semantic patch:

%<
virtual part1, part2, part3, part4

// 
// Part 1 is grepping all function that are use as callback for 
is_dirty_writeback.

// initialize file where we collect all function name (erase it)
@initialize:python depends on part1@
@@
file=open('/tmp/unicorn-functions', 'w')
file.close()

// match function name use as a callback
@p1r2 depends on part1@
identifier I1, FN;
@@
struct address_space_operations I1 = {..., .is_dirty_writeback = FN, ...};

@script:python p1r3 depends on p1r2@
funcname << p1r2.FN;
@@
if funcname != "NULL":
  file=open('/tmp/unicorn-functions', 'a')
  file.write(funcname + '\n')
  file.close()

// ---
// Part 2 modify callback

@p2r1 depends on part2@
identifier virtual.fn;
identifier I1, I2, I3;
type T1, T2, T3;
@@
void fn(
+struct address_space *__mapping,
T1 I1, T2 I2, T3 I3) { ... }

@p2r2 depends on part2@
identifier virtual.fn;
identifier I1, I2, I3;
type T1, T2, T3;
@@
void fn(
+struct address_space *__mapping,
T1 I1, T2 I2, T3 I3);

@p2r3 depends on part2@
identifier virtual.fn;
expression E1, E2, E3;
@@
fn(
+MAPPING_NULL,
E1, E2, E3)

// 
// Part 3 is grepping all function that are use the callback for 
is_dirty_writeback.

// initialize file where we collect all function name (erase it)
@initialize:python depends on part3@
@@
file=open('/tmp/unicorn-files', 'w')
file.write("./include/linux/pagemap.h\n")
file.write("./include/linux/mm.h\n")
file.write("./include/linux/fs.h\n")
file.write("./mm/readahead.c\n")
file.write("./mm/filemap.c\n")
file.close()

@p3r1 depends on part3 exists@
expression E1, E2, E3, E4;
identifier FN;
position P;
@@
FN@P(...) {...
(
E1.a_ops->is_dirty_writeback(E2, E3, E4)
|
E1->a_ops->is_dirty_writeback(E2, E3, E4)
)
...}

@script:python p3r2 depends on p3r1@
P << p3r1.P;
@@
file=open('/tmp/unicorn-files', 'a')
file.write(P[0].file + '\n')
file.close()

// ---
// Part 4 generic modification
@p4r1 depends on part4@
@@
struct address_space_operations { ... void (*is_dirty_writeback)(
+struct address_space *,
struct page *, ...); ... };

@p4r2 depends on part4@
expression E1, E2, E3, E4;
@@
E1.a_ops->is_dirty_writeback(
+MAPPING_NULL,
E2, E3, E4)

@p4r3 depends on part4@
expression E1, E2, E3, E4;
@@
E1->a_ops->is_dirty_writeback(
+MAPPING_NULL,
E2, E3, E4)
>%

Signed-off-by: Jérôme Glisse 
Cc: linux...@kvack.org
Cc: linux-fsde...@vger.kernel.org
Cc: Andrew Morton 
Cc: Alexander Viro 
Cc: linux-fsde...@vger.kernel.org
Cc: Tejun Heo 
Cc: Jan Kara 
Cc: Josef Bacik 
---
 fs/buffer.c | 3 ++-
 fs/nfs/file.c   | 5 +++--
 include/linux/buffer_head.h | 3 ++-
 include/linux/fs.h  | 3 ++-
 mm/vmscan.c | 3 ++-
 5 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/fs/buffer.c 

[PATCH 09/14] mm: add struct address_space to freepage() callback

2020-10-06 Thread jglisse
From: Jérôme Glisse 

This is part of patchset to remove dependency on struct page.mapping
field so that we can temporarily update it to point to a special
structure tracking temporary page state (note that original mapping
pointer is preserved and can still be accessed but at a cost).

Add struct address_space to freepage() callback arguments.

Note that this patch does not make use of the new argument, nor does
it use a valid one at call site (by default this patch just use NULL
for new argument value).

Use following script (from root of linux kernel tree):

./that-script.sh that-semantic-patch.spatch

%<
#!/bin/sh
spatch_file=$1

echo PART1 ===

# P1 find callback functions name
spatch  --dir . --no-includes -D part1 --sp-file $spatch_file

echo PART2 ===

# P2 change callback function prototype
cat /tmp/unicorn-functions | sort | uniq | while read func ; do
for file in $( git grep -l $func -- '*.[ch]' ) ; do
echo $file
spatch --no-includes --in-place -D part2 \
   -D fn=$func --sp-file $spatch_file $file
done
done

echo PART 3 ==

# P3 find all function which call the callback
spatch --dir . --include-headers -D part3 --sp-file $spatch_file

echo PART 4===

# P4 change all funcitons which call the callback
cat /tmp/unicorn-files | sort | uniq | while read file ; do
echo $file
spatch --no-includes --in-place -D part4 \
   --sp-file $spatch_file $file
done
>%

With the following semantic patch:

%<
virtual part1, part2, part3, part4

// 
// Part 1 is grepping all function that are use as callback for freepage.

// initialize file where we collect all function name (erase it)
@initialize:python depends on part1@
@@
file=open('/tmp/unicorn-functions', 'w')
file.close()

// match function name use as a callback
@p1r2 depends on part1@
identifier I1, FN;
@@
struct address_space_operations I1 = {..., .freepage = FN, ...};

@script:python p1r3 depends on p1r2@
funcname << p1r2.FN;
@@
if funcname != "NULL":
  file=open('/tmp/unicorn-functions', 'a')
  file.write(funcname + '\n')
  file.close()

// ---
// Part 2 modify callback

// Add address_space argument to the function (freepage callback one)
@p2r1 depends on part2@
identifier virtual.fn;
identifier I1;
type T1;
@@
void fn(
+struct address_space *__mapping,
T1 I1) { ... }

@p2r2 depends on part2@
identifier virtual.fn;
identifier I1;
type T1;
@@
void fn(
+struct address_space *__mapping,
T1 I1);

@p2r3 depends on part2@
identifier virtual.fn;
expression E1;
@@
fn(
+MAPPING_NULL,
E1)

// 
// Part 3 is grepping all function that use the callback for freepage.

// initialize file where we collect all function name (erase it)
@initialize:python depends on part3@
@@
file=open('/tmp/unicorn-files', 'w')
file.write("./include/linux/pagemap.h\n")
file.write("./include/linux/mm.h\n")
file.write("./include/linux/fs.h\n")
file.write("./mm/readahead.c\n")
file.write("./mm/filemap.c\n")
file.close()

@p3r1 depends on part3 exists@
expression E1, E2;
identifier FN;
position P;
@@
FN@P(...) {...
(
E1.a_ops->freepage(E2)
|
E1->a_ops->freepage(E2)
)
...}

@script:python p3r2 depends on p3r1@
P << p3r1.P;
@@
file=open('/tmp/unicorn-files', 'a')
file.write(P[0].file + '\n')
file.close()

// ---
// Part 4 generic modification
@p4r1 depends on part4@
@@
struct address_space_operations { ... void (*freepage)(
+struct address_space *,
struct page *, ...); ... };

@p4r2 depends on part4@
expression E1, E2;
@@
E1.a_ops->freepage(
+MAPPING_NULL,
E2)

@p4r3 depends on part4@
expression E1, E2;
@@
E1->a_ops->freepage(
+MAPPING_NULL,
E2)
>%

Signed-off-by: Jérôme Glisse 
Cc: linux...@kvack.org
Cc: linux-fsde...@vger.kernel.org
Cc: Andrew Morton 
Cc: Alexander Viro 
Cc: linux-fsde...@vger.kernel.org
Cc: Tejun Heo 
Cc: Jan Kara 
Cc: Josef Bacik 
---
 fs/nfs/dir.c| 9 +
 fs/orangefs/inode.c | 3 ++-
 include/linux/fs.h  | 2 +-
 mm/filemap.c| 4 ++--
 mm/truncate.c   | 2 +-
 mm/vmscan.c | 2 +-
 6 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 5a5c021967d3f..d8e66c98db3ea 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -53,7 +53,7 @@ static int nfs_closedir(struct inode *, struct file *);
 

[PATCH 13/14] mm: add struct address_space to isolate_page() callback

2020-10-06 Thread jglisse
From: Jérôme Glisse 

This is part of patchset to remove dependency on struct page.mapping
field so that we can temporarily update it to point to a special
structure tracking temporary page state (note that original mapping
pointer is preserved and can still be accessed but at a cost).

Add struct address_space to isolate_page() callback arguments.

Note that this patch does not make use of the new argument, nor does
it use a valid one at call site (by default this patch just use NULL
for new argument value).

Use following script (from root of linux kernel tree):

./that-script.sh that-semantic-patch.spatch

%<
#!/bin/sh
spatch_file=$1

echo PART1 ===

# P1 find callback functions name
spatch  --dir . --no-includes -D part1 --sp-file $spatch_file

echo PART2 ===

# P2 change callback function prototype
cat /tmp/unicorn-functions | sort | uniq | while read func ; do
for file in $( git grep -l $func -- '*.[ch]' ) ; do
echo $file
spatch --no-includes --in-place -D part2 \
   -D fn=$func --sp-file $spatch_file $file
done
done

echo PART 3 ==

# P3 find all function which call the callback
spatch --dir . --include-headers -D part3 --sp-file $spatch_file

echo PART 4===

# P4 change all funcitons which call the callback
cat /tmp/unicorn-files | sort | uniq | while read file ; do
echo $file
spatch --no-includes --in-place -D part4 \
   --sp-file $spatch_file $file
done
>%

With the following semantic patch:

%<
virtual part1, part2, part3, part4

// 
// Part 1 is grepping all function that are use as callback for isolate_page.

// initialize file where we collect all function name (erase it)
@initialize:python depends on part1@
@@
file=open('/tmp/unicorn-functions', 'w')
file.close()

// match function name use as a callback
@p1r2 depends on part1@
identifier I1, FN;
@@
struct address_space_operations I1 = {..., .isolate_page = FN, ...};

@script:python p1r3 depends on p1r2@
funcname << p1r2.FN;
@@
if funcname != "NULL":
  file=open('/tmp/unicorn-functions', 'a')
  file.write(funcname + '\n')
  file.close()

// ---
// Part 2 modify callback

// Add address_space argument to the function (isolate_page callback one)
@p2r1 depends on part2@
identifier virtual.fn;
identifier I1, I2;
type R, T1, T2;
@@
R fn(
+struct address_space *__mapping,
T1 I1, T2 I2) { ... }

@p2r2 depends on part2@
identifier virtual.fn;
identifier I1, I2;
type R, T1, T2;
@@
R fn(
+struct address_space *__mapping,
T1 I1, T2 I2);

@p2r3 depends on part2@
identifier virtual.fn;
expression E1, E2;
@@
fn(
+MAPPING_NULL,
E1, E2)

// 
// Part 3 is grepping all function that are use the callback for isolate_page.

// initialize file where we collect all function name (erase it)
@initialize:python depends on part3@
@@
file=open('/tmp/unicorn-files', 'w')
file.write("./include/linux/pagemap.h\n")
file.write("./include/linux/mm.h\n")
file.write("./include/linux/fs.h\n")
file.write("./mm/readahead.c\n")
file.write("./mm/filemap.c\n")
file.close()

@p3r1 depends on part3 exists@
expression E1, E2, E3;
identifier FN;
position P;
@@
FN@P(...) {...
(
E1.a_ops->isolate_page(E2, E3)
|
E1->a_ops->isolate_page(E2, E3)
)
...}

@script:python p3r2 depends on p3r1@
P << p3r1.P;
@@
file=open('/tmp/unicorn-files', 'a')
file.write(P[0].file + '\n')
file.close()

// ---
// Part 4 generic modification
@p4r1 depends on part4@
type R;
@@
struct address_space_operations { ... R (*isolate_page)(
+struct address_space *,
struct page *, ...); ... };

@p4r2 depends on part4@
expression E1, E2, E3;
@@
E1.a_ops->isolate_page(
+MAPPING_NULL,
E2, E3)

@p4r3 depends on part4@
expression E1, E2, E3;
@@
E1->a_ops->isolate_page(
+MAPPING_NULL,
E2, E3)
>%

Signed-off-by: Jérôme Glisse 
Cc: linux...@kvack.org
Cc: linux-fsde...@vger.kernel.org
Cc: Andrew Morton 
Cc: Alexander Viro 
Cc: linux-fsde...@vger.kernel.org
Cc: Tejun Heo 
Cc: Jan Kara 
Cc: Josef Bacik 
---
 include/linux/balloon_compaction.h | 5 +++--
 include/linux/fs.h | 3 ++-
 mm/balloon_compaction.c| 3 ++-
 mm/migrate.c   | 2 +-
 mm/z3fold.c| 3 ++-
 mm/zsmalloc.c  | 3 ++-
 6 files changed, 12 insertions(+), 7 deletions(-)


[PATCH 12/14] mm: add struct address_space to is_partially_uptodate() callback

2020-10-06 Thread jglisse
From: Jérôme Glisse 

This is part of patchset to remove dependency on struct page.mapping
field so that we can temporarily update it to point to a special
structure tracking temporary page state (note that original mapping
pointer is preserved and can still be accessed but at a cost).

Add struct address_space to is_partially_uptodate() callback arguments.

Note that this patch does not make use of the new argument, nor does
it use a valid one at call site (by default this patch just use NULL
for new argument value).

Use following script (from root of linux kernel tree):

./that-script.sh that-semantic-patch.spatch

%<
#!/bin/sh
spatch_file=$1

echo PART1 ===

# P1 find callback functions name
spatch  --dir . --no-includes -D part1 --sp-file $spatch_file

echo PART2 ===

# P2 change callback function prototype
cat /tmp/unicorn-functions | sort | uniq | while read func ; do
for file in $( git grep -l $func -- '*.[ch]' ) ; do
echo $file
spatch --no-includes --in-place -D part2 \
   -D fn=$func --sp-file $spatch_file $file
done
done

echo PART 3 ==

# P3 find all function which call the callback
spatch --dir . --include-headers -D part3 --sp-file $spatch_file

echo PART 4===

# P4 change all funcitons which call the callback
cat /tmp/unicorn-files | sort | uniq | while read file ; do
echo $file
spatch --no-includes --in-place -D part4 \
   --sp-file $spatch_file $file
done
>%

With the following semantic patch:

%<
virtual part1, part2, part3, part4

// 
// Part 1 is grepping all function that are use as callback for 
is_partially_uptodate.

// initialize file where we collect all function name (erase it)
@initialize:python depends on part1@
@@
file=open('/tmp/unicorn-functions', 'w')
file.close()

// match function name use as a callback
@p1r2 depends on part1@
identifier I1, FN;
@@
struct address_space_operations I1 = {..., .is_partially_uptodate = FN, ...};

@script:python p1r3 depends on p1r2@
funcname << p1r2.FN;
@@
if funcname != "NULL":
  file=open('/tmp/unicorn-functions', 'a')
  file.write(funcname + '\n')
  file.close()

// ---
// Part 2 modify callback

// Add address_space argument to the function (is_partially_uptodate callback 
one)
@p2r1 depends on part2@
identifier virtual.fn;
identifier I1, I2, I3;
type T1, T2, T3;
@@
int fn(
+struct address_space *__mapping,
T1 I1, T2 I2, T3 I3) { ... }

@p2r2 depends on part2@
identifier virtual.fn;
identifier I1, I2, I3;
type T1, T2, T3;
@@
int fn(
+struct address_space *__mapping,
T1 I1, T2 I2, T3 I3);

@p2r3 depends on part2@
identifier virtual.fn;
expression E1, E2, E3;
@@
fn(
+MAPPING_NULL,
E1, E2, E3)

// 
// Part 3 is grepping all function that are use the callback for 
is_partially_uptodate.

// initialize file where we collect all function name (erase it)
@initialize:python depends on part3@
@@
file=open('/tmp/unicorn-files', 'w')
file.write("./include/linux/pagemap.h\n")
file.write("./include/linux/mm.h\n")
file.write("./include/linux/fs.h\n")
file.write("./mm/readahead.c\n")
file.write("./mm/filemap.c\n")
file.close()

@p3r1 depends on part3 exists@
expression E1, E2, E3, E4;
identifier FN;
position P;
@@
FN@P(...) {...
(
E1.a_ops->is_partially_uptodate(E2, E3, E4)
|
E1->a_ops->is_partially_uptodate(E2, E3, E4)
)
...}

@script:python p3r2 depends on p3r1@
P << p3r1.P;
@@
file=open('/tmp/unicorn-files', 'a')
file.write(P[0].file + '\n')
file.close()

@p3r3 depends on part3 exists@
struct address_space_operations *AOPS;
expression E1, E2, E3;
identifier FN;
position P;
@@
FN@P(...) {...
AOPS->is_partially_uptodate(E1, E2, E3)
...}

@script:python p3r4 depends on p3r3@
P << p3r3.P;
@@
file=open('/tmp/unicorn-files', 'a')
file.write(P[0].file + '\n')
file.close()

// ---
// Part 4 generic modification
@p4r1 depends on part4@
@@
struct address_space_operations { ... int (*is_partially_uptodate)(
+struct address_space *,
struct page *, ...); ... };

@p4r2 depends on part4@
expression E1, E2, E3, E4;
@@
E1.a_ops->is_partially_uptodate(
+MAPPING_NULL,
E2, E3, E4)

@p4r3 depends on part4@
expression E1, E2, E3, E4;
@@
E1->a_ops->is_partially_uptodate(
+MAPPING_NULL,
E2, E3, E4)

@p4r4 depends on part4 exists@
struct address_space_operations *AOPS;
expression E1, E2, E3;
@@
{...
AOPS->is_partially_uptodate(

[PATCH 11/14] mm: add struct address_space to launder_page() callback

2020-10-06 Thread jglisse
From: Jérôme Glisse 

This is part of patchset to remove dependency on struct page.mapping
field so that we can temporarily update it to point to a special
structure tracking temporary page state (note that original mapping
pointer is preserved and can still be accessed but at a cost).

Add struct address_space to launder_page() callback arguments.

Note that this patch does not make use of the new argument, nor does
it use a valid one at call site (by default this patch just use NULL
for new argument value).

Use following script (from root of linux kernel tree):

./that-script.sh that-semantic-patch.spatch

%<
#!/bin/sh
spatch_file=$1

echo PART1 ===

# P1 find callback functions name
spatch  --dir . --no-includes -D part1 --sp-file $spatch_file

echo PART2 ===

# P2 change callback function prototype
cat /tmp/unicorn-functions | sort | uniq | while read func ; do
for file in $( git grep -l $func -- '*.[ch]' ) ; do
echo $file
spatch --no-includes --in-place -D part2 \
   -D fn=$func --sp-file $spatch_file $file
done
done

echo PART 3 ==

# P3 find all function which call the callback
spatch --dir . --include-headers -D part3 --sp-file $spatch_file

echo PART 4===

# P4 change all funcitons which call the callback
cat /tmp/unicorn-files | sort | uniq | while read file ; do
echo $file
spatch --no-includes --in-place -D part4 \
   --sp-file $spatch_file $file
done
>%

With the following semantic patch:

%<
virtual part1, part2, part3, part4

// 
// Part 1 is grepping all function that are use as callback for launder_page.

// initialize file where we collect all function name (erase it)
@initialize:python depends on part1@
@@
file=open('/tmp/unicorn-functions', 'w')
file.close()

// match function name use as a callback
@p1r2 depends on part1@
identifier I1, FN;
@@
struct address_space_operations I1 = {..., .launder_page = FN, ...};

@script:python p1r3 depends on p1r2@
funcname << p1r2.FN;
@@
if funcname != "NULL":
  file=open('/tmp/unicorn-functions', 'a')
  file.write(funcname + '\n')
  file.close()

// ---
// Part 2 modify callback

// Add address_space argument to the function (launder_page callback one)
@p2r1 depends on part2@
identifier virtual.fn;
identifier I1;
type T1;
@@
int fn(
+struct address_space *__mapping,
T1 I1) { ... }

@p2r2 depends on part2@
identifier virtual.fn;
identifier I1;
type T1;
@@
int fn(
+struct address_space *__mapping,
T1 I1);

@p2r3 depends on part2@
identifier virtual.fn;
type T1;
@@
int fn(
+struct address_space *__mapping,
T1);

@p2r4 depends on part2@
identifier virtual.fn;
expression E1;
@@
fn(
+MAPPING_NULL,
E1)

// 
// Part 3 is grepping all function that are use the callback for launder_page.

// initialize file where we collect all function name (erase it)
@initialize:python depends on part3@
@@
file=open('/tmp/unicorn-files', 'w')
file.write("./include/linux/pagemap.h\n")
file.write("./include/linux/mm.h\n")
file.write("./include/linux/fs.h\n")
file.write("./mm/readahead.c\n")
file.write("./mm/filemap.c\n")
file.close()

@p3r1 depends on part3 exists@
expression E1, E2;
identifier FN;
position P;
@@
FN@P(...) {...
(
E1.a_ops->launder_page(E2)
|
E1->a_ops->launder_page(E2)
)
...}

@script:python p3r2 depends on p3r1@
P << p3r1.P;
@@
file=open('/tmp/unicorn-files', 'a')
file.write(P[0].file + '\n')
file.close()

// ---
// Part 4 generic modification
@p4r1 depends on part4@
@@
struct address_space_operations { ... int (*launder_page)(
+struct address_space *,
struct page *); ... };

@p4r2 depends on part4@
expression E1, E2;
@@
E1.a_ops->launder_page(
+MAPPING_NULL,
E2)

@p4r3 depends on part4@
expression E1, E2;
@@
E1->a_ops->launder_page(
+MAPPING_NULL,
E2)
>%

Signed-off-by: Jérôme Glisse 
Cc: linux...@kvack.org
Cc: linux-fsde...@vger.kernel.org
Cc: Andrew Morton 
Cc: Alexander Viro 
Cc: linux-fsde...@vger.kernel.org
Cc: Tejun Heo 
Cc: Jan Kara 
Cc: Josef Bacik 
---
 fs/9p/vfs_addr.c|  3 ++-
 fs/afs/internal.h   |  2 +-
 fs/afs/write.c  |  2 +-
 fs/cifs/file.c  |  3 ++-
 fs/fuse/file.c  |  3 ++-
 fs/nfs/file.c   |  3 ++-
 fs/orangefs/inode.c | 17 +
 include/linux/fs.h  |  2 +-
 mm/truncate.c   |  2 +-
 9 files 

[PATCH 10/14] mm: add struct address_space to putback_page() callback

2020-10-06 Thread jglisse
From: Jérôme Glisse 

This is part of patchset to remove dependency on struct page.mapping
field so that we can temporarily update it to point to a special
structure tracking temporary page state (note that original mapping
pointer is preserved and can still be accessed but at a cost).

Add struct address_space to putback_page() callback arguments.

Note that this patch does not make use of the new argument, nor does
it use a valid one at call site (by default this patch just use NULL
for new argument value).

Use following script (from root of linux kernel tree):

./that-script.sh that-semantic-patch.spatch

%<
#!/bin/sh
spatch_file=$1

echo PART1 ===

# P1 find callback functions name
spatch  --dir . --no-includes -D part1 --sp-file $spatch_file

echo PART2 ===

# P2 change callback function prototype
cat /tmp/unicorn-functions | sort | uniq | while read func ; do
for file in $( git grep -l $func -- '*.[ch]' ) ; do
echo $file
spatch --no-includes --in-place -D part2 \
   -D fn=$func --sp-file $spatch_file $file
done
done

echo PART 3 ==

# P3 find all function which call the callback
spatch --dir . --include-headers -D part3 --sp-file $spatch_file

echo PART 4===

# P4 change all funcitons which call the callback
cat /tmp/unicorn-files | sort | uniq | while read file ; do
echo $file
spatch --no-includes --in-place -D part4 \
   --sp-file $spatch_file $file
done
>%

With the following semantic patch:

%<
virtual part1, part2, part3, part4

// 
// Part 1 is grepping all function that are use as callback for putback_page.

// initialize file where we collect all function name (erase it)
@initialize:python depends on part1@
@@
file=open('/tmp/unicorn-functions', 'w')
file.close()

// match function name use as a callback
@p1r2 depends on part1@
identifier I1, FN;
@@
struct address_space_operations I1 = {..., .putback_page = FN, ...};

@script:python p1r3 depends on p1r2@
funcname << p1r2.FN;
@@
if funcname != "NULL":
  file=open('/tmp/unicorn-functions', 'a')
  file.write(funcname + '\n')
  file.close()

// ---
// Part 2 modify callback

// Add address_space argument to the function (putback_page callback one)
@p2r1 depends on part2@
identifier virtual.fn;
identifier I1;
type T1;
@@
void fn(
+struct address_space *__mapping,
T1 I1) { ... }

@p2r2 depends on part2@
identifier virtual.fn;
identifier I1;
type T1;
@@
void fn(
+struct address_space *__mapping,
T1 I1);

@p2r3 depends on part2@
identifier virtual.fn;
expression E1;
@@
fn(
+MAPPING_NULL,
E1)

// 
// Part 3 is grepping all function that are use the callback for putback_page.

// initialize file where we collect all function name (erase it)
@initialize:python depends on part3@
@@
file=open('/tmp/unicorn-files', 'w')
file.write("./include/linux/pagemap.h\n")
file.write("./include/linux/mm.h\n")
file.write("./include/linux/fs.h\n")
file.write("./mm/readahead.c\n")
file.write("./mm/filemap.c\n")
file.close()

@p3r1 depends on part3 exists@
expression E1, E2;
identifier FN;
position P;
@@
FN@P(...) {...
(
E1.a_ops->putback_page(E2)
|
E1->a_ops->putback_page(E2)
)
...}

@script:python p3r2 depends on p3r1@
P << p3r1.P;
@@
file=open('/tmp/unicorn-files', 'a')
file.write(P[0].file + '\n')
file.close()

// ---
// Part 4 generic modification
@p4r1 depends on part4@
@@
struct address_space_operations { ... void (*putback_page)(
+struct address_space *,
struct page *); ... };

@p4r2 depends on part4@
expression E1, E2;
@@
E1.a_ops->putback_page(
+MAPPING_NULL,
E2)

@p4r3 depends on part4@
expression E1, E2;
@@
E1->a_ops->putback_page(
+MAPPING_NULL,
E2)
>%

Signed-off-by: Jérôme Glisse 
Cc: linux...@kvack.org
Cc: linux-fsde...@vger.kernel.org
Cc: Andrew Morton 
Cc: Alexander Viro 
Cc: linux-fsde...@vger.kernel.org
Cc: Tejun Heo 
Cc: Jan Kara 
Cc: Josef Bacik 
---
 include/linux/balloon_compaction.h | 6 --
 include/linux/fs.h | 2 +-
 mm/balloon_compaction.c| 2 +-
 mm/migrate.c   | 2 +-
 mm/z3fold.c| 3 ++-
 mm/zsmalloc.c  | 3 ++-
 6 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/include/linux/balloon_compaction.h 

[PATCH 07/14] mm: add struct address_space to invalidatepage() callback

2020-10-06 Thread jglisse
From: Jérôme Glisse 

This is part of patchset to remove dependency on struct page.mapping
field so that we can temporarily update it to point to a special
structure tracking temporary page state (note that original mapping
pointer is preserved and can still be accessed but at a cost).

Add struct address_space to invalidatepage() callback arguments.

Note that this patch does not make use of the new argument, nor does
it use a valid one at call site (by default this patch just use NULL
for new argument value).

Use following script (from root of linux kernel tree):

./that-script.sh that-semantic-patch.spatch

%<
#!/bin/sh
spatch_file=$1

echo PART1 ===

# P1 find callback functions name
spatch  --dir . --no-includes -D part1 --sp-file $spatch_file

echo PART2 ===

# P2 change callback function prototype
cat /tmp/unicorn-functions | sort | uniq | while read func ; do
for file in $( git grep -l $func -- '*.[ch]' ) ; do
echo $file
spatch --no-includes --in-place -D part2 \
   -D fn=$func --sp-file $spatch_file $file
done
done

echo PART 3 ==

# P3 find all function which call the callback
spatch --dir . --include-headers -D part3 --sp-file $spatch_file

echo PART 4===

# P4 change all funcitons which call the callback
cat /tmp/unicorn-files | sort | uniq | while read file ; do
echo $file
spatch --no-includes --in-place -D part4 \
   --sp-file $spatch_file $file
done
>%

With the following semantic patch:

%<
virtual part1, part2, part3, part4

// 
// Part 1 is grepping all function that are use as callback for invalidatepage.

// initialize file where we collect all function name (erase it)
@initialize:python depends on part1@
@@
file=open('/tmp/unicorn-functions', 'w')
file.close()

// match function name use as a callback
@p1r2 depends on part1@
identifier I1, FN;
@@
struct address_space_operations I1 = {..., .invalidatepage = FN, ...};

@script:python p1r3 depends on p1r2@
funcname << p1r2.FN;
@@
if funcname != "NULL":
  file=open('/tmp/unicorn-functions', 'a')
  file.write(funcname + '\n')
  file.close()

// ---
// Part 2 modify callback

// Add address_space argument to the function (invalidatepage callback one)
@p2r1 depends on part2@
identifier virtual.fn;
identifier I1, I2, I3;
type T1, T2, T3;
@@
void fn(
+struct address_space *__mapping,
T1 I1, T2 I2, T3 I3) { ... }

@p2r2 depends on part2@
identifier virtual.fn;
identifier I1, I2, I3;
type T1, T2, T3;
@@
void fn(
+struct address_space *__mapping,
T1 I1, T2 I2, T3 I3);

@p2r3 depends on part2@
identifier virtual.fn;
expression E1, E2, E3;
@@
fn(
+MAPPING_NULL,
E1, E2, E3)

// 
// Part 3 is grepping all function that are use the callback for invalidatepage.

// initialize file where we collect all function name (erase it)
@initialize:python depends on part3@
@@
file=open('/tmp/unicorn-files', 'w')
file.write("./include/linux/pagemap.h\n")
file.write("./include/linux/mm.h\n")
file.write("./include/linux/fs.h\n")
file.write("./mm/readahead.c\n")
file.write("./mm/truncate.c\n")
file.write("./mm/filemap.c\n")
file.close()

@p3r1 depends on part3 exists@
expression E1, E2, E3, E4;
identifier FN;
position P;
@@
FN@P(...) {...
(
E1.a_ops->invalidatepage(E2, E3, E4)
|
E1->a_ops->invalidatepage(E2, E3, E4)
)
...}

@script:python p3r2 depends on p3r1@
P << p3r1.P;
@@
file=open('/tmp/unicorn-files', 'a')
file.write(P[0].file + '\n')
file.close()

// ---
// Part 4 generic modification
@p4r1 depends on part4@
@@
struct address_space_operations { ... void (*invalidatepage)(
+struct address_space *,
struct page *, ...); ... };

@p4r2 depends on part4@
expression E1, E2, E3, E4;
@@
E1.a_ops->invalidatepage(
+MAPPING_NULL,
E2, E3, E4)

@p4r3 depends on part4@
expression E1, E2, E3, E4;
@@
E1->a_ops->invalidatepage(
+MAPPING_NULL,
E2, E3, E4)

@p4r4 depends on part4 exists@
identifier I1, FN;
expression E1;
@@
FN (...) {...
void (*I1)(struct page *, unsigned int, unsigned int);
...
I1 = E1->a_ops->invalidatepage;
...}

@p4r5 depends on p4r4 exists@
expression E1, E2, E3;
identifier I1, p4r4.FN;
@@
FN(...) {...
void (*I1)(
+struct address_space *,
struct page *, unsigned int, unsigned int);
...
 (*I1)(
+MAPPING_NULL,
E1, E2, E3);
...}

@p4r6 depends on part4@
expression E1, E2, E3;
@@
{...
-void (*invalidatepage)(struct 

[PATCH 02/14] fs: define filler_t as a function pointer type

2020-10-06 Thread jglisse
From: Jérôme Glisse 

Coccinelle is confuse by filler_t not being a regular function pointer
type. As they are no reason to define filler_t as a non pointer type
redefine it as a function pointer type and update function prototype
accordingly.

Signed-off-by: Jérôme Glisse 
Cc: linux...@kvack.org
Cc: linux-fsde...@vger.kernel.org
Cc: Andrew Morton 
Cc: Alexander Viro 
Cc: linux-fsde...@vger.kernel.org
Cc: Tejun Heo 
Cc: Jan Kara 
Cc: Josef Bacik 
---
 fs/nfs/dir.c| 2 +-
 fs/nfs/symlink.c| 4 ++--
 include/linux/pagemap.h | 6 +++---
 mm/filemap.c| 5 ++---
 mm/readahead.c  | 2 +-
 5 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index cb52db9a0cfb7..da1fe71ae810d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -740,7 +740,7 @@ static
 struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
 {
return read_cache_page(desc->file->f_mapping, desc->page_index,
-   nfs_readdir_filler, desc);
+   (filler_t)nfs_readdir_filler, desc);
 }
 
 /*
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 25ba299fdac2e..76691d94ae5f8 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -66,8 +66,8 @@ static const char *nfs_get_link(struct dentry *dentry,
err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
if (err)
return err;
-   page = read_cache_page(>i_data, 0, nfs_symlink_filler,
-   inode);
+   page = read_cache_page(>i_data, 0,
+   (filler_t)nfs_symlink_filler, inode);
if (IS_ERR(page))
return ERR_CAST(page);
}
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 7de11dcd534d6..9acfc605b3bc3 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -264,7 +264,7 @@ static inline gfp_t readahead_gfp_mask(struct address_space 
*x)
return mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN;
 }
 
-typedef int filler_t(void *, struct page *);
+typedef int (*filler_t)(void *, struct page *);
 
 pgoff_t page_cache_next_miss(struct address_space *mapping,
 pgoff_t index, unsigned long max_scan);
@@ -425,11 +425,11 @@ static inline struct page *grab_cache_page(struct 
address_space *mapping,
 }
 
 extern struct page * read_cache_page(struct address_space *mapping,
-   pgoff_t index, filler_t *filler, void *data);
+   pgoff_t index, filler_t filler, void *data);
 extern struct page * read_cache_page_gfp(struct address_space *mapping,
pgoff_t index, gfp_t gfp_mask);
 extern int read_cache_pages(struct address_space *mapping,
-   struct list_head *pages, filler_t *filler, void *data);
+   struct list_head *pages, filler_t filler, void *data);
 
 static inline struct page *read_mapping_page(struct address_space *mapping,
pgoff_t index, void *data)
diff --git a/mm/filemap.c b/mm/filemap.c
index 99c49eeae71b8..2cdbbffc55522 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2942,8 +2942,7 @@ static struct page *wait_on_page_read(struct page *page)
 }
 
 static struct page *do_read_cache_page(struct address_space *mapping,
-   pgoff_t index,
-   int (*filler)(void *, struct page *),
+   pgoff_t index, filler_t filler,
void *data,
gfp_t gfp)
 {
@@ -3064,7 +3063,7 @@ static struct page *do_read_cache_page(struct 
address_space *mapping,
  */
 struct page *read_cache_page(struct address_space *mapping,
pgoff_t index,
-   int (*filler)(void *, struct page *),
+   filler_t filler,
void *data)
 {
return do_read_cache_page(mapping, index, filler, data,
diff --git a/mm/readahead.c b/mm/readahead.c
index 3c9a8dd7c56c8..cd67c9cfa931a 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -87,7 +87,7 @@ static void read_cache_pages_invalidate_pages(struct 
address_space *mapping,
  * Returns: %0 on success, error return by @filler otherwise
  */
 int read_cache_pages(struct address_space *mapping, struct list_head *pages,
-   int (*filler)(void *, struct page *), void *data)
+   filler_t filler, void *data)
 {
struct page *page;
int ret = 0;
-- 
2.26.2



[PATCH 00/14] Small step toward KSM for file back page.

2020-10-06 Thread jglisse
From: Jérôme Glisse 

This patchset is a step toward a larger objective: generalize existing
KSM into a mechanism allowing exclusive write control for a page; either
anonymous memory (like KSM today) or file back page (modulo GUP which
would block that like it does today for KSM).

Exclusive write control page allow multiple different features to be
implemented:

- KSM kernel share page, ie de-duplicate pages with same content
  to use a single page for all. From many pages to one read only
  page. We have that today for anonymous memory only. The overall
  patchset extends it to file back page ie sharing the same struct
  page accross different file or accross same file. This can be
  be usefull for containers for instance ... or for deduplication
  in same file.

- NUMA duplication, duplicate a page into multiple local read only
  copy. This is the opposite of KSM in a sense, instead of saving
  memory. Using more memory to get better memory access performance.
  For instance duplicating libc code to local node copy; or big
  read only dataset duplicated on each nodes.

- Exclusive write access, owner of page write protection is the only
  that can write to the page (and must still abide by fs rules for
  fileback page in respect to writeback...). One use case is for
  fast atomic operation using non atomic instruction. For instance
  by PCIE device, if all mapping of the page is read only then PCIE
  device driver knows device write can not race with CPU write. This
  is a performance optimization.

- Use main memory as cache for persistent memory ie the page is
  read only and write will trigger callback and different strategy
  can be use like write combining (ie acumulating change in main
  memory before copying to persistent memory).

Like KSM today such protection can be broken at _any_ time. The owner
of the protection gets a callback (KSM code for instance get calls) so
that it can unprotect the page. Breaking protection should not block
and must happens quickly (like KSM code today).


Convertion of existing KSM into generic mechanism is straightforward
for anonymous page (just factorize out KSM code that deals with page
protection from KSM code that deals with de-duplication).


The big changes here is the support for file back pages. The idea to
achieve it is that we almost always have the mapping a page belongs
to within the call stack as we operate on such page either from:
  - Syscall/kernel against a file (file -> inode -> mapping).
  - Syscall/kernel against virtual address (vma -> file -> mapping).
  - Write back for a given mapping (mapping -> pages).

They are few exceptions:
  - Reclaim, but reclaim does not care about mapping. Reclaim wants to
unmap page to free it up. So all we have to do is provide special
path to do that just like KSM does today for anonymous pages.

  - Compaction, again we do not care about the mapping for compaction.
All we need is way to move page (ie migrate).

  - Flush data cache on some architecture the cache line are tag with
the virtual address so when flushing a page we need to find all of
its virtual addresses. Again we do not care about the mapping, we
just need a way to find all virtual address in all process pointing
to the page.

  - GUP user that want to set a page dirty. This is easy, we just do
not allow protection to work on GUPed page and GUP also will break
the protection. There is just no way to synchronize with GUP user
as they violate all mm and fs rules anyway.

  - Some proc fs and other memory debugging API. Here we do not care
about the mapping but about the page states. Also some of those
API works on virtual address for which we can easily get the vma
and thus the mapping.


So when we have the mapping for a page from the context and not from
page->mapping then we can use it as a key to lookup private and index
fields value for the page.

To avoid any regression risk, only protected pages sees their fields
overloaded. It means that if you are not using the page protection then
the page->mapping, page->private and page->index all stays as they are
today. Also page->mapping is always use as canonical place to lookup
the page mapping for unprotected page so that any existing code will
keep working as it does today even if the mapping we get from the
context does not match the page->mapping. More on this below.


Overview:
=

The core idea is pretty dumb, it is just about passing new mapping
argument to every function that get a page and need the mapping
corresponding to that page. Most of the changes are done through
semantic patches. Adding new function argument on itself does not
bring any risk. The risk is in making sure that the mapping we pass as
function argument is the one corresponding to the page. To avoid any
regression we keep using page->mapping as the canonical mapping even

[PATCH 04/14] mm: add struct address_space to readpage() callback

2020-10-06 Thread jglisse
From: Jérôme Glisse 

This is part of patchset to remove dependency on struct page.mapping
field so that we can temporarily update it to point to a special
structure tracking temporary page state (note that original mapping
pointer is preserved and can still be accessed but at a cost).

Add struct address_space to readpage() callback arguments.

Note that this patch does not make use of the new argument, nor does
it use a valid one at call site (by default this patch just use NULL
for new argument value).

Use following script (from root of linux kernel tree):

./that-script.sh that-semantic-patch.spatch

%<
#!/bin/sh
spatch_file=$1

echo PART1 ===

# P1 find callback functions name
spatch  --dir . --no-includes -D part1 --sp-file $spatch_file

echo PART2 ===

# P2 change callback function prototype
cat /tmp/unicorn-functions | sort | uniq | while read func ; do
for file in $( git grep -l $func -- '*.[ch]' ) ; do
echo $file
spatch --no-includes --in-place -D part2 \
   -D fn=$func --sp-file $spatch_file $file
done
done

echo PART 3 ==

# P3 find all function which call the callback
spatch --dir . --include-headers -D part3 --sp-file $spatch_file

echo PART 4===

# P4 change all funcitons which call the callback
cat /tmp/unicorn-files | sort | uniq | while read file ; do
echo $file
spatch --no-includes --in-place -D part4 \
   --sp-file $spatch_file $file
done
>%

With the following semantic patch:

%<
virtual part1, part2, part3, part4

// 
// Part 1 is grepping all function that are use as callback for readpage.

// initialize file where we collect all function name (erase it)
@initialize:python depends on part1@
@@
file=open('/tmp/unicorn-functions', 'w')
file.close()

// match function name use as a callback
@p1r2 depends on part1@
identifier I1, FN;
@@
struct address_space_operations I1 = {..., .readpage = FN, ...};

@script:python p1r3 depends on p1r2@
funcname << p1r2.FN;
@@
if funcname != "NULL":
  file=open('/tmp/unicorn-functions', 'a')
  file.write(funcname + '\n')
  file.close()

@p1r4 depends on part1 exists@
expression E1, E2, E3;
identifier FN;
type T1;
@@
{...
(
read_cache_page(E1, E2, (T1)FN, E3)
|
read_cache_pages(E1, E2, (T1)FN, E3)
)
...}

@script:python p1r5 depends on p1r4@
funcname << p1r4.FN;
@@
if funcname != "NULL":
  file=open('/tmp/unicorn-functions', 'a')
  file.write(funcname + '\n')
  print(funcname)
  file.close()

// ---
// Part 2 modify callback

// Add address_space argument to the function (readpage callback one)
@p2r1 depends on part2@
identifier virtual.fn;
identifier I1, I2;
type T1, T2;
@@
int fn(T1 I1,
+struct address_space *__mapping,
T2 I2) { ... }

@p2r2 depends on part2@
identifier virtual.fn;
identifier I1, I2;
type T1, T2;
@@
int fn(T1 I1,
+struct address_space *__mapping,
T2 I2);

@p2r3 depends on part2@
identifier virtual.fn;
type T1, T2;
@@
int fn(T1,
+struct address_space *,
T2);

@p2r4 depends on part2@
identifier virtual.fn;
expression E1, E2;
@@
fn(E1,
+MAPPING_NULL,
E2)

// 
// Part 3 is grepping all function that are use the callback for readpage.

// initialize file where we collect all function name (erase it)
@initialize:python depends on part3@
@@
file=open('/tmp/unicorn-files', 'w')
file.write("./include/linux/pagemap.h\n")
file.write("./include/linux/mm.h\n")
file.write("./include/linux/fs.h\n")
file.write("./mm/readahead.c\n")
file.write("./mm/filemap.c\n")
file.close()

@p3r1 depends on part3 exists@
expression E1, E2, E3;
identifier FN;
position P;
@@
FN@P(...) {...
(
E1.a_ops->readpage(E2, E3)
|
E1->a_ops->readpage(E2, E3)
)
...}

@script:python p3r2 depends on p3r1@
P << p3r1.P;
@@
file=open('/tmp/unicorn-files', 'a')
file.write(P[0].file + '\n')
file.close()

@p3r4 depends on part3 exists@
expression E1, E2, E3, E4;
identifier FN;
position P;
@@
FN@P(...) {...
(
read_cache_page(E1, E2, E3, E4)
|
read_cache_pages(E1, E2, E3, E4)
)
...}

@script:python p3r5 depends on p3r4@
P << p3r4.P;
@@
file=open('/tmp/unicorn-files', 'a')
file.write(P[0].file + '\n')
file.close()

// ---
// Part 4 generic modification
@p4r1 depends on part4@
@@
struct address_space_operations { ... int (*readpage)(struct file *,
+struct address_space *,
struct page *); ... };

@p4r2 depends on part4@
expression E1, E2, E3;
@@

[PATCH 01/14] mm/pxa: page exclusive access add header file for all helpers.

2020-10-06 Thread jglisse
From: Jérôme Glisse 

Add include/linux/page-xa.h where all helpers related to Page eXclusive
Acces (PXA) will be added (in following patches).

Also introduce MAPPING_NULL as a temporary define use to simplify the
mass modifications to stop relying on struct page.mapping and instead
pass down mapping pointer from the context (either from inode when in
syscall operating on a file or from vma->vm_file when operating on some
virtual address.

This is temporary define, do not use !

Signed-off-by: Jérôme Glisse 
Cc: Andrew Morton 
Cc: Alexander Viro 
Cc: linux-fsde...@vger.kernel.org
Cc: Tejun Heo 
Cc: Jan Kara 
Cc: Josef Bacik 
---
 include/linux/mm.h  |  5 
 include/linux/page-xa.h | 66 +
 2 files changed, 71 insertions(+)
 create mode 100644 include/linux/page-xa.h

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 16b799a0522cd..d165961c58c45 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3130,5 +3130,10 @@ unsigned long wp_shared_mapping_range(struct 
address_space *mapping,
 
 extern int sysctl_nr_trim_pages;
 
+
+/* Page exclusive access do depend on some helpers define in here. */
+#include 
+
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/include/linux/page-xa.h b/include/linux/page-xa.h
new file mode 100644
index 0..8ac9e6dc051e0
--- /dev/null
+++ b/include/linux/page-xa.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Page eXclusive Acess (PXA) is a generic mechanism to allow exclusive access
+ * to a file back or an anonymous page. Exclusive access means that no one can
+ * write to page except the owner of the protection (but the page can still be
+ * read). The exclusive access can be _broken_ at anytime and this can not be
+ * block (so anyone using that feature must be ready to give away the exclusive
+ * access at _any_ time and must do so in a timely fashion).
+ *
+ * Using PXA allows to implement few different features:
+ *  - KSM (Kernel Shared Memory) where page with same content are deduplicated
+ *using a unique page and all mapping are updated to read only. This allow
+ *to save memory for workload with a lot of pages in different process that
+ *end up with same content (multiple VM for instance).
+ *
+ *  - NUMA duplication (sort of the opposite of KSM) here a page is duplicated
+ *into multiple read only copy with each copy using physical memory local a
+ *NUMA node (or a device). This allow to improve performance by minimizing
+ *cross node memory transaction and also help minimizing bus traffic. It
+ *does however use more memory, so what you gain in performance you loose
+ *in available resources.
+ *
+ *  - Exclusive write access to a page, for instance you can use regular write
+ *instruction and still get atomic behavior (as you are the only being able
+ *to write you the garantee that no one can race with you).
+ *
+ * And any other use cases you can think of ...
+ *
+ * See Documentation/vm/page-xa.rst for further informations.
+ *
+ * Authors:
+ *  Jérôme Glisse
+ */
+#ifndef LINUX_PAGE_XA_H
+#define LINUX_PAGE_XA_H
+
+#include 
+#include 
+
+
+/*
+ * MAPPING_NULL this is temporary define use to simplify the mass modificaitons
+ * to stop relying on struct page.mapping and instead pass down mapping pointer
+ * from the context (either from inode when in syscall operating on a file or
+ * from vma->vm_file when operating on some virtual address range).
+ *
+ * DO NOT USE ! THIS IS ONLY FOR SEMANTIC PATCHES SIMPLIFICATION !
+ */
+#define MAPPING_NULL NULL
+
+
+/**
+ * PageXA() - is page under exclusive acces ?
+ *
+ * This function checks if a page is under exclusive access.
+ *
+ * @page: Pointer to page to be queried.
+ * @Return: True, if it is under exclusive access, false otherwise.
+ */
+static inline bool PageXA(struct page *page)
+{
+   return false;
+}
+
+
+#endif /* LINUX_PAGE_XA_H */
-- 
2.26.2



[PATCH 05/14] mm: add struct address_space to writepage() callback

2020-10-06 Thread jglisse
From: Jérôme Glisse 

This is part of patchset to remove dependency on struct page.mapping
field so that we can temporarily update it to point to a special
structure tracking temporary page state (note that original mapping
pointer is preserved and can still be accessed but at a cost).

Add struct address_space to writepage() callback arguments.

Note that this patch does not make use of the new argument, nor does
it use a valid one at call site (by default this patch just use NULL
for new argument value).

Use following script (from root of linux kernel tree):

./that-script.sh that-semantic-patch.spatch

%<
#!/bin/sh
spatch_file=$1

echo PART1 ===

# P1 find callback functions name
spatch  --dir . --no-includes -D part1 --sp-file $spatch_file

echo PART2 ===

# P2 change callback function prototype
cat /tmp/unicorn-functions | sort | uniq | while read func ; do
for file in $( git grep -l $func -- '*.[ch]' ) ; do
echo $file
spatch --no-includes --in-place -D part2 \
   -D fn=$func --sp-file $spatch_file $file
done
done

echo PART 3 ==

# P3 find all function which call the callback
spatch --dir . --include-headers -D part3 --sp-file $spatch_file

echo PART 4===

# P4 change all funcitons which call the callback
cat /tmp/unicorn-files | sort | uniq | while read file ; do
echo $file
spatch --no-includes --in-place -D part4 \
   --sp-file $spatch_file $file
done
>%

With the following semantic patch:

%<
virtual part1, part2, part3, part4

// 
// Part 1 is grepping all function that are use as callback for writepage.

// initialize file where we collect all function name (erase it)
@initialize:python depends on part1@
@@
file=open('/tmp/unicorn-functions', 'w')
file.close()

// match function name use as a callback
@p1r2 depends on part1@
identifier I1, FN;
@@
struct address_space_operations I1 = {..., .writepage = FN, ...};

@script:python p1r3 depends on p1r2@
funcname << p1r2.FN;
@@
if funcname != "NULL":
  file=open('/tmp/unicorn-functions', 'a')
  file.write(funcname + '\n')
  file.close()

// ---
// Part 2 modify callback

// Add address_space argument to the function (writepage callback one)
@p2r1 depends on part2@
identifier virtual.fn;
identifier I1, I2;
type T1, T2;
@@
int fn(
+struct address_space *__mapping,
T1 I1, T2 I2) { ... }

@p2r2 depends on part2@
identifier virtual.fn;
identifier I1, I2;
type T1, T2;
@@
int fn(
+struct address_space *__mapping,
T1 I1, T2 I2);

@p2r3 depends on part2@
identifier virtual.fn;
type T1, T2;
@@
int fn(
+struct address_space *__mapping,
T1, T2);

@p2r4 depends on part2@
identifier virtual.fn;
expression E1, E2;
@@
fn(
+MAPPING_NULL,
E1, E2)

// 
// Part 3 is grepping all function that are use the callback for writepage.

// initialize file where we collect all function name (erase it)
@initialize:python depends on part3@
@@
file=open('/tmp/unicorn-files', 'w')
file.write("./include/linux/pagemap.h\n")
file.write("./include/linux/mm.h\n")
file.write("./include/linux/fs.h\n")
file.write("./mm/readahead.c\n")
file.write("./mm/filemap.c\n")
file.close()

@p3r1 depends on part3 exists@
expression E1, E2, E3;
identifier FN;
position P;
@@
FN@P(...) {...
(
E1.a_ops->writepage(E2, E3)
|
E1->a_ops->writepage(E2, E3)
)
...}

@script:python p3r2 depends on p3r1@
P << p3r1.P;
@@
file=open('/tmp/unicorn-files', 'a')
file.write(P[0].file + '\n')
file.close()

// ---
// Part 4 generic modification
@p4r1 depends on part4@
@@
struct address_space_operations { ... int (*writepage)(
+struct address_space *,
struct page *page, ...); ... };

@p4r2 depends on part4@
expression E1, E2, E3;
@@
E1.a_ops->writepage(
+MAPPING_NULL,
E2, E3)

@p4r3 depends on part4@
expression E1, E2, E3;
@@
E1->a_ops->writepage(
+MAPPING_NULL,
E2, E3)
>%

Signed-off-by: Jérôme Glisse 
Cc: linux...@kvack.org
Cc: linux-fsde...@vger.kernel.org
Cc: Andrew Morton 
Cc: Alexander Viro 
Cc: linux-fsde...@vger.kernel.org
Cc: Tejun Heo 
Cc: Jan Kara 
Cc: Josef Bacik 
---
 drivers/gpu/drm/i915/gem/i915_gem_shmem.c | 3 ++-
 fs/9p/vfs_addr.c  | 4 +++-
 fs/adfs/inode.c   | 3 ++-
 fs/affs/file.c| 3 ++-
 fs/afs/internal.h  

[PATCH 06/14] mm: add struct address_space to set_page_dirty() callback

2020-10-06 Thread jglisse
From: Jérôme Glisse 

This is part of patchset to remove dependency on struct page.mapping
field so that we can temporarily update it to point to a special
structure tracking temporary page state (note that original mapping
pointer is preserved and can still be accessed but at a cost).

Add struct address_space to set_page_dirty() callback arguments.

Note that this patch does not make use of the new argument, nor does
it use a valid one at call site (by default this patch just use NULL
for new argument value).

Use following script (from root of linux kernel tree):

./that-script.sh that-semantic-patch.spatch

%<
#!/bin/sh
spatch_file=$1

echo PART1 ===

# P1 find callback functions name
spatch  --dir . --no-includes -D part1 --sp-file $spatch_file

echo PART2 ===

# P2 change callback function prototype
cat /tmp/unicorn-functions | sort | uniq | while read func ; do
for file in $( git grep -l $func -- '*.[ch]' ) ; do
echo $file
spatch --no-includes --in-place -D part2 \
   -D fn=$func --sp-file $spatch_file $file
done
done

echo PART 3 ==

# P3 find all function which call the callback
spatch --dir . --include-headers -D part3 --sp-file $spatch_file

echo PART 4===

# P4 change all funcitons which call the callback
cat /tmp/unicorn-files | sort | uniq | while read file ; do
echo $file
spatch --no-includes --in-place -D part4 \
   --sp-file $spatch_file $file
done
>%

With the following semantic patch:

%<
virtual part1, part2, part3, part4

// 
// Part 1 is grepping all function that are use as callback for set_page_dirty.

// initialize file where we collect all function name (erase it)
@initialize:python depends on part1@
@@
file=open('/tmp/unicorn-functions', 'w')
file.close()

// match function name use as a callback
@p1r2 depends on part1@
identifier I1, FN;
@@
struct address_space_operations I1 = {..., .set_page_dirty = FN, ...};

@script:python p1r3 depends on p1r2@
funcname << p1r2.FN;
@@
if funcname != "NULL":
  file=open('/tmp/unicorn-functions', 'a')
  file.write(funcname + '\n')
  file.close()

// ---
// Part 2 modify callback

// Add address_space argument to the function (set_page_dirty callback one)
@p2r1 depends on part2@
identifier virtual.fn;
identifier I1;
type T1;
@@
int fn(
+struct address_space *__mapping,
T1 I1) { ... }

@p2r2 depends on part2@
identifier virtual.fn;
identifier I1;
type T1;
@@
int fn(
+struct address_space *__mapping,
T1 I1);

@p2r3 depends on part2@
identifier virtual.fn;
type T1;
@@
int fn(
+struct address_space *,
T1);

@p2r4 depends on part2@
identifier virtual.fn;
expression E1;
@@
fn(
+MAPPING_NULL,
E1)

// 
// Part 3 is grepping all function that are use the callback for set_page_dirty.

// initialize file where we collect all function name (erase it)
@initialize:python depends on part3@
@@
file=open('/tmp/unicorn-files', 'w')
file.write("./include/linux/pagemap.h\n")
file.write("./mm/page-writeback.c\n")
file.write("./include/linux/mm.h\n")
file.write("./include/linux/fs.h\n")
file.close()

@p3r1 depends on part3 exists@
expression E1, E2;
identifier FN;
position P;
@@
FN@P(...) {...
(
E1.a_ops->set_page_dirty(E2)
|
E1->a_ops->set_page_dirty(E2)
)
...}

@script:python p3r2 depends on p3r1@
P << p3r1.P;
@@
file=open('/tmp/unicorn-files', 'a')
file.write(P[0].file + '\n')
file.close()

// ---
// Part 4 generic modification
@p4r1 depends on part4@
@@
struct address_space_operations { ... int (*set_page_dirty)(
+struct address_space *,
struct page *page); ... };

@p4r2 depends on part4@
expression E1, E2;
@@
E1.a_ops->set_page_dirty(
+MAPPING_NULL,
E2)

@p4r3 depends on part4@
expression E1, E2;
@@
E1->a_ops->set_page_dirty(
+MAPPING_NULL,
E2)

@p4r4 depends on part4@
@@
{...
-int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
+int (*spd)(struct address_space *, struct page *) = 
mapping->a_ops->set_page_dirty;
...
return (*spd)(
+MAPPING_NULL,
page);
...}
>%

Signed-off-by: Jérôme Glisse 
Cc: linux...@kvack.org
Cc: linux-fsde...@vger.kernel.org
Cc: Andrew Morton 
Cc: Alexander Viro 
Cc: linux-fsde...@vger.kernel.org
Cc: Tejun Heo 
Cc: Jan Kara 
Cc: Josef Bacik 
---
 drivers/video/fbdev/core/fb_defio.c |  3 ++-
 fs/afs/dir.c

[PATCH 03/14] fs: directly use a_ops->freepage() instead of a local copy of it.

2020-10-06 Thread jglisse
From: Jérôme Glisse 

Coccinelle is confuse with function pointer, convert to directly
use a_ops->freepage() to be nice to coccinelle.

Signed-off-by: Jérôme Glisse 
Cc: linux-fsde...@vger.kernel.org
Cc: linux...@kvack.org
Cc: Alexander Viro 
Cc: Tejun Heo 
Cc: Jan Kara 
Cc: Josef Bacik 
Cc: Andrew Morton 
---
 mm/filemap.c | 12 
 mm/vmscan.c  |  7 ++-
 2 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 2cdbbffc55522..ba892599a2717 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -242,11 +242,8 @@ void __delete_from_page_cache(struct page *page, void 
*shadow)
 static void page_cache_free_page(struct address_space *mapping,
struct page *page)
 {
-   void (*freepage)(struct page *);
-
-   freepage = mapping->a_ops->freepage;
-   if (freepage)
-   freepage(page);
+   if (mapping->a_ops->freepage)
+   mapping->a_ops->freepage(page);
 
if (PageTransHuge(page) && !PageHuge(page)) {
page_ref_sub(page, HPAGE_PMD_NR);
@@ -790,7 +787,6 @@ EXPORT_SYMBOL(file_write_and_wait_range);
 int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
 {
struct address_space *mapping = old->mapping;
-   void (*freepage)(struct page *) = mapping->a_ops->freepage;
pgoff_t offset = old->index;
XA_STATE(xas, >i_pages, offset);
unsigned long flags;
@@ -819,8 +815,8 @@ int replace_page_cache_page(struct page *old, struct page 
*new, gfp_t gfp_mask)
if (PageSwapBacked(new))
__inc_lruvec_page_state(new, NR_SHMEM);
xas_unlock_irqrestore(, flags);
-   if (freepage)
-   freepage(old);
+   if (mapping->a_ops->freepage)
+   mapping->a_ops->freepage(old);
put_page(old);
 
return 0;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 466fc3144fffc..6db869339073d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -903,9 +903,6 @@ static int __remove_mapping(struct address_space *mapping, 
struct page *page,
xa_unlock_irqrestore(>i_pages, flags);
put_swap_page(page, swap);
} else {
-   void (*freepage)(struct page *);
-
-   freepage = mapping->a_ops->freepage;
/*
 * Remember a shadow entry for reclaimed file cache in
 * order to detect refaults, thus thrashing, later on.
@@ -928,8 +925,8 @@ static int __remove_mapping(struct address_space *mapping, 
struct page *page,
__delete_from_page_cache(page, shadow);
xa_unlock_irqrestore(>i_pages, flags);
 
-   if (freepage != NULL)
-   freepage(page);
+   if (mapping->a_ops->freepage != NULL)
+   mapping->a_ops->freepage(page);
}
 
return 1;
-- 
2.26.2



[PATCH] mm/hmm: move THP and hugetlbfs code path behind #if KCONFIG

2019-04-22 Thread jglisse
From: Jérôme Glisse 

To avoid any undefined symbol build warning or error, move THP and
hugetlbfs code behind kconfig #if/#else/#endif against appropriate
Kconfig option.

Signed-off-by: Jérôme Glisse 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: Andrew Morton 
---
 mm/hmm.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/mm/hmm.c b/mm/hmm.c
index ecd16718285e..a8a950fe46b6 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -520,6 +520,7 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
  uint64_t *pfns,
  pmd_t pmd)
 {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
unsigned long pfn, npages, i;
@@ -550,6 +551,10 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
}
hmm_vma_walk->last = end;
return 0;
+#else
+   /* If THP is not enabled then we should never reach that code ! */
+   return -EINVAL;
+#endif
 }
 
 static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
@@ -792,6 +797,7 @@ static int hmm_vma_walk_pud(pud_t *pudp,
return hmm_vma_walk_hole_(addr, end, fault,
write_fault, walk);
 
+#ifdef CONFIG_HUGETLB_PAGE
pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
for (i = 0; i < npages; ++i, ++pfn) {
hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
@@ -807,6 +813,9 @@ static int hmm_vma_walk_pud(pud_t *pudp,
}
hmm_vma_walk->last = end;
return 0;
+#else
+   return -EINVAL;
+#endif
}
 
split_huge_pud(walk->vma, pudp, addr);
-- 
2.20.1



[PATCH] mm/hmm: add ARCH_HAS_HMM_MIRROR ARCH_HAS_HMM_DEVICE Kconfig

2019-04-17 Thread jglisse
From: Jérôme Glisse 

This patch just add 2 new Kconfig that are _not use_ by anyone. I check
that various make ARCH=somearch allmodconfig do work and do not complain.
This new Kconfig need to be added first so that device driver that do
depend on HMM can be updated.

Once drivers are updated then i can update the HMM Kconfig to depends
on this new Kconfig in a followup patch.

Signed-off-by: Jérôme Glisse 
Cc: Guenter Roeck 
Cc: Leon Romanovsky 
Cc: Jason Gunthorpe 
Cc: Andrew Morton 
Cc: Ralph Campbell 
Cc: John Hubbard 
---
 mm/Kconfig | 16 
 1 file changed, 16 insertions(+)

diff --git a/mm/Kconfig b/mm/Kconfig
index 25c71eb8a7db..daadc9131087 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -676,6 +676,22 @@ config ZONE_DEVICE
 
  If FS_DAX is enabled, then say Y.
 
+config ARCH_HAS_HMM_MIRROR
+   bool
+   default y
+   depends on (X86_64 || PPC64)
+   depends on MMU && 64BIT
+
+config ARCH_HAS_HMM_DEVICE
+   bool
+   default y
+   depends on (X86_64 || PPC64)
+   depends on MEMORY_HOTPLUG
+   depends on MEMORY_HOTREMOVE
+   depends on SPARSEMEM_VMEMMAP
+   depends on ARCH_HAS_ZONE_DEVICE
+   select XARRAY_MULTI
+
 config ARCH_HAS_HMM
bool
default y
-- 
2.20.1



[PATCH v4 1/1] RDMA/odp: convert to use HMM for ODP v4

2019-04-11 Thread jglisse
From: Jérôme Glisse 

Convert ODP to use HMM so that we can build on common infrastructure
for different class of devices that want to mirror a process address
space into a device. There is no functional changes.

Changes since v3:
- fix Kconfig to properly depends on HMM, also make sure things
  build properly if on demand paging is not enabled
Changes since v2:
- rebase on top of newer HMM patchset and mmu notifier patchset
Changes since v1:
- improved comments
- simplified page alignment computation

Signed-off-by: Jérôme Glisse 
Cc: Jason Gunthorpe 
Cc: Leon Romanovsky 
Cc: Doug Ledford 
Cc: Artemy Kovalyov 
Cc: Moni Shoua 
Cc: Mike Marciniszyn 
Cc: Kaike Wan 
Cc: Dennis Dalessandro 
Cc: linux-r...@vger.kernel.org
---
 drivers/infiniband/Kconfig |   3 +-
 drivers/infiniband/core/umem_odp.c | 499 -
 drivers/infiniband/hw/mlx5/mem.c   |  20 +-
 drivers/infiniband/hw/mlx5/mr.c|   2 +-
 drivers/infiniband/hw/mlx5/odp.c   | 106 +++---
 include/rdma/ib_umem_odp.h |  49 ++-
 6 files changed, 231 insertions(+), 448 deletions(-)

diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index a1fb840de45d..8002ca65898a 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -64,7 +64,8 @@ config INFINIBAND_USER_MEM
 config INFINIBAND_ON_DEMAND_PAGING
bool "InfiniBand on-demand paging support"
depends on INFINIBAND_USER_MEM
-   select MMU_NOTIFIER
+   depends on ARCH_HAS_HMM
+   select HMM_MIRROR
default y
---help---
  On demand paging support for the InfiniBand subsystem.
diff --git a/drivers/infiniband/core/umem_odp.c 
b/drivers/infiniband/core/umem_odp.c
index bcd53f302df2..139f520e733d 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -46,6 +46,26 @@
 #include 
 #include 
 
+
+static uint64_t odp_hmm_flags[HMM_PFN_FLAG_MAX] = {
+   ODP_READ_BIT,   /* HMM_PFN_VALID */
+   ODP_WRITE_BIT,  /* HMM_PFN_WRITE */
+   /*
+* The ODP_DEVICE_BIT is not use by ODP but is here to comply
+* with HMM API which also catter to device with local memory.
+* RDMA devices do not have any such memory and thus do not
+* have a real use for that flag.
+*/
+   ODP_DEVICE_BIT, /* HMM_PFN_DEVICE_PRIVATE */
+};
+
+static uint64_t odp_hmm_values[HMM_PFN_VALUE_MAX] = {
+   -1UL,   /* HMM_PFN_ERROR */
+   0UL,/* HMM_PFN_NONE */
+   -2UL,   /* HMM_PFN_SPECIAL */
+};
+
+
 /*
  * The ib_umem list keeps track of memory regions for which the HW
  * device request to receive notification when the related memory
@@ -78,57 +98,25 @@ static u64 node_last(struct umem_odp_node *n)
 INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last,
 node_start, node_last, static, rbt_ib_umem)
 
-static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp)
-{
-   mutex_lock(_odp->umem_mutex);
-   if (umem_odp->notifiers_count++ == 0)
-   /*
-* Initialize the completion object for waiting on
-* notifiers. Since notifier_count is zero, no one should be
-* waiting right now.
-*/
-   reinit_completion(_odp->notifier_completion);
-   mutex_unlock(_odp->umem_mutex);
-}
-
-static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp)
-{
-   mutex_lock(_odp->umem_mutex);
-   /*
-* This sequence increase will notify the QP page fault that the page
-* that is going to be mapped in the spte could have been freed.
-*/
-   ++umem_odp->notifiers_seq;
-   if (--umem_odp->notifiers_count == 0)
-   complete_all(_odp->notifier_completion);
-   mutex_unlock(_odp->umem_mutex);
-}
-
 static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp,
   u64 start, u64 end, void *cookie)
 {
struct ib_umem *umem = _odp->umem;
 
-   /*
-* Increase the number of notifiers running, to
-* prevent any further fault handling on this MR.
-*/
-   ib_umem_notifier_start_account(umem_odp);
umem_odp->dying = 1;
/* Make sure that the fact the umem is dying is out before we release
 * all pending page faults. */
smp_wmb();
-   complete_all(_odp->notifier_completion);
umem->context->invalidate_range(umem_odp, ib_umem_start(umem),
ib_umem_end(umem));
return 0;
 }
 
-static void ib_umem_notifier_release(struct mmu_notifier *mn,
-struct mm_struct *mm)
+static void ib_umem_notifier_release(struct hmm_mirror *mirror)
 {
-   struct ib_ucontext_per_mm *per_mm =
-   container_of(mn, struct ib_ucontext_per_mm, mn);
+   struct ib_ucontext_per_mm *per_mm;
+
+   per_mm = container_of(mirror, struct 

[PATCH v4 0/1] Use HMM for ODP v4

2019-04-11 Thread jglisse
From: Jérôme Glisse 

Just fixed Kconfig and build when ODP was not enabled, other than that
this is the same as v3. Here is previous cover letter:

Git tree with all prerequisite:
https://cgit.freedesktop.org/~glisse/linux/log/?h=rdma-odp-hmm-v4

This patchset convert RDMA ODP to use HMM underneath this is motivated
by stronger code sharing for same feature (share virtual memory SVM or
Share Virtual Address SVA) and also stronger integration with mm code to
achieve that. It depends on HMM patchset posted for inclusion in 5.2 [2]
and [3].

It has been tested with pingpong test with -o and others flags to test
different size/features associated with ODP.

Moreover they are some features of HMM in the works like peer to peer
support, fast CPU page table snapshot, fast IOMMU mapping update ...
It will be easier for RDMA devices with ODP to leverage those if they
use HMM underneath.

Quick summary of what HMM is:
HMM is a toolbox for device driver to implement software support for
Share Virtual Memory (SVM). Not only it provides helpers to mirror a
process address space on a device (hmm_mirror). It also provides
helper to allow to use device memory to back regular valid virtual
address of a process (any valid mmap that is not an mmap of a device
or a DAX mapping). They are two kinds of device memory. Private memory
that is not accessible to CPU because it does not have all the expected
properties (this is for all PCIE devices) or public memory which can
also be access by CPU without restriction (with OpenCAPI or CCIX or
similar cache-coherent and atomic inter-connect).

Device driver can use each of HMM tools separatly. You do not have to
use all the tools it provides.

For RDMA device i do not expect a need to use the device memory support
of HMM. This device memory support is geared toward accelerator like GPU.


You can find a branch [1] with all the prerequisite in. This patch is on
top of rdma-next with the HMM patchset [2] and mmu notifier patchset [3]
applied on top of it.

[1] https://cgit.freedesktop.org/~glisse/linux/log/?h=rdma-odp-hmm-v4
[2] https://lkml.org/lkml/2019/4/3/1032
[3] https://lkml.org/lkml/2019/3/26/900

Cc: linux-r...@vger.kernel.org
Cc: Jason Gunthorpe 
Cc: Leon Romanovsky 
Cc: Doug Ledford 
Cc: Artemy Kovalyov 
Cc: Moni Shoua 
Cc: Mike Marciniszyn 
Cc: Kaike Wan 
Cc: Dennis Dalessandro 

Jérôme Glisse (1):
  RDMA/odp: convert to use HMM for ODP v4

 drivers/infiniband/Kconfig |   3 +-
 drivers/infiniband/core/umem_odp.c | 499 -
 drivers/infiniband/hw/mlx5/mem.c   |  20 +-
 drivers/infiniband/hw/mlx5/mr.c|   2 +-
 drivers/infiniband/hw/mlx5/odp.c   | 106 +++---
 include/rdma/ib_umem_odp.h |  49 ++-
 6 files changed, 231 insertions(+), 448 deletions(-)

-- 
2.20.1



[PATCH] mm/hmm: kconfig split HMM address space mirroring from device memory

2019-04-11 Thread jglisse
From: Jérôme Glisse 

To allow building device driver that only care about address space
mirroring (like RDMA ODP) on platform that do not have all the pre-
requisite for HMM device memory (like ZONE_DEVICE on ARM) split the
HMM_MIRROR option dependency from the HMM_DEVICE dependency.

Signed-off-by: Jérôme Glisse 
Cc: Leon Romanovsky 
Cc: Jason Gunthorpe 
Cc: Andrew Morton 
Cc: Ralph Campbell 
Cc: John Hubbard 
---
 mm/Kconfig | 17 ++---
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/mm/Kconfig b/mm/Kconfig
index 2e6d24d783f7..00d9febbc775 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -679,12 +679,13 @@ config ZONE_DEVICE
 config ARCH_HAS_HMM
bool
default y
-   depends on (X86_64 || PPC64)
-   depends on ZONE_DEVICE
depends on MMU && 64BIT
-   depends on MEMORY_HOTPLUG
-   depends on MEMORY_HOTREMOVE
-   depends on SPARSEMEM_VMEMMAP
+
+config ARCH_HAS_HMM_DEVICE
+   bool
+   default y
+   depends on (X86_64 || PPC64)
+   depends on ARCH_HAS_ZONE_DEVICE
 
 config MIGRATE_VMA_HELPER
bool
@@ -710,7 +711,8 @@ config HMM_MIRROR
 
 config DEVICE_PRIVATE
bool "Unaddressable device memory (GPU memory, ...)"
-   depends on ARCH_HAS_HMM
+   depends on ARCH_HAS_HMM_DEVICE
+   depends on ZONE_DEVICE
select HMM
select DEV_PAGEMAP_OPS
 
@@ -721,7 +723,8 @@ config DEVICE_PRIVATE
 
 config DEVICE_PUBLIC
bool "Addressable device memory (like GPU memory)"
-   depends on ARCH_HAS_HMM
+   depends on ARCH_HAS_HMM_DEVICE
+   depends on ZONE_DEVICE
select HMM
select DEV_PAGEMAP_OPS
 
-- 
2.20.1



[PATCH] cifs: fix page reference leak with readv/writev

2019-04-10 Thread jglisse
From: Jérôme Glisse 

CIFS can leak pages reference gotten through GUP (get_user_pages*()
through iov_iter_get_pages()). This happen if cifs_send_async_read()
or cifs_write_from_iter() calls fail from within __cifs_readv() and
__cifs_writev() respectively. This patch move page unreference to
cifs_aio_ctx_release() which will happens on all code paths this is
all simpler to follow for correctness.

Signed-off-by: Jérôme Glisse 
Cc: Steve French 
Cc: linux-c...@vger.kernel.org
Cc: samba-techni...@lists.samba.org
Cc: Alexander Viro 
Cc: linux-fsde...@vger.kernel.org
Cc: Linus Torvalds 
Cc: Stable 
---
 fs/cifs/file.c | 15 +--
 fs/cifs/misc.c | 23 ++-
 2 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 89006e044973..a756a4d3f70f 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2858,7 +2858,6 @@ static void collect_uncached_write_data(struct 
cifs_aio_ctx *ctx)
struct cifs_tcon *tcon;
struct cifs_sb_info *cifs_sb;
struct dentry *dentry = ctx->cfile->dentry;
-   unsigned int i;
int rc;
 
tcon = tlink_tcon(ctx->cfile->tlink);
@@ -2922,10 +2921,6 @@ static void collect_uncached_write_data(struct 
cifs_aio_ctx *ctx)
kref_put(>refcount, cifs_uncached_writedata_release);
}
 
-   if (!ctx->direct_io)
-   for (i = 0; i < ctx->npages; i++)
-   put_page(ctx->bv[i].bv_page);
-
cifs_stats_bytes_written(tcon, ctx->total_len);
set_bit(CIFS_INO_INVALID_MAPPING, _I(dentry->d_inode)->flags);
 
@@ -3563,7 +3558,6 @@ collect_uncached_read_data(struct cifs_aio_ctx *ctx)
struct iov_iter *to = >iter;
struct cifs_sb_info *cifs_sb;
struct cifs_tcon *tcon;
-   unsigned int i;
int rc;
 
tcon = tlink_tcon(ctx->cfile->tlink);
@@ -3647,15 +3641,8 @@ collect_uncached_read_data(struct cifs_aio_ctx *ctx)
kref_put(>refcount, cifs_uncached_readdata_release);
}
 
-   if (!ctx->direct_io) {
-   for (i = 0; i < ctx->npages; i++) {
-   if (ctx->should_dirty)
-   set_page_dirty(ctx->bv[i].bv_page);
-   put_page(ctx->bv[i].bv_page);
-   }
-
+   if (!ctx->direct_io)
ctx->total_len = ctx->len - iov_iter_count(to);
-   }
 
/* mask nodata case */
if (rc == -ENODATA)
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index bee203055b30..9bc0d17a9d77 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -768,6 +768,11 @@ cifs_aio_ctx_alloc(void)
 {
struct cifs_aio_ctx *ctx;
 
+   /*
+* Must use kzalloc to initialize ctx->bv to NULL and ctx->direct_io
+* to false so that we know when we have to unreference pages within
+* cifs_aio_ctx_release()
+*/
ctx = kzalloc(sizeof(struct cifs_aio_ctx), GFP_KERNEL);
if (!ctx)
return NULL;
@@ -786,7 +791,23 @@ cifs_aio_ctx_release(struct kref *refcount)
struct cifs_aio_ctx, refcount);
 
cifsFileInfo_put(ctx->cfile);
-   kvfree(ctx->bv);
+
+   /*
+* ctx->bv is only set if setup_aio_ctx_iter() was call successfuly
+* which means that iov_iter_get_pages() was a success and thus that
+* we have taken reference on pages.
+*/
+   if (ctx->bv) {
+   unsigned i;
+
+   for (i = 0; i < ctx->npages; i++) {
+   if (ctx->should_dirty)
+   set_page_dirty(ctx->bv[i].bv_page);
+   put_page(ctx->bv[i].bv_page);
+   }
+   kvfree(ctx->bv);
+   }
+
kfree(ctx);
 }
 
-- 
2.20.1



[PATCH v3 1/1] RDMA/odp: convert to use HMM for ODP v3

2019-04-10 Thread jglisse
From: Jérôme Glisse 

Convert ODP to use HMM so that we can build on common infrastructure
for different class of devices that want to mirror a process address
space into a device. There is no functional changes.

Changes since v2:
- rebase on top of newer HMM patchset and mmu notifier patchset
Changes since v1:
- improved comments
- simplified page alignment computation

Signed-off-by: Jérôme Glisse 
Cc: linux-r...@vger.kernel.org
Cc: Jason Gunthorpe 
Cc: Leon Romanovsky 
Cc: Doug Ledford 
Cc: Artemy Kovalyov 
Cc: Moni Shoua 
Cc: Mike Marciniszyn 
Cc: Kaike Wan 
Cc: Dennis Dalessandro 
---
 drivers/infiniband/core/umem_odp.c | 499 -
 drivers/infiniband/hw/mlx5/mem.c   |  20 +-
 drivers/infiniband/hw/mlx5/mr.c|   2 +-
 drivers/infiniband/hw/mlx5/odp.c   | 106 +++---
 include/rdma/ib_umem_odp.h |  48 ++-
 5 files changed, 228 insertions(+), 447 deletions(-)

diff --git a/drivers/infiniband/core/umem_odp.c 
b/drivers/infiniband/core/umem_odp.c
index bcd53f302df2..139f520e733d 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -46,6 +46,26 @@
 #include 
 #include 
 
+
+static uint64_t odp_hmm_flags[HMM_PFN_FLAG_MAX] = {
+   ODP_READ_BIT,   /* HMM_PFN_VALID */
+   ODP_WRITE_BIT,  /* HMM_PFN_WRITE */
+   /*
+* The ODP_DEVICE_BIT is not use by ODP but is here to comply
+* with HMM API which also catter to device with local memory.
+* RDMA devices do not have any such memory and thus do not
+* have a real use for that flag.
+*/
+   ODP_DEVICE_BIT, /* HMM_PFN_DEVICE_PRIVATE */
+};
+
+static uint64_t odp_hmm_values[HMM_PFN_VALUE_MAX] = {
+   -1UL,   /* HMM_PFN_ERROR */
+   0UL,/* HMM_PFN_NONE */
+   -2UL,   /* HMM_PFN_SPECIAL */
+};
+
+
 /*
  * The ib_umem list keeps track of memory regions for which the HW
  * device request to receive notification when the related memory
@@ -78,57 +98,25 @@ static u64 node_last(struct umem_odp_node *n)
 INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last,
 node_start, node_last, static, rbt_ib_umem)
 
-static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp)
-{
-   mutex_lock(_odp->umem_mutex);
-   if (umem_odp->notifiers_count++ == 0)
-   /*
-* Initialize the completion object for waiting on
-* notifiers. Since notifier_count is zero, no one should be
-* waiting right now.
-*/
-   reinit_completion(_odp->notifier_completion);
-   mutex_unlock(_odp->umem_mutex);
-}
-
-static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp)
-{
-   mutex_lock(_odp->umem_mutex);
-   /*
-* This sequence increase will notify the QP page fault that the page
-* that is going to be mapped in the spte could have been freed.
-*/
-   ++umem_odp->notifiers_seq;
-   if (--umem_odp->notifiers_count == 0)
-   complete_all(_odp->notifier_completion);
-   mutex_unlock(_odp->umem_mutex);
-}
-
 static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp,
   u64 start, u64 end, void *cookie)
 {
struct ib_umem *umem = _odp->umem;
 
-   /*
-* Increase the number of notifiers running, to
-* prevent any further fault handling on this MR.
-*/
-   ib_umem_notifier_start_account(umem_odp);
umem_odp->dying = 1;
/* Make sure that the fact the umem is dying is out before we release
 * all pending page faults. */
smp_wmb();
-   complete_all(_odp->notifier_completion);
umem->context->invalidate_range(umem_odp, ib_umem_start(umem),
ib_umem_end(umem));
return 0;
 }
 
-static void ib_umem_notifier_release(struct mmu_notifier *mn,
-struct mm_struct *mm)
+static void ib_umem_notifier_release(struct hmm_mirror *mirror)
 {
-   struct ib_ucontext_per_mm *per_mm =
-   container_of(mn, struct ib_ucontext_per_mm, mn);
+   struct ib_ucontext_per_mm *per_mm;
+
+   per_mm = container_of(mirror, struct ib_ucontext_per_mm, mirror);
 
down_read(_mm->umem_rwsem);
if (per_mm->active)
@@ -136,23 +124,26 @@ static void ib_umem_notifier_release(struct mmu_notifier 
*mn,
_mm->umem_tree, 0, ULLONG_MAX,
ib_umem_notifier_release_trampoline, true, NULL);
up_read(_mm->umem_rwsem);
+
+   per_mm->mm = NULL;
 }
 
-static int invalidate_range_start_trampoline(struct ib_umem_odp *item,
-u64 start, u64 end, void *cookie)
+static int invalidate_range_trampoline(struct ib_umem_odp *item,
+  u64 start, u64 end, void *cookie)
 {
-   ib_umem_notifier_start_account(item);

[PATCH v3 0/1] Use HMM for ODP v3

2019-04-10 Thread jglisse
From: Jérôme Glisse 

Changes since v1/v2 are about rebase and better comments in the code.
Previous cover letter slightly updated.


This patchset convert RDMA ODP to use HMM underneath this is motivated
by stronger code sharing for same feature (share virtual memory SVM or
Share Virtual Address SVA) and also stronger integration with mm code to
achieve that. It depends on HMM patchset posted for inclusion in 5.2 [2]
and [3].

It has been tested with pingpong test with -o and others flags to test
different size/features associated with ODP.

Moreover they are some features of HMM in the works like peer to peer
support, fast CPU page table snapshot, fast IOMMU mapping update ...
It will be easier for RDMA devices with ODP to leverage those if they
use HMM underneath.

Quick summary of what HMM is:
HMM is a toolbox for device driver to implement software support for
Share Virtual Memory (SVM). Not only it provides helpers to mirror a
process address space on a device (hmm_mirror). It also provides
helper to allow to use device memory to back regular valid virtual
address of a process (any valid mmap that is not an mmap of a device
or a DAX mapping). They are two kinds of device memory. Private memory
that is not accessible to CPU because it does not have all the expected
properties (this is for all PCIE devices) or public memory which can
also be access by CPU without restriction (with OpenCAPI or CCIX or
similar cache-coherent and atomic inter-connect).

Device driver can use each of HMM tools separatly. You do not have to
use all the tools it provides.

For RDMA device i do not expect a need to use the device memory support
of HMM. This device memory support is geared toward accelerator like GPU.


You can find a branch [1] with all the prerequisite in. This patch is on
top of rdma-next with the HMM patchset [2] and mmu notifier patchset [3]
applied on top of it.

[1] https://cgit.freedesktop.org/~glisse/linux/log/?h=rdma-5.2
[2] https://lkml.org/lkml/2019/4/3/1032
[3] https://lkml.org/lkml/2019/3/26/900

Cc: linux-r...@vger.kernel.org
Cc: Jason Gunthorpe 
Cc: Leon Romanovsky 
Cc: Doug Ledford 
Cc: Artemy Kovalyov 
Cc: Moni Shoua 
Cc: Mike Marciniszyn 
Cc: Kaike Wan 
Cc: Dennis Dalessandro 

Jérôme Glisse (1):
  RDMA/odp: convert to use HMM for ODP v3

 drivers/infiniband/core/umem_odp.c | 486 -
 drivers/infiniband/hw/mlx5/mem.c   |  20 +-
 drivers/infiniband/hw/mlx5/mr.c|   2 +-
 drivers/infiniband/hw/mlx5/odp.c   | 106 ---
 include/rdma/ib_umem_odp.h |  48 ++-
 5 files changed, 219 insertions(+), 443 deletions(-)

-- 
2.20.1



[PATCH] mm/hmm: fix hmm_range_dma_map()/hmm_range_dma_unmap()

2019-04-09 Thread jglisse
From: Jérôme Glisse 

Was using wrong field and wrong enum for read only versus read and
write mapping.

Signed-off-by: Jérôme Glisse 
Cc: Andrew Morton 
Cc: Ralph Campbell 
Cc: John Hubbard 
---
 mm/hmm.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 90369fd2307b..ecd16718285e 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1203,7 +1203,7 @@ long hmm_range_dma_map(struct hmm_range *range,
 
npages = (range->end - range->start) >> PAGE_SHIFT;
for (i = 0, mapped = 0; i < npages; ++i) {
-   enum dma_data_direction dir = DMA_FROM_DEVICE;
+   enum dma_data_direction dir = DMA_TO_DEVICE;
struct page *page;
 
/*
@@ -1227,7 +1227,7 @@ long hmm_range_dma_map(struct hmm_range *range,
}
 
/* If it is read and write than map bi-directional. */
-   if (range->pfns[i] & range->values[HMM_PFN_WRITE])
+   if (range->pfns[i] & range->flags[HMM_PFN_WRITE])
dir = DMA_BIDIRECTIONAL;
 
daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir);
@@ -1243,7 +1243,7 @@ long hmm_range_dma_map(struct hmm_range *range,
 
 unmap:
for (npages = i, i = 0; (i < npages) && mapped; ++i) {
-   enum dma_data_direction dir = DMA_FROM_DEVICE;
+   enum dma_data_direction dir = DMA_TO_DEVICE;
struct page *page;
 
page = hmm_device_entry_to_page(range, range->pfns[i]);
@@ -1254,7 +1254,7 @@ long hmm_range_dma_map(struct hmm_range *range,
continue;
 
/* If it is read and write than map bi-directional. */
-   if (range->pfns[i] & range->values[HMM_PFN_WRITE])
+   if (range->pfns[i] & range->flags[HMM_PFN_WRITE])
dir = DMA_BIDIRECTIONAL;
 
dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir);
@@ -1298,7 +1298,7 @@ long hmm_range_dma_unmap(struct hmm_range *range,
 
npages = (range->end - range->start) >> PAGE_SHIFT;
for (i = 0; i < npages; ++i) {
-   enum dma_data_direction dir = DMA_FROM_DEVICE;
+   enum dma_data_direction dir = DMA_TO_DEVICE;
struct page *page;
 
page = hmm_device_entry_to_page(range, range->pfns[i]);
@@ -1306,7 +1306,7 @@ long hmm_range_dma_unmap(struct hmm_range *range,
continue;
 
/* If it is read and write than map bi-directional. */
-   if (range->pfns[i] & range->values[HMM_PFN_WRITE]) {
+   if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) {
dir = DMA_BIDIRECTIONAL;
 
/*
-- 
2.20.1



[PATCH] zram: pass down the bvec we need to read into in the work struct

2019-04-08 Thread jglisse
From: Jérôme Glisse 

When scheduling work item to read page we need to pass down the proper
bvec struct which point to the page to read into. Before this patch it
uses randomly initialized bvec (only if PAGE_SIZE != 4096) which is
wrong.

Signed-off-by: Jérôme Glisse 
Cc: Minchan Kim 
Cc: Nitin Gupta 
Cc: Sergey Senozhatsky 
Cc: linux-kernel@vger.kernel.org
---
 drivers/block/zram/zram_drv.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 399cad7daae7..d58a359a6622 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -774,18 +774,18 @@ struct zram_work {
struct zram *zram;
unsigned long entry;
struct bio *bio;
+   struct bio_vec bvec;
 };
 
 #if PAGE_SIZE != 4096
 static void zram_sync_read(struct work_struct *work)
 {
-   struct bio_vec bvec;
struct zram_work *zw = container_of(work, struct zram_work, work);
struct zram *zram = zw->zram;
unsigned long entry = zw->entry;
struct bio *bio = zw->bio;
 
-   read_from_bdev_async(zram, , entry, bio);
+   read_from_bdev_async(zram, >bvec, entry, bio);
 }
 
 /*
@@ -798,6 +798,7 @@ static int read_from_bdev_sync(struct zram *zram, struct 
bio_vec *bvec,
 {
struct zram_work work;
 
+   work.bvec = *bvec;
work.zram = zram;
work.entry = entry;
work.bio = bio;
-- 
2.20.1



[PATCH v3 12/12] mm/hmm: convert various hmm_pfn_* to device_entry which is a better name

2019-04-03 Thread jglisse
From: Jérôme Glisse 

Convert hmm_pfn_* to device_entry_* as here we are dealing with device
driver specific entry format and hmm provide helpers to allow differents
components (including HMM) to create/parse device entry.

We keep wrapper with the old name so that we can convert driver to use the
new API in stages in each device driver tree. This will get remove once all
driver are converted.

Signed-off-by: Jérôme Glisse 
Cc: Andrew Morton 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: Dan Williams 
Cc: Ira Weiny 
---
 include/linux/hmm.h | 93 +++--
 mm/hmm.c| 19 +
 2 files changed, 75 insertions(+), 37 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index f81fe2c0f343..51ec27a84668 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -239,36 +239,36 @@ static inline bool hmm_range_valid(struct hmm_range 
*range)
 }
 
 /*
- * hmm_pfn_to_page() - return struct page pointed to by a valid HMM pfn
- * @range: range use to decode HMM pfn value
- * @pfn: HMM pfn value to get corresponding struct page from
- * Returns: struct page pointer if pfn is a valid HMM pfn, NULL otherwise
+ * hmm_device_entry_to_page() - return struct page pointed to by a device entry
+ * @range: range use to decode device entry value
+ * @entry: device entry value to get corresponding struct page from
+ * Returns: struct page pointer if entry is a valid, NULL otherwise
  *
- * If the HMM pfn is valid (ie valid flag set) then return the struct page
- * matching the pfn value stored in the HMM pfn. Otherwise return NULL.
+ * If the device entry is valid (ie valid flag set) then return the struct page
+ * matching the entry value. Otherwise return NULL.
  */
-static inline struct page *hmm_pfn_to_page(const struct hmm_range *range,
-  uint64_t pfn)
+static inline struct page *hmm_device_entry_to_page(const struct hmm_range 
*range,
+   uint64_t entry)
 {
-   if (pfn == range->values[HMM_PFN_NONE])
+   if (entry == range->values[HMM_PFN_NONE])
return NULL;
-   if (pfn == range->values[HMM_PFN_ERROR])
+   if (entry == range->values[HMM_PFN_ERROR])
return NULL;
-   if (pfn == range->values[HMM_PFN_SPECIAL])
+   if (entry == range->values[HMM_PFN_SPECIAL])
return NULL;
-   if (!(pfn & range->flags[HMM_PFN_VALID]))
+   if (!(entry & range->flags[HMM_PFN_VALID]))
return NULL;
-   return pfn_to_page(pfn >> range->pfn_shift);
+   return pfn_to_page(entry >> range->pfn_shift);
 }
 
 /*
- * hmm_pfn_to_pfn() - return pfn value store in a HMM pfn
- * @range: range use to decode HMM pfn value
- * @pfn: HMM pfn value to extract pfn from
- * Returns: pfn value if HMM pfn is valid, -1UL otherwise
+ * hmm_device_entry_to_pfn() - return pfn value store in a device entry
+ * @range: range use to decode device entry value
+ * @entry: device entry to extract pfn from
+ * Returns: pfn value if device entry is valid, -1UL otherwise
  */
-static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range,
-  uint64_t pfn)
+static inline unsigned long
+hmm_device_entry_to_pfn(const struct hmm_range *range, uint64_t pfn)
 {
if (pfn == range->values[HMM_PFN_NONE])
return -1UL;
@@ -282,31 +282,66 @@ static inline unsigned long hmm_pfn_to_pfn(const struct 
hmm_range *range,
 }
 
 /*
- * hmm_pfn_from_page() - create a valid HMM pfn value from struct page
+ * hmm_device_entry_from_page() - create a valid device entry for a page
  * @range: range use to encode HMM pfn value
- * @page: struct page pointer for which to create the HMM pfn
- * Returns: valid HMM pfn for the page
+ * @page: page for which to create the device entry
+ * Returns: valid device entry for the page
  */
-static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range,
-struct page *page)
+static inline uint64_t hmm_device_entry_from_page(const struct hmm_range 
*range,
+ struct page *page)
 {
return (page_to_pfn(page) << range->pfn_shift) |
range->flags[HMM_PFN_VALID];
 }
 
 /*
- * hmm_pfn_from_pfn() - create a valid HMM pfn value from pfn
+ * hmm_device_entry_from_pfn() - create a valid device entry value from pfn
  * @range: range use to encode HMM pfn value
- * @pfn: pfn value for which to create the HMM pfn
- * Returns: valid HMM pfn for the pfn
+ * @pfn: pfn value for which to create the device entry
+ * Returns: valid device entry for the pfn
  */
-static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range,
-   unsigned long pfn)
+static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range,
+unsigned long pfn)
 {

[PATCH v3 05/12] mm/hmm: improve and rename hmm_vma_fault() to hmm_range_fault() v3

2019-04-03 Thread jglisse
From: Jérôme Glisse 

Minor optimization around hmm_pte_need_fault(). Rename for
consistency between code, comments and documentation. Also
improves the comments on all the possible returns values.
Improve the function by returning the number of populated
entries in pfns array.

Changes since v2:
- updated commit message
Changes since v1:
- updated documentation
- reformated some comments

Signed-off-by: Jérôme Glisse 
Reviewed-by: Ralph Campbell 
Cc: Andrew Morton 
Cc: John Hubbard 
Cc: Dan Williams 
---
 Documentation/vm/hmm.rst |  8 +---
 include/linux/hmm.h  | 13 +-
 mm/hmm.c | 91 +---
 3 files changed, 52 insertions(+), 60 deletions(-)

diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
index d9b27bdadd1b..61f073215a8d 100644
--- a/Documentation/vm/hmm.rst
+++ b/Documentation/vm/hmm.rst
@@ -190,13 +190,7 @@ When the device driver wants to populate a range of 
virtual addresses, it can
 use either::
 
   long hmm_range_snapshot(struct hmm_range *range);
-  int hmm_vma_fault(struct vm_area_struct *vma,
-struct hmm_range *range,
-unsigned long start,
-unsigned long end,
-hmm_pfn_t *pfns,
-bool write,
-bool block);
+  long hmm_range_fault(struct hmm_range *range, bool block);
 
 The first one (hmm_range_snapshot()) will only fetch present CPU page table
 entries and will not trigger a page fault on missing or non-present entries.
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 32206b0b1bfd..e9afd23c2eac 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -391,7 +391,18 @@ bool hmm_vma_range_done(struct hmm_range *range);
  *
  * See the function description in mm/hmm.c for further documentation.
  */
-int hmm_vma_fault(struct hmm_range *range, bool block);
+long hmm_range_fault(struct hmm_range *range, bool block);
+
+/* This is a temporary helper to avoid merge conflict between trees. */
+static inline int hmm_vma_fault(struct hmm_range *range, bool block)
+{
+   long ret = hmm_range_fault(range, block);
+   if (ret == -EBUSY)
+   ret = -EAGAIN;
+   else if (ret == -EAGAIN)
+   ret = -EBUSY;
+   return ret < 0 ? ret : 0;
+}
 
 /* Below are for HMM internal use only! Not to be used by device driver! */
 void hmm_mm_destroy(struct mm_struct *mm);
diff --git a/mm/hmm.c b/mm/hmm.c
index bd957a9f10d1..b7e4034d96e1 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -340,13 +340,13 @@ static int hmm_vma_do_fault(struct mm_walk *walk, 
unsigned long addr,
flags |= write_fault ? FAULT_FLAG_WRITE : 0;
ret = handle_mm_fault(vma, addr, flags);
if (ret & VM_FAULT_RETRY)
-   return -EBUSY;
+   return -EAGAIN;
if (ret & VM_FAULT_ERROR) {
*pfn = range->values[HMM_PFN_ERROR];
return -EFAULT;
}
 
-   return -EAGAIN;
+   return -EBUSY;
 }
 
 static int hmm_pfns_bad(unsigned long addr,
@@ -372,7 +372,7 @@ static int hmm_pfns_bad(unsigned long addr,
  * @fault: should we fault or not ?
  * @write_fault: write fault ?
  * @walk: mm_walk structure
- * Returns: 0 on success, -EAGAIN after page fault, or page fault error
+ * Returns: 0 on success, -EBUSY after page fault, or page fault error
  *
  * This function will be called whenever pmd_none() or pte_none() returns true,
  * or whenever there is no page directory covering the virtual address range.
@@ -395,12 +395,12 @@ static int hmm_vma_walk_hole_(unsigned long addr, 
unsigned long end,
 
ret = hmm_vma_do_fault(walk, addr, write_fault,
   [i]);
-   if (ret != -EAGAIN)
+   if (ret != -EBUSY)
return ret;
}
}
 
-   return (fault || write_fault) ? -EAGAIN : 0;
+   return (fault || write_fault) ? -EBUSY : 0;
 }
 
 static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
@@ -531,11 +531,11 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, 
unsigned long addr,
uint64_t orig_pfn = *pfn;
 
*pfn = range->values[HMM_PFN_NONE];
-   cpu_flags = pte_to_hmm_pfn_flags(range, pte);
-   hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
-  , _fault);
+   fault = write_fault = false;
 
if (pte_none(pte)) {
+   hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0,
+  , _fault);
if (fault || write_fault)
goto fault;
return 0;
@@ -574,7 +574,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, 
unsigned long addr,
hmm_vma_walk->last = addr;
migration_entry_wait(vma->vm_mm,
 

[PATCH v3 02/12] mm/hmm: use reference counting for HMM struct v3

2019-04-03 Thread jglisse
From: Jérôme Glisse 

Every time i read the code to check that the HMM structure does not
vanish before it should thanks to the many lock protecting its removal
i get a headache. Switch to reference counting instead it is much
easier to follow and harder to break. This also remove some code that
is no longer needed with refcounting.

Changes since v2:
- Renamed hmm_register() to hmm_get_or_create() updated comments
  accordingly
Changes since v1:
- removed bunch of useless check (if API is use with bogus argument
  better to fail loudly so user fix their code)
- s/hmm_get/mm_get_hmm/

Signed-off-by: Jérôme Glisse 
Reviewed-by: Ralph Campbell 
Cc: John Hubbard 
Cc: Andrew Morton 
Cc: Dan Williams 
---
 include/linux/hmm.h |   2 +
 mm/hmm.c| 190 
 2 files changed, 124 insertions(+), 68 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index ad50b7b4f141..716fc61fa6d4 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -131,6 +131,7 @@ enum hmm_pfn_value_e {
 /*
  * struct hmm_range - track invalidation lock on virtual address range
  *
+ * @hmm: the core HMM structure this range is active against
  * @vma: the vm area struct for the range
  * @list: all range lock are on a list
  * @start: range virtual start address (inclusive)
@@ -142,6 +143,7 @@ enum hmm_pfn_value_e {
  * @valid: pfns array did not change since it has been fill by an HMM function
  */
 struct hmm_range {
+   struct hmm  *hmm;
struct vm_area_struct   *vma;
struct list_headlist;
unsigned long   start;
diff --git a/mm/hmm.c b/mm/hmm.c
index fe1cd87e49ac..919d78fd21c5 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -50,6 +50,7 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
  */
 struct hmm {
struct mm_struct*mm;
+   struct kref kref;
spinlock_t  lock;
struct list_headranges;
struct list_headmirrors;
@@ -57,24 +58,33 @@ struct hmm {
struct rw_semaphore mirrors_sem;
 };
 
-/*
- * hmm_register - register HMM against an mm (HMM internal)
+static inline struct hmm *mm_get_hmm(struct mm_struct *mm)
+{
+   struct hmm *hmm = READ_ONCE(mm->hmm);
+
+   if (hmm && kref_get_unless_zero(>kref))
+   return hmm;
+
+   return NULL;
+}
+
+/**
+ * hmm_get_or_create - register HMM against an mm (HMM internal)
  *
  * @mm: mm struct to attach to
+ * Returns: returns an HMM object, either by referencing the existing
+ *  (per-process) object, or by creating a new one.
  *
- * This is not intended to be used directly by device drivers. It allocates an
- * HMM struct if mm does not have one, and initializes it.
+ * This is not intended to be used directly by device drivers. If mm already
+ * has an HMM struct then it get a reference on it and returns it. Otherwise
+ * it allocates an HMM struct, initializes it, associate it with the mm and
+ * returns it.
  */
-static struct hmm *hmm_register(struct mm_struct *mm)
+static struct hmm *hmm_get_or_create(struct mm_struct *mm)
 {
-   struct hmm *hmm = READ_ONCE(mm->hmm);
+   struct hmm *hmm = mm_get_hmm(mm);
bool cleanup = false;
 
-   /*
-* The hmm struct can only be freed once the mm_struct goes away,
-* hence we should always have pre-allocated an new hmm struct
-* above.
-*/
if (hmm)
return hmm;
 
@@ -86,6 +96,7 @@ static struct hmm *hmm_register(struct mm_struct *mm)
hmm->mmu_notifier.ops = NULL;
INIT_LIST_HEAD(>ranges);
spin_lock_init(>lock);
+   kref_init(>kref);
hmm->mm = mm;
 
spin_lock(>page_table_lock);
@@ -106,7 +117,7 @@ static struct hmm *hmm_register(struct mm_struct *mm)
if (__mmu_notifier_register(>mmu_notifier, mm))
goto error_mm;
 
-   return mm->hmm;
+   return hmm;
 
 error_mm:
spin_lock(>page_table_lock);
@@ -118,9 +129,41 @@ static struct hmm *hmm_register(struct mm_struct *mm)
return NULL;
 }
 
+static void hmm_free(struct kref *kref)
+{
+   struct hmm *hmm = container_of(kref, struct hmm, kref);
+   struct mm_struct *mm = hmm->mm;
+
+   mmu_notifier_unregister_no_release(>mmu_notifier, mm);
+
+   spin_lock(>page_table_lock);
+   if (mm->hmm == hmm)
+   mm->hmm = NULL;
+   spin_unlock(>page_table_lock);
+
+   kfree(hmm);
+}
+
+static inline void hmm_put(struct hmm *hmm)
+{
+   kref_put(>kref, hmm_free);
+}
+
 void hmm_mm_destroy(struct mm_struct *mm)
 {
-   kfree(mm->hmm);
+   struct hmm *hmm;
+
+   spin_lock(>page_table_lock);
+   hmm = mm_get_hmm(mm);
+   mm->hmm = NULL;
+   if (hmm) {
+   hmm->mm = NULL;
+   spin_unlock(>page_table_lock);
+   hmm_put(hmm);
+   return;
+   }
+
+   spin_unlock(>page_table_lock);
 }
 

[PATCH v3 10/12] mm/hmm: add helpers to test if mm is still alive or not

2019-04-03 Thread jglisse
From: Jérôme Glisse 

The device driver can have kernel thread or worker doing work against
a process mm and it is useful for those to test wether the mm is dead
or alive to avoid doing useless work. Add an helper to test that so
that driver can bail out early if a process is dying.

Note that the helper does not perform any lock synchronization and thus
is just a hint ie a process might be dying but the helper might still
return the process as alive. All HMM functions are safe to use in that
case as HMM internal properly protect itself with lock. If driver use
this helper with non HMM functions it should ascertain that it is safe
to do so.

Signed-off-by: Jérôme Glisse 
Cc: Ralph Campbell 
Cc: Andrew Morton 
Cc: John Hubbard 
Cc: Dan Williams 
Cc: Ira Weiny 
---
 include/linux/hmm.h | 24 
 1 file changed, 24 insertions(+)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index e5834082de60..a79fcc6681f5 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -438,6 +438,30 @@ struct hmm_mirror {
 int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm);
 void hmm_mirror_unregister(struct hmm_mirror *mirror);
 
+/*
+ * hmm_mirror_mm_is_alive() - test if mm is still alive
+ * @mirror: the HMM mm mirror for which we want to lock the mmap_sem
+ * Returns: false if the mm is dead, true otherwise
+ *
+ * This is an optimization it will not accurately always return -EINVAL if the
+ * mm is dead ie there can be false negative (process is being kill but HMM is
+ * not yet inform of that). It is only intented to be use to optimize out case
+ * where driver is about to do something time consuming and it would be better
+ * to skip it if the mm is dead.
+ */
+static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror)
+{
+   struct mm_struct *mm;
+
+   if (!mirror || !mirror->hmm)
+   return false;
+   mm = READ_ONCE(mirror->hmm->mm);
+   if (mirror->hmm->dead || !mm)
+   return false;
+
+   return true;
+}
+
 
 /*
  * Please see Documentation/vm/hmm.rst for how to use the range API.
-- 
2.17.2



[PATCH v3 09/12] mm/hmm: allow to mirror vma of a file on a DAX backed filesystem v3

2019-04-03 Thread jglisse
From: Jérôme Glisse 

HMM mirror is a device driver helpers to mirror range of virtual address.
It means that the process jobs running on the device can access the same
virtual address as the CPU threads of that process. This patch adds support
for mirroring mapping of file that are on a DAX block device (ie range of
virtual address that is an mmap of a file in a filesystem on a DAX block
device). There is no reason to not support such case when mirroring virtual
address on a device.

Note that unlike GUP code we do not take page reference hence when we
back-off we have nothing to undo.

Changes since v2:
- Added comments about get_dev_pagemap() optimization.
Changes since v1:
- improved commit message
- squashed: Arnd Bergmann: fix unused variable warning in hmm_vma_walk_pud

Signed-off-by: Jérôme Glisse 
Reviewed-by: Ralph Campbell 
Cc: Andrew Morton 
Cc: Dan Williams 
Cc: John Hubbard 
Cc: Arnd Bergmann 
---
 mm/hmm.c | 138 ++-
 1 file changed, 117 insertions(+), 21 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 9140cee24d36..39bc77d7e6e3 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -329,6 +329,7 @@ EXPORT_SYMBOL(hmm_mirror_unregister);
 
 struct hmm_vma_walk {
struct hmm_range*range;
+   struct dev_pagemap  *pgmap;
unsigned long   last;
boolfault;
boolblock;
@@ -503,6 +504,15 @@ static inline uint64_t pmd_to_hmm_pfn_flags(struct 
hmm_range *range, pmd_t pmd)
range->flags[HMM_PFN_VALID];
 }
 
+static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
+{
+   if (!pud_present(pud))
+   return 0;
+   return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
+   range->flags[HMM_PFN_WRITE] :
+   range->flags[HMM_PFN_VALID];
+}
+
 static int hmm_vma_handle_pmd(struct mm_walk *walk,
  unsigned long addr,
  unsigned long end,
@@ -524,8 +534,19 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
 
pfn = pmd_pfn(pmd) + pte_index(addr);
-   for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
+   for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+   if (pmd_devmap(pmd)) {
+   hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
+ hmm_vma_walk->pgmap);
+   if (unlikely(!hmm_vma_walk->pgmap))
+   return -EBUSY;
+   }
pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
+   }
+   if (hmm_vma_walk->pgmap) {
+   put_dev_pagemap(hmm_vma_walk->pgmap);
+   hmm_vma_walk->pgmap = NULL;
+   }
hmm_vma_walk->last = end;
return 0;
 }
@@ -612,10 +633,24 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, 
unsigned long addr,
if (fault || write_fault)
goto fault;
 
+   if (pte_devmap(pte)) {
+   hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte),
+ hmm_vma_walk->pgmap);
+   if (unlikely(!hmm_vma_walk->pgmap))
+   return -EBUSY;
+   } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) 
{
+   *pfn = range->values[HMM_PFN_SPECIAL];
+   return -EFAULT;
+   }
+
*pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags;
return 0;
 
 fault:
+   if (hmm_vma_walk->pgmap) {
+   put_dev_pagemap(hmm_vma_walk->pgmap);
+   hmm_vma_walk->pgmap = NULL;
+   }
pte_unmap(ptep);
/* Fault any virtual address we were asked to fault */
return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
@@ -703,12 +738,89 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
return r;
}
}
+   if (hmm_vma_walk->pgmap) {
+   /*
+* We do put_dev_pagemap() here and not in hmm_vma_handle_pte()
+* so that we can leverage get_dev_pagemap() optimization which
+* will not re-take a reference on a pgmap if we already have
+* one.
+*/
+   put_dev_pagemap(hmm_vma_walk->pgmap);
+   hmm_vma_walk->pgmap = NULL;
+   }
pte_unmap(ptep - 1);
 
hmm_vma_walk->last = addr;
return 0;
 }
 
+static int hmm_vma_walk_pud(pud_t *pudp,
+   unsigned long start,
+   unsigned long end,
+   struct mm_walk *walk)
+{
+   struct hmm_vma_walk *hmm_vma_walk = walk->private;
+   struct hmm_range *range = hmm_vma_walk->range;
+   unsigned long addr = start, 

[PATCH v3 06/12] mm/hmm: improve driver API to work and wait over a range v3

2019-04-03 Thread jglisse
From: Jérôme Glisse 

A common use case for HMM mirror is user trying to mirror a range
and before they could program the hardware it get invalidated by
some core mm event. Instead of having user re-try right away to
mirror the range provide a completion mechanism for them to wait
for any active invalidation affecting the range.

This also changes how hmm_range_snapshot() and hmm_range_fault()
works by not relying on vma so that we can drop the mmap_sem
when waiting and lookup the vma again on retry.

Changes since v2:
- Updated documentation to match new API.
- Added more comments in old API temporary wrapper.
- Consolidated documentation in hmm.rst to avoid out of sync.
Changes since v1:
- squashed: Dan Carpenter: potential deadlock in nonblocking code

Signed-off-by: Jérôme Glisse 
Reviewed-by: Ralph Campbell 
Cc: Andrew Morton 
Cc: John Hubbard 
Cc: Dan Williams 
Cc: Dan Carpenter 
Cc: Matthew Wilcox 
---
 Documentation/vm/hmm.rst |  25 +-
 include/linux/hmm.h  | 145 ---
 mm/hmm.c | 531 +++
 3 files changed, 387 insertions(+), 314 deletions(-)

diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
index 61f073215a8d..945d5fb6d14a 100644
--- a/Documentation/vm/hmm.rst
+++ b/Documentation/vm/hmm.rst
@@ -217,17 +217,33 @@ Locking with the update() callback is the most important 
aspect the driver must
   range.flags = ...;
   range.values = ...;
   range.pfn_shift = ...;
+  hmm_range_register();
+
+  /*
+   * Just wait for range to be valid, safe to ignore return value as we
+   * will use the return value of hmm_range_snapshot() below under the
+   * mmap_sem to ascertain the validity of the range.
+   */
+  hmm_range_wait_until_valid(, TIMEOUT_IN_MSEC);
 
  again:
   down_read(>mmap_sem);
-  range.vma = ...;
   ret = hmm_range_snapshot();
   if (ret) {
   up_read(>mmap_sem);
+  if (ret == -EAGAIN) {
+/*
+ * No need to check hmm_range_wait_until_valid() return value
+ * on retry we will get proper error with hmm_range_snapshot()
+ */
+hmm_range_wait_until_valid(, TIMEOUT_IN_MSEC);
+goto again;
+  }
+  hmm_mirror_unregister();
   return ret;
   }
   take_lock(driver->update);
-  if (!hmm_vma_range_done(vma, )) {
+  if (!range.valid) {
   release_lock(driver->update);
   up_read(>mmap_sem);
   goto again;
@@ -235,14 +251,15 @@ Locking with the update() callback is the most important 
aspect the driver must
 
   // Use pfns array content to update device page table
 
+  hmm_mirror_unregister();
   release_lock(driver->update);
   up_read(>mmap_sem);
   return 0;
  }
 
 The driver->update lock is the same lock that the driver takes inside its
-update() callback. That lock must be held before hmm_vma_range_done() to avoid
-any race with a concurrent CPU page table update.
+update() callback. That lock must be held before checking the range.valid
+field to avoid any race with a concurrent CPU page table update.
 
 HMM implements all this on top of the mmu_notifier API because we wanted a
 simpler API and also to be able to perform optimizations latter on like doing
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index e9afd23c2eac..ec4bfa91648f 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -77,8 +77,34 @@
 #include 
 #include 
 #include 
+#include 
 
-struct hmm;
+
+/*
+ * struct hmm - HMM per mm struct
+ *
+ * @mm: mm struct this HMM struct is bound to
+ * @lock: lock protecting ranges list
+ * @ranges: list of range being snapshotted
+ * @mirrors: list of mirrors for this mm
+ * @mmu_notifier: mmu notifier to track updates to CPU page table
+ * @mirrors_sem: read/write semaphore protecting the mirrors list
+ * @wq: wait queue for user waiting on a range invalidation
+ * @notifiers: count of active mmu notifiers
+ * @dead: is the mm dead ?
+ */
+struct hmm {
+   struct mm_struct*mm;
+   struct kref kref;
+   struct mutexlock;
+   struct list_headranges;
+   struct list_headmirrors;
+   struct mmu_notifier mmu_notifier;
+   struct rw_semaphore mirrors_sem;
+   wait_queue_head_t   wq;
+   longnotifiers;
+   booldead;
+};
 
 /*
  * hmm_pfn_flag_e - HMM flag enums
@@ -155,6 +181,38 @@ struct hmm_range {
boolvalid;
 };
 
+/*
+ * hmm_range_wait_until_valid() - wait for range to be valid
+ * @range: range affected by invalidation to wait on
+ * @timeout: time out for wait in ms (ie abort wait after that period of time)
+ * Returns: true if the range is valid, false otherwise.
+ */
+static inline bool hmm_range_wait_until_valid(struct hmm_range *range,
+ unsigned long 

[PATCH v3 11/12] mm/hmm: add an helper function that fault pages and map them to a device v3

2019-04-03 Thread jglisse
From: Jérôme Glisse 

This is a all in one helper that fault pages in a range and map them to
a device so that every single device driver do not have to re-implement
this common pattern.

This is taken from ODP RDMA in preparation of ODP RDMA convertion. It
will be use by nouveau and other drivers.

Changes since v2:
- Improved function comment for kernel documentation.
Changes since v1:
- improved commit message

Signed-off-by: Jérôme Glisse 
Cc: Andrew Morton 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: Dan Williams 
Cc: Souptick Joarder 
---
 include/linux/hmm.h |   9 +++
 mm/hmm.c| 152 
 2 files changed, 161 insertions(+)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index a79fcc6681f5..f81fe2c0f343 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -474,6 +474,15 @@ int hmm_range_register(struct hmm_range *range,
 void hmm_range_unregister(struct hmm_range *range);
 long hmm_range_snapshot(struct hmm_range *range);
 long hmm_range_fault(struct hmm_range *range, bool block);
+long hmm_range_dma_map(struct hmm_range *range,
+  struct device *device,
+  dma_addr_t *daddrs,
+  bool block);
+long hmm_range_dma_unmap(struct hmm_range *range,
+struct vm_area_struct *vma,
+struct device *device,
+dma_addr_t *daddrs,
+bool dirty);
 
 /*
  * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range
diff --git a/mm/hmm.c b/mm/hmm.c
index 39bc77d7e6e3..82fded7273d8 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -30,6 +30,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -1173,6 +1174,157 @@ long hmm_range_fault(struct hmm_range *range, bool 
block)
return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
 }
 EXPORT_SYMBOL(hmm_range_fault);
+
+/**
+ * hmm_range_dma_map() - hmm_range_fault() and dma map page all in one.
+ * @range: range being faulted
+ * @device: device against to dma map page to
+ * @daddrs: dma address of mapped pages
+ * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
+ * Returns: number of pages mapped on success, -EAGAIN if mmap_sem have been
+ *  drop and you need to try again, some other error value otherwise
+ *
+ * Note same usage pattern as hmm_range_fault().
+ */
+long hmm_range_dma_map(struct hmm_range *range,
+  struct device *device,
+  dma_addr_t *daddrs,
+  bool block)
+{
+   unsigned long i, npages, mapped;
+   long ret;
+
+   ret = hmm_range_fault(range, block);
+   if (ret <= 0)
+   return ret ? ret : -EBUSY;
+
+   npages = (range->end - range->start) >> PAGE_SHIFT;
+   for (i = 0, mapped = 0; i < npages; ++i) {
+   enum dma_data_direction dir = DMA_FROM_DEVICE;
+   struct page *page;
+
+   /*
+* FIXME need to update DMA API to provide invalid DMA address
+* value instead of a function to test dma address value. This
+* would remove lot of dumb code duplicated accross many arch.
+*
+* For now setting it to 0 here is good enough as the pfns[]
+* value is what is use to check what is valid and what isn't.
+*/
+   daddrs[i] = 0;
+
+   page = hmm_pfn_to_page(range, range->pfns[i]);
+   if (page == NULL)
+   continue;
+
+   /* Check if range is being invalidated */
+   if (!range->valid) {
+   ret = -EBUSY;
+   goto unmap;
+   }
+
+   /* If it is read and write than map bi-directional. */
+   if (range->pfns[i] & range->values[HMM_PFN_WRITE])
+   dir = DMA_BIDIRECTIONAL;
+
+   daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir);
+   if (dma_mapping_error(device, daddrs[i])) {
+   ret = -EFAULT;
+   goto unmap;
+   }
+
+   mapped++;
+   }
+
+   return mapped;
+
+unmap:
+   for (npages = i, i = 0; (i < npages) && mapped; ++i) {
+   enum dma_data_direction dir = DMA_FROM_DEVICE;
+   struct page *page;
+
+   page = hmm_pfn_to_page(range, range->pfns[i]);
+   if (page == NULL)
+   continue;
+
+   if (dma_mapping_error(device, daddrs[i]))
+   continue;
+
+   /* If it is read and write than map bi-directional. */
+   if (range->pfns[i] & range->values[HMM_PFN_WRITE])
+   dir = DMA_BIDIRECTIONAL;
+
+   dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir);
+   mapped--;
+   }
+
+   return ret;

[PATCH v3 03/12] mm/hmm: do not erase snapshot when a range is invalidated

2019-04-03 Thread jglisse
From: Jérôme Glisse 

Users of HMM might be using the snapshot information to do
preparatory step like dma mapping pages to a device before
checking for invalidation through hmm_vma_range_done() so
do not erase that information and assume users will do the
right thing.

Signed-off-by: Jérôme Glisse 
Reviewed-by: Ralph Campbell 
Reviewed-by: John Hubbard 
Cc: Andrew Morton 
Cc: Dan Williams 
---
 mm/hmm.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 919d78fd21c5..84e0577a912a 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -174,16 +174,10 @@ static int hmm_invalidate_range(struct hmm *hmm, bool 
device,
 
spin_lock(>lock);
list_for_each_entry(range, >ranges, list) {
-   unsigned long addr, idx, npages;
-
if (update->end < range->start || update->start >= range->end)
continue;
 
range->valid = false;
-   addr = max(update->start, range->start);
-   idx = (addr - range->start) >> PAGE_SHIFT;
-   npages = (min(range->end, update->end) - addr) >> PAGE_SHIFT;
-   memset(>pfns[idx], 0, sizeof(*range->pfns) * npages);
}
spin_unlock(>lock);
 
-- 
2.17.2



[PATCH v3 04/12] mm/hmm: improve and rename hmm_vma_get_pfns() to hmm_range_snapshot() v2

2019-04-03 Thread jglisse
From: Jérôme Glisse 

Rename for consistency between code, comments and documentation. Also
improves the comments on all the possible returns values. Improve the
function by returning the number of populated entries in pfns array.

Changes since v1:
- updated documentation
- reformated some comments

Signed-off-by: Jérôme Glisse 
Reviewed-by: Ralph Campbell 
Reviewed-by: John Hubbard 
Reviewed-by: Ira Weiny 
Cc: Andrew Morton 
Cc: Dan Williams 
---
 Documentation/vm/hmm.rst | 26 ++
 include/linux/hmm.h  |  4 ++--
 mm/hmm.c | 31 +--
 3 files changed, 37 insertions(+), 24 deletions(-)

diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
index 44205f0b671f..d9b27bdadd1b 100644
--- a/Documentation/vm/hmm.rst
+++ b/Documentation/vm/hmm.rst
@@ -189,11 +189,7 @@ the driver callback returns.
 When the device driver wants to populate a range of virtual addresses, it can
 use either::
 
-  int hmm_vma_get_pfns(struct vm_area_struct *vma,
-  struct hmm_range *range,
-  unsigned long start,
-  unsigned long end,
-  hmm_pfn_t *pfns);
+  long hmm_range_snapshot(struct hmm_range *range);
   int hmm_vma_fault(struct vm_area_struct *vma,
 struct hmm_range *range,
 unsigned long start,
@@ -202,7 +198,7 @@ When the device driver wants to populate a range of virtual 
addresses, it can
 bool write,
 bool block);
 
-The first one (hmm_vma_get_pfns()) will only fetch present CPU page table
+The first one (hmm_range_snapshot()) will only fetch present CPU page table
 entries and will not trigger a page fault on missing or non-present entries.
 The second one does trigger a page fault on missing or read-only entry if the
 write parameter is true. Page faults use the generic mm page fault code path
@@ -220,19 +216,33 @@ Locking with the update() callback is the most important 
aspect the driver must
  {
   struct hmm_range range;
   ...
+
+  range.start = ...;
+  range.end = ...;
+  range.pfns = ...;
+  range.flags = ...;
+  range.values = ...;
+  range.pfn_shift = ...;
+
  again:
-  ret = hmm_vma_get_pfns(vma, , start, end, pfns);
-  if (ret)
+  down_read(>mmap_sem);
+  range.vma = ...;
+  ret = hmm_range_snapshot();
+  if (ret) {
+  up_read(>mmap_sem);
   return ret;
+  }
   take_lock(driver->update);
   if (!hmm_vma_range_done(vma, )) {
   release_lock(driver->update);
+  up_read(>mmap_sem);
   goto again;
   }
 
   // Use pfns array content to update device page table
 
   release_lock(driver->update);
+  up_read(>mmap_sem);
   return 0;
  }
 
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 716fc61fa6d4..32206b0b1bfd 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -365,11 +365,11 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror);
  * table invalidation serializes on it.
  *
  * YOU MUST CALL hmm_vma_range_done() ONCE AND ONLY ONCE EACH TIME YOU CALL
- * hmm_vma_get_pfns() WITHOUT ERROR !
+ * hmm_range_snapshot() WITHOUT ERROR !
  *
  * IF YOU DO NOT FOLLOW THE ABOVE RULE THE SNAPSHOT CONTENT MIGHT BE INVALID !
  */
-int hmm_vma_get_pfns(struct hmm_range *range);
+long hmm_range_snapshot(struct hmm_range *range);
 bool hmm_vma_range_done(struct hmm_range *range);
 
 
diff --git a/mm/hmm.c b/mm/hmm.c
index 84e0577a912a..bd957a9f10d1 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -702,23 +702,25 @@ static void hmm_pfns_special(struct hmm_range *range)
 }
 
 /*
- * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual 
addresses
- * @range: range being snapshotted
- * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
- *  vma permission, 0 success
+ * hmm_range_snapshot() - snapshot CPU page table for a range
+ * @range: range
+ * Returns: number of valid pages in range->pfns[] (from range start
+ *  address). This may be zero. If the return value is negative,
+ *  then one of the following values may be returned:
+ *
+ *   -EINVAL  invalid arguments or mm or virtual address are in an
+ *invalid vma (ie either hugetlbfs or device file vma).
+ *   -EPERM   For example, asking for write, when the range is
+ *read-only
+ *   -EAGAIN  Caller needs to retry
+ *   -EFAULT  Either no valid vma exists for this range, or it is
+ *illegal to access the range
  *
  * This snapshots the CPU page table for a range of virtual addresses. Snapshot
  * validity is tracked by range struct. See hmm_vma_range_done() for further
  * information.
- *
- * The range struct is initialized here. It tracks the CPU page table, but only
- * if the function returns success (0), in which case the caller must 

[PATCH v3 08/12] mm/hmm: mirror hugetlbfs (snapshoting, faulting and DMA mapping) v3

2019-04-03 Thread jglisse
From: Jérôme Glisse 

HMM mirror is a device driver helpers to mirror range of virtual address.
It means that the process jobs running on the device can access the same
virtual address as the CPU threads of that process. This patch adds support
for hugetlbfs mapping (ie range of virtual address that are mmap of a
hugetlbfs).

Changes since v2:
- Use hmm_range_page_size() where we can.
Changes since v1:
- improved commit message
- squashed: Arnd Bergmann: fix unused variable warnings

Signed-off-by: Jérôme Glisse 
Reviewed-by: Ralph Campbell 
Reviewed-by: Ira Weiny 
Cc: Andrew Morton 
Cc: John Hubbard 
Cc: Dan Williams 
Cc: Arnd Bergmann 
---
 include/linux/hmm.h |  27 +-
 mm/hmm.c| 123 +++-
 2 files changed, 134 insertions(+), 16 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index dee2f8953b2e..e5834082de60 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -181,10 +181,31 @@ struct hmm_range {
const uint64_t  *values;
uint64_tdefault_flags;
uint64_tpfn_flags_mask;
+   uint8_t page_shift;
uint8_t pfn_shift;
boolvalid;
 };
 
+/*
+ * hmm_range_page_shift() - return the page shift for the range
+ * @range: range being queried
+ * Returns: page shift (page size = 1 << page shift) for the range
+ */
+static inline unsigned hmm_range_page_shift(const struct hmm_range *range)
+{
+   return range->page_shift;
+}
+
+/*
+ * hmm_range_page_size() - return the page size for the range
+ * @range: range being queried
+ * Returns: page size for the range in bytes
+ */
+static inline unsigned long hmm_range_page_size(const struct hmm_range *range)
+{
+   return 1UL << hmm_range_page_shift(range);
+}
+
 /*
  * hmm_range_wait_until_valid() - wait for range to be valid
  * @range: range affected by invalidation to wait on
@@ -424,7 +445,8 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror);
 int hmm_range_register(struct hmm_range *range,
   struct mm_struct *mm,
   unsigned long start,
-  unsigned long end);
+  unsigned long end,
+  unsigned page_shift);
 void hmm_range_unregister(struct hmm_range *range);
 long hmm_range_snapshot(struct hmm_range *range);
 long hmm_range_fault(struct hmm_range *range, bool block);
@@ -462,7 +484,8 @@ static inline int hmm_vma_fault(struct hmm_range *range, 
bool block)
range->pfn_flags_mask = -1UL;
 
ret = hmm_range_register(range, range->vma->vm_mm,
-range->start, range->end);
+range->start, range->end,
+PAGE_SHIFT);
if (ret)
return (int)ret;
 
diff --git a/mm/hmm.c b/mm/hmm.c
index 0e21d3594ab6..9140cee24d36 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -391,11 +391,13 @@ static int hmm_vma_walk_hole_(unsigned long addr, 
unsigned long end,
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
uint64_t *pfns = range->pfns;
-   unsigned long i;
+   unsigned long i, page_size;
 
hmm_vma_walk->last = addr;
-   i = (addr - range->start) >> PAGE_SHIFT;
-   for (; addr < end; addr += PAGE_SIZE, i++) {
+   page_size = hmm_range_page_size(range);
+   i = (addr - range->start) >> range->page_shift;
+
+   for (; addr < end; addr += page_size, i++) {
pfns[i] = range->values[HMM_PFN_NONE];
if (fault || write_fault) {
int ret;
@@ -707,6 +709,69 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
return 0;
 }
 
+static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
+ unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+   unsigned long addr = start, i, pfn, mask, size, pfn_inc;
+   struct hmm_vma_walk *hmm_vma_walk = walk->private;
+   struct hmm_range *range = hmm_vma_walk->range;
+   struct vm_area_struct *vma = walk->vma;
+   struct hstate *h = hstate_vma(vma);
+   uint64_t orig_pfn, cpu_flags;
+   bool fault, write_fault;
+   spinlock_t *ptl;
+   pte_t entry;
+   int ret = 0;
+
+   size = 1UL << huge_page_shift(h);
+   mask = size - 1;
+   if (range->page_shift != PAGE_SHIFT) {
+   /* Make sure we are looking at full page. */
+   if (start & mask)
+   return -EINVAL;
+   if (end < (start + size))
+   return -EINVAL;
+   pfn_inc = size >> PAGE_SHIFT;
+   } else {
+   pfn_inc = 1;
+   size = PAGE_SIZE;
+   }
+
+
+   ptl = huge_pte_lock(hstate_vma(walk->vma), 

[PATCH v3 07/12] mm/hmm: add default fault flags to avoid the need to pre-fill pfns arrays v2

2019-04-03 Thread jglisse
From: Jérôme Glisse 

The HMM mirror API can be use in two fashions. The first one where the HMM
user coalesce multiple page faults into one request and set flags per pfns
for of those faults. The second one where the HMM user want to pre-fault a
range with specific flags. For the latter one it is a waste to have the user
pre-fill the pfn arrays with a default flags value.

This patch adds a default flags value allowing user to set them for a range
without having to pre-fill the pfn array.

Changes since v1:
- Added documentation.
- Added comments in the old API wrapper to explain what is going on.

Signed-off-by: Jérôme Glisse 
Reviewed-by: Ralph Campbell 
Cc: Andrew Morton 
Cc: John Hubbard 
Cc: Dan Williams 
---
 Documentation/vm/hmm.rst | 35 +++
 include/linux/hmm.h  | 13 +
 mm/hmm.c | 12 
 3 files changed, 60 insertions(+)

diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
index 945d5fb6d14a..ec1efa32af3c 100644
--- a/Documentation/vm/hmm.rst
+++ b/Documentation/vm/hmm.rst
@@ -276,6 +276,41 @@ report commands as executed is serialized (there is no 
point in doing this
 concurrently).
 
 
+Leverage default_flags and pfn_flags_mask
+=
+
+The hmm_range struct has 2 fields default_flags and pfn_flags_mask that allows
+to set fault or snapshot policy for a whole range instead of having to set them
+for each entries in the range.
+
+For instance if the device flags for device entries are:
+VALID (1 << 63)
+WRITE (1 << 62)
+
+Now let say that device driver wants to fault with at least read a range then
+it does set:
+range->default_flags = (1 << 63)
+range->pfn_flags_mask = 0;
+
+and calls hmm_range_fault() as described above. This will fill fault all page
+in the range with at least read permission.
+
+Now let say driver wants to do the same except for one page in the range for
+which its want to have write. Now driver set:
+range->default_flags = (1 << 63);
+range->pfn_flags_mask = (1 << 62);
+range->pfns[index_of_write] = (1 << 62);
+
+With this HMM will fault in all page with at least read (ie valid) and for the
+address == range->start + (index_of_write << PAGE_SHIFT) it will fault with
+write permission ie if the CPU pte does not have write permission set then HMM
+will call handle_mm_fault().
+
+Note that HMM will populate the pfns array with write permission for any entry
+that have write permission within the CPU pte no matter what are the values set
+in default_flags or pfn_flags_mask.
+
+
 Represent and manage device memory from core kernel point of view
 =
 
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index ec4bfa91648f..dee2f8953b2e 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -165,6 +165,8 @@ enum hmm_pfn_value_e {
  * @pfns: array of pfns (big enough for the range)
  * @flags: pfn flags to match device driver page table
  * @values: pfn value for some special case (none, special, error, ...)
+ * @default_flags: default flags for the range (write, read, ... see hmm doc)
+ * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter
  * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT)
  * @valid: pfns array did not change since it has been fill by an HMM function
  */
@@ -177,6 +179,8 @@ struct hmm_range {
uint64_t*pfns;
const uint64_t  *flags;
const uint64_t  *values;
+   uint64_tdefault_flags;
+   uint64_tpfn_flags_mask;
uint8_t pfn_shift;
boolvalid;
 };
@@ -448,6 +452,15 @@ static inline int hmm_vma_fault(struct hmm_range *range, 
bool block)
 {
long ret;
 
+   /*
+* With the old API the driver must set each individual entries with
+* the requested flags (valid, write, ...). So here we set the mask to
+* keep intact the entries provided by the driver and zero out the
+* default_flags.
+*/
+   range->default_flags = 0;
+   range->pfn_flags_mask = -1UL;
+
ret = hmm_range_register(range, range->vma->vm_mm,
 range->start, range->end);
if (ret)
diff --git a/mm/hmm.c b/mm/hmm.c
index 3e07f32b94f8..0e21d3594ab6 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -419,6 +419,18 @@ static inline void hmm_pte_need_fault(const struct 
hmm_vma_walk *hmm_vma_walk,
if (!hmm_vma_walk->fault)
return;
 
+   /*
+* So we not only consider the individual per page request we also
+* consider the default flags requested for the range. The API can
+* be use in 2 fashions. The first one where the HMM user coalesce
+* multiple page fault into one request and set flags per pfns for
+* of those faults. The second one where the HMM user 

[PATCH v3 00/12] Improve HMM driver API v3

2019-04-03 Thread jglisse
From: Jérôme Glisse 

Changes since v2:
- Improved the documentations
- Added more comments in the code to explain things
- Renamed bunch of functions from popular demands


This patchset improves the HMM driver API and add support for mirroring
virtual address that are mmap of hugetlbfs or of a file in a filesystem
on a DAX block device. You can find a tree with all the patches [1]

This patchset is necessary for converting ODP to HMM and patch to do so
as been posted [2]. All new functions introduced by this patchset are use
by the ODP patch. The ODP patch will be push through the RDMA tree the
release after this patchset is merged.

Moreover all HMM functions are use by the nouveau driver starting in 5.1.

The last patch in the serie add helpers to directly dma map/unmap pages
for virtual addresses that are mirrored on behalf of device driver. This
has been extracted from ODP code as it is is a common pattern accross HMM
device driver. It will be first use by the ODP RDMA code and will latter
get use by nouveau and other driver that are working on including HMM
support.

[1] https://cgit.freedesktop.org/~glisse/linux/log/?h=hmm-for-5.2.v3
[2] https://cgit.freedesktop.org/~glisse/linux/log/?h=odp-hmm
[3] https://lkml.org/lkml/2019/1/29/1008

Cc: Balbir Singh 
Cc: Ralph Campbell 
Cc: Andrew Morton 
Cc: John Hubbard 
Cc: Dan Williams 
Cc: Ira Weiny 

Jérôme Glisse (12):
  mm/hmm: select mmu notifier when selecting HMM v2
  mm/hmm: use reference counting for HMM struct v3
  mm/hmm: do not erase snapshot when a range is invalidated
  mm/hmm: improve and rename hmm_vma_get_pfns() to hmm_range_snapshot()
v2
  mm/hmm: improve and rename hmm_vma_fault() to hmm_range_fault() v3
  mm/hmm: improve driver API to work and wait over a range v3
  mm/hmm: add default fault flags to avoid the need to pre-fill pfns
arrays v2
  mm/hmm: mirror hugetlbfs (snapshoting, faulting and DMA mapping) v3
  mm/hmm: allow to mirror vma of a file on a DAX backed filesystem v3
  mm/hmm: add helpers to test if mm is still alive or not
  mm/hmm: add an helper function that fault pages and map them to a
device v3
  mm/hmm: convert various hmm_pfn_* to device_entry which is a better
name

 Documentation/vm/hmm.rst |   94 +++-
 include/linux/hmm.h  |  310 ---
 mm/Kconfig   |2 +-
 mm/hmm.c | 1077 ++
 4 files changed, 1054 insertions(+), 429 deletions(-)

-- 
2.17.2



[PATCH v3 01/12] mm/hmm: select mmu notifier when selecting HMM v2

2019-04-03 Thread jglisse
From: Jérôme Glisse 

To avoid random config build issue, select mmu notifier when HMM is
selected. In any cases when HMM get selected it will be by users that
will also wants the mmu notifier.

Changes since v1:
- remove select MMU_NOTIFIER from HMM_MIRROR as it select HMM
  which select MMU_NOTIFIER now

Signed-off-by: Jérôme Glisse 
Acked-by: Balbir Singh 
Cc: Ralph Campbell 
Cc: Andrew Morton 
Cc: John Hubbard 
Cc: Dan Williams 
---
 mm/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/Kconfig b/mm/Kconfig
index 25c71eb8a7db..2e6d24d783f7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -694,12 +694,12 @@ config DEV_PAGEMAP_OPS
 
 config HMM
bool
+   select MMU_NOTIFIER
select MIGRATE_VMA_HELPER
 
 config HMM_MIRROR
bool "HMM mirror CPU page table into a device page table"
depends on ARCH_HAS_HMM
-   select MMU_NOTIFIER
select HMM
help
  Select HMM_MIRROR if you want to mirror range of the CPU page table 
of a
-- 
2.17.2



[PATCH v2 10/11] mm/hmm: add helpers for driver to safely take the mmap_sem v2

2019-03-25 Thread jglisse
From: Jérôme Glisse 

The device driver context which holds reference to mirror and thus to
core hmm struct might outlive the mm against which it was created. To
avoid every driver to check for that case provide an helper that check
if mm is still alive and take the mmap_sem in read mode if so. If the
mm have been destroy (mmu_notifier release call back did happen) then
we return -EINVAL so that calling code knows that it is trying to do
something against a mm that is no longer valid.

Changes since v1:
- removed bunch of useless check (if API is use with bogus argument
  better to fail loudly so user fix their code)

Signed-off-by: Jérôme Glisse 
Reviewed-by: Ralph Campbell 
Cc: Andrew Morton 
Cc: John Hubbard 
Cc: Dan Williams 
---
 include/linux/hmm.h | 50 ++---
 1 file changed, 47 insertions(+), 3 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index f3b919b04eda..5f9deaeb9d77 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -438,6 +438,50 @@ struct hmm_mirror {
 int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm);
 void hmm_mirror_unregister(struct hmm_mirror *mirror);
 
+/*
+ * hmm_mirror_mm_down_read() - lock the mmap_sem in read mode
+ * @mirror: the HMM mm mirror for which we want to lock the mmap_sem
+ * Returns: -EINVAL if the mm is dead, 0 otherwise (lock taken).
+ *
+ * The device driver context which holds reference to mirror and thus to core
+ * hmm struct might outlive the mm against which it was created. To avoid every
+ * driver to check for that case provide an helper that check if mm is still
+ * alive and take the mmap_sem in read mode if so. If the mm have been destroy
+ * (mmu_notifier release call back did happen) then we return -EINVAL so that
+ * calling code knows that it is trying to do something against a mm that is
+ * no longer valid.
+ */
+static inline int hmm_mirror_mm_down_read(struct hmm_mirror *mirror)
+{
+   struct mm_struct *mm;
+
+   /* Sanity check ... */
+   if (!mirror || !mirror->hmm)
+   return -EINVAL;
+   /*
+* Before trying to take the mmap_sem make sure the mm is still
+* alive as device driver context might outlive the mm lifetime.
+*
+* FIXME: should we also check for mm that outlive its owning
+* task ?
+*/
+   mm = READ_ONCE(mirror->hmm->mm);
+   if (mirror->hmm->dead || !mm)
+   return -EINVAL;
+
+   down_read(>mmap_sem);
+   return 0;
+}
+
+/*
+ * hmm_mirror_mm_up_read() - unlock the mmap_sem from read mode
+ * @mirror: the HMM mm mirror for which we want to lock the mmap_sem
+ */
+static inline void hmm_mirror_mm_up_read(struct hmm_mirror *mirror)
+{
+   up_read(>hmm->mm->mmap_sem);
+}
+
 
 /*
  * To snapshot the CPU page table you first have to call hmm_range_register()
@@ -463,7 +507,7 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror);
  *  if (ret)
  *  return ret;
  *
- *  down_read(mm->mmap_sem);
+ *  hmm_mirror_mm_down_read(mirror);
  *  again:
  *
  *  if (!hmm_range_wait_until_valid(, TIMEOUT)) {
@@ -476,13 +520,13 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror);
  *
  *  ret = hmm_range_snapshot(); or hmm_range_fault();
  *  if (ret == -EAGAIN) {
- *  down_read(mm->mmap_sem);
+ *  hmm_mirror_mm_down_read(mirror);
  *  goto again;
  *  } else if (ret == -EBUSY) {
  *  goto again;
  *  }
  *
- *  up_read(>mmap_sem);
+ *  hmm_mirror_mm_up_read(mirror);
  *  if (ret) {
  *  hmm_range_unregister(range);
  *  return ret;
-- 
2.17.2



[PATCH v2 11/11] mm/hmm: add an helper function that fault pages and map them to a device v2

2019-03-25 Thread jglisse
From: Jérôme Glisse 

This is a all in one helper that fault pages in a range and map them to
a device so that every single device driver do not have to re-implement
this common pattern.

This is taken from ODP RDMA in preparation of ODP RDMA convertion. It
will be use by nouveau and other drivers.

Changes since v1:
- improved commit message

Signed-off-by: Jérôme Glisse 
Cc: Andrew Morton 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: Dan Williams 
---
 include/linux/hmm.h |   9 +++
 mm/hmm.c| 152 
 2 files changed, 161 insertions(+)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 5f9deaeb9d77..7aadf18b29cb 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -568,6 +568,15 @@ int hmm_range_register(struct hmm_range *range,
 void hmm_range_unregister(struct hmm_range *range);
 long hmm_range_snapshot(struct hmm_range *range);
 long hmm_range_fault(struct hmm_range *range, bool block);
+long hmm_range_dma_map(struct hmm_range *range,
+  struct device *device,
+  dma_addr_t *daddrs,
+  bool block);
+long hmm_range_dma_unmap(struct hmm_range *range,
+struct vm_area_struct *vma,
+struct device *device,
+dma_addr_t *daddrs,
+bool dirty);
 
 /*
  * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range
diff --git a/mm/hmm.c b/mm/hmm.c
index ce33151c6832..fd143251b157 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -30,6 +30,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -1163,6 +1164,157 @@ long hmm_range_fault(struct hmm_range *range, bool 
block)
return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
 }
 EXPORT_SYMBOL(hmm_range_fault);
+
+/*
+ * hmm_range_dma_map() - hmm_range_fault() and dma map page all in one.
+ * @range: range being faulted
+ * @device: device against to dma map page to
+ * @daddrs: dma address of mapped pages
+ * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
+ * Returns: number of pages mapped on success, -EAGAIN if mmap_sem have been
+ *  drop and you need to try again, some other error value otherwise
+ *
+ * Note same usage pattern as hmm_range_fault().
+ */
+long hmm_range_dma_map(struct hmm_range *range,
+  struct device *device,
+  dma_addr_t *daddrs,
+  bool block)
+{
+   unsigned long i, npages, mapped;
+   long ret;
+
+   ret = hmm_range_fault(range, block);
+   if (ret <= 0)
+   return ret ? ret : -EBUSY;
+
+   npages = (range->end - range->start) >> PAGE_SHIFT;
+   for (i = 0, mapped = 0; i < npages; ++i) {
+   enum dma_data_direction dir = DMA_FROM_DEVICE;
+   struct page *page;
+
+   /*
+* FIXME need to update DMA API to provide invalid DMA address
+* value instead of a function to test dma address value. This
+* would remove lot of dumb code duplicated accross many arch.
+*
+* For now setting it to 0 here is good enough as the pfns[]
+* value is what is use to check what is valid and what isn't.
+*/
+   daddrs[i] = 0;
+
+   page = hmm_pfn_to_page(range, range->pfns[i]);
+   if (page == NULL)
+   continue;
+
+   /* Check if range is being invalidated */
+   if (!range->valid) {
+   ret = -EBUSY;
+   goto unmap;
+   }
+
+   /* If it is read and write than map bi-directional. */
+   if (range->pfns[i] & range->values[HMM_PFN_WRITE])
+   dir = DMA_BIDIRECTIONAL;
+
+   daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir);
+   if (dma_mapping_error(device, daddrs[i])) {
+   ret = -EFAULT;
+   goto unmap;
+   }
+
+   mapped++;
+   }
+
+   return mapped;
+
+unmap:
+   for (npages = i, i = 0; (i < npages) && mapped; ++i) {
+   enum dma_data_direction dir = DMA_FROM_DEVICE;
+   struct page *page;
+
+   page = hmm_pfn_to_page(range, range->pfns[i]);
+   if (page == NULL)
+   continue;
+
+   if (dma_mapping_error(device, daddrs[i]))
+   continue;
+
+   /* If it is read and write than map bi-directional. */
+   if (range->pfns[i] & range->values[HMM_PFN_WRITE])
+   dir = DMA_BIDIRECTIONAL;
+
+   dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir);
+   mapped--;
+   }
+
+   return ret;
+}
+EXPORT_SYMBOL(hmm_range_dma_map);
+
+/*
+ * hmm_range_dma_unmap() - unmap range of that was 

[PATCH v2 05/11] mm/hmm: improve and rename hmm_vma_fault() to hmm_range_fault() v2

2019-03-25 Thread jglisse
From: Jérôme Glisse 

Rename for consistency between code, comments and documentation. Also
improves the comments on all the possible returns values. Improve the
function by returning the number of populated entries in pfns array.

Changes since v1:
- updated documentation
- reformated some comments

Signed-off-by: Jérôme Glisse 
Reviewed-by: Ralph Campbell 
Cc: Andrew Morton 
Cc: John Hubbard 
Cc: Dan Williams 
---
 Documentation/vm/hmm.rst |  8 +---
 include/linux/hmm.h  | 13 +-
 mm/hmm.c | 91 +---
 3 files changed, 52 insertions(+), 60 deletions(-)

diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
index d9b27bdadd1b..61f073215a8d 100644
--- a/Documentation/vm/hmm.rst
+++ b/Documentation/vm/hmm.rst
@@ -190,13 +190,7 @@ When the device driver wants to populate a range of 
virtual addresses, it can
 use either::
 
   long hmm_range_snapshot(struct hmm_range *range);
-  int hmm_vma_fault(struct vm_area_struct *vma,
-struct hmm_range *range,
-unsigned long start,
-unsigned long end,
-hmm_pfn_t *pfns,
-bool write,
-bool block);
+  long hmm_range_fault(struct hmm_range *range, bool block);
 
 The first one (hmm_range_snapshot()) will only fetch present CPU page table
 entries and will not trigger a page fault on missing or non-present entries.
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 32206b0b1bfd..e9afd23c2eac 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -391,7 +391,18 @@ bool hmm_vma_range_done(struct hmm_range *range);
  *
  * See the function description in mm/hmm.c for further documentation.
  */
-int hmm_vma_fault(struct hmm_range *range, bool block);
+long hmm_range_fault(struct hmm_range *range, bool block);
+
+/* This is a temporary helper to avoid merge conflict between trees. */
+static inline int hmm_vma_fault(struct hmm_range *range, bool block)
+{
+   long ret = hmm_range_fault(range, block);
+   if (ret == -EBUSY)
+   ret = -EAGAIN;
+   else if (ret == -EAGAIN)
+   ret = -EBUSY;
+   return ret < 0 ? ret : 0;
+}
 
 /* Below are for HMM internal use only! Not to be used by device driver! */
 void hmm_mm_destroy(struct mm_struct *mm);
diff --git a/mm/hmm.c b/mm/hmm.c
index 91361aa74b8b..7860e63c3ba7 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -336,13 +336,13 @@ static int hmm_vma_do_fault(struct mm_walk *walk, 
unsigned long addr,
flags |= write_fault ? FAULT_FLAG_WRITE : 0;
ret = handle_mm_fault(vma, addr, flags);
if (ret & VM_FAULT_RETRY)
-   return -EBUSY;
+   return -EAGAIN;
if (ret & VM_FAULT_ERROR) {
*pfn = range->values[HMM_PFN_ERROR];
return -EFAULT;
}
 
-   return -EAGAIN;
+   return -EBUSY;
 }
 
 static int hmm_pfns_bad(unsigned long addr,
@@ -368,7 +368,7 @@ static int hmm_pfns_bad(unsigned long addr,
  * @fault: should we fault or not ?
  * @write_fault: write fault ?
  * @walk: mm_walk structure
- * Returns: 0 on success, -EAGAIN after page fault, or page fault error
+ * Returns: 0 on success, -EBUSY after page fault, or page fault error
  *
  * This function will be called whenever pmd_none() or pte_none() returns true,
  * or whenever there is no page directory covering the virtual address range.
@@ -391,12 +391,12 @@ static int hmm_vma_walk_hole_(unsigned long addr, 
unsigned long end,
 
ret = hmm_vma_do_fault(walk, addr, write_fault,
   [i]);
-   if (ret != -EAGAIN)
+   if (ret != -EBUSY)
return ret;
}
}
 
-   return (fault || write_fault) ? -EAGAIN : 0;
+   return (fault || write_fault) ? -EBUSY : 0;
 }
 
 static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
@@ -527,11 +527,11 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, 
unsigned long addr,
uint64_t orig_pfn = *pfn;
 
*pfn = range->values[HMM_PFN_NONE];
-   cpu_flags = pte_to_hmm_pfn_flags(range, pte);
-   hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
-  , _fault);
+   fault = write_fault = false;
 
if (pte_none(pte)) {
+   hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0,
+  , _fault);
if (fault || write_fault)
goto fault;
return 0;
@@ -570,7 +570,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, 
unsigned long addr,
hmm_vma_walk->last = addr;
migration_entry_wait(vma->vm_mm,
 pmdp, addr);
-   return -EAGAIN;
+   

[PATCH v2 07/11] mm/hmm: add default fault flags to avoid the need to pre-fill pfns arrays.

2019-03-25 Thread jglisse
From: Jérôme Glisse 

The HMM mirror API can be use in two fashions. The first one where the HMM
user coalesce multiple page faults into one request and set flags per pfns
for of those faults. The second one where the HMM user want to pre-fault a
range with specific flags. For the latter one it is a waste to have the user
pre-fill the pfn arrays with a default flags value.

This patch adds a default flags value allowing user to set them for a range
without having to pre-fill the pfn array.

Signed-off-by: Jérôme Glisse 
Reviewed-by: Ralph Campbell 
Cc: Andrew Morton 
Cc: John Hubbard 
Cc: Dan Williams 
---
 include/linux/hmm.h |  7 +++
 mm/hmm.c| 12 
 2 files changed, 19 insertions(+)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 79671036cb5f..13bc2c72f791 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -165,6 +165,8 @@ enum hmm_pfn_value_e {
  * @pfns: array of pfns (big enough for the range)
  * @flags: pfn flags to match device driver page table
  * @values: pfn value for some special case (none, special, error, ...)
+ * @default_flags: default flags for the range (write, read, ...)
+ * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter
  * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT)
  * @valid: pfns array did not change since it has been fill by an HMM function
  */
@@ -177,6 +179,8 @@ struct hmm_range {
uint64_t*pfns;
const uint64_t  *flags;
const uint64_t  *values;
+   uint64_tdefault_flags;
+   uint64_tpfn_flags_mask;
uint8_t pfn_shift;
boolvalid;
 };
@@ -521,6 +525,9 @@ static inline int hmm_vma_fault(struct hmm_range *range, 
bool block)
 {
long ret;
 
+   range->default_flags = 0;
+   range->pfn_flags_mask = -1UL;
+
ret = hmm_range_register(range, range->vma->vm_mm,
 range->start, range->end);
if (ret)
diff --git a/mm/hmm.c b/mm/hmm.c
index fa9498eeb9b6..4fe88a196d17 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -415,6 +415,18 @@ static inline void hmm_pte_need_fault(const struct 
hmm_vma_walk *hmm_vma_walk,
if (!hmm_vma_walk->fault)
return;
 
+   /*
+* So we not only consider the individual per page request we also
+* consider the default flags requested for the range. The API can
+* be use in 2 fashions. The first one where the HMM user coalesce
+* multiple page fault into one request and set flags per pfns for
+* of those faults. The second one where the HMM user want to pre-
+* fault a range with specific flags. For the latter one it is a
+* waste to have the user pre-fill the pfn arrays with a default
+* flags value.
+*/
+   pfns = (pfns & range->pfn_flags_mask) | range->default_flags;
+
/* We aren't ask to do anything ... */
if (!(pfns & range->flags[HMM_PFN_VALID]))
return;
-- 
2.17.2



[PATCH v2 04/11] mm/hmm: improve and rename hmm_vma_get_pfns() to hmm_range_snapshot() v2

2019-03-25 Thread jglisse
From: Jérôme Glisse 

Rename for consistency between code, comments and documentation. Also
improves the comments on all the possible returns values. Improve the
function by returning the number of populated entries in pfns array.

Changes since v1:
- updated documentation
- reformated some comments

Signed-off-by: Jérôme Glisse 
Reviewed-by: Ralph Campbell 
Reviewed-by: John Hubbard 
Cc: Andrew Morton 
Cc: Dan Williams 
---
 Documentation/vm/hmm.rst | 26 ++
 include/linux/hmm.h  |  4 ++--
 mm/hmm.c | 31 +--
 3 files changed, 37 insertions(+), 24 deletions(-)

diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
index 44205f0b671f..d9b27bdadd1b 100644
--- a/Documentation/vm/hmm.rst
+++ b/Documentation/vm/hmm.rst
@@ -189,11 +189,7 @@ the driver callback returns.
 When the device driver wants to populate a range of virtual addresses, it can
 use either::
 
-  int hmm_vma_get_pfns(struct vm_area_struct *vma,
-  struct hmm_range *range,
-  unsigned long start,
-  unsigned long end,
-  hmm_pfn_t *pfns);
+  long hmm_range_snapshot(struct hmm_range *range);
   int hmm_vma_fault(struct vm_area_struct *vma,
 struct hmm_range *range,
 unsigned long start,
@@ -202,7 +198,7 @@ When the device driver wants to populate a range of virtual 
addresses, it can
 bool write,
 bool block);
 
-The first one (hmm_vma_get_pfns()) will only fetch present CPU page table
+The first one (hmm_range_snapshot()) will only fetch present CPU page table
 entries and will not trigger a page fault on missing or non-present entries.
 The second one does trigger a page fault on missing or read-only entry if the
 write parameter is true. Page faults use the generic mm page fault code path
@@ -220,19 +216,33 @@ Locking with the update() callback is the most important 
aspect the driver must
  {
   struct hmm_range range;
   ...
+
+  range.start = ...;
+  range.end = ...;
+  range.pfns = ...;
+  range.flags = ...;
+  range.values = ...;
+  range.pfn_shift = ...;
+
  again:
-  ret = hmm_vma_get_pfns(vma, , start, end, pfns);
-  if (ret)
+  down_read(>mmap_sem);
+  range.vma = ...;
+  ret = hmm_range_snapshot();
+  if (ret) {
+  up_read(>mmap_sem);
   return ret;
+  }
   take_lock(driver->update);
   if (!hmm_vma_range_done(vma, )) {
   release_lock(driver->update);
+  up_read(>mmap_sem);
   goto again;
   }
 
   // Use pfns array content to update device page table
 
   release_lock(driver->update);
+  up_read(>mmap_sem);
   return 0;
  }
 
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 716fc61fa6d4..32206b0b1bfd 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -365,11 +365,11 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror);
  * table invalidation serializes on it.
  *
  * YOU MUST CALL hmm_vma_range_done() ONCE AND ONLY ONCE EACH TIME YOU CALL
- * hmm_vma_get_pfns() WITHOUT ERROR !
+ * hmm_range_snapshot() WITHOUT ERROR !
  *
  * IF YOU DO NOT FOLLOW THE ABOVE RULE THE SNAPSHOT CONTENT MIGHT BE INVALID !
  */
-int hmm_vma_get_pfns(struct hmm_range *range);
+long hmm_range_snapshot(struct hmm_range *range);
 bool hmm_vma_range_done(struct hmm_range *range);
 
 
diff --git a/mm/hmm.c b/mm/hmm.c
index 213b0beee8d3..91361aa74b8b 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -698,23 +698,25 @@ static void hmm_pfns_special(struct hmm_range *range)
 }
 
 /*
- * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual 
addresses
- * @range: range being snapshotted
- * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
- *  vma permission, 0 success
+ * hmm_range_snapshot() - snapshot CPU page table for a range
+ * @range: range
+ * Returns: number of valid pages in range->pfns[] (from range start
+ *  address). This may be zero. If the return value is negative,
+ *  then one of the following values may be returned:
+ *
+ *   -EINVAL  invalid arguments or mm or virtual address are in an
+ *invalid vma (ie either hugetlbfs or device file vma).
+ *   -EPERM   For example, asking for write, when the range is
+ *read-only
+ *   -EAGAIN  Caller needs to retry
+ *   -EFAULT  Either no valid vma exists for this range, or it is
+ *illegal to access the range
  *
  * This snapshots the CPU page table for a range of virtual addresses. Snapshot
  * validity is tracked by range struct. See hmm_vma_range_done() for further
  * information.
- *
- * The range struct is initialized here. It tracks the CPU page table, but only
- * if the function returns success (0), in which case the caller must then call
- * 

[PATCH v2 01/11] mm/hmm: select mmu notifier when selecting HMM

2019-03-25 Thread jglisse
From: Jérôme Glisse 

To avoid random config build issue, select mmu notifier when HMM is
selected. In any cases when HMM get selected it will be by users that
will also wants the mmu notifier.

Signed-off-by: Jérôme Glisse 
Acked-by: Balbir Singh 
Cc: Ralph Campbell 
Cc: Andrew Morton 
Cc: John Hubbard 
Cc: Dan Williams 
---
 mm/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/Kconfig b/mm/Kconfig
index 25c71eb8a7db..0d2944278d80 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -694,6 +694,7 @@ config DEV_PAGEMAP_OPS
 
 config HMM
bool
+   select MMU_NOTIFIER
select MIGRATE_VMA_HELPER
 
 config HMM_MIRROR
-- 
2.17.2



[PATCH v2 00/11] Improve HMM driver API v2

2019-03-25 Thread jglisse
From: Jérôme Glisse 

This patchset improves the HMM driver API and add support for mirroring
virtual address that are mmap of hugetlbfs or of a file in a filesystem
on a DAX block device. You can find a tree with all the patches [1]

This patchset is necessary for converting ODP to HMM and patch to do so
as been posted [2]. All new functions introduced by this patchset are use
by the ODP patch. The ODP patch will be push through the RDMA tree the
release after this patchset is merged.

Moreover all HMM functions are use by the nouveau driver starting in 5.1.

The last patch in the serie add helpers to directly dma map/unmap pages
for virtual addresses that are mirrored on behalf of device driver. This
has been extracted from ODP code as it is is a common pattern accross HMM
device driver. It will be first use by the ODP RDMA code and will latter
get use by nouveau and other driver that are working on including HMM
support.

[1] https://cgit.freedesktop.org/~glisse/linux/log/?h=hmm-for-5.1-v2
[2] https://cgit.freedesktop.org/~glisse/linux/log/?h=odp-hmm
[3] https://lkml.org/lkml/2019/1/29/1008

Cc: Balbir Singh 
Cc: Ralph Campbell 
Cc: Andrew Morton 
Cc: John Hubbard 
Cc: Dan Williams 

Jérôme Glisse (11):
  mm/hmm: select mmu notifier when selecting HMM
  mm/hmm: use reference counting for HMM struct v2
  mm/hmm: do not erase snapshot when a range is invalidated
  mm/hmm: improve and rename hmm_vma_get_pfns() to hmm_range_snapshot()
v2
  mm/hmm: improve and rename hmm_vma_fault() to hmm_range_fault() v2
  mm/hmm: improve driver API to work and wait over a range v2
  mm/hmm: add default fault flags to avoid the need to pre-fill pfns
arrays.
  mm/hmm: mirror hugetlbfs (snapshoting, faulting and DMA mapping) v2
  mm/hmm: allow to mirror vma of a file on a DAX backed filesystem v2
  mm/hmm: add helpers for driver to safely take the mmap_sem v2
  mm/hmm: add an helper function that fault pages and map them to a
device v2

 Documentation/vm/hmm.rst |   36 +-
 include/linux/hmm.h  |  290 ++-
 mm/Kconfig   |1 +
 mm/hmm.c | 1046 +-
 4 files changed, 990 insertions(+), 383 deletions(-)

-- 
2.17.2



[PATCH v2 06/11] mm/hmm: improve driver API to work and wait over a range v2

2019-03-25 Thread jglisse
From: Jérôme Glisse 

A common use case for HMM mirror is user trying to mirror a range
and before they could program the hardware it get invalidated by
some core mm event. Instead of having user re-try right away to
mirror the range provide a completion mechanism for them to wait
for any active invalidation affecting the range.

This also changes how hmm_range_snapshot() and hmm_range_fault()
works by not relying on vma so that we can drop the mmap_sem
when waiting and lookup the vma again on retry.

Changes since v1:
- squashed: Dan Carpenter: potential deadlock in nonblocking code

Signed-off-by: Jérôme Glisse 
Reviewed-by: Ralph Campbell 
Cc: Andrew Morton 
Cc: John Hubbard 
Cc: Dan Williams 
Cc: Dan Carpenter 
Cc: Matthew Wilcox 
---
 include/linux/hmm.h | 208 ++---
 mm/hmm.c| 528 +---
 2 files changed, 428 insertions(+), 308 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index e9afd23c2eac..79671036cb5f 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -77,8 +77,34 @@
 #include 
 #include 
 #include 
+#include 
 
-struct hmm;
+
+/*
+ * struct hmm - HMM per mm struct
+ *
+ * @mm: mm struct this HMM struct is bound to
+ * @lock: lock protecting ranges list
+ * @ranges: list of range being snapshotted
+ * @mirrors: list of mirrors for this mm
+ * @mmu_notifier: mmu notifier to track updates to CPU page table
+ * @mirrors_sem: read/write semaphore protecting the mirrors list
+ * @wq: wait queue for user waiting on a range invalidation
+ * @notifiers: count of active mmu notifiers
+ * @dead: is the mm dead ?
+ */
+struct hmm {
+   struct mm_struct*mm;
+   struct kref kref;
+   struct mutexlock;
+   struct list_headranges;
+   struct list_headmirrors;
+   struct mmu_notifier mmu_notifier;
+   struct rw_semaphore mirrors_sem;
+   wait_queue_head_t   wq;
+   longnotifiers;
+   booldead;
+};
 
 /*
  * hmm_pfn_flag_e - HMM flag enums
@@ -155,6 +181,38 @@ struct hmm_range {
boolvalid;
 };
 
+/*
+ * hmm_range_wait_until_valid() - wait for range to be valid
+ * @range: range affected by invalidation to wait on
+ * @timeout: time out for wait in ms (ie abort wait after that period of time)
+ * Returns: true if the range is valid, false otherwise.
+ */
+static inline bool hmm_range_wait_until_valid(struct hmm_range *range,
+ unsigned long timeout)
+{
+   /* Check if mm is dead ? */
+   if (range->hmm == NULL || range->hmm->dead || range->hmm->mm == NULL) {
+   range->valid = false;
+   return false;
+   }
+   if (range->valid)
+   return true;
+   wait_event_timeout(range->hmm->wq, range->valid || range->hmm->dead,
+  msecs_to_jiffies(timeout));
+   /* Return current valid status just in case we get lucky */
+   return range->valid;
+}
+
+/*
+ * hmm_range_valid() - test if a range is valid or not
+ * @range: range
+ * Returns: true if the range is valid, false otherwise.
+ */
+static inline bool hmm_range_valid(struct hmm_range *range)
+{
+   return range->valid;
+}
+
 /*
  * hmm_pfn_to_page() - return struct page pointed to by a valid HMM pfn
  * @range: range use to decode HMM pfn value
@@ -357,51 +415,133 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror);
 
 
 /*
- * To snapshot the CPU page table, call hmm_vma_get_pfns(), then take a device
- * driver lock that serializes device page table updates, then call
- * hmm_vma_range_done(), to check if the snapshot is still valid. The same
- * device driver page table update lock must also be used in the
- * hmm_mirror_ops.sync_cpu_device_pagetables() callback, so that CPU page
- * table invalidation serializes on it.
+ * To snapshot the CPU page table you first have to call hmm_range_register()
+ * to register the range. If hmm_range_register() return an error then some-
+ * thing is horribly wrong and you should fail loudly. If it returned true then
+ * you can wait for the range to be stable with hmm_range_wait_until_valid()
+ * function, a range is valid when there are no concurrent changes to the CPU
+ * page table for the range.
+ *
+ * Once the range is valid you can call hmm_range_snapshot() if that returns
+ * without error then you can take your device page table lock (the same lock
+ * you use in the HMM mirror sync_cpu_device_pagetables() callback). After
+ * taking that lock you have to check the range validity, if it is still valid
+ * (ie hmm_range_valid() returns true) then you can program the device page
+ * table, otherwise you have to start again. Pseudo code:
+ *
+ *  mydevice_prefault(mydevice, mm, start, end)
+ *  {
+ *  struct hmm_range range;
+ *  ...
  *
- * YOU MUST CALL hmm_vma_range_done() ONCE AND ONLY ONCE 

[PATCH v2 09/11] mm/hmm: allow to mirror vma of a file on a DAX backed filesystem v2

2019-03-25 Thread jglisse
From: Jérôme Glisse 

HMM mirror is a device driver helpers to mirror range of virtual address.
It means that the process jobs running on the device can access the same
virtual address as the CPU threads of that process. This patch adds support
for mirroring mapping of file that are on a DAX block device (ie range of
virtual address that is an mmap of a file in a filesystem on a DAX block
device). There is no reason to not support such case when mirroring virtual
address on a device.

Note that unlike GUP code we do not take page reference hence when we
back-off we have nothing to undo.

Changes since v1:
- improved commit message
- squashed: Arnd Bergmann: fix unused variable warning in hmm_vma_walk_pud

Signed-off-by: Jérôme Glisse 
Reviewed-by: Ralph Campbell 
Cc: Andrew Morton 
Cc: Dan Williams 
Cc: John Hubbard 
Cc: Arnd Bergmann 
---
 mm/hmm.c | 132 ++-
 1 file changed, 111 insertions(+), 21 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 64a33770813b..ce33151c6832 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -325,6 +325,7 @@ EXPORT_SYMBOL(hmm_mirror_unregister);
 
 struct hmm_vma_walk {
struct hmm_range*range;
+   struct dev_pagemap  *pgmap;
unsigned long   last;
boolfault;
boolblock;
@@ -499,6 +500,15 @@ static inline uint64_t pmd_to_hmm_pfn_flags(struct 
hmm_range *range, pmd_t pmd)
range->flags[HMM_PFN_VALID];
 }
 
+static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
+{
+   if (!pud_present(pud))
+   return 0;
+   return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
+   range->flags[HMM_PFN_WRITE] :
+   range->flags[HMM_PFN_VALID];
+}
+
 static int hmm_vma_handle_pmd(struct mm_walk *walk,
  unsigned long addr,
  unsigned long end,
@@ -520,8 +530,19 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
 
pfn = pmd_pfn(pmd) + pte_index(addr);
-   for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
+   for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+   if (pmd_devmap(pmd)) {
+   hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
+ hmm_vma_walk->pgmap);
+   if (unlikely(!hmm_vma_walk->pgmap))
+   return -EBUSY;
+   }
pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
+   }
+   if (hmm_vma_walk->pgmap) {
+   put_dev_pagemap(hmm_vma_walk->pgmap);
+   hmm_vma_walk->pgmap = NULL;
+   }
hmm_vma_walk->last = end;
return 0;
 }
@@ -608,10 +629,24 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, 
unsigned long addr,
if (fault || write_fault)
goto fault;
 
+   if (pte_devmap(pte)) {
+   hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte),
+ hmm_vma_walk->pgmap);
+   if (unlikely(!hmm_vma_walk->pgmap))
+   return -EBUSY;
+   } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) 
{
+   *pfn = range->values[HMM_PFN_SPECIAL];
+   return -EFAULT;
+   }
+
*pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags;
return 0;
 
 fault:
+   if (hmm_vma_walk->pgmap) {
+   put_dev_pagemap(hmm_vma_walk->pgmap);
+   hmm_vma_walk->pgmap = NULL;
+   }
pte_unmap(ptep);
/* Fault any virtual address we were asked to fault */
return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
@@ -699,12 +734,83 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
return r;
}
}
+   if (hmm_vma_walk->pgmap) {
+   put_dev_pagemap(hmm_vma_walk->pgmap);
+   hmm_vma_walk->pgmap = NULL;
+   }
pte_unmap(ptep - 1);
 
hmm_vma_walk->last = addr;
return 0;
 }
 
+static int hmm_vma_walk_pud(pud_t *pudp,
+   unsigned long start,
+   unsigned long end,
+   struct mm_walk *walk)
+{
+   struct hmm_vma_walk *hmm_vma_walk = walk->private;
+   struct hmm_range *range = hmm_vma_walk->range;
+   unsigned long addr = start, next;
+   pmd_t *pmdp;
+   pud_t pud;
+   int ret;
+
+again:
+   pud = READ_ONCE(*pudp);
+   if (pud_none(pud))
+   return hmm_vma_walk_hole(start, end, walk);
+
+   if (pud_huge(pud) && pud_devmap(pud)) {
+   unsigned long i, npages, pfn;
+   uint64_t *pfns, cpu_flags;
+   bool fault, write_fault;
+
+

[PATCH v2 03/11] mm/hmm: do not erase snapshot when a range is invalidated

2019-03-25 Thread jglisse
From: Jérôme Glisse 

Users of HMM might be using the snapshot information to do
preparatory step like dma mapping pages to a device before
checking for invalidation through hmm_vma_range_done() so
do not erase that information and assume users will do the
right thing.

Signed-off-by: Jérôme Glisse 
Reviewed-by: Ralph Campbell 
Reviewed-by: John Hubbard 
Cc: Andrew Morton 
Cc: Dan Williams 
---
 mm/hmm.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 306e57f7cded..213b0beee8d3 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -170,16 +170,10 @@ static int hmm_invalidate_range(struct hmm *hmm, bool 
device,
 
spin_lock(>lock);
list_for_each_entry(range, >ranges, list) {
-   unsigned long addr, idx, npages;
-
if (update->end < range->start || update->start >= range->end)
continue;
 
range->valid = false;
-   addr = max(update->start, range->start);
-   idx = (addr - range->start) >> PAGE_SHIFT;
-   npages = (min(range->end, update->end) - addr) >> PAGE_SHIFT;
-   memset(>pfns[idx], 0, sizeof(*range->pfns) * npages);
}
spin_unlock(>lock);
 
-- 
2.17.2



[PATCH v2 08/11] mm/hmm: mirror hugetlbfs (snapshoting, faulting and DMA mapping) v2

2019-03-25 Thread jglisse
From: Jérôme Glisse 

HMM mirror is a device driver helpers to mirror range of virtual address.
It means that the process jobs running on the device can access the same
virtual address as the CPU threads of that process. This patch adds support
for hugetlbfs mapping (ie range of virtual address that are mmap of a
hugetlbfs).

Changes since v1:
- improved commit message
- squashed: Arnd Bergmann: fix unused variable warnings

Signed-off-by: Jérôme Glisse 
Reviewed-by: Ralph Campbell 
Cc: Andrew Morton 
Cc: John Hubbard 
Cc: Dan Williams 
Cc: Arnd Bergmann 
---
 include/linux/hmm.h |  29 --
 mm/hmm.c| 126 +++-
 2 files changed, 138 insertions(+), 17 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 13bc2c72f791..f3b919b04eda 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -181,10 +181,31 @@ struct hmm_range {
const uint64_t  *values;
uint64_tdefault_flags;
uint64_tpfn_flags_mask;
+   uint8_t page_shift;
uint8_t pfn_shift;
boolvalid;
 };
 
+/*
+ * hmm_range_page_shift() - return the page shift for the range
+ * @range: range being queried
+ * Returns: page shift (page size = 1 << page shift) for the range
+ */
+static inline unsigned hmm_range_page_shift(const struct hmm_range *range)
+{
+   return range->page_shift;
+}
+
+/*
+ * hmm_range_page_size() - return the page size for the range
+ * @range: range being queried
+ * Returns: page size for the range in bytes
+ */
+static inline unsigned long hmm_range_page_size(const struct hmm_range *range)
+{
+   return 1UL << hmm_range_page_shift(range);
+}
+
 /*
  * hmm_range_wait_until_valid() - wait for range to be valid
  * @range: range affected by invalidation to wait on
@@ -438,7 +459,7 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror);
  *  struct hmm_range range;
  *  ...
  *
- *  ret = hmm_range_register(, mm, start, end);
+ *  ret = hmm_range_register(, mm, start, end, page_shift);
  *  if (ret)
  *  return ret;
  *
@@ -498,7 +519,8 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror);
 int hmm_range_register(struct hmm_range *range,
   struct mm_struct *mm,
   unsigned long start,
-  unsigned long end);
+  unsigned long end,
+  unsigned page_shift);
 void hmm_range_unregister(struct hmm_range *range);
 long hmm_range_snapshot(struct hmm_range *range);
 long hmm_range_fault(struct hmm_range *range, bool block);
@@ -529,7 +551,8 @@ static inline int hmm_vma_fault(struct hmm_range *range, 
bool block)
range->pfn_flags_mask = -1UL;
 
ret = hmm_range_register(range, range->vma->vm_mm,
-range->start, range->end);
+range->start, range->end,
+PAGE_SHIFT);
if (ret)
return (int)ret;
 
diff --git a/mm/hmm.c b/mm/hmm.c
index 4fe88a196d17..64a33770813b 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -387,11 +387,13 @@ static int hmm_vma_walk_hole_(unsigned long addr, 
unsigned long end,
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
uint64_t *pfns = range->pfns;
-   unsigned long i;
+   unsigned long i, page_size;
 
hmm_vma_walk->last = addr;
-   i = (addr - range->start) >> PAGE_SHIFT;
-   for (; addr < end; addr += PAGE_SIZE, i++) {
+   page_size = 1UL << range->page_shift;
+   i = (addr - range->start) >> range->page_shift;
+
+   for (; addr < end; addr += page_size, i++) {
pfns[i] = range->values[HMM_PFN_NONE];
if (fault || write_fault) {
int ret;
@@ -703,6 +705,69 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
return 0;
 }
 
+static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
+ unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+   unsigned long addr = start, i, pfn, mask, size, pfn_inc;
+   struct hmm_vma_walk *hmm_vma_walk = walk->private;
+   struct hmm_range *range = hmm_vma_walk->range;
+   struct vm_area_struct *vma = walk->vma;
+   struct hstate *h = hstate_vma(vma);
+   uint64_t orig_pfn, cpu_flags;
+   bool fault, write_fault;
+   spinlock_t *ptl;
+   pte_t entry;
+   int ret = 0;
+
+   size = 1UL << huge_page_shift(h);
+   mask = size - 1;
+   if (range->page_shift != PAGE_SHIFT) {
+   /* Make sure we are looking at full page. */
+   if (start & mask)
+   return -EINVAL;
+   if (end < (start + size))
+

[PATCH v2 02/11] mm/hmm: use reference counting for HMM struct v2

2019-03-25 Thread jglisse
From: Jérôme Glisse 

Every time i read the code to check that the HMM structure does not
vanish before it should thanks to the many lock protecting its removal
i get a headache. Switch to reference counting instead it is much
easier to follow and harder to break. This also remove some code that
is no longer needed with refcounting.

Changes since v1:
- removed bunch of useless check (if API is use with bogus argument
  better to fail loudly so user fix their code)
- s/hmm_get/mm_get_hmm/

Signed-off-by: Jérôme Glisse 
Reviewed-by: Ralph Campbell 
Cc: John Hubbard 
Cc: Andrew Morton 
Cc: Dan Williams 
---
 include/linux/hmm.h |   2 +
 mm/hmm.c| 170 
 2 files changed, 112 insertions(+), 60 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index ad50b7b4f141..716fc61fa6d4 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -131,6 +131,7 @@ enum hmm_pfn_value_e {
 /*
  * struct hmm_range - track invalidation lock on virtual address range
  *
+ * @hmm: the core HMM structure this range is active against
  * @vma: the vm area struct for the range
  * @list: all range lock are on a list
  * @start: range virtual start address (inclusive)
@@ -142,6 +143,7 @@ enum hmm_pfn_value_e {
  * @valid: pfns array did not change since it has been fill by an HMM function
  */
 struct hmm_range {
+   struct hmm  *hmm;
struct vm_area_struct   *vma;
struct list_headlist;
unsigned long   start;
diff --git a/mm/hmm.c b/mm/hmm.c
index fe1cd87e49ac..306e57f7cded 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -50,6 +50,7 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
  */
 struct hmm {
struct mm_struct*mm;
+   struct kref kref;
spinlock_t  lock;
struct list_headranges;
struct list_headmirrors;
@@ -57,6 +58,16 @@ struct hmm {
struct rw_semaphore mirrors_sem;
 };
 
+static inline struct hmm *mm_get_hmm(struct mm_struct *mm)
+{
+   struct hmm *hmm = READ_ONCE(mm->hmm);
+
+   if (hmm && kref_get_unless_zero(>kref))
+   return hmm;
+
+   return NULL;
+}
+
 /*
  * hmm_register - register HMM against an mm (HMM internal)
  *
@@ -67,14 +78,9 @@ struct hmm {
  */
 static struct hmm *hmm_register(struct mm_struct *mm)
 {
-   struct hmm *hmm = READ_ONCE(mm->hmm);
+   struct hmm *hmm = mm_get_hmm(mm);
bool cleanup = false;
 
-   /*
-* The hmm struct can only be freed once the mm_struct goes away,
-* hence we should always have pre-allocated an new hmm struct
-* above.
-*/
if (hmm)
return hmm;
 
@@ -86,6 +92,7 @@ static struct hmm *hmm_register(struct mm_struct *mm)
hmm->mmu_notifier.ops = NULL;
INIT_LIST_HEAD(>ranges);
spin_lock_init(>lock);
+   kref_init(>kref);
hmm->mm = mm;
 
spin_lock(>page_table_lock);
@@ -106,7 +113,7 @@ static struct hmm *hmm_register(struct mm_struct *mm)
if (__mmu_notifier_register(>mmu_notifier, mm))
goto error_mm;
 
-   return mm->hmm;
+   return hmm;
 
 error_mm:
spin_lock(>page_table_lock);
@@ -118,9 +125,41 @@ static struct hmm *hmm_register(struct mm_struct *mm)
return NULL;
 }
 
+static void hmm_free(struct kref *kref)
+{
+   struct hmm *hmm = container_of(kref, struct hmm, kref);
+   struct mm_struct *mm = hmm->mm;
+
+   mmu_notifier_unregister_no_release(>mmu_notifier, mm);
+
+   spin_lock(>page_table_lock);
+   if (mm->hmm == hmm)
+   mm->hmm = NULL;
+   spin_unlock(>page_table_lock);
+
+   kfree(hmm);
+}
+
+static inline void hmm_put(struct hmm *hmm)
+{
+   kref_put(>kref, hmm_free);
+}
+
 void hmm_mm_destroy(struct mm_struct *mm)
 {
-   kfree(mm->hmm);
+   struct hmm *hmm;
+
+   spin_lock(>page_table_lock);
+   hmm = mm_get_hmm(mm);
+   mm->hmm = NULL;
+   if (hmm) {
+   hmm->mm = NULL;
+   spin_unlock(>page_table_lock);
+   hmm_put(hmm);
+   return;
+   }
+
+   spin_unlock(>page_table_lock);
 }
 
 static int hmm_invalidate_range(struct hmm *hmm, bool device,
@@ -165,7 +204,7 @@ static int hmm_invalidate_range(struct hmm *hmm, bool 
device,
 static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 {
struct hmm_mirror *mirror;
-   struct hmm *hmm = mm->hmm;
+   struct hmm *hmm = mm_get_hmm(mm);
 
down_write(>mirrors_sem);
mirror = list_first_entry_or_null(>mirrors, struct hmm_mirror,
@@ -186,13 +225,16 @@ static void hmm_release(struct mmu_notifier *mn, struct 
mm_struct *mm)
  struct hmm_mirror, list);
}
up_write(>mirrors_sem);
+
+   hmm_put(hmm);
 }
 
 static int hmm_invalidate_range_start(struct mmu_notifier *mn,
 

[PATCH v2 1/1] RDMA/odp: convert to use HMM for ODP v2

2019-03-21 Thread jglisse
From: Jérôme Glisse 

Convert ODP to use HMM so that we can build on common infrastructure
for different class of devices that want to mirror a process address
space into a device. There is no functional changes.

Changes since v1:
- improved comments
- simplified page alignment computation

Signed-off-by: Jérôme Glisse 
Cc: Jason Gunthorpe 
Cc: Leon Romanovsky 
Cc: Doug Ledford 
Cc: Artemy Kovalyov 
Cc: Moni Shoua 
Cc: Mike Marciniszyn 
Cc: Kaike Wan 
Cc: Dennis Dalessandro 
---
 drivers/infiniband/core/umem_odp.c | 488 -
 drivers/infiniband/hw/mlx5/mem.c   |  20 +-
 drivers/infiniband/hw/mlx5/mr.c|   2 +-
 drivers/infiniband/hw/mlx5/odp.c   | 106 ---
 include/rdma/ib_umem_odp.h |  48 ++-
 5 files changed, 217 insertions(+), 447 deletions(-)

diff --git a/drivers/infiniband/core/umem_odp.c 
b/drivers/infiniband/core/umem_odp.c
index e6ec79ad9cc8..8ca90cc54b39 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -46,6 +46,20 @@
 #include 
 #include 
 
+
+static uint64_t odp_hmm_flags[HMM_PFN_FLAG_MAX] = {
+   ODP_READ_BIT,   /* HMM_PFN_VALID */
+   ODP_WRITE_BIT,  /* HMM_PFN_WRITE */
+   ODP_DEVICE_BIT, /* HMM_PFN_DEVICE_PRIVATE */
+};
+
+static uint64_t odp_hmm_values[HMM_PFN_VALUE_MAX] = {
+   -1UL,   /* HMM_PFN_ERROR */
+   0UL,/* HMM_PFN_NONE */
+   -2UL,   /* HMM_PFN_SPECIAL */
+};
+
+
 /*
  * The ib_umem list keeps track of memory regions for which the HW
  * device request to receive notification when the related memory
@@ -78,57 +92,25 @@ static u64 node_last(struct umem_odp_node *n)
 INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last,
 node_start, node_last, static, rbt_ib_umem)
 
-static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp)
-{
-   mutex_lock(_odp->umem_mutex);
-   if (umem_odp->notifiers_count++ == 0)
-   /*
-* Initialize the completion object for waiting on
-* notifiers. Since notifier_count is zero, no one should be
-* waiting right now.
-*/
-   reinit_completion(_odp->notifier_completion);
-   mutex_unlock(_odp->umem_mutex);
-}
-
-static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp)
-{
-   mutex_lock(_odp->umem_mutex);
-   /*
-* This sequence increase will notify the QP page fault that the page
-* that is going to be mapped in the spte could have been freed.
-*/
-   ++umem_odp->notifiers_seq;
-   if (--umem_odp->notifiers_count == 0)
-   complete_all(_odp->notifier_completion);
-   mutex_unlock(_odp->umem_mutex);
-}
-
 static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp,
   u64 start, u64 end, void *cookie)
 {
struct ib_umem *umem = _odp->umem;
 
-   /*
-* Increase the number of notifiers running, to
-* prevent any further fault handling on this MR.
-*/
-   ib_umem_notifier_start_account(umem_odp);
umem_odp->dying = 1;
/* Make sure that the fact the umem is dying is out before we release
 * all pending page faults. */
smp_wmb();
-   complete_all(_odp->notifier_completion);
umem->context->invalidate_range(umem_odp, ib_umem_start(umem),
ib_umem_end(umem));
return 0;
 }
 
-static void ib_umem_notifier_release(struct mmu_notifier *mn,
-struct mm_struct *mm)
+static void ib_umem_notifier_release(struct hmm_mirror *mirror)
 {
-   struct ib_ucontext_per_mm *per_mm =
-   container_of(mn, struct ib_ucontext_per_mm, mn);
+   struct ib_ucontext_per_mm *per_mm;
+
+   per_mm = container_of(mirror, struct ib_ucontext_per_mm, mirror);
 
down_read(_mm->umem_rwsem);
if (per_mm->active)
@@ -136,21 +118,24 @@ static void ib_umem_notifier_release(struct mmu_notifier 
*mn,
_mm->umem_tree, 0, ULLONG_MAX,
ib_umem_notifier_release_trampoline, true, NULL);
up_read(_mm->umem_rwsem);
+
+   per_mm->mm = NULL;
 }
 
-static int invalidate_range_start_trampoline(struct ib_umem_odp *item,
-u64 start, u64 end, void *cookie)
+static int invalidate_range_trampoline(struct ib_umem_odp *item,
+  u64 start, u64 end, void *cookie)
 {
-   ib_umem_notifier_start_account(item);
item->umem.context->invalidate_range(item, start, end);
return 0;
 }
 
-static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
-   const struct mmu_notifier_range *range)
+static int ib_sync_cpu_device_pagetables(struct hmm_mirror *mirror,
+   const struct hmm_update *range)
 {
-   struct 

[PATCH v2 0/1] Use HMM for ODP v2

2019-03-21 Thread jglisse
From: Jérôme Glisse 

This is just a rebase with minor changes and better comments in the code.

Previous cover letter (slightly improved):

This patchset convert RDMA ODP to use HMM underneath this is motivated
by stronger code sharing for same feature (share virtual memory SVM or
Share Virtual Address SVA) and also stronger integration with mm code to
achieve that. It depends on HMM patchset [1].

Moreover they are some features of HMM in the works like peer to peer
support, fast CPU page table snapshot, fast IOMMU mapping update ...
It will be easier for RDMA devices with ODP to leverage those if they
use HMM underneath.

Quick summary of what HMM is:
HMM is a toolbox for device driver to implement software support for
Share Virtual Memory (SVM). Not only it provides helpers to mirror a
process address space on a device (hmm_mirror). It also provides
helper to allow to use device memory to back regular valid virtual
address of a process (any valid mmap that is not an mmap of a device
or a DAX mapping). They are two kinds of device memory. Private memory
that is not accessible to CPU because it does not have all the expected
properties (this is for all PCIE devices) or public memory which can
also be access by CPU without restriction (with OpenCAPI or CCIX or
similar cache-coherent and atomic inter-connect).

Device driver can use each of HMM tools separatly. You do not have to
use all the tools it provides.

For RDMA device i do not expect a need to use the device memory support
of HMM. This device memory support is geared toward accelerator like GPU.


You can find a branch [1] with all the prerequisite in. This patch is on
top of 5.1rc1+ but i can rebase it on any specific branch once HMM pre-
requisite is upstream.

[1] https://cgit.freedesktop.org/~glisse/linux/log/?h=odp-hmm
[2] https://cgit.freedesktop.org/~glisse/linux/log/?h=hmm-odp-v2

Cc: linux-r...@vger.kernel.org
Cc: Jason Gunthorpe 
Cc: Leon Romanovsky 
Cc: Doug Ledford 
Cc: Artemy Kovalyov 
Cc: Moni Shoua ,
Cc: Mike Marciniszyn 
Cc: Kaike Wan 
Cc: Dennis Dalessandro 

Jérôme Glisse (1):
  RDMA/odp: convert to use HMM for ODP v2

 drivers/infiniband/core/umem_odp.c | 488 -
 drivers/infiniband/hw/mlx5/mem.c   |  20 +-
 drivers/infiniband/hw/mlx5/mr.c|   2 +-
 drivers/infiniband/hw/mlx5/odp.c   | 106 ---
 include/rdma/ib_umem_odp.h |  48 ++-
 5 files changed, 217 insertions(+), 447 deletions(-)

-- 
2.17.2



[PATCH v2 0/1] Restore change_pte optimization

2019-02-20 Thread jglisse
From: Jérôme Glisse 

This patch is on top of my patchset to add context information to
mmu notifier [1] you can find a branch with everything [2]. It has
been tested with qemu/KVM building kernel within the guest and also
running a benchmark which the result are given below.

The change_pte() callback is impaired by the range invalidation call-
back within KVM as the range invalidation callback as those do fully
invalidate the secondary mmu. This means that there is a window between
the range_start callback and the change_pte callback where the secondary
mmu for the address is empty. Guest can fault on that address during
that window.

That window can last for some times if the kernel code which is
doing the invalidation is interrupted or if they are other mmu
listener for the process that might sleep within their range_start
callback.

With this patch KVM will ignore the range_start and range_end call-
back and will rely solely on the change_pte callback to update the
secondary mmu. This means that the secondary mmu never have an empty
entry for the address between range_start and range_end and hence
the guest will not have a chance to fault.

This optimization is not valid for all the mmu notifier cases and
thanks to the patchset that add context informations to the mmu
notifier [1] we can now identify within KVM when it is safe to rely
on this optimization.

Roughly it is safe when:
- going from read only to read and write (same or different pfn)
- going from read and write to read only same pfn
- going from read only to read only different pfn

Longer explaination in [1] and [3].

Running ksm02 from ltp gives the following results:

before  mean  {real: 675.460632, user: 857.771423, sys: 215.929657, npages: 
4773.066895}
before  stdev {real:  37.035435, user:   4.395942, sys:   3.976172, npages:  
675.352783}
after   mean  {real: 672.515503, user: 855.817322, sys: 200.902710, npages: 
4899.00}
after   stdev {real:  37.340954, user:   4.051633, sys:   3.894153, npages:  
742.413452}

Roughly 7%-8% less time spent in the kernel. So we are saving few
cycles (this is with KSM enabled on the host and ksm sleep set to
0). Dunno how this translate to real workload.


Note that with the context information further optimization are now
possible within KVM. For instance you can find out if a range is
updated to read only (ie no pfn change just protection change) and
update the secondary mmu accordingly.

You can also identify munmap()/mremap() syscall and only free up the
resources you have allocated for the range (like freeing up secondary
page table for the range or data structure) when it is an munmap or
a mremap. Today my understanding is that kvm_unmap_hva_range() will
free up resources always assuming it is an munmap of some sort. So
for mundane invalidation (like migration, reclaim, mprotect, fork,
...) KVM is freeing up potential mega bytes of structure that it will
have to re-allocate shortly there after (see [4] for WIP example).

Cheers,
Jérôme

[1] https://lkml.org/lkml/2019/2/19/752
[2] https://cgit.freedesktop.org/~glisse/linux/log/?h=mmu-notifier-v05
[3] https://lkml.org/lkml/2019/2/19/754
[4] https://cgit.freedesktop.org/~glisse/linux/log/?h=wip-kvm-mmu-notifier-opti

Cc: Andrea Arcangeli 
Cc: Peter Xu 
Cc: Andrew Morton 

Jérôme Glisse (1):
  kvm/mmu_notifier: re-enable the change_pte() optimization.

 virt/kvm/kvm_main.c | 16 
 1 file changed, 16 insertions(+)

-- 
2.17.2



[PATCH v2 1/1] kvm/mmu_notifier: re-enable the change_pte() optimization.

2019-02-20 Thread jglisse
From: Jérôme Glisse 

Since changes to mmu notifier the change_pte() optimization was lost
for kvm. This re-enable it, when ever a pte is going from read and
write to read only with same pfn, or from read only to read and write
with different pfn.

It is safe to update the secondary MMUs, because the primary MMU
pte invalidate must have already happened with a ptep_clear_flush()
before set_pte_at_notify() is invoked (and thus before change_pte()
callback).

Signed-off-by: Jérôme Glisse 
Cc: Andrea Arcangeli 
Cc: Peter Xu 
Cc: Andrew Morton 
---
 virt/kvm/kvm_main.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 629760c0fb95..0f979f02bf1c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -369,6 +369,14 @@ static int kvm_mmu_notifier_invalidate_range_start(struct 
mmu_notifier *mn,
int need_tlb_flush = 0, idx;
int ret;
 
+   /*
+* Nothing to do when using change_pte() which will be call for each
+* individual pte update at the right time. See mmu_notifier.h for more
+* informations.
+*/
+   if (mmu_notifier_range_use_change_pte(range))
+   return 0;
+
idx = srcu_read_lock(>srcu);
spin_lock(>mmu_lock);
/*
@@ -399,6 +407,14 @@ static void kvm_mmu_notifier_invalidate_range_end(struct 
mmu_notifier *mn,
 {
struct kvm *kvm = mmu_notifier_to_kvm(mn);
 
+   /*
+* Nothing to do when using change_pte() which will be call for each
+* individual pte update at the right time. See mmu_notifier.h for more
+* informations.
+*/
+   if (mmu_notifier_range_use_change_pte(range))
+   return;
+
spin_lock(>mmu_lock);
/*
 * This sequence increase will notify the kvm page fault that
-- 
2.17.2



[PATCH v5 8/9] mm/mmu_notifier: mmu_notifier_range_update_to_read_only() helper

2019-02-19 Thread jglisse
From: Jérôme Glisse 

Helper to test if a range is updated to read only (it is still valid
to read from the range). This is useful for device driver or anyone
who wish to optimize out update when they know that they already have
the range map read only.

Signed-off-by: Jérôme Glisse 
Cc: Christian König 
Cc: Joonas Lahtinen 
Cc: Jani Nikula 
Cc: Rodrigo Vivi 
Cc: Jan Kara 
Cc: Andrea Arcangeli 
Cc: Peter Xu 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: Ross Zwisler 
Cc: Dan Williams 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: Christian Koenig 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: k...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: Arnd Bergmann 
---
 include/linux/mmu_notifier.h |  4 
 mm/mmu_notifier.c| 10 ++
 2 files changed, 14 insertions(+)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 0379956fff23..b6c004bd9f6a 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -259,6 +259,8 @@ extern void __mmu_notifier_invalidate_range_end(struct 
mmu_notifier_range *r,
  bool only_end);
 extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
  unsigned long start, unsigned long end);
+extern bool
+mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range);
 
 static inline bool
 mmu_notifier_range_blockable(const struct mmu_notifier_range *range)
@@ -568,6 +570,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct 
*mm)
 {
 }
 
+#define mmu_notifier_range_update_to_read_only(r) false
+
 #define ptep_clear_flush_young_notify ptep_clear_flush_young
 #define pmdp_clear_flush_young_notify pmdp_clear_flush_young
 #define ptep_clear_young_notify ptep_test_and_clear_young
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index abd88c466eb2..ee36068077b6 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -395,3 +395,13 @@ void mmu_notifier_unregister_no_release(struct 
mmu_notifier *mn,
mmdrop(mm);
 }
 EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release);
+
+bool
+mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range)
+{
+   if (!range->vma || range->event != MMU_NOTIFY_PROTECTION_VMA)
+   return false;
+   /* Return true if the vma still have the read flag set. */
+   return range->vma->vm_flags & VM_READ;
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_range_update_to_read_only);
-- 
2.17.2



[PATCH v5 9/9] mm/mmu_notifier: set MMU_NOTIFIER_USE_CHANGE_PTE flag where appropriate v2

2019-02-19 Thread jglisse
From: Jérôme Glisse 

When notifying change for a range use MMU_NOTIFIER_USE_CHANGE_PTE flag
for page table update that use set_pte_at_notify() and where the we are
going either from read and write to read only with same pfn or read only
to read and write with new pfn.

Note that set_pte_at_notify() itself should only be use in rare cases
ie we do not want to use it when we are updating a significant range of
virtual addresses and thus a significant number of pte. Instead for
those cases the event provided to mmu notifer invalidate_range_start()
callback should be use for optimization.

Changes since v1:
- Use the new unsigned flags field in struct mmu_notifier_range
- Use the new flags parameter to mmu_notifier_range_init()
- Explicitly list all the patterns where we can use change_pte()

Signed-off-by: Jérôme Glisse 
Cc: Christian König 
Cc: Joonas Lahtinen 
Cc: Jani Nikula 
Cc: Rodrigo Vivi 
Cc: Jan Kara 
Cc: Andrea Arcangeli 
Cc: Peter Xu 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: Ross Zwisler 
Cc: Dan Williams 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: Christian Koenig 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: k...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: Arnd Bergmann 
---
 include/linux/mmu_notifier.h | 34 --
 mm/ksm.c | 11 ++-
 mm/memory.c  |  5 +++--
 3 files changed, 41 insertions(+), 9 deletions(-)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index b6c004bd9f6a..0230a4b06b46 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -40,6 +40,26 @@ enum mmu_notifier_event {
MMU_NOTIFY_SOFT_DIRTY,
 };
 
+/*
+ * @MMU_NOTIFIER_RANGE_BLOCKABLE: can the mmu notifier range_start/range_end
+ * callback block or not ? If set then the callback can block.
+ *
+ * @MMU_NOTIFIER_USE_CHANGE_PTE: only set when the page table it updated with
+ * the set_pte_at_notify() the valid patterns for this are:
+ *  - pte read and write to read only same pfn
+ *  - pte read only to read and write (pfn can change or stay the same)
+ *  - pte read only to read only with different pfn
+ * It is illegal to set in any other circumstances.
+ *
+ * Note that set_pte_at_notify() should not be use outside of the above cases.
+ * When updating a range in batch (like write protecting a range) it is better
+ * to rely on invalidate_range_start() and struct mmu_notifier_range to infer
+ * the kind of update that is happening (as an example you can look at the
+ * mmu_notifier_range_update_to_read_only() function).
+ */
+#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)
+#define MMU_NOTIFIER_USE_CHANGE_PTE (1 << 1)
+
 #ifdef CONFIG_MMU_NOTIFIER
 
 /*
@@ -55,8 +75,6 @@ struct mmu_notifier_mm {
spinlock_t lock;
 };
 
-#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)
-
 struct mmu_notifier_range {
struct vm_area_struct *vma;
struct mm_struct *mm;
@@ -268,6 +286,12 @@ mmu_notifier_range_blockable(const struct 
mmu_notifier_range *range)
return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE);
 }
 
+static inline bool
+mmu_notifier_range_use_change_pte(const struct mmu_notifier_range *range)
+{
+   return (range->flags & MMU_NOTIFIER_USE_CHANGE_PTE);
+}
+
 static inline void mmu_notifier_release(struct mm_struct *mm)
 {
if (mm_has_notifiers(mm))
@@ -509,6 +533,12 @@ mmu_notifier_range_blockable(const struct 
mmu_notifier_range *range)
return true;
 }
 
+static inline bool
+mmu_notifier_range_use_change_pte(const struct mmu_notifier_range *range)
+{
+   return false;
+}
+
 static inline int mm_has_notifiers(struct mm_struct *mm)
 {
return 0;
diff --git a/mm/ksm.c b/mm/ksm.c
index b782fadade8f..41e51882f999 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1066,9 +1066,9 @@ static int write_protect_page(struct vm_area_struct *vma, 
struct page *page,
 
BUG_ON(PageTransCompound(page));
 
-   mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, mm,
-   pvmw.address,
-   pvmw.address + PAGE_SIZE);
+   mmu_notifier_range_init(, MMU_NOTIFY_CLEAR,
+   MMU_NOTIFIER_USE_CHANGE_PTE, vma, mm,
+   pvmw.address, pvmw.address + PAGE_SIZE);
mmu_notifier_invalidate_range_start();
 
if (!page_vma_mapped_walk())
@@ -1155,8 +1155,9 @@ static int replace_page(struct vm_area_struct *vma, 
struct page *page,
if (!pmd)
goto out;
 
-   mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
-   addr + PAGE_SIZE);
+   mmu_notifier_range_init(, MMU_NOTIFY_CLEAR,
+   MMU_NOTIFIER_USE_CHANGE_PTE,
+   vma, mm, addr, addr + PAGE_SIZE);
mmu_notifier_invalidate_range_start();
 
ptep = pte_offset_map_lock(mm, pmd, addr, );
diff 

[PATCH v5 6/9] mm/mmu_notifier: use correct mmu_notifier events for each invalidation

2019-02-19 Thread jglisse
From: Jérôme Glisse 

This update each existing invalidation to use the correct mmu notifier
event that represent what is happening to the CPU page table. See the
patch which introduced the events to see the rational behind this.

Signed-off-by: Jérôme Glisse 
Cc: Christian König 
Cc: Joonas Lahtinen 
Cc: Jani Nikula 
Cc: Rodrigo Vivi 
Cc: Jan Kara 
Cc: Andrea Arcangeli 
Cc: Peter Xu 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: Ross Zwisler 
Cc: Dan Williams 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: Christian Koenig 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: k...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: Arnd Bergmann 
---
 fs/proc/task_mmu.c  |  4 ++--
 kernel/events/uprobes.c |  2 +-
 mm/huge_memory.c| 14 ++
 mm/hugetlb.c|  8 
 mm/khugepaged.c |  2 +-
 mm/ksm.c|  4 ++--
 mm/madvise.c|  2 +-
 mm/memory.c | 14 +++---
 mm/migrate.c|  4 ++--
 mm/mprotect.c   |  5 +++--
 mm/rmap.c   |  6 +++---
 11 files changed, 32 insertions(+), 33 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index fcbd0e574917..3b93ce496dd4 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1151,8 +1151,8 @@ static ssize_t clear_refs_write(struct file *file, const 
char __user *buf,
break;
}
 
-   mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0,
-   NULL, mm, 0, -1UL);
+   mmu_notifier_range_init(, MMU_NOTIFY_SOFT_DIRTY,
+   0, NULL, mm, 0, -1UL);
mmu_notifier_invalidate_range_start();
}
walk_page_range(0, mm->highest_vm_end, _refs_walk);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 46f546bdba00..8e8342080013 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -161,7 +161,7 @@ static int __replace_page(struct vm_area_struct *vma, 
unsigned long addr,
struct mmu_notifier_range range;
struct mem_cgroup *memcg;
 
-   mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, vma, mm, addr,
+   mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
addr + PAGE_SIZE);
 
VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c9d638f1b34e..1da6ca0f0f6d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1184,9 +1184,8 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct 
vm_fault *vmf,
cond_resched();
}
 
-   mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
-   haddr,
-   haddr + HPAGE_PMD_SIZE);
+   mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+   haddr, haddr + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start();
 
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
@@ -1349,9 +1348,8 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, 
pmd_t orig_pmd)
vma, HPAGE_PMD_NR);
__SetPageUptodate(new_page);
 
-   mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
-   haddr,
-   haddr + HPAGE_PMD_SIZE);
+   mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+   haddr, haddr + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start();
 
spin_lock(vmf->ptl);
@@ -2028,7 +2026,7 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t 
*pud,
spinlock_t *ptl;
struct mmu_notifier_range range;
 
-   mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
+   mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
address & HPAGE_PUD_MASK,
(address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
mmu_notifier_invalidate_range_start();
@@ -2247,7 +2245,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t 
*pmd,
spinlock_t *ptl;
struct mmu_notifier_range range;
 
-   mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
+   mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
address & HPAGE_PMD_MASK,
(address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start();
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d9e5c5a4c004..a58115c6b0a3 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3250,7 +3250,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct 
mm_struct *src,
cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 
if (cow) {
- 

[PATCH v5 7/9] mm/mmu_notifier: pass down vma and reasons why mmu notifier is happening v2

2019-02-19 Thread jglisse
From: Jérôme Glisse 

CPU page table update can happens for many reasons, not only as a result
of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also
as a result of kernel activities (memory compression, reclaim, migration,
...).

Users of mmu notifier API track changes to the CPU page table and take
specific action for them. While current API only provide range of virtual
address affected by the change, not why the changes is happening

This patch is just passing down the new informations by adding it to the
mmu_notifier_range structure.

Changes since v1:
- Initialize flags field from mmu_notifier_range_init() arguments

Signed-off-by: Jérôme Glisse 
Cc: Christian König 
Cc: Joonas Lahtinen 
Cc: Jani Nikula 
Cc: Rodrigo Vivi 
Cc: Jan Kara 
Cc: Andrea Arcangeli 
Cc: Peter Xu 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: Ross Zwisler 
Cc: Dan Williams 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: Christian Koenig 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: k...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: Arnd Bergmann 
---
 include/linux/mmu_notifier.h | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 62f94cd85455..0379956fff23 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -58,10 +58,12 @@ struct mmu_notifier_mm {
 #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)
 
 struct mmu_notifier_range {
+   struct vm_area_struct *vma;
struct mm_struct *mm;
unsigned long start;
unsigned long end;
unsigned flags;
+   enum mmu_notifier_event event;
 };
 
 struct mmu_notifier_ops {
@@ -363,10 +365,12 @@ static inline void mmu_notifier_range_init(struct 
mmu_notifier_range *range,
   unsigned long start,
   unsigned long end)
 {
+   range->vma = vma;
+   range->event = event;
range->mm = mm;
range->start = start;
range->end = end;
-   range->flags = 0;
+   range->flags = flags;
 }
 
 #define ptep_clear_flush_young_notify(__vma, __address, __ptep)
\
-- 
2.17.2



[PATCH v5 4/9] mm/mmu_notifier: contextual information for event enums

2019-02-19 Thread jglisse
From: Jérôme Glisse 

CPU page table update can happens for many reasons, not only as a result
of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also
as a result of kernel activities (memory compression, reclaim, migration,
...).

This patch introduce a set of enums that can be associated with each of
the events triggering a mmu notifier. Latter patches take advantages of
those enum values.

- UNMAP: munmap() or mremap()
- CLEAR: page table is cleared (migration, compaction, reclaim, ...)
- PROTECTION_VMA: change in access protections for the range
- PROTECTION_PAGE: change in access protections for page in the range
- SOFT_DIRTY: soft dirtyness tracking

Being able to identify munmap() and mremap() from other reasons why the
page table is cleared is important to allow user of mmu notifier to
update their own internal tracking structure accordingly (on munmap or
mremap it is not longer needed to track range of virtual address as it
becomes invalid).

Signed-off-by: Jérôme Glisse 
Cc: Christian König 
Cc: Joonas Lahtinen 
Cc: Jani Nikula 
Cc: Rodrigo Vivi 
Cc: Jan Kara 
Cc: Andrea Arcangeli 
Cc: Peter Xu 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: Ross Zwisler 
Cc: Dan Williams 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: Christian Koenig 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: k...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: Arnd Bergmann 
---
 include/linux/mmu_notifier.h | 30 ++
 1 file changed, 30 insertions(+)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index c8672c366f67..2386e71ac1b8 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -10,6 +10,36 @@
 struct mmu_notifier;
 struct mmu_notifier_ops;
 
+/**
+ * enum mmu_notifier_event - reason for the mmu notifier callback
+ * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that
+ * move the range
+ *
+ * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like
+ * madvise() or replacing a page by another one, ...).
+ *
+ * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range
+ * ie using the vma access permission (vm_page_prot) to update the whole range
+ * is enough no need to inspect changes to the CPU page table (mprotect()
+ * syscall)
+ *
+ * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for
+ * pages in the range so to mirror those changes the user must inspect the CPU
+ * page table (from the end callback).
+ *
+ * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same
+ * access flags). User should soft dirty the page in the end callback to make
+ * sure that anyone relying on soft dirtyness catch pages that might be written
+ * through non CPU mappings.
+ */
+enum mmu_notifier_event {
+   MMU_NOTIFY_UNMAP = 0,
+   MMU_NOTIFY_CLEAR,
+   MMU_NOTIFY_PROTECTION_VMA,
+   MMU_NOTIFY_PROTECTION_PAGE,
+   MMU_NOTIFY_SOFT_DIRTY,
+};
+
 #ifdef CONFIG_MMU_NOTIFIER
 
 /*
-- 
2.17.2



[PATCH v5 5/9] mm/mmu_notifier: contextual information for event triggering invalidation v2

2019-02-19 Thread jglisse
From: Jérôme Glisse 

CPU page table update can happens for many reasons, not only as a result
of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also
as a result of kernel activities (memory compression, reclaim, migration,
...).

Users of mmu notifier API track changes to the CPU page table and take
specific action for them. While current API only provide range of virtual
address affected by the change, not why the changes is happening.

This patchset do the initial mechanical convertion of all the places that
calls mmu_notifier_range_init to also provide the default MMU_NOTIFY_UNMAP
event as well as the vma if it is know (most invalidation happens against
a given vma). Passing down the vma allows the users of mmu notifier to
inspect the new vma page protection.

The MMU_NOTIFY_UNMAP is always the safe default as users of mmu notifier
should assume that every for the range is going away when that event
happens. A latter patch do convert mm call path to use a more appropriate
events for each call.

Changes since v1:
- add the flags parameter to init range flags

This is done as 2 patches so that no call site is forgotten especialy
as it uses this following coccinelle patch:

%<--
@@
identifier I1, I2, I3, I4;
@@
static inline void mmu_notifier_range_init(struct mmu_notifier_range *I1,
+enum mmu_notifier_event event,
+unsigned flags,
+struct vm_area_struct *vma,
struct mm_struct *I2, unsigned long I3, unsigned long I4) { ... }

@@
@@
-#define mmu_notifier_range_init(range, mm, start, end)
+#define mmu_notifier_range_init(range, event, flags, vma, mm, start, end)

@@
expression E1, E3, E4;
identifier I1;
@@
<...
mmu_notifier_range_init(E1,
+MMU_NOTIFY_UNMAP, 0, I1,
I1->vm_mm, E3, E4)
...>

@@
expression E1, E2, E3, E4;
identifier FN, VMA;
@@
FN(..., struct vm_area_struct *VMA, ...) {
<...
mmu_notifier_range_init(E1,
+MMU_NOTIFY_UNMAP, 0, VMA,
E2, E3, E4)
...> }

@@
expression E1, E2, E3, E4;
identifier FN, VMA;
@@
FN(...) {
struct vm_area_struct *VMA;
<...
mmu_notifier_range_init(E1,
+MMU_NOTIFY_UNMAP, 0, VMA,
E2, E3, E4)
...> }

@@
expression E1, E2, E3, E4;
identifier FN;
@@
FN(...) {
<...
mmu_notifier_range_init(E1,
+MMU_NOTIFY_UNMAP, 0, NULL,
E2, E3, E4)
...> }
-->%

Applied with:
spatch --all-includes --sp-file mmu-notifier.spatch fs/proc/task_mmu.c 
--in-place
spatch --sp-file mmu-notifier.spatch --dir kernel/events/ --in-place
spatch --sp-file mmu-notifier.spatch --dir mm --in-place

Signed-off-by: Jérôme Glisse 
Cc: Christian König 
Cc: Joonas Lahtinen 
Cc: Jani Nikula 
Cc: Rodrigo Vivi 
Cc: Jan Kara 
Cc: Andrea Arcangeli 
Cc: Peter Xu 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: Ross Zwisler 
Cc: Dan Williams 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: Christian Koenig 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: k...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: Arnd Bergmann 
---
 fs/proc/task_mmu.c   |  3 ++-
 include/linux/mmu_notifier.h |  5 -
 kernel/events/uprobes.c  |  3 ++-
 mm/huge_memory.c | 12 
 mm/hugetlb.c | 12 
 mm/khugepaged.c  |  3 ++-
 mm/ksm.c |  6 --
 mm/madvise.c |  3 ++-
 mm/memory.c  | 25 -
 mm/migrate.c |  5 -
 mm/mprotect.c|  3 ++-
 mm/mremap.c  |  3 ++-
 mm/oom_kill.c|  3 ++-
 mm/rmap.c|  6 --
 14 files changed, 62 insertions(+), 30 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 92a91e7816d8..fcbd0e574917 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1151,7 +1151,8 @@ static ssize_t clear_refs_write(struct file *file, const 
char __user *buf,
break;
}
 
-   mmu_notifier_range_init(, mm, 0, -1UL);
+   mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0,
+   NULL, mm, 0, -1UL);
mmu_notifier_invalidate_range_start();
}
walk_page_range(0, mm->highest_vm_end, _refs_walk);
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 2386e71ac1b8..62f94cd85455 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -356,6 +356,9 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct 
*mm)
 
 
 static inline void mmu_notifier_range_init(struct mmu_notifier_range *range,
+  enum mmu_notifier_event event,
+  unsigned flags,
+  struct vm_area_struct *vma,
   struct mm_struct *mm,
 

[PATCH v5 0/9] mmu notifier provide context informations

2019-02-19 Thread jglisse
From: Jérôme Glisse 

Since last version [4] i added the extra bits needed for the change_pte
optimization (which is a KSM thing). Here i am not posting users of
this, they will be posted to the appropriate sub-systems (KVM, GPU,
RDMA, ...) once this serie get upstream. If you want to look at users
of this see [5] [6]. If this gets in 5.1 then i will be submitting
those users for 5.2 (including KVM if KVM folks feel comfortable with
it).

Note that this serie does not change any behavior for any existing
code. It just pass down more informations to mmu notifier listener.

The rational for this patchset:


CPU page table update can happens for many reasons, not only as a
result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...)
but also as a result of kernel activities (memory compression, reclaim,
migration, ...).

This patch introduce a set of enums that can be associated with each
of the events triggering a mmu notifier:

- UNMAP: munmap() or mremap()
- CLEAR: page table is cleared (migration, compaction, reclaim, ...)
- PROTECTION_VMA: change in access protections for the range
- PROTECTION_PAGE: change in access protections for page in the range
- SOFT_DIRTY: soft dirtyness tracking

Being able to identify munmap() and mremap() from other reasons why the
page table is cleared is important to allow user of mmu notifier to
update their own internal tracking structure accordingly (on munmap or
mremap it is not longer needed to track range of virtual address as it
becomes invalid). Without this serie, driver are force to assume that
every notification is an munmap which triggers useless trashing within
drivers that associate structure with range of virtual address. Each
driver is force to free up its tracking structure and then restore it
on next device page fault. With this serie we can also optimize device
page table update [5].

More over this can also be use to optimize out some page table updates
like for KVM where we can update the secondary MMU directly from the
callback instead of clearing it.

Patches to leverage this serie will be posted separately to each sub-
system.

Cheers,
Jérôme

[1] v1 https://lkml.org/lkml/2018/3/23/1049
[2] v2 https://lkml.org/lkml/2018/12/5/10
[3] v3 https://lkml.org/lkml/2018/12/13/620
[4] v4 https://lkml.org/lkml/2019/1/23/838
[5] patches to use this:
https://lkml.org/lkml/2019/1/23/833
https://lkml.org/lkml/2019/1/23/834
https://lkml.org/lkml/2019/1/23/832
https://lkml.org/lkml/2019/1/23/831
[6] KVM restore change pte optimization
https://patchwork.kernel.org/cover/10791179/

Cc: Christian König 
Cc: Joonas Lahtinen 
Cc: Jani Nikula 
Cc: Rodrigo Vivi 
Cc: Jan Kara 
Cc: Andrea Arcangeli 
Cc: Peter Xu 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: Andrew Morton 
Cc: Ross Zwisler 
Cc: Dan Williams 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: Christian Koenig 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: k...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: linux-fsde...@vger.kernel.org
Cc: Arnd Bergmann 

Jérôme Glisse (9):
  mm/mmu_notifier: helper to test if a range invalidation is blockable
  mm/mmu_notifier: convert user range->blockable to helper function
  mm/mmu_notifier: convert mmu_notifier_range->blockable to a flags
  mm/mmu_notifier: contextual information for event enums
  mm/mmu_notifier: contextual information for event triggering
invalidation v2
  mm/mmu_notifier: use correct mmu_notifier events for each invalidation
  mm/mmu_notifier: pass down vma and reasons why mmu notifier is
happening v2
  mm/mmu_notifier: mmu_notifier_range_update_to_read_only() helper
  mm/mmu_notifier: set MMU_NOTIFIER_USE_CHANGE_PTE flag where
appropriate v2

 drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c  |  8 +--
 drivers/gpu/drm/i915/i915_gem_userptr.c |  2 +-
 drivers/gpu/drm/radeon/radeon_mn.c  |  4 +-
 drivers/infiniband/core/umem_odp.c  |  5 +-
 drivers/xen/gntdev.c|  6 +-
 fs/proc/task_mmu.c  |  3 +-
 include/linux/mmu_notifier.h| 93 +++--
 kernel/events/uprobes.c |  3 +-
 mm/hmm.c|  6 +-
 mm/huge_memory.c| 14 ++--
 mm/hugetlb.c| 12 ++--
 mm/khugepaged.c |  3 +-
 mm/ksm.c|  9 ++-
 mm/madvise.c|  3 +-
 mm/memory.c | 26 ---
 mm/migrate.c|  5 +-
 mm/mmu_notifier.c   | 12 +++-
 mm/mprotect.c   |  4 +-
 mm/mremap.c |  3 +-
 mm/oom_kill.c   |  3 +-
 mm/rmap.c   |  6 +-
 virt/kvm/kvm_main.c |  3 +-
 22 files changed, 180 insertions(+), 53 deletions(-)

-- 
2.17.2



[PATCH v5 3/9] mm/mmu_notifier: convert mmu_notifier_range->blockable to a flags

2019-02-19 Thread jglisse
From: Jérôme Glisse 

Use an unsigned field for flags other than blockable and convert
the blockable field to be one of those flags.

Signed-off-by: Jérôme Glisse 
Cc: Christian König 
Cc: Joonas Lahtinen 
Cc: Jani Nikula 
Cc: Rodrigo Vivi 
Cc: Jan Kara 
Cc: Andrea Arcangeli 
Cc: Peter Xu 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: Andrew Morton 
Cc: Ross Zwisler 
Cc: Dan Williams 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: Christian Koenig 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: k...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: Arnd Bergmann 
---
 include/linux/mmu_notifier.h | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index e630def131ce..c8672c366f67 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -25,11 +25,13 @@ struct mmu_notifier_mm {
spinlock_t lock;
 };
 
+#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)
+
 struct mmu_notifier_range {
struct mm_struct *mm;
unsigned long start;
unsigned long end;
-   bool blockable;
+   unsigned flags;
 };
 
 struct mmu_notifier_ops {
@@ -229,7 +231,7 @@ extern void __mmu_notifier_invalidate_range(struct 
mm_struct *mm,
 static inline bool
 mmu_notifier_range_blockable(const struct mmu_notifier_range *range)
 {
-   return range->blockable;
+   return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE);
 }
 
 static inline void mmu_notifier_release(struct mm_struct *mm)
@@ -275,7 +277,7 @@ static inline void
 mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
 {
if (mm_has_notifiers(range->mm)) {
-   range->blockable = true;
+   range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE;
__mmu_notifier_invalidate_range_start(range);
}
 }
@@ -284,7 +286,7 @@ static inline int
 mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range)
 {
if (mm_has_notifiers(range->mm)) {
-   range->blockable = false;
+   range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE;
return __mmu_notifier_invalidate_range_start(range);
}
return 0;
@@ -331,6 +333,7 @@ static inline void mmu_notifier_range_init(struct 
mmu_notifier_range *range,
range->mm = mm;
range->start = start;
range->end = end;
+   range->flags = 0;
 }
 
 #define ptep_clear_flush_young_notify(__vma, __address, __ptep)
\
-- 
2.17.2



[PATCH v5 2/9] mm/mmu_notifier: convert user range->blockable to helper function

2019-02-19 Thread jglisse
From: Jérôme Glisse 

Use the mmu_notifier_range_blockable() helper function instead of
directly dereferencing the range->blockable field. This is done to
make it easier to change the mmu_notifier range field.

This patch is the outcome of the following coccinelle patch:

%<---
@@
identifier I1, FN;
@@
FN(..., struct mmu_notifier_range *I1, ...) {
<...
-I1->blockable
+mmu_notifier_range_blockable(I1)
...>
}
--->%

spatch --in-place --sp-file blockable.spatch --dir .

Signed-off-by: Jérôme Glisse 
Cc: Christian König 
Cc: Joonas Lahtinen 
Cc: Jani Nikula 
Cc: Rodrigo Vivi 
Cc: Jan Kara 
Cc: Andrea Arcangeli 
Cc: Peter Xu 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: Ross Zwisler 
Cc: Dan Williams 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: Christian Koenig 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: k...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: Arnd Bergmann 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c  | 8 
 drivers/gpu/drm/i915/i915_gem_userptr.c | 2 +-
 drivers/gpu/drm/radeon/radeon_mn.c  | 4 ++--
 drivers/infiniband/core/umem_odp.c  | 5 +++--
 drivers/xen/gntdev.c| 6 +++---
 mm/hmm.c| 6 +++---
 mm/mmu_notifier.c   | 2 +-
 virt/kvm/kvm_main.c | 3 ++-
 8 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
index 3e6823fdd939..58ed401c5996 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
@@ -256,14 +256,14 @@ static int amdgpu_mn_invalidate_range_start_gfx(struct 
mmu_notifier *mn,
/* TODO we should be able to split locking for interval tree and
 * amdgpu_mn_invalidate_node
 */
-   if (amdgpu_mn_read_lock(amn, range->blockable))
+   if (amdgpu_mn_read_lock(amn, mmu_notifier_range_blockable(range)))
return -EAGAIN;
 
it = interval_tree_iter_first(>objects, range->start, end);
while (it) {
struct amdgpu_mn_node *node;
 
-   if (!range->blockable) {
+   if (!mmu_notifier_range_blockable(range)) {
amdgpu_mn_read_unlock(amn);
return -EAGAIN;
}
@@ -299,7 +299,7 @@ static int amdgpu_mn_invalidate_range_start_hsa(struct 
mmu_notifier *mn,
/* notification is exclusive, but interval is inclusive */
end = range->end - 1;
 
-   if (amdgpu_mn_read_lock(amn, range->blockable))
+   if (amdgpu_mn_read_lock(amn, mmu_notifier_range_blockable(range)))
return -EAGAIN;
 
it = interval_tree_iter_first(>objects, range->start, end);
@@ -307,7 +307,7 @@ static int amdgpu_mn_invalidate_range_start_hsa(struct 
mmu_notifier *mn,
struct amdgpu_mn_node *node;
struct amdgpu_bo *bo;
 
-   if (!range->blockable) {
+   if (!mmu_notifier_range_blockable(range)) {
amdgpu_mn_read_unlock(amn);
return -EAGAIN;
}
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c 
b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 1d3f9a31ad61..777b3f8727e7 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -122,7 +122,7 @@ userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
while (it) {
struct drm_i915_gem_object *obj;
 
-   if (!range->blockable) {
+   if (!mmu_notifier_range_blockable(range)) {
ret = -EAGAIN;
break;
}
diff --git a/drivers/gpu/drm/radeon/radeon_mn.c 
b/drivers/gpu/drm/radeon/radeon_mn.c
index b3019505065a..c9bd1278f573 100644
--- a/drivers/gpu/drm/radeon/radeon_mn.c
+++ b/drivers/gpu/drm/radeon/radeon_mn.c
@@ -133,7 +133,7 @@ static int radeon_mn_invalidate_range_start(struct 
mmu_notifier *mn,
/* TODO we should be able to split locking for interval tree and
 * the tear down.
 */
-   if (range->blockable)
+   if (mmu_notifier_range_blockable(range))
mutex_lock(>lock);
else if (!mutex_trylock(>lock))
return -EAGAIN;
@@ -144,7 +144,7 @@ static int radeon_mn_invalidate_range_start(struct 
mmu_notifier *mn,
struct radeon_bo *bo;
long r;
 
-   if (!range->blockable) {
+   if (!mmu_notifier_range_blockable(range)) {
ret = -EAGAIN;
goto out_unlock;
}
diff --git a/drivers/infiniband/core/umem_odp.c 
b/drivers/infiniband/core/umem_odp.c
index 012044f16d1c..3a3f1538d295 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ 

[PATCH v5 1/9] mm/mmu_notifier: helper to test if a range invalidation is blockable

2019-02-19 Thread jglisse
From: Jérôme Glisse 

Simple helpers to test if range invalidation is blockable. Latter
patches use cocinnelle to convert all direct dereference of range->
blockable to use this function instead so that we can convert the
blockable field to an unsigned for more flags.

Signed-off-by: Jérôme Glisse 
Cc: Christian König 
Cc: Joonas Lahtinen 
Cc: Jani Nikula 
Cc: Rodrigo Vivi 
Cc: Jan Kara 
Cc: Andrea Arcangeli 
Cc: Peter Xu 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: Andrew Morton 
Cc: Ross Zwisler 
Cc: Dan Williams 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: Christian Koenig 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: k...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: linux-fsde...@vger.kernel.org
Cc: Arnd Bergmann 
---
 include/linux/mmu_notifier.h | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 4050ec1c3b45..e630def131ce 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -226,6 +226,12 @@ extern void __mmu_notifier_invalidate_range_end(struct 
mmu_notifier_range *r,
 extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
  unsigned long start, unsigned long end);
 
+static inline bool
+mmu_notifier_range_blockable(const struct mmu_notifier_range *range)
+{
+   return range->blockable;
+}
+
 static inline void mmu_notifier_release(struct mm_struct *mm)
 {
if (mm_has_notifiers(mm))
@@ -455,6 +461,11 @@ static inline void _mmu_notifier_range_init(struct 
mmu_notifier_range *range,
 #define mmu_notifier_range_init(range, mm, start, end) \
_mmu_notifier_range_init(range, start, end)
 
+static inline bool
+mmu_notifier_range_blockable(const struct mmu_notifier_range *range)
+{
+   return true;
+}
 
 static inline int mm_has_notifiers(struct mm_struct *mm)
 {
-- 
2.17.2



[RFC PATCH 1/4] uprobes: use set_pte_at() not set_pte_at_notify()

2019-01-31 Thread jglisse
From: Jérôme Glisse 

Using set_pte_at_notify() trigger useless calls to change_pte() so just
use set_pte_at() instead. The reason is that set_pte_at_notify() should
only be use when going from either a read and write pte to read only pte
with same pfn, or from read only to read and write with a different pfn.

The set_pte_at_notify() was use because __replace_page() code came from
the mm/ksm.c code in which the above rules are valid.

Signed-off-by: Jérôme Glisse 
Cc: Andrea Arcangeli 
Cc: Peter Xu 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Andrew Morton 
Cc: Matthew Wilcox 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: k...@vger.kernel.org
---
 kernel/events/uprobes.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 87e76a1dc758..a4807b1edd7f 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -207,8 +207,7 @@ static int __replace_page(struct vm_area_struct *vma, 
unsigned long addr,
 
flush_cache_page(vma, addr, pte_pfn(*pvmw.pte));
ptep_clear_flush_notify(vma, addr, pvmw.pte);
-   set_pte_at_notify(mm, addr, pvmw.pte,
-   mk_pte(new_page, vma->vm_page_prot));
+   set_pte_at(mm, addr, pvmw.pte, mk_pte(new_page, vma->vm_page_prot));
 
page_remove_rmap(old_page, false);
if (!page_mapped(old_page))
-- 
2.17.1



[RFC PATCH 3/4] mm/mmu_notifier: set MMU_NOTIFIER_USE_CHANGE_PTE flag where appropriate

2019-01-31 Thread jglisse
From: Jérôme Glisse 

When notifying change for a range use MMU_NOTIFIER_USE_CHANGE_PTE flag
for page table update that use set_pte_at_notify() and where the we are
going either from read and write to read only with same pfn or read only
to read and write with new pfn.

Note that set_pte_at_notify() itself should only be use in rare cases
ie we do not want to use it when we are updating a significant range of
virtual addresses and thus a significant number of pte. Instead for
those cases the event provided to mmu notifer invalidate_range_start()
callback should be use for optimization.

Signed-off-by: Jérôme Glisse 
Cc: Andrea Arcangeli 
Cc: Peter Xu 
Cc: Andrew Morton 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: k...@vger.kernel.org
---
 include/linux/mmu_notifier.h | 13 +
 mm/ksm.c |  6 --
 mm/memory.c  |  3 ++-
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index d7a35975c2bd..0885bf33dc9c 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -43,6 +43,19 @@ enum mmu_notifier_event {
 };
 
 #define MMU_NOTIFIER_EVENT_BITS order_base_2(MMU_NOTIFY_EVENT_MAX)
+/*
+ * Set MMU_NOTIFIER_USE_CHANGE_PTE only when the page table it updated with the
+ * set_pte_at_notify() and when pte is updated from read and write to read only
+ * with same pfn or from read only to read and write with different pfn. It is
+ * illegal to set in any other circumstances.
+ *
+ * Note that set_pte_at_notify() should not be use outside of the above cases.
+ * When updating a range in batch (like write protecting a range) it is better
+ * to rely on invalidate_range_start() and struct mmu_notifier_range to infer
+ * the kind of update that is happening (as an example you can look at the
+ * mmu_notifier_range_update_to_read_only() function).
+ */
+#define MMU_NOTIFIER_USE_CHANGE_PTE (1 << MMU_NOTIFIER_EVENT_BITS)
 
 #ifdef CONFIG_MMU_NOTIFIER
 
diff --git a/mm/ksm.c b/mm/ksm.c
index 97757c5fa15f..b7fb7b560cc0 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1051,7 +1051,8 @@ static int write_protect_page(struct vm_area_struct *vma, 
struct page *page,
 
BUG_ON(PageTransCompound(page));
 
-   mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, vma, mm,
+   mmu_notifier_range_init(, MMU_NOTIFY_CLEAR |
+   MMU_NOTIFIER_USE_CHANGE_PTE, vma, mm,
pvmw.address,
pvmw.address + PAGE_SIZE);
mmu_notifier_invalidate_range_start();
@@ -1140,7 +1141,8 @@ static int replace_page(struct vm_area_struct *vma, 
struct page *page,
if (!pmd)
goto out;
 
-   mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, vma, mm, addr,
+   mmu_notifier_range_init(, MMU_NOTIFY_CLEAR |
+   MMU_NOTIFIER_USE_CHANGE_PTE, vma, mm, addr,
addr + PAGE_SIZE);
mmu_notifier_invalidate_range_start();
 
diff --git a/mm/memory.c b/mm/memory.c
index a8c6922526f6..daf4b0f92af8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2275,7 +2275,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 
__SetPageUptodate(new_page);
 
-   mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, vma, mm,
+   mmu_notifier_range_init(, MMU_NOTIFY_CLEAR |
+   MMU_NOTIFIER_USE_CHANGE_PTE, vma, mm,
vmf->address & PAGE_MASK,
(vmf->address & PAGE_MASK) + PAGE_SIZE);
mmu_notifier_invalidate_range_start();
-- 
2.17.1



[RFC PATCH 2/4] mm/mmu_notifier: use unsigned for event field in range struct

2019-01-31 Thread jglisse
From: Jérôme Glisse 

Use unsigned for event field in range struct so that we can also set
flags with the event. This patch change the field and introduce the
helper.

Signed-off-by: Jérôme Glisse 
Cc: Andrea Arcangeli 
Cc: Peter Xu 
Cc: Andrew Morton 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: k...@vger.kernel.org
---
 include/linux/mmu_notifier.h | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index be873c431886..d7a35975c2bd 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -6,6 +6,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct mmu_notifier;
 struct mmu_notifier_ops;
@@ -38,8 +39,11 @@ enum mmu_notifier_event {
MMU_NOTIFY_PROTECTION_VMA,
MMU_NOTIFY_PROTECTION_PAGE,
MMU_NOTIFY_SOFT_DIRTY,
+   MMU_NOTIFY_EVENT_MAX
 };
 
+#define MMU_NOTIFIER_EVENT_BITS order_base_2(MMU_NOTIFY_EVENT_MAX)
+
 #ifdef CONFIG_MMU_NOTIFIER
 
 /*
@@ -60,7 +64,7 @@ struct mmu_notifier_range {
struct mm_struct *mm;
unsigned long start;
unsigned long end;
-   enum mmu_notifier_event event;
+   unsigned event;
bool blockable;
 };
 
@@ -352,7 +356,7 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct 
*mm)
 
 
 static inline void mmu_notifier_range_init(struct mmu_notifier_range *range,
-  enum mmu_notifier_event event,
+  unsigned event,
   struct vm_area_struct *vma,
   struct mm_struct *mm,
   unsigned long start,
-- 
2.17.1



[RFC PATCH 4/4] kvm/mmu_notifier: re-enable the change_pte() optimization.

2019-01-31 Thread jglisse
From: Jérôme Glisse 

Since changes to mmu notifier the change_pte() optimization was lost
for kvm. This re-enable it, when ever a pte is going from read and
write to read only with same pfn, or from read only to read and write
with different pfn.

It is safe to update the secondary MMUs, because the primary MMU
pte invalidate must have already happened with a ptep_clear_flush()
before set_pte_at_notify() is invoked (and thus before change_pte()
callback).

Signed-off-by: Jérôme Glisse 
Cc: Andrea Arcangeli 
Cc: Peter Xu 
Cc: Andrew Morton 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: k...@vger.kernel.org
---
 virt/kvm/kvm_main.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5ecea812cb6a..fec155c2d7b8 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -369,6 +369,14 @@ static int kvm_mmu_notifier_invalidate_range_start(struct 
mmu_notifier *mn,
int need_tlb_flush = 0, idx;
int ret;
 
+   /*
+* Nothing to do when MMU_NOTIFIER_USE_CHANGE_PTE is set as it means
+* that change_pte() will be call and it is a situation in which we
+* allow to only rely on change_pte().
+*/
+   if (range->event & MMU_NOTIFIER_USE_CHANGE_PTE)
+   return 0;
+
idx = srcu_read_lock(>srcu);
spin_lock(>mmu_lock);
/*
@@ -398,6 +406,14 @@ static void kvm_mmu_notifier_invalidate_range_end(struct 
mmu_notifier *mn,
 {
struct kvm *kvm = mmu_notifier_to_kvm(mn);
 
+   /*
+* Nothing to do when MMU_NOTIFIER_USE_CHANGE_PTE is set as it means
+* that change_pte() will be call and it is a situation in which we
+* allow to only rely on change_pte().
+*/
+   if (range->event & MMU_NOTIFIER_USE_CHANGE_PTE)
+   return;
+
spin_lock(>mmu_lock);
/*
 * This sequence increase will notify the kvm page fault that
-- 
2.17.1



[RFC PATCH 0/4] Restore change_pte optimization to its former glory

2019-01-31 Thread jglisse
From: Jérôme Glisse 

This patchset is on top of my patchset to add context information to
mmu notifier [1] you can find a branch with everything [2]. I have not
tested it but i wanted to get the discussion started. I believe it is
correct but i am not sure what kind of kvm test i can run to exercise
this.

The idea is that since kvm will invalidate the secondary MMUs within
invalidate_range callback then the change_pte() optimization is lost.
With this patchset everytime core mm is using set_pte_at_notify() and
thus change_pte() get calls then we can ignore the invalidate_range
callback altogether and only rely on change_pte callback.

Note that this is only valid when either going from a read and write
pte to a read only pte with same pfn, or from a read only pte to a
read and write pte with different pfn. The other side of the story
is that the primary mmu pte is clear with ptep_clear_flush_notify
before the call to change_pte.

Also with the mmu notifier context information [1] you can further
optimize other cases like mprotect or write protect when forking. You
can use the new context information to infer that the invalidation is
for read only update of the primary mmu and update the secondary mmu
accordingly instead of clearing it and forcing fault even for read
access. I do not know if that is an optimization that would bear any
fruit for kvm. It does help for device driver. You can also optimize
the soft dirty update.

Cheers,
Jérôme


[1] 
https://lore.kernel.org/linux-fsdevel/20190123222315.1122-1-jgli...@redhat.com/T/#m69e8f589240e18acbf196a1c8aa1d6fc97bd3565
[2] https://cgit.freedesktop.org/~glisse/linux/log/?h=kvm-restore-change_pte

Cc: Andrea Arcangeli 
Cc: Peter Xu 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Andrew Morton 
Cc: Matthew Wilcox 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: k...@vger.kernel.org

Jérôme Glisse (4):
  uprobes: use set_pte_at() not set_pte_at_notify()
  mm/mmu_notifier: use unsigned for event field in range struct
  mm/mmu_notifier: set MMU_NOTIFIER_USE_CHANGE_PTE flag where
appropriate
  kvm/mmu_notifier: re-enable the change_pte() optimization.

 include/linux/mmu_notifier.h | 21 +++--
 kernel/events/uprobes.c  |  3 +--
 mm/ksm.c |  6 --
 mm/memory.c  |  3 ++-
 virt/kvm/kvm_main.c  | 16 
 5 files changed, 42 insertions(+), 7 deletions(-)

-- 
2.17.1



[RFC PATCH 4/5] mm/hmm: add support for peer to peer to HMM device memory

2019-01-29 Thread jglisse
From: Jérôme Glisse 

Signed-off-by: Jérôme Glisse 
Cc: Logan Gunthorpe 
Cc: Greg Kroah-Hartman 
Cc: Rafael J. Wysocki 
Cc: Bjorn Helgaas 
Cc: Christian Koenig 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: linux-...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: Christoph Hellwig 
Cc: Marek Szyprowski 
Cc: Robin Murphy 
Cc: Joerg Roedel 
Cc: io...@lists.linux-foundation.org
---
 include/linux/hmm.h | 47 +
 mm/hmm.c| 63 +
 2 files changed, 105 insertions(+), 5 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 4a1454e3efba..7a3ac182cc48 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -710,6 +710,53 @@ struct hmm_devmem_ops {
 const struct page *page,
 unsigned int flags,
 pmd_t *pmdp);
+
+   /*
+* p2p_map() - map page for peer to peer between device
+* @devmem: device memory structure (see struct hmm_devmem)
+* @range: range of virtual address that is being mapped
+* @device: device the range is being map to
+* @addr: first virtual address in the range to consider
+* @pa: device address (where actual mapping is store)
+* Returns: number of page successfuly mapped, 0 otherwise
+*
+* Map page belonging to devmem to another device for peer to peer
+* access. Device can decide not to map in which case memory will
+* be migrated to main memory.
+*
+* Also there is no garantee that all the pages in the range does
+* belongs to the devmem so it is up to the function to check that
+* every single page does belong to devmem.
+*
+* Note for now we do not care about error exect error, so on failure
+* function should just return 0.
+*/
+   long (*p2p_map)(struct hmm_devmem *devmem,
+   struct hmm_range *range,
+   struct device *device,
+   unsigned long addr,
+   dma_addr_t *pas);
+
+   /*
+* p2p_unmap() - unmap page from peer to peer between device
+* @devmem: device memory structure (see struct hmm_devmem)
+* @range: range of virtual address that is being mapped
+* @device: device the range is being map to
+* @addr: first virtual address in the range to consider
+* @pa: device address (where actual mapping is store)
+* Returns: number of page successfuly unmapped, 0 otherwise
+*
+* Unmap page belonging to devmem previously map with p2p_map().
+*
+* Note there is no garantee that all the pages in the range does
+* belongs to the devmem so it is up to the function to check that
+* every single page does belong to devmem.
+*/
+   unsigned long (*p2p_unmap)(struct hmm_devmem *devmem,
+  struct hmm_range *range,
+  struct device *device,
+  unsigned long addr,
+  dma_addr_t *pas);
 };
 
 /*
diff --git a/mm/hmm.c b/mm/hmm.c
index 1a444885404e..fd49b1e116d0 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1193,16 +1193,19 @@ long hmm_range_dma_map(struct hmm_range *range,
   dma_addr_t *daddrs,
   bool block)
 {
-   unsigned long i, npages, mapped, page_size;
+   unsigned long i, npages, mapped, page_size, addr;
long ret;
 
+again:
ret = hmm_range_fault(range, block);
if (ret <= 0)
return ret ? ret : -EBUSY;
 
+   mapped = 0;
+   addr = range->start;
page_size = hmm_range_page_size(range);
npages = (range->end - range->start) >> range->page_shift;
-   for (i = 0, mapped = 0; i < npages; ++i) {
+   for (i = 0; i < npages; ++i, addr += page_size) {
enum dma_data_direction dir = DMA_FROM_DEVICE;
struct page *page;
 
@@ -1226,6 +1229,29 @@ long hmm_range_dma_map(struct hmm_range *range,
goto unmap;
}
 
+   if (is_device_private_page(page)) {
+   struct hmm_devmem *devmem = page->pgmap->data;
+
+   if (!devmem->ops->p2p_map || !devmem->ops->p2p_unmap) {
+   /* Fall-back to main memory. */
+   range->default_flags |=
+   range->flags[HMM_PFN_DEVICE_PRIVATE];
+   goto again;
+   }
+
+   ret = devmem->ops->p2p_map(devmem, range, device,
+  addr, daddrs);
+   if (ret <= 0) {
+   /* Fall-back to main memory. */
+   range->default_flags |=
+

[RFC PATCH 3/5] mm/vma: add support for peer to peer to device vma

2019-01-29 Thread jglisse
From: Jérôme Glisse 

Allow mmap of device file to export device memory to peer to peer
devices. This will allow for instance a network device to access a
GPU memory or to access a storage device queue directly.

The common case will be a vma created by userspace device driver
that is then share to another userspace device driver which call
in its kernel device driver to map that vma.

The vma does not need to have any valid CPU mapping so that only
peer to peer device might access its content. Or it could have
valid CPU mapping too in that case it should point to same memory
for consistency.

Note that peer to peer mapping is highly platform and device
dependent and it might not work in all the cases. However we do
expect supports for this to grow on more hardware platform.

This patch only adds new call backs to vm_operations_struct bulk
of code light within common bus driver (like pci) and device
driver (both the exporting and importing device).

Current design mandate that the importer must obey mmu_notifier
and invalidate any peer to peer mapping anytime a notification
of invalidation happens for a range that have been peer to peer
mapped. This allows exporter device to easily invalidate mapping
for any importer device.

Signed-off-by: Jérôme Glisse 
Cc: Logan Gunthorpe 
Cc: Greg Kroah-Hartman 
Cc: Rafael J. Wysocki 
Cc: Bjorn Helgaas 
Cc: Christian Koenig 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: linux-kernel@vger.kernel.org
Cc: linux-...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: Christoph Hellwig 
Cc: Marek Szyprowski 
Cc: Robin Murphy 
Cc: Joerg Roedel 
Cc: io...@lists.linux-foundation.org
---
 include/linux/mm.h | 38 ++
 1 file changed, 38 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 80bb6408fe73..1bd60a90e575 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -429,6 +429,44 @@ struct vm_operations_struct {
pgoff_t start_pgoff, pgoff_t end_pgoff);
unsigned long (*pagesize)(struct vm_area_struct * area);
 
+   /*
+* Optional for device driver that want to allow peer to peer (p2p)
+* mapping of their vma (which can be back by some device memory) to
+* another device.
+*
+* Note that the exporting device driver might not have map anything
+* inside the vma for the CPU but might still want to allow a peer
+* device to access the range of memory corresponding to a range in
+* that vma.
+*
+* FOR PREDICTABILITY IF DRIVER SUCCESSFULY MAP A RANGE ONCE FOR A
+* DEVICE THEN FURTHER MAPPING OF THE SAME IF THE VMA IS STILL VALID
+* SHOULD ALSO BE SUCCESSFUL. Following this rule allow the importing
+* device to map once during setup and report any failure at that time
+* to the userspace. Further mapping of the same range might happen
+* after mmu notifier invalidation over the range. The exporting device
+* can use this to move things around (defrag BAR space for instance)
+* or do other similar task.
+*
+* IMPORTER MUST OBEY mmu_notifier NOTIFICATION AND CALL p2p_unmap()
+* WHEN A NOTIFIER IS CALL FOR THE RANGE ! THIS CAN HAPPEN AT ANY
+* POINT IN TIME WITH NO LOCK HELD.
+*
+* In below function, the device argument is the importing device,
+* the exporting device is the device to which the vma belongs.
+*/
+   long (*p2p_map)(struct vm_area_struct *vma,
+   struct device *device,
+   unsigned long start,
+   unsigned long end,
+   dma_addr_t *pa,
+   bool write);
+   long (*p2p_unmap)(struct vm_area_struct *vma,
+ struct device *device,
+ unsigned long start,
+ unsigned long end,
+ dma_addr_t *pa);
+
/* notification that a previously read-only page is about to become
 * writable, if an error is returned it will cause a SIGBUS */
vm_fault_t (*page_mkwrite)(struct vm_fault *vmf);
-- 
2.17.2



[RFC PATCH 5/5] mm/hmm: add support for peer to peer to special device vma

2019-01-29 Thread jglisse
From: Jérôme Glisse 

Special device vma (mmap of a device file) can correspond to device
driver object that some device driver might want to share with other
device (giving access to). This add support for HMM to map those
special device vma if the owning device (exporter) allows it.

Signed-off-by: Jérôme Glisse 
Cc: Logan Gunthorpe 
Cc: Greg Kroah-Hartman 
Cc: Rafael J. Wysocki 
Cc: Bjorn Helgaas 
Cc: Christian Koenig 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: linux-...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: Christoph Hellwig 
Cc: Marek Szyprowski 
Cc: Robin Murphy 
Cc: Joerg Roedel 
Cc: io...@lists.linux-foundation.org
---
 include/linux/hmm.h |   6 ++
 mm/hmm.c| 156 ++--
 2 files changed, 128 insertions(+), 34 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 7a3ac182cc48..98ebe9f52432 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -137,6 +137,7 @@ enum hmm_pfn_flag_e {
  *  result of vmf_insert_pfn() or vm_insert_page(). Therefore, it should 
not
  *  be mirrored by a device, because the entry will never have 
HMM_PFN_VALID
  *  set and the pfn value is undefined.
+ * HMM_PFN_P2P: this entry have been map as P2P ie the dma address is valid
  *
  * Driver provide entry value for none entry, error entry and special entry,
  * driver can alias (ie use same value for error and special for instance). It
@@ -151,6 +152,7 @@ enum hmm_pfn_value_e {
HMM_PFN_ERROR,
HMM_PFN_NONE,
HMM_PFN_SPECIAL,
+   HMM_PFN_P2P,
HMM_PFN_VALUE_MAX
 };
 
@@ -250,6 +252,8 @@ static inline bool hmm_range_valid(struct hmm_range *range)
 static inline struct page *hmm_pfn_to_page(const struct hmm_range *range,
   uint64_t pfn)
 {
+   if (pfn == range->values[HMM_PFN_P2P])
+   return NULL;
if (pfn == range->values[HMM_PFN_NONE])
return NULL;
if (pfn == range->values[HMM_PFN_ERROR])
@@ -270,6 +274,8 @@ static inline struct page *hmm_pfn_to_page(const struct 
hmm_range *range,
 static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range,
   uint64_t pfn)
 {
+   if (pfn == range->values[HMM_PFN_P2P])
+   return -1UL;
if (pfn == range->values[HMM_PFN_NONE])
return -1UL;
if (pfn == range->values[HMM_PFN_ERROR])
diff --git a/mm/hmm.c b/mm/hmm.c
index fd49b1e116d0..621a4f831483 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1058,37 +1058,36 @@ long hmm_range_snapshot(struct hmm_range *range)
 }
 EXPORT_SYMBOL(hmm_range_snapshot);
 
-/*
- * hmm_range_fault() - try to fault some address in a virtual address range
- * @range: range being faulted
- * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
- * Returns: 0 on success ortherwise:
- *  -EINVAL:
- *  Invalid argument
- *  -ENOMEM:
- *  Out of memory.
- *  -EPERM:
- *  Invalid permission (for instance asking for write and range
- *  is read only).
- *  -EAGAIN:
- *  If you need to retry and mmap_sem was drop. This can only
- *  happens if block argument is false.
- *  -EBUSY:
- *  If the the range is being invalidated and you should wait for
- *  invalidation to finish.
- *  -EFAULT:
- *  Invalid (ie either no valid vma or it is illegal to access that
- *  range), number of valid pages in range->pfns[] (from range 
start
- *  address).
- *
- * This is similar to a regular CPU page fault except that it will not trigger
- * any memory migration if the memory being faulted is not accessible by CPUs
- * and caller does not ask for migration.
- *
- * On error, for one virtual address in the range, the function will mark the
- * corresponding HMM pfn entry with an error flag.
- */
-long hmm_range_fault(struct hmm_range *range, bool block)
+static int hmm_vma_p2p_map(struct hmm_range *range, struct vm_area_struct *vma,
+  unsigned long start, unsigned long end,
+  struct device *device, dma_addr_t *pas)
+{
+   struct hmm_vma_walk hmm_vma_walk;
+   unsigned long npages, i;
+   bool fault, write;
+   uint64_t *pfns;
+   int ret;
+
+   i = (start - range->start) >> PAGE_SHIFT;
+   npages = (end - start) >> PAGE_SHIFT;
+   pfns = >pfns[i];
+   pas = [i];
+
+   hmm_vma_walk.range = range;
+   hmm_vma_walk.fault = true;
+   hmm_range_need_fault(_vma_walk, pfns, npages,
+   0, , );
+
+   ret = vma->vm_ops->p2p_map(vma, device, start, end, pas, write);
+   for (i = 0; i < npages; ++i) {
+   pfns[i] = ret ? range->values[HMM_PFN_ERROR] :
+ range->values[HMM_PFN_P2P];
+   }
+   return ret;
+}
+
+static long 

[RFC PATCH 1/5] pci/p2p: add a function to test peer to peer capability

2019-01-29 Thread jglisse
From: Jérôme Glisse 

device_test_p2p() return true if two devices can peer to peer to
each other. We add a generic function as different inter-connect
can support peer to peer and we want to genericaly test this no
matter what the inter-connect might be. However this version only
support PCIE for now.

Signed-off-by: Jérôme Glisse 
Cc: Logan Gunthorpe 
Cc: Greg Kroah-Hartman 
Cc: Rafael J. Wysocki 
Cc: Bjorn Helgaas 
Cc: Christian Koenig 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: linux-kernel@vger.kernel.org
Cc: linux-...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: Christoph Hellwig 
Cc: Marek Szyprowski 
Cc: Robin Murphy 
Cc: Joerg Roedel 
Cc: io...@lists.linux-foundation.org
---
 drivers/pci/p2pdma.c   | 27 +++
 include/linux/pci-p2pdma.h |  6 ++
 2 files changed, 33 insertions(+)

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index c52298d76e64..620ac60babb5 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -797,3 +797,30 @@ ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev 
*p2p_dev,
return sprintf(page, "%s\n", pci_name(p2p_dev));
 }
 EXPORT_SYMBOL_GPL(pci_p2pdma_enable_show);
+
+bool pci_test_p2p(struct device *devA, struct device *devB)
+{
+   struct pci_dev *pciA, *pciB;
+   bool ret;
+   int tmp;
+
+   /*
+* For now we only support PCIE peer to peer but other inter-connect
+* can be added.
+*/
+   pciA = find_parent_pci_dev(devA);
+   pciB = find_parent_pci_dev(devB);
+   if (pciA == NULL || pciB == NULL) {
+   ret = false;
+   goto out;
+   }
+
+   tmp = upstream_bridge_distance(pciA, pciB, NULL);
+   ret = tmp < 0 ? false : true;
+
+out:
+   pci_dev_put(pciB);
+   pci_dev_put(pciA);
+   return false;
+}
+EXPORT_SYMBOL_GPL(pci_test_p2p);
diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h
index bca9bc3e5be7..7671cc499a08 100644
--- a/include/linux/pci-p2pdma.h
+++ b/include/linux/pci-p2pdma.h
@@ -36,6 +36,7 @@ int pci_p2pdma_enable_store(const char *page, struct pci_dev 
**p2p_dev,
bool *use_p2pdma);
 ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev,
   bool use_p2pdma);
+bool pci_test_p2p(struct device *devA, struct device *devB);
 #else /* CONFIG_PCI_P2PDMA */
 static inline int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar,
size_t size, u64 offset)
@@ -97,6 +98,11 @@ static inline ssize_t pci_p2pdma_enable_show(char *page,
 {
return sprintf(page, "none\n");
 }
+
+static inline bool pci_test_p2p(struct device *devA, struct device *devB)
+{
+   return false;
+}
 #endif /* CONFIG_PCI_P2PDMA */
 
 
-- 
2.17.2



[RFC PATCH 2/5] drivers/base: add a function to test peer to peer capability

2019-01-29 Thread jglisse
From: Jérôme Glisse 

device_test_p2p() return true if two devices can peer to peer to
each other. We add a generic function as different inter-connect
can support peer to peer and we want to genericaly test this no
matter what the inter-connect might be. However this version only
support PCIE for now.

Signed-off-by: Jérôme Glisse 
Cc: Logan Gunthorpe 
Cc: Greg Kroah-Hartman 
Cc: Rafael J. Wysocki 
Cc: Bjorn Helgaas 
Cc: Christian Koenig 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: linux-kernel@vger.kernel.org
Cc: linux-...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: Christoph Hellwig 
Cc: Marek Szyprowski 
Cc: Robin Murphy 
Cc: Joerg Roedel 
Cc: io...@lists.linux-foundation.org
---
 drivers/base/core.c| 20 
 include/linux/device.h |  1 +
 2 files changed, 21 insertions(+)

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 0073b09bb99f..56023b00e108 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -26,6 +26,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "base.h"
 #include "power/power.h"
@@ -3167,3 +3168,22 @@ void device_set_of_node_from_dev(struct device *dev, 
const struct device *dev2)
dev->of_node_reused = true;
 }
 EXPORT_SYMBOL_GPL(device_set_of_node_from_dev);
+
+/**
+ * device_test_p2p - test if two device can peer to peer to each other
+ * @devA: device A
+ * @devB: device B
+ * Returns: true if device can peer to peer to each other, false otherwise
+ */
+bool device_test_p2p(struct device *devA, struct device *devB)
+{
+   /*
+* For now we only support PCIE peer to peer but other inter-connect
+* can be added.
+*/
+   if (pci_test_p2p(devA, devB))
+   return true;
+
+   return false;
+}
+EXPORT_SYMBOL_GPL(device_test_p2p);
diff --git a/include/linux/device.h b/include/linux/device.h
index 6cb4640b6160..0d532d7f0779 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1250,6 +1250,7 @@ extern int device_online(struct device *dev);
 extern void set_primary_fwnode(struct device *dev, struct fwnode_handle 
*fwnode);
 extern void set_secondary_fwnode(struct device *dev, struct fwnode_handle 
*fwnode);
 void device_set_of_node_from_dev(struct device *dev, const struct device 
*dev2);
+bool device_test_p2p(struct device *devA, struct device *devB);
 
 static inline int dev_num_vf(struct device *dev)
 {
-- 
2.17.2



[RFC PATCH 0/5] Device peer to peer (p2p) through vma

2019-01-29 Thread jglisse
From: Jérôme Glisse 

This patchset add support for peer to peer between device in two manner.
First for device memory use through HMM in process regular address space
(ie inside a regular vma that is not an mmap of device file or special
file). Second for special vma ie mmap of a device file, in this case some
device driver might want to allow other device to directly access memory
use for those special vma (not that the memory might not even be map to
CPU in this case).

They are many use cases for this they mainly fall into 2 category:
[A]-Allow device to directly map and control another device command
queue.

[B]-Allow device to access another device memory without disrupting
the other device computation.

Corresponding workloads:

[1]-Network device directly access an control a block device command
queue so that it can do storage access without involving the CPU.
This fall into [A]
[2]-Accelerator device doing heavy computation and network device is
monitoring progress. Direct accelerator's memory access by the
network device avoid the need to use much slower system memory.
This fall into [B].
[3]-Accelerator device doing heavy computation and network device is
streaming out the result. This avoid the need to first bounce the
result through system memory (it saves both system memory and
bandwidth). This fall into [B].
[4]-Chaining device computation. For instance a camera device take a
picture, stream it to a color correction device that stream it
to final memory. This fall into [A and B].

People have more ideas on how to use this than i can list here. The
intention of this patchset is to provide the means to achieve those
and much more.

I have done a testing using nouveau and Mellanox mlx5 where the mlx5
device can directly access GPU memory [1]. I intend to use this inside
nouveau and help porting AMD ROCm RDMA to use this [2]. I believe
other people have express interest in working on using this with
network device and block device.

>From implementation point of view this just add 2 new call back to
vm_operations struct (for special device vma support) and 2 new call
back to HMM device memory structure for HMM device memory support.

For now it needs IOMMU off with ACS disabled and for both device to
be on same PCIE sub-tree (can not cross root complex). However the
intention here is different from some other peer to peer work in that
we do want to support IOMMU and are fine with going through the root
complex in that case. In other words, the bandwidth advantage of
avoiding the root complex is of less importance than the programming
model for the feature. We do actualy expect that this will be use
mostly with IOMMU enabled and thus with having to go through the root
bridge.

Another difference from other p2p solution is that we do require that
the importing device abide to mmu notifier invalidation so that the
exporting device can always invalidate a mapping at any point in time.
For this reasons we do not need a struct page for the device memory.

Also in all the cases the policy and final decision on wether to map
or not is solely under the control of the exporting device.

Finaly the device memory might not even be map to the CPU and thus
we have to go through the exporting device driver to get the physical
address at which the memory is accessible.

The core change are minimal (adding new call backs to some struct).
IOMMU support will need little change too. Most of the code is in
driver to implement export policy and BAR space management. Very gross
playground with IOMMU support in [3] (top 3 patches).

Cheers,
Jérôme

[1] https://cgit.freedesktop.org/~glisse/linux/log/?h=hmm-p2p
[2] https://github.com/RadeonOpenCompute/ROCnRDMA
[3] https://cgit.freedesktop.org/~glisse/linux/log/?h=wip-hmm-p2p

Cc: Logan Gunthorpe 
Cc: Greg Kroah-Hartman 
Cc: Rafael J. Wysocki 
Cc: Bjorn Helgaas 
Cc: Christian Koenig 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: linux-...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: Christoph Hellwig 
Cc: Marek Szyprowski 
Cc: Robin Murphy 
Cc: Joerg Roedel 
Cc: io...@lists.linux-foundation.org

Jérôme Glisse (5):
  pci/p2p: add a function to test peer to peer capability
  drivers/base: add a function to test peer to peer capability
  mm/vma: add support for peer to peer to device vma
  mm/hmm: add support for peer to peer to HMM device memory
  mm/hmm: add support for peer to peer to special device vma

 drivers/base/core.c|  20 
 drivers/pci/p2pdma.c   |  27 +
 include/linux/device.h |   1 +
 include/linux/hmm.h|  53 +
 include/linux/mm.h |  38 +++
 include/linux/pci-p2pdma.h |   6 +
 mm/hmm.c   | 219 ++---
 7 files changed, 325 insertions(+), 39 deletions(-)

-- 
2.17.2



[PATCH 1/1] RDMA/odp: convert to use HMM for ODP

2019-01-29 Thread jglisse
From: Jérôme Glisse 

Convert ODP to use HMM so that we can build on common infrastructure
for different class of devices that want to mirror a process address
space into a device. There is no functional changes.

Signed-off-by: Jérôme Glisse 
Cc: linux-r...@vger.kernel.org
Cc: Jason Gunthorpe 
Cc: Leon Romanovsky 
Cc: Doug Ledford 
Cc: Artemy Kovalyov 
Cc: Moni Shoua 
Cc: Mike Marciniszyn 
Cc: Kaike Wan 
Cc: Dennis Dalessandro 
---
 drivers/infiniband/core/umem_odp.c | 483 -
 drivers/infiniband/hw/mlx5/mem.c   |  20 +-
 drivers/infiniband/hw/mlx5/mr.c|   2 +-
 drivers/infiniband/hw/mlx5/odp.c   |  95 +++---
 include/rdma/ib_umem_odp.h |  54 +---
 5 files changed, 202 insertions(+), 452 deletions(-)

diff --git a/drivers/infiniband/core/umem_odp.c 
b/drivers/infiniband/core/umem_odp.c
index a4ec43093cb3..8afa707f1d9a 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -45,6 +45,20 @@
 #include 
 #include 
 
+
+static uint64_t odp_hmm_flags[HMM_PFN_FLAG_MAX] = {
+   ODP_READ_BIT,   /* HMM_PFN_VALID */
+   ODP_WRITE_BIT,  /* HMM_PFN_WRITE */
+   ODP_DEVICE_BIT, /* HMM_PFN_DEVICE_PRIVATE */
+};
+
+static uint64_t odp_hmm_values[HMM_PFN_VALUE_MAX] = {
+   -1UL,   /* HMM_PFN_ERROR */
+   0UL,/* HMM_PFN_NONE */
+   -2UL,   /* HMM_PFN_SPECIAL */
+};
+
+
 /*
  * The ib_umem list keeps track of memory regions for which the HW
  * device request to receive notification when the related memory
@@ -77,57 +91,25 @@ static u64 node_last(struct umem_odp_node *n)
 INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last,
 node_start, node_last, static, rbt_ib_umem)
 
-static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp)
-{
-   mutex_lock(_odp->umem_mutex);
-   if (umem_odp->notifiers_count++ == 0)
-   /*
-* Initialize the completion object for waiting on
-* notifiers. Since notifier_count is zero, no one should be
-* waiting right now.
-*/
-   reinit_completion(_odp->notifier_completion);
-   mutex_unlock(_odp->umem_mutex);
-}
-
-static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp)
-{
-   mutex_lock(_odp->umem_mutex);
-   /*
-* This sequence increase will notify the QP page fault that the page
-* that is going to be mapped in the spte could have been freed.
-*/
-   ++umem_odp->notifiers_seq;
-   if (--umem_odp->notifiers_count == 0)
-   complete_all(_odp->notifier_completion);
-   mutex_unlock(_odp->umem_mutex);
-}
-
 static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp,
   u64 start, u64 end, void *cookie)
 {
struct ib_umem *umem = _odp->umem;
 
-   /*
-* Increase the number of notifiers running, to
-* prevent any further fault handling on this MR.
-*/
-   ib_umem_notifier_start_account(umem_odp);
umem_odp->dying = 1;
/* Make sure that the fact the umem is dying is out before we release
 * all pending page faults. */
smp_wmb();
-   complete_all(_odp->notifier_completion);
umem->context->invalidate_range(umem_odp, ib_umem_start(umem),
ib_umem_end(umem));
return 0;
 }
 
-static void ib_umem_notifier_release(struct mmu_notifier *mn,
-struct mm_struct *mm)
+static void ib_umem_notifier_release(struct hmm_mirror *mirror)
 {
-   struct ib_ucontext_per_mm *per_mm =
-   container_of(mn, struct ib_ucontext_per_mm, mn);
+   struct ib_ucontext_per_mm *per_mm;
+
+   per_mm = container_of(mirror, struct ib_ucontext_per_mm, mirror);
 
down_read(_mm->umem_rwsem);
if (per_mm->active)
@@ -135,21 +117,24 @@ static void ib_umem_notifier_release(struct mmu_notifier 
*mn,
_mm->umem_tree, 0, ULLONG_MAX,
ib_umem_notifier_release_trampoline, true, NULL);
up_read(_mm->umem_rwsem);
+
+   per_mm->mm = NULL;
 }
 
-static int invalidate_range_start_trampoline(struct ib_umem_odp *item,
-u64 start, u64 end, void *cookie)
+static int invalidate_range_trampoline(struct ib_umem_odp *item,
+  u64 start, u64 end, void *cookie)
 {
-   ib_umem_notifier_start_account(item);
item->umem.context->invalidate_range(item, start, end);
return 0;
 }
 
-static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
-   const struct mmu_notifier_range *range)
+static int ib_sync_cpu_device_pagetables(struct hmm_mirror *mirror,
+   const struct hmm_update *range)
 {
-   struct ib_ucontext_per_mm *per_mm =
-   container_of(mn, struct 

[RFC PATCH 0/1] Use HMM for ODP

2019-01-29 Thread jglisse
From: Jérôme Glisse 

This patchset convert RDMA ODP to use HMM underneath this is motivated
by stronger code sharing for same feature (share virtual memory SVM or
Share Virtual Address SVA) and also stronger integration with mm code to
achieve that. It depends on HMM patchset posted for inclusion in 5.1 so
earliest target for this should be 5.2. I welcome any testing people can
do on this.

Moreover they are some features of HMM in the works like peer to peer
support, fast CPU page table snapshot, fast IOMMU mapping update ...
It will be easier for RDMA devices with ODP to leverage those if they
use HMM underneath.

Quick summary of what HMM is:
HMM is a toolbox for device driver to implement software support for
Share Virtual Memory (SVM). Not only it provides helpers to mirror a
process address space on a device (hmm_mirror). It also provides
helper to allow to use device memory to back regular valid virtual
address of a process (any valid mmap that is not an mmap of a device
or a DAX mapping). They are two kinds of device memory. Private memory
that is not accessible to CPU because it does not have all the expected
properties (this is for all PCIE devices) or public memory which can
also be access by CPU without restriction (with OpenCAPI or CCIX or
similar cache-coherent and atomic inter-connect).

Device driver can use each of HMM tools separatly. You do not have to
use all the tools it provides.

For RDMA device i do not expect a need to use the device memory support
of HMM. This device memory support is geared toward accelerator like GPU.


You can find a branch [1] with all the prerequisite in. This patch is on
top of 5.0rc2+ but i can rebase it on any specific branch before it is
consider for inclusion (5.2 at best).

Questions and reviews are more than welcome.

[1] https://cgit.freedesktop.org/~glisse/linux/log/?h=odp-hmm
[2] https://cgit.freedesktop.org/~glisse/linux/log/?h=hmm-for-5.1

Cc: linux-r...@vger.kernel.org
Cc: Jason Gunthorpe 
Cc: Leon Romanovsky 
Cc: Doug Ledford 
Cc: Artemy Kovalyov 
Cc: Moni Shoua 
Cc: Mike Marciniszyn 
Cc: Kaike Wan 
Cc: Dennis Dalessandro 

Jérôme Glisse (1):
  RDMA/odp: convert to use HMM for ODP

 drivers/infiniband/core/umem_odp.c | 483 -
 drivers/infiniband/hw/mlx5/mem.c   |  20 +-
 drivers/infiniband/hw/mlx5/mr.c|   2 +-
 drivers/infiniband/hw/mlx5/odp.c   |  95 +++---
 include/rdma/ib_umem_odp.h |  54 +---
 5 files changed, 202 insertions(+), 452 deletions(-)

-- 
2.17.2



[PATCH 06/10] mm/hmm: add default fault flags to avoid the need to pre-fill pfns arrays.

2019-01-29 Thread jglisse
From: Jérôme Glisse 

The HMM mirror API can be use in two fashions. The first one where the HMM
user coalesce multiple page faults into one request and set flags per pfns
for of those faults. The second one where the HMM user want to pre-fault a
range with specific flags. For the latter one it is a waste to have the user
pre-fill the pfn arrays with a default flags value.

This patch adds a default flags value allowing user to set them for a range
without having to pre-fill the pfn array.

Signed-off-by: Jérôme Glisse 
Cc: Andrew Morton 
Cc: Ralph Campbell 
Cc: John Hubbard 
---
 include/linux/hmm.h |  7 +++
 mm/hmm.c| 12 
 2 files changed, 19 insertions(+)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 93dc88edc293..4263f8fb32e5 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -165,6 +165,8 @@ enum hmm_pfn_value_e {
  * @pfns: array of pfns (big enough for the range)
  * @flags: pfn flags to match device driver page table
  * @values: pfn value for some special case (none, special, error, ...)
+ * @default_flags: default flags for the range (write, read, ...)
+ * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter
  * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT)
  * @valid: pfns array did not change since it has been fill by an HMM function
  */
@@ -177,6 +179,8 @@ struct hmm_range {
uint64_t*pfns;
const uint64_t  *flags;
const uint64_t  *values;
+   uint64_tdefault_flags;
+   uint64_tpfn_flags_mask;
uint8_t pfn_shift;
boolvalid;
 };
@@ -521,6 +525,9 @@ static inline int hmm_vma_fault(struct hmm_range *range, 
bool block)
 {
long ret;
 
+   range->default_flags = 0;
+   range->pfn_flags_mask = -1UL;
+
ret = hmm_range_register(range, range->vma->vm_mm,
 range->start, range->end);
if (ret)
diff --git a/mm/hmm.c b/mm/hmm.c
index 860ebe5d4b07..0a4ff31e9d7a 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -423,6 +423,18 @@ static inline void hmm_pte_need_fault(const struct 
hmm_vma_walk *hmm_vma_walk,
if (!hmm_vma_walk->fault)
return;
 
+   /*
+* So we not only consider the individual per page request we also
+* consider the default flags requested for the range. The API can
+* be use in 2 fashions. The first one where the HMM user coalesce
+* multiple page fault into one request and set flags per pfns for
+* of those faults. The second one where the HMM user want to pre-
+* fault a range with specific flags. For the latter one it is a
+* waste to have the user pre-fill the pfn arrays with a default
+* flags value.
+*/
+   pfns = (pfns & range->pfn_flags_mask) | range->default_flags;
+
/* We aren't ask to do anything ... */
if (!(pfns & range->flags[HMM_PFN_VALID]))
return;
-- 
2.17.2



[PATCH 01/10] mm/hmm: use reference counting for HMM struct

2019-01-29 Thread jglisse
From: Jérôme Glisse 

Every time i read the code to check that the HMM structure does not
vanish before it should thanks to the many lock protecting its removal
i get a headache. Switch to reference counting instead it is much
easier to follow and harder to break. This also remove some code that
is no longer needed with refcounting.

Signed-off-by: Jérôme Glisse 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: Andrew Morton 
---
 include/linux/hmm.h |   2 +
 mm/hmm.c| 178 +---
 2 files changed, 120 insertions(+), 60 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 66f9ebbb1df3..bd6e058597a6 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -131,6 +131,7 @@ enum hmm_pfn_value_e {
 /*
  * struct hmm_range - track invalidation lock on virtual address range
  *
+ * @hmm: the core HMM structure this range is active against
  * @vma: the vm area struct for the range
  * @list: all range lock are on a list
  * @start: range virtual start address (inclusive)
@@ -142,6 +143,7 @@ enum hmm_pfn_value_e {
  * @valid: pfns array did not change since it has been fill by an HMM function
  */
 struct hmm_range {
+   struct hmm  *hmm;
struct vm_area_struct   *vma;
struct list_headlist;
unsigned long   start;
diff --git a/mm/hmm.c b/mm/hmm.c
index a04e4b810610..b9f384ea15e9 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -50,6 +50,7 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
  */
 struct hmm {
struct mm_struct*mm;
+   struct kref kref;
spinlock_t  lock;
struct list_headranges;
struct list_headmirrors;
@@ -57,6 +58,16 @@ struct hmm {
struct rw_semaphore mirrors_sem;
 };
 
+static inline struct hmm *hmm_get(struct mm_struct *mm)
+{
+   struct hmm *hmm = READ_ONCE(mm->hmm);
+
+   if (hmm && kref_get_unless_zero(>kref))
+   return hmm;
+
+   return NULL;
+}
+
 /*
  * hmm_register - register HMM against an mm (HMM internal)
  *
@@ -67,14 +78,9 @@ struct hmm {
  */
 static struct hmm *hmm_register(struct mm_struct *mm)
 {
-   struct hmm *hmm = READ_ONCE(mm->hmm);
+   struct hmm *hmm = hmm_get(mm);
bool cleanup = false;
 
-   /*
-* The hmm struct can only be freed once the mm_struct goes away,
-* hence we should always have pre-allocated an new hmm struct
-* above.
-*/
if (hmm)
return hmm;
 
@@ -86,6 +92,7 @@ static struct hmm *hmm_register(struct mm_struct *mm)
hmm->mmu_notifier.ops = NULL;
INIT_LIST_HEAD(>ranges);
spin_lock_init(>lock);
+   kref_init(>kref);
hmm->mm = mm;
 
spin_lock(>page_table_lock);
@@ -106,7 +113,7 @@ static struct hmm *hmm_register(struct mm_struct *mm)
if (__mmu_notifier_register(>mmu_notifier, mm))
goto error_mm;
 
-   return mm->hmm;
+   return hmm;
 
 error_mm:
spin_lock(>page_table_lock);
@@ -118,9 +125,41 @@ static struct hmm *hmm_register(struct mm_struct *mm)
return NULL;
 }
 
+static void hmm_free(struct kref *kref)
+{
+   struct hmm *hmm = container_of(kref, struct hmm, kref);
+   struct mm_struct *mm = hmm->mm;
+
+   mmu_notifier_unregister_no_release(>mmu_notifier, mm);
+
+   spin_lock(>page_table_lock);
+   if (mm->hmm == hmm)
+   mm->hmm = NULL;
+   spin_unlock(>page_table_lock);
+
+   kfree(hmm);
+}
+
+static inline void hmm_put(struct hmm *hmm)
+{
+   kref_put(>kref, hmm_free);
+}
+
 void hmm_mm_destroy(struct mm_struct *mm)
 {
-   kfree(mm->hmm);
+   struct hmm *hmm;
+
+   spin_lock(>page_table_lock);
+   hmm = hmm_get(mm);
+   mm->hmm = NULL;
+   if (hmm) {
+   hmm->mm = NULL;
+   spin_unlock(>page_table_lock);
+   hmm_put(hmm);
+   return;
+   }
+
+   spin_unlock(>page_table_lock);
 }
 
 static int hmm_invalidate_range(struct hmm *hmm, bool device,
@@ -165,7 +204,7 @@ static int hmm_invalidate_range(struct hmm *hmm, bool 
device,
 static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 {
struct hmm_mirror *mirror;
-   struct hmm *hmm = mm->hmm;
+   struct hmm *hmm = hmm_get(mm);
 
down_write(>mirrors_sem);
mirror = list_first_entry_or_null(>mirrors, struct hmm_mirror,
@@ -186,36 +225,50 @@ static void hmm_release(struct mmu_notifier *mn, struct 
mm_struct *mm)
  struct hmm_mirror, list);
}
up_write(>mirrors_sem);
+
+   hmm_put(hmm);
 }
 
 static int hmm_invalidate_range_start(struct mmu_notifier *mn,
const struct mmu_notifier_range *range)
 {
struct hmm_update update;
-   struct hmm *hmm = range->mm->hmm;
+   struct hmm *hmm = hmm_get(range->mm);
+   int ret;
 

[PATCH 03/10] mm/hmm: improve and rename hmm_vma_get_pfns() to hmm_range_snapshot()

2019-01-29 Thread jglisse
From: Jérôme Glisse 

Rename for consistency between code, comments and documentation. Also
improves the comments on all the possible returns values. Improve the
function by returning the number of populated entries in pfns array.

Signed-off-by: Jérôme Glisse 
Cc: Andrew Morton 
Cc: Ralph Campbell 
Cc: John Hubbard 
---
 include/linux/hmm.h |  4 ++--
 mm/hmm.c| 23 ++-
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index bd6e058597a6..ddf49c1b1f5e 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -365,11 +365,11 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror);
  * table invalidation serializes on it.
  *
  * YOU MUST CALL hmm_vma_range_done() ONCE AND ONLY ONCE EACH TIME YOU CALL
- * hmm_vma_get_pfns() WITHOUT ERROR !
+ * hmm_range_snapshot() WITHOUT ERROR !
  *
  * IF YOU DO NOT FOLLOW THE ABOVE RULE THE SNAPSHOT CONTENT MIGHT BE INVALID !
  */
-int hmm_vma_get_pfns(struct hmm_range *range);
+long hmm_range_snapshot(struct hmm_range *range);
 bool hmm_vma_range_done(struct hmm_range *range);
 
 
diff --git a/mm/hmm.c b/mm/hmm.c
index 74d69812d6be..0d9ecd3337e5 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -706,23 +706,19 @@ static void hmm_pfns_special(struct hmm_range *range)
 }
 
 /*
- * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual 
addresses
- * @range: range being snapshotted
+ * hmm_range_snapshot() - snapshot CPU page table for a range
+ * @range: range
  * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
- *  vma permission, 0 success
+ *  permission (for instance asking for write and range is read only),
+ *  -EAGAIN if you need to retry, -EFAULT invalid (ie either no valid
+ *  vma or it is illegal to access that range), number of valid pages
+ *  in range->pfns[] (from range start address).
  *
  * This snapshots the CPU page table for a range of virtual addresses. Snapshot
  * validity is tracked by range struct. See hmm_vma_range_done() for further
  * information.
- *
- * The range struct is initialized here. It tracks the CPU page table, but only
- * if the function returns success (0), in which case the caller must then call
- * hmm_vma_range_done() to stop CPU page table update tracking on this range.
- *
- * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS
- * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED !
  */
-int hmm_vma_get_pfns(struct hmm_range *range)
+long hmm_range_snapshot(struct hmm_range *range)
 {
struct vm_area_struct *vma = range->vma;
struct hmm_vma_walk hmm_vma_walk;
@@ -776,6 +772,7 @@ int hmm_vma_get_pfns(struct hmm_range *range)
hmm_vma_walk.fault = false;
hmm_vma_walk.range = range;
mm_walk.private = _vma_walk;
+   hmm_vma_walk.last = range->start;
 
mm_walk.vma = vma;
mm_walk.mm = vma->vm_mm;
@@ -792,9 +789,9 @@ int hmm_vma_get_pfns(struct hmm_range *range)
 * function return 0).
 */
range->hmm = hmm;
-   return 0;
+   return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
 }
-EXPORT_SYMBOL(hmm_vma_get_pfns);
+EXPORT_SYMBOL(hmm_range_snapshot);
 
 /*
  * hmm_vma_range_done() - stop tracking change to CPU page table over a range
-- 
2.17.2



[PATCH 04/10] mm/hmm: improve and rename hmm_vma_fault() to hmm_range_fault()

2019-01-29 Thread jglisse
From: Jérôme Glisse 

Rename for consistency between code, comments and documentation. Also
improves the comments on all the possible returns values. Improve the
function by returning the number of populated entries in pfns array.

Signed-off-by: Jérôme Glisse 
Cc: Andrew Morton 
Cc: Ralph Campbell 
Cc: John Hubbard 
---
 include/linux/hmm.h | 13 ++-
 mm/hmm.c| 93 -
 2 files changed, 53 insertions(+), 53 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index ddf49c1b1f5e..ccf2b630447e 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -391,7 +391,18 @@ bool hmm_vma_range_done(struct hmm_range *range);
  *
  * See the function description in mm/hmm.c for further documentation.
  */
-int hmm_vma_fault(struct hmm_range *range, bool block);
+long hmm_range_fault(struct hmm_range *range, bool block);
+
+/* This is a temporary helper to avoid merge conflict between trees. */
+static inline int hmm_vma_fault(struct hmm_range *range, bool block)
+{
+   long ret = hmm_range_fault(range, block);
+   if (ret == -EBUSY)
+   ret = -EAGAIN;
+   else if (ret == -EAGAIN)
+   ret = -EBUSY;
+   return ret < 0 ? ret : 0;
+}
 
 /* Below are for HMM internal use only! Not to be used by device driver! */
 void hmm_mm_destroy(struct mm_struct *mm);
diff --git a/mm/hmm.c b/mm/hmm.c
index 0d9ecd3337e5..04235455b4d2 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -344,13 +344,13 @@ static int hmm_vma_do_fault(struct mm_walk *walk, 
unsigned long addr,
flags |= write_fault ? FAULT_FLAG_WRITE : 0;
ret = handle_mm_fault(vma, addr, flags);
if (ret & VM_FAULT_RETRY)
-   return -EBUSY;
+   return -EAGAIN;
if (ret & VM_FAULT_ERROR) {
*pfn = range->values[HMM_PFN_ERROR];
return -EFAULT;
}
 
-   return -EAGAIN;
+   return -EBUSY;
 }
 
 static int hmm_pfns_bad(unsigned long addr,
@@ -376,7 +376,7 @@ static int hmm_pfns_bad(unsigned long addr,
  * @fault: should we fault or not ?
  * @write_fault: write fault ?
  * @walk: mm_walk structure
- * Returns: 0 on success, -EAGAIN after page fault, or page fault error
+ * Returns: 0 on success, -EBUSY after page fault, or page fault error
  *
  * This function will be called whenever pmd_none() or pte_none() returns true,
  * or whenever there is no page directory covering the virtual address range.
@@ -399,12 +399,12 @@ static int hmm_vma_walk_hole_(unsigned long addr, 
unsigned long end,
 
ret = hmm_vma_do_fault(walk, addr, write_fault,
   [i]);
-   if (ret != -EAGAIN)
+   if (ret != -EBUSY)
return ret;
}
}
 
-   return (fault || write_fault) ? -EAGAIN : 0;
+   return (fault || write_fault) ? -EBUSY : 0;
 }
 
 static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
@@ -535,11 +535,11 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, 
unsigned long addr,
uint64_t orig_pfn = *pfn;
 
*pfn = range->values[HMM_PFN_NONE];
-   cpu_flags = pte_to_hmm_pfn_flags(range, pte);
-   hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
-  , _fault);
+   fault = write_fault = false;
 
if (pte_none(pte)) {
+   hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0,
+  , _fault);
if (fault || write_fault)
goto fault;
return 0;
@@ -578,7 +578,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, 
unsigned long addr,
hmm_vma_walk->last = addr;
migration_entry_wait(vma->vm_mm,
 pmdp, addr);
-   return -EAGAIN;
+   return -EBUSY;
}
return 0;
}
@@ -586,6 +586,10 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, 
unsigned long addr,
/* Report error for everything else */
*pfn = range->values[HMM_PFN_ERROR];
return -EFAULT;
+   } else {
+   cpu_flags = pte_to_hmm_pfn_flags(range, pte);
+   hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
+  , _fault);
}
 
if (fault || write_fault)
@@ -636,7 +640,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
if (fault || write_fault) {
hmm_vma_walk->last = addr;
pmd_migration_entry_wait(vma->vm_mm, pmdp);
-   return -EAGAIN;
+   return -EBUSY;
}
return 0;
} else if (!pmd_present(pmd))
@@ -858,53 +862,36 @@ bool 

[PATCH 09/10] mm/hmm: allow to mirror vma of a file on a DAX backed filesystem

2019-01-29 Thread jglisse
From: Jérôme Glisse 

This add support to mirror vma which is an mmap of a file which is on
a filesystem that using a DAX block device. There is no reason not to
support that case.

Note that unlike GUP code we do not take page reference hence when we
back-off we have nothing to undo.

Signed-off-by: Jérôme Glisse 
Cc: Andrew Morton 
Cc: Dan Williams 
Cc: Ralph Campbell 
Cc: John Hubbard 
---
 mm/hmm.c | 133 ++-
 1 file changed, 112 insertions(+), 21 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 8b87e1813313..1a444885404e 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -334,6 +334,7 @@ EXPORT_SYMBOL(hmm_mirror_unregister);
 
 struct hmm_vma_walk {
struct hmm_range*range;
+   struct dev_pagemap  *pgmap;
unsigned long   last;
boolfault;
boolblock;
@@ -508,6 +509,15 @@ static inline uint64_t pmd_to_hmm_pfn_flags(struct 
hmm_range *range, pmd_t pmd)
range->flags[HMM_PFN_VALID];
 }
 
+static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
+{
+   if (!pud_present(pud))
+   return 0;
+   return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
+   range->flags[HMM_PFN_WRITE] :
+   range->flags[HMM_PFN_VALID];
+}
+
 static int hmm_vma_handle_pmd(struct mm_walk *walk,
  unsigned long addr,
  unsigned long end,
@@ -529,8 +539,19 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
 
pfn = pmd_pfn(pmd) + pte_index(addr);
-   for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
+   for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+   if (pmd_devmap(pmd)) {
+   hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
+ hmm_vma_walk->pgmap);
+   if (unlikely(!hmm_vma_walk->pgmap))
+   return -EBUSY;
+   }
pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
+   }
+   if (hmm_vma_walk->pgmap) {
+   put_dev_pagemap(hmm_vma_walk->pgmap);
+   hmm_vma_walk->pgmap = NULL;
+   }
hmm_vma_walk->last = end;
return 0;
 }
@@ -617,10 +638,24 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, 
unsigned long addr,
if (fault || write_fault)
goto fault;
 
+   if (pte_devmap(pte)) {
+   hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte),
+ hmm_vma_walk->pgmap);
+   if (unlikely(!hmm_vma_walk->pgmap))
+   return -EBUSY;
+   } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) 
{
+   *pfn = range->values[HMM_PFN_SPECIAL];
+   return -EFAULT;
+   }
+
*pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags;
return 0;
 
 fault:
+   if (hmm_vma_walk->pgmap) {
+   put_dev_pagemap(hmm_vma_walk->pgmap);
+   hmm_vma_walk->pgmap = NULL;
+   }
pte_unmap(ptep);
/* Fault any virtual address we were asked to fault */
return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
@@ -708,12 +743,84 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
return r;
}
}
+   if (hmm_vma_walk->pgmap) {
+   put_dev_pagemap(hmm_vma_walk->pgmap);
+   hmm_vma_walk->pgmap = NULL;
+   }
pte_unmap(ptep - 1);
 
hmm_vma_walk->last = addr;
return 0;
 }
 
+static int hmm_vma_walk_pud(pud_t *pudp,
+   unsigned long start,
+   unsigned long end,
+   struct mm_walk *walk)
+{
+   struct hmm_vma_walk *hmm_vma_walk = walk->private;
+   struct hmm_range *range = hmm_vma_walk->range;
+   struct vm_area_struct *vma = walk->vma;
+   unsigned long addr = start, next;
+   pmd_t *pmdp;
+   pud_t pud;
+   int ret;
+
+again:
+   pud = READ_ONCE(*pudp);
+   if (pud_none(pud))
+   return hmm_vma_walk_hole(start, end, walk);
+
+   if (pud_huge(pud) && pud_devmap(pud)) {
+   unsigned long i, npages, pfn;
+   uint64_t *pfns, cpu_flags;
+   bool fault, write_fault;
+
+   if (!pud_present(pud))
+   return hmm_vma_walk_hole(start, end, walk);
+
+   i = (addr - range->start) >> PAGE_SHIFT;
+   npages = (end - addr) >> PAGE_SHIFT;
+   pfns = >pfns[i];
+
+   cpu_flags = pud_to_hmm_pfn_flags(range, pud);
+   hmm_range_need_fault(hmm_vma_walk, pfns, npages,
+

[PATCH 02/10] mm/hmm: do not erase snapshot when a range is invalidated

2019-01-29 Thread jglisse
From: Jérôme Glisse 

Users of HMM might be using the snapshot information to do
preparatory step like dma mapping pages to a device before
checking for invalidation through hmm_vma_range_done() so
do not erase that information and assume users will do the
right thing.

Signed-off-by: Jérôme Glisse 
Cc: Andrew Morton 
Cc: Ralph Campbell 
Cc: John Hubbard 
---
 mm/hmm.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index b9f384ea15e9..74d69812d6be 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -170,16 +170,10 @@ static int hmm_invalidate_range(struct hmm *hmm, bool 
device,
 
spin_lock(>lock);
list_for_each_entry(range, >ranges, list) {
-   unsigned long addr, idx, npages;
-
if (update->end < range->start || update->start >= range->end)
continue;
 
range->valid = false;
-   addr = max(update->start, range->start);
-   idx = (addr - range->start) >> PAGE_SHIFT;
-   npages = (min(range->end, update->end) - addr) >> PAGE_SHIFT;
-   memset(>pfns[idx], 0, sizeof(*range->pfns) * npages);
}
spin_unlock(>lock);
 
-- 
2.17.2



[PATCH 07/10] mm/hmm: add an helper function that fault pages and map them to a device

2019-01-29 Thread jglisse
From: Jérôme Glisse 

This is a all in one helper that fault pages in a range and map them to
a device so that every single device driver do not have to re-implement
this common pattern.

Signed-off-by: Jérôme Glisse 
Cc: Andrew Morton 
Cc: Ralph Campbell 
Cc: John Hubbard 
---
 include/linux/hmm.h |   9 +++
 mm/hmm.c| 152 
 2 files changed, 161 insertions(+)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 4263f8fb32e5..fc3630d0bbfd 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -502,6 +502,15 @@ int hmm_range_register(struct hmm_range *range,
 void hmm_range_unregister(struct hmm_range *range);
 long hmm_range_snapshot(struct hmm_range *range);
 long hmm_range_fault(struct hmm_range *range, bool block);
+long hmm_range_dma_map(struct hmm_range *range,
+  struct device *device,
+  dma_addr_t *daddrs,
+  bool block);
+long hmm_range_dma_unmap(struct hmm_range *range,
+struct vm_area_struct *vma,
+struct device *device,
+dma_addr_t *daddrs,
+bool dirty);
 
 /*
  * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range
diff --git a/mm/hmm.c b/mm/hmm.c
index 0a4ff31e9d7a..9cd68334a759 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -30,6 +30,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -985,6 +986,157 @@ long hmm_range_fault(struct hmm_range *range, bool block)
return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
 }
 EXPORT_SYMBOL(hmm_range_fault);
+
+/*
+ * hmm_range_dma_map() - hmm_range_fault() and dma map page all in one.
+ * @range: range being faulted
+ * @device: device against to dma map page to
+ * @daddrs: dma address of mapped pages
+ * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
+ * Returns: number of pages mapped on success, -EAGAIN if mmap_sem have been
+ *  drop and you need to try again, some other error value otherwise
+ *
+ * Note same usage pattern as hmm_range_fault().
+ */
+long hmm_range_dma_map(struct hmm_range *range,
+  struct device *device,
+  dma_addr_t *daddrs,
+  bool block)
+{
+   unsigned long i, npages, mapped;
+   long ret;
+
+   ret = hmm_range_fault(range, block);
+   if (ret <= 0)
+   return ret ? ret : -EBUSY;
+
+   npages = (range->end - range->start) >> PAGE_SHIFT;
+   for (i = 0, mapped = 0; i < npages; ++i) {
+   enum dma_data_direction dir = DMA_FROM_DEVICE;
+   struct page *page;
+
+   /*
+* FIXME need to update DMA API to provide invalid DMA address
+* value instead of a function to test dma address value. This
+* would remove lot of dumb code duplicated accross many arch.
+*
+* For now setting it to 0 here is good enough as the pfns[]
+* value is what is use to check what is valid and what isn't.
+*/
+   daddrs[i] = 0;
+
+   page = hmm_pfn_to_page(range, range->pfns[i]);
+   if (page == NULL)
+   continue;
+
+   /* Check if range is being invalidated */
+   if (!range->valid) {
+   ret = -EBUSY;
+   goto unmap;
+   }
+
+   /* If it is read and write than map bi-directional. */
+   if (range->pfns[i] & range->values[HMM_PFN_WRITE])
+   dir = DMA_BIDIRECTIONAL;
+
+   daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir);
+   if (dma_mapping_error(device, daddrs[i])) {
+   ret = -EFAULT;
+   goto unmap;
+   }
+
+   mapped++;
+   }
+
+   return mapped;
+
+unmap:
+   for (npages = i, i = 0; (i < npages) && mapped; ++i) {
+   enum dma_data_direction dir = DMA_FROM_DEVICE;
+   struct page *page;
+
+   page = hmm_pfn_to_page(range, range->pfns[i]);
+   if (page == NULL)
+   continue;
+
+   if (dma_mapping_error(device, daddrs[i]))
+   continue;
+
+   /* If it is read and write than map bi-directional. */
+   if (range->pfns[i] & range->values[HMM_PFN_WRITE])
+   dir = DMA_BIDIRECTIONAL;
+
+   dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir);
+   mapped--;
+   }
+
+   return ret;
+}
+EXPORT_SYMBOL(hmm_range_dma_map);
+
+/*
+ * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map()
+ * @range: range being unmapped
+ * @vma: the vma against which the range (optional)
+ * @device: device against which dma map was done
+ * @daddrs: dma 

[PATCH 10/10] mm/hmm: add helpers for driver to safely take the mmap_sem

2019-01-29 Thread jglisse
From: Jérôme Glisse 

The device driver context which holds reference to mirror and thus to
core hmm struct might outlive the mm against which it was created. To
avoid every driver to check for that case provide an helper that check
if mm is still alive and take the mmap_sem in read mode if so. If the
mm have been destroy (mmu_notifier release call back did happen) then
we return -EINVAL so that calling code knows that it is trying to do
something against a mm that is no longer valid.

Signed-off-by: Jérôme Glisse 
Cc: Andrew Morton 
Cc: Ralph Campbell 
Cc: John Hubbard 
---
 include/linux/hmm.h | 50 ++---
 1 file changed, 47 insertions(+), 3 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index b3850297352f..4a1454e3efba 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -438,6 +438,50 @@ struct hmm_mirror {
 int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm);
 void hmm_mirror_unregister(struct hmm_mirror *mirror);
 
+/*
+ * hmm_mirror_mm_down_read() - lock the mmap_sem in read mode
+ * @mirror: the HMM mm mirror for which we want to lock the mmap_sem
+ * Returns: -EINVAL if the mm is dead, 0 otherwise (lock taken).
+ *
+ * The device driver context which holds reference to mirror and thus to core
+ * hmm struct might outlive the mm against which it was created. To avoid every
+ * driver to check for that case provide an helper that check if mm is still
+ * alive and take the mmap_sem in read mode if so. If the mm have been destroy
+ * (mmu_notifier release call back did happen) then we return -EINVAL so that
+ * calling code knows that it is trying to do something against a mm that is
+ * no longer valid.
+ */
+static inline int hmm_mirror_mm_down_read(struct hmm_mirror *mirror)
+{
+   struct mm_struct *mm;
+
+   /* Sanity check ... */
+   if (!mirror || !mirror->hmm)
+   return -EINVAL;
+   /*
+* Before trying to take the mmap_sem make sure the mm is still
+* alive as device driver context might outlive the mm lifetime.
+*
+* FIXME: should we also check for mm that outlive its owning
+* task ?
+*/
+   mm = READ_ONCE(mirror->hmm->mm);
+   if (mirror->hmm->dead || !mm)
+   return -EINVAL;
+
+   down_read(>mmap_sem);
+   return 0;
+}
+
+/*
+ * hmm_mirror_mm_up_read() - unlock the mmap_sem from read mode
+ * @mirror: the HMM mm mirror for which we want to lock the mmap_sem
+ */
+static inline void hmm_mirror_mm_up_read(struct hmm_mirror *mirror)
+{
+   up_read(>hmm->mm->mmap_sem);
+}
+
 
 /*
  * To snapshot the CPU page table you first have to call hmm_range_register()
@@ -463,7 +507,7 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror);
  *  if (ret)
  *  return ret;
  *
- *  down_read(mm->mmap_sem);
+ *  hmm_mirror_mm_down_read(mirror);
  *  again:
  *
  *  if (!hmm_range_wait_until_valid(, TIMEOUT)) {
@@ -476,13 +520,13 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror);
  *
  *  ret = hmm_range_snapshot(); or hmm_range_fault();
  *  if (ret == -EAGAIN) {
- *  down_read(mm->mmap_sem);
+ *  hmm_mirror_mm_down_read(mirror);
  *  goto again;
  *  } else if (ret == -EBUSY) {
  *  goto again;
  *  }
  *
- *  up_read(>mmap_sem);
+ *  hmm_mirror_mm_up_read(mirror);
  *  if (ret) {
  *  hmm_range_unregister(range);
  *  return ret;
-- 
2.17.2



[PATCH 05/10] mm/hmm: improve driver API to work and wait over a range

2019-01-29 Thread jglisse
From: Jérôme Glisse 

A common use case for HMM mirror is user trying to mirror a range
and before they could program the hardware it get invalidated by
some core mm event. Instead of having user re-try right away to
mirror the range provide a completion mechanism for them to wait
for any active invalidation affecting the range.

This also changes how hmm_range_snapshot() and hmm_range_fault()
works by not relying on vma so that we can drop the mmap_sem
when waiting and lookup the vma again on retry.

Signed-off-by: Jérôme Glisse 
Cc: Andrew Morton 
Cc: Ralph Campbell 
Cc: John Hubbard 
---
 include/linux/hmm.h | 208 +++---
 mm/hmm.c| 526 +---
 2 files changed, 430 insertions(+), 304 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index ccf2b630447e..93dc88edc293 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -77,8 +77,34 @@
 #include 
 #include 
 #include 
+#include 
 
-struct hmm;
+
+/*
+ * struct hmm - HMM per mm struct
+ *
+ * @mm: mm struct this HMM struct is bound to
+ * @lock: lock protecting ranges list
+ * @ranges: list of range being snapshotted
+ * @mirrors: list of mirrors for this mm
+ * @mmu_notifier: mmu notifier to track updates to CPU page table
+ * @mirrors_sem: read/write semaphore protecting the mirrors list
+ * @wq: wait queue for user waiting on a range invalidation
+ * @notifiers: count of active mmu notifiers
+ * @dead: is the mm dead ?
+ */
+struct hmm {
+   struct mm_struct*mm;
+   struct kref kref;
+   struct mutexlock;
+   struct list_headranges;
+   struct list_headmirrors;
+   struct mmu_notifier mmu_notifier;
+   struct rw_semaphore mirrors_sem;
+   wait_queue_head_t   wq;
+   longnotifiers;
+   booldead;
+};
 
 /*
  * hmm_pfn_flag_e - HMM flag enums
@@ -155,6 +181,38 @@ struct hmm_range {
boolvalid;
 };
 
+/*
+ * hmm_range_wait_until_valid() - wait for range to be valid
+ * @range: range affected by invalidation to wait on
+ * @timeout: time out for wait in ms (ie abort wait after that period of time)
+ * Returns: true if the range is valid, false otherwise.
+ */
+static inline bool hmm_range_wait_until_valid(struct hmm_range *range,
+ unsigned long timeout)
+{
+   /* Check if mm is dead ? */
+   if (range->hmm == NULL || range->hmm->dead || range->hmm->mm == NULL) {
+   range->valid = false;
+   return false;
+   }
+   if (range->valid)
+   return true;
+   wait_event_timeout(range->hmm->wq, range->valid || range->hmm->dead,
+  msecs_to_jiffies(timeout));
+   /* Return current valid status just in case we get lucky */
+   return range->valid;
+}
+
+/*
+ * hmm_range_valid() - test if a range is valid or not
+ * @range: range
+ * Returns: true if the range is valid, false otherwise.
+ */
+static inline bool hmm_range_valid(struct hmm_range *range)
+{
+   return range->valid;
+}
+
 /*
  * hmm_pfn_to_page() - return struct page pointed to by a valid HMM pfn
  * @range: range use to decode HMM pfn value
@@ -357,51 +415,133 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror);
 
 
 /*
- * To snapshot the CPU page table, call hmm_vma_get_pfns(), then take a device
- * driver lock that serializes device page table updates, then call
- * hmm_vma_range_done(), to check if the snapshot is still valid. The same
- * device driver page table update lock must also be used in the
- * hmm_mirror_ops.sync_cpu_device_pagetables() callback, so that CPU page
- * table invalidation serializes on it.
+ * To snapshot the CPU page table you first have to call hmm_range_register()
+ * to register the range. If hmm_range_register() return an error then some-
+ * thing is horribly wrong and you should fail loudly. If it returned true then
+ * you can wait for the range to be stable with hmm_range_wait_until_valid()
+ * function, a range is valid when there are no concurrent changes to the CPU
+ * page table for the range.
+ *
+ * Once the range is valid you can call hmm_range_snapshot() if that returns
+ * without error then you can take your device page table lock (the same lock
+ * you use in the HMM mirror sync_cpu_device_pagetables() callback). After
+ * taking that lock you have to check the range validity, if it is still valid
+ * (ie hmm_range_valid() returns true) then you can program the device page
+ * table, otherwise you have to start again. Pseudo code:
+ *
+ *  mydevice_prefault(mydevice, mm, start, end)
+ *  {
+ *  struct hmm_range range;
+ *  ...
  *
- * YOU MUST CALL hmm_vma_range_done() ONCE AND ONLY ONCE EACH TIME YOU CALL
- * hmm_range_snapshot() WITHOUT ERROR !
+ *  ret = hmm_range_register(, mm, start, end);
+ *  if (ret)
+ *  

[PATCH 08/10] mm/hmm: support hugetlbfs (snap shoting, faulting and DMA mapping)

2019-01-29 Thread jglisse
From: Jérôme Glisse 

This adds support for hugetlbfs so that HMM user can map mirror range
of virtual address back by hugetlbfs. Note that now the range allows
user to optimize DMA mapping of such page so that we can map a huge
page as one chunk.

Signed-off-by: Jérôme Glisse 
Cc: Andrew Morton 
Cc: Ralph Campbell 
Cc: John Hubbard 
---
 include/linux/hmm.h |  29 -
 mm/hmm.c| 141 +---
 2 files changed, 147 insertions(+), 23 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index fc3630d0bbfd..b3850297352f 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -181,10 +181,31 @@ struct hmm_range {
const uint64_t  *values;
uint64_tdefault_flags;
uint64_tpfn_flags_mask;
+   uint8_t page_shift;
uint8_t pfn_shift;
boolvalid;
 };
 
+/*
+ * hmm_range_page_shift() - return the page shift for the range
+ * @range: range being queried
+ * Returns: page shift (page size = 1 << page shift) for the range
+ */
+static inline unsigned hmm_range_page_shift(const struct hmm_range *range)
+{
+   return range->page_shift;
+}
+
+/*
+ * hmm_range_page_size() - return the page size for the range
+ * @range: range being queried
+ * Returns: page size for the range in bytes
+ */
+static inline unsigned long hmm_range_page_size(const struct hmm_range *range)
+{
+   return 1UL << hmm_range_page_shift(range);
+}
+
 /*
  * hmm_range_wait_until_valid() - wait for range to be valid
  * @range: range affected by invalidation to wait on
@@ -438,7 +459,7 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror);
  *  struct hmm_range range;
  *  ...
  *
- *  ret = hmm_range_register(, mm, start, end);
+ *  ret = hmm_range_register(, mm, start, end, page_shift);
  *  if (ret)
  *  return ret;
  *
@@ -498,7 +519,8 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror);
 int hmm_range_register(struct hmm_range *range,
   struct mm_struct *mm,
   unsigned long start,
-  unsigned long end);
+  unsigned long end,
+  unsigned page_shift);
 void hmm_range_unregister(struct hmm_range *range);
 long hmm_range_snapshot(struct hmm_range *range);
 long hmm_range_fault(struct hmm_range *range, bool block);
@@ -538,7 +560,8 @@ static inline int hmm_vma_fault(struct hmm_range *range, 
bool block)
range->pfn_flags_mask = -1UL;
 
ret = hmm_range_register(range, range->vma->vm_mm,
-range->start, range->end);
+range->start, range->end,
+PAGE_SHIFT);
if (ret)
return (int)ret;
 
diff --git a/mm/hmm.c b/mm/hmm.c
index 9cd68334a759..8b87e1813313 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -396,11 +396,13 @@ static int hmm_vma_walk_hole_(unsigned long addr, 
unsigned long end,
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
uint64_t *pfns = range->pfns;
-   unsigned long i;
+   unsigned long i, page_size;
 
hmm_vma_walk->last = addr;
-   i = (addr - range->start) >> PAGE_SHIFT;
-   for (; addr < end; addr += PAGE_SIZE, i++) {
+   page_size = 1UL << range->page_shift;
+   i = (addr - range->start) >> range->page_shift;
+
+   for (; addr < end; addr += page_size, i++) {
pfns[i] = range->values[HMM_PFN_NONE];
if (fault || write_fault) {
int ret;
@@ -712,6 +714,69 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
return 0;
 }
 
+static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
+ unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+   unsigned long addr = start, i, pfn, mask, size, pfn_inc;
+   struct hmm_vma_walk *hmm_vma_walk = walk->private;
+   struct hmm_range *range = hmm_vma_walk->range;
+   struct vm_area_struct *vma = walk->vma;
+   struct hstate *h = hstate_vma(vma);
+   uint64_t orig_pfn, cpu_flags;
+   bool fault, write_fault;
+   spinlock_t *ptl;
+   pte_t entry;
+   int ret = 0;
+
+   size = 1UL << huge_page_shift(h);
+   mask = size - 1;
+   if (range->page_shift != PAGE_SHIFT) {
+   /* Make sure we are looking at full page. */
+   if (start & mask)
+   return -EINVAL;
+   if (end < (start + size))
+   return -EINVAL;
+   pfn_inc = size >> PAGE_SHIFT;
+   } else {
+   pfn_inc = 1;
+   size = PAGE_SIZE;
+   }
+
+
+   ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);

[PATCH 00/10] HMM updates for 5.1

2019-01-29 Thread jglisse
From: Jérôme Glisse 

This patchset improves the HMM driver API and add support for hugetlbfs
and DAX mirroring. The improvement motivation was to make the ODP to HMM
conversion easier [1]. Because we have nouveau bits schedule for 5.1 and
to avoid any multi-tree synchronization this patchset adds few lines of
inline function that wrap the existing HMM driver API to the improved
API. The nouveau driver was tested before and after this patchset and it
builds and works on both case so there is no merging issue [2]. The
nouveau bit are queue up for 5.1 so this is why i added those inline.

If this get merge in 5.1 the plans is to merge the HMM to ODP in 5.2 or
5.3 if testing shows any issues (so far no issues has been found with
limited testing but Mellanox will be running heavier testing for longer
time).

To avoid spamming mm i would like to not cc mm on ODP or nouveau patches,
however if people prefer to see those on mm mailing list then i can keep
it cced.

This is also what i intend to use as a base for AMD and Intel patches
(v2 with more thing of some rfc which were already posted in the past).

[1] https://cgit.freedesktop.org/~glisse/linux/log/?h=odp-hmm
[2] https://cgit.freedesktop.org/~glisse/linux/log/?h=hmm-for-5.1

Cc: Andrew Morton 
Cc: Felix Kuehling 
Cc: Christian König 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: Jason Gunthorpe 
Cc: Dan Williams 

Jérôme Glisse (10):
  mm/hmm: use reference counting for HMM struct
  mm/hmm: do not erase snapshot when a range is invalidated
  mm/hmm: improve and rename hmm_vma_get_pfns() to hmm_range_snapshot()
  mm/hmm: improve and rename hmm_vma_fault() to hmm_range_fault()
  mm/hmm: improve driver API to work and wait over a range
  mm/hmm: add default fault flags to avoid the need to pre-fill pfns
arrays.
  mm/hmm: add an helper function that fault pages and map them to a
device
  mm/hmm: support hugetlbfs (snap shoting, faulting and DMA mapping)
  mm/hmm: allow to mirror vma of a file on a DAX backed filesystem
  mm/hmm: add helpers for driver to safely take the mmap_sem

 include/linux/hmm.h |  290 ++--
 mm/hmm.c| 1060 +--
 2 files changed, 983 insertions(+), 367 deletions(-)

-- 
2.17.2



[PATCH v4 5/9] mm/mmu_notifier: mmu_notifier_range_update_to_read_only() helper

2019-01-23 Thread jglisse
From: Jérôme Glisse 

Helper to test if a range is updated to read only (it is still valid
to read from the range). This is useful for device driver or anyone
who wish to optimize out update when they know that they already have
the range map read only.

Signed-off-by: Jérôme Glisse 
Cc: Christian König 
Cc: Jan Kara 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: Andrew Morton 
Cc: Matthew Wilcox 
Cc: Ross Zwisler 
Cc: Dan Williams 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: k...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: linux-fsde...@vger.kernel.org
Cc: Arnd Bergmann 
---
 include/linux/mmu_notifier.h |  4 
 mm/mmu_notifier.c| 10 ++
 2 files changed, 14 insertions(+)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 7514775817de..be873c431886 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -257,6 +257,8 @@ extern void __mmu_notifier_invalidate_range_end(struct 
mmu_notifier_range *r,
  bool only_end);
 extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
  unsigned long start, unsigned long end);
+extern bool
+mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range);
 
 static inline void mmu_notifier_release(struct mm_struct *mm)
 {
@@ -553,6 +555,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct 
*mm)
 {
 }
 
+#define mmu_notifier_range_update_to_read_only(r) false
+
 #define ptep_clear_flush_young_notify ptep_clear_flush_young
 #define pmdp_clear_flush_young_notify pmdp_clear_flush_young
 #define ptep_clear_young_notify ptep_test_and_clear_young
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 9c884abc7850..0b2f77715a08 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -395,3 +395,13 @@ void mmu_notifier_unregister_no_release(struct 
mmu_notifier *mn,
mmdrop(mm);
 }
 EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release);
+
+bool
+mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range)
+{
+   if (!range->vma || range->event != MMU_NOTIFY_PROTECTION_VMA)
+   return false;
+   /* Return true if the vma still have the read flag set. */
+   return range->vma->vm_flags & VM_READ;
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_range_update_to_read_only);
-- 
2.17.2



[PATCH v4 9/9] RDMA/umem_odp: optimize out the case when a range is updated to read only

2019-01-23 Thread jglisse
From: Jérôme Glisse 

When range of virtual address is updated read only and corresponding
user ptr object are already read only it is pointless to do anything.
Optimize this case out.

Signed-off-by: Jérôme Glisse 
Cc: Christian König 
Cc: Jan Kara 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: Andrew Morton 
Cc: Matthew Wilcox 
Cc: Ross Zwisler 
Cc: Dan Williams 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: k...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: linux-fsde...@vger.kernel.org
Cc: Arnd Bergmann 
---
 drivers/infiniband/core/umem_odp.c | 22 +++---
 include/rdma/ib_umem_odp.h |  1 +
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/core/umem_odp.c 
b/drivers/infiniband/core/umem_odp.c
index a4ec43093cb3..fa4e7fdcabfc 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -140,8 +140,15 @@ static void ib_umem_notifier_release(struct mmu_notifier 
*mn,
 static int invalidate_range_start_trampoline(struct ib_umem_odp *item,
 u64 start, u64 end, void *cookie)
 {
+   bool update_to_read_only = *((bool *)cookie);
+
ib_umem_notifier_start_account(item);
-   item->umem.context->invalidate_range(item, start, end);
+   /*
+* If it is already read only and we are updating to read only then we
+* do not need to change anything. So save time and skip this one.
+*/
+   if (!update_to_read_only || !item->read_only)
+   item->umem.context->invalidate_range(item, start, end);
return 0;
 }
 
@@ -150,6 +157,7 @@ static int ib_umem_notifier_invalidate_range_start(struct 
mmu_notifier *mn,
 {
struct ib_ucontext_per_mm *per_mm =
container_of(mn, struct ib_ucontext_per_mm, mn);
+   bool update_to_read_only;
 
if (range->blockable)
down_read(_mm->umem_rwsem);
@@ -166,10 +174,13 @@ static int ib_umem_notifier_invalidate_range_start(struct 
mmu_notifier *mn,
return 0;
}
 
+   update_to_read_only = mmu_notifier_range_update_to_read_only(range);
+
return rbt_ib_umem_for_each_in_range(_mm->umem_tree, range->start,
 range->end,
 invalidate_range_start_trampoline,
-range->blockable, NULL);
+range->blockable,
+_to_read_only);
 }
 
 static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start,
@@ -363,6 +374,9 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct 
ib_ucontext_per_mm *per_mm,
goto out_odp_data;
}
 
+   /* Assume read only at first, each time GUP is call this is updated. */
+   odp_data->read_only = true;
+
odp_data->dma_list =
vzalloc(array_size(pages, sizeof(*odp_data->dma_list)));
if (!odp_data->dma_list) {
@@ -619,8 +633,10 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp 
*umem_odp, u64 user_virt,
goto out_put_task;
}
 
-   if (access_mask & ODP_WRITE_ALLOWED_BIT)
+   if (access_mask & ODP_WRITE_ALLOWED_BIT) {
+   umem_odp->read_only = false;
flags |= FOLL_WRITE;
+   }
 
start_idx = (user_virt - ib_umem_start(umem)) >> page_shift;
k = start_idx;
diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h
index 0b1446fe2fab..8256668c6170 100644
--- a/include/rdma/ib_umem_odp.h
+++ b/include/rdma/ib_umem_odp.h
@@ -76,6 +76,7 @@ struct ib_umem_odp {
struct completion   notifier_completion;
int dying;
struct work_struct  work;
+   bool read_only;
 };
 
 static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem)
-- 
2.17.2



[PATCH v4 4/9] mm/mmu_notifier: pass down vma and reasons why mmu notifier is happening

2019-01-23 Thread jglisse
From: Jérôme Glisse 

CPU page table update can happens for many reasons, not only as a result
of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also
as a result of kernel activities (memory compression, reclaim, migration,
...).

Users of mmu notifier API track changes to the CPU page table and take
specific action for them. While current API only provide range of virtual
address affected by the change, not why the changes is happening

This patch is just passing down the new informations by adding it to the
mmu_notifier_range structure.

Signed-off-by: Jérôme Glisse 
Cc: Christian König 
Cc: Jan Kara 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: Andrew Morton 
Cc: Matthew Wilcox 
Cc: Ross Zwisler 
Cc: Dan Williams 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: k...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: linux-fsde...@vger.kernel.org
Cc: Arnd Bergmann 
---
 include/linux/mmu_notifier.h | 4 
 1 file changed, 4 insertions(+)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index a9808add4070..7514775817de 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -56,9 +56,11 @@ struct mmu_notifier_mm {
 };
 
 struct mmu_notifier_range {
+   struct vm_area_struct *vma;
struct mm_struct *mm;
unsigned long start;
unsigned long end;
+   enum mmu_notifier_event event;
bool blockable;
 };
 
@@ -354,6 +356,8 @@ static inline void mmu_notifier_range_init(struct 
mmu_notifier_range *range,
   unsigned long start,
   unsigned long end)
 {
+   range->vma = vma;
+   range->event = event;
range->mm = mm;
range->start = start;
range->end = end;
-- 
2.17.2



[PATCH v4 6/9] gpu/drm/radeon: optimize out the case when a range is updated to read only

2019-01-23 Thread jglisse
From: Jérôme Glisse 

When range of virtual address is updated read only and corresponding
user ptr object are already read only it is pointless to do anything.
Optimize this case out.

Signed-off-by: Jérôme Glisse 
Cc: Christian König 
Cc: Jan Kara 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: Andrew Morton 
Cc: Matthew Wilcox 
Cc: Ross Zwisler 
Cc: Dan Williams 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: k...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: linux-fsde...@vger.kernel.org
Cc: Arnd Bergmann 
---
 drivers/gpu/drm/radeon/radeon_mn.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/drivers/gpu/drm/radeon/radeon_mn.c 
b/drivers/gpu/drm/radeon/radeon_mn.c
index b3019505065a..f77294f58e63 100644
--- a/drivers/gpu/drm/radeon/radeon_mn.c
+++ b/drivers/gpu/drm/radeon/radeon_mn.c
@@ -124,6 +124,7 @@ static int radeon_mn_invalidate_range_start(struct 
mmu_notifier *mn,
struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn);
struct ttm_operation_ctx ctx = { false, false };
struct interval_tree_node *it;
+   bool update_to_read_only;
unsigned long end;
int ret = 0;
 
@@ -138,6 +139,8 @@ static int radeon_mn_invalidate_range_start(struct 
mmu_notifier *mn,
else if (!mutex_trylock(>lock))
return -EAGAIN;
 
+   update_to_read_only = mmu_notifier_range_update_to_read_only(range);
+
it = interval_tree_iter_first(>objects, range->start, end);
while (it) {
struct radeon_mn_node *node;
@@ -153,10 +156,20 @@ static int radeon_mn_invalidate_range_start(struct 
mmu_notifier *mn,
it = interval_tree_iter_next(it, range->start, end);
 
list_for_each_entry(bo, >bos, mn_list) {
+   bool read_only;
 
if (!bo->tbo.ttm || bo->tbo.ttm->state != tt_bound)
continue;
 
+   /*
+* If it is already read only and we are updating to
+* read only then we do not need to change anything.
+* So save time and skip this one.
+*/
+   read_only = radeon_ttm_tt_is_readonly(bo->tbo.ttm);
+   if (update_to_read_only && read_only)
+   continue;
+
r = radeon_bo_reserve(bo, true);
if (r) {
DRM_ERROR("(%ld) failed to reserve user bo\n", 
r);
-- 
2.17.2



[PATCH v4 2/9] mm/mmu_notifier: contextual information for event triggering invalidation

2019-01-23 Thread jglisse
From: Jérôme Glisse 

CPU page table update can happens for many reasons, not only as a result
of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also
as a result of kernel activities (memory compression, reclaim, migration,
...).

Users of mmu notifier API track changes to the CPU page table and take
specific action for them. While current API only provide range of virtual
address affected by the change, not why the changes is happening.

This patchset do the initial mechanical convertion of all the places that
calls mmu_notifier_range_init to also provide the default MMU_NOTIFY_UNMAP
event as well as the vma if it is know (most invalidation happens against
a given vma). Passing down the vma allows the users of mmu notifier to
inspect the new vma page protection.

The MMU_NOTIFY_UNMAP is always the safe default as users of mmu notifier
should assume that every for the range is going away when that event
happens. A latter patch do convert mm call path to use a more appropriate
events for each call.

This is done as 2 patches so that no call site is forgotten especialy
as it uses this following coccinelle patch:

%<--
@@
identifier I1, I2, I3, I4;
@@
static inline void mmu_notifier_range_init(struct mmu_notifier_range *I1,
+enum mmu_notifier_event event,
+struct vm_area_struct *vma,
struct mm_struct *I2, unsigned long I3, unsigned long I4) { ... }

@@
@@
-#define mmu_notifier_range_init(range, mm, start, end)
+#define mmu_notifier_range_init(range, event, vma, mm, start, end)

@@
expression E1, E3, E4;
identifier I1;
@@
<...
mmu_notifier_range_init(E1,
+MMU_NOTIFY_UNMAP, I1,
I1->vm_mm, E3, E4)
...>

@@
expression E1, E2, E3, E4;
identifier FN, VMA;
@@
FN(..., struct vm_area_struct *VMA, ...) {
<...
mmu_notifier_range_init(E1,
+MMU_NOTIFY_UNMAP, VMA,
E2, E3, E4)
...> }

@@
expression E1, E2, E3, E4;
identifier FN, VMA;
@@
FN(...) {
struct vm_area_struct *VMA;
<...
mmu_notifier_range_init(E1,
+MMU_NOTIFY_UNMAP, VMA,
E2, E3, E4)
...> }

@@
expression E1, E2, E3, E4;
identifier FN;
@@
FN(...) {
<...
mmu_notifier_range_init(E1,
+MMU_NOTIFY_UNMAP, NULL,
E2, E3, E4)
...> }
-->%

Applied with:
spatch --all-includes --sp-file mmu-notifier.spatch fs/proc/task_mmu.c 
--in-place
spatch --sp-file mmu-notifier.spatch --dir kernel/events/ --in-place
spatch --sp-file mmu-notifier.spatch --dir mm --in-place

Signed-off-by: Jérôme Glisse 
Cc: Christian König 
Cc: Jan Kara 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: Andrew Morton 
Cc: Matthew Wilcox 
Cc: Ross Zwisler 
Cc: Dan Williams 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: k...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: linux-fsde...@vger.kernel.org
Cc: Arnd Bergmann 
---
 fs/proc/task_mmu.c   |  3 ++-
 include/linux/mmu_notifier.h |  4 +++-
 kernel/events/uprobes.c  |  3 ++-
 mm/huge_memory.c | 12 
 mm/hugetlb.c | 10 ++
 mm/khugepaged.c  |  3 ++-
 mm/ksm.c |  6 --
 mm/madvise.c |  3 ++-
 mm/memory.c  | 25 -
 mm/migrate.c |  5 -
 mm/mprotect.c|  3 ++-
 mm/mremap.c  |  3 ++-
 mm/oom_kill.c|  3 ++-
 mm/rmap.c|  6 --
 14 files changed, 59 insertions(+), 30 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f0ec9edab2f3..57e7f98647d3 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1143,7 +1143,8 @@ static ssize_t clear_refs_write(struct file *file, const 
char __user *buf,
break;
}
 
-   mmu_notifier_range_init(, mm, 0, -1UL);
+   mmu_notifier_range_init(, MMU_NOTIFY_UNMAP,
+   NULL, mm, 0, -1UL);
mmu_notifier_invalidate_range_start();
}
walk_page_range(0, mm->highest_vm_end, _refs_walk);
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index abc9dbb7bcb6..a9808add4070 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -348,6 +348,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct 
*mm)
 
 
 static inline void mmu_notifier_range_init(struct mmu_notifier_range *range,
+  enum mmu_notifier_event event,
+  struct vm_area_struct *vma,
   struct mm_struct *mm,
   unsigned long start,
   unsigned long end)
@@ -482,7 +484,7 @@ static inline void _mmu_notifier_range_init(struct 
mmu_notifier_range *range,
   

[PATCH v4 7/9] gpu/drm/amdgpu: optimize out the case when a range is updated to read only

2019-01-23 Thread jglisse
From: Jérôme Glisse 

When range of virtual address is updated read only and corresponding
user ptr object are already read only it is pointless to do anything.
Optimize this case out.

Signed-off-by: Jérôme Glisse 
Cc: Christian König 
Cc: Jan Kara 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: Andrew Morton 
Cc: Matthew Wilcox 
Cc: Ross Zwisler 
Cc: Dan Williams 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: k...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: linux-fsde...@vger.kernel.org
Cc: Arnd Bergmann 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
index 3e6823fdd939..7880eda064cd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
@@ -294,6 +294,7 @@ static int amdgpu_mn_invalidate_range_start_hsa(struct 
mmu_notifier *mn,
 {
struct amdgpu_mn *amn = container_of(mn, struct amdgpu_mn, mn);
struct interval_tree_node *it;
+   bool update_to_read_only;
unsigned long end;
 
/* notification is exclusive, but interval is inclusive */
@@ -302,6 +303,8 @@ static int amdgpu_mn_invalidate_range_start_hsa(struct 
mmu_notifier *mn,
if (amdgpu_mn_read_lock(amn, range->blockable))
return -EAGAIN;
 
+   update_to_read_only = mmu_notifier_range_update_to_read_only(range);
+
it = interval_tree_iter_first(>objects, range->start, end);
while (it) {
struct amdgpu_mn_node *node;
@@ -317,6 +320,16 @@ static int amdgpu_mn_invalidate_range_start_hsa(struct 
mmu_notifier *mn,
 
list_for_each_entry(bo, >bos, mn_list) {
struct kgd_mem *mem = bo->kfd_bo;
+   bool read_only;
+
+   /*
+* If it is already read only and we are updating to
+* read only then we do not need to change anything.
+* So save time and skip this one.
+*/
+   read_only = amdgpu_ttm_tt_is_readonly(bo->tbo.ttm);
+   if (update_to_read_only && read_only)
+   continue;
 
if (amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm,
 range->start,
-- 
2.17.2



[PATCH v4 3/9] mm/mmu_notifier: use correct mmu_notifier events for each invalidation

2019-01-23 Thread jglisse
From: Jérôme Glisse 

This update each existing invalidation to use the correct mmu notifier
event that represent what is happening to the CPU page table. See the
patch which introduced the events to see the rational behind this.

Signed-off-by: Jérôme Glisse 
Cc: Christian König 
Cc: Jan Kara 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: Andrew Morton 
Cc: Matthew Wilcox 
Cc: Ross Zwisler 
Cc: Dan Williams 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: k...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: linux-fsde...@vger.kernel.org
Cc: Arnd Bergmann 
---
 fs/proc/task_mmu.c  |  2 +-
 kernel/events/uprobes.c |  2 +-
 mm/huge_memory.c| 14 ++
 mm/hugetlb.c|  7 ---
 mm/khugepaged.c |  2 +-
 mm/ksm.c|  4 ++--
 mm/madvise.c|  2 +-
 mm/memory.c | 16 
 mm/migrate.c|  4 ++--
 mm/mprotect.c   |  5 +++--
 mm/rmap.c   |  6 +++---
 11 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 57e7f98647d3..cce226f3305f 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1143,7 +1143,7 @@ static ssize_t clear_refs_write(struct file *file, const 
char __user *buf,
break;
}
 
-   mmu_notifier_range_init(, MMU_NOTIFY_UNMAP,
+   mmu_notifier_range_init(, MMU_NOTIFY_SOFT_DIRTY,
NULL, mm, 0, -1UL);
mmu_notifier_invalidate_range_start();
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index b67fe7e59621..87e76a1dc758 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -174,7 +174,7 @@ static int __replace_page(struct vm_area_struct *vma, 
unsigned long addr,
struct mmu_notifier_range range;
struct mem_cgroup *memcg;
 
-   mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, vma, mm, addr,
+   mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, vma, mm, addr,
addr + PAGE_SIZE);
 
VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b353e8b7876f..957d23754217 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1182,9 +1182,8 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct 
vm_fault *vmf,
cond_resched();
}
 
-   mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, vma, vma->vm_mm,
-   haddr,
-   haddr + HPAGE_PMD_SIZE);
+   mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, vma, vma->vm_mm,
+   haddr, haddr + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start();
 
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
@@ -1346,9 +1345,8 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, 
pmd_t orig_pmd)
vma, HPAGE_PMD_NR);
__SetPageUptodate(new_page);
 
-   mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, vma, vma->vm_mm,
-   haddr,
-   haddr + HPAGE_PMD_SIZE);
+   mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, vma, vma->vm_mm,
+   haddr, haddr + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start();
 
spin_lock(vmf->ptl);
@@ -2025,7 +2023,7 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t 
*pud,
spinlock_t *ptl;
struct mmu_notifier_range range;
 
-   mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, vma, vma->vm_mm,
+   mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, vma, vma->vm_mm,
address & HPAGE_PUD_MASK,
(address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
mmu_notifier_invalidate_range_start();
@@ -2244,7 +2242,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t 
*pmd,
spinlock_t *ptl;
struct mmu_notifier_range range;
 
-   mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, vma, vma->vm_mm,
+   mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, vma, vma->vm_mm,
address & HPAGE_PMD_MASK,
(address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start();
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cbda46ad6a30..f691398ac6b6 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3246,7 +3246,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct 
mm_struct *src,
cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 
if (cow) {
-   mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, vma, src,
+   mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, vma, src,
vma->vm_start,
 

[PATCH v4 1/9] mm/mmu_notifier: contextual information for event enums

2019-01-23 Thread jglisse
From: Jérôme Glisse 

CPU page table update can happens for many reasons, not only as a result
of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also
as a result of kernel activities (memory compression, reclaim, migration,
...).

This patch introduce a set of enums that can be associated with each of
the events triggering a mmu notifier. Latter patches take advantages of
those enum values.

- UNMAP: munmap() or mremap()
- CLEAR: page table is cleared (migration, compaction, reclaim, ...)
- PROTECTION_VMA: change in access protections for the range
- PROTECTION_PAGE: change in access protections for page in the range
- SOFT_DIRTY: soft dirtyness tracking

Being able to identify munmap() and mremap() from other reasons why the
page table is cleared is important to allow user of mmu notifier to
update their own internal tracking structure accordingly (on munmap or
mremap it is not longer needed to track range of virtual address as it
becomes invalid).

Signed-off-by: Jérôme Glisse 
Cc: Christian König 
Cc: Jan Kara 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: Andrew Morton 
Cc: Matthew Wilcox 
Cc: Ross Zwisler 
Cc: Dan Williams 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: k...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: linux-fsde...@vger.kernel.org
Cc: Arnd Bergmann 
---
 include/linux/mmu_notifier.h | 30 ++
 1 file changed, 30 insertions(+)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 4050ec1c3b45..abc9dbb7bcb6 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -10,6 +10,36 @@
 struct mmu_notifier;
 struct mmu_notifier_ops;
 
+/**
+ * enum mmu_notifier_event - reason for the mmu notifier callback
+ * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that
+ * move the range
+ *
+ * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like
+ * madvise() or replacing a page by another one, ...).
+ *
+ * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range
+ * ie using the vma access permission (vm_page_prot) to update the whole range
+ * is enough no need to inspect changes to the CPU page table (mprotect()
+ * syscall)
+ *
+ * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for
+ * pages in the range so to mirror those changes the user must inspect the CPU
+ * page table (from the end callback).
+ *
+ * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same
+ * access flags). User should soft dirty the page in the end callback to make
+ * sure that anyone relying on soft dirtyness catch pages that might be written
+ * through non CPU mappings.
+ */
+enum mmu_notifier_event {
+   MMU_NOTIFY_UNMAP = 0,
+   MMU_NOTIFY_CLEAR,
+   MMU_NOTIFY_PROTECTION_VMA,
+   MMU_NOTIFY_PROTECTION_PAGE,
+   MMU_NOTIFY_SOFT_DIRTY,
+};
+
 #ifdef CONFIG_MMU_NOTIFIER
 
 /*
-- 
2.17.2



[PATCH v4 8/9] gpu/drm/i915: optimize out the case when a range is updated to read only

2019-01-23 Thread jglisse
From: Jérôme Glisse 

When range of virtual address is updated read only and corresponding
user ptr object are already read only it is pointless to do anything.
Optimize this case out.

Signed-off-by: Jérôme Glisse 
Cc: Christian König 
Cc: Jan Kara 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: Andrew Morton 
Cc: Matthew Wilcox 
Cc: Ross Zwisler 
Cc: Dan Williams 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: k...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: linux-fsde...@vger.kernel.org
Cc: Arnd Bergmann 
---
 drivers/gpu/drm/i915/i915_gem_userptr.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c 
b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 9558582c105e..23330ac3d7ea 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -59,6 +59,7 @@ struct i915_mmu_object {
struct interval_tree_node it;
struct list_head link;
struct work_struct work;
+   bool read_only;
bool attached;
 };
 
@@ -119,6 +120,7 @@ static int 
i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
container_of(_mn, struct i915_mmu_notifier, mn);
struct i915_mmu_object *mo;
struct interval_tree_node *it;
+   bool update_to_read_only;
LIST_HEAD(cancelled);
unsigned long end;
 
@@ -128,6 +130,8 @@ static int 
i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
/* interval ranges are inclusive, but invalidate range is exclusive */
end = range->end - 1;
 
+   update_to_read_only = mmu_notifier_range_update_to_read_only(range);
+
spin_lock(>lock);
it = interval_tree_iter_first(>objects, range->start, end);
while (it) {
@@ -145,6 +149,17 @@ static int 
i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
 * object if it is not in the process of being destroyed.
 */
mo = container_of(it, struct i915_mmu_object, it);
+
+   /*
+* If it is already read only and we are updating to
+* read only then we do not need to change anything.
+* So save time and skip this one.
+*/
+   if (update_to_read_only && mo->read_only) {
+   it = interval_tree_iter_next(it, range->start, end);
+   continue;
+   }
+
if (kref_get_unless_zero(>obj->base.refcount))
queue_work(mn->wq, >work);
 
@@ -270,6 +285,7 @@ i915_gem_userptr_init__mmu_notifier(struct 
drm_i915_gem_object *obj,
mo->mn = mn;
mo->obj = obj;
mo->it.start = obj->userptr.ptr;
+   mo->read_only = i915_gem_object_is_readonly(obj);
mo->it.last = obj->userptr.ptr + obj->base.size - 1;
INIT_WORK(>work, cancel_userptr);
 
-- 
2.17.2



[PATCH v4 0/9] mmu notifier provide context informations

2019-01-23 Thread jglisse
From: Jérôme Glisse 

Hi Andrew, i see that you still have my event patch in you queue [1].
This patchset replace that single patch and is broken down in further
step so that it is easier to review and ascertain that no mistake were
made during mechanical changes. Here are the step:

Patch 1 - add the enum values
Patch 2 - coccinelle semantic patch to convert all call site of
  mmu_notifier_range_init to default enum value and also
  to passing down the vma when it is available
Patch 3 - update many call site to more accurate enum values
Patch 4 - add the information to the mmu_notifier_range struct
Patch 5 - helper to test if a range is updated to read only

All the remaining patches are update to various driver to demonstrate
how this new information get use by device driver. I build tested
with make all and make all minus everything that enable mmu notifier
ie building with MMU_NOTIFIER=no. Also tested with some radeon,amd
gpu and intel gpu.

If they are no objections i believe best plan would be to merge the
the first 5 patches (all mm changes) through your queue for 5.1 and
then to delay driver update to each individual driver tree for 5.2.
This will allow each individual device driver maintainer time to more
thouroughly test this more then my own testing.

Note that i also intend to use this feature further in nouveau and
HMM down the road. I also expect that other user like KVM might be
interested into leveraging this new information to optimize some of
there secondary page table invalidation.

Here is an explaination on the rational for this patchset:


CPU page table update can happens for many reasons, not only as a result
of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also
as a result of kernel activities (memory compression, reclaim, migration,
...).

This patch introduce a set of enums that can be associated with each of
the events triggering a mmu notifier. Latter patches take advantages of
those enum values.

- UNMAP: munmap() or mremap()
- CLEAR: page table is cleared (migration, compaction, reclaim, ...)
- PROTECTION_VMA: change in access protections for the range
- PROTECTION_PAGE: change in access protections for page in the range
- SOFT_DIRTY: soft dirtyness tracking

Being able to identify munmap() and mremap() from other reasons why the
page table is cleared is important to allow user of mmu notifier to
update their own internal tracking structure accordingly (on munmap or
mremap it is not longer needed to track range of virtual address as it
becomes invalid).

[1] 
https://www.ozlabs.org/~akpm/mmotm/broken-out/mm-mmu_notifier-contextual-information-for-event-triggering-invalidation-v2.patch

Cc: Christian König 
Cc: Jan Kara 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: Andrew Morton 
Cc: Matthew Wilcox 
Cc: Ross Zwisler 
Cc: Dan Williams 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: k...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: linux-fsde...@vger.kernel.org
Cc: Arnd Bergmann 

Jérôme Glisse (9):
  mm/mmu_notifier: contextual information for event enums
  mm/mmu_notifier: contextual information for event triggering
invalidation
  mm/mmu_notifier: use correct mmu_notifier events for each invalidation
  mm/mmu_notifier: pass down vma and reasons why mmu notifier is
happening
  mm/mmu_notifier: mmu_notifier_range_update_to_read_only() helper
  gpu/drm/radeon: optimize out the case when a range is updated to read
only
  gpu/drm/amdgpu: optimize out the case when a range is updated to read
only
  gpu/drm/i915: optimize out the case when a range is updated to read
only
  RDMA/umem_odp: optimize out the case when a range is updated to read
only

 drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c  | 13 
 drivers/gpu/drm/i915/i915_gem_userptr.c | 16 ++
 drivers/gpu/drm/radeon/radeon_mn.c  | 13 
 drivers/infiniband/core/umem_odp.c  | 22 +++--
 fs/proc/task_mmu.c  |  3 +-
 include/linux/mmu_notifier.h| 42 -
 include/rdma/ib_umem_odp.h  |  1 +
 kernel/events/uprobes.c |  3 +-
 mm/huge_memory.c| 14 +
 mm/hugetlb.c| 11 ---
 mm/khugepaged.c |  3 +-
 mm/ksm.c|  6 ++--
 mm/madvise.c|  3 +-
 mm/memory.c | 25 +--
 mm/migrate.c|  5 ++-
 mm/mmu_notifier.c   | 10 ++
 mm/mprotect.c   |  4 ++-
 mm/mremap.c |  3 +-
 mm/oom_kill.c   |  3 +-
 mm/rmap.c   |  6 ++--
 20 files changed, 171 insertions(+), 35 deletions(-)

-- 
2.17.2



[PATCH v3 3/3] mm/mmu_notifier: contextual information for event triggering invalidation v2

2018-12-13 Thread jglisse
From: Jérôme Glisse 

CPU page table update can happens for many reasons, not only as a result
of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also
as a result of kernel activities (memory compression, reclaim, migration,
...).

Users of mmu notifier API track changes to the CPU page table and take
specific action for them. While current API only provide range of virtual
address affected by the change, not why the changes is happening.

This patchset adds event information so that users of mmu notifier can
differentiate among broad category:
- UNMAP: munmap() or mremap()
- CLEAR: page table is cleared (migration, compaction, reclaim, ...)
- PROTECTION_VMA: change in access protections for the range
- PROTECTION_PAGE: change in access protections for page in the range
- SOFT_DIRTY: soft dirtyness tracking

Being able to identify munmap() and mremap() from other reasons why the
page table is cleared is important to allow user of mmu notifier to
update their own internal tracking structure accordingly (on munmap or
mremap it is not longer needed to track range of virtual address as it
becomes invalid).

Changes since v1:
- use mmu_notifier_range_init() helper to to optimize out the case
  when mmu notifier is not enabled
- use kernel doc format for describing the enum values

Signed-off-by: Jérôme Glisse 
Acked-by: Christian König 
Acked-by: Jan Kara 
Acked-by: Felix Kuehling 
Acked-by: Jason Gunthorpe 
Cc: Andrew Morton 
Cc: Matthew Wilcox 
Cc: Ross Zwisler 
Cc: Dan Williams 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: Christian Koenig 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: k...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: linux-fsde...@vger.kernel.org
Cc: Arnd Bergmann 
---
 fs/dax.c |  7 +++
 fs/proc/task_mmu.c   |  3 ++-
 include/linux/mmu_notifier.h | 35 +--
 kernel/events/uprobes.c  |  3 ++-
 mm/huge_memory.c | 12 
 mm/hugetlb.c | 10 ++
 mm/khugepaged.c  |  3 ++-
 mm/ksm.c |  6 --
 mm/madvise.c |  3 ++-
 mm/memory.c  | 18 --
 mm/migrate.c |  5 +++--
 mm/mprotect.c|  3 ++-
 mm/mremap.c  |  3 ++-
 mm/oom_kill.c|  2 +-
 mm/rmap.c|  6 --
 15 files changed, 90 insertions(+), 29 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 874085bacaf5..6056b03a1626 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -768,6 +768,13 @@ static void dax_entry_mkclean(struct address_space 
*mapping, pgoff_t index,
 
address = pgoff_address(index, vma);
 
+   /*
+* All the field are populated by follow_pte_pmd() except
+* the event field.
+*/
+   mmu_notifier_range_init(, NULL, 0, -1UL,
+   MMU_NOTIFY_PROTECTION_PAGE);
+
/*
 * Note because we provide start/end to follow_pte_pmd it will
 * call mmu_notifier_invalidate_range_start() on our behalf
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index b3ddceb003bc..f68a9ebb0218 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1141,7 +1141,8 @@ static ssize_t clear_refs_write(struct file *file, const 
char __user *buf,
break;
}
 
-   mmu_notifier_range_init(, mm, 0, -1UL);
+   mmu_notifier_range_init(, mm, 0, -1UL,
+   MMU_NOTIFY_SOFT_DIRTY);
mmu_notifier_invalidate_range_start();
}
walk_page_range(0, mm->highest_vm_end, _refs_walk);
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 39b06772427f..d249e24acea5 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -25,10 +25,39 @@ struct mmu_notifier_mm {
spinlock_t lock;
 };
 
+/**
+ * enum mmu_notifier_event - reason for the mmu notifier callback
+ * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that
+ * move the range
+ *
+ * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like
+ * madvise() or replacing a page by another one, ...).
+ *
+ * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range
+ * ie using the vma access permission (vm_page_prot) to update the whole range
+ * is enough no need to inspect changes to the CPU page table (mprotect()
+ * syscall)
+ *
+ * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for
+ * pages in the range so to mirror those changes the user must inspect the CPU
+ * page table (from the end callback).
+ *
+ * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same
+ * 

[PATCH v3 1/3] mm/mmu_notifier: use structure for invalidate_range_start/end callback v2

2018-12-13 Thread jglisse
From: Jérôme Glisse 

To avoid having to change many callback definition everytime we want
to add a parameter use a structure to group all parameters for the
mmu_notifier invalidate_range_start/end callback. No functional changes
with this patch.

Changed since v1:
- fix make htmldocs warning in amdgpu_mn.c

Signed-off-by: Jérôme Glisse 
Acked-by: Jan Kara 
Acked-by: Felix Kuehling 
Acked-by: Jason Gunthorpe 
Cc: Andrew Morton 
Cc: Matthew Wilcox 
Cc: Ross Zwisler 
Cc: Dan Williams 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: Christian Koenig 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: k...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: linux-fsde...@vger.kernel.org
Cc: Arnd Bergmann 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c  | 47 +++--
 drivers/gpu/drm/i915/i915_gem_userptr.c | 14 
 drivers/gpu/drm/radeon/radeon_mn.c  | 16 -
 drivers/infiniband/core/umem_odp.c  | 20 +--
 drivers/infiniband/hw/hfi1/mmu_rb.c | 13 +++
 drivers/misc/mic/scif/scif_dma.c| 11 ++
 drivers/misc/sgi-gru/grutlbpurge.c  | 14 
 drivers/xen/gntdev.c| 12 +++
 include/linux/mmu_notifier.h| 14 +---
 mm/hmm.c| 23 +---
 mm/mmu_notifier.c   | 21 +--
 virt/kvm/kvm_main.c | 14 +++-
 12 files changed, 103 insertions(+), 116 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
index e55508b39496..3e6823fdd939 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
@@ -238,44 +238,40 @@ static void amdgpu_mn_invalidate_node(struct 
amdgpu_mn_node *node,
  * amdgpu_mn_invalidate_range_start_gfx - callback to notify about mm change
  *
  * @mn: our notifier
- * @mm: the mm this callback is about
- * @start: start of updated range
- * @end: end of updated range
+ * @range: mmu notifier context
  *
  * Block for operations on BOs to finish and mark pages as accessed and
  * potentially dirty.
  */
 static int amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn,
-struct mm_struct *mm,
-unsigned long start,
-unsigned long end,
-bool blockable)
+   const struct mmu_notifier_range *range)
 {
struct amdgpu_mn *amn = container_of(mn, struct amdgpu_mn, mn);
struct interval_tree_node *it;
+   unsigned long end;
 
/* notification is exclusive, but interval is inclusive */
-   end -= 1;
+   end = range->end - 1;
 
/* TODO we should be able to split locking for interval tree and
 * amdgpu_mn_invalidate_node
 */
-   if (amdgpu_mn_read_lock(amn, blockable))
+   if (amdgpu_mn_read_lock(amn, range->blockable))
return -EAGAIN;
 
-   it = interval_tree_iter_first(>objects, start, end);
+   it = interval_tree_iter_first(>objects, range->start, end);
while (it) {
struct amdgpu_mn_node *node;
 
-   if (!blockable) {
+   if (!range->blockable) {
amdgpu_mn_read_unlock(amn);
return -EAGAIN;
}
 
node = container_of(it, struct amdgpu_mn_node, it);
-   it = interval_tree_iter_next(it, start, end);
+   it = interval_tree_iter_next(it, range->start, end);
 
-   amdgpu_mn_invalidate_node(node, start, end);
+   amdgpu_mn_invalidate_node(node, range->start, end);
}
 
return 0;
@@ -294,39 +290,38 @@ static int amdgpu_mn_invalidate_range_start_gfx(struct 
mmu_notifier *mn,
  * are restorted in amdgpu_mn_invalidate_range_end_hsa.
  */
 static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn,
-struct mm_struct *mm,
-unsigned long start,
-unsigned long end,
-bool blockable)
+   const struct mmu_notifier_range *range)
 {
struct amdgpu_mn *amn = container_of(mn, struct amdgpu_mn, mn);
struct interval_tree_node *it;
+   unsigned long end;
 
/* notification is exclusive, but interval is inclusive */
-   end -= 1;
+   end = range->end - 1;
 
-   if (amdgpu_mn_read_lock(amn, blockable))
+   if (amdgpu_mn_read_lock(amn, range->blockable))
return -EAGAIN;
 
-   it = interval_tree_iter_first(>objects, start, end);
+   it = interval_tree_iter_first(>objects, range->start, end);
while (it) {
struct amdgpu_mn_node *node;
 

  1   2   3   4   5   >