[PATCH 08/14] mm: add struct address_space to releasepage() callback
From: Jérôme Glisse This is part of patchset to remove dependency on struct page.mapping field so that we can temporarily update it to point to a special structure tracking temporary page state (note that original mapping pointer is preserved and can still be accessed but at a cost). Add struct address_space to releasepage() callback arguments. Note that this patch does not make use of the new argument, nor does it use a valid one at call site (by default this patch just use NULL for new argument value). Use following script (from root of linux kernel tree): ./that-script.sh that-semantic-patch.spatch %< #!/bin/sh spatch_file=$1 echo PART1 === # P1 find callback functions name spatch --dir . --no-includes -D part1 --sp-file $spatch_file echo PART2 === # P2 change callback function prototype cat /tmp/unicorn-functions | sort | uniq | while read func ; do for file in $( git grep -l $func -- '*.[ch]' ) ; do echo $file spatch --no-includes --in-place -D part2 \ -D fn=$func --sp-file $spatch_file $file done done echo PART 3 == # P3 find all function which call the callback spatch --dir . --include-headers -D part3 --sp-file $spatch_file echo PART 4=== # P4 change all funcitons which call the callback cat /tmp/unicorn-files | sort | uniq | while read file ; do echo $file spatch --no-includes --in-place -D part4 \ --sp-file $spatch_file $file done >% With the following semantic patch: %< virtual part1, part2, part3, part4 // // Part 1 is grepping all function that are use as callback for releasepage. // initialize file where we collect all function name (erase it) @initialize:python depends on part1@ @@ file=open('/tmp/unicorn-functions', 'w') file.close() // match function name use as a callback @p1r2 depends on part1@ identifier I1, FN; @@ struct address_space_operations I1 = {..., .releasepage = FN, ...}; @script:python p1r3 depends on p1r2@ funcname << p1r2.FN; @@ if funcname != "NULL": file=open('/tmp/unicorn-functions', 'a') file.write(funcname + '\n') file.close() // --- // Part 2 modify callback // Add address_space argument to the function (releasepage callback one) @p2r1 depends on part2@ identifier virtual.fn; identifier I1, I2; type T1, T2; @@ int fn( +struct address_space *__mapping, T1 I1, T2 I2) { ... } @p2r2 depends on part2@ identifier virtual.fn; identifier I1, I2; type T1, T2; @@ int fn( +struct address_space *__mapping, T1 I1, T2 I2); @p2r3 depends on part2@ identifier virtual.fn; expression E1, E2; @@ fn( +MAPPING_NULL, E1, E2) // // Part 3 is grepping all function that are use the callback for releasepage. // initialize file where we collect all function name (erase it) @initialize:python depends on part3@ @@ file=open('/tmp/unicorn-files', 'w') file.write("./include/linux/pagemap.h\n") file.write("./include/linux/mm.h\n") file.write("./include/linux/fs.h\n") file.write("./mm/readahead.c\n") file.write("./mm/filemap.c\n") file.close() @p3r1 depends on part3 exists@ expression E1, E2, E3; identifier FN; position P; @@ FN@P(...) {... ( E1.a_ops->releasepage(E2, E3) | E1->a_ops->releasepage(E2, E3) ) ...} @script:python p3r2 depends on p3r1@ P << p3r1.P; @@ file=open('/tmp/unicorn-files', 'a') file.write(P[0].file + '\n') file.close() // --- // Part 4 generic modification @p4r1 depends on part4@ @@ struct address_space_operations { ... int (*releasepage)( +struct address_space *, struct page *, ...); ... }; @p4r2 depends on part4@ expression E1, E2, E3; @@ E1.a_ops->releasepage( +MAPPING_NULL, E2, E3) @p4r3 depends on part4@ expression E1, E2, E3; @@ E1->a_ops->releasepage( +MAPPING_NULL, E2, E3) >% Signed-off-by: Jérôme Glisse Cc: linux...@kvack.org Cc: linux-fsde...@vger.kernel.org Cc: Andrew Morton Cc: Alexander Viro Cc: linux-fsde...@vger.kernel.org Cc: Tejun Heo Cc: Jan Kara Cc: Josef Bacik --- fs/9p/vfs_addr.c | 3 ++- fs/afs/dir.c | 6 -- fs/afs/file.c | 6 -- fs/block_dev.c | 3 ++- fs/btrfs/disk-io.c | 5 +++-- fs/btrfs/inode.c | 5 +++-- fs/ceph/addr.c | 3 ++- fs/cifs/file.c | 3 ++- fs/erofs/super.c | 5 +++-- fs/ext4/inode.c| 3 ++-
[PATCH 14/14] mm: add struct address_space to is_dirty_writeback() callback
From: Jérôme Glisse This is part of patchset to remove dependency on struct page.mapping field so that we can temporarily update it to point to a special structure tracking temporary page state (note that original mapping pointer is preserved and can still be accessed but at a cost). Add struct address_space to is_dirty_writeback() callback arguments. Note that this patch does not make use of the new argument, nor does it use a valid one at call site (by default this patch just use NULL for new argument value). Use following script (from root of linux kernel tree): ./that-script.sh that-semantic-patch.spatch %< #!/bin/sh spatch_file=$1 echo PART1 === # P1 find callback functions name spatch --dir . --no-includes -D part1 --sp-file $spatch_file echo PART2 === # P2 change callback function prototype cat /tmp/unicorn-functions | sort | uniq | while read func ; do for file in $( git grep -l $func -- '*.[ch]' ) ; do echo $file spatch --no-includes --in-place -D part2 \ -D fn=$func --sp-file $spatch_file $file done done echo PART 3 == # P3 find all function which call the callback spatch --dir . --include-headers -D part3 --sp-file $spatch_file echo PART 4=== # P4 change all funcitons which call the callback cat /tmp/unicorn-files | sort | uniq | while read file ; do echo $file spatch --no-includes --in-place -D part4 \ --sp-file $spatch_file $file done >% With the following semantic patch: %< virtual part1, part2, part3, part4 // // Part 1 is grepping all function that are use as callback for is_dirty_writeback. // initialize file where we collect all function name (erase it) @initialize:python depends on part1@ @@ file=open('/tmp/unicorn-functions', 'w') file.close() // match function name use as a callback @p1r2 depends on part1@ identifier I1, FN; @@ struct address_space_operations I1 = {..., .is_dirty_writeback = FN, ...}; @script:python p1r3 depends on p1r2@ funcname << p1r2.FN; @@ if funcname != "NULL": file=open('/tmp/unicorn-functions', 'a') file.write(funcname + '\n') file.close() // --- // Part 2 modify callback @p2r1 depends on part2@ identifier virtual.fn; identifier I1, I2, I3; type T1, T2, T3; @@ void fn( +struct address_space *__mapping, T1 I1, T2 I2, T3 I3) { ... } @p2r2 depends on part2@ identifier virtual.fn; identifier I1, I2, I3; type T1, T2, T3; @@ void fn( +struct address_space *__mapping, T1 I1, T2 I2, T3 I3); @p2r3 depends on part2@ identifier virtual.fn; expression E1, E2, E3; @@ fn( +MAPPING_NULL, E1, E2, E3) // // Part 3 is grepping all function that are use the callback for is_dirty_writeback. // initialize file where we collect all function name (erase it) @initialize:python depends on part3@ @@ file=open('/tmp/unicorn-files', 'w') file.write("./include/linux/pagemap.h\n") file.write("./include/linux/mm.h\n") file.write("./include/linux/fs.h\n") file.write("./mm/readahead.c\n") file.write("./mm/filemap.c\n") file.close() @p3r1 depends on part3 exists@ expression E1, E2, E3, E4; identifier FN; position P; @@ FN@P(...) {... ( E1.a_ops->is_dirty_writeback(E2, E3, E4) | E1->a_ops->is_dirty_writeback(E2, E3, E4) ) ...} @script:python p3r2 depends on p3r1@ P << p3r1.P; @@ file=open('/tmp/unicorn-files', 'a') file.write(P[0].file + '\n') file.close() // --- // Part 4 generic modification @p4r1 depends on part4@ @@ struct address_space_operations { ... void (*is_dirty_writeback)( +struct address_space *, struct page *, ...); ... }; @p4r2 depends on part4@ expression E1, E2, E3, E4; @@ E1.a_ops->is_dirty_writeback( +MAPPING_NULL, E2, E3, E4) @p4r3 depends on part4@ expression E1, E2, E3, E4; @@ E1->a_ops->is_dirty_writeback( +MAPPING_NULL, E2, E3, E4) >% Signed-off-by: Jérôme Glisse Cc: linux...@kvack.org Cc: linux-fsde...@vger.kernel.org Cc: Andrew Morton Cc: Alexander Viro Cc: linux-fsde...@vger.kernel.org Cc: Tejun Heo Cc: Jan Kara Cc: Josef Bacik --- fs/buffer.c | 3 ++- fs/nfs/file.c | 5 +++-- include/linux/buffer_head.h | 3 ++- include/linux/fs.h | 3 ++- mm/vmscan.c | 3 ++- 5 files changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/buffer.c
[PATCH 09/14] mm: add struct address_space to freepage() callback
From: Jérôme Glisse This is part of patchset to remove dependency on struct page.mapping field so that we can temporarily update it to point to a special structure tracking temporary page state (note that original mapping pointer is preserved and can still be accessed but at a cost). Add struct address_space to freepage() callback arguments. Note that this patch does not make use of the new argument, nor does it use a valid one at call site (by default this patch just use NULL for new argument value). Use following script (from root of linux kernel tree): ./that-script.sh that-semantic-patch.spatch %< #!/bin/sh spatch_file=$1 echo PART1 === # P1 find callback functions name spatch --dir . --no-includes -D part1 --sp-file $spatch_file echo PART2 === # P2 change callback function prototype cat /tmp/unicorn-functions | sort | uniq | while read func ; do for file in $( git grep -l $func -- '*.[ch]' ) ; do echo $file spatch --no-includes --in-place -D part2 \ -D fn=$func --sp-file $spatch_file $file done done echo PART 3 == # P3 find all function which call the callback spatch --dir . --include-headers -D part3 --sp-file $spatch_file echo PART 4=== # P4 change all funcitons which call the callback cat /tmp/unicorn-files | sort | uniq | while read file ; do echo $file spatch --no-includes --in-place -D part4 \ --sp-file $spatch_file $file done >% With the following semantic patch: %< virtual part1, part2, part3, part4 // // Part 1 is grepping all function that are use as callback for freepage. // initialize file where we collect all function name (erase it) @initialize:python depends on part1@ @@ file=open('/tmp/unicorn-functions', 'w') file.close() // match function name use as a callback @p1r2 depends on part1@ identifier I1, FN; @@ struct address_space_operations I1 = {..., .freepage = FN, ...}; @script:python p1r3 depends on p1r2@ funcname << p1r2.FN; @@ if funcname != "NULL": file=open('/tmp/unicorn-functions', 'a') file.write(funcname + '\n') file.close() // --- // Part 2 modify callback // Add address_space argument to the function (freepage callback one) @p2r1 depends on part2@ identifier virtual.fn; identifier I1; type T1; @@ void fn( +struct address_space *__mapping, T1 I1) { ... } @p2r2 depends on part2@ identifier virtual.fn; identifier I1; type T1; @@ void fn( +struct address_space *__mapping, T1 I1); @p2r3 depends on part2@ identifier virtual.fn; expression E1; @@ fn( +MAPPING_NULL, E1) // // Part 3 is grepping all function that use the callback for freepage. // initialize file where we collect all function name (erase it) @initialize:python depends on part3@ @@ file=open('/tmp/unicorn-files', 'w') file.write("./include/linux/pagemap.h\n") file.write("./include/linux/mm.h\n") file.write("./include/linux/fs.h\n") file.write("./mm/readahead.c\n") file.write("./mm/filemap.c\n") file.close() @p3r1 depends on part3 exists@ expression E1, E2; identifier FN; position P; @@ FN@P(...) {... ( E1.a_ops->freepage(E2) | E1->a_ops->freepage(E2) ) ...} @script:python p3r2 depends on p3r1@ P << p3r1.P; @@ file=open('/tmp/unicorn-files', 'a') file.write(P[0].file + '\n') file.close() // --- // Part 4 generic modification @p4r1 depends on part4@ @@ struct address_space_operations { ... void (*freepage)( +struct address_space *, struct page *, ...); ... }; @p4r2 depends on part4@ expression E1, E2; @@ E1.a_ops->freepage( +MAPPING_NULL, E2) @p4r3 depends on part4@ expression E1, E2; @@ E1->a_ops->freepage( +MAPPING_NULL, E2) >% Signed-off-by: Jérôme Glisse Cc: linux...@kvack.org Cc: linux-fsde...@vger.kernel.org Cc: Andrew Morton Cc: Alexander Viro Cc: linux-fsde...@vger.kernel.org Cc: Tejun Heo Cc: Jan Kara Cc: Josef Bacik --- fs/nfs/dir.c| 9 + fs/orangefs/inode.c | 3 ++- include/linux/fs.h | 2 +- mm/filemap.c| 4 ++-- mm/truncate.c | 2 +- mm/vmscan.c | 2 +- 6 files changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 5a5c021967d3f..d8e66c98db3ea 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -53,7 +53,7 @@ static int nfs_closedir(struct inode *, struct file *);
[PATCH 13/14] mm: add struct address_space to isolate_page() callback
From: Jérôme Glisse This is part of patchset to remove dependency on struct page.mapping field so that we can temporarily update it to point to a special structure tracking temporary page state (note that original mapping pointer is preserved and can still be accessed but at a cost). Add struct address_space to isolate_page() callback arguments. Note that this patch does not make use of the new argument, nor does it use a valid one at call site (by default this patch just use NULL for new argument value). Use following script (from root of linux kernel tree): ./that-script.sh that-semantic-patch.spatch %< #!/bin/sh spatch_file=$1 echo PART1 === # P1 find callback functions name spatch --dir . --no-includes -D part1 --sp-file $spatch_file echo PART2 === # P2 change callback function prototype cat /tmp/unicorn-functions | sort | uniq | while read func ; do for file in $( git grep -l $func -- '*.[ch]' ) ; do echo $file spatch --no-includes --in-place -D part2 \ -D fn=$func --sp-file $spatch_file $file done done echo PART 3 == # P3 find all function which call the callback spatch --dir . --include-headers -D part3 --sp-file $spatch_file echo PART 4=== # P4 change all funcitons which call the callback cat /tmp/unicorn-files | sort | uniq | while read file ; do echo $file spatch --no-includes --in-place -D part4 \ --sp-file $spatch_file $file done >% With the following semantic patch: %< virtual part1, part2, part3, part4 // // Part 1 is grepping all function that are use as callback for isolate_page. // initialize file where we collect all function name (erase it) @initialize:python depends on part1@ @@ file=open('/tmp/unicorn-functions', 'w') file.close() // match function name use as a callback @p1r2 depends on part1@ identifier I1, FN; @@ struct address_space_operations I1 = {..., .isolate_page = FN, ...}; @script:python p1r3 depends on p1r2@ funcname << p1r2.FN; @@ if funcname != "NULL": file=open('/tmp/unicorn-functions', 'a') file.write(funcname + '\n') file.close() // --- // Part 2 modify callback // Add address_space argument to the function (isolate_page callback one) @p2r1 depends on part2@ identifier virtual.fn; identifier I1, I2; type R, T1, T2; @@ R fn( +struct address_space *__mapping, T1 I1, T2 I2) { ... } @p2r2 depends on part2@ identifier virtual.fn; identifier I1, I2; type R, T1, T2; @@ R fn( +struct address_space *__mapping, T1 I1, T2 I2); @p2r3 depends on part2@ identifier virtual.fn; expression E1, E2; @@ fn( +MAPPING_NULL, E1, E2) // // Part 3 is grepping all function that are use the callback for isolate_page. // initialize file where we collect all function name (erase it) @initialize:python depends on part3@ @@ file=open('/tmp/unicorn-files', 'w') file.write("./include/linux/pagemap.h\n") file.write("./include/linux/mm.h\n") file.write("./include/linux/fs.h\n") file.write("./mm/readahead.c\n") file.write("./mm/filemap.c\n") file.close() @p3r1 depends on part3 exists@ expression E1, E2, E3; identifier FN; position P; @@ FN@P(...) {... ( E1.a_ops->isolate_page(E2, E3) | E1->a_ops->isolate_page(E2, E3) ) ...} @script:python p3r2 depends on p3r1@ P << p3r1.P; @@ file=open('/tmp/unicorn-files', 'a') file.write(P[0].file + '\n') file.close() // --- // Part 4 generic modification @p4r1 depends on part4@ type R; @@ struct address_space_operations { ... R (*isolate_page)( +struct address_space *, struct page *, ...); ... }; @p4r2 depends on part4@ expression E1, E2, E3; @@ E1.a_ops->isolate_page( +MAPPING_NULL, E2, E3) @p4r3 depends on part4@ expression E1, E2, E3; @@ E1->a_ops->isolate_page( +MAPPING_NULL, E2, E3) >% Signed-off-by: Jérôme Glisse Cc: linux...@kvack.org Cc: linux-fsde...@vger.kernel.org Cc: Andrew Morton Cc: Alexander Viro Cc: linux-fsde...@vger.kernel.org Cc: Tejun Heo Cc: Jan Kara Cc: Josef Bacik --- include/linux/balloon_compaction.h | 5 +++-- include/linux/fs.h | 3 ++- mm/balloon_compaction.c| 3 ++- mm/migrate.c | 2 +- mm/z3fold.c| 3 ++- mm/zsmalloc.c | 3 ++- 6 files changed, 12 insertions(+), 7 deletions(-)
[PATCH 12/14] mm: add struct address_space to is_partially_uptodate() callback
From: Jérôme Glisse This is part of patchset to remove dependency on struct page.mapping field so that we can temporarily update it to point to a special structure tracking temporary page state (note that original mapping pointer is preserved and can still be accessed but at a cost). Add struct address_space to is_partially_uptodate() callback arguments. Note that this patch does not make use of the new argument, nor does it use a valid one at call site (by default this patch just use NULL for new argument value). Use following script (from root of linux kernel tree): ./that-script.sh that-semantic-patch.spatch %< #!/bin/sh spatch_file=$1 echo PART1 === # P1 find callback functions name spatch --dir . --no-includes -D part1 --sp-file $spatch_file echo PART2 === # P2 change callback function prototype cat /tmp/unicorn-functions | sort | uniq | while read func ; do for file in $( git grep -l $func -- '*.[ch]' ) ; do echo $file spatch --no-includes --in-place -D part2 \ -D fn=$func --sp-file $spatch_file $file done done echo PART 3 == # P3 find all function which call the callback spatch --dir . --include-headers -D part3 --sp-file $spatch_file echo PART 4=== # P4 change all funcitons which call the callback cat /tmp/unicorn-files | sort | uniq | while read file ; do echo $file spatch --no-includes --in-place -D part4 \ --sp-file $spatch_file $file done >% With the following semantic patch: %< virtual part1, part2, part3, part4 // // Part 1 is grepping all function that are use as callback for is_partially_uptodate. // initialize file where we collect all function name (erase it) @initialize:python depends on part1@ @@ file=open('/tmp/unicorn-functions', 'w') file.close() // match function name use as a callback @p1r2 depends on part1@ identifier I1, FN; @@ struct address_space_operations I1 = {..., .is_partially_uptodate = FN, ...}; @script:python p1r3 depends on p1r2@ funcname << p1r2.FN; @@ if funcname != "NULL": file=open('/tmp/unicorn-functions', 'a') file.write(funcname + '\n') file.close() // --- // Part 2 modify callback // Add address_space argument to the function (is_partially_uptodate callback one) @p2r1 depends on part2@ identifier virtual.fn; identifier I1, I2, I3; type T1, T2, T3; @@ int fn( +struct address_space *__mapping, T1 I1, T2 I2, T3 I3) { ... } @p2r2 depends on part2@ identifier virtual.fn; identifier I1, I2, I3; type T1, T2, T3; @@ int fn( +struct address_space *__mapping, T1 I1, T2 I2, T3 I3); @p2r3 depends on part2@ identifier virtual.fn; expression E1, E2, E3; @@ fn( +MAPPING_NULL, E1, E2, E3) // // Part 3 is grepping all function that are use the callback for is_partially_uptodate. // initialize file where we collect all function name (erase it) @initialize:python depends on part3@ @@ file=open('/tmp/unicorn-files', 'w') file.write("./include/linux/pagemap.h\n") file.write("./include/linux/mm.h\n") file.write("./include/linux/fs.h\n") file.write("./mm/readahead.c\n") file.write("./mm/filemap.c\n") file.close() @p3r1 depends on part3 exists@ expression E1, E2, E3, E4; identifier FN; position P; @@ FN@P(...) {... ( E1.a_ops->is_partially_uptodate(E2, E3, E4) | E1->a_ops->is_partially_uptodate(E2, E3, E4) ) ...} @script:python p3r2 depends on p3r1@ P << p3r1.P; @@ file=open('/tmp/unicorn-files', 'a') file.write(P[0].file + '\n') file.close() @p3r3 depends on part3 exists@ struct address_space_operations *AOPS; expression E1, E2, E3; identifier FN; position P; @@ FN@P(...) {... AOPS->is_partially_uptodate(E1, E2, E3) ...} @script:python p3r4 depends on p3r3@ P << p3r3.P; @@ file=open('/tmp/unicorn-files', 'a') file.write(P[0].file + '\n') file.close() // --- // Part 4 generic modification @p4r1 depends on part4@ @@ struct address_space_operations { ... int (*is_partially_uptodate)( +struct address_space *, struct page *, ...); ... }; @p4r2 depends on part4@ expression E1, E2, E3, E4; @@ E1.a_ops->is_partially_uptodate( +MAPPING_NULL, E2, E3, E4) @p4r3 depends on part4@ expression E1, E2, E3, E4; @@ E1->a_ops->is_partially_uptodate( +MAPPING_NULL, E2, E3, E4) @p4r4 depends on part4 exists@ struct address_space_operations *AOPS; expression E1, E2, E3; @@ {... AOPS->is_partially_uptodate(
[PATCH 11/14] mm: add struct address_space to launder_page() callback
From: Jérôme Glisse This is part of patchset to remove dependency on struct page.mapping field so that we can temporarily update it to point to a special structure tracking temporary page state (note that original mapping pointer is preserved and can still be accessed but at a cost). Add struct address_space to launder_page() callback arguments. Note that this patch does not make use of the new argument, nor does it use a valid one at call site (by default this patch just use NULL for new argument value). Use following script (from root of linux kernel tree): ./that-script.sh that-semantic-patch.spatch %< #!/bin/sh spatch_file=$1 echo PART1 === # P1 find callback functions name spatch --dir . --no-includes -D part1 --sp-file $spatch_file echo PART2 === # P2 change callback function prototype cat /tmp/unicorn-functions | sort | uniq | while read func ; do for file in $( git grep -l $func -- '*.[ch]' ) ; do echo $file spatch --no-includes --in-place -D part2 \ -D fn=$func --sp-file $spatch_file $file done done echo PART 3 == # P3 find all function which call the callback spatch --dir . --include-headers -D part3 --sp-file $spatch_file echo PART 4=== # P4 change all funcitons which call the callback cat /tmp/unicorn-files | sort | uniq | while read file ; do echo $file spatch --no-includes --in-place -D part4 \ --sp-file $spatch_file $file done >% With the following semantic patch: %< virtual part1, part2, part3, part4 // // Part 1 is grepping all function that are use as callback for launder_page. // initialize file where we collect all function name (erase it) @initialize:python depends on part1@ @@ file=open('/tmp/unicorn-functions', 'w') file.close() // match function name use as a callback @p1r2 depends on part1@ identifier I1, FN; @@ struct address_space_operations I1 = {..., .launder_page = FN, ...}; @script:python p1r3 depends on p1r2@ funcname << p1r2.FN; @@ if funcname != "NULL": file=open('/tmp/unicorn-functions', 'a') file.write(funcname + '\n') file.close() // --- // Part 2 modify callback // Add address_space argument to the function (launder_page callback one) @p2r1 depends on part2@ identifier virtual.fn; identifier I1; type T1; @@ int fn( +struct address_space *__mapping, T1 I1) { ... } @p2r2 depends on part2@ identifier virtual.fn; identifier I1; type T1; @@ int fn( +struct address_space *__mapping, T1 I1); @p2r3 depends on part2@ identifier virtual.fn; type T1; @@ int fn( +struct address_space *__mapping, T1); @p2r4 depends on part2@ identifier virtual.fn; expression E1; @@ fn( +MAPPING_NULL, E1) // // Part 3 is grepping all function that are use the callback for launder_page. // initialize file where we collect all function name (erase it) @initialize:python depends on part3@ @@ file=open('/tmp/unicorn-files', 'w') file.write("./include/linux/pagemap.h\n") file.write("./include/linux/mm.h\n") file.write("./include/linux/fs.h\n") file.write("./mm/readahead.c\n") file.write("./mm/filemap.c\n") file.close() @p3r1 depends on part3 exists@ expression E1, E2; identifier FN; position P; @@ FN@P(...) {... ( E1.a_ops->launder_page(E2) | E1->a_ops->launder_page(E2) ) ...} @script:python p3r2 depends on p3r1@ P << p3r1.P; @@ file=open('/tmp/unicorn-files', 'a') file.write(P[0].file + '\n') file.close() // --- // Part 4 generic modification @p4r1 depends on part4@ @@ struct address_space_operations { ... int (*launder_page)( +struct address_space *, struct page *); ... }; @p4r2 depends on part4@ expression E1, E2; @@ E1.a_ops->launder_page( +MAPPING_NULL, E2) @p4r3 depends on part4@ expression E1, E2; @@ E1->a_ops->launder_page( +MAPPING_NULL, E2) >% Signed-off-by: Jérôme Glisse Cc: linux...@kvack.org Cc: linux-fsde...@vger.kernel.org Cc: Andrew Morton Cc: Alexander Viro Cc: linux-fsde...@vger.kernel.org Cc: Tejun Heo Cc: Jan Kara Cc: Josef Bacik --- fs/9p/vfs_addr.c| 3 ++- fs/afs/internal.h | 2 +- fs/afs/write.c | 2 +- fs/cifs/file.c | 3 ++- fs/fuse/file.c | 3 ++- fs/nfs/file.c | 3 ++- fs/orangefs/inode.c | 17 + include/linux/fs.h | 2 +- mm/truncate.c | 2 +- 9 files
[PATCH 10/14] mm: add struct address_space to putback_page() callback
From: Jérôme Glisse This is part of patchset to remove dependency on struct page.mapping field so that we can temporarily update it to point to a special structure tracking temporary page state (note that original mapping pointer is preserved and can still be accessed but at a cost). Add struct address_space to putback_page() callback arguments. Note that this patch does not make use of the new argument, nor does it use a valid one at call site (by default this patch just use NULL for new argument value). Use following script (from root of linux kernel tree): ./that-script.sh that-semantic-patch.spatch %< #!/bin/sh spatch_file=$1 echo PART1 === # P1 find callback functions name spatch --dir . --no-includes -D part1 --sp-file $spatch_file echo PART2 === # P2 change callback function prototype cat /tmp/unicorn-functions | sort | uniq | while read func ; do for file in $( git grep -l $func -- '*.[ch]' ) ; do echo $file spatch --no-includes --in-place -D part2 \ -D fn=$func --sp-file $spatch_file $file done done echo PART 3 == # P3 find all function which call the callback spatch --dir . --include-headers -D part3 --sp-file $spatch_file echo PART 4=== # P4 change all funcitons which call the callback cat /tmp/unicorn-files | sort | uniq | while read file ; do echo $file spatch --no-includes --in-place -D part4 \ --sp-file $spatch_file $file done >% With the following semantic patch: %< virtual part1, part2, part3, part4 // // Part 1 is grepping all function that are use as callback for putback_page. // initialize file where we collect all function name (erase it) @initialize:python depends on part1@ @@ file=open('/tmp/unicorn-functions', 'w') file.close() // match function name use as a callback @p1r2 depends on part1@ identifier I1, FN; @@ struct address_space_operations I1 = {..., .putback_page = FN, ...}; @script:python p1r3 depends on p1r2@ funcname << p1r2.FN; @@ if funcname != "NULL": file=open('/tmp/unicorn-functions', 'a') file.write(funcname + '\n') file.close() // --- // Part 2 modify callback // Add address_space argument to the function (putback_page callback one) @p2r1 depends on part2@ identifier virtual.fn; identifier I1; type T1; @@ void fn( +struct address_space *__mapping, T1 I1) { ... } @p2r2 depends on part2@ identifier virtual.fn; identifier I1; type T1; @@ void fn( +struct address_space *__mapping, T1 I1); @p2r3 depends on part2@ identifier virtual.fn; expression E1; @@ fn( +MAPPING_NULL, E1) // // Part 3 is grepping all function that are use the callback for putback_page. // initialize file where we collect all function name (erase it) @initialize:python depends on part3@ @@ file=open('/tmp/unicorn-files', 'w') file.write("./include/linux/pagemap.h\n") file.write("./include/linux/mm.h\n") file.write("./include/linux/fs.h\n") file.write("./mm/readahead.c\n") file.write("./mm/filemap.c\n") file.close() @p3r1 depends on part3 exists@ expression E1, E2; identifier FN; position P; @@ FN@P(...) {... ( E1.a_ops->putback_page(E2) | E1->a_ops->putback_page(E2) ) ...} @script:python p3r2 depends on p3r1@ P << p3r1.P; @@ file=open('/tmp/unicorn-files', 'a') file.write(P[0].file + '\n') file.close() // --- // Part 4 generic modification @p4r1 depends on part4@ @@ struct address_space_operations { ... void (*putback_page)( +struct address_space *, struct page *); ... }; @p4r2 depends on part4@ expression E1, E2; @@ E1.a_ops->putback_page( +MAPPING_NULL, E2) @p4r3 depends on part4@ expression E1, E2; @@ E1->a_ops->putback_page( +MAPPING_NULL, E2) >% Signed-off-by: Jérôme Glisse Cc: linux...@kvack.org Cc: linux-fsde...@vger.kernel.org Cc: Andrew Morton Cc: Alexander Viro Cc: linux-fsde...@vger.kernel.org Cc: Tejun Heo Cc: Jan Kara Cc: Josef Bacik --- include/linux/balloon_compaction.h | 6 -- include/linux/fs.h | 2 +- mm/balloon_compaction.c| 2 +- mm/migrate.c | 2 +- mm/z3fold.c| 3 ++- mm/zsmalloc.c | 3 ++- 6 files changed, 11 insertions(+), 7 deletions(-) diff --git a/include/linux/balloon_compaction.h
[PATCH 07/14] mm: add struct address_space to invalidatepage() callback
From: Jérôme Glisse This is part of patchset to remove dependency on struct page.mapping field so that we can temporarily update it to point to a special structure tracking temporary page state (note that original mapping pointer is preserved and can still be accessed but at a cost). Add struct address_space to invalidatepage() callback arguments. Note that this patch does not make use of the new argument, nor does it use a valid one at call site (by default this patch just use NULL for new argument value). Use following script (from root of linux kernel tree): ./that-script.sh that-semantic-patch.spatch %< #!/bin/sh spatch_file=$1 echo PART1 === # P1 find callback functions name spatch --dir . --no-includes -D part1 --sp-file $spatch_file echo PART2 === # P2 change callback function prototype cat /tmp/unicorn-functions | sort | uniq | while read func ; do for file in $( git grep -l $func -- '*.[ch]' ) ; do echo $file spatch --no-includes --in-place -D part2 \ -D fn=$func --sp-file $spatch_file $file done done echo PART 3 == # P3 find all function which call the callback spatch --dir . --include-headers -D part3 --sp-file $spatch_file echo PART 4=== # P4 change all funcitons which call the callback cat /tmp/unicorn-files | sort | uniq | while read file ; do echo $file spatch --no-includes --in-place -D part4 \ --sp-file $spatch_file $file done >% With the following semantic patch: %< virtual part1, part2, part3, part4 // // Part 1 is grepping all function that are use as callback for invalidatepage. // initialize file where we collect all function name (erase it) @initialize:python depends on part1@ @@ file=open('/tmp/unicorn-functions', 'w') file.close() // match function name use as a callback @p1r2 depends on part1@ identifier I1, FN; @@ struct address_space_operations I1 = {..., .invalidatepage = FN, ...}; @script:python p1r3 depends on p1r2@ funcname << p1r2.FN; @@ if funcname != "NULL": file=open('/tmp/unicorn-functions', 'a') file.write(funcname + '\n') file.close() // --- // Part 2 modify callback // Add address_space argument to the function (invalidatepage callback one) @p2r1 depends on part2@ identifier virtual.fn; identifier I1, I2, I3; type T1, T2, T3; @@ void fn( +struct address_space *__mapping, T1 I1, T2 I2, T3 I3) { ... } @p2r2 depends on part2@ identifier virtual.fn; identifier I1, I2, I3; type T1, T2, T3; @@ void fn( +struct address_space *__mapping, T1 I1, T2 I2, T3 I3); @p2r3 depends on part2@ identifier virtual.fn; expression E1, E2, E3; @@ fn( +MAPPING_NULL, E1, E2, E3) // // Part 3 is grepping all function that are use the callback for invalidatepage. // initialize file where we collect all function name (erase it) @initialize:python depends on part3@ @@ file=open('/tmp/unicorn-files', 'w') file.write("./include/linux/pagemap.h\n") file.write("./include/linux/mm.h\n") file.write("./include/linux/fs.h\n") file.write("./mm/readahead.c\n") file.write("./mm/truncate.c\n") file.write("./mm/filemap.c\n") file.close() @p3r1 depends on part3 exists@ expression E1, E2, E3, E4; identifier FN; position P; @@ FN@P(...) {... ( E1.a_ops->invalidatepage(E2, E3, E4) | E1->a_ops->invalidatepage(E2, E3, E4) ) ...} @script:python p3r2 depends on p3r1@ P << p3r1.P; @@ file=open('/tmp/unicorn-files', 'a') file.write(P[0].file + '\n') file.close() // --- // Part 4 generic modification @p4r1 depends on part4@ @@ struct address_space_operations { ... void (*invalidatepage)( +struct address_space *, struct page *, ...); ... }; @p4r2 depends on part4@ expression E1, E2, E3, E4; @@ E1.a_ops->invalidatepage( +MAPPING_NULL, E2, E3, E4) @p4r3 depends on part4@ expression E1, E2, E3, E4; @@ E1->a_ops->invalidatepage( +MAPPING_NULL, E2, E3, E4) @p4r4 depends on part4 exists@ identifier I1, FN; expression E1; @@ FN (...) {... void (*I1)(struct page *, unsigned int, unsigned int); ... I1 = E1->a_ops->invalidatepage; ...} @p4r5 depends on p4r4 exists@ expression E1, E2, E3; identifier I1, p4r4.FN; @@ FN(...) {... void (*I1)( +struct address_space *, struct page *, unsigned int, unsigned int); ... (*I1)( +MAPPING_NULL, E1, E2, E3); ...} @p4r6 depends on part4@ expression E1, E2, E3; @@ {... -void (*invalidatepage)(struct
[PATCH 02/14] fs: define filler_t as a function pointer type
From: Jérôme Glisse Coccinelle is confuse by filler_t not being a regular function pointer type. As they are no reason to define filler_t as a non pointer type redefine it as a function pointer type and update function prototype accordingly. Signed-off-by: Jérôme Glisse Cc: linux...@kvack.org Cc: linux-fsde...@vger.kernel.org Cc: Andrew Morton Cc: Alexander Viro Cc: linux-fsde...@vger.kernel.org Cc: Tejun Heo Cc: Jan Kara Cc: Josef Bacik --- fs/nfs/dir.c| 2 +- fs/nfs/symlink.c| 4 ++-- include/linux/pagemap.h | 6 +++--- mm/filemap.c| 5 ++--- mm/readahead.c | 2 +- 5 files changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index cb52db9a0cfb7..da1fe71ae810d 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -740,7 +740,7 @@ static struct page *get_cache_page(nfs_readdir_descriptor_t *desc) { return read_cache_page(desc->file->f_mapping, desc->page_index, - nfs_readdir_filler, desc); + (filler_t)nfs_readdir_filler, desc); } /* diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c index 25ba299fdac2e..76691d94ae5f8 100644 --- a/fs/nfs/symlink.c +++ b/fs/nfs/symlink.c @@ -66,8 +66,8 @@ static const char *nfs_get_link(struct dentry *dentry, err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping)); if (err) return err; - page = read_cache_page(>i_data, 0, nfs_symlink_filler, - inode); + page = read_cache_page(>i_data, 0, + (filler_t)nfs_symlink_filler, inode); if (IS_ERR(page)) return ERR_CAST(page); } diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 7de11dcd534d6..9acfc605b3bc3 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -264,7 +264,7 @@ static inline gfp_t readahead_gfp_mask(struct address_space *x) return mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN; } -typedef int filler_t(void *, struct page *); +typedef int (*filler_t)(void *, struct page *); pgoff_t page_cache_next_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan); @@ -425,11 +425,11 @@ static inline struct page *grab_cache_page(struct address_space *mapping, } extern struct page * read_cache_page(struct address_space *mapping, - pgoff_t index, filler_t *filler, void *data); + pgoff_t index, filler_t filler, void *data); extern struct page * read_cache_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern int read_cache_pages(struct address_space *mapping, - struct list_head *pages, filler_t *filler, void *data); + struct list_head *pages, filler_t filler, void *data); static inline struct page *read_mapping_page(struct address_space *mapping, pgoff_t index, void *data) diff --git a/mm/filemap.c b/mm/filemap.c index 99c49eeae71b8..2cdbbffc55522 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2942,8 +2942,7 @@ static struct page *wait_on_page_read(struct page *page) } static struct page *do_read_cache_page(struct address_space *mapping, - pgoff_t index, - int (*filler)(void *, struct page *), + pgoff_t index, filler_t filler, void *data, gfp_t gfp) { @@ -3064,7 +3063,7 @@ static struct page *do_read_cache_page(struct address_space *mapping, */ struct page *read_cache_page(struct address_space *mapping, pgoff_t index, - int (*filler)(void *, struct page *), + filler_t filler, void *data) { return do_read_cache_page(mapping, index, filler, data, diff --git a/mm/readahead.c b/mm/readahead.c index 3c9a8dd7c56c8..cd67c9cfa931a 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -87,7 +87,7 @@ static void read_cache_pages_invalidate_pages(struct address_space *mapping, * Returns: %0 on success, error return by @filler otherwise */ int read_cache_pages(struct address_space *mapping, struct list_head *pages, - int (*filler)(void *, struct page *), void *data) + filler_t filler, void *data) { struct page *page; int ret = 0; -- 2.26.2
[PATCH 00/14] Small step toward KSM for file back page.
From: Jérôme Glisse This patchset is a step toward a larger objective: generalize existing KSM into a mechanism allowing exclusive write control for a page; either anonymous memory (like KSM today) or file back page (modulo GUP which would block that like it does today for KSM). Exclusive write control page allow multiple different features to be implemented: - KSM kernel share page, ie de-duplicate pages with same content to use a single page for all. From many pages to one read only page. We have that today for anonymous memory only. The overall patchset extends it to file back page ie sharing the same struct page accross different file or accross same file. This can be be usefull for containers for instance ... or for deduplication in same file. - NUMA duplication, duplicate a page into multiple local read only copy. This is the opposite of KSM in a sense, instead of saving memory. Using more memory to get better memory access performance. For instance duplicating libc code to local node copy; or big read only dataset duplicated on each nodes. - Exclusive write access, owner of page write protection is the only that can write to the page (and must still abide by fs rules for fileback page in respect to writeback...). One use case is for fast atomic operation using non atomic instruction. For instance by PCIE device, if all mapping of the page is read only then PCIE device driver knows device write can not race with CPU write. This is a performance optimization. - Use main memory as cache for persistent memory ie the page is read only and write will trigger callback and different strategy can be use like write combining (ie acumulating change in main memory before copying to persistent memory). Like KSM today such protection can be broken at _any_ time. The owner of the protection gets a callback (KSM code for instance get calls) so that it can unprotect the page. Breaking protection should not block and must happens quickly (like KSM code today). Convertion of existing KSM into generic mechanism is straightforward for anonymous page (just factorize out KSM code that deals with page protection from KSM code that deals with de-duplication). The big changes here is the support for file back pages. The idea to achieve it is that we almost always have the mapping a page belongs to within the call stack as we operate on such page either from: - Syscall/kernel against a file (file -> inode -> mapping). - Syscall/kernel against virtual address (vma -> file -> mapping). - Write back for a given mapping (mapping -> pages). They are few exceptions: - Reclaim, but reclaim does not care about mapping. Reclaim wants to unmap page to free it up. So all we have to do is provide special path to do that just like KSM does today for anonymous pages. - Compaction, again we do not care about the mapping for compaction. All we need is way to move page (ie migrate). - Flush data cache on some architecture the cache line are tag with the virtual address so when flushing a page we need to find all of its virtual addresses. Again we do not care about the mapping, we just need a way to find all virtual address in all process pointing to the page. - GUP user that want to set a page dirty. This is easy, we just do not allow protection to work on GUPed page and GUP also will break the protection. There is just no way to synchronize with GUP user as they violate all mm and fs rules anyway. - Some proc fs and other memory debugging API. Here we do not care about the mapping but about the page states. Also some of those API works on virtual address for which we can easily get the vma and thus the mapping. So when we have the mapping for a page from the context and not from page->mapping then we can use it as a key to lookup private and index fields value for the page. To avoid any regression risk, only protected pages sees their fields overloaded. It means that if you are not using the page protection then the page->mapping, page->private and page->index all stays as they are today. Also page->mapping is always use as canonical place to lookup the page mapping for unprotected page so that any existing code will keep working as it does today even if the mapping we get from the context does not match the page->mapping. More on this below. Overview: = The core idea is pretty dumb, it is just about passing new mapping argument to every function that get a page and need the mapping corresponding to that page. Most of the changes are done through semantic patches. Adding new function argument on itself does not bring any risk. The risk is in making sure that the mapping we pass as function argument is the one corresponding to the page. To avoid any regression we keep using page->mapping as the canonical mapping even
[PATCH 04/14] mm: add struct address_space to readpage() callback
From: Jérôme Glisse This is part of patchset to remove dependency on struct page.mapping field so that we can temporarily update it to point to a special structure tracking temporary page state (note that original mapping pointer is preserved and can still be accessed but at a cost). Add struct address_space to readpage() callback arguments. Note that this patch does not make use of the new argument, nor does it use a valid one at call site (by default this patch just use NULL for new argument value). Use following script (from root of linux kernel tree): ./that-script.sh that-semantic-patch.spatch %< #!/bin/sh spatch_file=$1 echo PART1 === # P1 find callback functions name spatch --dir . --no-includes -D part1 --sp-file $spatch_file echo PART2 === # P2 change callback function prototype cat /tmp/unicorn-functions | sort | uniq | while read func ; do for file in $( git grep -l $func -- '*.[ch]' ) ; do echo $file spatch --no-includes --in-place -D part2 \ -D fn=$func --sp-file $spatch_file $file done done echo PART 3 == # P3 find all function which call the callback spatch --dir . --include-headers -D part3 --sp-file $spatch_file echo PART 4=== # P4 change all funcitons which call the callback cat /tmp/unicorn-files | sort | uniq | while read file ; do echo $file spatch --no-includes --in-place -D part4 \ --sp-file $spatch_file $file done >% With the following semantic patch: %< virtual part1, part2, part3, part4 // // Part 1 is grepping all function that are use as callback for readpage. // initialize file where we collect all function name (erase it) @initialize:python depends on part1@ @@ file=open('/tmp/unicorn-functions', 'w') file.close() // match function name use as a callback @p1r2 depends on part1@ identifier I1, FN; @@ struct address_space_operations I1 = {..., .readpage = FN, ...}; @script:python p1r3 depends on p1r2@ funcname << p1r2.FN; @@ if funcname != "NULL": file=open('/tmp/unicorn-functions', 'a') file.write(funcname + '\n') file.close() @p1r4 depends on part1 exists@ expression E1, E2, E3; identifier FN; type T1; @@ {... ( read_cache_page(E1, E2, (T1)FN, E3) | read_cache_pages(E1, E2, (T1)FN, E3) ) ...} @script:python p1r5 depends on p1r4@ funcname << p1r4.FN; @@ if funcname != "NULL": file=open('/tmp/unicorn-functions', 'a') file.write(funcname + '\n') print(funcname) file.close() // --- // Part 2 modify callback // Add address_space argument to the function (readpage callback one) @p2r1 depends on part2@ identifier virtual.fn; identifier I1, I2; type T1, T2; @@ int fn(T1 I1, +struct address_space *__mapping, T2 I2) { ... } @p2r2 depends on part2@ identifier virtual.fn; identifier I1, I2; type T1, T2; @@ int fn(T1 I1, +struct address_space *__mapping, T2 I2); @p2r3 depends on part2@ identifier virtual.fn; type T1, T2; @@ int fn(T1, +struct address_space *, T2); @p2r4 depends on part2@ identifier virtual.fn; expression E1, E2; @@ fn(E1, +MAPPING_NULL, E2) // // Part 3 is grepping all function that are use the callback for readpage. // initialize file where we collect all function name (erase it) @initialize:python depends on part3@ @@ file=open('/tmp/unicorn-files', 'w') file.write("./include/linux/pagemap.h\n") file.write("./include/linux/mm.h\n") file.write("./include/linux/fs.h\n") file.write("./mm/readahead.c\n") file.write("./mm/filemap.c\n") file.close() @p3r1 depends on part3 exists@ expression E1, E2, E3; identifier FN; position P; @@ FN@P(...) {... ( E1.a_ops->readpage(E2, E3) | E1->a_ops->readpage(E2, E3) ) ...} @script:python p3r2 depends on p3r1@ P << p3r1.P; @@ file=open('/tmp/unicorn-files', 'a') file.write(P[0].file + '\n') file.close() @p3r4 depends on part3 exists@ expression E1, E2, E3, E4; identifier FN; position P; @@ FN@P(...) {... ( read_cache_page(E1, E2, E3, E4) | read_cache_pages(E1, E2, E3, E4) ) ...} @script:python p3r5 depends on p3r4@ P << p3r4.P; @@ file=open('/tmp/unicorn-files', 'a') file.write(P[0].file + '\n') file.close() // --- // Part 4 generic modification @p4r1 depends on part4@ @@ struct address_space_operations { ... int (*readpage)(struct file *, +struct address_space *, struct page *); ... }; @p4r2 depends on part4@ expression E1, E2, E3; @@
[PATCH 01/14] mm/pxa: page exclusive access add header file for all helpers.
From: Jérôme Glisse Add include/linux/page-xa.h where all helpers related to Page eXclusive Acces (PXA) will be added (in following patches). Also introduce MAPPING_NULL as a temporary define use to simplify the mass modifications to stop relying on struct page.mapping and instead pass down mapping pointer from the context (either from inode when in syscall operating on a file or from vma->vm_file when operating on some virtual address. This is temporary define, do not use ! Signed-off-by: Jérôme Glisse Cc: Andrew Morton Cc: Alexander Viro Cc: linux-fsde...@vger.kernel.org Cc: Tejun Heo Cc: Jan Kara Cc: Josef Bacik --- include/linux/mm.h | 5 include/linux/page-xa.h | 66 + 2 files changed, 71 insertions(+) create mode 100644 include/linux/page-xa.h diff --git a/include/linux/mm.h b/include/linux/mm.h index 16b799a0522cd..d165961c58c45 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3130,5 +3130,10 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping, extern int sysctl_nr_trim_pages; + +/* Page exclusive access do depend on some helpers define in here. */ +#include + + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/linux/page-xa.h b/include/linux/page-xa.h new file mode 100644 index 0..8ac9e6dc051e0 --- /dev/null +++ b/include/linux/page-xa.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Page eXclusive Acess (PXA) is a generic mechanism to allow exclusive access + * to a file back or an anonymous page. Exclusive access means that no one can + * write to page except the owner of the protection (but the page can still be + * read). The exclusive access can be _broken_ at anytime and this can not be + * block (so anyone using that feature must be ready to give away the exclusive + * access at _any_ time and must do so in a timely fashion). + * + * Using PXA allows to implement few different features: + * - KSM (Kernel Shared Memory) where page with same content are deduplicated + *using a unique page and all mapping are updated to read only. This allow + *to save memory for workload with a lot of pages in different process that + *end up with same content (multiple VM for instance). + * + * - NUMA duplication (sort of the opposite of KSM) here a page is duplicated + *into multiple read only copy with each copy using physical memory local a + *NUMA node (or a device). This allow to improve performance by minimizing + *cross node memory transaction and also help minimizing bus traffic. It + *does however use more memory, so what you gain in performance you loose + *in available resources. + * + * - Exclusive write access to a page, for instance you can use regular write + *instruction and still get atomic behavior (as you are the only being able + *to write you the garantee that no one can race with you). + * + * And any other use cases you can think of ... + * + * See Documentation/vm/page-xa.rst for further informations. + * + * Authors: + * Jérôme Glisse + */ +#ifndef LINUX_PAGE_XA_H +#define LINUX_PAGE_XA_H + +#include +#include + + +/* + * MAPPING_NULL this is temporary define use to simplify the mass modificaitons + * to stop relying on struct page.mapping and instead pass down mapping pointer + * from the context (either from inode when in syscall operating on a file or + * from vma->vm_file when operating on some virtual address range). + * + * DO NOT USE ! THIS IS ONLY FOR SEMANTIC PATCHES SIMPLIFICATION ! + */ +#define MAPPING_NULL NULL + + +/** + * PageXA() - is page under exclusive acces ? + * + * This function checks if a page is under exclusive access. + * + * @page: Pointer to page to be queried. + * @Return: True, if it is under exclusive access, false otherwise. + */ +static inline bool PageXA(struct page *page) +{ + return false; +} + + +#endif /* LINUX_PAGE_XA_H */ -- 2.26.2
[PATCH 05/14] mm: add struct address_space to writepage() callback
From: Jérôme Glisse This is part of patchset to remove dependency on struct page.mapping field so that we can temporarily update it to point to a special structure tracking temporary page state (note that original mapping pointer is preserved and can still be accessed but at a cost). Add struct address_space to writepage() callback arguments. Note that this patch does not make use of the new argument, nor does it use a valid one at call site (by default this patch just use NULL for new argument value). Use following script (from root of linux kernel tree): ./that-script.sh that-semantic-patch.spatch %< #!/bin/sh spatch_file=$1 echo PART1 === # P1 find callback functions name spatch --dir . --no-includes -D part1 --sp-file $spatch_file echo PART2 === # P2 change callback function prototype cat /tmp/unicorn-functions | sort | uniq | while read func ; do for file in $( git grep -l $func -- '*.[ch]' ) ; do echo $file spatch --no-includes --in-place -D part2 \ -D fn=$func --sp-file $spatch_file $file done done echo PART 3 == # P3 find all function which call the callback spatch --dir . --include-headers -D part3 --sp-file $spatch_file echo PART 4=== # P4 change all funcitons which call the callback cat /tmp/unicorn-files | sort | uniq | while read file ; do echo $file spatch --no-includes --in-place -D part4 \ --sp-file $spatch_file $file done >% With the following semantic patch: %< virtual part1, part2, part3, part4 // // Part 1 is grepping all function that are use as callback for writepage. // initialize file where we collect all function name (erase it) @initialize:python depends on part1@ @@ file=open('/tmp/unicorn-functions', 'w') file.close() // match function name use as a callback @p1r2 depends on part1@ identifier I1, FN; @@ struct address_space_operations I1 = {..., .writepage = FN, ...}; @script:python p1r3 depends on p1r2@ funcname << p1r2.FN; @@ if funcname != "NULL": file=open('/tmp/unicorn-functions', 'a') file.write(funcname + '\n') file.close() // --- // Part 2 modify callback // Add address_space argument to the function (writepage callback one) @p2r1 depends on part2@ identifier virtual.fn; identifier I1, I2; type T1, T2; @@ int fn( +struct address_space *__mapping, T1 I1, T2 I2) { ... } @p2r2 depends on part2@ identifier virtual.fn; identifier I1, I2; type T1, T2; @@ int fn( +struct address_space *__mapping, T1 I1, T2 I2); @p2r3 depends on part2@ identifier virtual.fn; type T1, T2; @@ int fn( +struct address_space *__mapping, T1, T2); @p2r4 depends on part2@ identifier virtual.fn; expression E1, E2; @@ fn( +MAPPING_NULL, E1, E2) // // Part 3 is grepping all function that are use the callback for writepage. // initialize file where we collect all function name (erase it) @initialize:python depends on part3@ @@ file=open('/tmp/unicorn-files', 'w') file.write("./include/linux/pagemap.h\n") file.write("./include/linux/mm.h\n") file.write("./include/linux/fs.h\n") file.write("./mm/readahead.c\n") file.write("./mm/filemap.c\n") file.close() @p3r1 depends on part3 exists@ expression E1, E2, E3; identifier FN; position P; @@ FN@P(...) {... ( E1.a_ops->writepage(E2, E3) | E1->a_ops->writepage(E2, E3) ) ...} @script:python p3r2 depends on p3r1@ P << p3r1.P; @@ file=open('/tmp/unicorn-files', 'a') file.write(P[0].file + '\n') file.close() // --- // Part 4 generic modification @p4r1 depends on part4@ @@ struct address_space_operations { ... int (*writepage)( +struct address_space *, struct page *page, ...); ... }; @p4r2 depends on part4@ expression E1, E2, E3; @@ E1.a_ops->writepage( +MAPPING_NULL, E2, E3) @p4r3 depends on part4@ expression E1, E2, E3; @@ E1->a_ops->writepage( +MAPPING_NULL, E2, E3) >% Signed-off-by: Jérôme Glisse Cc: linux...@kvack.org Cc: linux-fsde...@vger.kernel.org Cc: Andrew Morton Cc: Alexander Viro Cc: linux-fsde...@vger.kernel.org Cc: Tejun Heo Cc: Jan Kara Cc: Josef Bacik --- drivers/gpu/drm/i915/gem/i915_gem_shmem.c | 3 ++- fs/9p/vfs_addr.c | 4 +++- fs/adfs/inode.c | 3 ++- fs/affs/file.c| 3 ++- fs/afs/internal.h
[PATCH 06/14] mm: add struct address_space to set_page_dirty() callback
From: Jérôme Glisse This is part of patchset to remove dependency on struct page.mapping field so that we can temporarily update it to point to a special structure tracking temporary page state (note that original mapping pointer is preserved and can still be accessed but at a cost). Add struct address_space to set_page_dirty() callback arguments. Note that this patch does not make use of the new argument, nor does it use a valid one at call site (by default this patch just use NULL for new argument value). Use following script (from root of linux kernel tree): ./that-script.sh that-semantic-patch.spatch %< #!/bin/sh spatch_file=$1 echo PART1 === # P1 find callback functions name spatch --dir . --no-includes -D part1 --sp-file $spatch_file echo PART2 === # P2 change callback function prototype cat /tmp/unicorn-functions | sort | uniq | while read func ; do for file in $( git grep -l $func -- '*.[ch]' ) ; do echo $file spatch --no-includes --in-place -D part2 \ -D fn=$func --sp-file $spatch_file $file done done echo PART 3 == # P3 find all function which call the callback spatch --dir . --include-headers -D part3 --sp-file $spatch_file echo PART 4=== # P4 change all funcitons which call the callback cat /tmp/unicorn-files | sort | uniq | while read file ; do echo $file spatch --no-includes --in-place -D part4 \ --sp-file $spatch_file $file done >% With the following semantic patch: %< virtual part1, part2, part3, part4 // // Part 1 is grepping all function that are use as callback for set_page_dirty. // initialize file where we collect all function name (erase it) @initialize:python depends on part1@ @@ file=open('/tmp/unicorn-functions', 'w') file.close() // match function name use as a callback @p1r2 depends on part1@ identifier I1, FN; @@ struct address_space_operations I1 = {..., .set_page_dirty = FN, ...}; @script:python p1r3 depends on p1r2@ funcname << p1r2.FN; @@ if funcname != "NULL": file=open('/tmp/unicorn-functions', 'a') file.write(funcname + '\n') file.close() // --- // Part 2 modify callback // Add address_space argument to the function (set_page_dirty callback one) @p2r1 depends on part2@ identifier virtual.fn; identifier I1; type T1; @@ int fn( +struct address_space *__mapping, T1 I1) { ... } @p2r2 depends on part2@ identifier virtual.fn; identifier I1; type T1; @@ int fn( +struct address_space *__mapping, T1 I1); @p2r3 depends on part2@ identifier virtual.fn; type T1; @@ int fn( +struct address_space *, T1); @p2r4 depends on part2@ identifier virtual.fn; expression E1; @@ fn( +MAPPING_NULL, E1) // // Part 3 is grepping all function that are use the callback for set_page_dirty. // initialize file where we collect all function name (erase it) @initialize:python depends on part3@ @@ file=open('/tmp/unicorn-files', 'w') file.write("./include/linux/pagemap.h\n") file.write("./mm/page-writeback.c\n") file.write("./include/linux/mm.h\n") file.write("./include/linux/fs.h\n") file.close() @p3r1 depends on part3 exists@ expression E1, E2; identifier FN; position P; @@ FN@P(...) {... ( E1.a_ops->set_page_dirty(E2) | E1->a_ops->set_page_dirty(E2) ) ...} @script:python p3r2 depends on p3r1@ P << p3r1.P; @@ file=open('/tmp/unicorn-files', 'a') file.write(P[0].file + '\n') file.close() // --- // Part 4 generic modification @p4r1 depends on part4@ @@ struct address_space_operations { ... int (*set_page_dirty)( +struct address_space *, struct page *page); ... }; @p4r2 depends on part4@ expression E1, E2; @@ E1.a_ops->set_page_dirty( +MAPPING_NULL, E2) @p4r3 depends on part4@ expression E1, E2; @@ E1->a_ops->set_page_dirty( +MAPPING_NULL, E2) @p4r4 depends on part4@ @@ {... -int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; +int (*spd)(struct address_space *, struct page *) = mapping->a_ops->set_page_dirty; ... return (*spd)( +MAPPING_NULL, page); ...} >% Signed-off-by: Jérôme Glisse Cc: linux...@kvack.org Cc: linux-fsde...@vger.kernel.org Cc: Andrew Morton Cc: Alexander Viro Cc: linux-fsde...@vger.kernel.org Cc: Tejun Heo Cc: Jan Kara Cc: Josef Bacik --- drivers/video/fbdev/core/fb_defio.c | 3 ++- fs/afs/dir.c
[PATCH 03/14] fs: directly use a_ops->freepage() instead of a local copy of it.
From: Jérôme Glisse Coccinelle is confuse with function pointer, convert to directly use a_ops->freepage() to be nice to coccinelle. Signed-off-by: Jérôme Glisse Cc: linux-fsde...@vger.kernel.org Cc: linux...@kvack.org Cc: Alexander Viro Cc: Tejun Heo Cc: Jan Kara Cc: Josef Bacik Cc: Andrew Morton --- mm/filemap.c | 12 mm/vmscan.c | 7 ++- 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 2cdbbffc55522..ba892599a2717 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -242,11 +242,8 @@ void __delete_from_page_cache(struct page *page, void *shadow) static void page_cache_free_page(struct address_space *mapping, struct page *page) { - void (*freepage)(struct page *); - - freepage = mapping->a_ops->freepage; - if (freepage) - freepage(page); + if (mapping->a_ops->freepage) + mapping->a_ops->freepage(page); if (PageTransHuge(page) && !PageHuge(page)) { page_ref_sub(page, HPAGE_PMD_NR); @@ -790,7 +787,6 @@ EXPORT_SYMBOL(file_write_and_wait_range); int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) { struct address_space *mapping = old->mapping; - void (*freepage)(struct page *) = mapping->a_ops->freepage; pgoff_t offset = old->index; XA_STATE(xas, >i_pages, offset); unsigned long flags; @@ -819,8 +815,8 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) if (PageSwapBacked(new)) __inc_lruvec_page_state(new, NR_SHMEM); xas_unlock_irqrestore(, flags); - if (freepage) - freepage(old); + if (mapping->a_ops->freepage) + mapping->a_ops->freepage(old); put_page(old); return 0; diff --git a/mm/vmscan.c b/mm/vmscan.c index 466fc3144fffc..6db869339073d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -903,9 +903,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, xa_unlock_irqrestore(>i_pages, flags); put_swap_page(page, swap); } else { - void (*freepage)(struct page *); - - freepage = mapping->a_ops->freepage; /* * Remember a shadow entry for reclaimed file cache in * order to detect refaults, thus thrashing, later on. @@ -928,8 +925,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, __delete_from_page_cache(page, shadow); xa_unlock_irqrestore(>i_pages, flags); - if (freepage != NULL) - freepage(page); + if (mapping->a_ops->freepage != NULL) + mapping->a_ops->freepage(page); } return 1; -- 2.26.2
[PATCH] mm/hmm: move THP and hugetlbfs code path behind #if KCONFIG
From: Jérôme Glisse To avoid any undefined symbol build warning or error, move THP and hugetlbfs code behind kconfig #if/#else/#endif against appropriate Kconfig option. Signed-off-by: Jérôme Glisse Cc: Ralph Campbell Cc: John Hubbard Cc: Andrew Morton --- mm/hmm.c | 9 + 1 file changed, 9 insertions(+) diff --git a/mm/hmm.c b/mm/hmm.c index ecd16718285e..a8a950fe46b6 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -520,6 +520,7 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, uint64_t *pfns, pmd_t pmd) { +#ifdef CONFIG_TRANSPARENT_HUGEPAGE struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; unsigned long pfn, npages, i; @@ -550,6 +551,10 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, } hmm_vma_walk->last = end; return 0; +#else + /* If THP is not enabled then we should never reach that code ! */ + return -EINVAL; +#endif } static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) @@ -792,6 +797,7 @@ static int hmm_vma_walk_pud(pud_t *pudp, return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); +#ifdef CONFIG_HUGETLB_PAGE pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); for (i = 0; i < npages; ++i, ++pfn) { hmm_vma_walk->pgmap = get_dev_pagemap(pfn, @@ -807,6 +813,9 @@ static int hmm_vma_walk_pud(pud_t *pudp, } hmm_vma_walk->last = end; return 0; +#else + return -EINVAL; +#endif } split_huge_pud(walk->vma, pudp, addr); -- 2.20.1
[PATCH] mm/hmm: add ARCH_HAS_HMM_MIRROR ARCH_HAS_HMM_DEVICE Kconfig
From: Jérôme Glisse This patch just add 2 new Kconfig that are _not use_ by anyone. I check that various make ARCH=somearch allmodconfig do work and do not complain. This new Kconfig need to be added first so that device driver that do depend on HMM can be updated. Once drivers are updated then i can update the HMM Kconfig to depends on this new Kconfig in a followup patch. Signed-off-by: Jérôme Glisse Cc: Guenter Roeck Cc: Leon Romanovsky Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Ralph Campbell Cc: John Hubbard --- mm/Kconfig | 16 1 file changed, 16 insertions(+) diff --git a/mm/Kconfig b/mm/Kconfig index 25c71eb8a7db..daadc9131087 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -676,6 +676,22 @@ config ZONE_DEVICE If FS_DAX is enabled, then say Y. +config ARCH_HAS_HMM_MIRROR + bool + default y + depends on (X86_64 || PPC64) + depends on MMU && 64BIT + +config ARCH_HAS_HMM_DEVICE + bool + default y + depends on (X86_64 || PPC64) + depends on MEMORY_HOTPLUG + depends on MEMORY_HOTREMOVE + depends on SPARSEMEM_VMEMMAP + depends on ARCH_HAS_ZONE_DEVICE + select XARRAY_MULTI + config ARCH_HAS_HMM bool default y -- 2.20.1
[PATCH v4 1/1] RDMA/odp: convert to use HMM for ODP v4
From: Jérôme Glisse Convert ODP to use HMM so that we can build on common infrastructure for different class of devices that want to mirror a process address space into a device. There is no functional changes. Changes since v3: - fix Kconfig to properly depends on HMM, also make sure things build properly if on demand paging is not enabled Changes since v2: - rebase on top of newer HMM patchset and mmu notifier patchset Changes since v1: - improved comments - simplified page alignment computation Signed-off-by: Jérôme Glisse Cc: Jason Gunthorpe Cc: Leon Romanovsky Cc: Doug Ledford Cc: Artemy Kovalyov Cc: Moni Shoua Cc: Mike Marciniszyn Cc: Kaike Wan Cc: Dennis Dalessandro Cc: linux-r...@vger.kernel.org --- drivers/infiniband/Kconfig | 3 +- drivers/infiniband/core/umem_odp.c | 499 - drivers/infiniband/hw/mlx5/mem.c | 20 +- drivers/infiniband/hw/mlx5/mr.c| 2 +- drivers/infiniband/hw/mlx5/odp.c | 106 +++--- include/rdma/ib_umem_odp.h | 49 ++- 6 files changed, 231 insertions(+), 448 deletions(-) diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index a1fb840de45d..8002ca65898a 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -64,7 +64,8 @@ config INFINIBAND_USER_MEM config INFINIBAND_ON_DEMAND_PAGING bool "InfiniBand on-demand paging support" depends on INFINIBAND_USER_MEM - select MMU_NOTIFIER + depends on ARCH_HAS_HMM + select HMM_MIRROR default y ---help--- On demand paging support for the InfiniBand subsystem. diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index bcd53f302df2..139f520e733d 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -46,6 +46,26 @@ #include #include + +static uint64_t odp_hmm_flags[HMM_PFN_FLAG_MAX] = { + ODP_READ_BIT, /* HMM_PFN_VALID */ + ODP_WRITE_BIT, /* HMM_PFN_WRITE */ + /* +* The ODP_DEVICE_BIT is not use by ODP but is here to comply +* with HMM API which also catter to device with local memory. +* RDMA devices do not have any such memory and thus do not +* have a real use for that flag. +*/ + ODP_DEVICE_BIT, /* HMM_PFN_DEVICE_PRIVATE */ +}; + +static uint64_t odp_hmm_values[HMM_PFN_VALUE_MAX] = { + -1UL, /* HMM_PFN_ERROR */ + 0UL,/* HMM_PFN_NONE */ + -2UL, /* HMM_PFN_SPECIAL */ +}; + + /* * The ib_umem list keeps track of memory regions for which the HW * device request to receive notification when the related memory @@ -78,57 +98,25 @@ static u64 node_last(struct umem_odp_node *n) INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, node_start, node_last, static, rbt_ib_umem) -static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp) -{ - mutex_lock(_odp->umem_mutex); - if (umem_odp->notifiers_count++ == 0) - /* -* Initialize the completion object for waiting on -* notifiers. Since notifier_count is zero, no one should be -* waiting right now. -*/ - reinit_completion(_odp->notifier_completion); - mutex_unlock(_odp->umem_mutex); -} - -static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp) -{ - mutex_lock(_odp->umem_mutex); - /* -* This sequence increase will notify the QP page fault that the page -* that is going to be mapped in the spte could have been freed. -*/ - ++umem_odp->notifiers_seq; - if (--umem_odp->notifiers_count == 0) - complete_all(_odp->notifier_completion); - mutex_unlock(_odp->umem_mutex); -} - static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp, u64 start, u64 end, void *cookie) { struct ib_umem *umem = _odp->umem; - /* -* Increase the number of notifiers running, to -* prevent any further fault handling on this MR. -*/ - ib_umem_notifier_start_account(umem_odp); umem_odp->dying = 1; /* Make sure that the fact the umem is dying is out before we release * all pending page faults. */ smp_wmb(); - complete_all(_odp->notifier_completion); umem->context->invalidate_range(umem_odp, ib_umem_start(umem), ib_umem_end(umem)); return 0; } -static void ib_umem_notifier_release(struct mmu_notifier *mn, -struct mm_struct *mm) +static void ib_umem_notifier_release(struct hmm_mirror *mirror) { - struct ib_ucontext_per_mm *per_mm = - container_of(mn, struct ib_ucontext_per_mm, mn); + struct ib_ucontext_per_mm *per_mm; + + per_mm = container_of(mirror, struct
[PATCH v4 0/1] Use HMM for ODP v4
From: Jérôme Glisse Just fixed Kconfig and build when ODP was not enabled, other than that this is the same as v3. Here is previous cover letter: Git tree with all prerequisite: https://cgit.freedesktop.org/~glisse/linux/log/?h=rdma-odp-hmm-v4 This patchset convert RDMA ODP to use HMM underneath this is motivated by stronger code sharing for same feature (share virtual memory SVM or Share Virtual Address SVA) and also stronger integration with mm code to achieve that. It depends on HMM patchset posted for inclusion in 5.2 [2] and [3]. It has been tested with pingpong test with -o and others flags to test different size/features associated with ODP. Moreover they are some features of HMM in the works like peer to peer support, fast CPU page table snapshot, fast IOMMU mapping update ... It will be easier for RDMA devices with ODP to leverage those if they use HMM underneath. Quick summary of what HMM is: HMM is a toolbox for device driver to implement software support for Share Virtual Memory (SVM). Not only it provides helpers to mirror a process address space on a device (hmm_mirror). It also provides helper to allow to use device memory to back regular valid virtual address of a process (any valid mmap that is not an mmap of a device or a DAX mapping). They are two kinds of device memory. Private memory that is not accessible to CPU because it does not have all the expected properties (this is for all PCIE devices) or public memory which can also be access by CPU without restriction (with OpenCAPI or CCIX or similar cache-coherent and atomic inter-connect). Device driver can use each of HMM tools separatly. You do not have to use all the tools it provides. For RDMA device i do not expect a need to use the device memory support of HMM. This device memory support is geared toward accelerator like GPU. You can find a branch [1] with all the prerequisite in. This patch is on top of rdma-next with the HMM patchset [2] and mmu notifier patchset [3] applied on top of it. [1] https://cgit.freedesktop.org/~glisse/linux/log/?h=rdma-odp-hmm-v4 [2] https://lkml.org/lkml/2019/4/3/1032 [3] https://lkml.org/lkml/2019/3/26/900 Cc: linux-r...@vger.kernel.org Cc: Jason Gunthorpe Cc: Leon Romanovsky Cc: Doug Ledford Cc: Artemy Kovalyov Cc: Moni Shoua Cc: Mike Marciniszyn Cc: Kaike Wan Cc: Dennis Dalessandro Jérôme Glisse (1): RDMA/odp: convert to use HMM for ODP v4 drivers/infiniband/Kconfig | 3 +- drivers/infiniband/core/umem_odp.c | 499 - drivers/infiniband/hw/mlx5/mem.c | 20 +- drivers/infiniband/hw/mlx5/mr.c| 2 +- drivers/infiniband/hw/mlx5/odp.c | 106 +++--- include/rdma/ib_umem_odp.h | 49 ++- 6 files changed, 231 insertions(+), 448 deletions(-) -- 2.20.1
[PATCH] mm/hmm: kconfig split HMM address space mirroring from device memory
From: Jérôme Glisse To allow building device driver that only care about address space mirroring (like RDMA ODP) on platform that do not have all the pre- requisite for HMM device memory (like ZONE_DEVICE on ARM) split the HMM_MIRROR option dependency from the HMM_DEVICE dependency. Signed-off-by: Jérôme Glisse Cc: Leon Romanovsky Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Ralph Campbell Cc: John Hubbard --- mm/Kconfig | 17 ++--- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index 2e6d24d783f7..00d9febbc775 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -679,12 +679,13 @@ config ZONE_DEVICE config ARCH_HAS_HMM bool default y - depends on (X86_64 || PPC64) - depends on ZONE_DEVICE depends on MMU && 64BIT - depends on MEMORY_HOTPLUG - depends on MEMORY_HOTREMOVE - depends on SPARSEMEM_VMEMMAP + +config ARCH_HAS_HMM_DEVICE + bool + default y + depends on (X86_64 || PPC64) + depends on ARCH_HAS_ZONE_DEVICE config MIGRATE_VMA_HELPER bool @@ -710,7 +711,8 @@ config HMM_MIRROR config DEVICE_PRIVATE bool "Unaddressable device memory (GPU memory, ...)" - depends on ARCH_HAS_HMM + depends on ARCH_HAS_HMM_DEVICE + depends on ZONE_DEVICE select HMM select DEV_PAGEMAP_OPS @@ -721,7 +723,8 @@ config DEVICE_PRIVATE config DEVICE_PUBLIC bool "Addressable device memory (like GPU memory)" - depends on ARCH_HAS_HMM + depends on ARCH_HAS_HMM_DEVICE + depends on ZONE_DEVICE select HMM select DEV_PAGEMAP_OPS -- 2.20.1
[PATCH] cifs: fix page reference leak with readv/writev
From: Jérôme Glisse CIFS can leak pages reference gotten through GUP (get_user_pages*() through iov_iter_get_pages()). This happen if cifs_send_async_read() or cifs_write_from_iter() calls fail from within __cifs_readv() and __cifs_writev() respectively. This patch move page unreference to cifs_aio_ctx_release() which will happens on all code paths this is all simpler to follow for correctness. Signed-off-by: Jérôme Glisse Cc: Steve French Cc: linux-c...@vger.kernel.org Cc: samba-techni...@lists.samba.org Cc: Alexander Viro Cc: linux-fsde...@vger.kernel.org Cc: Linus Torvalds Cc: Stable --- fs/cifs/file.c | 15 +-- fs/cifs/misc.c | 23 ++- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 89006e044973..a756a4d3f70f 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2858,7 +2858,6 @@ static void collect_uncached_write_data(struct cifs_aio_ctx *ctx) struct cifs_tcon *tcon; struct cifs_sb_info *cifs_sb; struct dentry *dentry = ctx->cfile->dentry; - unsigned int i; int rc; tcon = tlink_tcon(ctx->cfile->tlink); @@ -2922,10 +2921,6 @@ static void collect_uncached_write_data(struct cifs_aio_ctx *ctx) kref_put(>refcount, cifs_uncached_writedata_release); } - if (!ctx->direct_io) - for (i = 0; i < ctx->npages; i++) - put_page(ctx->bv[i].bv_page); - cifs_stats_bytes_written(tcon, ctx->total_len); set_bit(CIFS_INO_INVALID_MAPPING, _I(dentry->d_inode)->flags); @@ -3563,7 +3558,6 @@ collect_uncached_read_data(struct cifs_aio_ctx *ctx) struct iov_iter *to = >iter; struct cifs_sb_info *cifs_sb; struct cifs_tcon *tcon; - unsigned int i; int rc; tcon = tlink_tcon(ctx->cfile->tlink); @@ -3647,15 +3641,8 @@ collect_uncached_read_data(struct cifs_aio_ctx *ctx) kref_put(>refcount, cifs_uncached_readdata_release); } - if (!ctx->direct_io) { - for (i = 0; i < ctx->npages; i++) { - if (ctx->should_dirty) - set_page_dirty(ctx->bv[i].bv_page); - put_page(ctx->bv[i].bv_page); - } - + if (!ctx->direct_io) ctx->total_len = ctx->len - iov_iter_count(to); - } /* mask nodata case */ if (rc == -ENODATA) diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index bee203055b30..9bc0d17a9d77 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -768,6 +768,11 @@ cifs_aio_ctx_alloc(void) { struct cifs_aio_ctx *ctx; + /* +* Must use kzalloc to initialize ctx->bv to NULL and ctx->direct_io +* to false so that we know when we have to unreference pages within +* cifs_aio_ctx_release() +*/ ctx = kzalloc(sizeof(struct cifs_aio_ctx), GFP_KERNEL); if (!ctx) return NULL; @@ -786,7 +791,23 @@ cifs_aio_ctx_release(struct kref *refcount) struct cifs_aio_ctx, refcount); cifsFileInfo_put(ctx->cfile); - kvfree(ctx->bv); + + /* +* ctx->bv is only set if setup_aio_ctx_iter() was call successfuly +* which means that iov_iter_get_pages() was a success and thus that +* we have taken reference on pages. +*/ + if (ctx->bv) { + unsigned i; + + for (i = 0; i < ctx->npages; i++) { + if (ctx->should_dirty) + set_page_dirty(ctx->bv[i].bv_page); + put_page(ctx->bv[i].bv_page); + } + kvfree(ctx->bv); + } + kfree(ctx); } -- 2.20.1
[PATCH v3 1/1] RDMA/odp: convert to use HMM for ODP v3
From: Jérôme Glisse Convert ODP to use HMM so that we can build on common infrastructure for different class of devices that want to mirror a process address space into a device. There is no functional changes. Changes since v2: - rebase on top of newer HMM patchset and mmu notifier patchset Changes since v1: - improved comments - simplified page alignment computation Signed-off-by: Jérôme Glisse Cc: linux-r...@vger.kernel.org Cc: Jason Gunthorpe Cc: Leon Romanovsky Cc: Doug Ledford Cc: Artemy Kovalyov Cc: Moni Shoua Cc: Mike Marciniszyn Cc: Kaike Wan Cc: Dennis Dalessandro --- drivers/infiniband/core/umem_odp.c | 499 - drivers/infiniband/hw/mlx5/mem.c | 20 +- drivers/infiniband/hw/mlx5/mr.c| 2 +- drivers/infiniband/hw/mlx5/odp.c | 106 +++--- include/rdma/ib_umem_odp.h | 48 ++- 5 files changed, 228 insertions(+), 447 deletions(-) diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index bcd53f302df2..139f520e733d 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -46,6 +46,26 @@ #include #include + +static uint64_t odp_hmm_flags[HMM_PFN_FLAG_MAX] = { + ODP_READ_BIT, /* HMM_PFN_VALID */ + ODP_WRITE_BIT, /* HMM_PFN_WRITE */ + /* +* The ODP_DEVICE_BIT is not use by ODP but is here to comply +* with HMM API which also catter to device with local memory. +* RDMA devices do not have any such memory and thus do not +* have a real use for that flag. +*/ + ODP_DEVICE_BIT, /* HMM_PFN_DEVICE_PRIVATE */ +}; + +static uint64_t odp_hmm_values[HMM_PFN_VALUE_MAX] = { + -1UL, /* HMM_PFN_ERROR */ + 0UL,/* HMM_PFN_NONE */ + -2UL, /* HMM_PFN_SPECIAL */ +}; + + /* * The ib_umem list keeps track of memory regions for which the HW * device request to receive notification when the related memory @@ -78,57 +98,25 @@ static u64 node_last(struct umem_odp_node *n) INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, node_start, node_last, static, rbt_ib_umem) -static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp) -{ - mutex_lock(_odp->umem_mutex); - if (umem_odp->notifiers_count++ == 0) - /* -* Initialize the completion object for waiting on -* notifiers. Since notifier_count is zero, no one should be -* waiting right now. -*/ - reinit_completion(_odp->notifier_completion); - mutex_unlock(_odp->umem_mutex); -} - -static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp) -{ - mutex_lock(_odp->umem_mutex); - /* -* This sequence increase will notify the QP page fault that the page -* that is going to be mapped in the spte could have been freed. -*/ - ++umem_odp->notifiers_seq; - if (--umem_odp->notifiers_count == 0) - complete_all(_odp->notifier_completion); - mutex_unlock(_odp->umem_mutex); -} - static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp, u64 start, u64 end, void *cookie) { struct ib_umem *umem = _odp->umem; - /* -* Increase the number of notifiers running, to -* prevent any further fault handling on this MR. -*/ - ib_umem_notifier_start_account(umem_odp); umem_odp->dying = 1; /* Make sure that the fact the umem is dying is out before we release * all pending page faults. */ smp_wmb(); - complete_all(_odp->notifier_completion); umem->context->invalidate_range(umem_odp, ib_umem_start(umem), ib_umem_end(umem)); return 0; } -static void ib_umem_notifier_release(struct mmu_notifier *mn, -struct mm_struct *mm) +static void ib_umem_notifier_release(struct hmm_mirror *mirror) { - struct ib_ucontext_per_mm *per_mm = - container_of(mn, struct ib_ucontext_per_mm, mn); + struct ib_ucontext_per_mm *per_mm; + + per_mm = container_of(mirror, struct ib_ucontext_per_mm, mirror); down_read(_mm->umem_rwsem); if (per_mm->active) @@ -136,23 +124,26 @@ static void ib_umem_notifier_release(struct mmu_notifier *mn, _mm->umem_tree, 0, ULLONG_MAX, ib_umem_notifier_release_trampoline, true, NULL); up_read(_mm->umem_rwsem); + + per_mm->mm = NULL; } -static int invalidate_range_start_trampoline(struct ib_umem_odp *item, -u64 start, u64 end, void *cookie) +static int invalidate_range_trampoline(struct ib_umem_odp *item, + u64 start, u64 end, void *cookie) { - ib_umem_notifier_start_account(item);
[PATCH v3 0/1] Use HMM for ODP v3
From: Jérôme Glisse Changes since v1/v2 are about rebase and better comments in the code. Previous cover letter slightly updated. This patchset convert RDMA ODP to use HMM underneath this is motivated by stronger code sharing for same feature (share virtual memory SVM or Share Virtual Address SVA) and also stronger integration with mm code to achieve that. It depends on HMM patchset posted for inclusion in 5.2 [2] and [3]. It has been tested with pingpong test with -o and others flags to test different size/features associated with ODP. Moreover they are some features of HMM in the works like peer to peer support, fast CPU page table snapshot, fast IOMMU mapping update ... It will be easier for RDMA devices with ODP to leverage those if they use HMM underneath. Quick summary of what HMM is: HMM is a toolbox for device driver to implement software support for Share Virtual Memory (SVM). Not only it provides helpers to mirror a process address space on a device (hmm_mirror). It also provides helper to allow to use device memory to back regular valid virtual address of a process (any valid mmap that is not an mmap of a device or a DAX mapping). They are two kinds of device memory. Private memory that is not accessible to CPU because it does not have all the expected properties (this is for all PCIE devices) or public memory which can also be access by CPU without restriction (with OpenCAPI or CCIX or similar cache-coherent and atomic inter-connect). Device driver can use each of HMM tools separatly. You do not have to use all the tools it provides. For RDMA device i do not expect a need to use the device memory support of HMM. This device memory support is geared toward accelerator like GPU. You can find a branch [1] with all the prerequisite in. This patch is on top of rdma-next with the HMM patchset [2] and mmu notifier patchset [3] applied on top of it. [1] https://cgit.freedesktop.org/~glisse/linux/log/?h=rdma-5.2 [2] https://lkml.org/lkml/2019/4/3/1032 [3] https://lkml.org/lkml/2019/3/26/900 Cc: linux-r...@vger.kernel.org Cc: Jason Gunthorpe Cc: Leon Romanovsky Cc: Doug Ledford Cc: Artemy Kovalyov Cc: Moni Shoua Cc: Mike Marciniszyn Cc: Kaike Wan Cc: Dennis Dalessandro Jérôme Glisse (1): RDMA/odp: convert to use HMM for ODP v3 drivers/infiniband/core/umem_odp.c | 486 - drivers/infiniband/hw/mlx5/mem.c | 20 +- drivers/infiniband/hw/mlx5/mr.c| 2 +- drivers/infiniband/hw/mlx5/odp.c | 106 --- include/rdma/ib_umem_odp.h | 48 ++- 5 files changed, 219 insertions(+), 443 deletions(-) -- 2.20.1
[PATCH] mm/hmm: fix hmm_range_dma_map()/hmm_range_dma_unmap()
From: Jérôme Glisse Was using wrong field and wrong enum for read only versus read and write mapping. Signed-off-by: Jérôme Glisse Cc: Andrew Morton Cc: Ralph Campbell Cc: John Hubbard --- mm/hmm.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index 90369fd2307b..ecd16718285e 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1203,7 +1203,7 @@ long hmm_range_dma_map(struct hmm_range *range, npages = (range->end - range->start) >> PAGE_SHIFT; for (i = 0, mapped = 0; i < npages; ++i) { - enum dma_data_direction dir = DMA_FROM_DEVICE; + enum dma_data_direction dir = DMA_TO_DEVICE; struct page *page; /* @@ -1227,7 +1227,7 @@ long hmm_range_dma_map(struct hmm_range *range, } /* If it is read and write than map bi-directional. */ - if (range->pfns[i] & range->values[HMM_PFN_WRITE]) + if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) dir = DMA_BIDIRECTIONAL; daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir); @@ -1243,7 +1243,7 @@ long hmm_range_dma_map(struct hmm_range *range, unmap: for (npages = i, i = 0; (i < npages) && mapped; ++i) { - enum dma_data_direction dir = DMA_FROM_DEVICE; + enum dma_data_direction dir = DMA_TO_DEVICE; struct page *page; page = hmm_device_entry_to_page(range, range->pfns[i]); @@ -1254,7 +1254,7 @@ long hmm_range_dma_map(struct hmm_range *range, continue; /* If it is read and write than map bi-directional. */ - if (range->pfns[i] & range->values[HMM_PFN_WRITE]) + if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) dir = DMA_BIDIRECTIONAL; dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir); @@ -1298,7 +1298,7 @@ long hmm_range_dma_unmap(struct hmm_range *range, npages = (range->end - range->start) >> PAGE_SHIFT; for (i = 0; i < npages; ++i) { - enum dma_data_direction dir = DMA_FROM_DEVICE; + enum dma_data_direction dir = DMA_TO_DEVICE; struct page *page; page = hmm_device_entry_to_page(range, range->pfns[i]); @@ -1306,7 +1306,7 @@ long hmm_range_dma_unmap(struct hmm_range *range, continue; /* If it is read and write than map bi-directional. */ - if (range->pfns[i] & range->values[HMM_PFN_WRITE]) { + if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) { dir = DMA_BIDIRECTIONAL; /* -- 2.20.1
[PATCH] zram: pass down the bvec we need to read into in the work struct
From: Jérôme Glisse When scheduling work item to read page we need to pass down the proper bvec struct which point to the page to read into. Before this patch it uses randomly initialized bvec (only if PAGE_SIZE != 4096) which is wrong. Signed-off-by: Jérôme Glisse Cc: Minchan Kim Cc: Nitin Gupta Cc: Sergey Senozhatsky Cc: linux-kernel@vger.kernel.org --- drivers/block/zram/zram_drv.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 399cad7daae7..d58a359a6622 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -774,18 +774,18 @@ struct zram_work { struct zram *zram; unsigned long entry; struct bio *bio; + struct bio_vec bvec; }; #if PAGE_SIZE != 4096 static void zram_sync_read(struct work_struct *work) { - struct bio_vec bvec; struct zram_work *zw = container_of(work, struct zram_work, work); struct zram *zram = zw->zram; unsigned long entry = zw->entry; struct bio *bio = zw->bio; - read_from_bdev_async(zram, , entry, bio); + read_from_bdev_async(zram, >bvec, entry, bio); } /* @@ -798,6 +798,7 @@ static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec, { struct zram_work work; + work.bvec = *bvec; work.zram = zram; work.entry = entry; work.bio = bio; -- 2.20.1
[PATCH v3 12/12] mm/hmm: convert various hmm_pfn_* to device_entry which is a better name
From: Jérôme Glisse Convert hmm_pfn_* to device_entry_* as here we are dealing with device driver specific entry format and hmm provide helpers to allow differents components (including HMM) to create/parse device entry. We keep wrapper with the old name so that we can convert driver to use the new API in stages in each device driver tree. This will get remove once all driver are converted. Signed-off-by: Jérôme Glisse Cc: Andrew Morton Cc: Ralph Campbell Cc: John Hubbard Cc: Dan Williams Cc: Ira Weiny --- include/linux/hmm.h | 93 +++-- mm/hmm.c| 19 + 2 files changed, 75 insertions(+), 37 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index f81fe2c0f343..51ec27a84668 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -239,36 +239,36 @@ static inline bool hmm_range_valid(struct hmm_range *range) } /* - * hmm_pfn_to_page() - return struct page pointed to by a valid HMM pfn - * @range: range use to decode HMM pfn value - * @pfn: HMM pfn value to get corresponding struct page from - * Returns: struct page pointer if pfn is a valid HMM pfn, NULL otherwise + * hmm_device_entry_to_page() - return struct page pointed to by a device entry + * @range: range use to decode device entry value + * @entry: device entry value to get corresponding struct page from + * Returns: struct page pointer if entry is a valid, NULL otherwise * - * If the HMM pfn is valid (ie valid flag set) then return the struct page - * matching the pfn value stored in the HMM pfn. Otherwise return NULL. + * If the device entry is valid (ie valid flag set) then return the struct page + * matching the entry value. Otherwise return NULL. */ -static inline struct page *hmm_pfn_to_page(const struct hmm_range *range, - uint64_t pfn) +static inline struct page *hmm_device_entry_to_page(const struct hmm_range *range, + uint64_t entry) { - if (pfn == range->values[HMM_PFN_NONE]) + if (entry == range->values[HMM_PFN_NONE]) return NULL; - if (pfn == range->values[HMM_PFN_ERROR]) + if (entry == range->values[HMM_PFN_ERROR]) return NULL; - if (pfn == range->values[HMM_PFN_SPECIAL]) + if (entry == range->values[HMM_PFN_SPECIAL]) return NULL; - if (!(pfn & range->flags[HMM_PFN_VALID])) + if (!(entry & range->flags[HMM_PFN_VALID])) return NULL; - return pfn_to_page(pfn >> range->pfn_shift); + return pfn_to_page(entry >> range->pfn_shift); } /* - * hmm_pfn_to_pfn() - return pfn value store in a HMM pfn - * @range: range use to decode HMM pfn value - * @pfn: HMM pfn value to extract pfn from - * Returns: pfn value if HMM pfn is valid, -1UL otherwise + * hmm_device_entry_to_pfn() - return pfn value store in a device entry + * @range: range use to decode device entry value + * @entry: device entry to extract pfn from + * Returns: pfn value if device entry is valid, -1UL otherwise */ -static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range, - uint64_t pfn) +static inline unsigned long +hmm_device_entry_to_pfn(const struct hmm_range *range, uint64_t pfn) { if (pfn == range->values[HMM_PFN_NONE]) return -1UL; @@ -282,31 +282,66 @@ static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range, } /* - * hmm_pfn_from_page() - create a valid HMM pfn value from struct page + * hmm_device_entry_from_page() - create a valid device entry for a page * @range: range use to encode HMM pfn value - * @page: struct page pointer for which to create the HMM pfn - * Returns: valid HMM pfn for the page + * @page: page for which to create the device entry + * Returns: valid device entry for the page */ -static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range, -struct page *page) +static inline uint64_t hmm_device_entry_from_page(const struct hmm_range *range, + struct page *page) { return (page_to_pfn(page) << range->pfn_shift) | range->flags[HMM_PFN_VALID]; } /* - * hmm_pfn_from_pfn() - create a valid HMM pfn value from pfn + * hmm_device_entry_from_pfn() - create a valid device entry value from pfn * @range: range use to encode HMM pfn value - * @pfn: pfn value for which to create the HMM pfn - * Returns: valid HMM pfn for the pfn + * @pfn: pfn value for which to create the device entry + * Returns: valid device entry for the pfn */ -static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range, - unsigned long pfn) +static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range, +unsigned long pfn) {
[PATCH v3 05/12] mm/hmm: improve and rename hmm_vma_fault() to hmm_range_fault() v3
From: Jérôme Glisse Minor optimization around hmm_pte_need_fault(). Rename for consistency between code, comments and documentation. Also improves the comments on all the possible returns values. Improve the function by returning the number of populated entries in pfns array. Changes since v2: - updated commit message Changes since v1: - updated documentation - reformated some comments Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Cc: Andrew Morton Cc: John Hubbard Cc: Dan Williams --- Documentation/vm/hmm.rst | 8 +--- include/linux/hmm.h | 13 +- mm/hmm.c | 91 +--- 3 files changed, 52 insertions(+), 60 deletions(-) diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index d9b27bdadd1b..61f073215a8d 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -190,13 +190,7 @@ When the device driver wants to populate a range of virtual addresses, it can use either:: long hmm_range_snapshot(struct hmm_range *range); - int hmm_vma_fault(struct vm_area_struct *vma, -struct hmm_range *range, -unsigned long start, -unsigned long end, -hmm_pfn_t *pfns, -bool write, -bool block); + long hmm_range_fault(struct hmm_range *range, bool block); The first one (hmm_range_snapshot()) will only fetch present CPU page table entries and will not trigger a page fault on missing or non-present entries. diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 32206b0b1bfd..e9afd23c2eac 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -391,7 +391,18 @@ bool hmm_vma_range_done(struct hmm_range *range); * * See the function description in mm/hmm.c for further documentation. */ -int hmm_vma_fault(struct hmm_range *range, bool block); +long hmm_range_fault(struct hmm_range *range, bool block); + +/* This is a temporary helper to avoid merge conflict between trees. */ +static inline int hmm_vma_fault(struct hmm_range *range, bool block) +{ + long ret = hmm_range_fault(range, block); + if (ret == -EBUSY) + ret = -EAGAIN; + else if (ret == -EAGAIN) + ret = -EBUSY; + return ret < 0 ? ret : 0; +} /* Below are for HMM internal use only! Not to be used by device driver! */ void hmm_mm_destroy(struct mm_struct *mm); diff --git a/mm/hmm.c b/mm/hmm.c index bd957a9f10d1..b7e4034d96e1 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -340,13 +340,13 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, flags |= write_fault ? FAULT_FLAG_WRITE : 0; ret = handle_mm_fault(vma, addr, flags); if (ret & VM_FAULT_RETRY) - return -EBUSY; + return -EAGAIN; if (ret & VM_FAULT_ERROR) { *pfn = range->values[HMM_PFN_ERROR]; return -EFAULT; } - return -EAGAIN; + return -EBUSY; } static int hmm_pfns_bad(unsigned long addr, @@ -372,7 +372,7 @@ static int hmm_pfns_bad(unsigned long addr, * @fault: should we fault or not ? * @write_fault: write fault ? * @walk: mm_walk structure - * Returns: 0 on success, -EAGAIN after page fault, or page fault error + * Returns: 0 on success, -EBUSY after page fault, or page fault error * * This function will be called whenever pmd_none() or pte_none() returns true, * or whenever there is no page directory covering the virtual address range. @@ -395,12 +395,12 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, ret = hmm_vma_do_fault(walk, addr, write_fault, [i]); - if (ret != -EAGAIN) + if (ret != -EBUSY) return ret; } } - return (fault || write_fault) ? -EAGAIN : 0; + return (fault || write_fault) ? -EBUSY : 0; } static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, @@ -531,11 +531,11 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, uint64_t orig_pfn = *pfn; *pfn = range->values[HMM_PFN_NONE]; - cpu_flags = pte_to_hmm_pfn_flags(range, pte); - hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, - , _fault); + fault = write_fault = false; if (pte_none(pte)) { + hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0, + , _fault); if (fault || write_fault) goto fault; return 0; @@ -574,7 +574,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, hmm_vma_walk->last = addr; migration_entry_wait(vma->vm_mm,
[PATCH v3 02/12] mm/hmm: use reference counting for HMM struct v3
From: Jérôme Glisse Every time i read the code to check that the HMM structure does not vanish before it should thanks to the many lock protecting its removal i get a headache. Switch to reference counting instead it is much easier to follow and harder to break. This also remove some code that is no longer needed with refcounting. Changes since v2: - Renamed hmm_register() to hmm_get_or_create() updated comments accordingly Changes since v1: - removed bunch of useless check (if API is use with bogus argument better to fail loudly so user fix their code) - s/hmm_get/mm_get_hmm/ Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Cc: John Hubbard Cc: Andrew Morton Cc: Dan Williams --- include/linux/hmm.h | 2 + mm/hmm.c| 190 2 files changed, 124 insertions(+), 68 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index ad50b7b4f141..716fc61fa6d4 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -131,6 +131,7 @@ enum hmm_pfn_value_e { /* * struct hmm_range - track invalidation lock on virtual address range * + * @hmm: the core HMM structure this range is active against * @vma: the vm area struct for the range * @list: all range lock are on a list * @start: range virtual start address (inclusive) @@ -142,6 +143,7 @@ enum hmm_pfn_value_e { * @valid: pfns array did not change since it has been fill by an HMM function */ struct hmm_range { + struct hmm *hmm; struct vm_area_struct *vma; struct list_headlist; unsigned long start; diff --git a/mm/hmm.c b/mm/hmm.c index fe1cd87e49ac..919d78fd21c5 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -50,6 +50,7 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops; */ struct hmm { struct mm_struct*mm; + struct kref kref; spinlock_t lock; struct list_headranges; struct list_headmirrors; @@ -57,24 +58,33 @@ struct hmm { struct rw_semaphore mirrors_sem; }; -/* - * hmm_register - register HMM against an mm (HMM internal) +static inline struct hmm *mm_get_hmm(struct mm_struct *mm) +{ + struct hmm *hmm = READ_ONCE(mm->hmm); + + if (hmm && kref_get_unless_zero(>kref)) + return hmm; + + return NULL; +} + +/** + * hmm_get_or_create - register HMM against an mm (HMM internal) * * @mm: mm struct to attach to + * Returns: returns an HMM object, either by referencing the existing + * (per-process) object, or by creating a new one. * - * This is not intended to be used directly by device drivers. It allocates an - * HMM struct if mm does not have one, and initializes it. + * This is not intended to be used directly by device drivers. If mm already + * has an HMM struct then it get a reference on it and returns it. Otherwise + * it allocates an HMM struct, initializes it, associate it with the mm and + * returns it. */ -static struct hmm *hmm_register(struct mm_struct *mm) +static struct hmm *hmm_get_or_create(struct mm_struct *mm) { - struct hmm *hmm = READ_ONCE(mm->hmm); + struct hmm *hmm = mm_get_hmm(mm); bool cleanup = false; - /* -* The hmm struct can only be freed once the mm_struct goes away, -* hence we should always have pre-allocated an new hmm struct -* above. -*/ if (hmm) return hmm; @@ -86,6 +96,7 @@ static struct hmm *hmm_register(struct mm_struct *mm) hmm->mmu_notifier.ops = NULL; INIT_LIST_HEAD(>ranges); spin_lock_init(>lock); + kref_init(>kref); hmm->mm = mm; spin_lock(>page_table_lock); @@ -106,7 +117,7 @@ static struct hmm *hmm_register(struct mm_struct *mm) if (__mmu_notifier_register(>mmu_notifier, mm)) goto error_mm; - return mm->hmm; + return hmm; error_mm: spin_lock(>page_table_lock); @@ -118,9 +129,41 @@ static struct hmm *hmm_register(struct mm_struct *mm) return NULL; } +static void hmm_free(struct kref *kref) +{ + struct hmm *hmm = container_of(kref, struct hmm, kref); + struct mm_struct *mm = hmm->mm; + + mmu_notifier_unregister_no_release(>mmu_notifier, mm); + + spin_lock(>page_table_lock); + if (mm->hmm == hmm) + mm->hmm = NULL; + spin_unlock(>page_table_lock); + + kfree(hmm); +} + +static inline void hmm_put(struct hmm *hmm) +{ + kref_put(>kref, hmm_free); +} + void hmm_mm_destroy(struct mm_struct *mm) { - kfree(mm->hmm); + struct hmm *hmm; + + spin_lock(>page_table_lock); + hmm = mm_get_hmm(mm); + mm->hmm = NULL; + if (hmm) { + hmm->mm = NULL; + spin_unlock(>page_table_lock); + hmm_put(hmm); + return; + } + + spin_unlock(>page_table_lock); }
[PATCH v3 10/12] mm/hmm: add helpers to test if mm is still alive or not
From: Jérôme Glisse The device driver can have kernel thread or worker doing work against a process mm and it is useful for those to test wether the mm is dead or alive to avoid doing useless work. Add an helper to test that so that driver can bail out early if a process is dying. Note that the helper does not perform any lock synchronization and thus is just a hint ie a process might be dying but the helper might still return the process as alive. All HMM functions are safe to use in that case as HMM internal properly protect itself with lock. If driver use this helper with non HMM functions it should ascertain that it is safe to do so. Signed-off-by: Jérôme Glisse Cc: Ralph Campbell Cc: Andrew Morton Cc: John Hubbard Cc: Dan Williams Cc: Ira Weiny --- include/linux/hmm.h | 24 1 file changed, 24 insertions(+) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index e5834082de60..a79fcc6681f5 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -438,6 +438,30 @@ struct hmm_mirror { int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm); void hmm_mirror_unregister(struct hmm_mirror *mirror); +/* + * hmm_mirror_mm_is_alive() - test if mm is still alive + * @mirror: the HMM mm mirror for which we want to lock the mmap_sem + * Returns: false if the mm is dead, true otherwise + * + * This is an optimization it will not accurately always return -EINVAL if the + * mm is dead ie there can be false negative (process is being kill but HMM is + * not yet inform of that). It is only intented to be use to optimize out case + * where driver is about to do something time consuming and it would be better + * to skip it if the mm is dead. + */ +static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror) +{ + struct mm_struct *mm; + + if (!mirror || !mirror->hmm) + return false; + mm = READ_ONCE(mirror->hmm->mm); + if (mirror->hmm->dead || !mm) + return false; + + return true; +} + /* * Please see Documentation/vm/hmm.rst for how to use the range API. -- 2.17.2
[PATCH v3 09/12] mm/hmm: allow to mirror vma of a file on a DAX backed filesystem v3
From: Jérôme Glisse HMM mirror is a device driver helpers to mirror range of virtual address. It means that the process jobs running on the device can access the same virtual address as the CPU threads of that process. This patch adds support for mirroring mapping of file that are on a DAX block device (ie range of virtual address that is an mmap of a file in a filesystem on a DAX block device). There is no reason to not support such case when mirroring virtual address on a device. Note that unlike GUP code we do not take page reference hence when we back-off we have nothing to undo. Changes since v2: - Added comments about get_dev_pagemap() optimization. Changes since v1: - improved commit message - squashed: Arnd Bergmann: fix unused variable warning in hmm_vma_walk_pud Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Cc: Andrew Morton Cc: Dan Williams Cc: John Hubbard Cc: Arnd Bergmann --- mm/hmm.c | 138 ++- 1 file changed, 117 insertions(+), 21 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index 9140cee24d36..39bc77d7e6e3 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -329,6 +329,7 @@ EXPORT_SYMBOL(hmm_mirror_unregister); struct hmm_vma_walk { struct hmm_range*range; + struct dev_pagemap *pgmap; unsigned long last; boolfault; boolblock; @@ -503,6 +504,15 @@ static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) range->flags[HMM_PFN_VALID]; } +static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud) +{ + if (!pud_present(pud)) + return 0; + return pud_write(pud) ? range->flags[HMM_PFN_VALID] | + range->flags[HMM_PFN_WRITE] : + range->flags[HMM_PFN_VALID]; +} + static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, unsigned long end, @@ -524,8 +534,19 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); pfn = pmd_pfn(pmd) + pte_index(addr); - for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { + if (pmd_devmap(pmd)) { + hmm_vma_walk->pgmap = get_dev_pagemap(pfn, + hmm_vma_walk->pgmap); + if (unlikely(!hmm_vma_walk->pgmap)) + return -EBUSY; + } pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags; + } + if (hmm_vma_walk->pgmap) { + put_dev_pagemap(hmm_vma_walk->pgmap); + hmm_vma_walk->pgmap = NULL; + } hmm_vma_walk->last = end; return 0; } @@ -612,10 +633,24 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, if (fault || write_fault) goto fault; + if (pte_devmap(pte)) { + hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte), + hmm_vma_walk->pgmap); + if (unlikely(!hmm_vma_walk->pgmap)) + return -EBUSY; + } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) { + *pfn = range->values[HMM_PFN_SPECIAL]; + return -EFAULT; + } + *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags; return 0; fault: + if (hmm_vma_walk->pgmap) { + put_dev_pagemap(hmm_vma_walk->pgmap); + hmm_vma_walk->pgmap = NULL; + } pte_unmap(ptep); /* Fault any virtual address we were asked to fault */ return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); @@ -703,12 +738,89 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, return r; } } + if (hmm_vma_walk->pgmap) { + /* +* We do put_dev_pagemap() here and not in hmm_vma_handle_pte() +* so that we can leverage get_dev_pagemap() optimization which +* will not re-take a reference on a pgmap if we already have +* one. +*/ + put_dev_pagemap(hmm_vma_walk->pgmap); + hmm_vma_walk->pgmap = NULL; + } pte_unmap(ptep - 1); hmm_vma_walk->last = addr; return 0; } +static int hmm_vma_walk_pud(pud_t *pudp, + unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + unsigned long addr = start,
[PATCH v3 06/12] mm/hmm: improve driver API to work and wait over a range v3
From: Jérôme Glisse A common use case for HMM mirror is user trying to mirror a range and before they could program the hardware it get invalidated by some core mm event. Instead of having user re-try right away to mirror the range provide a completion mechanism for them to wait for any active invalidation affecting the range. This also changes how hmm_range_snapshot() and hmm_range_fault() works by not relying on vma so that we can drop the mmap_sem when waiting and lookup the vma again on retry. Changes since v2: - Updated documentation to match new API. - Added more comments in old API temporary wrapper. - Consolidated documentation in hmm.rst to avoid out of sync. Changes since v1: - squashed: Dan Carpenter: potential deadlock in nonblocking code Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Cc: Andrew Morton Cc: John Hubbard Cc: Dan Williams Cc: Dan Carpenter Cc: Matthew Wilcox --- Documentation/vm/hmm.rst | 25 +- include/linux/hmm.h | 145 --- mm/hmm.c | 531 +++ 3 files changed, 387 insertions(+), 314 deletions(-) diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index 61f073215a8d..945d5fb6d14a 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -217,17 +217,33 @@ Locking with the update() callback is the most important aspect the driver must range.flags = ...; range.values = ...; range.pfn_shift = ...; + hmm_range_register(); + + /* + * Just wait for range to be valid, safe to ignore return value as we + * will use the return value of hmm_range_snapshot() below under the + * mmap_sem to ascertain the validity of the range. + */ + hmm_range_wait_until_valid(, TIMEOUT_IN_MSEC); again: down_read(>mmap_sem); - range.vma = ...; ret = hmm_range_snapshot(); if (ret) { up_read(>mmap_sem); + if (ret == -EAGAIN) { +/* + * No need to check hmm_range_wait_until_valid() return value + * on retry we will get proper error with hmm_range_snapshot() + */ +hmm_range_wait_until_valid(, TIMEOUT_IN_MSEC); +goto again; + } + hmm_mirror_unregister(); return ret; } take_lock(driver->update); - if (!hmm_vma_range_done(vma, )) { + if (!range.valid) { release_lock(driver->update); up_read(>mmap_sem); goto again; @@ -235,14 +251,15 @@ Locking with the update() callback is the most important aspect the driver must // Use pfns array content to update device page table + hmm_mirror_unregister(); release_lock(driver->update); up_read(>mmap_sem); return 0; } The driver->update lock is the same lock that the driver takes inside its -update() callback. That lock must be held before hmm_vma_range_done() to avoid -any race with a concurrent CPU page table update. +update() callback. That lock must be held before checking the range.valid +field to avoid any race with a concurrent CPU page table update. HMM implements all this on top of the mmu_notifier API because we wanted a simpler API and also to be able to perform optimizations latter on like doing diff --git a/include/linux/hmm.h b/include/linux/hmm.h index e9afd23c2eac..ec4bfa91648f 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -77,8 +77,34 @@ #include #include #include +#include -struct hmm; + +/* + * struct hmm - HMM per mm struct + * + * @mm: mm struct this HMM struct is bound to + * @lock: lock protecting ranges list + * @ranges: list of range being snapshotted + * @mirrors: list of mirrors for this mm + * @mmu_notifier: mmu notifier to track updates to CPU page table + * @mirrors_sem: read/write semaphore protecting the mirrors list + * @wq: wait queue for user waiting on a range invalidation + * @notifiers: count of active mmu notifiers + * @dead: is the mm dead ? + */ +struct hmm { + struct mm_struct*mm; + struct kref kref; + struct mutexlock; + struct list_headranges; + struct list_headmirrors; + struct mmu_notifier mmu_notifier; + struct rw_semaphore mirrors_sem; + wait_queue_head_t wq; + longnotifiers; + booldead; +}; /* * hmm_pfn_flag_e - HMM flag enums @@ -155,6 +181,38 @@ struct hmm_range { boolvalid; }; +/* + * hmm_range_wait_until_valid() - wait for range to be valid + * @range: range affected by invalidation to wait on + * @timeout: time out for wait in ms (ie abort wait after that period of time) + * Returns: true if the range is valid, false otherwise. + */ +static inline bool hmm_range_wait_until_valid(struct hmm_range *range, + unsigned long
[PATCH v3 11/12] mm/hmm: add an helper function that fault pages and map them to a device v3
From: Jérôme Glisse This is a all in one helper that fault pages in a range and map them to a device so that every single device driver do not have to re-implement this common pattern. This is taken from ODP RDMA in preparation of ODP RDMA convertion. It will be use by nouveau and other drivers. Changes since v2: - Improved function comment for kernel documentation. Changes since v1: - improved commit message Signed-off-by: Jérôme Glisse Cc: Andrew Morton Cc: Ralph Campbell Cc: John Hubbard Cc: Dan Williams Cc: Souptick Joarder --- include/linux/hmm.h | 9 +++ mm/hmm.c| 152 2 files changed, 161 insertions(+) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index a79fcc6681f5..f81fe2c0f343 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -474,6 +474,15 @@ int hmm_range_register(struct hmm_range *range, void hmm_range_unregister(struct hmm_range *range); long hmm_range_snapshot(struct hmm_range *range); long hmm_range_fault(struct hmm_range *range, bool block); +long hmm_range_dma_map(struct hmm_range *range, + struct device *device, + dma_addr_t *daddrs, + bool block); +long hmm_range_dma_unmap(struct hmm_range *range, +struct vm_area_struct *vma, +struct device *device, +dma_addr_t *daddrs, +bool dirty); /* * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range diff --git a/mm/hmm.c b/mm/hmm.c index 39bc77d7e6e3..82fded7273d8 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -1173,6 +1174,157 @@ long hmm_range_fault(struct hmm_range *range, bool block) return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; } EXPORT_SYMBOL(hmm_range_fault); + +/** + * hmm_range_dma_map() - hmm_range_fault() and dma map page all in one. + * @range: range being faulted + * @device: device against to dma map page to + * @daddrs: dma address of mapped pages + * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) + * Returns: number of pages mapped on success, -EAGAIN if mmap_sem have been + * drop and you need to try again, some other error value otherwise + * + * Note same usage pattern as hmm_range_fault(). + */ +long hmm_range_dma_map(struct hmm_range *range, + struct device *device, + dma_addr_t *daddrs, + bool block) +{ + unsigned long i, npages, mapped; + long ret; + + ret = hmm_range_fault(range, block); + if (ret <= 0) + return ret ? ret : -EBUSY; + + npages = (range->end - range->start) >> PAGE_SHIFT; + for (i = 0, mapped = 0; i < npages; ++i) { + enum dma_data_direction dir = DMA_FROM_DEVICE; + struct page *page; + + /* +* FIXME need to update DMA API to provide invalid DMA address +* value instead of a function to test dma address value. This +* would remove lot of dumb code duplicated accross many arch. +* +* For now setting it to 0 here is good enough as the pfns[] +* value is what is use to check what is valid and what isn't. +*/ + daddrs[i] = 0; + + page = hmm_pfn_to_page(range, range->pfns[i]); + if (page == NULL) + continue; + + /* Check if range is being invalidated */ + if (!range->valid) { + ret = -EBUSY; + goto unmap; + } + + /* If it is read and write than map bi-directional. */ + if (range->pfns[i] & range->values[HMM_PFN_WRITE]) + dir = DMA_BIDIRECTIONAL; + + daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir); + if (dma_mapping_error(device, daddrs[i])) { + ret = -EFAULT; + goto unmap; + } + + mapped++; + } + + return mapped; + +unmap: + for (npages = i, i = 0; (i < npages) && mapped; ++i) { + enum dma_data_direction dir = DMA_FROM_DEVICE; + struct page *page; + + page = hmm_pfn_to_page(range, range->pfns[i]); + if (page == NULL) + continue; + + if (dma_mapping_error(device, daddrs[i])) + continue; + + /* If it is read and write than map bi-directional. */ + if (range->pfns[i] & range->values[HMM_PFN_WRITE]) + dir = DMA_BIDIRECTIONAL; + + dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir); + mapped--; + } + + return ret;
[PATCH v3 03/12] mm/hmm: do not erase snapshot when a range is invalidated
From: Jérôme Glisse Users of HMM might be using the snapshot information to do preparatory step like dma mapping pages to a device before checking for invalidation through hmm_vma_range_done() so do not erase that information and assume users will do the right thing. Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Reviewed-by: John Hubbard Cc: Andrew Morton Cc: Dan Williams --- mm/hmm.c | 6 -- 1 file changed, 6 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index 919d78fd21c5..84e0577a912a 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -174,16 +174,10 @@ static int hmm_invalidate_range(struct hmm *hmm, bool device, spin_lock(>lock); list_for_each_entry(range, >ranges, list) { - unsigned long addr, idx, npages; - if (update->end < range->start || update->start >= range->end) continue; range->valid = false; - addr = max(update->start, range->start); - idx = (addr - range->start) >> PAGE_SHIFT; - npages = (min(range->end, update->end) - addr) >> PAGE_SHIFT; - memset(>pfns[idx], 0, sizeof(*range->pfns) * npages); } spin_unlock(>lock); -- 2.17.2
[PATCH v3 04/12] mm/hmm: improve and rename hmm_vma_get_pfns() to hmm_range_snapshot() v2
From: Jérôme Glisse Rename for consistency between code, comments and documentation. Also improves the comments on all the possible returns values. Improve the function by returning the number of populated entries in pfns array. Changes since v1: - updated documentation - reformated some comments Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Reviewed-by: John Hubbard Reviewed-by: Ira Weiny Cc: Andrew Morton Cc: Dan Williams --- Documentation/vm/hmm.rst | 26 ++ include/linux/hmm.h | 4 ++-- mm/hmm.c | 31 +-- 3 files changed, 37 insertions(+), 24 deletions(-) diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index 44205f0b671f..d9b27bdadd1b 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -189,11 +189,7 @@ the driver callback returns. When the device driver wants to populate a range of virtual addresses, it can use either:: - int hmm_vma_get_pfns(struct vm_area_struct *vma, - struct hmm_range *range, - unsigned long start, - unsigned long end, - hmm_pfn_t *pfns); + long hmm_range_snapshot(struct hmm_range *range); int hmm_vma_fault(struct vm_area_struct *vma, struct hmm_range *range, unsigned long start, @@ -202,7 +198,7 @@ When the device driver wants to populate a range of virtual addresses, it can bool write, bool block); -The first one (hmm_vma_get_pfns()) will only fetch present CPU page table +The first one (hmm_range_snapshot()) will only fetch present CPU page table entries and will not trigger a page fault on missing or non-present entries. The second one does trigger a page fault on missing or read-only entry if the write parameter is true. Page faults use the generic mm page fault code path @@ -220,19 +216,33 @@ Locking with the update() callback is the most important aspect the driver must { struct hmm_range range; ... + + range.start = ...; + range.end = ...; + range.pfns = ...; + range.flags = ...; + range.values = ...; + range.pfn_shift = ...; + again: - ret = hmm_vma_get_pfns(vma, , start, end, pfns); - if (ret) + down_read(>mmap_sem); + range.vma = ...; + ret = hmm_range_snapshot(); + if (ret) { + up_read(>mmap_sem); return ret; + } take_lock(driver->update); if (!hmm_vma_range_done(vma, )) { release_lock(driver->update); + up_read(>mmap_sem); goto again; } // Use pfns array content to update device page table release_lock(driver->update); + up_read(>mmap_sem); return 0; } diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 716fc61fa6d4..32206b0b1bfd 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -365,11 +365,11 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror); * table invalidation serializes on it. * * YOU MUST CALL hmm_vma_range_done() ONCE AND ONLY ONCE EACH TIME YOU CALL - * hmm_vma_get_pfns() WITHOUT ERROR ! + * hmm_range_snapshot() WITHOUT ERROR ! * * IF YOU DO NOT FOLLOW THE ABOVE RULE THE SNAPSHOT CONTENT MIGHT BE INVALID ! */ -int hmm_vma_get_pfns(struct hmm_range *range); +long hmm_range_snapshot(struct hmm_range *range); bool hmm_vma_range_done(struct hmm_range *range); diff --git a/mm/hmm.c b/mm/hmm.c index 84e0577a912a..bd957a9f10d1 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -702,23 +702,25 @@ static void hmm_pfns_special(struct hmm_range *range) } /* - * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses - * @range: range being snapshotted - * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid - * vma permission, 0 success + * hmm_range_snapshot() - snapshot CPU page table for a range + * @range: range + * Returns: number of valid pages in range->pfns[] (from range start + * address). This may be zero. If the return value is negative, + * then one of the following values may be returned: + * + * -EINVAL invalid arguments or mm or virtual address are in an + *invalid vma (ie either hugetlbfs or device file vma). + * -EPERM For example, asking for write, when the range is + *read-only + * -EAGAIN Caller needs to retry + * -EFAULT Either no valid vma exists for this range, or it is + *illegal to access the range * * This snapshots the CPU page table for a range of virtual addresses. Snapshot * validity is tracked by range struct. See hmm_vma_range_done() for further * information. - * - * The range struct is initialized here. It tracks the CPU page table, but only - * if the function returns success (0), in which case the caller must
[PATCH v3 08/12] mm/hmm: mirror hugetlbfs (snapshoting, faulting and DMA mapping) v3
From: Jérôme Glisse HMM mirror is a device driver helpers to mirror range of virtual address. It means that the process jobs running on the device can access the same virtual address as the CPU threads of that process. This patch adds support for hugetlbfs mapping (ie range of virtual address that are mmap of a hugetlbfs). Changes since v2: - Use hmm_range_page_size() where we can. Changes since v1: - improved commit message - squashed: Arnd Bergmann: fix unused variable warnings Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Reviewed-by: Ira Weiny Cc: Andrew Morton Cc: John Hubbard Cc: Dan Williams Cc: Arnd Bergmann --- include/linux/hmm.h | 27 +- mm/hmm.c| 123 +++- 2 files changed, 134 insertions(+), 16 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index dee2f8953b2e..e5834082de60 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -181,10 +181,31 @@ struct hmm_range { const uint64_t *values; uint64_tdefault_flags; uint64_tpfn_flags_mask; + uint8_t page_shift; uint8_t pfn_shift; boolvalid; }; +/* + * hmm_range_page_shift() - return the page shift for the range + * @range: range being queried + * Returns: page shift (page size = 1 << page shift) for the range + */ +static inline unsigned hmm_range_page_shift(const struct hmm_range *range) +{ + return range->page_shift; +} + +/* + * hmm_range_page_size() - return the page size for the range + * @range: range being queried + * Returns: page size for the range in bytes + */ +static inline unsigned long hmm_range_page_size(const struct hmm_range *range) +{ + return 1UL << hmm_range_page_shift(range); +} + /* * hmm_range_wait_until_valid() - wait for range to be valid * @range: range affected by invalidation to wait on @@ -424,7 +445,8 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror); int hmm_range_register(struct hmm_range *range, struct mm_struct *mm, unsigned long start, - unsigned long end); + unsigned long end, + unsigned page_shift); void hmm_range_unregister(struct hmm_range *range); long hmm_range_snapshot(struct hmm_range *range); long hmm_range_fault(struct hmm_range *range, bool block); @@ -462,7 +484,8 @@ static inline int hmm_vma_fault(struct hmm_range *range, bool block) range->pfn_flags_mask = -1UL; ret = hmm_range_register(range, range->vma->vm_mm, -range->start, range->end); +range->start, range->end, +PAGE_SHIFT); if (ret) return (int)ret; diff --git a/mm/hmm.c b/mm/hmm.c index 0e21d3594ab6..9140cee24d36 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -391,11 +391,13 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; uint64_t *pfns = range->pfns; - unsigned long i; + unsigned long i, page_size; hmm_vma_walk->last = addr; - i = (addr - range->start) >> PAGE_SHIFT; - for (; addr < end; addr += PAGE_SIZE, i++) { + page_size = hmm_range_page_size(range); + i = (addr - range->start) >> range->page_shift; + + for (; addr < end; addr += page_size, i++) { pfns[i] = range->values[HMM_PFN_NONE]; if (fault || write_fault) { int ret; @@ -707,6 +709,69 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, return 0; } +static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long start, unsigned long end, + struct mm_walk *walk) +{ +#ifdef CONFIG_HUGETLB_PAGE + unsigned long addr = start, i, pfn, mask, size, pfn_inc; + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + struct vm_area_struct *vma = walk->vma; + struct hstate *h = hstate_vma(vma); + uint64_t orig_pfn, cpu_flags; + bool fault, write_fault; + spinlock_t *ptl; + pte_t entry; + int ret = 0; + + size = 1UL << huge_page_shift(h); + mask = size - 1; + if (range->page_shift != PAGE_SHIFT) { + /* Make sure we are looking at full page. */ + if (start & mask) + return -EINVAL; + if (end < (start + size)) + return -EINVAL; + pfn_inc = size >> PAGE_SHIFT; + } else { + pfn_inc = 1; + size = PAGE_SIZE; + } + + + ptl = huge_pte_lock(hstate_vma(walk->vma),
[PATCH v3 07/12] mm/hmm: add default fault flags to avoid the need to pre-fill pfns arrays v2
From: Jérôme Glisse The HMM mirror API can be use in two fashions. The first one where the HMM user coalesce multiple page faults into one request and set flags per pfns for of those faults. The second one where the HMM user want to pre-fault a range with specific flags. For the latter one it is a waste to have the user pre-fill the pfn arrays with a default flags value. This patch adds a default flags value allowing user to set them for a range without having to pre-fill the pfn array. Changes since v1: - Added documentation. - Added comments in the old API wrapper to explain what is going on. Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Cc: Andrew Morton Cc: John Hubbard Cc: Dan Williams --- Documentation/vm/hmm.rst | 35 +++ include/linux/hmm.h | 13 + mm/hmm.c | 12 3 files changed, 60 insertions(+) diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index 945d5fb6d14a..ec1efa32af3c 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -276,6 +276,41 @@ report commands as executed is serialized (there is no point in doing this concurrently). +Leverage default_flags and pfn_flags_mask += + +The hmm_range struct has 2 fields default_flags and pfn_flags_mask that allows +to set fault or snapshot policy for a whole range instead of having to set them +for each entries in the range. + +For instance if the device flags for device entries are: +VALID (1 << 63) +WRITE (1 << 62) + +Now let say that device driver wants to fault with at least read a range then +it does set: +range->default_flags = (1 << 63) +range->pfn_flags_mask = 0; + +and calls hmm_range_fault() as described above. This will fill fault all page +in the range with at least read permission. + +Now let say driver wants to do the same except for one page in the range for +which its want to have write. Now driver set: +range->default_flags = (1 << 63); +range->pfn_flags_mask = (1 << 62); +range->pfns[index_of_write] = (1 << 62); + +With this HMM will fault in all page with at least read (ie valid) and for the +address == range->start + (index_of_write << PAGE_SHIFT) it will fault with +write permission ie if the CPU pte does not have write permission set then HMM +will call handle_mm_fault(). + +Note that HMM will populate the pfns array with write permission for any entry +that have write permission within the CPU pte no matter what are the values set +in default_flags or pfn_flags_mask. + + Represent and manage device memory from core kernel point of view = diff --git a/include/linux/hmm.h b/include/linux/hmm.h index ec4bfa91648f..dee2f8953b2e 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -165,6 +165,8 @@ enum hmm_pfn_value_e { * @pfns: array of pfns (big enough for the range) * @flags: pfn flags to match device driver page table * @values: pfn value for some special case (none, special, error, ...) + * @default_flags: default flags for the range (write, read, ... see hmm doc) + * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT) * @valid: pfns array did not change since it has been fill by an HMM function */ @@ -177,6 +179,8 @@ struct hmm_range { uint64_t*pfns; const uint64_t *flags; const uint64_t *values; + uint64_tdefault_flags; + uint64_tpfn_flags_mask; uint8_t pfn_shift; boolvalid; }; @@ -448,6 +452,15 @@ static inline int hmm_vma_fault(struct hmm_range *range, bool block) { long ret; + /* +* With the old API the driver must set each individual entries with +* the requested flags (valid, write, ...). So here we set the mask to +* keep intact the entries provided by the driver and zero out the +* default_flags. +*/ + range->default_flags = 0; + range->pfn_flags_mask = -1UL; + ret = hmm_range_register(range, range->vma->vm_mm, range->start, range->end); if (ret) diff --git a/mm/hmm.c b/mm/hmm.c index 3e07f32b94f8..0e21d3594ab6 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -419,6 +419,18 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, if (!hmm_vma_walk->fault) return; + /* +* So we not only consider the individual per page request we also +* consider the default flags requested for the range. The API can +* be use in 2 fashions. The first one where the HMM user coalesce +* multiple page fault into one request and set flags per pfns for +* of those faults. The second one where the HMM user
[PATCH v3 00/12] Improve HMM driver API v3
From: Jérôme Glisse Changes since v2: - Improved the documentations - Added more comments in the code to explain things - Renamed bunch of functions from popular demands This patchset improves the HMM driver API and add support for mirroring virtual address that are mmap of hugetlbfs or of a file in a filesystem on a DAX block device. You can find a tree with all the patches [1] This patchset is necessary for converting ODP to HMM and patch to do so as been posted [2]. All new functions introduced by this patchset are use by the ODP patch. The ODP patch will be push through the RDMA tree the release after this patchset is merged. Moreover all HMM functions are use by the nouveau driver starting in 5.1. The last patch in the serie add helpers to directly dma map/unmap pages for virtual addresses that are mirrored on behalf of device driver. This has been extracted from ODP code as it is is a common pattern accross HMM device driver. It will be first use by the ODP RDMA code and will latter get use by nouveau and other driver that are working on including HMM support. [1] https://cgit.freedesktop.org/~glisse/linux/log/?h=hmm-for-5.2.v3 [2] https://cgit.freedesktop.org/~glisse/linux/log/?h=odp-hmm [3] https://lkml.org/lkml/2019/1/29/1008 Cc: Balbir Singh Cc: Ralph Campbell Cc: Andrew Morton Cc: John Hubbard Cc: Dan Williams Cc: Ira Weiny Jérôme Glisse (12): mm/hmm: select mmu notifier when selecting HMM v2 mm/hmm: use reference counting for HMM struct v3 mm/hmm: do not erase snapshot when a range is invalidated mm/hmm: improve and rename hmm_vma_get_pfns() to hmm_range_snapshot() v2 mm/hmm: improve and rename hmm_vma_fault() to hmm_range_fault() v3 mm/hmm: improve driver API to work and wait over a range v3 mm/hmm: add default fault flags to avoid the need to pre-fill pfns arrays v2 mm/hmm: mirror hugetlbfs (snapshoting, faulting and DMA mapping) v3 mm/hmm: allow to mirror vma of a file on a DAX backed filesystem v3 mm/hmm: add helpers to test if mm is still alive or not mm/hmm: add an helper function that fault pages and map them to a device v3 mm/hmm: convert various hmm_pfn_* to device_entry which is a better name Documentation/vm/hmm.rst | 94 +++- include/linux/hmm.h | 310 --- mm/Kconfig |2 +- mm/hmm.c | 1077 ++ 4 files changed, 1054 insertions(+), 429 deletions(-) -- 2.17.2
[PATCH v3 01/12] mm/hmm: select mmu notifier when selecting HMM v2
From: Jérôme Glisse To avoid random config build issue, select mmu notifier when HMM is selected. In any cases when HMM get selected it will be by users that will also wants the mmu notifier. Changes since v1: - remove select MMU_NOTIFIER from HMM_MIRROR as it select HMM which select MMU_NOTIFIER now Signed-off-by: Jérôme Glisse Acked-by: Balbir Singh Cc: Ralph Campbell Cc: Andrew Morton Cc: John Hubbard Cc: Dan Williams --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/Kconfig b/mm/Kconfig index 25c71eb8a7db..2e6d24d783f7 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -694,12 +694,12 @@ config DEV_PAGEMAP_OPS config HMM bool + select MMU_NOTIFIER select MIGRATE_VMA_HELPER config HMM_MIRROR bool "HMM mirror CPU page table into a device page table" depends on ARCH_HAS_HMM - select MMU_NOTIFIER select HMM help Select HMM_MIRROR if you want to mirror range of the CPU page table of a -- 2.17.2
[PATCH v2 10/11] mm/hmm: add helpers for driver to safely take the mmap_sem v2
From: Jérôme Glisse The device driver context which holds reference to mirror and thus to core hmm struct might outlive the mm against which it was created. To avoid every driver to check for that case provide an helper that check if mm is still alive and take the mmap_sem in read mode if so. If the mm have been destroy (mmu_notifier release call back did happen) then we return -EINVAL so that calling code knows that it is trying to do something against a mm that is no longer valid. Changes since v1: - removed bunch of useless check (if API is use with bogus argument better to fail loudly so user fix their code) Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Cc: Andrew Morton Cc: John Hubbard Cc: Dan Williams --- include/linux/hmm.h | 50 ++--- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index f3b919b04eda..5f9deaeb9d77 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -438,6 +438,50 @@ struct hmm_mirror { int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm); void hmm_mirror_unregister(struct hmm_mirror *mirror); +/* + * hmm_mirror_mm_down_read() - lock the mmap_sem in read mode + * @mirror: the HMM mm mirror for which we want to lock the mmap_sem + * Returns: -EINVAL if the mm is dead, 0 otherwise (lock taken). + * + * The device driver context which holds reference to mirror and thus to core + * hmm struct might outlive the mm against which it was created. To avoid every + * driver to check for that case provide an helper that check if mm is still + * alive and take the mmap_sem in read mode if so. If the mm have been destroy + * (mmu_notifier release call back did happen) then we return -EINVAL so that + * calling code knows that it is trying to do something against a mm that is + * no longer valid. + */ +static inline int hmm_mirror_mm_down_read(struct hmm_mirror *mirror) +{ + struct mm_struct *mm; + + /* Sanity check ... */ + if (!mirror || !mirror->hmm) + return -EINVAL; + /* +* Before trying to take the mmap_sem make sure the mm is still +* alive as device driver context might outlive the mm lifetime. +* +* FIXME: should we also check for mm that outlive its owning +* task ? +*/ + mm = READ_ONCE(mirror->hmm->mm); + if (mirror->hmm->dead || !mm) + return -EINVAL; + + down_read(>mmap_sem); + return 0; +} + +/* + * hmm_mirror_mm_up_read() - unlock the mmap_sem from read mode + * @mirror: the HMM mm mirror for which we want to lock the mmap_sem + */ +static inline void hmm_mirror_mm_up_read(struct hmm_mirror *mirror) +{ + up_read(>hmm->mm->mmap_sem); +} + /* * To snapshot the CPU page table you first have to call hmm_range_register() @@ -463,7 +507,7 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror); * if (ret) * return ret; * - * down_read(mm->mmap_sem); + * hmm_mirror_mm_down_read(mirror); * again: * * if (!hmm_range_wait_until_valid(, TIMEOUT)) { @@ -476,13 +520,13 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror); * * ret = hmm_range_snapshot(); or hmm_range_fault(); * if (ret == -EAGAIN) { - * down_read(mm->mmap_sem); + * hmm_mirror_mm_down_read(mirror); * goto again; * } else if (ret == -EBUSY) { * goto again; * } * - * up_read(>mmap_sem); + * hmm_mirror_mm_up_read(mirror); * if (ret) { * hmm_range_unregister(range); * return ret; -- 2.17.2
[PATCH v2 11/11] mm/hmm: add an helper function that fault pages and map them to a device v2
From: Jérôme Glisse This is a all in one helper that fault pages in a range and map them to a device so that every single device driver do not have to re-implement this common pattern. This is taken from ODP RDMA in preparation of ODP RDMA convertion. It will be use by nouveau and other drivers. Changes since v1: - improved commit message Signed-off-by: Jérôme Glisse Cc: Andrew Morton Cc: Ralph Campbell Cc: John Hubbard Cc: Dan Williams --- include/linux/hmm.h | 9 +++ mm/hmm.c| 152 2 files changed, 161 insertions(+) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 5f9deaeb9d77..7aadf18b29cb 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -568,6 +568,15 @@ int hmm_range_register(struct hmm_range *range, void hmm_range_unregister(struct hmm_range *range); long hmm_range_snapshot(struct hmm_range *range); long hmm_range_fault(struct hmm_range *range, bool block); +long hmm_range_dma_map(struct hmm_range *range, + struct device *device, + dma_addr_t *daddrs, + bool block); +long hmm_range_dma_unmap(struct hmm_range *range, +struct vm_area_struct *vma, +struct device *device, +dma_addr_t *daddrs, +bool dirty); /* * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range diff --git a/mm/hmm.c b/mm/hmm.c index ce33151c6832..fd143251b157 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -1163,6 +1164,157 @@ long hmm_range_fault(struct hmm_range *range, bool block) return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; } EXPORT_SYMBOL(hmm_range_fault); + +/* + * hmm_range_dma_map() - hmm_range_fault() and dma map page all in one. + * @range: range being faulted + * @device: device against to dma map page to + * @daddrs: dma address of mapped pages + * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) + * Returns: number of pages mapped on success, -EAGAIN if mmap_sem have been + * drop and you need to try again, some other error value otherwise + * + * Note same usage pattern as hmm_range_fault(). + */ +long hmm_range_dma_map(struct hmm_range *range, + struct device *device, + dma_addr_t *daddrs, + bool block) +{ + unsigned long i, npages, mapped; + long ret; + + ret = hmm_range_fault(range, block); + if (ret <= 0) + return ret ? ret : -EBUSY; + + npages = (range->end - range->start) >> PAGE_SHIFT; + for (i = 0, mapped = 0; i < npages; ++i) { + enum dma_data_direction dir = DMA_FROM_DEVICE; + struct page *page; + + /* +* FIXME need to update DMA API to provide invalid DMA address +* value instead of a function to test dma address value. This +* would remove lot of dumb code duplicated accross many arch. +* +* For now setting it to 0 here is good enough as the pfns[] +* value is what is use to check what is valid and what isn't. +*/ + daddrs[i] = 0; + + page = hmm_pfn_to_page(range, range->pfns[i]); + if (page == NULL) + continue; + + /* Check if range is being invalidated */ + if (!range->valid) { + ret = -EBUSY; + goto unmap; + } + + /* If it is read and write than map bi-directional. */ + if (range->pfns[i] & range->values[HMM_PFN_WRITE]) + dir = DMA_BIDIRECTIONAL; + + daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir); + if (dma_mapping_error(device, daddrs[i])) { + ret = -EFAULT; + goto unmap; + } + + mapped++; + } + + return mapped; + +unmap: + for (npages = i, i = 0; (i < npages) && mapped; ++i) { + enum dma_data_direction dir = DMA_FROM_DEVICE; + struct page *page; + + page = hmm_pfn_to_page(range, range->pfns[i]); + if (page == NULL) + continue; + + if (dma_mapping_error(device, daddrs[i])) + continue; + + /* If it is read and write than map bi-directional. */ + if (range->pfns[i] & range->values[HMM_PFN_WRITE]) + dir = DMA_BIDIRECTIONAL; + + dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir); + mapped--; + } + + return ret; +} +EXPORT_SYMBOL(hmm_range_dma_map); + +/* + * hmm_range_dma_unmap() - unmap range of that was
[PATCH v2 05/11] mm/hmm: improve and rename hmm_vma_fault() to hmm_range_fault() v2
From: Jérôme Glisse Rename for consistency between code, comments and documentation. Also improves the comments on all the possible returns values. Improve the function by returning the number of populated entries in pfns array. Changes since v1: - updated documentation - reformated some comments Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Cc: Andrew Morton Cc: John Hubbard Cc: Dan Williams --- Documentation/vm/hmm.rst | 8 +--- include/linux/hmm.h | 13 +- mm/hmm.c | 91 +--- 3 files changed, 52 insertions(+), 60 deletions(-) diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index d9b27bdadd1b..61f073215a8d 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -190,13 +190,7 @@ When the device driver wants to populate a range of virtual addresses, it can use either:: long hmm_range_snapshot(struct hmm_range *range); - int hmm_vma_fault(struct vm_area_struct *vma, -struct hmm_range *range, -unsigned long start, -unsigned long end, -hmm_pfn_t *pfns, -bool write, -bool block); + long hmm_range_fault(struct hmm_range *range, bool block); The first one (hmm_range_snapshot()) will only fetch present CPU page table entries and will not trigger a page fault on missing or non-present entries. diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 32206b0b1bfd..e9afd23c2eac 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -391,7 +391,18 @@ bool hmm_vma_range_done(struct hmm_range *range); * * See the function description in mm/hmm.c for further documentation. */ -int hmm_vma_fault(struct hmm_range *range, bool block); +long hmm_range_fault(struct hmm_range *range, bool block); + +/* This is a temporary helper to avoid merge conflict between trees. */ +static inline int hmm_vma_fault(struct hmm_range *range, bool block) +{ + long ret = hmm_range_fault(range, block); + if (ret == -EBUSY) + ret = -EAGAIN; + else if (ret == -EAGAIN) + ret = -EBUSY; + return ret < 0 ? ret : 0; +} /* Below are for HMM internal use only! Not to be used by device driver! */ void hmm_mm_destroy(struct mm_struct *mm); diff --git a/mm/hmm.c b/mm/hmm.c index 91361aa74b8b..7860e63c3ba7 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -336,13 +336,13 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, flags |= write_fault ? FAULT_FLAG_WRITE : 0; ret = handle_mm_fault(vma, addr, flags); if (ret & VM_FAULT_RETRY) - return -EBUSY; + return -EAGAIN; if (ret & VM_FAULT_ERROR) { *pfn = range->values[HMM_PFN_ERROR]; return -EFAULT; } - return -EAGAIN; + return -EBUSY; } static int hmm_pfns_bad(unsigned long addr, @@ -368,7 +368,7 @@ static int hmm_pfns_bad(unsigned long addr, * @fault: should we fault or not ? * @write_fault: write fault ? * @walk: mm_walk structure - * Returns: 0 on success, -EAGAIN after page fault, or page fault error + * Returns: 0 on success, -EBUSY after page fault, or page fault error * * This function will be called whenever pmd_none() or pte_none() returns true, * or whenever there is no page directory covering the virtual address range. @@ -391,12 +391,12 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, ret = hmm_vma_do_fault(walk, addr, write_fault, [i]); - if (ret != -EAGAIN) + if (ret != -EBUSY) return ret; } } - return (fault || write_fault) ? -EAGAIN : 0; + return (fault || write_fault) ? -EBUSY : 0; } static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, @@ -527,11 +527,11 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, uint64_t orig_pfn = *pfn; *pfn = range->values[HMM_PFN_NONE]; - cpu_flags = pte_to_hmm_pfn_flags(range, pte); - hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, - , _fault); + fault = write_fault = false; if (pte_none(pte)) { + hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0, + , _fault); if (fault || write_fault) goto fault; return 0; @@ -570,7 +570,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, hmm_vma_walk->last = addr; migration_entry_wait(vma->vm_mm, pmdp, addr); - return -EAGAIN; +
[PATCH v2 07/11] mm/hmm: add default fault flags to avoid the need to pre-fill pfns arrays.
From: Jérôme Glisse The HMM mirror API can be use in two fashions. The first one where the HMM user coalesce multiple page faults into one request and set flags per pfns for of those faults. The second one where the HMM user want to pre-fault a range with specific flags. For the latter one it is a waste to have the user pre-fill the pfn arrays with a default flags value. This patch adds a default flags value allowing user to set them for a range without having to pre-fill the pfn array. Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Cc: Andrew Morton Cc: John Hubbard Cc: Dan Williams --- include/linux/hmm.h | 7 +++ mm/hmm.c| 12 2 files changed, 19 insertions(+) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 79671036cb5f..13bc2c72f791 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -165,6 +165,8 @@ enum hmm_pfn_value_e { * @pfns: array of pfns (big enough for the range) * @flags: pfn flags to match device driver page table * @values: pfn value for some special case (none, special, error, ...) + * @default_flags: default flags for the range (write, read, ...) + * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT) * @valid: pfns array did not change since it has been fill by an HMM function */ @@ -177,6 +179,8 @@ struct hmm_range { uint64_t*pfns; const uint64_t *flags; const uint64_t *values; + uint64_tdefault_flags; + uint64_tpfn_flags_mask; uint8_t pfn_shift; boolvalid; }; @@ -521,6 +525,9 @@ static inline int hmm_vma_fault(struct hmm_range *range, bool block) { long ret; + range->default_flags = 0; + range->pfn_flags_mask = -1UL; + ret = hmm_range_register(range, range->vma->vm_mm, range->start, range->end); if (ret) diff --git a/mm/hmm.c b/mm/hmm.c index fa9498eeb9b6..4fe88a196d17 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -415,6 +415,18 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, if (!hmm_vma_walk->fault) return; + /* +* So we not only consider the individual per page request we also +* consider the default flags requested for the range. The API can +* be use in 2 fashions. The first one where the HMM user coalesce +* multiple page fault into one request and set flags per pfns for +* of those faults. The second one where the HMM user want to pre- +* fault a range with specific flags. For the latter one it is a +* waste to have the user pre-fill the pfn arrays with a default +* flags value. +*/ + pfns = (pfns & range->pfn_flags_mask) | range->default_flags; + /* We aren't ask to do anything ... */ if (!(pfns & range->flags[HMM_PFN_VALID])) return; -- 2.17.2
[PATCH v2 04/11] mm/hmm: improve and rename hmm_vma_get_pfns() to hmm_range_snapshot() v2
From: Jérôme Glisse Rename for consistency between code, comments and documentation. Also improves the comments on all the possible returns values. Improve the function by returning the number of populated entries in pfns array. Changes since v1: - updated documentation - reformated some comments Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Reviewed-by: John Hubbard Cc: Andrew Morton Cc: Dan Williams --- Documentation/vm/hmm.rst | 26 ++ include/linux/hmm.h | 4 ++-- mm/hmm.c | 31 +-- 3 files changed, 37 insertions(+), 24 deletions(-) diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index 44205f0b671f..d9b27bdadd1b 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -189,11 +189,7 @@ the driver callback returns. When the device driver wants to populate a range of virtual addresses, it can use either:: - int hmm_vma_get_pfns(struct vm_area_struct *vma, - struct hmm_range *range, - unsigned long start, - unsigned long end, - hmm_pfn_t *pfns); + long hmm_range_snapshot(struct hmm_range *range); int hmm_vma_fault(struct vm_area_struct *vma, struct hmm_range *range, unsigned long start, @@ -202,7 +198,7 @@ When the device driver wants to populate a range of virtual addresses, it can bool write, bool block); -The first one (hmm_vma_get_pfns()) will only fetch present CPU page table +The first one (hmm_range_snapshot()) will only fetch present CPU page table entries and will not trigger a page fault on missing or non-present entries. The second one does trigger a page fault on missing or read-only entry if the write parameter is true. Page faults use the generic mm page fault code path @@ -220,19 +216,33 @@ Locking with the update() callback is the most important aspect the driver must { struct hmm_range range; ... + + range.start = ...; + range.end = ...; + range.pfns = ...; + range.flags = ...; + range.values = ...; + range.pfn_shift = ...; + again: - ret = hmm_vma_get_pfns(vma, , start, end, pfns); - if (ret) + down_read(>mmap_sem); + range.vma = ...; + ret = hmm_range_snapshot(); + if (ret) { + up_read(>mmap_sem); return ret; + } take_lock(driver->update); if (!hmm_vma_range_done(vma, )) { release_lock(driver->update); + up_read(>mmap_sem); goto again; } // Use pfns array content to update device page table release_lock(driver->update); + up_read(>mmap_sem); return 0; } diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 716fc61fa6d4..32206b0b1bfd 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -365,11 +365,11 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror); * table invalidation serializes on it. * * YOU MUST CALL hmm_vma_range_done() ONCE AND ONLY ONCE EACH TIME YOU CALL - * hmm_vma_get_pfns() WITHOUT ERROR ! + * hmm_range_snapshot() WITHOUT ERROR ! * * IF YOU DO NOT FOLLOW THE ABOVE RULE THE SNAPSHOT CONTENT MIGHT BE INVALID ! */ -int hmm_vma_get_pfns(struct hmm_range *range); +long hmm_range_snapshot(struct hmm_range *range); bool hmm_vma_range_done(struct hmm_range *range); diff --git a/mm/hmm.c b/mm/hmm.c index 213b0beee8d3..91361aa74b8b 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -698,23 +698,25 @@ static void hmm_pfns_special(struct hmm_range *range) } /* - * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses - * @range: range being snapshotted - * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid - * vma permission, 0 success + * hmm_range_snapshot() - snapshot CPU page table for a range + * @range: range + * Returns: number of valid pages in range->pfns[] (from range start + * address). This may be zero. If the return value is negative, + * then one of the following values may be returned: + * + * -EINVAL invalid arguments or mm or virtual address are in an + *invalid vma (ie either hugetlbfs or device file vma). + * -EPERM For example, asking for write, when the range is + *read-only + * -EAGAIN Caller needs to retry + * -EFAULT Either no valid vma exists for this range, or it is + *illegal to access the range * * This snapshots the CPU page table for a range of virtual addresses. Snapshot * validity is tracked by range struct. See hmm_vma_range_done() for further * information. - * - * The range struct is initialized here. It tracks the CPU page table, but only - * if the function returns success (0), in which case the caller must then call - *
[PATCH v2 01/11] mm/hmm: select mmu notifier when selecting HMM
From: Jérôme Glisse To avoid random config build issue, select mmu notifier when HMM is selected. In any cases when HMM get selected it will be by users that will also wants the mmu notifier. Signed-off-by: Jérôme Glisse Acked-by: Balbir Singh Cc: Ralph Campbell Cc: Andrew Morton Cc: John Hubbard Cc: Dan Williams --- mm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/Kconfig b/mm/Kconfig index 25c71eb8a7db..0d2944278d80 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -694,6 +694,7 @@ config DEV_PAGEMAP_OPS config HMM bool + select MMU_NOTIFIER select MIGRATE_VMA_HELPER config HMM_MIRROR -- 2.17.2
[PATCH v2 00/11] Improve HMM driver API v2
From: Jérôme Glisse This patchset improves the HMM driver API and add support for mirroring virtual address that are mmap of hugetlbfs or of a file in a filesystem on a DAX block device. You can find a tree with all the patches [1] This patchset is necessary for converting ODP to HMM and patch to do so as been posted [2]. All new functions introduced by this patchset are use by the ODP patch. The ODP patch will be push through the RDMA tree the release after this patchset is merged. Moreover all HMM functions are use by the nouveau driver starting in 5.1. The last patch in the serie add helpers to directly dma map/unmap pages for virtual addresses that are mirrored on behalf of device driver. This has been extracted from ODP code as it is is a common pattern accross HMM device driver. It will be first use by the ODP RDMA code and will latter get use by nouveau and other driver that are working on including HMM support. [1] https://cgit.freedesktop.org/~glisse/linux/log/?h=hmm-for-5.1-v2 [2] https://cgit.freedesktop.org/~glisse/linux/log/?h=odp-hmm [3] https://lkml.org/lkml/2019/1/29/1008 Cc: Balbir Singh Cc: Ralph Campbell Cc: Andrew Morton Cc: John Hubbard Cc: Dan Williams Jérôme Glisse (11): mm/hmm: select mmu notifier when selecting HMM mm/hmm: use reference counting for HMM struct v2 mm/hmm: do not erase snapshot when a range is invalidated mm/hmm: improve and rename hmm_vma_get_pfns() to hmm_range_snapshot() v2 mm/hmm: improve and rename hmm_vma_fault() to hmm_range_fault() v2 mm/hmm: improve driver API to work and wait over a range v2 mm/hmm: add default fault flags to avoid the need to pre-fill pfns arrays. mm/hmm: mirror hugetlbfs (snapshoting, faulting and DMA mapping) v2 mm/hmm: allow to mirror vma of a file on a DAX backed filesystem v2 mm/hmm: add helpers for driver to safely take the mmap_sem v2 mm/hmm: add an helper function that fault pages and map them to a device v2 Documentation/vm/hmm.rst | 36 +- include/linux/hmm.h | 290 ++- mm/Kconfig |1 + mm/hmm.c | 1046 +- 4 files changed, 990 insertions(+), 383 deletions(-) -- 2.17.2
[PATCH v2 06/11] mm/hmm: improve driver API to work and wait over a range v2
From: Jérôme Glisse A common use case for HMM mirror is user trying to mirror a range and before they could program the hardware it get invalidated by some core mm event. Instead of having user re-try right away to mirror the range provide a completion mechanism for them to wait for any active invalidation affecting the range. This also changes how hmm_range_snapshot() and hmm_range_fault() works by not relying on vma so that we can drop the mmap_sem when waiting and lookup the vma again on retry. Changes since v1: - squashed: Dan Carpenter: potential deadlock in nonblocking code Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Cc: Andrew Morton Cc: John Hubbard Cc: Dan Williams Cc: Dan Carpenter Cc: Matthew Wilcox --- include/linux/hmm.h | 208 ++--- mm/hmm.c| 528 +--- 2 files changed, 428 insertions(+), 308 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index e9afd23c2eac..79671036cb5f 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -77,8 +77,34 @@ #include #include #include +#include -struct hmm; + +/* + * struct hmm - HMM per mm struct + * + * @mm: mm struct this HMM struct is bound to + * @lock: lock protecting ranges list + * @ranges: list of range being snapshotted + * @mirrors: list of mirrors for this mm + * @mmu_notifier: mmu notifier to track updates to CPU page table + * @mirrors_sem: read/write semaphore protecting the mirrors list + * @wq: wait queue for user waiting on a range invalidation + * @notifiers: count of active mmu notifiers + * @dead: is the mm dead ? + */ +struct hmm { + struct mm_struct*mm; + struct kref kref; + struct mutexlock; + struct list_headranges; + struct list_headmirrors; + struct mmu_notifier mmu_notifier; + struct rw_semaphore mirrors_sem; + wait_queue_head_t wq; + longnotifiers; + booldead; +}; /* * hmm_pfn_flag_e - HMM flag enums @@ -155,6 +181,38 @@ struct hmm_range { boolvalid; }; +/* + * hmm_range_wait_until_valid() - wait for range to be valid + * @range: range affected by invalidation to wait on + * @timeout: time out for wait in ms (ie abort wait after that period of time) + * Returns: true if the range is valid, false otherwise. + */ +static inline bool hmm_range_wait_until_valid(struct hmm_range *range, + unsigned long timeout) +{ + /* Check if mm is dead ? */ + if (range->hmm == NULL || range->hmm->dead || range->hmm->mm == NULL) { + range->valid = false; + return false; + } + if (range->valid) + return true; + wait_event_timeout(range->hmm->wq, range->valid || range->hmm->dead, + msecs_to_jiffies(timeout)); + /* Return current valid status just in case we get lucky */ + return range->valid; +} + +/* + * hmm_range_valid() - test if a range is valid or not + * @range: range + * Returns: true if the range is valid, false otherwise. + */ +static inline bool hmm_range_valid(struct hmm_range *range) +{ + return range->valid; +} + /* * hmm_pfn_to_page() - return struct page pointed to by a valid HMM pfn * @range: range use to decode HMM pfn value @@ -357,51 +415,133 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror); /* - * To snapshot the CPU page table, call hmm_vma_get_pfns(), then take a device - * driver lock that serializes device page table updates, then call - * hmm_vma_range_done(), to check if the snapshot is still valid. The same - * device driver page table update lock must also be used in the - * hmm_mirror_ops.sync_cpu_device_pagetables() callback, so that CPU page - * table invalidation serializes on it. + * To snapshot the CPU page table you first have to call hmm_range_register() + * to register the range. If hmm_range_register() return an error then some- + * thing is horribly wrong and you should fail loudly. If it returned true then + * you can wait for the range to be stable with hmm_range_wait_until_valid() + * function, a range is valid when there are no concurrent changes to the CPU + * page table for the range. + * + * Once the range is valid you can call hmm_range_snapshot() if that returns + * without error then you can take your device page table lock (the same lock + * you use in the HMM mirror sync_cpu_device_pagetables() callback). After + * taking that lock you have to check the range validity, if it is still valid + * (ie hmm_range_valid() returns true) then you can program the device page + * table, otherwise you have to start again. Pseudo code: + * + * mydevice_prefault(mydevice, mm, start, end) + * { + * struct hmm_range range; + * ... * - * YOU MUST CALL hmm_vma_range_done() ONCE AND ONLY ONCE
[PATCH v2 09/11] mm/hmm: allow to mirror vma of a file on a DAX backed filesystem v2
From: Jérôme Glisse HMM mirror is a device driver helpers to mirror range of virtual address. It means that the process jobs running on the device can access the same virtual address as the CPU threads of that process. This patch adds support for mirroring mapping of file that are on a DAX block device (ie range of virtual address that is an mmap of a file in a filesystem on a DAX block device). There is no reason to not support such case when mirroring virtual address on a device. Note that unlike GUP code we do not take page reference hence when we back-off we have nothing to undo. Changes since v1: - improved commit message - squashed: Arnd Bergmann: fix unused variable warning in hmm_vma_walk_pud Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Cc: Andrew Morton Cc: Dan Williams Cc: John Hubbard Cc: Arnd Bergmann --- mm/hmm.c | 132 ++- 1 file changed, 111 insertions(+), 21 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index 64a33770813b..ce33151c6832 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -325,6 +325,7 @@ EXPORT_SYMBOL(hmm_mirror_unregister); struct hmm_vma_walk { struct hmm_range*range; + struct dev_pagemap *pgmap; unsigned long last; boolfault; boolblock; @@ -499,6 +500,15 @@ static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) range->flags[HMM_PFN_VALID]; } +static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud) +{ + if (!pud_present(pud)) + return 0; + return pud_write(pud) ? range->flags[HMM_PFN_VALID] | + range->flags[HMM_PFN_WRITE] : + range->flags[HMM_PFN_VALID]; +} + static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, unsigned long end, @@ -520,8 +530,19 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); pfn = pmd_pfn(pmd) + pte_index(addr); - for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { + if (pmd_devmap(pmd)) { + hmm_vma_walk->pgmap = get_dev_pagemap(pfn, + hmm_vma_walk->pgmap); + if (unlikely(!hmm_vma_walk->pgmap)) + return -EBUSY; + } pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags; + } + if (hmm_vma_walk->pgmap) { + put_dev_pagemap(hmm_vma_walk->pgmap); + hmm_vma_walk->pgmap = NULL; + } hmm_vma_walk->last = end; return 0; } @@ -608,10 +629,24 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, if (fault || write_fault) goto fault; + if (pte_devmap(pte)) { + hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte), + hmm_vma_walk->pgmap); + if (unlikely(!hmm_vma_walk->pgmap)) + return -EBUSY; + } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) { + *pfn = range->values[HMM_PFN_SPECIAL]; + return -EFAULT; + } + *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags; return 0; fault: + if (hmm_vma_walk->pgmap) { + put_dev_pagemap(hmm_vma_walk->pgmap); + hmm_vma_walk->pgmap = NULL; + } pte_unmap(ptep); /* Fault any virtual address we were asked to fault */ return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); @@ -699,12 +734,83 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, return r; } } + if (hmm_vma_walk->pgmap) { + put_dev_pagemap(hmm_vma_walk->pgmap); + hmm_vma_walk->pgmap = NULL; + } pte_unmap(ptep - 1); hmm_vma_walk->last = addr; return 0; } +static int hmm_vma_walk_pud(pud_t *pudp, + unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + unsigned long addr = start, next; + pmd_t *pmdp; + pud_t pud; + int ret; + +again: + pud = READ_ONCE(*pudp); + if (pud_none(pud)) + return hmm_vma_walk_hole(start, end, walk); + + if (pud_huge(pud) && pud_devmap(pud)) { + unsigned long i, npages, pfn; + uint64_t *pfns, cpu_flags; + bool fault, write_fault; + +
[PATCH v2 03/11] mm/hmm: do not erase snapshot when a range is invalidated
From: Jérôme Glisse Users of HMM might be using the snapshot information to do preparatory step like dma mapping pages to a device before checking for invalidation through hmm_vma_range_done() so do not erase that information and assume users will do the right thing. Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Reviewed-by: John Hubbard Cc: Andrew Morton Cc: Dan Williams --- mm/hmm.c | 6 -- 1 file changed, 6 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index 306e57f7cded..213b0beee8d3 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -170,16 +170,10 @@ static int hmm_invalidate_range(struct hmm *hmm, bool device, spin_lock(>lock); list_for_each_entry(range, >ranges, list) { - unsigned long addr, idx, npages; - if (update->end < range->start || update->start >= range->end) continue; range->valid = false; - addr = max(update->start, range->start); - idx = (addr - range->start) >> PAGE_SHIFT; - npages = (min(range->end, update->end) - addr) >> PAGE_SHIFT; - memset(>pfns[idx], 0, sizeof(*range->pfns) * npages); } spin_unlock(>lock); -- 2.17.2
[PATCH v2 08/11] mm/hmm: mirror hugetlbfs (snapshoting, faulting and DMA mapping) v2
From: Jérôme Glisse HMM mirror is a device driver helpers to mirror range of virtual address. It means that the process jobs running on the device can access the same virtual address as the CPU threads of that process. This patch adds support for hugetlbfs mapping (ie range of virtual address that are mmap of a hugetlbfs). Changes since v1: - improved commit message - squashed: Arnd Bergmann: fix unused variable warnings Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Cc: Andrew Morton Cc: John Hubbard Cc: Dan Williams Cc: Arnd Bergmann --- include/linux/hmm.h | 29 -- mm/hmm.c| 126 +++- 2 files changed, 138 insertions(+), 17 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 13bc2c72f791..f3b919b04eda 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -181,10 +181,31 @@ struct hmm_range { const uint64_t *values; uint64_tdefault_flags; uint64_tpfn_flags_mask; + uint8_t page_shift; uint8_t pfn_shift; boolvalid; }; +/* + * hmm_range_page_shift() - return the page shift for the range + * @range: range being queried + * Returns: page shift (page size = 1 << page shift) for the range + */ +static inline unsigned hmm_range_page_shift(const struct hmm_range *range) +{ + return range->page_shift; +} + +/* + * hmm_range_page_size() - return the page size for the range + * @range: range being queried + * Returns: page size for the range in bytes + */ +static inline unsigned long hmm_range_page_size(const struct hmm_range *range) +{ + return 1UL << hmm_range_page_shift(range); +} + /* * hmm_range_wait_until_valid() - wait for range to be valid * @range: range affected by invalidation to wait on @@ -438,7 +459,7 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror); * struct hmm_range range; * ... * - * ret = hmm_range_register(, mm, start, end); + * ret = hmm_range_register(, mm, start, end, page_shift); * if (ret) * return ret; * @@ -498,7 +519,8 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror); int hmm_range_register(struct hmm_range *range, struct mm_struct *mm, unsigned long start, - unsigned long end); + unsigned long end, + unsigned page_shift); void hmm_range_unregister(struct hmm_range *range); long hmm_range_snapshot(struct hmm_range *range); long hmm_range_fault(struct hmm_range *range, bool block); @@ -529,7 +551,8 @@ static inline int hmm_vma_fault(struct hmm_range *range, bool block) range->pfn_flags_mask = -1UL; ret = hmm_range_register(range, range->vma->vm_mm, -range->start, range->end); +range->start, range->end, +PAGE_SHIFT); if (ret) return (int)ret; diff --git a/mm/hmm.c b/mm/hmm.c index 4fe88a196d17..64a33770813b 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -387,11 +387,13 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; uint64_t *pfns = range->pfns; - unsigned long i; + unsigned long i, page_size; hmm_vma_walk->last = addr; - i = (addr - range->start) >> PAGE_SHIFT; - for (; addr < end; addr += PAGE_SIZE, i++) { + page_size = 1UL << range->page_shift; + i = (addr - range->start) >> range->page_shift; + + for (; addr < end; addr += page_size, i++) { pfns[i] = range->values[HMM_PFN_NONE]; if (fault || write_fault) { int ret; @@ -703,6 +705,69 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, return 0; } +static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long start, unsigned long end, + struct mm_walk *walk) +{ +#ifdef CONFIG_HUGETLB_PAGE + unsigned long addr = start, i, pfn, mask, size, pfn_inc; + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + struct vm_area_struct *vma = walk->vma; + struct hstate *h = hstate_vma(vma); + uint64_t orig_pfn, cpu_flags; + bool fault, write_fault; + spinlock_t *ptl; + pte_t entry; + int ret = 0; + + size = 1UL << huge_page_shift(h); + mask = size - 1; + if (range->page_shift != PAGE_SHIFT) { + /* Make sure we are looking at full page. */ + if (start & mask) + return -EINVAL; + if (end < (start + size)) +
[PATCH v2 02/11] mm/hmm: use reference counting for HMM struct v2
From: Jérôme Glisse Every time i read the code to check that the HMM structure does not vanish before it should thanks to the many lock protecting its removal i get a headache. Switch to reference counting instead it is much easier to follow and harder to break. This also remove some code that is no longer needed with refcounting. Changes since v1: - removed bunch of useless check (if API is use with bogus argument better to fail loudly so user fix their code) - s/hmm_get/mm_get_hmm/ Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Cc: John Hubbard Cc: Andrew Morton Cc: Dan Williams --- include/linux/hmm.h | 2 + mm/hmm.c| 170 2 files changed, 112 insertions(+), 60 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index ad50b7b4f141..716fc61fa6d4 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -131,6 +131,7 @@ enum hmm_pfn_value_e { /* * struct hmm_range - track invalidation lock on virtual address range * + * @hmm: the core HMM structure this range is active against * @vma: the vm area struct for the range * @list: all range lock are on a list * @start: range virtual start address (inclusive) @@ -142,6 +143,7 @@ enum hmm_pfn_value_e { * @valid: pfns array did not change since it has been fill by an HMM function */ struct hmm_range { + struct hmm *hmm; struct vm_area_struct *vma; struct list_headlist; unsigned long start; diff --git a/mm/hmm.c b/mm/hmm.c index fe1cd87e49ac..306e57f7cded 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -50,6 +50,7 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops; */ struct hmm { struct mm_struct*mm; + struct kref kref; spinlock_t lock; struct list_headranges; struct list_headmirrors; @@ -57,6 +58,16 @@ struct hmm { struct rw_semaphore mirrors_sem; }; +static inline struct hmm *mm_get_hmm(struct mm_struct *mm) +{ + struct hmm *hmm = READ_ONCE(mm->hmm); + + if (hmm && kref_get_unless_zero(>kref)) + return hmm; + + return NULL; +} + /* * hmm_register - register HMM against an mm (HMM internal) * @@ -67,14 +78,9 @@ struct hmm { */ static struct hmm *hmm_register(struct mm_struct *mm) { - struct hmm *hmm = READ_ONCE(mm->hmm); + struct hmm *hmm = mm_get_hmm(mm); bool cleanup = false; - /* -* The hmm struct can only be freed once the mm_struct goes away, -* hence we should always have pre-allocated an new hmm struct -* above. -*/ if (hmm) return hmm; @@ -86,6 +92,7 @@ static struct hmm *hmm_register(struct mm_struct *mm) hmm->mmu_notifier.ops = NULL; INIT_LIST_HEAD(>ranges); spin_lock_init(>lock); + kref_init(>kref); hmm->mm = mm; spin_lock(>page_table_lock); @@ -106,7 +113,7 @@ static struct hmm *hmm_register(struct mm_struct *mm) if (__mmu_notifier_register(>mmu_notifier, mm)) goto error_mm; - return mm->hmm; + return hmm; error_mm: spin_lock(>page_table_lock); @@ -118,9 +125,41 @@ static struct hmm *hmm_register(struct mm_struct *mm) return NULL; } +static void hmm_free(struct kref *kref) +{ + struct hmm *hmm = container_of(kref, struct hmm, kref); + struct mm_struct *mm = hmm->mm; + + mmu_notifier_unregister_no_release(>mmu_notifier, mm); + + spin_lock(>page_table_lock); + if (mm->hmm == hmm) + mm->hmm = NULL; + spin_unlock(>page_table_lock); + + kfree(hmm); +} + +static inline void hmm_put(struct hmm *hmm) +{ + kref_put(>kref, hmm_free); +} + void hmm_mm_destroy(struct mm_struct *mm) { - kfree(mm->hmm); + struct hmm *hmm; + + spin_lock(>page_table_lock); + hmm = mm_get_hmm(mm); + mm->hmm = NULL; + if (hmm) { + hmm->mm = NULL; + spin_unlock(>page_table_lock); + hmm_put(hmm); + return; + } + + spin_unlock(>page_table_lock); } static int hmm_invalidate_range(struct hmm *hmm, bool device, @@ -165,7 +204,7 @@ static int hmm_invalidate_range(struct hmm *hmm, bool device, static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) { struct hmm_mirror *mirror; - struct hmm *hmm = mm->hmm; + struct hmm *hmm = mm_get_hmm(mm); down_write(>mirrors_sem); mirror = list_first_entry_or_null(>mirrors, struct hmm_mirror, @@ -186,13 +225,16 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) struct hmm_mirror, list); } up_write(>mirrors_sem); + + hmm_put(hmm); } static int hmm_invalidate_range_start(struct mmu_notifier *mn,
[PATCH v2 1/1] RDMA/odp: convert to use HMM for ODP v2
From: Jérôme Glisse Convert ODP to use HMM so that we can build on common infrastructure for different class of devices that want to mirror a process address space into a device. There is no functional changes. Changes since v1: - improved comments - simplified page alignment computation Signed-off-by: Jérôme Glisse Cc: Jason Gunthorpe Cc: Leon Romanovsky Cc: Doug Ledford Cc: Artemy Kovalyov Cc: Moni Shoua Cc: Mike Marciniszyn Cc: Kaike Wan Cc: Dennis Dalessandro --- drivers/infiniband/core/umem_odp.c | 488 - drivers/infiniband/hw/mlx5/mem.c | 20 +- drivers/infiniband/hw/mlx5/mr.c| 2 +- drivers/infiniband/hw/mlx5/odp.c | 106 --- include/rdma/ib_umem_odp.h | 48 ++- 5 files changed, 217 insertions(+), 447 deletions(-) diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index e6ec79ad9cc8..8ca90cc54b39 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -46,6 +46,20 @@ #include #include + +static uint64_t odp_hmm_flags[HMM_PFN_FLAG_MAX] = { + ODP_READ_BIT, /* HMM_PFN_VALID */ + ODP_WRITE_BIT, /* HMM_PFN_WRITE */ + ODP_DEVICE_BIT, /* HMM_PFN_DEVICE_PRIVATE */ +}; + +static uint64_t odp_hmm_values[HMM_PFN_VALUE_MAX] = { + -1UL, /* HMM_PFN_ERROR */ + 0UL,/* HMM_PFN_NONE */ + -2UL, /* HMM_PFN_SPECIAL */ +}; + + /* * The ib_umem list keeps track of memory regions for which the HW * device request to receive notification when the related memory @@ -78,57 +92,25 @@ static u64 node_last(struct umem_odp_node *n) INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, node_start, node_last, static, rbt_ib_umem) -static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp) -{ - mutex_lock(_odp->umem_mutex); - if (umem_odp->notifiers_count++ == 0) - /* -* Initialize the completion object for waiting on -* notifiers. Since notifier_count is zero, no one should be -* waiting right now. -*/ - reinit_completion(_odp->notifier_completion); - mutex_unlock(_odp->umem_mutex); -} - -static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp) -{ - mutex_lock(_odp->umem_mutex); - /* -* This sequence increase will notify the QP page fault that the page -* that is going to be mapped in the spte could have been freed. -*/ - ++umem_odp->notifiers_seq; - if (--umem_odp->notifiers_count == 0) - complete_all(_odp->notifier_completion); - mutex_unlock(_odp->umem_mutex); -} - static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp, u64 start, u64 end, void *cookie) { struct ib_umem *umem = _odp->umem; - /* -* Increase the number of notifiers running, to -* prevent any further fault handling on this MR. -*/ - ib_umem_notifier_start_account(umem_odp); umem_odp->dying = 1; /* Make sure that the fact the umem is dying is out before we release * all pending page faults. */ smp_wmb(); - complete_all(_odp->notifier_completion); umem->context->invalidate_range(umem_odp, ib_umem_start(umem), ib_umem_end(umem)); return 0; } -static void ib_umem_notifier_release(struct mmu_notifier *mn, -struct mm_struct *mm) +static void ib_umem_notifier_release(struct hmm_mirror *mirror) { - struct ib_ucontext_per_mm *per_mm = - container_of(mn, struct ib_ucontext_per_mm, mn); + struct ib_ucontext_per_mm *per_mm; + + per_mm = container_of(mirror, struct ib_ucontext_per_mm, mirror); down_read(_mm->umem_rwsem); if (per_mm->active) @@ -136,21 +118,24 @@ static void ib_umem_notifier_release(struct mmu_notifier *mn, _mm->umem_tree, 0, ULLONG_MAX, ib_umem_notifier_release_trampoline, true, NULL); up_read(_mm->umem_rwsem); + + per_mm->mm = NULL; } -static int invalidate_range_start_trampoline(struct ib_umem_odp *item, -u64 start, u64 end, void *cookie) +static int invalidate_range_trampoline(struct ib_umem_odp *item, + u64 start, u64 end, void *cookie) { - ib_umem_notifier_start_account(item); item->umem.context->invalidate_range(item, start, end); return 0; } -static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, - const struct mmu_notifier_range *range) +static int ib_sync_cpu_device_pagetables(struct hmm_mirror *mirror, + const struct hmm_update *range) { - struct
[PATCH v2 0/1] Use HMM for ODP v2
From: Jérôme Glisse This is just a rebase with minor changes and better comments in the code. Previous cover letter (slightly improved): This patchset convert RDMA ODP to use HMM underneath this is motivated by stronger code sharing for same feature (share virtual memory SVM or Share Virtual Address SVA) and also stronger integration with mm code to achieve that. It depends on HMM patchset [1]. Moreover they are some features of HMM in the works like peer to peer support, fast CPU page table snapshot, fast IOMMU mapping update ... It will be easier for RDMA devices with ODP to leverage those if they use HMM underneath. Quick summary of what HMM is: HMM is a toolbox for device driver to implement software support for Share Virtual Memory (SVM). Not only it provides helpers to mirror a process address space on a device (hmm_mirror). It also provides helper to allow to use device memory to back regular valid virtual address of a process (any valid mmap that is not an mmap of a device or a DAX mapping). They are two kinds of device memory. Private memory that is not accessible to CPU because it does not have all the expected properties (this is for all PCIE devices) or public memory which can also be access by CPU without restriction (with OpenCAPI or CCIX or similar cache-coherent and atomic inter-connect). Device driver can use each of HMM tools separatly. You do not have to use all the tools it provides. For RDMA device i do not expect a need to use the device memory support of HMM. This device memory support is geared toward accelerator like GPU. You can find a branch [1] with all the prerequisite in. This patch is on top of 5.1rc1+ but i can rebase it on any specific branch once HMM pre- requisite is upstream. [1] https://cgit.freedesktop.org/~glisse/linux/log/?h=odp-hmm [2] https://cgit.freedesktop.org/~glisse/linux/log/?h=hmm-odp-v2 Cc: linux-r...@vger.kernel.org Cc: Jason Gunthorpe Cc: Leon Romanovsky Cc: Doug Ledford Cc: Artemy Kovalyov Cc: Moni Shoua , Cc: Mike Marciniszyn Cc: Kaike Wan Cc: Dennis Dalessandro Jérôme Glisse (1): RDMA/odp: convert to use HMM for ODP v2 drivers/infiniband/core/umem_odp.c | 488 - drivers/infiniband/hw/mlx5/mem.c | 20 +- drivers/infiniband/hw/mlx5/mr.c| 2 +- drivers/infiniband/hw/mlx5/odp.c | 106 --- include/rdma/ib_umem_odp.h | 48 ++- 5 files changed, 217 insertions(+), 447 deletions(-) -- 2.17.2
[PATCH v2 0/1] Restore change_pte optimization
From: Jérôme Glisse This patch is on top of my patchset to add context information to mmu notifier [1] you can find a branch with everything [2]. It has been tested with qemu/KVM building kernel within the guest and also running a benchmark which the result are given below. The change_pte() callback is impaired by the range invalidation call- back within KVM as the range invalidation callback as those do fully invalidate the secondary mmu. This means that there is a window between the range_start callback and the change_pte callback where the secondary mmu for the address is empty. Guest can fault on that address during that window. That window can last for some times if the kernel code which is doing the invalidation is interrupted or if they are other mmu listener for the process that might sleep within their range_start callback. With this patch KVM will ignore the range_start and range_end call- back and will rely solely on the change_pte callback to update the secondary mmu. This means that the secondary mmu never have an empty entry for the address between range_start and range_end and hence the guest will not have a chance to fault. This optimization is not valid for all the mmu notifier cases and thanks to the patchset that add context informations to the mmu notifier [1] we can now identify within KVM when it is safe to rely on this optimization. Roughly it is safe when: - going from read only to read and write (same or different pfn) - going from read and write to read only same pfn - going from read only to read only different pfn Longer explaination in [1] and [3]. Running ksm02 from ltp gives the following results: before mean {real: 675.460632, user: 857.771423, sys: 215.929657, npages: 4773.066895} before stdev {real: 37.035435, user: 4.395942, sys: 3.976172, npages: 675.352783} after mean {real: 672.515503, user: 855.817322, sys: 200.902710, npages: 4899.00} after stdev {real: 37.340954, user: 4.051633, sys: 3.894153, npages: 742.413452} Roughly 7%-8% less time spent in the kernel. So we are saving few cycles (this is with KSM enabled on the host and ksm sleep set to 0). Dunno how this translate to real workload. Note that with the context information further optimization are now possible within KVM. For instance you can find out if a range is updated to read only (ie no pfn change just protection change) and update the secondary mmu accordingly. You can also identify munmap()/mremap() syscall and only free up the resources you have allocated for the range (like freeing up secondary page table for the range or data structure) when it is an munmap or a mremap. Today my understanding is that kvm_unmap_hva_range() will free up resources always assuming it is an munmap of some sort. So for mundane invalidation (like migration, reclaim, mprotect, fork, ...) KVM is freeing up potential mega bytes of structure that it will have to re-allocate shortly there after (see [4] for WIP example). Cheers, Jérôme [1] https://lkml.org/lkml/2019/2/19/752 [2] https://cgit.freedesktop.org/~glisse/linux/log/?h=mmu-notifier-v05 [3] https://lkml.org/lkml/2019/2/19/754 [4] https://cgit.freedesktop.org/~glisse/linux/log/?h=wip-kvm-mmu-notifier-opti Cc: Andrea Arcangeli Cc: Peter Xu Cc: Andrew Morton Jérôme Glisse (1): kvm/mmu_notifier: re-enable the change_pte() optimization. virt/kvm/kvm_main.c | 16 1 file changed, 16 insertions(+) -- 2.17.2
[PATCH v2 1/1] kvm/mmu_notifier: re-enable the change_pte() optimization.
From: Jérôme Glisse Since changes to mmu notifier the change_pte() optimization was lost for kvm. This re-enable it, when ever a pte is going from read and write to read only with same pfn, or from read only to read and write with different pfn. It is safe to update the secondary MMUs, because the primary MMU pte invalidate must have already happened with a ptep_clear_flush() before set_pte_at_notify() is invoked (and thus before change_pte() callback). Signed-off-by: Jérôme Glisse Cc: Andrea Arcangeli Cc: Peter Xu Cc: Andrew Morton --- virt/kvm/kvm_main.c | 16 1 file changed, 16 insertions(+) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 629760c0fb95..0f979f02bf1c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -369,6 +369,14 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, int need_tlb_flush = 0, idx; int ret; + /* +* Nothing to do when using change_pte() which will be call for each +* individual pte update at the right time. See mmu_notifier.h for more +* informations. +*/ + if (mmu_notifier_range_use_change_pte(range)) + return 0; + idx = srcu_read_lock(>srcu); spin_lock(>mmu_lock); /* @@ -399,6 +407,14 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, { struct kvm *kvm = mmu_notifier_to_kvm(mn); + /* +* Nothing to do when using change_pte() which will be call for each +* individual pte update at the right time. See mmu_notifier.h for more +* informations. +*/ + if (mmu_notifier_range_use_change_pte(range)) + return; + spin_lock(>mmu_lock); /* * This sequence increase will notify the kvm page fault that -- 2.17.2
[PATCH v5 8/9] mm/mmu_notifier: mmu_notifier_range_update_to_read_only() helper
From: Jérôme Glisse Helper to test if a range is updated to read only (it is still valid to read from the range). This is useful for device driver or anyone who wish to optimize out update when they know that they already have the range map read only. Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-de...@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: Arnd Bergmann --- include/linux/mmu_notifier.h | 4 mm/mmu_notifier.c| 10 ++ 2 files changed, 14 insertions(+) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 0379956fff23..b6c004bd9f6a 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -259,6 +259,8 @@ extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r, bool only_end); extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); +extern bool +mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range); static inline bool mmu_notifier_range_blockable(const struct mmu_notifier_range *range) @@ -568,6 +570,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) { } +#define mmu_notifier_range_update_to_read_only(r) false + #define ptep_clear_flush_young_notify ptep_clear_flush_young #define pmdp_clear_flush_young_notify pmdp_clear_flush_young #define ptep_clear_young_notify ptep_test_and_clear_young diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index abd88c466eb2..ee36068077b6 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -395,3 +395,13 @@ void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, mmdrop(mm); } EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release); + +bool +mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range) +{ + if (!range->vma || range->event != MMU_NOTIFY_PROTECTION_VMA) + return false; + /* Return true if the vma still have the read flag set. */ + return range->vma->vm_flags & VM_READ; +} +EXPORT_SYMBOL_GPL(mmu_notifier_range_update_to_read_only); -- 2.17.2
[PATCH v5 9/9] mm/mmu_notifier: set MMU_NOTIFIER_USE_CHANGE_PTE flag where appropriate v2
From: Jérôme Glisse When notifying change for a range use MMU_NOTIFIER_USE_CHANGE_PTE flag for page table update that use set_pte_at_notify() and where the we are going either from read and write to read only with same pfn or read only to read and write with new pfn. Note that set_pte_at_notify() itself should only be use in rare cases ie we do not want to use it when we are updating a significant range of virtual addresses and thus a significant number of pte. Instead for those cases the event provided to mmu notifer invalidate_range_start() callback should be use for optimization. Changes since v1: - Use the new unsigned flags field in struct mmu_notifier_range - Use the new flags parameter to mmu_notifier_range_init() - Explicitly list all the patterns where we can use change_pte() Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-de...@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: Arnd Bergmann --- include/linux/mmu_notifier.h | 34 -- mm/ksm.c | 11 ++- mm/memory.c | 5 +++-- 3 files changed, 41 insertions(+), 9 deletions(-) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index b6c004bd9f6a..0230a4b06b46 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -40,6 +40,26 @@ enum mmu_notifier_event { MMU_NOTIFY_SOFT_DIRTY, }; +/* + * @MMU_NOTIFIER_RANGE_BLOCKABLE: can the mmu notifier range_start/range_end + * callback block or not ? If set then the callback can block. + * + * @MMU_NOTIFIER_USE_CHANGE_PTE: only set when the page table it updated with + * the set_pte_at_notify() the valid patterns for this are: + * - pte read and write to read only same pfn + * - pte read only to read and write (pfn can change or stay the same) + * - pte read only to read only with different pfn + * It is illegal to set in any other circumstances. + * + * Note that set_pte_at_notify() should not be use outside of the above cases. + * When updating a range in batch (like write protecting a range) it is better + * to rely on invalidate_range_start() and struct mmu_notifier_range to infer + * the kind of update that is happening (as an example you can look at the + * mmu_notifier_range_update_to_read_only() function). + */ +#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) +#define MMU_NOTIFIER_USE_CHANGE_PTE (1 << 1) + #ifdef CONFIG_MMU_NOTIFIER /* @@ -55,8 +75,6 @@ struct mmu_notifier_mm { spinlock_t lock; }; -#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) - struct mmu_notifier_range { struct vm_area_struct *vma; struct mm_struct *mm; @@ -268,6 +286,12 @@ mmu_notifier_range_blockable(const struct mmu_notifier_range *range) return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE); } +static inline bool +mmu_notifier_range_use_change_pte(const struct mmu_notifier_range *range) +{ + return (range->flags & MMU_NOTIFIER_USE_CHANGE_PTE); +} + static inline void mmu_notifier_release(struct mm_struct *mm) { if (mm_has_notifiers(mm)) @@ -509,6 +533,12 @@ mmu_notifier_range_blockable(const struct mmu_notifier_range *range) return true; } +static inline bool +mmu_notifier_range_use_change_pte(const struct mmu_notifier_range *range) +{ + return false; +} + static inline int mm_has_notifiers(struct mm_struct *mm) { return 0; diff --git a/mm/ksm.c b/mm/ksm.c index b782fadade8f..41e51882f999 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1066,9 +1066,9 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, BUG_ON(PageTransCompound(page)); - mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, mm, - pvmw.address, - pvmw.address + PAGE_SIZE); + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, + MMU_NOTIFIER_USE_CHANGE_PTE, vma, mm, + pvmw.address, pvmw.address + PAGE_SIZE); mmu_notifier_invalidate_range_start(); if (!page_vma_mapped_walk()) @@ -1155,8 +1155,9 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, if (!pmd) goto out; - mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, - addr + PAGE_SIZE); + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, + MMU_NOTIFIER_USE_CHANGE_PTE, + vma, mm, addr, addr + PAGE_SIZE); mmu_notifier_invalidate_range_start(); ptep = pte_offset_map_lock(mm, pmd, addr, ); diff
[PATCH v5 6/9] mm/mmu_notifier: use correct mmu_notifier events for each invalidation
From: Jérôme Glisse This update each existing invalidation to use the correct mmu notifier event that represent what is happening to the CPU page table. See the patch which introduced the events to see the rational behind this. Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-de...@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: Arnd Bergmann --- fs/proc/task_mmu.c | 4 ++-- kernel/events/uprobes.c | 2 +- mm/huge_memory.c| 14 ++ mm/hugetlb.c| 8 mm/khugepaged.c | 2 +- mm/ksm.c| 4 ++-- mm/madvise.c| 2 +- mm/memory.c | 14 +++--- mm/migrate.c| 4 ++-- mm/mprotect.c | 5 +++-- mm/rmap.c | 6 +++--- 11 files changed, 32 insertions(+), 33 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index fcbd0e574917..3b93ce496dd4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1151,8 +1151,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, break; } - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, - NULL, mm, 0, -1UL); + mmu_notifier_range_init(, MMU_NOTIFY_SOFT_DIRTY, + 0, NULL, mm, 0, -1UL); mmu_notifier_invalidate_range_start(); } walk_page_range(0, mm->highest_vm_end, _refs_walk); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 46f546bdba00..8e8342080013 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -161,7 +161,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct mmu_notifier_range range; struct mem_cgroup *memcg; - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, vma, mm, addr, + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, addr + PAGE_SIZE); VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c9d638f1b34e..1da6ca0f0f6d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1184,9 +1184,8 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, cond_resched(); } - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, - haddr, - haddr + HPAGE_PMD_SIZE); + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(); vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); @@ -1349,9 +1348,8 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, - haddr, - haddr + HPAGE_PMD_SIZE); + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(); spin_lock(vmf->ptl); @@ -2028,7 +2026,7 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address & HPAGE_PUD_MASK, (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); mmu_notifier_invalidate_range_start(); @@ -2247,7 +2245,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address & HPAGE_PMD_MASK, (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d9e5c5a4c004..a58115c6b0a3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3250,7 +3250,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; if (cow) { -
[PATCH v5 7/9] mm/mmu_notifier: pass down vma and reasons why mmu notifier is happening v2
From: Jérôme Glisse CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). Users of mmu notifier API track changes to the CPU page table and take specific action for them. While current API only provide range of virtual address affected by the change, not why the changes is happening This patch is just passing down the new informations by adding it to the mmu_notifier_range structure. Changes since v1: - Initialize flags field from mmu_notifier_range_init() arguments Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-de...@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: Arnd Bergmann --- include/linux/mmu_notifier.h | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 62f94cd85455..0379956fff23 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -58,10 +58,12 @@ struct mmu_notifier_mm { #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) struct mmu_notifier_range { + struct vm_area_struct *vma; struct mm_struct *mm; unsigned long start; unsigned long end; unsigned flags; + enum mmu_notifier_event event; }; struct mmu_notifier_ops { @@ -363,10 +365,12 @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, unsigned long start, unsigned long end) { + range->vma = vma; + range->event = event; range->mm = mm; range->start = start; range->end = end; - range->flags = 0; + range->flags = flags; } #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ -- 2.17.2
[PATCH v5 4/9] mm/mmu_notifier: contextual information for event enums
From: Jérôme Glisse CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). This patch introduce a set of enums that can be associated with each of the events triggering a mmu notifier. Latter patches take advantages of those enum values. - UNMAP: munmap() or mremap() - CLEAR: page table is cleared (migration, compaction, reclaim, ...) - PROTECTION_VMA: change in access protections for the range - PROTECTION_PAGE: change in access protections for page in the range - SOFT_DIRTY: soft dirtyness tracking Being able to identify munmap() and mremap() from other reasons why the page table is cleared is important to allow user of mmu notifier to update their own internal tracking structure accordingly (on munmap or mremap it is not longer needed to track range of virtual address as it becomes invalid). Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-de...@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: Arnd Bergmann --- include/linux/mmu_notifier.h | 30 ++ 1 file changed, 30 insertions(+) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index c8672c366f67..2386e71ac1b8 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -10,6 +10,36 @@ struct mmu_notifier; struct mmu_notifier_ops; +/** + * enum mmu_notifier_event - reason for the mmu notifier callback + * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that + * move the range + * + * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like + * madvise() or replacing a page by another one, ...). + * + * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range + * ie using the vma access permission (vm_page_prot) to update the whole range + * is enough no need to inspect changes to the CPU page table (mprotect() + * syscall) + * + * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for + * pages in the range so to mirror those changes the user must inspect the CPU + * page table (from the end callback). + * + * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same + * access flags). User should soft dirty the page in the end callback to make + * sure that anyone relying on soft dirtyness catch pages that might be written + * through non CPU mappings. + */ +enum mmu_notifier_event { + MMU_NOTIFY_UNMAP = 0, + MMU_NOTIFY_CLEAR, + MMU_NOTIFY_PROTECTION_VMA, + MMU_NOTIFY_PROTECTION_PAGE, + MMU_NOTIFY_SOFT_DIRTY, +}; + #ifdef CONFIG_MMU_NOTIFIER /* -- 2.17.2
[PATCH v5 5/9] mm/mmu_notifier: contextual information for event triggering invalidation v2
From: Jérôme Glisse CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). Users of mmu notifier API track changes to the CPU page table and take specific action for them. While current API only provide range of virtual address affected by the change, not why the changes is happening. This patchset do the initial mechanical convertion of all the places that calls mmu_notifier_range_init to also provide the default MMU_NOTIFY_UNMAP event as well as the vma if it is know (most invalidation happens against a given vma). Passing down the vma allows the users of mmu notifier to inspect the new vma page protection. The MMU_NOTIFY_UNMAP is always the safe default as users of mmu notifier should assume that every for the range is going away when that event happens. A latter patch do convert mm call path to use a more appropriate events for each call. Changes since v1: - add the flags parameter to init range flags This is done as 2 patches so that no call site is forgotten especialy as it uses this following coccinelle patch: %<-- @@ identifier I1, I2, I3, I4; @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *I1, +enum mmu_notifier_event event, +unsigned flags, +struct vm_area_struct *vma, struct mm_struct *I2, unsigned long I3, unsigned long I4) { ... } @@ @@ -#define mmu_notifier_range_init(range, mm, start, end) +#define mmu_notifier_range_init(range, event, flags, vma, mm, start, end) @@ expression E1, E3, E4; identifier I1; @@ <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, I1, I1->vm_mm, E3, E4) ...> @@ expression E1, E2, E3, E4; identifier FN, VMA; @@ FN(..., struct vm_area_struct *VMA, ...) { <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, VMA, E2, E3, E4) ...> } @@ expression E1, E2, E3, E4; identifier FN, VMA; @@ FN(...) { struct vm_area_struct *VMA; <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, VMA, E2, E3, E4) ...> } @@ expression E1, E2, E3, E4; identifier FN; @@ FN(...) { <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, NULL, E2, E3, E4) ...> } -->% Applied with: spatch --all-includes --sp-file mmu-notifier.spatch fs/proc/task_mmu.c --in-place spatch --sp-file mmu-notifier.spatch --dir kernel/events/ --in-place spatch --sp-file mmu-notifier.spatch --dir mm --in-place Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-de...@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: Arnd Bergmann --- fs/proc/task_mmu.c | 3 ++- include/linux/mmu_notifier.h | 5 - kernel/events/uprobes.c | 3 ++- mm/huge_memory.c | 12 mm/hugetlb.c | 12 mm/khugepaged.c | 3 ++- mm/ksm.c | 6 -- mm/madvise.c | 3 ++- mm/memory.c | 25 - mm/migrate.c | 5 - mm/mprotect.c| 3 ++- mm/mremap.c | 3 ++- mm/oom_kill.c| 3 ++- mm/rmap.c| 6 -- 14 files changed, 62 insertions(+), 30 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 92a91e7816d8..fcbd0e574917 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1151,7 +1151,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, break; } - mmu_notifier_range_init(, mm, 0, -1UL); + mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, + NULL, mm, 0, -1UL); mmu_notifier_invalidate_range_start(); } walk_page_range(0, mm->highest_vm_end, _refs_walk); diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 2386e71ac1b8..62f94cd85455 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -356,6 +356,9 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, + enum mmu_notifier_event event, + unsigned flags, + struct vm_area_struct *vma, struct mm_struct *mm,
[PATCH v5 0/9] mmu notifier provide context informations
From: Jérôme Glisse Since last version [4] i added the extra bits needed for the change_pte optimization (which is a KSM thing). Here i am not posting users of this, they will be posted to the appropriate sub-systems (KVM, GPU, RDMA, ...) once this serie get upstream. If you want to look at users of this see [5] [6]. If this gets in 5.1 then i will be submitting those users for 5.2 (including KVM if KVM folks feel comfortable with it). Note that this serie does not change any behavior for any existing code. It just pass down more informations to mmu notifier listener. The rational for this patchset: CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). This patch introduce a set of enums that can be associated with each of the events triggering a mmu notifier: - UNMAP: munmap() or mremap() - CLEAR: page table is cleared (migration, compaction, reclaim, ...) - PROTECTION_VMA: change in access protections for the range - PROTECTION_PAGE: change in access protections for page in the range - SOFT_DIRTY: soft dirtyness tracking Being able to identify munmap() and mremap() from other reasons why the page table is cleared is important to allow user of mmu notifier to update their own internal tracking structure accordingly (on munmap or mremap it is not longer needed to track range of virtual address as it becomes invalid). Without this serie, driver are force to assume that every notification is an munmap which triggers useless trashing within drivers that associate structure with range of virtual address. Each driver is force to free up its tracking structure and then restore it on next device page fault. With this serie we can also optimize device page table update [5]. More over this can also be use to optimize out some page table updates like for KVM where we can update the secondary MMU directly from the callback instead of clearing it. Patches to leverage this serie will be posted separately to each sub- system. Cheers, Jérôme [1] v1 https://lkml.org/lkml/2018/3/23/1049 [2] v2 https://lkml.org/lkml/2018/12/5/10 [3] v3 https://lkml.org/lkml/2018/12/13/620 [4] v4 https://lkml.org/lkml/2019/1/23/838 [5] patches to use this: https://lkml.org/lkml/2019/1/23/833 https://lkml.org/lkml/2019/1/23/834 https://lkml.org/lkml/2019/1/23/832 https://lkml.org/lkml/2019/1/23/831 [6] KVM restore change pte optimization https://patchwork.kernel.org/cover/10791179/ Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-de...@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Arnd Bergmann Jérôme Glisse (9): mm/mmu_notifier: helper to test if a range invalidation is blockable mm/mmu_notifier: convert user range->blockable to helper function mm/mmu_notifier: convert mmu_notifier_range->blockable to a flags mm/mmu_notifier: contextual information for event enums mm/mmu_notifier: contextual information for event triggering invalidation v2 mm/mmu_notifier: use correct mmu_notifier events for each invalidation mm/mmu_notifier: pass down vma and reasons why mmu notifier is happening v2 mm/mmu_notifier: mmu_notifier_range_update_to_read_only() helper mm/mmu_notifier: set MMU_NOTIFIER_USE_CHANGE_PTE flag where appropriate v2 drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 8 +-- drivers/gpu/drm/i915/i915_gem_userptr.c | 2 +- drivers/gpu/drm/radeon/radeon_mn.c | 4 +- drivers/infiniband/core/umem_odp.c | 5 +- drivers/xen/gntdev.c| 6 +- fs/proc/task_mmu.c | 3 +- include/linux/mmu_notifier.h| 93 +++-- kernel/events/uprobes.c | 3 +- mm/hmm.c| 6 +- mm/huge_memory.c| 14 ++-- mm/hugetlb.c| 12 ++-- mm/khugepaged.c | 3 +- mm/ksm.c| 9 ++- mm/madvise.c| 3 +- mm/memory.c | 26 --- mm/migrate.c| 5 +- mm/mmu_notifier.c | 12 +++- mm/mprotect.c | 4 +- mm/mremap.c | 3 +- mm/oom_kill.c | 3 +- mm/rmap.c | 6 +- virt/kvm/kvm_main.c | 3 +- 22 files changed, 180 insertions(+), 53 deletions(-) -- 2.17.2
[PATCH v5 3/9] mm/mmu_notifier: convert mmu_notifier_range->blockable to a flags
From: Jérôme Glisse Use an unsigned field for flags other than blockable and convert the blockable field to be one of those flags. Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-de...@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: Arnd Bergmann --- include/linux/mmu_notifier.h | 11 +++ 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index e630def131ce..c8672c366f67 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -25,11 +25,13 @@ struct mmu_notifier_mm { spinlock_t lock; }; +#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) + struct mmu_notifier_range { struct mm_struct *mm; unsigned long start; unsigned long end; - bool blockable; + unsigned flags; }; struct mmu_notifier_ops { @@ -229,7 +231,7 @@ extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, static inline bool mmu_notifier_range_blockable(const struct mmu_notifier_range *range) { - return range->blockable; + return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE); } static inline void mmu_notifier_release(struct mm_struct *mm) @@ -275,7 +277,7 @@ static inline void mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) { if (mm_has_notifiers(range->mm)) { - range->blockable = true; + range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE; __mmu_notifier_invalidate_range_start(range); } } @@ -284,7 +286,7 @@ static inline int mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) { if (mm_has_notifiers(range->mm)) { - range->blockable = false; + range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE; return __mmu_notifier_invalidate_range_start(range); } return 0; @@ -331,6 +333,7 @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, range->mm = mm; range->start = start; range->end = end; + range->flags = 0; } #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ -- 2.17.2
[PATCH v5 2/9] mm/mmu_notifier: convert user range->blockable to helper function
From: Jérôme Glisse Use the mmu_notifier_range_blockable() helper function instead of directly dereferencing the range->blockable field. This is done to make it easier to change the mmu_notifier range field. This patch is the outcome of the following coccinelle patch: %<--- @@ identifier I1, FN; @@ FN(..., struct mmu_notifier_range *I1, ...) { <... -I1->blockable +mmu_notifier_range_blockable(I1) ...> } --->% spatch --in-place --sp-file blockable.spatch --dir . Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-de...@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: Arnd Bergmann --- drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 8 drivers/gpu/drm/i915/i915_gem_userptr.c | 2 +- drivers/gpu/drm/radeon/radeon_mn.c | 4 ++-- drivers/infiniband/core/umem_odp.c | 5 +++-- drivers/xen/gntdev.c| 6 +++--- mm/hmm.c| 6 +++--- mm/mmu_notifier.c | 2 +- virt/kvm/kvm_main.c | 3 ++- 8 files changed, 19 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c index 3e6823fdd939..58ed401c5996 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c @@ -256,14 +256,14 @@ static int amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn, /* TODO we should be able to split locking for interval tree and * amdgpu_mn_invalidate_node */ - if (amdgpu_mn_read_lock(amn, range->blockable)) + if (amdgpu_mn_read_lock(amn, mmu_notifier_range_blockable(range))) return -EAGAIN; it = interval_tree_iter_first(>objects, range->start, end); while (it) { struct amdgpu_mn_node *node; - if (!range->blockable) { + if (!mmu_notifier_range_blockable(range)) { amdgpu_mn_read_unlock(amn); return -EAGAIN; } @@ -299,7 +299,7 @@ static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, /* notification is exclusive, but interval is inclusive */ end = range->end - 1; - if (amdgpu_mn_read_lock(amn, range->blockable)) + if (amdgpu_mn_read_lock(amn, mmu_notifier_range_blockable(range))) return -EAGAIN; it = interval_tree_iter_first(>objects, range->start, end); @@ -307,7 +307,7 @@ static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, struct amdgpu_mn_node *node; struct amdgpu_bo *bo; - if (!range->blockable) { + if (!mmu_notifier_range_blockable(range)) { amdgpu_mn_read_unlock(amn); return -EAGAIN; } diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index 1d3f9a31ad61..777b3f8727e7 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c @@ -122,7 +122,7 @@ userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, while (it) { struct drm_i915_gem_object *obj; - if (!range->blockable) { + if (!mmu_notifier_range_blockable(range)) { ret = -EAGAIN; break; } diff --git a/drivers/gpu/drm/radeon/radeon_mn.c b/drivers/gpu/drm/radeon/radeon_mn.c index b3019505065a..c9bd1278f573 100644 --- a/drivers/gpu/drm/radeon/radeon_mn.c +++ b/drivers/gpu/drm/radeon/radeon_mn.c @@ -133,7 +133,7 @@ static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn, /* TODO we should be able to split locking for interval tree and * the tear down. */ - if (range->blockable) + if (mmu_notifier_range_blockable(range)) mutex_lock(>lock); else if (!mutex_trylock(>lock)) return -EAGAIN; @@ -144,7 +144,7 @@ static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn, struct radeon_bo *bo; long r; - if (!range->blockable) { + if (!mmu_notifier_range_blockable(range)) { ret = -EAGAIN; goto out_unlock; } diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 012044f16d1c..3a3f1538d295 100644 --- a/drivers/infiniband/core/umem_odp.c +++
[PATCH v5 1/9] mm/mmu_notifier: helper to test if a range invalidation is blockable
From: Jérôme Glisse Simple helpers to test if range invalidation is blockable. Latter patches use cocinnelle to convert all direct dereference of range-> blockable to use this function instead so that we can convert the blockable field to an unsigned for more flags. Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-de...@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Arnd Bergmann --- include/linux/mmu_notifier.h | 11 +++ 1 file changed, 11 insertions(+) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 4050ec1c3b45..e630def131ce 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -226,6 +226,12 @@ extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r, extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); +static inline bool +mmu_notifier_range_blockable(const struct mmu_notifier_range *range) +{ + return range->blockable; +} + static inline void mmu_notifier_release(struct mm_struct *mm) { if (mm_has_notifiers(mm)) @@ -455,6 +461,11 @@ static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range, #define mmu_notifier_range_init(range, mm, start, end) \ _mmu_notifier_range_init(range, start, end) +static inline bool +mmu_notifier_range_blockable(const struct mmu_notifier_range *range) +{ + return true; +} static inline int mm_has_notifiers(struct mm_struct *mm) { -- 2.17.2
[RFC PATCH 1/4] uprobes: use set_pte_at() not set_pte_at_notify()
From: Jérôme Glisse Using set_pte_at_notify() trigger useless calls to change_pte() so just use set_pte_at() instead. The reason is that set_pte_at_notify() should only be use when going from either a read and write pte to read only pte with same pfn, or from read only to read and write with a different pfn. The set_pte_at_notify() was use because __replace_page() code came from the mm/ksm.c code in which the above rules are valid. Signed-off-by: Jérôme Glisse Cc: Andrea Arcangeli Cc: Peter Xu Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Andrew Morton Cc: Matthew Wilcox Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: k...@vger.kernel.org --- kernel/events/uprobes.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 87e76a1dc758..a4807b1edd7f 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -207,8 +207,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, flush_cache_page(vma, addr, pte_pfn(*pvmw.pte)); ptep_clear_flush_notify(vma, addr, pvmw.pte); - set_pte_at_notify(mm, addr, pvmw.pte, - mk_pte(new_page, vma->vm_page_prot)); + set_pte_at(mm, addr, pvmw.pte, mk_pte(new_page, vma->vm_page_prot)); page_remove_rmap(old_page, false); if (!page_mapped(old_page)) -- 2.17.1
[RFC PATCH 3/4] mm/mmu_notifier: set MMU_NOTIFIER_USE_CHANGE_PTE flag where appropriate
From: Jérôme Glisse When notifying change for a range use MMU_NOTIFIER_USE_CHANGE_PTE flag for page table update that use set_pte_at_notify() and where the we are going either from read and write to read only with same pfn or read only to read and write with new pfn. Note that set_pte_at_notify() itself should only be use in rare cases ie we do not want to use it when we are updating a significant range of virtual addresses and thus a significant number of pte. Instead for those cases the event provided to mmu notifer invalidate_range_start() callback should be use for optimization. Signed-off-by: Jérôme Glisse Cc: Andrea Arcangeli Cc: Peter Xu Cc: Andrew Morton Cc: Paolo Bonzini Cc: Radim Krčmář Cc: k...@vger.kernel.org --- include/linux/mmu_notifier.h | 13 + mm/ksm.c | 6 -- mm/memory.c | 3 ++- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index d7a35975c2bd..0885bf33dc9c 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -43,6 +43,19 @@ enum mmu_notifier_event { }; #define MMU_NOTIFIER_EVENT_BITS order_base_2(MMU_NOTIFY_EVENT_MAX) +/* + * Set MMU_NOTIFIER_USE_CHANGE_PTE only when the page table it updated with the + * set_pte_at_notify() and when pte is updated from read and write to read only + * with same pfn or from read only to read and write with different pfn. It is + * illegal to set in any other circumstances. + * + * Note that set_pte_at_notify() should not be use outside of the above cases. + * When updating a range in batch (like write protecting a range) it is better + * to rely on invalidate_range_start() and struct mmu_notifier_range to infer + * the kind of update that is happening (as an example you can look at the + * mmu_notifier_range_update_to_read_only() function). + */ +#define MMU_NOTIFIER_USE_CHANGE_PTE (1 << MMU_NOTIFIER_EVENT_BITS) #ifdef CONFIG_MMU_NOTIFIER diff --git a/mm/ksm.c b/mm/ksm.c index 97757c5fa15f..b7fb7b560cc0 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1051,7 +1051,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, BUG_ON(PageTransCompound(page)); - mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, vma, mm, + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR | + MMU_NOTIFIER_USE_CHANGE_PTE, vma, mm, pvmw.address, pvmw.address + PAGE_SIZE); mmu_notifier_invalidate_range_start(); @@ -1140,7 +1141,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, if (!pmd) goto out; - mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, vma, mm, addr, + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR | + MMU_NOTIFIER_USE_CHANGE_PTE, vma, mm, addr, addr + PAGE_SIZE); mmu_notifier_invalidate_range_start(); diff --git a/mm/memory.c b/mm/memory.c index a8c6922526f6..daf4b0f92af8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2275,7 +2275,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) __SetPageUptodate(new_page); - mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, vma, mm, + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR | + MMU_NOTIFIER_USE_CHANGE_PTE, vma, mm, vmf->address & PAGE_MASK, (vmf->address & PAGE_MASK) + PAGE_SIZE); mmu_notifier_invalidate_range_start(); -- 2.17.1
[RFC PATCH 2/4] mm/mmu_notifier: use unsigned for event field in range struct
From: Jérôme Glisse Use unsigned for event field in range struct so that we can also set flags with the event. This patch change the field and introduce the helper. Signed-off-by: Jérôme Glisse Cc: Andrea Arcangeli Cc: Peter Xu Cc: Andrew Morton Cc: Paolo Bonzini Cc: Radim Krčmář Cc: k...@vger.kernel.org --- include/linux/mmu_notifier.h | 8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index be873c431886..d7a35975c2bd 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -6,6 +6,7 @@ #include #include #include +#include struct mmu_notifier; struct mmu_notifier_ops; @@ -38,8 +39,11 @@ enum mmu_notifier_event { MMU_NOTIFY_PROTECTION_VMA, MMU_NOTIFY_PROTECTION_PAGE, MMU_NOTIFY_SOFT_DIRTY, + MMU_NOTIFY_EVENT_MAX }; +#define MMU_NOTIFIER_EVENT_BITS order_base_2(MMU_NOTIFY_EVENT_MAX) + #ifdef CONFIG_MMU_NOTIFIER /* @@ -60,7 +64,7 @@ struct mmu_notifier_range { struct mm_struct *mm; unsigned long start; unsigned long end; - enum mmu_notifier_event event; + unsigned event; bool blockable; }; @@ -352,7 +356,7 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, - enum mmu_notifier_event event, + unsigned event, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, -- 2.17.1
[RFC PATCH 4/4] kvm/mmu_notifier: re-enable the change_pte() optimization.
From: Jérôme Glisse Since changes to mmu notifier the change_pte() optimization was lost for kvm. This re-enable it, when ever a pte is going from read and write to read only with same pfn, or from read only to read and write with different pfn. It is safe to update the secondary MMUs, because the primary MMU pte invalidate must have already happened with a ptep_clear_flush() before set_pte_at_notify() is invoked (and thus before change_pte() callback). Signed-off-by: Jérôme Glisse Cc: Andrea Arcangeli Cc: Peter Xu Cc: Andrew Morton Cc: Paolo Bonzini Cc: Radim Krčmář Cc: k...@vger.kernel.org --- virt/kvm/kvm_main.c | 16 1 file changed, 16 insertions(+) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5ecea812cb6a..fec155c2d7b8 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -369,6 +369,14 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, int need_tlb_flush = 0, idx; int ret; + /* +* Nothing to do when MMU_NOTIFIER_USE_CHANGE_PTE is set as it means +* that change_pte() will be call and it is a situation in which we +* allow to only rely on change_pte(). +*/ + if (range->event & MMU_NOTIFIER_USE_CHANGE_PTE) + return 0; + idx = srcu_read_lock(>srcu); spin_lock(>mmu_lock); /* @@ -398,6 +406,14 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, { struct kvm *kvm = mmu_notifier_to_kvm(mn); + /* +* Nothing to do when MMU_NOTIFIER_USE_CHANGE_PTE is set as it means +* that change_pte() will be call and it is a situation in which we +* allow to only rely on change_pte(). +*/ + if (range->event & MMU_NOTIFIER_USE_CHANGE_PTE) + return; + spin_lock(>mmu_lock); /* * This sequence increase will notify the kvm page fault that -- 2.17.1
[RFC PATCH 0/4] Restore change_pte optimization to its former glory
From: Jérôme Glisse This patchset is on top of my patchset to add context information to mmu notifier [1] you can find a branch with everything [2]. I have not tested it but i wanted to get the discussion started. I believe it is correct but i am not sure what kind of kvm test i can run to exercise this. The idea is that since kvm will invalidate the secondary MMUs within invalidate_range callback then the change_pte() optimization is lost. With this patchset everytime core mm is using set_pte_at_notify() and thus change_pte() get calls then we can ignore the invalidate_range callback altogether and only rely on change_pte callback. Note that this is only valid when either going from a read and write pte to a read only pte with same pfn, or from a read only pte to a read and write pte with different pfn. The other side of the story is that the primary mmu pte is clear with ptep_clear_flush_notify before the call to change_pte. Also with the mmu notifier context information [1] you can further optimize other cases like mprotect or write protect when forking. You can use the new context information to infer that the invalidation is for read only update of the primary mmu and update the secondary mmu accordingly instead of clearing it and forcing fault even for read access. I do not know if that is an optimization that would bear any fruit for kvm. It does help for device driver. You can also optimize the soft dirty update. Cheers, Jérôme [1] https://lore.kernel.org/linux-fsdevel/20190123222315.1122-1-jgli...@redhat.com/T/#m69e8f589240e18acbf196a1c8aa1d6fc97bd3565 [2] https://cgit.freedesktop.org/~glisse/linux/log/?h=kvm-restore-change_pte Cc: Andrea Arcangeli Cc: Peter Xu Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Andrew Morton Cc: Matthew Wilcox Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: k...@vger.kernel.org Jérôme Glisse (4): uprobes: use set_pte_at() not set_pte_at_notify() mm/mmu_notifier: use unsigned for event field in range struct mm/mmu_notifier: set MMU_NOTIFIER_USE_CHANGE_PTE flag where appropriate kvm/mmu_notifier: re-enable the change_pte() optimization. include/linux/mmu_notifier.h | 21 +++-- kernel/events/uprobes.c | 3 +-- mm/ksm.c | 6 -- mm/memory.c | 3 ++- virt/kvm/kvm_main.c | 16 5 files changed, 42 insertions(+), 7 deletions(-) -- 2.17.1
[RFC PATCH 4/5] mm/hmm: add support for peer to peer to HMM device memory
From: Jérôme Glisse Signed-off-by: Jérôme Glisse Cc: Logan Gunthorpe Cc: Greg Kroah-Hartman Cc: Rafael J. Wysocki Cc: Bjorn Helgaas Cc: Christian Koenig Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: linux-...@vger.kernel.org Cc: dri-de...@lists.freedesktop.org Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Robin Murphy Cc: Joerg Roedel Cc: io...@lists.linux-foundation.org --- include/linux/hmm.h | 47 + mm/hmm.c| 63 + 2 files changed, 105 insertions(+), 5 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 4a1454e3efba..7a3ac182cc48 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -710,6 +710,53 @@ struct hmm_devmem_ops { const struct page *page, unsigned int flags, pmd_t *pmdp); + + /* +* p2p_map() - map page for peer to peer between device +* @devmem: device memory structure (see struct hmm_devmem) +* @range: range of virtual address that is being mapped +* @device: device the range is being map to +* @addr: first virtual address in the range to consider +* @pa: device address (where actual mapping is store) +* Returns: number of page successfuly mapped, 0 otherwise +* +* Map page belonging to devmem to another device for peer to peer +* access. Device can decide not to map in which case memory will +* be migrated to main memory. +* +* Also there is no garantee that all the pages in the range does +* belongs to the devmem so it is up to the function to check that +* every single page does belong to devmem. +* +* Note for now we do not care about error exect error, so on failure +* function should just return 0. +*/ + long (*p2p_map)(struct hmm_devmem *devmem, + struct hmm_range *range, + struct device *device, + unsigned long addr, + dma_addr_t *pas); + + /* +* p2p_unmap() - unmap page from peer to peer between device +* @devmem: device memory structure (see struct hmm_devmem) +* @range: range of virtual address that is being mapped +* @device: device the range is being map to +* @addr: first virtual address in the range to consider +* @pa: device address (where actual mapping is store) +* Returns: number of page successfuly unmapped, 0 otherwise +* +* Unmap page belonging to devmem previously map with p2p_map(). +* +* Note there is no garantee that all the pages in the range does +* belongs to the devmem so it is up to the function to check that +* every single page does belong to devmem. +*/ + unsigned long (*p2p_unmap)(struct hmm_devmem *devmem, + struct hmm_range *range, + struct device *device, + unsigned long addr, + dma_addr_t *pas); }; /* diff --git a/mm/hmm.c b/mm/hmm.c index 1a444885404e..fd49b1e116d0 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1193,16 +1193,19 @@ long hmm_range_dma_map(struct hmm_range *range, dma_addr_t *daddrs, bool block) { - unsigned long i, npages, mapped, page_size; + unsigned long i, npages, mapped, page_size, addr; long ret; +again: ret = hmm_range_fault(range, block); if (ret <= 0) return ret ? ret : -EBUSY; + mapped = 0; + addr = range->start; page_size = hmm_range_page_size(range); npages = (range->end - range->start) >> range->page_shift; - for (i = 0, mapped = 0; i < npages; ++i) { + for (i = 0; i < npages; ++i, addr += page_size) { enum dma_data_direction dir = DMA_FROM_DEVICE; struct page *page; @@ -1226,6 +1229,29 @@ long hmm_range_dma_map(struct hmm_range *range, goto unmap; } + if (is_device_private_page(page)) { + struct hmm_devmem *devmem = page->pgmap->data; + + if (!devmem->ops->p2p_map || !devmem->ops->p2p_unmap) { + /* Fall-back to main memory. */ + range->default_flags |= + range->flags[HMM_PFN_DEVICE_PRIVATE]; + goto again; + } + + ret = devmem->ops->p2p_map(devmem, range, device, + addr, daddrs); + if (ret <= 0) { + /* Fall-back to main memory. */ + range->default_flags |= +
[RFC PATCH 3/5] mm/vma: add support for peer to peer to device vma
From: Jérôme Glisse Allow mmap of device file to export device memory to peer to peer devices. This will allow for instance a network device to access a GPU memory or to access a storage device queue directly. The common case will be a vma created by userspace device driver that is then share to another userspace device driver which call in its kernel device driver to map that vma. The vma does not need to have any valid CPU mapping so that only peer to peer device might access its content. Or it could have valid CPU mapping too in that case it should point to same memory for consistency. Note that peer to peer mapping is highly platform and device dependent and it might not work in all the cases. However we do expect supports for this to grow on more hardware platform. This patch only adds new call backs to vm_operations_struct bulk of code light within common bus driver (like pci) and device driver (both the exporting and importing device). Current design mandate that the importer must obey mmu_notifier and invalidate any peer to peer mapping anytime a notification of invalidation happens for a range that have been peer to peer mapped. This allows exporter device to easily invalidate mapping for any importer device. Signed-off-by: Jérôme Glisse Cc: Logan Gunthorpe Cc: Greg Kroah-Hartman Cc: Rafael J. Wysocki Cc: Bjorn Helgaas Cc: Christian Koenig Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: linux-kernel@vger.kernel.org Cc: linux-...@vger.kernel.org Cc: dri-de...@lists.freedesktop.org Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Robin Murphy Cc: Joerg Roedel Cc: io...@lists.linux-foundation.org --- include/linux/mm.h | 38 ++ 1 file changed, 38 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 80bb6408fe73..1bd60a90e575 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -429,6 +429,44 @@ struct vm_operations_struct { pgoff_t start_pgoff, pgoff_t end_pgoff); unsigned long (*pagesize)(struct vm_area_struct * area); + /* +* Optional for device driver that want to allow peer to peer (p2p) +* mapping of their vma (which can be back by some device memory) to +* another device. +* +* Note that the exporting device driver might not have map anything +* inside the vma for the CPU but might still want to allow a peer +* device to access the range of memory corresponding to a range in +* that vma. +* +* FOR PREDICTABILITY IF DRIVER SUCCESSFULY MAP A RANGE ONCE FOR A +* DEVICE THEN FURTHER MAPPING OF THE SAME IF THE VMA IS STILL VALID +* SHOULD ALSO BE SUCCESSFUL. Following this rule allow the importing +* device to map once during setup and report any failure at that time +* to the userspace. Further mapping of the same range might happen +* after mmu notifier invalidation over the range. The exporting device +* can use this to move things around (defrag BAR space for instance) +* or do other similar task. +* +* IMPORTER MUST OBEY mmu_notifier NOTIFICATION AND CALL p2p_unmap() +* WHEN A NOTIFIER IS CALL FOR THE RANGE ! THIS CAN HAPPEN AT ANY +* POINT IN TIME WITH NO LOCK HELD. +* +* In below function, the device argument is the importing device, +* the exporting device is the device to which the vma belongs. +*/ + long (*p2p_map)(struct vm_area_struct *vma, + struct device *device, + unsigned long start, + unsigned long end, + dma_addr_t *pa, + bool write); + long (*p2p_unmap)(struct vm_area_struct *vma, + struct device *device, + unsigned long start, + unsigned long end, + dma_addr_t *pa); + /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ vm_fault_t (*page_mkwrite)(struct vm_fault *vmf); -- 2.17.2
[RFC PATCH 5/5] mm/hmm: add support for peer to peer to special device vma
From: Jérôme Glisse Special device vma (mmap of a device file) can correspond to device driver object that some device driver might want to share with other device (giving access to). This add support for HMM to map those special device vma if the owning device (exporter) allows it. Signed-off-by: Jérôme Glisse Cc: Logan Gunthorpe Cc: Greg Kroah-Hartman Cc: Rafael J. Wysocki Cc: Bjorn Helgaas Cc: Christian Koenig Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: linux-...@vger.kernel.org Cc: dri-de...@lists.freedesktop.org Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Robin Murphy Cc: Joerg Roedel Cc: io...@lists.linux-foundation.org --- include/linux/hmm.h | 6 ++ mm/hmm.c| 156 ++-- 2 files changed, 128 insertions(+), 34 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 7a3ac182cc48..98ebe9f52432 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -137,6 +137,7 @@ enum hmm_pfn_flag_e { * result of vmf_insert_pfn() or vm_insert_page(). Therefore, it should not * be mirrored by a device, because the entry will never have HMM_PFN_VALID * set and the pfn value is undefined. + * HMM_PFN_P2P: this entry have been map as P2P ie the dma address is valid * * Driver provide entry value for none entry, error entry and special entry, * driver can alias (ie use same value for error and special for instance). It @@ -151,6 +152,7 @@ enum hmm_pfn_value_e { HMM_PFN_ERROR, HMM_PFN_NONE, HMM_PFN_SPECIAL, + HMM_PFN_P2P, HMM_PFN_VALUE_MAX }; @@ -250,6 +252,8 @@ static inline bool hmm_range_valid(struct hmm_range *range) static inline struct page *hmm_pfn_to_page(const struct hmm_range *range, uint64_t pfn) { + if (pfn == range->values[HMM_PFN_P2P]) + return NULL; if (pfn == range->values[HMM_PFN_NONE]) return NULL; if (pfn == range->values[HMM_PFN_ERROR]) @@ -270,6 +274,8 @@ static inline struct page *hmm_pfn_to_page(const struct hmm_range *range, static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range, uint64_t pfn) { + if (pfn == range->values[HMM_PFN_P2P]) + return -1UL; if (pfn == range->values[HMM_PFN_NONE]) return -1UL; if (pfn == range->values[HMM_PFN_ERROR]) diff --git a/mm/hmm.c b/mm/hmm.c index fd49b1e116d0..621a4f831483 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1058,37 +1058,36 @@ long hmm_range_snapshot(struct hmm_range *range) } EXPORT_SYMBOL(hmm_range_snapshot); -/* - * hmm_range_fault() - try to fault some address in a virtual address range - * @range: range being faulted - * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) - * Returns: 0 on success ortherwise: - * -EINVAL: - * Invalid argument - * -ENOMEM: - * Out of memory. - * -EPERM: - * Invalid permission (for instance asking for write and range - * is read only). - * -EAGAIN: - * If you need to retry and mmap_sem was drop. This can only - * happens if block argument is false. - * -EBUSY: - * If the the range is being invalidated and you should wait for - * invalidation to finish. - * -EFAULT: - * Invalid (ie either no valid vma or it is illegal to access that - * range), number of valid pages in range->pfns[] (from range start - * address). - * - * This is similar to a regular CPU page fault except that it will not trigger - * any memory migration if the memory being faulted is not accessible by CPUs - * and caller does not ask for migration. - * - * On error, for one virtual address in the range, the function will mark the - * corresponding HMM pfn entry with an error flag. - */ -long hmm_range_fault(struct hmm_range *range, bool block) +static int hmm_vma_p2p_map(struct hmm_range *range, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct device *device, dma_addr_t *pas) +{ + struct hmm_vma_walk hmm_vma_walk; + unsigned long npages, i; + bool fault, write; + uint64_t *pfns; + int ret; + + i = (start - range->start) >> PAGE_SHIFT; + npages = (end - start) >> PAGE_SHIFT; + pfns = >pfns[i]; + pas = [i]; + + hmm_vma_walk.range = range; + hmm_vma_walk.fault = true; + hmm_range_need_fault(_vma_walk, pfns, npages, + 0, , ); + + ret = vma->vm_ops->p2p_map(vma, device, start, end, pas, write); + for (i = 0; i < npages; ++i) { + pfns[i] = ret ? range->values[HMM_PFN_ERROR] : + range->values[HMM_PFN_P2P]; + } + return ret; +} + +static long
[RFC PATCH 1/5] pci/p2p: add a function to test peer to peer capability
From: Jérôme Glisse device_test_p2p() return true if two devices can peer to peer to each other. We add a generic function as different inter-connect can support peer to peer and we want to genericaly test this no matter what the inter-connect might be. However this version only support PCIE for now. Signed-off-by: Jérôme Glisse Cc: Logan Gunthorpe Cc: Greg Kroah-Hartman Cc: Rafael J. Wysocki Cc: Bjorn Helgaas Cc: Christian Koenig Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: linux-kernel@vger.kernel.org Cc: linux-...@vger.kernel.org Cc: dri-de...@lists.freedesktop.org Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Robin Murphy Cc: Joerg Roedel Cc: io...@lists.linux-foundation.org --- drivers/pci/p2pdma.c | 27 +++ include/linux/pci-p2pdma.h | 6 ++ 2 files changed, 33 insertions(+) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index c52298d76e64..620ac60babb5 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -797,3 +797,30 @@ ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev, return sprintf(page, "%s\n", pci_name(p2p_dev)); } EXPORT_SYMBOL_GPL(pci_p2pdma_enable_show); + +bool pci_test_p2p(struct device *devA, struct device *devB) +{ + struct pci_dev *pciA, *pciB; + bool ret; + int tmp; + + /* +* For now we only support PCIE peer to peer but other inter-connect +* can be added. +*/ + pciA = find_parent_pci_dev(devA); + pciB = find_parent_pci_dev(devB); + if (pciA == NULL || pciB == NULL) { + ret = false; + goto out; + } + + tmp = upstream_bridge_distance(pciA, pciB, NULL); + ret = tmp < 0 ? false : true; + +out: + pci_dev_put(pciB); + pci_dev_put(pciA); + return false; +} +EXPORT_SYMBOL_GPL(pci_test_p2p); diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h index bca9bc3e5be7..7671cc499a08 100644 --- a/include/linux/pci-p2pdma.h +++ b/include/linux/pci-p2pdma.h @@ -36,6 +36,7 @@ int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev, bool *use_p2pdma); ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev, bool use_p2pdma); +bool pci_test_p2p(struct device *devA, struct device *devB); #else /* CONFIG_PCI_P2PDMA */ static inline int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, u64 offset) @@ -97,6 +98,11 @@ static inline ssize_t pci_p2pdma_enable_show(char *page, { return sprintf(page, "none\n"); } + +static inline bool pci_test_p2p(struct device *devA, struct device *devB) +{ + return false; +} #endif /* CONFIG_PCI_P2PDMA */ -- 2.17.2
[RFC PATCH 2/5] drivers/base: add a function to test peer to peer capability
From: Jérôme Glisse device_test_p2p() return true if two devices can peer to peer to each other. We add a generic function as different inter-connect can support peer to peer and we want to genericaly test this no matter what the inter-connect might be. However this version only support PCIE for now. Signed-off-by: Jérôme Glisse Cc: Logan Gunthorpe Cc: Greg Kroah-Hartman Cc: Rafael J. Wysocki Cc: Bjorn Helgaas Cc: Christian Koenig Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: linux-kernel@vger.kernel.org Cc: linux-...@vger.kernel.org Cc: dri-de...@lists.freedesktop.org Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Robin Murphy Cc: Joerg Roedel Cc: io...@lists.linux-foundation.org --- drivers/base/core.c| 20 include/linux/device.h | 1 + 2 files changed, 21 insertions(+) diff --git a/drivers/base/core.c b/drivers/base/core.c index 0073b09bb99f..56023b00e108 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "base.h" #include "power/power.h" @@ -3167,3 +3168,22 @@ void device_set_of_node_from_dev(struct device *dev, const struct device *dev2) dev->of_node_reused = true; } EXPORT_SYMBOL_GPL(device_set_of_node_from_dev); + +/** + * device_test_p2p - test if two device can peer to peer to each other + * @devA: device A + * @devB: device B + * Returns: true if device can peer to peer to each other, false otherwise + */ +bool device_test_p2p(struct device *devA, struct device *devB) +{ + /* +* For now we only support PCIE peer to peer but other inter-connect +* can be added. +*/ + if (pci_test_p2p(devA, devB)) + return true; + + return false; +} +EXPORT_SYMBOL_GPL(device_test_p2p); diff --git a/include/linux/device.h b/include/linux/device.h index 6cb4640b6160..0d532d7f0779 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1250,6 +1250,7 @@ extern int device_online(struct device *dev); extern void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode); extern void set_secondary_fwnode(struct device *dev, struct fwnode_handle *fwnode); void device_set_of_node_from_dev(struct device *dev, const struct device *dev2); +bool device_test_p2p(struct device *devA, struct device *devB); static inline int dev_num_vf(struct device *dev) { -- 2.17.2
[RFC PATCH 0/5] Device peer to peer (p2p) through vma
From: Jérôme Glisse This patchset add support for peer to peer between device in two manner. First for device memory use through HMM in process regular address space (ie inside a regular vma that is not an mmap of device file or special file). Second for special vma ie mmap of a device file, in this case some device driver might want to allow other device to directly access memory use for those special vma (not that the memory might not even be map to CPU in this case). They are many use cases for this they mainly fall into 2 category: [A]-Allow device to directly map and control another device command queue. [B]-Allow device to access another device memory without disrupting the other device computation. Corresponding workloads: [1]-Network device directly access an control a block device command queue so that it can do storage access without involving the CPU. This fall into [A] [2]-Accelerator device doing heavy computation and network device is monitoring progress. Direct accelerator's memory access by the network device avoid the need to use much slower system memory. This fall into [B]. [3]-Accelerator device doing heavy computation and network device is streaming out the result. This avoid the need to first bounce the result through system memory (it saves both system memory and bandwidth). This fall into [B]. [4]-Chaining device computation. For instance a camera device take a picture, stream it to a color correction device that stream it to final memory. This fall into [A and B]. People have more ideas on how to use this than i can list here. The intention of this patchset is to provide the means to achieve those and much more. I have done a testing using nouveau and Mellanox mlx5 where the mlx5 device can directly access GPU memory [1]. I intend to use this inside nouveau and help porting AMD ROCm RDMA to use this [2]. I believe other people have express interest in working on using this with network device and block device. >From implementation point of view this just add 2 new call back to vm_operations struct (for special device vma support) and 2 new call back to HMM device memory structure for HMM device memory support. For now it needs IOMMU off with ACS disabled and for both device to be on same PCIE sub-tree (can not cross root complex). However the intention here is different from some other peer to peer work in that we do want to support IOMMU and are fine with going through the root complex in that case. In other words, the bandwidth advantage of avoiding the root complex is of less importance than the programming model for the feature. We do actualy expect that this will be use mostly with IOMMU enabled and thus with having to go through the root bridge. Another difference from other p2p solution is that we do require that the importing device abide to mmu notifier invalidation so that the exporting device can always invalidate a mapping at any point in time. For this reasons we do not need a struct page for the device memory. Also in all the cases the policy and final decision on wether to map or not is solely under the control of the exporting device. Finaly the device memory might not even be map to the CPU and thus we have to go through the exporting device driver to get the physical address at which the memory is accessible. The core change are minimal (adding new call backs to some struct). IOMMU support will need little change too. Most of the code is in driver to implement export policy and BAR space management. Very gross playground with IOMMU support in [3] (top 3 patches). Cheers, Jérôme [1] https://cgit.freedesktop.org/~glisse/linux/log/?h=hmm-p2p [2] https://github.com/RadeonOpenCompute/ROCnRDMA [3] https://cgit.freedesktop.org/~glisse/linux/log/?h=wip-hmm-p2p Cc: Logan Gunthorpe Cc: Greg Kroah-Hartman Cc: Rafael J. Wysocki Cc: Bjorn Helgaas Cc: Christian Koenig Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: linux-...@vger.kernel.org Cc: dri-de...@lists.freedesktop.org Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Robin Murphy Cc: Joerg Roedel Cc: io...@lists.linux-foundation.org Jérôme Glisse (5): pci/p2p: add a function to test peer to peer capability drivers/base: add a function to test peer to peer capability mm/vma: add support for peer to peer to device vma mm/hmm: add support for peer to peer to HMM device memory mm/hmm: add support for peer to peer to special device vma drivers/base/core.c| 20 drivers/pci/p2pdma.c | 27 + include/linux/device.h | 1 + include/linux/hmm.h| 53 + include/linux/mm.h | 38 +++ include/linux/pci-p2pdma.h | 6 + mm/hmm.c | 219 ++--- 7 files changed, 325 insertions(+), 39 deletions(-) -- 2.17.2
[PATCH 1/1] RDMA/odp: convert to use HMM for ODP
From: Jérôme Glisse Convert ODP to use HMM so that we can build on common infrastructure for different class of devices that want to mirror a process address space into a device. There is no functional changes. Signed-off-by: Jérôme Glisse Cc: linux-r...@vger.kernel.org Cc: Jason Gunthorpe Cc: Leon Romanovsky Cc: Doug Ledford Cc: Artemy Kovalyov Cc: Moni Shoua Cc: Mike Marciniszyn Cc: Kaike Wan Cc: Dennis Dalessandro --- drivers/infiniband/core/umem_odp.c | 483 - drivers/infiniband/hw/mlx5/mem.c | 20 +- drivers/infiniband/hw/mlx5/mr.c| 2 +- drivers/infiniband/hw/mlx5/odp.c | 95 +++--- include/rdma/ib_umem_odp.h | 54 +--- 5 files changed, 202 insertions(+), 452 deletions(-) diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index a4ec43093cb3..8afa707f1d9a 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -45,6 +45,20 @@ #include #include + +static uint64_t odp_hmm_flags[HMM_PFN_FLAG_MAX] = { + ODP_READ_BIT, /* HMM_PFN_VALID */ + ODP_WRITE_BIT, /* HMM_PFN_WRITE */ + ODP_DEVICE_BIT, /* HMM_PFN_DEVICE_PRIVATE */ +}; + +static uint64_t odp_hmm_values[HMM_PFN_VALUE_MAX] = { + -1UL, /* HMM_PFN_ERROR */ + 0UL,/* HMM_PFN_NONE */ + -2UL, /* HMM_PFN_SPECIAL */ +}; + + /* * The ib_umem list keeps track of memory regions for which the HW * device request to receive notification when the related memory @@ -77,57 +91,25 @@ static u64 node_last(struct umem_odp_node *n) INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, node_start, node_last, static, rbt_ib_umem) -static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp) -{ - mutex_lock(_odp->umem_mutex); - if (umem_odp->notifiers_count++ == 0) - /* -* Initialize the completion object for waiting on -* notifiers. Since notifier_count is zero, no one should be -* waiting right now. -*/ - reinit_completion(_odp->notifier_completion); - mutex_unlock(_odp->umem_mutex); -} - -static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp) -{ - mutex_lock(_odp->umem_mutex); - /* -* This sequence increase will notify the QP page fault that the page -* that is going to be mapped in the spte could have been freed. -*/ - ++umem_odp->notifiers_seq; - if (--umem_odp->notifiers_count == 0) - complete_all(_odp->notifier_completion); - mutex_unlock(_odp->umem_mutex); -} - static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp, u64 start, u64 end, void *cookie) { struct ib_umem *umem = _odp->umem; - /* -* Increase the number of notifiers running, to -* prevent any further fault handling on this MR. -*/ - ib_umem_notifier_start_account(umem_odp); umem_odp->dying = 1; /* Make sure that the fact the umem is dying is out before we release * all pending page faults. */ smp_wmb(); - complete_all(_odp->notifier_completion); umem->context->invalidate_range(umem_odp, ib_umem_start(umem), ib_umem_end(umem)); return 0; } -static void ib_umem_notifier_release(struct mmu_notifier *mn, -struct mm_struct *mm) +static void ib_umem_notifier_release(struct hmm_mirror *mirror) { - struct ib_ucontext_per_mm *per_mm = - container_of(mn, struct ib_ucontext_per_mm, mn); + struct ib_ucontext_per_mm *per_mm; + + per_mm = container_of(mirror, struct ib_ucontext_per_mm, mirror); down_read(_mm->umem_rwsem); if (per_mm->active) @@ -135,21 +117,24 @@ static void ib_umem_notifier_release(struct mmu_notifier *mn, _mm->umem_tree, 0, ULLONG_MAX, ib_umem_notifier_release_trampoline, true, NULL); up_read(_mm->umem_rwsem); + + per_mm->mm = NULL; } -static int invalidate_range_start_trampoline(struct ib_umem_odp *item, -u64 start, u64 end, void *cookie) +static int invalidate_range_trampoline(struct ib_umem_odp *item, + u64 start, u64 end, void *cookie) { - ib_umem_notifier_start_account(item); item->umem.context->invalidate_range(item, start, end); return 0; } -static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, - const struct mmu_notifier_range *range) +static int ib_sync_cpu_device_pagetables(struct hmm_mirror *mirror, + const struct hmm_update *range) { - struct ib_ucontext_per_mm *per_mm = - container_of(mn, struct
[RFC PATCH 0/1] Use HMM for ODP
From: Jérôme Glisse This patchset convert RDMA ODP to use HMM underneath this is motivated by stronger code sharing for same feature (share virtual memory SVM or Share Virtual Address SVA) and also stronger integration with mm code to achieve that. It depends on HMM patchset posted for inclusion in 5.1 so earliest target for this should be 5.2. I welcome any testing people can do on this. Moreover they are some features of HMM in the works like peer to peer support, fast CPU page table snapshot, fast IOMMU mapping update ... It will be easier for RDMA devices with ODP to leverage those if they use HMM underneath. Quick summary of what HMM is: HMM is a toolbox for device driver to implement software support for Share Virtual Memory (SVM). Not only it provides helpers to mirror a process address space on a device (hmm_mirror). It also provides helper to allow to use device memory to back regular valid virtual address of a process (any valid mmap that is not an mmap of a device or a DAX mapping). They are two kinds of device memory. Private memory that is not accessible to CPU because it does not have all the expected properties (this is for all PCIE devices) or public memory which can also be access by CPU without restriction (with OpenCAPI or CCIX or similar cache-coherent and atomic inter-connect). Device driver can use each of HMM tools separatly. You do not have to use all the tools it provides. For RDMA device i do not expect a need to use the device memory support of HMM. This device memory support is geared toward accelerator like GPU. You can find a branch [1] with all the prerequisite in. This patch is on top of 5.0rc2+ but i can rebase it on any specific branch before it is consider for inclusion (5.2 at best). Questions and reviews are more than welcome. [1] https://cgit.freedesktop.org/~glisse/linux/log/?h=odp-hmm [2] https://cgit.freedesktop.org/~glisse/linux/log/?h=hmm-for-5.1 Cc: linux-r...@vger.kernel.org Cc: Jason Gunthorpe Cc: Leon Romanovsky Cc: Doug Ledford Cc: Artemy Kovalyov Cc: Moni Shoua Cc: Mike Marciniszyn Cc: Kaike Wan Cc: Dennis Dalessandro Jérôme Glisse (1): RDMA/odp: convert to use HMM for ODP drivers/infiniband/core/umem_odp.c | 483 - drivers/infiniband/hw/mlx5/mem.c | 20 +- drivers/infiniband/hw/mlx5/mr.c| 2 +- drivers/infiniband/hw/mlx5/odp.c | 95 +++--- include/rdma/ib_umem_odp.h | 54 +--- 5 files changed, 202 insertions(+), 452 deletions(-) -- 2.17.2
[PATCH 06/10] mm/hmm: add default fault flags to avoid the need to pre-fill pfns arrays.
From: Jérôme Glisse The HMM mirror API can be use in two fashions. The first one where the HMM user coalesce multiple page faults into one request and set flags per pfns for of those faults. The second one where the HMM user want to pre-fault a range with specific flags. For the latter one it is a waste to have the user pre-fill the pfn arrays with a default flags value. This patch adds a default flags value allowing user to set them for a range without having to pre-fill the pfn array. Signed-off-by: Jérôme Glisse Cc: Andrew Morton Cc: Ralph Campbell Cc: John Hubbard --- include/linux/hmm.h | 7 +++ mm/hmm.c| 12 2 files changed, 19 insertions(+) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 93dc88edc293..4263f8fb32e5 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -165,6 +165,8 @@ enum hmm_pfn_value_e { * @pfns: array of pfns (big enough for the range) * @flags: pfn flags to match device driver page table * @values: pfn value for some special case (none, special, error, ...) + * @default_flags: default flags for the range (write, read, ...) + * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT) * @valid: pfns array did not change since it has been fill by an HMM function */ @@ -177,6 +179,8 @@ struct hmm_range { uint64_t*pfns; const uint64_t *flags; const uint64_t *values; + uint64_tdefault_flags; + uint64_tpfn_flags_mask; uint8_t pfn_shift; boolvalid; }; @@ -521,6 +525,9 @@ static inline int hmm_vma_fault(struct hmm_range *range, bool block) { long ret; + range->default_flags = 0; + range->pfn_flags_mask = -1UL; + ret = hmm_range_register(range, range->vma->vm_mm, range->start, range->end); if (ret) diff --git a/mm/hmm.c b/mm/hmm.c index 860ebe5d4b07..0a4ff31e9d7a 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -423,6 +423,18 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, if (!hmm_vma_walk->fault) return; + /* +* So we not only consider the individual per page request we also +* consider the default flags requested for the range. The API can +* be use in 2 fashions. The first one where the HMM user coalesce +* multiple page fault into one request and set flags per pfns for +* of those faults. The second one where the HMM user want to pre- +* fault a range with specific flags. For the latter one it is a +* waste to have the user pre-fill the pfn arrays with a default +* flags value. +*/ + pfns = (pfns & range->pfn_flags_mask) | range->default_flags; + /* We aren't ask to do anything ... */ if (!(pfns & range->flags[HMM_PFN_VALID])) return; -- 2.17.2
[PATCH 01/10] mm/hmm: use reference counting for HMM struct
From: Jérôme Glisse Every time i read the code to check that the HMM structure does not vanish before it should thanks to the many lock protecting its removal i get a headache. Switch to reference counting instead it is much easier to follow and harder to break. This also remove some code that is no longer needed with refcounting. Signed-off-by: Jérôme Glisse Cc: Ralph Campbell Cc: John Hubbard Cc: Andrew Morton --- include/linux/hmm.h | 2 + mm/hmm.c| 178 +--- 2 files changed, 120 insertions(+), 60 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 66f9ebbb1df3..bd6e058597a6 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -131,6 +131,7 @@ enum hmm_pfn_value_e { /* * struct hmm_range - track invalidation lock on virtual address range * + * @hmm: the core HMM structure this range is active against * @vma: the vm area struct for the range * @list: all range lock are on a list * @start: range virtual start address (inclusive) @@ -142,6 +143,7 @@ enum hmm_pfn_value_e { * @valid: pfns array did not change since it has been fill by an HMM function */ struct hmm_range { + struct hmm *hmm; struct vm_area_struct *vma; struct list_headlist; unsigned long start; diff --git a/mm/hmm.c b/mm/hmm.c index a04e4b810610..b9f384ea15e9 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -50,6 +50,7 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops; */ struct hmm { struct mm_struct*mm; + struct kref kref; spinlock_t lock; struct list_headranges; struct list_headmirrors; @@ -57,6 +58,16 @@ struct hmm { struct rw_semaphore mirrors_sem; }; +static inline struct hmm *hmm_get(struct mm_struct *mm) +{ + struct hmm *hmm = READ_ONCE(mm->hmm); + + if (hmm && kref_get_unless_zero(>kref)) + return hmm; + + return NULL; +} + /* * hmm_register - register HMM against an mm (HMM internal) * @@ -67,14 +78,9 @@ struct hmm { */ static struct hmm *hmm_register(struct mm_struct *mm) { - struct hmm *hmm = READ_ONCE(mm->hmm); + struct hmm *hmm = hmm_get(mm); bool cleanup = false; - /* -* The hmm struct can only be freed once the mm_struct goes away, -* hence we should always have pre-allocated an new hmm struct -* above. -*/ if (hmm) return hmm; @@ -86,6 +92,7 @@ static struct hmm *hmm_register(struct mm_struct *mm) hmm->mmu_notifier.ops = NULL; INIT_LIST_HEAD(>ranges); spin_lock_init(>lock); + kref_init(>kref); hmm->mm = mm; spin_lock(>page_table_lock); @@ -106,7 +113,7 @@ static struct hmm *hmm_register(struct mm_struct *mm) if (__mmu_notifier_register(>mmu_notifier, mm)) goto error_mm; - return mm->hmm; + return hmm; error_mm: spin_lock(>page_table_lock); @@ -118,9 +125,41 @@ static struct hmm *hmm_register(struct mm_struct *mm) return NULL; } +static void hmm_free(struct kref *kref) +{ + struct hmm *hmm = container_of(kref, struct hmm, kref); + struct mm_struct *mm = hmm->mm; + + mmu_notifier_unregister_no_release(>mmu_notifier, mm); + + spin_lock(>page_table_lock); + if (mm->hmm == hmm) + mm->hmm = NULL; + spin_unlock(>page_table_lock); + + kfree(hmm); +} + +static inline void hmm_put(struct hmm *hmm) +{ + kref_put(>kref, hmm_free); +} + void hmm_mm_destroy(struct mm_struct *mm) { - kfree(mm->hmm); + struct hmm *hmm; + + spin_lock(>page_table_lock); + hmm = hmm_get(mm); + mm->hmm = NULL; + if (hmm) { + hmm->mm = NULL; + spin_unlock(>page_table_lock); + hmm_put(hmm); + return; + } + + spin_unlock(>page_table_lock); } static int hmm_invalidate_range(struct hmm *hmm, bool device, @@ -165,7 +204,7 @@ static int hmm_invalidate_range(struct hmm *hmm, bool device, static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) { struct hmm_mirror *mirror; - struct hmm *hmm = mm->hmm; + struct hmm *hmm = hmm_get(mm); down_write(>mirrors_sem); mirror = list_first_entry_or_null(>mirrors, struct hmm_mirror, @@ -186,36 +225,50 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) struct hmm_mirror, list); } up_write(>mirrors_sem); + + hmm_put(hmm); } static int hmm_invalidate_range_start(struct mmu_notifier *mn, const struct mmu_notifier_range *range) { struct hmm_update update; - struct hmm *hmm = range->mm->hmm; + struct hmm *hmm = hmm_get(range->mm); + int ret;
[PATCH 03/10] mm/hmm: improve and rename hmm_vma_get_pfns() to hmm_range_snapshot()
From: Jérôme Glisse Rename for consistency between code, comments and documentation. Also improves the comments on all the possible returns values. Improve the function by returning the number of populated entries in pfns array. Signed-off-by: Jérôme Glisse Cc: Andrew Morton Cc: Ralph Campbell Cc: John Hubbard --- include/linux/hmm.h | 4 ++-- mm/hmm.c| 23 ++- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index bd6e058597a6..ddf49c1b1f5e 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -365,11 +365,11 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror); * table invalidation serializes on it. * * YOU MUST CALL hmm_vma_range_done() ONCE AND ONLY ONCE EACH TIME YOU CALL - * hmm_vma_get_pfns() WITHOUT ERROR ! + * hmm_range_snapshot() WITHOUT ERROR ! * * IF YOU DO NOT FOLLOW THE ABOVE RULE THE SNAPSHOT CONTENT MIGHT BE INVALID ! */ -int hmm_vma_get_pfns(struct hmm_range *range); +long hmm_range_snapshot(struct hmm_range *range); bool hmm_vma_range_done(struct hmm_range *range); diff --git a/mm/hmm.c b/mm/hmm.c index 74d69812d6be..0d9ecd3337e5 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -706,23 +706,19 @@ static void hmm_pfns_special(struct hmm_range *range) } /* - * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses - * @range: range being snapshotted + * hmm_range_snapshot() - snapshot CPU page table for a range + * @range: range * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid - * vma permission, 0 success + * permission (for instance asking for write and range is read only), + * -EAGAIN if you need to retry, -EFAULT invalid (ie either no valid + * vma or it is illegal to access that range), number of valid pages + * in range->pfns[] (from range start address). * * This snapshots the CPU page table for a range of virtual addresses. Snapshot * validity is tracked by range struct. See hmm_vma_range_done() for further * information. - * - * The range struct is initialized here. It tracks the CPU page table, but only - * if the function returns success (0), in which case the caller must then call - * hmm_vma_range_done() to stop CPU page table update tracking on this range. - * - * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS - * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED ! */ -int hmm_vma_get_pfns(struct hmm_range *range) +long hmm_range_snapshot(struct hmm_range *range) { struct vm_area_struct *vma = range->vma; struct hmm_vma_walk hmm_vma_walk; @@ -776,6 +772,7 @@ int hmm_vma_get_pfns(struct hmm_range *range) hmm_vma_walk.fault = false; hmm_vma_walk.range = range; mm_walk.private = _vma_walk; + hmm_vma_walk.last = range->start; mm_walk.vma = vma; mm_walk.mm = vma->vm_mm; @@ -792,9 +789,9 @@ int hmm_vma_get_pfns(struct hmm_range *range) * function return 0). */ range->hmm = hmm; - return 0; + return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; } -EXPORT_SYMBOL(hmm_vma_get_pfns); +EXPORT_SYMBOL(hmm_range_snapshot); /* * hmm_vma_range_done() - stop tracking change to CPU page table over a range -- 2.17.2
[PATCH 04/10] mm/hmm: improve and rename hmm_vma_fault() to hmm_range_fault()
From: Jérôme Glisse Rename for consistency between code, comments and documentation. Also improves the comments on all the possible returns values. Improve the function by returning the number of populated entries in pfns array. Signed-off-by: Jérôme Glisse Cc: Andrew Morton Cc: Ralph Campbell Cc: John Hubbard --- include/linux/hmm.h | 13 ++- mm/hmm.c| 93 - 2 files changed, 53 insertions(+), 53 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index ddf49c1b1f5e..ccf2b630447e 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -391,7 +391,18 @@ bool hmm_vma_range_done(struct hmm_range *range); * * See the function description in mm/hmm.c for further documentation. */ -int hmm_vma_fault(struct hmm_range *range, bool block); +long hmm_range_fault(struct hmm_range *range, bool block); + +/* This is a temporary helper to avoid merge conflict between trees. */ +static inline int hmm_vma_fault(struct hmm_range *range, bool block) +{ + long ret = hmm_range_fault(range, block); + if (ret == -EBUSY) + ret = -EAGAIN; + else if (ret == -EAGAIN) + ret = -EBUSY; + return ret < 0 ? ret : 0; +} /* Below are for HMM internal use only! Not to be used by device driver! */ void hmm_mm_destroy(struct mm_struct *mm); diff --git a/mm/hmm.c b/mm/hmm.c index 0d9ecd3337e5..04235455b4d2 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -344,13 +344,13 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, flags |= write_fault ? FAULT_FLAG_WRITE : 0; ret = handle_mm_fault(vma, addr, flags); if (ret & VM_FAULT_RETRY) - return -EBUSY; + return -EAGAIN; if (ret & VM_FAULT_ERROR) { *pfn = range->values[HMM_PFN_ERROR]; return -EFAULT; } - return -EAGAIN; + return -EBUSY; } static int hmm_pfns_bad(unsigned long addr, @@ -376,7 +376,7 @@ static int hmm_pfns_bad(unsigned long addr, * @fault: should we fault or not ? * @write_fault: write fault ? * @walk: mm_walk structure - * Returns: 0 on success, -EAGAIN after page fault, or page fault error + * Returns: 0 on success, -EBUSY after page fault, or page fault error * * This function will be called whenever pmd_none() or pte_none() returns true, * or whenever there is no page directory covering the virtual address range. @@ -399,12 +399,12 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, ret = hmm_vma_do_fault(walk, addr, write_fault, [i]); - if (ret != -EAGAIN) + if (ret != -EBUSY) return ret; } } - return (fault || write_fault) ? -EAGAIN : 0; + return (fault || write_fault) ? -EBUSY : 0; } static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, @@ -535,11 +535,11 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, uint64_t orig_pfn = *pfn; *pfn = range->values[HMM_PFN_NONE]; - cpu_flags = pte_to_hmm_pfn_flags(range, pte); - hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, - , _fault); + fault = write_fault = false; if (pte_none(pte)) { + hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0, + , _fault); if (fault || write_fault) goto fault; return 0; @@ -578,7 +578,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, hmm_vma_walk->last = addr; migration_entry_wait(vma->vm_mm, pmdp, addr); - return -EAGAIN; + return -EBUSY; } return 0; } @@ -586,6 +586,10 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, /* Report error for everything else */ *pfn = range->values[HMM_PFN_ERROR]; return -EFAULT; + } else { + cpu_flags = pte_to_hmm_pfn_flags(range, pte); + hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, + , _fault); } if (fault || write_fault) @@ -636,7 +640,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, if (fault || write_fault) { hmm_vma_walk->last = addr; pmd_migration_entry_wait(vma->vm_mm, pmdp); - return -EAGAIN; + return -EBUSY; } return 0; } else if (!pmd_present(pmd)) @@ -858,53 +862,36 @@ bool
[PATCH 09/10] mm/hmm: allow to mirror vma of a file on a DAX backed filesystem
From: Jérôme Glisse This add support to mirror vma which is an mmap of a file which is on a filesystem that using a DAX block device. There is no reason not to support that case. Note that unlike GUP code we do not take page reference hence when we back-off we have nothing to undo. Signed-off-by: Jérôme Glisse Cc: Andrew Morton Cc: Dan Williams Cc: Ralph Campbell Cc: John Hubbard --- mm/hmm.c | 133 ++- 1 file changed, 112 insertions(+), 21 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index 8b87e1813313..1a444885404e 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -334,6 +334,7 @@ EXPORT_SYMBOL(hmm_mirror_unregister); struct hmm_vma_walk { struct hmm_range*range; + struct dev_pagemap *pgmap; unsigned long last; boolfault; boolblock; @@ -508,6 +509,15 @@ static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) range->flags[HMM_PFN_VALID]; } +static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud) +{ + if (!pud_present(pud)) + return 0; + return pud_write(pud) ? range->flags[HMM_PFN_VALID] | + range->flags[HMM_PFN_WRITE] : + range->flags[HMM_PFN_VALID]; +} + static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, unsigned long end, @@ -529,8 +539,19 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); pfn = pmd_pfn(pmd) + pte_index(addr); - for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { + if (pmd_devmap(pmd)) { + hmm_vma_walk->pgmap = get_dev_pagemap(pfn, + hmm_vma_walk->pgmap); + if (unlikely(!hmm_vma_walk->pgmap)) + return -EBUSY; + } pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags; + } + if (hmm_vma_walk->pgmap) { + put_dev_pagemap(hmm_vma_walk->pgmap); + hmm_vma_walk->pgmap = NULL; + } hmm_vma_walk->last = end; return 0; } @@ -617,10 +638,24 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, if (fault || write_fault) goto fault; + if (pte_devmap(pte)) { + hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte), + hmm_vma_walk->pgmap); + if (unlikely(!hmm_vma_walk->pgmap)) + return -EBUSY; + } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) { + *pfn = range->values[HMM_PFN_SPECIAL]; + return -EFAULT; + } + *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags; return 0; fault: + if (hmm_vma_walk->pgmap) { + put_dev_pagemap(hmm_vma_walk->pgmap); + hmm_vma_walk->pgmap = NULL; + } pte_unmap(ptep); /* Fault any virtual address we were asked to fault */ return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); @@ -708,12 +743,84 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, return r; } } + if (hmm_vma_walk->pgmap) { + put_dev_pagemap(hmm_vma_walk->pgmap); + hmm_vma_walk->pgmap = NULL; + } pte_unmap(ptep - 1); hmm_vma_walk->last = addr; return 0; } +static int hmm_vma_walk_pud(pud_t *pudp, + unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + struct vm_area_struct *vma = walk->vma; + unsigned long addr = start, next; + pmd_t *pmdp; + pud_t pud; + int ret; + +again: + pud = READ_ONCE(*pudp); + if (pud_none(pud)) + return hmm_vma_walk_hole(start, end, walk); + + if (pud_huge(pud) && pud_devmap(pud)) { + unsigned long i, npages, pfn; + uint64_t *pfns, cpu_flags; + bool fault, write_fault; + + if (!pud_present(pud)) + return hmm_vma_walk_hole(start, end, walk); + + i = (addr - range->start) >> PAGE_SHIFT; + npages = (end - addr) >> PAGE_SHIFT; + pfns = >pfns[i]; + + cpu_flags = pud_to_hmm_pfn_flags(range, pud); + hmm_range_need_fault(hmm_vma_walk, pfns, npages, +
[PATCH 02/10] mm/hmm: do not erase snapshot when a range is invalidated
From: Jérôme Glisse Users of HMM might be using the snapshot information to do preparatory step like dma mapping pages to a device before checking for invalidation through hmm_vma_range_done() so do not erase that information and assume users will do the right thing. Signed-off-by: Jérôme Glisse Cc: Andrew Morton Cc: Ralph Campbell Cc: John Hubbard --- mm/hmm.c | 6 -- 1 file changed, 6 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index b9f384ea15e9..74d69812d6be 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -170,16 +170,10 @@ static int hmm_invalidate_range(struct hmm *hmm, bool device, spin_lock(>lock); list_for_each_entry(range, >ranges, list) { - unsigned long addr, idx, npages; - if (update->end < range->start || update->start >= range->end) continue; range->valid = false; - addr = max(update->start, range->start); - idx = (addr - range->start) >> PAGE_SHIFT; - npages = (min(range->end, update->end) - addr) >> PAGE_SHIFT; - memset(>pfns[idx], 0, sizeof(*range->pfns) * npages); } spin_unlock(>lock); -- 2.17.2
[PATCH 07/10] mm/hmm: add an helper function that fault pages and map them to a device
From: Jérôme Glisse This is a all in one helper that fault pages in a range and map them to a device so that every single device driver do not have to re-implement this common pattern. Signed-off-by: Jérôme Glisse Cc: Andrew Morton Cc: Ralph Campbell Cc: John Hubbard --- include/linux/hmm.h | 9 +++ mm/hmm.c| 152 2 files changed, 161 insertions(+) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 4263f8fb32e5..fc3630d0bbfd 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -502,6 +502,15 @@ int hmm_range_register(struct hmm_range *range, void hmm_range_unregister(struct hmm_range *range); long hmm_range_snapshot(struct hmm_range *range); long hmm_range_fault(struct hmm_range *range, bool block); +long hmm_range_dma_map(struct hmm_range *range, + struct device *device, + dma_addr_t *daddrs, + bool block); +long hmm_range_dma_unmap(struct hmm_range *range, +struct vm_area_struct *vma, +struct device *device, +dma_addr_t *daddrs, +bool dirty); /* * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range diff --git a/mm/hmm.c b/mm/hmm.c index 0a4ff31e9d7a..9cd68334a759 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -985,6 +986,157 @@ long hmm_range_fault(struct hmm_range *range, bool block) return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; } EXPORT_SYMBOL(hmm_range_fault); + +/* + * hmm_range_dma_map() - hmm_range_fault() and dma map page all in one. + * @range: range being faulted + * @device: device against to dma map page to + * @daddrs: dma address of mapped pages + * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) + * Returns: number of pages mapped on success, -EAGAIN if mmap_sem have been + * drop and you need to try again, some other error value otherwise + * + * Note same usage pattern as hmm_range_fault(). + */ +long hmm_range_dma_map(struct hmm_range *range, + struct device *device, + dma_addr_t *daddrs, + bool block) +{ + unsigned long i, npages, mapped; + long ret; + + ret = hmm_range_fault(range, block); + if (ret <= 0) + return ret ? ret : -EBUSY; + + npages = (range->end - range->start) >> PAGE_SHIFT; + for (i = 0, mapped = 0; i < npages; ++i) { + enum dma_data_direction dir = DMA_FROM_DEVICE; + struct page *page; + + /* +* FIXME need to update DMA API to provide invalid DMA address +* value instead of a function to test dma address value. This +* would remove lot of dumb code duplicated accross many arch. +* +* For now setting it to 0 here is good enough as the pfns[] +* value is what is use to check what is valid and what isn't. +*/ + daddrs[i] = 0; + + page = hmm_pfn_to_page(range, range->pfns[i]); + if (page == NULL) + continue; + + /* Check if range is being invalidated */ + if (!range->valid) { + ret = -EBUSY; + goto unmap; + } + + /* If it is read and write than map bi-directional. */ + if (range->pfns[i] & range->values[HMM_PFN_WRITE]) + dir = DMA_BIDIRECTIONAL; + + daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir); + if (dma_mapping_error(device, daddrs[i])) { + ret = -EFAULT; + goto unmap; + } + + mapped++; + } + + return mapped; + +unmap: + for (npages = i, i = 0; (i < npages) && mapped; ++i) { + enum dma_data_direction dir = DMA_FROM_DEVICE; + struct page *page; + + page = hmm_pfn_to_page(range, range->pfns[i]); + if (page == NULL) + continue; + + if (dma_mapping_error(device, daddrs[i])) + continue; + + /* If it is read and write than map bi-directional. */ + if (range->pfns[i] & range->values[HMM_PFN_WRITE]) + dir = DMA_BIDIRECTIONAL; + + dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir); + mapped--; + } + + return ret; +} +EXPORT_SYMBOL(hmm_range_dma_map); + +/* + * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map() + * @range: range being unmapped + * @vma: the vma against which the range (optional) + * @device: device against which dma map was done + * @daddrs: dma
[PATCH 10/10] mm/hmm: add helpers for driver to safely take the mmap_sem
From: Jérôme Glisse The device driver context which holds reference to mirror and thus to core hmm struct might outlive the mm against which it was created. To avoid every driver to check for that case provide an helper that check if mm is still alive and take the mmap_sem in read mode if so. If the mm have been destroy (mmu_notifier release call back did happen) then we return -EINVAL so that calling code knows that it is trying to do something against a mm that is no longer valid. Signed-off-by: Jérôme Glisse Cc: Andrew Morton Cc: Ralph Campbell Cc: John Hubbard --- include/linux/hmm.h | 50 ++--- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index b3850297352f..4a1454e3efba 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -438,6 +438,50 @@ struct hmm_mirror { int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm); void hmm_mirror_unregister(struct hmm_mirror *mirror); +/* + * hmm_mirror_mm_down_read() - lock the mmap_sem in read mode + * @mirror: the HMM mm mirror for which we want to lock the mmap_sem + * Returns: -EINVAL if the mm is dead, 0 otherwise (lock taken). + * + * The device driver context which holds reference to mirror and thus to core + * hmm struct might outlive the mm against which it was created. To avoid every + * driver to check for that case provide an helper that check if mm is still + * alive and take the mmap_sem in read mode if so. If the mm have been destroy + * (mmu_notifier release call back did happen) then we return -EINVAL so that + * calling code knows that it is trying to do something against a mm that is + * no longer valid. + */ +static inline int hmm_mirror_mm_down_read(struct hmm_mirror *mirror) +{ + struct mm_struct *mm; + + /* Sanity check ... */ + if (!mirror || !mirror->hmm) + return -EINVAL; + /* +* Before trying to take the mmap_sem make sure the mm is still +* alive as device driver context might outlive the mm lifetime. +* +* FIXME: should we also check for mm that outlive its owning +* task ? +*/ + mm = READ_ONCE(mirror->hmm->mm); + if (mirror->hmm->dead || !mm) + return -EINVAL; + + down_read(>mmap_sem); + return 0; +} + +/* + * hmm_mirror_mm_up_read() - unlock the mmap_sem from read mode + * @mirror: the HMM mm mirror for which we want to lock the mmap_sem + */ +static inline void hmm_mirror_mm_up_read(struct hmm_mirror *mirror) +{ + up_read(>hmm->mm->mmap_sem); +} + /* * To snapshot the CPU page table you first have to call hmm_range_register() @@ -463,7 +507,7 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror); * if (ret) * return ret; * - * down_read(mm->mmap_sem); + * hmm_mirror_mm_down_read(mirror); * again: * * if (!hmm_range_wait_until_valid(, TIMEOUT)) { @@ -476,13 +520,13 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror); * * ret = hmm_range_snapshot(); or hmm_range_fault(); * if (ret == -EAGAIN) { - * down_read(mm->mmap_sem); + * hmm_mirror_mm_down_read(mirror); * goto again; * } else if (ret == -EBUSY) { * goto again; * } * - * up_read(>mmap_sem); + * hmm_mirror_mm_up_read(mirror); * if (ret) { * hmm_range_unregister(range); * return ret; -- 2.17.2
[PATCH 05/10] mm/hmm: improve driver API to work and wait over a range
From: Jérôme Glisse A common use case for HMM mirror is user trying to mirror a range and before they could program the hardware it get invalidated by some core mm event. Instead of having user re-try right away to mirror the range provide a completion mechanism for them to wait for any active invalidation affecting the range. This also changes how hmm_range_snapshot() and hmm_range_fault() works by not relying on vma so that we can drop the mmap_sem when waiting and lookup the vma again on retry. Signed-off-by: Jérôme Glisse Cc: Andrew Morton Cc: Ralph Campbell Cc: John Hubbard --- include/linux/hmm.h | 208 +++--- mm/hmm.c| 526 +--- 2 files changed, 430 insertions(+), 304 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index ccf2b630447e..93dc88edc293 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -77,8 +77,34 @@ #include #include #include +#include -struct hmm; + +/* + * struct hmm - HMM per mm struct + * + * @mm: mm struct this HMM struct is bound to + * @lock: lock protecting ranges list + * @ranges: list of range being snapshotted + * @mirrors: list of mirrors for this mm + * @mmu_notifier: mmu notifier to track updates to CPU page table + * @mirrors_sem: read/write semaphore protecting the mirrors list + * @wq: wait queue for user waiting on a range invalidation + * @notifiers: count of active mmu notifiers + * @dead: is the mm dead ? + */ +struct hmm { + struct mm_struct*mm; + struct kref kref; + struct mutexlock; + struct list_headranges; + struct list_headmirrors; + struct mmu_notifier mmu_notifier; + struct rw_semaphore mirrors_sem; + wait_queue_head_t wq; + longnotifiers; + booldead; +}; /* * hmm_pfn_flag_e - HMM flag enums @@ -155,6 +181,38 @@ struct hmm_range { boolvalid; }; +/* + * hmm_range_wait_until_valid() - wait for range to be valid + * @range: range affected by invalidation to wait on + * @timeout: time out for wait in ms (ie abort wait after that period of time) + * Returns: true if the range is valid, false otherwise. + */ +static inline bool hmm_range_wait_until_valid(struct hmm_range *range, + unsigned long timeout) +{ + /* Check if mm is dead ? */ + if (range->hmm == NULL || range->hmm->dead || range->hmm->mm == NULL) { + range->valid = false; + return false; + } + if (range->valid) + return true; + wait_event_timeout(range->hmm->wq, range->valid || range->hmm->dead, + msecs_to_jiffies(timeout)); + /* Return current valid status just in case we get lucky */ + return range->valid; +} + +/* + * hmm_range_valid() - test if a range is valid or not + * @range: range + * Returns: true if the range is valid, false otherwise. + */ +static inline bool hmm_range_valid(struct hmm_range *range) +{ + return range->valid; +} + /* * hmm_pfn_to_page() - return struct page pointed to by a valid HMM pfn * @range: range use to decode HMM pfn value @@ -357,51 +415,133 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror); /* - * To snapshot the CPU page table, call hmm_vma_get_pfns(), then take a device - * driver lock that serializes device page table updates, then call - * hmm_vma_range_done(), to check if the snapshot is still valid. The same - * device driver page table update lock must also be used in the - * hmm_mirror_ops.sync_cpu_device_pagetables() callback, so that CPU page - * table invalidation serializes on it. + * To snapshot the CPU page table you first have to call hmm_range_register() + * to register the range. If hmm_range_register() return an error then some- + * thing is horribly wrong and you should fail loudly. If it returned true then + * you can wait for the range to be stable with hmm_range_wait_until_valid() + * function, a range is valid when there are no concurrent changes to the CPU + * page table for the range. + * + * Once the range is valid you can call hmm_range_snapshot() if that returns + * without error then you can take your device page table lock (the same lock + * you use in the HMM mirror sync_cpu_device_pagetables() callback). After + * taking that lock you have to check the range validity, if it is still valid + * (ie hmm_range_valid() returns true) then you can program the device page + * table, otherwise you have to start again. Pseudo code: + * + * mydevice_prefault(mydevice, mm, start, end) + * { + * struct hmm_range range; + * ... * - * YOU MUST CALL hmm_vma_range_done() ONCE AND ONLY ONCE EACH TIME YOU CALL - * hmm_range_snapshot() WITHOUT ERROR ! + * ret = hmm_range_register(, mm, start, end); + * if (ret) + *
[PATCH 08/10] mm/hmm: support hugetlbfs (snap shoting, faulting and DMA mapping)
From: Jérôme Glisse This adds support for hugetlbfs so that HMM user can map mirror range of virtual address back by hugetlbfs. Note that now the range allows user to optimize DMA mapping of such page so that we can map a huge page as one chunk. Signed-off-by: Jérôme Glisse Cc: Andrew Morton Cc: Ralph Campbell Cc: John Hubbard --- include/linux/hmm.h | 29 - mm/hmm.c| 141 +--- 2 files changed, 147 insertions(+), 23 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index fc3630d0bbfd..b3850297352f 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -181,10 +181,31 @@ struct hmm_range { const uint64_t *values; uint64_tdefault_flags; uint64_tpfn_flags_mask; + uint8_t page_shift; uint8_t pfn_shift; boolvalid; }; +/* + * hmm_range_page_shift() - return the page shift for the range + * @range: range being queried + * Returns: page shift (page size = 1 << page shift) for the range + */ +static inline unsigned hmm_range_page_shift(const struct hmm_range *range) +{ + return range->page_shift; +} + +/* + * hmm_range_page_size() - return the page size for the range + * @range: range being queried + * Returns: page size for the range in bytes + */ +static inline unsigned long hmm_range_page_size(const struct hmm_range *range) +{ + return 1UL << hmm_range_page_shift(range); +} + /* * hmm_range_wait_until_valid() - wait for range to be valid * @range: range affected by invalidation to wait on @@ -438,7 +459,7 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror); * struct hmm_range range; * ... * - * ret = hmm_range_register(, mm, start, end); + * ret = hmm_range_register(, mm, start, end, page_shift); * if (ret) * return ret; * @@ -498,7 +519,8 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror); int hmm_range_register(struct hmm_range *range, struct mm_struct *mm, unsigned long start, - unsigned long end); + unsigned long end, + unsigned page_shift); void hmm_range_unregister(struct hmm_range *range); long hmm_range_snapshot(struct hmm_range *range); long hmm_range_fault(struct hmm_range *range, bool block); @@ -538,7 +560,8 @@ static inline int hmm_vma_fault(struct hmm_range *range, bool block) range->pfn_flags_mask = -1UL; ret = hmm_range_register(range, range->vma->vm_mm, -range->start, range->end); +range->start, range->end, +PAGE_SHIFT); if (ret) return (int)ret; diff --git a/mm/hmm.c b/mm/hmm.c index 9cd68334a759..8b87e1813313 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -396,11 +396,13 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; uint64_t *pfns = range->pfns; - unsigned long i; + unsigned long i, page_size; hmm_vma_walk->last = addr; - i = (addr - range->start) >> PAGE_SHIFT; - for (; addr < end; addr += PAGE_SIZE, i++) { + page_size = 1UL << range->page_shift; + i = (addr - range->start) >> range->page_shift; + + for (; addr < end; addr += page_size, i++) { pfns[i] = range->values[HMM_PFN_NONE]; if (fault || write_fault) { int ret; @@ -712,6 +714,69 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, return 0; } +static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long start, unsigned long end, + struct mm_walk *walk) +{ +#ifdef CONFIG_HUGETLB_PAGE + unsigned long addr = start, i, pfn, mask, size, pfn_inc; + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + struct vm_area_struct *vma = walk->vma; + struct hstate *h = hstate_vma(vma); + uint64_t orig_pfn, cpu_flags; + bool fault, write_fault; + spinlock_t *ptl; + pte_t entry; + int ret = 0; + + size = 1UL << huge_page_shift(h); + mask = size - 1; + if (range->page_shift != PAGE_SHIFT) { + /* Make sure we are looking at full page. */ + if (start & mask) + return -EINVAL; + if (end < (start + size)) + return -EINVAL; + pfn_inc = size >> PAGE_SHIFT; + } else { + pfn_inc = 1; + size = PAGE_SIZE; + } + + + ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
[PATCH 00/10] HMM updates for 5.1
From: Jérôme Glisse This patchset improves the HMM driver API and add support for hugetlbfs and DAX mirroring. The improvement motivation was to make the ODP to HMM conversion easier [1]. Because we have nouveau bits schedule for 5.1 and to avoid any multi-tree synchronization this patchset adds few lines of inline function that wrap the existing HMM driver API to the improved API. The nouveau driver was tested before and after this patchset and it builds and works on both case so there is no merging issue [2]. The nouveau bit are queue up for 5.1 so this is why i added those inline. If this get merge in 5.1 the plans is to merge the HMM to ODP in 5.2 or 5.3 if testing shows any issues (so far no issues has been found with limited testing but Mellanox will be running heavier testing for longer time). To avoid spamming mm i would like to not cc mm on ODP or nouveau patches, however if people prefer to see those on mm mailing list then i can keep it cced. This is also what i intend to use as a base for AMD and Intel patches (v2 with more thing of some rfc which were already posted in the past). [1] https://cgit.freedesktop.org/~glisse/linux/log/?h=odp-hmm [2] https://cgit.freedesktop.org/~glisse/linux/log/?h=hmm-for-5.1 Cc: Andrew Morton Cc: Felix Kuehling Cc: Christian König Cc: Ralph Campbell Cc: John Hubbard Cc: Jason Gunthorpe Cc: Dan Williams Jérôme Glisse (10): mm/hmm: use reference counting for HMM struct mm/hmm: do not erase snapshot when a range is invalidated mm/hmm: improve and rename hmm_vma_get_pfns() to hmm_range_snapshot() mm/hmm: improve and rename hmm_vma_fault() to hmm_range_fault() mm/hmm: improve driver API to work and wait over a range mm/hmm: add default fault flags to avoid the need to pre-fill pfns arrays. mm/hmm: add an helper function that fault pages and map them to a device mm/hmm: support hugetlbfs (snap shoting, faulting and DMA mapping) mm/hmm: allow to mirror vma of a file on a DAX backed filesystem mm/hmm: add helpers for driver to safely take the mmap_sem include/linux/hmm.h | 290 ++-- mm/hmm.c| 1060 +-- 2 files changed, 983 insertions(+), 367 deletions(-) -- 2.17.2
[PATCH v4 5/9] mm/mmu_notifier: mmu_notifier_range_update_to_read_only() helper
From: Jérôme Glisse Helper to test if a range is updated to read only (it is still valid to read from the range). This is useful for device driver or anyone who wish to optimize out update when they know that they already have the range map read only. Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Jan Kara Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-de...@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Arnd Bergmann --- include/linux/mmu_notifier.h | 4 mm/mmu_notifier.c| 10 ++ 2 files changed, 14 insertions(+) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 7514775817de..be873c431886 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -257,6 +257,8 @@ extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r, bool only_end); extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); +extern bool +mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range); static inline void mmu_notifier_release(struct mm_struct *mm) { @@ -553,6 +555,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) { } +#define mmu_notifier_range_update_to_read_only(r) false + #define ptep_clear_flush_young_notify ptep_clear_flush_young #define pmdp_clear_flush_young_notify pmdp_clear_flush_young #define ptep_clear_young_notify ptep_test_and_clear_young diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 9c884abc7850..0b2f77715a08 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -395,3 +395,13 @@ void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, mmdrop(mm); } EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release); + +bool +mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range) +{ + if (!range->vma || range->event != MMU_NOTIFY_PROTECTION_VMA) + return false; + /* Return true if the vma still have the read flag set. */ + return range->vma->vm_flags & VM_READ; +} +EXPORT_SYMBOL_GPL(mmu_notifier_range_update_to_read_only); -- 2.17.2
[PATCH v4 9/9] RDMA/umem_odp: optimize out the case when a range is updated to read only
From: Jérôme Glisse When range of virtual address is updated read only and corresponding user ptr object are already read only it is pointless to do anything. Optimize this case out. Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Jan Kara Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-de...@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Arnd Bergmann --- drivers/infiniband/core/umem_odp.c | 22 +++--- include/rdma/ib_umem_odp.h | 1 + 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index a4ec43093cb3..fa4e7fdcabfc 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -140,8 +140,15 @@ static void ib_umem_notifier_release(struct mmu_notifier *mn, static int invalidate_range_start_trampoline(struct ib_umem_odp *item, u64 start, u64 end, void *cookie) { + bool update_to_read_only = *((bool *)cookie); + ib_umem_notifier_start_account(item); - item->umem.context->invalidate_range(item, start, end); + /* +* If it is already read only and we are updating to read only then we +* do not need to change anything. So save time and skip this one. +*/ + if (!update_to_read_only || !item->read_only) + item->umem.context->invalidate_range(item, start, end); return 0; } @@ -150,6 +157,7 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, { struct ib_ucontext_per_mm *per_mm = container_of(mn, struct ib_ucontext_per_mm, mn); + bool update_to_read_only; if (range->blockable) down_read(_mm->umem_rwsem); @@ -166,10 +174,13 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, return 0; } + update_to_read_only = mmu_notifier_range_update_to_read_only(range); + return rbt_ib_umem_for_each_in_range(_mm->umem_tree, range->start, range->end, invalidate_range_start_trampoline, -range->blockable, NULL); +range->blockable, +_to_read_only); } static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start, @@ -363,6 +374,9 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm, goto out_odp_data; } + /* Assume read only at first, each time GUP is call this is updated. */ + odp_data->read_only = true; + odp_data->dma_list = vzalloc(array_size(pages, sizeof(*odp_data->dma_list))); if (!odp_data->dma_list) { @@ -619,8 +633,10 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, goto out_put_task; } - if (access_mask & ODP_WRITE_ALLOWED_BIT) + if (access_mask & ODP_WRITE_ALLOWED_BIT) { + umem_odp->read_only = false; flags |= FOLL_WRITE; + } start_idx = (user_virt - ib_umem_start(umem)) >> page_shift; k = start_idx; diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 0b1446fe2fab..8256668c6170 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -76,6 +76,7 @@ struct ib_umem_odp { struct completion notifier_completion; int dying; struct work_struct work; + bool read_only; }; static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem) -- 2.17.2
[PATCH v4 4/9] mm/mmu_notifier: pass down vma and reasons why mmu notifier is happening
From: Jérôme Glisse CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). Users of mmu notifier API track changes to the CPU page table and take specific action for them. While current API only provide range of virtual address affected by the change, not why the changes is happening This patch is just passing down the new informations by adding it to the mmu_notifier_range structure. Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Jan Kara Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-de...@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Arnd Bergmann --- include/linux/mmu_notifier.h | 4 1 file changed, 4 insertions(+) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index a9808add4070..7514775817de 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -56,9 +56,11 @@ struct mmu_notifier_mm { }; struct mmu_notifier_range { + struct vm_area_struct *vma; struct mm_struct *mm; unsigned long start; unsigned long end; + enum mmu_notifier_event event; bool blockable; }; @@ -354,6 +356,8 @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, unsigned long start, unsigned long end) { + range->vma = vma; + range->event = event; range->mm = mm; range->start = start; range->end = end; -- 2.17.2
[PATCH v4 6/9] gpu/drm/radeon: optimize out the case when a range is updated to read only
From: Jérôme Glisse When range of virtual address is updated read only and corresponding user ptr object are already read only it is pointless to do anything. Optimize this case out. Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Jan Kara Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-de...@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Arnd Bergmann --- drivers/gpu/drm/radeon/radeon_mn.c | 13 + 1 file changed, 13 insertions(+) diff --git a/drivers/gpu/drm/radeon/radeon_mn.c b/drivers/gpu/drm/radeon/radeon_mn.c index b3019505065a..f77294f58e63 100644 --- a/drivers/gpu/drm/radeon/radeon_mn.c +++ b/drivers/gpu/drm/radeon/radeon_mn.c @@ -124,6 +124,7 @@ static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn, struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn); struct ttm_operation_ctx ctx = { false, false }; struct interval_tree_node *it; + bool update_to_read_only; unsigned long end; int ret = 0; @@ -138,6 +139,8 @@ static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn, else if (!mutex_trylock(>lock)) return -EAGAIN; + update_to_read_only = mmu_notifier_range_update_to_read_only(range); + it = interval_tree_iter_first(>objects, range->start, end); while (it) { struct radeon_mn_node *node; @@ -153,10 +156,20 @@ static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn, it = interval_tree_iter_next(it, range->start, end); list_for_each_entry(bo, >bos, mn_list) { + bool read_only; if (!bo->tbo.ttm || bo->tbo.ttm->state != tt_bound) continue; + /* +* If it is already read only and we are updating to +* read only then we do not need to change anything. +* So save time and skip this one. +*/ + read_only = radeon_ttm_tt_is_readonly(bo->tbo.ttm); + if (update_to_read_only && read_only) + continue; + r = radeon_bo_reserve(bo, true); if (r) { DRM_ERROR("(%ld) failed to reserve user bo\n", r); -- 2.17.2
[PATCH v4 2/9] mm/mmu_notifier: contextual information for event triggering invalidation
From: Jérôme Glisse CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). Users of mmu notifier API track changes to the CPU page table and take specific action for them. While current API only provide range of virtual address affected by the change, not why the changes is happening. This patchset do the initial mechanical convertion of all the places that calls mmu_notifier_range_init to also provide the default MMU_NOTIFY_UNMAP event as well as the vma if it is know (most invalidation happens against a given vma). Passing down the vma allows the users of mmu notifier to inspect the new vma page protection. The MMU_NOTIFY_UNMAP is always the safe default as users of mmu notifier should assume that every for the range is going away when that event happens. A latter patch do convert mm call path to use a more appropriate events for each call. This is done as 2 patches so that no call site is forgotten especialy as it uses this following coccinelle patch: %<-- @@ identifier I1, I2, I3, I4; @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *I1, +enum mmu_notifier_event event, +struct vm_area_struct *vma, struct mm_struct *I2, unsigned long I3, unsigned long I4) { ... } @@ @@ -#define mmu_notifier_range_init(range, mm, start, end) +#define mmu_notifier_range_init(range, event, vma, mm, start, end) @@ expression E1, E3, E4; identifier I1; @@ <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, I1, I1->vm_mm, E3, E4) ...> @@ expression E1, E2, E3, E4; identifier FN, VMA; @@ FN(..., struct vm_area_struct *VMA, ...) { <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, VMA, E2, E3, E4) ...> } @@ expression E1, E2, E3, E4; identifier FN, VMA; @@ FN(...) { struct vm_area_struct *VMA; <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, VMA, E2, E3, E4) ...> } @@ expression E1, E2, E3, E4; identifier FN; @@ FN(...) { <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, NULL, E2, E3, E4) ...> } -->% Applied with: spatch --all-includes --sp-file mmu-notifier.spatch fs/proc/task_mmu.c --in-place spatch --sp-file mmu-notifier.spatch --dir kernel/events/ --in-place spatch --sp-file mmu-notifier.spatch --dir mm --in-place Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Jan Kara Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-de...@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Arnd Bergmann --- fs/proc/task_mmu.c | 3 ++- include/linux/mmu_notifier.h | 4 +++- kernel/events/uprobes.c | 3 ++- mm/huge_memory.c | 12 mm/hugetlb.c | 10 ++ mm/khugepaged.c | 3 ++- mm/ksm.c | 6 -- mm/madvise.c | 3 ++- mm/memory.c | 25 - mm/migrate.c | 5 - mm/mprotect.c| 3 ++- mm/mremap.c | 3 ++- mm/oom_kill.c| 3 ++- mm/rmap.c| 6 -- 14 files changed, 59 insertions(+), 30 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f0ec9edab2f3..57e7f98647d3 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1143,7 +1143,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, break; } - mmu_notifier_range_init(, mm, 0, -1UL); + mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, + NULL, mm, 0, -1UL); mmu_notifier_invalidate_range_start(); } walk_page_range(0, mm->highest_vm_end, _refs_walk); diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index abc9dbb7bcb6..a9808add4070 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -348,6 +348,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, + enum mmu_notifier_event event, + struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end) @@ -482,7 +484,7 @@ static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range,
[PATCH v4 7/9] gpu/drm/amdgpu: optimize out the case when a range is updated to read only
From: Jérôme Glisse When range of virtual address is updated read only and corresponding user ptr object are already read only it is pointless to do anything. Optimize this case out. Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Jan Kara Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-de...@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Arnd Bergmann --- drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 13 + 1 file changed, 13 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c index 3e6823fdd939..7880eda064cd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c @@ -294,6 +294,7 @@ static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, { struct amdgpu_mn *amn = container_of(mn, struct amdgpu_mn, mn); struct interval_tree_node *it; + bool update_to_read_only; unsigned long end; /* notification is exclusive, but interval is inclusive */ @@ -302,6 +303,8 @@ static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, if (amdgpu_mn_read_lock(amn, range->blockable)) return -EAGAIN; + update_to_read_only = mmu_notifier_range_update_to_read_only(range); + it = interval_tree_iter_first(>objects, range->start, end); while (it) { struct amdgpu_mn_node *node; @@ -317,6 +320,16 @@ static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, list_for_each_entry(bo, >bos, mn_list) { struct kgd_mem *mem = bo->kfd_bo; + bool read_only; + + /* +* If it is already read only and we are updating to +* read only then we do not need to change anything. +* So save time and skip this one. +*/ + read_only = amdgpu_ttm_tt_is_readonly(bo->tbo.ttm); + if (update_to_read_only && read_only) + continue; if (amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm, range->start, -- 2.17.2
[PATCH v4 3/9] mm/mmu_notifier: use correct mmu_notifier events for each invalidation
From: Jérôme Glisse This update each existing invalidation to use the correct mmu notifier event that represent what is happening to the CPU page table. See the patch which introduced the events to see the rational behind this. Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Jan Kara Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-de...@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Arnd Bergmann --- fs/proc/task_mmu.c | 2 +- kernel/events/uprobes.c | 2 +- mm/huge_memory.c| 14 ++ mm/hugetlb.c| 7 --- mm/khugepaged.c | 2 +- mm/ksm.c| 4 ++-- mm/madvise.c| 2 +- mm/memory.c | 16 mm/migrate.c| 4 ++-- mm/mprotect.c | 5 +++-- mm/rmap.c | 6 +++--- 11 files changed, 32 insertions(+), 32 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 57e7f98647d3..cce226f3305f 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1143,7 +1143,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, break; } - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, + mmu_notifier_range_init(, MMU_NOTIFY_SOFT_DIRTY, NULL, mm, 0, -1UL); mmu_notifier_invalidate_range_start(); } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index b67fe7e59621..87e76a1dc758 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -174,7 +174,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct mmu_notifier_range range; struct mem_cgroup *memcg; - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, vma, mm, addr, + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, vma, mm, addr, addr + PAGE_SIZE); VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b353e8b7876f..957d23754217 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1182,9 +1182,8 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, cond_resched(); } - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, vma, vma->vm_mm, - haddr, - haddr + HPAGE_PMD_SIZE); + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, vma, vma->vm_mm, + haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(); vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); @@ -1346,9 +1345,8 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, vma, vma->vm_mm, - haddr, - haddr + HPAGE_PMD_SIZE); + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, vma, vma->vm_mm, + haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(); spin_lock(vmf->ptl); @@ -2025,7 +2023,7 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, vma, vma->vm_mm, + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, vma, vma->vm_mm, address & HPAGE_PUD_MASK, (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); mmu_notifier_invalidate_range_start(); @@ -2244,7 +2242,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, vma, vma->vm_mm, + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, vma, vma->vm_mm, address & HPAGE_PMD_MASK, (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cbda46ad6a30..f691398ac6b6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3246,7 +3246,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; if (cow) { - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, vma, src, + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, vma, src, vma->vm_start,
[PATCH v4 1/9] mm/mmu_notifier: contextual information for event enums
From: Jérôme Glisse CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). This patch introduce a set of enums that can be associated with each of the events triggering a mmu notifier. Latter patches take advantages of those enum values. - UNMAP: munmap() or mremap() - CLEAR: page table is cleared (migration, compaction, reclaim, ...) - PROTECTION_VMA: change in access protections for the range - PROTECTION_PAGE: change in access protections for page in the range - SOFT_DIRTY: soft dirtyness tracking Being able to identify munmap() and mremap() from other reasons why the page table is cleared is important to allow user of mmu notifier to update their own internal tracking structure accordingly (on munmap or mremap it is not longer needed to track range of virtual address as it becomes invalid). Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Jan Kara Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-de...@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Arnd Bergmann --- include/linux/mmu_notifier.h | 30 ++ 1 file changed, 30 insertions(+) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 4050ec1c3b45..abc9dbb7bcb6 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -10,6 +10,36 @@ struct mmu_notifier; struct mmu_notifier_ops; +/** + * enum mmu_notifier_event - reason for the mmu notifier callback + * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that + * move the range + * + * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like + * madvise() or replacing a page by another one, ...). + * + * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range + * ie using the vma access permission (vm_page_prot) to update the whole range + * is enough no need to inspect changes to the CPU page table (mprotect() + * syscall) + * + * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for + * pages in the range so to mirror those changes the user must inspect the CPU + * page table (from the end callback). + * + * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same + * access flags). User should soft dirty the page in the end callback to make + * sure that anyone relying on soft dirtyness catch pages that might be written + * through non CPU mappings. + */ +enum mmu_notifier_event { + MMU_NOTIFY_UNMAP = 0, + MMU_NOTIFY_CLEAR, + MMU_NOTIFY_PROTECTION_VMA, + MMU_NOTIFY_PROTECTION_PAGE, + MMU_NOTIFY_SOFT_DIRTY, +}; + #ifdef CONFIG_MMU_NOTIFIER /* -- 2.17.2
[PATCH v4 8/9] gpu/drm/i915: optimize out the case when a range is updated to read only
From: Jérôme Glisse When range of virtual address is updated read only and corresponding user ptr object are already read only it is pointless to do anything. Optimize this case out. Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Jan Kara Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-de...@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Arnd Bergmann --- drivers/gpu/drm/i915/i915_gem_userptr.c | 16 1 file changed, 16 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index 9558582c105e..23330ac3d7ea 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c @@ -59,6 +59,7 @@ struct i915_mmu_object { struct interval_tree_node it; struct list_head link; struct work_struct work; + bool read_only; bool attached; }; @@ -119,6 +120,7 @@ static int i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, container_of(_mn, struct i915_mmu_notifier, mn); struct i915_mmu_object *mo; struct interval_tree_node *it; + bool update_to_read_only; LIST_HEAD(cancelled); unsigned long end; @@ -128,6 +130,8 @@ static int i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, /* interval ranges are inclusive, but invalidate range is exclusive */ end = range->end - 1; + update_to_read_only = mmu_notifier_range_update_to_read_only(range); + spin_lock(>lock); it = interval_tree_iter_first(>objects, range->start, end); while (it) { @@ -145,6 +149,17 @@ static int i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, * object if it is not in the process of being destroyed. */ mo = container_of(it, struct i915_mmu_object, it); + + /* +* If it is already read only and we are updating to +* read only then we do not need to change anything. +* So save time and skip this one. +*/ + if (update_to_read_only && mo->read_only) { + it = interval_tree_iter_next(it, range->start, end); + continue; + } + if (kref_get_unless_zero(>obj->base.refcount)) queue_work(mn->wq, >work); @@ -270,6 +285,7 @@ i915_gem_userptr_init__mmu_notifier(struct drm_i915_gem_object *obj, mo->mn = mn; mo->obj = obj; mo->it.start = obj->userptr.ptr; + mo->read_only = i915_gem_object_is_readonly(obj); mo->it.last = obj->userptr.ptr + obj->base.size - 1; INIT_WORK(>work, cancel_userptr); -- 2.17.2
[PATCH v4 0/9] mmu notifier provide context informations
From: Jérôme Glisse Hi Andrew, i see that you still have my event patch in you queue [1]. This patchset replace that single patch and is broken down in further step so that it is easier to review and ascertain that no mistake were made during mechanical changes. Here are the step: Patch 1 - add the enum values Patch 2 - coccinelle semantic patch to convert all call site of mmu_notifier_range_init to default enum value and also to passing down the vma when it is available Patch 3 - update many call site to more accurate enum values Patch 4 - add the information to the mmu_notifier_range struct Patch 5 - helper to test if a range is updated to read only All the remaining patches are update to various driver to demonstrate how this new information get use by device driver. I build tested with make all and make all minus everything that enable mmu notifier ie building with MMU_NOTIFIER=no. Also tested with some radeon,amd gpu and intel gpu. If they are no objections i believe best plan would be to merge the the first 5 patches (all mm changes) through your queue for 5.1 and then to delay driver update to each individual driver tree for 5.2. This will allow each individual device driver maintainer time to more thouroughly test this more then my own testing. Note that i also intend to use this feature further in nouveau and HMM down the road. I also expect that other user like KVM might be interested into leveraging this new information to optimize some of there secondary page table invalidation. Here is an explaination on the rational for this patchset: CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). This patch introduce a set of enums that can be associated with each of the events triggering a mmu notifier. Latter patches take advantages of those enum values. - UNMAP: munmap() or mremap() - CLEAR: page table is cleared (migration, compaction, reclaim, ...) - PROTECTION_VMA: change in access protections for the range - PROTECTION_PAGE: change in access protections for page in the range - SOFT_DIRTY: soft dirtyness tracking Being able to identify munmap() and mremap() from other reasons why the page table is cleared is important to allow user of mmu notifier to update their own internal tracking structure accordingly (on munmap or mremap it is not longer needed to track range of virtual address as it becomes invalid). [1] https://www.ozlabs.org/~akpm/mmotm/broken-out/mm-mmu_notifier-contextual-information-for-event-triggering-invalidation-v2.patch Cc: Christian König Cc: Jan Kara Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-de...@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Arnd Bergmann Jérôme Glisse (9): mm/mmu_notifier: contextual information for event enums mm/mmu_notifier: contextual information for event triggering invalidation mm/mmu_notifier: use correct mmu_notifier events for each invalidation mm/mmu_notifier: pass down vma and reasons why mmu notifier is happening mm/mmu_notifier: mmu_notifier_range_update_to_read_only() helper gpu/drm/radeon: optimize out the case when a range is updated to read only gpu/drm/amdgpu: optimize out the case when a range is updated to read only gpu/drm/i915: optimize out the case when a range is updated to read only RDMA/umem_odp: optimize out the case when a range is updated to read only drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 13 drivers/gpu/drm/i915/i915_gem_userptr.c | 16 ++ drivers/gpu/drm/radeon/radeon_mn.c | 13 drivers/infiniband/core/umem_odp.c | 22 +++-- fs/proc/task_mmu.c | 3 +- include/linux/mmu_notifier.h| 42 - include/rdma/ib_umem_odp.h | 1 + kernel/events/uprobes.c | 3 +- mm/huge_memory.c| 14 + mm/hugetlb.c| 11 --- mm/khugepaged.c | 3 +- mm/ksm.c| 6 ++-- mm/madvise.c| 3 +- mm/memory.c | 25 +-- mm/migrate.c| 5 ++- mm/mmu_notifier.c | 10 ++ mm/mprotect.c | 4 ++- mm/mremap.c | 3 +- mm/oom_kill.c | 3 +- mm/rmap.c | 6 ++-- 20 files changed, 171 insertions(+), 35 deletions(-) -- 2.17.2
[PATCH v3 3/3] mm/mmu_notifier: contextual information for event triggering invalidation v2
From: Jérôme Glisse CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). Users of mmu notifier API track changes to the CPU page table and take specific action for them. While current API only provide range of virtual address affected by the change, not why the changes is happening. This patchset adds event information so that users of mmu notifier can differentiate among broad category: - UNMAP: munmap() or mremap() - CLEAR: page table is cleared (migration, compaction, reclaim, ...) - PROTECTION_VMA: change in access protections for the range - PROTECTION_PAGE: change in access protections for page in the range - SOFT_DIRTY: soft dirtyness tracking Being able to identify munmap() and mremap() from other reasons why the page table is cleared is important to allow user of mmu notifier to update their own internal tracking structure accordingly (on munmap or mremap it is not longer needed to track range of virtual address as it becomes invalid). Changes since v1: - use mmu_notifier_range_init() helper to to optimize out the case when mmu notifier is not enabled - use kernel doc format for describing the enum values Signed-off-by: Jérôme Glisse Acked-by: Christian König Acked-by: Jan Kara Acked-by: Felix Kuehling Acked-by: Jason Gunthorpe Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-de...@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Arnd Bergmann --- fs/dax.c | 7 +++ fs/proc/task_mmu.c | 3 ++- include/linux/mmu_notifier.h | 35 +-- kernel/events/uprobes.c | 3 ++- mm/huge_memory.c | 12 mm/hugetlb.c | 10 ++ mm/khugepaged.c | 3 ++- mm/ksm.c | 6 -- mm/madvise.c | 3 ++- mm/memory.c | 18 -- mm/migrate.c | 5 +++-- mm/mprotect.c| 3 ++- mm/mremap.c | 3 ++- mm/oom_kill.c| 2 +- mm/rmap.c| 6 -- 15 files changed, 90 insertions(+), 29 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 874085bacaf5..6056b03a1626 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -768,6 +768,13 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, address = pgoff_address(index, vma); + /* +* All the field are populated by follow_pte_pmd() except +* the event field. +*/ + mmu_notifier_range_init(, NULL, 0, -1UL, + MMU_NOTIFY_PROTECTION_PAGE); + /* * Note because we provide start/end to follow_pte_pmd it will * call mmu_notifier_invalidate_range_start() on our behalf diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index b3ddceb003bc..f68a9ebb0218 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1141,7 +1141,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, break; } - mmu_notifier_range_init(, mm, 0, -1UL); + mmu_notifier_range_init(, mm, 0, -1UL, + MMU_NOTIFY_SOFT_DIRTY); mmu_notifier_invalidate_range_start(); } walk_page_range(0, mm->highest_vm_end, _refs_walk); diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 39b06772427f..d249e24acea5 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -25,10 +25,39 @@ struct mmu_notifier_mm { spinlock_t lock; }; +/** + * enum mmu_notifier_event - reason for the mmu notifier callback + * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that + * move the range + * + * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like + * madvise() or replacing a page by another one, ...). + * + * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range + * ie using the vma access permission (vm_page_prot) to update the whole range + * is enough no need to inspect changes to the CPU page table (mprotect() + * syscall) + * + * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for + * pages in the range so to mirror those changes the user must inspect the CPU + * page table (from the end callback). + * + * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same + *
[PATCH v3 1/3] mm/mmu_notifier: use structure for invalidate_range_start/end callback v2
From: Jérôme Glisse To avoid having to change many callback definition everytime we want to add a parameter use a structure to group all parameters for the mmu_notifier invalidate_range_start/end callback. No functional changes with this patch. Changed since v1: - fix make htmldocs warning in amdgpu_mn.c Signed-off-by: Jérôme Glisse Acked-by: Jan Kara Acked-by: Felix Kuehling Acked-by: Jason Gunthorpe Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-de...@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Arnd Bergmann --- drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 47 +++-- drivers/gpu/drm/i915/i915_gem_userptr.c | 14 drivers/gpu/drm/radeon/radeon_mn.c | 16 - drivers/infiniband/core/umem_odp.c | 20 +-- drivers/infiniband/hw/hfi1/mmu_rb.c | 13 +++ drivers/misc/mic/scif/scif_dma.c| 11 ++ drivers/misc/sgi-gru/grutlbpurge.c | 14 drivers/xen/gntdev.c| 12 +++ include/linux/mmu_notifier.h| 14 +--- mm/hmm.c| 23 +--- mm/mmu_notifier.c | 21 +-- virt/kvm/kvm_main.c | 14 +++- 12 files changed, 103 insertions(+), 116 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c index e55508b39496..3e6823fdd939 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c @@ -238,44 +238,40 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node, * amdgpu_mn_invalidate_range_start_gfx - callback to notify about mm change * * @mn: our notifier - * @mm: the mm this callback is about - * @start: start of updated range - * @end: end of updated range + * @range: mmu notifier context * * Block for operations on BOs to finish and mark pages as accessed and * potentially dirty. */ static int amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn, -struct mm_struct *mm, -unsigned long start, -unsigned long end, -bool blockable) + const struct mmu_notifier_range *range) { struct amdgpu_mn *amn = container_of(mn, struct amdgpu_mn, mn); struct interval_tree_node *it; + unsigned long end; /* notification is exclusive, but interval is inclusive */ - end -= 1; + end = range->end - 1; /* TODO we should be able to split locking for interval tree and * amdgpu_mn_invalidate_node */ - if (amdgpu_mn_read_lock(amn, blockable)) + if (amdgpu_mn_read_lock(amn, range->blockable)) return -EAGAIN; - it = interval_tree_iter_first(>objects, start, end); + it = interval_tree_iter_first(>objects, range->start, end); while (it) { struct amdgpu_mn_node *node; - if (!blockable) { + if (!range->blockable) { amdgpu_mn_read_unlock(amn); return -EAGAIN; } node = container_of(it, struct amdgpu_mn_node, it); - it = interval_tree_iter_next(it, start, end); + it = interval_tree_iter_next(it, range->start, end); - amdgpu_mn_invalidate_node(node, start, end); + amdgpu_mn_invalidate_node(node, range->start, end); } return 0; @@ -294,39 +290,38 @@ static int amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn, * are restorted in amdgpu_mn_invalidate_range_end_hsa. */ static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, -struct mm_struct *mm, -unsigned long start, -unsigned long end, -bool blockable) + const struct mmu_notifier_range *range) { struct amdgpu_mn *amn = container_of(mn, struct amdgpu_mn, mn); struct interval_tree_node *it; + unsigned long end; /* notification is exclusive, but interval is inclusive */ - end -= 1; + end = range->end - 1; - if (amdgpu_mn_read_lock(amn, blockable)) + if (amdgpu_mn_read_lock(amn, range->blockable)) return -EAGAIN; - it = interval_tree_iter_first(>objects, start, end); + it = interval_tree_iter_first(>objects, range->start, end); while (it) { struct amdgpu_mn_node *node;