[PATCH 19/20] dax: Protect PTE modification on WP fault by radix tree entry lock

2016-11-18 Thread Jan Kara
Currently PTE gets updated in wp_pfn_shared() after dax_pfn_mkwrite()
has released corresponding radix tree entry lock. When we want to
writeprotect PTE on cache flush, we need PTE modification to happen
under radix tree entry lock to ensure consistent updates of PTE and radix
tree (standard faults use page lock to ensure this consistency). So move
update of PTE bit into dax_pfn_mkwrite().

Reviewed-by: Ross Zwisler 
Signed-off-by: Jan Kara 
---
 fs/dax.c| 22 --
 mm/memory.c |  2 +-
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 2d317328ae90..d64465584f4c 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -782,17 +782,27 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct 
vm_fault *vmf)
 {
struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
-   void *entry;
+   void *entry, **slot;
pgoff_t index = vmf->pgoff;
 
spin_lock_irq(>tree_lock);
-   entry = get_unlocked_mapping_entry(mapping, index, NULL);
-   if (!entry || !radix_tree_exceptional_entry(entry))
-   goto out;
+   entry = get_unlocked_mapping_entry(mapping, index, );
+   if (!entry || !radix_tree_exceptional_entry(entry)) {
+   if (entry)
+   put_unlocked_mapping_entry(mapping, index, entry);
+   spin_unlock_irq(>tree_lock);
+   return VM_FAULT_NOPAGE;
+   }
radix_tree_tag_set(>page_tree, index, PAGECACHE_TAG_DIRTY);
-   put_unlocked_mapping_entry(mapping, index, entry);
-out:
+   entry = lock_slot(mapping, slot);
spin_unlock_irq(>tree_lock);
+   /*
+* If we race with somebody updating the PTE and finish_mkwrite_fault()
+* fails, we don't care. We need to return VM_FAULT_NOPAGE and retry
+* the fault in either case.
+*/
+   finish_mkwrite_fault(vmf);
+   put_locked_mapping_entry(mapping, index, entry);
return VM_FAULT_NOPAGE;
 }
 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
diff --git a/mm/memory.c b/mm/memory.c
index d4874d3733f4..e37250fc54c2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2319,7 +2319,7 @@ static int wp_pfn_shared(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
vmf->flags |= FAULT_FLAG_MKWRITE;
ret = vma->vm_ops->pfn_mkwrite(vma, vmf);
-   if (ret & VM_FAULT_ERROR)
+   if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
return ret;
return finish_mkwrite_fault(vmf);
}
-- 
2.6.6

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[PATCH 19/20] dax: Protect PTE modification on WP fault by radix tree entry lock

2016-11-01 Thread Jan Kara
Currently PTE gets updated in wp_pfn_shared() after dax_pfn_mkwrite()
has released corresponding radix tree entry lock. When we want to
writeprotect PTE on cache flush, we need PTE modification to happen
under radix tree entry lock to ensure consisten updates of PTE and radix
tree (standard faults use page lock to ensure this consistency). So move
update of PTE bit into dax_pfn_mkwrite().

Signed-off-by: Jan Kara 
---
 fs/dax.c| 22 --
 mm/memory.c |  2 +-
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index c6cadf8413a3..a2d3781c9f4e 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1163,17 +1163,27 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct 
vm_fault *vmf)
 {
struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
-   void *entry;
+   void *entry, **slot;
pgoff_t index = vmf->pgoff;
 
spin_lock_irq(>tree_lock);
-   entry = get_unlocked_mapping_entry(mapping, index, NULL);
-   if (!entry || !radix_tree_exceptional_entry(entry))
-   goto out;
+   entry = get_unlocked_mapping_entry(mapping, index, );
+   if (!entry || !radix_tree_exceptional_entry(entry)) {
+   if (entry)
+   put_unlocked_mapping_entry(mapping, index, entry);
+   spin_unlock_irq(>tree_lock);
+   return VM_FAULT_NOPAGE;
+   }
radix_tree_tag_set(>page_tree, index, PAGECACHE_TAG_DIRTY);
-   put_unlocked_mapping_entry(mapping, index, entry);
-out:
+   entry = lock_slot(mapping, slot);
spin_unlock_irq(>tree_lock);
+   /*
+* If we race with somebody updating the PTE and finish_mkwrite_fault()
+* fails, we don't care. We need to return VM_FAULT_NOPAGE and retry
+* the fault in either case.
+*/
+   finish_mkwrite_fault(vmf);
+   put_locked_mapping_entry(mapping, index, entry);
return VM_FAULT_NOPAGE;
 }
 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
diff --git a/mm/memory.c b/mm/memory.c
index e7a4a30a5e88..5fa3d0c5196e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2310,7 +2310,7 @@ static int wp_pfn_shared(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
vmf->flags |= FAULT_FLAG_MKWRITE;
ret = vma->vm_ops->pfn_mkwrite(vma, vmf);
-   if (ret & VM_FAULT_ERROR)
+   if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
return ret;
return finish_mkwrite_fault(vmf);
}
-- 
2.6.6

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH 19/20] dax: Protect PTE modification on WP fault by radix tree entry lock

2016-10-19 Thread Ross Zwisler
On Wed, Oct 19, 2016 at 09:25:05AM +0200, Jan Kara wrote:
> On Tue 18-10-16 13:53:32, Ross Zwisler wrote:
> > On Tue, Sep 27, 2016 at 06:08:23PM +0200, Jan Kara wrote:
> > > - void *entry;
> > > + void *entry, **slot;
> > >   pgoff_t index = vmf->pgoff;
> > >  
> > >   spin_lock_irq(>tree_lock);
> > > - entry = get_unlocked_mapping_entry(mapping, index, NULL);
> > > - if (!entry || !radix_tree_exceptional_entry(entry))
> > > - goto out;
> > > + entry = get_unlocked_mapping_entry(mapping, index, );
> > > + if (!entry || !radix_tree_exceptional_entry(entry)) {
> > > + if (entry)
> > > + put_unlocked_mapping_entry(mapping, index, entry);
> > 
> > I don't think you need this call to put_unlocked_mapping_entry().  If we get
> > in here we know that 'entry' is a page cache page, in which case
> > put_unlocked_mapping_entry() will just return without doing any work.
> 
> Right, but that is just an implementation detail internal to how the
> locking works. The rules are simple to avoid issues and thus the invariant
> is: Once you call get_unlocked_mapping_entry() you either have to lock the
> entry and then call put_locked_mapping_entry() or you have to drop it with
> put_unlocked_mapping_entry(). Once you add arguments about entry types
> etc., errors are much easier to make...

Makes sense.  You can add:

Reviewed-by: Ross Zwisler 
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH 19/20] dax: Protect PTE modification on WP fault by radix tree entry lock

2016-10-19 Thread Jan Kara
On Tue 18-10-16 13:53:32, Ross Zwisler wrote:
> On Tue, Sep 27, 2016 at 06:08:23PM +0200, Jan Kara wrote:
> > -   void *entry;
> > +   void *entry, **slot;
> > pgoff_t index = vmf->pgoff;
> >  
> > spin_lock_irq(>tree_lock);
> > -   entry = get_unlocked_mapping_entry(mapping, index, NULL);
> > -   if (!entry || !radix_tree_exceptional_entry(entry))
> > -   goto out;
> > +   entry = get_unlocked_mapping_entry(mapping, index, );
> > +   if (!entry || !radix_tree_exceptional_entry(entry)) {
> > +   if (entry)
> > +   put_unlocked_mapping_entry(mapping, index, entry);
> 
> I don't think you need this call to put_unlocked_mapping_entry().  If we get
> in here we know that 'entry' is a page cache page, in which case
> put_unlocked_mapping_entry() will just return without doing any work.

Right, but that is just an implementation detail internal to how the
locking works. The rules are simple to avoid issues and thus the invariant
is: Once you call get_unlocked_mapping_entry() you either have to lock the
entry and then call put_locked_mapping_entry() or you have to drop it with
put_unlocked_mapping_entry(). Once you add arguments about entry types
etc., errors are much easier to make...

Honza
-- 
Jan Kara 
SUSE Labs, CR
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[PATCH 19/20] dax: Protect PTE modification on WP fault by radix tree entry lock

2016-09-27 Thread Jan Kara
Currently PTE gets updated in wp_pfn_shared() after dax_pfn_mkwrite()
has released corresponding radix tree entry lock. When we want to
writeprotect PTE on cache flush, we need PTE modification to happen
under radix tree entry lock to ensure consisten updates of PTE and radix
tree (standard faults use page lock to ensure this consistency). So move
update of PTE bit into dax_pfn_mkwrite().

Signed-off-by: Jan Kara 
---
 fs/dax.c| 22 --
 mm/memory.c |  2 +-
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index c6cadf8413a3..a2d3781c9f4e 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1163,17 +1163,27 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct 
vm_fault *vmf)
 {
struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
-   void *entry;
+   void *entry, **slot;
pgoff_t index = vmf->pgoff;
 
spin_lock_irq(>tree_lock);
-   entry = get_unlocked_mapping_entry(mapping, index, NULL);
-   if (!entry || !radix_tree_exceptional_entry(entry))
-   goto out;
+   entry = get_unlocked_mapping_entry(mapping, index, );
+   if (!entry || !radix_tree_exceptional_entry(entry)) {
+   if (entry)
+   put_unlocked_mapping_entry(mapping, index, entry);
+   spin_unlock_irq(>tree_lock);
+   return VM_FAULT_NOPAGE;
+   }
radix_tree_tag_set(>page_tree, index, PAGECACHE_TAG_DIRTY);
-   put_unlocked_mapping_entry(mapping, index, entry);
-out:
+   entry = lock_slot(mapping, slot);
spin_unlock_irq(>tree_lock);
+   /*
+* If we race with somebody updating the PTE and finish_mkwrite_fault()
+* fails, we don't care. We need to return VM_FAULT_NOPAGE and retry
+* the fault in either case.
+*/
+   finish_mkwrite_fault(vmf);
+   put_locked_mapping_entry(mapping, index, entry);
return VM_FAULT_NOPAGE;
 }
 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
diff --git a/mm/memory.c b/mm/memory.c
index e7a4a30a5e88..5fa3d0c5196e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2310,7 +2310,7 @@ static int wp_pfn_shared(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
vmf->flags |= FAULT_FLAG_MKWRITE;
ret = vma->vm_ops->pfn_mkwrite(vma, vmf);
-   if (ret & VM_FAULT_ERROR)
+   if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
return ret;
return finish_mkwrite_fault(vmf);
}
-- 
2.6.6

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm