On Wed 14-08-24 17:25:31, Josef Bacik wrote:
> FS_PRE_ACCESS or FS_PRE_MODIFY will be generated on page fault depending
> on the faulting method.
> 
> This pre-content event is meant to be used by hierarchical storage
> managers that want to fill in the file content on first read access.
> 
> Export a simple helper that file systems that have their own ->fault()
> will use, and have a more complicated helper to be do fancy things with
> in filemap_fault.
> 
> Signed-off-by: Josef Bacik <[email protected]>

Looks good to me. Just let's CC MM guys for awareness about changes to
filemap_fault().
                                                                Honza

> ---
>  include/linux/mm.h |   1 +
>  mm/filemap.c       | 116 ++++++++++++++++++++++++++++++++++++++++++---
>  2 files changed, 110 insertions(+), 7 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index ab3d78116043..3e190f0a0997 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3503,6 +3503,7 @@ extern vm_fault_t filemap_fault(struct vm_fault *vmf);
>  extern vm_fault_t filemap_map_pages(struct vm_fault *vmf,
>               pgoff_t start_pgoff, pgoff_t end_pgoff);
>  extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);
> +extern vm_fault_t filemap_maybe_emit_fsnotify_event(struct vm_fault *vmf);
>  
>  extern unsigned long stack_guard_gap;
>  /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 8b1684b62177..50e88e47dff3 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -46,6 +46,7 @@
>  #include <linux/pipe_fs_i.h>
>  #include <linux/splice.h>
>  #include <linux/rcupdate_wait.h>
> +#include <linux/fsnotify.h>
>  #include <asm/pgalloc.h>
>  #include <asm/tlbflush.h>
>  #include "internal.h"
> @@ -3112,13 +3113,13 @@ static int lock_folio_maybe_drop_mmap(struct vm_fault 
> *vmf, struct folio *folio,
>   * that.  If we didn't pin a file then we return NULL.  The file that is
>   * returned needs to be fput()'ed when we're done with it.
>   */
> -static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
> +static struct file *do_sync_mmap_readahead(struct vm_fault *vmf,
> +                                        struct file *fpin)
>  {
>       struct file *file = vmf->vma->vm_file;
>       struct file_ra_state *ra = &file->f_ra;
>       struct address_space *mapping = file->f_mapping;
>       DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
> -     struct file *fpin = NULL;
>       unsigned long vm_flags = vmf->vma->vm_flags;
>       unsigned int mmap_miss;
>  
> @@ -3190,12 +3191,12 @@ static struct file *do_sync_mmap_readahead(struct 
> vm_fault *vmf)
>   * was pinned if we have to drop the mmap_lock in order to do IO.
>   */
>  static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
> -                                         struct folio *folio)
> +                                         struct folio *folio,
> +                                         struct file *fpin)
>  {
>       struct file *file = vmf->vma->vm_file;
>       struct file_ra_state *ra = &file->f_ra;
>       DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff);
> -     struct file *fpin = NULL;
>       unsigned int mmap_miss;
>  
>       /* See comment in do_sync_mmap_readahead. */
> @@ -3260,6 +3261,93 @@ static vm_fault_t 
> filemap_fault_recheck_pte_none(struct vm_fault *vmf)
>       return ret;
>  }
>  
> +/*
> + * If we have pre-content watches on this file we will need to emit an event 
> for
> + * this range.  We will handle dropping the lock and emitting the event.
> + *
> + * If FAULT_FLAG_RETRY_NOWAIT is set then we'll return VM_FAULT_RETRY.
> + *
> + * If no event was emitted then *fpin will be NULL and we will return 0.
> + *
> + * If any error occurred we will return VM_FAULT_SIGBUS, *fpin could still be
> + * set and will need to have fput() called on it.
> + *
> + * If we emitted the event then we will return 0 and *fpin will be set, this
> + * must have fput() called on it, and the caller must call VM_FAULT_RETRY 
> after
> + * any other operations it does in order to re-fault the page and make sure 
> the
> + * appropriate locking is maintained.
> + *
> + * Return: the appropriate vm_fault_t return code, 0 on success.
> + */
> +static vm_fault_t __filemap_maybe_emit_fsnotify_event(struct vm_fault *vmf,
> +                                                   struct file **fpin)
> +{
> +     struct file *file = vmf->vma->vm_file;
> +     loff_t pos = vmf->pgoff << PAGE_SHIFT;
> +     int mask = (vmf->flags & FAULT_FLAG_WRITE) ? MAY_WRITE : MAY_ACCESS;
> +     int ret;
> +
> +     /*
> +      * We already did this and now we're retrying with everything locked,
> +      * don't emit the event and continue.
> +      */
> +     if (vmf->flags & FAULT_FLAG_TRIED)
> +             return 0;
> +
> +     /* No watches, return NULL. */
> +     if (!fsnotify_file_has_pre_content_watches(file))
> +             return 0;
> +
> +     /* We are NOWAIT, we can't wait, just return EAGAIN. */
> +     if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
> +             return VM_FAULT_RETRY;
> +
> +     /*
> +      * If this fails then we're not allowed to drop the fault lock, return a
> +      * SIGBUS so we don't errantly populate pagecache with bogus data for
> +      * this file.
> +      */
> +     *fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
> +     if (*fpin == NULL)
> +             return VM_FAULT_SIGBUS;
> +
> +     /*
> +      * We can't fput(*fpin) at this point because we could have been passed
> +      * in fpin from a previous call.
> +      */
> +     ret = fsnotify_file_area_perm(*fpin, mask, &pos, PAGE_SIZE);
> +     if (ret)
> +             return VM_FAULT_SIGBUS;
> +
> +     return 0;
> +}
> +
> +/**
> + * filemap_maybe_emit_fsnotify_event - maybe emit a pre-content event.
> + * @vmf:     struct vm_fault containing details of the fault.
> + *
> + * If we have a pre-content watch on this file we will emit an event for this
> + * range.  If we return anything the fault caller should return immediately, 
> we
> + * will return VM_FAULT_RETRY if we had to emit an event, which will trigger 
> the
> + * fault again and then the fault handler will run the second time through.
> + *
> + * Return: a bitwise-OR of %VM_FAULT_ codes, 0 if nothing happened.
> + */
> +vm_fault_t filemap_maybe_emit_fsnotify_event(struct vm_fault *vmf)
> +{
> +     struct file *fpin = NULL;
> +     vm_fault_t ret;
> +
> +     ret = __filemap_maybe_emit_fsnotify_event(vmf, &fpin);
> +     if (fpin) {
> +             fput(fpin);
> +             if (!ret)
> +                     ret = VM_FAULT_RETRY;
> +     }
> +     return ret;
> +}
> +EXPORT_SYMBOL_GPL(filemap_maybe_emit_fsnotify_event);
> +
>  /**
>   * filemap_fault - read in file data for page fault handling
>   * @vmf:     struct vm_fault containing details of the fault
> @@ -3299,6 +3387,17 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
>       if (unlikely(index >= max_idx))
>               return VM_FAULT_SIGBUS;
>  
> +     /*
> +      * If we have pre-content watchers then we need to generate events on
> +      * page fault so that we can populate any data before the fault.
> +      */
> +     ret = __filemap_maybe_emit_fsnotify_event(vmf, &fpin);
> +     if (unlikely(ret)) {
> +             if (fpin)
> +                     fput(fpin);
> +             return ret;
> +     }
> +
>       /*
>        * Do we have something in the page cache already?
>        */
> @@ -3309,21 +3408,24 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
>                * the lock.
>                */
>               if (!(vmf->flags & FAULT_FLAG_TRIED))
> -                     fpin = do_async_mmap_readahead(vmf, folio);
> +                     fpin = do_async_mmap_readahead(vmf, folio, fpin);
>               if (unlikely(!folio_test_uptodate(folio))) {
>                       filemap_invalidate_lock_shared(mapping);
>                       mapping_locked = true;
>               }
>       } else {
>               ret = filemap_fault_recheck_pte_none(vmf);
> -             if (unlikely(ret))
> +             if (unlikely(ret)) {
> +                     if (fpin)
> +                             goto out_retry;
>                       return ret;
> +             }
>  
>               /* No page in the page cache at all */
>               count_vm_event(PGMAJFAULT);
>               count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
>               ret = VM_FAULT_MAJOR;
> -             fpin = do_sync_mmap_readahead(vmf);
> +             fpin = do_sync_mmap_readahead(vmf, fpin);
>  retry_find:
>               /*
>                * See comment in filemap_create_folio() why we need
> -- 
> 2.43.0
> 
-- 
Jan Kara <[email protected]>
SUSE Labs, CR

Reply via email to