Re: [PATCH v5 06/11] mm, dax: enable filesystems to trigger dev_pagemap ->page_free callbacks
On Fri, Mar 09, 2018 at 10:55:21PM -0800, Dan Williams wrote: > In order to resolve collisions between filesystem operations and DMA to > DAX mapped pages we need a callback when DMA completes. With a callback > we can hold off filesystem operations while DMA is in-flight and then > resume those operations when the last put_page() occurs on a DMA page. > > Recall that the 'struct page' entries for DAX memory are created with > devm_memremap_pages(). That routine arranges for the pages to be > allocated, but never onlined, so a DAX page is DMA-idle when its > reference count reaches one. > > Also recall that the HMM sub-system added infrastructure to trap the > page-idle (2-to-1 reference count) transition of the pages allocated by > devm_memremap_pages() and trigger a callback via the 'struct > dev_pagemap' associated with the page range. Whereas the HMM callbacks > are going to a device driver to manage bounce pages in device-memory in > the filesystem-dax case we will call back to filesystem specified > callback. > > Since the callback is not known at devm_memremap_pages() time we arrange > for the filesystem to install it at mount time. No functional changes > are expected as this only registers a nop handler for the ->page_free() > event for device-mapped pages. > > Cc: Michal HockoLooks good to me Reviewed-by: "Jérôme Glisse" > Reviewed-by: Christoph Hellwig > Signed-off-by: Dan Williams > --- > drivers/dax/super.c | 79 > -- > drivers/nvdimm/pmem.c|3 +- > fs/ext2/super.c |6 ++- > fs/ext4/super.c |6 ++- > fs/xfs/xfs_super.c | 20 ++-- > include/linux/dax.h | 17 +- > include/linux/memremap.h |8 + > 7 files changed, 103 insertions(+), 36 deletions(-) > > diff --git a/drivers/dax/super.c b/drivers/dax/super.c > index 2b2332b605e4..ecefe9f7eb60 100644 > --- a/drivers/dax/super.c > +++ b/drivers/dax/super.c > @@ -29,6 +29,7 @@ static struct vfsmount *dax_mnt; > static DEFINE_IDA(dax_minor_ida); > static struct kmem_cache *dax_cache __read_mostly; > static struct super_block *dax_superblock __read_mostly; > +static DEFINE_MUTEX(devmap_lock); > > #define DAX_HASH_SIZE (PAGE_SIZE / sizeof(struct hlist_head)) > static struct hlist_head dax_host_list[DAX_HASH_SIZE]; > @@ -62,16 +63,6 @@ int bdev_dax_pgoff(struct block_device *bdev, sector_t > sector, size_t size, > } > EXPORT_SYMBOL(bdev_dax_pgoff); > > -#if IS_ENABLED(CONFIG_FS_DAX) > -struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev) > -{ > - if (!blk_queue_dax(bdev->bd_queue)) > - return NULL; > - return fs_dax_get_by_host(bdev->bd_disk->disk_name); > -} > -EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev); > -#endif > - > /** > * __bdev_dax_supported() - Check if the device supports dax for filesystem > * @sb: The superblock of the device > @@ -169,9 +160,66 @@ struct dax_device { > const char *host; > void *private; > unsigned long flags; > + struct dev_pagemap *pgmap; > const struct dax_operations *ops; > }; > > +#if IS_ENABLED(CONFIG_FS_DAX) > +static void generic_dax_pagefree(struct page *page, void *data) > +{ > + /* TODO: wakeup page-idle waiters */ > +} > + > +struct dax_device *fs_dax_claim_bdev(struct block_device *bdev, void *owner) > +{ > + struct dax_device *dax_dev; > + struct dev_pagemap *pgmap; > + > + if (!blk_queue_dax(bdev->bd_queue)) > + return NULL; > + dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name); > + if (!dax_dev->pgmap) > + return dax_dev; > + pgmap = dax_dev->pgmap; > + > + mutex_lock(_lock); > + if ((pgmap->data && pgmap->data != owner) || pgmap->page_free > + || pgmap->page_fault > + || pgmap->type != MEMORY_DEVICE_HOST) { > + put_dax(dax_dev); > + mutex_unlock(_lock); > + return NULL; > + } > + > + pgmap->type = MEMORY_DEVICE_FS_DAX; > + pgmap->page_free = generic_dax_pagefree; > + pgmap->data = owner; > + mutex_unlock(_lock); > + > + return dax_dev; > +} > +EXPORT_SYMBOL_GPL(fs_dax_claim_bdev); > + > +void fs_dax_release(struct dax_device *dax_dev, void *owner) > +{ > + struct dev_pagemap *pgmap = dax_dev ? dax_dev->pgmap : NULL; > + > + put_dax(dax_dev); > + if (!pgmap) > + return; > + if (!pgmap->data) > + return; > + > + mutex_lock(_lock); > + WARN_ON(pgmap->data != owner); > + pgmap->type = MEMORY_DEVICE_HOST; > + pgmap->page_free = NULL; > + pgmap->data = NULL; > + mutex_unlock(_lock); > +} > +EXPORT_SYMBOL_GPL(fs_dax_release); > +#endif > + > static ssize_t write_cache_show(struct device *dev, > struct device_attribute *attr, char *buf) > { > @@ -499,6 +547,17 @@ struct dax_device
Re: [PATCH v5 06/11] mm, dax: enable filesystems to trigger dev_pagemap ->page_free callbacks
On Fri, Mar 09, 2018 at 10:55:21PM -0800, Dan Williams wrote: > In order to resolve collisions between filesystem operations and DMA to > DAX mapped pages we need a callback when DMA completes. With a callback > we can hold off filesystem operations while DMA is in-flight and then > resume those operations when the last put_page() occurs on a DMA page. > > Recall that the 'struct page' entries for DAX memory are created with > devm_memremap_pages(). That routine arranges for the pages to be > allocated, but never onlined, so a DAX page is DMA-idle when its > reference count reaches one. > > Also recall that the HMM sub-system added infrastructure to trap the > page-idle (2-to-1 reference count) transition of the pages allocated by > devm_memremap_pages() and trigger a callback via the 'struct > dev_pagemap' associated with the page range. Whereas the HMM callbacks > are going to a device driver to manage bounce pages in device-memory in > the filesystem-dax case we will call back to filesystem specified > callback. > > Since the callback is not known at devm_memremap_pages() time we arrange > for the filesystem to install it at mount time. No functional changes > are expected as this only registers a nop handler for the ->page_free() > event for device-mapped pages. > > Cc: Michal Hocko Looks good to me Reviewed-by: "Jérôme Glisse" > Reviewed-by: Christoph Hellwig > Signed-off-by: Dan Williams > --- > drivers/dax/super.c | 79 > -- > drivers/nvdimm/pmem.c|3 +- > fs/ext2/super.c |6 ++- > fs/ext4/super.c |6 ++- > fs/xfs/xfs_super.c | 20 ++-- > include/linux/dax.h | 17 +- > include/linux/memremap.h |8 + > 7 files changed, 103 insertions(+), 36 deletions(-) > > diff --git a/drivers/dax/super.c b/drivers/dax/super.c > index 2b2332b605e4..ecefe9f7eb60 100644 > --- a/drivers/dax/super.c > +++ b/drivers/dax/super.c > @@ -29,6 +29,7 @@ static struct vfsmount *dax_mnt; > static DEFINE_IDA(dax_minor_ida); > static struct kmem_cache *dax_cache __read_mostly; > static struct super_block *dax_superblock __read_mostly; > +static DEFINE_MUTEX(devmap_lock); > > #define DAX_HASH_SIZE (PAGE_SIZE / sizeof(struct hlist_head)) > static struct hlist_head dax_host_list[DAX_HASH_SIZE]; > @@ -62,16 +63,6 @@ int bdev_dax_pgoff(struct block_device *bdev, sector_t > sector, size_t size, > } > EXPORT_SYMBOL(bdev_dax_pgoff); > > -#if IS_ENABLED(CONFIG_FS_DAX) > -struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev) > -{ > - if (!blk_queue_dax(bdev->bd_queue)) > - return NULL; > - return fs_dax_get_by_host(bdev->bd_disk->disk_name); > -} > -EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev); > -#endif > - > /** > * __bdev_dax_supported() - Check if the device supports dax for filesystem > * @sb: The superblock of the device > @@ -169,9 +160,66 @@ struct dax_device { > const char *host; > void *private; > unsigned long flags; > + struct dev_pagemap *pgmap; > const struct dax_operations *ops; > }; > > +#if IS_ENABLED(CONFIG_FS_DAX) > +static void generic_dax_pagefree(struct page *page, void *data) > +{ > + /* TODO: wakeup page-idle waiters */ > +} > + > +struct dax_device *fs_dax_claim_bdev(struct block_device *bdev, void *owner) > +{ > + struct dax_device *dax_dev; > + struct dev_pagemap *pgmap; > + > + if (!blk_queue_dax(bdev->bd_queue)) > + return NULL; > + dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name); > + if (!dax_dev->pgmap) > + return dax_dev; > + pgmap = dax_dev->pgmap; > + > + mutex_lock(_lock); > + if ((pgmap->data && pgmap->data != owner) || pgmap->page_free > + || pgmap->page_fault > + || pgmap->type != MEMORY_DEVICE_HOST) { > + put_dax(dax_dev); > + mutex_unlock(_lock); > + return NULL; > + } > + > + pgmap->type = MEMORY_DEVICE_FS_DAX; > + pgmap->page_free = generic_dax_pagefree; > + pgmap->data = owner; > + mutex_unlock(_lock); > + > + return dax_dev; > +} > +EXPORT_SYMBOL_GPL(fs_dax_claim_bdev); > + > +void fs_dax_release(struct dax_device *dax_dev, void *owner) > +{ > + struct dev_pagemap *pgmap = dax_dev ? dax_dev->pgmap : NULL; > + > + put_dax(dax_dev); > + if (!pgmap) > + return; > + if (!pgmap->data) > + return; > + > + mutex_lock(_lock); > + WARN_ON(pgmap->data != owner); > + pgmap->type = MEMORY_DEVICE_HOST; > + pgmap->page_free = NULL; > + pgmap->data = NULL; > + mutex_unlock(_lock); > +} > +EXPORT_SYMBOL_GPL(fs_dax_release); > +#endif > + > static ssize_t write_cache_show(struct device *dev, > struct device_attribute *attr, char *buf) > { > @@ -499,6 +547,17 @@ struct dax_device *alloc_dax(void *private, const char > *__host, > } > EXPORT_SYMBOL_GPL(alloc_dax);
[PATCH v5 06/11] mm, dax: enable filesystems to trigger dev_pagemap ->page_free callbacks
In order to resolve collisions between filesystem operations and DMA to DAX mapped pages we need a callback when DMA completes. With a callback we can hold off filesystem operations while DMA is in-flight and then resume those operations when the last put_page() occurs on a DMA page. Recall that the 'struct page' entries for DAX memory are created with devm_memremap_pages(). That routine arranges for the pages to be allocated, but never onlined, so a DAX page is DMA-idle when its reference count reaches one. Also recall that the HMM sub-system added infrastructure to trap the page-idle (2-to-1 reference count) transition of the pages allocated by devm_memremap_pages() and trigger a callback via the 'struct dev_pagemap' associated with the page range. Whereas the HMM callbacks are going to a device driver to manage bounce pages in device-memory in the filesystem-dax case we will call back to filesystem specified callback. Since the callback is not known at devm_memremap_pages() time we arrange for the filesystem to install it at mount time. No functional changes are expected as this only registers a nop handler for the ->page_free() event for device-mapped pages. Cc: Michal HockoCc: "Jérôme Glisse" Reviewed-by: Christoph Hellwig Signed-off-by: Dan Williams --- drivers/dax/super.c | 79 -- drivers/nvdimm/pmem.c|3 +- fs/ext2/super.c |6 ++- fs/ext4/super.c |6 ++- fs/xfs/xfs_super.c | 20 ++-- include/linux/dax.h | 17 +- include/linux/memremap.h |8 + 7 files changed, 103 insertions(+), 36 deletions(-) diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 2b2332b605e4..ecefe9f7eb60 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -29,6 +29,7 @@ static struct vfsmount *dax_mnt; static DEFINE_IDA(dax_minor_ida); static struct kmem_cache *dax_cache __read_mostly; static struct super_block *dax_superblock __read_mostly; +static DEFINE_MUTEX(devmap_lock); #define DAX_HASH_SIZE (PAGE_SIZE / sizeof(struct hlist_head)) static struct hlist_head dax_host_list[DAX_HASH_SIZE]; @@ -62,16 +63,6 @@ int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size, } EXPORT_SYMBOL(bdev_dax_pgoff); -#if IS_ENABLED(CONFIG_FS_DAX) -struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev) -{ - if (!blk_queue_dax(bdev->bd_queue)) - return NULL; - return fs_dax_get_by_host(bdev->bd_disk->disk_name); -} -EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev); -#endif - /** * __bdev_dax_supported() - Check if the device supports dax for filesystem * @sb: The superblock of the device @@ -169,9 +160,66 @@ struct dax_device { const char *host; void *private; unsigned long flags; + struct dev_pagemap *pgmap; const struct dax_operations *ops; }; +#if IS_ENABLED(CONFIG_FS_DAX) +static void generic_dax_pagefree(struct page *page, void *data) +{ + /* TODO: wakeup page-idle waiters */ +} + +struct dax_device *fs_dax_claim_bdev(struct block_device *bdev, void *owner) +{ + struct dax_device *dax_dev; + struct dev_pagemap *pgmap; + + if (!blk_queue_dax(bdev->bd_queue)) + return NULL; + dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name); + if (!dax_dev->pgmap) + return dax_dev; + pgmap = dax_dev->pgmap; + + mutex_lock(_lock); + if ((pgmap->data && pgmap->data != owner) || pgmap->page_free + || pgmap->page_fault + || pgmap->type != MEMORY_DEVICE_HOST) { + put_dax(dax_dev); + mutex_unlock(_lock); + return NULL; + } + + pgmap->type = MEMORY_DEVICE_FS_DAX; + pgmap->page_free = generic_dax_pagefree; + pgmap->data = owner; + mutex_unlock(_lock); + + return dax_dev; +} +EXPORT_SYMBOL_GPL(fs_dax_claim_bdev); + +void fs_dax_release(struct dax_device *dax_dev, void *owner) +{ + struct dev_pagemap *pgmap = dax_dev ? dax_dev->pgmap : NULL; + + put_dax(dax_dev); + if (!pgmap) + return; + if (!pgmap->data) + return; + + mutex_lock(_lock); + WARN_ON(pgmap->data != owner); + pgmap->type = MEMORY_DEVICE_HOST; + pgmap->page_free = NULL; + pgmap->data = NULL; + mutex_unlock(_lock); +} +EXPORT_SYMBOL_GPL(fs_dax_release); +#endif + static ssize_t write_cache_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -499,6 +547,17 @@ struct dax_device *alloc_dax(void *private, const char *__host, } EXPORT_SYMBOL_GPL(alloc_dax); +struct dax_device *alloc_dax_devmap(void *private, const char *host, + const struct dax_operations *ops, struct dev_pagemap *pgmap) +{ + struct dax_device *dax_dev =
[PATCH v5 06/11] mm, dax: enable filesystems to trigger dev_pagemap ->page_free callbacks
In order to resolve collisions between filesystem operations and DMA to DAX mapped pages we need a callback when DMA completes. With a callback we can hold off filesystem operations while DMA is in-flight and then resume those operations when the last put_page() occurs on a DMA page. Recall that the 'struct page' entries for DAX memory are created with devm_memremap_pages(). That routine arranges for the pages to be allocated, but never onlined, so a DAX page is DMA-idle when its reference count reaches one. Also recall that the HMM sub-system added infrastructure to trap the page-idle (2-to-1 reference count) transition of the pages allocated by devm_memremap_pages() and trigger a callback via the 'struct dev_pagemap' associated with the page range. Whereas the HMM callbacks are going to a device driver to manage bounce pages in device-memory in the filesystem-dax case we will call back to filesystem specified callback. Since the callback is not known at devm_memremap_pages() time we arrange for the filesystem to install it at mount time. No functional changes are expected as this only registers a nop handler for the ->page_free() event for device-mapped pages. Cc: Michal Hocko Cc: "Jérôme Glisse" Reviewed-by: Christoph Hellwig Signed-off-by: Dan Williams --- drivers/dax/super.c | 79 -- drivers/nvdimm/pmem.c|3 +- fs/ext2/super.c |6 ++- fs/ext4/super.c |6 ++- fs/xfs/xfs_super.c | 20 ++-- include/linux/dax.h | 17 +- include/linux/memremap.h |8 + 7 files changed, 103 insertions(+), 36 deletions(-) diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 2b2332b605e4..ecefe9f7eb60 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -29,6 +29,7 @@ static struct vfsmount *dax_mnt; static DEFINE_IDA(dax_minor_ida); static struct kmem_cache *dax_cache __read_mostly; static struct super_block *dax_superblock __read_mostly; +static DEFINE_MUTEX(devmap_lock); #define DAX_HASH_SIZE (PAGE_SIZE / sizeof(struct hlist_head)) static struct hlist_head dax_host_list[DAX_HASH_SIZE]; @@ -62,16 +63,6 @@ int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size, } EXPORT_SYMBOL(bdev_dax_pgoff); -#if IS_ENABLED(CONFIG_FS_DAX) -struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev) -{ - if (!blk_queue_dax(bdev->bd_queue)) - return NULL; - return fs_dax_get_by_host(bdev->bd_disk->disk_name); -} -EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev); -#endif - /** * __bdev_dax_supported() - Check if the device supports dax for filesystem * @sb: The superblock of the device @@ -169,9 +160,66 @@ struct dax_device { const char *host; void *private; unsigned long flags; + struct dev_pagemap *pgmap; const struct dax_operations *ops; }; +#if IS_ENABLED(CONFIG_FS_DAX) +static void generic_dax_pagefree(struct page *page, void *data) +{ + /* TODO: wakeup page-idle waiters */ +} + +struct dax_device *fs_dax_claim_bdev(struct block_device *bdev, void *owner) +{ + struct dax_device *dax_dev; + struct dev_pagemap *pgmap; + + if (!blk_queue_dax(bdev->bd_queue)) + return NULL; + dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name); + if (!dax_dev->pgmap) + return dax_dev; + pgmap = dax_dev->pgmap; + + mutex_lock(_lock); + if ((pgmap->data && pgmap->data != owner) || pgmap->page_free + || pgmap->page_fault + || pgmap->type != MEMORY_DEVICE_HOST) { + put_dax(dax_dev); + mutex_unlock(_lock); + return NULL; + } + + pgmap->type = MEMORY_DEVICE_FS_DAX; + pgmap->page_free = generic_dax_pagefree; + pgmap->data = owner; + mutex_unlock(_lock); + + return dax_dev; +} +EXPORT_SYMBOL_GPL(fs_dax_claim_bdev); + +void fs_dax_release(struct dax_device *dax_dev, void *owner) +{ + struct dev_pagemap *pgmap = dax_dev ? dax_dev->pgmap : NULL; + + put_dax(dax_dev); + if (!pgmap) + return; + if (!pgmap->data) + return; + + mutex_lock(_lock); + WARN_ON(pgmap->data != owner); + pgmap->type = MEMORY_DEVICE_HOST; + pgmap->page_free = NULL; + pgmap->data = NULL; + mutex_unlock(_lock); +} +EXPORT_SYMBOL_GPL(fs_dax_release); +#endif + static ssize_t write_cache_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -499,6 +547,17 @@ struct dax_device *alloc_dax(void *private, const char *__host, } EXPORT_SYMBOL_GPL(alloc_dax); +struct dax_device *alloc_dax_devmap(void *private, const char *host, + const struct dax_operations *ops, struct dev_pagemap *pgmap) +{ + struct dax_device *dax_dev = alloc_dax(private, host, ops); + + if (dax_dev) + dax_dev->pgmap = pgmap;