Hi,

This is an RFC patch to provide a dax operation to zero a range of memory.
It will also clear poison in the process. This is primarily compile tested
patch. I don't have real hardware to test the poison logic. I am posting
this to figure out if this is the right direction or not.

Motivation from this patch comes from Christoph's feedback that he will
rather prefer a dax way to zero a range instead of relying on having to
call blkdev_issue_zeroout() in __dax_zero_page_range().

https://lkml.org/lkml/2019/8/26/361

My motivation for this change is virtiofs DAX support. There we use DAX
but we don't have a block device. So any dax code which has the assumption
that there is always a block device associated is a problem. So this
is more of a cleanup of one of the places where dax has this dependency
on block device and if we add a dax operation for zeroing a range, it
can help with not having to call blkdev_issue_zeroout() in dax path.

I have yet to take care of stacked block drivers (dm/md).

Current poison clearing logic is primarily written with assumption that
I/O is sector aligned. With this new method, this assumption is broken
and one can pass any range of memory to zero. I have fixed few places
in existing logic to be able to handle an arbitrary start/end. I am
not sure are there other dependencies which might need fixing or
prohibit us from providing this method.

Any feedback or comment is welcome.

Thanks
Vivek

---
 drivers/dax/super.c   |   13 +++++++++
 drivers/nvdimm/pmem.c |   67 ++++++++++++++++++++++++++++++++++++++++++--------
 fs/dax.c              |   39 ++++++++---------------------
 include/linux/dax.h   |    3 ++
 4 files changed, 85 insertions(+), 37 deletions(-)

Index: rhvgoyal-linux/drivers/nvdimm/pmem.c
===================================================================
--- rhvgoyal-linux.orig/drivers/nvdimm/pmem.c   2020-01-23 11:32:11.075139183 
-0500
+++ rhvgoyal-linux/drivers/nvdimm/pmem.c        2020-01-23 11:32:28.660139183 
-0500
@@ -52,8 +52,8 @@ static void hwpoison_clear(struct pmem_d
        if (is_vmalloc_addr(pmem->virt_addr))
                return;
 
-       pfn_start = PHYS_PFN(phys);
-       pfn_end = pfn_start + PHYS_PFN(len);
+       pfn_start = PFN_UP(phys);
+       pfn_end = PFN_DOWN(phys + len);
        for (pfn = pfn_start; pfn < pfn_end; pfn++) {
                struct page *page = pfn_to_page(pfn);
 
@@ -71,22 +71,24 @@ static blk_status_t pmem_clear_poison(st
                phys_addr_t offset, unsigned int len)
 {
        struct device *dev = to_dev(pmem);
-       sector_t sector;
+       sector_t sector_start, sector_end;
        long cleared;
        blk_status_t rc = BLK_STS_OK;
+       int nr_sectors;
 
-       sector = (offset - pmem->data_offset) / 512;
+       sector_start = ALIGN((offset - pmem->data_offset), 512) / 512;
+       sector_end = ALIGN_DOWN((offset - pmem->data_offset + len), 512)/512;
+       nr_sectors =  sector_end - sector_start;
 
        cleared = nvdimm_clear_poison(dev, pmem->phys_addr + offset, len);
        if (cleared < len)
                rc = BLK_STS_IOERR;
-       if (cleared > 0 && cleared / 512) {
+       if (cleared > 0 && nr_sectors > 0) {
                hwpoison_clear(pmem, pmem->phys_addr + offset, cleared);
-               cleared /= 512;
-               dev_dbg(dev, "%#llx clear %ld sector%s\n",
-                               (unsigned long long) sector, cleared,
-                               cleared > 1 ? "s" : "");
-               badblocks_clear(&pmem->bb, sector, cleared);
+               dev_dbg(dev, "%#llx clear %d sector%s\n",
+                               (unsigned long long) sector_start, nr_sectors,
+                               nr_sectors > 1 ? "s" : "");
+               badblocks_clear(&pmem->bb, sector_start, nr_sectors);
                if (pmem->bb_state)
                        sysfs_notify_dirent(pmem->bb_state);
        }
@@ -268,6 +270,50 @@ static const struct block_device_operati
        .revalidate_disk =      nvdimm_revalidate_disk,
 };
 
+static int pmem_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
+                                   unsigned int offset, loff_t len)
+{
+       int rc = 0;
+       phys_addr_t phys_pos = pgoff * PAGE_SIZE + offset;
+       struct pmem_device *pmem = dax_get_private(dax_dev);
+       struct page *page = ZERO_PAGE(0);
+
+       do {
+               unsigned bytes, nr_sectors = 0;
+               sector_t sector_start, sector_end;
+               bool bad_pmem = false;
+               phys_addr_t pmem_off = phys_pos + pmem->data_offset;
+               void *pmem_addr = pmem->virt_addr + pmem_off;
+               unsigned int page_offset;
+
+               page_offset = offset_in_page(phys_pos);
+               bytes = min_t(loff_t, PAGE_SIZE - page_offset, len);
+
+               sector_start = ALIGN(phys_pos, 512)/512;
+               sector_end = ALIGN_DOWN(phys_pos + bytes, 512)/512;
+               if (sector_end > sector_start)
+                       nr_sectors = sector_end - sector_start;
+
+               if (nr_sectors &&
+                   unlikely(is_bad_pmem(&pmem->bb, sector_start,
+                                        nr_sectors * 512)))
+                       bad_pmem = true;
+
+               write_pmem(pmem_addr, page, 0, bytes);
+               if (unlikely(bad_pmem)) {
+                       rc = pmem_clear_poison(pmem, pmem_off, bytes);
+                       write_pmem(pmem_addr, page, 0, bytes);
+               }
+               if (rc > 0)
+                       return -EIO;
+
+               phys_pos += phys_pos + bytes;
+               len -= bytes;
+       } while (len > 0);
+
+       return 0;
+}
+
 static long pmem_dax_direct_access(struct dax_device *dax_dev,
                pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn)
 {
@@ -299,6 +345,7 @@ static const struct dax_operations pmem_
        .dax_supported = generic_fsdax_supported,
        .copy_from_iter = pmem_copy_from_iter,
        .copy_to_iter = pmem_copy_to_iter,
+       .zero_page_range = pmem_dax_zero_page_range,
 };
 
 static const struct attribute_group *pmem_attribute_groups[] = {
Index: rhvgoyal-linux/include/linux/dax.h
===================================================================
--- rhvgoyal-linux.orig/include/linux/dax.h     2020-01-23 11:25:23.814139183 
-0500
+++ rhvgoyal-linux/include/linux/dax.h  2020-01-23 11:32:17.799139183 -0500
@@ -34,6 +34,8 @@ struct dax_operations {
        /* copy_to_iter: required operation for fs-dax direct-i/o */
        size_t (*copy_to_iter)(struct dax_device *, pgoff_t, void *, size_t,
                        struct iov_iter *);
+       /* zero_page_range: optional operation for fs-dax direct-i/o */
+       int (*zero_page_range)(struct dax_device *, pgoff_t, unsigned, loff_t);
 };
 
 extern struct attribute_group dax_attribute_group;
@@ -209,6 +211,7 @@ size_t dax_copy_from_iter(struct dax_dev
                size_t bytes, struct iov_iter *i);
 size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
                size_t bytes, struct iov_iter *i);
+int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, unsigned 
offset, loff_t len);
 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size);
 
 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
Index: rhvgoyal-linux/fs/dax.c
===================================================================
--- rhvgoyal-linux.orig/fs/dax.c        2020-01-23 11:25:23.814139183 -0500
+++ rhvgoyal-linux/fs/dax.c     2020-01-23 11:32:17.801139183 -0500
@@ -1044,38 +1044,23 @@ static vm_fault_t dax_load_hole(struct x
        return ret;
 }
 
-static bool dax_range_is_aligned(struct block_device *bdev,
-                                unsigned int offset, unsigned int length)
-{
-       unsigned short sector_size = bdev_logical_block_size(bdev);
-
-       if (!IS_ALIGNED(offset, sector_size))
-               return false;
-       if (!IS_ALIGNED(length, sector_size))
-               return false;
-
-       return true;
-}
-
 int __dax_zero_page_range(struct block_device *bdev,
                struct dax_device *dax_dev, sector_t sector,
                unsigned int offset, unsigned int size)
 {
-       if (dax_range_is_aligned(bdev, offset, size)) {
-               sector_t start_sector = sector + (offset >> 9);
+       pgoff_t pgoff;
+       long rc, id;
 
-               return blkdev_issue_zeroout(bdev, start_sector,
-                               size >> 9, GFP_NOFS, 0);
-       } else {
-               pgoff_t pgoff;
-               long rc, id;
+       rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
+       if (rc)
+               return rc;
+
+       id = dax_read_lock();
+       rc = dax_zero_page_range(dax_dev, pgoff, offset, size);
+       if (rc == -EOPNOTSUPP) {
                void *kaddr;
 
-               rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
-               if (rc)
-                       return rc;
-
-               id = dax_read_lock();
+               /* If driver does not implement zero page range, fallback */
                rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
                if (rc < 0) {
                        dax_read_unlock(id);
@@ -1083,9 +1068,9 @@ int __dax_zero_page_range(struct block_d
                }
                memset(kaddr + offset, 0, size);
                dax_flush(dax_dev, kaddr + offset, size);
-               dax_read_unlock(id);
        }
-       return 0;
+       dax_read_unlock(id);
+       return rc;
 }
 EXPORT_SYMBOL_GPL(__dax_zero_page_range);
 
Index: rhvgoyal-linux/drivers/dax/super.c
===================================================================
--- rhvgoyal-linux.orig/drivers/dax/super.c     2020-01-23 11:25:23.814139183 
-0500
+++ rhvgoyal-linux/drivers/dax/super.c  2020-01-23 11:32:17.802139183 -0500
@@ -344,6 +344,19 @@ size_t dax_copy_to_iter(struct dax_devic
 }
 EXPORT_SYMBOL_GPL(dax_copy_to_iter);
 
+int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
+                       unsigned offset, loff_t len)
+{
+       if (!dax_alive(dax_dev))
+               return 0;
+
+       if (!dax_dev->ops->zero_page_range)
+               return -EOPNOTSUPP;
+
+       return dax_dev->ops->zero_page_range(dax_dev, pgoff, offset, len);
+}
+EXPORT_SYMBOL_GPL(dax_zero_page_range);
+
 #ifdef CONFIG_ARCH_HAS_PMEM_API
 void arch_wb_cache_pmem(void *addr, size_t size);
 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
_______________________________________________
Linux-nvdimm mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to