The 'start_pad' accounting bug resulted from the pmem driver inferring
properties of the established pagemap to calculate pmem->phys_addr and
pmem->size, and that nd_pfn->data_offset was identical to
pmem->data_offset. The assumptions in the current implementation are
only true when 'start_pad' is zero.

The confusion resulted in the wrong offset being applied for
sector-to-physical-page translations. In advance of introducing a
corrected implementation make the mapping parameters and translation
explicit via a new 'struct pfn_map_info' to carry the mapping and
device-offset to physical address translation parameters.

No functional change intended, this cleanup preserves the existing bug.

Signed-off-by: Dan Williams <dan.j.willi...@intel.com>
---
 drivers/dax/pmem.c              |    9 ++-
 drivers/nvdimm/nd.h             |   13 ++++-
 drivers/nvdimm/pfn_devs.c       |   21 ++++++-
 drivers/nvdimm/pmem.c           |  111 ++++++++++++++++++---------------------
 drivers/nvdimm/pmem.h           |   12 +---
 tools/testing/nvdimm/pmem-dax.c |   15 +++--
 6 files changed, 96 insertions(+), 85 deletions(-)

diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c
index 2c1f459c0c63..80359df4f662 100644
--- a/drivers/dax/pmem.c
+++ b/drivers/dax/pmem.c
@@ -64,6 +64,7 @@ static int dax_pmem_probe(struct device *dev)
        struct nd_pfn_sb *pfn_sb;
        struct dev_dax *dev_dax;
        struct dax_pmem *dax_pmem;
+       struct pfn_map_info mi;
        struct nd_namespace_io *nsio;
        struct dax_region *dax_region;
        struct nd_namespace_common *ndns;
@@ -83,7 +84,7 @@ static int dax_pmem_probe(struct device *dev)
        rc = devm_nsio_enable(dev, nsio);
        if (rc)
                return rc;
-       rc = nvdimm_setup_pfn(nd_pfn, &dax_pmem->pgmap);
+       rc = nvdimm_setup_pfn(nd_pfn, &dax_pmem->pgmap, &mi);
        if (rc)
                return rc;
        devm_nsio_disable(dev, nsio);
@@ -117,8 +118,10 @@ static int dax_pmem_probe(struct device *dev)
                return PTR_ERR(addr);
 
        /* adjust the dax_region resource to the start of data */
-       memcpy(&res, &dax_pmem->pgmap.res, sizeof(res));
-       res.start += le64_to_cpu(pfn_sb->dataoff);
+       res = (struct resource) {
+               .start = mi.map_base + mi.map_pad + mi.map_offset,
+               .end = mi.map_base + mi.map_pad + mi.map_size - 1,
+       };
 
        rc = sscanf(dev_name(&ndns->dev), "namespace%d.%d", &region_id, &id);
        if (rc != 2)
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 379bf4305e61..4339d338928b 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -377,13 +377,22 @@ const char *nvdimm_namespace_disk_name(struct 
nd_namespace_common *ndns,
 unsigned int pmem_sector_size(struct nd_namespace_common *ndns);
 void nvdimm_badblocks_populate(struct nd_region *nd_region,
                struct badblocks *bb, const struct resource *res);
+struct pfn_map_info {
+       resource_size_t map_base;
+       unsigned long map_offset;
+       resource_size_t map_size;
+       unsigned long map_pad;
+       u64 pfn_flags;
+       void *map;
+};
 #if IS_ENABLED(CONFIG_ND_CLAIM)
-int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap);
+int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap,
+               struct pfn_map_info *mi);
 int devm_nsio_enable(struct device *dev, struct nd_namespace_io *nsio);
 void devm_nsio_disable(struct device *dev, struct nd_namespace_io *nsio);
 #else
 static inline int nvdimm_setup_pfn(struct nd_pfn *nd_pfn,
-                                  struct dev_pagemap *pgmap)
+               struct dev_pagemap *pgmap, struct pfn_map_info *mi)
 {
        return -ENXIO;
 }
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 110699f4c3e4..1c2a0e707da3 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -611,7 +611,8 @@ static unsigned long init_altmap_reserve(resource_size_t 
base)
        return reserve;
 }
 
-static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
+static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap,
+               struct pfn_map_info *mi)
 {
        struct resource *res = &pgmap->res;
        struct vmem_altmap *altmap = &pgmap->altmap;
@@ -632,6 +633,19 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, 
struct dev_pagemap *pgmap)
        res->start += start_pad;
        res->end -= end_trunc;
 
+       *mi = (struct pfn_map_info) {
+               /*
+                * TODO fix implementation to properly account for
+                * 'start_pad' in map_base, and map_offset. As is, the
+                * fact that __va(map_base) != __pa(map) leads
+                * mistranslations in pmem_direct_access().
+                */
+               .map_base = nsio->res.start,
+               .map_pad = start_pad,
+               .map_offset = offset,
+               .map_size = resource_size(res),
+       };
+
        if (nd_pfn->mode == PFN_MODE_RAM) {
                if (offset < reserve)
                        return -EINVAL;
@@ -789,7 +803,8 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
  * Determine the effective resource range and vmem_altmap from an nd_pfn
  * instance.
  */
-int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
+int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap,
+               struct pfn_map_info *mi)
 {
        int rc;
 
@@ -801,6 +816,6 @@ int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct 
dev_pagemap *pgmap)
                return rc;
 
        /* we need a valid pfn_sb before we can init a dev_pagemap */
-       return __nvdimm_setup_pfn(nd_pfn, pgmap);
+       return __nvdimm_setup_pfn(nd_pfn, pgmap, mi);
 }
 EXPORT_SYMBOL_GPL(nvdimm_setup_pfn);
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index bc2f700feef8..46b823f3b94d 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -34,9 +34,7 @@
 #include <linux/nd.h>
 #include <linux/backing-dev.h>
 #include "pmem.h"
-#include "pfn.h"
 #include "nd.h"
-#include "nd-core.h"
 
 static struct device *to_dev(struct pmem_device *pmem)
 {
@@ -58,7 +56,7 @@ static void hwpoison_clear(struct pmem_device *pmem,
        unsigned long pfn_start, pfn_end, pfn;
 
        /* only pmem in the linear map supports HWPoison */
-       if (is_vmalloc_addr(pmem->virt_addr))
+       if (is_vmalloc_addr(pmem->mi.map))
                return;
 
        pfn_start = PHYS_PFN(phys);
@@ -80,17 +78,18 @@ static blk_status_t pmem_clear_poison(struct pmem_device 
*pmem,
                phys_addr_t offset, unsigned int len)
 {
        struct device *dev = to_dev(pmem);
+       struct pfn_map_info *mi = &pmem->mi;
        sector_t sector;
        long cleared;
        blk_status_t rc = BLK_STS_OK;
 
-       sector = (offset - pmem->data_offset) / 512;
+       sector = (offset - mi->map_offset) / 512;
 
-       cleared = nvdimm_clear_poison(dev, pmem->phys_addr + offset, len);
+       cleared = nvdimm_clear_poison(dev, mi->map_base + offset, len);
        if (cleared < len)
                rc = BLK_STS_IOERR;
        if (cleared > 0 && cleared / 512) {
-               hwpoison_clear(pmem, pmem->phys_addr + offset, cleared);
+               hwpoison_clear(pmem, mi->map_base + offset, cleared);
                cleared /= 512;
                dev_dbg(dev, "%#llx clear %ld sector%s\n",
                                (unsigned long long) sector, cleared,
@@ -100,7 +99,7 @@ static blk_status_t pmem_clear_poison(struct pmem_device 
*pmem,
                        sysfs_notify_dirent(pmem->bb_state);
        }
 
-       arch_invalidate_pmem(pmem->virt_addr + offset, len);
+       arch_invalidate_pmem(mi->map + offset, len);
 
        return rc;
 }
@@ -151,8 +150,9 @@ static blk_status_t pmem_do_bvec(struct pmem_device *pmem, 
struct page *page,
 {
        blk_status_t rc = BLK_STS_OK;
        bool bad_pmem = false;
-       phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
-       void *pmem_addr = pmem->virt_addr + pmem_off;
+       struct pfn_map_info *mi = &pmem->mi;
+       phys_addr_t pmem_off = sector * 512 + mi->map_offset;
+       void *pmem_addr = mi->map + pmem_off;
 
        if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
                bad_pmem = true;
@@ -247,16 +247,17 @@ static int pmem_rw_page(struct block_device *bdev, 
sector_t sector,
 __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
                long nr_pages, void **kaddr, pfn_t *pfn)
 {
-       resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;
+       struct pfn_map_info *mi = &pmem->mi;
+       resource_size_t offset = PFN_PHYS(pgoff) + mi->map_offset;
 
        if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
                                        PFN_PHYS(nr_pages))))
                return -EIO;
 
        if (kaddr)
-               *kaddr = pmem->virt_addr + offset;
+               *kaddr = mi->map + offset;
        if (pfn)
-               *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
+               *pfn = phys_to_pfn_t(mi->map_base + offset, mi->pfn_flags);
 
        /*
         * If badblocks are present, limit known good range to the
@@ -264,7 +265,7 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, 
pgoff_t pgoff,
         */
        if (unlikely(pmem->bb.count))
                return nr_pages;
-       return PHYS_PFN(pmem->size - pmem->pfn_pad - offset);
+       return PHYS_PFN(mi->map_size - offset);
 }
 
 static const struct block_device_operations pmem_fops = {
@@ -354,45 +355,49 @@ static int pmem_attach_disk(struct device *dev,
        struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
        struct nd_region *nd_region = to_nd_region(dev->parent);
        int nid = dev_to_node(dev), fua;
-       struct resource *res = &nsio->res;
+       struct pfn_map_info *mi;
        struct resource bb_res;
        struct nd_pfn *nd_pfn = NULL;
        struct dax_device *dax_dev;
-       struct nd_pfn_sb *pfn_sb;
        struct pmem_device *pmem;
        struct request_queue *q;
        struct device *gendev;
        struct gendisk *disk;
-       void *addr;
        int rc;
 
        pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL);
        if (!pmem)
                return -ENOMEM;
+       mi = &pmem->mi;
 
        /* while nsio_rw_bytes is active, parse a pfn info block if present */
        if (is_nd_pfn(dev)) {
                nd_pfn = to_nd_pfn(dev);
-               rc = nvdimm_setup_pfn(nd_pfn, &pmem->pgmap);
+               rc = nvdimm_setup_pfn(nd_pfn, &pmem->pgmap, mi);
                if (rc)
                        return rc;
+       } else {
+               *mi = (struct pfn_map_info) {
+                       .map_offset = 0,
+                       .map_base = nsio->res.start,
+                       .map_size = resource_size(&nsio->res),
+               };
        }
 
        /* we're attaching a block device, disable raw namespace access */
        devm_nsio_disable(dev, nsio);
 
        dev_set_drvdata(dev, pmem);
-       pmem->phys_addr = res->start;
-       pmem->size = resource_size(res);
        fua = nvdimm_has_flush(nd_region);
        if (!IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) || fua < 0) {
                dev_warn(dev, "unable to guarantee persistence of writes\n");
                fua = 0;
        }
 
-       if (!devm_request_mem_region(dev, res->start, resource_size(res),
+       if (!devm_request_mem_region(dev, nsio->res.start,
+                               resource_size(&nsio->res),
                                dev_name(&ndns->dev))) {
-               dev_warn(dev, "could not reserve region %pR\n", res);
+               dev_warn(dev, "could not reserve region %pR\n", &nsio->res);
                return -EBUSY;
        }
 
@@ -403,37 +408,27 @@ static int pmem_attach_disk(struct device *dev,
        if (devm_add_action_or_reset(dev, pmem_release_queue, q))
                return -ENOMEM;
 
-       pmem->pfn_flags = PFN_DEV;
+       mi->pfn_flags = PFN_DEV;
        pmem->pgmap.ref = &q->q_usage_counter;
        pmem->pgmap.kill = pmem_freeze_queue;
        if (is_nd_pfn(dev)) {
                if (setup_pagemap_fsdax(dev, &pmem->pgmap))
                        return -ENOMEM;
-               addr = devm_memremap_pages(dev, &pmem->pgmap);
-               pfn_sb = nd_pfn->pfn_sb;
-               pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
-               pmem->pfn_pad = resource_size(res) -
-                       resource_size(&pmem->pgmap.res);
-               pmem->pfn_flags |= PFN_MAP;
-               memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res));
-               bb_res.start += pmem->data_offset;
+               mi->map = devm_memremap_pages(dev, &pmem->pgmap);
+               mi->pfn_flags |= PFN_MAP;
        } else if (pmem_should_map_pages(dev)) {
                memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res));
                pmem->pgmap.altmap_valid = false;
                if (setup_pagemap_fsdax(dev, &pmem->pgmap))
                        return -ENOMEM;
-               addr = devm_memremap_pages(dev, &pmem->pgmap);
-               pmem->pfn_flags |= PFN_MAP;
-               memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res));
-       } else {
-               addr = devm_memremap(dev, pmem->phys_addr,
-                               pmem->size, ARCH_MEMREMAP_PMEM);
-               memcpy(&bb_res, &nsio->res, sizeof(bb_res));
-       }
+               mi->map = devm_memremap_pages(dev, &pmem->pgmap);
+               mi->pfn_flags |= PFN_MAP;
+       } else
+               mi->map = devm_memremap(dev, mi->map_base, mi->map_size,
+                               ARCH_MEMREMAP_PMEM);
 
-       if (IS_ERR(addr))
-               return PTR_ERR(addr);
-       pmem->virt_addr = addr;
+       if (IS_ERR(mi->map))
+               return PTR_ERR(mi->map);
 
        blk_queue_write_cache(q, true, fua);
        blk_queue_make_request(q, pmem_make_request);
@@ -441,7 +436,7 @@ static int pmem_attach_disk(struct device *dev,
        blk_queue_logical_block_size(q, pmem_sector_size(ndns));
        blk_queue_max_hw_sectors(q, UINT_MAX);
        blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
-       if (pmem->pfn_flags & PFN_MAP)
+       if (mi->pfn_flags & PFN_MAP)
                blk_queue_flag_set(QUEUE_FLAG_DAX, q);
        q->queuedata = pmem;
 
@@ -455,10 +450,13 @@ static int pmem_attach_disk(struct device *dev,
        disk->flags             = GENHD_FL_EXT_DEVT;
        disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO;
        nvdimm_namespace_disk_name(ndns, disk->disk_name);
-       set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset)
-                       / 512);
+       set_capacity(disk, (mi->map_size - mi->map_offset) / 512);
        if (devm_init_badblocks(dev, &pmem->bb))
                return -ENOMEM;
+       bb_res = (struct resource) {
+               .start = mi->map_base + mi->map_pad + mi->map_offset,
+               .end = mi->map_base + mi->map_pad + mi->map_size - 1,
+       };
        nvdimm_badblocks_populate(nd_region, &pmem->bb, &bb_res);
        disk->bb = &pmem->bb;
 
@@ -540,7 +538,6 @@ static void nd_pmem_shutdown(struct device *dev)
 static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
 {
        struct nd_region *nd_region;
-       resource_size_t offset = 0, end_trunc = 0;
        struct nd_namespace_common *ndns;
        struct nd_namespace_io *nsio;
        struct resource res;
@@ -556,32 +553,26 @@ static void nd_pmem_notify(struct device *dev, enum 
nvdimm_event event)
                ndns = nd_btt->ndns;
                nd_region = to_nd_region(ndns->dev.parent);
                nsio = to_nd_namespace_io(&ndns->dev);
+               res = (struct resource) {
+                       .start = nsio->res.start,
+                       .end = nsio->res.end,
+               };
                bb = &nsio->bb;
                bb_state = NULL;
        } else {
                struct pmem_device *pmem = dev_get_drvdata(dev);
+               struct pfn_map_info *mi = &pmem->mi;
 
                nd_region = to_region(pmem);
                bb = &pmem->bb;
                bb_state = pmem->bb_state;
 
-               if (is_nd_pfn(dev)) {
-                       struct nd_pfn *nd_pfn = to_nd_pfn(dev);
-                       struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
-
-                       ndns = nd_pfn->ndns;
-                       offset = pmem->data_offset +
-                                       __le32_to_cpu(pfn_sb->start_pad);
-                       end_trunc = __le32_to_cpu(pfn_sb->end_trunc);
-               } else {
-                       ndns = to_ndns(dev);
-               }
-
-               nsio = to_nd_namespace_io(&ndns->dev);
+               res = (struct resource) {
+                       .start = mi->map_base + mi->map_pad + mi->map_offset,
+                       .end = mi->map_base + mi->map_pad + mi->map_size - 1,
+               };
        }
 
-       res.start = nsio->res.start + offset;
-       res.end = nsio->res.end - end_trunc;
        nvdimm_badblocks_populate(nd_region, bb, &res);
        if (bb_state)
                sysfs_notify_dirent(bb_state);
diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h
index 59cfe13ea8a8..6c27bbae349f 100644
--- a/drivers/nvdimm/pmem.h
+++ b/drivers/nvdimm/pmem.h
@@ -6,19 +6,11 @@
 #include <linux/types.h>
 #include <linux/pfn_t.h>
 #include <linux/fs.h>
+#include "nd.h"
 
 /* this definition is in it's own header for tools/testing/nvdimm to consume */
 struct pmem_device {
-       /* One contiguous memory region per device */
-       phys_addr_t             phys_addr;
-       /* when non-zero this device is hosting a 'pfn' instance */
-       phys_addr_t             data_offset;
-       u64                     pfn_flags;
-       void                    *virt_addr;
-       /* immutable base size of the namespace */
-       size_t                  size;
-       /* trim size when namespace capacity has been section aligned */
-       u32                     pfn_pad;
+       struct pfn_map_info     mi;
        struct kernfs_node      *bb_state;
        struct badblocks        bb;
        struct dax_device       *dax_dev;
diff --git a/tools/testing/nvdimm/pmem-dax.c b/tools/testing/nvdimm/pmem-dax.c
index 2e7fd8227969..52a8760d2ec5 100644
--- a/tools/testing/nvdimm/pmem-dax.c
+++ b/tools/testing/nvdimm/pmem-dax.c
@@ -18,7 +18,8 @@
 long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
                long nr_pages, void **kaddr, pfn_t *pfn)
 {
-       resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;
+       struct pfn_map_info *mi = &pmem->mi;
+       resource_size_t offset = PFN_PHYS(pgoff) + mi->map_offset;
 
        if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
                                        PFN_PHYS(nr_pages))))
@@ -28,12 +29,12 @@ long __pmem_direct_access(struct pmem_device *pmem, pgoff_t 
pgoff,
         * Limit dax to a single page at a time given vmalloc()-backed
         * in the nfit_test case.
         */
-       if (get_nfit_res(pmem->phys_addr + offset)) {
+       if (get_nfit_res(mi->map_base + offset)) {
                struct page *page;
 
                if (kaddr)
-                       *kaddr = pmem->virt_addr + offset;
-               page = vmalloc_to_page(pmem->virt_addr + offset);
+                       *kaddr = mi->map + offset;
+               page = vmalloc_to_page(mi->map + offset);
                if (pfn)
                        *pfn = page_to_pfn_t(page);
                pr_debug_ratelimited("%s: pmem: %p pgoff: %#lx pfn: %#lx\n",
@@ -43,9 +44,9 @@ long __pmem_direct_access(struct pmem_device *pmem, pgoff_t 
pgoff,
        }
 
        if (kaddr)
-               *kaddr = pmem->virt_addr + offset;
+               *kaddr = mi->map + offset;
        if (pfn)
-               *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
+               *pfn = phys_to_pfn_t(mi->map_base + offset, mi->pfn_flags);
 
        /*
         * If badblocks are present, limit known good range to the
@@ -53,5 +54,5 @@ long __pmem_direct_access(struct pmem_device *pmem, pgoff_t 
pgoff,
         */
        if (unlikely(pmem->bb.count))
                return nr_pages;
-       return PHYS_PFN(pmem->size - pmem->pfn_pad - offset);
+       return PHYS_PFN(mi->map_size - offset);
 }

Reply via email to