[PATCH] mm/shmem: fix up gfpmask for shmem hugepage allocation

2020-10-21 Thread Xu Yu
Currently, the gfpmask used in shmem_alloc_hugepage is fixed, i.e.,
gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, where gfp comes from
inode mapping, usually GFP_HIGHUSER_MOVABLE. This will introduce direct
or kswapd reclaim when fast path of shmem hugepage allocation fails,
which is unexpected sometimes.

This applies the effect of defrag option of anonymous hugepage to shmem
hugepage too. By doing so, we can control the defrag behavior of both
kinds of THP.

This also explicitly adds the SHMEM_HUGE_ALWAYS case in
shmem_getpage_gfp, for better code reading.

Signed-off-by: Xu Yu 
---
 mm/shmem.c | 44 
 1 file changed, 44 insertions(+)

diff --git a/mm/shmem.c b/mm/shmem.c
index 537c137698f8..a0f5d02e479b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1780,6 +1780,47 @@ static int shmem_swapin_page(struct inode *inode, 
pgoff_t index,
return error;
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline gfp_t shmem_hugepage_gfpmask_fixup(gfp_t gfp,
+enum sgp_type sgp_huge)
+{
+   const bool vma_madvised = sgp_huge == SGP_HUGE;
+
+   gfp |= __GFP_NOMEMALLOC;
+   gfp &= ~__GFP_RECLAIM;
+
+   /* Force do synchronous compaction */
+   if (shmem_huge == SHMEM_HUGE_FORCE)
+   return gfp | __GFP_DIRECT_RECLAIM;
+
+   /* Always do synchronous compaction */
+   if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, 
_hugepage_flags))
+   return gfp | __GFP_DIRECT_RECLAIM | (vma_madvised ? 0 : 
__GFP_NORETRY);
+
+   /* Kick kcompactd and fail quickly */
+   if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, 
_hugepage_flags))
+   return gfp | __GFP_KSWAPD_RECLAIM;
+
+   /* Synchronous compaction if madvised, otherwise kick kcompactd */
+   if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, 
_hugepage_flags))
+   return gfp |
+   (vma_madvised ? __GFP_DIRECT_RECLAIM :
+   __GFP_KSWAPD_RECLAIM);
+
+   /* Only do synchronous compaction if madvised */
+   if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, 
_hugepage_flags))
+   return gfp | (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
+
+   return gfp;
+}
+#else
+static inline gfp_t shmem_hugepage_gfpmask_fixup(gfp_t gfp,
+enum sgp_type sgp_huge)
+{
+   return gfp;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
 /*
  * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
  *
@@ -1867,6 +1908,8 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t 
index,
switch (sbinfo->huge) {
case SHMEM_HUGE_NEVER:
goto alloc_nohuge;
+   case SHMEM_HUGE_ALWAYS:
+   goto alloc_huge;
case SHMEM_HUGE_WITHIN_SIZE: {
loff_t i_size;
pgoff_t off;
@@ -1887,6 +1930,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t 
index,
}
 
 alloc_huge:
+   gfp = shmem_hugepage_gfpmask_fixup(gfp, sgp_huge);
page = shmem_alloc_and_acct_page(gfp, inode, index, true);
if (IS_ERR(page)) {
 alloc_nohuge:
-- 
2.20.1.2432.ga663e714



[PATCH v2] bpf: do not restore dst_reg when cur_state is freed

2019-03-21 Thread Xu Yu
Syzkaller hit 'KASAN: use-after-free Write in sanitize_ptr_alu' bug.
Call trace:
  dump_stack+0xbf/0x12e
  print_address_description+0x6a/0x280
  kasan_report+0x237/0x360
  sanitize_ptr_alu+0x85a/0x8d0
  adjust_ptr_min_max_vals+0x8f2/0x1ca0
  adjust_reg_min_max_vals+0x8ed/0x22e0
  do_check+0x1ca6/0x5d00
  bpf_check+0x9ca/0x2570
  bpf_prog_load+0xc91/0x1030
  __se_sys_bpf+0x61e/0x1f00
  do_syscall_64+0xc8/0x550
  entry_SYSCALL_64_after_hwframe+0x49/0xbe
Fault injection trace:
  kfree+0xea/0x290
  free_func_state+0x4a/0x60
  free_verifier_state+0x61/0xe0
  push_stack+0x216/0x2f0  <- inject failslab
  sanitize_ptr_alu+0x2b1/0x8d0
  adjust_ptr_min_max_vals+0x8f2/0x1ca0
  adjust_reg_min_max_vals+0x8ed/0x22e0
  do_check+0x1ca6/0x5d00
  bpf_check+0x9ca/0x2570
  bpf_prog_load+0xc91/0x1030
  __se_sys_bpf+0x61e/0x1f00
  do_syscall_64+0xc8/0x550
  entry_SYSCALL_64_after_hwframe+0x49/0xbe

When kzalloc() fails in push_stack(), free_verifier_state() will free
current verifier state. As push_stack() returns, dst_reg was restored
if ptr_is_dst_reg is false. However, as member of the cur_state, dst_reg
is also freed, and error occurs when dereferencing dst_reg.

Simply fix it by testing ret of push_stack() before retoring dst_reg.

Signed-off-by: Xu Yu 
---
 kernel/bpf/verifier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ce166a0..cb1b874 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3368,7 +3368,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
*dst_reg = *ptr_reg;
}
ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true);
-   if (!ptr_is_dst_reg)
+   if (!ptr_is_dst_reg && ret)
*dst_reg = tmp;
return !ret ? -EFAULT : 0;
 }
-- 
1.8.3.1



[PATCH] bpf: do not restore dst_reg when cur_state is freed

2019-03-21 Thread Xu Yu
Syzkaller hit 'KASAN: use-after-free Write in sanitize_ptr_alu' bug.
Call trace:
  dump_stack+0xbf/0x12e
  print_address_description+0x6a/0x280
  kasan_report+0x237/0x360
  sanitize_ptr_alu+0x85a/0x8d0
  adjust_ptr_min_max_vals+0x8f2/0x1ca0
  adjust_reg_min_max_vals+0x8ed/0x22e0
  do_check+0x1ca6/0x5d00
  bpf_check+0x9ca/0x2570
  bpf_prog_load+0xc91/0x1030
  __se_sys_bpf+0x61e/0x1f00
  do_syscall_64+0xc8/0x550
  entry_SYSCALL_64_after_hwframe+0x49/0xbe
Fault injection trace:
  kfree+0xea/0x290
  free_func_state+0x4a/0x60
  free_verifier_state+0x61/0xe0
  push_stack+0x216/0x2f0  <- inject failslab
  sanitize_ptr_alu+0x2b1/0x8d0
  adjust_ptr_min_max_vals+0x8f2/0x1ca0
  adjust_reg_min_max_vals+0x8ed/0x22e0
  do_check+0x1ca6/0x5d00
  bpf_check+0x9ca/0x2570
  bpf_prog_load+0xc91/0x1030
  __se_sys_bpf+0x61e/0x1f00
  do_syscall_64+0xc8/0x550
  entry_SYSCALL_64_after_hwframe+0x49/0xbe

When kzalloc() fails in push_stack(), free_verifier_state() will free
current verifier state. As push_stack() returns, dst_reg was restored
if ptr_is_dst_reg is false. However, as member of the cur_state, dst_reg
is also freed, and error occurs when dereferencing dst_reg.

Simply fix it by checking whether cur_state is NULL before retoring
dst_reg.

Signed-off-by: Xu Yu 
---
 kernel/bpf/verifier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ce166a0..018ce4f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3368,7 +3368,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
*dst_reg = *ptr_reg;
}
ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true);
-   if (!ptr_is_dst_reg)
+   if (!ptr_is_dst_reg && env->cur_state)
*dst_reg = tmp;
return !ret ? -EFAULT : 0;
 }
-- 
1.8.3.1



[PATCH v3] nvme/pci: remap BAR0 to cover admin CQ doorbell for large stride

2017-05-24 Thread Xu Yu
The existing driver initially maps 8192 bytes of BAR0 which is
intended to cover doorbells of admin SQ and CQ. However, if a
large stride, e.g. 10, is used, the doorbell of admin CQ will
be out of 8192 bytes. Consequently, a page fault will be raised
when the admin CQ doorbell is accessed in nvme_configure_admin_queue().

This patch fixes this issue by remapping BAR0 before accessing
admin CQ doorbell if the initial mapping is not enough.

Signed-off-by: Xu Yu <yu.a...@intel.com>
---
Changes since v2:
* Add a sanity check in nvme_remap_bar().
* Keep constant 4096 in nvme_dev_map().
Changes since v1:
* Move the bar (re)mapping logic in nvme_dev_map(), nvme_configure_admin_queue()
and nvme_setup_io_queues() to a new helper nvme_remap_bar().
* Replace several magic numbers by PAGE_SIZE and the new NVME_REG_DBS.
---
 drivers/nvme/host/pci.c | 65 -
 include/linux/nvme.h|  1 +
 2 files changed, 44 insertions(+), 22 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 9d4640a..2c24046 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -91,6 +91,7 @@ struct nvme_dev {
int q_depth;
u32 db_stride;
void __iomem *bar;
+   unsigned long bar_mapped_size;
struct work_struct reset_work;
struct work_struct remove_work;
struct timer_list watchdog_timer;
@@ -1316,6 +1317,32 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
return 0;
 }
 
+static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
+{
+   return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride);
+}
+
+static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size)
+{
+   struct pci_dev *pdev = to_pci_dev(dev->dev);
+
+   if (size <= dev->bar_mapped_size)
+   return 0;
+   if (size > pci_resource_len(pdev, 0))
+   return -ENOMEM;
+   if (dev->bar)
+   iounmap(dev->bar);
+   dev->bar = ioremap(pci_resource_start(pdev, 0), size);
+   if (!dev->bar) {
+   dev->bar_mapped_size = 0;
+   return -ENOMEM;
+   }
+   dev->bar_mapped_size = size;
+   dev->dbs = dev->bar + NVME_REG_DBS;
+
+   return 0;
+}
+
 static int nvme_configure_admin_queue(struct nvme_dev *dev)
 {
int result;
@@ -1323,6 +1350,10 @@ static int nvme_configure_admin_queue(struct nvme_dev 
*dev)
u64 cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
struct nvme_queue *nvmeq;
 
+   result = nvme_remap_bar(dev, db_bar_size(dev, 0));
+   if (result < 0)
+   return result;
+
dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ?
NVME_CAP_NSSRC(cap) : 0;
 
@@ -1514,16 +1545,12 @@ static inline void nvme_release_cmb(struct nvme_dev 
*dev)
}
 }
 
-static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
-{
-   return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride);
-}
-
 static int nvme_setup_io_queues(struct nvme_dev *dev)
 {
struct nvme_queue *adminq = dev->queues[0];
struct pci_dev *pdev = to_pci_dev(dev->dev);
-   int result, nr_io_queues, size;
+   int result, nr_io_queues;
+   unsigned long size;
 
nr_io_queues = num_online_cpus();
result = nvme_set_queue_count(>ctrl, _io_queues);
@@ -1542,20 +1569,15 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
nvme_release_cmb(dev);
}
 
-   size = db_bar_size(dev, nr_io_queues);
-   if (size > 8192) {
-   iounmap(dev->bar);
-   do {
-   dev->bar = ioremap(pci_resource_start(pdev, 0), size);
-   if (dev->bar)
-   break;
-   if (!--nr_io_queues)
-   return -ENOMEM;
-   size = db_bar_size(dev, nr_io_queues);
-   } while (1);
-   dev->dbs = dev->bar + 4096;
-   adminq->q_db = dev->dbs;
-   }
+   do {
+   size = db_bar_size(dev, nr_io_queues);
+   result = nvme_remap_bar(dev, size);
+   if (!result)
+   break;
+   if (!--nr_io_queues)
+   return -ENOMEM;
+   } while (1);
+   adminq->q_db = dev->dbs;
 
/* Deregister the admin queue's interrupt */
free_irq(pci_irq_vector(pdev, 0), adminq);
@@ -2061,8 +2083,7 @@ static int nvme_dev_map(struct nvme_dev *dev)
if (pci_request_mem_regions(pdev, "nvme"))
return -ENODEV;
 
-   dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
-   if (!dev->bar)
+   if (nvme_remap_bar(dev, NVME_REG_DBS + 4096))
goto release;
 
return 0;
d

[PATCH v3] nvme/pci: remap BAR0 to cover admin CQ doorbell for large stride

2017-05-24 Thread Xu Yu
The existing driver initially maps 8192 bytes of BAR0 which is
intended to cover doorbells of admin SQ and CQ. However, if a
large stride, e.g. 10, is used, the doorbell of admin CQ will
be out of 8192 bytes. Consequently, a page fault will be raised
when the admin CQ doorbell is accessed in nvme_configure_admin_queue().

This patch fixes this issue by remapping BAR0 before accessing
admin CQ doorbell if the initial mapping is not enough.

Signed-off-by: Xu Yu 
---
Changes since v2:
* Add a sanity check in nvme_remap_bar().
* Keep constant 4096 in nvme_dev_map().
Changes since v1:
* Move the bar (re)mapping logic in nvme_dev_map(), nvme_configure_admin_queue()
and nvme_setup_io_queues() to a new helper nvme_remap_bar().
* Replace several magic numbers by PAGE_SIZE and the new NVME_REG_DBS.
---
 drivers/nvme/host/pci.c | 65 -
 include/linux/nvme.h|  1 +
 2 files changed, 44 insertions(+), 22 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 9d4640a..2c24046 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -91,6 +91,7 @@ struct nvme_dev {
int q_depth;
u32 db_stride;
void __iomem *bar;
+   unsigned long bar_mapped_size;
struct work_struct reset_work;
struct work_struct remove_work;
struct timer_list watchdog_timer;
@@ -1316,6 +1317,32 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
return 0;
 }
 
+static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
+{
+   return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride);
+}
+
+static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size)
+{
+   struct pci_dev *pdev = to_pci_dev(dev->dev);
+
+   if (size <= dev->bar_mapped_size)
+   return 0;
+   if (size > pci_resource_len(pdev, 0))
+   return -ENOMEM;
+   if (dev->bar)
+   iounmap(dev->bar);
+   dev->bar = ioremap(pci_resource_start(pdev, 0), size);
+   if (!dev->bar) {
+   dev->bar_mapped_size = 0;
+   return -ENOMEM;
+   }
+   dev->bar_mapped_size = size;
+   dev->dbs = dev->bar + NVME_REG_DBS;
+
+   return 0;
+}
+
 static int nvme_configure_admin_queue(struct nvme_dev *dev)
 {
int result;
@@ -1323,6 +1350,10 @@ static int nvme_configure_admin_queue(struct nvme_dev 
*dev)
u64 cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
struct nvme_queue *nvmeq;
 
+   result = nvme_remap_bar(dev, db_bar_size(dev, 0));
+   if (result < 0)
+   return result;
+
dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ?
NVME_CAP_NSSRC(cap) : 0;
 
@@ -1514,16 +1545,12 @@ static inline void nvme_release_cmb(struct nvme_dev 
*dev)
}
 }
 
-static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
-{
-   return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride);
-}
-
 static int nvme_setup_io_queues(struct nvme_dev *dev)
 {
struct nvme_queue *adminq = dev->queues[0];
struct pci_dev *pdev = to_pci_dev(dev->dev);
-   int result, nr_io_queues, size;
+   int result, nr_io_queues;
+   unsigned long size;
 
nr_io_queues = num_online_cpus();
result = nvme_set_queue_count(>ctrl, _io_queues);
@@ -1542,20 +1569,15 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
nvme_release_cmb(dev);
}
 
-   size = db_bar_size(dev, nr_io_queues);
-   if (size > 8192) {
-   iounmap(dev->bar);
-   do {
-   dev->bar = ioremap(pci_resource_start(pdev, 0), size);
-   if (dev->bar)
-   break;
-   if (!--nr_io_queues)
-   return -ENOMEM;
-   size = db_bar_size(dev, nr_io_queues);
-   } while (1);
-   dev->dbs = dev->bar + 4096;
-   adminq->q_db = dev->dbs;
-   }
+   do {
+   size = db_bar_size(dev, nr_io_queues);
+   result = nvme_remap_bar(dev, size);
+   if (!result)
+   break;
+   if (!--nr_io_queues)
+   return -ENOMEM;
+   } while (1);
+   adminq->q_db = dev->dbs;
 
/* Deregister the admin queue's interrupt */
free_irq(pci_irq_vector(pdev, 0), adminq);
@@ -2061,8 +2083,7 @@ static int nvme_dev_map(struct nvme_dev *dev)
if (pci_request_mem_regions(pdev, "nvme"))
return -ENODEV;
 
-   dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
-   if (!dev->bar)
+   if (nvme_remap_bar(dev, NVME_REG_DBS + 4096))
goto release;
 
return 0;
diff --git a/include/

[PATCH v2] nvme/pci: remap BAR0 to cover admin CQ doorbell for large stride

2017-05-21 Thread Xu Yu
The existing driver initially maps 8192 bytes of BAR0 which is
intended to cover doorbells of admin SQ and CQ. However, if a
large stride, e.g. 10, is used, the doorbell of admin CQ will
be out of 8192 bytes. Consequently, a page fault will be raised
when the admin CQ doorbell is accessed in nvme_configure_admin_queue().

This patch fixes this issue by remapping BAR0 before accessing
admin CQ doorbell if the initial mapping is not enough.

Signed-off-by: Xu Yu <yu.a...@intel.com>
---
Changes since v1:
* Move the bar (re)mapping logic in nvme_dev_map(), nvme_configure_admin_queue()
and nvme_setup_io_queues() to a new helper nvme_remap_bar().
* Replace several magic numbers by PAGE_SIZE and the new NVME_REG_DBS.
---
 drivers/nvme/host/pci.c | 63 -
 include/linux/nvme.h|  1 +
 2 files changed, 42 insertions(+), 22 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 9d4640a..84a254a 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -91,6 +91,7 @@ struct nvme_dev {
int q_depth;
u32 db_stride;
void __iomem *bar;
+   unsigned long bar_mapped_size;
struct work_struct reset_work;
struct work_struct remove_work;
struct timer_list watchdog_timer;
@@ -1316,6 +1317,30 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
return 0;
 }
 
+static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
+{
+   return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride);
+}
+
+static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size)
+{
+   struct pci_dev *pdev = to_pci_dev(dev->dev);
+
+   if (size <= dev->bar_mapped_size)
+   return 0;
+   if (dev->bar)
+   iounmap(dev->bar);
+   dev->bar = ioremap(pci_resource_start(pdev, 0), size);
+   if (!dev->bar) {
+   dev->bar_mapped_size = 0;
+   return -ENOMEM;
+   }
+   dev->bar_mapped_size = size;
+   dev->dbs = dev->bar + NVME_REG_DBS;
+
+   return 0;
+}
+
 static int nvme_configure_admin_queue(struct nvme_dev *dev)
 {
int result;
@@ -1323,6 +1348,10 @@ static int nvme_configure_admin_queue(struct nvme_dev 
*dev)
u64 cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
struct nvme_queue *nvmeq;
 
+   result = nvme_remap_bar(dev, db_bar_size(dev, 0));
+   if (result < 0)
+   return result;
+
dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ?
NVME_CAP_NSSRC(cap) : 0;
 
@@ -1514,16 +1543,12 @@ static inline void nvme_release_cmb(struct nvme_dev 
*dev)
}
 }
 
-static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
-{
-   return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride);
-}
-
 static int nvme_setup_io_queues(struct nvme_dev *dev)
 {
struct nvme_queue *adminq = dev->queues[0];
struct pci_dev *pdev = to_pci_dev(dev->dev);
-   int result, nr_io_queues, size;
+   int result, nr_io_queues;
+   unsigned long size;
 
nr_io_queues = num_online_cpus();
result = nvme_set_queue_count(>ctrl, _io_queues);
@@ -1542,20 +1567,15 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
nvme_release_cmb(dev);
}
 
-   size = db_bar_size(dev, nr_io_queues);
-   if (size > 8192) {
-   iounmap(dev->bar);
-   do {
-   dev->bar = ioremap(pci_resource_start(pdev, 0), size);
-   if (dev->bar)
-   break;
-   if (!--nr_io_queues)
-   return -ENOMEM;
-   size = db_bar_size(dev, nr_io_queues);
-   } while (1);
-   dev->dbs = dev->bar + 4096;
-   adminq->q_db = dev->dbs;
-   }
+   do {
+   size = db_bar_size(dev, nr_io_queues);
+   result = nvme_remap_bar(dev, size);
+   if (!result)
+   break;
+   if (!--nr_io_queues)
+   return -ENOMEM;
+   } while (1);
+   adminq->q_db = dev->dbs;
 
/* Deregister the admin queue's interrupt */
free_irq(pci_irq_vector(pdev, 0), adminq);
@@ -2061,8 +2081,7 @@ static int nvme_dev_map(struct nvme_dev *dev)
if (pci_request_mem_regions(pdev, "nvme"))
return -ENODEV;
 
-   dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
-   if (!dev->bar)
+   if (nvme_remap_bar(dev, NVME_REG_DBS + PAGE_SIZE))
goto release;
 
return 0;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index b625bac..7715be4 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -101,6 +101,7 @@ enum {
  

[PATCH v2] nvme/pci: remap BAR0 to cover admin CQ doorbell for large stride

2017-05-21 Thread Xu Yu
The existing driver initially maps 8192 bytes of BAR0 which is
intended to cover doorbells of admin SQ and CQ. However, if a
large stride, e.g. 10, is used, the doorbell of admin CQ will
be out of 8192 bytes. Consequently, a page fault will be raised
when the admin CQ doorbell is accessed in nvme_configure_admin_queue().

This patch fixes this issue by remapping BAR0 before accessing
admin CQ doorbell if the initial mapping is not enough.

Signed-off-by: Xu Yu 
---
Changes since v1:
* Move the bar (re)mapping logic in nvme_dev_map(), nvme_configure_admin_queue()
and nvme_setup_io_queues() to a new helper nvme_remap_bar().
* Replace several magic numbers by PAGE_SIZE and the new NVME_REG_DBS.
---
 drivers/nvme/host/pci.c | 63 -
 include/linux/nvme.h|  1 +
 2 files changed, 42 insertions(+), 22 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 9d4640a..84a254a 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -91,6 +91,7 @@ struct nvme_dev {
int q_depth;
u32 db_stride;
void __iomem *bar;
+   unsigned long bar_mapped_size;
struct work_struct reset_work;
struct work_struct remove_work;
struct timer_list watchdog_timer;
@@ -1316,6 +1317,30 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
return 0;
 }
 
+static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
+{
+   return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride);
+}
+
+static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size)
+{
+   struct pci_dev *pdev = to_pci_dev(dev->dev);
+
+   if (size <= dev->bar_mapped_size)
+   return 0;
+   if (dev->bar)
+   iounmap(dev->bar);
+   dev->bar = ioremap(pci_resource_start(pdev, 0), size);
+   if (!dev->bar) {
+   dev->bar_mapped_size = 0;
+   return -ENOMEM;
+   }
+   dev->bar_mapped_size = size;
+   dev->dbs = dev->bar + NVME_REG_DBS;
+
+   return 0;
+}
+
 static int nvme_configure_admin_queue(struct nvme_dev *dev)
 {
int result;
@@ -1323,6 +1348,10 @@ static int nvme_configure_admin_queue(struct nvme_dev 
*dev)
u64 cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
struct nvme_queue *nvmeq;
 
+   result = nvme_remap_bar(dev, db_bar_size(dev, 0));
+   if (result < 0)
+   return result;
+
dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ?
NVME_CAP_NSSRC(cap) : 0;
 
@@ -1514,16 +1543,12 @@ static inline void nvme_release_cmb(struct nvme_dev 
*dev)
}
 }
 
-static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
-{
-   return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride);
-}
-
 static int nvme_setup_io_queues(struct nvme_dev *dev)
 {
struct nvme_queue *adminq = dev->queues[0];
struct pci_dev *pdev = to_pci_dev(dev->dev);
-   int result, nr_io_queues, size;
+   int result, nr_io_queues;
+   unsigned long size;
 
nr_io_queues = num_online_cpus();
result = nvme_set_queue_count(>ctrl, _io_queues);
@@ -1542,20 +1567,15 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
nvme_release_cmb(dev);
}
 
-   size = db_bar_size(dev, nr_io_queues);
-   if (size > 8192) {
-   iounmap(dev->bar);
-   do {
-   dev->bar = ioremap(pci_resource_start(pdev, 0), size);
-   if (dev->bar)
-   break;
-   if (!--nr_io_queues)
-   return -ENOMEM;
-   size = db_bar_size(dev, nr_io_queues);
-   } while (1);
-   dev->dbs = dev->bar + 4096;
-   adminq->q_db = dev->dbs;
-   }
+   do {
+   size = db_bar_size(dev, nr_io_queues);
+   result = nvme_remap_bar(dev, size);
+   if (!result)
+   break;
+   if (!--nr_io_queues)
+   return -ENOMEM;
+   } while (1);
+   adminq->q_db = dev->dbs;
 
/* Deregister the admin queue's interrupt */
free_irq(pci_irq_vector(pdev, 0), adminq);
@@ -2061,8 +2081,7 @@ static int nvme_dev_map(struct nvme_dev *dev)
if (pci_request_mem_regions(pdev, "nvme"))
return -ENODEV;
 
-   dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
-   if (!dev->bar)
+   if (nvme_remap_bar(dev, NVME_REG_DBS + PAGE_SIZE))
goto release;
 
return 0;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index b625bac..7715be4 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -101,6 +101,7 @@ enum {
NVME_REG_ACQ= 

RE: [PATCH] nvme/pci: remap BAR0 to cover admin CQ doorbell for large stride

2017-05-19 Thread Xu, Yu A
Thanks for your suggestion, we will try to make it better and resend the patch 
soon.

-Original Message-
From: Christoph Hellwig [mailto:h...@lst.de] 
Sent: Thursday, May 18, 2017 9:44 PM
To: Xu, Yu A <yu.a...@intel.com>
Cc: linux-n...@lists.infradead.org; linux-kernel@vger.kernel.org; Busch, Keith 
<keith.bu...@intel.com>; ax...@fb.com; h...@lst.de; s...@grimberg.me; Zhang, 
Haozhong <haozhong.zh...@intel.com>
Subject: Re: [PATCH] nvme/pci: remap BAR0 to cover admin CQ doorbell for large 
stride

On Thu, May 18, 2017 at 06:35:47AM +0800, Xu Yu wrote:
> The existing driver initially maps 8192 bytes of BAR0 which is 
> intended to cover doorbells of admin SQ and CQ. However, if a large 
> stride, e.g. 10, is used, the doorbell of admin CQ will be out of 8192 
> bytes. Consequently, a page fault will be raised when the admin CQ 
> doorbell is accessed in nvme_configure_admin_queue().
> 
> This patch fixes this issue by remapping BAR0 before accessing admin 
> CQ doorbell if the initial mapping is not enough.
> 
> Signed-off-by: "Xu, Yu A" <yu.a...@intel.com>
> ---
>  drivers/nvme/host/pci.c | 11 +++
>  1 file changed, 11 insertions(+)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 
> 9d4640a..7c991eb 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -1322,6 +1322,17 @@ static int nvme_configure_admin_queue(struct nvme_dev 
> *dev)
>   u32 aqa;
>   u64 cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
>   struct nvme_queue *nvmeq;
> + struct pci_dev *pdev = to_pci_dev(dev->dev);
> + unsigned long size;
> +
> + size = 4096 + 2 * 4 * dev->db_stride;
> + if (size > 8192) {
> + iounmap(dev->bar);
> + dev->bar = ioremap(pci_resource_start(pdev, 0), size);
> + if (!dev->bar)
> + return -ENOMEM;
> + dev->dbs = dev->bar + 4096;
> + }

This code duplicates logic in db_bar_size / nvme_setup_io_queues.
Please reuse the db_bar_size helper by passing 0 to, and try to figure out if 
we can factor this whole sequence into a new helper as well.

Bonus points for adding constants to nvme.h for the 4096 offset of the first db 
register, and our magic 8192 threshold.


RE: [PATCH] nvme/pci: remap BAR0 to cover admin CQ doorbell for large stride

2017-05-19 Thread Xu, Yu A
Thanks for your suggestion, we will try to make it better and resend the patch 
soon.

-Original Message-
From: Christoph Hellwig [mailto:h...@lst.de] 
Sent: Thursday, May 18, 2017 9:44 PM
To: Xu, Yu A 
Cc: linux-n...@lists.infradead.org; linux-kernel@vger.kernel.org; Busch, Keith 
; ax...@fb.com; h...@lst.de; s...@grimberg.me; Zhang, 
Haozhong 
Subject: Re: [PATCH] nvme/pci: remap BAR0 to cover admin CQ doorbell for large 
stride

On Thu, May 18, 2017 at 06:35:47AM +0800, Xu Yu wrote:
> The existing driver initially maps 8192 bytes of BAR0 which is 
> intended to cover doorbells of admin SQ and CQ. However, if a large 
> stride, e.g. 10, is used, the doorbell of admin CQ will be out of 8192 
> bytes. Consequently, a page fault will be raised when the admin CQ 
> doorbell is accessed in nvme_configure_admin_queue().
> 
> This patch fixes this issue by remapping BAR0 before accessing admin 
> CQ doorbell if the initial mapping is not enough.
> 
> Signed-off-by: "Xu, Yu A" 
> ---
>  drivers/nvme/host/pci.c | 11 +++
>  1 file changed, 11 insertions(+)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 
> 9d4640a..7c991eb 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -1322,6 +1322,17 @@ static int nvme_configure_admin_queue(struct nvme_dev 
> *dev)
>   u32 aqa;
>   u64 cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
>   struct nvme_queue *nvmeq;
> + struct pci_dev *pdev = to_pci_dev(dev->dev);
> + unsigned long size;
> +
> + size = 4096 + 2 * 4 * dev->db_stride;
> + if (size > 8192) {
> + iounmap(dev->bar);
> + dev->bar = ioremap(pci_resource_start(pdev, 0), size);
> + if (!dev->bar)
> + return -ENOMEM;
> + dev->dbs = dev->bar + 4096;
> + }

This code duplicates logic in db_bar_size / nvme_setup_io_queues.
Please reuse the db_bar_size helper by passing 0 to, and try to figure out if 
we can factor this whole sequence into a new helper as well.

Bonus points for adding constants to nvme.h for the 4096 offset of the first db 
register, and our magic 8192 threshold.


[PATCH] nvme/pci: remap BAR0 to cover admin CQ doorbell for large stride

2017-05-18 Thread Xu Yu
The existing driver initially maps 8192 bytes of BAR0 which is
intended to cover doorbells of admin SQ and CQ. However, if a
large stride, e.g. 10, is used, the doorbell of admin CQ will
be out of 8192 bytes. Consequently, a page fault will be raised
when the admin CQ doorbell is accessed in nvme_configure_admin_queue().

This patch fixes this issue by remapping BAR0 before accessing
admin CQ doorbell if the initial mapping is not enough.

Signed-off-by: "Xu, Yu A" <yu.a...@intel.com>
---
 drivers/nvme/host/pci.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 9d4640a..7c991eb 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1322,6 +1322,17 @@ static int nvme_configure_admin_queue(struct nvme_dev 
*dev)
u32 aqa;
u64 cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
struct nvme_queue *nvmeq;
+   struct pci_dev *pdev = to_pci_dev(dev->dev);
+   unsigned long size;
+
+   size = 4096 + 2 * 4 * dev->db_stride;
+   if (size > 8192) {
+   iounmap(dev->bar);
+   dev->bar = ioremap(pci_resource_start(pdev, 0), size);
+   if (!dev->bar)
+   return -ENOMEM;
+   dev->dbs = dev->bar + 4096;
+   }
 
dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ?
NVME_CAP_NSSRC(cap) : 0;
-- 
2.10.1



[PATCH] nvme/pci: remap BAR0 to cover admin CQ doorbell for large stride

2017-05-18 Thread Xu Yu
The existing driver initially maps 8192 bytes of BAR0 which is
intended to cover doorbells of admin SQ and CQ. However, if a
large stride, e.g. 10, is used, the doorbell of admin CQ will
be out of 8192 bytes. Consequently, a page fault will be raised
when the admin CQ doorbell is accessed in nvme_configure_admin_queue().

This patch fixes this issue by remapping BAR0 before accessing
admin CQ doorbell if the initial mapping is not enough.

Signed-off-by: "Xu, Yu A" 
---
 drivers/nvme/host/pci.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 9d4640a..7c991eb 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1322,6 +1322,17 @@ static int nvme_configure_admin_queue(struct nvme_dev 
*dev)
u32 aqa;
u64 cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
struct nvme_queue *nvmeq;
+   struct pci_dev *pdev = to_pci_dev(dev->dev);
+   unsigned long size;
+
+   size = 4096 + 2 * 4 * dev->db_stride;
+   if (size > 8192) {
+   iounmap(dev->bar);
+   dev->bar = ioremap(pci_resource_start(pdev, 0), size);
+   if (!dev->bar)
+   return -ENOMEM;
+   dev->dbs = dev->bar + 4096;
+   }
 
dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ?
NVME_CAP_NSSRC(cap) : 0;
-- 
2.10.1