Re: [PATCH 4/5] lightnvm: pblk: Support for packed metadata in pblk.

2018-06-19 Thread Igor Konopko




On 19.06.2018 05:47, Javier Gonzalez wrote:

On 19 Jun 2018, at 14.42, Matias Bjørling  wrote:

On Tue, Jun 19, 2018 at 1:08 PM, Javier Gonzalez  wrote:

On 16 Jun 2018, at 00.27, Igor Konopko  wrote:

In current pblk implementation, l2p mapping for not closed lines
is always stored only in OOB metadata and recovered from it.

Such a solution does not provide data integrity when drives does
not have such a OOB metadata space.

The goal of this patch is to add support for so called packed
metadata, which store l2p mapping for open lines in last sector
of every write unit.

Signed-off-by: Igor Konopko 
---
drivers/lightnvm/pblk-core.c | 52 
drivers/lightnvm/pblk-init.c | 37 ++--
drivers/lightnvm/pblk-rb.c   |  3 +++
drivers/lightnvm/pblk-recovery.c | 25 +++
drivers/lightnvm/pblk-sysfs.c|  7 ++
drivers/lightnvm/pblk-write.c| 14 +++
drivers/lightnvm/pblk.h  |  5 +++-
7 files changed, 128 insertions(+), 15 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index c092ee93a18d..375c6430612e 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -340,7 +340,7 @@ void pblk_write_should_kick(struct pblk *pblk)
{
  unsigned int secs_avail = pblk_rb_read_count(>rwb);

- if (secs_avail >= pblk->min_write_pgs)
+ if (secs_avail >= pblk->min_write_pgs_data)
  pblk_write_kick(pblk);
}

@@ -371,7 +371,9 @@ struct list_head *pblk_line_gc_list(struct pblk *pblk, 
struct pblk_line *line)
  struct pblk_line_meta *lm = >lm;
  struct pblk_line_mgmt *l_mg = >l_mg;
  struct list_head *move_list = NULL;
- int vsc = le32_to_cpu(*line->vsc);
+ int packed_meta = (le32_to_cpu(*line->vsc) / pblk->min_write_pgs_data)
+ * (pblk->min_write_pgs - pblk->min_write_pgs_data);
+ int vsc = le32_to_cpu(*line->vsc) + packed_meta;

  lockdep_assert_held(>lock);

@@ -540,12 +542,15 @@ struct bio *pblk_bio_map_addr(struct pblk *pblk, void 
*data,
}

int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail,
-unsigned long secs_to_flush)
+unsigned long secs_to_flush, bool skip_meta)
{
  int max = pblk->sec_per_write;
  int min = pblk->min_write_pgs;
  int secs_to_sync = 0;

+ if (skip_meta)
+ min = max = pblk->min_write_pgs_data;
+
  if (secs_avail >= max)
  secs_to_sync = max;
  else if (secs_avail >= min)
@@ -663,7 +668,7 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, 
struct pblk_line *line,
next_rq:
  memset(, 0, sizeof(struct nvm_rq));

- rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+ rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false);
  rq_len = rq_ppas * geo->csecs;

  bio = pblk_bio_map_addr(pblk, emeta_buf, rq_ppas, rq_len,
@@ -2091,3 +2096,42 @@ void pblk_lookup_l2p_rand(struct pblk *pblk, struct 
ppa_addr *ppas,
  }
  spin_unlock(>trans_lock);
}
+
+void pblk_set_packed_meta(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ void *meta_list = rqd->meta_list;
+ void *page;
+ int i = 0;
+
+ if (pblk_is_oob_meta_supported(pblk))
+ return;
+
+ /* We need to zero out metadata corresponding to packed meta page */
+ pblk_get_meta_at(pblk, meta_list, rqd->nr_ppas - 1)->lba = ADDR_EMPTY;
+
+ page = page_to_virt(rqd->bio->bi_io_vec[rqd->bio->bi_vcnt - 1].bv_page);
+ /* We need to fill last page of request (packed metadata)
+  * with data from oob meta buffer.
+  */
+ for (; i < rqd->nr_ppas; i++)
+ memcpy(page + (i * sizeof(struct pblk_sec_meta)),
+ pblk_get_meta_at(pblk, meta_list, i),
+ sizeof(struct pblk_sec_meta));
+}
+
+void pblk_get_packed_meta(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ void *meta_list = rqd->meta_list;
+ void *page;
+ int i = 0;
+
+ if (pblk_is_oob_meta_supported(pblk))
+ return;
+
+ page = page_to_virt(rqd->bio->bi_io_vec[rqd->bio->bi_vcnt - 1].bv_page);
+ /* We need to fill oob meta buffer with data from packed metadata */
+ for (; i < rqd->nr_ppas; i++)
+ memcpy(pblk_get_meta_at(pblk, meta_list, i),
+ page + (i * sizeof(struct pblk_sec_meta)),
+ sizeof(struct pblk_sec_meta));
+}
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index f05112230a52..5eb641da46ed 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -372,8 +372,40 @@ static int pblk_core_init(struct pblk *pblk)
  pblk->min_write_pgs = geo->ws_opt * (geo->csecs / PAGE_SIZE);
  max_write_ppas = pblk->min_write_pgs * geo->all_luns;
  pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA);
+ pblk->min_write_pgs_data = pblk->min_write_pgs;
  pblk_set_sec_per_write(pblk, pblk->min_write_pgs);

+ if 

[PATCH 8/9] check: Suppress a shellcheck warning about the DMESG_FILTER initialization

2018-06-19 Thread Bart Van Assche
Avoid that shellcheck reports the following:

check:396:2: warning: Use var=$(command) to assign output (or quote to assign 
string). [SC2209]

Signed-off-by: Bart Van Assche 
---
 check | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/check b/check
index f1feb96b293e..e7439e163884 100755
--- a/check
+++ b/check
@@ -393,7 +393,7 @@ _call_test() {
 _run_test() {
TEST_NAME="$1"
CHECK_DMESG=1
-   DMESG_FILTER=cat
+   DMESG_FILTER="cat"
 
# shellcheck disable=SC1090
. "tests/${TEST_NAME}"
-- 
2.17.1



[PATCH 4/9] Suppress shellcheck complaints about global variables

2018-06-19 Thread Bart Van Assche
Add a new file common/shellcheck and include it in all test scripts
that use global variables. The file common/shellcheck contains a
statement that stops shellcheck to complain about global variables:

echo "$CHECK_DMESG ${CPUS_ONLINE_SAVED[*]} $DESCRIPTION $DMESG_FILTER 
$FIO_PERF_FIELDS $FIO_PERF_PREFIX $RESTORE_CPUS_ONLINE $QUICK $SKIP_REASON 
${TEST_RUN[*]} $TIMED" >/dev/null

Signed-off-by: Bart Van Assche 
---
 common/cpuhotplug | 2 ++
 common/fio| 2 ++
 common/iopoll | 2 ++
 common/loop   | 1 +
 common/nbd| 2 ++
 common/nvme   | 2 ++
 common/scsi   | 2 ++
 common/shellcheck | 4 
 new   | 1 +
 tests/block/001   | 1 +
 tests/block/002   | 1 +
 tests/block/003   | 2 ++
 tests/block/004   | 2 ++
 tests/block/005   | 2 ++
 tests/block/006   | 2 ++
 tests/block/007   | 1 +
 tests/block/009   | 1 +
 tests/block/010   | 2 ++
 tests/block/011   | 2 ++
 tests/block/012   | 2 ++
 tests/block/013   | 2 ++
 tests/block/014   | 2 ++
 tests/block/015   | 2 ++
 tests/block/016   | 2 ++
 tests/block/017   | 2 ++
 tests/block/018   | 2 ++
 tests/block/019   | 2 ++
 tests/block/020   | 2 ++
 tests/block/021   | 2 ++
 tests/loop/001| 2 ++
 tests/loop/003| 2 ++
 tests/loop/005| 2 ++
 tests/meta/001| 2 ++
 tests/meta/002| 2 ++
 tests/meta/003| 2 ++
 tests/meta/004| 2 ++
 tests/meta/005| 2 ++
 tests/meta/006| 2 ++
 tests/meta/007| 2 ++
 tests/meta/008| 2 ++
 tests/meta/009| 2 ++
 tests/meta/010| 2 ++
 tests/meta/011| 2 ++
 tests/meta/012| 2 ++
 tests/nbd/001 | 2 ++
 tests/nbd/002 | 2 ++
 tests/nvme/001| 2 ++
 tests/nvme/002| 2 ++
 tests/nvme/003| 2 ++
 tests/nvme/004| 2 ++
 tests/nvme/005| 2 ++
 tests/nvme/006| 2 ++
 tests/nvme/007| 2 ++
 tests/nvme/008| 2 ++
 tests/nvme/009| 2 ++
 tests/nvme/010| 2 ++
 tests/nvme/011| 2 ++
 tests/nvme/012| 2 ++
 tests/nvme/013| 2 ++
 tests/scsi/001| 2 ++
 tests/scsi/002| 2 ++
 tests/scsi/003| 2 ++
 tests/scsi/004| 1 +
 63 files changed, 121 insertions(+)
 create mode 100644 common/shellcheck

diff --git a/common/cpuhotplug b/common/cpuhotplug
index 70b284f7b7aa..9f43615a8f64 100644
--- a/common/cpuhotplug
+++ b/common/cpuhotplug
@@ -17,6 +17,8 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see .
 
+. common/shellcheck
+
 # Also initializes the ALL_CPUS and HOTPLUGGABLE_CPUS arrays.
 _have_cpu_hotplug() {
ALL_CPUS=()
diff --git a/common/fio b/common/fio
index 0f93632c0382..949796810a0c 100644
--- a/common/fio
+++ b/common/fio
@@ -17,6 +17,8 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see .
 
+. common/shellcheck
+
 _have_fio() {
if ! _have_program fio; then
return 1
diff --git a/common/iopoll b/common/iopoll
index affdd4332dc2..10ebd16923ad 100644
--- a/common/iopoll
+++ b/common/iopoll
@@ -17,6 +17,8 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see .
 
+. common/shellcheck
+
 _have_fio_with_poll() {
if ! _have_fio; then
return 1
diff --git a/common/loop b/common/loop
index 16cc3212a028..28071840ed9b 100644
--- a/common/loop
+++ b/common/loop
@@ -17,6 +17,7 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see .
 
+. common/shellcheck
 
 _have_loop_set_block_size() {
src/loblksize "$(losetup -f)" 512 &>/dev/null
diff --git a/common/nbd b/common/nbd
index 320f456e0bd8..2e0186482989 100644
--- a/common/nbd
+++ b/common/nbd
@@ -17,6 +17,8 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see .
 
+. common/shellcheck
+
 _have_nbd() {
if ! _have_module nbd; then
return 1
diff --git a/common/nvme b/common/nvme
index 044d7fe94c65..0060f3e29573 100644
--- a/common/nvme
+++ b/common/nvme
@@ -19,6 +19,8 @@
 
 NVMET_CFS="/sys/kernel/config/nvmet/"
 
+. common/shellcheck
+
 _test_dev_is_nvme() {
if ! readlink -f "$TEST_DEV_SYSFS/device" | grep -q nvme; then
SKIP_REASON="$TEST_DEV is not a NVMe device"
diff --git a/common/scsi b/common/scsi
index d9fcc0e15909..eb5a3ddc708f 100644
--- a/common/scsi
+++ b/common/scsi
@@ -17,6 +17,8 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see .
 
+. common/shellcheck
+
 _have_scsi_generic() {
_have_module sg
 }
diff --git a/common/shellcheck b/common/shellcheck
new file mode 100644
index ..1ec759976dd7
--- /dev/null
+++ b/common/shellcheck
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+# Suppress shellcheck complaints about 

[PATCH 1/9] common/rc: Fix _have_tracepoint()

2018-06-19 Thread Bart Van Assche
Make sure that _have_tracepoint() uses the argument passed to that
function instead of using an undefined variable.

Signed-off-by: Bart Van Assche 
---
 common/rc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/common/rc b/common/rc
index 7f1728025364..7592400dda82 100644
--- a/common/rc
+++ b/common/rc
@@ -112,6 +112,8 @@ _have_tracefs() {
 }
 
 _have_tracepoint() {
+   local event=$1
+
if [[ ! -d /sys/kernel/debug/tracing/events/${event} ]]; then
SKIP_REASON="tracepoint ${event} does not exist"
return 1
-- 
2.17.1



[PATCH 6/9] Multiple tests: remove unused and undefined variables

2018-06-19 Thread Bart Van Assche
Signed-off-by: Bart Van Assche 
Cc: Chaitanya Kulkarni 
---
 tests/loop/003 | 3 ---
 tests/nvme/010 | 3 +--
 tests/nvme/011 | 3 +--
 tests/nvme/012 | 1 -
 tests/nvme/013 | 1 -
 5 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/tests/loop/003 b/tests/loop/003
index 353d4fcfa33c..6f6fa8d18ebe 100755
--- a/tests/loop/003
+++ b/tests/loop/003
@@ -28,11 +28,8 @@ requires() {
 }
 
 test() {
-   local loop_dev
echo "Running ${TEST_NAME}"
 
-   loop_dev="$(losetup -f)"
-
# In a subshell so TIMEFORMAT doesn't get clobbered.
(
TIMEFORMAT="Test took %0R seconds"
diff --git a/tests/nvme/010 b/tests/nvme/010
index 956c6597a374..b6849dbad375 100755
--- a/tests/nvme/010
+++ b/tests/nvme/010
@@ -38,7 +38,6 @@ test() {
modprobe nvme-loop
 
local port
-   local ret=0
local nvmedev
local loop_dev
local file_path="${TMPDIR}/img"
@@ -70,7 +69,7 @@ test() {
losetup -d "${loop_dev}"
 
rm "${file_path}"
-   rm -f local*verify*state "${fio_out}"
+   rm -f local*verify*state
 
modprobe -r nvme_loop
modprobe -r nvmet
diff --git a/tests/nvme/011 b/tests/nvme/011
index 872a99af4f8c..f26eb155809c 100755
--- a/tests/nvme/011
+++ b/tests/nvme/011
@@ -37,7 +37,6 @@ test() {
modprobe nvme-loop
 
local port
-   local ret=0
local nvmedev
local file_path
local file_path="${TMPDIR}/img"
@@ -65,7 +64,7 @@ test() {
_remove_nvmet_port "${port}"
 
rm "${file_path}"
-   rm -f local-write-and-verify*state "${fio_out}"
+   rm -f local-write-and-verify*state
 
modprobe -r nvme_loop
modprobe -r nvmet
diff --git a/tests/nvme/012 b/tests/nvme/012
index 8f632b30734d..e570793dd5db 100755
--- a/tests/nvme/012
+++ b/tests/nvme/012
@@ -38,7 +38,6 @@ test() {
modprobe nvme-loop
 
local port
-   local ret=0
local nvmedev
local loop_dev
local mount_dir="/mnt/blktests"
diff --git a/tests/nvme/013 b/tests/nvme/013
index 0ccb083b175b..247fe9f252fa 100755
--- a/tests/nvme/013
+++ b/tests/nvme/013
@@ -38,7 +38,6 @@ test() {
modprobe nvme-loop
 
local port
-   local ret=0
local nvmedev
local mount_dir="/mnt/blktests/"
local file_path="${TMPDIR}/img"
-- 
2.17.1



[PATCH 5/9] check: Avoid that shellcheck complains that $FULL appears unused

2018-06-19 Thread Bart Van Assche
$FULL is a global variable. Avoid that shellcheck complains about it.

Signed-off-by: Bart Van Assche 
---
 check | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/check b/check
index 5f53fa105f72..f1feb96b293e 100755
--- a/check
+++ b/check
@@ -301,6 +301,8 @@ _call_test() {
local test_func="$1"
local seqres="${RESULTS_DIR}/${TEST_NAME}"
FULL="${seqres}.full"
+   # Avoid that shellcheck complains that $FULL appears unused.
+   echo "$FULL" >/dev/null
declare -A TEST_DEV_QUEUE_SAVED
 
_read_last_test_run
-- 
2.17.1



[PATCH 9/9] Makefile: Do not suppress useful shellcheck warnings

2018-06-19 Thread Bart Van Assche
All the shellcheck warnings that are currently suppressed are useful.
Additionally, it is easy to avoid false positives for the currently
suppressed categories of shellcheck warnings. Hence stop suppressing
shellcheck warnings. See also commit 17a59e0dc212 ("Fix all shellcheck
warnings").

Signed-off-by: Bart Van Assche 
---
 Makefile | 15 +--
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/Makefile b/Makefile
index 14ba8341dae5..c4bffa2d3f32 100644
--- a/Makefile
+++ b/Makefile
@@ -4,21 +4,8 @@ all:
 clean:
$(MAKE) -C src clean
 
-# SC1090: "Can't follow non-constant source". We use variable sources all over
-# the place.
-# SC2034: "VARIABLE appears unused". All test scripts use this for the test
-# metadata, and many helper functions define global variables.
-# SC2119: "Use foo "$@" if function's $1 should mean script's $1". False
-# positives on helpers like _init_scsi_debug.
-# SC2154: "VARIABLE is referenced but not assigned". False positives on
-# TEST_RUN[foo]=bar.
-# SC2209: "Use var=$(command) to assign output (or quote to assign string)".
-# Warns about DMESG_FILTER=cat, which is not going to confuse anyone who knows
-# how to write shell scripts.
-SHELLCHECK_EXCLUDE := SC1090,SC2034,SC2119,SC2154,SC2209
-
 check:
-   shellcheck -x -e $(SHELLCHECK_EXCLUDE) -f gcc check new common/* 
tests/*/[0-9]*[0-9]
+   shellcheck -x -f gcc check new common/* tests/*/[0-9]*[0-9]
! grep TODO tests/*/[0-9]*[0-9]
 
 .PHONY: all check
-- 
2.17.1



[PATCH 3/9] check, tests/meta/012: Use array["index"] instead of array[index]

2018-06-19 Thread Bart Van Assche
This causes shellcheck to stop complaining about these array element
accesses.

Signed-off-by: Bart Van Assche 
---
 check  | 46 +++---
 tests/meta/012 |  4 ++--
 2 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/check b/check
index 5e1ba7bc8401..5f53fa105f72 100755
--- a/check
+++ b/check
@@ -156,11 +156,11 @@ declare -A TEST_RUN
 _read_last_test_run() {
local seqres="${RESULTS_DIR}/${TEST_NAME}"
 
-   LAST_TEST_RUN[date]=""
-   LAST_TEST_RUN[status]=""
-   LAST_TEST_RUN[reason]=""
-   LAST_TEST_RUN[exit_status]=""
-   LAST_TEST_RUN[runtime]=""
+   LAST_TEST_RUN["date"]=""
+   LAST_TEST_RUN["status"]=""
+   LAST_TEST_RUN["reason"]=""
+   LAST_TEST_RUN["exit_status"]=""
+   LAST_TEST_RUN["runtime"]=""
 
if [[ ! -e $seqres ]]; then
return
@@ -247,9 +247,9 @@ _output_test_run() {
fi
 
if [[ -v TEST_DEV ]]; then
-   _output_status "$TEST_NAME => $(basename "$TEST_DEV")" 
"${TEST_RUN[status]}ed"
+   _output_status "$TEST_NAME => $(basename "$TEST_DEV")" 
"${TEST_RUN["status"]}ed"
else
-   _output_status "$TEST_NAME" "${TEST_RUN[status]}ed"
+   _output_status "$TEST_NAME" "${TEST_RUN["status"]}ed"
fi
 
(
@@ -306,14 +306,14 @@ _call_test() {
_read_last_test_run
_output_last_test_run
 
-   TEST_RUN[date]="$(date "+%F %T")"
+   TEST_RUN["date"]="$(date "+%F %T")"
 
mkdir -p "$(dirname "$seqres")"
# Remove leftovers from last time.
rm -f "${seqres}" "${seqres}."*
 
if [[ -w /dev/kmsg ]]; then
-   local dmesg_marker="run blktests $TEST_NAME at 
${TEST_RUN[date]}"
+   local dmesg_marker="run blktests $TEST_NAME at 
${TEST_RUN["date"]}"
echo "$dmesg_marker" >> /dev/kmsg
else
local dmesg_marker=""
@@ -328,35 +328,35 @@ _call_test() {
TIMEFORMAT="%Rs"
pushd . >/dev/null || return
{ time "$test_func" >"${seqres}.out" 2>&1; } 2>"${seqres}.runtime"
-   TEST_RUN[exit_status]=$?
+   TEST_RUN["exit_status"]=$?
popd >/dev/null || return
-   TEST_RUN[runtime]="$(cat "${seqres}.runtime")"
+   TEST_RUN["runtime"]="$(cat "${seqres}.runtime")"
rm -f "${seqres}.runtime"
 
_cleanup
 
if ! diff "tests/${TEST_NAME}.out" "${seqres}.out" >/dev/null; then
mv "${seqres}.out" "${seqres}.out.bad"
-   TEST_RUN[status]=fail
-   TEST_RUN[reason]=output
-   elif [[ ${TEST_RUN[exit_status]} -ne 0 ]]; then
-   TEST_RUN[status]=fail
-   TEST_RUN[reason]="exit"
+   TEST_RUN["status"]=fail
+   TEST_RUN["reason"]=output
+   elif [[ ${TEST_RUN["exit_status"]} -ne 0 ]]; then
+   TEST_RUN["status"]=fail
+   TEST_RUN["reason"]="exit"
elif ! _check_dmesg "$dmesg_marker"; then
-   TEST_RUN[status]=fail
-   TEST_RUN[reason]=dmesg
+   TEST_RUN["status"]=fail
+   TEST_RUN["reason"]=dmesg
else
-   TEST_RUN[status]=pass
+   TEST_RUN["status"]=pass
fi
rm -f "${seqres}.out"
 
_write_test_run
_output_test_run
 
-   if [[ ${TEST_RUN[status]} = pass ]]; then
+   if [[ ${TEST_RUN["status"]} = pass ]]; then
return 0
else
-   case "${TEST_RUN[reason]}" in
+   case "${TEST_RUN["reason"]}" in
output)
diff -u "tests/${TEST_NAME}.out" "${seqres}.out.bad" | 
awk "
{
@@ -369,7 +369,7 @@ _call_test() {
}"
;;
exit)
-   echo "exited with status ${TEST_RUN[exit_status]}"
+   echo "exited with status ${TEST_RUN["exit_status"]}"
;;
dmesg)
echo "something found in dmesg:"
@@ -432,7 +432,7 @@ _run_test() {
 
 _run_group() {
local tests=("$@")
-   local group="${tests[0]%/*}"
+   local group="${tests["0"]%/*}"
 
# shellcheck disable=SC1090
. "tests/${group}/group"
diff --git a/tests/meta/012 b/tests/meta/012
index fcf8fbdde2aa..ab8c2a73eaa5 100755
--- a/tests/meta/012
+++ b/tests/meta/012
@@ -22,8 +22,8 @@ DESCRIPTION="record pid and random junk"
 test() {
echo "Running ${TEST_NAME}"
sleep 1
-   TEST_RUN[pid]=$$
-   TEST_RUN[random]="$(od -N $((4 * (RANDOM % 4))) -An -vtx1 /dev/urandom 
| tr -d ' \n')"
+   TEST_RUN["pid"]=$$
+   TEST_RUN["random"]="$(od -N $((4 * (RANDOM % 4))) -An -vtx1 
/dev/urandom | tr -d ' \n')"
TEST_RUN[$$]=hello
echo "Test complete"
 }
-- 
2.17.1



[PATCH 7/9] Avoid passing tests/block/002 arguments to _init_scsi_debug

2018-06-19 Thread Bart Van Assche
This patch avoids that shellcheck reports the following:

tests/block/002:34:7: note: Use _init_scsi_debug "$@" if function's $1 should 
mean script's $1. [SC2119]

Signed-off-by: Bart Van Assche 
---
 common/scsi_debug | 3 +++
 tests/block/002   | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/common/scsi_debug b/common/scsi_debug
index e26e85637369..d8e6b71899a3 100644
--- a/common/scsi_debug
+++ b/common/scsi_debug
@@ -21,7 +21,10 @@ _have_scsi_debug() {
_have_module scsi_debug
 }
 
+# The _init_scsi_debug() argument list is passed to modprobe scsi_debug. "--"
+# is filtered out from the start of the argument list.
 _init_scsi_debug() {
+   [ "$1" = "--" ] && shift
if ! modprobe -r scsi_debug || ! modprobe scsi_debug "$@"; then
return 1
fi
diff --git a/tests/block/002 b/tests/block/002
index 32a0f818b95b..10e3f157c018 100755
--- a/tests/block/002
+++ b/tests/block/002
@@ -31,7 +31,7 @@ requires() {
 test() {
echo "Running ${TEST_NAME}"
 
-   if ! _init_scsi_debug; then
+   if ! _init_scsi_debug --; then
return 1
fi
 
-- 
2.17.1



[PATCH 0/9] blktests: Re-enable shellcheck warnings

2018-06-19 Thread Bart Van Assche
Hello Omar,

Since I noticed that several useful shellcheck warnings are suppressed in the
blktests project, I came up with this patch series that reenables all
shellcheck warnings and also suppresses false positive shellcheck reports. It
would be appreciated if you could have a look at this patch series.

Thanks,

Bart.

Bart Van Assche (9):
  common/rc: Fix _have_tracepoint()
  Annotate include statements in shell scripts where the source file is
a variable
  check, tests/meta/012: Use array["index"] instead of array[index]
  Suppress shellcheck complaints about global variables
  check: Avoid that shellcheck complains that $FULL appears unused
  Multiple tests: remove unused and undefined variables
  Avoid passing tests/block/002 arguments to _init_scsi_debug
  check: Suppress a shellcheck warning about the DMESG_FILTER
initialization
  Makefile: Do not suppress useful shellcheck warnings

 Makefile  | 15 +-
 check | 53 ++-
 common/cpuhotplug |  2 ++
 common/fio|  2 ++
 common/iopoll |  2 ++
 common/loop   |  1 +
 common/nbd|  2 ++
 common/nvme   |  2 ++
 common/rc |  2 ++
 common/scsi   |  2 ++
 common/scsi_debug |  3 +++
 common/shellcheck |  4 
 new   |  1 +
 tests/block/001   |  1 +
 tests/block/002   |  3 ++-
 tests/block/003   |  2 ++
 tests/block/004   |  2 ++
 tests/block/005   |  2 ++
 tests/block/006   |  2 ++
 tests/block/007   |  1 +
 tests/block/009   |  1 +
 tests/block/010   |  2 ++
 tests/block/011   |  2 ++
 tests/block/012   |  2 ++
 tests/block/013   |  2 ++
 tests/block/014   |  2 ++
 tests/block/015   |  2 ++
 tests/block/016   |  2 ++
 tests/block/017   |  2 ++
 tests/block/018   |  2 ++
 tests/block/019   |  2 ++
 tests/block/020   |  2 ++
 tests/block/021   |  2 ++
 tests/loop/001|  2 ++
 tests/loop/003|  5 ++---
 tests/loop/005|  2 ++
 tests/meta/001|  2 ++
 tests/meta/002|  2 ++
 tests/meta/003|  2 ++
 tests/meta/004|  2 ++
 tests/meta/005|  2 ++
 tests/meta/006|  2 ++
 tests/meta/007|  2 ++
 tests/meta/008|  2 ++
 tests/meta/009|  2 ++
 tests/meta/010|  2 ++
 tests/meta/011|  2 ++
 tests/meta/012|  6 --
 tests/nbd/001 |  2 ++
 tests/nbd/002 |  2 ++
 tests/nvme/001|  2 ++
 tests/nvme/002|  2 ++
 tests/nvme/003|  2 ++
 tests/nvme/004|  2 ++
 tests/nvme/005|  2 ++
 tests/nvme/006|  2 ++
 tests/nvme/007|  2 ++
 tests/nvme/008|  2 ++
 tests/nvme/009|  2 ++
 tests/nvme/010|  5 +++--
 tests/nvme/011|  5 +++--
 tests/nvme/012|  3 ++-
 tests/nvme/013|  3 ++-
 tests/scsi/001|  2 ++
 tests/scsi/002|  2 ++
 tests/scsi/003|  2 ++
 tests/scsi/004|  1 +
 67 files changed, 161 insertions(+), 50 deletions(-)
 create mode 100644 common/shellcheck

-- 
2.17.1



[PATCH 2/9] Annotate include statements in shell scripts where the source file is a variable

2018-06-19 Thread Bart Van Assche
This causes shellcheck to stop complaining about these include statements.

Signed-off-by: Bart Van Assche 
---
 check | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/check b/check
index 4baa8dde2436..5e1ba7bc8401 100755
--- a/check
+++ b/check
@@ -23,6 +23,7 @@ _found_test() {
 
unset DESCRIPTION QUICK TIMED requires device_requires test test_device
 
+   # shellcheck disable=SC1090
if ! . "tests/${test_name}"; then
return 1
fi
@@ -392,6 +393,7 @@ _run_test() {
CHECK_DMESG=1
DMESG_FILTER=cat
 
+   # shellcheck disable=SC1090
. "tests/${TEST_NAME}"
 
if declare -fF test >/dev/null; then
@@ -432,6 +434,7 @@ _run_group() {
local tests=("$@")
local group="${tests[0]%/*}"
 
+   # shellcheck disable=SC1090
. "tests/${group}/group"
 
if declare -fF group_requires >/dev/null && ! group_requires; then
-- 
2.17.1



Re: [PATCH v2] Revert "block: Add warning for bi_next not NULL in bio_endio()"

2018-06-19 Thread Jens Axboe
On 6/19/18 12:17 PM, Bart Van Assche wrote:
> On Tue, 2018-06-19 at 14:16 -0400, Kent Overstreet wrote:
>> I take it if we had a test for request based dm in blktests or somewhere that
>> probably would have caught this much easier :/
> 
> I'm working on porting the srp-test software to the blktests framework.

That's awesome, would be a great addition!

-- 
Jens Axboe



Re: [PATCH v2] Revert "block: Add warning for bi_next not NULL in bio_endio()"

2018-06-19 Thread Bart Van Assche
On Tue, 2018-06-19 at 14:16 -0400, Kent Overstreet wrote:
> I take it if we had a test for request based dm in blktests or somewhere that
> probably would have caught this much easier :/

I'm working on porting the srp-test software to the blktests framework.

Bart.




Re: [PATCH v2] Revert "block: Add warning for bi_next not NULL in bio_endio()"

2018-06-19 Thread Jens Axboe
On 6/19/18 11:26 AM, Bart Van Assche wrote:
> Commit 0ba99ca4838b ("block: Add warning for bi_next not NULL in
> bio_endio()") breaks the dm driver. end_clone_bio() detects whether
> or not a bio is the last bio associated with a request by checking
> the .bi_next field. Commit 0ba99ca4838b clears that field before
> end_clone_bio() has had a chance to inspect that field. Hence revert
> commit 0ba99ca4838b.

Applied, thanks Bart.

-- 
Jens Axboe



Re: [PATCH v2] Revert "block: Add warning for bi_next not NULL in bio_endio()"

2018-06-19 Thread Mike Snitzer
On Tue, Jun 19 2018 at  1:26pm -0400,
Bart Van Assche  wrote:

> Commit 0ba99ca4838b ("block: Add warning for bi_next not NULL in
> bio_endio()") breaks the dm driver. end_clone_bio() detects whether
> or not a bio is the last bio associated with a request by checking
> the .bi_next field. Commit 0ba99ca4838b clears that field before
> end_clone_bio() has had a chance to inspect that field. Hence revert
> commit 0ba99ca4838b.
> 
> This patch avoids that KASAN reports the following complaint when
> running the srp-test software (srp-test/run_tests -c -d -r 10 -t 02-mq):
> 
> ==
> BUG: KASAN: use-after-free in bio_advance+0x11b/0x1d0
> Read of size 4 at addr 8801300e06d0 by task ksoftirqd/0/9
> 
> CPU: 0 PID: 9 Comm: ksoftirqd/0 Not tainted 4.18.0-rc1-dbg+ #1
> Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
> 1.0.0-prebuilt.qemu-project.org 04/01/2014
> Call Trace:
>  dump_stack+0xa4/0xf5
>  print_address_description+0x6f/0x270
>  kasan_report+0x241/0x360
>  __asan_load4+0x78/0x80
>  bio_advance+0x11b/0x1d0
>  blk_update_request+0xa7/0x5b0
>  scsi_end_request+0x56/0x320 [scsi_mod]
>  scsi_io_completion+0x7d6/0xb20 [scsi_mod]
>  scsi_finish_command+0x1c0/0x280 [scsi_mod]
>  scsi_softirq_done+0x19a/0x230 [scsi_mod]
>  blk_mq_complete_request+0x160/0x240
>  scsi_mq_done+0x50/0x1a0 [scsi_mod]
>  srp_recv_done+0x515/0x1330 [ib_srp]
>  __ib_process_cq+0xa0/0xf0 [ib_core]
>  ib_poll_handler+0x38/0xa0 [ib_core]
>  irq_poll_softirq+0xe8/0x1f0
>  __do_softirq+0x128/0x60d
>  run_ksoftirqd+0x3f/0x60
>  smpboot_thread_fn+0x352/0x460
>  kthread+0x1c1/0x1e0
>  ret_from_fork+0x24/0x30
> 
> Allocated by task 1918:
>  save_stack+0x43/0xd0
>  kasan_kmalloc+0xad/0xe0
>  kasan_slab_alloc+0x11/0x20
>  kmem_cache_alloc+0xfe/0x350
>  mempool_alloc_slab+0x15/0x20
>  mempool_alloc+0xfb/0x270
>  bio_alloc_bioset+0x244/0x350
>  submit_bh_wbc+0x9c/0x2f0
>  __block_write_full_page+0x299/0x5a0
>  block_write_full_page+0x16b/0x180
>  blkdev_writepage+0x18/0x20
>  __writepage+0x42/0x80
>  write_cache_pages+0x376/0x8a0
>  generic_writepages+0xbe/0x110
>  blkdev_writepages+0xe/0x10
>  do_writepages+0x9b/0x180
>  __filemap_fdatawrite_range+0x178/0x1c0
>  file_write_and_wait_range+0x59/0xc0
>  blkdev_fsync+0x46/0x80
>  vfs_fsync_range+0x66/0x100
>  do_fsync+0x3d/0x70
>  __x64_sys_fsync+0x21/0x30
>  do_syscall_64+0x77/0x230
>  entry_SYSCALL_64_after_hwframe+0x49/0xbe
> 
> Freed by task 9:
>  save_stack+0x43/0xd0
>  __kasan_slab_free+0x137/0x190
>  kasan_slab_free+0xe/0x10
>  kmem_cache_free+0xd3/0x380
>  mempool_free_slab+0x17/0x20
>  mempool_free+0x63/0x160
>  bio_free+0x81/0xa0
>  bio_put+0x59/0x60
>  end_bio_bh_io_sync+0x5d/0x70
>  bio_endio+0x1a7/0x360
>  blk_update_request+0xd0/0x5b0
>  end_clone_bio+0xa3/0xd0 [dm_mod]
>  bio_endio+0x1a7/0x360
>  blk_update_request+0xd0/0x5b0
>  scsi_end_request+0x56/0x320 [scsi_mod]
>  scsi_io_completion+0x7d6/0xb20 [scsi_mod]
>  scsi_finish_command+0x1c0/0x280 [scsi_mod]
>  scsi_softirq_done+0x19a/0x230 [scsi_mod]
>  blk_mq_complete_request+0x160/0x240
>  scsi_mq_done+0x50/0x1a0 [scsi_mod]
>  srp_recv_done+0x515/0x1330 [ib_srp]
>  __ib_process_cq+0xa0/0xf0 [ib_core]
>  ib_poll_handler+0x38/0xa0 [ib_core]
>  irq_poll_softirq+0xe8/0x1f0
>  __do_softirq+0x128/0x60d
> 
> The buggy address belongs to the object at 8801300e0640
>  which belongs to the cache bio-0 of size 200
> The buggy address is located 144 bytes inside of
>  200-byte region [8801300e0640, 8801300e0708)
> The buggy address belongs to the page:
> page:ea0004c03800 count:1 mapcount:0 mapping:88015a563a00 index:0x0 
> compound_mapcount: 0
> flags: 0x80008100(slab|head)
> raw: 80008100 dead0100 dead0200 88015a563a00
> raw:  00330033 0001 
> page dumped because: kasan: bad access detected
> 
> Memory state around the buggy address:
>  8801300e0580: fb fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc
>  8801300e0600: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb
> >8801300e0680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>  ^
>  8801300e0700: fb fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>  8801300e0780: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> ==
> 
> Fixes: 0ba99ca4838b ("block: Add warning for bi_next not NULL in bio_endio()")
> Signed-off-by: Bart Van Assche 
> Cc: Kent Overstreet 
> Cc: Mike Snitzer 

Acked-by: Mike Snitzer 

Thanks Bart.


[PATCH v2] Revert "block: Add warning for bi_next not NULL in bio_endio()"

2018-06-19 Thread Bart Van Assche
Commit 0ba99ca4838b ("block: Add warning for bi_next not NULL in
bio_endio()") breaks the dm driver. end_clone_bio() detects whether
or not a bio is the last bio associated with a request by checking
the .bi_next field. Commit 0ba99ca4838b clears that field before
end_clone_bio() has had a chance to inspect that field. Hence revert
commit 0ba99ca4838b.

This patch avoids that KASAN reports the following complaint when
running the srp-test software (srp-test/run_tests -c -d -r 10 -t 02-mq):

==
BUG: KASAN: use-after-free in bio_advance+0x11b/0x1d0
Read of size 4 at addr 8801300e06d0 by task ksoftirqd/0/9

CPU: 0 PID: 9 Comm: ksoftirqd/0 Not tainted 4.18.0-rc1-dbg+ #1
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
1.0.0-prebuilt.qemu-project.org 04/01/2014
Call Trace:
 dump_stack+0xa4/0xf5
 print_address_description+0x6f/0x270
 kasan_report+0x241/0x360
 __asan_load4+0x78/0x80
 bio_advance+0x11b/0x1d0
 blk_update_request+0xa7/0x5b0
 scsi_end_request+0x56/0x320 [scsi_mod]
 scsi_io_completion+0x7d6/0xb20 [scsi_mod]
 scsi_finish_command+0x1c0/0x280 [scsi_mod]
 scsi_softirq_done+0x19a/0x230 [scsi_mod]
 blk_mq_complete_request+0x160/0x240
 scsi_mq_done+0x50/0x1a0 [scsi_mod]
 srp_recv_done+0x515/0x1330 [ib_srp]
 __ib_process_cq+0xa0/0xf0 [ib_core]
 ib_poll_handler+0x38/0xa0 [ib_core]
 irq_poll_softirq+0xe8/0x1f0
 __do_softirq+0x128/0x60d
 run_ksoftirqd+0x3f/0x60
 smpboot_thread_fn+0x352/0x460
 kthread+0x1c1/0x1e0
 ret_from_fork+0x24/0x30

Allocated by task 1918:
 save_stack+0x43/0xd0
 kasan_kmalloc+0xad/0xe0
 kasan_slab_alloc+0x11/0x20
 kmem_cache_alloc+0xfe/0x350
 mempool_alloc_slab+0x15/0x20
 mempool_alloc+0xfb/0x270
 bio_alloc_bioset+0x244/0x350
 submit_bh_wbc+0x9c/0x2f0
 __block_write_full_page+0x299/0x5a0
 block_write_full_page+0x16b/0x180
 blkdev_writepage+0x18/0x20
 __writepage+0x42/0x80
 write_cache_pages+0x376/0x8a0
 generic_writepages+0xbe/0x110
 blkdev_writepages+0xe/0x10
 do_writepages+0x9b/0x180
 __filemap_fdatawrite_range+0x178/0x1c0
 file_write_and_wait_range+0x59/0xc0
 blkdev_fsync+0x46/0x80
 vfs_fsync_range+0x66/0x100
 do_fsync+0x3d/0x70
 __x64_sys_fsync+0x21/0x30
 do_syscall_64+0x77/0x230
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Freed by task 9:
 save_stack+0x43/0xd0
 __kasan_slab_free+0x137/0x190
 kasan_slab_free+0xe/0x10
 kmem_cache_free+0xd3/0x380
 mempool_free_slab+0x17/0x20
 mempool_free+0x63/0x160
 bio_free+0x81/0xa0
 bio_put+0x59/0x60
 end_bio_bh_io_sync+0x5d/0x70
 bio_endio+0x1a7/0x360
 blk_update_request+0xd0/0x5b0
 end_clone_bio+0xa3/0xd0 [dm_mod]
 bio_endio+0x1a7/0x360
 blk_update_request+0xd0/0x5b0
 scsi_end_request+0x56/0x320 [scsi_mod]
 scsi_io_completion+0x7d6/0xb20 [scsi_mod]
 scsi_finish_command+0x1c0/0x280 [scsi_mod]
 scsi_softirq_done+0x19a/0x230 [scsi_mod]
 blk_mq_complete_request+0x160/0x240
 scsi_mq_done+0x50/0x1a0 [scsi_mod]
 srp_recv_done+0x515/0x1330 [ib_srp]
 __ib_process_cq+0xa0/0xf0 [ib_core]
 ib_poll_handler+0x38/0xa0 [ib_core]
 irq_poll_softirq+0xe8/0x1f0
 __do_softirq+0x128/0x60d

The buggy address belongs to the object at 8801300e0640
 which belongs to the cache bio-0 of size 200
The buggy address is located 144 bytes inside of
 200-byte region [8801300e0640, 8801300e0708)
The buggy address belongs to the page:
page:ea0004c03800 count:1 mapcount:0 mapping:88015a563a00 index:0x0 
compound_mapcount: 0
flags: 0x80008100(slab|head)
raw: 80008100 dead0100 dead0200 88015a563a00
raw:  00330033 0001 
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 8801300e0580: fb fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc
 8801300e0600: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb
>8801300e0680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ^
 8801300e0700: fb fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
 8801300e0780: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
==

Fixes: 0ba99ca4838b ("block: Add warning for bi_next not NULL in bio_endio()")
Signed-off-by: Bart Van Assche 
Cc: Kent Overstreet 
Cc: Mike Snitzer 
---

Changes in v2 compared to v1: improved patch description.

 block/bio.c  | 3 ---
 block/blk-core.c | 8 +---
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 595663e0281a..accf68a5aa1e 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1777,9 +1777,6 @@ void bio_endio(struct bio *bio)
if (!bio_integrity_endio(bio))
return;
 
-   if (WARN_ONCE(bio->bi_next, "driver left bi_next not NULL"))
-   bio->bi_next = NULL;
-
/*
 * Need to have a real endio function for chained bios, otherwise
 * various corner cases will break (like stacking block devices that
diff --git a/block/blk-core.c 

Re: [PATCH] block: fix timeout changes for legacy request drivers

2018-06-19 Thread Jens Axboe
On 6/19/18 10:40 AM, Christoph Hellwig wrote:
> blk_mq_complete_request can only be called for blk-mq drivers, but when
> removing the BLK_EH_HANDLED return value, two legacy request timeout
> methods incorrectly got switched to call blk_mq_complete_request.
> Call __blk_complete_request instead to reinstance the previous behavior.
> For that __blk_complete_request needs to be exported.

Yikes. Looks like blktests only smoke tests blk-mq timeout handling,
we should add a test to do the same for !mq.

-- 
Jens Axboe



[PATCH] block: fix timeout changes for legacy request drivers

2018-06-19 Thread Christoph Hellwig
blk_mq_complete_request can only be called for blk-mq drivers, but when
removing the BLK_EH_HANDLED return value, two legacy request timeout
methods incorrectly got switched to call blk_mq_complete_request.
Call __blk_complete_request instead to reinstance the previous behavior.
For that __blk_complete_request needs to be exported.

Fixes: 1fc2b62e ("scsi_transport_fc: complete requests from ->timeout")
Fixes: 0df0bb08 ("null_blk: complete requests from ->timeout")
Reported-by: Jianchao Wang 
Signed-off-by: Christoph Hellwig 
---
 block/blk-softirq.c  | 1 +
 drivers/block/null_blk.c | 2 +-
 drivers/scsi/scsi_transport_fc.c | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 01e2b353a2b9..15c1f5e12eb8 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -144,6 +144,7 @@ void __blk_complete_request(struct request *req)
 
local_irq_restore(flags);
 }
+EXPORT_SYMBOL(__blk_complete_request);
 
 /**
  * blk_complete_request - end I/O on a request
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 7948049f6c43..042c778e5a4e 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -1365,7 +1365,7 @@ static blk_qc_t null_queue_bio(struct request_queue *q, 
struct bio *bio)
 static enum blk_eh_timer_return null_rq_timed_out_fn(struct request *rq)
 {
pr_info("null: rq %p timed out\n", rq);
-   blk_mq_complete_request(rq);
+   __blk_complete_request(rq);
return BLK_EH_DONE;
 }
 
diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 1da3d71e9f61..13948102ca29 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -3592,7 +3592,7 @@ fc_bsg_job_timeout(struct request *req)
 
/* the blk_end_sync_io() doesn't check the error */
if (inflight)
-   blk_mq_complete_request(req);
+   __blk_complete_request(req);
return BLK_EH_DONE;
 }
 
-- 
2.17.1



Re: [PATCH 4/5] lightnvm: pblk: Support for packed metadata in pblk.

2018-06-19 Thread Javier Gonzalez
> On 19 Jun 2018, at 14.42, Matias Bjørling  wrote:
> 
> On Tue, Jun 19, 2018 at 1:08 PM, Javier Gonzalez  wrote:
>>> On 16 Jun 2018, at 00.27, Igor Konopko  wrote:
>>> 
>>> In current pblk implementation, l2p mapping for not closed lines
>>> is always stored only in OOB metadata and recovered from it.
>>> 
>>> Such a solution does not provide data integrity when drives does
>>> not have such a OOB metadata space.
>>> 
>>> The goal of this patch is to add support for so called packed
>>> metadata, which store l2p mapping for open lines in last sector
>>> of every write unit.
>>> 
>>> Signed-off-by: Igor Konopko 
>>> ---
>>> drivers/lightnvm/pblk-core.c | 52 
>>> 
>>> drivers/lightnvm/pblk-init.c | 37 ++--
>>> drivers/lightnvm/pblk-rb.c   |  3 +++
>>> drivers/lightnvm/pblk-recovery.c | 25 +++
>>> drivers/lightnvm/pblk-sysfs.c|  7 ++
>>> drivers/lightnvm/pblk-write.c| 14 +++
>>> drivers/lightnvm/pblk.h  |  5 +++-
>>> 7 files changed, 128 insertions(+), 15 deletions(-)
>>> 
>>> diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
>>> index c092ee93a18d..375c6430612e 100644
>>> --- a/drivers/lightnvm/pblk-core.c
>>> +++ b/drivers/lightnvm/pblk-core.c
>>> @@ -340,7 +340,7 @@ void pblk_write_should_kick(struct pblk *pblk)
>>> {
>>>  unsigned int secs_avail = pblk_rb_read_count(>rwb);
>>> 
>>> - if (secs_avail >= pblk->min_write_pgs)
>>> + if (secs_avail >= pblk->min_write_pgs_data)
>>>  pblk_write_kick(pblk);
>>> }
>>> 
>>> @@ -371,7 +371,9 @@ struct list_head *pblk_line_gc_list(struct pblk *pblk, 
>>> struct pblk_line *line)
>>>  struct pblk_line_meta *lm = >lm;
>>>  struct pblk_line_mgmt *l_mg = >l_mg;
>>>  struct list_head *move_list = NULL;
>>> - int vsc = le32_to_cpu(*line->vsc);
>>> + int packed_meta = (le32_to_cpu(*line->vsc) / pblk->min_write_pgs_data)
>>> + * (pblk->min_write_pgs - pblk->min_write_pgs_data);
>>> + int vsc = le32_to_cpu(*line->vsc) + packed_meta;
>>> 
>>>  lockdep_assert_held(>lock);
>>> 
>>> @@ -540,12 +542,15 @@ struct bio *pblk_bio_map_addr(struct pblk *pblk, void 
>>> *data,
>>> }
>>> 
>>> int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail,
>>> -unsigned long secs_to_flush)
>>> +unsigned long secs_to_flush, bool skip_meta)
>>> {
>>>  int max = pblk->sec_per_write;
>>>  int min = pblk->min_write_pgs;
>>>  int secs_to_sync = 0;
>>> 
>>> + if (skip_meta)
>>> + min = max = pblk->min_write_pgs_data;
>>> +
>>>  if (secs_avail >= max)
>>>  secs_to_sync = max;
>>>  else if (secs_avail >= min)
>>> @@ -663,7 +668,7 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, 
>>> struct pblk_line *line,
>>> next_rq:
>>>  memset(, 0, sizeof(struct nvm_rq));
>>> 
>>> - rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
>>> + rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false);
>>>  rq_len = rq_ppas * geo->csecs;
>>> 
>>>  bio = pblk_bio_map_addr(pblk, emeta_buf, rq_ppas, rq_len,
>>> @@ -2091,3 +2096,42 @@ void pblk_lookup_l2p_rand(struct pblk *pblk, struct 
>>> ppa_addr *ppas,
>>>  }
>>>  spin_unlock(>trans_lock);
>>> }
>>> +
>>> +void pblk_set_packed_meta(struct pblk *pblk, struct nvm_rq *rqd)
>>> +{
>>> + void *meta_list = rqd->meta_list;
>>> + void *page;
>>> + int i = 0;
>>> +
>>> + if (pblk_is_oob_meta_supported(pblk))
>>> + return;
>>> +
>>> + /* We need to zero out metadata corresponding to packed meta page */
>>> + pblk_get_meta_at(pblk, meta_list, rqd->nr_ppas - 1)->lba = ADDR_EMPTY;
>>> +
>>> + page = page_to_virt(rqd->bio->bi_io_vec[rqd->bio->bi_vcnt - 
>>> 1].bv_page);
>>> + /* We need to fill last page of request (packed metadata)
>>> +  * with data from oob meta buffer.
>>> +  */
>>> + for (; i < rqd->nr_ppas; i++)
>>> + memcpy(page + (i * sizeof(struct pblk_sec_meta)),
>>> + pblk_get_meta_at(pblk, meta_list, i),
>>> + sizeof(struct pblk_sec_meta));
>>> +}
>>> +
>>> +void pblk_get_packed_meta(struct pblk *pblk, struct nvm_rq *rqd)
>>> +{
>>> + void *meta_list = rqd->meta_list;
>>> + void *page;
>>> + int i = 0;
>>> +
>>> + if (pblk_is_oob_meta_supported(pblk))
>>> + return;
>>> +
>>> + page = page_to_virt(rqd->bio->bi_io_vec[rqd->bio->bi_vcnt - 
>>> 1].bv_page);
>>> + /* We need to fill oob meta buffer with data from packed metadata */
>>> + for (; i < rqd->nr_ppas; i++)
>>> + memcpy(pblk_get_meta_at(pblk, meta_list, i),
>>> + page + (i * sizeof(struct pblk_sec_meta)),
>>> + sizeof(struct pblk_sec_meta));
>>> +}
>>> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
>>> index f05112230a52..5eb641da46ed 100644
>>> --- 

Re: [PATCH 4/5] lightnvm: pblk: Support for packed metadata in pblk.

2018-06-19 Thread Matias Bjørling
On Tue, Jun 19, 2018 at 1:08 PM, Javier Gonzalez  wrote:
>> On 16 Jun 2018, at 00.27, Igor Konopko  wrote:
>>
>> In current pblk implementation, l2p mapping for not closed lines
>> is always stored only in OOB metadata and recovered from it.
>>
>> Such a solution does not provide data integrity when drives does
>> not have such a OOB metadata space.
>>
>> The goal of this patch is to add support for so called packed
>> metadata, which store l2p mapping for open lines in last sector
>> of every write unit.
>>
>> Signed-off-by: Igor Konopko 
>> ---
>> drivers/lightnvm/pblk-core.c | 52 
>> 
>> drivers/lightnvm/pblk-init.c | 37 ++--
>> drivers/lightnvm/pblk-rb.c   |  3 +++
>> drivers/lightnvm/pblk-recovery.c | 25 +++
>> drivers/lightnvm/pblk-sysfs.c|  7 ++
>> drivers/lightnvm/pblk-write.c| 14 +++
>> drivers/lightnvm/pblk.h  |  5 +++-
>> 7 files changed, 128 insertions(+), 15 deletions(-)
>>
>> diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
>> index c092ee93a18d..375c6430612e 100644
>> --- a/drivers/lightnvm/pblk-core.c
>> +++ b/drivers/lightnvm/pblk-core.c
>> @@ -340,7 +340,7 @@ void pblk_write_should_kick(struct pblk *pblk)
>> {
>>   unsigned int secs_avail = pblk_rb_read_count(>rwb);
>>
>> - if (secs_avail >= pblk->min_write_pgs)
>> + if (secs_avail >= pblk->min_write_pgs_data)
>>   pblk_write_kick(pblk);
>> }
>>
>> @@ -371,7 +371,9 @@ struct list_head *pblk_line_gc_list(struct pblk *pblk, 
>> struct pblk_line *line)
>>   struct pblk_line_meta *lm = >lm;
>>   struct pblk_line_mgmt *l_mg = >l_mg;
>>   struct list_head *move_list = NULL;
>> - int vsc = le32_to_cpu(*line->vsc);
>> + int packed_meta = (le32_to_cpu(*line->vsc) / pblk->min_write_pgs_data)
>> + * (pblk->min_write_pgs - pblk->min_write_pgs_data);
>> + int vsc = le32_to_cpu(*line->vsc) + packed_meta;
>>
>>   lockdep_assert_held(>lock);
>>
>> @@ -540,12 +542,15 @@ struct bio *pblk_bio_map_addr(struct pblk *pblk, void 
>> *data,
>> }
>>
>> int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail,
>> -unsigned long secs_to_flush)
>> +unsigned long secs_to_flush, bool skip_meta)
>> {
>>   int max = pblk->sec_per_write;
>>   int min = pblk->min_write_pgs;
>>   int secs_to_sync = 0;
>>
>> + if (skip_meta)
>> + min = max = pblk->min_write_pgs_data;
>> +
>>   if (secs_avail >= max)
>>   secs_to_sync = max;
>>   else if (secs_avail >= min)
>> @@ -663,7 +668,7 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, 
>> struct pblk_line *line,
>> next_rq:
>>   memset(, 0, sizeof(struct nvm_rq));
>>
>> - rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
>> + rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false);
>>   rq_len = rq_ppas * geo->csecs;
>>
>>   bio = pblk_bio_map_addr(pblk, emeta_buf, rq_ppas, rq_len,
>> @@ -2091,3 +2096,42 @@ void pblk_lookup_l2p_rand(struct pblk *pblk, struct 
>> ppa_addr *ppas,
>>   }
>>   spin_unlock(>trans_lock);
>> }
>> +
>> +void pblk_set_packed_meta(struct pblk *pblk, struct nvm_rq *rqd)
>> +{
>> + void *meta_list = rqd->meta_list;
>> + void *page;
>> + int i = 0;
>> +
>> + if (pblk_is_oob_meta_supported(pblk))
>> + return;
>> +
>> + /* We need to zero out metadata corresponding to packed meta page */
>> + pblk_get_meta_at(pblk, meta_list, rqd->nr_ppas - 1)->lba = ADDR_EMPTY;
>> +
>> + page = page_to_virt(rqd->bio->bi_io_vec[rqd->bio->bi_vcnt - 
>> 1].bv_page);
>> + /* We need to fill last page of request (packed metadata)
>> +  * with data from oob meta buffer.
>> +  */
>> + for (; i < rqd->nr_ppas; i++)
>> + memcpy(page + (i * sizeof(struct pblk_sec_meta)),
>> + pblk_get_meta_at(pblk, meta_list, i),
>> + sizeof(struct pblk_sec_meta));
>> +}
>> +
>> +void pblk_get_packed_meta(struct pblk *pblk, struct nvm_rq *rqd)
>> +{
>> + void *meta_list = rqd->meta_list;
>> + void *page;
>> + int i = 0;
>> +
>> + if (pblk_is_oob_meta_supported(pblk))
>> + return;
>> +
>> + page = page_to_virt(rqd->bio->bi_io_vec[rqd->bio->bi_vcnt - 
>> 1].bv_page);
>> + /* We need to fill oob meta buffer with data from packed metadata */
>> + for (; i < rqd->nr_ppas; i++)
>> + memcpy(pblk_get_meta_at(pblk, meta_list, i),
>> + page + (i * sizeof(struct pblk_sec_meta)),
>> + sizeof(struct pblk_sec_meta));
>> +}
>> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
>> index f05112230a52..5eb641da46ed 100644
>> --- a/drivers/lightnvm/pblk-init.c
>> +++ b/drivers/lightnvm/pblk-init.c
>> @@ -372,8 +372,40 @@ static int pblk_core_init(struct pblk *pblk)
>>   pblk->min_write_pgs = geo->ws_opt * (geo->csecs / 

Re: [PATCH 4/5] lightnvm: pblk: Support for packed metadata in pblk.

2018-06-19 Thread Javier Gonzalez
> On 16 Jun 2018, at 00.27, Igor Konopko  wrote:
> 
> In current pblk implementation, l2p mapping for not closed lines
> is always stored only in OOB metadata and recovered from it.
> 
> Such a solution does not provide data integrity when drives does
> not have such a OOB metadata space.
> 
> The goal of this patch is to add support for so called packed
> metadata, which store l2p mapping for open lines in last sector
> of every write unit.
> 
> Signed-off-by: Igor Konopko 
> ---
> drivers/lightnvm/pblk-core.c | 52 
> drivers/lightnvm/pblk-init.c | 37 ++--
> drivers/lightnvm/pblk-rb.c   |  3 +++
> drivers/lightnvm/pblk-recovery.c | 25 +++
> drivers/lightnvm/pblk-sysfs.c|  7 ++
> drivers/lightnvm/pblk-write.c| 14 +++
> drivers/lightnvm/pblk.h  |  5 +++-
> 7 files changed, 128 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
> index c092ee93a18d..375c6430612e 100644
> --- a/drivers/lightnvm/pblk-core.c
> +++ b/drivers/lightnvm/pblk-core.c
> @@ -340,7 +340,7 @@ void pblk_write_should_kick(struct pblk *pblk)
> {
>   unsigned int secs_avail = pblk_rb_read_count(>rwb);
> 
> - if (secs_avail >= pblk->min_write_pgs)
> + if (secs_avail >= pblk->min_write_pgs_data)
>   pblk_write_kick(pblk);
> }
> 
> @@ -371,7 +371,9 @@ struct list_head *pblk_line_gc_list(struct pblk *pblk, 
> struct pblk_line *line)
>   struct pblk_line_meta *lm = >lm;
>   struct pblk_line_mgmt *l_mg = >l_mg;
>   struct list_head *move_list = NULL;
> - int vsc = le32_to_cpu(*line->vsc);
> + int packed_meta = (le32_to_cpu(*line->vsc) / pblk->min_write_pgs_data)
> + * (pblk->min_write_pgs - pblk->min_write_pgs_data);
> + int vsc = le32_to_cpu(*line->vsc) + packed_meta;
> 
>   lockdep_assert_held(>lock);
> 
> @@ -540,12 +542,15 @@ struct bio *pblk_bio_map_addr(struct pblk *pblk, void 
> *data,
> }
> 
> int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail,
> -unsigned long secs_to_flush)
> +unsigned long secs_to_flush, bool skip_meta)
> {
>   int max = pblk->sec_per_write;
>   int min = pblk->min_write_pgs;
>   int secs_to_sync = 0;
> 
> + if (skip_meta)
> + min = max = pblk->min_write_pgs_data;
> +
>   if (secs_avail >= max)
>   secs_to_sync = max;
>   else if (secs_avail >= min)
> @@ -663,7 +668,7 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, 
> struct pblk_line *line,
> next_rq:
>   memset(, 0, sizeof(struct nvm_rq));
> 
> - rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
> + rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false);
>   rq_len = rq_ppas * geo->csecs;
> 
>   bio = pblk_bio_map_addr(pblk, emeta_buf, rq_ppas, rq_len,
> @@ -2091,3 +2096,42 @@ void pblk_lookup_l2p_rand(struct pblk *pblk, struct 
> ppa_addr *ppas,
>   }
>   spin_unlock(>trans_lock);
> }
> +
> +void pblk_set_packed_meta(struct pblk *pblk, struct nvm_rq *rqd)
> +{
> + void *meta_list = rqd->meta_list;
> + void *page;
> + int i = 0;
> +
> + if (pblk_is_oob_meta_supported(pblk))
> + return;
> +
> + /* We need to zero out metadata corresponding to packed meta page */
> + pblk_get_meta_at(pblk, meta_list, rqd->nr_ppas - 1)->lba = ADDR_EMPTY;
> +
> + page = page_to_virt(rqd->bio->bi_io_vec[rqd->bio->bi_vcnt - 1].bv_page);
> + /* We need to fill last page of request (packed metadata)
> +  * with data from oob meta buffer.
> +  */
> + for (; i < rqd->nr_ppas; i++)
> + memcpy(page + (i * sizeof(struct pblk_sec_meta)),
> + pblk_get_meta_at(pblk, meta_list, i),
> + sizeof(struct pblk_sec_meta));
> +}
> +
> +void pblk_get_packed_meta(struct pblk *pblk, struct nvm_rq *rqd)
> +{
> + void *meta_list = rqd->meta_list;
> + void *page;
> + int i = 0;
> +
> + if (pblk_is_oob_meta_supported(pblk))
> + return;
> +
> + page = page_to_virt(rqd->bio->bi_io_vec[rqd->bio->bi_vcnt - 1].bv_page);
> + /* We need to fill oob meta buffer with data from packed metadata */
> + for (; i < rqd->nr_ppas; i++)
> + memcpy(pblk_get_meta_at(pblk, meta_list, i),
> + page + (i * sizeof(struct pblk_sec_meta)),
> + sizeof(struct pblk_sec_meta));
> +}
> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
> index f05112230a52..5eb641da46ed 100644
> --- a/drivers/lightnvm/pblk-init.c
> +++ b/drivers/lightnvm/pblk-init.c
> @@ -372,8 +372,40 @@ static int pblk_core_init(struct pblk *pblk)
>   pblk->min_write_pgs = geo->ws_opt * (geo->csecs / PAGE_SIZE);
>   max_write_ppas = pblk->min_write_pgs * geo->all_luns;
>   pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA);
> + pblk->min_write_pgs_data = 

Re: [PATCH 0/5] lightnvm: More flexible approach to metadata

2018-06-19 Thread Javier Gonzalez

> On 16 Jun 2018, at 00.27, Igor Konopko  wrote:
> 
> This series of patches introduce some more flexibility in pblk
> related to OOB meta:
> -ability to use different sizes of metadata (previously fixed 16b)
> -ability to use pblk on drives without metadata
> -ensuring that extended (interleaved) metadata is not in use
> 
> I belive that most of this patches, maybe except of number 4 (Support
> for packed metadata) are rather simple, so waiting for comments
> especially about this one.
> 
> Igor Konopko (5):
>  lightnvm: pblk: Helpers for OOB metadata
>  lightnvm: pblk: Remove resv field for sec meta
>  lightnvm: Flexible DMA pool entry size
>  lightnvm: pblk: Support for packed metadata in pblk.
>  lightnvm: pblk: Disable interleaved metadata in pblk
> 
> drivers/lightnvm/core.c  | 33 ++-
> drivers/lightnvm/pblk-core.c | 86 +++-
> drivers/lightnvm/pblk-init.c | 52 +++-
> drivers/lightnvm/pblk-map.c  | 21 ++
> drivers/lightnvm/pblk-rb.c   |  3 ++
> drivers/lightnvm/pblk-read.c | 85 +--
> drivers/lightnvm/pblk-recovery.c | 67 +--
> drivers/lightnvm/pblk-sysfs.c|  7 
> drivers/lightnvm/pblk-write.c| 22 ++
> drivers/lightnvm/pblk.h  | 46 +++--
> drivers/nvme/host/lightnvm.c |  7 +++-
> include/linux/lightnvm.h |  9 +++--
> 12 files changed, 333 insertions(+), 105 deletions(-)
> 
> --
> 2.14.3

I get a number of errors when running the series. A simple bisect points
to the first patch being the one introducing the (first) regression.
Here you have the trace attached. I could easily reproduce it mounting
ext4 and running a RocksDB's db_bench.

[   80.302731] Workqueue: pblk-read-end-wq pblk_line_put_ws
[   80.302733] RIP: 0010:__pblk_line_put+0xc3/0xd0
[   80.302733] Code: 89 55 70 48 89 4b 20 48 89 43 28 48 89 10 83 45 64 01 c6 
07 00 0f 1f 40 00 48 89 de 4c 89 ef 5b 5d 41 5c 41 5d e9 5d a5 00 00 <0f> 0b e9 
60 ff ff ff 66 0f 1f 44 00 00 0f 1f 44 00 00 55 53 48 89
[   80.302755] RSP: 0018:b17102733e40 EFLAGS: 00010293
[   80.302756] RAX:  RBX: 8d34529003e8 RCX: 8d3467068020
[   80.302757] RDX: 0001 RSI: 8d34529003e8 RDI: 8d34529004a8
[   80.302758] RBP: 8d34622ae800 R08:  R09: 8080808080808080
[   80.302758] R10: 0018 R11: fefefefefefefeff R12: 8d34529004a8
[   80.302759] R13:  R14: 8d346532c600 R15: 08d345f59990
[   80.302760] FS:  () GS:8d34778c() 
knlGS:
[   80.302761] CS:  0010 DS:  ES:  CR0: 80050033
[   80.302762] CR2: 7f4d37e0 CR3: 00020780a002 CR4: 003606e0
[   80.302763] DR0:  DR1:  DR2: 
[   80.302764] DR3:  DR6: fffe0ff0 DR7: 0400
[   80.302764] Call Trace:
[   80.302768]  pblk_line_put_ws+0x1a/0x30
[   80.302771]  process_one_work+0x15e/0x3d0
[   80.302773]  worker_thread+0x4c/0x440
[   80.302774]  kthread+0xf8/0x130
[   80.302776]  ? rescuer_thread+0x350/0x350
[   80.302777]  ? kthread_associate_blkcg+0x90/0x90
[   80.302779]  ret_from_fork+0x35/0x40
[   80.302781] ---[ end trace c4ab4ef1527265f6 ]---
[   81.551907] WARNING: CPU: 6 PID: 5045 at drivers/lightnvm/pblk-core.c:162 
__pblk_map_invalidate+0x10b/0x130
[   81.551908] Modules linked in:
[   81.551910] CPU: 6 PID: 5045 Comm: rocksdb:bg0 Tainted: GW 
4.17.0--00884b2fb689 #2569
[   81.551911] Hardware name: Supermicro Super Server/X11SSH-F, BIOS 2.1 
12/11/2017
[   81.551912] RIP: 0010:__pblk_map_invalidate+0x10b/0x130
[   81.551912] Code: 48 89 de 4c 89 e7 e8 f4 fd ff ff 49 89 c5 e9 62 ff ff ff 
48 c7 c7 ec 8e 5e a4 c6 05 50 65 10 01 01 e8 29 eb 88 ff 0f 0b eb c5 <0f> 0b e9 
1b ff ff ff c6 07 00 0f 1f 40 00 4c 89 e7 c6 07 00 0f 1f
[   81.551927] RSP: 0018:b17106d23808 EFLAGS: 00010246
[   81.551928] RAX:  RBX: 8d34529003e8 RCX: 0300
[   81.551929] RDX: 0001 RSI: 8d34529003e8 RDI: 8d34529004a8
[   81.551929] RBP: 8d34529004a8 R08: 0018 R09: 8d345d6e96d0
[   81.551930] R10: b17106d237c8 R11: 0040 R12: 8d34622ae800
[   81.551930] R13: 0b40 R14: 8d34622aec10 R15: b1710615d7c0
[   81.551931] FS:  7f4d6707e700() GS:8d347798() 
knlGS:
[   81.551932] CS:  0010 DS:  ES:  CR0: 80050033
[   81.551932] CR2: 7f4d57e0 CR3: 00040c204006 CR4: 003606e0
[   81.551933] DR0:  DR1:  DR2: 
[   81.551933] DR3:  DR6: fffe0ff0 DR7: 0400
[   81.551934] Call Trace:
[   81.551937]  pblk_update_map_dev+0x69/0x370
[   81.551938]  __pblk_rb_update_l2p+0x52/0x160
[   81.551939]  

Re: [PATCH] bdi: Fix another oops in wb_workfn()

2018-06-19 Thread Jan Kara
On Mon 18-06-18 23:38:12, Tetsuo Handa wrote:
> On 2018/06/18 22:46, Jan Kara wrote:
> > syzbot is reporting NULL pointer dereference at wb_workfn() [1] due to
> 
> [1] 
> https://syzkaller.appspot.com/bug?id=e0818ccb7e46190b3f1038b0c794299208ed4206
> 
> line is missing.
> 
> > wb->bdi->dev being NULL. And Dmitry confirmed that wb->state was
> > WB_shutting_down after wb->bdi->dev became NULL. This indicates that
> > unregister_bdi() failed to call wb_shutdown() on one of wb objects.
> > 
> > The problem is in cgwb_bdi_unregister() which does cgwb_kill() and thus
> > drops bdi's reference to wb structures before going through the list of
> > wbs again and calling wb_shutdown() on each of them. This way the loop
> > iterating through all wbs can easily miss a wb if that wb has already
> > passed through cgwb_remove_from_bdi_list() called from wb_shutdown()
> > from cgwb_release_workfn() and as a result fully shutdown bdi although
> > wb_workfn() for this wb structure is still running. In fact there are
> > also other ways cgwb_bdi_unregister() can race with
> > cgwb_release_workfn() leading e.g. to use-after-free issues:
> > 
> > CPU1CPU2
> > cgwb_bdi_unregister()
> >   cgwb_kill(*slot);
> > 
> > cgwb_release()
> >   queue_work(cgwb_release_wq, >release_work);
> > cgwb_release_workfn()
> >   wb = list_first_entry(>wb_list, ...)
> >   spin_unlock_irq(_lock);
> >   wb_shutdown(wb);
> >   ...
> >   kfree_rcu(wb, rcu);
> >   wb_shutdown(wb); -> oops use-after-free
> > 
> > We solve these issues by synchronizing writeback structure shutdown from
> > cgwb_bdi_unregister() with cgwb_release_workfn() using a new mutex. That
> > way we also no longer need synchronization using WB_shutting_down as the
> > mutex provides it for CONFIG_CGROUP_WRITEBACK case and without
> > CONFIG_CGROUP_WRITEBACK wb_shutdown() can be called only once from
> > bdi_unregister().
> 
> Wow, this patch removes WB_shutting_down.

Yes.

> A bit of worry for me is how long will this mutex_lock() sleep, for
> if there are a lot of wb objects to shutdown, sequentially doing
> wb_shutdown() might block someone's mutex_lock() for longer than
> khungtaskd's timeout period (typically 120 seconds) ?

That's a good question but since the bdi is going away in this case I
don't think the flusher work should take long to complete - the device is
removed from the system at this point so it won't do any IO.

Honza
-- 
Jan Kara 
SUSE Labs, CR


Re: [PATCH 5/5] lightnvm: pblk: Disable interleaved metadata in pblk

2018-06-19 Thread Matias Bjørling
On Mon, Jun 18, 2018 at 4:29 PM, Javier Gonzalez  wrote:
>> On 16 Jun 2018, at 21.38, Matias Bjørling  wrote:
>>
>> On 06/16/2018 12:27 AM, Igor Konopko wrote:
>>> Currently pblk and lightnvm does only check for size
>>> of OOB metadata and does not care wheather this meta
>>> is located in separate buffer or is interleaved with
>>> data in single buffer.
>>> In reality only the first scenario is supported, where
>>> second mode will break pblk functionality during any
>>> IO operation.
>>> The goal of this patch is to block creation of pblk
>>> devices in case of interleaved metadata
>>> Signed-off-by: Igor Konopko 
>>> ---
>>>  drivers/lightnvm/pblk-init.c | 6 ++
>>>  drivers/nvme/host/lightnvm.c | 1 +
>>>  include/linux/lightnvm.h | 1 +
>>>  3 files changed, 8 insertions(+)
>>> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
>>> index 5eb641da46ed..483a6d479e7d 100644
>>> --- a/drivers/lightnvm/pblk-init.c
>>> +++ b/drivers/lightnvm/pblk-init.c
>>> @@ -1238,6 +1238,12 @@ static void *pblk_init(struct nvm_tgt_dev *dev, 
>>> struct gendisk *tdisk,
>>>  return ERR_PTR(-EINVAL);
>>>  }
>>>  +   if (geo->ext) {
>>> +pr_err("pblk: extended (interleaved) metadata in data buffer"
>>> +" not supported\n");
>>> +return ERR_PTR(-EINVAL);
>>> +}
>>> +
>>>  pblk = kzalloc(sizeof(struct pblk), GFP_KERNEL);
>>>  if (!pblk)
>>>  return ERR_PTR(-ENOMEM);
>>> diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
>>> index 670478abc754..872ab854ccf5 100644
>>> --- a/drivers/nvme/host/lightnvm.c
>>> +++ b/drivers/nvme/host/lightnvm.c
>>> @@ -979,6 +979,7 @@ void nvme_nvm_update_nvm_info(struct nvme_ns *ns)
>>>  geo->csecs = 1 << ns->lba_shift;
>>>  geo->sos = ns->ms;
>>> +geo->ext = ns->ext;
>>>  }
>>>int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
>>> diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
>>> index 72a55d71917e..b13e64e2112f 100644
>>> --- a/include/linux/lightnvm.h
>>> +++ b/include/linux/lightnvm.h
>>> @@ -350,6 +350,7 @@ struct nvm_geo {
>>>  u32 clba;   /* sectors per chunk */
>>>  u16 csecs;  /* sector size */
>>>  u16 sos;/* out-of-band area size */
>>> +u16 ext;/* metadata in extended data buffer */
>>>  /* device write constrains */
>>>  u32 ws_min; /* minimum write size */
>>
>> I think bool type would be better here. Can it be placesd a bit down, just 
>> over the 1.2 stuff?
>>
>> Also, feel free to fix up the checkpatch stuff in patch 1 & 3 & 5.
>
> Apart from Matias' comments, it looks good to me.
>
> Traditionally, we have separated subsystem and target patches to make
> sure there is no coupling between pblk and lightnvm, but if Matias is ok
> with starting having patches covering all at once, then good for me too.
>

I often object when a patch can logically be split into two and should
be two distinct parts. In this case, it fits together.


Re: [PATCH 1/5] lightnvm: pblk: Helpers for OOB metadata

2018-06-19 Thread Javier Gonzalez
> 
> On 18 Jun 2018, at 22.53, Igor Konopko  wrote:
> 
> 
> 
> On 18.06.2018 07:23, Javier Gonzalez wrote:
>>> On 16 Jun 2018, at 00.27, Igor Konopko  wrote:
>>> 
>>> Currently pblk assumes that size of OOB metadata on drive is always
>>> equal to size of pblk_sec_meta struct. This commit add helpers which will
>>> allow to handle different sizes of OOB metadata on drive.
>>> 
>>> Signed-off-by: Igor Konopko 
>>> ---
>>> drivers/lightnvm/pblk-core.c | 10 +
>>> drivers/lightnvm/pblk-map.c  | 21 ---
>>> drivers/lightnvm/pblk-read.c | 45 
>>> +---
>>> drivers/lightnvm/pblk-recovery.c | 24 -
>>> drivers/lightnvm/pblk.h  | 29 ++
>>> 5 files changed, 91 insertions(+), 38 deletions(-)
>>> 
>>> diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
>>> index 66ab1036f2fb..8a0ac466872f 100644
>>> --- a/drivers/lightnvm/pblk-core.c
>>> +++ b/drivers/lightnvm/pblk-core.c
>>> @@ -685,7 +685,7 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, 
>>> struct pblk_line *line,
>>> rqd.nr_ppas = rq_ppas;
>>> 
>>> if (dir == PBLK_WRITE) {
>>> -   struct pblk_sec_meta *meta_list = rqd.meta_list;
>>> +   void *meta_list = rqd.meta_list;
>>> 
>>> rqd.flags = pblk_set_progr_mode(pblk, PBLK_WRITE);
>>> for (i = 0; i < rqd.nr_ppas; ) {
>>> @@ -693,7 +693,8 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, 
>>> struct pblk_line *line,
>>> paddr = __pblk_alloc_page(pblk, line, min);
>>> spin_unlock(>lock);
>>> for (j = 0; j < min; j++, i++, paddr++) {
>>> -   meta_list[i].lba = cpu_to_le64(ADDR_EMPTY);
>>> +   pblk_get_meta_at(pblk, meta_list, i)->lba =
>>> +   cpu_to_le64(ADDR_EMPTY);
>>> rqd.ppa_list[i] =
>>> addr_to_gen_ppa(pblk, paddr, id);
>>> }
>>> @@ -825,14 +826,15 @@ static int pblk_line_submit_smeta_io(struct pblk 
>>> *pblk, struct pblk_line *line,
>>> rqd.nr_ppas = lm->smeta_sec;
>>> 
>>> for (i = 0; i < lm->smeta_sec; i++, paddr++) {
>>> -   struct pblk_sec_meta *meta_list = rqd.meta_list;
>>> +   void *meta_list = rqd.meta_list;
>>> 
>>> rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
>>> 
>>> if (dir == PBLK_WRITE) {
>>> __le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
>>> 
>>> -   meta_list[i].lba = lba_list[paddr] = addr_empty;
>>> +   pblk_get_meta_at(pblk, meta_list, i)->lba =
>>> +   lba_list[paddr] = addr_empty;
>>> }
>>> }
>>> 
>>> diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c
>>> index 953ca31dda68..92c40b546c4e 100644
>>> --- a/drivers/lightnvm/pblk-map.c
>>> +++ b/drivers/lightnvm/pblk-map.c
>>> @@ -21,7 +21,7 @@
>>> static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
>>>   struct ppa_addr *ppa_list,
>>>   unsigned long *lun_bitmap,
>>> - struct pblk_sec_meta *meta_list,
>>> + void *meta_list,
>>>   unsigned int valid_secs)
>>> {
>>> struct pblk_line *line = pblk_line_get_data(pblk);
>>> @@ -67,14 +67,17 @@ static int pblk_map_page_data(struct pblk *pblk, 
>>> unsigned int sentry,
>>> kref_get(>ref);
>>> w_ctx = pblk_rb_w_ctx(>rwb, sentry + i);
>>> w_ctx->ppa = ppa_list[i];
>>> -   meta_list[i].lba = cpu_to_le64(w_ctx->lba);
>>> +   pblk_get_meta_at(pblk, meta_list, i)->lba =
>>> +   cpu_to_le64(w_ctx->lba);
>>> lba_list[paddr] = cpu_to_le64(w_ctx->lba);
>>> if (lba_list[paddr] != addr_empty)
>>> line->nr_valid_lbas++;
>>> else
>>> atomic64_inc(>pad_wa);
>>> } else {
>>> -   lba_list[paddr] = meta_list[i].lba = addr_empty;
>>> +   lba_list[paddr] =
>>> +   pblk_get_meta_at(pblk, meta_list, i)->lba =
>>> +   addr_empty;
>>> __pblk_map_invalidate(pblk, line, paddr);
>>> }
>>> }
>>> @@ -87,7 +90,7 @@ void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, 
>>> unsigned int sentry,
>>>  unsigned long *lun_bitmap, unsigned int valid_secs,
>>>  unsigned int off)
>>> {
>>> -   struct pblk_sec_meta *meta_list = rqd->meta_list;
>>> +   void *meta_list = rqd->meta_list;
>>> unsigned int map_secs;
>>> int min = pblk->min_write_pgs;
>>> int i;
>>> @@ -95,7 +98,9 @@ void