date:20180313

[PATCH v2 3/8] block: sed-opal: unify cmd start and finalize

2018-03-13 Thread Jonas Rabenstein

Every step starts with resetting the cmd buffer as well as the comid and
constructs the appropriate OPAL_CALL command. Consequently, those
actions may be combined into one generic function.

Signed-off-by: Jonas Rabenstein 
---
 block/sed-opal.c | 250 ---
 1 file changed, 70 insertions(+), 180 deletions(-)

diff --git a/block/sed-opal.c b/block/sed-opal.c
index a228a13f0a08..3bf685884fbf 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -659,6 +659,9 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 
tsn)
struct opal_header *hdr;
int err = 0;
 
+   /* close the parameter list opened from start_opal_cmd */
+   add_token_u8(&err, cmd, OPAL_ENDLIST);
+
add_token_u8(&err, cmd, OPAL_ENDOFDATA);
add_token_u8(&err, cmd, OPAL_STARTLIST);
add_token_u8(&err, cmd, 0);
@@ -1001,6 +1004,26 @@ static void clear_opal_cmd(struct opal_dev *dev)
memset(dev->cmd, 0, IO_BUFFER_LENGTH);
 }
 
+static int start_opal_cmd(struct opal_dev *dev, const u8 *uid, const u8 
*method)
+{
+   int err = 0;
+
+   clear_opal_cmd(dev);
+   set_comid(dev, dev->comid);
+
+   add_token_u8(&err, dev, OPAL_CALL);
+   add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
+   add_token_bytestring(&err, dev, method, OPAL_METHOD_LENGTH);
+
+   /* every method call is followed by its parameters enclosed within
+* OPAL_STARTLIST and OPAL_ENDLIST tokens. We automatically open the
+* parameter list here and close it later in cmd_finalize
+*/
+   add_token_u8(&err, dev, OPAL_STARTLIST);
+
+   return err;
+}
+
 static int start_opal_session_cont(struct opal_dev *dev)
 {
u32 hsn, tsn;
@@ -1063,21 +1086,13 @@ static int finalize_and_send(struct opal_dev *dev, 
cont_fn cont)
 static int gen_key(struct opal_dev *dev, void *data)
 {
u8 uid[OPAL_UID_LENGTH];
-   int err = 0;
-
-   clear_opal_cmd(dev);
-   set_comid(dev, dev->comid);
+   int err;
 
memcpy(uid, dev->prev_data, min(sizeof(uid), dev->prev_d_len));
kfree(dev->prev_data);
dev->prev_data = NULL;
 
-   add_token_u8(&err, dev, OPAL_CALL);
-   add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
-   add_token_bytestring(&err, dev, opalmethod[OPAL_GENKEY],
-OPAL_UID_LENGTH);
-   add_token_u8(&err, dev, OPAL_STARTLIST);
-   add_token_u8(&err, dev, OPAL_ENDLIST);
+   err = start_opal_cmd(dev, uid, opalmethod[OPAL_GENKEY]);
 
if (err) {
pr_debug("Error building gen key command\n");
@@ -1115,21 +1130,14 @@ static int get_active_key_cont(struct opal_dev *dev)
 static int get_active_key(struct opal_dev *dev, void *data)
 {
u8 uid[OPAL_UID_LENGTH];
-   int err = 0;
+   int err;
u8 *lr = data;
 
-   clear_opal_cmd(dev);
-   set_comid(dev, dev->comid);
-
err = build_locking_range(uid, sizeof(uid), *lr);
if (err)
return err;
 
-   err = 0;
-   add_token_u8(&err, dev, OPAL_CALL);
-   add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
-   add_token_bytestring(&err, dev, opalmethod[OPAL_GET], OPAL_UID_LENGTH);
-   add_token_u8(&err, dev, OPAL_STARTLIST);
+   err = start_opal_cmd(dev, uid, opalmethod[OPAL_GET]);
add_token_u8(&err, dev, OPAL_STARTLIST);
add_token_u8(&err, dev, OPAL_STARTNAME);
add_token_u8(&err, dev, 3); /* startCloumn */
@@ -1140,7 +1148,6 @@ static int get_active_key(struct opal_dev *dev, void 
*data)
add_token_u8(&err, dev, 10); /* ActiveKey */
add_token_u8(&err, dev, OPAL_ENDNAME);
add_token_u8(&err, dev, OPAL_ENDLIST);
-   add_token_u8(&err, dev, OPAL_ENDLIST);
if (err) {
pr_debug("Error building get active key command\n");
return err;
@@ -1153,13 +1160,10 @@ static int generic_lr_enable_disable(struct opal_dev 
*dev,
 u8 *uid, bool rle, bool wle,
 bool rl, bool wl)
 {
-   int err = 0;
+   int err;
 
-   add_token_u8(&err, dev, OPAL_CALL);
-   add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
-   add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH);
+   err = start_opal_cmd(dev, uid, opalmethod[OPAL_SET]);
 
-   add_token_u8(&err, dev, OPAL_STARTLIST);
add_token_u8(&err, dev, OPAL_STARTNAME);
add_token_u8(&err, dev, OPAL_VALUES);
add_token_u8(&err, dev, OPAL_STARTLIST);
@@ -1186,7 +1190,6 @@ static int generic_lr_enable_disable(struct opal_dev *dev,
 
add_token_u8(&err, dev, OPAL_ENDLIST);
add_token_u8(&err, dev, OPAL_ENDNAME);
-   add_token_u8(&err, dev, OPAL_ENDLIST);
return err;
 }
 
@@ -1207,10 +1210,7 @@ static int setup_locking_range(struct opal_dev *dev, 
void *data)
u8 uid[OPAL_UID_LENGTH];
struct opal_user_lr_

[PATCH v5] blk-throttle: fix race between blkcg_bio_issue_check() and cgroup_rmdir()

2018-03-13 Thread Joseph Qi

We've triggered a WARNING in blk_throtl_bio() when throttling writeback
io, which complains blkg->refcnt is already 0 when calling blkg_get(),
and then kernel crashes with invalid page request.
After investigating this issue, we've found it is caused by a race
between blkcg_bio_issue_check() and cgroup_rmdir(), which is described
below:

writeback kworker   cgroup_rmdir
  cgroup_destroy_locked
kill_css
  css_killed_ref_fn
css_killed_work_fn
  offline_css
blkcg_css_offline
  blkcg_bio_issue_check
rcu_read_lock
blkg_lookup
  spin_trylock(q->queue_lock)
  blkg_destroy
  spin_unlock(q->queue_lock)
blk_throtl_bio
spin_lock_irq(q->queue_lock)
...
spin_unlock_irq(q->queue_lock)
  rcu_read_unlock

Since rcu can only prevent blkg from releasing when it is being used,
the blkg->refcnt can be decreased to 0 during blkg_destroy() and schedule
blkg release.
Then trying to blkg_get() in blk_throtl_bio() will complains the WARNING.
And then the corresponding blkg_put() will schedule blkg release again,
which result in double free.
This race is introduced by commit ae1188963611 ("blkcg: consolidate blkg
creation in blkcg_bio_issue_check()"). Before this commit, it will
lookup first and then try to lookup/create again with queue_lock. Since
revive this logic is a bit drastic, so fix it by only offlining pd during
blkcg_css_offline(), and move the rest destruction (especially
blkg_put()) into blkcg_css_free(), which should be the right way as
discussed.

Fixes: ae1188963611 ("blkcg: consolidate blkg creation in 
blkcg_bio_issue_check()")
Reported-by: Jiufei Xue 
Cc: sta...@vger.kernel.org #4.3+
Signed-off-by: Joseph Qi 
---
 block/blk-cgroup.c | 78 --
 include/linux/blk-cgroup.h |  1 +
 2 files changed, 63 insertions(+), 16 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index c2033a2..92112f4 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -307,11 +307,28 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
}
 }
 
+static void blkg_pd_offline(struct blkcg_gq *blkg)
+{
+   int i;
+
+   lockdep_assert_held(blkg->q->queue_lock);
+   lockdep_assert_held(&blkg->blkcg->lock);
+
+   for (i = 0; i < BLKCG_MAX_POLS; i++) {
+   struct blkcg_policy *pol = blkcg_policy[i];
+
+   if (blkg->pd[i] && !blkg->pd[i]->offlined &&
+   pol->pd_offline_fn) {
+   pol->pd_offline_fn(blkg->pd[i]);
+   blkg->pd[i]->offlined = true;
+   }
+   }
+}
+
 static void blkg_destroy(struct blkcg_gq *blkg)
 {
struct blkcg *blkcg = blkg->blkcg;
struct blkcg_gq *parent = blkg->parent;
-   int i;
 
lockdep_assert_held(blkg->q->queue_lock);
lockdep_assert_held(&blkcg->lock);
@@ -320,13 +337,6 @@ static void blkg_destroy(struct blkcg_gq *blkg)
WARN_ON_ONCE(list_empty(&blkg->q_node));
WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
 
-   for (i = 0; i < BLKCG_MAX_POLS; i++) {
-   struct blkcg_policy *pol = blkcg_policy[i];
-
-   if (blkg->pd[i] && pol->pd_offline_fn)
-   pol->pd_offline_fn(blkg->pd[i]);
-   }
-
if (parent) {
blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes);
blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios);
@@ -369,6 +379,7 @@ static void blkg_destroy_all(struct request_queue *q)
struct blkcg *blkcg = blkg->blkcg;
 
spin_lock(&blkcg->lock);
+   blkg_pd_offline(blkg);
blkg_destroy(blkg);
spin_unlock(&blkcg->lock);
}
@@ -995,25 +1006,25 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
  * @css: css of interest
  *
  * This function is called when @css is about to go away and responsible
- * for shooting down all blkgs associated with @css.  blkgs should be
- * removed while holding both q and blkcg locks.  As blkcg lock is nested
- * inside q lock, this function performs reverse double lock dancing.
+ * for offlining all blkgs pd and killing all wbs associated with @css.
+ * blkgs pd offline should be done while holding both q and blkcg locks.
+ * As blkcg lock is nested inside q lock, this function performs reverse
+ * double lock dancing.
  *
  * This is the blkcg counterpart of ioc_release_fn().
  */
 static void blkcg_css_offline(struct cgroup_subsys_state *css)
 {
struct blkcg *blkcg = css_to_blkcg(css);
+   struct blkcg_gq *blkg;
 
spin_lock_irq(&blkcg->lock);
 
-   while

[PATCH v2 8.1/8.4] block: sed-opal: ioctl for writing to shadow mbr

2018-03-13 Thread Jonas Rabenstein

Allow modification of the shadow mbr. If the shadow mbr is not marked as
done, this data will be presented read only as the device content. Only
after marking the shadow mbr as done and unlocking a locking range the
actual content is accessible.

Signed-off-by: Jonas Rabenstein 
---
 block/sed-opal.c  | 76 +++
 include/linux/sed-opal.h  |  1 +
 include/uapi/linux/sed-opal.h |  8 +
 3 files changed, 85 insertions(+)

diff --git a/block/sed-opal.c b/block/sed-opal.c
index b201c96d23a3..aa49fcc30462 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -1491,6 +1491,54 @@ static int set_mbr_enable_disable(struct opal_dev *dev, 
void *data)
return finalize_and_send(dev, parse_and_check_status);
 }
 
+static int write_shadow_mbr(struct opal_dev *dev, void *data)
+{
+   struct opal_shadow_mbr *shadow = data;
+   const u8 __user *src;
+   u8 *dst;
+   size_t off;
+   u64 len;
+   int err = 0;
+
+   /* FIXME: this is the maximum we can use for IO_BUFFER_LENGTH=2048.
+*Instead of having constant, it would be nice to compute the
+*actual value depending on IO_BUFFER_LENGTH
+*/
+   len = 1950;
+
+   /* do the actual transmission(s) */
+   src = (u8 *) shadow->data;
+   for (off = 0 ; off < shadow->size; off += len) {
+   len = min(len, shadow->size - off);
+
+   pr_debug("MBR: write bytes %zu+%llu/%llu\n",
+off, len, shadow->size);
+   err = start_opal_cmd(dev, opaluid[OPAL_MBR],
+opalmethod[OPAL_SET]);
+   add_token_u8(&err, dev, OPAL_STARTNAME);
+   add_token_u8(&err, dev, OPAL_WHERE);
+   add_token_u64(&err, dev, shadow->offset + off);
+   add_token_u8(&err, dev, OPAL_ENDNAME);
+
+   add_token_u8(&err, dev, OPAL_STARTNAME);
+   add_token_u8(&err, dev, OPAL_VALUES);
+   dst = add_bytestring_header(&err, dev, len);
+   if (!dst)
+   break;
+   if (copy_from_user(dst, src + off, len))
+   err = -EFAULT;
+
+   add_token_u8(&err, dev, OPAL_ENDNAME);
+   if (err)
+   break;
+
+   err = finalize_and_send(dev, parse_and_check_status);
+   if (err)
+   break;
+   }
+   return err;
+}
+
 static int generic_pw_cmd(u8 *key, size_t key_len, u8 *cpin_uid,
  struct opal_dev *dev)
 {
@@ -2036,6 +2084,31 @@ static int opal_mbr_status(struct opal_dev *dev, struct 
opal_mbr_data *opal_mbr)
return ret;
 }
 
+static int opal_write_shadow_mbr(struct opal_dev *dev,
+struct opal_shadow_mbr *info)
+{
+   const struct opal_step mbr_steps[] = {
+   { opal_discovery0, },
+   { start_admin1LSP_opal_session, &info->key },
+   { write_shadow_mbr, info },
+   { end_opal_session, },
+   { NULL, }
+   };
+   int ret;
+
+   if (info->size == 0)
+   return 0;
+
+   if (!access_ok(VERIFY_READ, info->data, info->size))
+   return -EINVAL;
+
+   mutex_lock(&dev->dev_lock);
+   setup_opal_dev(dev, mbr_steps);
+   ret = next(dev);
+   mutex_unlock(&dev->dev_lock);
+   return ret;
+}
+
 static int opal_save(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk)
 {
struct opal_suspend_data *suspend;
@@ -2368,6 +2441,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, 
void __user *arg)
case IOC_OPAL_MBR_STATUS:
ret = opal_mbr_status(dev, p);
break;
+   case IOC_OPAL_WRITE_SHADOW_MBR:
+   ret = opal_write_shadow_mbr(dev, p);
+   break;
case IOC_OPAL_ERASE_LR:
ret = opal_erase_locking_range(dev, p);
break;
diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h
index b38dc602cae3..cf08cdc13cbd 100644
--- a/include/linux/sed-opal.h
+++ b/include/linux/sed-opal.h
@@ -47,6 +47,7 @@ static inline bool is_sed_ioctl(unsigned int cmd)
case IOC_OPAL_ENABLE_DISABLE_MBR:
case IOC_OPAL_ERASE_LR:
case IOC_OPAL_SECURE_ERASE_LR:
+   case IOC_OPAL_WRITE_SHADOW_MBR:
case IOC_OPAL_MBR_STATUS:
return true;
}
diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h
index 0cb9890cdc04..ab4b69778bd0 100644
--- a/include/uapi/linux/sed-opal.h
+++ b/include/uapi/linux/sed-opal.h
@@ -104,6 +104,13 @@ struct opal_mbr_data {
__u8 __align[7];
 };
 
+struct opal_shadow_mbr {
+   struct opal_key key;
+   const __u64 data;
+   __u64 offset;
+   __u64 size;
+};
+
 #define IOC_OPAL_SAVE  _IOW('p', 220, struct opal_lock_unlock)
 #define IOC_OPAL_LOCK_UNLOCK   _IOW('p', 221, s

[PATCH v2 8.3/8.4] block: sed-opal: get metadata about opal-sed tables

2018-03-13 Thread Jonas Rabenstein

Every opal-sed table is described in the OPAL_TABLE_TABLE. Provide a
function to get desired information out of that table.

Signed-off-by: Jonas Rabenstein 
---
 block/opal_proto.h | 16 
 block/sed-opal.c   | 25 +
 2 files changed, 41 insertions(+)

diff --git a/block/opal_proto.h b/block/opal_proto.h
index b6e352cfe982..5e8df3245eb0 100644
--- a/block/opal_proto.h
+++ b/block/opal_proto.h
@@ -106,6 +106,7 @@ enum opal_uid {
OPAL_ENTERPRISE_BANDMASTER0_UID,
OPAL_ENTERPRISE_ERASEMASTER_UID,
/* tables */
+   OPAL_TABLE_TABLE,
OPAL_LOCKINGRANGE_GLOBAL,
OPAL_LOCKINGRANGE_ACE_RDLOCKED,
OPAL_LOCKINGRANGE_ACE_WRLOCKED,
@@ -160,6 +161,21 @@ enum opal_token {
OPAL_STARTCOLUMN = 0x03,
OPAL_ENDCOLUMN = 0x04,
OPAL_VALUES = 0x01,
+   /* table table */
+   OPAL_TABLE_UID = 0x00,
+   OPAL_TABLE_NAME = 0x01,
+   OPAL_TABLE_COMMON = 0x02,
+   OPAL_TABLE_TEMPLATE = 0x03,
+   OPAL_TABLE_KIND = 0x04,
+   OPAL_TABLE_COLUMN = 0x05,
+   OPAL_TABLE_COLUMNS = 0x06,
+   OPAL_TABLE_ROWS = 0x07,
+   OPAL_TABLE_ROWS_FREE = 0x08,
+   OPAL_TABLE_ROW_BYTES = 0x09,
+   OPAL_TABLE_LASTID = 0x0A,
+   OPAL_TABLE_MIN = 0x0B,
+   OPAL_TABLE_MAX = 0x0C,
+
/* authority table */
OPAL_PIN = 0x03,
/* locking tokens */
diff --git a/block/sed-opal.c b/block/sed-opal.c
index ebee06aaadd6..4d93a6097ec0 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -136,6 +136,8 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = {
 
/* tables */
 
+   [OPAL_TABLE_TABLE]
+   { 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01 },
[OPAL_LOCKINGRANGE_GLOBAL] =
{ 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0x00, 0x01 },
[OPAL_LOCKINGRANGE_ACE_RDLOCKED] =
@@ -1106,6 +1108,29 @@ static int generic_get_column(struct opal_dev *dev, 
const u8 *table,
return finalize_and_send(dev, parse_and_check_status);
 }
 
+/*
+ * see TCG SAS 5.3.2.3 for a description of the available columns
+ *
+ * the result is provided in dev->resp->tok[4]
+ */
+static int generic_get_table_info(struct opal_dev *dev, enum opal_uid table,
+ u64 column)
+{
+   u8 uid[OPAL_UID_LENGTH];
+   const unsigned int half = OPAL_UID_LENGTH/2;
+
+   /* sed-opal UIDs can be split in two halfs:
+*  first:  actual table index
+*  second: relative index in the table
+* so we have to get the first half of the OPAL_TABLE_TABLE and use the
+* first part of the target table as relative index into that table
+*/
+   memcpy(uid, opaluid[OPAL_TABLE_TABLE], half);
+   memcpy(uid+half, opaluid[table], half);
+
+   return generic_get_column(dev, uid, column);
+}
+
 static int gen_key(struct opal_dev *dev, void *data)
 {
u8 uid[OPAL_UID_LENGTH];
-- 
2.16.1

[PATCH v2 8.2/8.4] block: sed-opal: unify retrieval of table columns

2018-03-13 Thread Jonas Rabenstein

instead of having multiple places defining the same argument list to get
a specific column of a sed-opal table, provide a generic version and
call it from those functions.

Signed-off-by: Jonas Rabenstein 
---
 block/opal_proto.h |   2 +
 block/sed-opal.c   | 130 +++--
 2 files changed, 49 insertions(+), 83 deletions(-)

diff --git a/block/opal_proto.h b/block/opal_proto.h
index e20be8258854..b6e352cfe982 100644
--- a/block/opal_proto.h
+++ b/block/opal_proto.h
@@ -170,6 +170,8 @@ enum opal_token {
OPAL_READLOCKED = 0x07,
OPAL_WRITELOCKED = 0x08,
OPAL_ACTIVEKEY = 0x0A,
+   /* lockingsp table */
+   OPAL_LIFECYCLE = 0x06,
/* locking info table */
OPAL_MAXRANGES = 0x04,
 /* mbr control */
diff --git a/block/sed-opal.c b/block/sed-opal.c
index aa49fcc30462..ebee06aaadd6 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -1075,6 +1075,37 @@ static int finalize_and_send(struct opal_dev *dev, 
cont_fn cont)
return opal_send_recv(dev, cont);
 }
 
+/*
+ * request @column from table @table on device @dev. On success, the column
+ * data will be available in dev->resp->tok[4]
+ */
+static int generic_get_column(struct opal_dev *dev, const u8 *table,
+ u64 column)
+{
+   int err;
+
+   err = start_opal_cmd(dev, table, opalmethod[OPAL_GET]);
+
+   add_token_u8(&err, dev, OPAL_STARTLIST);
+
+   add_token_u8(&err, dev, OPAL_STARTNAME);
+   add_token_u8(&err, dev, OPAL_STARTCOLUMN);
+   add_token_u64(&err, dev, column);
+   add_token_u8(&err, dev, OPAL_ENDNAME);
+
+   add_token_u8(&err, dev, OPAL_STARTNAME);
+   add_token_u8(&err, dev, OPAL_ENDCOLUMN);
+   add_token_u64(&err, dev, column);
+   add_token_u8(&err, dev, OPAL_ENDNAME);
+
+   add_token_u8(&err, dev, OPAL_ENDLIST);
+
+   if (err)
+   return err;
+
+   return finalize_and_send(dev, parse_and_check_status);
+}
+
 static int gen_key(struct opal_dev *dev, void *data)
 {
u8 uid[OPAL_UID_LENGTH];
@@ -1129,23 +1160,11 @@ static int get_active_key(struct opal_dev *dev, void 
*data)
if (err)
return err;
 
-   err = start_opal_cmd(dev, uid, opalmethod[OPAL_GET]);
-   add_token_u8(&err, dev, OPAL_STARTLIST);
-   add_token_u8(&err, dev, OPAL_STARTNAME);
-   add_token_u8(&err, dev, 3); /* startCloumn */
-   add_token_u8(&err, dev, 10); /* ActiveKey */
-   add_token_u8(&err, dev, OPAL_ENDNAME);
-   add_token_u8(&err, dev, OPAL_STARTNAME);
-   add_token_u8(&err, dev, 4); /* endColumn */
-   add_token_u8(&err, dev, 10); /* ActiveKey */
-   add_token_u8(&err, dev, OPAL_ENDNAME);
-   add_token_u8(&err, dev, OPAL_ENDLIST);
-   if (err) {
-   pr_debug("Error building get active key command\n");
+   err = generic_get_column(dev, uid, OPAL_ACTIVEKEY);
+   if (err)
return err;
-   }
 
-   return finalize_and_send(dev, get_active_key_cont);
+   return get_active_key_cont(dev);
 }
 
 static int generic_lr_enable_disable(struct opal_dev *dev,
@@ -1800,14 +1819,16 @@ static int activate_lsp(struct opal_dev *dev, void 
*data)
return finalize_and_send(dev, parse_and_check_status);
 }
 
-static int get_lsp_lifecycle_cont(struct opal_dev *dev)
+/* Determine if we're in the Manufactured Inactive or Active state */
+static int get_lsp_lifecycle(struct opal_dev *dev, void *data)
 {
u8 lc_status;
-   int error = 0;
+   int err;
 
-   error = parse_and_check_status(dev);
-   if (error)
-   return error;
+   err = generic_get_column(dev, opaluid[OPAL_LOCKINGSP_UID],
+OPAL_LIFECYCLE);
+   if (err)
+   return err;
 
lc_status = response_get_u64(&dev->parsed, 4);
/* 0x08 is Manufacured Inactive */
@@ -1820,49 +1841,19 @@ static int get_lsp_lifecycle_cont(struct opal_dev *dev)
return 0;
 }
 
-/* Determine if we're in the Manufactured Inactive or Active state */
-static int get_lsp_lifecycle(struct opal_dev *dev, void *data)
-{
-   int err;
-
-   err = start_opal_cmd(dev, opaluid[OPAL_LOCKINGSP_UID],
-opalmethod[OPAL_GET]);
-
-   add_token_u8(&err, dev, OPAL_STARTLIST);
-
-   add_token_u8(&err, dev, OPAL_STARTNAME);
-   add_token_u8(&err, dev, 3); /* Start Column */
-   add_token_u8(&err, dev, 6); /* Lifecycle Column */
-   add_token_u8(&err, dev, OPAL_ENDNAME);
-
-   add_token_u8(&err, dev, OPAL_STARTNAME);
-   add_token_u8(&err, dev, 4); /* End Column */
-   add_token_u8(&err, dev, 6); /* Lifecycle Column */
-   add_token_u8(&err, dev, OPAL_ENDNAME);
-
-   add_token_u8(&err, dev, OPAL_ENDLIST);
-
-   if (err) {
-   pr_debug("Error Building GET Lifecycle Status command\n");
-   return err;
-   }
-
-   return finalize_and_send(dev, get_lsp

[PATCH v2 8.4/8.4] block: sed-opal: check size of shadow mbr

2018-03-13 Thread Jonas Rabenstein

Check whether the shadow mbr does fit in the provided space on the
target. Also a proper firmware should handle this case and return an
error we may prevent problem with crappy firmwares.

Signed-off-by: Jonas Rabenstein 
---
 block/sed-opal.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/block/sed-opal.c b/block/sed-opal.c
index 4d93a6097ec0..8a08ae91bc25 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -1544,6 +1544,20 @@ static int write_shadow_mbr(struct opal_dev *dev, void 
*data)
u64 len;
int err = 0;
 
+   /* do we fit in the available shadow mbr space? */
+   err = generic_get_table_info(dev, OPAL_MBR, OPAL_TABLE_ROWS);
+   if (err) {
+   pr_debug("MBR: could not get shadow size\n");
+   return err;
+   }
+
+   len = response_get_u64(&dev->parsed, 4);
+   if (shadow->offset + shadow->size > len) {
+   pr_debug("MBR: does not fit in shadow (%llu vs. %llu)\n",
+shadow->offset + shadow->size, len);
+   return -ENOSPC;
+   }
+
/* FIXME: this is the maximum we can use for IO_BUFFER_LENGTH=2048.
 *Instead of having constant, it would be nice to compute the
 *actual value depending on IO_BUFFER_LENGTH
-- 
2.16.1

[PATCH v2 8.0/8.4] block: sed-opal: check size of shadow mbr

2018-03-13 Thread Jonas Rabenstein

Hi,
I managed to extract the usable shadow mbr size out of my 850Evos
OPAL_TABLE_TABLE and added an appropriate check into the write function.
As this involves more than just a few lines, I decided to split the v2
of this subpatch into 4 separate patches. I am unsure what whould be the
best practice for such an situation but hope it is okay like this.

Also the userspace exported pointer is replaced with an u64 value.

Jonas Rabenstein (4):
  block: sed-opal: ioctl for writing to shadow mbr
  block: sed-opal: unify retrieval of table columns
  block: sed-opal: get metadata about opal-sed tables
  block: sed-opal: check size of shadow mbr

 block/opal_proto.h|  18 
 block/sed-opal.c  | 245 --
 include/linux/sed-opal.h  |   1 +
 include/uapi/linux/sed-opal.h |   8 ++
 4 files changed, 189 insertions(+), 83 deletions(-)

-- 
2.16.1

Re: [PATCH V3 0/4] genirq/affinity: irq vector spread among online CPUs as far as possible

2018-03-13 Thread Dou Liyang


Hi Artem,

At 03/14/2018 11:29 AM, Dou Liyang wrote:

Hi All,

At 03/13/2018 05:35 PM, Rafael J. Wysocki wrote:
On Tue, Mar 13, 2018 at 9:39 AM, Artem Bityutskiy 
 wrote:

On Tue, 2018-03-13 at 16:35 +0800, Ming Lei wrote:

Then looks this issue need to fix by making possible CPU count
accurate
because there are other resources allocated according to
num_possible_cpus(),
such as percpu variables.


Short term the regression should be fixed. It is already v4.16-rc6, we
have little time left.


Right.


Longer term, yeah, I agree. Kernel's notion of possible CPU count
should be realistic.




I did a patch for that, Artem, could you help me to test it.



I didn't consider the nr_cpu_ids before. please ignore the old patch and
try the following RFC patch.

Thanks
dou

--->8-

diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c
index 449d86d39965..96d568408515 100644
--- a/drivers/acpi/acpi_processor.c
+++ b/drivers/acpi/acpi_processor.c
@@ -671,6 +671,23 @@ static acpi_status __init 
acpi_processor_ids_walk(acpi_handle handle,


 }

+static void __init acpi_refill_possible_map(void)
+{
+   unsigned int cpu, nr = 0;
+
+   if (nr_cpu_ids <= nr_unique_ids)
+   return;
+
+   for_each_possible_cpu(cpu) {
+   if (nr >= nr_unique_ids)
+   set_cpu_possible(cpu, false);
+   nr++;
+   }
+
+   nr_cpu_ids = nr_unique_ids;
+   pr_info("Allowing %d possible CPUs\n", nr_cpu_ids);
+}
+
 static void __init acpi_processor_check_duplicates(void)
 {
/* check the correctness for all processors in ACPI namespace */
@@ -680,6 +697,9 @@ static void __init acpi_processor_check_duplicates(void)
NULL, NULL, NULL);
acpi_get_devices(ACPI_PROCESSOR_DEVICE_HID, 
acpi_processor_ids_walk,

NULL, NULL);
+
+   /* make possible CPU count more realistic */
+   acpi_refill_possible_map();
 }

 bool acpi_duplicate_processor_id(int proc_id)

Re: [PATCH V3 0/4] genirq/affinity: irq vector spread among online CPUs as far as possible

2018-03-13 Thread Dou Liyang


Hi Rafael,

Thank you so much for your reply.

At 03/13/2018 05:25 PM, Rafael J. Wysocki wrote:

On Tue, Mar 13, 2018 at 4:11 AM, Dou Liyang  wrote:

Hi Thomas,

At 03/09/2018 11:08 PM, Thomas Gleixner wrote:
[...]



I'm not sure if there is a clear indicator whether physcial hotplug is
supported or not, but the ACPI folks (x86) and architecture maintainers


+cc Rafael


should be able to answer that question. I have a machine which says:

 smpboot: Allowing 128 CPUs, 96 hotplug CPUs

There is definitely no way to hotplug anything on that machine and sure
the



AFAIK, in ACPI based dynamic reconfiguration, there is no clear
indicator. In theory, If the ACPI tables have the hotpluggable
CPU resources, the OS can support physical hotplug.


In order for the ACPI-based CPU hotplug (I mean physical, not just the
software offline/online we do in the kernel) to work, there have to be
objects in the ACPI namespace corresponding to all of the processors
in question.

If they are not present, there is no way to signal insertion and eject
the processors safely.


Yes, I see.

Thanks
dou

Re: [PATCH V3 0/4] genirq/affinity: irq vector spread among online CPUs as far as possible

2018-03-13 Thread Dou Liyang


Hi All,

At 03/13/2018 05:35 PM, Rafael J. Wysocki wrote:

On Tue, Mar 13, 2018 at 9:39 AM, Artem Bityutskiy  wrote:

On Tue, 2018-03-13 at 16:35 +0800, Ming Lei wrote:

Then looks this issue need to fix by making possible CPU count
accurate
because there are other resources allocated according to
num_possible_cpus(),
such as percpu variables.


Short term the regression should be fixed. It is already v4.16-rc6, we
have little time left.


Right.


Longer term, yeah, I agree. Kernel's notion of possible CPU count
should be realistic.




I did a patch for that, Artem, could you help me to test it.

--->8-

diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c
index 449d86d39965..878abfa0ce30 100644
--- a/drivers/acpi/acpi_processor.c
+++ b/drivers/acpi/acpi_processor.c
@@ -671,6 +671,18 @@ static acpi_status __init 
acpi_processor_ids_walk(acpi_handle handle,


 }

+static void __init acpi_refill_possible_map(void)
+{
+   int i;
+
+   reset_cpu_possible_mask();
+
+   for (i = 0; i < nr_unique_ids; i++)
+   set_cpu_possible(i, true);
+
+   pr_info("Allowing %d possible CPUs\n", nr_unique_ids);
+}
+
 static void __init acpi_processor_check_duplicates(void)
 {
/* check the correctness for all processors in ACPI namespace */
@@ -680,6 +692,9 @@ static void __init acpi_processor_check_duplicates(void)
NULL, NULL, NULL);
acpi_get_devices(ACPI_PROCESSOR_DEVICE_HID, 
acpi_processor_ids_walk,

NULL, NULL);
+
+   /* make possible CPU count more realistic */
+   acpi_refill_possible_map();
 }

 bool acpi_duplicate_processor_id(int proc_id)

--

I agree.

Moreover, there are not too many systems where physical CPU hotplug
actually works in practice AFAICS, so IMO we should default to "no
physical CPU hotplug" and only change that default in special cases
(which may be hard to figure out, but that's a different matter).



Yes, I think so.




What platform firmware tells us may be completely off.


Rafeal,

Sorry, I am not sure what you mean :-) . Did you mean no platform
firmware can tell us whether physcial CPU hotplug is supported or not?

My colleagues also told to me that there is no way in OS to know whether
it is supported or not.

Thanks
dou

Re: [PATCH v3] blk-throttle: fix race between blkcg_bio_issue_check() and cgroup_rmdir()

2018-03-13 Thread Joseph Qi

Fine, I will do the corresponding changes and post v5.

Thanks,
Joseph

On 18/3/14 04:19, Tejun Heo wrote:
> Hello, Joseph.
> 
> On Mon, Mar 05, 2018 at 11:03:16PM +0800, Joseph Qi wrote:
>> +static void blkg_pd_offline(struct blkcg_gq *blkg)
>> +{
>> +int i;
>> +
>> +lockdep_assert_held(blkg->q->queue_lock);
>> +lockdep_assert_held(&blkg->blkcg->lock);
>> +
>> +for (i = 0; i < BLKCG_MAX_POLS; i++) {
>> +struct blkcg_policy *pol = blkcg_policy[i];
>> +
>> +if (blkg->pd[i] && !blkg->pd_offline[i] && pol->pd_offline_fn) {
>> +pol->pd_offline_fn(blkg->pd[i]);
>> +blkg->pd_offline[i] = true;
> 
> Can we move this flag into blkg_policy_data?
> 
>> +while (!hlist_empty(&blkcg->blkg_list)) {
>> +struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
>> +struct blkcg_gq,
>> +blkcg_node);
>> +struct request_queue *q = blkg->q;
>> +
>> +if (spin_trylock(q->queue_lock)) {
>> +blkg_destroy(blkg);
>> +spin_unlock(q->queue_lock);
>> +} else {
>> +spin_unlock_irq(&blkcg->lock);
>> +cpu_relax();
>> +spin_lock_irq(&blkcg->lock);
>> +}
> 
> Can we factor out the above loop?  It's something subtle and painful
> and I think it'd be better to have it separated out and documented.
> 
> Other than that, looks great.
> 
> Thanks.
>

Re: [PATCH v3 01/11] PCI/P2PDMA: Support peer-to-peer memory

2018-03-13 Thread Bjorn Helgaas

On Tue, Mar 13, 2018 at 05:21:20PM -0600, Logan Gunthorpe wrote:
> On 13/03/18 05:08 PM, Bjorn Helgaas wrote:
> > On Tue, Mar 13, 2018 at 10:31:55PM +, Stephen  Bates wrote:
> > If it *is* necessary because Root Ports and devices below them behave
> > differently than in conventional PCI, I think you should include a
> > reference to the relevant section of the spec and check directly for a
> > Root Port.  I would prefer that over trying to exclude Root Ports by
> > looking for two upstream bridges.
> 
> Well we've established that we don't want to allow root ports.

I assume you want to exclude Root Ports because of multi-function
devices and the "route to self" error.  I was hoping for a reference
to that so I could learn more about it.

While I was looking for it, I found sec 6.12.1.2 (PCIe r4.0), "ACS
Functions in SR-IOV Capable and Multi-Function Devices", which seems
relevant.  It talks about "peer-to-peer Requests (between Functions of
the device)".  Thay says to me that multi-function devices can DMA
between themselves.

> But we need to, at a minimum, do two pci_upstream_bridge() calls...
> 
> Remember, we need to check that a number of devices are behind the same
> switch. So we need to find a _common_ upstream port for several devices.

I agree that peers need to have a common upstream bridge.  I think
you're saying peers need to have *two* common upstream bridges.  If I
understand correctly, requiring two common bridges is a way to ensure
that peers directly below Root Ports don't try to DMA to each other.

So I guess the first order of business is to nail down whether peers
below a Root Port are prohibited from DMAing to each other.  My
assumption, based on 6.12.1.2 and the fact that I haven't yet found
a prohibition, is that they can.

> Excluding the multifunction device case (which I don't think is
> applicable for reasons we've discussed before), this will *never* be the
> first upstream port for a given device.

If you're looking for a common upstream bridge, you don't need to make
assumptions about whether the hierarchy is conventional PCI or PCIe or
how many levels are in the hierarchy.

You already have upstream_bridges_match(), which takes two pci_devs.
I think it should walk up the PCI hierarchy from the first device,
checking whether the bridge at each level is also a parent of the
second device.

Bjorn

Re: [PATCH v3 01/11] PCI/P2PDMA: Support peer-to-peer memory

2018-03-13 Thread Logan Gunthorpe

On 13/03/18 05:19 PM, Sinan Kaya wrote:
> It is still a switch it can move packets but, maybe it can move data at
> 100kbps speed. 

As Stephen pointed out, it's a requirement of the PCIe spec that a
switch supports P2P. If you want to sell a switch that does P2P with bad
performance then that's on you to deal with.

> What prevents that switch from trying P2P and having a bad user experience?

The fact that no one would buy such a switch as it would have a bad user
experience with regular PCI transfers. A P2P TLP is not special on a
switch as it's routed in just the same way as any others. There'd also
be no cost gain in making such a broken-by-design device.

> If everything is so broken, I was suggesting to at least list the switches
> you have tested.
> 
> What's the problem with this?

More complexity for zero gain.

> Why do you want to assume that all switches are good and all root ports are
> bad?

Because the assumption that all switches are good is more than
reasonable and simplifies the code and maintenance significantly. No one
wants to maintain a white list when they don't have to.

> What if the design is so canned that you can't change anything? 

Based on the feedback we've got so far and the developers that have
participated in getting it to where it is, it is not "canned".

> I have been asking things like getting rid of switch search in ACS
> enablement towards achieving generic P2P. You seem to be pushing back.
> You said yourself P2P and isolation doesn't go together at this point
> but you also care about isolation for other devices that are not doing
> P2P.

P2P and isolation will never be compatible at any point. They are two
opposite concepts. So we could just disable isolation across the whole
system and for our purposes that would be fine. But it's relatively
simple to limit this and only disable it behind switches. So why
wouldn't we? It enables use cases like having an isolated card on the
root complex used in a VM while having P2P on cards behind switches. I
personally have no interest in doing this but I also have no reason to
prevent it with my code.

> It is not a requirement for you but it is a requirement for me (ARM64 guy).
> Linux happens to run on multiple architectures. One exception invalidates your
> point.

It is not a requirement of an architecture or people that use a specific
architecture. It is a requirement of the use-case and you have not said
any use case or how we could do better. If you're running VMs that
require isolation you will need to be *very* careful if you also want to
do P2P between cards which requires no isolation. But, based on my
understanding, most people will want to do one or the other -- not both.
If you want to do P2P you enable the P2P config option, if you want
isolation you don't.

> If you are assuming that your kernel option should not be used by general
> distributions like Ubuntu/redhat etc. and requires a kernel compilation,
> creating a dependency to EXPERT is the right way to do. 

I don't have a problem adding EXPERT as a dependency. We can do that for
v4. I'd rather hope that distros actually read and understand the
kconfig help text before enabling an off-by-default option. But maybe
I'm asking too much.

Logan

Re: [PATCH v3 01/11] PCI/P2PDMA: Support peer-to-peer memory

2018-03-13 Thread Logan Gunthorpe

On 13/03/18 05:08 PM, Bjorn Helgaas wrote:
> On Tue, Mar 13, 2018 at 10:31:55PM +, Stephen  Bates wrote:
> If it *is* necessary because Root Ports and devices below them behave
> differently than in conventional PCI, I think you should include a
> reference to the relevant section of the spec and check directly for a
> Root Port.  I would prefer that over trying to exclude Root Ports by
> looking for two upstream bridges.

Well we've established that we don't want to allow root ports.

But we need to, at a minimum, do two pci_upstream_bridge() calls...

Remember, we need to check that a number of devices are behind the same
switch. So we need to find a _common_ upstream port for several devices.
Excluding the multifunction device case (which I don't think is
applicable for reasons we've discussed before), this will *never* be the
first upstream port for a given device. We need to find the upstream
port of the switch and ensure all devices share that same port. If a
device is behind a switch, it's pci_upstream_bridge() is the downstream
switch port which is unique to that device. So a peer device would have
a different pci_upstream_bridge() port but share the same
pci_upstream_bridge(pci_upstream_bridge()) port (assuming they are on
the same switch).

The alternative, is to walk the entire tree of upstream ports and check
that all devices have a common port somewhere in the tree that isn't the
root complex. Arguably this is the right way to do it and would support
a network of switches but is a fair bit more complex to code.

Logan

Re: [PATCH v3 01/11] PCI/P2PDMA: Support peer-to-peer memory

2018-03-13 Thread Sinan Kaya

On 3/13/2018 6:48 PM, Logan Gunthorpe wrote:
> 
> 
> On 13/03/18 04:29 PM, Sinan Kaya wrote:
>> If hardware doesn't support it, blacklisting should have been the right
>> path and I still think that you should remove all switch business from the 
>> code.
>> I did not hear enough justification for having a switch requirement
>> for P2P.
> 
> I disagree.
> 
>> You are also saying that root ports have issues not because of functionality 
>> but
>> because of performance. 
> 
> No... performance can also be an issue but the main justification for
> this is that many root ports do not support P2P at all and can fail in
> different ways (usually just dropping the TLPs).
> 
>> What if I come up with a very cheap/crappy switch (something like used in 
>> data
>> mining)?
> 
> Good luck. That's not how hardware is designed. PCIe switches that have
> any hope to compete with the existing market will need features like
> NTB, non-transparent ports, etc... and that implies a certain symmetry
> (ie there isn't a special host port because there may be more than one
> and it may move around) which implies that packets will always be able
> to forward between each ports which implies P2P will work.
> 

It is still a switch it can move packets but, maybe it can move data at
100kbps speed. 

What prevents that switch from trying P2P and having a bad user experience?

If everything is so broken, I was suggesting to at least list the switches
you have tested.

What's the problem with this?

Why do you want to assume that all switches are good and all root ports are
bad?

>> I have been doing my best to provide feedback. It feels like you are throwing
>> them over the wall to be honest.
>>
>> You keep implying "not my problem".
> 
> Well, the fact of the matter is that extending this in all the ways
> people like you want face huge problems on all sides. These are not
> trivial issues and holding back work that works for our problem because
> it doesn't solve your problem is IMO just going to grind development in
> this area to a halt. We have to have something we can agree on which is
> safe to start building on. The building can't just emerge fully formed
> in one go.
> 

What if the design is so canned that you can't change anything? 

I have been asking things like getting rid of switch search in ACS
enablement towards achieving generic P2P. You seem to be pushing back.
You said yourself P2P and isolation doesn't go together at this point
but you also care about isolation for other devices that are not doing
P2P.

Confusing...

> P2P proposal go back a long time and have never gotten anywhere because
> there are limitations and people want it to do things that are hard but
> don't want to contribute the work to solving those problems.
> 
>>> Well, if it's a problem for someone they'll have to solve it. We're
>>> targeting JBOFs that have no use for ACS / IOMMU groups at all.
>>
>> IMO, you (not somebody) should address this one way or the other before this
>> series land in upstream.
> 
> The real way to address this (as I've mentioned before) is with some way
> of doing ACS and iomem groups dynamically. But this is a huge project in
> itself and is never going to be part of the P2P patchset.

fair enough.

> 
>> Another assumption: There are other architectures like ARM64 where IOMMU
>> is enabled by default even if you don't use VMs for security reasons.
>> IOMMU blocks stray transactions.
> 
> True, but it doesn't change my point: ACS is not a requirement for Linux
> many many systems do not have it on at all or by default.

I don't think so.

It is not a requirement for you but it is a requirement for me (ARM64 guy).
Linux happens to run on multiple architectures. One exception invalidates your
point.

> 
>> Didn't the ACS behavior change suddenly for no good reason when we enabled
>> your code even though I might not be using the P2P but I happen to have
>> a kernel with P2P config option?
> 

If you are assuming that your kernel option should not be used by general
distributions like Ubuntu/redhat etc. and requires a kernel compilation,
creating a dependency to EXPERT is the right way to do. 

Distributions assume that no damage is done by enabling PCI bus options
under normal circumstances.

> Well no, presumably you made a conscious choice to turn the config
> option on and build a custom kernel for your box. That doesn't seem very
> sudden and the reason is that the two concepts are very much at odds
> with each other: you can't have isolation and still have transactions
> between devices.
> 
> Logan
> 
> 


-- 
Sinan Kaya
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm 
Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux 
Foundation Collaborative Project.

Re: [PATCH v3 01/11] PCI/P2PDMA: Support peer-to-peer memory

2018-03-13 Thread Bjorn Helgaas

On Tue, Mar 13, 2018 at 10:31:55PM +, Stephen  Bates wrote:
> >> It sounds like you have very tight hardware expectations for this to work
> >> at this moment. You also don't want to generalize this code for others and
> >> address the shortcomings.
> >  No, that's the way the community has pushed this work
> 
> Hi Sinan
> 
> Thanks for all the input. As Logan has pointed out the switch
> requirement is something that has evolved over time based on input
> from the community. You are more than welcome to have an opinion on
> this (and you have made that opinion clear ;-)). Over time the
> patchset may evolve from its current requirements but right now we
> are aligned with the feedback from the community.

This part of the community hasn't been convinced of the need to have
two bridges, e.g., both an Upstream Port and a Downstream Port, or two
conventional PCI bridges, above the peers.

Every PCI-to-PCI bridge is required to support routing transactions
between devices on its secondary side.  Therefore, I think it is
sufficient to verify that the potential peers share a single common
upstream bridge.  This could be a conventional PCI bridge, a Switch
Downstream Port, or a Root Port.

I've seen the response that peers directly below a Root Port could not
DMA to each other through the Root Port because of the "route to self"
issue, and I'm not disputing that.  But enforcing a requirement for
two upstream bridges introduces a weird restriction on conventional
PCI topologies, makes the code hard to read, and I don't think it's
necessary.

If it *is* necessary because Root Ports and devices below them behave
differently than in conventional PCI, I think you should include a
reference to the relevant section of the spec and check directly for a
Root Port.  I would prefer that over trying to exclude Root Ports by
looking for two upstream bridges.

Bjorn

Re: [PATCH v3 01/11] PCI/P2PDMA: Support peer-to-peer memory

2018-03-13 Thread Logan Gunthorpe

On 13/03/18 04:29 PM, Sinan Kaya wrote:
> If hardware doesn't support it, blacklisting should have been the right
> path and I still think that you should remove all switch business from the 
> code.
> I did not hear enough justification for having a switch requirement
> for P2P.

I disagree.

> You are also saying that root ports have issues not because of functionality 
> but
> because of performance. 

No... performance can also be an issue but the main justification for
this is that many root ports do not support P2P at all and can fail in
different ways (usually just dropping the TLPs).

> What if I come up with a very cheap/crappy switch (something like used in data
> mining)?

Good luck. That's not how hardware is designed. PCIe switches that have
any hope to compete with the existing market will need features like
NTB, non-transparent ports, etc... and that implies a certain symmetry
(ie there isn't a special host port because there may be more than one
and it may move around) which implies that packets will always be able
to forward between each ports which implies P2P will work.

> I have been doing my best to provide feedback. It feels like you are throwing
> them over the wall to be honest.
> 
> You keep implying "not my problem".

Well, the fact of the matter is that extending this in all the ways
people like you want face huge problems on all sides. These are not
trivial issues and holding back work that works for our problem because
it doesn't solve your problem is IMO just going to grind development in
this area to a halt. We have to have something we can agree on which is
safe to start building on. The building can't just emerge fully formed
in one go.

P2P proposal go back a long time and have never gotten anywhere because
there are limitations and people want it to do things that are hard but
don't want to contribute the work to solving those problems.

>> Well, if it's a problem for someone they'll have to solve it. We're
>> targeting JBOFs that have no use for ACS / IOMMU groups at all.
> 
> IMO, you (not somebody) should address this one way or the other before this
> series land in upstream.

The real way to address this (as I've mentioned before) is with some way
of doing ACS and iomem groups dynamically. But this is a huge project in
itself and is never going to be part of the P2P patchset.

> Another assumption: There are other architectures like ARM64 where IOMMU
> is enabled by default even if you don't use VMs for security reasons.
> IOMMU blocks stray transactions.

True, but it doesn't change my point: ACS is not a requirement for Linux
many many systems do not have it on at all or by default.

> Didn't the ACS behavior change suddenly for no good reason when we enabled
> your code even though I might not be using the P2P but I happen to have
> a kernel with P2P config option?

Well no, presumably you made a conscious choice to turn the config
option on and build a custom kernel for your box. That doesn't seem very
sudden and the reason is that the two concepts are very much at odds
with each other: you can't have isolation and still have transactions
between devices.

Logan

Re: [PATCH v3 01/11] PCI/P2PDMA: Support peer-to-peer memory

2018-03-13 Thread Stephen Bates

Hi Sinan

>If hardware doesn't support it, blacklisting should have been the right
>path and I still think that you should remove all switch business from the 
> code.
>I did not hear enough justification for having a switch requirement
>for P2P.
  
We disagree. As does the community based on feedback on previous version of 
this patch set.
  
> You are also saying that root ports have issues not because of functionality 
> but
> because of performance. 

I have seen both.  Some systems fail in entirety to route TLPs across route 
ports. Others do route but with poor performance. Having a requirement for a 
switch is therefore a reasonable and sensible minimum criteria for enabling p2p.

> If you know what is bad, create a list and keep adding it. You can't assume
> universally that all root ports are broken/ have bad performance.

I actually wanted to do a blacklist originally but that idea was not accepted 
by the community. 

>What if I come up with a very cheap/crappy switch (something like used in 
> data
>mining)?
> What guarantees that P2P will work with this device? You are making an
> assumption here that all switches have good performance.

A switch must support P2P or it is in violation of the PCIe specification.

>I have been doing my best to provide feedback. It feels like you are 
> throwing
>them over the wall to be honest.

I think one issue Sinan is that a lot of your ideas have been discussed in 
previous versions of the series. So you are asking us to make changes which, in 
some cases, we know are not acceptable to others in the community. The input is 
welcome but it flies in the face of other input we have received. 

However if other developers feel strongly about the blacklist/whitelist and 
switch criteria please speak up! ;-)

Stephen

Re: [PATCH v3 01/11] PCI/P2PDMA: Support peer-to-peer memory

2018-03-13 Thread Stephen Bates

>> It sounds like you have very tight hardware expectations for this to work
>> at this moment. You also don't want to generalize this code for others and
>> address the shortcomings.
>  No, that's the way the community has pushed this work

Hi Sinan

Thanks for all the input. As Logan has pointed out the switch requirement is 
something that has evolved over time based on input from the community. You are 
more than welcome to have an opinion on this (and you have made that opinion 
clear ;-)). Over time the patchset may evolve from its current requirements but 
right now we are aligned with the feedback from the community.

Cheers

Stephen

Re: [PATCH v3 01/11] PCI/P2PDMA: Support peer-to-peer memory

2018-03-13 Thread Sinan Kaya

On 3/13/2018 6:00 PM, Logan Gunthorpe wrote:
> 
> 
> On 13/03/18 03:22 PM, Sinan Kaya wrote:
>> It sounds like you have very tight hardware expectations for this to work
>> at this moment. You also don't want to generalize this code for others and
>> address the shortcomings.
> 
> No, that's the way the community has pushed this work. Our original work
> was very general and we were told it was unacceptable to put the onus on
> the user and have things break if the hardware doesn't support it. I
> think that's a reasonable requirement. So the hardware use-cases were
> wittled down to the ones we can be confident about and support with
> reasonable changes to the kernel today.

If hardware doesn't support it, blacklisting should have been the right
path and I still think that you should remove all switch business from the code.
I did not hear enough justification for having a switch requirement
for P2P.

You are also saying that root ports have issues not because of functionality but
because of performance. 

If you know what is bad, create a list and keep adding it. You can't assume
universally that all root ports are broken/ have bad performance.

> 
>> To get you going, you should limit this change to the switch products that 
>> you have
>> validated via white-listing PCI vendor/device ids. Please do not enable this 
>> feature
>> for all other PCI devices or by default.
> 
> This doesn't seem necessary. We know that all PCIe switches available
> today support P2P and we are highly confident that any switch that would
> ever be produced would support P2P. As long as devices are behind a
> switch you don't need any white listing. This is what the current patch
> set implements. If you want to start including root ports then you will
> need a white list (and solve all the other problems I mentioned earlier).

What if I come up with a very cheap/crappy switch (something like used in data
mining)?

What guarantees that P2P will work with this device? You are making an
assumption here that all switches have good performance.

How is that any different from good switch vs. bad switch and good root
port vs. bad root port?

If it is universally broken, why don't you list the things that work?

> 
>> I think your code qualifies as a virus until this issue is resolved (so 
>> NAK). 
> 
> That seems a bit hyperbolic... "a virus"??!... please keep things
> constructive.
> 

Sorry, this was harsh. I'm taking "virus" word back. I apologize. 
But, I hold onto my NAK opinion. 

I have been doing my best to provide feedback. It feels like you are throwing
them over the wall to be honest.

You keep implying "not my problem".

> > I agree disabling globally would be bad. Somebody can always say I have
> > ten switches on my system. I want to do peer-to-peer on one switch only. 
> > Now,
> > this change weakened security for the other switches that I had no intention
> > with doing P2P.
> >
> > Isn't this a problem?
> 
> Well, if it's a problem for someone they'll have to solve it. We're
> targeting JBOFs that have no use for ACS / IOMMU groups at all.

IMO, you (not somebody) should address this one way or the other before this
series land in upstream.

>> You are delivering a general purpose P2P code with a lot of holes in it and
>> expecting people to jump through it.
> No, the code prevents users from screwing it up. It just requires a
> switch in the hardware which is hardly a high bar to jump through
> (provided you are putting some design thought into your hardware). And
> given it requires semi-custom hardware today, it's not something that
> needs to be on by default in any distributor kernel.
> 
>> Turning security off by default is also not acceptable. Linux requires ACS 
>> support
>> even though you don't care about it for your particular application.
> 
> That's not true. Linux does not require ACS support. In fact it's
> already off by default until you specifically turn on the IOMMU. (Which
> is not always the most obvious thing to enable.) And the only thing it
> really supports is enforcing isolation between VMs that are using
> different pieces of hardware in the system.

Another assumption: There are other architectures like ARM64 where IOMMU
is enabled by default even if you don't use VMs for security reasons.
IOMMU blocks stray transactions.

> 
>> I'd hate ACS to be broken due to some operating system enabling your CONFIG 
>> option.
> 
> ACS isn't "broken" by enabling the config option. It just makes the
> IOMMU groups and isolation less granular. (ie. devices behind a switch
> will be in the same group and not isolated from each-other).

Didn't the ACS behavior change suddenly for no good reason when we enabled
your code even though I might not be using the P2P but I happen to have
a kernel with P2P config option?

-- 
Sinan Kaya
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm 
Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux 
Foundatio

Re: [PATCH v3 01/11] PCI/P2PDMA: Support peer-to-peer memory

2018-03-13 Thread Logan Gunthorpe

On 13/03/18 03:22 PM, Sinan Kaya wrote:
> It sounds like you have very tight hardware expectations for this to work
> at this moment. You also don't want to generalize this code for others and
> address the shortcomings.

No, that's the way the community has pushed this work. Our original work
was very general and we were told it was unacceptable to put the onus on
the user and have things break if the hardware doesn't support it. I
think that's a reasonable requirement. So the hardware use-cases were
wittled down to the ones we can be confident about and support with
reasonable changes to the kernel today.

> To get you going, you should limit this change to the switch products that 
> you have
> validated via white-listing PCI vendor/device ids. Please do not enable this 
> feature
> for all other PCI devices or by default.

This doesn't seem necessary. We know that all PCIe switches available
today support P2P and we are highly confident that any switch that would
ever be produced would support P2P. As long as devices are behind a
switch you don't need any white listing. This is what the current patch
set implements. If you want to start including root ports then you will
need a white list (and solve all the other problems I mentioned earlier).

> I think your code qualifies as a virus until this issue is resolved (so NAK). 

That seems a bit hyperbolic... "a virus"??!... please keep things
constructive.

> You are delivering a general purpose P2P code with a lot of holes in it and
> expecting people to jump through it.
No, the code prevents users from screwing it up. It just requires a
switch in the hardware which is hardly a high bar to jump through
(provided you are putting some design thought into your hardware). And
given it requires semi-custom hardware today, it's not something that
needs to be on by default in any distributor kernel.

> Turning security off by default is also not acceptable. Linux requires ACS 
> support
> even though you don't care about it for your particular application.

That's not true. Linux does not require ACS support. In fact it's
already off by default until you specifically turn on the IOMMU. (Which
is not always the most obvious thing to enable.) And the only thing it
really supports is enforcing isolation between VMs that are using
different pieces of hardware in the system.

> I'd hate ACS to be broken due to some operating system enabling your CONFIG 
> option.

ACS isn't "broken" by enabling the config option. It just makes the
IOMMU groups and isolation less granular. (ie. devices behind a switch
will be in the same group and not isolated from each-other).

Logan

Re: [PATCH v3 01/11] PCI/P2PDMA: Support peer-to-peer memory

2018-03-13 Thread Sinan Kaya

On 3/13/2018 4:46 PM, Logan Gunthorpe wrote:
> 
> 
> On 13/03/18 01:53 PM, Sinan Kaya wrote:
>> I agree disabling globally would be bad. Somebody can always say I have
>> ten switches on my system. I want to do peer-to-peer on one switch only. Now,
>> this change weakened security for the other switches that I had no intention
>> with doing P2P.
>>
>> Isn't this a problem?
> 
> Well, if it's a problem for someone they'll have to solve it. We're
> targeting JBOFs that have no use for ACS / IOMMU groups at all.
> 
>> Can we specify the BDF of the downstream device we want P2P with during boot 
>> via
>> kernel command line?
> 
> That's a painful configuration burden. And then things might stop
> working if you change your topology at all and now have to change boot
> parameters.
> 

It sounds like you have very tight hardware expectations for this to work
at this moment. You also don't want to generalize this code for others and
address the shortcomings.

To get you going, you should limit this change to the switch products that you 
have
validated via white-listing PCI vendor/device ids. Please do not enable this 
feature
for all other PCI devices or by default.

I think your code qualifies as a virus until this issue is resolved (so NAK). 

Another option is for your CONFIG to depend on BROKEN/EXPERT.

You are delivering a general purpose P2P code with a lot of holes in it and
expecting people to jump through it.

Turning security off by default is also not acceptable. Linux requires ACS 
support
even though you don't care about it for your particular application.

I'd hate ACS to be broken due to some operating system enabling your CONFIG 
option.

-- 
Sinan Kaya
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm 
Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux 
Foundation Collaborative Project.

Re: [PATCH v3 01/11] PCI/P2PDMA: Support peer-to-peer memory

2018-03-13 Thread Logan Gunthorpe

On 13/03/18 01:53 PM, Sinan Kaya wrote:
> I agree disabling globally would be bad. Somebody can always say I have
> ten switches on my system. I want to do peer-to-peer on one switch only. Now,
> this change weakened security for the other switches that I had no intention
> with doing P2P.
> 
> Isn't this a problem?

Well, if it's a problem for someone they'll have to solve it. We're
targeting JBOFs that have no use for ACS / IOMMU groups at all.

> Can we specify the BDF of the downstream device we want P2P with during boot 
> via
> kernel command line?

That's a painful configuration burden. And then things might stop
working if you change your topology at all and now have to change boot
parameters.

Logan

Re: [PATCH v3] blk-throttle: fix race between blkcg_bio_issue_check() and cgroup_rmdir()

2018-03-13 Thread Tejun Heo

Hello, Joseph.

On Mon, Mar 05, 2018 at 11:03:16PM +0800, Joseph Qi wrote:
> +static void blkg_pd_offline(struct blkcg_gq *blkg)
> +{
> + int i;
> +
> + lockdep_assert_held(blkg->q->queue_lock);
> + lockdep_assert_held(&blkg->blkcg->lock);
> +
> + for (i = 0; i < BLKCG_MAX_POLS; i++) {
> + struct blkcg_policy *pol = blkcg_policy[i];
> +
> + if (blkg->pd[i] && !blkg->pd_offline[i] && pol->pd_offline_fn) {
> + pol->pd_offline_fn(blkg->pd[i]);
> + blkg->pd_offline[i] = true;

Can we move this flag into blkg_policy_data?

> + while (!hlist_empty(&blkcg->blkg_list)) {
> + struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
> + struct blkcg_gq,
> + blkcg_node);
> + struct request_queue *q = blkg->q;
> +
> + if (spin_trylock(q->queue_lock)) {
> + blkg_destroy(blkg);
> + spin_unlock(q->queue_lock);
> + } else {
> + spin_unlock_irq(&blkcg->lock);
> + cpu_relax();
> + spin_lock_irq(&blkcg->lock);
> + }

Can we factor out the above loop?  It's something subtle and painful
and I think it'd be better to have it separated out and documented.

Other than that, looks great.

Thanks.

-- 
tejun

Re: [PATCH v3 01/11] PCI/P2PDMA: Support peer-to-peer memory

2018-03-13 Thread Sinan Kaya

On 3/13/2018 3:19 PM, Logan Gunthorpe wrote:
> 
> 
> On 13/03/18 01:10 PM, Sinan Kaya wrote:
>> I was thinking of this for the pci_p2pdma_add_client() case for the
>> parent pointer.
>>
>> +struct pci_p2pdma_client {
>> +struct list_head list;
>> +struct pci_dev *client;
>> +struct pci_dev *provider;
>> +};
> 
> Yeah, that structure only exists in a list owned by the client and we
> only check the upstream bridge once per entry so I don't see the point.
> 
>> But then, Why bother searching for the switch at all?
> 
> Huh? We have to make sure all the client and provider devices are behind
> the same switch. How can we do that without "searching" for the switch?
> 

Sorry, I was thinking of ACS case you described below. The only thing code
cares is if the device is behind a switch or not at this moment.

> In the ACS case, we only disable ACS on downstream ports of switches. No
> sense disabling it globally as that's worse from an isolation point of
> view and not worth it given we require all P2P transactions to be behind
> a switch.

I agree disabling globally would be bad. Somebody can always say I have
ten switches on my system. I want to do peer-to-peer on one switch only. Now,
this change weakened security for the other switches that I had no intention
with doing P2P.

Isn't this a problem?

Can we specify the BDF of the downstream device we want P2P with during boot via
kernel command line?

> 
>> Even if the switch is there, there is no guarantee that it is currently
>> being used for P2P.
> 
> IOMMU groups are set at boot time and, at present, there's no way to
> dynamically change ACS bits without messing up the groups. So switches
> not used for P2P will not have ACS enabled when CONFIG_PCI_P2PDMA is set
> and I don't know of any good solution to that. Please see the ACS
> discussions in v1 and v2.

Given the implementation limitations, this might be OK as a short-term
solution. 

It depends on if Alex is comfortable with this.

> 
>> It seems that we are going with the assumption that enabling this config
>> option implies you want P2P, then we can simplify this code as well.
> 
> How so?
> 
> Logan
> 
> 
> 


-- 
Sinan Kaya
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm 
Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux 
Foundation Collaborative Project.

Re: [PATCH v3 01/11] PCI/P2PDMA: Support peer-to-peer memory

2018-03-13 Thread Logan Gunthorpe

On 13/03/18 01:10 PM, Sinan Kaya wrote:
> I was thinking of this for the pci_p2pdma_add_client() case for the
> parent pointer.
> 
> +struct pci_p2pdma_client {
> + struct list_head list;
> + struct pci_dev *client;
> + struct pci_dev *provider;
> +};

Yeah, that structure only exists in a list owned by the client and we
only check the upstream bridge once per entry so I don't see the point.

> But then, Why bother searching for the switch at all?

Huh? We have to make sure all the client and provider devices are behind
the same switch. How can we do that without "searching" for the switch?

In the ACS case, we only disable ACS on downstream ports of switches. No
sense disabling it globally as that's worse from an isolation point of
view and not worth it given we require all P2P transactions to be behind
a switch.

> Even if the switch is there, there is no guarantee that it is currently
> being used for P2P.

IOMMU groups are set at boot time and, at present, there's no way to
dynamically change ACS bits without messing up the groups. So switches
not used for P2P will not have ACS enabled when CONFIG_PCI_P2PDMA is set
and I don't know of any good solution to that. Please see the ACS
discussions in v1 and v2.

> It seems that we are going with the assumption that enabling this config
> option implies you want P2P, then we can simplify this code as well.

How so?

Logan

Re: [PATCH v3 01/11] PCI/P2PDMA: Support peer-to-peer memory

2018-03-13 Thread Sinan Kaya

On 3/13/2018 2:44 PM, Logan Gunthorpe wrote:
>> Do you think you can keep a pointer to the parent bridge instead of querying 
>> it
>> via get_upstream_bridge_port() here so that we can reuse your
>> pci_p2pdma_disable_acs() in the future.
> 
> Keep a pointer where? pci_p2pdma_disable_acs() and pci_p2pdma_add_client() 
> are used in completely different cases on completely different devices. There 
> really is no overlap and no obvious place to store the port pointer (except 
> in the struct pci_dev itself, in which case why not just call the function 
> again).

I was thinking of this for the pci_p2pdma_add_client() case for the
parent pointer.

+struct pci_p2pdma_client {
+   struct list_head list;
+   struct pci_dev *client;
+   struct pci_dev *provider;
+};

You are right second code seems to be there to disable ACS altogether
when this kernel configuration is selected.

It doesn't have any relationship to the client code.

But then, Why bother searching for the switch at all?

Even if the switch is there, there is no guarantee that it is currently
being used for P2P.

It seems that we are going with the assumption that enabling this config
option implies you want P2P, then we can simplify this code as well.

-- 
Sinan Kaya
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm 
Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux 
Foundation Collaborative Project.

Re: [PATCH v3 01/11] PCI/P2PDMA: Support peer-to-peer memory

2018-03-13 Thread Logan Gunthorpe




On 13/03/18 11:49 AM, Sinan Kaya wrote:

And there's also the ACS problem which means if you want to use P2P on the root 
ports you'll have to disable ACS on the entire system. (Or preferably, the 
IOMMU groups need to get more sophisticated to allow for dynamic changes).



Do you think you can keep a pointer to the parent bridge instead of querying it
via get_upstream_bridge_port() here so that we can reuse your
pci_p2pdma_disable_acs() in the future.


Keep a pointer where? pci_p2pdma_disable_acs() and 
pci_p2pdma_add_client() are used in completely different cases on 
completely different devices. There really is no overlap and no obvious 
place to store the port pointer (except in the struct pci_dev itself, in 
which case why not just call the function again).



+int pci_p2pdma_disable_acs(struct pci_dev *pdev)
+{
+   struct pci_dev *up;
+   int pos;
+   u16 ctrl;
+
+   up = get_upstream_bridge_port(pdev);
+   if (!up)
+   return 0;

Some future device would only deal with pci_p2pdma_add_client(() for 
whitelisting
instead of changing all of your code.


That's a problem for whoever writes the future code.


We should at least limit the usage of get_upstream_bridge_port() family of 
functions
to probe time only.


Why?


We can tell iommu to do one to one translation by passing iommu.passthrough=1 
to kernel
command line to have identical behavior to your switch case.


Well, someone will need to write code for all available IOMMUs to 
support this. That's a very big project.


Logan

Re: [PATCH v3 01/11] PCI/P2PDMA: Support peer-to-peer memory

2018-03-13 Thread Logan Gunthorpe




On 12/03/18 09:28 PM, Sinan Kaya wrote:

Maybe, dev parameter should also be struct pci_dev so that you can get rid of
all to_pci_dev() calls in this code including find_parent_pci_dev() function.


No, this was mentioned in v2. find_parent_pci_dev is necessary because 
the calling drivers aren't likely to have a bunch of struct pci_dev's 
for all the devices they are working with lying around. It's a much 
nicer from an API stand point to take struct devs and not worth it just 
to have a PCI API only taking struct pci_devs.


Logan

Re: [PATCH v3 01/11] PCI/P2PDMA: Support peer-to-peer memory

2018-03-13 Thread Sinan Kaya

On 3/13/2018 12:43 PM, Logan Gunthorpe wrote:
> 
> 
> On 12/03/18 09:28 PM, Sinan Kaya wrote:
>> On 3/12/2018 3:35 PM, Logan Gunthorpe wrote:
>> Regarding the switch business, It is amazing how much trouble you went into
>> limit this functionality into very specific hardware.
>>
>> I thought that we reached to an agreement that code would not impose
>> any limits on what user wants.
>>
>> What happened to all the emails we exchanged?
> 
> It turns out that root ports that support P2P are far less common than anyone 
> thought. So it will likely have to be a white list. Nobody else seems keen on 
> allowing the user to enable this on hardware that doesn't work. The easiest 
> solution is still limiting it to using a switch. From there, if someone wants 
> to start creating a white-list then that's probably the way forward to 
> support root ports.

I thought only few root ports had problem. Thanks for clarifying that.

> 
> And there's also the ACS problem which means if you want to use P2P on the 
> root ports you'll have to disable ACS on the entire system. (Or preferably, 
> the IOMMU groups need to get more sophisticated to allow for dynamic changes).
> 

Do you think you can keep a pointer to the parent bridge instead of querying it
via get_upstream_bridge_port() here so that we can reuse your
pci_p2pdma_disable_acs() in the future.

+int pci_p2pdma_disable_acs(struct pci_dev *pdev)
+{
+   struct pci_dev *up;
+   int pos;
+   u16 ctrl;
+
+   up = get_upstream_bridge_port(pdev);
+   if (!up)
+   return 0;

Some future device would only deal with pci_p2pdma_add_client(() for 
whitelisting
instead of changing all of your code.

We should at least limit the usage of get_upstream_bridge_port() family of 
functions
to probe time only.

> Additionally, once you allow for root ports you may find the IOMMU getting in 
> the way.

We can tell iommu to do one to one translation by passing iommu.passthrough=1 
to kernel
command line to have identical behavior to your switch case.

> 
> So there are great deal more issues to sort out if you don't restrict to 
> devices behind switches.
> 
> Logan
> 


-- 
Sinan Kaya
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm 
Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux 
Foundation Collaborative Project.

Re: disk-io lockup in 4.14.13 kernel

2018-03-13 Thread Jaco Kroon

Hi Bart,

On 13/03/2018 19:24, Bart Van Assche wrote:
> On Tue, 2018-03-13 at 19:16 +0200, Jaco Kroon wrote:
>> The server in question is the destination of  numerous rsync/ssh cases
>> (used primarily for backups) and is not intended as a real-time system.
>> I'm happy to enable the options below that you would indicate would be
>> helpful in pinpointing the problem (assuming we're not looking at a 8x
>> more CPU required type of degrading as I've recently seen with asterisk
>> lock debugging enabled). I've marked in bold below what I assume would
>> be helpful.  If you don't mind confirming for me I'll enable and
>> schedule a reboot.
> Hello Jaco,
>
> My recommendation is to wait until the mpt3sas maintainers post a fix
> for what I reported yesterday on the linux-scsi mailing list. Enabling
> CONFIG_DEBUG_ATOMIC_SLEEP has namely a very annoying consequence for the
> mpt3sas driver: the first process that hits the "sleep in atomic context"
> bug gets killed. I don't think that you want this kind of behavior on a
> production setup.
>
Would you mind adding myself as CC to that thread please?

Kind Regards,
Jaco

Re: bsg-lib interface cleanup V2

2018-03-13 Thread Jens Axboe

On 3/13/18 9:28 AM, Christoph Hellwig wrote:
> Hi all,
> 
> this series cleans up various abuses of the bsg interfaces, and then
> splits bsg for SCSI passthrough from bsg for arbitrary transport
> passthrough.  This removes the scsi_request abuse in bsg-lib that is
> very confusing, and also makes sure we can sanity check the requests
> we get.  The current code will happily execute scsi commands against
> bsg-lib queues, and transport pass through against scsi nodes, without
> any indication to the driver that we are doing the wrong thing.

Applied for 4.17, thanks.

-- 
Jens Axboe

Re: [PATCH v2] block: bio_check_eod() needs to consider partition

2018-03-13 Thread Jens Axboe

On 3/13/18 9:25 AM, Bart Van Assche wrote:
> On Tue, 2018-03-13 at 07:49 -0700, Jens Axboe wrote:
>> On 3/13/18 2:18 AM, Christoph Hellwig wrote:
>>> bio_check_eod() should check partiton size not the whole disk if
>>> bio->bi_partno is non-zero.  Does this by taking the call to bio_check_eod
>>> into blk_partition_remap.
>>
>> Applied, thanks.
> 
> Hello Christoph and Jens,
> 
> I think that this patch introduces a severe regression: with this patch 
> applied
> a VM that I use for kernel testing doesn't boot anymore. If I revert this 
> patch
> that VM boots fine again. This is what I see on the serial console with this
> patch applied:
> 
> virtio_blk virtio4: [vda] 41943040 512-byte logical blocks (21.5 GB/20.0 GiB)
>  vda: vda1
> attempt to access beyond end of device
> vda: rw=0, want=41940872, limit=41938944
> attempt to access beyond end of device
> vda: rw=0, want=41940872, limit=41938944

I've killed the patch for now, thanks for the quick test, Bart.

-- 
Jens Axboe

Re: disk-io lockup in 4.14.13 kernel

2018-03-13 Thread Bart Van Assche

On Tue, 2018-03-13 at 19:16 +0200, Jaco Kroon wrote:
> The server in question is the destination of  numerous rsync/ssh cases
> (used primarily for backups) and is not intended as a real-time system.
> I'm happy to enable the options below that you would indicate would be
> helpful in pinpointing the problem (assuming we're not looking at a 8x
> more CPU required type of degrading as I've recently seen with asterisk
> lock debugging enabled). I've marked in bold below what I assume would
> be helpful.  If you don't mind confirming for me I'll enable and
> schedule a reboot.

Hello Jaco,

My recommendation is to wait until the mpt3sas maintainers post a fix
for what I reported yesterday on the linux-scsi mailing list. Enabling
CONFIG_DEBUG_ATOMIC_SLEEP has namely a very annoying consequence for the
mpt3sas driver: the first process that hits the "sleep in atomic context"
bug gets killed. I don't think that you want this kind of behavior on a
production setup.

Bart.

RE: [PATCH V5 2/5] scsi: megaraid_sas: fix selection of reply queue

2018-03-13 Thread Kashyap Desai

> -Original Message-
> From: Ming Lei [mailto:ming@redhat.com]
> Sent: Tuesday, March 13, 2018 3:13 PM
> To: James Bottomley; Jens Axboe; Martin K . Petersen
> Cc: Christoph Hellwig; linux-s...@vger.kernel.org; linux-
> bl...@vger.kernel.org; Meelis Roos; Don Brace; Kashyap Desai; Laurence
> Oberman; Mike Snitzer; Paolo Bonzini; Ming Lei; Hannes Reinecke; James
> Bottomley; Artem Bityutskiy
> Subject: [PATCH V5 2/5] scsi: megaraid_sas: fix selection of reply queue
>
> From 84676c1f21 (genirq/affinity: assign vectors to all possible CPUs),
one
> msix vector can be created without any online CPU mapped, then command
> may be queued, and won't be notified after its completion.
>
> This patch setups mapping between cpu and reply queue according to irq
> affinity info retrived by pci_irq_get_affinity(), and uses this info to
choose
> reply queue for queuing one command.
>
> Then the chosen reply queue has to be active, and fixes IO hang caused
by
> using inactive reply queue which doesn't have any online CPU mapped.
>
> Cc: Hannes Reinecke 
> Cc: "Martin K. Petersen" ,
> Cc: James Bottomley ,
> Cc: Christoph Hellwig ,
> Cc: Don Brace 
> Cc: Kashyap Desai 
> Cc: Laurence Oberman 
> Cc: Mike Snitzer 
> Cc: Meelis Roos 
> Cc: Artem Bityutskiy 
> Fixes: 84676c1f21e8 ("genirq/affinity: assign vectors to all possible
CPUs")
> Signed-off-by: Ming Lei 


Side note - For a max performance, your below proposed patch/series is
required.  Without below patch, performance is going to be dropped due to
fewer reply queues are getting utilized one this particular patch is
included.
genirq/affinity: irq vector spread  among online CPUs as far as possible

Acked-by: Kashyap Desai 
Tested-by: Kashyap Desai

Re: [PATCH v3 01/11] PCI/P2PDMA: Support peer-to-peer memory

2018-03-13 Thread Logan Gunthorpe




On 12/03/18 09:28 PM, Sinan Kaya wrote:

On 3/12/2018 3:35 PM, Logan Gunthorpe wrote:
Regarding the switch business, It is amazing how much trouble you went into
limit this functionality into very specific hardware.

I thought that we reached to an agreement that code would not impose
any limits on what user wants.

What happened to all the emails we exchanged?


It turns out that root ports that support P2P are far less common than 
anyone thought. So it will likely have to be a white list. Nobody else 
seems keen on allowing the user to enable this on hardware that doesn't 
work. The easiest solution is still limiting it to using a switch. From 
there, if someone wants to start creating a white-list then that's 
probably the way forward to support root ports.


And there's also the ACS problem which means if you want to use P2P on 
the root ports you'll have to disable ACS on the entire system. (Or 
preferably, the IOMMU groups need to get more sophisticated to allow for 
dynamic changes).


Additionally, once you allow for root ports you may find the IOMMU 
getting in the way.


So there are great deal more issues to sort out if you don't restrict to 
devices behind switches.


Logan

Re: [PATCH v2] block: bio_check_eod() needs to consider partition

2018-03-13 Thread h...@lst.de

On Tue, Mar 13, 2018 at 04:25:40PM +, Bart Van Assche wrote:
> On Tue, 2018-03-13 at 07:49 -0700, Jens Axboe wrote:
> > On 3/13/18 2:18 AM, Christoph Hellwig wrote:
> > > bio_check_eod() should check partiton size not the whole disk if
> > > bio->bi_partno is non-zero.  Does this by taking the call to bio_check_eod
> > > into blk_partition_remap.
> > 
> > Applied, thanks.
> 
> Hello Christoph and Jens,
> 
> I think that this patch introduces a severe regression: with this patch 
> applied
> a VM that I use for kernel testing doesn't boot anymore. If I revert this 
> patch
> that VM boots fine again. This is what I see on the serial console with this
> patch applied:

Ok, please revert this for now.  I'm off for today and will look into
it tomorrow.

[PATCH 3/3] bsg: split handling of SCSI CDBs vs transport requeues

2018-03-13 Thread Christoph Hellwig

The current BSG design tries to shoe-horn the transport-specific
passthrough commands into the overall framework for SCSI passthrough
requests.  This has a couple problems:

 - each passthrough queue has to set the QUEUE_FLAG_SCSI_PASSTHROUGH flag
   despite not dealing with SCSI commands at all.  Because of that these
   queues could also incorrectly accept SCSI commands from in-kernel
   users or through the legacy SCSI_IOCTL_SEND_COMMAND ioctl.
 - the real SCSI bsg queues also incorrectly accept bsg requests of the
   BSG_SUB_PROTOCOL_SCSI_TRANSPORT type
 - the bsg transport code is almost unredable because it tries to reuse
   different SCSI concepts for its own purpose.

This patch instead adds a new bsg_ops structure to handle the two cases
differently, and thus solves all of the above problems.  Another side
effect is that the bsg-lib queues also don't need to embedd a
struct scsi_request anymore.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Hannes Reinecke 
Reviewed-by: Johannes Thumshirn 
---
 block/bsg-lib.c   | 158 +++
 block/bsg.c   | 262 +-
 drivers/scsi/scsi_lib.c   |   4 +-
 drivers/scsi/scsi_sysfs.c |   3 +-
 drivers/scsi/scsi_transport_sas.c |   1 -
 include/linux/bsg-lib.h   |   4 +-
 include/linux/bsg.h   |  35 +++--
 7 files changed, 250 insertions(+), 217 deletions(-)

diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index f2c2d54a61b4..fc2e5ff2c4b9 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -27,6 +27,94 @@
 #include 
 #include 
 #include 
+#include 
+
+#define uptr64(val) ((void __user *)(uintptr_t)(val))
+
+static int bsg_transport_check_proto(struct sg_io_v4 *hdr)
+{
+   if (hdr->protocol != BSG_PROTOCOL_SCSI  ||
+   hdr->subprotocol != BSG_SUB_PROTOCOL_SCSI_TRANSPORT)
+   return -EINVAL;
+   if (!capable(CAP_SYS_RAWIO))
+   return -EPERM;
+   return 0;
+}
+
+static int bsg_transport_fill_hdr(struct request *rq, struct sg_io_v4 *hdr,
+   fmode_t mode)
+{
+   struct bsg_job *job = blk_mq_rq_to_pdu(rq);
+
+   job->request_len = hdr->request_len;
+   job->request = memdup_user(uptr64(hdr->request), hdr->request_len);
+   if (IS_ERR(job->request))
+   return PTR_ERR(job->request);
+   return 0;
+}
+
+static int bsg_transport_complete_rq(struct request *rq, struct sg_io_v4 *hdr)
+{
+   struct bsg_job *job = blk_mq_rq_to_pdu(rq);
+   int ret = 0;
+
+   /*
+* The assignments below don't make much sense, but are kept for
+* bug by bug backwards compatibility:
+*/
+   hdr->device_status = job->result & 0xff;
+   hdr->transport_status = host_byte(job->result);
+   hdr->driver_status = driver_byte(job->result);
+   hdr->info = 0;
+   if (hdr->device_status || hdr->transport_status || hdr->driver_status)
+   hdr->info |= SG_INFO_CHECK;
+   hdr->response_len = 0;
+
+   if (job->result < 0) {
+   /* we're only returning the result field in the reply */
+   job->reply_len = sizeof(u32);
+   ret = job->result;
+   }
+
+   if (job->reply_len && hdr->response) {
+   int len = min(hdr->max_response_len, job->reply_len);
+
+   if (copy_to_user(uptr64(hdr->response), job->reply, len))
+   ret = -EFAULT;
+   else
+   hdr->response_len = len;
+   }
+
+   /* we assume all request payload was transferred, residual == 0 */
+   hdr->dout_resid = 0;
+
+   if (rq->next_rq) {
+   unsigned int rsp_len = job->reply_payload.payload_len;
+
+   if (WARN_ON(job->reply_payload_rcv_len > rsp_len))
+   hdr->din_resid = 0;
+   else
+   hdr->din_resid = rsp_len - job->reply_payload_rcv_len;
+   } else {
+   hdr->din_resid = 0;
+   }
+
+   return ret;
+}
+
+static void bsg_transport_free_rq(struct request *rq)
+{
+   struct bsg_job *job = blk_mq_rq_to_pdu(rq);
+
+   kfree(job->request);
+}
+
+static const struct bsg_ops bsg_transport_ops = {
+   .check_proto= bsg_transport_check_proto,
+   .fill_hdr   = bsg_transport_fill_hdr,
+   .complete_rq= bsg_transport_complete_rq,
+   .free_rq= bsg_transport_free_rq,
+};
 
 /**
  * bsg_teardown_job - routine to teardown a bsg job
@@ -68,27 +156,9 @@ EXPORT_SYMBOL_GPL(bsg_job_get);
 void bsg_job_done(struct bsg_job *job, int result,
  unsigned int reply_payload_rcv_len)
 {
-   struct request *req = blk_mq_rq_from_pdu(job);
-   struct request *rsp = req->next_rq;
-   int err;
-
-   err = job->sreq.result = result;
-   if (err < 0)
-   /* we're only returning the result field in the reply */
-   job->sreq.sense_len = sizeof

[PATCH 2/3] bsg-lib: remove bsg_job.req

2018-03-13 Thread Christoph Hellwig

Users of the bsg-lib interface should only use the bsg_job data structure
and not know about implementation details of it.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Benjamin Block 
Reviewed-by: Hannes Reinecke 
Reviewed-by: Johannes Thumshirn 
---
 block/bsg-lib.c | 14 ++
 include/linux/bsg-lib.h |  1 -
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index fb509779a090..f2c2d54a61b4 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -35,7 +35,7 @@
 static void bsg_teardown_job(struct kref *kref)
 {
struct bsg_job *job = container_of(kref, struct bsg_job, kref);
-   struct request *rq = job->req;
+   struct request *rq = blk_mq_rq_from_pdu(job);
 
put_device(job->dev);   /* release reference for the request */
 
@@ -68,19 +68,18 @@ EXPORT_SYMBOL_GPL(bsg_job_get);
 void bsg_job_done(struct bsg_job *job, int result,
  unsigned int reply_payload_rcv_len)
 {
-   struct request *req = job->req;
+   struct request *req = blk_mq_rq_from_pdu(job);
struct request *rsp = req->next_rq;
-   struct scsi_request *rq = scsi_req(req);
int err;
 
-   err = scsi_req(job->req)->result = result;
+   err = job->sreq.result = result;
if (err < 0)
/* we're only returning the result field in the reply */
-   rq->sense_len = sizeof(u32);
+   job->sreq.sense_len = sizeof(u32);
else
-   rq->sense_len = job->reply_len;
+   job->sreq.sense_len = job->reply_len;
/* we assume all request payload was transferred, residual == 0 */
-   rq->resid_len = 0;
+   job->sreq.resid_len = 0;
 
if (rsp) {
WARN_ON(reply_payload_rcv_len > scsi_req(rsp)->resid_len);
@@ -232,7 +231,6 @@ static void bsg_initialize_rq(struct request *req)
sreq->sense = sense;
sreq->sense_len = SCSI_SENSE_BUFFERSIZE;
 
-   job->req = req;
job->reply = sense;
job->reply_len = sreq->sense_len;
job->dd_data = job + 1;
diff --git a/include/linux/bsg-lib.h b/include/linux/bsg-lib.h
index 402223c95ce1..08762d297cbd 100644
--- a/include/linux/bsg-lib.h
+++ b/include/linux/bsg-lib.h
@@ -40,7 +40,6 @@ struct bsg_buffer {
 struct bsg_job {
struct scsi_request sreq;
struct device *dev;
-   struct request *req;
 
struct kref kref;
 
-- 
2.14.2

[PATCH 1/3] bsg-lib: introduce a timeout field in struct bsg_job

2018-03-13 Thread Christoph Hellwig

The zfcp driver wants to know the timeout for a bsg job, so add a field
to struct bsg_job for it in preparation of not exposing the request
to the bsg-lib users.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Benjamin Block 
Reviewed-by: Hannes Reinecke 
Reviewed-by: Johannes Thumshirn 
---
 block/bsg-lib.c | 1 +
 drivers/s390/scsi/zfcp_fc.c | 4 ++--
 include/linux/bsg-lib.h | 2 ++
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index b4fe1a48f111..fb509779a090 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -132,6 +132,7 @@ static int bsg_prepare_job(struct device *dev, struct 
request *req)
struct bsg_job *job = blk_mq_rq_to_pdu(req);
int ret;
 
+   job->timeout = req->timeout;
job->request = rq->cmd;
job->request_len = rq->cmd_len;
 
diff --git a/drivers/s390/scsi/zfcp_fc.c b/drivers/s390/scsi/zfcp_fc.c
index ca218c82321f..6162cf57a20a 100644
--- a/drivers/s390/scsi/zfcp_fc.c
+++ b/drivers/s390/scsi/zfcp_fc.c
@@ -961,7 +961,7 @@ static int zfcp_fc_exec_els_job(struct bsg_job *job,
d_id = ntoh24(bsg_request->rqst_data.h_els.port_id);
 
els->handler = zfcp_fc_ct_els_job_handler;
-   return zfcp_fsf_send_els(adapter, d_id, els, job->req->timeout / HZ);
+   return zfcp_fsf_send_els(adapter, d_id, els, job->timeout / HZ);
 }
 
 static int zfcp_fc_exec_ct_job(struct bsg_job *job,
@@ -980,7 +980,7 @@ static int zfcp_fc_exec_ct_job(struct bsg_job *job,
return ret;
 
ct->handler = zfcp_fc_ct_job_handler;
-   ret = zfcp_fsf_send_ct(wka_port, ct, NULL, job->req->timeout / HZ);
+   ret = zfcp_fsf_send_ct(wka_port, ct, NULL, job->timeout / HZ);
if (ret)
zfcp_fc_wka_port_put(wka_port);
 
diff --git a/include/linux/bsg-lib.h b/include/linux/bsg-lib.h
index b1be0233ce35..402223c95ce1 100644
--- a/include/linux/bsg-lib.h
+++ b/include/linux/bsg-lib.h
@@ -44,6 +44,8 @@ struct bsg_job {
 
struct kref kref;
 
+   unsigned int timeout;
+
/* Transport/driver specific request/reply structs */
void *request;
void *reply;
-- 
2.14.2

bsg-lib interface cleanup V2

2018-03-13 Thread Christoph Hellwig

Hi all,

this series cleans up various abuses of the bsg interfaces, and then
splits bsg for SCSI passthrough from bsg for arbitrary transport
passthrough.  This removes the scsi_request abuse in bsg-lib that is
very confusing, and also makes sure we can sanity check the requests
we get.  The current code will happily execute scsi commands against
bsg-lib queues, and transport pass through against scsi nodes, without
any indication to the driver that we are doing the wrong thing.

Changes since V1:
 - dropped various patches merged already
 - use job to get the payload length in bsg_transport_complete_rq
 - renamed the ptr64 macro to uptr64
 - make bsg_scsi_complete_rq and bsg_transport_complete_rq look more
   similar
 - trivial cleanup in bsg_map_hdr

Re: [PATCH v2] block: bio_check_eod() needs to consider partition

2018-03-13 Thread Bart Van Assche

On Tue, 2018-03-13 at 07:49 -0700, Jens Axboe wrote:
> On 3/13/18 2:18 AM, Christoph Hellwig wrote:
> > bio_check_eod() should check partiton size not the whole disk if
> > bio->bi_partno is non-zero.  Does this by taking the call to bio_check_eod
> > into blk_partition_remap.
> 
> Applied, thanks.

Hello Christoph and Jens,

I think that this patch introduces a severe regression: with this patch applied
a VM that I use for kernel testing doesn't boot anymore. If I revert this patch
that VM boots fine again. This is what I see on the serial console with this
patch applied:

virtio_blk virtio4: [vda] 41943040 512-byte logical blocks (21.5 GB/20.0 GiB)
 vda: vda1
attempt to access beyond end of device
vda: rw=0, want=41940872, limit=41938944
attempt to access beyond end of device
vda: rw=0, want=41940872, limit=41938944
Buffer I/O error on dev vda1, logical block 5242352, async page read
scsi host2: Virtio SCSI HBA
scsi 2:0:0:0: Direct-Access QEMU QEMU HARDDISK2.5+ PQ: 0 ANSI: 5
sr 0:0:1:0: [sr0] scsi3-mmc drive: 4x/4x cd/rw xa/form2 tray
cdrom: Uniform CD-ROM driver Revision: 3.20
sr 0:0:1:0: Attached scsi CD-ROM sr0
sg_inq (142) used greatest stack depth: 26912 bytes left
systemd-udevd[90]: could not create device: Invalid argument
systemd-udevd[110]: Inserted 'virtio_scsi'
systemd-udevd[90]: Validate module index
systemd-udevd[110]: passed device to netlink monitor 0x55642693eed0
systemd-udevd[90]: Check if link configuration needs reloading.
systemd-udevd[90]: passed 146 byte device to netlink monitor 0x55642692e7d0
systemd-udevd[123]: passed device to netlink monitor 0x556426980e80
systemd-udevd[90]: passed 176 byte device to netlink monitor 0x55642692e7d0
systemd-udevd[123]: passed device to netlink monitor 0x556426980e80
systemd-udevd[90]: passed 179 byte device to netlink monitor 0x55642692e7d0
sd 2:0:0:0: Power-on or device reset occurred
sd 2:0:0:0: [sda] 5242880 512-byte logical blocks: (2.68 GB/2.50 GiB)
sd 2:0:0:0: [sda] Write Protect is off
sd 2:0:0:0: [sda] Mode Sense: 63 00 00 08
sd 2:0:0:0: [sda] Write cache: enabled, read cache: enabled, doesn't support 
DPO or FUA
systemd-udevd (123) used greatest stack depth: 26576 bytes left
systemd-udevd (121) used greatest stack depth: 24928 bytes left
systemd-udevd (110) used greatest stack depth: 24848 bytes left
sd 2:0:0:0: [sda] Attached SCSI disk
attempt to access beyond end of device
vda: rw=0, want=41940872, limit=41938944
attempt to access beyond end of device
vda: rw=0, want=41940872, limit=41938944
Buffer I/O error on dev vda1, logical block 5242352, async page read
attempt to access beyond end of device
vda: rw=0, want=41940872, limit=41938944
attempt to access beyond end of device
vda: rw=0, want=41940872, limit=41938944
Buffer I/O error on dev vda1, logical block 5242352, async page read
attempt to access beyond end of device
vda: rw=0, want=41940872, limit=41938944
attempt to access beyond end of device
vda: rw=0, want=41940872, limit=41938944
Buffer I/O error on dev vda1, logical block 5242352, async page read
attempt to access beyond end of device
vda: rw=0, want=41940872, limit=41938944
attempt to access beyond end of device
vda: rw=0, want=41940872, limit=41938944
Buffer I/O error on dev vda1, logical block 5242352, async page read
systemd-udevd[90]: Validate module index
systemd-udevd[90]: Check if link configuration needs reloading.
systemd-udevd[209]: IMPORT '/usr/bin/sg_inq --export /dev/sda' 
/lib/udev/rules.d/55-scsi-sg3_id.rules:10
systemd-udevd[211]: starting '/usr/bin/sg_inq --export /dev/sda'
systemd-udevd[210]: IMPORT builtin 'path_id' 
/lib/udev/rules.d/60-persistent-storage.rules:66
systemd-udevd[210]: LINK 'disk/by-path/pci-:00:07.0' 
/lib/udev/rules.d/60-persistent-storage.rules:68
systemd-udevd[210]: LINK 'disk/by-path/virtio-pci-:00:07.0' 
/lib/udev/rules.d/60-persistent-storage.rules:72
systemd-udevd[209]: '/usr/bin/sg_inq --export /dev/sda'(out) 'SCSI_TPGS=0'
systemd-udevd[210]: IMPORT builtin 'blkid' 
/lib/udev/rules.d/60-persistent-storage.rules:83
systemd-udevd[209]: '/usr/bin/sg_inq --export /dev/sda'(out) 'SCSI_TYPE=disk'
attempt to access beyond end of device
vda: rw=0, want=41940872, limit=41938944
attempt to access beyond end of device
vda: rw=0, want=41940872, limit=41938944
Buffer I/O error on dev vda1, logical block 5242352, async page read
attempt to access beyond end of device
vda: rw=0, want=41940872, limit=41938944
attempt to access beyond end of device
vda: rw=0, want=41940872, limit=41938944
Buffer I/O error on dev vda1, logical block 5242352, async page read
attempt to access beyond end of device
vda: rw=0, want=41940872, limit=41938944
attempt to access beyond end of device
vda: rw=0, want=41940872, limit=41938944
Buffer I/O error on dev vda1, logical block 5242352, async page read
attempt to access beyond end of device
vda: rw=0, want=41940872, limit=41938944
attempt to access beyond end of device
vda: rw=0, want=41940872, limit=41938944
Buffer I/O error on dev vda1, logical b

Re: [PATCH 0/8] block: sed-opal: support write to shadow mbr

2018-03-13 Thread Scott Bauer

On Tue, Mar 13, 2018 at 02:08:53PM +0100, Jonas Rabenstein wrote:
> Hi,
> this patchset adds support to write data into the shadow mbr of sed-opal
> enabled devices. They apply cleanly on today next-tree (next-20180313)
> and requires the u64 short atom length fix from [0] as that is still
> missing in that tree. As I only can test on my only sed-opal enabled
> Samsung 850 Evo on amd64, tests on other hardware would be appreciated.

Generally the series looks fine, there are a few nits I've sent out.
I will test and get back to you in the next day or two. With any further
comments.

Re: [PATCH 8/8] block: sed-opal: ioctl for writing to shadow mbr

2018-03-13 Thread Scott Bauer

On Tue, Mar 13, 2018 at 02:09:01PM +0100, Jonas Rabenstein wrote:
> Allow modification of the shadow mbr. If the shadow mbr is not marked as
> done, this data will be presented read only as the device content. Only
> after marking the shadow mbr as done and unlocking a locking range the
> actual content is accessible.
> 

> Signed-off-by: Jonas Rabenstein 
> +static int opal_write_shadow_mbr(struct opal_dev *dev,
> +  struct opal_shadow_mbr *info)
> +{
> + const struct opal_step mbr_steps[] = {
> + { opal_discovery0, },
> + { start_admin1LSP_opal_session, &info->key },
> + { write_shadow_mbr, info },
> + { end_opal_session, },
> + { NULL, }
> + };
> + int ret;
> +
> + if (info->size == 0)
> + return 0;

We need to bound this to some maximum. I assume we'll at some point come across 
a controller
with crappy firmware that wont check this against the MBR Table size and the 
user will either
brick their drive or overwrite their data.

We can get the size of the MBR Table it seems but I'm not sure how hard it is 
to pull that table yet.

TCG SAS 5.7.3.6:
The size of the MBR Table is retrievable from the "Table" table of the SP that 
incorporates the Locking Template.

As always the TCG spec is super helpful /s.

I will see how todo this and if it's worth it.

> diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h
> index 0cb9890cdc04..c2669ebff681 100644
> --- a/include/uapi/linux/sed-opal.h
> +++ b/include/uapi/linux/sed-opal.h
> @@ -104,6 +104,13 @@ struct opal_mbr_data {
>   __u8 __align[7];
>  };
>  
> +struct opal_shadow_mbr {
> + struct opal_key key;
> + const __u8 *data;

 Please use a u64 here for the data and cast it to a pointer
 in the driver. We do this so we do not have to maintain a compat
 layer for 32 bit userland.

Re: [PATCH 3/8] block: sed-opal: unify cmd start and finalize

2018-03-13 Thread Scott Bauer

On Tue, Mar 13, 2018 at 02:08:56PM +0100, Jonas Rabenstein wrote:
> Every step starts with resetting the cmd buffer as well as the comid and
> constructs the appropriate OPAL_CALL command. Consequently, those
> actions may be combined into one generic function.
> 
> Signed-off-by: Jonas Rabenstein 
> ---
>  block/sed-opal.c | 243 
> +++
>  1 file changed, 63 insertions(+), 180 deletions(-)
> 
> diff --git a/block/sed-opal.c b/block/sed-opal.c
> index a228a13f0a08..22dbea7cf4d1 100644
> --- a/block/sed-opal.c
> +++ b/block/sed-opal.c
> @@ -659,6 +659,7 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, 
> u32 tsn)
>   struct opal_header *hdr;
>   int err = 0;
>  
> + add_token_u8(&err, cmd, OPAL_ENDLIST);
>   add_token_u8(&err, cmd, OPAL_ENDOFDATA);
>   add_token_u8(&err, cmd, OPAL_STARTLIST);
>   add_token_u8(&err, cmd, 0);
> @@ -1001,6 +1002,21 @@ static void clear_opal_cmd(struct opal_dev *dev)
>   memset(dev->cmd, 0, IO_BUFFER_LENGTH);
>  }
>  
> +static int start_opal_cmd(struct opal_dev *dev, const u8 *uid, const u8 
> *method)
> +{
> + int err = 0;
> +
> + clear_opal_cmd(dev);
> + set_comid(dev, dev->comid);
> +
> + add_token_u8(&err, dev, OPAL_CALL);
> + add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
> + add_token_bytestring(&err, dev, method, OPAL_METHOD_LENGTH);
> + add_token_u8(&err, dev, OPAL_STARTLIST);

Can you resend this patch (in a bit I'm revewing the rest) with a comment here 
and
in cmd_finalize explaining that the start_list here gets terminiated by the new
opal_endlist in cmd_finalize.

I'm not a huge fan of having the start/list and end/list in separate functions
since those are used to specify a parameter list for a opal method. I can get 
over it
since it cleans the code up, as long as there are comments on both sides 
reminding me
what they're opening/closing off.

Re: disk-io lockup in 4.14.13 kernel

2018-03-13 Thread Bart Van Assche

On Tue, 2018-03-13 at 16:59 +0200, Jaco Kroon wrote:
> I quickly checked my dmesg logs and I'm not seeing that particular
> message, could be that newer kernels only started warning about it?

Hello Jaco,

That message only appears if  CONFIG_DEBUG_ATOMIC_SLEEP (sleep inside atomic)
is enabled in the kernel config. The kernel configuration options I enable to
test kernel code can be found below. Please note that some of these options
slow down the kernel significantly, so these options should probably not be
enabled on a production system.

CONFIG_BLK_DEBUG_FS
CONFIG_DEBUG_ATOMIC_SLEEP
CONFIG_DEBUG_BOOT_PARAMS
CONFIG_DEBUG_BUGVERBOSE
CONFIG_DEBUG_FS
CONFIG_DEBUG_INFO
CONFIG_DEBUG_INFO_DWARF4
CONFIG_DEBUG_INFO_REDUCED
CONFIG_DEBUG_KERNEL
CONFIG_DEBUG_KMEMLEAK
CONFIG_DEBUG_LIST
CONFIG_DEBUG_LOCK_ALLOC
CONFIG_DEBUG_MUTEXES
CONFIG_DEBUG_OBJECTS
CONFIG_DEBUG_OBJECTS_RCU_HEAD
CONFIG_DEBUG_PAGEALLOC
CONFIG_DEBUG_PER_CPU_MAPS
CONFIG_DEBUG_PI_LIST
CONFIG_DEBUG_PREEMPT
CONFIG_DEBUG_PREEMPT_VOLUNTARY
CONFIG_DEBUG_SG
CONFIG_DEBUG_SPINLOCK
CONFIG_DEBUG_STACKOVERFLOW
CONFIG_DEBUG_STACK_USAGE
CONFIG_DETECT_HUNG_TASK
CONFIG_DYNAMIC_DEBUG
CONFIG_HARDLOCKUP_DETECTOR
CONFIG_KASAN
CONFIG_MAGIC_SYSRQ
CONFIG_PREEMPT
CONFIG_PROVE_LOCKING
CONFIG_PROVE_RCU
CONFIG_SCHED_DEBUG
CONFIG_SLUB
CONFIG_SLUB_DEBUG_ON
CONFIG_WQ_WATCHDOG

Bart.

Re: disk-io lockup in 4.14.13 kernel

2018-03-13 Thread Jaco Kroon

Hi Bart,

On 13/03/2018 16:10, Bart Van Assche wrote:
> On Tue, 2018-03-13 at 11:30 +0200, Jaco Kroon wrote:
>> On 11/03/2018 07:00, Bart Van Assche wrote:
>>> Did I see correctly that /dev/sdm is behind a MPT SAS controller? You may
>>> want to contact the authors of this driver and Cc the linux-scsi mailing
>>> list. Sorry but I'm not familiar with the mpt3sas driver myself.
>> You did see correctly, all drives are behind the MPT SAS.  If there is
>> in fact a problem with the driver (or the controller itself for that
>> matter) it would explain things.  It would also explain why we don't see
>> this problem on other hosts.
> You may want to have a look at the following report, something I ran into
> myself yesterday: https://marc.info/?l=linux-scsi&m=152087904024146.
I quickly checked my dmesg logs and I'm not seeing that particular
message, could be that newer kernels only started warning about it?

I'm not seeing any references to the
scsih_get_enclosure_logicalid_chassis_slot() function in the code here,
so there is obviously a newer driver in newer kernels.  The scsih_get_*
calls I do see is in mpt3sas_scsih.c and I'm not seeing (direct) calls
to _config_request at all.  At least it seems there is activity on the
driver, which is always a good sign that someone is attending to problems.

I'll queue an upgrade to 4.16 so long for when it's released in a couple
of days/weeks.

Kind Regards,
Jaco

Re: [Possible REGRESSION, 4.16-rc4] Error updating SMART data during runtime and could not connect to lvmetad at some boot attempts

2018-03-13 Thread Bart Van Assche

On Tue, 2018-03-13 at 22:32 +0800, Ming Lei wrote:
> On Tue, Mar 13, 2018 at 02:08:23PM +0100, Martin Steigerwald wrote:
> > Ming and Bart, I added you to cc, cause I had to do with you about another 
> > blk-mq report, please feel free to adapt.
> 
> Looks RIP points to scsi_times_out+0x17/0x1d0, maybe a SCSI regression?

I think that it's much more likely that this is a block layer regression. See
e.g. "[PATCH v2] blk-mq: Fix race between resetting the timer and completion
handling" 
(https://www.mail-archive.com/linux-block@vger.kernel.org/msg18338.html).

Bart.

Re: [PATCH v2] block: bio_check_eod() needs to consider partition

2018-03-13 Thread Jens Axboe

On 3/13/18 2:18 AM, Christoph Hellwig wrote:
> bio_check_eod() should check partiton size not the whole disk if
> bio->bi_partno is non-zero.  Does this by taking the call to bio_check_eod
> into blk_partition_remap.

Applied, thanks.

-- 
Jens Axboe

Re: [Possible REGRESSION, 4.16-rc4] Error updating SMART data during runtime and could not connect to lvmetad at some boot attempts

2018-03-13 Thread Ming Lei

On Tue, Mar 13, 2018 at 02:08:23PM +0100, Martin Steigerwald wrote:
> Hans de Goede - 11.03.18, 15:37:
> > Hi Martin,
> > 
> > On 11-03-18 09:20, Martin Steigerwald wrote:
> > > Hello.
> > > 
> > > Since 4.16-rc4 (upgraded from 4.15.2 which worked) I have an issue
> > > with SMART checks occassionally failing like this:
> > > 
> > > smartd[28017]: Device: /dev/sdb [SAT], is in SLEEP mode, suspending checks
> > > udisksd[24408]: Error performing housekeeping for drive
> > > /org/freedesktop/UDisks2/drives/INTEL_SSDSA2CW300G3_[…]: Error updating
> > > SMART data: Error sending ATA command CHECK POWER MODE: Unexpected sense
> > > data returned:#012: 0e 09 0c 00  00 00 ff 00  00 00 00 00  00 00 50
> > > 00..P.#0120010: 00 00 00 00  00 00 00 00  00 00 00 00  00
> > > 00 00 00#012 (g-io-error-quark, 0) merkaba
> > > udisksd[24408]: Error performing housekeeping for drive
> > > /org/freedesktop/UDisks2/drives/Crucial_CT480M500SSD3_[…]: Error updating
> > > SMART dat a: Error sending ATA command CHECK POWER MODE: Unexpected sense
> > > data returned:#012: 01 00 1d 00  00 00 0e 09  0c 00 00 00  ff 00 00
> > > 00#0120010: 00 0 0 00 00  50 00 00 00  00 00 00 00 
> > > 00 00 00 00P...#012 (g-io-error-quark, 0)
> > > 
> > > (Intel SSD is connected via SATA, Crucial via mSATA in a ThinkPad T520)
> > > 
> > > However when I then check manually with smartctl -a | -x | -H the device
> > > reports SMART data just fine.
> > > 
> > > As smartd correctly detects that device is in sleep mode, this may be an
> > > userspace issue in udisksd.
> > > 
> > > Also at some boot attempts the boot hangs with a message like "could not
> > > connect to lvmetad, scanning manually for devices". I use BTRFS RAID 1
> > > on to LVs (each on one of the SSDs). A configuration that requires a
> > > manual
> > > adaption to InitRAMFS in order to boot (basically vgchange -ay before
> > > btrfs device scan).
> > > 
> > > I wonder whether that has to do with the new SATA LPM policy stuff, but as
> > > I had issues with
> > > 
> > >   3 => Medium power with Device Initiated PM enabled
> > > 
> > > (machine did not boot, which could also have been caused by me
> > > accidentally
> > > removing all TCP/IP network support in the kernel with that setting)
> > > 
> > > I set it back to
> > > 
> > > CONFIG_SATA_MOBILE_LPM_POLICY=0
> > > 
> > > (firmware settings)
> > 
> > Right, so at that settings the LPM policy changes are effectively
> > disabled and cannot explain your SMART issues.
> 
> Yes, I now good a photo of one of those boot failures I mentioned, at it 
> seems 
> to be related to blk-mq, as the backtrace contains "blk_mq_terminate_expired".
> 
> I add the screenshot to my bug report.
> 
> [Possible REGRESSION, 4.16-rc4] Error updating SMART data during runtime and 
> boot failures with blk_mq_terminate_expired in backtrace
> https://bugzilla.kernel.org/show_bug.cgi?id=199077
> 
> Hans, I will test your LPM policy horkage for Crucial m500 patch at a later 
> time. I first wanted to add the photo of the boot failure to the bug report.
> 
> Ming and Bart, I added you to cc, cause I had to do with you about another 
> blk-mq report, please feel free to adapt.

Looks RIP points to scsi_times_out+0x17/0x1d0, maybe a SCSI regression?

Thanks,
Ming

Re: disk-io lockup in 4.14.13 kernel

2018-03-13 Thread Bart Van Assche

On Tue, 2018-03-13 at 11:30 +0200, Jaco Kroon wrote:
> On 11/03/2018 07:00, Bart Van Assche wrote:
> > Did I see correctly that /dev/sdm is behind a MPT SAS controller? You may
> > want to contact the authors of this driver and Cc the linux-scsi mailing
> > list. Sorry but I'm not familiar with the mpt3sas driver myself.
> 
> You did see correctly, all drives are behind the MPT SAS.  If there is
> in fact a problem with the driver (or the controller itself for that
> matter) it would explain things.  It would also explain why we don't see
> this problem on other hosts.

You may want to have a look at the following report, something I ran into
myself yesterday: https://marc.info/?l=linux-scsi&m=152087904024146.

Bart.

[PATCH 3/8] block: sed-opal: unify cmd start and finalize

2018-03-13 Thread Jonas Rabenstein

Every step starts with resetting the cmd buffer as well as the comid and
constructs the appropriate OPAL_CALL command. Consequently, those
actions may be combined into one generic function.

Signed-off-by: Jonas Rabenstein 
---
 block/sed-opal.c | 243 +++
 1 file changed, 63 insertions(+), 180 deletions(-)

diff --git a/block/sed-opal.c b/block/sed-opal.c
index a228a13f0a08..22dbea7cf4d1 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -659,6 +659,7 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 
tsn)
struct opal_header *hdr;
int err = 0;
 
+   add_token_u8(&err, cmd, OPAL_ENDLIST);
add_token_u8(&err, cmd, OPAL_ENDOFDATA);
add_token_u8(&err, cmd, OPAL_STARTLIST);
add_token_u8(&err, cmd, 0);
@@ -1001,6 +1002,21 @@ static void clear_opal_cmd(struct opal_dev *dev)
memset(dev->cmd, 0, IO_BUFFER_LENGTH);
 }
 
+static int start_opal_cmd(struct opal_dev *dev, const u8 *uid, const u8 
*method)
+{
+   int err = 0;
+
+   clear_opal_cmd(dev);
+   set_comid(dev, dev->comid);
+
+   add_token_u8(&err, dev, OPAL_CALL);
+   add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
+   add_token_bytestring(&err, dev, method, OPAL_METHOD_LENGTH);
+   add_token_u8(&err, dev, OPAL_STARTLIST);
+
+   return err;
+}
+
 static int start_opal_session_cont(struct opal_dev *dev)
 {
u32 hsn, tsn;
@@ -1063,21 +1079,13 @@ static int finalize_and_send(struct opal_dev *dev, 
cont_fn cont)
 static int gen_key(struct opal_dev *dev, void *data)
 {
u8 uid[OPAL_UID_LENGTH];
-   int err = 0;
-
-   clear_opal_cmd(dev);
-   set_comid(dev, dev->comid);
+   int err;
 
memcpy(uid, dev->prev_data, min(sizeof(uid), dev->prev_d_len));
kfree(dev->prev_data);
dev->prev_data = NULL;
 
-   add_token_u8(&err, dev, OPAL_CALL);
-   add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
-   add_token_bytestring(&err, dev, opalmethod[OPAL_GENKEY],
-OPAL_UID_LENGTH);
-   add_token_u8(&err, dev, OPAL_STARTLIST);
-   add_token_u8(&err, dev, OPAL_ENDLIST);
+   err = start_opal_cmd(dev, uid, opalmethod[OPAL_GENKEY]);
 
if (err) {
pr_debug("Error building gen key command\n");
@@ -1115,21 +1123,14 @@ static int get_active_key_cont(struct opal_dev *dev)
 static int get_active_key(struct opal_dev *dev, void *data)
 {
u8 uid[OPAL_UID_LENGTH];
-   int err = 0;
+   int err;
u8 *lr = data;
 
-   clear_opal_cmd(dev);
-   set_comid(dev, dev->comid);
-
err = build_locking_range(uid, sizeof(uid), *lr);
if (err)
return err;
 
-   err = 0;
-   add_token_u8(&err, dev, OPAL_CALL);
-   add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
-   add_token_bytestring(&err, dev, opalmethod[OPAL_GET], OPAL_UID_LENGTH);
-   add_token_u8(&err, dev, OPAL_STARTLIST);
+   err = start_opal_cmd(dev, uid, opalmethod[OPAL_GET]);
add_token_u8(&err, dev, OPAL_STARTLIST);
add_token_u8(&err, dev, OPAL_STARTNAME);
add_token_u8(&err, dev, 3); /* startCloumn */
@@ -1140,7 +1141,6 @@ static int get_active_key(struct opal_dev *dev, void 
*data)
add_token_u8(&err, dev, 10); /* ActiveKey */
add_token_u8(&err, dev, OPAL_ENDNAME);
add_token_u8(&err, dev, OPAL_ENDLIST);
-   add_token_u8(&err, dev, OPAL_ENDLIST);
if (err) {
pr_debug("Error building get active key command\n");
return err;
@@ -1153,13 +1153,10 @@ static int generic_lr_enable_disable(struct opal_dev 
*dev,
 u8 *uid, bool rle, bool wle,
 bool rl, bool wl)
 {
-   int err = 0;
+   int err;
 
-   add_token_u8(&err, dev, OPAL_CALL);
-   add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
-   add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH);
+   err = start_opal_cmd(dev, uid, opalmethod[OPAL_SET]);
 
-   add_token_u8(&err, dev, OPAL_STARTLIST);
add_token_u8(&err, dev, OPAL_STARTNAME);
add_token_u8(&err, dev, OPAL_VALUES);
add_token_u8(&err, dev, OPAL_STARTLIST);
@@ -1186,7 +1183,6 @@ static int generic_lr_enable_disable(struct opal_dev *dev,
 
add_token_u8(&err, dev, OPAL_ENDLIST);
add_token_u8(&err, dev, OPAL_ENDNAME);
-   add_token_u8(&err, dev, OPAL_ENDLIST);
return err;
 }
 
@@ -1207,10 +1203,7 @@ static int setup_locking_range(struct opal_dev *dev, 
void *data)
u8 uid[OPAL_UID_LENGTH];
struct opal_user_lr_setup *setup = data;
u8 lr;
-   int err = 0;
-
-   clear_opal_cmd(dev);
-   set_comid(dev, dev->comid);
+   int err;
 
lr = setup->session.opal_key.lr;
err = build_locking_range(uid, sizeof(uid), lr);
@@ -1220,12 +1213,8 @@ static int setup_locking_range(struc

[PATCH 2/8] block: sed-opal: unify space check in add_token_*

2018-03-13 Thread Jonas Rabenstein

All add_token_* functions have a common set of conditions that have to
be checked. Use a common function for those checks in order to avoid
different behaviour.

Signed-off-by: Jonas Rabenstein 
---
 block/sed-opal.c | 25 -
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/block/sed-opal.c b/block/sed-opal.c
index f9d94ce64b08..a228a13f0a08 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -510,15 +510,24 @@ static int opal_discovery0(struct opal_dev *dev, void 
*data)
return opal_discovery0_end(dev);
 }
 
-static void add_token_u8(int *err, struct opal_dev *cmd, u8 tok)
+static bool can_add(int *err, struct opal_dev *cmd, size_t len)
 {
if (*err)
-   return;
-   if (cmd->pos >= IO_BUFFER_LENGTH - 1) {
-   pr_debug("Error adding u8: end of buffer.\n");
+   return false;
+
+   if (len > IO_BUFFER_LENGTH || cmd->pos >= IO_BUFFER_LENGTH - len) {
+   pr_debug("Error adding %zu bytes: end of buffer.\n", len);
*err = -ERANGE;
-   return;
+   return false;
}
+
+   return true;
+}
+
+static void add_token_u8(int *err, struct opal_dev *cmd, u8 tok)
+{
+   if (!can_add(err, cmd, 1))
+   return;
cmd->cmd[cmd->pos++] = tok;
 }
 
@@ -564,9 +573,8 @@ static void add_token_u64(int *err, struct opal_dev *cmd, 
u64 number)
msb = fls(number);
len = DIV_ROUND_UP(msb, 4);
 
-   if (cmd->pos >= IO_BUFFER_LENGTH - len - 1) {
+   if (!can_add(err, cmd, len + 1)) {
pr_debug("Error adding u64: end of buffer.\n");
-   *err = -ERANGE;
return;
}
add_short_atom_header(cmd, false, false, len);
@@ -590,9 +598,8 @@ static void add_token_bytestring(int *err, struct opal_dev 
*cmd,
is_short_atom = false;
}
 
-   if (len >= IO_BUFFER_LENGTH - cmd->pos - header_len) {
+   if (!can_add(err, cmd, header_len + len)) {
pr_debug("Error adding bytestring: end of buffer.\n");
-   *err = -ERANGE;
return;
}
 
-- 
2.16.1

[PATCH 5/8] block: sed-opal: print failed function address

2018-03-13 Thread Jonas Rabenstein

add function address (and if available its symbol) to the message if a
step function fails.

Signed-off-by: Jonas Rabenstein 
---
 block/sed-opal.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/sed-opal.c b/block/sed-opal.c
index 5a448a3ba1df..f7925e6f607c 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -394,8 +394,8 @@ static int next(struct opal_dev *dev)
 
error = step->fn(dev, step->data);
if (error) {
-   pr_debug("Error on step function: %d with error %d: 
%s\n",
-state, error,
+   pr_info("Error on step function %pS: %d with error %d: 
%s\n",
+step->fn, state, error,
 opal_error_to_human(error));
 
/* For each OPAL command we do a discovery0 then we
-- 
2.16.1

[PATCH 4/8] block: sed-opal: unify error handling of responses

2018-03-13 Thread Jonas Rabenstein

Also response_get_token had already been in place, its functionality had
been duplicated within response_get_{u64,bytestring} with the same error
handling. Unify the handling by reusing response_get_token within the
other functions.

Signed-off-by: Jonas Rabenstein 
---
 block/sed-opal.c | 28 ++--
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/block/sed-opal.c b/block/sed-opal.c
index 22dbea7cf4d1..5a448a3ba1df 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -698,6 +698,11 @@ static const struct opal_resp_tok *response_get_token(
 {
const struct opal_resp_tok *tok;
 
+   if (!resp) {
+   pr_debug("Response is NULL\n");
+   return ERR_PTR(-EINVAL);
+   }
+
if (n >= resp->num) {
pr_debug("Token number doesn't exist: %d, resp: %d\n",
 n, resp->num);
@@ -883,18 +888,10 @@ static size_t response_get_string(const struct 
parsed_resp *resp, int n,
const struct opal_resp_tok *token;
 
*store = NULL;
-   if (!resp) {
-   pr_debug("Response is NULL\n");
+   token = response_get_token(resp, n);
+   if (IS_ERR(token))
return 0;
-   }
 
-   if (n > resp->num) {
-   pr_debug("Response has %d tokens. Can't access %d\n",
-resp->num, n);
-   return 0;
-   }
-
-   token = &resp->toks[n];
if (token->type != OPAL_DTA_TOKENID_BYTESTRING) {
pr_debug("Token is not a byte string!\n");
return 0;
@@ -922,16 +919,11 @@ static size_t response_get_string(const struct 
parsed_resp *resp, int n,
 
 static u64 response_get_u64(const struct parsed_resp *resp, int n)
 {
-   if (!resp) {
-   pr_debug("Response is NULL\n");
-   return 0;
-   }
+   const struct opal_resp_tok *token;
 
-   if (n > resp->num) {
-   pr_debug("Response has %d tokens. Can't access %d\n",
-resp->num, n);
+   token = response_get_token(resp, n);
+   if (IS_ERR(token))
return 0;
-   }
 
if (resp->toks[n].type != OPAL_DTA_TOKENID_UINT) {
pr_debug("Token is not unsigned it: %d\n",
-- 
2.16.1

[PATCH 7/8] block: sed-opal: add ioctl for done-mark of shadow mbr

2018-03-13 Thread Jonas Rabenstein

Enable users to mark the shadow mbr as done without completely
deactivating the shadow mbr feature. This may be useful on reboots,
when the power to the disk is not disconnected in between and the shadow
mbr stores the required boot files. Of course, this saves also the
(few) commands required to enable the feature if it is already enabled
and one only wants to mark the shadow mbr as done.

Signed-off-by: Jonas Rabenstein 
---
 block/sed-opal.c  | 33 +++--
 include/linux/sed-opal.h  |  1 +
 include/uapi/linux/sed-opal.h |  1 +
 3 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/block/sed-opal.c b/block/sed-opal.c
index f169e621279a..b201c96d23a3 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -1987,13 +1987,39 @@ static int opal_erase_locking_range(struct opal_dev 
*dev,
 static int opal_enable_disable_shadow_mbr(struct opal_dev *dev,
  struct opal_mbr_data *opal_mbr)
 {
+   u8 token = opal_mbr->enable_disable == OPAL_MBR_ENABLE
+   ? OPAL_TRUE : OPAL_FALSE;
const struct opal_step mbr_steps[] = {
{ opal_discovery0, },
{ start_admin1LSP_opal_session, &opal_mbr->key },
-   { set_mbr_done, &opal_mbr->enable_disable },
+   { set_mbr_done, &token },
{ end_opal_session, },
{ start_admin1LSP_opal_session, &opal_mbr->key },
-   { set_mbr_enable_disable, &opal_mbr->enable_disable },
+   { set_mbr_enable_disable, &token },
+   { end_opal_session, },
+   { NULL, }
+   };
+   int ret;
+
+   if (opal_mbr->enable_disable != OPAL_MBR_ENABLE &&
+   opal_mbr->enable_disable != OPAL_MBR_DISABLE)
+   return -EINVAL;
+
+   mutex_lock(&dev->dev_lock);
+   setup_opal_dev(dev, mbr_steps);
+   ret = next(dev);
+   mutex_unlock(&dev->dev_lock);
+   return ret;
+}
+
+static int opal_mbr_status(struct opal_dev *dev, struct opal_mbr_data 
*opal_mbr)
+{
+   u8 token = opal_mbr->enable_disable == OPAL_MBR_ENABLE
+   ? OPAL_TRUE : OPAL_FALSE;
+   const struct opal_step mbr_steps[] = {
+   { opal_discovery0, },
+   { start_admin1LSP_opal_session, &opal_mbr->key },
+   { set_mbr_done, &token },
{ end_opal_session, },
{ NULL, }
};
@@ -2339,6 +2365,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, 
void __user *arg)
case IOC_OPAL_ENABLE_DISABLE_MBR:
ret = opal_enable_disable_shadow_mbr(dev, p);
break;
+   case IOC_OPAL_MBR_STATUS:
+   ret = opal_mbr_status(dev, p);
+   break;
case IOC_OPAL_ERASE_LR:
ret = opal_erase_locking_range(dev, p);
break;
diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h
index 04b124fca51e..b38dc602cae3 100644
--- a/include/linux/sed-opal.h
+++ b/include/linux/sed-opal.h
@@ -47,6 +47,7 @@ static inline bool is_sed_ioctl(unsigned int cmd)
case IOC_OPAL_ENABLE_DISABLE_MBR:
case IOC_OPAL_ERASE_LR:
case IOC_OPAL_SECURE_ERASE_LR:
+   case IOC_OPAL_MBR_STATUS:
return true;
}
return false;
diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h
index 627624d35030..0cb9890cdc04 100644
--- a/include/uapi/linux/sed-opal.h
+++ b/include/uapi/linux/sed-opal.h
@@ -116,5 +116,6 @@ struct opal_mbr_data {
 #define IOC_OPAL_ENABLE_DISABLE_MBR _IOW('p', 229, struct opal_mbr_data)
 #define IOC_OPAL_ERASE_LR   _IOW('p', 230, struct opal_session_info)
 #define IOC_OPAL_SECURE_ERASE_LR_IOW('p', 231, struct opal_session_info)
+#define IOC_OPAL_MBR_STATUS _IOW('p', 232, struct opal_mbr_data)
 
 #endif /* _UAPI_SED_OPAL_H */
-- 
2.16.1

[PATCH 8/8] block: sed-opal: ioctl for writing to shadow mbr

2018-03-13 Thread Jonas Rabenstein

Allow modification of the shadow mbr. If the shadow mbr is not marked as
done, this data will be presented read only as the device content. Only
after marking the shadow mbr as done and unlocking a locking range the
actual content is accessible.

Signed-off-by: Jonas Rabenstein 
---
 block/sed-opal.c  | 74 +++
 include/linux/sed-opal.h  |  1 +
 include/uapi/linux/sed-opal.h |  8 +
 3 files changed, 83 insertions(+)

diff --git a/block/sed-opal.c b/block/sed-opal.c
index b201c96d23a3..1ec3734af656 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -1491,6 +1491,52 @@ static int set_mbr_enable_disable(struct opal_dev *dev, 
void *data)
return finalize_and_send(dev, parse_and_check_status);
 }
 
+static int write_shadow_mbr(struct opal_dev *dev, void *data)
+{
+   struct opal_shadow_mbr *shadow = data;
+   size_t off;
+   u64 len;
+   int err = 0;
+   u8 *payload;
+
+   /* FIXME: this is the maximum we can use for IO_BUFFER_LENGTH=2048.
+*Instead of having constant, it would be nice to compute the
+*actual value depending on IO_BUFFER_LENGTH
+*/
+   len = 1950;
+
+   /* do the actual transmission(s) */
+   for (off = 0 ; off < shadow->size; off += len) {
+   len = min(len, shadow->size - off);
+
+   pr_debug("MBR: write bytes %zu+%llu/%llu\n",
+off, len, shadow->size);
+   err = start_opal_cmd(dev, opaluid[OPAL_MBR],
+opalmethod[OPAL_SET]);
+   add_token_u8(&err, dev, OPAL_STARTNAME);
+   add_token_u8(&err, dev, OPAL_WHERE);
+   add_token_u64(&err, dev, shadow->offset + off);
+   add_token_u8(&err, dev, OPAL_ENDNAME);
+
+   add_token_u8(&err, dev, OPAL_STARTNAME);
+   add_token_u8(&err, dev, OPAL_VALUES);
+   payload = add_bytestring_header(&err, dev, len);
+   if (!payload)
+   break;
+   if (copy_from_user(payload, shadow->data + off, len))
+   err = -EFAULT;
+
+   add_token_u8(&err, dev, OPAL_ENDNAME);
+   if (err)
+   break;
+
+   err = finalize_and_send(dev, parse_and_check_status);
+   if (err)
+   break;
+   }
+   return err;
+}
+
 static int generic_pw_cmd(u8 *key, size_t key_len, u8 *cpin_uid,
  struct opal_dev *dev)
 {
@@ -2036,6 +2082,31 @@ static int opal_mbr_status(struct opal_dev *dev, struct 
opal_mbr_data *opal_mbr)
return ret;
 }
 
+static int opal_write_shadow_mbr(struct opal_dev *dev,
+struct opal_shadow_mbr *info)
+{
+   const struct opal_step mbr_steps[] = {
+   { opal_discovery0, },
+   { start_admin1LSP_opal_session, &info->key },
+   { write_shadow_mbr, info },
+   { end_opal_session, },
+   { NULL, }
+   };
+   int ret;
+
+   if (info->size == 0)
+   return 0;
+
+   if (!access_ok(VERIFY_READ, info->data, info->size))
+   return -EINVAL;
+
+   mutex_lock(&dev->dev_lock);
+   setup_opal_dev(dev, mbr_steps);
+   ret = next(dev);
+   mutex_unlock(&dev->dev_lock);
+   return ret;
+}
+
 static int opal_save(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk)
 {
struct opal_suspend_data *suspend;
@@ -2368,6 +2439,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, 
void __user *arg)
case IOC_OPAL_MBR_STATUS:
ret = opal_mbr_status(dev, p);
break;
+   case IOC_OPAL_WRITE_SHADOW_MBR:
+   ret = opal_write_shadow_mbr(dev, p);
+   break;
case IOC_OPAL_ERASE_LR:
ret = opal_erase_locking_range(dev, p);
break;
diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h
index b38dc602cae3..cf08cdc13cbd 100644
--- a/include/linux/sed-opal.h
+++ b/include/linux/sed-opal.h
@@ -47,6 +47,7 @@ static inline bool is_sed_ioctl(unsigned int cmd)
case IOC_OPAL_ENABLE_DISABLE_MBR:
case IOC_OPAL_ERASE_LR:
case IOC_OPAL_SECURE_ERASE_LR:
+   case IOC_OPAL_WRITE_SHADOW_MBR:
case IOC_OPAL_MBR_STATUS:
return true;
}
diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h
index 0cb9890cdc04..c2669ebff681 100644
--- a/include/uapi/linux/sed-opal.h
+++ b/include/uapi/linux/sed-opal.h
@@ -104,6 +104,13 @@ struct opal_mbr_data {
__u8 __align[7];
 };
 
+struct opal_shadow_mbr {
+   struct opal_key key;
+   const __u8 *data;
+   __u64 offset;
+   __u64 size;
+};
+
 #define IOC_OPAL_SAVE  _IOW('p', 220, struct opal_lock_unlock)
 #define IOC_OPAL_LOCK_UNLOCK   _IOW('p', 221, struct opal_lock_unlock)
 #define IOC_OPA

[PATCH 6/8] block: sed-opal: split generation of bytestring header and content

2018-03-13 Thread Jonas Rabenstein

Split the header generation from the (normal) memcpy part if a
bytestring is copied into the command buffer. This allows in-place
generation of the bytestring content. For example, copy_from_user may be
used without an intermediate buffer.

Signed-off-by: Jonas Rabenstein 
---
 block/sed-opal.c | 23 +++
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/block/sed-opal.c b/block/sed-opal.c
index f7925e6f607c..f169e621279a 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -584,14 +584,11 @@ static void add_token_u64(int *err, struct opal_dev *cmd, 
u64 number)
}
 }
 
-static void add_token_bytestring(int *err, struct opal_dev *cmd,
-const u8 *bytestring, size_t len)
+static u8 *add_bytestring_header(int *err, struct opal_dev *cmd, size_t len)
 {
size_t header_len = 1;
bool is_short_atom = true;
-
-   if (*err)
-   return;
+   char *start;
 
if (len & ~SHORT_ATOM_LEN_MASK) {
header_len = 2;
@@ -600,17 +597,27 @@ static void add_token_bytestring(int *err, struct 
opal_dev *cmd,
 
if (!can_add(err, cmd, header_len + len)) {
pr_debug("Error adding bytestring: end of buffer.\n");
-   return;
+   return NULL;
}
 
if (is_short_atom)
add_short_atom_header(cmd, true, false, len);
else
add_medium_atom_header(cmd, true, false, len);
-
-   memcpy(&cmd->cmd[cmd->pos], bytestring, len);
+   start = &cmd->cmd[cmd->pos];
cmd->pos += len;
+   return start;
+}
 
+static void add_token_bytestring(int *err, struct opal_dev *cmd,
+const u8 *bytestring, size_t len)
+{
+   u8 *start;
+
+   start = add_bytestring_header(err, cmd, len);
+   if (!start)
+   return;
+   memcpy(start, bytestring, len);
 }
 
 static int build_locking_range(u8 *buffer, size_t length, u8 lr)
-- 
2.16.1

[PATCH 1/8] block: sed-opal: use correct macro for method length

2018-03-13 Thread Jonas Rabenstein

also the values of OPAL_UID_LENGTH and OPAL_METHOD_LENGTH are the same,
it is weird to use OPAL_UID_LENGTH for the definition of the methods.

Signed-off-by: Jonas Rabenstein 
---
 block/sed-opal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/sed-opal.c b/block/sed-opal.c
index 2158624013b3..f9d94ce64b08 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -181,7 +181,7 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = {
  * Derived from: TCG_Storage_Architecture_Core_Spec_v2.01_r1.00
  * Section: 6.3 Assigned UIDs
  */
-static const u8 opalmethod[][OPAL_UID_LENGTH] = {
+static const u8 opalmethod[][OPAL_METHOD_LENGTH] = {
[OPAL_PROPERTIES] =
{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x01 },
[OPAL_STARTSESSION] =
-- 
2.16.1

[PATCH 0/8] block: sed-opal: support write to shadow mbr

2018-03-13 Thread Jonas Rabenstein

Hi,
this patchset adds support to write data into the shadow mbr of sed-opal
enabled devices. They apply cleanly on today next-tree (next-20180313)
and requires the u64 short atom length fix from [0] as that is still
missing in that tree. As I only can test on my only sed-opal enabled
Samsung 850 Evo on amd64, tests on other hardware would be appreciated.

The first six patches provide helper functions that are used in the
following up patches as well as do some refactoring and unification of
the existing code.

With the seventh patch, a new ioctl is added to mark the shadow mbr as
done without having to enable/disable the shadow mbr feature as a whole.

Finally, the last patch adds an ioctl to write data into the shadow mbr.

A modified version of the user space tools with support for those ioctls
can be found at [1].

Looking forward to feedback and suggestions for improvement,
 Jonas

[0] https://lkml.org/lkml/2018/3/7/534
[1] https://github.com/ghostav/sed-opal-temp

Jonas Rabenstein (8):
  block: sed-opal: use correct macro for method length
  block: sed-opal: unify space check in add_token_*
  block: sed-opal: unify cmd start and finalize
  block: sed-opal: unify error handling of responses
  block: sed-opal: print failed function address
  block: sed-opal: split generation of bytestring header and content
  block: sed-opal: add ioctl for done-mark of shadow mbr
  block: sed-opal: ioctl for writing to shadow mbr

 block/sed-opal.c  | 432 +-
 include/linux/sed-opal.h  |   2 +
 include/uapi/linux/sed-opal.h |   9 +
 3 files changed, 223 insertions(+), 220 deletions(-)

-- 
2.16.1

Re: [Possible REGRESSION, 4.16-rc4] Error updating SMART data during runtime and could not connect to lvmetad at some boot attempts

2018-03-13 Thread Martin Steigerwald

Hans de Goede - 11.03.18, 15:37:
> Hi Martin,
> 
> On 11-03-18 09:20, Martin Steigerwald wrote:
> > Hello.
> > 
> > Since 4.16-rc4 (upgraded from 4.15.2 which worked) I have an issue
> > with SMART checks occassionally failing like this:
> > 
> > smartd[28017]: Device: /dev/sdb [SAT], is in SLEEP mode, suspending checks
> > udisksd[24408]: Error performing housekeeping for drive
> > /org/freedesktop/UDisks2/drives/INTEL_SSDSA2CW300G3_[…]: Error updating
> > SMART data: Error sending ATA command CHECK POWER MODE: Unexpected sense
> > data returned:#012: 0e 09 0c 00  00 00 ff 00  00 00 00 00  00 00 50
> > 00..P.#0120010: 00 00 00 00  00 00 00 00  00 00 00 00  00
> > 00 00 00#012 (g-io-error-quark, 0) merkaba
> > udisksd[24408]: Error performing housekeeping for drive
> > /org/freedesktop/UDisks2/drives/Crucial_CT480M500SSD3_[…]: Error updating
> > SMART dat a: Error sending ATA command CHECK POWER MODE: Unexpected sense
> > data returned:#012: 01 00 1d 00  00 00 0e 09  0c 00 00 00  ff 00 00
> > 00#0120010: 00 0 0 00 00  50 00 00 00  00 00 00 00 
> > 00 00 00 00P...#012 (g-io-error-quark, 0)
> > 
> > (Intel SSD is connected via SATA, Crucial via mSATA in a ThinkPad T520)
> > 
> > However when I then check manually with smartctl -a | -x | -H the device
> > reports SMART data just fine.
> > 
> > As smartd correctly detects that device is in sleep mode, this may be an
> > userspace issue in udisksd.
> > 
> > Also at some boot attempts the boot hangs with a message like "could not
> > connect to lvmetad, scanning manually for devices". I use BTRFS RAID 1
> > on to LVs (each on one of the SSDs). A configuration that requires a
> > manual
> > adaption to InitRAMFS in order to boot (basically vgchange -ay before
> > btrfs device scan).
> > 
> > I wonder whether that has to do with the new SATA LPM policy stuff, but as
> > I had issues with
> > 
> >   3 => Medium power with Device Initiated PM enabled
> > 
> > (machine did not boot, which could also have been caused by me
> > accidentally
> > removing all TCP/IP network support in the kernel with that setting)
> > 
> > I set it back to
> > 
> > CONFIG_SATA_MOBILE_LPM_POLICY=0
> > 
> > (firmware settings)
> 
> Right, so at that settings the LPM policy changes are effectively
> disabled and cannot explain your SMART issues.

Yes, I now good a photo of one of those boot failures I mentioned, at it seems 
to be related to blk-mq, as the backtrace contains "blk_mq_terminate_expired".

I add the screenshot to my bug report.

[Possible REGRESSION, 4.16-rc4] Error updating SMART data during runtime and 
boot failures with blk_mq_terminate_expired in backtrace
https://bugzilla.kernel.org/show_bug.cgi?id=199077

Hans, I will test your LPM policy horkage for Crucial m500 patch at a later 
time. I first wanted to add the photo of the boot failure to the bug report.

Ming and Bart, I added you to cc, cause I had to do with you about another 
blk-mq report, please feel free to adapt.

Thanks,
-- 
Martin

RE: [PATCH] scsi: resolve COMMAND_SIZE at compile time

2018-03-13 Thread David Laight

From: Stephen Kitt
> Sent: 09 March 2018 22:34
> 
> COMMAND_SIZE currently uses an array of values in block/scsi_ioctl.c.
> A number of device_handler functions use this to initialise arrays,
> and this is flagged by -Wvla.
> 
> This patch replaces COMMAND_SIZE with a variant using a formula which
> can be resolved at compile time in cases where the opcode is fixed,
> resolving the array size and avoiding the VLA. The downside is that
> the code is less maintainable and that some call sites end up having
> more complex generated code.
> 
> Since scsi_command_size_tbl is dropped, we can remove the dependency
> on BLK_SCSI_REQUEST from drivers/target/Kconfig.
> 
> This was prompted by https://lkml.org/lkml/2018/3/7/621
> 
> Signed-off-by: Stephen Kitt 
> ---
>  block/scsi_ioctl.c |  8 
>  drivers/target/Kconfig |  1 -
>  include/scsi/scsi_common.h | 13 +++--
>  3 files changed, 11 insertions(+), 11 deletions(-)
> 
> diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
> index 60b471f8621b..b9e176e537be 100644
> --- a/block/scsi_ioctl.c
> +++ b/block/scsi_ioctl.c
> @@ -41,14 +41,6 @@ struct blk_cmd_filter {
> 
>  static struct blk_cmd_filter blk_default_cmd_filter;
> 
> -/* Command group 3 is reserved and should never be used.  */
> -const unsigned char scsi_command_size_tbl[8] =
> -{
> - 6, 10, 10, 12,
> - 16, 12, 10, 10
> -};
> -EXPORT_SYMBOL(scsi_command_size_tbl);
> -
>  #include 
> 
>  static int sg_get_version(int __user *p)
> diff --git a/drivers/target/Kconfig b/drivers/target/Kconfig
> index 4c44d7bed01a..b5f05b60cf06 100644
> --- a/drivers/target/Kconfig
> +++ b/drivers/target/Kconfig
> @@ -4,7 +4,6 @@ menuconfig TARGET_CORE
>   depends on SCSI && BLOCK
>   select CONFIGFS_FS
>   select CRC_T10DIF
> - select BLK_SCSI_REQUEST # only for scsi_command_size_tbl..
>   select SGL_ALLOC
>   default n
>   help
> diff --git a/include/scsi/scsi_common.h b/include/scsi/scsi_common.h
> index 731ac09ed231..48d950666376 100644
> --- a/include/scsi/scsi_common.h
> +++ b/include/scsi/scsi_common.h
> @@ -15,8 +15,17 @@ scsi_varlen_cdb_length(const void *hdr)
>   return ((struct scsi_varlen_cdb_hdr *)hdr)->additional_cdb_length + 8;
>  }
> 
> -extern const unsigned char scsi_command_size_tbl[8];
> -#define COMMAND_SIZE(opcode) scsi_command_size_tbl[((opcode) >> 5) & 7]
> +/*
> + * SCSI command sizes are as follows, in bytes, for fixed size commands, per
> + * group: 6, 10, 10, 12, 16, 12, 10, 10. The top three bits of an opcode
> + * determine its group.
> + * The size table is encoded into a 32-bit value by subtracting each value
> + * from 16, resulting in a value of 1715488362
> + * (6 << 28 + 6 << 24 + 4 << 20 + 0 << 16 + 4 << 12 + 6 << 8 + 6 << 4 + 10).
> + * Command group 3 is reserved and should never be used.
> + */
> +#define COMMAND_SIZE(opcode) \
> + (16 - (15 & (1715488362 >> (4 * (((opcode) >> 5) & 7)

Why not:
(6 + (15 & (0x446a6440u 

Specifying the constant in hex makes it more obvious.
It is a shame about the 16, but an offset is easier than the negate.

David

[PATCH V5 4/5] scsi: virtio_scsi: fix IO hang caused by irq vector automatic affinity

2018-03-13 Thread Ming Lei

Now 84676c1f21e8ff5(genirq/affinity: assign vectors to all possible CPUs)
has been merged to V4.16-rc, and it is easy to allocate all offline CPUs
for some irq vectors, this can't be avoided even though the allocation
is improved.

For example, on a 8cores VM, 4~7 are not-present/offline, 4 queues of
virtio-scsi, the irq affinity assigned can become the following shape:

irq 36, cpu list 0-7
irq 37, cpu list 0-7
irq 38, cpu list 0-7
irq 39, cpu list 0-1
irq 40, cpu list 4,6
irq 41, cpu list 2-3
irq 42, cpu list 5,7

Then IO hang is triggered in case of non-SCSI_MQ.

Given storage IO is always C/S model, there isn't such issue with 
SCSI_MQ(blk-mq),
because no IO can be submitted to one hw queue if the hw queue isn't
mapped to online CPUs.

Fix this issue by forcing to use blk_mq.

BTW, I have been used virtio-scsi(scsi_mq) for several years, and it has
been quite stable, so it shouldn't cause extra risk.

Cc: Omar Sandoval ,
Cc: "Martin K. Petersen" ,
Cc: James Bottomley ,
Cc: Christoph Hellwig ,
Cc: Don Brace 
Cc: Kashyap Desai 
Cc: Paolo Bonzini 
Cc: Mike Snitzer 
Cc: Laurence Oberman 
Reviewed-by: Hannes Reinecke 
Acked-by: Paolo Bonzini 
Reviewed-by: Christoph Hellwig 
Fixes: 84676c1f21e8 ("genirq/affinity: assign vectors to all possible CPUs")
Signed-off-by: Ming Lei 
---
 drivers/scsi/virtio_scsi.c | 59 +++---
 1 file changed, 3 insertions(+), 56 deletions(-)

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 7c28e8d4955a..54e3a0f6844c 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -91,9 +91,6 @@ struct virtio_scsi_vq {
 struct virtio_scsi_target_state {
seqcount_t tgt_seq;
 
-   /* Count of outstanding requests. */
-   atomic_t reqs;
-
/* Currently active virtqueue for requests sent to this target. */
struct virtio_scsi_vq *req_vq;
 };
@@ -152,8 +149,6 @@ static void virtscsi_complete_cmd(struct virtio_scsi 
*vscsi, void *buf)
struct virtio_scsi_cmd *cmd = buf;
struct scsi_cmnd *sc = cmd->sc;
struct virtio_scsi_cmd_resp *resp = &cmd->resp.cmd;
-   struct virtio_scsi_target_state *tgt =
-   scsi_target(sc->device)->hostdata;
 
dev_dbg(&sc->device->sdev_gendev,
"cmd %p response %u status %#02x sense_len %u\n",
@@ -210,8 +205,6 @@ static void virtscsi_complete_cmd(struct virtio_scsi 
*vscsi, void *buf)
}
 
sc->scsi_done(sc);
-
-   atomic_dec(&tgt->reqs);
 }
 
 static void virtscsi_vq_done(struct virtio_scsi *vscsi,
@@ -580,10 +573,7 @@ static int virtscsi_queuecommand_single(struct Scsi_Host 
*sh,
struct scsi_cmnd *sc)
 {
struct virtio_scsi *vscsi = shost_priv(sh);
-   struct virtio_scsi_target_state *tgt =
-   scsi_target(sc->device)->hostdata;
 
-   atomic_inc(&tgt->reqs);
return virtscsi_queuecommand(vscsi, &vscsi->req_vqs[0], sc);
 }
 
@@ -596,55 +586,11 @@ static struct virtio_scsi_vq *virtscsi_pick_vq_mq(struct 
virtio_scsi *vscsi,
return &vscsi->req_vqs[hwq];
 }
 
-static struct virtio_scsi_vq *virtscsi_pick_vq(struct virtio_scsi *vscsi,
-  struct virtio_scsi_target_state 
*tgt)
-{
-   struct virtio_scsi_vq *vq;
-   unsigned long flags;
-   u32 queue_num;
-
-   local_irq_save(flags);
-   if (atomic_inc_return(&tgt->reqs) > 1) {
-   unsigned long seq;
-
-   do {
-   seq = read_seqcount_begin(&tgt->tgt_seq);
-   vq = tgt->req_vq;
-   } while (read_seqcount_retry(&tgt->tgt_seq, seq));
-   } else {
-   /* no writes can be concurrent because of atomic_t */
-   write_seqcount_begin(&tgt->tgt_seq);
-
-   /* keep previous req_vq if a reader just arrived */
-   if (unlikely(atomic_read(&tgt->reqs) > 1)) {
-   vq = tgt->req_vq;
-   goto unlock;
-   }
-
-   queue_num = smp_processor_id();
-   while (unlikely(queue_num >= vscsi->num_queues))
-   queue_num -= vscsi->num_queues;
-   tgt->req_vq = vq = &vscsi->req_vqs[queue_num];
- unlock:
-   write_seqcount_end(&tgt->tgt_seq);
-   }
-   local_irq_restore(flags);
-
-   return vq;
-}
-
 static int virtscsi_queuecommand_multi(struct Scsi_Host *sh,
   struct scsi_cmnd *sc)
 {
struct virtio_scsi *vscsi = shost_priv(sh);
-   struct virtio_scsi_target_state *tgt =
-   scsi_target(sc->device)->hostdata;
-   struct virtio_scsi_vq *req_vq;
-
-   if (shost_use_blk_mq(sh))
-   req_vq = virtscsi_pick_vq_mq(vscsi, sc);
-   else
-   req_vq = virtscsi_pick_vq(vscsi, tgt);

[PATCH V5 2/5] scsi: megaraid_sas: fix selection of reply queue

2018-03-13 Thread Ming Lei

>From 84676c1f21 (genirq/affinity: assign vectors to all possible CPUs),
one msix vector can be created without any online CPU mapped, then
command may be queued, and won't be notified after its completion.

This patch setups mapping between cpu and reply queue according to irq
affinity info retrived by pci_irq_get_affinity(), and uses this info
to choose reply queue for queuing one command.

Then the chosen reply queue has to be active, and fixes IO hang caused
by using inactive reply queue which doesn't have any online CPU mapped.

Cc: Hannes Reinecke 
Cc: "Martin K. Petersen" ,
Cc: James Bottomley ,
Cc: Christoph Hellwig ,
Cc: Don Brace 
Cc: Kashyap Desai 
Cc: Laurence Oberman 
Cc: Mike Snitzer 
Cc: Meelis Roos 
Cc: Artem Bityutskiy 
Fixes: 84676c1f21e8 ("genirq/affinity: assign vectors to all possible CPUs")
Signed-off-by: Ming Lei 
---
 drivers/scsi/megaraid/megaraid_sas.h|  1 +
 drivers/scsi/megaraid/megaraid_sas_base.c   | 39 ++---
 drivers/scsi/megaraid/megaraid_sas_fusion.c | 12 +++--
 3 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/drivers/scsi/megaraid/megaraid_sas.h 
b/drivers/scsi/megaraid/megaraid_sas.h
index ba6503f37756..27fab8235ea5 100644
--- a/drivers/scsi/megaraid/megaraid_sas.h
+++ b/drivers/scsi/megaraid/megaraid_sas.h
@@ -2128,6 +2128,7 @@ enum MR_PD_TYPE {
 
 struct megasas_instance {
 
+   unsigned int *reply_map;
__le32 *producer;
dma_addr_t producer_h;
__le32 *consumer;
diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c 
b/drivers/scsi/megaraid/megaraid_sas_base.c
index a71ee67df084..dde0798b8a91 100644
--- a/drivers/scsi/megaraid/megaraid_sas_base.c
+++ b/drivers/scsi/megaraid/megaraid_sas_base.c
@@ -5165,6 +5165,26 @@ megasas_setup_jbod_map(struct megasas_instance *instance)
instance->use_seqnum_jbod_fp = false;
 }
 
+static void megasas_setup_reply_map(struct megasas_instance *instance)
+{
+   const struct cpumask *mask;
+   unsigned int queue, cpu;
+
+   for (queue = 0; queue < instance->msix_vectors; queue++) {
+   mask = pci_irq_get_affinity(instance->pdev, queue);
+   if (!mask)
+   goto fallback;
+
+   for_each_cpu(cpu, mask)
+   instance->reply_map[cpu] = queue;
+   }
+   return;
+
+fallback:
+   for_each_possible_cpu(cpu)
+   instance->reply_map[cpu] = cpu % instance->msix_vectors;
+}
+
 /**
  * megasas_init_fw -   Initializes the FW
  * @instance:  Adapter soft state
@@ -5343,6 +5363,8 @@ static int megasas_init_fw(struct megasas_instance 
*instance)
goto fail_setup_irqs;
}
 
+   megasas_setup_reply_map(instance);
+
dev_info(&instance->pdev->dev,
"firmware supports msix\t: (%d)", fw_msix_count);
dev_info(&instance->pdev->dev,
@@ -6123,20 +6145,29 @@ static inline int megasas_alloc_mfi_ctrl_mem(struct 
megasas_instance *instance)
  */
 static int megasas_alloc_ctrl_mem(struct megasas_instance *instance)
 {
+   instance->reply_map = kzalloc(sizeof(unsigned int) * nr_cpu_ids,
+ GFP_KERNEL);
+   if (!instance->reply_map)
+   return -ENOMEM;
+
switch (instance->adapter_type) {
case MFI_SERIES:
if (megasas_alloc_mfi_ctrl_mem(instance))
-   return -ENOMEM;
+   goto fail;
break;
case VENTURA_SERIES:
case THUNDERBOLT_SERIES:
case INVADER_SERIES:
if (megasas_alloc_fusion_context(instance))
-   return -ENOMEM;
+   goto fail;
break;
}
 
return 0;
+ fail:
+   kfree(instance->reply_map);
+   instance->reply_map = NULL;
+   return -ENOMEM;
 }
 
 /*
@@ -6148,6 +6179,7 @@ static int megasas_alloc_ctrl_mem(struct megasas_instance 
*instance)
  */
 static inline void megasas_free_ctrl_mem(struct megasas_instance *instance)
 {
+   kfree(instance->reply_map);
if (instance->adapter_type == MFI_SERIES) {
if (instance->producer)
pci_free_consistent(instance->pdev, sizeof(u32),
@@ -6540,7 +6572,6 @@ static int megasas_probe_one(struct pci_dev *pdev,
pci_free_irq_vectors(instance->pdev);
 fail_init_mfi:
scsi_host_put(host);
-
 fail_alloc_instance:
pci_disable_device(pdev);
 
@@ -6746,6 +6777,8 @@ megasas_resume(struct pci_dev *pdev)
if (rval < 0)
goto fail_reenable_msix;
 
+   megasas_setup_reply_map(instance);
+
if (instance->adapter_type != MFI_SERIES) {
megasas_reset_reply_desc(instance);
if (megasas_ioc_init_fusion(instance)) {
diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.c 
b/drivers/scsi/megaraid/megaraid_sas_fusion.c
index 073ced07e662..2994176a0121 100644
--- a/drivers/scsi/megarai

[PATCH V5 5/5] scsi: virtio_scsi: unify scsi_host_template

2018-03-13 Thread Ming Lei

Now we switch to use_blk_mq always, and both single queue and multi
queue cases can be handled in one .queuecommand callback, not necessary
to use two scsi_host_template.

Suggested-by: Christoph Hellwig ,
Cc: Omar Sandoval ,
Cc: "Martin K. Petersen" ,
Cc: James Bottomley ,
Cc: Christoph Hellwig ,
Cc: Paolo Bonzini 
Cc: Mike Snitzer 
Cc: Laurence Oberman 
Cc: Hannes Reinecke 
Signed-off-by: Ming Lei 
---
 drivers/scsi/virtio_scsi.c | 74 ++
 1 file changed, 15 insertions(+), 59 deletions(-)

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 54e3a0f6844c..45d04631888a 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -522,11 +522,20 @@ static void virtio_scsi_init_hdr_pi(struct virtio_device 
*vdev,
 }
 #endif
 
-static int virtscsi_queuecommand(struct virtio_scsi *vscsi,
-struct virtio_scsi_vq *req_vq,
+static struct virtio_scsi_vq *virtscsi_pick_vq_mq(struct virtio_scsi *vscsi,
+ struct scsi_cmnd *sc)
+{
+   u32 tag = blk_mq_unique_tag(sc->request);
+   u16 hwq = blk_mq_unique_tag_to_hwq(tag);
+
+   return &vscsi->req_vqs[hwq];
+}
+
+static int virtscsi_queuecommand(struct Scsi_Host *shost,
 struct scsi_cmnd *sc)
 {
-   struct Scsi_Host *shost = virtio_scsi_host(vscsi->vdev);
+   struct virtio_scsi *vscsi = shost_priv(shost);
+   struct virtio_scsi_vq *req_vq = virtscsi_pick_vq_mq(vscsi, sc);
struct virtio_scsi_cmd *cmd = scsi_cmd_priv(sc);
unsigned long flags;
int req_size;
@@ -569,32 +578,6 @@ static int virtscsi_queuecommand(struct virtio_scsi *vscsi,
return 0;
 }
 
-static int virtscsi_queuecommand_single(struct Scsi_Host *sh,
-   struct scsi_cmnd *sc)
-{
-   struct virtio_scsi *vscsi = shost_priv(sh);
-
-   return virtscsi_queuecommand(vscsi, &vscsi->req_vqs[0], sc);
-}
-
-static struct virtio_scsi_vq *virtscsi_pick_vq_mq(struct virtio_scsi *vscsi,
- struct scsi_cmnd *sc)
-{
-   u32 tag = blk_mq_unique_tag(sc->request);
-   u16 hwq = blk_mq_unique_tag_to_hwq(tag);
-
-   return &vscsi->req_vqs[hwq];
-}
-
-static int virtscsi_queuecommand_multi(struct Scsi_Host *sh,
-  struct scsi_cmnd *sc)
-{
-   struct virtio_scsi *vscsi = shost_priv(sh);
-   struct virtio_scsi_vq *req_vq = virtscsi_pick_vq_mq(vscsi, sc);
-
-   return virtscsi_queuecommand(vscsi, req_vq, sc);
-}
-
 static int virtscsi_tmf(struct virtio_scsi *vscsi, struct virtio_scsi_cmd *cmd)
 {
DECLARE_COMPLETION_ONSTACK(comp);
@@ -750,34 +733,13 @@ static enum blk_eh_timer_return 
virtscsi_eh_timed_out(struct scsi_cmnd *scmnd)
return BLK_EH_RESET_TIMER;
 }
 
-static struct scsi_host_template virtscsi_host_template_single = {
-   .module = THIS_MODULE,
-   .name = "Virtio SCSI HBA",
-   .proc_name = "virtio_scsi",
-   .this_id = -1,
-   .cmd_size = sizeof(struct virtio_scsi_cmd),
-   .queuecommand = virtscsi_queuecommand_single,
-   .change_queue_depth = virtscsi_change_queue_depth,
-   .eh_abort_handler = virtscsi_abort,
-   .eh_device_reset_handler = virtscsi_device_reset,
-   .eh_timed_out = virtscsi_eh_timed_out,
-   .slave_alloc = virtscsi_device_alloc,
-
-   .dma_boundary = UINT_MAX,
-   .use_clustering = ENABLE_CLUSTERING,
-   .target_alloc = virtscsi_target_alloc,
-   .target_destroy = virtscsi_target_destroy,
-   .track_queue_depth = 1,
-   .force_blk_mq = 1,
-};
-
-static struct scsi_host_template virtscsi_host_template_multi = {
+static struct scsi_host_template virtscsi_host_template = {
.module = THIS_MODULE,
.name = "Virtio SCSI HBA",
.proc_name = "virtio_scsi",
.this_id = -1,
.cmd_size = sizeof(struct virtio_scsi_cmd),
-   .queuecommand = virtscsi_queuecommand_multi,
+   .queuecommand = virtscsi_queuecommand,
.change_queue_depth = virtscsi_change_queue_depth,
.eh_abort_handler = virtscsi_abort,
.eh_device_reset_handler = virtscsi_device_reset,
@@ -883,7 +845,6 @@ static int virtscsi_probe(struct virtio_device *vdev)
u32 sg_elems, num_targets;
u32 cmd_per_lun;
u32 num_queues;
-   struct scsi_host_template *hostt;
 
if (!vdev->config->get) {
dev_err(&vdev->dev, "%s failure: config access disabled\n",
@@ -896,12 +857,7 @@ static int virtscsi_probe(struct virtio_device *vdev)
 
num_targets = virtscsi_config_get(vdev, max_target) + 1;
 
-   if (num_queues == 1)
-   hostt = &virtscsi_host_template_single;
-   else
-   hostt = &virtscsi_host_template_multi;
-
-   shost = scsi_host_alloc(hostt,
+   shost = scsi_host_alloc(&virtscsi_host_template,
sizeof(*vscsi) + sizeo

[PATCH V5 3/5] scsi: introduce force_blk_mq

2018-03-13 Thread Ming Lei

>From scsi driver view, it is a bit troublesome to support both blk-mq
and non-blk-mq at the same time, especially when drivers need to support
multi hw-queue.

This patch introduces 'force_blk_mq' to scsi_host_template so that drivers
can provide blk-mq only support, so driver code can avoid the trouble
for supporting both.

Cc: Omar Sandoval ,
Cc: "Martin K. Petersen" ,
Cc: James Bottomley ,
Cc: Christoph Hellwig ,
Cc: Don Brace 
Cc: Kashyap Desai 
Cc: Mike Snitzer 
Cc: Laurence Oberman 
Reviewed-by: Hannes Reinecke 
Reviewed-by: Christoph Hellwig 
Signed-off-by: Ming Lei 
---
 drivers/scsi/hosts.c | 1 +
 include/scsi/scsi_host.h | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 57bf43e34863..cbbc32df7595 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -477,6 +477,7 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template 
*sht, int privsize)
shost->dma_boundary = 0x;
 
shost->use_blk_mq = scsi_use_blk_mq;
+   shost->use_blk_mq = scsi_use_blk_mq || shost->hostt->force_blk_mq;
 
device_initialize(&shost->shost_gendev);
dev_set_name(&shost->shost_gendev, "host%d", shost->host_no);
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 1a1df0d21ee3..6c6366f0bd15 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -452,6 +452,9 @@ struct scsi_host_template {
/* True if the controller does not support WRITE SAME */
unsigned no_write_same:1;
 
+   /* True if the low-level driver supports blk-mq only */
+   unsigned force_blk_mq:1;
+
/*
 * Countdown for host blocking with no commands outstanding.
 */
-- 
2.9.5

[PATCH V5 1/5] scsi: hpsa: fix selection of reply queue

2018-03-13 Thread Ming Lei

>From 84676c1f21 (genirq/affinity: assign vectors to all possible CPUs),
one msix vector can be created without any online CPU mapped, then one
command's completion may not be notified.

This patch setups mapping between cpu and reply queue according to irq
affinity info retrived by pci_irq_get_affinity(), and uses this mapping
table to choose reply queue for queuing one command.

Then the chosen reply queue has to be active, and fixes IO hang caused
by using inactive reply queue which doesn't have any online CPU mapped.

Cc: Hannes Reinecke 
Cc: "Martin K. Petersen" ,
Cc: James Bottomley ,
Cc: Christoph Hellwig ,
Cc: Don Brace 
Cc: Kashyap Desai 
Cc: Laurence Oberman 
Cc: Meelis Roos 
Cc: Artem Bityutskiy 
Cc: Mike Snitzer 
Tested-by: Laurence Oberman 
Tested-by: Don Brace 
Tested-by: Artem Bityutskiy 
Acked-by: Don Brace 
Fixes: 84676c1f21e8 ("genirq/affinity: assign vectors to all possible CPUs")
Signed-off-by: Ming Lei 
---
 drivers/scsi/hpsa.c | 73 +++--
 drivers/scsi/hpsa.h |  1 +
 2 files changed, 55 insertions(+), 19 deletions(-)

diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c
index 5293e6827ce5..3a9eca163db8 100644
--- a/drivers/scsi/hpsa.c
+++ b/drivers/scsi/hpsa.c
@@ -1045,11 +1045,7 @@ static void set_performant_mode(struct ctlr_info *h, 
struct CommandList *c,
c->busaddr |= 1 | (h->blockFetchTable[c->Header.SGList] << 1);
if (unlikely(!h->msix_vectors))
return;
-   if (likely(reply_queue == DEFAULT_REPLY_QUEUE))
-   c->Header.ReplyQueue =
-   raw_smp_processor_id() % h->nreply_queues;
-   else
-   c->Header.ReplyQueue = reply_queue % h->nreply_queues;
+   c->Header.ReplyQueue = reply_queue;
}
 }
 
@@ -1063,10 +1059,7 @@ static void set_ioaccel1_performant_mode(struct 
ctlr_info *h,
 * Tell the controller to post the reply to the queue for this
 * processor.  This seems to give the best I/O throughput.
 */
-   if (likely(reply_queue == DEFAULT_REPLY_QUEUE))
-   cp->ReplyQueue = smp_processor_id() % h->nreply_queues;
-   else
-   cp->ReplyQueue = reply_queue % h->nreply_queues;
+   cp->ReplyQueue = reply_queue;
/*
 * Set the bits in the address sent down to include:
 *  - performant mode bit (bit 0)
@@ -1087,10 +1080,7 @@ static void set_ioaccel2_tmf_performant_mode(struct 
ctlr_info *h,
/* Tell the controller to post the reply to the queue for this
 * processor.  This seems to give the best I/O throughput.
 */
-   if (likely(reply_queue == DEFAULT_REPLY_QUEUE))
-   cp->reply_queue = smp_processor_id() % h->nreply_queues;
-   else
-   cp->reply_queue = reply_queue % h->nreply_queues;
+   cp->reply_queue = reply_queue;
/* Set the bits in the address sent down to include:
 *  - performant mode bit not used in ioaccel mode 2
 *  - pull count (bits 0-3)
@@ -1109,10 +1099,7 @@ static void set_ioaccel2_performant_mode(struct 
ctlr_info *h,
 * Tell the controller to post the reply to the queue for this
 * processor.  This seems to give the best I/O throughput.
 */
-   if (likely(reply_queue == DEFAULT_REPLY_QUEUE))
-   cp->reply_queue = smp_processor_id() % h->nreply_queues;
-   else
-   cp->reply_queue = reply_queue % h->nreply_queues;
+   cp->reply_queue = reply_queue;
/*
 * Set the bits in the address sent down to include:
 *  - performant mode bit not used in ioaccel mode 2
@@ -1157,6 +1144,8 @@ static void __enqueue_cmd_and_start_io(struct ctlr_info 
*h,
 {
dial_down_lockup_detection_during_fw_flash(h, c);
atomic_inc(&h->commands_outstanding);
+
+   reply_queue = h->reply_map[raw_smp_processor_id()];
switch (c->cmd_type) {
case CMD_IOACCEL1:
set_ioaccel1_performant_mode(h, c, reply_queue);
@@ -7376,6 +7365,26 @@ static void hpsa_disable_interrupt_mode(struct ctlr_info 
*h)
h->msix_vectors = 0;
 }
 
+static void hpsa_setup_reply_map(struct ctlr_info *h)
+{
+   const struct cpumask *mask;
+   unsigned int queue, cpu;
+
+   for (queue = 0; queue < h->msix_vectors; queue++) {
+   mask = pci_irq_get_affinity(h->pdev, queue);
+   if (!mask)
+   goto fallback;
+
+   for_each_cpu(cpu, mask)
+   h->reply_map[cpu] = queue;
+   }
+   return;
+
+fallback:
+   for_each_possible_cpu(cpu)
+   h->reply_map[cpu] = 0;
+}
+
 /* If MSI/MSI-X is supported by the kernel we will try to enable it on
  * controllers that are capable. If not, we use legacy INTx mode.
  */
@@ -7771,6 +7780,10 @@ static int hpsa_pci_init(struct ctlr_info *h)
err = hpsa_interrupt_mode(h);
if

[PATCH V5 0/5] SCSI: fix selection of reply(hw) queue

2018-03-13 Thread Ming Lei

Hi All,

The patches fixes reply queue(virt-queue on virtio-scsi) selection on hpsa,
megaraid_sa and virtio-scsi, and IO hang can be caused easily by this issue.

This issue is triggered by 84676c1f21e8 ("genirq/affinity: assign vectors
to all possible CPUs"). After 84676c1f21e8, it is easy to see one msix
vector mapped to all offline CPUs. If the reply queue is seleteced from
all allocated msix vectors(reply queues) in round-roin way, the selected
replay queue may not have any online CPU mapped, IO hang is caused.

Both hpsa and megaraid_sas uses host-wide tagset, we can't convert the
reply queue to blk_mq hw queue directly, otherwise IO performance is degraded
much, according to Kashyap's test, so this patchset sets up one mapping talbe
for selecting reply queue, and this approach has been used by mpt3sas already.

For virtio-scsi, the virt-queue is really hw queue wrt. blk-mq view, so
we introduce 'force_blk_mq' for fix this issue because: 1) virtio-blk
has been used for years in blk-mq mode; 2) we have discussed recently
that scsi_mq will be enabled at default soon.


gitweb:
https://github.com/ming1/linux/tree/v4.16-rc-select-reply-queue-fix-V5

V5:
- cover legacy vector for megaraid_sas(2/5)
- patch style change (4/5)
- add one virtio-scsi cleanup patch(5/5)

V4:
- splitted from previous patchset
- handle virtio-scsi by force_blk_mq

Ming Lei (5):
  scsi: hpsa: fix selection of reply queue
  scsi: megaraid_sas: fix selection of reply queue
  scsi: introduce force_blk_mq
  scsi: virtio_scsi: fix IO hang caused by irq vector automatic affinity
  scsi: virtio_scsi: unify scsi_host_template

 drivers/scsi/hosts.c|   1 +
 drivers/scsi/hpsa.c |  73 
 drivers/scsi/hpsa.h |   1 +
 drivers/scsi/megaraid/megaraid_sas.h|   1 +
 drivers/scsi/megaraid/megaraid_sas_base.c   |  39 -
 drivers/scsi/megaraid/megaraid_sas_fusion.c |  12 +--
 drivers/scsi/virtio_scsi.c  | 129 
 include/scsi/scsi_host.h|   3 +
 8 files changed, 116 insertions(+), 143 deletions(-)

-- 
2.9.5

Re: [PATCH V3 0/4] genirq/affinity: irq vector spread among online CPUs as far as possible

2018-03-13 Thread Rafael J. Wysocki

On Tue, Mar 13, 2018 at 9:39 AM, Artem Bityutskiy  wrote:
> On Tue, 2018-03-13 at 16:35 +0800, Ming Lei wrote:
>> Then looks this issue need to fix by making possible CPU count
>> accurate
>> because there are other resources allocated according to
>> num_possible_cpus(),
>> such as percpu variables.
>
> Short term the regression should be fixed. It is already v4.16-rc6, we
> have little time left.

Right.

> Longer term, yeah, I agree. Kernel's notion of possible CPU count
> should be realistic.

I agree.

Moreover, there are not too many systems where physical CPU hotplug
actually works in practice AFAICS, so IMO we should default to "no
physical CPU hotplug" and only change that default in special cases
(which may be hard to figure out, but that's a different matter).

What platform firmware tells us may be completely off.

Re: disk-io lockup in 4.14.13 kernel

2018-03-13 Thread Jaco Kroon

Hi Bart,

On 11/03/2018 07:00, Bart Van Assche wrote:
> On Sun, 2018-03-11 at 06:33 +0200, Jaco Kroon wrote:
>> crowsnest ~ # find /sys -name sdm
>> /sys/kernel/debug/block/sdm
>> /sys/devices/pci:00/:00:01.0/:01:00.0/host0/port-0:0/expander-0:0/port-0:0:0/expander-0:1/port-0:1:0/end_device-0:1:0/target0:0:13/0:0:13:0/block/sdm
>> /sys/class/block/sdm
>> /sys/block/sdm
>>
>>> lspci
>> crowsnest ~ # lspci
>> [ ... ]
>> 01:00.0 Serial Attached SCSI controller: LSI Logic / Symbios Logic
>> SAS3008 PCI-Express Fusion-MPT SAS-3 (rev 02)
>> [ ... ]
> Hi Jaco,
>
> Recently a bug fix for the mq-deadline scheduler was posted but I don't
> think that that patch will change the behavior on your setup since you are
> not using ZBC disks. See also "mq-deadline: Make sure to always unlock
> zones" (https://marc.info/?l=linux-block&m=151983933714492).
>From that link:

In case of a failed write request (all retries failed) and when using
libata, the SCSI error handler calls scsi_finish_command(). In the
case of blk-mq this means that scsi_mq_done() does not get called,
that blk_mq_complete_request() does not get called and also that the
mq-deadline .completed_request() method is not called. This results in
the target zone of the failed write request being left in a locked
state, preventing that any new write requests are issued to the same
zone.

Why do you say that this won't make a difference? To me it sounds like
it could very well relate? You're talking about "ZBC" disks. I'm going
to assume that the ZBC is Zoned Block ??? and reading up on it I get
really confused.

Either way, the source version onto hich the patch applies is not
4.14.13 code (the patch references lines 756 and the source in 4.14.13
only has 679 lines of code.  I also can't find any kind of locking that
I can imagine that can cause a problem unless there is problems inside
__dd_dispatch_request, blk_mq_sched_try_merge or dd_insert_request (none
of which contains any loops that I can see at a quick glance, at least
down to elv_merge, from there it gets more complicated).

>
> Did I see correctly that /dev/sdm is behind a MPT SAS controller? You may
> want to contact the authors of this driver and Cc the linux-scsi mailing
> list. Sorry but I'm not familiar with the mpt3sas driver myself.
You did see correctly, all drives are behind the MPT SAS.  If there is
in fact a problem with the driver (or the controller itself for that
matter) it would explain things.  It would also explain why we don't see
this problem on other hosts.

I'll contact them as well.

Kind Regards,
Jaco

Re: [PATCH V3 0/4] genirq/affinity: irq vector spread among online CPUs as far as possible

2018-03-13 Thread Rafael J. Wysocki

On Tue, Mar 13, 2018 at 4:11 AM, Dou Liyang  wrote:
> Hi Thomas,
>
> At 03/09/2018 11:08 PM, Thomas Gleixner wrote:
> [...]
>>
>>
>> I'm not sure if there is a clear indicator whether physcial hotplug is
>> supported or not, but the ACPI folks (x86) and architecture maintainers
>
> +cc Rafael
>
>> should be able to answer that question. I have a machine which says:
>>
>> smpboot: Allowing 128 CPUs, 96 hotplug CPUs
>>
>> There is definitely no way to hotplug anything on that machine and sure
>> the
>
>
> AFAIK, in ACPI based dynamic reconfiguration, there is no clear
> indicator. In theory, If the ACPI tables have the hotpluggable
> CPU resources, the OS can support physical hotplug.

In order for the ACPI-based CPU hotplug (I mean physical, not just the
software offline/online we do in the kernel) to work, there have to be
objects in the ACPI namespace corresponding to all of the processors
in question.

If they are not present, there is no way to signal insertion and eject
the processors safely.

[PATCH v2] block: bio_check_eod() needs to consider partition

2018-03-13 Thread Christoph Hellwig

bio_check_eod() should check partiton size not the whole disk if
bio->bi_partno is non-zero.  Does this by taking the call to bio_check_eod
into blk_partition_remap.

Based on an earlier patch from Jiufei Xue.

Fixes: 74d46992e0d9 ("block: replace bi_bdev with a gendisk pointer and 
partitions index")
Reported-by: Jiufei Xue 
Signed-off-by: Christoph Hellwig 
---
 block/blk-core.c | 99 
 1 file changed, 43 insertions(+), 56 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 6d82c4f7fadd..5a78f5537087 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2023,7 +2023,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, 
struct bio *bio)
return BLK_QC_T_NONE;
 }
 
-static void handle_bad_sector(struct bio *bio)
+static void handle_bad_sector(struct bio *bio, sector_t maxsector)
 {
char b[BDEVNAME_SIZE];
 
@@ -2031,7 +2031,7 @@ static void handle_bad_sector(struct bio *bio)
printk(KERN_INFO "%s: rw=%d, want=%Lu, limit=%Lu\n",
bio_devname(bio, b), bio->bi_opf,
(unsigned long long)bio_end_sector(bio),
-   (long long)get_capacity(bio->bi_disk));
+   (long long)maxsector);
 }
 
 #ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -2092,66 +2092,57 @@ static noinline int should_fail_bio(struct bio *bio)
 }
 ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);
 
+/*
+ * Check whether this bio extends beyond the end of the device or partition.
+ * This may well happen - the kernel calls bread() without checking the size of
+ * the device, e.g., when mounting a file system.
+ */
+static inline int bio_check_eod(struct bio *bio, sector_t maxsector)
+{
+   unsigned int nr_sectors = bio_sectors(bio);
+
+   if (nr_sectors && maxsector &&
+   (nr_sectors > maxsector ||
+bio->bi_iter.bi_sector > maxsector - nr_sectors)) {
+   handle_bad_sector(bio, maxsector);
+   return -EIO;
+   }
+   return 0;
+}
+
 /*
  * Remap block n of partition p to block n+start(p) of the disk.
  */
 static inline int blk_partition_remap(struct bio *bio)
 {
struct hd_struct *p;
-   int ret = 0;
+   sector_t maxsector = 0;
 
rcu_read_lock();
p = __disk_get_part(bio->bi_disk, bio->bi_partno);
-   if (unlikely(!p || should_fail_request(p, bio->bi_iter.bi_size) ||
-bio_check_ro(bio, p))) {
-   ret = -EIO;
-   goto out;
-   }
+   if (unlikely(!p))
+   goto out_unlock;
+   if (unlikely(should_fail_request(p, bio->bi_iter.bi_size)))
+   goto out_unlock;
+   if (unlikely(bio_check_ro(bio, p)))
+   goto out_unlock;
 
/*
 * Zone reset does not include bi_size so bio_sectors() is always 0.
 * Include a test for the reset op code and perform the remap if needed.
 */
-   if (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET)
-   goto out;
-
-   bio->bi_iter.bi_sector += p->start_sect;
-   bio->bi_partno = 0;
-   trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
- bio->bi_iter.bi_sector - p->start_sect);
-
-out:
-   rcu_read_unlock();
-   return ret;
-}
-
-/*
- * Check whether this bio extends beyond the end of the device.
- */
-static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
-{
-   sector_t maxsector;
-
-   if (!nr_sectors)
-   return 0;
-
-   /* Test device or partition size, when known. */
-   maxsector = get_capacity(bio->bi_disk);
-   if (maxsector) {
-   sector_t sector = bio->bi_iter.bi_sector;
-
-   if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {
-   /*
-* This may well happen - the kernel calls bread()
-* without checking the size of the device, e.g., when
-* mounting a device.
-*/
-   handle_bad_sector(bio);
-   return 1;
-   }
+   if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET) {
+   bio->bi_iter.bi_sector += p->start_sect;
+   bio->bi_partno = 0;
+   trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
+ bio->bi_iter.bi_sector - p->start_sect);
+   maxsector = part_nr_sects_read(p);
}
-
-   return 0;
+   rcu_read_unlock();
+   return bio_check_eod(bio, maxsector);
+out_unlock:
+   rcu_read_unlock();
+   return -EIO;
 }
 
 static noinline_for_stack bool
@@ -2164,9 +2155,6 @@ generic_make_request_checks(struct bio *bio)
 
might_sleep();
 
-   if (bio_check_eod(bio, nr_sectors))
-   goto end_io;
-
q = bio->bi_disk->queue;
if (unlikely(!q)) {

Re: [PATCH V3 0/4] genirq/affinity: irq vector spread among online CPUs as far as possible

2018-03-13 Thread Artem Bityutskiy

On Tue, 2018-03-13 at 16:35 +0800, Ming Lei wrote:
> Then looks this issue need to fix by making possible CPU count
> accurate
> because there are other resources allocated according to
> num_possible_cpus(),
> such as percpu variables.

Short term the regression should be fixed. It is already v4.16-rc6, we
have little time left.

Longer term, yeah, I agree. Kernel's notion of possible CPU count
should be realistic.

Artem.

Re: [PATCH V3 0/4] genirq/affinity: irq vector spread among online CPUs as far as possible

2018-03-13 Thread Ming Lei

On Tue, Mar 13, 2018 at 09:38:41AM +0200, Artem Bityutskiy wrote:
> On Tue, 2018-03-13 at 11:11 +0800, Dou Liyang wrote:
> >  I also
> > met the situation that BIOS told to ACPI that it could support
> > physical
> > CPUs hotplug, But actually, there was no hardware slots in the
> > machine.
> > the ACPI tables like user inputs which should be validated when we
> > use.
> 
> This is exactly what happens on Skylake Xeon systems. When I check
> dmesg or this file:
> 
> /sys/devices/system/cpu/possible
> 
> on 2S (two socket) and 4S (four socket) systems, I see the same number
> 432.
> 
> This number comes from ACPI MADT. I will speculate (did not see myself)
> that 8S systems will report the same number as well, because of the
> Skylake-SP (Scalable Platform) architecture.
> 
> Number 432 is good for 8S systems, but it is way too large for 2S and
> 4S systems - 4x or 2x larger than the theoretical maximum.
> 
> I do not know why BIOSes have to report unrealistically high numbers, I
> am just sharing my observation.
> 
> So yes, Linux kernel's possible CPU count knowledge may be too large.
> If we use that number to evenly spread IRQ vectors among the CPUs, we
> end up with wasted vectors, and even bugs, as I observe on a 2S
> Skylake.

Then looks this issue need to fix by making possible CPU count accurate
because there are other resources allocated according to num_possible_cpus(),
such as percpu variables.

Thanks,
Ming

Re: [PATCH] block: bio_check_eod() needs to consider partition

2018-03-13 Thread Christoph Hellwig

On Thu, Mar 08, 2018 at 02:09:23PM -0700, Jens Axboe wrote:
> On Thu, Mar 08 2018, Christoph Hellwig wrote:
> > bio_check_eod() should check partiton size not the whole disk if
> > bio->bi_partno is non-zero.
> > 
> > Based on an earlier patch from Jiufei Xue.
> 
> This doesn't apply, what did you generate it against?

Linux 4.15.  I'll respin it against latests 4.16-rc.

Re: [PATCH V3 0/4] genirq/affinity: irq vector spread among online CPUs as far as possible

2018-03-13 Thread Artem Bityutskiy

On Tue, 2018-03-13 at 11:11 +0800, Dou Liyang wrote:
>  I also
> met the situation that BIOS told to ACPI that it could support
> physical
> CPUs hotplug, But actually, there was no hardware slots in the
> machine.
> the ACPI tables like user inputs which should be validated when we
> use.

This is exactly what happens on Skylake Xeon systems. When I check
dmesg or this file:

/sys/devices/system/cpu/possible

on 2S (two socket) and 4S (four socket) systems, I see the same number
432.

This number comes from ACPI MADT. I will speculate (did not see myself)
that 8S systems will report the same number as well, because of the
Skylake-SP (Scalable Platform) architecture.

Number 432 is good for 8S systems, but it is way too large for 2S and
4S systems - 4x or 2x larger than the theoretical maximum.

I do not know why BIOSes have to report unrealistically high numbers, I
am just sharing my observation.

So yes, Linux kernel's possible CPU count knowledge may be too large.
If we use that number to evenly spread IRQ vectors among the CPUs, we
end up with wasted vectors, and even bugs, as I observe on a 2S
Skylake.

Artem.

76 matches

Mail list logo