On Thu, Dec 14, 2017 at 4:26 PM, Vishal Verma <[email protected]> wrote:
>
> Due to a spec misinterpretation, the Linux implementation of the BTT log
> area had different padding scheme from other implementations, such as
> UEFI and NVML.
>
> This fixes the padding scheme, and defaults to it for new BTT layouts.
> We attempt to detect the padding scheme in use when probing for an
> existing BTT. If we detect the older/incompatible scheme, we continue
> using it.
>
> Reported-by: Juston Li <[email protected]>
> Cc: Dan Williams <[email protected]>

Fixes: 5212e11fde4d ("nd_btt: atomic sector updates")
Cc: <[email protected]>

> Signed-off-by: Vishal Verma <[email protected]>
> ---
>  drivers/nvdimm/btt.c | 194 
> ++++++++++++++++++++++++++++++++++++++++++---------
>  drivers/nvdimm/btt.h |  45 +++++++++++-
>  2 files changed, 204 insertions(+), 35 deletions(-)

Looks good, some minor comments below:

>
> diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
> index e949e3302af4..1b8692dd6ca3 100644
> --- a/drivers/nvdimm/btt.c
> +++ b/drivers/nvdimm/btt.c
> @@ -211,12 +211,12 @@ static int btt_map_read(struct arena_info *arena, u32 
> lba, u32 *mapping,
>         return ret;
>  }
>
> -static int btt_log_read_pair(struct arena_info *arena, u32 lane,
> -                       struct log_entry *ent)
> +static int btt_log_group_read(struct arena_info *arena, u32 lane,
> +                       struct log_group *log)
>  {
>         return arena_read_bytes(arena,
> -                       arena->logoff + (2 * lane * LOG_ENT_SIZE), ent,
> -                       2 * LOG_ENT_SIZE, 0);
> +                       arena->logoff + (lane * LOG_GRP_SIZE), log,
> +                       LOG_GRP_SIZE, 0);
>  }
>
>  static struct dentry *debugfs_root;
> @@ -256,6 +256,8 @@ static void arena_debugfs_init(struct arena_info *a, 
> struct dentry *parent,
>         debugfs_create_x64("logoff", S_IRUGO, d, &a->logoff);
>         debugfs_create_x64("info2off", S_IRUGO, d, &a->info2off);
>         debugfs_create_x32("flags", S_IRUGO, d, &a->flags);
> +       debugfs_create_u32("log_index_0", S_IRUGO, d, &a->log_index[0]);
> +       debugfs_create_u32("log_index_1", S_IRUGO, d, &a->log_index[1]);
>  }
>
>  static void btt_debugfs_init(struct btt *btt)
> @@ -283,7 +285,7 @@ static void btt_debugfs_init(struct btt *btt)
>   *
>   * TODO The logic feels a bit kludge-y. make it better..
>   */
> -static int btt_log_get_old(struct log_entry *ent)
> +static int btt_log_get_old(struct arena_info *a, struct log_group *log)
>  {
>         int old;
>
> @@ -292,23 +294,27 @@ static int btt_log_get_old(struct log_entry *ent)
>          * the next time, the following logic works out to put this
>          * (next) entry into [1]
>          */
> -       if (ent[0].seq == 0) {
> -               ent[0].seq = cpu_to_le32(1);
> +       if (log->ent[a->log_index[0]].seq == 0) {
> +               log->ent[a->log_index[0]].seq = cpu_to_le32(1);
>                 return 0;
>         }
>
> -       if (ent[0].seq == ent[1].seq)
> +       if (log->ent[a->log_index[0]].seq == log->ent[a->log_index[1]].seq)
>                 return -EINVAL;
> -       if (le32_to_cpu(ent[0].seq) + le32_to_cpu(ent[1].seq) > 5)
> +       if (le32_to_cpu(log->ent[a->log_index[0]].seq) +
> +                       le32_to_cpu(log->ent[a->log_index[1]].seq) > 5)

This is getting dense, can we introduce a little helper like:

    u32 log_seq(struct log_group *log, int seq_idx)

...so this becomes a bit more readable:

    if (log_seq(log, seq0) + log_seq(log, seq1) > 5)

...where seq0 and and seq1 are initialized once. That way we don't
need to keep doing the pointer chasing through the arena.

>                 return -EINVAL;
>
> -       if (le32_to_cpu(ent[0].seq) < le32_to_cpu(ent[1].seq)) {
> -               if (le32_to_cpu(ent[1].seq) - le32_to_cpu(ent[0].seq) == 1)
> +       if (le32_to_cpu(log->ent[a->log_index[0]].seq) <
> +                       le32_to_cpu(log->ent[a->log_index[1]].seq)) {
> +               if (le32_to_cpu(log->ent[a->log_index[1]].seq) -
> +                               le32_to_cpu(log->ent[a->log_index[0]].seq) == 
> 1)
>                         old = 0;
>                 else
>                         old = 1;
>         } else {
> -               if (le32_to_cpu(ent[0].seq) - le32_to_cpu(ent[1].seq) == 1)
> +               if (le32_to_cpu(log->ent[a->log_index[0]].seq) -
> +                               le32_to_cpu(log->ent[a->log_index[1]].seq) == 
> 1)
>                         old = 1;
>                 else
>                         old = 0;
> @@ -328,17 +334,18 @@ static int btt_log_read(struct arena_info *arena, u32 
> lane,
>  {
>         int ret;
>         int old_ent, ret_ent;
> -       struct log_entry log[2];
> +       struct log_group log;
>
> -       ret = btt_log_read_pair(arena, lane, log);
> +       ret = btt_log_group_read(arena, lane, &log);
>         if (ret)
>                 return -EIO;
>
> -       old_ent = btt_log_get_old(log);
> +       old_ent = btt_log_get_old(arena, &log);
>         if (old_ent < 0 || old_ent > 1) {
>                 dev_err(to_dev(arena),
>                                 "log corruption (%d): lane %d seq [%d, %d]\n",
> -                       old_ent, lane, log[0].seq, log[1].seq);
> +                               old_ent, lane, 
> log.ent[arena->log_index[0]].seq,
> +                               log.ent[arena->log_index[1]].seq);
>                 /* TODO set error state? */
>                 return -EIO;
>         }
> @@ -346,7 +353,7 @@ static int btt_log_read(struct arena_info *arena, u32 
> lane,
>         ret_ent = (old_flag ? old_ent : (1 - old_ent));
>
>         if (ent != NULL)
> -               memcpy(ent, &log[ret_ent], LOG_ENT_SIZE);
> +               memcpy(ent, &log.ent[arena->log_index[ret_ent]], 
> LOG_ENT_SIZE);
>
>         return ret_ent;
>  }
> @@ -360,17 +367,13 @@ static int __btt_log_write(struct arena_info *arena, 
> u32 lane,
>                         u32 sub, struct log_entry *ent, unsigned long flags)
>  {
>         int ret;
> -       /*
> -        * Ignore the padding in log_entry for calculating log_half.
> -        * The entry is 'committed' when we write the sequence number,
> -        * and we want to ensure that that is the last thing written.
> -        * We don't bother writing the padding as that would be extra
> -        * media wear and write amplification
> -        */
> -       unsigned int log_half = (LOG_ENT_SIZE - 2 * sizeof(u64)) / 2;
> -       u64 ns_off = arena->logoff + (((2 * lane) + sub) * LOG_ENT_SIZE);
> +       u32 group_slot = arena->log_index[sub];
> +       unsigned int log_half = LOG_ENT_SIZE / 2;
>         void *src = ent;
> +       u64 ns_off;
>
> +       ns_off = arena->logoff + (lane * LOG_GRP_SIZE) +
> +               (group_slot * LOG_ENT_SIZE);
>         /* split the 16B write into atomic, durable halves */
>         ret = arena_write_bytes(arena, ns_off, src, log_half, flags);
>         if (ret)
> @@ -453,7 +456,7 @@ static int btt_log_init(struct arena_info *arena)
>  {
>         size_t logsize = arena->info2off - arena->logoff;
>         size_t chunk_size = SZ_4K, offset = 0;
> -       struct log_entry log;
> +       struct log_entry ent;
>         void *zerobuf;
>         int ret;
>         u32 i;
> @@ -485,11 +488,11 @@ static int btt_log_init(struct arena_info *arena)
>         }
>
>         for (i = 0; i < arena->nfree; i++) {
> -               log.lba = cpu_to_le32(i);
> -               log.old_map = cpu_to_le32(arena->external_nlba + i);
> -               log.new_map = cpu_to_le32(arena->external_nlba + i);
> -               log.seq = cpu_to_le32(LOG_SEQ_INIT);
> -               ret = __btt_log_write(arena, i, 0, &log, 0);
> +               ent.lba = cpu_to_le32(i);
> +               ent.old_map = cpu_to_le32(arena->external_nlba + i);
> +               ent.new_map = cpu_to_le32(arena->external_nlba + i);
> +               ent.seq = cpu_to_le32(LOG_SEQ_INIT);
> +               ret = __btt_log_write(arena, i, 0, &ent, 0);
>                 if (ret)
>                         goto free;
>         }
> @@ -594,6 +597,119 @@ static int btt_freelist_init(struct arena_info *arena)
>         return 0;
>  }
>
> +static bool ent_is_padding(struct log_entry *ent)
> +{
> +       return (ent->lba == 0) && (ent->old_map == 0) && (ent->new_map == 0)
> +               && (ent->seq == 0);
> +}
> +
> +/*
> + * Detecting valid log indices: We read a log group (see the comments in 
> btt.h
> + * for a description of a 'log_group' and its 'slots'), and iterate over its
> + * four slots. We expect that a padding slot will be all-zeroes, and use this
> + * to detect a padding slot vs. an actual entry.
> + *
> + * If a log_group is in the initial state, i.e. hasn't been used since the
> + * creation of this BTT layout, it will have three of the four slots with
> + * zeroes. We skip over these log_groups for the detection of log_index. If
> + * all log_groups are in the initial state (i.e. the BTT has never been
> + * written to), it is safe to assume the 'new format' of log entries in slots
> + * (0, 1).
> + */
> +static int log_set_indices(struct arena_info *arena)
> +{
> +       bool idx_set = false, initial_state = true;
> +       int ret, log_index[2] = {-1, -1};
> +       u32 i, j, next_idx = 0;
> +       struct log_group log;
> +       u32 pad_count = 0;
> +
> +       for (i = 0; i < arena->nfree; i++) {
> +               ret = btt_log_group_read(arena, i, &log);
> +               if (ret < 0)
> +                       return ret;
> +
> +               for (j = 0; j < 4; j++) {
> +                       if (!idx_set) {
> +                               if (ent_is_padding(&log.ent[j])) {
> +                                       pad_count++;
> +                                       continue;
> +                               } else {
> +                                       /* Skip if index has been recorded */
> +                                       if ((next_idx == 1) &&
> +                                               (j == log_index[0]))
> +                                               continue;
> +                                       /* valid entry, record index */
> +                                       log_index[next_idx] = j;
> +                                       next_idx++;
> +                               }
> +                               if (next_idx == 2) {
> +                                       /* two valid entries found */
> +                                       idx_set = true;
> +                               } else if (next_idx > 2) {
> +                                       /* too many valid indices */
> +                                       return -ENXIO;
> +                               }
> +                       } else {
> +                               /*
> +                                * once the indices have been set, just verify
> +                                * that all subsequent log groups are either 
> in
> +                                * their initial state or follow the same
> +                                * indices.
> +                                */
> +                               if (j == log_index[0]) {
> +                                       /* entry must be 'valid' */
> +                                       if (ent_is_padding(&log.ent[j]))
> +                                               return -ENXIO;
> +                               } else if (j == log_index[1]) {
> +                                       ;
> +                                       /*
> +                                        * log_index[1] can be padding if the
> +                                        * lane never got used and it is still
> +                                        * in the initial state (three 
> 'padding'
> +                                        * entries)
> +                                        */
> +                               } else {
> +                                       /* entry must be invalid (padding) */
> +                                       if (!ent_is_padding(&log.ent[j]))
> +                                               return -ENXIO;
> +                               }
> +                       }
> +               }
> +               /*
> +                * If any of the log_groups have more than one valid,
> +                * non-padding entry, then the we are no longer in the
> +                * initial_state
> +                */
> +               if (pad_count < 3)
> +                       initial_state = false;
> +               pad_count = 0;
> +       }
> +
> +       if (!initial_state && !idx_set)
> +               return -ENXIO;
> +
> +       /*
> +        * If all the entries in the log were in the initial state,
> +        * assume new padding scheme
> +        */
> +       if (initial_state)
> +               log_index[1] = 1;
> +
> +       /*
> +        * Only allow the known permutations of log/padding indices,
> +        * i.e. (0, 1), and (0, 2)
> +        */
> +       if ((log_index[0] == 0) && ((log_index[1] == 1) || (log_index[1] == 
> 2)))
> +               ; /* known index possibilities */
> +       else
> +               return -ENXIO;

I think this 'else' case deserves a dev_err() print.

> +
> +       arena->log_index[0] = log_index[0];
> +       arena->log_index[1] = log_index[1];

Let's add a dev_dbg() to report what scheme the kernel picked to
supplement what you have in debugfs.

> +       return 0;
> +}
> +
>  static int btt_rtt_init(struct arena_info *arena)
>  {
>         arena->rtt = kcalloc(arena->nfree, sizeof(u32), GFP_KERNEL);
> @@ -650,8 +766,7 @@ static struct arena_info *alloc_arena(struct btt *btt, 
> size_t size,
>         available -= 2 * BTT_PG_SIZE;
>
>         /* The log takes a fixed amount of space based on nfree */
> -       logsize = roundup(2 * arena->nfree * sizeof(struct log_entry),
> -                               BTT_PG_SIZE);
> +       logsize = roundup(arena->nfree * LOG_GRP_SIZE, BTT_PG_SIZE);
>         available -= logsize;
>
>         /* Calculate optimal split between map and data area */
> @@ -668,6 +783,10 @@ static struct arena_info *alloc_arena(struct btt *btt, 
> size_t size,
>         arena->mapoff = arena->dataoff + datasize;
>         arena->logoff = arena->mapoff + mapsize;
>         arena->info2off = arena->logoff + logsize;
> +
> +       /* Default log indices are (0,1) */
> +       arena->log_index[0] = 0;
> +       arena->log_index[1] = 1;
>         return arena;
>  }
>
> @@ -758,6 +877,13 @@ static int discover_arenas(struct btt *btt)
>                 arena->external_lba_start = cur_nlba;
>                 parse_arena_meta(arena, super, cur_off);
>
> +               ret = log_set_indices(arena);
> +               if (ret) {
> +                       dev_err(to_dev(arena),
> +                               "Unable to deduce log/padding indices\n");
> +                       goto out;
> +               }
> +
>                 mutex_init(&arena->err_lock);
>                 ret = btt_freelist_init(arena);
>                 if (ret)
> diff --git a/drivers/nvdimm/btt.h b/drivers/nvdimm/btt.h
> index 884fbbbdd18a..be1031c5b1ba 100644
> --- a/drivers/nvdimm/btt.h
> +++ b/drivers/nvdimm/btt.h
> @@ -27,6 +27,7 @@
>  #define MAP_ERR_MASK (1 << MAP_ERR_SHIFT)
>  #define MAP_LBA_MASK (~((1 << MAP_TRIM_SHIFT) | (1 << MAP_ERR_SHIFT)))
>  #define MAP_ENT_NORMAL 0xC0000000
> +#define LOG_GRP_SIZE sizeof(struct log_group)
>  #define LOG_ENT_SIZE sizeof(struct log_entry)
>  #define ARENA_MIN_SIZE (1UL << 24)     /* 16 MB */
>  #define ARENA_MAX_SIZE (1ULL << 39)    /* 512 GB */
> @@ -50,12 +51,52 @@ enum btt_init_state {
>         INIT_READY
>  };
>
> +/*
> + * A log group represents one log 'lane', and consists of four log entries.
> + * Two of the four entries are valid entries, and the remaining two are
> + * padding. Due to an old bug in the padding location, we need to perform a
> + * test to determine the padding scheme being used, and use that scheme
> + * thereafter.
> + *
> + * An old format 'log group' would have actual log entries at indices (0, 2)
> + * and padding at indices (1, 3), where as the correct/updated format has
> + * log entries at indices (0, 1) and padding at indices (2, 3).

Let's qualify "old" and say "kernels prior to 4.15".

> + *
> + * Old (incompatible) format:
> + * +-----------------+-----------------+
> + * |      ent[0]     |      ent[1]     |
> + * |       16B       |       16B       |
> + * | lba/old/new/seq |       pad       |
> + * +-----------------------------------+
> + * |      ent[2]     |      ent[3]     |
> + * |       16B       |       16B       |
> + * | lba/old/new/seq |       pad       |
> + * +-----------------+-----------------+
> + *
> + * New format:
> + * +-----------------+-----------------+
> + * |      ent[0]     |      ent[1]     |
> + * |       16B       |       16B       |
> + * | lba/old/new/seq | lba/old/new/seq |
> + * +-----------------------------------+
> + * |      ent[2]     |      ent[3]     |
> + * |       16B       |       16B       |
> + * |       pad       |       pad       |
> + * +-----------------+-----------------+
> + *
> + * We detect during start-up which format is in use, and set
> + * arena->log_index[(0, 1)] with the detected format.
> + */
> +
>  struct log_entry {
>         __le32 lba;
>         __le32 old_map;
>         __le32 new_map;
>         __le32 seq;
> -       __le64 padding[2];
> +};
> +
> +struct log_group {
> +       struct log_entry ent[4];
>  };
>
>  struct btt_sb {
> @@ -126,6 +167,7 @@ struct aligned_lock {
>   * @debugfs_dir:       Debugfs dentry
>   * @flags:             Arena flags - may signify error states.
>   * @err_lock:          Mutex for synchronizing error clearing.
> + * @log_index:         Indices of the valid log entries in a log_group
>   *
>   * arena_info is a per-arena handle. Once an arena is narrowed down for an
>   * IO, this struct is passed around for the duration of the IO.
> @@ -158,6 +200,7 @@ struct arena_info {
>         /* Arena flags */
>         u32 flags;
>         struct mutex err_lock;
> +       int log_index[2];
>  };
>
>  /**
> --
> 2.14.3
>
_______________________________________________
Linux-nvdimm mailing list
[email protected]
https://lists.01.org/mailman/listinfo/linux-nvdimm

Reply via email to