On Tue, Jun 19, 2018 at 1:08 PM, Javier Gonzalez <jav...@cnexlabs.com> wrote:
>> On 16 Jun 2018, at 00.27, Igor Konopko <igor.j.kono...@intel.com> wrote:
>>
>> In current pblk implementation, l2p mapping for not closed lines
>> is always stored only in OOB metadata and recovered from it.
>>
>> Such a solution does not provide data integrity when drives does
>> not have such a OOB metadata space.
>>
>> The goal of this patch is to add support for so called packed
>> metadata, which store l2p mapping for open lines in last sector
>> of every write unit.
>>
>> Signed-off-by: Igor Konopko <igor.j.kono...@intel.com>
>> ---
>> drivers/lightnvm/pblk-core.c     | 52 
>> ++++++++++++++++++++++++++++++++++++----
>> drivers/lightnvm/pblk-init.c     | 37 ++++++++++++++++++++++++++--
>> drivers/lightnvm/pblk-rb.c       |  3 +++
>> drivers/lightnvm/pblk-recovery.c | 25 +++++++++++++++----
>> drivers/lightnvm/pblk-sysfs.c    |  7 ++++++
>> drivers/lightnvm/pblk-write.c    | 14 +++++++----
>> drivers/lightnvm/pblk.h          |  5 +++-
>> 7 files changed, 128 insertions(+), 15 deletions(-)
>>
>> diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
>> index c092ee93a18d..375c6430612e 100644
>> --- a/drivers/lightnvm/pblk-core.c
>> +++ b/drivers/lightnvm/pblk-core.c
>> @@ -340,7 +340,7 @@ void pblk_write_should_kick(struct pblk *pblk)
>> {
>>       unsigned int secs_avail = pblk_rb_read_count(&pblk->rwb);
>>
>> -     if (secs_avail >= pblk->min_write_pgs)
>> +     if (secs_avail >= pblk->min_write_pgs_data)
>>               pblk_write_kick(pblk);
>> }
>>
>> @@ -371,7 +371,9 @@ struct list_head *pblk_line_gc_list(struct pblk *pblk, 
>> struct pblk_line *line)
>>       struct pblk_line_meta *lm = &pblk->lm;
>>       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
>>       struct list_head *move_list = NULL;
>> -     int vsc = le32_to_cpu(*line->vsc);
>> +     int packed_meta = (le32_to_cpu(*line->vsc) / pblk->min_write_pgs_data)
>> +                     * (pblk->min_write_pgs - pblk->min_write_pgs_data);
>> +     int vsc = le32_to_cpu(*line->vsc) + packed_meta;
>>
>>       lockdep_assert_held(&line->lock);
>>
>> @@ -540,12 +542,15 @@ struct bio *pblk_bio_map_addr(struct pblk *pblk, void 
>> *data,
>> }
>>
>> int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail,
>> -                unsigned long secs_to_flush)
>> +                unsigned long secs_to_flush, bool skip_meta)
>> {
>>       int max = pblk->sec_per_write;
>>       int min = pblk->min_write_pgs;
>>       int secs_to_sync = 0;
>>
>> +     if (skip_meta)
>> +             min = max = pblk->min_write_pgs_data;
>> +
>>       if (secs_avail >= max)
>>               secs_to_sync = max;
>>       else if (secs_avail >= min)
>> @@ -663,7 +668,7 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, 
>> struct pblk_line *line,
>> next_rq:
>>       memset(&rqd, 0, sizeof(struct nvm_rq));
>>
>> -     rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
>> +     rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false);
>>       rq_len = rq_ppas * geo->csecs;
>>
>>       bio = pblk_bio_map_addr(pblk, emeta_buf, rq_ppas, rq_len,
>> @@ -2091,3 +2096,42 @@ void pblk_lookup_l2p_rand(struct pblk *pblk, struct 
>> ppa_addr *ppas,
>>       }
>>       spin_unlock(&pblk->trans_lock);
>> }
>> +
>> +void pblk_set_packed_meta(struct pblk *pblk, struct nvm_rq *rqd)
>> +{
>> +     void *meta_list = rqd->meta_list;
>> +     void *page;
>> +     int i = 0;
>> +
>> +     if (pblk_is_oob_meta_supported(pblk))
>> +             return;
>> +
>> +     /* We need to zero out metadata corresponding to packed meta page */
>> +     pblk_get_meta_at(pblk, meta_list, rqd->nr_ppas - 1)->lba = ADDR_EMPTY;
>> +
>> +     page = page_to_virt(rqd->bio->bi_io_vec[rqd->bio->bi_vcnt - 
>> 1].bv_page);
>> +     /* We need to fill last page of request (packed metadata)
>> +      * with data from oob meta buffer.
>> +      */
>> +     for (; i < rqd->nr_ppas; i++)
>> +             memcpy(page + (i * sizeof(struct pblk_sec_meta)),
>> +                     pblk_get_meta_at(pblk, meta_list, i),
>> +                     sizeof(struct pblk_sec_meta));
>> +}
>> +
>> +void pblk_get_packed_meta(struct pblk *pblk, struct nvm_rq *rqd)
>> +{
>> +     void *meta_list = rqd->meta_list;
>> +     void *page;
>> +     int i = 0;
>> +
>> +     if (pblk_is_oob_meta_supported(pblk))
>> +             return;
>> +
>> +     page = page_to_virt(rqd->bio->bi_io_vec[rqd->bio->bi_vcnt - 
>> 1].bv_page);
>> +     /* We need to fill oob meta buffer with data from packed metadata */
>> +     for (; i < rqd->nr_ppas; i++)
>> +             memcpy(pblk_get_meta_at(pblk, meta_list, i),
>> +                     page + (i * sizeof(struct pblk_sec_meta)),
>> +                     sizeof(struct pblk_sec_meta));
>> +}
>> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
>> index f05112230a52..5eb641da46ed 100644
>> --- a/drivers/lightnvm/pblk-init.c
>> +++ b/drivers/lightnvm/pblk-init.c
>> @@ -372,8 +372,40 @@ static int pblk_core_init(struct pblk *pblk)
>>       pblk->min_write_pgs = geo->ws_opt * (geo->csecs / PAGE_SIZE);
>>       max_write_ppas = pblk->min_write_pgs * geo->all_luns;
>>       pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA);
>> +     pblk->min_write_pgs_data = pblk->min_write_pgs;
>>       pblk_set_sec_per_write(pblk, pblk->min_write_pgs);
>>
>> +     if (!pblk_is_oob_meta_supported(pblk)) {
>> +             /* For drives which does not have OOB metadata feature
>> +              * in order to support recovery feature we need to use
>> +              * so called packed metadata. Packed metada will store
>> +              * the same information as OOB metadata (l2p table mapping,
>> +              * but in the form of the single page at the end of
>> +              * every write request.
>> +              */
>> +             if (pblk->min_write_pgs
>> +                     * sizeof(struct pblk_sec_meta) > PAGE_SIZE) {
>> +                     /* We want to keep all the packed metadata on single
>> +                      * page per write requests. So we need to ensure that
>> +                      * it will fit.
>> +                      *
>> +                      * This is more like sanity check, since there is
>> +                      * no device with such a big minimal write size
>> +                      * (above 1 metabytes).
>> +                      */
>> +                     pr_err("pblk: Not supported min write size\n");
>> +                     return -EINVAL;
>> +             }
>> +             /* For packed meta approach we do some simplification.
>> +              * On read path we always issue requests which size
>> +              * equal to max_write_pgs, with all pages filled with
>> +              * user payload except of last one page which will be
>> +              * filled with packed metadata.
>> +              */
>> +             pblk->max_write_pgs = pblk->min_write_pgs;
>> +             pblk->min_write_pgs_data = pblk->min_write_pgs - 1;
>> +     }
>> +
>>       if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) {
>>               pr_err("pblk: vector list too big(%u > %u)\n",
>>                               pblk->max_write_pgs, PBLK_MAX_REQ_ADDRS);
>> @@ -668,7 +700,7 @@ static void pblk_set_provision(struct pblk *pblk, long 
>> nr_free_blks)
>>       struct pblk_line_meta *lm = &pblk->lm;
>>       struct nvm_geo *geo = &dev->geo;
>>       sector_t provisioned;
>> -     int sec_meta, blk_meta;
>> +     int sec_meta, blk_meta, clba;
>>
>>       if (geo->op == NVM_TARGET_DEFAULT_OP)
>>               pblk->op = PBLK_DEFAULT_OP;
>> @@ -691,7 +723,8 @@ static void pblk_set_provision(struct pblk *pblk, long 
>> nr_free_blks)
>>       sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
>>       blk_meta = DIV_ROUND_UP(sec_meta, geo->clba);
>>
>> -     pblk->capacity = (provisioned - blk_meta) * geo->clba;
>> +     clba = (geo->clba / pblk->min_write_pgs) * pblk->min_write_pgs_data;
>> +     pblk->capacity = (provisioned - blk_meta) * clba;
>>
>>       atomic_set(&pblk->rl.free_blocks, nr_free_blks);
>>       atomic_set(&pblk->rl.free_user_blocks, nr_free_blks);
>> diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
>> index a81a97e8ea6d..081e73e7978f 100644
>> --- a/drivers/lightnvm/pblk-rb.c
>> +++ b/drivers/lightnvm/pblk-rb.c
>> @@ -528,6 +528,9 @@ unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, 
>> struct nvm_rq *rqd,
>>               to_read = count;
>>       }
>>
>> +     /* Add space for packed metadata if in use*/
>> +     pad += (pblk->min_write_pgs - pblk->min_write_pgs_data);
>> +
>>       c_ctx->sentry = pos;
>>       c_ctx->nr_valid = to_read;
>>       c_ctx->nr_padded = pad;
>> diff --git a/drivers/lightnvm/pblk-recovery.c 
>> b/drivers/lightnvm/pblk-recovery.c
>> index f5853fc77a0c..0fab18fe30d9 100644
>> --- a/drivers/lightnvm/pblk-recovery.c
>> +++ b/drivers/lightnvm/pblk-recovery.c
>> @@ -138,7 +138,7 @@ static int pblk_recov_read_oob(struct pblk *pblk, struct 
>> pblk_line *line,
>> next_read_rq:
>>       memset(rqd, 0, pblk_g_rq_size);
>>
>> -     rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
>> +     rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false);
>>       if (!rq_ppas)
>>               rq_ppas = pblk->min_write_pgs;
>>       rq_len = rq_ppas * geo->csecs;
>> @@ -198,6 +198,7 @@ static int pblk_recov_read_oob(struct pblk *pblk, struct 
>> pblk_line *line,
>>               return -EINTR;
>>       }
>>
>> +     pblk_get_packed_meta(pblk, rqd);
>>       for (i = 0; i < rqd->nr_ppas; i++) {
>>               u64 lba = le64_to_cpu(pblk_get_meta_at(pblk,
>>                                                       meta_list, i)->lba);
>> @@ -272,7 +273,7 @@ static int pblk_recov_pad_oob(struct pblk *pblk, struct 
>> pblk_line *line,
>>       kref_init(&pad_rq->ref);
>>
>> next_pad_rq:
>> -     rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
>> +     rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false);
>>       if (rq_ppas < pblk->min_write_pgs) {
>>               pr_err("pblk: corrupted pad line %d\n", line->id);
>>               goto fail_free_pad;
>> @@ -418,7 +419,7 @@ static int pblk_recov_scan_all_oob(struct pblk *pblk, 
>> struct pblk_line *line,
>> next_rq:
>>       memset(rqd, 0, pblk_g_rq_size);
>>
>> -     rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
>> +     rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false);
>>       if (!rq_ppas)
>>               rq_ppas = pblk->min_write_pgs;
>>       rq_len = rq_ppas * geo->csecs;
>> @@ -475,6 +476,7 @@ static int pblk_recov_scan_all_oob(struct pblk *pblk, 
>> struct pblk_line *line,
>>        */
>>       if (!rec_round++ && !rqd->error) {
>>               rec_round = 0;
>> +             pblk_get_packed_meta(pblk, rqd);
>>               for (i = 0; i < rqd->nr_ppas; i++, r_ptr++) {
>>                       u64 lba = le64_to_cpu(pblk_get_meta_at(pblk,
>>                                                       meta_list, i)->lba);
>> @@ -492,6 +494,12 @@ static int pblk_recov_scan_all_oob(struct pblk *pblk, 
>> struct pblk_line *line,
>>               int ret;
>>
>>               bit = find_first_bit((void *)&rqd->ppa_status, rqd->nr_ppas);
>> +             if (!pblk_is_oob_meta_supported(pblk) && bit > 0) {
>> +                     /* This case should not happen since we always read in
>> +                      * the same unit here as we wrote in writer thread.
>> +                      */
>> +                     pr_err("pblk: Inconsistent packed metadata read\n");
>> +             }
>>               nr_error_bits = rqd->nr_ppas - bit;
>>
>>               /* Roll back failed sectors */
>> @@ -550,7 +558,7 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct 
>> pblk_line *line,
>> next_rq:
>>       memset(rqd, 0, pblk_g_rq_size);
>>
>> -     rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
>> +     rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false);
>>       if (!rq_ppas)
>>               rq_ppas = pblk->min_write_pgs;
>>       rq_len = rq_ppas * geo->csecs;
>> @@ -608,6 +616,14 @@ static int pblk_recov_scan_oob(struct pblk *pblk, 
>> struct pblk_line *line,
>>               int nr_error_bits, bit;
>>
>>               bit = find_first_bit((void *)&rqd->ppa_status, rqd->nr_ppas);
>> +             if (!pblk_is_oob_meta_supported(pblk)) {
>> +                     /* For packed metadata we do not handle partially
>> +                      * written requests here, since metadata is always
>> +                      * in last page on the requests.
>> +                      */
>> +                     bit = 0;
>> +                     *done = 0;
>> +             }
>>               nr_error_bits = rqd->nr_ppas - bit;
>>
>>               /* Roll back failed sectors */
>> @@ -622,6 +638,7 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct 
>> pblk_line *line,
>>                       *done = 0;
>>       }
>>
>> +     pblk_get_packed_meta(pblk, rqd);
>>       for (i = 0; i < rqd->nr_ppas; i++) {
>>               u64 lba = le64_to_cpu(pblk_get_meta_at(pblk,
>>                                                       meta_list, i)->lba);
>> diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c
>> index b0e5e93a9d5f..aa7b4164ce9e 100644
>> --- a/drivers/lightnvm/pblk-sysfs.c
>> +++ b/drivers/lightnvm/pblk-sysfs.c
>> @@ -473,6 +473,13 @@ static ssize_t pblk_sysfs_set_sec_per_write(struct pblk 
>> *pblk,
>>       if (kstrtouint(page, 0, &sec_per_write))
>>               return -EINVAL;
>>
>> +     if (!pblk_is_oob_meta_supported(pblk)) {
>> +             /* For packed metadata case it is
>> +              * not allowed to change sec_per_write.
>> +              */
>> +             return -EINVAL;
>> +     }
>> +
>>       if (sec_per_write < pblk->min_write_pgs
>>                               || sec_per_write > pblk->max_write_pgs
>>                               || sec_per_write % pblk->min_write_pgs != 0)
>> diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
>> index 6552db35f916..bb45c7e6c375 100644
>> --- a/drivers/lightnvm/pblk-write.c
>> +++ b/drivers/lightnvm/pblk-write.c
>> @@ -354,7 +354,7 @@ static int pblk_calc_secs_to_sync(struct pblk *pblk, 
>> unsigned int secs_avail,
>> {
>>       int secs_to_sync;
>>
>> -     secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush);
>> +     secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush, true);
>>
>> #ifdef CONFIG_NVM_PBLK_DEBUG
>>       if ((!secs_to_sync && secs_to_flush)
>> @@ -522,6 +522,11 @@ static int pblk_submit_io_set(struct pblk *pblk, struct 
>> nvm_rq *rqd)
>>               return NVM_IO_ERR;
>>       }
>>
>> +     /* This is the first place when we have write requests mapped
>> +      * and we can fill packed metadata with l2p mappings.
>> +      */
>> +     pblk_set_packed_meta(pblk, rqd);
>> +
>>       meta_line = pblk_should_submit_meta_io(pblk, rqd);
>>
>>       /* Submit data write for current data line */
>> @@ -572,7 +577,7 @@ static int pblk_submit_write(struct pblk *pblk)
>>       struct bio *bio;
>>       struct nvm_rq *rqd;
>>       unsigned int secs_avail, secs_to_sync, secs_to_com;
>> -     unsigned int secs_to_flush;
>> +     unsigned int secs_to_flush, packed_meta_pgs;
>>       unsigned long pos;
>>       unsigned int resubmit;
>>
>> @@ -608,7 +613,7 @@ static int pblk_submit_write(struct pblk *pblk)
>>                       return 1;
>>
>>               secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb);
>> -             if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
>> +             if (!secs_to_flush && secs_avail < pblk->min_write_pgs_data)
>>                       return 1;
>>
>>               secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail,
>> @@ -623,7 +628,8 @@ static int pblk_submit_write(struct pblk *pblk)
>>               pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com);
>>       }
>>
>> -     bio = bio_alloc(GFP_KERNEL, secs_to_sync);
>> +     packed_meta_pgs = (pblk->min_write_pgs - pblk->min_write_pgs_data);
>> +     bio = bio_alloc(GFP_KERNEL, secs_to_sync + packed_meta_pgs);
>>
>>       bio->bi_iter.bi_sector = 0; /* internal bio */
>>       bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
>> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
>> index 4c61ede5b207..c95ecd8bcf79 100644
>> --- a/drivers/lightnvm/pblk.h
>> +++ b/drivers/lightnvm/pblk.h
>> @@ -605,6 +605,7 @@ struct pblk {
>>       int state;                      /* pblk line state */
>>
>>       int min_write_pgs; /* Minimum amount of pages required by controller */
>> +     int min_write_pgs_data; /* Minimum amount of payload pages */
>>       int max_write_pgs; /* Maximum amount of pages supported by controller 
>> */
>>
>>       sector_t capacity; /* Device capacity when bad blocks are subtracted */
>> @@ -798,7 +799,7 @@ void pblk_dealloc_page(struct pblk *pblk, struct 
>> pblk_line *line, int nr_secs);
>> u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs);
>> u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int 
>> nr_secs);
>> int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail,
>> -                unsigned long secs_to_flush);
>> +                unsigned long secs_to_flush, bool skip_meta);
>> void pblk_up_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas);
>> void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
>>                 unsigned long *lun_bitmap);
>> @@ -823,6 +824,8 @@ void pblk_lookup_l2p_rand(struct pblk *pblk, struct 
>> ppa_addr *ppas,
>>                         u64 *lba_list, int nr_secs);
>> void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
>>                        sector_t blba, int nr_secs);
>> +void pblk_set_packed_meta(struct pblk *pblk, struct nvm_rq *rqd);
>> +void pblk_get_packed_meta(struct pblk *pblk, struct nvm_rq *rqd);
>>
>> /*
>>  * pblk user I/O write path
>> --
>> 2.14.3
>
> The functionality is good. A couple of comments though:
>   - Have you considered the case when the device reports ws_min = ws_opt
>     = 4096? Maybe checks preventing this case would be a good idea.
>   - Have you checked for any conflicts in the write recovery path? You
>     would need to deal with more corner cases (e.g., a write failure in
>     the metadata page would require re-sending always all other sectors).
>   - There are also corner cases on scan recovery: what if the metadata
>     page cannot be read? In this case, data loss (at a drive level) will
>     happen, but you will need to guarantee that reading garbage will not
>     corrupt other data. In the OOB area case this is not a problem
>     because data and metadata are always good/not good simultaneously.

What I understood from Igor, is that for example for 256KB writes (as
WS_MIN), the last 4 KB would be metadata. In that case, it would be
either success or failure. This may also fix your second point.

>   - I think it would be good to update the WA counters, since there is a
>     significant write and space amplification in using (1 / ws_opt)
>     pages for metadata.
>
> Javier

Reply via email to