Dima,

I agree with general approach of this patch, but there are some (easy-to-fix) issues. See, please, inline comments below...

On 06/20/2016 11:58 AM, Dmitry Monakhov wrote:
barrier code is broken in many ways:
Currently only ->dio_submit() handles PLOOP_REQ_FORCE_{FLUSH,FUA} correctly.
But request also can goes though ->dio_submit_alloc()->dio_submit_pad and 
write_page (for indexes)
So in case of grow_dev we have following sequance:

E_RELOC_DATA_READ:
              ->set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
               ->delta->allocate
                  ->io->submit_allloc: dio_submit_alloc
                    ->dio_submit_pad
E_DATA_WBI : data written, time to update index
               ->delta->allocate_complete:ploop_index_update
                 ->set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
                 ->write_page
                 ->ploop_map_wb_complete
                   ->ploop_wb_complete_post_process
                     ->set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
E_RELOC_NULLIFY:

                ->submit()

BUG#2: currecntly kaio write_page silently ignores REQ_FUA

Sorry, I can't agree, it actually does not ignore:

static void
kaio_write_page(struct ploop_io * io, struct ploop_request * preq,
         struct page * page, sector_t sec, int fua)
{
    /* No FUA in kaio, convert it to fsync */
    if (fua)
        set_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state);


BUG#3: io_direct:dio_submit  if fua_delay is not possible we MUST tag all bios 
via REQ_FUA
        not just latest one.

No need to tag *all*. See inline comments below.

This patch unify barrier handling like follows:
- Get rid of FORCE_{FLUSH,FUA}
- Introduce DELAYED_FLUSH, currecntly it supported only by io_direct
- fix up fua handling for dio_submit

This makes reloc sequence optimal:
io_direct
RELOC_S: R1, W2, WBI:FLUSH|FUA
RELOC_A: R1, W2, WBI:FLUSH|FUA, W1:NULLIFY|FUA
io_kaio
RELOC_S: R1, W2:FUA, WBI:FUA
RELOC_A: R1, W2:FUA, WBI:FUA, W1:NULLIFY|FUA

https://jira.sw.ru/browse/PSBM-47107
Signed-off-by: Dmitry Monakhov <[email protected]>
---
  drivers/block/ploop/dev.c       |  8 +++++---
  drivers/block/ploop/io_direct.c | 29 +++++++++-----------------
  drivers/block/ploop/io_kaio.c   | 17 ++++++++++------
  drivers/block/ploop/map.c       | 45 ++++++++++++++++++++++-------------------
  include/linux/ploop/ploop.h     |  8 ++++----
  5 files changed, 54 insertions(+), 53 deletions(-)

diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
index 96f7850..fbc5f2f 100644
--- a/drivers/block/ploop/dev.c
+++ b/drivers/block/ploop/dev.c
@@ -1224,6 +1224,9 @@ static void ploop_complete_request(struct ploop_request * 
preq)
__TRACE("Z %p %u\n", preq, preq->req_cluster); + if (!preq->error) {
+               WARN_ON(test_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state));
+       }
        while (preq->bl.head) {
                struct bio * bio = preq->bl.head;
                preq->bl.head = bio->bi_next;
@@ -2530,9 +2533,8 @@ restart:
                top_delta = ploop_top_delta(plo);
                sbl.head = sbl.tail = preq->aux_bio;
- /* Relocated data write required sync before BAT updatee */
-               set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
-
+               /* Relocated data write required sync before BAT updatee
+                * this will happen inside index_update */
                if (test_bit(PLOOP_REQ_RELOC_S, &preq->state)) {
                        preq->eng_state = PLOOP_E_DATA_WBI;
                        plo->st.bio_out++;
diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c
index a6d83fe..d7ecd4a 100644
--- a/drivers/block/ploop/io_direct.c
+++ b/drivers/block/ploop/io_direct.c
@@ -90,21 +90,12 @@ dio_submit(struct ploop_io *io, struct ploop_request * preq,
        trace_submit(preq);
preflush = !!(rw & REQ_FLUSH);
-
-       if (test_and_clear_bit(PLOOP_REQ_FORCE_FLUSH, &preq->state))
-               preflush = 1;
-
-       if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, &preq->state))
-               postfua = 1;
-
-       if (!postfua && ploop_req_delay_fua_possible(rw, preq)) {
-
+       postfua = !!(rw & REQ_FUA);
+       if (ploop_req_delay_fua_possible(rw, preq)) {
                /* Mark req that delayed flush required */
-               set_bit(PLOOP_REQ_FORCE_FLUSH, &preq->state);
-       } else if (rw & REQ_FUA) {
-               postfua = 1;
+               set_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state);
+               postfua = 0;
        }

"postfua" is a horrible name, let us see if we can get rid of it completely. Also, the way how ploop_req_delay_fua_possible implemented is prone to errors (see below an issue in kaio_complete_io_state). Let's rework it like this:

static inline bool ploop_req_delay_fua_possible(struct ploop_request *preq)
{
    return preq->eng_state == PLOOP_E_DATA_WBI;
}

Then, that chunk in the dio_submit above might look as:

    /* If we can delay, mark req that delayed flush required */
    if ((rw & REQ_FUA) && ploop_req_delay_fua_possible(preq))
        set_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state);



-
        rw &= ~(REQ_FLUSH | REQ_FUA);
@@ -238,14 +229,15 @@ flush_bio:
                        rw2 |= REQ_FLUSH;
                        preflush = 0;
                }
-               if (unlikely(postfua && !bl.head))
-                       rw2 |= (REQ_FUA | ((bio_num) ? REQ_FLUSH : 0));
+               /* Very unlikely, but correct.
+                * TODO: Optimize postfua via DELAY_FLUSH for any req state */
+               if (unlikely(!postfua))
+                       rw2 |= REQ_FUA;

If test_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state), do nothing here -- that's obvious. So let's assume !test_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state)...

There is a call to bio_add_page in dio_submit before that loop. The moment bio_add_page returned zero (i.e. OK), we know for sure which bio it came from: it is bw.cur. Then it's enough to mark new bio like this:

    bio->bi_rw |= bw.cur & REQ_FUA;

Then, instead of "submit_bio(rw2, b);", use "submit_bio(rw2 | b->bi_rw, b);".

This way we'll mark with FUA only those of new bio-s that has at least some data (pages) from original bio-s with FUA. Makes sense?

Btw, dio_io_page() must be fixed similarly (get rid of postfua, mark with FUA only what we really need).

ploop_acc_ff_out(preq->plo, rw2 | b->bi_rw);
                submit_bio(rw2, b);
                bio_num++;
        }
-

I liked this empty line, please keep it! ;)

        ploop_complete_io_request(preq);
        return;
@@ -1520,15 +1512,14 @@ dio_read_page(struct ploop_io * io, struct ploop_request * preq, static void
  dio_write_page(struct ploop_io * io, struct ploop_request * preq,
-              struct page * page, sector_t sec, int fua)
+              struct page * page, sector_t sec, unsigned long rw)
  {
        if (!(io->files.file->f_mode & FMODE_WRITE)) {
                PLOOP_FAIL_REQUEST(preq, -EBADF);
                return;
        }
- dio_io_page(io, WRITE | (fua ? REQ_FUA : 0) | REQ_SYNC,
-                   preq, page, sec);
+       dio_io_page(io, rw | WRITE | REQ_SYNC, preq, page, sec);
  }
static int
diff --git a/drivers/block/ploop/io_kaio.c b/drivers/block/ploop/io_kaio.c
index de26319..0b93e13 100644
--- a/drivers/block/ploop/io_kaio.c
+++ b/drivers/block/ploop/io_kaio.c
@@ -72,13 +72,17 @@ static void kaio_complete_io_state(struct ploop_request * 
preq)
        }
/* Convert requested fua to fsync */
-       if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, &preq->state) ||
+       if (test_and_clear_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state) ||

io_kaio has nothing to do with DELAYED_FLUSH (only dio sets it). Hence, no need to check it here.

            test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state))
                post_fsync = 1;
if (!post_fsync &&
-           !ploop_req_delay_fua_possible(preq->req_rw, preq) &&
-           (preq->req_rw & REQ_FUA))
+           !ploop_req_delay_fua_possible(preq->req_rw, preq))
+               post_fsync = 1;
+       /* Reloc requires FLUSH_FUA from ->index_update for delayed_fua
+          which is not supporded by ->kaio_write_page */
+       if (test_and_clear_bit(PLOOP_REQ_RELOC_S, &preq->state) ||
+           test_and_clear_bit(PLOOP_REQ_RELOC_S, &preq->state))
                post_fsync = 1;

I'm sorry, this looks like a mess. You can't freely clear this bit, and RELOC_A missed. Can we replace all this lengthy code:

    /* Convert requested fua to fsync */
    if (test_and_clear_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state) ||
        test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state))
        post_fsync = 1;

    if (!post_fsync &&
        !ploop_req_delay_fua_possible(preq->req_rw, preq))
        post_fsync = 1;
    /* Reloc requires FLUSH_FUA from ->index_update for delayed_fua
       which is not supporded by ->kaio_write_page */
    if (test_and_clear_bit(PLOOP_REQ_RELOC_S, &preq->state) ||
        test_and_clear_bit(PLOOP_REQ_RELOC_S, &preq->state))
        post_fsync = 1;

with something more brief like this:

    bool fua = preq->req_rw & REQ_FUA;
    unsigned long state = READ_ONCE(preq->state);
    bool reloc = state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL);

    if (test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state) ||
        (reloc && ploop_req_delay_fua_possible(preq)) ||
        (fua && !ploop_req_delay_fua_possible(preq)))
        post_fsync = 1;

Note, how it fixes a bug: if !(preq->req_rw & REQ_FUA), delay_fua_possible(rw, preq) returned zero anyway. Hence we did post_sync=1 even though the request has nothing to do with FUA!


preq->req_rw &= ~REQ_FUA;
@@ -608,12 +612,13 @@ kaio_read_page(struct ploop_io * io, struct ploop_request 
* preq,
static void
  kaio_write_page(struct ploop_io * io, struct ploop_request * preq,
-                struct page * page, sector_t sec, int fua)
+                struct page * page, sector_t sec, unsigned long rw)
  {
        ploop_prepare_tracker(preq, sec);
-
+       /* This is async call , preflush is not supported */
+       BUG_ON(rw & REQ_FLUSH);
        /* No FUA in kaio, convert it to fsync */
-       if (fua)
+       if (rw & REQ_FUA)
                set_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state);
kaio_io_page(io, IOCB_CMD_WRITE_ITER, preq, page, sec);
diff --git a/drivers/block/ploop/map.c b/drivers/block/ploop/map.c
index 3a6365d..de9bb32 100644
--- a/drivers/block/ploop/map.c
+++ b/drivers/block/ploop/map.c
@@ -896,6 +896,8 @@ void ploop_index_update(struct ploop_request * preq)
        struct ploop_device * plo = preq->plo;
        struct map_node * m = preq->map;
        struct ploop_delta * top_delta = map_top_delta(m->parent);
+       unsigned long rw = preq->req_rw & REQ_FUA;
+       unsigned long state = READ_ONCE(preq->state);
        u32 idx;
        map_index_t blk;
        int old_level;
@@ -953,13 +955,14 @@ void ploop_index_update(struct ploop_request * preq)
        __TRACE("wbi %p %u %p\n", preq, preq->req_cluster, m);
        plo->st.map_single_writes++;
        top_delta->ops->map_index(top_delta, m->mn_start, &sec);
-       /* Relocate requires consistent writes, mark such reqs appropriately */
-       if (test_bit(PLOOP_REQ_RELOC_A, &preq->state) ||
-           test_bit(PLOOP_REQ_RELOC_S, &preq->state))
-               set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
-
-       top_delta->io.ops->write_page(&top_delta->io, preq, page, sec,
-                                     !!(preq->req_rw & REQ_FUA));
+       /* Relocate requires consistent index update */
+       if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL))
+               rw |= REQ_FUA;
+       if (state & PLOOP_REQ_DELAYED_FLUSH_FL)
+               rw |= REQ_FLUSH;
+       if (rw)
+               clear_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state);

Can you please move clear_bit upward, like this:

    if (state & PLOOP_REQ_DELAYED_FLUSH_FL) {
        rw |= REQ_FLUSH;
        clear_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state);
    }

+       top_delta->io.ops->write_page(&top_delta->io, preq, page, sec, rw | 
WRITE);
        put_page(page);
        return;
@@ -1063,7 +1066,7 @@ static void map_wb_complete_post_process(struct ploop_map *map,
         * (see dio_submit()). So fsync of EXT4 image doesnt help us.
         * We need to force sync of nullified blocks.
         */
-       set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
+       preq->req_rw |= REQ_FUA;
        top_delta->io.ops->submit(&top_delta->io, preq, preq->req_rw,
                                  &sbl, preq->iblock, 1<<plo->cluster_log);
  }
@@ -1074,11 +1077,12 @@ static void map_wb_complete(struct map_node * m, int 
err)
        struct ploop_delta * top_delta = map_top_delta(m->parent);
        struct list_head * cursor, * tmp;
        struct ploop_request * main_preq;
+       unsigned long rw =  0;
        struct page * page = NULL;
        int delayed = 0;
        unsigned int idx;
        sector_t sec;
-       int fua, force_fua;
+       int fua;
/* First, complete processing of written back indices,
         * finally instantiate indices in mapping cache.
@@ -1149,10 +1153,10 @@ static void map_wb_complete(struct map_node * m, int 
err)
main_preq = NULL;
        fua = 0;
-       force_fua = 0;
list_for_each_safe(cursor, tmp, &m->io_queue) {
                struct ploop_request * preq;
+               unsigned long state;
preq = list_entry(cursor, struct ploop_request, list); @@ -1167,13 +1171,15 @@ static void map_wb_complete(struct map_node * m, int err)
                                spin_unlock_irq(&plo->lock);
                                break;
                        }
-
-                       if (preq->req_rw & REQ_FUA)
-                               fua = 1;
-
-                       if (test_bit(PLOOP_REQ_RELOC_A, &preq->state) ||
-                           test_bit(PLOOP_REQ_RELOC_S, &preq->state))
-                               force_fua = 1;
+                       state = READ_ONCE(preq->state);
+                       /* Relocate requires consistent index update */
+                       rw |= preq->req_rw & REQ_FUA;
+                       if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL))
+                               rw |= REQ_FUA;
+                       if (state & PLOOP_REQ_DELAYED_FLUSH_FL)
+                               rw |= REQ_FLUSH;
+                       if (rw)
+                               clear_bit(PLOOP_REQ_DELAYED_FLUSH, 
&preq->state);

The same as above -- easier to read if clear_bit is inside DELAYED_FLUSH "if".

Thanks,
Maxim

preq->eng_state = PLOOP_E_INDEX_WB;
                        get_page(page);
@@ -1200,10 +1206,7 @@ static void map_wb_complete(struct map_node * m, int err)
        plo->st.map_multi_writes++;
        top_delta->ops->map_index(top_delta, m->mn_start, &sec);
- if (force_fua)
-               set_bit(PLOOP_REQ_FORCE_FUA, &main_preq->state);
-
-       top_delta->io.ops->write_page(&top_delta->io, main_preq, page, sec, 
fua);
+       top_delta->io.ops->write_page(&top_delta->io, main_preq, page, sec, rw 
| WRITE);
        put_page(page);
  }
diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h
index 0fba25e..2f26824 100644
--- a/include/linux/ploop/ploop.h
+++ b/include/linux/ploop/ploop.h
@@ -157,7 +157,7 @@ struct ploop_io_ops
        void    (*read_page)(struct ploop_io * io, struct ploop_request * preq,
                             struct page * page, sector_t sec);
        void    (*write_page)(struct ploop_io * io, struct ploop_request * preq,
-                             struct page * page, sector_t sec, int fua);
+                             struct page * page, sector_t sec, unsigned long 
rw);
int (*sync_read)(struct ploop_io * io, struct page * page,
@@ -465,8 +465,7 @@ enum
        PLOOP_REQ_ZERO,
        PLOOP_REQ_DISCARD,
        PLOOP_REQ_RSYNC,
-       PLOOP_REQ_FORCE_FUA,    /*force fua of req write I/O by engine */
-       PLOOP_REQ_FORCE_FLUSH,  /*force flush by engine */
+       PLOOP_REQ_DELAYED_FLUSH,/* REQ_FLUSH is delayed */
        PLOOP_REQ_KAIO_FSYNC,   /*force image fsync by KAIO module */
        PLOOP_REQ_POST_SUBMIT, /* preq needs post_submit processing */
        PLOOP_REQ_PUSH_BACKUP, /* preq was ACKed by userspace push_backup */
@@ -477,6 +476,7 @@ enum
  #define PLOOP_REQ_MERGE_FL (1 << PLOOP_REQ_MERGE)
  #define PLOOP_REQ_RELOC_A_FL (1 << PLOOP_REQ_RELOC_A)
  #define PLOOP_REQ_RELOC_S_FL (1 << PLOOP_REQ_RELOC_S)
+#define PLOOP_REQ_DELAYED_FLUSH_FL (1 << PLOOP_REQ_DELAYED_FLUSH)
  #define PLOOP_REQ_DISCARD_FL (1 << PLOOP_REQ_DISCARD)
  #define PLOOP_REQ_ZERO_FL (1 << PLOOP_REQ_ZERO)
@@ -614,7 +614,7 @@ static inline int ploop_req_delay_fua_possible(unsigned long rw,
         * fua.
         */
        if (rw & REQ_FUA) {
-               if (preq->eng_state != PLOOP_E_COMPLETE)
+               if (preq->eng_state == PLOOP_E_DATA_WBI)
                        delay_fua = 1;
        }
        return delay_fua;

_______________________________________________
Devel mailing list
[email protected]
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to