On 9/16/25 00:18, Andrey Zhadchenko wrote:


On 9/15/25 17:00, Konstantin Khorenko wrote:
On 08.09.2025 19:27, Andrey Zhadchenko wrote:
dm-qcow2 is almost ready to merge intermediate images, thanks
to separated metadata tables and qio being per-qcow2.
So this patch mostly propagates which qcow2 we want to use
instead of tgt->top.

qcow2_merge_backward_complete is a bit harder due to in-middle
replacement and needing to recalculate img_id for a part of
image chain.

https://virtuozzo.atlassian.net/browse/VSTOR-101375
Signed-off-by: Andrey Zhadchenko <andrey.zhadche...@virtuozzo.com>
---
  drivers/md/dm-qcow2-cmd.c | 82 ++++++++++++++++++++++++++++-----------
  drivers/md/dm-qcow2.h     |  1 +
  2 files changed, 61 insertions(+), 22 deletions(-)

diff --git a/drivers/md/dm-qcow2-cmd.c b/drivers/md/dm-qcow2-cmd.c
index e77052ad61e6b..d59fa82dbffbc 100644
--- a/drivers/md/dm-qcow2-cmd.c
+++ b/drivers/md/dm-qcow2-cmd.c
@@ -116,9 +116,9 @@ static int qcow2_service_iter(struct qcow2_target *tgt, struct qcow2 *qcow2,
  }
  ALLOW_ERROR_INJECTION(qcow2_service_iter, ERRNO);
-static int qcow2_merge_common(struct qcow2_target *tgt)
+static int qcow2_merge_common(struct qcow2_target *tgt, struct qcow2 *qcow2)
  {
-    struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower;
+    struct qcow2 *lower = qcow2->lower;
      u32 clu_size = qcow2->clu_size;
      loff_t end = lower->hdr.size;
@@ -139,9 +139,8 @@ static int qcow2_merge_forward(struct qcow2_target *tgt)
  }
  ALLOW_ERROR_INJECTION(qcow2_merge_forward, ERRNO);
-static int qcow2_break_l1cow(struct qcow2_target *tgt)
+static int qcow2_break_l1cow(struct qcow2_target *tgt, struct qcow2 *qcow2)
  {
-    struct qcow2 *qcow2 = tgt->top;
      loff_t end = qcow2->hdr.size;
      loff_t step = (u64)qcow2->l2_entries * qcow2->clu_size;
@@ -152,25 +151,35 @@ static int qcow2_break_l1cow(struct qcow2_target *tgt)
  static void set_backward_merge_in_process(struct qcow2_target *tgt,
                       struct qcow2 *qcow2, bool set)
  {
+    struct qcow2 *top = tgt->top;
      LIST_HEAD(list);
+    /*
+     * There are no writes if it is not the top qcow2 image.
+     * so we do not need to stop and flush requests when setting
+     */
+    if (qcow2 != top && set) {
+        qcow2->backward_merge_in_process = set;
+        return;
+    }
+
      /*
       * To avoid race between allocations and COWS
       * we completely stop queueing qios and wait
       * for pending qios. Lock is for visability.
       */
-    spin_lock_irq(&qcow2->deferred_lock);
-    qcow2->pause_submitting_qios = true;
-    spin_unlock_irq(&qcow2->deferred_lock);
+    spin_lock_irq(&top->deferred_lock);
+    top->pause_submitting_qios = true;
+    spin_unlock_irq(&top->deferred_lock);
      qcow2_inflight_ref_switch(tgt);
      /* queue is stopped */
-    spin_lock_irq(&qcow2->deferred_lock);
+    spin_lock_irq(&top->deferred_lock);
      WARN_ON_ONCE(qcow2->backward_merge_in_process == set);
      qcow2->backward_merge_in_process = set;
-    qcow2->pause_submitting_qios = false;
-    list_splice_init(&qcow2->paused_qios, &list);
-    spin_unlock_irq(&qcow2->deferred_lock);
+    top->pause_submitting_qios = false;
+    list_splice_init(&top->paused_qios, &list);
+    spin_unlock_irq(&top->deferred_lock);
      qcow2_submit_embedded_qios(tgt, &list);
  }
@@ -240,11 +249,15 @@ static int qcow2_merge_backward_progress(struct qcow2_target *tgt,   static int qcow2_merge_backward_set_eventfd(struct qcow2_target *tgt, int efd); -static int qcow2_merge_backward_start(struct qcow2_target *tgt, int efd) +static int qcow2_merge_backward_start(struct qcow2_target *tgt, int efd, int depth)
  {
-    struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower;
+    struct qcow2 *qcow2 = tgt->top, *lower;
      int ret;
+    while (depth-- > 0 && qcow2->lower)
+        qcow2 = qcow2->lower;
+    lower = qcow2->lower;
+
      lockdep_assert_held(&tgt->ctl_mutex);
      if (!lower)
@@ -263,6 +276,7 @@ static int qcow2_merge_backward_start(struct qcow2_target *tgt, int efd)
      if (ret)
          return ret;
+    tgt->backward_merge.qcow2 = qcow2;
      tgt->backward_merge.state = BACKWARD_MERGE_START;
      __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_START);
      tgt->backward_merge.error = 0;
@@ -291,7 +305,7 @@ void qcow2_merge_backward_work(struct work_struct *work)       __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_BREAK_L1COW);
      mutex_unlock(&tgt->ctl_mutex);
-    qcow2 = tgt->top;
+    qcow2 = tgt->backward_merge.qcow2;
      lower = qcow2->lower;
      /*
@@ -300,7 +314,7 @@ void qcow2_merge_backward_work(struct work_struct *work)
       * we'd have to freeze IO going to all data clusters
       * under every L1 entry related to several snapshots.
       */
-    ret = qcow2_break_l1cow(tgt);
+    ret = qcow2_break_l1cow(tgt, qcow2);
      if (ret) {
          QC_ERR(tgt->ti, "Can't break L1 COW");
          goto out_err;
@@ -316,7 +330,7 @@ void qcow2_merge_backward_work(struct work_struct *work)
      /* Start merge */
      backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_RUNNING);
-    ret = qcow2_merge_common(tgt);
+    ret = qcow2_merge_common(tgt, qcow2);
      if (ret) {
          set_backward_merge_in_process(tgt, qcow2, false);
          ret2 = qcow2_set_image_file_features(lower, false);
@@ -357,9 +371,29 @@ static int qcow2_merge_backward_complete(struct qcow2_target *tgt)
      if (tgt->backward_merge.state != BACKWARD_MERGE_WAIT_COMPLETION)
          return -EBUSY;
+
+    if (tgt->top == tgt->backward_merge.qcow2) {
+        tgt->top = lower;
+    } else {
+        struct qcow2 *top = tgt->top;
+
+        while (top && top->lower != tgt->backward_merge.qcow2)
+            top = top->lower;

This algorithm can be simplified, we really only need to search the place in delta list once on "start", save it and use it directly later.

To do that we need to save a double pointer to where qcow2 pointer is to tgt->backward_merge.pqcow2 (e.g.: struct qcow2 **).

on start:

tgt->backward_merge.pqcow2 = &qcow2;

on complete:

*tgt->backward_merge.pqcow2 = (*tgt->backward_merge.pqcow2)->lower;

And we can still get qcow2 and lower like:

qcow2 = *tgt->backward_merge.pqcow2;
lower = qcow2->lower;

There is img_id change loop though, which will be still needed, but at least we don't have two loops.

Note: By the way we might be able to replace qcow2->img_id on each delta with just one dm_target->depth per target, as we only use img_id from top delta to find the depth AFAICS (or search for (depth-n)-th delta from top).

+        if (!top)
+            return -EFAULT;

We should not get to EFAULT as this list is only updated by us ("complete") and qcow2_ctr->qcow2_alloc_delta. This way we can have no error here and move BACKWARD_MERGE_STAGE_COMPLETING stage setting above this code, indicating that we are in the process of completion.

+
+        qcow2 = tgt->backward_merge.qcow2;
+        lower = qcow2->lower;
+        top->lower = lower;
+
+        top = tgt->top;
+        while (top != lower) {
+            top->img_id--;
+            top = top->lower;
+        }

May be to check here (in while) if top == NULL and return an error is yes?

Why? We did some checks early. And also if we want to return error from here we need to increment id's back and revert top->lower change. I would rather not do that.

As we've already checked that we have all needed deltas on desired depth as early as on backwardmerge start, check here is not needed, agreed.

But we can have BUG_ON here, to be extra careful in case delta list was corrupted or some other unexpected thing happened.



+    }
+
      __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_COMPLETING);
-
-    tgt->top = lower;
      smp_wmb(); /* Pairs with qcow2_ref_inc() */
      qcow2_inflight_ref_switch(tgt); /* Pending qios */
      qcow2_flush_deferred_activity(tgt, qcow2); /* Delayed md pages */
@@ -396,7 +430,7 @@ void qcow2_merge_backward_cancel(struct qcow2_target *tgt)
      } else if (tgt->backward_merge.state == BACKWARD_MERGE_STOP) {
          flush = true;
      } else if (tgt->backward_merge.state == BACKWARD_MERGE_WAIT_COMPLETION) {
-        set_backward_merge_in_process(tgt, tgt->top, false);
+        set_backward_merge_in_process(tgt, tgt- >backward_merge.qcow2, false);
          tgt->backward_merge.state = BACKWARD_MERGE_STOPPED;
          __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_NONE);
      }
@@ -572,7 +606,7 @@ int qcow2_message(struct dm_target *ti, unsigned int argc, char **argv,
      struct qcow2_target *tgt = to_qcow2_target(ti);
      int ret = -EPERM;
      u32 val, val2;
-    int efd;
+    int efd, depth = 0;
      if (!capable(CAP_SYS_ADMIN))
          goto out;
@@ -652,11 +686,15 @@ int qcow2_message(struct dm_target *ti, unsigned int argc, char **argv,
      } else if (!strcmp(argv[0], "merge_backward")) {
          /* argc >= 2 */
          if (!strcmp(argv[1], "start")) {
-            if (argc != 3 || kstrtoint(argv[2], 10, &efd) || efd < 0) {
+            if (argc < 3 || argc > 4 || kstrtoint(argv[2], 10, &efd) || efd < 0) {
                  ret = -EINVAL;
                  goto out_unlock;
              }
-            ret = qcow2_merge_backward_start(tgt, efd);
+            if (argc == 4 && kstrtou32(argv[3], 10, &depth)) {
+                ret = -EINVAL;
+                goto out_unlock;
+            }
+            ret = qcow2_merge_backward_start(tgt, efd, depth);
          } else if (!strcmp(argv[1], "complete")) {
              if (argc != 2) {
                  ret = -EINVAL;
diff --git a/drivers/md/dm-qcow2.h b/drivers/md/dm-qcow2.h
index 5aa00c6a5ebd5..c2b2193561461 100644
--- a/drivers/md/dm-qcow2.h
+++ b/drivers/md/dm-qcow2.h
@@ -172,6 +172,7 @@ enum qcow2_backward_merge_stage {
  struct qcow2_backward_merge {
      struct work_struct work;
+    struct qcow2 *qcow2;
      enum qcow2_backward_merge_state state;
      int error;
      struct eventfd_ctx *eventfd_ctx;



--
Best regards, Pavel Tikhomirov
Senior Software Developer, Virtuozzo.

_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to