On 10/22/25 4:46 PM, Alexey Kuznetsov wrote:

Hello!

If it really does what you described, it is all right.

But it does not look like it does. From what I see it really calls
kvmalloc with GFP_ATOMIC.  No?
GFP_ATOMIC of high order is death to the system, literally.
We want to keep performance and not reschedule in case the node has plenty of ram, so I abstained from placing a concrete limit to allocation in atomic context.

Why not to check order when on interrupt to go straight to workqueue
instead of all the trickery?

Moreover, IMHO block layer does _not_ use GFP_ATOMIC at all, unless it
is backed by mempool,
where it is not actually GFP_ATOMIC, but the thing which I described before.
As I see dm ploop/qcow is the only device which abuses GFP_ATOMIC. Does not
this smell fishy? :-)

Also, the last version added a formal technical error:

if (flags & GFP_ATOMIC)

GFP_ATOMIC is not a flag, it is combo of many flags, which can in theory
intersect wth everything
I replaced the flag check with gfpflags_allow_blocking.

On Wed, Oct 22, 2025 at 9:41 PM Pavel Tikhomirov
<[email protected]> wrote:


On 10/22/25 20:38, Alexey Kuznetsov wrote:
Hello!

Beware, it used GFP_ATOMIC. Does not this mean t his code can be
executed in interrupt context?
If so, then kvmalloc is a strict no.
The idea was if we require high order allocation (when we create bvec
from rq) in interrupt (I guess it is ploop_clone_and_map() path) instead
of just failing, as it was before this patch, we put pio to a "to be
handled later" list (see ploop_prepare_one_embedded_pio). And we handle
allocation for this pio in ploop kernel threads, which already run in
non-atomic context and can use kvmalloc safely.

On Wed, Oct 22, 2025 at 8:11 PM Vasileios Almpanis
<[email protected]> wrote:
When handling multiple concurrent dm-ploop requests, large bio_vec arrays
can be allocated during request processing. These allocations are currently
done with kmalloc_array(GFP_ATOMIC), which can fail under memory pressure
for higher orders (order >= 6, ~256KB). Such failures result in partial or
corrupted I/O, leading to EXT4 directory checksum errors and read-only
remounts under heavy parallel workloads.

This patch adds a fallback mechanism to use kvmalloc_array for
large or failed allocations. If the estimated allocation order is >= 6, or
if the kmalloc_array allocation fails. This avoids high-order GFP_ATOMIC
allocations from interrupt context and ensures more reliable memory allocation
behavior.

https://virtuozzo.atlassian.net/browse/VSTOR-109595
Signed-off-by: Vasileios Almpanis <[email protected]>
Feature: dm-ploop: ploop target driver
---
   drivers/md/dm-ploop-map.c | 46 ++++++++++++++++++++++++++++++---------
   drivers/md/dm-ploop.h     |  1 +
   2 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c
index 3fb841f8bcea..899b9bf088b3 100644
--- a/drivers/md/dm-ploop-map.c
+++ b/drivers/md/dm-ploop-map.c
@@ -16,6 +16,7 @@
   #include <linux/error-injection.h>
   #include <linux/uio.h>
   #include <linux/blk-mq.h>
+#include <linux/mm.h>
   #include <uapi/linux/falloc.h>
   #include "dm-ploop.h"
   #include "dm-rq.h"
@@ -89,6 +90,7 @@ void ploop_init_pio(struct ploop *ploop, unsigned int bi_op, 
struct pio *pio)
          pio->ref_index = PLOOP_REF_INDEX_INVALID;
          pio->queue_list_id = PLOOP_LIST_DEFERRED;
          pio->bi_status = BLK_STS_OK;
+       pio->use_kvmalloc = false;
          atomic_set(&pio->remaining, 1);
          pio->piwb = NULL;
          INIT_LIST_HEAD(&pio->list);
@@ -193,8 +195,12 @@ static void ploop_prq_endio(struct pio *pio, void *prq_ptr,
          struct ploop_rq *prq = prq_ptr;
          struct request *rq = prq->rq;

-       if (prq->bvec)
-               kfree(prq->bvec);
+       if (prq->bvec) {
+               if (pio->use_kvmalloc)
+                       kvfree(prq->bvec);
+               else
+                       kfree(prq->bvec);
+       }
          if (prq->css)
                  css_put(prq->css);
          /*
@@ -1963,26 +1969,40 @@ void ploop_index_wb_submit(struct ploop *ploop, struct 
ploop_index_wb *piwb)
          ploop_runners_add_work(ploop, pio);
   }

-static struct bio_vec *ploop_create_bvec_from_rq(struct request *rq)
+static struct bio_vec *ploop_create_bvec_from_rq(struct request *rq, bool 
use_kvmalloc)
   {
          struct bio_vec bv, *bvec, *tmp;
          struct req_iterator rq_iter;
          unsigned int nr_bvec = 0;
+       unsigned int order = 0;

          rq_for_each_bvec(bv, rq, rq_iter)
                  nr_bvec++;

-       bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
-                            GFP_ATOMIC);
-       if (!bvec)
-               goto out;
+       if (use_kvmalloc) {
+               bvec = kvmalloc_array(nr_bvec, sizeof(struct bio_vec),
+                                     GFP_NOIO);
+               if (!bvec)
+                       return ERR_PTR(-ENOMEM);
+       } else {
+               order = get_order(nr_bvec * sizeof(struct bio_vec));
+               /*
+                * order 6 is 262144 bytes. Lets defer such big
+                * allocations to workqueue.
+                */
+               if (order >= 6)
+                       return ERR_PTR(-EAGAIN);
+               bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
+                                    GFP_ATOMIC | __GFP_NOWARN);
+               if (!bvec)
+                       return ERR_PTR(-EAGAIN);
+       }

          tmp = bvec;
          rq_for_each_bvec(bv, rq, rq_iter) {
                  *tmp = bv;
                  tmp++;
          }
-out:
          return bvec;
   }
   ALLOW_ERROR_INJECTION(ploop_create_bvec_from_rq, NULL);
@@ -2003,9 +2023,15 @@ static void ploop_prepare_one_embedded_pio(struct ploop 
*ploop,
                   * Transform a set of bvec arrays related to bios
                   * into a single bvec array (which we can iterate).
                   */
-               bvec = ploop_create_bvec_from_rq(rq);
-               if (!bvec)
+               bvec = ploop_create_bvec_from_rq(rq, pio->use_kvmalloc);
+               if (IS_ERR(bvec)) {
+                       if (PTR_ERR(bvec) == -EAGAIN) {
+                               pio->use_kvmalloc = true;
+                               llist_add((struct llist_node *)(&pio->list), 
&ploop->pios[PLOOP_LIST_PREPARE]);
+                               return;
+                       }
                          goto err_nomem;
+               }
                  prq->bvec = bvec;
   skip_bvec:
                  pio->bi_iter.bi_size = blk_rq_bytes(rq);
diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h
index fc12efeb0cd9..53e8d12064bd 100644
--- a/drivers/md/dm-ploop.h
+++ b/drivers/md/dm-ploop.h
@@ -316,6 +316,7 @@ struct pio {
          unsigned int ref_index:2;

          u8 queue_list_id; /* id in ploop->pios */
+       bool use_kvmalloc;

          struct ploop_index_wb *piwb;

--
2.43.0

_______________________________________________
Devel mailing list
[email protected]
https://lists.openvz.org/mailman/listinfo/devel
--
Best regards, Pavel Tikhomirov
Senior Software Developer, Virtuozzo.
_______________________________________________
Devel mailing list
[email protected]
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to