The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will 
appear at g...@bitbucket.org:openvz/vzkernel.git
after rh9-5.14.0-427.44.1.vz9.80.34
------>
commit fac71eb8970e82ea7d98c78514991bd9682f099b
Author: Alexey Kuznetsov <kuz...@virtuozzo.com>
Date:   Mon May 12 23:14:05 2025 +0800

    fs/fuse/kio: relax congestion avoidance rule
    
    Patch 99c9c1cd19193 ("fs/fuse/kio: severe reordering when cs is congested")
    fixed reordering happening when cs congested, but unexpectedly introduced
    serious degradation in random read performance.
    
    The problem is pretty hard and it requires serious investigation.
    So, for current release we suggest to undo this patch, which must
    be safe since the problem which it fixed was not even ever noticed
    in the practice. Yet, retest of read tests for replicas is still required.
    
    Kernel module "fuse_kio_pcs" option "relax_congavoid" is added.
    
    Values:
     - 1 (default): returns the bevahiour as it was before the patch
          "fs/fuse/kio: severe reordering when cs is congested"
     - 0: returns formally correct but badly performing behaviour
     - 2: disables congestion avoidance for local CSes, which might be
          variant mostly satisfactory, yet experimental.
    
    Summary of the problem: when target CS is congested, requests are
    queued in client waiting for completion of some of sent requests.
    
    But at high rates (>~1Miops) we get our own local cpu so busy,
    that it looks for our algorithm indistiguishable of actual CS congestion.
    
    So, requests start to be queued which drastically increases processing
    cost and put even more pressure on cpu. This unexpected positive feedback
    reduces iops in 64 thread tests by >20%, and I was able to make
    the configuration, where it is worse by 50%, which is unacceptable.
    
    Proper solution is still unknown, positive feedback problems are difficult.
    Either we change queue accounting not to account locally queued requests,
    or we optimize code to make requeueing cheap or we disable congestion
    limits for fast path, any of these things are not quick fixes.
    
    Affects: #VSTOR-103628
    https://virtuozzo.atlassian.net/browse/VSTOR-103628
    
    Signed-off-by: Alexey Kuznetsov <kuz...@virtuozzo.com>
    Feature: vStorage
---
 fs/fuse/kio/pcs/pcs_cs.c  |  4 +---
 fs/fuse/kio/pcs/pcs_cs.h  | 12 +++++++++++-
 fs/fuse/kio/pcs/pcs_map.c | 23 ++++++++++++++++++-----
 3 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/fs/fuse/kio/pcs/pcs_cs.c b/fs/fuse/kio/pcs/pcs_cs.c
index 299ece862cd3b..6eef58994feff 100644
--- a/fs/fuse/kio/pcs/pcs_cs.c
+++ b/fs/fuse/kio/pcs/pcs_cs.c
@@ -1549,11 +1549,10 @@ void pcs_cs_set_stat_up(struct pcs_cs_set *set)
        pcs_cs_for_each_entry(set, do_update_stat, 0);
 }
 
-int pcs_cs_cong_enqueue_cond(struct pcs_int_request *ireq, struct pcs_cs *cs)
+int __pcs_cs_cong_enqueue_cond(struct pcs_int_request *ireq, struct pcs_cs *cs)
 {
        int queued = 0;
 
-       spin_lock(&cs->lock);
        if (cs->in_flight >= cs->eff_cwnd ||
            (cs->cong_queue_len && !(ireq->flags & IREQ_F_REQUEUED))) {
                queued = 1;
@@ -1571,7 +1570,6 @@ int pcs_cs_cong_enqueue_cond(struct pcs_int_request 
*ireq, struct pcs_cs *cs)
                             queued, ireq, smp_processor_id(), 
cs->cong_queue_len,
                             cs->in_flight, cs->eff_cwnd, 
DENTRY_ARGS(ireq->dentry),
                             ireq->iochunk.chunk + ireq->iochunk.offset, 
ireq->iochunk.size);
-       spin_unlock(&cs->lock);
        return queued;
 }
 
diff --git a/fs/fuse/kio/pcs/pcs_cs.h b/fs/fuse/kio/pcs/pcs_cs.h
index d2e80fed7882b..1fdc502663584 100644
--- a/fs/fuse/kio/pcs/pcs_cs.h
+++ b/fs/fuse/kio/pcs/pcs_cs.h
@@ -128,7 +128,17 @@ static inline void pcs_cs_activate_cong_queue(struct 
pcs_cs *cs)
        list_splice_tail_init(&cs->cong_queue, &cs->active_list);
 }
 
-int pcs_cs_cong_enqueue_cond(struct pcs_int_request *ireq, struct pcs_cs *cs);
+int __pcs_cs_cong_enqueue_cond(struct pcs_int_request *ireq, struct pcs_cs 
*cs);
+
+static inline int pcs_cs_cong_enqueue_cond(struct pcs_int_request *ireq, 
struct pcs_cs *cs)
+{
+       int queued = 0;
+
+       spin_lock(&cs->lock);
+       queued = __pcs_cs_cong_enqueue_cond(ireq, cs);
+       spin_unlock(&cs->lock);
+       return queued;
+}
 
 #define PCS_CS_HASH_SIZE 1024
 #define PCS_HOST_HASH_SIZE 64
diff --git a/fs/fuse/kio/pcs/pcs_map.c b/fs/fuse/kio/pcs/pcs_map.c
index 5d1dd5a13b1cd..b491f38bb053c 100644
--- a/fs/fuse/kio/pcs/pcs_map.c
+++ b/fs/fuse/kio/pcs/pcs_map.c
@@ -46,6 +46,10 @@ unsigned int cs_enable_fanout = 0;
 module_param(cs_enable_fanout, uint, 0644);
 MODULE_PARM_DESC(cs_enable_fanout, "Enable CS fanout");
 
+int relax_congavoid = 1;
+module_param(relax_congavoid, int, 0644);
+MODULE_PARM_DESC(relax_congavoid, "Weaker congestion avoidance");
+
 static struct pcs_cs_list *cs_link_to_cs_list(struct pcs_cs_link *csl)
 {
        struct pcs_cs_record *cs_rec;
@@ -2063,15 +2067,24 @@ static int pcs_cslist_submit_read(struct 
pcs_int_request *ireq, struct pcs_cs_li
        spin_lock(&cs->lock);
        cs_cwnd_use_or_lose(cs);
        allot = cs->eff_cwnd - cs->in_flight;
-       spin_unlock(&cs->lock);
 
-       if (allot < 0 || cs->cong_queue_len) {
-               if (pcs_cs_cong_enqueue_cond(ireq, cs))
-                       return 0;
+       if ((relax_congavoid & 2) && test_bit(CS_SF_LOCAL, &cs->state))
+               goto skip_cong_avoid;
+
+       if (relax_congavoid && allot >= 0 && list_empty(&cs->cong_queue))
+               goto skip_cong_avoid;
+
+       if (__pcs_cs_cong_enqueue_cond(ireq, cs)) {
+               spin_unlock(&cs->lock);
+               return 0;
        }
 
+skip_cong_avoid:
+
+       spin_unlock(&cs->lock);
+
        iochunk = READ_ONCE(ireq->dentry->cluster->cfg.lmss);
-       if (allot < iochunk)
+       if (allot < (int)iochunk)
                allot = iochunk;
 
        if (test_bit(CS_SF_LOCAL, &cs->state))
_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to