This is an automated email from the ASF dual-hosted git repository.

maxyang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git


The following commit(s) were added to refs/heads/main by this push:
     new f225eeb2f3d FIX: Invalid relcache leak WARNING logged in autovacuum
f225eeb2f3d is described below

commit f225eeb2f3da9018d5a120df4f07a35c3e6e9810
Author: zhoujiaqi <zhouji...@hashdata.cn>
AuthorDate: Mon Aug 11 13:52:15 2025 +0800

    FIX: Invalid relcache leak WARNING logged in autovacuum
    
    The autovacuum launcher process periodically launches workers to vacuum the 
table.
    During this process, the UDF `pg_catalog.gp_acquire_sample_rows` will be 
called.
    Also the vacuum task always be canceled by launcher.
    
    The plan of `pg_catalog.gp_acquire_sample_rows` is:
    ```
                                       QUERY PLAN
    
---------------------------------------------------------------------------------
     Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..45.02 rows=3000 
width=32)
       Output: (gp_acquire_sample_rows('17018'::oid, 1250, false))
       ->  ProjectSet  (cost=0.00..5.02 rows=1000 width=32)
             Output: gp_acquire_sample_rows('17018'::oid, 1250, false)
             ->  Result  (cost=0.00..0.01 rows=1 width=0)
     Optimizer: Postgres query optimizer
    (6 rows)
    ```
    
    In actual examples, we often encounter relcache leaks caused by 
`pg_catalog.gp_acquire_sample_rows`.
    In fact, this warning is not caused by the UDF itself.
    
    The following are the complete steps to reproduce(not stable reproduce)
    
    1. User use the insert/update/delete SQL. Auto-vacuum is enabled.
    2. The auto-vacuum worker process call the 
`pg_catalog.gp_acquire_sample_rows`
        2.1 The vacuum launches in master cancel the vacuum query.
        2.2 The vacuum worker in master process the interrupt in the 
intercontect.
            So the gather motion will be aborted.
        2.3 The segment do the tuple sender in the motion(`doSendTuple`).But it
            found the connection is NOT alive. Also it have not recv the SIGN 
INT
            in this time.
            So segment mark the `StopRequested` to true, and finish the current 
motion,
            and the function `pg_catalog.gp_acquire_sample_rows` in project set 
can't
            call the `table_close` in this time.
        2.4 The segment call the `PortalDrop` to destory the resowner which 
inside the current
            portal, and current portal status won't be FAIL, because current 
segment still
            have not recv the SIGN INT.
            The resowner found the leaked relcache, log the WARNING.
    3. After step2, segments recv the SIGN INT, But nothing to do.
---
 contrib/interconnect/udp/ic_udpifc.c    |  5 ++++-
 src/backend/executor/nodeMotion.c       |  3 ++-
 src/backend/utils/mmgr/portalmem.c      |  3 ++-
 src/include/utils/portal.h              |  3 +++
 src/test/regress/expected/vacuum_gp.out | 28 ++++++++++++++++++++++++++++
 src/test/regress/sql/vacuum_gp.sql      | 17 ++++++++++++++++-
 6 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/contrib/interconnect/udp/ic_udpifc.c 
b/contrib/interconnect/udp/ic_udpifc.c
index 63e8c9301dd..c28511fe828 100644
--- a/contrib/interconnect/udp/ic_udpifc.c
+++ b/contrib/interconnect/udp/ic_udpifc.c
@@ -3965,7 +3965,10 @@ receiveChunksUDPIFCLoop(ChunkTransportState 
*pTransportStates, ChunkTransportSta
                /* check the potential errors in rx thread. */
                checkRxThreadError();
 
-               /* do not check interrupts when holding the lock */
+               FaultInjector_InjectFaultIfSet("interconnect_stop_recv_chunk",
+                                                                               
        DDLNotSpecified,
+                                                                               
        "" /* databaseName */ ,
+                                                                               
        "" /* tableName */ );
                ML_CHECK_FOR_INTERRUPTS(pTransportStates->teardownActive);
 
                /*
diff --git a/src/backend/executor/nodeMotion.c 
b/src/backend/executor/nodeMotion.c
index b6514f2a0ce..1229d460a55 100644
--- a/src/backend/executor/nodeMotion.c
+++ b/src/backend/executor/nodeMotion.c
@@ -31,7 +31,7 @@
 #include "utils/wait_event.h"
 #include "miscadmin.h"
 #include "utils/memutils.h"
-
+#include "tcop/pquery.h" /* ActivePortal */
 
 /* #define MEASURE_MOTION_TIME */
 
@@ -269,6 +269,7 @@ execMotionSender(MotionState *node)
 
                        if (node->stopRequested)
                        {
+                               ActivePortal->stop_requested_in_motion = true;
                                elog(gp_workfile_caching_loglevel, "Motion 
calling Squelch on child node");
                                /* propagate stop notification to our children 
*/
                                ExecSquelchNode(outerNode, true);
diff --git a/src/backend/utils/mmgr/portalmem.c 
b/src/backend/utils/mmgr/portalmem.c
index 45812a608e0..2835fbd10f1 100644
--- a/src/backend/utils/mmgr/portalmem.c
+++ b/src/backend/utils/mmgr/portalmem.c
@@ -224,6 +224,7 @@ CreatePortal(const char *name, bool allowDup, bool 
dupSilent)
        portal->atEnd = true;           /* disallow fetches until query is set 
*/
        portal->visible = true;
        portal->creation_time = GetCurrentStatementStartTimestamp();
+       portal->stop_requested_in_motion = false;
 
        if (IsResQueueEnabled())
        {
@@ -587,7 +588,7 @@ PortalDrop(Portal portal, bool isTopCommit)
        if (portal->resowner &&
                (!isTopCommit || portal->status == PORTAL_FAILED))
        {
-               bool            isCommit = (portal->status != PORTAL_FAILED);
+               bool            isCommit = (portal->status != PORTAL_FAILED) && 
!portal->stop_requested_in_motion;
 
                ResourceOwnerRelease(portal->resowner,
                                                         
RESOURCE_RELEASE_BEFORE_LOCKS,
diff --git a/src/include/utils/portal.h b/src/include/utils/portal.h
index fb940fa1ecf..e592d1e8863 100644
--- a/src/include/utils/portal.h
+++ b/src/include/utils/portal.h
@@ -225,6 +225,9 @@ typedef struct PortalData
 
        /* MPP: is this portal a CURSOR, or protocol level portal? */
        bool            is_extended_query; /* simple or extended query 
protocol? */
+
+       /* current motion stop requested? */
+       bool            stop_requested_in_motion;
 }                      PortalData;
 
 /*
diff --git a/src/test/regress/expected/vacuum_gp.out 
b/src/test/regress/expected/vacuum_gp.out
index 35d1a1e4fed..daeb2504559 100644
--- a/src/test/regress/expected/vacuum_gp.out
+++ b/src/test/regress/expected/vacuum_gp.out
@@ -437,3 +437,31 @@ SELECT reltuples, relname FROM pg_class WHERE 
oid='vac_reltuple_distortion'::reg
      1e+06 | vac_reltuple_distortion
 (1 row)
 
+-- test wrong log relcache leak in pg_catalog.gp_acquire_sample_rows
+-- start_ignore
+drop table if exists relcache_leak_in_motion;
+NOTICE:  table "relcache_leak_in_motion" does not exist, skipping
+-- end_ignore
+create table relcache_leak_in_motion(v1 int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'v1' 
as the Apache Cloudberry data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
+insert into relcache_leak_in_motion values(generate_series(0, 10000));
+SELECT gp_inject_fault('interconnect_stop_recv_chunk', 'interrupt', dbid)
+  FROM gp_segment_configuration WHERE content = -1 and role='p';
+ gp_inject_fault 
+-----------------
+ Success:
+(1 row)
+
+analyze relcache_leak_in_motion;
+ERROR:  canceling statement due to user request
+SELECT gp_inject_fault('interconnect_stop_recv_chunk', 'reset', dbid)
+  FROM gp_segment_configuration WHERE content = -1 and role='p';
+ gp_inject_fault 
+-----------------
+ Success:
+(1 row)
+
+-- start_ignore
+drop table if exists relcache_leak_in_motion;
+-- end_ignore
diff --git a/src/test/regress/sql/vacuum_gp.sql 
b/src/test/regress/sql/vacuum_gp.sql
index 04f88df9e0c..198a80f4a93 100644
--- a/src/test/regress/sql/vacuum_gp.sql
+++ b/src/test/regress/sql/vacuum_gp.sql
@@ -290,4 +290,19 @@ VACUUM vac_reltuple_distortion;
 VACUUM vac_reltuple_distortion; -- 2nd call to VACUUM after ANALYZE
 SELECT reltuples, relname FROM pg_class WHERE 
oid='vac_reltuple_distortion'::regclass;
 VACUUM vac_reltuple_distortion;
-SELECT reltuples, relname FROM pg_class WHERE 
oid='vac_reltuple_distortion'::regclass;
\ No newline at end of file
+SELECT reltuples, relname FROM pg_class WHERE 
oid='vac_reltuple_distortion'::regclass;
+
+-- test wrong log relcache leak in pg_catalog.gp_acquire_sample_rows
+-- start_ignore
+drop table if exists relcache_leak_in_motion;
+-- end_ignore
+create table relcache_leak_in_motion(v1 int);
+insert into relcache_leak_in_motion values(generate_series(0, 10000));
+SELECT gp_inject_fault('interconnect_stop_recv_chunk', 'interrupt', dbid)
+  FROM gp_segment_configuration WHERE content = -1 and role='p';
+analyze relcache_leak_in_motion;
+SELECT gp_inject_fault('interconnect_stop_recv_chunk', 'reset', dbid)
+  FROM gp_segment_configuration WHERE content = -1 and role='p';
+-- start_ignore
+drop table if exists relcache_leak_in_motion;
+-- end_ignore


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@cloudberry.apache.org
For additional commands, e-mail: commits-h...@cloudberry.apache.org

Reply via email to