This is an automated email from the ASF dual-hosted git repository. maxyang pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/cloudberry.git
The following commit(s) were added to refs/heads/main by this push: new f225eeb2f3d FIX: Invalid relcache leak WARNING logged in autovacuum f225eeb2f3d is described below commit f225eeb2f3da9018d5a120df4f07a35c3e6e9810 Author: zhoujiaqi <zhouji...@hashdata.cn> AuthorDate: Mon Aug 11 13:52:15 2025 +0800 FIX: Invalid relcache leak WARNING logged in autovacuum The autovacuum launcher process periodically launches workers to vacuum the table. During this process, the UDF `pg_catalog.gp_acquire_sample_rows` will be called. Also the vacuum task always be canceled by launcher. The plan of `pg_catalog.gp_acquire_sample_rows` is: ``` QUERY PLAN --------------------------------------------------------------------------------- Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..45.02 rows=3000 width=32) Output: (gp_acquire_sample_rows('17018'::oid, 1250, false)) -> ProjectSet (cost=0.00..5.02 rows=1000 width=32) Output: gp_acquire_sample_rows('17018'::oid, 1250, false) -> Result (cost=0.00..0.01 rows=1 width=0) Optimizer: Postgres query optimizer (6 rows) ``` In actual examples, we often encounter relcache leaks caused by `pg_catalog.gp_acquire_sample_rows`. In fact, this warning is not caused by the UDF itself. The following are the complete steps to reproduce(not stable reproduce) 1. User use the insert/update/delete SQL. Auto-vacuum is enabled. 2. The auto-vacuum worker process call the `pg_catalog.gp_acquire_sample_rows` 2.1 The vacuum launches in master cancel the vacuum query. 2.2 The vacuum worker in master process the interrupt in the intercontect. So the gather motion will be aborted. 2.3 The segment do the tuple sender in the motion(`doSendTuple`).But it found the connection is NOT alive. Also it have not recv the SIGN INT in this time. So segment mark the `StopRequested` to true, and finish the current motion, and the function `pg_catalog.gp_acquire_sample_rows` in project set can't call the `table_close` in this time. 2.4 The segment call the `PortalDrop` to destory the resowner which inside the current portal, and current portal status won't be FAIL, because current segment still have not recv the SIGN INT. The resowner found the leaked relcache, log the WARNING. 3. After step2, segments recv the SIGN INT, But nothing to do. --- contrib/interconnect/udp/ic_udpifc.c | 5 ++++- src/backend/executor/nodeMotion.c | 3 ++- src/backend/utils/mmgr/portalmem.c | 3 ++- src/include/utils/portal.h | 3 +++ src/test/regress/expected/vacuum_gp.out | 28 ++++++++++++++++++++++++++++ src/test/regress/sql/vacuum_gp.sql | 17 ++++++++++++++++- 6 files changed, 55 insertions(+), 4 deletions(-) diff --git a/contrib/interconnect/udp/ic_udpifc.c b/contrib/interconnect/udp/ic_udpifc.c index 63e8c9301dd..c28511fe828 100644 --- a/contrib/interconnect/udp/ic_udpifc.c +++ b/contrib/interconnect/udp/ic_udpifc.c @@ -3965,7 +3965,10 @@ receiveChunksUDPIFCLoop(ChunkTransportState *pTransportStates, ChunkTransportSta /* check the potential errors in rx thread. */ checkRxThreadError(); - /* do not check interrupts when holding the lock */ + FaultInjector_InjectFaultIfSet("interconnect_stop_recv_chunk", + DDLNotSpecified, + "" /* databaseName */ , + "" /* tableName */ ); ML_CHECK_FOR_INTERRUPTS(pTransportStates->teardownActive); /* diff --git a/src/backend/executor/nodeMotion.c b/src/backend/executor/nodeMotion.c index b6514f2a0ce..1229d460a55 100644 --- a/src/backend/executor/nodeMotion.c +++ b/src/backend/executor/nodeMotion.c @@ -31,7 +31,7 @@ #include "utils/wait_event.h" #include "miscadmin.h" #include "utils/memutils.h" - +#include "tcop/pquery.h" /* ActivePortal */ /* #define MEASURE_MOTION_TIME */ @@ -269,6 +269,7 @@ execMotionSender(MotionState *node) if (node->stopRequested) { + ActivePortal->stop_requested_in_motion = true; elog(gp_workfile_caching_loglevel, "Motion calling Squelch on child node"); /* propagate stop notification to our children */ ExecSquelchNode(outerNode, true); diff --git a/src/backend/utils/mmgr/portalmem.c b/src/backend/utils/mmgr/portalmem.c index 45812a608e0..2835fbd10f1 100644 --- a/src/backend/utils/mmgr/portalmem.c +++ b/src/backend/utils/mmgr/portalmem.c @@ -224,6 +224,7 @@ CreatePortal(const char *name, bool allowDup, bool dupSilent) portal->atEnd = true; /* disallow fetches until query is set */ portal->visible = true; portal->creation_time = GetCurrentStatementStartTimestamp(); + portal->stop_requested_in_motion = false; if (IsResQueueEnabled()) { @@ -587,7 +588,7 @@ PortalDrop(Portal portal, bool isTopCommit) if (portal->resowner && (!isTopCommit || portal->status == PORTAL_FAILED)) { - bool isCommit = (portal->status != PORTAL_FAILED); + bool isCommit = (portal->status != PORTAL_FAILED) && !portal->stop_requested_in_motion; ResourceOwnerRelease(portal->resowner, RESOURCE_RELEASE_BEFORE_LOCKS, diff --git a/src/include/utils/portal.h b/src/include/utils/portal.h index fb940fa1ecf..e592d1e8863 100644 --- a/src/include/utils/portal.h +++ b/src/include/utils/portal.h @@ -225,6 +225,9 @@ typedef struct PortalData /* MPP: is this portal a CURSOR, or protocol level portal? */ bool is_extended_query; /* simple or extended query protocol? */ + + /* current motion stop requested? */ + bool stop_requested_in_motion; } PortalData; /* diff --git a/src/test/regress/expected/vacuum_gp.out b/src/test/regress/expected/vacuum_gp.out index 35d1a1e4fed..daeb2504559 100644 --- a/src/test/regress/expected/vacuum_gp.out +++ b/src/test/regress/expected/vacuum_gp.out @@ -437,3 +437,31 @@ SELECT reltuples, relname FROM pg_class WHERE oid='vac_reltuple_distortion'::reg 1e+06 | vac_reltuple_distortion (1 row) +-- test wrong log relcache leak in pg_catalog.gp_acquire_sample_rows +-- start_ignore +drop table if exists relcache_leak_in_motion; +NOTICE: table "relcache_leak_in_motion" does not exist, skipping +-- end_ignore +create table relcache_leak_in_motion(v1 int); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'v1' as the Apache Cloudberry data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +insert into relcache_leak_in_motion values(generate_series(0, 10000)); +SELECT gp_inject_fault('interconnect_stop_recv_chunk', 'interrupt', dbid) + FROM gp_segment_configuration WHERE content = -1 and role='p'; + gp_inject_fault +----------------- + Success: +(1 row) + +analyze relcache_leak_in_motion; +ERROR: canceling statement due to user request +SELECT gp_inject_fault('interconnect_stop_recv_chunk', 'reset', dbid) + FROM gp_segment_configuration WHERE content = -1 and role='p'; + gp_inject_fault +----------------- + Success: +(1 row) + +-- start_ignore +drop table if exists relcache_leak_in_motion; +-- end_ignore diff --git a/src/test/regress/sql/vacuum_gp.sql b/src/test/regress/sql/vacuum_gp.sql index 04f88df9e0c..198a80f4a93 100644 --- a/src/test/regress/sql/vacuum_gp.sql +++ b/src/test/regress/sql/vacuum_gp.sql @@ -290,4 +290,19 @@ VACUUM vac_reltuple_distortion; VACUUM vac_reltuple_distortion; -- 2nd call to VACUUM after ANALYZE SELECT reltuples, relname FROM pg_class WHERE oid='vac_reltuple_distortion'::regclass; VACUUM vac_reltuple_distortion; -SELECT reltuples, relname FROM pg_class WHERE oid='vac_reltuple_distortion'::regclass; \ No newline at end of file +SELECT reltuples, relname FROM pg_class WHERE oid='vac_reltuple_distortion'::regclass; + +-- test wrong log relcache leak in pg_catalog.gp_acquire_sample_rows +-- start_ignore +drop table if exists relcache_leak_in_motion; +-- end_ignore +create table relcache_leak_in_motion(v1 int); +insert into relcache_leak_in_motion values(generate_series(0, 10000)); +SELECT gp_inject_fault('interconnect_stop_recv_chunk', 'interrupt', dbid) + FROM gp_segment_configuration WHERE content = -1 and role='p'; +analyze relcache_leak_in_motion; +SELECT gp_inject_fault('interconnect_stop_recv_chunk', 'reset', dbid) + FROM gp_segment_configuration WHERE content = -1 and role='p'; +-- start_ignore +drop table if exists relcache_leak_in_motion; +-- end_ignore --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@cloudberry.apache.org For additional commands, e-mail: commits-h...@cloudberry.apache.org