Hi all, While testing on v36 patch with gist index, I came across below segmentation fault.
-- PG Head+ v36_patch create table tab1(c1 int, c2 text PRIMARY KEY, c3 bool, c4 timestamp without time zone, c5 timestamp with time zone, p point); create index gist_idx1 on tab1 using gist(p); create index gist_idx2 on tab1 using gist(p); create index gist_idx3 on tab1 using gist(p); create index gist_idx4 on tab1 using gist(p); create index gist_idx5 on tab1 using gist(p); -- Cancel the insert statement in middle: postgres=# insert into tab1 (select x, x||'_c2', 'T', current_date-x/100, current_date-x/100,point (x,x) from generate_series(1,1000000) x); ^CCancel request sent ERROR: canceling statement due to user request -- Segmentation fault during VACUUM(PARALLEL): postgres=# vacuum(parallel 10) tab1 ; server closed the connection unexpectedly This probably means the server terminated abnormally before or while processing the request. The connection to the server was lost. Attempting reset: Failed. -- Below is the stack trace: [centos@parallel-vacuum-testing bin]$ gdb -q -c data/core.14650 postgres Reading symbols from /home/centos/BLP_Vacuum/postgresql/inst/bin/postgres...done. [New LWP 14650] [Thread debugging using libthread_db enabled] Using host libthread_db library "/lib64/libthread_db.so.1". Core was generated by `postgres: centos postgres [local] VACUUM '. Program terminated with signal 11, Segmentation fault. #0 0x000000000075e713 in intset_num_entries (intset=0x1f62) at integerset.c:353 353 return intset->num_entries; Missing separate debuginfos, use: debuginfo-install glibc-2.17-292.el7.x86_64 keyutils-libs-1.5.8-3.el7.x86_64 krb5-libs-1.15.1-37.el7_7.2.x86_64 libcom_err-1.42.9-16.el7.x86_64 libselinux-2.5-14.1.el7.x86_64 openssl-libs-1.0.2k-19.el7.x86_64 pcre-8.32-17.el7.x86_64 zlib-1.2.7-18.el7.x86_64 (gdb) bt #0 0x000000000075e713 in intset_num_entries (intset=0x1f62) at integerset.c:353 #1 0x00000000004cbe0f in gistvacuum_delete_empty_pages (info=0x7fff32f8eba0, stats=0x7f2923b3f4d8) at gistvacuum.c:478 #2 0x00000000004cb353 in gistvacuumcleanup (info=0x7fff32f8eba0, stats=0x7f2923b3f4d8) at gistvacuum.c:124 #3 0x000000000050dcca in index_vacuum_cleanup (info=0x7fff32f8eba0, stats=0x7f2923b3f4d8) at indexam.c:711 #4 0x00000000005079ba in lazy_cleanup_index (indrel=0x7f292e149560, stats=0x2db5e40, reltuples=0, estimated_count=false) at vacuumlazy.c:2380 #5 0x00000000005074f0 in vacuum_one_index (indrel=0x7f292e149560, stats=0x2db5e40, lvshared=0x7f2923b3f460, shared_indstats=0x7f2923b3f4d0, dead_tuples=0x7f2922fbe2c0) at vacuumlazy.c:2196 #6 0x0000000000507428 in vacuum_indexes_leader (Irel=0x2db5de0, nindexes=6, stats=0x2db5e38, vacrelstats=0x2db5cb0, lps=0x2db5e90) at vacuumlazy.c:2155 #7 0x0000000000507126 in lazy_parallel_vacuum_indexes (Irel=0x2db5de0, stats=0x2db5e38, vacrelstats=0x2db5cb0, lps=0x2db5e90, nindexes=6) at vacuumlazy.c:2045 #8 0x0000000000507770 in lazy_cleanup_indexes (Irel=0x2db5de0, stats=0x2db5e38, vacrelstats=0x2db5cb0, lps=0x2db5e90, nindexes=6) at vacuumlazy.c:2300 #9 0x0000000000506076 in lazy_scan_heap (onerel=0x7f292e1473b8, params=0x7fff32f8f3e0, vacrelstats=0x2db5cb0, Irel=0x2db5de0, nindexes=6, aggressive=false) at vacuumlazy.c:1675 #10 0x0000000000504228 in heap_vacuum_rel (onerel=0x7f292e1473b8, params=0x7fff32f8f3e0, bstrategy=0x2deb3a0) at vacuumlazy.c:475 #11 0x00000000006ea059 in table_relation_vacuum (rel=0x7f292e1473b8, params=0x7fff32f8f3e0, bstrategy=0x2deb3a0) at ../../../src/include/access/tableam.h:1432 #12 0x00000000006ecb74 in vacuum_rel (relid=16384, relation=0x2cf5cf8, params=0x7fff32f8f3e0) at vacuum.c:1885 #13 0x00000000006eac8d in vacuum (relations=0x2deb548, params=0x7fff32f8f3e0, bstrategy=0x2deb3a0, isTopLevel=true) at vacuum.c:440 #14 0x00000000006ea776 in ExecVacuum (pstate=0x2deaf90, vacstmt=0x2cf5de0, isTopLevel=true) at vacuum.c:241 #15 0x000000000091da3d in standard_ProcessUtility (pstmt=0x2cf5ea8, queryString=0x2cf51a0 "vacuum(parallel 10) tab1 ;", context=PROCESS_UTILITY_TOPLEVEL, params=0x0, queryEnv=0x0, dest=0x2cf6188, completionTag=0x7fff32f8f840 "") at utility.c:665 #16 0x000000000091d270 in ProcessUtility (pstmt=0x2cf5ea8, queryString=0x2cf51a0 "vacuum(parallel 10) tab1 ;", context=PROCESS_UTILITY_TOPLEVEL, params=0x0, queryEnv=0x0, dest=0x2cf6188, completionTag=0x7fff32f8f840 "") at utility.c:359 #17 0x000000000091c187 in PortalRunUtility (portal=0x2d5c530, pstmt=0x2cf5ea8, isTopLevel=true, setHoldSnapshot=false, dest=0x2cf6188, completionTag=0x7fff32f8f840 "") at pquery.c:1175 #18 0x000000000091c39e in PortalRunMulti (portal=0x2d5c530, isTopLevel=true, setHoldSnapshot=false, dest=0x2cf6188, altdest=0x2cf6188, completionTag=0x7fff32f8f840 "") at pquery.c:1321 #19 0x000000000091b8c8 in PortalRun (portal=0x2d5c530, count=9223372036854775807, isTopLevel=true, run_once=true, dest=0x2cf6188, altdest=0x2cf6188, completionTag=0x7fff32f8f840 "") at pquery.c:796 #20 0x00000000009156d4 in exec_simple_query (query_string=0x2cf51a0 "vacuum(parallel 10) tab1 ;") at postgres.c:1227 #21 0x0000000000919a1c in PostgresMain (argc=1, argv=0x2d1f608, dbname=0x2d1f520 "postgres", username=0x2d1f500 "centos") at postgres.c:4288 #22 0x000000000086de39 in BackendRun (port=0x2d174e0) at postmaster.c:4498 #23 0x000000000086d617 in BackendStartup (port=0x2d174e0) at postmaster.c:4189 #24 0x0000000000869992 in ServerLoop () at postmaster.c:1727 #25 0x0000000000869248 in PostmasterMain (argc=3, argv=0x2cefd70) at postmaster.c:1400 #26 0x0000000000778593 in main (argc=3, argv=0x2cefd70) at main.c:210 On Wed, Dec 18, 2019 at 3:36 PM Amit Kapila <amit.kapil...@gmail.com> wrote: > On Wed, Dec 18, 2019 at 12:04 PM Amit Kapila <amit.kapil...@gmail.com> > wrote: > > > > On Wed, Dec 18, 2019 at 11:46 AM Masahiko Sawada > > <masahiko.saw...@2ndquadrant.com> wrote: > > > > > > On Wed, 18 Dec 2019 at 15:03, Amit Kapila <amit.kapil...@gmail.com> > wrote: > > > > > > > > I was analyzing your changes related to ReinitializeParallelDSM() and > > > > it seems like we might launch more number of workers for the > > > > bulkdelete phase. While creating a parallel context, we used the > > > > maximum of "workers required for bulkdelete phase" and "workers > > > > required for cleanup", but now if the number of workers required in > > > > bulkdelete phase is lesser than a cleanup phase(as mentioned by you > in > > > > one example), then we would launch more workers for bulkdelete phase. > > > > > > Good catch. Currently when creating a parallel context the number of > > > workers passed to CreateParallelContext() is set not only to > > > pcxt->nworkers but also pcxt->nworkers_to_launch. We would need to > > > specify the number of workers actually to launch after created the > > > parallel context or when creating it. Or I think we call > > > ReinitializeParallelDSM() even the first time running index vacuum. > > > > > > > How about just having ReinitializeParallelWorkers which can be called > > only via vacuum even for the first time before the launch of workers > > as of now? > > > > See in the attached what I have in mind. Few other comments: > > 1. > + shared->disable_delay = (params->options & VACOPT_FAST); > > This should be part of the third patch. > > 2. > +lazy_parallel_vacuum_indexes(Relation *Irel, IndexBulkDeleteResult > **stats, > + LVRelStats *vacrelstats, LVParallelState *lps, > + int nindexes) > { > .. > .. > + /* Cap by the worker we computed at the beginning of parallel lazy > vacuum */ > + nworkers = Min(nworkers, lps->pcxt->nworkers); > .. > } > > This should be Assert. In no case, the computed workers can be more > than what we have in context. > > 3. > + if (((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) != 0) || > + ((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) != 0)) > + nindexes_parallel_cleanup++; > > I think the second condition should be VACUUM_OPTION_PARALLEL_COND_CLEANUP. > > I have fixed the above comments and some given by me earlier [1] in > the attached patch. The attached patch is a diff on top of > v36-0002-Add-parallel-option-to-VACUUM-command. > > Few other comments which I have not fixed: > > 4. > + if (Irel[i]->rd_indam->amusemaintenanceworkmem) > + nindexes_mwm++; > + > + /* Skip indexes that don't participate parallel index vacuum */ > + if (vacoptions == VACUUM_OPTION_NO_PARALLEL || > + RelationGetNumberOfBlocks(Irel[i]) < min_parallel_index_scan_size) > + continue; > > Won't we need to worry about the number of indexes that uses > maintenance_work_mem only for indexes that can participate in a > parallel vacuum? If so, the above checks need to be reversed. > > 5. > /* > + * Remember indexes that can participate parallel index vacuum and use > + * it for index statistics initialization on DSM because the index > + * size can get bigger during vacuum. > + */ > + can_parallel_vacuum[i] = true; > > I am not able to understand the second part of the comment ("because > the index size can get bigger during vacuum."). What is its > relevance? > > 6. > +/* > + * Vacuum or cleanup indexes that can be processed by only the leader > process > + * because these indexes don't support parallel operation at that phase. > + * Therefore this function must be called by the leader process. > + */ > +static void > +vacuum_indexes_leader(Relation *Irel, int nindexes, > IndexBulkDeleteResult **stats, > + LVRelStats *vacrelstats, LVParallelState *lps) > { > .. > > Why you have changed the order of nindexes parameter? I think in the > previous patch, it was the last parameter and that seems to be better > place for it. Also, I think after the latest modifications, you can > remove the second sentence in the above comment ("Therefore this > function must be called by the leader process.). > > 7. > + for (i = 0; i < nindexes; i++) > + { > + bool leader_only = (get_indstats(lps->lvshared, i) == NULL || > + skip_parallel_vacuum_index(Irel[i], lps->lvshared)); > + > + /* Skip the indexes that can be processed by parallel workers */ > + if (!leader_only) > + continue; > > It is better to name this parameter as skip_index or something like that. > > > [1] - > https://www.postgresql.org/message-id/CAA4eK1%2BKBAt1JS%2BasDd7K9C10OtBiyuUC75y8LR6QVnD2wrsMw%40mail.gmail.com > > -- > With Regards, > Amit Kapila. > EnterpriseDB: http://www.enterprisedb.com > -- With Regards, Prabhat Kumar Sahu Skype ID: prabhat.sahu1984 EnterpriseDB Software India Pvt. Ltd. The Postgres Database Company