[ https://issues.apache.org/jira/browse/HAWQ-62?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Lei Chang closed HAWQ-62. ------------------------- > Incorrect calculation of locus describing of hash table introduce core dump > in OLAP query > ----------------------------------------------------------------------------------------- > > Key: HAWQ-62 > URL: https://issues.apache.org/jira/browse/HAWQ-62 > Project: Apache HAWQ > Issue Type: Bug > Components: Core > Reporter: Ruilong Huo > Assignee: Hubert Zhang > Priority: Critical > Fix For: 2.0.0-beta-incubating > > > During execution of OLAP query, core dump was generated. The root cause > analysis shows that it was introduced by incorrect calculation of locus > describing of hash table. > The detailed symptom, reproduction steps, and recommended fix are as below: > 1. Call stack of core dump > {noformat} > $ sudo gdb hawq/bin/postgres -c core.postgres.148248.1444900646.11.501.501 > ... > Core was generated by `postgres: port 31000, gpadmin parquetolap [local] > con123 [local] cmd1 SELECT '. > Program terminated with signal 11, Segmentation fault. > #0 0x0000003c1ec0f5db in raise () from /lib64/libpthread.so.0 > Missing separate debuginfos, use: debuginfo-install hawq-2.0.0.0-17409.x86_64 > (gdb) bt > #0 0x0000003c1ec0f5db in raise () from /lib64/libpthread.so.0 > #1 0x00000000009e76ac in SafeHandlerForSegvBusIll (processName=0xd52b4b > "Master process", postgres_signal_arg=11) at elog.c:4497 > #2 0x00000000009e78d9 in StandardHandlerForSigillSigsegvSigbus_OnMainThread > (processName=0xd52b4b "Master process", postgres_signal_arg=11) at elog.c:4575 > #3 0x00000000008fa17f in CdbProgramErrorHandler (postgres_signal_arg=11) at > postgres.c:3403 > #4 <signal handler called> > #5 0x000000000087dabf in get_sortgroupclause_tle (sortClause=0x0, > targetList=0x7fa2fc880d38) at tlist.c:194 > #6 0x000000000087dce1 in get_sortgroupclause_expr (sortClause=0x0, > targetList=0x7fa2fc880d38) at tlist.c:281 > #7 0x000000000087dd37 in get_sortgrouplist_exprs > (sortClauses=0x7fa2fc8e1608, targetList=0x7fa2fc880d38) at tlist.c:302 > #8 0x0000000000aefd20 in add_motion_to_dqa_child (plan=0x7fa2fc881e28, > root=0x7fa2fc8e20c0, motion_added=0x7fff1edc67df "") at cdbgroup.c:6801 > #9 0x0000000000ae6370 in join_dqa_coplan (root=0x7fa2fc8e20c0, > ctx=0x7fff1edc6ab0, outer=0x7fa2fc881e28, dqa_index=1) at cdbgroup.c:2330 > #10 0x0000000000ae4d19 in make_three_stage_agg_plan (root=0x7fa2fc8e20c0, > ctx=0x7fff1edc6ab0) at cdbgroup.c:1690 > #11 0x0000000000ae3110 in cdb_grouping_planner (root=0x7fa2fc8e20c0, > agg_counts=0x7fff1edc6dd0, group_context=0x7fff1edc6d50) at cdbgroup.c:899 > #12 0x0000000000848bbb in grouping_planner (root=0x7fa2fc8e20c0, > tuple_fraction=0) at planner.c:1800 > #13 0x0000000000847468 in subquery_planner (glob=0x2e6faa8, > parse=0x7fa2fc8da5c0, parent_root=0x0, tuple_fraction=0, > subroot=0x7fff1edc70d0, config=0x7fa2fc8e2048) at planner.c:1002 > #14 0x000000000084660e in standard_planner (parse=0x7fa2fc8da5c0, > cursorOptions=0, boundParams=0x0) at planner.c:591 > #15 0x00000000008462c2 in resource_negotiator (parse=0x2e6df80, > cursorOptions=0, boundParams=0x0, resourceLife=QRL_ONCE, > result=0x7fff1edc7308) at planner.c:468 > #16 0x0000000000845ba7 in planner (parse=0x2e6df80, cursorOptions=0, > boundParams=0x0, resourceLife=QRL_ONCE) at planner.c:303 > #17 0x00000000008f5b11 in pg_plan_query (querytree=0x2e6df80, > boundParams=0x0, resource_life=QRL_ONCE) at postgres.c:816 > #18 0x00000000008f5c1c in pg_plan_queries (querytrees=0x7fa2fc8b19b8, > boundParams=0x0, needSnapshot=0 '\000', resource_life=QRL_ONCE) at > postgres.c:889 > #19 0x00000000008f7220 in exec_simple_query ( > query_string=0x2e17f78 "SELECT > sale.cn,sale.prc,sale.vn,sale.pn,GROUP_ID(), > TO_CHAR(COALESCE(COVAR_POP(floor(sale.vn),floor(sale.cn)),0),'99999999.9999999'),TO_CHAR(COALESCE(COUNT(DISTINCT > floor(sale.pn-sale.vn)),0),'9999999"..., seqServerHost=0x0, > seqServerPort=-1) at postgres.c:1647 > #20 0x00000000008fc0e0 in PostgresMain (argc=4, argv=0x2d9af18, > username=0x2d966b8 "gpadmin") at postgres.c:4691 > #21 0x000000000089ed72 in BackendRun (port=0x2d1beb0) at postmaster.c:5845 > #22 0x000000000089e1fc in BackendStartup (port=0x2d1beb0) at postmaster.c:5438 > #23 0x000000000089891a in ServerLoop () at postmaster.c:2140 > #24 0x00000000008979f0 in PostmasterMain (argc=9, argv=0x2d161b0) at > postmaster.c:1432 > #25 0x00000000007b14b2 in main (argc=9, argv=0x2d161b0) at main.c:226 > {noformat} > 2. Steps for reproduction > (1) Schema and data > {code} > create table sale > ( > cn int not null, > vn int not null, > pn int not null, > dt date not null, > qty int not null, > prc float not null > --primary key (cn, vn, pn) > ) WITH (appendonly=true, orientation = parquet) distributed by (cn,vn,pn); > insert into sale values > ( 2, 40, 100, '1401-1-1', 1100, 2400), > ( 1, 10, 200, '1401-3-1', 1, 0), > ( 3, 40, 200, '1401-4-1', 1, 0), > ( 1, 20, 100, '1401-5-1', 1, 0), > ( 1, 30, 300, '1401-5-2', 1, 0), > ( 1, 50, 400, '1401-6-1', 1, 0), > ( 2, 50, 400, '1401-6-1', 1, 0), > ( 1, 30, 500, '1401-6-1', 12, 5), > ( 3, 30, 500, '1401-6-1', 12, 5), > ( 3, 30, 600, '1401-6-1', 12, 5), > ( 4, 40, 700, '1401-6-1', 1, 1), > ( 4, 40, 800, '1401-6-1', 1, 1); > {code} > (2) SQL statement > {code} > -- Query involving COVAR_POP() function > SELECT sale.cn,sale.prc,sale.vn,sale.pn,GROUP_ID(), > TO_CHAR(COALESCE(COVAR_POP(floor(sale.vn),floor(sale.cn)),0),'99999999.9999999'),TO_CHAR(COALESCE(COUNT(DISTINCT > > floor(sale.pn-sale.vn)),0),'99999999.9999999'),TO_CHAR(COALESCE(COUNT(DISTINCT > > floor(sale.qty)),0),'99999999.9999999'),TO_CHAR(COALESCE(VAR_SAMP(floor(sale.prc)),0),'99999999.9999999'),TO_CHAR(COALESCE(STDDEV(DISTINCT > > floor(sale.qty-sale.pn)),0),'99999999.9999999'),TO_CHAR(COALESCE(MIN(floor(sale.pn)),0),'99999999.9999999'),TO_CHAR(COALESCE(STDDEV_SAMP(floor(sale.cn)),0),'99999999.9999999'),TO_CHAR(COALESCE(VAR_SAMP(floor(sale.prc+sale.prc)),0),'99999999.9999999'),TO_CHAR(COALESCE(STDDEV_SAMP(floor(sale.qty)),0),'99999999.9999999'),TO_CHAR(COALESCE(SUM(floor(sale.cn-sale.pn)),0),'99999999.9999999') > FROM sale > GROUP BY > (sale.cn,sale.pn),(sale.dt,sale.cn),(sale.dt,sale.cn),(sale.vn),(sale.pn,sale.qty,sale.pn),CUBE((sale.vn,sale.cn),(sale.cn,sale.qty),(sale.vn)),(),sale.prc; > {code} > (3) psql output: > {noformat} > psql:query197.sql:5: server closed the connection unexpectedly > This probably means the server terminated abnormally > before or while processing the request. > psql:query197.sql:5: connection to server was lost > {noformat} > 3. Potential fix > {noformat} > rhuo-mbp:apache-hawq rhuo$ git diff src/backend/cdb/cdbpathlocus.c > diff --git a/src/backend/cdb/cdbpathlocus.c b/src/backend/cdb/cdbpathlocus.c > index 2471fd4..9fdd485 100644 > --- a/src/backend/cdb/cdbpathlocus.c > +++ b/src/backend/cdb/cdbpathlocus.c > @@ -303,8 +303,9 @@ cdbpathlocus_from_baserel(struct PlannerInfo *root, > } > } > } > - if (isRelationRuntimeHash && (policy->nattrs > 0) && > allocatedResource && > - (list_length(root->glob->resource->segments) == > (policy->bucketnum))) > + if (isRelationRuntimeHash && (policy->nattrs > 0) && > ((allocatedResource && > + (list_length(root->glob->resource->segments) == > (policy->bucketnum))) || > + (!allocatedResource))) > { > List *partkey = cdb_build_distribution_pathkeys(root, > rel, > {noformat} -- This message was sent by Atlassian JIRA (v6.3.4#6332)