Ruilong Huo created HAWQ-62:
-------------------------------

             Summary: Incorrect calculation of locus describing of hash table 
introduce core dump in OLAP query
                 Key: HAWQ-62
                 URL: https://issues.apache.org/jira/browse/HAWQ-62
             Project: Apache HAWQ
          Issue Type: Bug
            Reporter: Ruilong Huo
            Assignee: Hubert Zhang


During execution of OLAP query, core dump was generated. The root cause 
analysis shows that it was introduced by incorrect calculation of locus 
describing of hash table.

The detailed symptom, reproduction steps, and recommended fix are as below:

1. Call stack of core dump
{noformat}
$ sudo gdb hawq/bin/postgres -c core.postgres.148248.1444900646.11.501.501
...
Core was generated by `postgres: port 31000, gpadmin parquetolap [local] con123 
[local] cmd1 SELECT  '.
Program terminated with signal 11, Segmentation fault.
#0  0x0000003c1ec0f5db in raise () from /lib64/libpthread.so.0
Missing separate debuginfos, use: debuginfo-install hawq-2.0.0.0-17409.x86_64
(gdb) bt
#0  0x0000003c1ec0f5db in raise () from /lib64/libpthread.so.0
#1  0x00000000009e76ac in SafeHandlerForSegvBusIll (processName=0xd52b4b 
"Master process", postgres_signal_arg=11) at elog.c:4497
#2  0x00000000009e78d9 in StandardHandlerForSigillSigsegvSigbus_OnMainThread 
(processName=0xd52b4b "Master process", postgres_signal_arg=11) at elog.c:4575
#3  0x00000000008fa17f in CdbProgramErrorHandler (postgres_signal_arg=11) at 
postgres.c:3403
#4  <signal handler called>
#5  0x000000000087dabf in get_sortgroupclause_tle (sortClause=0x0, 
targetList=0x7fa2fc880d38) at tlist.c:194
#6  0x000000000087dce1 in get_sortgroupclause_expr (sortClause=0x0, 
targetList=0x7fa2fc880d38) at tlist.c:281
#7  0x000000000087dd37 in get_sortgrouplist_exprs (sortClauses=0x7fa2fc8e1608, 
targetList=0x7fa2fc880d38) at tlist.c:302
#8  0x0000000000aefd20 in add_motion_to_dqa_child (plan=0x7fa2fc881e28, 
root=0x7fa2fc8e20c0, motion_added=0x7fff1edc67df "") at cdbgroup.c:6801
#9  0x0000000000ae6370 in join_dqa_coplan (root=0x7fa2fc8e20c0, 
ctx=0x7fff1edc6ab0, outer=0x7fa2fc881e28, dqa_index=1) at cdbgroup.c:2330
#10 0x0000000000ae4d19 in make_three_stage_agg_plan (root=0x7fa2fc8e20c0, 
ctx=0x7fff1edc6ab0) at cdbgroup.c:1690
#11 0x0000000000ae3110 in cdb_grouping_planner (root=0x7fa2fc8e20c0, 
agg_counts=0x7fff1edc6dd0, group_context=0x7fff1edc6d50) at cdbgroup.c:899
#12 0x0000000000848bbb in grouping_planner (root=0x7fa2fc8e20c0, 
tuple_fraction=0) at planner.c:1800
#13 0x0000000000847468 in subquery_planner (glob=0x2e6faa8, 
parse=0x7fa2fc8da5c0, parent_root=0x0, tuple_fraction=0, 
subroot=0x7fff1edc70d0, config=0x7fa2fc8e2048) at planner.c:1002
#14 0x000000000084660e in standard_planner (parse=0x7fa2fc8da5c0, 
cursorOptions=0, boundParams=0x0) at planner.c:591
#15 0x00000000008462c2 in resource_negotiator (parse=0x2e6df80, 
cursorOptions=0, boundParams=0x0, resourceLife=QRL_ONCE, result=0x7fff1edc7308) 
at planner.c:468
#16 0x0000000000845ba7 in planner (parse=0x2e6df80, cursorOptions=0, 
boundParams=0x0, resourceLife=QRL_ONCE) at planner.c:303
#17 0x00000000008f5b11 in pg_plan_query (querytree=0x2e6df80, boundParams=0x0, 
resource_life=QRL_ONCE) at postgres.c:816
#18 0x00000000008f5c1c in pg_plan_queries (querytrees=0x7fa2fc8b19b8, 
boundParams=0x0, needSnapshot=0 '\000', resource_life=QRL_ONCE) at 
postgres.c:889
#19 0x00000000008f7220 in exec_simple_query (
    query_string=0x2e17f78 "SELECT sale.cn,sale.prc,sale.vn,sale.pn,GROUP_ID(), 
TO_CHAR(COALESCE(COVAR_POP(floor(sale.vn),floor(sale.cn)),0),'99999999.9999999'),TO_CHAR(COALESCE(COUNT(DISTINCT
 floor(sale.pn-sale.vn)),0),'9999999"..., seqServerHost=0x0, seqServerPort=-1) 
at postgres.c:1647
#20 0x00000000008fc0e0 in PostgresMain (argc=4, argv=0x2d9af18, 
username=0x2d966b8 "gpadmin") at postgres.c:4691
#21 0x000000000089ed72 in BackendRun (port=0x2d1beb0) at postmaster.c:5845
#22 0x000000000089e1fc in BackendStartup (port=0x2d1beb0) at postmaster.c:5438
#23 0x000000000089891a in ServerLoop () at postmaster.c:2140
#24 0x00000000008979f0 in PostmasterMain (argc=9, argv=0x2d161b0) at 
postmaster.c:1432
#25 0x00000000007b14b2 in main (argc=9, argv=0x2d161b0) at main.c:226
{noformat}

2. Steps for reproduction
(1) Schema and data
{code}
create table sale
(
        cn int not null,
        vn int not null,
        pn int not null,
        dt date not null,
        qty int not null,
        prc float not null

        --primary key (cn, vn, pn)

) WITH (appendonly=true, orientation = parquet) distributed by (cn,vn,pn);

insert into sale values
  ( 2, 40, 100, '1401-1-1', 1100, 2400),
  ( 1, 10, 200, '1401-3-1', 1, 0),
  ( 3, 40, 200, '1401-4-1', 1, 0),
  ( 1, 20, 100, '1401-5-1', 1, 0),
  ( 1, 30, 300, '1401-5-2', 1, 0),
  ( 1, 50, 400, '1401-6-1', 1, 0),
  ( 2, 50, 400, '1401-6-1', 1, 0),
  ( 1, 30, 500, '1401-6-1', 12, 5),
  ( 3, 30, 500, '1401-6-1', 12, 5),
  ( 3, 30, 600, '1401-6-1', 12, 5),
  ( 4, 40, 700, '1401-6-1', 1, 1),
  ( 4, 40, 800, '1401-6-1', 1, 1);
{code}

(2) SQL statement
{code}
-- Query involving COVAR_POP() function
SELECT sale.cn,sale.prc,sale.vn,sale.pn,GROUP_ID(), 
TO_CHAR(COALESCE(COVAR_POP(floor(sale.vn),floor(sale.cn)),0),'99999999.9999999'),TO_CHAR(COALESCE(COUNT(DISTINCT
 floor(sale.pn-sale.vn)),0),'99999999.9999999'),TO_CHAR(COALESCE(COUNT(DISTINCT 
floor(sale.qty)),0),'99999999.9999999'),TO_CHAR(COALESCE(VAR_SAMP(floor(sale.prc)),0),'99999999.9999999'),TO_CHAR(COALESCE(STDDEV(DISTINCT
 
floor(sale.qty-sale.pn)),0),'99999999.9999999'),TO_CHAR(COALESCE(MIN(floor(sale.pn)),0),'99999999.9999999'),TO_CHAR(COALESCE(STDDEV_SAMP(floor(sale.cn)),0),'99999999.9999999'),TO_CHAR(COALESCE(VAR_SAMP(floor(sale.prc+sale.prc)),0),'99999999.9999999'),TO_CHAR(COALESCE(STDDEV_SAMP(floor(sale.qty)),0),'99999999.9999999'),TO_CHAR(COALESCE(SUM(floor(sale.cn-sale.pn)),0),'99999999.9999999')
FROM sale
GROUP BY 
(sale.cn,sale.pn),(sale.dt,sale.cn),(sale.dt,sale.cn),(sale.vn),(sale.pn,sale.qty,sale.pn),CUBE((sale.vn,sale.cn),(sale.cn,sale.qty),(sale.vn)),(),sale.prc;
{code}

(3) psql output:
{noformat}
psql:query197.sql:5: server closed the connection unexpectedly
        This probably means the server terminated abnormally
        before or while processing the request.
psql:query197.sql:5: connection to server was lost
{noformat}

3. Potential fix
{noformat}
rhuo-mbp:apache-hawq rhuo$ git diff src/backend/cdb/cdbpathlocus.c
diff --git a/src/backend/cdb/cdbpathlocus.c b/src/backend/cdb/cdbpathlocus.c
index 2471fd4..9fdd485 100644
--- a/src/backend/cdb/cdbpathlocus.c
+++ b/src/backend/cdb/cdbpathlocus.c
@@ -303,8 +303,9 @@ cdbpathlocus_from_baserel(struct PlannerInfo   *root,
                                        }
                                }
                        }
-        if (isRelationRuntimeHash && (policy->nattrs > 0) && allocatedResource 
&&
-            (list_length(root->glob->resource->segments) == 
(policy->bucketnum)))
+        if (isRelationRuntimeHash && (policy->nattrs > 0) && 
((allocatedResource &&
+            (list_length(root->glob->resource->segments) == 
(policy->bucketnum))) ||
+            (!allocatedResource)))
         {
                List *partkey = cdb_build_distribution_pathkeys(root,
                                                                rel,
{noformat}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to