IMPALA-7528: Fix division by zero when computing cardinalities This patch fixes a case where there can be a division by zero when computing cardinalities of many to many joins on NULL columns.
Testing: Added a planner test case Change-Id: Ieedd51d3ad6131a4ea2a5883f05104e6a0f2cd14 Reviewed-on: http://gerrit.cloudera.org:8080/11901 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/impala/repo Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/d07bc818 Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/d07bc818 Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/d07bc818 Branch: refs/heads/branch-3.1.0 Commit: d07bc818d6a8af6b2ebc2d1d2672849abc40f51c Parents: 8dfbe3c Author: Bikramjeet Vig <[email protected]> Authored: Tue Nov 6 23:44:19 2018 -0800 Committer: Zoltan Borok-Nagy <[email protected]> Committed: Tue Nov 13 12:52:35 2018 +0100 ---------------------------------------------------------------------- fe/src/main/java/org/apache/impala/planner/JoinNode.java | 7 ++++++- fe/src/test/java/org/apache/impala/planner/PlannerTest.java | 9 +++++++++ testdata/bin/compute-table-stats.sh | 2 +- .../functional-planner/queries/PlannerTest/joins.test | 4 ++-- 4 files changed, 18 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/impala/blob/d07bc818/fe/src/main/java/org/apache/impala/planner/JoinNode.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/org/apache/impala/planner/JoinNode.java b/fe/src/main/java/org/apache/impala/planner/JoinNode.java index cc50318..96f93cd 100644 --- a/fe/src/main/java/org/apache/impala/planner/JoinNode.java +++ b/fe/src/main/java/org/apache/impala/planner/JoinNode.java @@ -365,7 +365,12 @@ public abstract class JoinNode extends PlanNode { if (slots.lhsNumRows() > lhsCard) lhsAdjNdv *= lhsCard / slots.lhsNumRows(); double rhsAdjNdv = slots.rhsNdv(); if (slots.rhsNumRows() > rhsCard) rhsAdjNdv *= rhsCard / slots.rhsNumRows(); - long joinCard = Math.round((lhsCard / Math.max(lhsAdjNdv, rhsAdjNdv)) * rhsCard); + // A lower limit of 1 on the max Adjusted Ndv ensures we don't estimate + // cardinality more than the max possible. This also handles the case of + // null columns on both sides having an Ndv of zero (which would change + // after IMPALA-7310 is fixed). + long joinCard = Math.round((lhsCard / Math.max(1, Math.max(lhsAdjNdv, rhsAdjNdv))) * + rhsCard); if (result == -1) { result = joinCard; } else { http://git-wip-us.apache.org/repos/asf/impala/blob/d07bc818/fe/src/test/java/org/apache/impala/planner/PlannerTest.java ---------------------------------------------------------------------- diff --git a/fe/src/test/java/org/apache/impala/planner/PlannerTest.java b/fe/src/test/java/org/apache/impala/planner/PlannerTest.java index 2ec4b15..82235ed 100644 --- a/fe/src/test/java/org/apache/impala/planner/PlannerTest.java +++ b/fe/src/test/java/org/apache/impala/planner/PlannerTest.java @@ -703,4 +703,13 @@ public class PlannerTest extends PlannerTestBase { assertEquals(HBaseScanNode.memoryEstimateForFetchingColumns(largeColumnList), 8 * 1024 * 1024); } + + @Test + public void testNullColumnJoinCardinality() throws ImpalaException { + // IMPALA-7565: Make sure there is no division by zero during cardinality calculation + // in a many to many join on null columns (ndv = 0). + String query = "select * from functional.nulltable t1 " + + "inner join [shuffle] functional.nulltable t2 on t1.d = t2.d"; + checkCardinality(query, 1, 1); + } } http://git-wip-us.apache.org/repos/asf/impala/blob/d07bc818/testdata/bin/compute-table-stats.sh ---------------------------------------------------------------------- diff --git a/testdata/bin/compute-table-stats.sh b/testdata/bin/compute-table-stats.sh index 08c7595..d6e6d22 100755 --- a/testdata/bin/compute-table-stats.sh +++ b/testdata/bin/compute-table-stats.sh @@ -33,7 +33,7 @@ COMPUTE_STATS_SCRIPT="${IMPALA_HOME}/tests/util/compute_table_stats.py --impalad # Run compute stats over as many of the tables used in the Planner tests as possible. ${COMPUTE_STATS_SCRIPT} --db_names=functional\ --table_names="alltypes,alltypesagg,alltypesaggmultifilesnopart,alltypesaggnonulls, - alltypessmall,alltypestiny,jointbl,dimtbl,stringpartitionkey" + alltypessmall,alltypestiny,jointbl,dimtbl,stringpartitionkey,nulltable" # We cannot load HBase on s3 and isilon yet. if [ "${TARGET_FILESYSTEM}" = "hdfs" ]; then http://git-wip-us.apache.org/repos/asf/impala/blob/d07bc818/testdata/workloads/functional-planner/queries/PlannerTest/joins.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/joins.test b/testdata/workloads/functional-planner/queries/PlannerTest/joins.test index 4c5d7ab..b681acb 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/joins.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/joins.test @@ -2387,10 +2387,10 @@ PLAN-ROOT SINK 03:NESTED LOOP JOIN [INNER JOIN] | predicates: t1.d IS DISTINCT FROM t2.d | -|--01:SCAN HDFS [functional.nulltable t2] +|--00:SCAN HDFS [functional.nulltable t1] | partitions=1/1 files=1 size=18B | -00:SCAN HDFS [functional.nulltable t1] +01:SCAN HDFS [functional.nulltable t2] partitions=1/1 files=1 size=18B ==== # IMPALA-3450: limits on join nodes are reflected in cardinality estimates. The test for
