Repository: incubator-impala Updated Branches: refs/heads/master bb6b0ce24 -> d03e7d6ce
IMPALA-5648: fix count(*) mem estimate regression The metadata-only scan doesn't allocate I/O buffers, contrary to an assumption of the memory estimation code in the planner. This fix also sets a floor on the memory estimate, to avoid estimating 0 bytes. 1MB seems like a reasonable approximation: I ran metadata-only scans on a few different data sizes and saw numbers from 128kb to 1mb. The estimate is now much closer to actual consumption (it was 80MB before): [localhost:21000] > select count(*) from tpch_parquet.lineitem; summary; Query: select count(*) from tpch_parquet.lineitem Query submitted at: 2017-08-23 11:58:29 (Coordinator: http://tarmstrong-box:25000) Query progress can be monitored at: http://tarmstrong-box:25000/query_plan?query_id=cb4b8d41fc838c9a:c5496ff300000000 +----------+ | count(*) | +----------+ | 6001215 | +----------+ Fetched 1 row(s) in 0.13s +--------------+--------+----------+----------+-------+------------+-----------+---------------+-----------------------+ | Operator | #Hosts | Avg Time | Max Time | #Rows | Est. #Rows | Peak Mem | Est. Peak Mem | Detail | +--------------+--------+----------+----------+-------+------------+-----------+---------------+-----------------------+ | 03:AGGREGATE | 1 | 168.49us | 168.49us | 1 | 1 | 28.00 KB | 10.00 MB | FINALIZE | | 02:EXCHANGE | 1 | 30.11ms | 30.11ms | 3 | 1 | 0 B | 0 B | UNPARTITIONED | | 01:AGGREGATE | 3 | 2.05us | 6.14us | 3 | 1 | 20.00 KB | 10.00 MB | | | 00:SCAN HDFS | 3 | 4.58ms | 4.72ms | 3 | 6.00M | 128.00 KB | 1.00 MB | tpch_parquet.lineitem | +--------------+--------+----------+----------+-------+------------+-----------+---------------+-----------------------+ Testing: Updated affected planner tests. Change-Id: Iaf5c2316bef2afae54a94245c715534ed294f286 Reviewed-on: http://gerrit.cloudera.org:8080/7783 Reviewed-by: Tim Armstrong <[email protected]> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/b2ebf3de Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/b2ebf3de Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/b2ebf3de Branch: refs/heads/master Commit: b2ebf3de369c628b58259cce1ffb031b6b7d0b20 Parents: bb6b0ce Author: Tim Armstrong <[email protected]> Authored: Wed Aug 23 11:35:14 2017 -0700 Committer: Impala Public Jenkins <[email protected]> Committed: Thu Aug 24 06:55:08 2017 +0000 ---------------------------------------------------------------------- .../java/org/apache/impala/planner/HdfsScanNode.java | 11 ++++++++++- .../queries/PlannerTest/disable-codegen.test | 2 +- .../queries/PlannerTest/resource-requirements.test | 12 ++++++------ 3 files changed, 17 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b2ebf3de/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java index bf183be..ad8501a 100644 --- a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java +++ b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java @@ -127,6 +127,12 @@ public class HdfsScanNode extends ScanNode { // scan ranges than would have been estimated assuming a uniform distribution. private final static double SCAN_RANGE_SKEW_FACTOR = 1.2; + // The minimum amount of memory we estimate a scan will use. The number is + // derived experimentally: running metadata-only Parquet count(*) scans on TPC-H + // lineitem and TPC-DS store_sales of different sizes resulted in memory consumption + // between 128kb and 1.1mb. + private final static long MIN_MEMORY_ESTIMATE = 1 * 1024 * 1024; + private final HdfsTable tbl_; // List of partitions to be scanned. Partitions have been pruned. @@ -1031,9 +1037,11 @@ public class HdfsScanNode extends ScanNode { int perHostScanRanges; if (table.getMajorityFormat() == HdfsFileFormat.PARQUET) { // For the purpose of this estimation, the number of per-host scan ranges for - // Parquet files are equal to the number of non-partition columns scanned. + // Parquet files are equal to the number of columns read from the file. I.e. + // excluding partition columns and columns that are populated from file metadata. perHostScanRanges = 0; for (SlotDescriptor slot: desc_.getSlots()) { + if (!slot.isMaterialized() || slot == countStarSlot_) continue; if (slot.getColumn() == null || slot.getColumn().getPosition() >= table.getNumClusteringCols()) { ++perHostScanRanges; @@ -1075,6 +1083,7 @@ public class HdfsScanNode extends ScanNode { PrintUtils.printBytes(perHostUpperBound))); perInstanceMemEstimate = perHostUpperBound; } + perInstanceMemEstimate = Math.max(perInstanceMemEstimate, MIN_MEMORY_ESTIMATE); nodeResourceProfile_ = ResourceProfile.noReservation(perInstanceMemEstimate); } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b2ebf3de/testdata/workloads/functional-planner/queries/PlannerTest/disable-codegen.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/disable-codegen.test b/testdata/workloads/functional-planner/queries/PlannerTest/disable-codegen.test index 97d662b..7b4afb8 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/disable-codegen.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/disable-codegen.test @@ -41,7 +41,7 @@ PLAN-ROOT SINK select count(*) from functional_parquet.alltypes ---- DISTRIBUTEDPLAN Max Per-Host Resource Reservation: Memory=0B -Per-Host Resource Estimates: Memory=36.00MB +Per-Host Resource Estimates: Memory=21.00MB WARNING: The following tables are missing relevant table and/or column statistics. functional_parquet.alltypes http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b2ebf3de/testdata/workloads/functional-planner/queries/PlannerTest/resource-requirements.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/resource-requirements.test b/testdata/workloads/functional-planner/queries/PlannerTest/resource-requirements.test index bf2bc52..b9afdf9 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/resource-requirements.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/resource-requirements.test @@ -459,10 +459,10 @@ Per-Host Resources: mem-estimate=212.48MB mem-reservation=0B select count(*) from tpch_parquet.lineitem ---- PLAN Max Per-Host Resource Reservation: Memory=0B -Per-Host Resource Estimates: Memory=90.00MB +Per-Host Resource Estimates: Memory=11.00MB F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1 -| Per-Host Resources: mem-estimate=90.00MB mem-reservation=0B +| Per-Host Resources: mem-estimate=11.00MB mem-reservation=0B PLAN-ROOT SINK | mem-estimate=0B mem-reservation=0B | @@ -476,11 +476,11 @@ PLAN-ROOT SINK stats-rows=6001215 extrapolated-rows=disabled table stats: rows=6001215 size=193.92MB column stats: all - mem-estimate=80.00MB mem-reservation=0B + mem-estimate=1.00MB mem-reservation=0B tuple-ids=0 row-size=8B cardinality=6001215 ---- DISTRIBUTEDPLAN Max Per-Host Resource Reservation: Memory=0B -Per-Host Resource Estimates: Memory=100.00MB +Per-Host Resource Estimates: Memory=21.00MB F01:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1 | Per-Host Resources: mem-estimate=10.00MB mem-reservation=0B @@ -497,7 +497,7 @@ PLAN-ROOT SINK | tuple-ids=1 row-size=8B cardinality=1 | F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=3 -Per-Host Resources: mem-estimate=90.00MB mem-reservation=0B +Per-Host Resources: mem-estimate=11.00MB mem-reservation=0B 01:AGGREGATE | output: sum_init_zero(tpch_parquet.lineitem.parquet-stats: num_rows) | mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB @@ -508,7 +508,7 @@ Per-Host Resources: mem-estimate=90.00MB mem-reservation=0B stats-rows=6001215 extrapolated-rows=disabled table stats: rows=6001215 size=193.92MB column stats: all - mem-estimate=80.00MB mem-reservation=0B + mem-estimate=1.00MB mem-reservation=0B tuple-ids=0 row-size=8B cardinality=6001215 ---- PARALLELPLANS Max Per-Host Resource Reservation: Memory=0B
