Repository: incubator-impala Updated Branches: refs/heads/master 6bbb7fb3d -> 12496c7fb
IMPALA-1657: Rework detection and reporting of corrupt table stats. 1. Minor fixes for cardinality estimation of unpartitioned tables. 2. Reworks handling of corrupt table stats as follows: The stats of a table or partition are reported as corrupt if the numRows < -1, or if numRows == 0 but the table size is positive. 3. Removes the Preconditions check reported in IMPALA-1657 in favor or issuing a corrupt table stats warning. 4. Fixes a few tests to set numRows together with STATS_GENERATED_VIA_STATS_TASK so that the numRows is definitely set in the HMS. Change-Id: I1d3305791d96e1c23a901af7b7c109af9352bb44 Reviewed-on: http://gerrit.cloudera.org:8080/4166 Reviewed-by: Alex Behm <[email protected]> Tested-by: Internal Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/12496c7f Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/12496c7f Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/12496c7f Branch: refs/heads/master Commit: 12496c7fbf62e364f7c23d819185237a5b1b9f59 Parents: 6bbb7fb Author: Alex Behm <[email protected]> Authored: Mon Aug 29 14:42:17 2016 -0700 Committer: Internal Jenkins <[email protected]> Committed: Wed Aug 31 00:58:03 2016 +0000 ---------------------------------------------------------------------- .../cloudera/impala/planner/HdfsScanNode.java | 30 ++++++--- .../com/cloudera/impala/planner/Planner.java | 5 +- .../queries/QueryTest/corrupt-stats.test | 64 +++++++++++++++++--- tests/metadata/test_compute_stats.py | 4 +- 4 files changed, 81 insertions(+), 22 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12496c7f/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java b/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java index 31ec600..9b83902 100644 --- a/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java +++ b/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java @@ -356,26 +356,35 @@ public class HdfsScanNode extends ScanNode { } /** - * Also computes totalBytes_ + * Also computes totalBytes_, totalFiles_, numPartitionsMissingStats_, + * and sets hasCorruptTableStats_. */ @Override public void computeStats(Analyzer analyzer) { super.computeStats(analyzer); LOG.debug("collecting partitions for table " + tbl_.getName()); numPartitionsMissingStats_ = 0; - if (tbl_.getPartitions().isEmpty()) { + totalFiles_ = 0; + totalBytes_ = 0; + if (tbl_.getNumClusteringCols() == 0) { cardinality_ = tbl_.getNumRows(); - if ((cardinality_ < -1 || cardinality_ == 0) && tbl_.getTotalHdfsBytes() > 0) { + if (cardinality_ < -1 || (cardinality_ == 0 && tbl_.getTotalHdfsBytes() > 0)) { hasCorruptTableStats_ = true; } + if (partitions_.isEmpty()) { + // Nothing to scan. Definitely a cardinality of 0 even if we have no stats. + cardinality_ = 0; + } else { + Preconditions.checkState(partitions_.size() == 1); + totalFiles_ += partitions_.get(0).getFileDescriptors().size(); + totalBytes_ += partitions_.get(0).getSize(); + } } else { cardinality_ = 0; - totalFiles_ = 0; - totalBytes_ = 0; boolean hasValidPartitionCardinality = false; for (HdfsPartition p: partitions_) { // Check for corrupt table stats - if ((p.getNumRows() == 0 || p.getNumRows() < -1) && p.getSize() > 0) { + if (p.getNumRows() < -1 || (p.getNumRows() == 0 && p.getSize() > 0)) { hasCorruptTableStats_ = true; } // ignore partitions with missing stats in the hope they don't matter @@ -402,8 +411,13 @@ public class HdfsScanNode extends ScanNode { } } inputCardinality_ = cardinality_; - Preconditions.checkState(cardinality_ >= 0 || cardinality_ == -1, - "Internal error: invalid scan node cardinality: " + cardinality_); + + // Sanity check scan node cardinality. + if (cardinality_ < -1) { + hasCorruptTableStats_ = true; + cardinality_ = -1; + } + if (cardinality_ > 0) { LOG.debug("cardinality_=" + Long.toString(cardinality_) + " sel=" + Double.toString(computeSelectivity())); http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12496c7f/fe/src/main/java/com/cloudera/impala/planner/Planner.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/com/cloudera/impala/planner/Planner.java b/fe/src/main/java/com/cloudera/impala/planner/Planner.java index 64a6c05..de1c088 100644 --- a/fe/src/main/java/com/cloudera/impala/planner/Planner.java +++ b/fe/src/main/java/com/cloudera/impala/planner/Planner.java @@ -209,8 +209,9 @@ public class Planner { for (TTableName tableName: request.query_ctx.getTables_with_corrupt_stats()) { tableNames.add(tableName.db_name + "." + tableName.table_name); } - str.append("WARNING: The following tables have potentially corrupt table\n" + - "statistics. Drop and re-compute statistics to resolve this problem.\n" + + str.append( + "WARNING: The following tables have potentially corrupt table statistics.\n" + + "Drop and re-compute statistics to resolve this problem.\n" + Joiner.on(", ").join(tableNames) + "\n"); hasHeader = true; } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12496c7f/testdata/workloads/functional-query/queries/QueryTest/corrupt-stats.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/QueryTest/corrupt-stats.test b/testdata/workloads/functional-query/queries/QueryTest/corrupt-stats.test index 7c5b561..da88f3b 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/corrupt-stats.test +++ b/testdata/workloads/functional-query/queries/QueryTest/corrupt-stats.test @@ -34,7 +34,7 @@ ORG, #ROWS, #FILES, SIZE, BYTES CACHED, CACHE REPLICATION, FORMAT, INCREMENTAL S STRING, BIGINT, BIGINT, STRING, STRING, STRING, STRING, STRING, STRING ==== ---- QUERY -alter table corrupted partition(org=1) set tblproperties('numRows'='0'); +alter table corrupted partition(org=1) set tblproperties('numRows'='0', 'STATS_GENERATED_VIA_STATS_TASK'='true'); ==== ---- QUERY invalidate metadata corrupted; @@ -53,8 +53,8 @@ STRING, BIGINT, BIGINT, STRING, STRING, STRING, STRING, STRING, STRING ---- QUERY explain select count(*) from corrupted where org = 1; ---- RESULTS: VERIFY_IS_SUBSET -'WARNING: The following tables have potentially corrupt table' -'statistics. Drop and re-compute statistics to resolve this problem.' +'WARNING: The following tables have potentially corrupt table statistics.' +'Drop and re-compute statistics to resolve this problem.' '$DATABASE.corrupted' '' '03:AGGREGATE [FINALIZE]' @@ -71,8 +71,8 @@ explain select count(*) from corrupted where org = 1; STRING ==== ---- QUERY -alter table corrupted partition(org=1) set tblproperties('numRows'='3'); -alter table corrupted set tblproperties('numRows'='0'); +alter table corrupted partition(org=1) set tblproperties('numRows'='3', 'STATS_GENERATED_VIA_STATS_TASK'='true'); +alter table corrupted set tblproperties('numRows'='0', 'STATS_GENERATED_VIA_STATS_TASK'='true'); ==== ---- QUERY show table stats corrupted; @@ -97,7 +97,7 @@ explain select count(*) from corrupted; STRING ==== ---- QUERY -alter table corrupted set tblproperties('numRows'='6'); +alter table corrupted set tblproperties('numRows'='6', 'STATS_GENERATED_VIA_STATS_TASK'='true'); ==== ---- QUERY show table stats corrupted; @@ -122,6 +122,28 @@ explain select count(*) from corrupted; STRING ==== ---- QUERY +# IMPALA-3930: Set numRows of a partition to a negative value and check warning, +alter table corrupted partition(org=2) set tblproperties('numRows'='-1234', 'STATS_GENERATED_VIA_STATS_TASK'='true'); +explain select count(*) from corrupted where org = 2; +---- RESULTS: VERIFY_IS_SUBSET +'WARNING: The following tables have potentially corrupt table statistics.' +'Drop and re-compute statistics to resolve this problem.' +'$DATABASE.corrupted' +'' +'03:AGGREGATE [FINALIZE]' +'| output: count:merge(*)' +'|' +'02:EXCHANGE [UNPARTITIONED]' +'|' +'01:AGGREGATE' +'| output: count(*)' +'|' +'00:SCAN HDFS [$DATABASE.corrupted]' +' partitions=1/2 files=1 size=24B' +---- TYPES +STRING +==== +---- QUERY create table corrupted_no_part (id int); insert into corrupted_no_part values (1),(2),(3); compute stats corrupted_no_part; @@ -148,22 +170,44 @@ explain select count(*) from corrupted_no_part; STRING ==== ---- QUERY -alter table corrupted_no_part set tblproperties('numRows'='0'); +alter table corrupted_no_part set tblproperties('numRows'='0', 'STATS_GENERATED_VIA_STATS_TASK'='true'); ==== ---- QUERY show table stats corrupted_no_part; ---- LABELS #ROWS, #FILES, SIZE, BYTES CACHED, CACHE REPLICATION, FORMAT, INCREMENTAL STATS, LOCATION ---- RESULTS --1,1,'6B','NOT CACHED','NOT CACHED','TEXT','false','$NAMENODE/test-warehouse/$DATABASE.db/corrupted_no_part' +0,1,'6B','NOT CACHED','NOT CACHED','TEXT','false','$NAMENODE/test-warehouse/$DATABASE.db/corrupted_no_part' ---- TYPES BIGINT, BIGINT, STRING, STRING, STRING, STRING, STRING, STRING ==== ---- QUERY --- After setting num rows to 0, the HMS will set it to -1 and avoids bad behavior. explain select count(*) from corrupted_no_part; ---- RESULTS: VERIFY_IS_SUBSET -'WARNING: The following tables are missing relevant table and/or column statistics.' +'WARNING: The following tables have potentially corrupt table statistics.' +'Drop and re-compute statistics to resolve this problem.' +'$DATABASE.corrupted_no_part' +'' +'03:AGGREGATE [FINALIZE]' +'| output: count:merge(*)' +'|' +'02:EXCHANGE [UNPARTITIONED]' +'|' +'01:AGGREGATE' +'| output: count(*)' +'|' +'00:SCAN HDFS [$DATABASE.corrupted_no_part]' +' partitions=1/1 files=1 size=6B' +---- TYPES +STRING +==== +---- QUERY +# IMPALA-3930: Set numRows of the table to a negative value and check warning, +alter table corrupted_no_part set tblproperties('numRows'='-1234', 'STATS_GENERATED_VIA_STATS_TASK'='true'); +explain select count(*) from corrupted_no_part; +---- RESULTS: VERIFY_IS_SUBSET +'WARNING: The following tables have potentially corrupt table statistics.' +'Drop and re-compute statistics to resolve this problem.' '$DATABASE.corrupted_no_part' '' '03:AGGREGATE [FINALIZE]' http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12496c7f/tests/metadata/test_compute_stats.py ---------------------------------------------------------------------- diff --git a/tests/metadata/test_compute_stats.py b/tests/metadata/test_compute_stats.py index 348d7f7..6f6d31a 100644 --- a/tests/metadata/test_compute_stats.py +++ b/tests/metadata/test_compute_stats.py @@ -151,8 +151,8 @@ class TestCorruptTableStats(ImpalaTestSuite): cls.TestMatrix.add_dimension(create_uncompressed_text_dimension(cls.get_workload())) def test_corrupt_stats(self, vector, unique_database): - """IMPALA-1983: Test that in the presence of corrupt table statistics a warning is - issued and the small query optimization is disabled.""" + """IMPALA-1983/IMPALA-1657: Test that in the presence of corrupt table statistics a + warning is issued and the small query optimization is disabled.""" if self.exploration_strategy() != 'exhaustive': pytest.skip("Only run in exhaustive") self.run_test_case('QueryTest/corrupt-stats', vector, unique_database)
