This is an automated email from the ASF dual-hosted git repository. joemcdonnell pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit c2516d220da8e532b6ebdb6f3a12e7ad97c4f597 Author: Csaba Ringhofer <csringho...@cloudera.com> AuthorDate: Wed Apr 17 15:01:07 2019 +0200 IMPALA-8409: Fix row-size for STRING columns with unknown stats Explain returned row-size=11B for STRING columns without statistics. The issue was caused by adding -1 (meaning unknown) to the 12 byte slot size (sizeof(StringValue)). The code in TupleDescriptor.java tried to handle this by checking if the size is -1, but it was already 11 at this point. There is more potential for cleanup, but I wanted to keep this change minimal. Testing: - revived some tests in CatalogTest.java that were removed in 2013 due to flakiness - added an EE test that checks row size with and without stats - fixed a similar test, test_explain_validate_cardinality_estimates (the format of the line it looks for has changed, which lead to skipping the actual verification and accepting everything) - ran core FE and EE tests Change-Id: I866acf10b2c011a735dee019f4bc29358f2ec4e5 Reviewed-on: http://gerrit.cloudera.org:8080/13190 Reviewed-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com> --- .../apache/impala/analysis/TupleDescriptor.java | 2 +- .../org/apache/impala/catalog/ColumnStats.java | 1 - .../org/apache/impala/planner/HdfsScanNode.java | 3 ++ .../org/apache/impala/catalog/CatalogTest.java | 61 +++++++-------------- tests/metadata/test_explain.py | 63 +++++++++++++++++++--- 5 files changed, 80 insertions(+), 50 deletions(-) diff --git a/fe/src/main/java/org/apache/impala/analysis/TupleDescriptor.java b/fe/src/main/java/org/apache/impala/analysis/TupleDescriptor.java index 87b8e5f..557874d 100644 --- a/fe/src/main/java/org/apache/impala/analysis/TupleDescriptor.java +++ b/fe/src/main/java/org/apache/impala/analysis/TupleDescriptor.java @@ -268,7 +268,7 @@ public class TupleDescriptor { ColumnStats stats = d.getStats(); int slotSize = d.getType().getSlotSize(); - if (stats.hasAvgSerializedSize()) { + if (stats.hasAvgSize()) { avgSerializedSize_ += d.getStats().getAvgSerializedSize(); } else { // TODO: for computed slots, try to come up with stats estimates diff --git a/fe/src/main/java/org/apache/impala/catalog/ColumnStats.java b/fe/src/main/java/org/apache/impala/catalog/ColumnStats.java index a3e40cb..65533da 100644 --- a/fe/src/main/java/org/apache/impala/catalog/ColumnStats.java +++ b/fe/src/main/java/org/apache/impala/catalog/ColumnStats.java @@ -166,7 +166,6 @@ public class ColumnStats { public boolean hasNulls() { return numNulls_ > 0; } public long getNumNulls() { return numNulls_; } public boolean hasAvgSize() { return avgSize_ >= 0; } - public boolean hasAvgSerializedSize() { return avgSerializedSize_ >= 0; } public boolean hasNumDistinctValues() { return numDistinctValues_ >= 0; } public boolean hasStats() { return numNulls_ != -1 || numDistinctValues_ != -1; } diff --git a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java index ce5c850..2a2f4e6 100644 --- a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java +++ b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java @@ -1694,6 +1694,9 @@ public class HdfsScanNode extends ScanNode { if (stats.hasAvgSize() && maxScanRangeNumRows_ != -1) { // Estimate the column's uncompressed data size based on row count and average // size. + // TODO: Size of strings seems to be underestimated, as avg size returns the + // average length of the strings and does not include the 4 byte length + // field used in Parquet plain encoding. (IMPALA-8431) reservationBytes = (long) Math.min(reservationBytes, stats.getAvgSize() * maxScanRangeNumRows_); if (stats.hasNumDistinctValues()) { diff --git a/fe/src/test/java/org/apache/impala/catalog/CatalogTest.java b/fe/src/test/java/org/apache/impala/catalog/CatalogTest.java index 80a33fc..36c9919 100644 --- a/fe/src/test/java/org/apache/impala/catalog/CatalogTest.java +++ b/fe/src/test/java/org/apache/impala/catalog/CatalogTest.java @@ -23,6 +23,7 @@ import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertSame; import static org.junit.Assert.assertTrue; +import static org.junit.Assert.assertFalse; import static org.junit.Assert.fail; import java.io.IOException; @@ -471,89 +472,71 @@ public class CatalogTest { assertEquals(1, uniqueSds.size()); } - // TODO: All Hive-stats related tests are temporarily disabled because of an unknown, - // sporadic issue causing stats of some columns to be absent in Jenkins runs. - // Investigate this issue further. - //@Test - public void testStats() throws TableLoadingException { + @Test + public void testStats() throws CatalogException { // make sure the stats for functional.alltypesagg look correct - HdfsTable table = - (HdfsTable) catalog_.getDb("functional").getTable("AllTypesAgg"); + HdfsTable table = (HdfsTable) catalog_.getOrLoadTable("functional", "AllTypesAgg"); Column idCol = table.getColumn("id"); - assertEquals(idCol.getStats().getAvgSerializedSize() - - PrimitiveType.INT.getSlotSize(), + assertEquals(idCol.getStats().getAvgSerializedSize(), PrimitiveType.INT.getSlotSize(), 0.0001); assertEquals(idCol.getStats().getMaxSize(), PrimitiveType.INT.getSlotSize()); - assertTrue(!idCol.getStats().hasNulls()); + assertFalse(idCol.getStats().hasNulls()); Column boolCol = table.getColumn("bool_col"); - assertEquals(boolCol.getStats().getAvgSerializedSize() - - PrimitiveType.BOOLEAN.getSlotSize(), + assertEquals(boolCol.getStats().getAvgSerializedSize(), PrimitiveType.BOOLEAN.getSlotSize(), 0.0001); assertEquals(boolCol.getStats().getMaxSize(), PrimitiveType.BOOLEAN.getSlotSize()); - assertTrue(!boolCol.getStats().hasNulls()); + assertFalse(boolCol.getStats().hasNulls()); Column tinyintCol = table.getColumn("tinyint_col"); - assertEquals(tinyintCol.getStats().getAvgSerializedSize() - - PrimitiveType.TINYINT.getSlotSize(), + assertEquals(tinyintCol.getStats().getAvgSerializedSize(), PrimitiveType.TINYINT.getSlotSize(), 0.0001); - assertEquals(tinyintCol.getStats().getMaxSize(), - PrimitiveType.TINYINT.getSlotSize()); + assertEquals(tinyintCol.getStats().getMaxSize(), PrimitiveType.TINYINT.getSlotSize()); assertTrue(tinyintCol.getStats().hasNulls()); Column smallintCol = table.getColumn("smallint_col"); - assertEquals(smallintCol.getStats().getAvgSerializedSize() - - PrimitiveType.SMALLINT.getSlotSize(), + assertEquals(smallintCol.getStats().getAvgSerializedSize(), PrimitiveType.SMALLINT.getSlotSize(), 0.0001); assertEquals(smallintCol.getStats().getMaxSize(), PrimitiveType.SMALLINT.getSlotSize()); assertTrue(smallintCol.getStats().hasNulls()); Column intCol = table.getColumn("int_col"); - assertEquals(intCol.getStats().getAvgSerializedSize() - - PrimitiveType.INT.getSlotSize(), + assertEquals(intCol.getStats().getAvgSerializedSize(), PrimitiveType.INT.getSlotSize(), 0.0001); assertEquals(intCol.getStats().getMaxSize(), PrimitiveType.INT.getSlotSize()); assertTrue(intCol.getStats().hasNulls()); Column bigintCol = table.getColumn("bigint_col"); - assertEquals(bigintCol.getStats().getAvgSerializedSize() - - PrimitiveType.BIGINT.getSlotSize(), + assertEquals(bigintCol.getStats().getAvgSerializedSize(), PrimitiveType.BIGINT.getSlotSize(), 0.0001); assertEquals(bigintCol.getStats().getMaxSize(), PrimitiveType.BIGINT.getSlotSize()); assertTrue(bigintCol.getStats().hasNulls()); Column floatCol = table.getColumn("float_col"); - assertEquals(floatCol.getStats().getAvgSerializedSize() - - PrimitiveType.FLOAT.getSlotSize(), + assertEquals(floatCol.getStats().getAvgSerializedSize(), PrimitiveType.FLOAT.getSlotSize(), 0.0001); assertEquals(floatCol.getStats().getMaxSize(), PrimitiveType.FLOAT.getSlotSize()); assertTrue(floatCol.getStats().hasNulls()); Column doubleCol = table.getColumn("double_col"); - assertEquals(doubleCol.getStats().getAvgSerializedSize() - - PrimitiveType.DOUBLE.getSlotSize(), + assertEquals(doubleCol.getStats().getAvgSerializedSize(), PrimitiveType.DOUBLE.getSlotSize(), 0.0001); assertEquals(doubleCol.getStats().getMaxSize(), PrimitiveType.DOUBLE.getSlotSize()); assertTrue(doubleCol.getStats().hasNulls()); Column timestampCol = table.getColumn("timestamp_col"); - assertEquals(timestampCol.getStats().getAvgSerializedSize() - - PrimitiveType.TIMESTAMP.getSlotSize(), + assertEquals(timestampCol.getStats().getAvgSerializedSize(), PrimitiveType.TIMESTAMP.getSlotSize(), 0.0001); assertEquals(timestampCol.getStats().getMaxSize(), PrimitiveType.TIMESTAMP.getSlotSize()); - // this does not have nulls, it's not clear why this passes - // TODO: investigate and re-enable - //assertTrue(timestampCol.getStats().hasNulls()); + assertFalse(timestampCol.getStats().hasNulls()); Column stringCol = table.getColumn("string_col"); - assertTrue(stringCol.getStats().getAvgSerializedSize() >= - PrimitiveType.STRING.getSlotSize()); assertTrue(stringCol.getStats().getAvgSerializedSize() > 0); assertTrue(stringCol.getStats().getMaxSize() > 0); - assertTrue(!stringCol.getStats().hasNulls()); + assertFalse(stringCol.getStats().hasNulls()); } /** @@ -561,10 +544,7 @@ public class CatalogTest { * the column type results in the stats being set to "unknown". This is a regression * test for IMPALA-588, where this used to result in a Preconditions failure. */ - // TODO: All Hive-stats related tests are temporarily disabled because of an unknown, - // sporadic issue causing stats of some columns to be absent in Jenkins runs. - // Investigate this issue further. - //@Test + @Test public void testColStatsColTypeMismatch() throws Exception { // First load a table that has column stats. //catalog_.refreshTable("functional", "alltypesagg", false); @@ -597,7 +577,7 @@ public class CatalogTest { // Now try to apply a matching column stats data and ensure it succeeds. assertTrue(table.getColumn("string_col").updateStats(stringColStatsData)); - assertEquals(1178, table.getColumn("string_col").getStats().getNumDistinctValues()); + assertEquals(963, table.getColumn("string_col").getStats().getNumDistinctValues()); } } @@ -606,7 +586,6 @@ public class CatalogTest { assertEquals(-1, column.getStats().getNumNulls()); double expectedSize = column.getType().isFixedLengthType() ? column.getType().getSlotSize() : -1; - assertEquals(expectedSize, column.getStats().getAvgSerializedSize(), 0.0001); assertEquals(expectedSize, column.getStats().getMaxSize(), 0.0001); } diff --git a/tests/metadata/test_explain.py b/tests/metadata/test_explain.py index 48a6d69..9f2d61b 100644 --- a/tests/metadata/test_explain.py +++ b/tests/metadata/test_explain.py @@ -70,6 +70,22 @@ class TestExplain(ImpalaTestSuite): vector.get_value('exec_option')['explain_level'] = 3 self.run_test_case('QueryTest/explain-level3', vector) + @staticmethod + def check_row_size_and_cardinality(query_result, expected_row_size=None, + expected_cardinality=None): + regex = re.compile('tuple-ids=.+ row-size=(\d+)B cardinality=(.*)') + found_match = False + for res in query_result: + m = regex.match(res.strip()) + if m: + found_match = True + assert len(m.groups()) == 2 + if expected_row_size: + assert m.groups()[0] == expected_row_size + if expected_cardinality: + assert m.groups()[1] == expected_cardinality + assert found_match, query_result + def test_explain_validate_cardinality_estimates(self, vector, unique_database): # Tests that the cardinality estimates are correct for partitioned tables. # TODO Cardinality estimation tests should eventually be part of the planner tests. @@ -78,13 +94,8 @@ class TestExplain(ImpalaTestSuite): tbl_name = 'alltypes' def check_cardinality(query_result, expected_cardinality): - regex = re.compile(' row-size=\d+B cardinality=(.*)$') - for res in query_result: - m = regex.match(res.strip()) - if m: - assert len(m.groups()) == 1 - # The cardinality should be zero. - assert m.groups()[0] == expected_cardinality + self.check_row_size_and_cardinality( + query_result, expected_cardinality=expected_cardinality) # All partitions are filtered out, cardinality should be 0. result = self.execute_query("explain select * from %s.%s where year = 1900" % ( @@ -130,6 +141,44 @@ class TestExplain(ImpalaTestSuite): query_options={'explain_level':3}) check_cardinality(result.data, '100') + def test_explain_row_size_estimates(self, vector, unique_database): + """ Tests that EXPLAIN returns the expected row sizes with and without stats. + + Planner tests is probably a more logical place for this, but covering string avg_size + handling end-to-end seemed easier here. + + Note that row sizes do not include the null indicator bytes, so actual tuple sizes + are a bit larger. """ + def check_row_size(query_result, expected_row_size): + self.check_row_size_and_cardinality( + query_result, expected_row_size=expected_row_size) + + def execute_explain(query): + return self.execute_query("explain " + query, query_options={'explain_level': 3}) + + FQ_TBL_NAME = unique_database + ".t" + self.execute_query("create table %s (i int, s string)" % FQ_TBL_NAME) + # Fill the table with data that leads to avg_size of 4 for 's'. + self.execute_query("insert into %s values (1, '123'), (2, '12345')" % FQ_TBL_NAME) + + # Always use slot size for fixed sized types. + result = execute_explain("select i from %s" % FQ_TBL_NAME) + check_row_size(result.data, '4') + + # If there are no stats, use slot size for variable length types. + result = execute_explain("select s from %s" % FQ_TBL_NAME) + check_row_size(result.data, "12") + + self.execute_query("compute stats %s" % FQ_TBL_NAME) + + # Always use slot size for fixed sized types. + result = execute_explain("select i from %s" % FQ_TBL_NAME) + check_row_size(result.data, '4') + + # If there are no stats, use slot size + avg_size for variable length types. + result = execute_explain("select s from %s" % FQ_TBL_NAME) + check_row_size(result.data, "16") + class TestExplainEmptyPartition(ImpalaTestSuite): TEST_DB_NAME = "imp_1708"