IMPALA-7140 (part 6): fetch column stats for LocalTable This adds fetching of column statistics for LocalTable. Currently, all column stats are fetched when the table is loaded, even for simple statements like 'DESCRIBE' where they aren't necessary. This is because I couldn't find a convenient spot during analysis at which time the set of necessary columns are known. I left a TODO for this potential improvement.
With this change I can see that 'SHOW COLUMN STATS' shows the expected results for functional.alltypes. A new simple unit test verifies this. Planner tests still don't pass due to some NullPointerExceptions related to loading functions from the builtins DB -- most of the tests seem to rely on simple built-ins like COUNT and CAST. Change-Id: Ib6403c2bedf4ee29c5e6f90e947382cb44f46e0c Reviewed-on: http://gerrit.cloudera.org:8080/10797 Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com> Reviewed-by: Todd Lipcon <t...@apache.org> Project: http://git-wip-us.apache.org/repos/asf/impala/repo Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/9df9efc5 Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/9df9efc5 Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/9df9efc5 Branch: refs/heads/master Commit: 9df9efc5f33d2e31f218b69a6053b575a30696a6 Parents: 9b4e6d8 Author: Todd Lipcon <t...@cloudera.com> Authored: Wed Jun 20 15:42:39 2018 -0700 Committer: Todd Lipcon <t...@apache.org> Committed: Thu Jul 12 02:36:55 2018 +0000 ---------------------------------------------------------------------- .../apache/impala/catalog/FeCatalogUtils.java | 29 ++++++++++++++++++++ .../java/org/apache/impala/catalog/Table.java | 19 ++----------- .../catalog/local/DirectMetaProvider.java | 10 +++++++ .../impala/catalog/local/LocalFsTable.java | 24 ++++++++++++++++ .../apache/impala/catalog/local/LocalTable.java | 29 +++++++++++++++++--- .../impala/catalog/local/MetaProvider.java | 7 +++++ .../impala/catalog/local/LocalCatalogTest.java | 16 +++++++++++ 7 files changed, 114 insertions(+), 20 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/impala/blob/9df9efc5/fe/src/main/java/org/apache/impala/catalog/FeCatalogUtils.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/org/apache/impala/catalog/FeCatalogUtils.java b/fe/src/main/java/org/apache/impala/catalog/FeCatalogUtils.java index dc16629..12b152f 100644 --- a/fe/src/main/java/org/apache/impala/catalog/FeCatalogUtils.java +++ b/fe/src/main/java/org/apache/impala/catalog/FeCatalogUtils.java @@ -24,6 +24,7 @@ import java.util.Map; import org.apache.hadoop.hive.common.FileUtils; import org.apache.hadoop.hive.common.StatsSetupConst; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.impala.analysis.Expr; import org.apache.impala.analysis.LiteralExpr; @@ -129,6 +130,34 @@ public abstract class FeCatalogUtils { } /** + * Given the list of column stats returned from the metastore, inject those + * stats into matching columns in 'table'. + */ + public static void injectColumnStats(List<ColumnStatisticsObj> colStats, + FeTable table) { + for (ColumnStatisticsObj stats: colStats) { + Column col = table.getColumn(stats.getColName()); + Preconditions.checkNotNull(col, "Unable to find column %s in table %s", + stats.getColName(), table.getFullName()); + if (!ColumnStats.isSupportedColType(col.getType())) { + LOG.warn(String.format( + "Statistics for %s, column %s are not supported as column " + + "has type %s", table.getFullName(), col.getName(), col.getType())); + continue; + } + + if (!col.updateStats(stats.getStatsData())) { + LOG.warn(String.format( + "Failed to load column stats for %s, column %s. Stats may be " + + "incompatible with column type %s. Consider regenerating statistics " + + "for %s.", table.getFullName(), col.getName(), col.getType(), + table.getFullName())); + continue; + } + } + } + + /** * Returns the value of the ROW_COUNT constant, or -1 if not found. */ public static long getRowCount(Map<String, String> parameters) { http://git-wip-us.apache.org/repos/asf/impala/blob/9df9efc5/fe/src/main/java/org/apache/impala/catalog/Table.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/org/apache/impala/catalog/Table.java b/fe/src/main/java/org/apache/impala/catalog/Table.java index 7fdc3c9..06c4500 100644 --- a/fe/src/main/java/org/apache/impala/catalog/Table.java +++ b/fe/src/main/java/org/apache/impala/catalog/Table.java @@ -214,6 +214,8 @@ public abstract class Table extends CatalogObjectImpl implements FeTable { // We need to only query those columns which may have stats; asking HMS for other // columns causes loadAllColumnStats() to return nothing. + // TODO(todd): this no longer seems to be true - asking for a non-existent column + // is just ignored, and the columns that do exist are returned. List<String> colNames = getColumnNamesWithHmsStats(); try { @@ -223,22 +225,7 @@ public abstract class Table extends CatalogObjectImpl implements FeTable { return; } - for (ColumnStatisticsObj stats: colStats) { - Column col = getColumn(stats.getColName()); - Preconditions.checkNotNull(col); - if (!ColumnStats.isSupportedColType(col.getType())) { - LOG.warn(String.format("Statistics for %s, column %s are not supported as " + - "column has type %s", getFullName(), col.getName(), col.getType())); - continue; - } - - if (!col.updateStats(stats.getStatsData())) { - LOG.warn(String.format("Failed to load column stats for %s, column %s. Stats " + - "may be incompatible with column type %s. Consider regenerating statistics " + - "for %s.", getFullName(), col.getName(), col.getType(), getFullName())); - continue; - } - } + FeCatalogUtils.injectColumnStats(colStats, this); } /** http://git-wip-us.apache.org/repos/asf/impala/blob/9df9efc5/fe/src/main/java/org/apache/impala/catalog/local/DirectMetaProvider.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/org/apache/impala/catalog/local/DirectMetaProvider.java b/fe/src/main/java/org/apache/impala/catalog/local/DirectMetaProvider.java index aadbf23..59acd9d 100644 --- a/fe/src/main/java/org/apache/impala/catalog/local/DirectMetaProvider.java +++ b/fe/src/main/java/org/apache/impala/catalog/local/DirectMetaProvider.java @@ -28,6 +28,7 @@ import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.RemoteIterator; import org.apache.hadoop.hive.common.FileUtils; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.Database; import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; @@ -160,6 +161,15 @@ class DirectMetaProvider implements MetaProvider { return ret; } + + @Override + public List<ColumnStatisticsObj> loadTableColumnStatistics(String dbName, + String tblName, List<String> colNames) throws TException { + try (MetaStoreClient c = msClientPool_.getClient()) { + return c.getHiveClient().getTableColumnStatistics(dbName, tblName, colNames); + } + } + @Override public List<LocatedFileStatus> loadFileMetadata(Path dir) throws IOException { Preconditions.checkNotNull(dir); http://git-wip-us.apache.org/repos/asf/impala/blob/9df9efc5/fe/src/main/java/org/apache/impala/catalog/local/LocalFsTable.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/org/apache/impala/catalog/local/LocalFsTable.java b/fe/src/main/java/org/apache/impala/catalog/local/LocalFsTable.java index 6871f90..42d3b03 100644 --- a/fe/src/main/java/org/apache/impala/catalog/local/LocalFsTable.java +++ b/fe/src/main/java/org/apache/impala/catalog/local/LocalFsTable.java @@ -31,6 +31,7 @@ import org.apache.impala.analysis.LiteralExpr; import org.apache.impala.analysis.NullLiteral; import org.apache.impala.catalog.CatalogException; import org.apache.impala.catalog.Column; +import org.apache.impala.catalog.ColumnStats; import org.apache.impala.catalog.FeCatalogUtils; import org.apache.impala.catalog.FeFsPartition; import org.apache.impala.catalog.FeFsTable; @@ -375,6 +376,29 @@ public class LocalFsTable extends LocalTable implements FeFsTable { partitionSpecs_ = b.build(); } + /** + * Override base implementation to populate column stats for + * clustering columns based on the partition map. + */ + @Override + protected void loadColumnStats() { + super.loadColumnStats(); + // TODO(todd): this is called for all tables even if not necessary, + // which means we need to load all partition names, even if not + // necessary. + loadPartitionValueMap(); + for (int i = 0; i < getNumClusteringCols(); i++) { + ColumnStats stats = getColumns().get(i).getStats(); + int nonNullParts = partitionValueMap_.get(i).size(); + int nullParts = nullPartitionIds_.get(i).size(); + stats.setNumDistinctValues(nonNullParts + nullParts); + // TODO(todd): this calculation ends up setting the num_nulls stat + // to the number of partitions with null rows, not the number of rows. + // However, it maintains the existing behavior from HdfsTable. + stats.setNumNulls(nullParts); + } + } + @Override public int parseSkipHeaderLineCount(StringBuilder error) { // TODO Auto-generated method stub http://git-wip-us.apache.org/repos/asf/impala/blob/9df9efc5/fe/src/main/java/org/apache/impala/catalog/local/LocalTable.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/org/apache/impala/catalog/local/LocalTable.java b/fe/src/main/java/org/apache/impala/catalog/local/LocalTable.java index a0a7bd7..ae697b5 100644 --- a/fe/src/main/java/org/apache/impala/catalog/local/LocalTable.java +++ b/fe/src/main/java/org/apache/impala/catalog/local/LocalTable.java @@ -22,6 +22,7 @@ import java.util.List; import javax.annotation.concurrent.Immutable; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.Table; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.impala.analysis.TableName; @@ -37,6 +38,7 @@ import org.apache.impala.catalog.StructType; import org.apache.impala.catalog.TableLoadingException; import org.apache.impala.thrift.TCatalogObjectType; import org.apache.impala.thrift.TTableStats; +import org.apache.log4j.Logger; import org.apache.thrift.TException; import com.google.common.base.Preconditions; @@ -52,6 +54,8 @@ import com.google.common.collect.Lists; * each catalog instance. */ abstract class LocalTable implements FeTable { + private static final Logger LOG = Logger.getLogger(LocalTable.class); + protected final LocalDb db_; /** The lower-case name of the table. */ protected final String name_; @@ -61,15 +65,22 @@ abstract class LocalTable implements FeTable { // In order to know which kind of table subclass to instantiate, we need // to eagerly grab and parse the top-level Table object from the HMS. SchemaInfo schemaInfo = SchemaInfo.load(db, tblName); + LocalTable t; if (HdfsFileFormat.isHdfsInputFormatClass( schemaInfo.msTable_.getSd().getInputFormat())) { - return new LocalFsTable(db, tblName, schemaInfo); + t = new LocalFsTable(db, tblName, schemaInfo); + } else { + throw new LocalCatalogException("Unknown table type for table " + + db.getName() + "." + tblName); } - throw new LocalCatalogException("Unknown table type for table " + - db.getName() + "." + tblName); + // TODO(todd): it would be preferable to only load stats for those columns + // referenced in a query, but there doesn't seem to be a convenient spot + // in between slot reference resolution and where the stats are needed. + // So, for now, we'll just load all the column stats up front. + t.loadColumnStats(); + return t; } - public LocalTable(LocalDb db, String tblName, SchemaInfo schemaInfo) { this.db_ = Preconditions.checkNotNull(db); this.name_ = Preconditions.checkNotNull(tblName); @@ -179,6 +190,16 @@ abstract class LocalTable implements FeTable { return schemaInfo_.tableStats_; } + protected void loadColumnStats() { + try { + List<ColumnStatisticsObj> stats = db_.getCatalog().getMetaProvider() + .loadTableColumnStatistics(db_.getName(), getName(), getColumnNames()); + FeCatalogUtils.injectColumnStats(stats, this); + } catch (TException e) { + LOG.warn("Could not load column statistics for: " + getFullName(), e); + } + } + /** * The table schema, loaded from the HMS Table object. This is common * to all Table implementations and includes the column definitions and http://git-wip-us.apache.org/repos/asf/impala/blob/9df9efc5/fe/src/main/java/org/apache/impala/catalog/local/MetaProvider.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/org/apache/impala/catalog/local/MetaProvider.java b/fe/src/main/java/org/apache/impala/catalog/local/MetaProvider.java index f59a7c9..9f433f8 100644 --- a/fe/src/main/java/org/apache/impala/catalog/local/MetaProvider.java +++ b/fe/src/main/java/org/apache/impala/catalog/local/MetaProvider.java @@ -23,6 +23,7 @@ import java.util.Map; import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.Database; import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; @@ -70,6 +71,12 @@ interface MetaProvider { throws MetaException, TException; /** + * Load statistics for the given columns from the given table. + */ + List<ColumnStatisticsObj> loadTableColumnStatistics(String dbName, + String tblName, List<String> colNames) throws TException; + + /** * Load file metadata and block locations for the files in the given * partition directory. */ http://git-wip-us.apache.org/repos/asf/impala/blob/9df9efc5/fe/src/test/java/org/apache/impala/catalog/local/LocalCatalogTest.java ---------------------------------------------------------------------- diff --git a/fe/src/test/java/org/apache/impala/catalog/local/LocalCatalogTest.java b/fe/src/test/java/org/apache/impala/catalog/local/LocalCatalogTest.java index a242b1c..100ba47 100644 --- a/fe/src/test/java/org/apache/impala/catalog/local/LocalCatalogTest.java +++ b/fe/src/test/java/org/apache/impala/catalog/local/LocalCatalogTest.java @@ -23,6 +23,7 @@ import java.util.List; import java.util.Set; import org.apache.impala.catalog.CatalogTest; +import org.apache.impala.catalog.ColumnStats; import org.apache.impala.catalog.FeCatalogUtils; import org.apache.impala.catalog.FeDb; import org.apache.impala.catalog.FeFsPartition; @@ -149,4 +150,19 @@ public class LocalCatalogTest { } assertEquals(24, totalFds); } + + @Test + public void testColumnStats() throws Exception { + FeFsTable t = (FeFsTable) catalog_.getTable("functional", "alltypesagg"); + // Verify expected stats for a partitioning column. + // 'days' has 10 non-NULL plus one NULL partition + ColumnStats stats = t.getColumn("day").getStats(); + assertEquals(11, stats.getNumDistinctValues()); + assertEquals(1, stats.getNumNulls()); + + // Verify expected stats for timestamp. + stats = t.getColumn("timestamp_col").getStats(); + assertEquals(10210, stats.getNumDistinctValues()); + assertEquals(-1, stats.getNumNulls()); + } }