Author: gunther
Date: Mon Jul 14 18:18:31 2014
New Revision: 1610477
URL: http://svn.apache.org/r1610477
Log:
HIVE-7395: Work around non availability of stats for partition columns (Laljo
John Pullokkaran via Gunther Hagleitner)
Modified:
hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/RelOptHiveTable.java
hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/RelNodeConverter.java
hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
Modified:
hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/RelOptHiveTable.java
URL:
http://svn.apache.org/viewvc/hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/RelOptHiveTable.java?rev=1610477&r1=1610476&r2=1610477&view=diff
==============================================================================
---
hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/RelOptHiveTable.java
(original)
+++
hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/RelOptHiveTable.java
Mon Jul 14 18:18:31 2014
@@ -11,12 +11,11 @@ import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.ql.exec.ColumnInfo;
-import org.apache.hadoop.hive.ql.exec.CommonJoinOperator;
+import org.apache.hadoop.hive.ql.metadata.Hive;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.plan.ColStatistics;
-import org.apache.hadoop.hive.ql.plan.Statistics;
import org.apache.hadoop.hive.ql.stats.StatsUtils;
import org.eigenbase.rel.RelNode;
import org.eigenbase.rel.TableAccessRel;
@@ -24,6 +23,10 @@ import org.eigenbase.relopt.RelOptAbstra
import org.eigenbase.relopt.RelOptSchema;
import org.eigenbase.reltype.RelDataType;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.ImmutableMap.Builder;
+
/*
* Fix Me:
* 1. Column Pruning
@@ -32,106 +35,190 @@ import org.eigenbase.reltype.RelDataType
*/
public class RelOptHiveTable extends RelOptAbstractTable {
- private final Table m_hiveTblMetadata;
- private double m_rowCount = -1;
-
- final Map<String, Double> m_columnIdxToSizeMap = new HashMap<String,
Double>();
-
- Map<String, Integer> m_bucketingColMap;
- Map<String, Integer> m_bucketingSortColMap;
-
- Statistics m_hiveStats;
- List<ColStatistics> m_hiveColStats = new ArrayList<ColStatistics>();
+ private final Table m_hiveTblMetadata;
+ private double m_rowCount = -1;
+ private final ImmutableList<ColumnInfo> m_hiveNonPartitionCols;
+ private final ImmutableMap<Integer, ColumnInfo>
m_hiveNonPartitionColsMap;
+ private final ImmutableMap<Integer, ColumnInfo> m_hivePartitionColsMap;
+ Map<Integer, ColStatistics> m_hiveColStatsMap = new HashMap<Integer,
ColStatistics>();
+ private Integer m_numPartitions;
+ private final int m_noOfProjs;
protected static final Log LOG = LogFactory.getLog(RelOptHiveTable.class
.getName());
- // NOTE: name here is the table alias which may or may not be the real name
in
- // metadata. Use
- // m_hiveTblMetadata.getTableName() for table name and
- // m_hiveTblMetadata.getDbName() for db name.
- public RelOptHiveTable(RelOptSchema schema, String name, RelDataType rowType,
- Table hiveTblMetadata, Statistics stats) {
- super(schema, name, rowType);
- m_hiveTblMetadata = hiveTblMetadata;
- }
-
- public RelOptHiveTable(RelOptSchema optiqSchema, String name, RelDataType
rowType,
- Table hiveTblMetadata, List<ColumnInfo> hiveSchema) {
- super(optiqSchema, name, rowType);
- m_hiveTblMetadata = hiveTblMetadata;
-
- List<String> neededColumns = new ArrayList<String>();
- for (ColumnInfo ci : hiveSchema) {
- neededColumns.add(ci.getInternalName());
- }
-
- //TODO: Fix below two stats
- m_hiveColStats = StatsUtils.getTableColumnStats(m_hiveTblMetadata,
hiveSchema, neededColumns);
- m_rowCount = StatsUtils.getNumRows(m_hiveTblMetadata);
- }
-
- @Override
- public boolean isKey(BitSet arg0) {
- return false;
- }
-
- @Override
- public RelNode toRel(ToRelContext context) {
- return new TableAccessRel(context.getCluster(), this);
- }
-
- @Override
- public <T> T unwrap(Class<T> arg0) {
- return arg0.isInstance(this) ? arg0.cast(this) : null;
- }
-
- @Override
- public double getRowCount() {
- return m_rowCount;
- }
-
- public Table getHiveTableMD() {
- return m_hiveTblMetadata;
- }
-
- public Statistics getHiveStats() {
- return m_hiveStats;
- }
-
- private String getColNameList(Set<Integer> colLst) {
- StringBuffer sb = new StringBuffer();
- List<FieldSchema> schema = m_hiveTblMetadata.getAllCols();
- for (Integer i : colLst) {
- String colName = (i < schema.size()) ?
m_hiveTblMetadata.getAllCols().get(i).getName() : "";
- if (i == 0)
- sb.append(colName);
- else
- sb.append(", " + colName);
- }
- return sb.toString();
- }
-
- public List<ColStatistics> getColStat(List<Integer> projIndxLst) {
- if (projIndxLst != null) {
- Set<Integer> colsWithoutStats = new HashSet<Integer>();
- List<ColStatistics> hiveColStatLst = new LinkedList<ColStatistics>();
- for (Integer i : projIndxLst) {
- if (i >= m_hiveColStats.size())
- colsWithoutStats.add(i);
- else
- hiveColStatLst.add(m_hiveColStats.get(i));
- }
- if (!colsWithoutStats.isEmpty()) {
- String logMsg = "No Stats for DB@Table " +
m_hiveTblMetadata.getCompleteName()
- + ", Columns: " + getColNameList(colsWithoutStats);
- LOG.error(logMsg);
- throw new RuntimeException(logMsg);
- }
-
- return hiveColStatLst;
- } else {
- return m_hiveColStats;
- }
- }
+ public RelOptHiveTable(RelOptSchema optiqSchema, String name,
+ RelDataType rowType, Table hiveTblMetadata,
+ List<ColumnInfo> hiveNonPartitionCols,
+ List<ColumnInfo> hivePartitionCols) {
+ super(optiqSchema, name, rowType);
+ m_hiveTblMetadata = hiveTblMetadata;
+ m_hiveNonPartitionCols =
ImmutableList.copyOf(hiveNonPartitionCols);
+ m_hiveNonPartitionColsMap = getColInfoMap(hiveNonPartitionCols,
0);
+ m_hivePartitionColsMap = getColInfoMap(hivePartitionCols,
+ m_hiveNonPartitionColsMap.size());
+ m_noOfProjs = hiveNonPartitionCols.size() +
hivePartitionCols.size();
+ }
+
+ private static ImmutableMap<Integer, ColumnInfo> getColInfoMap(
+ List<ColumnInfo> hiveCols, int startIndx) {
+ Builder<Integer, ColumnInfo> bldr = ImmutableMap
+ .<Integer, ColumnInfo> builder();
+
+ int indx = startIndx;
+ for (ColumnInfo ci : hiveCols) {
+ bldr.put(indx, ci);
+ indx++;
+ }
+
+ return bldr.build();
+ }
+
+ @Override
+ public boolean isKey(BitSet arg0) {
+ return false;
+ }
+
+ @Override
+ public RelNode toRel(ToRelContext context) {
+ return new TableAccessRel(context.getCluster(), this);
+ }
+
+ @Override
+ public <T> T unwrap(Class<T> arg0) {
+ return arg0.isInstance(this) ? arg0.cast(this) : null;
+ }
+
+ @Override
+ public double getRowCount() {
+ if (m_rowCount == -1)
+ m_rowCount = StatsUtils.getNumRows(m_hiveTblMetadata);
+
+ return m_rowCount;
+ }
+
+ public Table getHiveTableMD() {
+ return m_hiveTblMetadata;
+ }
+
+ private String getColNamesForLogging(Set<String> colLst) {
+ StringBuffer sb = new StringBuffer();
+ boolean firstEntry = true;
+ for (String colName : colLst) {
+ if (firstEntry) {
+ sb.append(colName);
+ firstEntry = false;
+ } else {
+ sb.append(", " + colName);
+ }
+ }
+ return sb.toString();
+ }
+
+ private void updateColStats(Set<Integer> projIndxLst) {
+ List<String> nonPartColNamesThatRqrStats = new
ArrayList<String>();
+ List<Integer> nonPartColIndxsThatRqrStats = new
ArrayList<Integer>();
+ List<String> partColNamesThatRqrStats = new ArrayList<String>();
+ List<Integer> partColIndxsThatRqrStats = new
ArrayList<Integer>();
+ Set<String> colNamesFailedStats = new HashSet<String>();
+
+ // 1. Separate required columns to Non Partition and Partition
Cols
+ ColumnInfo tmp;
+ for (Integer pi : projIndxLst) {
+ if (m_hiveColStatsMap.get(pi) == null) {
+ if ((tmp = m_hiveNonPartitionColsMap.get(pi))
!= null) {
+
nonPartColNamesThatRqrStats.add(tmp.getInternalName());
+ nonPartColIndxsThatRqrStats.add(pi);
+ } else if ((tmp =
m_hivePartitionColsMap.get(pi)) != null) {
+
partColNamesThatRqrStats.add(tmp.getInternalName());
+ partColIndxsThatRqrStats.add(pi);
+ } else {
+ String logMsg = "Unable to find Column
Index: " + pi
+ + ", in " +
m_hiveTblMetadata.getCompleteName();
+ LOG.error(logMsg);
+ throw new RuntimeException(logMsg);
+ }
+ }
+ }
+
+ // 2. Obtain Col Stats for Non Partition Cols
+ if (nonPartColNamesThatRqrStats.size() > 0) {
+ List<ColStatistics> colStats =
StatsUtils.getTableColumnStats(
+ m_hiveTblMetadata,
m_hiveNonPartitionCols,
+ nonPartColNamesThatRqrStats);
+ if (colStats != null
+ && colStats.size() ==
nonPartColNamesThatRqrStats.size()) {
+ for (int i = 0; i < colStats.size(); i++) {
+
m_hiveColStatsMap.put(nonPartColIndxsThatRqrStats.get(i),
+ colStats.get(i));
+ }
+ } else {
+ // TODO: colNamesFailedStats is designed to be
used for both non
+ // partitioned & partitioned cols; currently
only used for non
+ // partitioned cols.
+
colNamesFailedStats.addAll(nonPartColNamesThatRqrStats);
+ }
+ }
+
+ // 3. Obtain Stats for Partition Cols
+ // TODO: Fix this as part of Partition Pruning
+ if (!partColNamesThatRqrStats.isEmpty()) {
+ if (m_numPartitions == null) {
+ try {
+ m_numPartitions = Hive
+ .get()
+
.getPartitionNames(m_hiveTblMetadata.getDbName(),
+
m_hiveTblMetadata.getTableName(),
+ (short)
-1).size();
+ } catch (HiveException e) {
+ String logMsg = "Could not get stats,
number of Partitions for "
+ +
m_hiveTblMetadata.getCompleteName();
+ LOG.error(logMsg);
+ throw new RuntimeException(logMsg);
+ }
+ }
+
+ ColStatistics cStats = null;
+ for (int i = 0; i < partColNamesThatRqrStats.size();
i++) {
+ cStats = new
ColStatistics(m_hiveTblMetadata.getTableName(),
+
partColNamesThatRqrStats.get(i), m_hivePartitionColsMap
+
.get(partColIndxsThatRqrStats.get(i))
+ .getTypeName());
+ cStats.setCountDistint(m_numPartitions);
+
+
m_hiveColStatsMap.put(partColIndxsThatRqrStats.get(i), cStats);
+ }
+ }
+
+ // 4. Warn user if we could get stats for required columns
+ if (!colNamesFailedStats.isEmpty()) {
+ String logMsg = "No Stats for "
+ + m_hiveTblMetadata.getCompleteName() +
", Columns: "
+ +
getColNamesForLogging(colNamesFailedStats);
+ LOG.error(logMsg);
+ throw new RuntimeException(logMsg);
+ }
+ }
+
+ public List<ColStatistics> getColStat(List<Integer> projIndxLst) {
+ List<ColStatistics> hiveColStatLst = new
LinkedList<ColStatistics>();
+
+ if (projIndxLst != null) {
+ updateColStats(new HashSet<Integer>(projIndxLst));
+ for (Integer i : projIndxLst) {
+ hiveColStatLst.add(m_hiveColStatsMap.get(i));
+ }
+ } else {
+ List<Integer> pILst = new ArrayList<Integer>();
+ for (Integer i = 0; i < m_noOfProjs; i++) {
+ pILst.add(i);
+ }
+ updateColStats(new HashSet<Integer>(pILst));
+ for (Integer pi : pILst) {
+ hiveColStatLst.add(m_hiveColStatsMap.get(pi));
+ }
+ }
+
+ return hiveColStatLst;
+ }
}
Modified:
hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/RelNodeConverter.java
URL:
http://svn.apache.org/viewvc/hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/RelNodeConverter.java?rev=1610477&r1=1610476&r2=1610477&view=diff
==============================================================================
---
hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/RelNodeConverter.java
(original)
+++
hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/RelNodeConverter.java
Mon Jul 14 18:18:31 2014
@@ -627,7 +627,7 @@ public class RelNodeConverter {
}
RelDataType rowType = TypeConverter.getType(ctx.cluster, rr, neededCols);
RelOptHiveTable optTable = new RelOptHiveTable(ctx.schema,
tableScanOp.getConf().getAlias(),
- rowType, ctx.sA.getTable(tableScanOp), stats);
+ rowType, ctx.sA.getTable(tableScanOp), null, null);
TableAccessRelBase tableRel = new HiveTableScanRel(ctx.cluster,
ctx.cluster.traitSetOf(HiveRel.CONVENTION), optTable, rowType);
ctx.buildColumnMap(tableScanOp, tableRel);
Modified:
hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
URL:
http://svn.apache.org/viewvc/hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java?rev=1610477&r1=1610476&r2=1610477&view=diff
==============================================================================
---
hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
(original)
+++
hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
Mon Jul 14 18:18:31 2014
@@ -12090,8 +12090,9 @@ public class SemanticAnalyzer extends Ba
cInfoLst.add(colInfo);
}
// TODO: Fix this
- ArrayList<ColumnInfo> columnsThatNeedsStats = new
ArrayList<ColumnInfo>(
+ ArrayList<ColumnInfo> nonPartitionColumns = new ArrayList<ColumnInfo>(
cInfoLst);
+ ArrayList<ColumnInfo> partitionColumns = new ArrayList<ColumnInfo>();
// 3.2 Add column info corresponding to partition columns
for (FieldSchema part_col : tab.getPartCols()) {
@@ -12101,6 +12102,7 @@ public class SemanticAnalyzer extends Ba
tableAlias, true);
rr.put(tableAlias, colName, colInfo);
cInfoLst.add(colInfo);
+ partitionColumns.add(colInfo);
}
// 3.3 Add column info corresponding to virtual columns
@@ -12119,7 +12121,7 @@ public class SemanticAnalyzer extends Ba
// 4. Build RelOptAbstractTable
RelOptHiveTable optTable = new RelOptHiveTable(m_relOptSchema,
- tableAlias, rowType, tab, columnsThatNeedsStats);
+ tableAlias, rowType, tab, nonPartitionColumns, partitionColumns);
// 5. Build Hive Table Scan Rel
tableRel = new HiveTableScanRel(m_cluster,