SemanticAnalyzer.java

gunther Mon, 14 Jul 2014 11:19:21 -0700

Author: gunther
Date: Mon Jul 14 18:18:31 2014
New Revision: 1610477

URL: http://svn.apache.org/r1610477
Log:
HIVE-7395: Work around non availability of stats for partition columns (Laljo 
John Pullokkaran via Gunther Hagleitner)


Modified:
    
hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/RelOptHiveTable.java
    
hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/RelNodeConverter.java
    
hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java

Modified: 
hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/RelOptHiveTable.java
URL: 
http://svn.apache.org/viewvc/hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/RelOptHiveTable.java?rev=1610477&r1=1610476&r2=1610477&view=diff
==============================================================================
--- 
hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/RelOptHiveTable.java
 (original)
+++ 
hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/RelOptHiveTable.java
 Mon Jul 14 18:18:31 2014
@@ -11,12 +11,11 @@ import java.util.Set;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.hive.metastore.api.FieldSchema;
 import org.apache.hadoop.hive.ql.exec.ColumnInfo;
-import org.apache.hadoop.hive.ql.exec.CommonJoinOperator;
+import org.apache.hadoop.hive.ql.metadata.Hive;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
 import org.apache.hadoop.hive.ql.metadata.Table;
 import org.apache.hadoop.hive.ql.plan.ColStatistics;
-import org.apache.hadoop.hive.ql.plan.Statistics;
 import org.apache.hadoop.hive.ql.stats.StatsUtils;
 import org.eigenbase.rel.RelNode;
 import org.eigenbase.rel.TableAccessRel;
@@ -24,6 +23,10 @@ import org.eigenbase.relopt.RelOptAbstra
 import org.eigenbase.relopt.RelOptSchema;
 import org.eigenbase.reltype.RelDataType;
 
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.ImmutableMap.Builder;
+
 /*
  * Fix Me: 
  * 1. Column Pruning
@@ -32,106 +35,190 @@ import org.eigenbase.reltype.RelDataType
  */
 
 public class RelOptHiveTable extends RelOptAbstractTable {
-  private final Table       m_hiveTblMetadata;
-  private double            m_rowCount           = -1;
-
-  final Map<String, Double> m_columnIdxToSizeMap = new HashMap<String, 
Double>();
-
-  Map<String, Integer>      m_bucketingColMap;
-  Map<String, Integer>      m_bucketingSortColMap;
-
-  Statistics                m_hiveStats;
-  List<ColStatistics>       m_hiveColStats = new ArrayList<ColStatistics>();
+       private final Table m_hiveTblMetadata;
+       private double m_rowCount = -1;
+       private final ImmutableList<ColumnInfo> m_hiveNonPartitionCols;
+       private final ImmutableMap<Integer, ColumnInfo> 
m_hiveNonPartitionColsMap;
+       private final ImmutableMap<Integer, ColumnInfo> m_hivePartitionColsMap;
+       Map<Integer, ColStatistics> m_hiveColStatsMap = new HashMap<Integer, 
ColStatistics>();
+       private Integer m_numPartitions;
+       private final int m_noOfProjs;
 
        protected static final Log LOG = LogFactory.getLog(RelOptHiveTable.class
                        .getName());
 
-  // NOTE: name here is the table alias which may or may not be the real name 
in
-  // metadata. Use
-  // m_hiveTblMetadata.getTableName() for table name and
-  // m_hiveTblMetadata.getDbName() for db name.
-  public RelOptHiveTable(RelOptSchema schema, String name, RelDataType rowType,
-      Table hiveTblMetadata, Statistics stats) {
-    super(schema, name, rowType);
-    m_hiveTblMetadata = hiveTblMetadata;
-  }
-
-  public RelOptHiveTable(RelOptSchema optiqSchema, String name, RelDataType 
rowType,
-      Table hiveTblMetadata, List<ColumnInfo> hiveSchema) {
-    super(optiqSchema, name, rowType);
-    m_hiveTblMetadata = hiveTblMetadata;
-    
-    List<String> neededColumns = new ArrayList<String>();
-    for (ColumnInfo ci : hiveSchema) {
-      neededColumns.add(ci.getInternalName());
-    }
-    
-    //TODO: Fix below two stats
-    m_hiveColStats = StatsUtils.getTableColumnStats(m_hiveTblMetadata, 
hiveSchema, neededColumns);
-    m_rowCount = StatsUtils.getNumRows(m_hiveTblMetadata);
-  }
-
-  @Override
-  public boolean isKey(BitSet arg0) {
-    return false;
-  }
-
-  @Override
-  public RelNode toRel(ToRelContext context) {
-    return new TableAccessRel(context.getCluster(), this);
-  }
-
-  @Override
-  public <T> T unwrap(Class<T> arg0) {
-    return arg0.isInstance(this) ? arg0.cast(this) : null;
-  }
-
-  @Override
-  public double getRowCount() {
-    return m_rowCount;
-  }
-
-  public Table getHiveTableMD() {
-    return m_hiveTblMetadata;
-  }
-
-  public Statistics getHiveStats() {
-    return m_hiveStats;
-  }
-
-  private String getColNameList(Set<Integer> colLst) {
-    StringBuffer sb = new StringBuffer();
-    List<FieldSchema> schema = m_hiveTblMetadata.getAllCols();
-    for (Integer i : colLst) {
-      String colName = (i < schema.size()) ? 
m_hiveTblMetadata.getAllCols().get(i).getName() : "";
-      if (i == 0)
-        sb.append(colName);
-      else
-        sb.append(", " + colName);
-    }
-    return sb.toString();
-  }
-
-  public List<ColStatistics> getColStat(List<Integer> projIndxLst) {
-    if (projIndxLst != null) {
-      Set<Integer> colsWithoutStats = new HashSet<Integer>();
-      List<ColStatistics> hiveColStatLst = new LinkedList<ColStatistics>();
-      for (Integer i : projIndxLst) {
-        if (i >= m_hiveColStats.size())
-          colsWithoutStats.add(i);
-        else
-          hiveColStatLst.add(m_hiveColStats.get(i));
-      }
-      if (!colsWithoutStats.isEmpty()) {
-        String logMsg = "No Stats for DB@Table " + 
m_hiveTblMetadata.getCompleteName()
-            + ", Columns: " + getColNameList(colsWithoutStats);
-        LOG.error(logMsg);
-        throw new RuntimeException(logMsg);
-      }
-
-      return hiveColStatLst;
-    } else {
-      return m_hiveColStats;
-    }
-  }
+       public RelOptHiveTable(RelOptSchema optiqSchema, String name,
+                       RelDataType rowType, Table hiveTblMetadata,
+                       List<ColumnInfo> hiveNonPartitionCols,
+                       List<ColumnInfo> hivePartitionCols) {
+               super(optiqSchema, name, rowType);
+               m_hiveTblMetadata = hiveTblMetadata;
+               m_hiveNonPartitionCols = 
ImmutableList.copyOf(hiveNonPartitionCols);
+               m_hiveNonPartitionColsMap = getColInfoMap(hiveNonPartitionCols, 
0);
+               m_hivePartitionColsMap = getColInfoMap(hivePartitionCols,
+                               m_hiveNonPartitionColsMap.size());
+               m_noOfProjs = hiveNonPartitionCols.size() + 
hivePartitionCols.size();
+       }
+
+       private static ImmutableMap<Integer, ColumnInfo> getColInfoMap(
+                       List<ColumnInfo> hiveCols, int startIndx) {
+               Builder<Integer, ColumnInfo> bldr = ImmutableMap
+                               .<Integer, ColumnInfo> builder();
+
+               int indx = startIndx;
+               for (ColumnInfo ci : hiveCols) {
+                       bldr.put(indx, ci);
+                       indx++;
+               }
+
+               return bldr.build();
+       }
+
+       @Override
+       public boolean isKey(BitSet arg0) {
+               return false;
+       }
+
+       @Override
+       public RelNode toRel(ToRelContext context) {
+               return new TableAccessRel(context.getCluster(), this);
+       }
+
+       @Override
+       public <T> T unwrap(Class<T> arg0) {
+               return arg0.isInstance(this) ? arg0.cast(this) : null;
+       }
+
+       @Override
+       public double getRowCount() {
+               if (m_rowCount == -1)
+                       m_rowCount = StatsUtils.getNumRows(m_hiveTblMetadata);
+
+               return m_rowCount;
+       }
+
+       public Table getHiveTableMD() {
+               return m_hiveTblMetadata;
+       }
+
+       private String getColNamesForLogging(Set<String> colLst) {
+               StringBuffer sb = new StringBuffer();
+               boolean firstEntry = true;
+               for (String colName : colLst) {
+                       if (firstEntry) {
+                               sb.append(colName);
+                               firstEntry = false;
+                       } else {
+                               sb.append(", " + colName);
+                       }
+               }
+               return sb.toString();
+       }
+
+       private void updateColStats(Set<Integer> projIndxLst) {
+               List<String> nonPartColNamesThatRqrStats = new 
ArrayList<String>();
+               List<Integer> nonPartColIndxsThatRqrStats = new 
ArrayList<Integer>();
+               List<String> partColNamesThatRqrStats = new ArrayList<String>();
+               List<Integer> partColIndxsThatRqrStats = new 
ArrayList<Integer>();
+               Set<String> colNamesFailedStats = new HashSet<String>();
+
+               // 1. Separate required columns to Non Partition and Partition 
Cols
+               ColumnInfo tmp;
+               for (Integer pi : projIndxLst) {
+                       if (m_hiveColStatsMap.get(pi) == null) {
+                               if ((tmp = m_hiveNonPartitionColsMap.get(pi)) 
!= null) {
+                                       
nonPartColNamesThatRqrStats.add(tmp.getInternalName());
+                                       nonPartColIndxsThatRqrStats.add(pi);
+                               } else if ((tmp = 
m_hivePartitionColsMap.get(pi)) != null) {
+                                       
partColNamesThatRqrStats.add(tmp.getInternalName());
+                                       partColIndxsThatRqrStats.add(pi);
+                               } else {
+                                       String logMsg = "Unable to find Column 
Index: " + pi
+                                                       + ", in " + 
m_hiveTblMetadata.getCompleteName();
+                                       LOG.error(logMsg);
+                                       throw new RuntimeException(logMsg);
+                               }
+                       }
+               }
+
+               // 2. Obtain Col Stats for Non Partition Cols
+               if (nonPartColNamesThatRqrStats.size() > 0) {
+                       List<ColStatistics> colStats = 
StatsUtils.getTableColumnStats(
+                                       m_hiveTblMetadata, 
m_hiveNonPartitionCols,
+                                       nonPartColNamesThatRqrStats);
+                       if (colStats != null
+                                       && colStats.size() == 
nonPartColNamesThatRqrStats.size()) {
+                               for (int i = 0; i < colStats.size(); i++) {
+                                       
m_hiveColStatsMap.put(nonPartColIndxsThatRqrStats.get(i),
+                                                       colStats.get(i));
+                               }
+                       } else {
+                               // TODO: colNamesFailedStats is designed to be 
used for both non
+                               // partitioned & partitioned cols; currently 
only used for non
+                               // partitioned cols.
+                               
colNamesFailedStats.addAll(nonPartColNamesThatRqrStats);
+                       }
+               }
+
+               // 3. Obtain Stats for Partition Cols
+               // TODO: Fix this as part of Partition Pruning
+               if (!partColNamesThatRqrStats.isEmpty()) {
+                       if (m_numPartitions == null) {
+                               try {
+                                       m_numPartitions = Hive
+                                                       .get()
+                                                       
.getPartitionNames(m_hiveTblMetadata.getDbName(),
+                                                                       
m_hiveTblMetadata.getTableName(),
+                                                                       (short) 
-1).size();
+                               } catch (HiveException e) {
+                                       String logMsg = "Could not get stats, 
number of Partitions for "
+                                                       + 
m_hiveTblMetadata.getCompleteName();
+                                       LOG.error(logMsg);
+                                       throw new RuntimeException(logMsg);
+                               }
+                       }
+
+                       ColStatistics cStats = null;
+                       for (int i = 0; i < partColNamesThatRqrStats.size(); 
i++) {
+                               cStats = new 
ColStatistics(m_hiveTblMetadata.getTableName(),
+                                               
partColNamesThatRqrStats.get(i), m_hivePartitionColsMap
+                                                               
.get(partColIndxsThatRqrStats.get(i))
+                                                               .getTypeName());
+                               cStats.setCountDistint(m_numPartitions);
+
+                               
m_hiveColStatsMap.put(partColIndxsThatRqrStats.get(i), cStats);
+                       }
+               }
+
+               // 4. Warn user if we could get stats for required columns
+               if (!colNamesFailedStats.isEmpty()) {
+                       String logMsg = "No Stats for "
+                                       + m_hiveTblMetadata.getCompleteName() + 
", Columns: "
+                                       + 
getColNamesForLogging(colNamesFailedStats);
+                       LOG.error(logMsg);
+                       throw new RuntimeException(logMsg);
+               }
+       }
+
+       public List<ColStatistics> getColStat(List<Integer> projIndxLst) {
+               List<ColStatistics> hiveColStatLst = new 
LinkedList<ColStatistics>();
+
+               if (projIndxLst != null) {
+                       updateColStats(new HashSet<Integer>(projIndxLst));
+                       for (Integer i : projIndxLst) {
+                               hiveColStatLst.add(m_hiveColStatsMap.get(i));
+                       }
+               } else {
+                       List<Integer> pILst = new ArrayList<Integer>();
+                       for (Integer i = 0; i < m_noOfProjs; i++) {
+                               pILst.add(i);
+                       }
+                       updateColStats(new HashSet<Integer>(pILst));
+                       for (Integer pi : pILst) {
+                               hiveColStatLst.add(m_hiveColStatsMap.get(pi));
+                       }
+               }
+
+               return hiveColStatLst;
+       }
 }

Modified: 
hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/RelNodeConverter.java
URL: 
http://svn.apache.org/viewvc/hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/RelNodeConverter.java?rev=1610477&r1=1610476&r2=1610477&view=diff
==============================================================================
--- 
hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/RelNodeConverter.java
 (original)
+++ 
hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/RelNodeConverter.java
 Mon Jul 14 18:18:31 2014
@@ -627,7 +627,7 @@ public class RelNodeConverter {
       }
       RelDataType rowType = TypeConverter.getType(ctx.cluster, rr, neededCols);
       RelOptHiveTable optTable = new RelOptHiveTable(ctx.schema, 
tableScanOp.getConf().getAlias(),
-          rowType, ctx.sA.getTable(tableScanOp), stats);
+          rowType, ctx.sA.getTable(tableScanOp), null, null);
       TableAccessRelBase tableRel = new HiveTableScanRel(ctx.cluster,
           ctx.cluster.traitSetOf(HiveRel.CONVENTION), optTable, rowType);
       ctx.buildColumnMap(tableScanOp, tableRel);

Modified: 
hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
URL: 
http://svn.apache.org/viewvc/hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java?rev=1610477&r1=1610476&r2=1610477&view=diff
==============================================================================
--- 
hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
 (original)
+++ 
hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
 Mon Jul 14 18:18:31 2014
@@ -12090,8 +12090,9 @@ public class SemanticAnalyzer extends Ba
           cInfoLst.add(colInfo);
         }
         // TODO: Fix this
-        ArrayList<ColumnInfo> columnsThatNeedsStats = new 
ArrayList<ColumnInfo>(
+        ArrayList<ColumnInfo> nonPartitionColumns = new ArrayList<ColumnInfo>(
             cInfoLst);
+        ArrayList<ColumnInfo> partitionColumns = new ArrayList<ColumnInfo>();
 
         // 3.2 Add column info corresponding to partition columns
         for (FieldSchema part_col : tab.getPartCols()) {
@@ -12101,6 +12102,7 @@ public class SemanticAnalyzer extends Ba
               tableAlias, true);
           rr.put(tableAlias, colName, colInfo);
           cInfoLst.add(colInfo);
+          partitionColumns.add(colInfo);
         }
 
         // 3.3 Add column info corresponding to virtual columns
@@ -12119,7 +12121,7 @@ public class SemanticAnalyzer extends Ba
 
         // 4. Build RelOptAbstractTable
         RelOptHiveTable optTable = new RelOptHiveTable(m_relOptSchema,
-            tableAlias, rowType, tab, columnsThatNeedsStats);
+            tableAlias, rowType, tab, nonPartitionColumns, partitionColumns);
 
         // 5. Build Hive Table Scan Rel
         tableRel = new HiveTableScanRel(m_cluster,

svn commit: r1610477 - in /hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql: optimizer/optiq/RelOptHiveTable.java optimizer/optiq/translator/RelNodeConverter.java parse/SemanticAnalyzer.java

Reply via email to