Author: hashutosh
Date: Mon Mar 3 16:02:15 2014
New Revision: 1573591
URL: http://svn.apache.org/r1573591
Log:
HIVE-6505 : Make stats optimizer more robust in presence of distinct clause
(Ashutosh Chauhan via Thejas Nair)
Added:
hive/trunk/ql/src/test/queries/clientpositive/distinct_stats.q
hive/trunk/ql/src/test/results/clientpositive/distinct_stats.q.out
Modified:
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
Modified:
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
URL:
http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java?rev=1573591&r1=1573590&r2=1573591&view=diff
==============================================================================
---
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
(original)
+++
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
Mon Mar 3 16:02:15 2014
@@ -31,8 +31,6 @@ import org.apache.hadoop.hive.common.Sta
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
-import org.apache.hadoop.hive.metastore.api.MetaException;
-import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
import org.apache.hadoop.hive.ql.exec.ColumnInfo;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.FetchTask;
@@ -67,7 +65,6 @@ import org.apache.hadoop.hive.ql.udf.gen
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMin;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFSum;
import org.apache.hadoop.hive.serde.serdeConstants;
-import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import
org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
@@ -76,7 +73,6 @@ import org.apache.hadoop.hive.serde2.obj
import org.apache.thrift.TException;
import com.google.common.collect.Lists;
-import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
/** There is a set of queries which can be answered entirely from statistics
stored in metastore.
@@ -184,48 +180,65 @@ public class StatsOptimizer implements T
// 4. Compose rows and add it in FetchWork
// 5. Delete GBY - RS - GBY - SEL from the pipeline.
- TableScanOperator tsOp = (TableScanOperator) stack.get(0);
- if(tsOp.getParentOperators() != null && tsOp.getParentOperators().size()
> 0) {
- // looks like a subq plan.
- return null;
- }
- SelectOperator selOp = (SelectOperator)tsOp.getChildren().get(0);
- for(ExprNodeDesc desc : selOp.getConf().getColList()) {
- if (!(desc instanceof ExprNodeColumnDesc)) {
- // Probably an expression, cant handle that
+ try {
+ TableScanOperator tsOp = (TableScanOperator) stack.get(0);
+ if(tsOp.getParentOperators() != null &&
tsOp.getParentOperators().size() > 0) {
+ // looks like a subq plan.
return null;
}
- }
- // Since we have done an exact match on TS-SEL-GBY-RS-GBY-SEL-FS
- // we need not to do any instanceof checks for following.
- GroupByOperator gbyOp = (GroupByOperator)selOp.getChildren().get(0);
- FileSinkOperator fsOp = (FileSinkOperator)(gbyOp.getChildren().get(0).
- getChildren().get(0).getChildren().get(0).getChildren().get(0));
- if (fsOp.getChildOperators() != null && fsOp.getChildOperators().size()
> 0) {
- // looks like a subq plan.
- return null;
- }
- List<AggregationDesc> aggrs = gbyOp.getConf().getAggregators();
+ SelectOperator selOp = (SelectOperator)tsOp.getChildren().get(0);
+ for(ExprNodeDesc desc : selOp.getConf().getColList()) {
+ if (!(desc instanceof ExprNodeColumnDesc)) {
+ // Probably an expression, cant handle that
+ return null;
+ }
+ }
+ // Since we have done an exact match on TS-SEL-GBY-RS-GBY-SEL-FS
+ // we need not to do any instanceof checks for following.
+ GroupByOperator gbyOp = (GroupByOperator)selOp.getChildren().get(0);
+ ReduceSinkOperator rsOp =
(ReduceSinkOperator)gbyOp.getChildren().get(0);
+ if (rsOp.getConf().getDistinctColumnIndices().size() > 0) {
+ // we can't handle distinct
+ return null;
+ }
+
+ selOp =
(SelectOperator)rsOp.getChildOperators().get(0).getChildOperators().get(0);
+ List<AggregationDesc> aggrs = gbyOp.getConf().getAggregators();
+
+ if (!(selOp.getConf().getColList().size() == aggrs.size())) {
+ // all select columns must be aggregations
+ return null;
+
+ }
+ FileSinkOperator fsOp = (FileSinkOperator)(selOp.getChildren().get(0));
+ if (fsOp.getChildOperators() != null &&
fsOp.getChildOperators().size() > 0) {
+ // looks like a subq plan.
+ return null;
+ }
+
+ Table tbl = pctx.getTopToTable().get(tsOp);
+ List<Object> oneRow = new ArrayList<Object>();
+ List<ObjectInspector> ois = new ArrayList<ObjectInspector>();
- Table tbl = pctx.getTopToTable().get(tsOp);
- List<Object> oneRow = new ArrayList<Object>();
- List<ObjectInspector> ois = new ArrayList<ObjectInspector>();
- try{
Hive hive = Hive.get(pctx.getConf());
for (AggregationDesc aggr : aggrs) {
+ if (aggr.getDistinct()) {
+ // our stats for NDV is approx, not accurate.
+ return null;
+ }
if
(aggr.getGenericUDAFName().equals(GenericUDAFSum.class.getAnnotation(
Description.class).name())) {
- if(!(aggr.getParameters().get(0) instanceof
ExprNodeConstantDesc)){
- return null;
- }
- Long rowCnt = getRowCnt(pctx, tsOp, tbl);
- if(rowCnt == null) {
- return null;
- }
- oneRow.add(HiveDecimal.create(((ExprNodeConstantDesc)
aggr.getParameters().get(0))
+ if(!(aggr.getParameters().get(0) instanceof ExprNodeConstantDesc)){
+ return null;
+ }
+ Long rowCnt = getRowCnt(pctx, tsOp, tbl);
+ if(rowCnt == null) {
+ return null;
+ }
+ oneRow.add(HiveDecimal.create(((ExprNodeConstantDesc)
aggr.getParameters().get(0))
.getValue().toString()).multiply(HiveDecimal.create(rowCnt)));
-
ois.add(PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(
+
ois.add(PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(
PrimitiveCategory.DECIMAL));
}
else if
(aggr.getGenericUDAFName().equals(GenericUDAFCount.class.getAnnotation(
@@ -274,14 +287,14 @@ public class StatsOptimizer implements T
}
} else {
Set<Partition> parts = pctx.getPrunedPartitions(
- tsOp.getConf().getAlias(), tsOp).getPartitions();
+ tsOp.getConf().getAlias(), tsOp).getPartitions();
for (Partition part : parts) {
if (!StatsSetupConst.areStatsUptoDate(part.getParameters()))
{
Log.debug("Stats for part : " + part.getSpec() + " are not
upto date.");
return null;
}
Long partRowCnt = Long.parseLong(part.getParameters()
- .get(StatsSetupConst.ROW_COUNT));
+ .get(StatsSetupConst.ROW_COUNT));
if (partRowCnt < 1) {
Log.debug("Partition doesn't have upto date stats " +
part.getSpec());
return null;
@@ -328,66 +341,66 @@ public class StatsOptimizer implements T
}
ColumnStatisticsData statData = stats.get(0).getStatsData();
switch (type) {
- case Integeral:
- oneRow.add(statData.getLongStats().getHighValue());
- ois.add(PrimitiveObjectInspectorFactory.
- getPrimitiveJavaObjectInspector(PrimitiveCategory.LONG));
- break;
- case Double:
- oneRow.add(statData.getDoubleStats().getHighValue());
- ois.add(PrimitiveObjectInspectorFactory.
- getPrimitiveJavaObjectInspector(PrimitiveCategory.DOUBLE));
- break;
- default:
- // unsupported type
- Log.debug("Unsupported type: " + colDesc.getTypeString() + "
encountered in " +
- "metadata optimizer for column : " + colName);
- return null;
+ case Integeral:
+ oneRow.add(statData.getLongStats().getHighValue());
+ ois.add(PrimitiveObjectInspectorFactory.
+ getPrimitiveJavaObjectInspector(PrimitiveCategory.LONG));
+ break;
+ case Double:
+ oneRow.add(statData.getDoubleStats().getHighValue());
+ ois.add(PrimitiveObjectInspectorFactory.
+
getPrimitiveJavaObjectInspector(PrimitiveCategory.DOUBLE));
+ break;
+ default:
+ // unsupported type
+ Log.debug("Unsupported type: " + colDesc.getTypeString() + "
encountered in " +
+ "metadata optimizer for column : " + colName);
+ return null;
}
} else {
Set<Partition> parts = pctx.getPrunedPartitions(
- tsOp.getConf().getAlias(), tsOp).getPartitions();
+ tsOp.getConf().getAlias(), tsOp).getPartitions();
switch (type) {
- case Integeral: {
- long maxVal = Long.MIN_VALUE;
- Collection<List<ColumnStatisticsObj>> result =
- verifyAndGetPartStats(hive, tbl, colName, parts);
- if (result == null) {
- return null; // logging inside
- }
- for (List<ColumnStatisticsObj> statObj : result) {
- ColumnStatisticsData statData =
validateSingleColStat(statObj);
- if (statData == null) return null;
- long curVal = statData.getLongStats().getHighValue();
- maxVal = Math.max(maxVal, curVal);
- }
- oneRow.add(maxVal);
- ois.add(PrimitiveObjectInspectorFactory.
- getPrimitiveJavaObjectInspector(PrimitiveCategory.LONG));
- break;
- }
- case Double: {
- double maxVal = Double.MIN_VALUE;
- Collection<List<ColumnStatisticsObj>> result =
- verifyAndGetPartStats(hive, tbl, colName, parts);
- if (result == null) {
- return null; // logging inside
- }
- for (List<ColumnStatisticsObj> statObj : result) {
- ColumnStatisticsData statData =
validateSingleColStat(statObj);
- if (statData == null) return null;
- double curVal = statData.getDoubleStats().getHighValue();
- maxVal = Math.max(maxVal, curVal);
+ case Integeral: {
+ long maxVal = Long.MIN_VALUE;
+ Collection<List<ColumnStatisticsObj>> result =
+ verifyAndGetPartStats(hive, tbl, colName, parts);
+ if (result == null) {
+ return null; // logging inside
+ }
+ for (List<ColumnStatisticsObj> statObj : result) {
+ ColumnStatisticsData statData =
validateSingleColStat(statObj);
+ if (statData == null) return null;
+ long curVal = statData.getLongStats().getHighValue();
+ maxVal = Math.max(maxVal, curVal);
+ }
+ oneRow.add(maxVal);
+ ois.add(PrimitiveObjectInspectorFactory.
+ getPrimitiveJavaObjectInspector(PrimitiveCategory.LONG));
+ break;
+ }
+ case Double: {
+ double maxVal = Double.MIN_VALUE;
+ Collection<List<ColumnStatisticsObj>> result =
+ verifyAndGetPartStats(hive, tbl, colName, parts);
+ if (result == null) {
+ return null; // logging inside
+ }
+ for (List<ColumnStatisticsObj> statObj : result) {
+ ColumnStatisticsData statData =
validateSingleColStat(statObj);
+ if (statData == null) return null;
+ double curVal = statData.getDoubleStats().getHighValue();
+ maxVal = Math.max(maxVal, curVal);
+ }
+ oneRow.add(maxVal);
+ ois.add(PrimitiveObjectInspectorFactory.
+
getPrimitiveJavaObjectInspector(PrimitiveCategory.DOUBLE));
+ break;
}
- oneRow.add(maxVal);
- ois.add(PrimitiveObjectInspectorFactory.
- getPrimitiveJavaObjectInspector(PrimitiveCategory.DOUBLE));
- break;
- }
- default:
- Log.debug("Unsupported type: " + colDesc.getTypeString() + "
encountered in " +
- "metadata optimizer for column : " + colName);
- return null;
+ default:
+ Log.debug("Unsupported type: " + colDesc.getTypeString() + "
encountered in " +
+ "metadata optimizer for column : " + colName);
+ return null;
}
}
} else if
(aggr.getGenericUDAFName().equals(GenericUDAFMin.class.getAnnotation(
@@ -401,67 +414,67 @@ public class StatsOptimizer implements T
return null;
}
ColumnStatisticsData statData =
hive.getMSC().getTableColumnStatistics(
- tbl.getDbName(), tbl.getTableName(),
Lists.newArrayList(colName))
- .get(0).getStatsData();
+ tbl.getDbName(), tbl.getTableName(),
Lists.newArrayList(colName))
+ .get(0).getStatsData();
switch (type) {
- case Integeral:
- oneRow.add(statData.getLongStats().getLowValue());
- ois.add(PrimitiveObjectInspectorFactory.
- getPrimitiveJavaObjectInspector(PrimitiveCategory.LONG));
- break;
- case Double:
- oneRow.add(statData.getDoubleStats().getLowValue());
- ois.add(PrimitiveObjectInspectorFactory.
- getPrimitiveJavaObjectInspector(PrimitiveCategory.DOUBLE));
- break;
- default: // unsupported type
- Log.debug("Unsupported type: " + colDesc.getTypeString() + "
encountered in " +
- "metadata optimizer for column : " + colName);
- return null;
+ case Integeral:
+ oneRow.add(statData.getLongStats().getLowValue());
+ ois.add(PrimitiveObjectInspectorFactory.
+ getPrimitiveJavaObjectInspector(PrimitiveCategory.LONG));
+ break;
+ case Double:
+ oneRow.add(statData.getDoubleStats().getLowValue());
+ ois.add(PrimitiveObjectInspectorFactory.
+
getPrimitiveJavaObjectInspector(PrimitiveCategory.DOUBLE));
+ break;
+ default: // unsupported type
+ Log.debug("Unsupported type: " + colDesc.getTypeString() + "
encountered in " +
+ "metadata optimizer for column : " + colName);
+ return null;
}
} else {
Set<Partition> parts =
pctx.getPrunedPartitions(tsOp.getConf().getAlias(), tsOp).getPartitions();
switch(type) {
- case Integeral: {
- long minVal = Long.MAX_VALUE;
- Collection<List<ColumnStatisticsObj>> result =
- verifyAndGetPartStats(hive, tbl, colName, parts);
- if (result == null) {
- return null; // logging inside
- }
- for (List<ColumnStatisticsObj> statObj : result) {
- ColumnStatisticsData statData =
validateSingleColStat(statObj);
- if (statData == null) return null;
- long curVal = statData.getLongStats().getLowValue();
- minVal = Math.min(minVal, curVal);
- }
- oneRow.add(minVal);
- ois.add(PrimitiveObjectInspectorFactory.
- getPrimitiveJavaObjectInspector(PrimitiveCategory.LONG));
- break;
- }
- case Double: {
- double minVal = Double.MAX_VALUE;
- Collection<List<ColumnStatisticsObj>> result =
- verifyAndGetPartStats(hive, tbl, colName, parts);
- if (result == null) {
- return null; // logging inside
- }
- for (List<ColumnStatisticsObj> statObj : result) {
- ColumnStatisticsData statData =
validateSingleColStat(statObj);
- if (statData == null) return null;
- double curVal = statData.getDoubleStats().getLowValue();
- minVal = Math.min(minVal, curVal);
+ case Integeral: {
+ long minVal = Long.MAX_VALUE;
+ Collection<List<ColumnStatisticsObj>> result =
+ verifyAndGetPartStats(hive, tbl, colName, parts);
+ if (result == null) {
+ return null; // logging inside
+ }
+ for (List<ColumnStatisticsObj> statObj : result) {
+ ColumnStatisticsData statData =
validateSingleColStat(statObj);
+ if (statData == null) return null;
+ long curVal = statData.getLongStats().getLowValue();
+ minVal = Math.min(minVal, curVal);
+ }
+ oneRow.add(minVal);
+ ois.add(PrimitiveObjectInspectorFactory.
+ getPrimitiveJavaObjectInspector(PrimitiveCategory.LONG));
+ break;
+ }
+ case Double: {
+ double minVal = Double.MAX_VALUE;
+ Collection<List<ColumnStatisticsObj>> result =
+ verifyAndGetPartStats(hive, tbl, colName, parts);
+ if (result == null) {
+ return null; // logging inside
+ }
+ for (List<ColumnStatisticsObj> statObj : result) {
+ ColumnStatisticsData statData =
validateSingleColStat(statObj);
+ if (statData == null) return null;
+ double curVal = statData.getDoubleStats().getLowValue();
+ minVal = Math.min(minVal, curVal);
+ }
+ oneRow.add(minVal);
+ ois.add(PrimitiveObjectInspectorFactory.
+
getPrimitiveJavaObjectInspector(PrimitiveCategory.DOUBLE));
+ break;
}
- oneRow.add(minVal);
- ois.add(PrimitiveObjectInspectorFactory.
- getPrimitiveJavaObjectInspector(PrimitiveCategory.DOUBLE));
- break;
- }
- default: // unsupported type
- Log.debug("Unsupported type: " + colDesc.getTypeString() + "
encountered in " +
- "metadata optimizer for column : " + colName);
- return null;
+ default: // unsupported type
+ Log.debug("Unsupported type: " + colDesc.getTypeString() + "
encountered in " +
+ "metadata optimizer for column : " + colName);
+ return null;
}
}
@@ -471,28 +484,29 @@ public class StatsOptimizer implements T
return null;
}
}
+
+
+ List<List<Object>> allRows = new ArrayList<List<Object>>();
+ allRows.add(oneRow);
+
+ List<String> colNames = new ArrayList<String>();
+ for (ColumnInfo colInfo: gbyOp.getSchema().getSignature()) {
+ colNames.add(colInfo.getInternalName());
+ }
+ StandardStructObjectInspector sOI = ObjectInspectorFactory.
+ getStandardStructObjectInspector(colNames, ois);
+ FetchWork fWork = new FetchWork(allRows, sOI);
+ FetchTask fTask = (FetchTask)TaskFactory.get(fWork, pctx.getConf());
+ fWork.setLimit(allRows.size());
+ pctx.setFetchTask(fTask);
+
+ return null;
} catch (Exception e) {
// this is best effort optimization, bail out in error conditions and
// try generate and execute slower plan
Log.debug("Failed to optimize using metadata optimizer", e);
return null;
}
-
- List<List<Object>> allRows = new ArrayList<List<Object>>();
- allRows.add(oneRow);
-
- List<String> colNames = new ArrayList<String>();
- for (ColumnInfo colInfo: gbyOp.getSchema().getSignature()) {
- colNames.add(colInfo.getInternalName());
- }
- StandardStructObjectInspector sOI = ObjectInspectorFactory.
- getStandardStructObjectInspector(colNames, ois);
- FetchWork fWork = new FetchWork(allRows, sOI);
- FetchTask fTask = (FetchTask)TaskFactory.get(fWork, pctx.getConf());
- fWork.setLimit(allRows.size());
- pctx.setFetchTask(fTask);
-
- return null;
}
private ColumnStatisticsData
validateSingleColStat(List<ColumnStatisticsObj> statObj) {
Added: hive/trunk/ql/src/test/queries/clientpositive/distinct_stats.q
URL:
http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/distinct_stats.q?rev=1573591&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/distinct_stats.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/distinct_stats.q Mon Mar 3
16:02:15 2014
@@ -0,0 +1,20 @@
+set hive.stats.autogather=true;
+
+set hive.compute.query.using.stats=true;
+create table t1 (a string, b string);
+
+insert into table t1 select * from src;
+
+analyze table t1 compute statistics for columns a,b;
+
+explain
+select count(distinct b) from t1 group by a;
+
+explain
+select distinct(b) from t1;
+
+explain
+select a, count(*) from t1 group by a;
+
+drop table t1;
+set hive.compute.query.using.stats = false;
Added: hive/trunk/ql/src/test/results/clientpositive/distinct_stats.q.out
URL:
http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/distinct_stats.q.out?rev=1573591&view=auto
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/distinct_stats.q.out (added)
+++ hive/trunk/ql/src/test/results/clientpositive/distinct_stats.q.out Mon Mar
3 16:02:15 2014
@@ -0,0 +1,208 @@
+PREHOOK: query: create table t1 (a string, b string)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+POSTHOOK: query: create table t1 (a string, b string)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t1
+PREHOOK: query: insert into table t1 select * from src
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@t1
+POSTHOOK: query: insert into table t1 select * from src
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@t1
+POSTHOOK: Lineage: t1.a SIMPLE [(src)src.FieldSchema(name:key, type:string,
comment:default), ]
+POSTHOOK: Lineage: t1.b SIMPLE [(src)src.FieldSchema(name:value, type:string,
comment:default), ]
+PREHOOK: query: analyze table t1 compute statistics for columns a,b
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: analyze table t1 compute statistics for columns a,b
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.a SIMPLE [(src)src.FieldSchema(name:key, type:string,
comment:default), ]
+POSTHOOK: Lineage: t1.b SIMPLE [(src)src.FieldSchema(name:value, type:string,
comment:default), ]
+PREHOOK: query: explain
+select count(distinct b) from t1 group by a
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+select count(distinct b) from t1 group by a
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.a SIMPLE [(src)src.FieldSchema(name:key, type:string,
comment:default), ]
+POSTHOOK: Lineage: t1.b SIMPLE [(src)src.FieldSchema(name:value, type:string,
comment:default), ]
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: t1
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE
Column stats: NONE
+ Select Operator
+ expressions: a (type: string), b (type: string)
+ outputColumnNames: a, b
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE
Column stats: NONE
+ Group By Operator
+ aggregations: count(DISTINCT b)
+ keys: a (type: string), b (type: string)
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 500 Data size: 5312 Basic stats:
COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: string), _col1 (type: string)
+ sort order: ++
+ Map-reduce partition columns: _col0 (type: string)
+ Statistics: Num rows: 500 Data size: 5312 Basic stats:
COMPLETE Column stats: NONE
+ value expressions: _col2 (type: bigint)
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: count(DISTINCT KEY._col1:0._col0)
+ keys: KEY._col0 (type: string)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE
Column stats: NONE
+ Select Operator
+ expressions: _col1 (type: bigint)
+ outputColumnNames: _col0
+ Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE
Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE
Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+PREHOOK: query: explain
+select distinct(b) from t1
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+select distinct(b) from t1
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.a SIMPLE [(src)src.FieldSchema(name:key, type:string,
comment:default), ]
+POSTHOOK: Lineage: t1.b SIMPLE [(src)src.FieldSchema(name:value, type:string,
comment:default), ]
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: t1
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE
Column stats: NONE
+ Select Operator
+ expressions: b (type: string)
+ outputColumnNames: b
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE
Column stats: NONE
+ Group By Operator
+ keys: b (type: string)
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 500 Data size: 5312 Basic stats:
COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: string)
+ Statistics: Num rows: 500 Data size: 5312 Basic stats:
COMPLETE Column stats: NONE
+ Reduce Operator Tree:
+ Group By Operator
+ keys: KEY._col0 (type: string)
+ mode: mergepartial
+ outputColumnNames: _col0
+ Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE
Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE
Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE
Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+PREHOOK: query: explain
+select a, count(*) from t1 group by a
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+select a, count(*) from t1 group by a
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.a SIMPLE [(src)src.FieldSchema(name:key, type:string,
comment:default), ]
+POSTHOOK: Lineage: t1.b SIMPLE [(src)src.FieldSchema(name:value, type:string,
comment:default), ]
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: t1
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE
Column stats: NONE
+ Select Operator
+ expressions: a (type: string)
+ outputColumnNames: a
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE
Column stats: NONE
+ Group By Operator
+ aggregations: count()
+ keys: a (type: string)
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 500 Data size: 5312 Basic stats:
COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: string)
+ Statistics: Num rows: 500 Data size: 5312 Basic stats:
COMPLETE Column stats: NONE
+ value expressions: _col1 (type: bigint)
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: count(VALUE._col0)
+ keys: KEY._col0 (type: string)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE
Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: string), _col1 (type: bigint)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE
Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE
Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+PREHOOK: query: drop table t1
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@t1
+PREHOOK: Output: default@t1
+POSTHOOK: query: drop table t1
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@t1
+POSTHOOK: Output: default@t1
+POSTHOOK: Lineage: t1.a SIMPLE [(src)src.FieldSchema(name:key, type:string,
comment:default), ]
+POSTHOOK: Lineage: t1.b SIMPLE [(src)src.FieldSchema(name:value, type:string,
comment:default), ]