HIVE-14806: Support UDTF in CBO (AST return path) (Pengcheng Xiong, reviewed by Ashutosh Chauhan)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/efe9c84e Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/efe9c84e Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/efe9c84e Branch: refs/heads/repl2 Commit: efe9c84e7bd30de537b0b2c0a224cb47b16b7618 Parents: 21a0142 Author: Pengcheng Xiong <pxi...@apache.org> Authored: Tue Oct 4 11:00:00 2016 -0700 Committer: Pengcheng Xiong <pxi...@apache.org> Committed: Tue Oct 4 11:00:00 2016 -0700 ---------------------------------------------------------------------- .../reloperators/HiveTableFunctionScan.java | 77 +++++++ .../calcite/translator/ASTConverter.java | 83 ++++++-- .../translator/SqlFunctionConverter.java | 9 + .../hadoop/hive/ql/parse/CalcitePlanner.java | 211 +++++++++++++++++-- .../clientpositive/allcolref_in_udf.q.out | 80 ++++--- .../clientpositive/lateral_view_noalias.q.out | 12 +- .../test/results/clientpositive/ppd_udtf.q.out | 4 +- .../results/clientpositive/udf_inline.q.out | 12 +- .../results/clientpositive/udtf_explode.q.out | 76 ++++--- .../clientpositive/udtf_json_tuple.q.out | 12 +- .../clientpositive/udtf_parse_url_tuple.q.out | 12 +- 11 files changed, 478 insertions(+), 110 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/efe9c84e/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveTableFunctionScan.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveTableFunctionScan.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveTableFunctionScan.java new file mode 100644 index 0000000..bf4896d --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveTableFunctionScan.java @@ -0,0 +1,77 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.calcite.reloperators; + +import java.lang.reflect.Type; +import java.util.List; +import java.util.Set; + +import org.apache.calcite.plan.RelOptCluster; +import org.apache.calcite.plan.RelTraitSet; +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.core.TableFunctionScan; +import org.apache.calcite.rel.metadata.RelColumnMapping; +import org.apache.calcite.rel.type.RelDataType; +import org.apache.calcite.rex.RexNode; +import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException; + +public class HiveTableFunctionScan extends TableFunctionScan implements HiveRelNode { + + /** + * @param cluster + * cluster - Cluster that this relational expression belongs to + * @param traitSet + * @param inputs + * inputs - 0 or more relational inputs + * @param rexCall + * rexCall - Function invocation expression + * @param elementType + * elementType - Element type of the collection that will implement + * this table + * @param rowType + * rowType - Row type produced by function + * @param columnMappings + * columnMappings - Column mappings associated with this function + */ + public HiveTableFunctionScan(RelOptCluster cluster, RelTraitSet traitSet, List<RelNode> inputs, + RexNode rexCall, Type elementType, RelDataType rowType, Set<RelColumnMapping> columnMappings) { + super(cluster, traitSet, inputs, rexCall, elementType, rowType, columnMappings); + } + + public static HiveTableFunctionScan create(RelOptCluster cluster, RelTraitSet traitSet, + List<RelNode> inputs, RexNode rexCall, Type elementType, RelDataType rowType, + Set<RelColumnMapping> columnMappings) throws CalciteSemanticException { + HiveTableFunctionScan hiveTableFunctionScan = new HiveTableFunctionScan(cluster, traitSet, + inputs, rexCall, elementType, rowType, columnMappings); + return hiveTableFunctionScan; + } + + @Override + public TableFunctionScan copy(RelTraitSet traitSet, List<RelNode> inputs, RexNode rexCall, + Type elementType, RelDataType rowType, Set<RelColumnMapping> columnMappings) { + HiveTableFunctionScan htfs = new HiveTableFunctionScan(getCluster(), traitSet, inputs, rexCall, + elementType, rowType, columnMappings); + return htfs; + } + + @Override + public void implement(Implementor implementor) { + + } + +} http://git-wip-us.apache.org/repos/asf/hive/blob/efe9c84e/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ASTConverter.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ASTConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ASTConverter.java index 9f5e733..8d738aa 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ASTConverter.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ASTConverter.java @@ -35,6 +35,7 @@ import org.apache.calcite.rel.core.JoinRelType; import org.apache.calcite.rel.core.Project; import org.apache.calcite.rel.core.SemiJoin; import org.apache.calcite.rel.core.Sort; +import org.apache.calcite.rel.core.TableFunctionScan; import org.apache.calcite.rel.core.TableScan; import org.apache.calcite.rel.core.Union; import org.apache.calcite.rel.type.RelDataTypeField; @@ -60,6 +61,7 @@ import org.apache.hadoop.hive.ql.optimizer.calcite.druid.DruidQuery; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveGroupingID; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSortLimit; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableFunctionScan; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan; import org.apache.hadoop.hive.ql.optimizer.calcite.translator.SqlFunctionConverter.HiveToken; import org.apache.hadoop.hive.ql.parse.ASTNode; @@ -79,7 +81,7 @@ public class ASTConverter { private Filter where; private Aggregate groupBy; private Filter having; - private Project select; + private RelNode select; private Sort orderLimit; private Schema schema; @@ -192,25 +194,50 @@ public class ASTConverter { */ ASTBuilder b = ASTBuilder.construct(HiveParser.TOK_SELECT, "TOK_SELECT"); - if (select.getChildExps().isEmpty()) { - RexLiteral r = select.getCluster().getRexBuilder().makeExactLiteral(new BigDecimal(1)); - ASTNode selectExpr = ASTBuilder.selectExpr(ASTBuilder.literal(r), "1"); - b.add(selectExpr); - } else { - int i = 0; + if (select instanceof Project) { + if (select.getChildExps().isEmpty()) { + RexLiteral r = select.getCluster().getRexBuilder().makeExactLiteral(new BigDecimal(1)); + ASTNode selectExpr = ASTBuilder.selectExpr(ASTBuilder.literal(r), "1"); + b.add(selectExpr); + } else { + int i = 0; - for (RexNode r : select.getChildExps()) { + for (RexNode r : select.getChildExps()) { + if (RexUtil.isNull(r) && r.getType().getSqlTypeName() != SqlTypeName.NULL) { + // It is NULL value with different type, we need to introduce a CAST + // to keep it + r = select.getCluster().getRexBuilder().makeAbstractCast(r.getType(), r); + } + ASTNode expr = r.accept(new RexVisitor(schema, r instanceof RexLiteral)); + String alias = select.getRowType().getFieldNames().get(i++); + ASTNode selectExpr = ASTBuilder.selectExpr(expr, alias); + b.add(selectExpr); + } + } + hiveAST.select = b.node(); + } else { + // select is UDTF + HiveTableFunctionScan udtf = (HiveTableFunctionScan) select; + List<ASTNode> children = new ArrayList<>(); + RexCall call = (RexCall) udtf.getCall(); + for (RexNode r : call.getOperands()) { if (RexUtil.isNull(r) && r.getType().getSqlTypeName() != SqlTypeName.NULL) { - // It is NULL value with different type, we need to introduce a CAST to keep it + // It is NULL value with different type, we need to introduce a CAST + // to keep it r = select.getCluster().getRexBuilder().makeAbstractCast(r.getType(), r); } ASTNode expr = r.accept(new RexVisitor(schema, r instanceof RexLiteral)); - String alias = select.getRowType().getFieldNames().get(i++); - ASTNode selectExpr = ASTBuilder.selectExpr(expr, alias); - b.add(selectExpr); + children.add(expr); + } + ASTBuilder sel = ASTBuilder.construct(HiveParser.TOK_SELEXPR, "TOK_SELEXPR"); + ASTNode function = buildUDTFAST(call.getOperator().getName(), children); + sel.add(function); + for (String alias : udtf.getRowType().getFieldNames()) { + sel.add(HiveParser.Identifier, alias); } + b.add(sel); + hiveAST.select = b.node(); } - hiveAST.select = b.node(); /* * 7. Order Use in Order By from the block above. RelNode has no pointer to @@ -224,6 +251,14 @@ public class ASTConverter { return hiveAST.getAST(); } + private ASTNode buildUDTFAST(String functionName, List<ASTNode> children) { + ASTNode node = (ASTNode) ParseDriver.adaptor.create(HiveParser.TOK_FUNCTION, "TOK_FUNCTION"); + node.addChild((ASTNode) ParseDriver.adaptor.create(HiveParser.Identifier, functionName)); + for (ASTNode c : children) { + ParseDriver.adaptor.addChild(node, c); + } + return node; + } private void convertOrderLimitToASTNode(HiveSortLimit order) { if (order != null) { HiveSortLimit hiveSortLimit = order; @@ -296,7 +331,11 @@ public class ASTConverter { } private Schema getRowSchema(String tblAlias) { - return new Schema(select, tblAlias); + if (select instanceof Project) { + return new Schema((Project) select, tblAlias); + } else { + return new Schema((TableFunctionScan) select, tblAlias); + } } private QueryBlockInfo convertSource(RelNode r) throws CalciteSemanticException { @@ -375,6 +414,14 @@ public class ASTConverter { } } + public void handle(TableFunctionScan tableFunctionScan) { + if (ASTConverter.this.select == null) { + ASTConverter.this.select = tableFunctionScan; + } else { + ASTConverter.this.from = tableFunctionScan; + } + } + @Override public void visit(RelNode node, int ordinal, RelNode parent) { @@ -384,6 +431,8 @@ public class ASTConverter { handle((Filter) node); } else if (node instanceof Project) { handle((Project) node); + } else if (node instanceof TableFunctionScan) { + handle((TableFunctionScan) node); } else if (node instanceof Join) { ASTConverter.this.from = node; } else if (node instanceof Union) { @@ -644,6 +693,12 @@ public class ASTConverter { } } + Schema(TableFunctionScan select, String alias) { + for (RelDataTypeField field : select.getRowType().getFieldList()) { + add(new ColumnInfo(alias, field.getName())); + } + } + Schema(Union unionRel, String alias) { for (RelDataTypeField field : unionRel.getRowType().getFieldList()) { add(new ColumnInfo(alias, field.getName())); http://git-wip-us.apache.org/repos/asf/hive/blob/efe9c84e/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/SqlFunctionConverter.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/SqlFunctionConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/SqlFunctionConverter.java index 53e4a2a..f150132 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/SqlFunctionConverter.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/SqlFunctionConverter.java @@ -59,6 +59,7 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNegative; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPPositive; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; @@ -108,6 +109,14 @@ public class SqlFunctionConverter { return getCalciteFn(name, calciteArgTypes, retType, FunctionRegistry.isDeterministic(hiveUDF)); } + public static SqlOperator getCalciteOperator(String funcTextName, GenericUDTF hiveUDTF, + ImmutableList<RelDataType> calciteArgTypes, RelDataType retType) throws SemanticException { + // We could just do toLowerCase here and let SA qualify it, but + // let's be proper... + String name = FunctionRegistry.getNormalizedFunctionName(funcTextName); + return getCalciteFn(name, calciteArgTypes, retType, false); + } + public static GenericUDF getHiveUDF(SqlOperator op, RelDataType dt, int argsLength) { String name = reverseOperatorMap.get(op); if (name == null) { http://git-wip-us.apache.org/repos/asf/hive/blob/efe9c84e/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java index e2ddb14..e6ab947 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java @@ -91,6 +91,7 @@ import org.apache.calcite.sql.SqlExplainLevel; import org.apache.calcite.sql.SqlKind; import org.apache.calcite.sql.SqlLiteral; import org.apache.calcite.sql.SqlNode; +import org.apache.calcite.sql.SqlOperator; import org.apache.calcite.sql.SqlWindow; import org.apache.calcite.sql.parser.SqlParserPos; import org.apache.calcite.sql.type.SqlTypeName; @@ -144,6 +145,7 @@ import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveRelNode; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSemiJoin; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSortLimit; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableFunctionScan; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveUnion; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveAggregateJoinTransposeRule; @@ -208,8 +210,11 @@ import org.apache.hadoop.hive.ql.plan.SelectDesc; import org.apache.hadoop.hive.ql.session.SessionState; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; @@ -3017,24 +3022,70 @@ public class CalcitePlanner extends SemanticAnalyzer { throw new CalciteSemanticException(msg, UnsupportedFeature.Select_transform); } - // 5. Bailout if select involves UDTF + // 5. Check if select involves UDTF + String udtfTableAlias = null; + GenericUDTF genericUDTF = null; + String genericUDTFName = null; + ArrayList<String> udtfColAliases = new ArrayList<String>(); ASTNode expr = (ASTNode) selExprList.getChild(posn).getChild(0); int exprType = expr.getType(); if (exprType == HiveParser.TOK_FUNCTION || exprType == HiveParser.TOK_FUNCTIONSTAR) { String funcName = TypeCheckProcFactory.DefaultExprProcessor.getFunctionText(expr, true); FunctionInfo fi = FunctionRegistry.getFunctionInfo(funcName); if (fi != null && fi.getGenericUDTF() != null) { - String msg = String.format("UDTF " + funcName + " is currently not supported in CBO," - + " turn off cbo to use UDTF " + funcName); - LOG.debug(msg); - throw new CalciteSemanticException(msg, UnsupportedFeature.UDTF); + LOG.debug("Find UDTF " + funcName); + genericUDTF = fi.getGenericUDTF(); + genericUDTFName = funcName; + if (genericUDTF != null && (selectStar = exprType == HiveParser.TOK_FUNCTIONSTAR)) { + genColListRegex(".*", null, (ASTNode) expr.getChild(0), + col_list, null, inputRR, starRR, pos, out_rwsch, qb.getAliases(), false); + } + } + } + + if (genericUDTF != null) { + // Only support a single expression when it's a UDTF + if (selExprList.getChildCount() > 1) { + throw new SemanticException(generateErrorMessage( + (ASTNode) selExprList.getChild(1), + ErrorMsg.UDTF_MULTIPLE_EXPR.getMsg())); + } + + ASTNode selExpr = (ASTNode) selExprList.getChild(posn); + + // Get the column / table aliases from the expression. Start from 1 as + // 0 is the TOK_FUNCTION + // column names also can be inferred from result of UDTF + for (int i = 1; i < selExpr.getChildCount(); i++) { + ASTNode selExprChild = (ASTNode) selExpr.getChild(i); + switch (selExprChild.getType()) { + case HiveParser.Identifier: + udtfColAliases.add(unescapeIdentifier(selExprChild.getText().toLowerCase())); + break; + case HiveParser.TOK_TABALIAS: + assert (selExprChild.getChildCount() == 1); + udtfTableAlias = unescapeIdentifier(selExprChild.getChild(0) + .getText()); + qb.addAlias(udtfTableAlias); + break; + default: + throw new SemanticException("Find invalid token type " + selExprChild.getType() + + " in UDTF."); + } } + LOG.debug("UDTF table alias is " + udtfTableAlias); + LOG.debug("UDTF col aliases are " + udtfColAliases); } // 6. Iterate over all expression (after SELECT) - ASTNode exprList = selExprList; - int startPosn = posn; - List<String> tabAliasesForAllProjs = getTabAliases(starRR); + ASTNode exprList; + if (genericUDTF != null) { + exprList = expr; + } else { + exprList = selExprList; + } + // For UDTF's, skip the function name to get the expressions + int startPosn = genericUDTF != null ? posn + 1 : posn; for (int i = startPosn; i < exprList.getChildCount(); ++i) { // 6.1 child can be EXPR AS ALIAS, or EXPR. @@ -3045,7 +3096,7 @@ public class CalcitePlanner extends SemanticAnalyzer { // This check is not needed and invalid when there is a transform b/c // the // AST's are slightly different. - if (child.getChildCount() > 2) { + if (genericUDTF == null && child.getChildCount() > 2) { throw new SemanticException(SemanticAnalyzer.generateErrorMessage( (ASTNode) child.getChild(2), ErrorMsg.INVALID_AS.getMsg())); } @@ -3053,12 +3104,18 @@ public class CalcitePlanner extends SemanticAnalyzer { String tabAlias; String colAlias; - // 6.3 Get rid of TOK_SELEXPR - expr = (ASTNode) child.getChild(0); - String[] colRef = SemanticAnalyzer.getColAlias(child, getAutogenColAliasPrfxLbl(), inputRR, - autogenColAliasPrfxIncludeFuncName(), i); - tabAlias = colRef[0]; - colAlias = colRef[1]; + if (genericUDTF != null) { + tabAlias = null; + colAlias = getAutogenColAliasPrfxLbl() + i; + expr = child; + } else { + // 6.3 Get rid of TOK_SELEXPR + expr = (ASTNode) child.getChild(0); + String[] colRef = SemanticAnalyzer.getColAlias(child, getAutogenColAliasPrfxLbl(), + inputRR, autogenColAliasPrfxIncludeFuncName(), i); + tabAlias = colRef[0]; + colAlias = colRef[1]; + } // 6.4 Build ExprNode corresponding to colums if (expr.getType() == HiveParser.TOK_ALLCOLREF) { @@ -3143,7 +3200,16 @@ public class CalcitePlanner extends SemanticAnalyzer { } // 8. Build Calcite Rel - RelNode outputRel = genSelectRelNode(calciteColLst, out_rwsch, srcRel); + RelNode outputRel = null; + if (genericUDTF != null) { + // The basic idea for CBO support of UDTF is to treat UDTF as a special project. + // In AST return path, as we just need to generate a SEL_EXPR, we just need to remember the expressions and the alias. + // In OP return path, we need to generate a SEL and then a UDTF following old semantic analyzer. + outputRel = genUDTFPlan(genericUDTF, genericUDTFName, udtfTableAlias, udtfColAliases, qb, calciteColLst, out_rwsch, srcRel); + } + else{ + outputRel = genSelectRelNode(calciteColLst, out_rwsch, srcRel); + } // 9. Handle select distinct as GBY if there exist windowing functions if (selForWindow != null && selExprList.getToken().getType() == HiveParser.TOK_SELECTDI) { @@ -3165,6 +3231,119 @@ public class CalcitePlanner extends SemanticAnalyzer { return outputRel; } + private RelNode genUDTFPlan(GenericUDTF genericUDTF, String genericUDTFName, String outputTableAlias, + ArrayList<String> colAliases, QB qb, List<RexNode> selectColLst, RowResolver selectRR, RelNode input) throws SemanticException { + + // No GROUP BY / DISTRIBUTE BY / SORT BY / CLUSTER BY + QBParseInfo qbp = qb.getParseInfo(); + if (!qbp.getDestToGroupBy().isEmpty()) { + throw new SemanticException(ErrorMsg.UDTF_NO_GROUP_BY.getMsg()); + } + if (!qbp.getDestToDistributeBy().isEmpty()) { + throw new SemanticException(ErrorMsg.UDTF_NO_DISTRIBUTE_BY.getMsg()); + } + if (!qbp.getDestToSortBy().isEmpty()) { + throw new SemanticException(ErrorMsg.UDTF_NO_SORT_BY.getMsg()); + } + if (!qbp.getDestToClusterBy().isEmpty()) { + throw new SemanticException(ErrorMsg.UDTF_NO_CLUSTER_BY.getMsg()); + } + if (!qbp.getAliasToLateralViews().isEmpty()) { + throw new SemanticException(ErrorMsg.UDTF_LATERAL_VIEW.getMsg()); + } + + LOG.debug("Table alias: " + outputTableAlias + " Col aliases: " + colAliases); + + // Use the RowResolver from the input operator to generate a input + // ObjectInspector that can be used to initialize the UDTF. Then, the + // resulting output object inspector can be used to make the RowResolver + // for the UDTF operator + ArrayList<ColumnInfo> inputCols = selectRR.getColumnInfos(); + + // Create the object inspector for the input columns and initialize the + // UDTF + ArrayList<String> colNames = new ArrayList<String>(); + ObjectInspector[] colOIs = new ObjectInspector[inputCols.size()]; + for (int i = 0; i < inputCols.size(); i++) { + colNames.add(inputCols.get(i).getInternalName()); + colOIs[i] = inputCols.get(i).getObjectInspector(); + } + StandardStructObjectInspector rowOI = ObjectInspectorFactory + .getStandardStructObjectInspector(colNames, Arrays.asList(colOIs)); + StructObjectInspector outputOI = genericUDTF.initialize(rowOI); + + int numUdtfCols = outputOI.getAllStructFieldRefs().size(); + if (colAliases.isEmpty()) { + // user did not specfied alias names, infer names from outputOI + for (StructField field : outputOI.getAllStructFieldRefs()) { + colAliases.add(field.getFieldName()); + } + } + // Make sure that the number of column aliases in the AS clause matches + // the number of columns output by the UDTF + int numSuppliedAliases = colAliases.size(); + if (numUdtfCols != numSuppliedAliases) { + throw new SemanticException(ErrorMsg.UDTF_ALIAS_MISMATCH.getMsg("expected " + numUdtfCols + + " aliases " + "but got " + numSuppliedAliases)); + } + + // Generate the output column info's / row resolver using internal names. + ArrayList<ColumnInfo> udtfCols = new ArrayList<ColumnInfo>(); + + Iterator<String> colAliasesIter = colAliases.iterator(); + for (StructField sf : outputOI.getAllStructFieldRefs()) { + + String colAlias = colAliasesIter.next(); + assert (colAlias != null); + + // Since the UDTF operator feeds into a LVJ operator that will rename + // all the internal names, we can just use field name from the UDTF's OI + // as the internal name + ColumnInfo col = new ColumnInfo(sf.getFieldName(), + TypeInfoUtils.getTypeInfoFromObjectInspector(sf.getFieldObjectInspector()), + outputTableAlias, false); + udtfCols.add(col); + } + + // Create the row resolver for this operator from the output columns + RowResolver out_rwsch = new RowResolver(); + for (int i = 0; i < udtfCols.size(); i++) { + out_rwsch.put(outputTableAlias, colAliases.get(i), udtfCols.get(i)); + } + + // Add the UDTFOperator to the operator DAG + RelTraitSet traitSet = TraitsUtil.getDefaultTraitSet(cluster); + + // Build row type from field <type, name> + RelDataType retType = TypeConverter.getType(cluster, out_rwsch, null); + + Builder<RelDataType> argTypeBldr = ImmutableList.<RelDataType> builder(); + + RexBuilder rexBuilder = cluster.getRexBuilder(); + RelDataTypeFactory dtFactory = rexBuilder.getTypeFactory(); + RowSchema rs = selectRR.getRowSchema(); + for (ColumnInfo ci : rs.getSignature()) { + argTypeBldr.add(TypeConverter.convert(ci.getType(), dtFactory)); + } + + SqlOperator calciteOp = SqlFunctionConverter.getCalciteOperator(genericUDTFName, genericUDTF, + argTypeBldr.build(), retType); + + // Hive UDTF only has a single input + List<RelNode> list = new ArrayList<>(); + list.add(input); + + RexNode rexNode = cluster.getRexBuilder().makeCall(calciteOp, selectColLst); + + RelNode udtf = HiveTableFunctionScan.create(cluster, traitSet, list, rexNode, null, retType, + null); + // Add new rel & its RR to the maps + relToHiveColNameCalcitePosMap.put(udtf, this.buildHiveToCalciteColumnMap(out_rwsch, udtf)); + relToHiveRR.put(udtf, out_rwsch); + + return udtf; + } + private RelNode genLogicalPlan(QBExpr qbexpr) throws SemanticException { if (qbexpr.getOpcode() == QBExpr.Opcode.NULLOP) { return genLogicalPlan(qbexpr.getQB(), false); http://git-wip-us.apache.org/repos/asf/hive/blob/efe9c84e/ql/src/test/results/clientpositive/allcolref_in_udf.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/allcolref_in_udf.q.out b/ql/src/test/results/clientpositive/allcolref_in_udf.q.out index eda49ed..ecd784e 100644 --- a/ql/src/test/results/clientpositive/allcolref_in_udf.q.out +++ b/ql/src/test/results/clientpositive/allcolref_in_udf.q.out @@ -85,33 +85,41 @@ STAGE PLANS: alias: a Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: ((key + 1) is not null and (key < 100)) (type: boolean) + predicate: (UDFToDouble(key) < 100.0) (type: boolean) Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: (key + 1) (type: double) - sort order: + - Map-reduce partition columns: (key + 1) (type: double) + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE - value expressions: key (type: string), value (type: string) + Reduce Output Operator + key expressions: (UDFToDouble(_col0) + 1.0) (type: double) + sort order: + + Map-reduce partition columns: (UDFToDouble(_col0) + 1.0) (type: double) + Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string) TableScan alias: b Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: UDFToDouble(key) is not null (type: boolean) + predicate: key is not null (type: boolean) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: UDFToDouble(key) (type: double) - sort order: + - Map-reduce partition columns: UDFToDouble(key) (type: double) + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: key (type: string), value (type: string) + Reduce Output Operator + key expressions: UDFToDouble(_col0) (type: double) + sort order: + + Map-reduce partition columns: UDFToDouble(_col0) (type: double) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string) Reduce Operator Tree: Join Operator condition map: Inner Join 0 to 1 keys: - 0 (key + 1) (type: double) - 1 UDFToDouble(key) (type: double) + 0 (UDFToDouble(_col0) + 1.0) (type: double) + 1 UDFToDouble(_col0) (type: double) outputColumnNames: _col0, _col1, _col5, _col6 Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE Select Operator @@ -121,16 +129,20 @@ STAGE PLANS: UDTF Operator Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE function name: stack - Limit - Number of rows: 10 - Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false + Select Operator + expressions: col0 (type: string), col1 (type: string), col2 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 10 Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + File Output Operator + compressed: false + Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Stage: Stage-0 Fetch Operator @@ -195,16 +207,20 @@ STAGE PLANS: UDTF Operator Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE function name: explode - Limit - Number of rows: 10 - Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false + Select Operator + expressions: col (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 10 Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + File Output Operator + compressed: false + Statistics: Num rows: 10 Data size: 100 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Stage: Stage-0 Fetch Operator http://git-wip-us.apache.org/repos/asf/hive/blob/efe9c84e/ql/src/test/results/clientpositive/lateral_view_noalias.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/lateral_view_noalias.q.out b/ql/src/test/results/clientpositive/lateral_view_noalias.q.out index 7c08b86..e8d23d4 100644 --- a/ql/src/test/results/clientpositive/lateral_view_noalias.q.out +++ b/ql/src/test/results/clientpositive/lateral_view_noalias.q.out @@ -81,10 +81,14 @@ STAGE PLANS: UDTF Operator Statistics: Num rows: 500 Data size: 192000 Basic stats: COMPLETE Column stats: COMPLETE function name: explode - Limit - Number of rows: 2 - Statistics: Num rows: 2 Data size: 768 Basic stats: COMPLETE Column stats: COMPLETE - ListSink + Select Operator + expressions: key (type: string), value (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 4000 Basic stats: COMPLETE Column stats: COMPLETE + Limit + Number of rows: 2 + Statistics: Num rows: 2 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE + ListSink PREHOOK: query: SELECT explode(map('key1', 100, 'key2', 200)) from src limit 2 PREHOOK: type: QUERY http://git-wip-us.apache.org/repos/asf/hive/blob/efe9c84e/ql/src/test/results/clientpositive/ppd_udtf.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/ppd_udtf.q.out b/ql/src/test/results/clientpositive/ppd_udtf.q.out index 544b80e..d008526 100644 --- a/ql/src/test/results/clientpositive/ppd_udtf.q.out +++ b/ql/src/test/results/clientpositive/ppd_udtf.q.out @@ -24,7 +24,7 @@ STAGE PLANS: alias: src Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: (key > 400) (type: boolean) + predicate: (UDFToDouble(key) > 400.0) (type: boolean) Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: array(key,value) (type: array<string>) @@ -34,7 +34,7 @@ STAGE PLANS: Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE function name: explode Filter Operator - predicate: (col < 450) (type: boolean) + predicate: (UDFToDouble(col) < 450.0) (type: boolean) Statistics: Num rows: 55 Data size: 584 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: col (type: string) http://git-wip-us.apache.org/repos/asf/hive/blob/efe9c84e/ql/src/test/results/clientpositive/udf_inline.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/udf_inline.q.out b/ql/src/test/results/clientpositive/udf_inline.q.out index f986abf..dca41d9 100644 --- a/ql/src/test/results/clientpositive/udf_inline.q.out +++ b/ql/src/test/results/clientpositive/udf_inline.q.out @@ -37,10 +37,14 @@ STAGE PLANS: UDTF Operator Statistics: Num rows: 500 Data size: 32000 Basic stats: COMPLETE Column stats: COMPLETE function name: inline - Limit - Number of rows: 2 - Statistics: Num rows: 2 Data size: 128 Basic stats: COMPLETE Column stats: COMPLETE - ListSink + Select Operator + expressions: col1 (type: int), col2 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 4000 Basic stats: COMPLETE Column stats: COMPLETE + Limit + Number of rows: 2 + Statistics: Num rows: 2 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE + ListSink PREHOOK: query: SELECT inline( ARRAY( http://git-wip-us.apache.org/repos/asf/hive/blob/efe9c84e/ql/src/test/results/clientpositive/udtf_explode.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/udtf_explode.q.out b/ql/src/test/results/clientpositive/udtf_explode.q.out index de7a2f7..f00c9f4 100644 --- a/ql/src/test/results/clientpositive/udtf_explode.q.out +++ b/ql/src/test/results/clientpositive/udtf_explode.q.out @@ -31,10 +31,14 @@ STAGE PLANS: UDTF Operator Statistics: Num rows: 500 Data size: 28000 Basic stats: COMPLETE Column stats: COMPLETE function name: explode - Limit - Number of rows: 3 - Statistics: Num rows: 3 Data size: 168 Basic stats: COMPLETE Column stats: COMPLETE - ListSink + Select Operator + expressions: col (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 500 Data size: 4000 Basic stats: COMPLETE Column stats: COMPLETE + Limit + Number of rows: 3 + Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + ListSink PREHOOK: query: EXPLAIN EXTENDED SELECT a.myCol, count(1) FROM (SELECT explode(array(1,2,3)) AS myCol FROM src LIMIT 3) a GROUP BY a.myCol PREHOOK: type: QUERY @@ -60,16 +64,20 @@ STAGE PLANS: UDTF Operator Statistics: Num rows: 500 Data size: 28000 Basic stats: COMPLETE Column stats: COMPLETE function name: explode - Limit - Number of rows: 3 - Statistics: Num rows: 3 Data size: 168 Basic stats: COMPLETE Column stats: COMPLETE - Reduce Output Operator - null sort order: - sort order: - Statistics: Num rows: 3 Data size: 168 Basic stats: COMPLETE Column stats: COMPLETE - tag: -1 - value expressions: col (type: int) - auto parallelism: false + Select Operator + expressions: col (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 500 Data size: 4000 Basic stats: COMPLETE Column stats: COMPLETE + Limit + Number of rows: 3 + Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + tag: -1 + value expressions: _col0 (type: int) + auto parallelism: false Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -118,7 +126,7 @@ STAGE PLANS: name: default.src name: default.src Truncated Path -> Alias: - /src [a:src] + /src [$hdt$_0:$hdt$_0:$hdt$_0:src] Needs Tagging: false Reduce Operator Tree: Select Operator @@ -283,10 +291,14 @@ STAGE PLANS: UDTF Operator Statistics: Num rows: 500 Data size: 259500 Basic stats: COMPLETE Column stats: COMPLETE function name: explode - Limit - Number of rows: 3 - Statistics: Num rows: 3 Data size: 1557 Basic stats: COMPLETE Column stats: COMPLETE - ListSink + Select Operator + expressions: key (type: int), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 4000 Basic stats: COMPLETE Column stats: COMPLETE + Limit + Number of rows: 3 + Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + ListSink PREHOOK: query: EXPLAIN EXTENDED SELECT a.myKey, a.myVal, count(1) FROM (SELECT explode(map(1,'one',2,'two',3,'three')) as (myKey,myVal) FROM src LIMIT 3) a GROUP BY a.myKey, a.myVal PREHOOK: type: QUERY @@ -312,16 +324,20 @@ STAGE PLANS: UDTF Operator Statistics: Num rows: 500 Data size: 259500 Basic stats: COMPLETE Column stats: COMPLETE function name: explode - Limit - Number of rows: 3 - Statistics: Num rows: 3 Data size: 1557 Basic stats: COMPLETE Column stats: COMPLETE - Reduce Output Operator - null sort order: - sort order: - Statistics: Num rows: 3 Data size: 1557 Basic stats: COMPLETE Column stats: COMPLETE - tag: -1 - value expressions: key (type: int), value (type: string) - auto parallelism: false + Select Operator + expressions: key (type: int), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 4000 Basic stats: COMPLETE Column stats: COMPLETE + Limit + Number of rows: 3 + Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + tag: -1 + value expressions: _col0 (type: int), _col1 (type: string) + auto parallelism: false Path -> Alias: #### A masked pattern was here #### Path -> Partition: @@ -370,7 +386,7 @@ STAGE PLANS: name: default.src name: default.src Truncated Path -> Alias: - /src [a:src] + /src [$hdt$_0:$hdt$_0:$hdt$_0:src] Needs Tagging: false Reduce Operator Tree: Select Operator http://git-wip-us.apache.org/repos/asf/hive/blob/efe9c84e/ql/src/test/results/clientpositive/udtf_json_tuple.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/udtf_json_tuple.q.out b/ql/src/test/results/clientpositive/udtf_json_tuple.q.out index e0d4f00..5be6eb0 100644 --- a/ql/src/test/results/clientpositive/udtf_json_tuple.q.out +++ b/ql/src/test/results/clientpositive/udtf_json_tuple.q.out @@ -153,11 +153,15 @@ STAGE PLANS: UDTF Operator Statistics: Num rows: 6 Data size: 236 Basic stats: COMPLETE Column stats: NONE function name: json_tuple - Reduce Output Operator - key expressions: c0 (type: string), c1 (type: string), c2 (type: string) - sort order: +++ + Select Operator + expressions: c0 (type: string), c1 (type: string), c2 (type: string), c3 (type: string), c4 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 6 Data size: 236 Basic stats: COMPLETE Column stats: NONE - value expressions: c3 (type: string), c4 (type: string) + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string) + sort order: +++ + Statistics: Num rows: 6 Data size: 236 Basic stats: COMPLETE Column stats: NONE + value expressions: _col3 (type: string), _col4 (type: string) Reduce Operator Tree: Select Operator expressions: KEY.reducesinkkey0 (type: string), KEY.reducesinkkey1 (type: string), KEY.reducesinkkey2 (type: string), VALUE._col0 (type: string), VALUE._col1 (type: string) http://git-wip-us.apache.org/repos/asf/hive/blob/efe9c84e/ql/src/test/results/clientpositive/udtf_parse_url_tuple.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/udtf_parse_url_tuple.q.out b/ql/src/test/results/clientpositive/udtf_parse_url_tuple.q.out index ad5ea21..ab6cee7 100644 --- a/ql/src/test/results/clientpositive/udtf_parse_url_tuple.q.out +++ b/ql/src/test/results/clientpositive/udtf_parse_url_tuple.q.out @@ -170,11 +170,15 @@ STAGE PLANS: UDTF Operator Statistics: Num rows: 6 Data size: 213 Basic stats: COMPLETE Column stats: NONE function name: parse_url_tuple - Reduce Output Operator - key expressions: c0 (type: string), c1 (type: string), c2 (type: string) - sort order: +++ + Select Operator + expressions: c0 (type: string), c1 (type: string), c2 (type: string), c3 (type: string), c4 (type: string), c5 (type: string), c6 (type: string), c7 (type: string), c8 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 Statistics: Num rows: 6 Data size: 213 Basic stats: COMPLETE Column stats: NONE - value expressions: c3 (type: string), c4 (type: string), c5 (type: string), c6 (type: string), c7 (type: string), c8 (type: string) + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string) + sort order: +++ + Statistics: Num rows: 6 Data size: 213 Basic stats: COMPLETE Column stats: NONE + value expressions: _col3 (type: string), _col4 (type: string), _col5 (type: string), _col6 (type: string), _col7 (type: string), _col8 (type: string) Reduce Operator Tree: Select Operator expressions: KEY.reducesinkkey0 (type: string), KEY.reducesinkkey1 (type: string), KEY.reducesinkkey2 (type: string), VALUE._col0 (type: string), VALUE._col1 (type: string), VALUE._col2 (type: string), VALUE._col3 (type: string), VALUE._col4 (type: string), VALUE._col5 (type: string)