kasakrisz commented on code in PR #6043:
URL: https://github.com/apache/hive/pull/6043#discussion_r2351546961
##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/lineage/OpProcFactory.java:
##########
@@ -676,6 +689,249 @@ public Object process(Node nd, Stack<Node> stack,
NodeProcessorCtx procCtx,
}
}
+ /**
+ * PTF processor
+ */
+ public static class PTFLineage implements SemanticNodeProcessor {
+
+ @Override
+ public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx
procCtx, Object... nodeOutputs) throws SemanticException {
+ // LineageCTx
+ LineageCtx lCtx = (LineageCtx) procCtx;
+
+ // The operators
+ @SuppressWarnings("unchecked")
+ PTFOperator op = (PTFOperator)nd;
+ Operator<? extends OperatorDesc> inpOp = getParent(stack);
+ lCtx.getIndex().copyPredicates(inpOp, op);
+
+ Dependency dep = new Dependency();
+ DependencyType new_type = DependencyType.EXPRESSION;
Review Comment:
`new_type` - > `newType`
##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/lineage/OpProcFactory.java:
##########
@@ -676,6 +689,249 @@ public Object process(Node nd, Stack<Node> stack,
NodeProcessorCtx procCtx,
}
}
+ /**
+ * PTF processor
+ */
+ public static class PTFLineage implements SemanticNodeProcessor {
+
+ @Override
+ public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx
procCtx, Object... nodeOutputs) throws SemanticException {
+ // LineageCTx
+ LineageCtx lCtx = (LineageCtx) procCtx;
+
+ // The operators
+ @SuppressWarnings("unchecked")
+ PTFOperator op = (PTFOperator)nd;
+ Operator<? extends OperatorDesc> inpOp = getParent(stack);
+ lCtx.getIndex().copyPredicates(inpOp, op);
+
+ Dependency dep = new Dependency();
+ DependencyType new_type = DependencyType.EXPRESSION;
+ dep.setType(new_type);
+
+ Set<String> columns = new HashSet<>();
+ PartitionedTableFunctionDef funcDef = op.getConf().getFuncDef();
+ StringBuilder sb = new StringBuilder();
+ WindowFrameDef windowFrameDef = null;
+
+ if (!(funcDef.getTFunction() instanceof Noop)) {
+
+ if (funcDef instanceof WindowTableFunctionDef) {
+ // function name
+ WindowFunctionDef windowFunctionDef = ((WindowTableFunctionDef)
funcDef).getWindowFunctions().getFirst();
+ sb.append(windowFunctionDef.getName()).append("(");
+
+ addArgs(sb, columns, lCtx, inpOp, op.getSchema(),
windowFunctionDef.getArgs());
+
+ } else /* PartitionedTableFunctionDef */ {
+ // function name
+ sb.append(funcDef.getName()).append("(");
+ addArgs(sb, columns, lCtx, inpOp,
funcDef.getRawInputShape().getRr().getRowSchema(), funcDef.getArgs());
+ }
+
+ if (funcDef instanceof WindowTableFunctionDef) {
+ WindowFunctionDef windowFunctionDef = ((WindowTableFunctionDef)
funcDef).getWindowFunctions().getFirst();
+ windowFrameDef = windowFunctionDef.getWindowFrame();
+
+ if (sb.charAt(sb.length() - 2) == ',') {
+ sb.delete(sb.length() - 2, sb.length());
+ }
+ sb.append(")");
+ sb.append(" over (");
+ } else {
+ // matchpath has argument pattern like matchpath(<input expression>,
<argument methods: arg1(), arg2()...>)
+ if (funcDef.getInput() != null) {
+ sb.append("on ").append(funcDef.getInput().getAlias()).append(" ");
+
+ int counter = 1;
+ for (PTFExpressionDef arg : funcDef.getArgs()) {
+ ExprNodeDesc exprNode = arg.getExprNode();
+
+ addIfNotNull(columns, exprNode.getCols());
+
+ sb.append("arg").append(counter++).append("(");
+
sb.append(ExprProcFactory.getExprString(funcDef.getRawInputShape().getRr().getRowSchema(),
arg.getExprNode(), lCtx, inpOp, null));
+ sb.append("), ");
+ }
+
+ sb.delete(sb.length() - 2, sb.length());
+ }
+
+ }
+ }
+
+ /*
+ Collect partition by and distribute by information.
+ Please note, at the expression node level, there is no difference
between those.
+ That means distribute by gets a string partition by in the expression
string.
+ */
+ if (funcDef.getPartition() != null ) {
+ List<PTFExpressionDef> partitionExpressions =
funcDef.getPartition().getExpressions();
+
+ boolean isPartitionByAdded = false;
+ for (PTFExpressionDef partitionExpr : partitionExpressions) {
+ ExprNodeDesc partitionExprNode = partitionExpr.getExprNode();
+
+ if (partitionExprNode.getCols() != null &&
!partitionExprNode.getCols().isEmpty()) {
+ if (!isPartitionByAdded) {
+ sb.append("partition by ");
+ isPartitionByAdded = true;
+ }
+
+ addIfNotNull(columns, partitionExprNode.getCols());
+
+ if (partitionExprNode instanceof ExprNodeColumnDesc) {
+
sb.append(ExprProcFactory.getExprString(funcDef.getRawInputShape().getRr().getRowSchema(),
partitionExprNode, lCtx, inpOp, null));
+ sb.append(", ");
+ }
+
+ sb.delete(sb.length() - 2, sb.length());
+ }
+ }
+
+ }
+
+ /*
+ Collects the order by and sort by information.
+ Please note, at the expression node level, there is no difference
between those.
+ That means sort by gets a string partition by in the expression string.
+ */
+ if (funcDef.getOrder() != null) {
+ /*
+ Order by is sometimes added by the compiler to make the PTF call
deterministic.
+ At this point of the code execution, we don't know if it is added by
the compiler or
+ it was originally part of the query string.
+ */
+ List<OrderExpressionDef> orderExpressions =
funcDef.getOrder().getExpressions();
+
+ if (!sb.isEmpty() && sb.charAt(sb.length() - 1) != '(') {
+ sb.append(" ");
+ }
+ sb.append("order by ");
+
+ for (OrderExpressionDef orderExpr : orderExpressions) {
+ ExprNodeDesc orderExprNode = orderExpr.getExprNode();
+ addIfNotNull(columns, orderExprNode.getCols());
+
+
sb.append(ExprProcFactory.getExprString(funcDef.getRawInputShape().getRr().getRowSchema(),
orderExprNode, lCtx, inpOp, null));
+ if (PTFInvocationSpec.Order.DESC.equals(orderExpr.getOrder())) {
+ sb.append(" desc");
+ }
+ sb.append(", ");
+ }
+
+ sb.delete(sb.length() - 2, sb.length());
+ }
+
+ /*
+ Window frame is sometimes added by the compiler to make the PTF call
deterministic.
+ At this point of the code execution, we don't know if it is added by the
compiler or
+ it was originally part of the query string.
+ */
+ if (windowFrameDef != null) {
+ sb.append(" ").append(windowFrameDef.getWindowType()).append(" between
");
+
+ if (windowFrameDef.getStart().isCurrentRow()) {
+ sb.append("current_row");
+ } else {
+ sb.append(windowFrameDef.getStart().isUnbounded() ? "unbounded" :
windowFrameDef.getStart().getAmt() + " preceding");
+ }
+
+ sb.append(" and ");
+
+ if (windowFrameDef.getStart().isCurrentRow()) {
Review Comment:
Should this second call of `getStart()` be `getEnd()` ?
##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/lineage/OpProcFactory.java:
##########
@@ -676,6 +689,249 @@ public Object process(Node nd, Stack<Node> stack,
NodeProcessorCtx procCtx,
}
}
+ /**
+ * PTF processor
+ */
+ public static class PTFLineage implements SemanticNodeProcessor {
+
+ @Override
+ public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx
procCtx, Object... nodeOutputs) throws SemanticException {
+ // LineageCTx
+ LineageCtx lCtx = (LineageCtx) procCtx;
+
+ // The operators
+ @SuppressWarnings("unchecked")
+ PTFOperator op = (PTFOperator)nd;
+ Operator<? extends OperatorDesc> inpOp = getParent(stack);
+ lCtx.getIndex().copyPredicates(inpOp, op);
+
+ Dependency dep = new Dependency();
+ DependencyType new_type = DependencyType.EXPRESSION;
+ dep.setType(new_type);
+
+ Set<String> columns = new HashSet<>();
+ PartitionedTableFunctionDef funcDef = op.getConf().getFuncDef();
+ StringBuilder sb = new StringBuilder();
+ WindowFrameDef windowFrameDef = null;
+
+ if (!(funcDef.getTFunction() instanceof Noop)) {
+
+ if (funcDef instanceof WindowTableFunctionDef) {
+ // function name
+ WindowFunctionDef windowFunctionDef = ((WindowTableFunctionDef)
funcDef).getWindowFunctions().getFirst();
+ sb.append(windowFunctionDef.getName()).append("(");
+
+ addArgs(sb, columns, lCtx, inpOp, op.getSchema(),
windowFunctionDef.getArgs());
+
+ } else /* PartitionedTableFunctionDef */ {
+ // function name
+ sb.append(funcDef.getName()).append("(");
+ addArgs(sb, columns, lCtx, inpOp,
funcDef.getRawInputShape().getRr().getRowSchema(), funcDef.getArgs());
+ }
+
+ if (funcDef instanceof WindowTableFunctionDef) {
+ WindowFunctionDef windowFunctionDef = ((WindowTableFunctionDef)
funcDef).getWindowFunctions().getFirst();
+ windowFrameDef = windowFunctionDef.getWindowFrame();
+
+ if (sb.charAt(sb.length() - 2) == ',') {
+ sb.delete(sb.length() - 2, sb.length());
+ }
+ sb.append(")");
+ sb.append(" over (");
+ } else {
+ // matchpath has argument pattern like matchpath(<input expression>,
<argument methods: arg1(), arg2()...>)
+ if (funcDef.getInput() != null) {
+ sb.append("on ").append(funcDef.getInput().getAlias()).append(" ");
+
+ int counter = 1;
+ for (PTFExpressionDef arg : funcDef.getArgs()) {
+ ExprNodeDesc exprNode = arg.getExprNode();
+
+ addIfNotNull(columns, exprNode.getCols());
+
+ sb.append("arg").append(counter++).append("(");
+
sb.append(ExprProcFactory.getExprString(funcDef.getRawInputShape().getRr().getRowSchema(),
arg.getExprNode(), lCtx, inpOp, null));
+ sb.append("), ");
+ }
+
+ sb.delete(sb.length() - 2, sb.length());
+ }
+
+ }
+ }
+
+ /*
+ Collect partition by and distribute by information.
+ Please note, at the expression node level, there is no difference
between those.
+ That means distribute by gets a string partition by in the expression
string.
+ */
+ if (funcDef.getPartition() != null ) {
+ List<PTFExpressionDef> partitionExpressions =
funcDef.getPartition().getExpressions();
+
+ boolean isPartitionByAdded = false;
+ for (PTFExpressionDef partitionExpr : partitionExpressions) {
+ ExprNodeDesc partitionExprNode = partitionExpr.getExprNode();
+
+ if (partitionExprNode.getCols() != null &&
!partitionExprNode.getCols().isEmpty()) {
+ if (!isPartitionByAdded) {
+ sb.append("partition by ");
+ isPartitionByAdded = true;
+ }
+
+ addIfNotNull(columns, partitionExprNode.getCols());
+
+ if (partitionExprNode instanceof ExprNodeColumnDesc) {
+
sb.append(ExprProcFactory.getExprString(funcDef.getRawInputShape().getRr().getRowSchema(),
partitionExprNode, lCtx, inpOp, null));
+ sb.append(", ");
+ }
+
+ sb.delete(sb.length() - 2, sb.length());
+ }
+ }
+
+ }
+
+ /*
+ Collects the order by and sort by information.
+ Please note, at the expression node level, there is no difference
between those.
+ That means sort by gets a string partition by in the expression string.
+ */
+ if (funcDef.getOrder() != null) {
+ /*
+ Order by is sometimes added by the compiler to make the PTF call
deterministic.
+ At this point of the code execution, we don't know if it is added by
the compiler or
+ it was originally part of the query string.
+ */
+ List<OrderExpressionDef> orderExpressions =
funcDef.getOrder().getExpressions();
+
+ if (!sb.isEmpty() && sb.charAt(sb.length() - 1) != '(') {
+ sb.append(" ");
+ }
+ sb.append("order by ");
+
+ for (OrderExpressionDef orderExpr : orderExpressions) {
+ ExprNodeDesc orderExprNode = orderExpr.getExprNode();
+ addIfNotNull(columns, orderExprNode.getCols());
+
+
sb.append(ExprProcFactory.getExprString(funcDef.getRawInputShape().getRr().getRowSchema(),
orderExprNode, lCtx, inpOp, null));
+ if (PTFInvocationSpec.Order.DESC.equals(orderExpr.getOrder())) {
+ sb.append(" desc");
+ }
+ sb.append(", ");
+ }
+
+ sb.delete(sb.length() - 2, sb.length());
+ }
+
+ /*
+ Window frame is sometimes added by the compiler to make the PTF call
deterministic.
+ At this point of the code execution, we don't know if it is added by the
compiler or
+ it was originally part of the query string.
+ */
+ if (windowFrameDef != null) {
+ sb.append(" ").append(windowFrameDef.getWindowType()).append(" between
");
+
+ if (windowFrameDef.getStart().isCurrentRow()) {
+ sb.append("current_row");
+ } else {
+ sb.append(windowFrameDef.getStart().isUnbounded() ? "unbounded" :
windowFrameDef.getStart().getAmt() + " preceding");
+ }
+
+ sb.append(" and ");
+
+ if (windowFrameDef.getStart().isCurrentRow()) {
+ sb.append("current_row");
+ } else {
+ sb.append(windowFrameDef.getStart().isUnbounded() ? "unbounded" :
windowFrameDef.getStart().getAmt() + " following");
+ }
+ }
+
+ sb.append(")");
+ dep.setExpr(sb.toString());
+
+ LinkedHashSet<BaseColumnInfo> col_set = new LinkedHashSet<>();
Review Comment:
`col_set` -> `colSet`
##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/lineage/OpProcFactory.java:
##########
@@ -676,6 +689,249 @@ public Object process(Node nd, Stack<Node> stack,
NodeProcessorCtx procCtx,
}
}
+ /**
+ * PTF processor
+ */
+ public static class PTFLineage implements SemanticNodeProcessor {
+
+ @Override
+ public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx
procCtx, Object... nodeOutputs) throws SemanticException {
+ // LineageCTx
+ LineageCtx lCtx = (LineageCtx) procCtx;
+
+ // The operators
+ @SuppressWarnings("unchecked")
+ PTFOperator op = (PTFOperator)nd;
+ Operator<? extends OperatorDesc> inpOp = getParent(stack);
+ lCtx.getIndex().copyPredicates(inpOp, op);
+
+ Dependency dep = new Dependency();
+ DependencyType new_type = DependencyType.EXPRESSION;
+ dep.setType(new_type);
+
+ Set<String> columns = new HashSet<>();
+ PartitionedTableFunctionDef funcDef = op.getConf().getFuncDef();
+ StringBuilder sb = new StringBuilder();
+ WindowFrameDef windowFrameDef = null;
+
+ if (!(funcDef.getTFunction() instanceof Noop)) {
+
+ if (funcDef instanceof WindowTableFunctionDef) {
+ // function name
+ WindowFunctionDef windowFunctionDef = ((WindowTableFunctionDef)
funcDef).getWindowFunctions().getFirst();
+ sb.append(windowFunctionDef.getName()).append("(");
+
+ addArgs(sb, columns, lCtx, inpOp, op.getSchema(),
windowFunctionDef.getArgs());
+
+ } else /* PartitionedTableFunctionDef */ {
+ // function name
+ sb.append(funcDef.getName()).append("(");
+ addArgs(sb, columns, lCtx, inpOp,
funcDef.getRawInputShape().getRr().getRowSchema(), funcDef.getArgs());
+ }
+
+ if (funcDef instanceof WindowTableFunctionDef) {
+ WindowFunctionDef windowFunctionDef = ((WindowTableFunctionDef)
funcDef).getWindowFunctions().getFirst();
+ windowFrameDef = windowFunctionDef.getWindowFrame();
+
+ if (sb.charAt(sb.length() - 2) == ',') {
+ sb.delete(sb.length() - 2, sb.length());
+ }
+ sb.append(")");
+ sb.append(" over (");
+ } else {
+ // matchpath has argument pattern like matchpath(<input expression>,
<argument methods: arg1(), arg2()...>)
+ if (funcDef.getInput() != null) {
+ sb.append("on ").append(funcDef.getInput().getAlias()).append(" ");
+
+ int counter = 1;
+ for (PTFExpressionDef arg : funcDef.getArgs()) {
+ ExprNodeDesc exprNode = arg.getExprNode();
+
+ addIfNotNull(columns, exprNode.getCols());
+
+ sb.append("arg").append(counter++).append("(");
+
sb.append(ExprProcFactory.getExprString(funcDef.getRawInputShape().getRr().getRowSchema(),
arg.getExprNode(), lCtx, inpOp, null));
+ sb.append("), ");
+ }
+
+ sb.delete(sb.length() - 2, sb.length());
+ }
+
+ }
Review Comment:
Could you please merge the blocks and else blocks of these if statements?
Both has the same condition
```
(funcDef instanceof WindowTableFunctionDef)
```
##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/lineage/OpProcFactory.java:
##########
@@ -676,6 +689,249 @@ public Object process(Node nd, Stack<Node> stack,
NodeProcessorCtx procCtx,
}
}
+ /**
+ * PTF processor
+ */
+ public static class PTFLineage implements SemanticNodeProcessor {
+
+ @Override
+ public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx
procCtx, Object... nodeOutputs) throws SemanticException {
+ // LineageCTx
+ LineageCtx lCtx = (LineageCtx) procCtx;
+
+ // The operators
+ @SuppressWarnings("unchecked")
+ PTFOperator op = (PTFOperator)nd;
+ Operator<? extends OperatorDesc> inpOp = getParent(stack);
+ lCtx.getIndex().copyPredicates(inpOp, op);
+
+ Dependency dep = new Dependency();
+ DependencyType new_type = DependencyType.EXPRESSION;
+ dep.setType(new_type);
+
+ Set<String> columns = new HashSet<>();
+ PartitionedTableFunctionDef funcDef = op.getConf().getFuncDef();
+ StringBuilder sb = new StringBuilder();
+ WindowFrameDef windowFrameDef = null;
+
+ if (!(funcDef.getTFunction() instanceof Noop)) {
+
+ if (funcDef instanceof WindowTableFunctionDef) {
+ // function name
+ WindowFunctionDef windowFunctionDef = ((WindowTableFunctionDef)
funcDef).getWindowFunctions().getFirst();
+ sb.append(windowFunctionDef.getName()).append("(");
+
+ addArgs(sb, columns, lCtx, inpOp, op.getSchema(),
windowFunctionDef.getArgs());
+
+ } else /* PartitionedTableFunctionDef */ {
+ // function name
+ sb.append(funcDef.getName()).append("(");
+ addArgs(sb, columns, lCtx, inpOp,
funcDef.getRawInputShape().getRr().getRowSchema(), funcDef.getArgs());
+ }
+
+ if (funcDef instanceof WindowTableFunctionDef) {
+ WindowFunctionDef windowFunctionDef = ((WindowTableFunctionDef)
funcDef).getWindowFunctions().getFirst();
+ windowFrameDef = windowFunctionDef.getWindowFrame();
+
+ if (sb.charAt(sb.length() - 2) == ',') {
+ sb.delete(sb.length() - 2, sb.length());
+ }
+ sb.append(")");
+ sb.append(" over (");
+ } else {
+ // matchpath has argument pattern like matchpath(<input expression>,
<argument methods: arg1(), arg2()...>)
+ if (funcDef.getInput() != null) {
+ sb.append("on ").append(funcDef.getInput().getAlias()).append(" ");
+
+ int counter = 1;
+ for (PTFExpressionDef arg : funcDef.getArgs()) {
+ ExprNodeDesc exprNode = arg.getExprNode();
+
+ addIfNotNull(columns, exprNode.getCols());
+
+ sb.append("arg").append(counter++).append("(");
+
sb.append(ExprProcFactory.getExprString(funcDef.getRawInputShape().getRr().getRowSchema(),
arg.getExprNode(), lCtx, inpOp, null));
+ sb.append("), ");
+ }
+
+ sb.delete(sb.length() - 2, sb.length());
+ }
+
+ }
+ }
+
+ /*
+ Collect partition by and distribute by information.
+ Please note, at the expression node level, there is no difference
between those.
+ That means distribute by gets a string partition by in the expression
string.
+ */
+ if (funcDef.getPartition() != null ) {
+ List<PTFExpressionDef> partitionExpressions =
funcDef.getPartition().getExpressions();
+
+ boolean isPartitionByAdded = false;
+ for (PTFExpressionDef partitionExpr : partitionExpressions) {
+ ExprNodeDesc partitionExprNode = partitionExpr.getExprNode();
+
+ if (partitionExprNode.getCols() != null &&
!partitionExprNode.getCols().isEmpty()) {
+ if (!isPartitionByAdded) {
+ sb.append("partition by ");
+ isPartitionByAdded = true;
+ }
+
+ addIfNotNull(columns, partitionExprNode.getCols());
+
+ if (partitionExprNode instanceof ExprNodeColumnDesc) {
+
sb.append(ExprProcFactory.getExprString(funcDef.getRawInputShape().getRr().getRowSchema(),
partitionExprNode, lCtx, inpOp, null));
+ sb.append(", ");
+ }
+
+ sb.delete(sb.length() - 2, sb.length());
+ }
+ }
+
+ }
+
+ /*
+ Collects the order by and sort by information.
+ Please note, at the expression node level, there is no difference
between those.
+ That means sort by gets a string partition by in the expression string.
+ */
+ if (funcDef.getOrder() != null) {
+ /*
+ Order by is sometimes added by the compiler to make the PTF call
deterministic.
+ At this point of the code execution, we don't know if it is added by
the compiler or
+ it was originally part of the query string.
+ */
+ List<OrderExpressionDef> orderExpressions =
funcDef.getOrder().getExpressions();
+
+ if (!sb.isEmpty() && sb.charAt(sb.length() - 1) != '(') {
+ sb.append(" ");
+ }
+ sb.append("order by ");
+
+ for (OrderExpressionDef orderExpr : orderExpressions) {
+ ExprNodeDesc orderExprNode = orderExpr.getExprNode();
+ addIfNotNull(columns, orderExprNode.getCols());
+
+
sb.append(ExprProcFactory.getExprString(funcDef.getRawInputShape().getRr().getRowSchema(),
orderExprNode, lCtx, inpOp, null));
+ if (PTFInvocationSpec.Order.DESC.equals(orderExpr.getOrder())) {
+ sb.append(" desc");
+ }
+ sb.append(", ");
+ }
+
+ sb.delete(sb.length() - 2, sb.length());
+ }
+
+ /*
+ Window frame is sometimes added by the compiler to make the PTF call
deterministic.
+ At this point of the code execution, we don't know if it is added by the
compiler or
+ it was originally part of the query string.
+ */
+ if (windowFrameDef != null) {
+ sb.append(" ").append(windowFrameDef.getWindowType()).append(" between
");
+
+ if (windowFrameDef.getStart().isCurrentRow()) {
+ sb.append("current_row");
+ } else {
+ sb.append(windowFrameDef.getStart().isUnbounded() ? "unbounded" :
windowFrameDef.getStart().getAmt() + " preceding");
+ }
+
+ sb.append(" and ");
+
+ if (windowFrameDef.getStart().isCurrentRow()) {
+ sb.append("current_row");
+ } else {
+ sb.append(windowFrameDef.getStart().isUnbounded() ? "unbounded" :
windowFrameDef.getStart().getAmt() + " following");
+ }
Review Comment:
This code is duplicated. How about extracting this to method like
```
appendBoundary(BoundaryDef boundaryDef, String precedingOrfollowing)
```
or just
```
appendBoundary(BoundaryDef boundaryDef)
```
because `BoundaryDef` seems to have all the data we need to print.
##########
ql/src/test/queries/clientpositive/lineage_ptf.q:
##########
@@ -0,0 +1,142 @@
+set hive.exec.post.hooks=org.apache.hadoop.hive.ql.hooks.LineageLogger;
+
Review Comment:
Is there a test case when a column is referenced more than once in a window
expression?
```
sum(src.col_001) over (partition by src.col_001 order by src.col_001)
```
##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/lineage/OpProcFactory.java:
##########
@@ -676,6 +689,249 @@ public Object process(Node nd, Stack<Node> stack,
NodeProcessorCtx procCtx,
}
}
+ /**
+ * PTF processor
+ */
+ public static class PTFLineage implements SemanticNodeProcessor {
+
+ @Override
+ public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx
procCtx, Object... nodeOutputs) throws SemanticException {
+ // LineageCTx
+ LineageCtx lCtx = (LineageCtx) procCtx;
+
+ // The operators
+ @SuppressWarnings("unchecked")
+ PTFOperator op = (PTFOperator)nd;
+ Operator<? extends OperatorDesc> inpOp = getParent(stack);
+ lCtx.getIndex().copyPredicates(inpOp, op);
+
+ Dependency dep = new Dependency();
+ DependencyType new_type = DependencyType.EXPRESSION;
+ dep.setType(new_type);
+
+ Set<String> columns = new HashSet<>();
+ PartitionedTableFunctionDef funcDef = op.getConf().getFuncDef();
+ StringBuilder sb = new StringBuilder();
+ WindowFrameDef windowFrameDef = null;
+
+ if (!(funcDef.getTFunction() instanceof Noop)) {
+
+ if (funcDef instanceof WindowTableFunctionDef) {
+ // function name
+ WindowFunctionDef windowFunctionDef = ((WindowTableFunctionDef)
funcDef).getWindowFunctions().getFirst();
+ sb.append(windowFunctionDef.getName()).append("(");
+
+ addArgs(sb, columns, lCtx, inpOp, op.getSchema(),
windowFunctionDef.getArgs());
+
+ } else /* PartitionedTableFunctionDef */ {
+ // function name
+ sb.append(funcDef.getName()).append("(");
+ addArgs(sb, columns, lCtx, inpOp,
funcDef.getRawInputShape().getRr().getRowSchema(), funcDef.getArgs());
+ }
+
+ if (funcDef instanceof WindowTableFunctionDef) {
+ WindowFunctionDef windowFunctionDef = ((WindowTableFunctionDef)
funcDef).getWindowFunctions().getFirst();
+ windowFrameDef = windowFunctionDef.getWindowFrame();
+
+ if (sb.charAt(sb.length() - 2) == ',') {
+ sb.delete(sb.length() - 2, sb.length());
+ }
+ sb.append(")");
+ sb.append(" over (");
+ } else {
+ // matchpath has argument pattern like matchpath(<input expression>,
<argument methods: arg1(), arg2()...>)
+ if (funcDef.getInput() != null) {
+ sb.append("on ").append(funcDef.getInput().getAlias()).append(" ");
+
+ int counter = 1;
+ for (PTFExpressionDef arg : funcDef.getArgs()) {
+ ExprNodeDesc exprNode = arg.getExprNode();
+
+ addIfNotNull(columns, exprNode.getCols());
+
+ sb.append("arg").append(counter++).append("(");
+
sb.append(ExprProcFactory.getExprString(funcDef.getRawInputShape().getRr().getRowSchema(),
arg.getExprNode(), lCtx, inpOp, null));
+ sb.append("), ");
+ }
+
+ sb.delete(sb.length() - 2, sb.length());
+ }
+
+ }
+ }
+
+ /*
+ Collect partition by and distribute by information.
+ Please note, at the expression node level, there is no difference
between those.
+ That means distribute by gets a string partition by in the expression
string.
+ */
+ if (funcDef.getPartition() != null ) {
+ List<PTFExpressionDef> partitionExpressions =
funcDef.getPartition().getExpressions();
+
+ boolean isPartitionByAdded = false;
+ for (PTFExpressionDef partitionExpr : partitionExpressions) {
+ ExprNodeDesc partitionExprNode = partitionExpr.getExprNode();
+
+ if (partitionExprNode.getCols() != null &&
!partitionExprNode.getCols().isEmpty()) {
+ if (!isPartitionByAdded) {
+ sb.append("partition by ");
+ isPartitionByAdded = true;
+ }
+
+ addIfNotNull(columns, partitionExprNode.getCols());
+
+ if (partitionExprNode instanceof ExprNodeColumnDesc) {
+
sb.append(ExprProcFactory.getExprString(funcDef.getRawInputShape().getRr().getRowSchema(),
partitionExprNode, lCtx, inpOp, null));
+ sb.append(", ");
+ }
+
+ sb.delete(sb.length() - 2, sb.length());
+ }
+ }
+
+ }
+
+ /*
+ Collects the order by and sort by information.
+ Please note, at the expression node level, there is no difference
between those.
+ That means sort by gets a string partition by in the expression string.
+ */
+ if (funcDef.getOrder() != null) {
+ /*
+ Order by is sometimes added by the compiler to make the PTF call
deterministic.
+ At this point of the code execution, we don't know if it is added by
the compiler or
+ it was originally part of the query string.
+ */
+ List<OrderExpressionDef> orderExpressions =
funcDef.getOrder().getExpressions();
+
+ if (!sb.isEmpty() && sb.charAt(sb.length() - 1) != '(') {
+ sb.append(" ");
+ }
+ sb.append("order by ");
+
+ for (OrderExpressionDef orderExpr : orderExpressions) {
+ ExprNodeDesc orderExprNode = orderExpr.getExprNode();
+ addIfNotNull(columns, orderExprNode.getCols());
+
+
sb.append(ExprProcFactory.getExprString(funcDef.getRawInputShape().getRr().getRowSchema(),
orderExprNode, lCtx, inpOp, null));
+ if (PTFInvocationSpec.Order.DESC.equals(orderExpr.getOrder())) {
+ sb.append(" desc");
+ }
+ sb.append(", ");
+ }
+
+ sb.delete(sb.length() - 2, sb.length());
+ }
+
+ /*
+ Window frame is sometimes added by the compiler to make the PTF call
deterministic.
+ At this point of the code execution, we don't know if it is added by the
compiler or
+ it was originally part of the query string.
+ */
+ if (windowFrameDef != null) {
+ sb.append(" ").append(windowFrameDef.getWindowType()).append(" between
");
+
+ if (windowFrameDef.getStart().isCurrentRow()) {
+ sb.append("current_row");
+ } else {
+ sb.append(windowFrameDef.getStart().isUnbounded() ? "unbounded" :
windowFrameDef.getStart().getAmt() + " preceding");
+ }
+
+ sb.append(" and ");
+
+ if (windowFrameDef.getStart().isCurrentRow()) {
+ sb.append("current_row");
+ } else {
+ sb.append(windowFrameDef.getStart().isUnbounded() ? "unbounded" :
windowFrameDef.getStart().getAmt() + " following");
+ }
+ }
+
+ sb.append(")");
+ dep.setExpr(sb.toString());
+
+ LinkedHashSet<BaseColumnInfo> col_set = new LinkedHashSet<>();
+ for(ColumnInfo ci : inpOp.getSchema().getSignature()) {
+ Dependency d = lCtx.getIndex().getDependency(inpOp, ci);
+ if (d != null) {
+ new_type = LineageCtx.getNewDependencyType(d.getType(), new_type);
+ if (!ci.isHiddenVirtualCol() &&
columns.contains(ci.getInternalName())) {
+ col_set.addAll(d.getBaseCols());
+ }
+ }
+ }
+
+ dep.setType(new_type);
+ dep.setBaseCols(col_set);
+
+ // This dependency is then set for all the colinfos of the script
operator
+ for(ColumnInfo ci : op.getSchema().getSignature()) {
+ Dependency d = dep;
+ Dependency dep_ci = lCtx.getIndex().getDependency(inpOp, ci);
+ if (dep_ci != null) {
Review Comment:
`dep_ci` -> `depCI`
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]