This is an automated email from the ASF dual-hosted git repository. timothyfarkas pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/drill.git
commit efd6d29910d155cd84692ee8aafff3eb06c6e391 Author: jbimbert <jean-blas.imb...@amadeus.com> AuthorDate: Tue Jun 12 19:22:20 2018 +0200 DRILL-5796 : implement ROWS_MATCH enum to keep inside rowgroup the filter result information, used to prune the filter if all rows match. closes #1298 --- .../exec/expr/stat/ParquetBooleanPredicate.java | 48 ++- .../exec/expr/stat/ParquetComparisonPredicate.java | 78 ++--- .../exec/expr/stat/ParquetFilterPredicate.java | 13 +- .../drill/exec/expr/stat/ParquetIsPredicate.java | 125 +++++--- .../drill/exec/expr/stat/RangeExprEvaluator.java | 33 +- .../store/parquet/AbstractParquetGroupScan.java | 8 +- .../exec/store/parquet/ParquetPushDownFilter.java | 18 +- .../store/parquet/ParquetRGFilterEvaluator.java | 52 +++- .../drill/exec/store/parquet/RowGroupInfo.java | 5 + .../parquet/stat/ParquetFooterStatCollector.java | 2 +- .../parquet/stat/ParquetMetaStatCollector.java | 2 +- .../store/parquet/TestParquetFilterPushDown.java | 335 +++++++++++++++------ .../test/resources/parquet/multirowgroup2.parquet | Bin 0 -> 598 bytes .../parquet/multirowgroupwithNulls.parquet | Bin 0 -> 2063 bytes .../resources/parquetFilterPush/tfTbl/ff1.parquet | Bin 0 -> 251 bytes .../resources/parquetFilterPush/tfTbl/ft0.parquet | Bin 0 -> 251 bytes .../resources/parquetFilterPush/tfTbl/tt1.parquet | Bin 0 -> 251 bytes 17 files changed, 510 insertions(+), 209 deletions(-) diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetBooleanPredicate.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetBooleanPredicate.java index fa5c467..f427dc6 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetBooleanPredicate.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetBooleanPredicate.java @@ -46,15 +46,29 @@ public abstract class ParquetBooleanPredicate<C extends Comparable<C>> extends B ExpressionPosition pos ) { return new ParquetBooleanPredicate<C>(name, args, pos) { + /** + * Evaluates a compound "AND" filter on the statistics of a RowGroup (the filter reads "filterA and filterB"). + * Return value :<ul> + * <li>ALL : only if all filters return ALL + * <li>NONE : if one filter at least returns NONE + * <li>SOME : all other cases + * </ul> + */ @Override - public boolean canDrop(RangeExprEvaluator<C> evaluator) { - // "and" : as long as one branch is OK to drop, we can drop it. + public RowsMatch matches(RangeExprEvaluator<C> evaluator) { + RowsMatch resultMatch = RowsMatch.ALL; for (LogicalExpression child : this) { - if (child instanceof ParquetFilterPredicate && ((ParquetFilterPredicate)child).canDrop(evaluator)) { - return true; + if (child instanceof ParquetFilterPredicate) { + switch (((ParquetFilterPredicate) child).matches(evaluator)) { + case NONE: + return RowsMatch.NONE; // No row comply to 1 filter part => can drop RG + case SOME: + resultMatch = RowsMatch.SOME; + default: // Do nothing + } } } - return false; + return resultMatch; } }; } @@ -66,15 +80,29 @@ public abstract class ParquetBooleanPredicate<C extends Comparable<C>> extends B ExpressionPosition pos ) { return new ParquetBooleanPredicate<C>(name, args, pos) { + /** + * Evaluates a compound "OR" filter on the statistics of a RowGroup (the filter reads "filterA or filterB"). + * Return value :<ul> + * <li>NONE : only if all filters return NONE + * <li>ALL : if one filter at least returns ALL + * <li>SOME : all other cases + * </ul> + */ @Override - public boolean canDrop(RangeExprEvaluator<C> evaluator) { + public RowsMatch matches(RangeExprEvaluator<C> evaluator) { + RowsMatch resultMatch = RowsMatch.NONE; for (LogicalExpression child : this) { - // "or" : as long as one branch is NOT ok to drop, we can NOT drop it. - if (!(child instanceof ParquetFilterPredicate) || !((ParquetFilterPredicate)child).canDrop(evaluator)) { - return false; + if (child instanceof ParquetFilterPredicate) { + switch (((ParquetFilterPredicate) child).matches(evaluator)) { + case ALL: + return RowsMatch.ALL; // One at least is ALL => can drop filter but not RG + case SOME: + resultMatch = RowsMatch.SOME; + default: // Do nothing + } } } - return true; + return resultMatch; } }; } diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetComparisonPredicate.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetComparisonPredicate.java index ebceefb..531cbab 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetComparisonPredicate.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetComparisonPredicate.java @@ -26,8 +26,9 @@ import org.apache.parquet.column.statistics.Statistics; import java.util.ArrayList; import java.util.Iterator; import java.util.List; -import java.util.function.BiPredicate; +import java.util.function.BiFunction; +import static org.apache.drill.exec.expr.stat.ParquetPredicatesHelper.hasNoNulls; import static org.apache.drill.exec.expr.stat.ParquetPredicatesHelper.isNullOrEmpty; import static org.apache.drill.exec.expr.stat.ParquetPredicatesHelper.isAllNulls; @@ -38,12 +39,13 @@ public class ParquetComparisonPredicate<C extends Comparable<C>> extends Logical implements ParquetFilterPredicate<C> { private final LogicalExpression left; private final LogicalExpression right; - private final BiPredicate<Statistics<C>, Statistics<C>> predicate; + + private final BiFunction<Statistics<C>, Statistics<C>, RowsMatch> predicate; private ParquetComparisonPredicate( LogicalExpression left, LogicalExpression right, - BiPredicate<Statistics<C>, Statistics<C>> predicate + BiFunction<Statistics<C>, Statistics<C>, RowsMatch> predicate ) { super(left.getPosition()); this.left = left; @@ -65,7 +67,7 @@ public class ParquetComparisonPredicate<C extends Comparable<C>> extends Logical } /** - * Semantics of canDrop() is very similar to what is implemented in Parquet library's + * Semantics of matches() is very similar to what is implemented in Parquet library's * {@link org.apache.parquet.filter2.statisticslevel.StatisticsFilter} and * {@link org.apache.parquet.filter2.predicate.FilterPredicate} * @@ -83,23 +85,29 @@ public class ParquetComparisonPredicate<C extends Comparable<C>> extends Logical * where Column1 and Column2 are from same parquet table. */ @Override - public boolean canDrop(RangeExprEvaluator<C> evaluator) { + public RowsMatch matches(RangeExprEvaluator<C> evaluator) { Statistics<C> leftStat = left.accept(evaluator, null); if (isNullOrEmpty(leftStat)) { - return false; + return RowsMatch.SOME; } - Statistics<C> rightStat = right.accept(evaluator, null); if (isNullOrEmpty(rightStat)) { - return false; + return RowsMatch.SOME; } - - // if either side is ALL null, = is evaluated to UNKNOWN -> canDrop if (isAllNulls(leftStat, evaluator.getRowCount()) || isAllNulls(rightStat, evaluator.getRowCount())) { - return true; + return RowsMatch.NONE; + } + if (!leftStat.hasNonNullValue() || !rightStat.hasNonNullValue()) { + return RowsMatch.SOME; } + return predicate.apply(leftStat, rightStat); + } - return (leftStat.hasNonNullValue() && rightStat.hasNonNullValue()) && predicate.test(leftStat, rightStat); + /** + * If one rowgroup contains some null values, change the RowsMatch.ALL into RowsMatch.SOME (null values should be discarded by filter) + */ + private static RowsMatch checkNull(Statistics leftStat, Statistics rightStat) { + return !hasNoNulls(leftStat) || !hasNoNulls(rightStat) ? RowsMatch.SOME : RowsMatch.ALL; } /** @@ -109,12 +117,9 @@ public class ParquetComparisonPredicate<C extends Comparable<C>> extends Logical LogicalExpression left, LogicalExpression right ) { - return new ParquetComparisonPredicate<C>(left, right, (leftStat, rightStat) -> { - // can drop when left's max < right's min, or right's max < left's min - final C leftMin = leftStat.genericGetMin(); - final C rightMin = rightStat.genericGetMin(); - return (leftStat.compareMaxToValue(rightMin) < 0) || (rightStat.compareMaxToValue(leftMin) < 0); - }) { + return new ParquetComparisonPredicate<C>(left, right, (leftStat, rightStat) -> + leftStat.compareMaxToValue(rightStat.genericGetMin()) < 0 || rightStat.compareMaxToValue(leftStat.genericGetMin()) < 0 ? RowsMatch.NONE : RowsMatch.SOME + ) { @Override public String toString() { return left + " = " + right; @@ -130,9 +135,10 @@ public class ParquetComparisonPredicate<C extends Comparable<C>> extends Logical LogicalExpression right ) { return new ParquetComparisonPredicate<C>(left, right, (leftStat, rightStat) -> { - // can drop when left's max <= right's min. - final C rightMin = rightStat.genericGetMin(); - return leftStat.compareMaxToValue(rightMin) <= 0; + if (leftStat.compareMaxToValue(rightStat.genericGetMin()) <= 0) { + return RowsMatch.NONE; + } + return leftStat.compareMinToValue(rightStat.genericGetMax()) > 0 ? checkNull(leftStat, rightStat) : RowsMatch.SOME; }); } @@ -144,9 +150,10 @@ public class ParquetComparisonPredicate<C extends Comparable<C>> extends Logical LogicalExpression right ) { return new ParquetComparisonPredicate<C>(left, right, (leftStat, rightStat) -> { - // can drop when left's max < right's min. - final C rightMin = rightStat.genericGetMin(); - return leftStat.compareMaxToValue(rightMin) < 0; + if (leftStat.compareMaxToValue(rightStat.genericGetMin()) < 0) { + return RowsMatch.NONE; + } + return leftStat.compareMinToValue(rightStat.genericGetMax()) >= 0 ? checkNull(leftStat, rightStat) : RowsMatch.SOME; }); } @@ -158,9 +165,10 @@ public class ParquetComparisonPredicate<C extends Comparable<C>> extends Logical LogicalExpression right ) { return new ParquetComparisonPredicate<C>(left, right, (leftStat, rightStat) -> { - // can drop when right's max <= left's min. - final C leftMin = leftStat.genericGetMin(); - return rightStat.compareMaxToValue(leftMin) <= 0; + if (rightStat.compareMaxToValue(leftStat.genericGetMin()) <= 0) { + return RowsMatch.NONE; + } + return leftStat.compareMaxToValue(rightStat.genericGetMin()) < 0 ? checkNull(leftStat, rightStat) : RowsMatch.SOME; }); } @@ -171,9 +179,10 @@ public class ParquetComparisonPredicate<C extends Comparable<C>> extends Logical LogicalExpression left, LogicalExpression right ) { return new ParquetComparisonPredicate<C>(left, right, (leftStat, rightStat) -> { - // can drop when right's max < left's min. - final C leftMin = leftStat.genericGetMin(); - return rightStat.compareMaxToValue(leftMin) < 0; + if (rightStat.compareMaxToValue(leftStat.genericGetMin()) < 0) { + return RowsMatch.NONE; + } + return leftStat.compareMaxToValue(rightStat.genericGetMin()) <= 0 ? checkNull(leftStat, rightStat) : RowsMatch.SOME; }); } @@ -185,11 +194,10 @@ public class ParquetComparisonPredicate<C extends Comparable<C>> extends Logical LogicalExpression right ) { return new ParquetComparisonPredicate<C>(left, right, (leftStat, rightStat) -> { - // can drop when there is only one unique value. - final C leftMax = leftStat.genericGetMax(); - final C rightMax = rightStat.genericGetMax(); - return leftStat.compareMinToValue(leftMax) == 0 && rightStat.compareMinToValue(rightMax) == 0 && - leftStat.compareMaxToValue(rightMax) == 0; + if (leftStat.compareMaxToValue(rightStat.genericGetMin()) < 0 || rightStat.compareMaxToValue(leftStat.genericGetMin()) < 0) { + return checkNull(leftStat, rightStat); + } + return leftStat.compareMaxToValue(rightStat.genericGetMax()) == 0 && leftStat.compareMinToValue(rightStat.genericGetMin()) == 0 ? RowsMatch.NONE : RowsMatch.SOME; }); } diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetFilterPredicate.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetFilterPredicate.java index 1b7e9e5..c472d48 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetFilterPredicate.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetFilterPredicate.java @@ -18,5 +18,16 @@ package org.apache.drill.exec.expr.stat; public interface ParquetFilterPredicate<T extends Comparable<T>> { - boolean canDrop(RangeExprEvaluator<T> evaluator); + + /** + * Define the validity of a row group against a filter + * <ul> + * <li>ALL : all rows match the filter (can not drop the row group and can prune the filter) + * <li>NONE : no row matches the filter (can drop the row group) + * <li>SOME : some rows only match the filter or the filter can not be applied (can not drop the row group nor the filter) + * </ul> + */ + enum RowsMatch {ALL, NONE, SOME} + + RowsMatch matches(RangeExprEvaluator<T> evaluator); } diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java index 42e6e0b..e69dd8b 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java @@ -19,7 +19,6 @@ package org.apache.drill.exec.expr.stat; import org.apache.drill.common.expression.LogicalExpression; import org.apache.drill.common.expression.LogicalExpressionBase; -import org.apache.drill.common.expression.SchemaPath; import org.apache.drill.common.expression.TypedFieldExpr; import org.apache.drill.common.expression.visitors.ExprVisitor; import org.apache.drill.exec.expr.fn.FunctionGenerationHelper; @@ -29,7 +28,7 @@ import org.apache.parquet.column.statistics.Statistics; import java.util.ArrayList; import java.util.Iterator; import java.util.List; -import java.util.function.BiPredicate; +import java.util.function.BiFunction; import static org.apache.drill.exec.expr.stat.ParquetPredicatesHelper.hasNoNulls; import static org.apache.drill.exec.expr.stat.ParquetPredicatesHelper.isAllNulls; @@ -42,9 +41,10 @@ public class ParquetIsPredicate<C extends Comparable<C>> extends LogicalExpressi implements ParquetFilterPredicate<C> { private final LogicalExpression expr; - private final BiPredicate<Statistics<C>, RangeExprEvaluator<C>> predicate; - private ParquetIsPredicate(LogicalExpression expr, BiPredicate<Statistics<C>, RangeExprEvaluator<C>> predicate) { + private final BiFunction<Statistics<C>, RangeExprEvaluator<C>, RowsMatch> predicate; + + private ParquetIsPredicate(LogicalExpression expr, BiFunction<Statistics<C>, RangeExprEvaluator<C>, RowsMatch> predicate) { super(expr.getPosition()); this.expr = expr; this.predicate = predicate; @@ -62,14 +62,22 @@ public class ParquetIsPredicate<C extends Comparable<C>> extends LogicalExpressi return visitor.visitUnknown(this, value); } - @Override - public boolean canDrop(RangeExprEvaluator<C> evaluator) { + /** + * Apply the filter condition against the meta of the rowgroup. + */ + public RowsMatch matches(RangeExprEvaluator<C> evaluator) { Statistics<C> exprStat = expr.accept(evaluator, null); - if (isNullOrEmpty(exprStat)) { - return false; - } + return isNullOrEmpty(exprStat) ? RowsMatch.SOME : predicate.apply(exprStat, evaluator); + } - return predicate.test(exprStat, evaluator); + /** + * After the applying of the filter against the statistics of the rowgroup, if the result is RowsMatch.ALL, + * then we still must know if the rowgroup contains some null values, because it can change the filter result. + * If it contains some null values, then we change the RowsMatch.ALL into RowsMatch.SOME, which sya that maybe + * some values (the null ones) should be disgarded. + */ + private static RowsMatch checkNull(Statistics exprStat) { + return hasNoNulls(exprStat) ? RowsMatch.ALL : RowsMatch.SOME; } /** @@ -77,26 +85,20 @@ public class ParquetIsPredicate<C extends Comparable<C>> extends LogicalExpressi */ private static <C extends Comparable<C>> LogicalExpression createIsNullPredicate(LogicalExpression expr) { return new ParquetIsPredicate<C>(expr, - //if there are no nulls -> canDrop - (exprStat, evaluator) -> hasNoNulls(exprStat)) { - private final boolean isArray = isArray(expr); - - private boolean isArray(LogicalExpression expression) { - if (expression instanceof TypedFieldExpr) { - TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expression; - SchemaPath schemaPath = typedFieldExpr.getPath(); - return schemaPath.isArray(); - } - return false; - } - - @Override - public boolean canDrop(RangeExprEvaluator<C> evaluator) { + (exprStat, evaluator) -> { // for arrays we are not able to define exact number of nulls // [1,2,3] vs [1,2] -> in second case 3 is absent and thus it's null but statistics shows no nulls - return !isArray && super.canDrop(evaluator); - } - }; + if (expr instanceof TypedFieldExpr) { + TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expr; + if (typedFieldExpr.getPath().isArray()) { + return RowsMatch.SOME; + } + } + if (hasNoNulls(exprStat)) { + return RowsMatch.NONE; + } + return isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.ALL : RowsMatch.SOME; + }); } /** @@ -104,8 +106,7 @@ public class ParquetIsPredicate<C extends Comparable<C>> extends LogicalExpressi */ private static <C extends Comparable<C>> LogicalExpression createIsNotNullPredicate(LogicalExpression expr) { return new ParquetIsPredicate<C>(expr, - //if there are all nulls -> canDrop - (exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount()) + (exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.NONE : checkNull(exprStat) ); } @@ -113,40 +114,72 @@ public class ParquetIsPredicate<C extends Comparable<C>> extends LogicalExpressi * IS TRUE predicate. */ private static LogicalExpression createIsTruePredicate(LogicalExpression expr) { - return new ParquetIsPredicate<Boolean>(expr, (exprStat, evaluator) -> - //if max value is not true or if there are all nulls -> canDrop - isAllNulls(exprStat, evaluator.getRowCount()) || exprStat.hasNonNullValue() && !((BooleanStatistics) exprStat).getMax() - ); + return new ParquetIsPredicate<Boolean>(expr, (exprStat, evaluator) -> { + if (isAllNulls(exprStat, evaluator.getRowCount())) { + return RowsMatch.NONE; + } + if (!exprStat.hasNonNullValue()) { + return RowsMatch.SOME; + } + if (!((BooleanStatistics) exprStat).getMax()) { + return RowsMatch.NONE; + } + return ((BooleanStatistics) exprStat).getMin() ? checkNull(exprStat) : RowsMatch.SOME; + }); } /** * IS FALSE predicate. */ private static LogicalExpression createIsFalsePredicate(LogicalExpression expr) { - return new ParquetIsPredicate<Boolean>(expr, (exprStat, evaluator) -> - //if min value is not false or if there are all nulls -> canDrop - isAllNulls(exprStat, evaluator.getRowCount()) || exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin() - ); + return new ParquetIsPredicate<Boolean>(expr, (exprStat, evaluator) -> { + if (isAllNulls(exprStat, evaluator.getRowCount())) { + return RowsMatch.NONE; + } + if (!exprStat.hasNonNullValue()) { + return RowsMatch.SOME; + } + if (((BooleanStatistics) exprStat).getMin()) { + return RowsMatch.NONE; + } + return ((BooleanStatistics) exprStat).getMax() ? RowsMatch.SOME : checkNull(exprStat); + }); } /** * IS NOT TRUE predicate. */ private static LogicalExpression createIsNotTruePredicate(LogicalExpression expr) { - return new ParquetIsPredicate<Boolean>(expr, (exprStat, evaluator) -> - //if min value is not false or if there are no nulls -> canDrop - hasNoNulls(exprStat) && exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin() - ); + return new ParquetIsPredicate<Boolean>(expr, (exprStat, evaluator) -> { + if (isAllNulls(exprStat, evaluator.getRowCount())) { + return RowsMatch.ALL; + } + if (!exprStat.hasNonNullValue()) { + return RowsMatch.SOME; + } + if (((BooleanStatistics) exprStat).getMin()) { + return hasNoNulls(exprStat) ? RowsMatch.NONE : RowsMatch.SOME; + } + return ((BooleanStatistics) exprStat).getMax() ? RowsMatch.SOME : RowsMatch.ALL; + }); } /** * IS NOT FALSE predicate. */ private static LogicalExpression createIsNotFalsePredicate(LogicalExpression expr) { - return new ParquetIsPredicate<Boolean>(expr, (exprStat, evaluator) -> - //if max value is not true or if there are no nulls -> canDrop - hasNoNulls(exprStat) && exprStat.hasNonNullValue() && !((BooleanStatistics) exprStat).getMax() - ); + return new ParquetIsPredicate<Boolean>(expr, (exprStat, evaluator) -> { + if (isAllNulls(exprStat, evaluator.getRowCount())) { + return RowsMatch.ALL; + } + if (!exprStat.hasNonNullValue()) { + return RowsMatch.SOME; + } + if (!((BooleanStatistics) exprStat).getMax()) { + return hasNoNulls(exprStat) ? RowsMatch.NONE : RowsMatch.SOME; + } + return ((BooleanStatistics) exprStat).getMin() ? RowsMatch.ALL : RowsMatch.SOME; + }); } public static <C extends Comparable<C>> LogicalExpression createIsPredicate(String function, LogicalExpression expr) { diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/RangeExprEvaluator.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/RangeExprEvaluator.java index f127f0b..2b55e3d 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/RangeExprEvaluator.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/RangeExprEvaluator.java @@ -258,21 +258,28 @@ public class RangeExprEvaluator<T extends Comparable<T>> extends AbstractExprVis final ValueHolder minFuncHolder = InterpreterEvaluator.evaluateFunction(interpreter, args1, holderExpr.getName()); final ValueHolder maxFuncHolder = InterpreterEvaluator.evaluateFunction(interpreter, args2, holderExpr.getName()); + Statistics<T> statistics; switch (destType) { - //TODO : need handle # of nulls. - case INT: - return getStatistics( ((IntHolder)minFuncHolder).value, ((IntHolder)maxFuncHolder).value); - case BIGINT: - return getStatistics( ((BigIntHolder)minFuncHolder).value, ((BigIntHolder)maxFuncHolder).value); - case FLOAT4: - return getStatistics( ((Float4Holder)minFuncHolder).value, ((Float4Holder)maxFuncHolder).value); - case FLOAT8: - return getStatistics( ((Float8Holder)minFuncHolder).value, ((Float8Holder)maxFuncHolder).value); - case TIMESTAMP: - return getStatistics(((TimeStampHolder) minFuncHolder).value, ((TimeStampHolder) maxFuncHolder).value); - default: - return null; + case INT: + statistics = getStatistics(((IntHolder) minFuncHolder).value, ((IntHolder) maxFuncHolder).value); + break; + case BIGINT: + statistics = getStatistics(((BigIntHolder) minFuncHolder).value, ((BigIntHolder) maxFuncHolder).value); + break; + case FLOAT4: + statistics = getStatistics(((Float4Holder) minFuncHolder).value, ((Float4Holder) maxFuncHolder).value); + break; + case FLOAT8: + statistics = getStatistics(((Float8Holder) minFuncHolder).value, ((Float8Holder) maxFuncHolder).value); + break; + case TIMESTAMP: + statistics = getStatistics(((TimeStampHolder) minFuncHolder).value, ((TimeStampHolder) maxFuncHolder).value); + break; + default: + return null; } + statistics.setNumNulls(input.getNumNulls()); + return statistics; } catch (Exception e) { throw new DrillRuntimeException("Error in evaluating function of " + holderExpr.getName() ); } diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetGroupScan.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetGroupScan.java index 33472bb..bf292be 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetGroupScan.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetGroupScan.java @@ -262,12 +262,14 @@ public abstract class AbstractParquetGroupScan extends AbstractFileGroupScan { } } - if (ParquetRGFilterEvaluator.canDrop(filterPredicate, columnStatisticsMap, rowGroup.getRowCount())) { - continue; + ParquetFilterPredicate.RowsMatch match = ParquetRGFilterEvaluator.matches(filterPredicate, columnStatisticsMap, rowGroup.getRowCount(), parquetTableMetadata, rowGroup.getColumns(), schemaPathsInExpr); + if (match == ParquetFilterPredicate.RowsMatch.NONE) { + continue; // No row comply to the filter => drop the row group } + rowGroup.setRowsMatch(match); qualifiedRGs.add(rowGroup); - qualifiedFilePath.add(rowGroup.getPath()); // TODO : optimize when 1 file contains m row groups. + qualifiedFilePath.add(rowGroup.getPath()); } if (qualifiedRGs.size() == rowGroupInfos.size() ) { diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetPushDownFilter.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetPushDownFilter.java index 83ce4d2..b5f0ca4 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetPushDownFilter.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetPushDownFilter.java @@ -29,6 +29,8 @@ import org.apache.calcite.rex.RexNode; import org.apache.calcite.rex.RexUtil; import org.apache.drill.common.expression.LogicalExpression; import org.apache.drill.common.expression.ValueExpressions; +import org.apache.drill.exec.expr.stat.ParquetFilterPredicate; +import org.apache.drill.exec.expr.stat.ParquetFilterPredicate.RowsMatch; import org.apache.drill.exec.ops.OptimizerRulesContext; import org.apache.drill.exec.physical.base.GroupScan; import org.apache.drill.exec.planner.common.DrillRelOptUtil; @@ -165,12 +167,26 @@ public abstract class ParquetPushDownFilter extends StoragePluginOptimizerRule { return; } - RelNode newScan = ScanPrel.create(scan, scan.getTraitSet(), newGroupScan, scan.getRowType());; if (project != null) { newScan = project.copy(project.getTraitSet(), ImmutableList.of(newScan)); } + + if (newGroupScan instanceof AbstractParquetGroupScan) { + RowsMatch matchAll = RowsMatch.ALL; + List<RowGroupInfo> rowGroupInfos = ((AbstractParquetGroupScan) newGroupScan).rowGroupInfos; + for (RowGroupInfo rowGroup : rowGroupInfos) { + if (rowGroup.getRowsMatch() != RowsMatch.ALL) { + matchAll = RowsMatch.SOME; + break; + } + } + if (matchAll == ParquetFilterPredicate.RowsMatch.ALL) { + call.transformTo(newScan); + } + } + final RelNode newFilter = filter.copy(filter.getTraitSet(), ImmutableList.<RelNode>of(newScan)); call.transformTo(newFilter); } diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetRGFilterEvaluator.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetRGFilterEvaluator.java index 370988b..3e7bc65 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetRGFilterEvaluator.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetRGFilterEvaluator.java @@ -27,6 +27,7 @@ import org.apache.drill.exec.compile.sig.ConstantExpressionIdentifier; import org.apache.drill.exec.expr.ExpressionTreeMaterializer; import org.apache.drill.exec.expr.fn.FunctionLookupContext; import org.apache.drill.exec.expr.stat.ParquetFilterPredicate; +import org.apache.drill.exec.expr.stat.ParquetFilterPredicate.RowsMatch; import org.apache.drill.exec.expr.stat.RangeExprEvaluator; import org.apache.drill.exec.ops.FragmentContext; import org.apache.drill.exec.ops.UdfUtilities; @@ -37,19 +38,23 @@ import org.apache.drill.exec.store.parquet.stat.ParquetFooterStatCollector; import org.apache.parquet.hadoop.metadata.ParquetMetadata; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.Set; +import static org.apache.drill.exec.store.parquet.metadata.MetadataBase.ColumnMetadata; +import static org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetTableMetadataBase; + public class ParquetRGFilterEvaluator { static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(ParquetRGFilterEvaluator.class); - public static boolean evalFilter(LogicalExpression expr, ParquetMetadata footer, int rowGroupIndex, + public static RowsMatch evalFilter(LogicalExpression expr, ParquetMetadata footer, int rowGroupIndex, OptionManager options, FragmentContext fragmentContext) { final HashMap<String, String> emptyMap = new HashMap<String, String>(); return evalFilter(expr, footer, rowGroupIndex, options, fragmentContext, emptyMap); } - public static boolean evalFilter(LogicalExpression expr, ParquetMetadata footer, int rowGroupIndex, + public static RowsMatch evalFilter(LogicalExpression expr, ParquetMetadata footer, int rowGroupIndex, OptionManager options, FragmentContext fragmentContext, Map<String, String> implicitColValues) { // figure out the set of columns referenced in expression. final Set<SchemaPath> schemaPathsInExpr = expr.accept(new FieldReferenceFinder(), null); @@ -57,23 +62,19 @@ public class ParquetRGFilterEvaluator { Map<SchemaPath, ColumnStatistics> columnStatisticsMap = columnStatCollector.collectColStat(schemaPathsInExpr); - boolean canDrop = canDrop(expr, columnStatisticsMap, footer.getBlocks().get(rowGroupIndex).getRowCount(), fragmentContext, fragmentContext.getFunctionRegistry()); - return canDrop; + return matches(expr, columnStatisticsMap, footer.getBlocks().get(rowGroupIndex).getRowCount(), fragmentContext, fragmentContext.getFunctionRegistry()); } - - public static boolean canDrop(ParquetFilterPredicate parquetPredicate, Map<SchemaPath, + public static RowsMatch matches(ParquetFilterPredicate parquetPredicate, Map<SchemaPath, ColumnStatistics> columnStatisticsMap, long rowCount) { - boolean canDrop = false; if (parquetPredicate != null) { RangeExprEvaluator rangeExprEvaluator = new RangeExprEvaluator(columnStatisticsMap, rowCount); - canDrop = parquetPredicate.canDrop(rangeExprEvaluator); + return parquetPredicate.matches(rangeExprEvaluator); } - return canDrop; + return RowsMatch.SOME; } - - public static boolean canDrop(LogicalExpression expr, Map<SchemaPath, ColumnStatistics> columnStatisticsMap, + public static RowsMatch matches(LogicalExpression expr, Map<SchemaPath, ColumnStatistics> columnStatisticsMap, long rowCount, UdfUtilities udfUtilities, FunctionLookupContext functionImplementationRegistry) { ErrorCollector errorCollector = new ErrorCollectorImpl(); LogicalExpression materializedFilter = ExpressionTreeMaterializer.materializeFilterExpr( @@ -82,14 +83,39 @@ public class ParquetRGFilterEvaluator { if (errorCollector.hasErrors()) { logger.error("{} error(s) encountered when materialize filter expression : {}", errorCollector.getErrorCount(), errorCollector.toErrorString()); - return false; + return RowsMatch.SOME; } Set<LogicalExpression> constantBoundaries = ConstantExpressionIdentifier.getConstantExpressionSet(materializedFilter); ParquetFilterPredicate parquetPredicate = (ParquetFilterPredicate) ParquetFilterBuilder.buildParquetFilterPredicate( materializedFilter, constantBoundaries, udfUtilities); - return canDrop(parquetPredicate, columnStatisticsMap, rowCount); + return matches(parquetPredicate, columnStatisticsMap, rowCount); + } + + public static RowsMatch matches(ParquetFilterPredicate parquetPredicate, Map<SchemaPath, ColumnStatistics> columnStatisticsMap, long rowCount, ParquetTableMetadataBase parquetTableMetadata, List<? extends ColumnMetadata> columnMetadataList, Set<SchemaPath> schemaPathsInExpr) { + RowsMatch temp = matches(parquetPredicate, columnStatisticsMap, rowCount); + return temp == RowsMatch.ALL && isRepeated(schemaPathsInExpr, parquetTableMetadata, columnMetadataList) ? RowsMatch.SOME : temp; + } + + /** + * Check if one of the fields involved in the filter is an array (used in DRILL_6259_test_data). + * + * @return true if one at least is an array, false otherwise. + */ + private static boolean isRepeated(Set<SchemaPath> fields, ParquetTableMetadataBase parquetTableMetadata, List<? extends ColumnMetadata> columnMetadataList) { + final Map<SchemaPath, ColumnMetadata> columnMetadataMap = new HashMap<>(); + for (final ColumnMetadata columnMetadata : columnMetadataList) { + SchemaPath schemaPath = SchemaPath.getCompoundPath(columnMetadata.getName()); + columnMetadataMap.put(schemaPath, columnMetadata); + } + for (final SchemaPath field : fields) { + ColumnMetadata columnMetadata = columnMetadataMap.get(field.getUnIndexed()); + if (columnMetadata != null && parquetTableMetadata.getRepetitionLevel(columnMetadata.getName()) >= 1) { + return true; + } + } + return false; } /** diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/RowGroupInfo.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/RowGroupInfo.java index af436d8..7d2143c 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/RowGroupInfo.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/RowGroupInfo.java @@ -19,6 +19,7 @@ package org.apache.drill.exec.store.parquet; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; +import org.apache.drill.exec.expr.stat.ParquetFilterPredicate.RowsMatch; import org.apache.drill.exec.store.dfs.ReadEntryFromHDFS; import org.apache.drill.exec.store.dfs.easy.FileWork; import org.apache.drill.exec.store.schedule.CompleteWork; @@ -35,6 +36,7 @@ public class RowGroupInfo extends ReadEntryFromHDFS implements CompleteWork, Fil private List<? extends ColumnMetadata> columns; private long rowCount; // rowCount = -1 indicates to include all rows. private long numRecordsToRead; + private RowsMatch rowsMatch = RowsMatch.SOME; @JsonCreator public RowGroupInfo(@JsonProperty("path") String path, @@ -95,4 +97,7 @@ public class RowGroupInfo extends ReadEntryFromHDFS implements CompleteWork, Fil this.columns = columns; } + public RowsMatch getRowsMatch() { return rowsMatch; } + + public void setRowsMatch(RowsMatch rowsMatch) { this.rowsMatch = rowsMatch; } } diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/stat/ParquetFooterStatCollector.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/stat/ParquetFooterStatCollector.java index ac63bda..4e73d6b 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/stat/ParquetFooterStatCollector.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/stat/ParquetFooterStatCollector.java @@ -59,7 +59,7 @@ public class ParquetFooterStatCollector implements ColumnStatCollector { // Reasons to pass implicit columns and their values: // 1. Differentiate implicit columns from regular non-exist columns. Implicit columns do not // exist in parquet metadata. Without such knowledge, implicit columns is treated as non-exist - // column. A condition on non-exist column would lead to canDrop = true, which is not the + // column. A condition on non-exist column would lead to matches = ALL, which is not the // right behavior for condition on implicit columns. // 2. Pass in the implicit column name with corresponding values, and wrap them in Statistics with diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/stat/ParquetMetaStatCollector.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/stat/ParquetMetaStatCollector.java index 437074e..a46191b 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/stat/ParquetMetaStatCollector.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/stat/ParquetMetaStatCollector.java @@ -59,7 +59,7 @@ public class ParquetMetaStatCollector implements ColumnStatCollector { // Reasons to pass implicit columns and their values: // 1. Differentiate implicit columns from regular non-exist columns. Implicit columns do not // exist in parquet metadata. Without such knowledge, implicit columns is treated as non-exist - // column. A condition on non-exist column would lead to canDrop = true, which is not the + // column. A condition on non-exist column would lead to matches = ALL, which is not the // right behavior for condition on implicit columns. // 2. Pass in the implicit column name with corresponding values, and wrap them in Statistics with diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/TestParquetFilterPushDown.java b/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/TestParquetFilterPushDown.java index c871ccc..ea12f40 100644 --- a/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/TestParquetFilterPushDown.java +++ b/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/TestParquetFilterPushDown.java @@ -20,12 +20,17 @@ package org.apache.drill.exec.store.parquet; import org.apache.commons.io.FileUtils; import org.apache.drill.PlanTestBase; import org.apache.drill.common.expression.LogicalExpression; +import org.apache.drill.exec.expr.fn.FunctionGenerationHelper; +import org.apache.drill.exec.expr.stat.ParquetFilterPredicate.RowsMatch; +import org.apache.drill.exec.expr.stat.ParquetIsPredicate; +import org.apache.drill.exec.expr.stat.RangeExprEvaluator; import org.apache.drill.exec.ops.FragmentContextImpl; import org.apache.drill.exec.planner.physical.PlannerSettings; import org.apache.drill.exec.proto.BitControl; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.parquet.column.statistics.BooleanStatistics; import org.apache.parquet.format.converter.ParquetMetadataConverter; import org.apache.parquet.hadoop.ParquetFileReader; import org.apache.parquet.hadoop.metadata.ParquetMetadata; @@ -36,6 +41,8 @@ import org.junit.Rule; import org.junit.Test; import org.junit.rules.TestWatcher; import org.junit.runner.Description; +import org.mockito.ArgumentMatchers; +import org.mockito.Mockito; import java.io.File; import java.io.IOException; @@ -57,6 +64,8 @@ public class TestParquetFilterPushDown extends PlanTestBase { dirTestWatcher.copyResourceToRoot(Paths.get("parquetFilterPush")); dirTestWatcher.copyResourceToRoot(Paths.get("parquet", "multirowgroup.parquet")); + dirTestWatcher.copyResourceToRoot(Paths.get("parquet", "multirowgroup2.parquet")); + dirTestWatcher.copyResourceToRoot(Paths.get("parquet", "multirowgroupwithNulls.parquet")); } @AfterClass @@ -97,73 +106,75 @@ public class TestParquetFilterPushDown extends PlanTestBase { .toFile(); ParquetMetadata footer = getParquetMetaData(file); - testParquetRowGroupFilterEval(footer, "intCol = 100", false); - testParquetRowGroupFilterEval(footer, "intCol = 0", false); - testParquetRowGroupFilterEval(footer, "intCol = 50", false); + testParquetRowGroupFilterEval(footer, "intCol = 100", RowsMatch.SOME); + testParquetRowGroupFilterEval(footer, "intCol = 0", RowsMatch.SOME); + testParquetRowGroupFilterEval(footer, "intCol = 50", RowsMatch.SOME); - testParquetRowGroupFilterEval(footer, "intCol = -1", true); - testParquetRowGroupFilterEval(footer, "intCol = 101", true); + testParquetRowGroupFilterEval(footer, "intCol = -1", RowsMatch.NONE); + testParquetRowGroupFilterEval(footer, "intCol = 101", RowsMatch.NONE); - testParquetRowGroupFilterEval(footer, "intCol > 100", true); - testParquetRowGroupFilterEval(footer, "intCol > 99", false); + testParquetRowGroupFilterEval(footer, "intCol > 100", RowsMatch.NONE); + testParquetRowGroupFilterEval(footer, "intCol > 99", RowsMatch.SOME); - testParquetRowGroupFilterEval(footer, "intCol >= 100", false); - testParquetRowGroupFilterEval(footer, "intCol >= 101", true); + testParquetRowGroupFilterEval(footer, "intCol >= 100", RowsMatch.SOME); + testParquetRowGroupFilterEval(footer, "intCol >= 101", RowsMatch.NONE); - testParquetRowGroupFilterEval(footer, "intCol < 100", false); - testParquetRowGroupFilterEval(footer, "intCol < 1", false); - testParquetRowGroupFilterEval(footer, "intCol < 0", true); + testParquetRowGroupFilterEval(footer, "intCol < 100", RowsMatch.SOME); + testParquetRowGroupFilterEval(footer, "intCol < 1", RowsMatch.SOME); + testParquetRowGroupFilterEval(footer, "intCol < 0", RowsMatch.NONE); - testParquetRowGroupFilterEval(footer, "intCol <= 100", false); - testParquetRowGroupFilterEval(footer, "intCol <= 1", false); - testParquetRowGroupFilterEval(footer, "intCol <= 0", false); - testParquetRowGroupFilterEval(footer, "intCol <= -1", true); + testParquetRowGroupFilterEval(footer, "intCol <= 100", RowsMatch.ALL); + testParquetRowGroupFilterEval(footer, "intCol <= 1", RowsMatch.SOME); + testParquetRowGroupFilterEval(footer, "intCol <= 0", RowsMatch.SOME); + testParquetRowGroupFilterEval(footer, "intCol <= -1", RowsMatch.NONE); // "and" - testParquetRowGroupFilterEval(footer, "intCol > 100 and intCol < 200", true); - testParquetRowGroupFilterEval(footer, "intCol > 50 and intCol < 200", false); - testParquetRowGroupFilterEval(footer, "intCol > 50 and intCol > 200", true); // essentially, intCol > 200 + testParquetRowGroupFilterEval(footer, "intCol > 100 and intCol < 200", RowsMatch.NONE); + testParquetRowGroupFilterEval(footer, "intCol > 50 and intCol < 200", RowsMatch.SOME); + testParquetRowGroupFilterEval(footer, "intCol > 50 and intCol > 200", RowsMatch.NONE); // essentially, intCol > 200 // "or" - testParquetRowGroupFilterEval(footer, "intCol = 150 or intCol = 160", true); - testParquetRowGroupFilterEval(footer, "intCol = 50 or intCol = 160", false); + testParquetRowGroupFilterEval(footer, "intCol = 150 or intCol = 160", RowsMatch.NONE); + testParquetRowGroupFilterEval(footer, "intCol = 50 or intCol = 160", RowsMatch.SOME); //"nonExistCol" does not exist in the table. "AND" with a filter on exist column - testParquetRowGroupFilterEval(footer, "intCol > 100 and nonExistCol = 100", true); - testParquetRowGroupFilterEval(footer, "intCol > 50 and nonExistCol = 100", true); // since nonExistCol = 100 -> Unknown -> could drop. - testParquetRowGroupFilterEval(footer, "nonExistCol = 100 and intCol > 50", true); // since nonExistCol = 100 -> Unknown -> could drop. - testParquetRowGroupFilterEval(footer, "intCol > 100 and nonExistCol < 'abc'", true); - testParquetRowGroupFilterEval(footer, "nonExistCol < 'abc' and intCol > 100", true); // nonExistCol < 'abc' hit NumberException and is ignored, but intCol >100 will say "drop". - testParquetRowGroupFilterEval(footer, "intCol > 50 and nonExistCol < 'abc'", false); // because nonExistCol < 'abc' hit NumberException and is ignored. + testParquetRowGroupFilterEval(footer, "intCol > 100 and nonExistCol = 100", RowsMatch.NONE); + testParquetRowGroupFilterEval(footer, "intCol > 50 and nonExistCol = 100", RowsMatch.NONE); // since nonExistCol = 100 -> Unknown -> could drop. + testParquetRowGroupFilterEval(footer, "nonExistCol = 100 and intCol > 50", RowsMatch.NONE); // since nonExistCol = 100 -> Unknown -> could drop. + testParquetRowGroupFilterEval(footer, "intCol > 100 and nonExistCol < 'abc'", RowsMatch.NONE); + testParquetRowGroupFilterEval(footer, "nonExistCol < 'abc' and intCol > 100", RowsMatch.NONE); // nonExistCol < 'abc' hit NumberException and is ignored, but intCol >100 will + // say "drop". + testParquetRowGroupFilterEval(footer, "intCol > 50 and nonExistCol < 'abc'", RowsMatch.SOME); // because nonExistCol < 'abc' hit NumberException and + // is ignored. //"nonExistCol" does not exist in the table. "OR" with a filter on exist column - testParquetRowGroupFilterEval(footer, "intCol > 100 or nonExistCol = 100", true); // nonExistCol = 100 -> could drop. - testParquetRowGroupFilterEval(footer, "nonExistCol = 100 or intCol > 100", true); // nonExistCol = 100 -> could drop. - testParquetRowGroupFilterEval(footer, "intCol > 50 or nonExistCol < 100", false); - testParquetRowGroupFilterEval(footer, "nonExistCol < 100 or intCol > 50", false); + testParquetRowGroupFilterEval(footer, "intCol > 100 or nonExistCol = 100", RowsMatch.NONE); // nonExistCol = 100 -> could drop. + testParquetRowGroupFilterEval(footer, "nonExistCol = 100 or intCol > 100", RowsMatch.NONE); // nonExistCol = 100 -> could drop. + testParquetRowGroupFilterEval(footer, "intCol > 50 or nonExistCol < 100", RowsMatch.SOME); + testParquetRowGroupFilterEval(footer, "nonExistCol < 100 or intCol > 50", RowsMatch.SOME); // cast function on column side (LHS) - testParquetRowGroupFilterEval(footer, "cast(intCol as bigint) = 100", false); - testParquetRowGroupFilterEval(footer, "cast(intCol as bigint) = 0", false); - testParquetRowGroupFilterEval(footer, "cast(intCol as bigint) = 50", false); - testParquetRowGroupFilterEval(footer, "cast(intCol as bigint) = 101", true); - testParquetRowGroupFilterEval(footer, "cast(intCol as bigint) = -1", true); + testParquetRowGroupFilterEval(footer, "cast(intCol as bigint) = 100", RowsMatch.SOME); + testParquetRowGroupFilterEval(footer, "cast(intCol as bigint) = 0", RowsMatch.SOME); + testParquetRowGroupFilterEval(footer, "cast(intCol as bigint) = 50", RowsMatch.SOME); + testParquetRowGroupFilterEval(footer, "cast(intCol as bigint) = 101", RowsMatch.NONE); + testParquetRowGroupFilterEval(footer, "cast(intCol as bigint) = -1", RowsMatch.NONE); // cast function on constant side (RHS) - testParquetRowGroupFilterEval(footer, "intCol = cast(100 as bigint)", false); - testParquetRowGroupFilterEval(footer, "intCol = cast(0 as bigint)", false); - testParquetRowGroupFilterEval(footer, "intCol = cast(50 as bigint)", false); - testParquetRowGroupFilterEval(footer, "intCol = cast(101 as bigint)", true); - testParquetRowGroupFilterEval(footer, "intCol = cast(-1 as bigint)", true); + testParquetRowGroupFilterEval(footer, "intCol = cast(100 as bigint)", RowsMatch.SOME); + testParquetRowGroupFilterEval(footer, "intCol = cast(0 as bigint)", RowsMatch.SOME); + testParquetRowGroupFilterEval(footer, "intCol = cast(50 as bigint)", RowsMatch.SOME); + testParquetRowGroupFilterEval(footer, "intCol = cast(101 as bigint)", RowsMatch.NONE); + testParquetRowGroupFilterEval(footer, "intCol = cast(-1 as bigint)", RowsMatch.NONE); // cast into float4/float8 - testParquetRowGroupFilterEval(footer, "cast(intCol as float4) = cast(101.0 as float4)", true); - testParquetRowGroupFilterEval(footer, "cast(intCol as float4) = cast(-1.0 as float4)", true); - testParquetRowGroupFilterEval(footer, "cast(intCol as float4) = cast(1.0 as float4)", false); + testParquetRowGroupFilterEval(footer, "cast(intCol as float4) = cast(101.0 as float4)", RowsMatch.NONE); + testParquetRowGroupFilterEval(footer, "cast(intCol as float4) = cast(-1.0 as float4)", RowsMatch.NONE); + testParquetRowGroupFilterEval(footer, "cast(intCol as float4) = cast(1.0 as float4)", RowsMatch.SOME); - testParquetRowGroupFilterEval(footer, "cast(intCol as float8) = 101.0", true); - testParquetRowGroupFilterEval(footer, "cast(intCol as float8) = -1.0", true); - testParquetRowGroupFilterEval(footer, "cast(intCol as float8) = 1.0", false); + testParquetRowGroupFilterEval(footer, "cast(intCol as float8) = 101.0", RowsMatch.NONE); + testParquetRowGroupFilterEval(footer, "cast(intCol as float8) = -1.0", RowsMatch.NONE); + testParquetRowGroupFilterEval(footer, "cast(intCol as float8) = 1.0", RowsMatch.SOME); } @Test @@ -176,15 +187,15 @@ public class TestParquetFilterPushDown extends PlanTestBase { .toFile(); ParquetMetadata footer = getParquetMetaData(file); - testParquetRowGroupFilterEval(footer, "intCol = 100", true); - testParquetRowGroupFilterEval(footer, "intCol = 0", true); - testParquetRowGroupFilterEval(footer, "intCol = -100", true); + testParquetRowGroupFilterEval(footer, "intCol = 100", RowsMatch.NONE); + testParquetRowGroupFilterEval(footer, "intCol = 0", RowsMatch.NONE); + testParquetRowGroupFilterEval(footer, "intCol = -100", RowsMatch.NONE); - testParquetRowGroupFilterEval(footer, "intCol > 10", true); - testParquetRowGroupFilterEval(footer, "intCol >= 10", true); + testParquetRowGroupFilterEval(footer, "intCol > 10", RowsMatch.NONE); + testParquetRowGroupFilterEval(footer, "intCol >= 10", RowsMatch.NONE); - testParquetRowGroupFilterEval(footer, "intCol < 10", true); - testParquetRowGroupFilterEval(footer, "intCol <= 10", true); + testParquetRowGroupFilterEval(footer, "intCol < 10", RowsMatch.NONE); + testParquetRowGroupFilterEval(footer, "intCol <= 10", RowsMatch.NONE); } @Test @@ -216,21 +227,21 @@ public class TestParquetFilterPushDown extends PlanTestBase { } private void testDatePredicateAgainstDrillCTASHelper(ParquetMetadata footer) throws Exception{ - testParquetRowGroupFilterEval(footer, "o_orderdate = cast('1992-01-01' as date)", false); - testParquetRowGroupFilterEval(footer, "o_orderdate = cast('1991-12-31' as date)", true); + testParquetRowGroupFilterEval(footer, "o_orderdate = cast('1992-01-01' as date)", RowsMatch.SOME); + testParquetRowGroupFilterEval(footer, "o_orderdate = cast('1991-12-31' as date)", RowsMatch.NONE); - testParquetRowGroupFilterEval(footer, "o_orderdate >= cast('1991-12-31' as date)", false); - testParquetRowGroupFilterEval(footer, "o_orderdate >= cast('1992-01-03' as date)", false); - testParquetRowGroupFilterEval(footer, "o_orderdate >= cast('1992-01-04' as date)", true); + testParquetRowGroupFilterEval(footer, "o_orderdate >= cast('1991-12-31' as date)", RowsMatch.ALL); + testParquetRowGroupFilterEval(footer, "o_orderdate >= cast('1992-01-03' as date)", RowsMatch.SOME); + testParquetRowGroupFilterEval(footer, "o_orderdate >= cast('1992-01-04' as date)", RowsMatch.NONE); - testParquetRowGroupFilterEval(footer, "o_orderdate > cast('1992-01-01' as date)", false); - testParquetRowGroupFilterEval(footer, "o_orderdate > cast('1992-01-03' as date)", true); + testParquetRowGroupFilterEval(footer, "o_orderdate > cast('1992-01-01' as date)", RowsMatch.SOME); + testParquetRowGroupFilterEval(footer, "o_orderdate > cast('1992-01-03' as date)", RowsMatch.NONE); - testParquetRowGroupFilterEval(footer, "o_orderdate <= cast('1992-01-01' as date)", false); - testParquetRowGroupFilterEval(footer, "o_orderdate <= cast('1991-12-31' as date)", true); + testParquetRowGroupFilterEval(footer, "o_orderdate <= cast('1992-01-01' as date)", RowsMatch.SOME); + testParquetRowGroupFilterEval(footer, "o_orderdate <= cast('1991-12-31' as date)", RowsMatch.NONE); - testParquetRowGroupFilterEval(footer, "o_orderdate < cast('1992-01-02' as date)", false); - testParquetRowGroupFilterEval(footer, "o_orderdate < cast('1992-01-01' as date)", true); + testParquetRowGroupFilterEval(footer, "o_orderdate < cast('1992-01-02' as date)", RowsMatch.SOME); + testParquetRowGroupFilterEval(footer, "o_orderdate < cast('1992-01-01' as date)", RowsMatch.NONE); } @Test @@ -243,25 +254,99 @@ public class TestParquetFilterPushDown extends PlanTestBase { .toFile(); ParquetMetadata footer = getParquetMetaData(file); - testParquetRowGroupFilterEval(footer, "o_ordertimestamp = cast('1992-01-01 10:20:30' as timestamp)", false); - testParquetRowGroupFilterEval(footer, "o_ordertimestamp = cast('1992-01-01 10:20:29' as timestamp)", true); + testParquetRowGroupFilterEval(footer, "o_ordertimestamp = cast('1992-01-01 10:20:30' as timestamp)", RowsMatch.SOME); + testParquetRowGroupFilterEval(footer, "o_ordertimestamp = cast('1992-01-01 10:20:29' as timestamp)", RowsMatch.NONE); - testParquetRowGroupFilterEval(footer, "o_ordertimestamp >= cast('1992-01-01 10:20:29' as timestamp)", false); - testParquetRowGroupFilterEval(footer, "o_ordertimestamp >= cast('1992-01-03 10:20:30' as timestamp)", false); - testParquetRowGroupFilterEval(footer, "o_ordertimestamp >= cast('1992-01-03 10:20:31' as timestamp)", true); + testParquetRowGroupFilterEval(footer, "o_ordertimestamp >= cast('1992-01-01 10:20:29' as timestamp)", RowsMatch.ALL); + testParquetRowGroupFilterEval(footer, "o_ordertimestamp >= cast('1992-01-03 10:20:30' as timestamp)", RowsMatch.SOME); + testParquetRowGroupFilterEval(footer, "o_ordertimestamp >= cast('1992-01-03 10:20:31' as timestamp)", RowsMatch.NONE); - testParquetRowGroupFilterEval(footer, "o_ordertimestamp > cast('1992-01-03 10:20:29' as timestamp)", false); - testParquetRowGroupFilterEval(footer, "o_ordertimestamp > cast('1992-01-03 10:20:30' as timestamp)", true); + testParquetRowGroupFilterEval(footer, "o_ordertimestamp > cast('1992-01-03 10:20:29' as timestamp)", RowsMatch.SOME); + testParquetRowGroupFilterEval(footer, "o_ordertimestamp > cast('1992-01-03 10:20:30' as timestamp)", RowsMatch.NONE); - testParquetRowGroupFilterEval(footer, "o_ordertimestamp <= cast('1992-01-01 10:20:30' as timestamp)", false); - testParquetRowGroupFilterEval(footer, "o_ordertimestamp <= cast('1992-01-01 10:20:29' as timestamp)", true); + testParquetRowGroupFilterEval(footer, "o_ordertimestamp <= cast('1992-01-01 10:20:30' as timestamp)", RowsMatch.SOME); + testParquetRowGroupFilterEval(footer, "o_ordertimestamp <= cast('1992-01-01 10:20:29' as timestamp)", RowsMatch.NONE); - testParquetRowGroupFilterEval(footer, "o_ordertimestamp < cast('1992-01-01 10:20:31' as timestamp)", false); - testParquetRowGroupFilterEval(footer, "o_ordertimestamp < cast('1992-01-01 10:20:30' as timestamp)", true); + testParquetRowGroupFilterEval(footer, "o_ordertimestamp < cast('1992-01-01 10:20:31' as timestamp)", RowsMatch.SOME); + testParquetRowGroupFilterEval(footer, "o_ordertimestamp < cast('1992-01-01 10:20:30' as timestamp)", RowsMatch.NONE); } @Test + public void testFilterPruning() throws Exception { + // multirowgroup2 is a parquet file with 3 rowgroups inside. One with a=0, another with a=1 and a=2, and the last with a=3 and a=4; + // FilterPushDown should be able to prune the filter from the scan operator according to the rowgroup statistics. + final String sql = "select * from dfs.`parquet/multirowgroup2.parquet` where "; + PlanTestBase.testPlanMatchingPatterns(sql + "a > 1", new String[]{"numRowGroups=2"}); //No filter pruning + PlanTestBase.testPlanMatchingPatterns(sql + "a > 2", new String[]{"numRowGroups=1"}, new String[]{"Filter\\("}); // Filter pruning + + PlanTestBase.testPlanMatchingPatterns(sql + "a < 2", new String[]{"numRowGroups=2"}); // No filter pruning + PlanTestBase.testPlanMatchingPatterns(sql + "a < 1", new String[]{"numRowGroups=1"}, new String[]{"Filter\\("}); // Filter pruning + + PlanTestBase.testPlanMatchingPatterns(sql + "a >= 2", new String[]{"numRowGroups=2"}); // No filter pruning + PlanTestBase.testPlanMatchingPatterns(sql + "a >= 1", new String[]{"numRowGroups=2"}, new String[]{"Filter\\("}); // Filter pruning + + PlanTestBase.testPlanMatchingPatterns(sql + "a <= 1", new String[]{"numRowGroups=2"}); // No filter pruning + PlanTestBase.testPlanMatchingPatterns(sql + "a <= 2", new String[]{"numRowGroups=2"}, new String[]{"Filter\\("}); // Filter pruning + + PlanTestBase.testPlanMatchingPatterns(sql + "a > 0 and a < 2", new String[]{"numRowGroups=1"}); // No filter pruning + PlanTestBase.testPlanMatchingPatterns(sql + "a > 0 and a < 3", new String[]{"numRowGroups=1"}, new String[]{"Filter\\("}); //Filter pruning + + PlanTestBase.testPlanMatchingPatterns(sql + "a < 1 or a > 1", new String[]{"numRowGroups=3"}); // No filter pruning + PlanTestBase.testPlanMatchingPatterns(sql + "a < 1 or a > 2", new String[]{"numRowGroups=2"}, new String[]{"Filter\\("}); //Filter pruning + } + + @Test + public void testFilterPruningWithNulls() throws Exception { + // multirowgroupwithNulls is a parquet file with 4 rowgroups inside and some groups contain null values. + // RG1 : [min: 20, max: 29, num_nulls: 0] + // RG2 : [min: 31, max: 39, num_nulls: 1] + // RG3 : [min: 40, max: 49, num_nulls: 1] + // RG4 : [min: 50, max: 59, num_nulls: 0] + final String sql = "select a from dfs.`parquet/multirowgroupwithNulls.parquet` where "; + // "<" "and" ">" with filter + testParquetFilterPruning(sql + "30 < a and 40 > a", 9, 1, null); + testParquetFilterPruning(sql + "30 < a and a < 40", 9, 1, null); + testParquetFilterPruning(sql + "a > 30 and 40 > a", 9, 1, null); + testParquetFilterPruning(sql + "a > 30 and a < 40", 9, 1, null); + // "<" "and" ">" with no filter + testParquetFilterPruning(sql + "19 < a and 30 > a", 10, 1, new String[]{"Filter\\("}); + testParquetFilterPruning(sql + "19 < a and a < 30", 10, 1, new String[]{"Filter\\("}); + testParquetFilterPruning(sql + "a > 19 and 30 > a", 10, 1, new String[]{"Filter\\("}); + testParquetFilterPruning(sql + "a > 19 and a < 30", 10, 1, new String[]{"Filter\\("}); + // "<=" "and" ">=" with filter + testParquetFilterPruning(sql + "a >= 30 and 39 >= a", 9, 1, null); + testParquetFilterPruning(sql + "a >= 30 and a <= 39", 9, 1, null); + testParquetFilterPruning(sql + "30 <= a and 39 >= a", 9, 1, null); + testParquetFilterPruning(sql + "30 <= a and a <= 39", 9, 1, null); + // "<=" "and" ">=" with no filter + testParquetFilterPruning(sql + "a >= 20 and a <= 29", 10, 1, new String[]{"Filter\\("}); + testParquetFilterPruning(sql + "a >= 20 and 29 >= a", 10, 1, new String[]{"Filter\\("}); + testParquetFilterPruning(sql + "20 <= a and a <= 29", 10, 1, new String[]{"Filter\\("}); + testParquetFilterPruning(sql + "20 <= a and 29 >= a", 10, 1, new String[]{"Filter\\("}); + // "<" "or" ">" with filter + testParquetFilterPruning(sql + "a < 40 or a > 49", 29, 3, null); + testParquetFilterPruning(sql + "a < 40 or 49 < a", 29, 3, null); + testParquetFilterPruning(sql + "40 > a or a > 49", 29, 3, null); + testParquetFilterPruning(sql + "40 > a or 49 < a", 29, 3, null); + // "<" "or" ">" with no filter + testParquetFilterPruning(sql + "a < 30 or a > 49", 20, 2, new String[]{"Filter\\("}); + testParquetFilterPruning(sql + "a < 30 or 49 < a", 20, 2, new String[]{"Filter\\("}); + testParquetFilterPruning(sql + "30 > a or a > 49", 20, 2, new String[]{"Filter\\("}); + testParquetFilterPruning(sql + "30 > a or 49 < a", 20, 2, new String[]{"Filter\\("}); + // "<=" "or" ">=" with filter + testParquetFilterPruning(sql + "a <= 39 or a >= 50", 29, 3, null); + testParquetFilterPruning(sql + "a <= 39 or 50 <= a", 29, 3, null); + testParquetFilterPruning(sql + "39 >= a or a >= 50", 29, 3, null); + testParquetFilterPruning(sql + "39 >= a or 50 <= a", 29, 3, null); + // "<=" "or" ">=" with no filter + testParquetFilterPruning(sql + "a <= 29 or a >= 50", 20, 2, new String[]{"Filter\\("}); + testParquetFilterPruning(sql + "a <= 29 or 50 <= a", 20, 2, new String[]{"Filter\\("}); + testParquetFilterPruning(sql + "29 >= a or a >= 50", 20, 2, new String[]{"Filter\\("}); + testParquetFilterPruning(sql + "29 >= a or 50 <= a", 20, 2, new String[]{"Filter\\("}); + } + + @Test // Test against parquet files from Drill CTAS post 1.8.0 release. public void testDatePredicateAgaistDrillCTASPost1_8() throws Exception { test("use dfs.tmp"); @@ -428,6 +513,15 @@ public class TestParquetFilterPushDown extends PlanTestBase { final String queryEqualTrueWithAnd = "select col_bln from dfs.`parquetFilterPush/blnTbl` where col_bln = true and unk_col = 'a'"; testParquetFilterPD(queryEqualTrueWithAnd, 0, 2, false); + + // File ff1.parquet has column with the values: false, null, false. + // File tt1.parquet has column with the values: true, null, true. + // File ft0.parquet has column with the values: false, true. + final String query = "select a from dfs.`parquetFilterPush/tfTbl` where "; + testParquetFilterPD(query + "a is true", 3, 2, false); + testParquetFilterPD(query + "a is false", 3, 2, false); + testParquetFilterPD(query + "a is not true", 5, 1, false); + testParquetFilterPD(query + "a is not false", 5, 1, false); } @Test // DRILL-5359 @@ -478,18 +572,89 @@ public class TestParquetFilterPushDown extends PlanTestBase { String[] expectedPlan = {"numRowGroups=2"}; PlanTestBase.testPlanMatchingPatterns(query, expectedPlan); - testBuilder() - .sqlQuery(query) - .unOrdered() - .baselineColumns("cnt") - .baselineValues(2L) - .go(); + testBuilder().sqlQuery(query).unOrdered().baselineColumns("cnt").baselineValues(2L).go(); + } + + @Test // testing min=false, max=true, min/max set, no nulls + public void testMinFalseMaxTrue() throws Exception { + LogicalExpression le = Mockito.mock(LogicalExpression.class); + BooleanStatistics booleanStatistics = Mockito.mock(BooleanStatistics.class); + Mockito.doReturn(booleanStatistics).when(le).accept(ArgumentMatchers.any(), ArgumentMatchers.any()); + RangeExprEvaluator<Boolean> re = Mockito.mock(RangeExprEvaluator.class); + Mockito.when(re.getRowCount()).thenReturn(Long.valueOf(2)); // 2 rows + Mockito.when(booleanStatistics.isEmpty()).thenReturn(false); // stat is not empty + Mockito.when(booleanStatistics.isNumNullsSet()).thenReturn(true); // num_nulls set + Mockito. when(booleanStatistics.getNumNulls()).thenReturn(Long.valueOf(0)); // no nulls + Mockito. when(booleanStatistics.hasNonNullValue()).thenReturn(true); // min/max set + Mockito.when(booleanStatistics.getMin()).thenReturn(false); // min false + Mockito.when(booleanStatistics.getMax()).thenReturn(true); // max true + ParquetIsPredicate isTrue = (ParquetIsPredicate) ParquetIsPredicate.createIsPredicate(FunctionGenerationHelper.IS_TRUE, le); + assertEquals(RowsMatch.SOME, isTrue.matches(re)); + ParquetIsPredicate isFalse = (ParquetIsPredicate) ParquetIsPredicate.createIsPredicate(FunctionGenerationHelper.IS_FALSE, le); + assertEquals(RowsMatch.SOME, isFalse.matches(re)); + ParquetIsPredicate isNotTrue = (ParquetIsPredicate) ParquetIsPredicate.createIsPredicate(FunctionGenerationHelper.IS_NOT_TRUE, le); + assertEquals(RowsMatch.SOME, isNotTrue.matches(re)); + ParquetIsPredicate isNotFalse = (ParquetIsPredicate) ParquetIsPredicate.createIsPredicate(FunctionGenerationHelper.IS_NOT_FALSE, le); + assertEquals(RowsMatch.SOME, isNotFalse.matches(re)); + } + + @Test // testing min=false, max=false, min/max set, no nulls + public void testMinFalseMaxFalse() throws Exception { + LogicalExpression le = Mockito.mock(LogicalExpression.class); + BooleanStatistics booleanStatistics = Mockito.mock(BooleanStatistics.class); + Mockito.doReturn(booleanStatistics).when(le).accept(ArgumentMatchers.any(), ArgumentMatchers.any()); + RangeExprEvaluator<Boolean> re = Mockito.mock(RangeExprEvaluator.class); + Mockito.when(re.getRowCount()).thenReturn(Long.valueOf(2)); // 2 rows + Mockito.when(booleanStatistics.isEmpty()).thenReturn(false); // stat is not empty + Mockito.when(booleanStatistics.isNumNullsSet()).thenReturn(true); // num_nulls set + Mockito. when(booleanStatistics.getNumNulls()).thenReturn(Long.valueOf(0)); // no nulls + Mockito. when(booleanStatistics.hasNonNullValue()).thenReturn(true); // min/max set + Mockito.when(booleanStatistics.getMin()).thenReturn(false); // min false + Mockito.when(booleanStatistics.getMax()).thenReturn(false); // max false + ParquetIsPredicate isTrue = (ParquetIsPredicate) ParquetIsPredicate.createIsPredicate(FunctionGenerationHelper.IS_TRUE, le); + assertEquals(RowsMatch.NONE, isTrue.matches(re)); + ParquetIsPredicate isFalse = (ParquetIsPredicate) ParquetIsPredicate.createIsPredicate(FunctionGenerationHelper.IS_FALSE, le); + assertEquals(RowsMatch.ALL, isFalse.matches(re)); + ParquetIsPredicate isNotTrue = (ParquetIsPredicate) ParquetIsPredicate.createIsPredicate(FunctionGenerationHelper.IS_NOT_TRUE, le); + assertEquals(RowsMatch.ALL, isNotTrue.matches(re)); + ParquetIsPredicate isNotFalse = (ParquetIsPredicate) ParquetIsPredicate.createIsPredicate(FunctionGenerationHelper.IS_NOT_FALSE, le); + assertEquals(RowsMatch.NONE, isNotFalse.matches(re)); + } + + @Test // testing min=true, max=true, min/max set, no nulls + public void testMinTrueMaxTrue() throws Exception { + LogicalExpression le = Mockito.mock(LogicalExpression.class); + BooleanStatistics booleanStatistics = Mockito.mock(BooleanStatistics.class); + Mockito.doReturn(booleanStatistics).when(le).accept(ArgumentMatchers.any(), ArgumentMatchers.any()); + RangeExprEvaluator<Boolean> re = Mockito.mock(RangeExprEvaluator.class); + Mockito.when(re.getRowCount()).thenReturn(Long.valueOf(2)); // 2 rows + Mockito.when(booleanStatistics.isEmpty()).thenReturn(false); // stat is not empty + Mockito.when(booleanStatistics.isNumNullsSet()).thenReturn(true); // num_nulls set + Mockito. when(booleanStatistics.getNumNulls()).thenReturn(Long.valueOf(0)); // no nulls + Mockito. when(booleanStatistics.hasNonNullValue()).thenReturn(true); // min/max set + Mockito.when(booleanStatistics.getMin()).thenReturn(true); // min false + Mockito.when(booleanStatistics.getMax()).thenReturn(true); // max true + ParquetIsPredicate isTrue = (ParquetIsPredicate) ParquetIsPredicate.createIsPredicate(FunctionGenerationHelper.IS_TRUE, le); + assertEquals(RowsMatch.ALL, isTrue.matches(re)); + ParquetIsPredicate isFalse = (ParquetIsPredicate) ParquetIsPredicate.createIsPredicate(FunctionGenerationHelper.IS_FALSE, le); + assertEquals(RowsMatch.NONE, isFalse.matches(re)); + ParquetIsPredicate isNotTrue = (ParquetIsPredicate) ParquetIsPredicate.createIsPredicate(FunctionGenerationHelper.IS_NOT_TRUE, le); + assertEquals(RowsMatch.NONE, isNotTrue.matches(re)); + ParquetIsPredicate isNotFalse = (ParquetIsPredicate) ParquetIsPredicate.createIsPredicate(FunctionGenerationHelper.IS_NOT_FALSE, le); + assertEquals(RowsMatch.ALL, isNotFalse.matches(re)); } ////////////////////////////////////////////////////////////////////////////////////////////////// // Some test helper functions. ////////////////////////////////////////////////////////////////////////////////////////////////// + private void testParquetFilterPruning(final String query, int expectedRowCount, int expectedRowgroups, String[] excludedPattern) throws Exception{ + int actualRowCount = testSql(query); + assertEquals(expectedRowCount, actualRowCount); + String numRowGroupPattern = "numRowGroups=" + expectedRowgroups; + testPlanMatchingPatterns(query, new String[]{numRowGroupPattern}, excludedPattern); + } + private void testParquetFilterPD(final String query, int expectedRowCount, int expectedNumFiles, boolean usedMetadataFile) throws Exception{ int actualRowCount = testSql(query); assertEquals(expectedRowCount, actualRowCount); @@ -499,13 +664,13 @@ public class TestParquetFilterPushDown extends PlanTestBase { testPlanMatchingPatterns(query, new String[]{numFilesPattern, usedMetaPattern}); } - private void testParquetRowGroupFilterEval(final ParquetMetadata footer, final String exprStr, boolean canDropExpected) throws Exception{ + private void testParquetRowGroupFilterEval(final ParquetMetadata footer, final String exprStr, RowsMatch canDropExpected) throws Exception{ final LogicalExpression filterExpr = parseExpr(exprStr); testParquetRowGroupFilterEval(footer, 0, filterExpr, canDropExpected); } - private void testParquetRowGroupFilterEval(final ParquetMetadata footer, final int rowGroupIndex, final LogicalExpression filterExpr, boolean canDropExpected) { - boolean canDrop = ParquetRGFilterEvaluator.evalFilter(filterExpr, footer, rowGroupIndex, fragContext.getOptions(), fragContext); + private void testParquetRowGroupFilterEval(final ParquetMetadata footer, final int rowGroupIndex, final LogicalExpression filterExpr, RowsMatch canDropExpected) { + RowsMatch canDrop = ParquetRGFilterEvaluator.evalFilter(filterExpr, footer, rowGroupIndex, fragContext.getOptions(), fragContext); Assert.assertEquals(canDropExpected, canDrop); } diff --git a/exec/java-exec/src/test/resources/parquet/multirowgroup2.parquet b/exec/java-exec/src/test/resources/parquet/multirowgroup2.parquet new file mode 100644 index 0000000..5139802 Binary files /dev/null and b/exec/java-exec/src/test/resources/parquet/multirowgroup2.parquet differ diff --git a/exec/java-exec/src/test/resources/parquet/multirowgroupwithNulls.parquet b/exec/java-exec/src/test/resources/parquet/multirowgroupwithNulls.parquet new file mode 100644 index 0000000..084b315 Binary files /dev/null and b/exec/java-exec/src/test/resources/parquet/multirowgroupwithNulls.parquet differ diff --git a/exec/java-exec/src/test/resources/parquetFilterPush/tfTbl/ff1.parquet b/exec/java-exec/src/test/resources/parquetFilterPush/tfTbl/ff1.parquet new file mode 100644 index 0000000..79c2362 Binary files /dev/null and b/exec/java-exec/src/test/resources/parquetFilterPush/tfTbl/ff1.parquet differ diff --git a/exec/java-exec/src/test/resources/parquetFilterPush/tfTbl/ft0.parquet b/exec/java-exec/src/test/resources/parquetFilterPush/tfTbl/ft0.parquet new file mode 100644 index 0000000..c0c51c4 Binary files /dev/null and b/exec/java-exec/src/test/resources/parquetFilterPush/tfTbl/ft0.parquet differ diff --git a/exec/java-exec/src/test/resources/parquetFilterPush/tfTbl/tt1.parquet b/exec/java-exec/src/test/resources/parquetFilterPush/tfTbl/tt1.parquet new file mode 100644 index 0000000..35ca274 Binary files /dev/null and b/exec/java-exec/src/test/resources/parquetFilterPush/tfTbl/tt1.parquet differ