[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-24 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r204839799
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -62,91 +62,126 @@ private ParquetIsPredicate(LogicalExpression expr, 
BiPredicate, Ra
 return visitor.visitUnknown(this, value);
   }
 
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  /**
+   * Apply the filter condition against the meta of the rowgroup.
+   */
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
 Statistics exprStat = expr.accept(evaluator, null);
-if (isNullOrEmpty(exprStat)) {
-  return false;
-}
+return isNullOrEmpty(exprStat) ? RowsMatch.SOME : 
predicate.apply(exprStat, evaluator);
+  }
 
-return predicate.test(exprStat, evaluator);
+  /**
+   * After the applying of the filter against the statistics of the rowgroup, 
if the result is RowsMatch.ALL,
+   * then we still must know if the rowgroup contains some null values, 
because it can change the filter result.
+   * If it contains some null values, then we change the RowsMatch.ALL into 
RowsMatch.SOME, which sya that maybe
+   * some values (the null ones) should be disgarded.
+   */
+  private static RowsMatch checkNull(Statistics exprStat) {
+return hasNoNulls(exprStat) ? RowsMatch.ALL : RowsMatch.SOME;
   }
 
+  /**
+   * Return true if exprStat.getMin is defined and true
+   */
+  private static Boolean minIsTrue(Statistics exprStat) { return 
exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin(); }
+
+  /**
+   * Return true if exprStat.getMin is defined and false
+   */
+  private static Boolean minIsFalse(Statistics exprStat) { return 
exprStat.hasNonNullValue() && !((BooleanStatistics) exprStat).getMin(); }
+
+  /**
+   * Return true if exprStat.getMax is defined and true
+   */
+  private static Boolean maxIsTrue(Statistics exprStat) { return 
exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMax(); }
+
+  /**
+   * Return true if exprStat.getMax is defined and false
+   */
+  private static Boolean maxIsFalse(Statistics exprStat) { return 
exprStat.hasNonNullValue() && !((BooleanStatistics) exprStat).getMax(); }
+
   /**
* IS NULL predicate.
*/
   private static > LogicalExpression 
createIsNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are no nulls  -> canDrop
-(exprStat, evaluator) -> hasNoNulls(exprStat)) {
-  private final boolean isArray = isArray(expr);
-
-  private boolean isArray(LogicalExpression expression) {
-if (expression instanceof TypedFieldExpr) {
-  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expression;
-  SchemaPath schemaPath = typedFieldExpr.getPath();
-  return schemaPath.isArray();
-}
-return false;
-  }
-
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  (exprStat, evaluator) -> {
 // for arrays we are not able to define exact number of nulls
 // [1,2,3] vs [1,2] -> in second case 3 is absent and thus it's null 
but statistics shows no nulls
-return !isArray && super.canDrop(evaluator);
-  }
-};
+if (expr instanceof TypedFieldExpr) {
+  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expr;
+  if (typedFieldExpr.getPath().isArray()) {
+return RowsMatch.SOME;
+  }
+}
+if (hasNoNulls(exprStat)) {
+  return RowsMatch.NONE;
+}
+return isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.ALL : 
RowsMatch.SOME;
+  });
   }
 
   /**
* IS NOT NULL predicate.
*/
   private static > LogicalExpression 
createIsNotNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are all nulls  -> canDrop
-(exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount())
+  (exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount()) ? 
RowsMatch.NONE : checkNull(exprStat)
 );
   }
 
   /**
* IS TRUE predicate.
*/
   private static LogicalExpression createIsTruePredicate(LogicalExpression 
expr) {
-return new ParquetIsPredicate(expr, (exprStat, evaluator) ->
-//if max value is not true or if there are all nulls  -> canDrop
-isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && !((BooleanStatistics) exprStat).getMax()
-);
+return new ParquetIsPredicate(expr, (exprStat, evaluator) -> {
+  if (isAllNulls(exprStat, evaluator.getRowCount()) || 
maxIsFalse(exprStat)) {
 
 Review comment:
   Done


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above 

[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-24 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r204839867
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetPushDownFilter.java
 ##
 @@ -165,12 +167,29 @@ protected void doOnMatch(RelOptRuleCall call, FilterPrel 
filter, ProjectPrel pro
   return;
 }
 
-
 RelNode newScan = ScanPrel.create(scan, scan.getTraitSet(), newGroupScan, 
scan.getRowType());;
 
 if (project != null) {
   newScan = project.copy(project.getTraitSet(), ImmutableList.of(newScan));
 }
+
+if (newGroupScan instanceof AbstractParquetGroupScan) {
+  RowsMatch matchAll = RowsMatch.ALL;
+  List rowGroupInfos = ((AbstractParquetGroupScan) 
newGroupScan).rowGroupInfos;
+  for (RowGroupInfo rowGroup : rowGroupInfos) {
+if (rowGroup.getRowsMatch() != RowsMatch.ALL) {
+  matchAll = RowsMatch.SOME;
+  break;
+}
+  }
+  if (matchAll == ParquetFilterPredicate.RowsMatch.ALL) {
+call.transformTo(newScan);
+  }
+} else {
+  final RelNode newFilter = filter.copy(filter.getTraitSet(), 
ImmutableList.of(newScan));
+  call.transformTo(newFilter);
+}
+
 final RelNode newFilter = filter.copy(filter.getTraitSet(), 
ImmutableList.of(newScan));
 
 Review comment:
   Exact. Suppressed duplicate lines


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-24 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r204839620
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -124,8 +124,7 @@ private static LogicalExpression 
createIsTruePredicate(LogicalExpression expr) {
*/
   private static LogicalExpression createIsFalsePredicate(LogicalExpression 
expr) {
 return new ParquetIsPredicate(expr, (exprStat, evaluator) ->
-//if min value is not false or if there are all nulls  -> canDrop
-isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin()
+  isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin() ? 
RowsMatch.NONE : checkNull(exprStat)
 
 Review comment:
   Done added 12 unit tests for cases 
   a. ST:[min: true, max: true, num_nulls: 0]
   b. ST:[min: false, max: false, num_nulls: 0]
   c. ST:[min: false, max: true, num_nulls: 0]


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-24 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r204839765
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -62,91 +62,126 @@ private ParquetIsPredicate(LogicalExpression expr, 
BiPredicate, Ra
 return visitor.visitUnknown(this, value);
   }
 
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  /**
+   * Apply the filter condition against the meta of the rowgroup.
+   */
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
 Statistics exprStat = expr.accept(evaluator, null);
-if (isNullOrEmpty(exprStat)) {
-  return false;
-}
+return isNullOrEmpty(exprStat) ? RowsMatch.SOME : 
predicate.apply(exprStat, evaluator);
+  }
 
-return predicate.test(exprStat, evaluator);
+  /**
+   * After the applying of the filter against the statistics of the rowgroup, 
if the result is RowsMatch.ALL,
+   * then we still must know if the rowgroup contains some null values, 
because it can change the filter result.
+   * If it contains some null values, then we change the RowsMatch.ALL into 
RowsMatch.SOME, which sya that maybe
+   * some values (the null ones) should be disgarded.
+   */
+  private static RowsMatch checkNull(Statistics exprStat) {
+return hasNoNulls(exprStat) ? RowsMatch.ALL : RowsMatch.SOME;
   }
 
+  /**
+   * Return true if exprStat.getMin is defined and true
+   */
+  private static Boolean minIsTrue(Statistics exprStat) { return 
exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin(); }
 
 Review comment:
   Functions suppressed because of next comment


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-23 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r204325743
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -124,8 +124,7 @@ private static LogicalExpression 
createIsTruePredicate(LogicalExpression expr) {
*/
   private static LogicalExpression createIsFalsePredicate(LogicalExpression 
expr) {
 return new ParquetIsPredicate(expr, (exprStat, evaluator) ->
-//if min value is not false or if there are all nulls  -> canDrop
-isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin()
+  isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin() ? 
RowsMatch.NONE : checkNull(exprStat)
 
 Review comment:
   OK I found the reason why the tests pass : 
   1. we need several parquet files ,else the process is squeezed in 
AbstractParquetGroupScan.applyFilter.
   2. We need that some parquet files are dropped again in 
AbstractParquetGroupScan.applyFilter
   if (qualifiedRGs.size() == rowGroupInfos.size() ) { return null } ...
   3. If one at least of the row groups is SOME, then the filter is applied to 
all, in ParquetPushDownFilter.doOnMatch L 179
   These 3 conditions together make that the tests pass.
   One way to check that they fail, is to put ft0.parquet, ft0.parquet and 
tt1.parquet in the same folder and run a IS TRUE predicate. the result then 
reads F, T, F, T (wrong) instead of F, F (expected) !
   
   I have then written the IS TRUE, IS FALSE, IS NOT TRUE and IS NOT FALSE 
predicates based on the cases:
   a. ST:[min: true, max: true, num_nulls: ?]
   b. ST:[min: false, max: false, num_nulls: ?]
   c. ST:[min: false, max: true, num_nulls: ?]
   d. and num_nulls = RC ( row count)
   And check all cases.
   
   I also introduced 4 helper functions for code readability: minIsTrue, 
minIsFalse, maxIsTrue and maxIsFalse


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-20 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r203966305
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -114,8 +115,7 @@ public boolean canDrop(RangeExprEvaluator evaluator) {
*/
   private static LogicalExpression createIsTruePredicate(LogicalExpression 
expr) {
 return new ParquetIsPredicate(expr, (exprStat, evaluator) ->
-//if max value is not true or if there are all nulls  -> canDrop
-isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && !((BooleanStatistics) exprStat).getMax()
+  exprStat.hasNonNullValue() && !((BooleanStatistics) exprStat).getMax() 
|| isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.NONE : 
checkNull(exprStat)
 
 Review comment:
   OK. Reversed.


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-20 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r203966145
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -124,8 +124,7 @@ private static LogicalExpression 
createIsTruePredicate(LogicalExpression expr) {
*/
   private static LogicalExpression createIsFalsePredicate(LogicalExpression 
expr) {
 return new ParquetIsPredicate(expr, (exprStat, evaluator) ->
-//if min value is not false or if there are all nulls  -> canDrop
-isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin()
+  isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin() ? 
RowsMatch.NONE : checkNull(exprStat)
 
 Review comment:
   blnTbl/0_0_1.parquet => ST:[min: false, max: false, num_nulls: 0] : 8 tests 
in  testBooleanPredicate()
tfTbl/ft0.parquet => ST:[min: false, max: true, num_nulls: 0] : 4 tests in 
testBooleanPredicate
   
   example1:
   select * from 
`ava-exec/src/test/resources/parquetFilterPush/blnTbl/0_0_1.parquet` where 
col_bln is false returns (false, false, false)
   
   example2:
   select * from 
`java-exec/src/test/resources/parquetFilterPush/tfTbl/ft0.parquet` where a is 
true[resp. false]  return true[resp. false]
   
   Finally, when running the query  
   select * from dfs.tmp.`blnTbl` where col_bln is false
   with blnTbl contains only 0_0_0.parquet (T,T,T) and 0_0_1.parquet (F,F,F)
   the physical plan reads:
   00-00Screen : rowType = RecordType(DYNAMIC_STAR **): rowcount = 3.0, 
cumulative cost = {9.3 rows, 12.3 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 
523
   00-01  Project(**=[$0]) : rowType = RecordType(DYNAMIC_STAR **): 
rowcount = 3.0, cumulative cost = {9.0 rows, 12.0 cpu, 0.0 io, 0.0 network, 0.0 
memory}, id = 522
   00-02Project(**=[$0]) : rowType = RecordType(DYNAMIC_STAR **): 
rowcount = 3.0, cumulative cost = {6.0 rows, 9.0 cpu, 0.0 io, 0.0 network, 0.0 
memory}, id = 521
   00-03  Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath 
[path=/tmp/blnTbl/0_0_1.parquet]], selectionRoot=file:/tmp/blnTbl, numFiles=1, 
numRowGroups=1, usedMetadataFile=false, columns=[`**`]]]) : rowType = 
RecordType(DYNAMIC_STAR **, ANY col_bln): rowcount = 3.0, cumulative cost = 
{3.0 rows, 6.0 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 520
   No more filter since it returns NONE for 0_0_0.parquet and ALL for 
0_0_1.parquet.
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-17 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r203022582
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -62,50 +62,51 @@ private ParquetIsPredicate(LogicalExpression expr, 
BiPredicate, Ra
 return visitor.visitUnknown(this, value);
   }
 
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  /**
+   * Apply the filter condition against the meta of the rowgroup.
+   */
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
 Statistics exprStat = expr.accept(evaluator, null);
-if (isNullOrEmpty(exprStat)) {
-  return false;
-}
+return isNullOrEmpty(exprStat) ? RowsMatch.SOME : 
predicate.apply(exprStat, evaluator);
+  }
 
-return predicate.test(exprStat, evaluator);
+  /**
+   * After the applying of the filter against the statistics of the rowgroup, 
if the result is RowsMatch.ALL,
+   * then we still must know if the rowgroup contains some null values, 
because it can change the filter result.
+   * If it contains some null values, then we change the RowsMatch.ALL into 
RowsMatch.SOME, which sya that maybe
+   * some values (the null ones) should be disgarded.
+   */
+  private static RowsMatch checkNull(Statistics exprStat) {
+return hasNoNulls(exprStat) ? RowsMatch.ALL : RowsMatch.SOME;
   }
 
   /**
* IS NULL predicate.
*/
   private static > LogicalExpression 
createIsNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are no nulls  -> canDrop
-(exprStat, evaluator) -> hasNoNulls(exprStat)) {
-  private final boolean isArray = isArray(expr);
-
-  private boolean isArray(LogicalExpression expression) {
-if (expression instanceof TypedFieldExpr) {
-  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expression;
-  SchemaPath schemaPath = typedFieldExpr.getPath();
-  return schemaPath.isArray();
-}
-return false;
-  }
-
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  (exprStat, evaluator) -> {
 // for arrays we are not able to define exact number of nulls
 // [1,2,3] vs [1,2] -> in second case 3 is absent and thus it's null 
but statistics shows no nulls
-return !isArray && super.canDrop(evaluator);
-  }
-};
+if (expr instanceof TypedFieldExpr) {
+  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expr;
+  if (typedFieldExpr.getPath().isArray()) {
+return RowsMatch.SOME;
+  }
+}
+if (hasNoNulls(exprStat)) {
 
 Review comment:
   Done. hasNonNullValue(0 removed in both checkNull() and 
createIsNullPredicate.


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-17 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r203003949
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -62,50 +62,51 @@ private ParquetIsPredicate(LogicalExpression expr, 
BiPredicate, Ra
 return visitor.visitUnknown(this, value);
   }
 
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  /**
+   * Apply the filter condition against the meta of the rowgroup.
+   */
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
 Statistics exprStat = expr.accept(evaluator, null);
-if (isNullOrEmpty(exprStat)) {
-  return false;
-}
+return isNullOrEmpty(exprStat) ? RowsMatch.SOME : 
predicate.apply(exprStat, evaluator);
+  }
 
-return predicate.test(exprStat, evaluator);
+  /**
+   * After the applying of the filter against the statistics of the rowgroup, 
if the result is RowsMatch.ALL,
+   * then we still must know if the rowgroup contains some null values, 
because it can change the filter result.
+   * If it contains some null values, then we change the RowsMatch.ALL into 
RowsMatch.SOME, which sya that maybe
+   * some values (the null ones) should be disgarded.
+   */
+  private static RowsMatch checkNull(Statistics exprStat) {
+return hasNoNulls(exprStat) ? RowsMatch.ALL : RowsMatch.SOME;
   }
 
   /**
* IS NULL predicate.
*/
   private static > LogicalExpression 
createIsNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are no nulls  -> canDrop
-(exprStat, evaluator) -> hasNoNulls(exprStat)) {
-  private final boolean isArray = isArray(expr);
-
-  private boolean isArray(LogicalExpression expression) {
-if (expression instanceof TypedFieldExpr) {
-  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expression;
-  SchemaPath schemaPath = typedFieldExpr.getPath();
-  return schemaPath.isArray();
-}
-return false;
-  }
-
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  (exprStat, evaluator) -> {
 // for arrays we are not able to define exact number of nulls
 // [1,2,3] vs [1,2] -> in second case 3 is absent and thus it's null 
but statistics shows no nulls
-return !isArray && super.canDrop(evaluator);
-  }
-};
+if (expr instanceof TypedFieldExpr) {
+  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expr;
+  if (typedFieldExpr.getPath().isArray()) {
+return RowsMatch.SOME;
+  }
+}
+if (hasNoNulls(exprStat)) {
 
 Review comment:
   @arina-ielchiieva If I remove hasNonNullValue from createIsNullPredicate, 
the test mentioned in https://issues.apache.org/jira/browse/DRILL-6603 fails.
   I can remove it in ParquetComparisonPredicate.checkNull(), but this function 
is used a lot... 
   I can also remove it from ParquetIsPredicate.checkNull(), with the same 
comment.
   For safety, until the bug is fixed, we maybe better keep it as is ?
   (In the other places, it is used to ensure getMin/getMax is valid)
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-17 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r203003949
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -62,50 +62,51 @@ private ParquetIsPredicate(LogicalExpression expr, 
BiPredicate, Ra
 return visitor.visitUnknown(this, value);
   }
 
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  /**
+   * Apply the filter condition against the meta of the rowgroup.
+   */
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
 Statistics exprStat = expr.accept(evaluator, null);
-if (isNullOrEmpty(exprStat)) {
-  return false;
-}
+return isNullOrEmpty(exprStat) ? RowsMatch.SOME : 
predicate.apply(exprStat, evaluator);
+  }
 
-return predicate.test(exprStat, evaluator);
+  /**
+   * After the applying of the filter against the statistics of the rowgroup, 
if the result is RowsMatch.ALL,
+   * then we still must know if the rowgroup contains some null values, 
because it can change the filter result.
+   * If it contains some null values, then we change the RowsMatch.ALL into 
RowsMatch.SOME, which sya that maybe
+   * some values (the null ones) should be disgarded.
+   */
+  private static RowsMatch checkNull(Statistics exprStat) {
+return hasNoNulls(exprStat) ? RowsMatch.ALL : RowsMatch.SOME;
   }
 
   /**
* IS NULL predicate.
*/
   private static > LogicalExpression 
createIsNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are no nulls  -> canDrop
-(exprStat, evaluator) -> hasNoNulls(exprStat)) {
-  private final boolean isArray = isArray(expr);
-
-  private boolean isArray(LogicalExpression expression) {
-if (expression instanceof TypedFieldExpr) {
-  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expression;
-  SchemaPath schemaPath = typedFieldExpr.getPath();
-  return schemaPath.isArray();
-}
-return false;
-  }
-
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  (exprStat, evaluator) -> {
 // for arrays we are not able to define exact number of nulls
 // [1,2,3] vs [1,2] -> in second case 3 is absent and thus it's null 
but statistics shows no nulls
-return !isArray && super.canDrop(evaluator);
-  }
-};
+if (expr instanceof TypedFieldExpr) {
+  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expr;
+  if (typedFieldExpr.getPath().isArray()) {
+return RowsMatch.SOME;
+  }
+}
+if (hasNoNulls(exprStat)) {
 
 Review comment:
   @arina-ielchiieva If I remove hasNonNullValue from createIsNullPredicate, 
the test mentioned inhttps://issues.apache.org/jira/browse/DRILL-6603 fails.
   I can remove it in ParquetComparisonPredicate.checkNull(), but this function 
is used a lot... 
   I can also remove it from ParquetIsPredicate.checkNull(), with the same 
comment.
   For safety, until the bug is fixed, we maybe better keep it as is ?
   (In the other places, it is used to ensure getMin/getMax is valid)
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-16 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r202755614
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetComparisonPredicate.java
 ##
 @@ -83,23 +85,29 @@ private ParquetComparisonPredicate(
* where Column1 and Column2 are from same parquet table.
*/
   @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
 Statistics leftStat = left.accept(evaluator, null);
 if (isNullOrEmpty(leftStat)) {
-  return false;
+  return RowsMatch.SOME;
 }
-
 Statistics rightStat = right.accept(evaluator, null);
 if (isNullOrEmpty(rightStat)) {
-  return false;
+  return RowsMatch.SOME;
 }
-
-// if either side is ALL null, = is evaluated to UNKNOWN -> canDrop
 if (isAllNulls(leftStat, evaluator.getRowCount()) || isAllNulls(rightStat, 
evaluator.getRowCount())) {
-  return true;
+  return RowsMatch.NONE;
+}
+if (leftStat.hasNonNullValue() && !leftStat.hasNonNullValue() || 
rightStat.hasNonNullValue() && !rightStat.hasNonNullValue()) {
 
 Review comment:
   Yes I noticed but the commit was already pushed and I had to wait for the 
tests (45 mn).
   Corrected in the new push.


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-16 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r202736342
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -62,50 +62,51 @@ private ParquetIsPredicate(LogicalExpression expr, 
BiPredicate, Ra
 return visitor.visitUnknown(this, value);
   }
 
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  /**
+   * Apply the filter condition against the meta of the rowgroup.
+   */
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
 Statistics exprStat = expr.accept(evaluator, null);
-if (isNullOrEmpty(exprStat)) {
-  return false;
-}
+return isNullOrEmpty(exprStat) ? RowsMatch.SOME : 
predicate.apply(exprStat, evaluator);
+  }
 
-return predicate.test(exprStat, evaluator);
+  /**
+   * After the applying of the filter against the statistics of the rowgroup, 
if the result is RowsMatch.ALL,
+   * then we still must know if the rowgroup contains some null values, 
because it can change the filter result.
+   * If it contains some null values, then we change the RowsMatch.ALL into 
RowsMatch.SOME, which sya that maybe
+   * some values (the null ones) should be disgarded.
+   */
+  private static RowsMatch checkNull(Statistics exprStat) {
+return hasNoNulls(exprStat) ? RowsMatch.ALL : RowsMatch.SOME;
   }
 
   /**
* IS NULL predicate.
*/
   private static > LogicalExpression 
createIsNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are no nulls  -> canDrop
-(exprStat, evaluator) -> hasNoNulls(exprStat)) {
-  private final boolean isArray = isArray(expr);
-
-  private boolean isArray(LogicalExpression expression) {
-if (expression instanceof TypedFieldExpr) {
-  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expression;
-  SchemaPath schemaPath = typedFieldExpr.getPath();
-  return schemaPath.isArray();
-}
-return false;
-  }
-
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  (exprStat, evaluator) -> {
 // for arrays we are not able to define exact number of nulls
 // [1,2,3] vs [1,2] -> in second case 3 is absent and thus it's null 
but statistics shows no nulls
-return !isArray && super.canDrop(evaluator);
-  }
-};
+if (expr instanceof TypedFieldExpr) {
+  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expr;
+  if (typedFieldExpr.getPath().isArray()) {
+return RowsMatch.SOME;
+  }
+}
+if (hasNoNulls(exprStat)) {
 
 Review comment:
   @arina-ielchiieva checked all code and added hasNonNullValue() before  
hasNoNulls.


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-16 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r202719524
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -124,8 +124,7 @@ private static LogicalExpression 
createIsTruePredicate(LogicalExpression expr) {
*/
   private static LogicalExpression createIsFalsePredicate(LogicalExpression 
expr) {
 return new ParquetIsPredicate(expr, (exprStat, evaluator) ->
-//if min value is not false or if there are all nulls  -> canDrop
-isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin()
+  exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin() || 
isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.NONE : 
checkNull(exprStat)
 
 Review comment:
   @vrozov Yes, to have the same code in all lines. Easier to maintain.
   Maybe you want 
java-exec/src/test/resources/parquetFilterPush/blnTbl/0_0_2.parquet with 
ST:[min: false, max: false, num_nulls: 0]
   and java-exec/src/test/resources/parquetFilterPush/tfTbl/tt1.parquet with 
ST:[min: false, max: true, num_nulls: 0]
   which are tested in testBooleanPredicate()


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-16 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r202719524
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -124,8 +124,7 @@ private static LogicalExpression 
createIsTruePredicate(LogicalExpression expr) {
*/
   private static LogicalExpression createIsFalsePredicate(LogicalExpression 
expr) {
 return new ParquetIsPredicate(expr, (exprStat, evaluator) ->
-//if min value is not false or if there are all nulls  -> canDrop
-isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin()
+  exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin() || 
isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.NONE : 
checkNull(exprStat)
 
 Review comment:
   @vrozov Yes, to have the same code in all lines. Easier to maintain.


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-16 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r202719524
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -124,8 +124,7 @@ private static LogicalExpression 
createIsTruePredicate(LogicalExpression expr) {
*/
   private static LogicalExpression createIsFalsePredicate(LogicalExpression 
expr) {
 return new ParquetIsPredicate(expr, (exprStat, evaluator) ->
-//if min value is not false or if there are all nulls  -> canDrop
-isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin()
+  exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin() || 
isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.NONE : 
checkNull(exprStat)
 
 Review comment:
   @vrozov Yes, to have the same code in all lines. Easier to see maintain.


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-16 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r202718440
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -62,50 +62,51 @@ private ParquetIsPredicate(LogicalExpression expr, 
BiPredicate, Ra
 return visitor.visitUnknown(this, value);
   }
 
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  /**
+   * Apply the filter condition against the meta of the rowgroup.
+   */
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
 Statistics exprStat = expr.accept(evaluator, null);
-if (isNullOrEmpty(exprStat)) {
-  return false;
-}
+return isNullOrEmpty(exprStat) ? RowsMatch.SOME : 
predicate.apply(exprStat, evaluator);
+  }
 
-return predicate.test(exprStat, evaluator);
+  /**
+   * After the applying of the filter against the statistics of the rowgroup, 
if the result is RowsMatch.ALL,
+   * then we still must know if the rowgroup contains some null values, 
because it can change the filter result.
+   * If it contains some null values, then we change the RowsMatch.ALL into 
RowsMatch.SOME, which sya that maybe
+   * some values (the null ones) should be disgarded.
+   */
+  private static RowsMatch checkNull(Statistics exprStat) {
+return hasNoNulls(exprStat) ? RowsMatch.ALL : RowsMatch.SOME;
   }
 
   /**
* IS NULL predicate.
*/
   private static > LogicalExpression 
createIsNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are no nulls  -> canDrop
-(exprStat, evaluator) -> hasNoNulls(exprStat)) {
-  private final boolean isArray = isArray(expr);
-
-  private boolean isArray(LogicalExpression expression) {
-if (expression instanceof TypedFieldExpr) {
-  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expression;
-  SchemaPath schemaPath = typedFieldExpr.getPath();
-  return schemaPath.isArray();
-}
-return false;
-  }
-
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  (exprStat, evaluator) -> {
 // for arrays we are not able to define exact number of nulls
 // [1,2,3] vs [1,2] -> in second case 3 is absent and thus it's null 
but statistics shows no nulls
-return !isArray && super.canDrop(evaluator);
-  }
-};
+if (expr instanceof TypedFieldExpr) {
+  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expr;
+  if (typedFieldExpr.getPath().isArray()) {
+return RowsMatch.SOME;
+  }
+}
+if (hasNoNulls(exprStat)) {
 
 Review comment:
   Yes. I have added hasNonNullValue() in createIsNullPredicate(). It solves 
the issue. All other tests OK. 


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-16 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r202714385
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -62,50 +62,51 @@ private ParquetIsPredicate(LogicalExpression expr, 
BiPredicate, Ra
 return visitor.visitUnknown(this, value);
   }
 
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  /**
+   * Apply the filter condition against the meta of the rowgroup.
+   */
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
 Statistics exprStat = expr.accept(evaluator, null);
-if (isNullOrEmpty(exprStat)) {
-  return false;
-}
+return isNullOrEmpty(exprStat) ? RowsMatch.SOME : 
predicate.apply(exprStat, evaluator);
+  }
 
-return predicate.test(exprStat, evaluator);
+  /**
+   * After the applying of the filter against the statistics of the rowgroup, 
if the result is RowsMatch.ALL,
+   * then we still must know if the rowgroup contains some null values, 
because it can change the filter result.
+   * If it contains some null values, then we change the RowsMatch.ALL into 
RowsMatch.SOME, which sya that maybe
+   * some values (the null ones) should be disgarded.
+   */
+  private static RowsMatch checkNull(Statistics exprStat) {
+return hasNoNulls(exprStat) ? RowsMatch.ALL : RowsMatch.SOME;
   }
 
   /**
* IS NULL predicate.
*/
   private static > LogicalExpression 
createIsNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are no nulls  -> canDrop
-(exprStat, evaluator) -> hasNoNulls(exprStat)) {
-  private final boolean isArray = isArray(expr);
-
-  private boolean isArray(LogicalExpression expression) {
-if (expression instanceof TypedFieldExpr) {
-  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expression;
-  SchemaPath schemaPath = typedFieldExpr.getPath();
-  return schemaPath.isArray();
-}
-return false;
-  }
-
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  (exprStat, evaluator) -> {
 // for arrays we are not able to define exact number of nulls
 // [1,2,3] vs [1,2] -> in second case 3 is absent and thus it's null 
but statistics shows no nulls
-return !isArray && super.canDrop(evaluator);
-  }
-};
+if (expr instanceof TypedFieldExpr) {
+  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expr;
+  if (typedFieldExpr.getPath().isArray()) {
+return RowsMatch.SOME;
+  }
+}
+if (hasNoNulls(exprStat)) {
 
 Review comment:
   @arina-ielchiieva thanks for the file. Running it, it happens that the 
parquet files have ST:[no stats for this column] and this is interpreted as 
"num_nulls=0, min/max undefined" in RangeExprEvaluator.visitTypedFieldExpr: 
   if (columnStatistics != null) {
 return columnStatistics.getStatistics();
   } 
   ...
   Which leads to the issue, since num_nulls is not defined in this case from 
the parquet rowgroup stats, but certainly not 0 (in this case, num_nulls > 0).


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-16 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r202666136
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -62,50 +62,51 @@ private ParquetIsPredicate(LogicalExpression expr, 
BiPredicate, Ra
 return visitor.visitUnknown(this, value);
   }
 
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  /**
+   * Apply the filter condition against the meta of the rowgroup.
+   */
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
 Statistics exprStat = expr.accept(evaluator, null);
-if (isNullOrEmpty(exprStat)) {
-  return false;
-}
+return isNullOrEmpty(exprStat) ? RowsMatch.SOME : 
predicate.apply(exprStat, evaluator);
+  }
 
-return predicate.test(exprStat, evaluator);
+  /**
+   * After the applying of the filter against the statistics of the rowgroup, 
if the result is RowsMatch.ALL,
+   * then we still must know if the rowgroup contains some null values, 
because it can change the filter result.
+   * If it contains some null values, then we change the RowsMatch.ALL into 
RowsMatch.SOME, which sya that maybe
+   * some values (the null ones) should be disgarded.
+   */
+  private static RowsMatch checkNull(Statistics exprStat) {
+return hasNoNulls(exprStat) ? RowsMatch.ALL : RowsMatch.SOME;
   }
 
   /**
* IS NULL predicate.
*/
   private static > LogicalExpression 
createIsNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are no nulls  -> canDrop
-(exprStat, evaluator) -> hasNoNulls(exprStat)) {
-  private final boolean isArray = isArray(expr);
-
-  private boolean isArray(LogicalExpression expression) {
-if (expression instanceof TypedFieldExpr) {
-  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expression;
-  SchemaPath schemaPath = typedFieldExpr.getPath();
-  return schemaPath.isArray();
-}
-return false;
-  }
-
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  (exprStat, evaluator) -> {
 // for arrays we are not able to define exact number of nulls
 // [1,2,3] vs [1,2] -> in second case 3 is absent and thus it's null 
but statistics shows no nulls
-return !isArray && super.canDrop(evaluator);
-  }
-};
+if (expr instanceof TypedFieldExpr) {
+  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expr;
+  if (typedFieldExpr.getPath().isArray()) {
+return RowsMatch.SOME;
+  }
+}
+if (hasNoNulls(exprStat)) {
 
 Review comment:
   In testBooleanPredicate, file 
java-exec/src/test/resources/parquetFilterPush/blnTbl/0_0_3.parquet which has 
ST:[num_nulls: 3, min/max not defined] and "row group 1: RC:3 TS:27 OFFSET:4", 
we get the correct value isAllNulls=true, because we compare getNumNulls with 
rowCount, which are both 3 in this case.


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-16 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r202664099
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -124,8 +124,7 @@ private static LogicalExpression 
createIsTruePredicate(LogicalExpression expr) {
*/
   private static LogicalExpression createIsFalsePredicate(LogicalExpression 
expr) {
 return new ParquetIsPredicate(expr, (exprStat, evaluator) ->
-//if min value is not false or if there are all nulls  -> canDrop
-isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin()
+  exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin() || 
isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.NONE : 
checkNull(exprStat)
 
 Review comment:
   @vrozov java-exec/src/test/resources/parquetFilterPush/blnTbl/0_0_3.parquet 
gives ST:[num_nulls: 3, min/max not defined] whihc means hasNonNullValue=false 
but isAllNulls=true, which is what we are looking for. This happens each time 
we run 
   testBooleanPredicate()
   @arina-ielchiieva isallNulls would produce incorrect values if getNumNulls 
is undefined, which may happen whatever the return value of hasNonNullValue 
(see stats of parquet file 0_0_3.parquet). Would it be possible that all the 
stats (num_nulls, min and max) are not defined for ParquetIsPredicate entering 
file?


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-13 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r202408237
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -124,8 +124,7 @@ private static LogicalExpression 
createIsTruePredicate(LogicalExpression expr) {
*/
   private static LogicalExpression createIsFalsePredicate(LogicalExpression 
expr) {
 return new ParquetIsPredicate(expr, (exprStat, evaluator) ->
-//if min value is not false or if there are all nulls  -> canDrop
-isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin()
+  exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin() || 
isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.NONE : 
checkNull(exprStat)
 
 Review comment:
   hasNonNullValue = true if min and max exist
   isAllNulls = true if all rows are null values
   testBooleanPredicate with File 0_0_3.parquet (contains only 3 null values)


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-13 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r202296895
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -62,70 +62,73 @@ private ParquetIsPredicate(LogicalExpression expr, 
BiPredicate, Ra
 return visitor.visitUnknown(this, value);
   }
 
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  /**
+   * Apply the filter condition against the meta of the rowgroup.
+   */
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
 Statistics exprStat = expr.accept(evaluator, null);
-if (isNullOrEmpty(exprStat)) {
-  return false;
-}
+return isNullOrEmpty(exprStat) ? RowsMatch.SOME : 
predicate.apply(exprStat, evaluator);
+  }
 
-return predicate.test(exprStat, evaluator);
+  /**
+   * After the applying of the filter against the statistics of the rowgroup, 
if the result is RowsMatch.ALL,
+   * then we still must know if the rowgroup contains some null values, 
because it can change the filter result.
+   * If it contains some null values, then we change the RowsMatch.ALL into 
RowsMatch.SOME, which sya that maybe
+   * some values (the null ones) should be disgarded.
+   */
+  private static RowsMatch checkNull(Statistics exprStat) {
+return hasNoNulls(exprStat) ? RowsMatch.ALL : RowsMatch.SOME;
   }
 
   /**
* IS NULL predicate.
*/
   private static > LogicalExpression 
createIsNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are no nulls  -> canDrop
-(exprStat, evaluator) -> hasNoNulls(exprStat)) {
-  private final boolean isArray = isArray(expr);
-
-  private boolean isArray(LogicalExpression expression) {
-if (expression instanceof TypedFieldExpr) {
-  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expression;
-  SchemaPath schemaPath = typedFieldExpr.getPath();
-  return schemaPath.isArray();
-}
-return false;
-  }
-
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  (exprStat, evaluator) -> {
 // for arrays we are not able to define exact number of nulls
 // [1,2,3] vs [1,2] -> in second case 3 is absent and thus it's null 
but statistics shows no nulls
-return !isArray && super.canDrop(evaluator);
-  }
-};
+if (expr instanceof TypedFieldExpr) {
+  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expr;
+  if (typedFieldExpr.getPath().isArray()) {
+return RowsMatch.SOME;
+  }
+}
+if (hasNoNulls(exprStat)) {
+  return RowsMatch.NONE;
+}
+return isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.ALL : 
RowsMatch.SOME;
+  });
   }
 
   /**
* IS NOT NULL predicate.
*/
   private static > LogicalExpression 
createIsNotNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are all nulls  -> canDrop
-(exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount())
+  (exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount()) ? 
RowsMatch.NONE : checkNull(exprStat)
 );
   }
 
   /**
* IS TRUE predicate.
*/
   private static LogicalExpression createIsTruePredicate(LogicalExpression 
expr) {
-return new ParquetIsPredicate(expr, (exprStat, evaluator) ->
-//if max value is not true or if there are all nulls  -> canDrop
-isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && !((BooleanStatistics) exprStat).getMax()
-);
+return new ParquetIsPredicate(expr, (exprStat, evaluator) -> {
+  if (isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && !((BooleanStatistics) exprStat).getMax()) {
+return RowsMatch.NONE;
+  }
+  return ((BooleanStatistics) exprStat).getMin() ? checkNull(exprStat) : 
RowsMatch.SOME;
+});
   }
 
   /**
* IS FALSE predicate.
*/
   private static LogicalExpression createIsFalsePredicate(LogicalExpression 
expr) {
 return new ParquetIsPredicate(expr, (exprStat, evaluator) ->
-//if min value is not false or if there are all nulls  -> canDrop
-isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin()
+  exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin() && 
((BooleanStatistics) exprStat).getMax() || isAllNulls(exprStat, 
evaluator.getRowCount()) ?
 
 Review comment:
   Added 4 new tests exec/java-exec/src/test/resources/parquetFilterPush/tfTbl/
   ff1 (false, false, 1 null)
   ft0 (false, true, 0 null)
   tt1 (true, true, 1 null)
   Other cases are already tested in 
exec/java-exec/src/test/resources/parquetFilterPush/blnTbl
   
   Updated "is true" 

[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-13 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r202296895
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -62,70 +62,73 @@ private ParquetIsPredicate(LogicalExpression expr, 
BiPredicate, Ra
 return visitor.visitUnknown(this, value);
   }
 
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  /**
+   * Apply the filter condition against the meta of the rowgroup.
+   */
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
 Statistics exprStat = expr.accept(evaluator, null);
-if (isNullOrEmpty(exprStat)) {
-  return false;
-}
+return isNullOrEmpty(exprStat) ? RowsMatch.SOME : 
predicate.apply(exprStat, evaluator);
+  }
 
-return predicate.test(exprStat, evaluator);
+  /**
+   * After the applying of the filter against the statistics of the rowgroup, 
if the result is RowsMatch.ALL,
+   * then we still must know if the rowgroup contains some null values, 
because it can change the filter result.
+   * If it contains some null values, then we change the RowsMatch.ALL into 
RowsMatch.SOME, which sya that maybe
+   * some values (the null ones) should be disgarded.
+   */
+  private static RowsMatch checkNull(Statistics exprStat) {
+return hasNoNulls(exprStat) ? RowsMatch.ALL : RowsMatch.SOME;
   }
 
   /**
* IS NULL predicate.
*/
   private static > LogicalExpression 
createIsNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are no nulls  -> canDrop
-(exprStat, evaluator) -> hasNoNulls(exprStat)) {
-  private final boolean isArray = isArray(expr);
-
-  private boolean isArray(LogicalExpression expression) {
-if (expression instanceof TypedFieldExpr) {
-  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expression;
-  SchemaPath schemaPath = typedFieldExpr.getPath();
-  return schemaPath.isArray();
-}
-return false;
-  }
-
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  (exprStat, evaluator) -> {
 // for arrays we are not able to define exact number of nulls
 // [1,2,3] vs [1,2] -> in second case 3 is absent and thus it's null 
but statistics shows no nulls
-return !isArray && super.canDrop(evaluator);
-  }
-};
+if (expr instanceof TypedFieldExpr) {
+  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expr;
+  if (typedFieldExpr.getPath().isArray()) {
+return RowsMatch.SOME;
+  }
+}
+if (hasNoNulls(exprStat)) {
+  return RowsMatch.NONE;
+}
+return isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.ALL : 
RowsMatch.SOME;
+  });
   }
 
   /**
* IS NOT NULL predicate.
*/
   private static > LogicalExpression 
createIsNotNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are all nulls  -> canDrop
-(exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount())
+  (exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount()) ? 
RowsMatch.NONE : checkNull(exprStat)
 );
   }
 
   /**
* IS TRUE predicate.
*/
   private static LogicalExpression createIsTruePredicate(LogicalExpression 
expr) {
-return new ParquetIsPredicate(expr, (exprStat, evaluator) ->
-//if max value is not true or if there are all nulls  -> canDrop
-isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && !((BooleanStatistics) exprStat).getMax()
-);
+return new ParquetIsPredicate(expr, (exprStat, evaluator) -> {
+  if (isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && !((BooleanStatistics) exprStat).getMax()) {
+return RowsMatch.NONE;
+  }
+  return ((BooleanStatistics) exprStat).getMin() ? checkNull(exprStat) : 
RowsMatch.SOME;
+});
   }
 
   /**
* IS FALSE predicate.
*/
   private static LogicalExpression createIsFalsePredicate(LogicalExpression 
expr) {
 return new ParquetIsPredicate(expr, (exprStat, evaluator) ->
-//if min value is not false or if there are all nulls  -> canDrop
-isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin()
+  exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin() && 
((BooleanStatistics) exprStat).getMax() || isAllNulls(exprStat, 
evaluator.getRowCount()) ?
 
 Review comment:
   Added 4 new tests exec/java-exec/src/test/resources/parquetFilterPush/tfTbl/
   ff1 (false, false, 1 null)
   ft0 (false, true, 0 null)
   tt1 (true, true, 1 null)
   Other cases are already tested in 
exec/java-exec/src/test/resources/parquetFilterPush/blnTbl


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-12 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r202068577
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -62,70 +62,72 @@ private ParquetIsPredicate(LogicalExpression expr, 
BiPredicate, Ra
 return visitor.visitUnknown(this, value);
   }
 
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  /**
+   * Apply the filter condition against the meta of the rowgroup.
+   */
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
 Statistics exprStat = expr.accept(evaluator, null);
-if (isNullOrEmpty(exprStat)) {
-  return false;
-}
+return isNullOrEmpty(exprStat) ? RowsMatch.SOME : 
predicate.apply(exprStat, evaluator);
+  }
 
-return predicate.test(exprStat, evaluator);
+  /**
+   * After the applying of the filter against the statistics of the rowgroup, 
if the result is RowsMatch.ALL,
+   * then we still must know if the rowgroup contains some null values, 
because it can change the filter result.
+   * If it contains some null values, then we change the RowsMatch.ALL into 
RowsMatch.SOME, which sya that maybe
+   * some values (the null ones) should be disgarded.
+   */
+  private static RowsMatch checkNull(Statistics exprStat) {
+return hasNoNulls(exprStat) ? RowsMatch.ALL : RowsMatch.SOME;
   }
 
   /**
* IS NULL predicate.
*/
   private static > LogicalExpression 
createIsNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are no nulls  -> canDrop
-(exprStat, evaluator) -> hasNoNulls(exprStat)) {
-  private final boolean isArray = isArray(expr);
-
-  private boolean isArray(LogicalExpression expression) {
-if (expression instanceof TypedFieldExpr) {
-  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expression;
-  SchemaPath schemaPath = typedFieldExpr.getPath();
-  return schemaPath.isArray();
-}
-return false;
-  }
-
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  (exprStat, evaluator) -> {
 // for arrays we are not able to define exact number of nulls
 // [1,2,3] vs [1,2] -> in second case 3 is absent and thus it's null 
but statistics shows no nulls
-return !isArray && super.canDrop(evaluator);
-  }
-};
+if(expr instanceof TypedFieldExpr) {
+  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expr;
+  if (typedFieldExpr.getPath().isArray()) {
+return RowsMatch.SOME;
+  }
+}
+if (hasNoNulls(exprStat)) {
+  return RowsMatch.NONE;
+}
+return isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.ALL : 
RowsMatch.SOME;
+  });
   }
 
   /**
* IS NOT NULL predicate.
*/
   private static > LogicalExpression 
createIsNotNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are all nulls  -> canDrop
-(exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount())
+  (exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount()) ? 
RowsMatch.NONE : checkNull(exprStat)
 );
   }
 
   /**
* IS TRUE predicate.
*/
   private static LogicalExpression createIsTruePredicate(LogicalExpression 
expr) {
-return new ParquetIsPredicate(expr, (exprStat, evaluator) ->
-//if max value is not true or if there are all nulls  -> canDrop
-isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && !((BooleanStatistics) exprStat).getMax()
-);
+return new ParquetIsPredicate(expr, (exprStat, evaluator) -> {
+  if (isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && !((BooleanStatistics)exprStat).getMax()) {
+return RowsMatch.NONE;
+  }
+  return ((BooleanStatistics)exprStat).getMin() && 
((BooleanStatistics)exprStat).getMax() ? checkNull(exprStat) : RowsMatch.SOME;
+});
   }
 
   /**
* IS FALSE predicate.
*/
   private static LogicalExpression createIsFalsePredicate(LogicalExpression 
expr) {
 return new ParquetIsPredicate(expr, (exprStat, evaluator) ->
-//if min value is not false or if there are all nulls  -> canDrop
-isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin()
+  exprStat.hasNonNullValue() && ((BooleanStatistics)exprStat).getMin() || 
isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.NONE : 
checkNull(exprStat)
 
 Review comment:
   Exact. Corrected.


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries 

[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-12 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r202062943
 
 

 ##
 File path: 
exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/TestParquetFilterPushDown.java
 ##
 @@ -243,24 +248,98 @@ public void testTimeStampPredicateWithEval() throws 
Exception {
   .toFile();
 ParquetMetadata footer = getParquetMetaData(file);
 
-testParquetRowGroupFilterEval(footer, "o_ordertimestamp = cast('1992-01-01 
10:20:30' as timestamp)", false);
-testParquetRowGroupFilterEval(footer, "o_ordertimestamp = cast('1992-01-01 
10:20:29' as timestamp)", true);
+testParquetRowGroupFilterEval(footer, "o_ordertimestamp = cast('1992-01-01 
10:20:30' as timestamp)", RowsMatch.SOME);
+testParquetRowGroupFilterEval(footer, "o_ordertimestamp = cast('1992-01-01 
10:20:29' as timestamp)", RowsMatch.NONE);
 
-testParquetRowGroupFilterEval(footer, "o_ordertimestamp >= 
cast('1992-01-01 10:20:29' as timestamp)", false);
-testParquetRowGroupFilterEval(footer, "o_ordertimestamp >= 
cast('1992-01-03 10:20:30' as timestamp)", false);
-testParquetRowGroupFilterEval(footer, "o_ordertimestamp >= 
cast('1992-01-03 10:20:31' as timestamp)", true);
+testParquetRowGroupFilterEval(footer, "o_ordertimestamp >= 
cast('1992-01-01 10:20:29' as timestamp)", RowsMatch.ALL);
+testParquetRowGroupFilterEval(footer, "o_ordertimestamp >= 
cast('1992-01-03 10:20:30' as timestamp)", RowsMatch.SOME);
+testParquetRowGroupFilterEval(footer, "o_ordertimestamp >= 
cast('1992-01-03 10:20:31' as timestamp)", RowsMatch.NONE);
 
-testParquetRowGroupFilterEval(footer, "o_ordertimestamp > cast('1992-01-03 
10:20:29' as timestamp)", false);
-testParquetRowGroupFilterEval(footer, "o_ordertimestamp > cast('1992-01-03 
10:20:30' as timestamp)", true);
+testParquetRowGroupFilterEval(footer, "o_ordertimestamp > cast('1992-01-03 
10:20:29' as timestamp)", RowsMatch.SOME);
+testParquetRowGroupFilterEval(footer, "o_ordertimestamp > cast('1992-01-03 
10:20:30' as timestamp)", RowsMatch.NONE);
 
-testParquetRowGroupFilterEval(footer, "o_ordertimestamp <= 
cast('1992-01-01 10:20:30' as timestamp)", false);
-testParquetRowGroupFilterEval(footer, "o_ordertimestamp <= 
cast('1992-01-01 10:20:29' as timestamp)", true);
+testParquetRowGroupFilterEval(footer, "o_ordertimestamp <= 
cast('1992-01-01 10:20:30' as timestamp)", RowsMatch.SOME);
+testParquetRowGroupFilterEval(footer, "o_ordertimestamp <= 
cast('1992-01-01 10:20:29' as timestamp)", RowsMatch.NONE);
 
-testParquetRowGroupFilterEval(footer, "o_ordertimestamp < cast('1992-01-01 
10:20:31' as timestamp)", false);
-testParquetRowGroupFilterEval(footer, "o_ordertimestamp < cast('1992-01-01 
10:20:30' as timestamp)", true);
+testParquetRowGroupFilterEval(footer, "o_ordertimestamp < cast('1992-01-01 
10:20:31' as timestamp)", RowsMatch.SOME);
+testParquetRowGroupFilterEval(footer, "o_ordertimestamp < cast('1992-01-01 
10:20:30' as timestamp)", RowsMatch.NONE);
 
   }
 
+  @Test
+  public void testFilterPruning() throws Exception {
+// multirowgroup2 is a parquet file with 3 rowgroups inside. One with a=0, 
another with a=1 and a=2, and the last with a=3 and a=4;
+// FilterPushDown should be able to prune the filter from the scan 
operator according to the rowgroup statistics.
+final String sql = "select * from dfs.`parquet/multirowgroup2.parquet` 
where ";
+PlanTestBase.testPlanMatchingPatterns(sql + "a > 1", new 
String[]{"numRowGroups=2"}, null); //No filter pruning
 
 Review comment:
   Replaced everywhere


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-12 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r202062739
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/RangeExprEvaluator.java
 ##
 @@ -258,21 +258,28 @@ public long getRowCount() {
   final ValueHolder minFuncHolder = 
InterpreterEvaluator.evaluateFunction(interpreter, args1, holderExpr.getName());
   final ValueHolder maxFuncHolder = 
InterpreterEvaluator.evaluateFunction(interpreter, args2, holderExpr.getName());
 
+  Statistics statistics;
   switch (destType) {
-  //TODO : need handle # of nulls.
   case INT:
-return getStatistics( ((IntHolder)minFuncHolder).value, 
((IntHolder)maxFuncHolder).value);
+statistics = getStatistics( ((IntHolder)minFuncHolder).value, 
((IntHolder)maxFuncHolder).value);
 
 Review comment:
   Done. Checked other occurrences.


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-12 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r202062839
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetPushDownFilter.java
 ##
 @@ -165,12 +167,29 @@ protected void doOnMatch(RelOptRuleCall call, FilterPrel 
filter, ProjectPrel pro
   return;
 }
 
-
 RelNode newScan = ScanPrel.create(scan, scan.getTraitSet(), newGroupScan, 
scan.getRowType());;
 
 if (project != null) {
   newScan = project.copy(project.getTraitSet(), ImmutableList.of(newScan));
 }
+
+if (newGroupScan instanceof AbstractParquetGroupScan) {
+  RowsMatch matchAll = RowsMatch.ALL;
+  List rowGroupInfos = ((AbstractParquetGroupScan) 
newGroupScan).rowGroupInfos;
+  for (RowGroupInfo rowGroup : rowGroupInfos) {
+if (rowGroup.getRowsMatch() !=  RowsMatch.ALL) {
 
 Review comment:
   Done


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-12 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r202062690
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -62,70 +62,72 @@ private ParquetIsPredicate(LogicalExpression expr, 
BiPredicate, Ra
 return visitor.visitUnknown(this, value);
   }
 
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  /**
+   * Apply the filter condition against the meta of the rowgroup.
+   */
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
 Statistics exprStat = expr.accept(evaluator, null);
-if (isNullOrEmpty(exprStat)) {
-  return false;
-}
+return isNullOrEmpty(exprStat) ? RowsMatch.SOME : 
predicate.apply(exprStat, evaluator);
+  }
 
-return predicate.test(exprStat, evaluator);
+  /**
+   * After the applying of the filter against the statistics of the rowgroup, 
if the result is RowsMatch.ALL,
+   * then we still must know if the rowgroup contains some null values, 
because it can change the filter result.
+   * If it contains some null values, then we change the RowsMatch.ALL into 
RowsMatch.SOME, which sya that maybe
+   * some values (the null ones) should be disgarded.
+   */
+  private static RowsMatch checkNull(Statistics exprStat) {
+return hasNoNulls(exprStat) ? RowsMatch.ALL : RowsMatch.SOME;
   }
 
   /**
* IS NULL predicate.
*/
   private static > LogicalExpression 
createIsNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are no nulls  -> canDrop
-(exprStat, evaluator) -> hasNoNulls(exprStat)) {
-  private final boolean isArray = isArray(expr);
-
-  private boolean isArray(LogicalExpression expression) {
-if (expression instanceof TypedFieldExpr) {
-  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expression;
-  SchemaPath schemaPath = typedFieldExpr.getPath();
-  return schemaPath.isArray();
-}
-return false;
-  }
-
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  (exprStat, evaluator) -> {
 // for arrays we are not able to define exact number of nulls
 // [1,2,3] vs [1,2] -> in second case 3 is absent and thus it's null 
but statistics shows no nulls
-return !isArray && super.canDrop(evaluator);
-  }
-};
+if(expr instanceof TypedFieldExpr) {
+  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expr;
+  if (typedFieldExpr.getPath().isArray()) {
+return RowsMatch.SOME;
+  }
+}
+if (hasNoNulls(exprStat)) {
+  return RowsMatch.NONE;
+}
+return isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.ALL : 
RowsMatch.SOME;
+  });
   }
 
   /**
* IS NOT NULL predicate.
*/
   private static > LogicalExpression 
createIsNotNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are all nulls  -> canDrop
-(exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount())
+  (exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount()) ? 
RowsMatch.NONE : checkNull(exprStat)
 );
   }
 
   /**
* IS TRUE predicate.
*/
   private static LogicalExpression createIsTruePredicate(LogicalExpression 
expr) {
-return new ParquetIsPredicate(expr, (exprStat, evaluator) ->
-//if max value is not true or if there are all nulls  -> canDrop
-isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && !((BooleanStatistics) exprStat).getMax()
-);
+return new ParquetIsPredicate(expr, (exprStat, evaluator) -> {
+  if (isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && !((BooleanStatistics)exprStat).getMax()) {
+return RowsMatch.NONE;
+  }
+  return ((BooleanStatistics)exprStat).getMin() ? checkNull(exprStat) : 
RowsMatch.SOME;
 
 Review comment:
   Done. Checked other occurrences.


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-12 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r202062611
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -62,70 +62,72 @@ private ParquetIsPredicate(LogicalExpression expr, 
BiPredicate, Ra
 return visitor.visitUnknown(this, value);
   }
 
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  /**
+   * Apply the filter condition against the meta of the rowgroup.
+   */
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
 Statistics exprStat = expr.accept(evaluator, null);
-if (isNullOrEmpty(exprStat)) {
-  return false;
-}
+return isNullOrEmpty(exprStat) ? RowsMatch.SOME : 
predicate.apply(exprStat, evaluator);
+  }
 
-return predicate.test(exprStat, evaluator);
+  /**
+   * After the applying of the filter against the statistics of the rowgroup, 
if the result is RowsMatch.ALL,
+   * then we still must know if the rowgroup contains some null values, 
because it can change the filter result.
+   * If it contains some null values, then we change the RowsMatch.ALL into 
RowsMatch.SOME, which sya that maybe
+   * some values (the null ones) should be disgarded.
+   */
+  private static RowsMatch checkNull(Statistics exprStat) {
+return hasNoNulls(exprStat) ? RowsMatch.ALL : RowsMatch.SOME;
   }
 
   /**
* IS NULL predicate.
*/
   private static > LogicalExpression 
createIsNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are no nulls  -> canDrop
-(exprStat, evaluator) -> hasNoNulls(exprStat)) {
-  private final boolean isArray = isArray(expr);
-
-  private boolean isArray(LogicalExpression expression) {
-if (expression instanceof TypedFieldExpr) {
-  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expression;
-  SchemaPath schemaPath = typedFieldExpr.getPath();
-  return schemaPath.isArray();
-}
-return false;
-  }
-
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  (exprStat, evaluator) -> {
 // for arrays we are not able to define exact number of nulls
 // [1,2,3] vs [1,2] -> in second case 3 is absent and thus it's null 
but statistics shows no nulls
-return !isArray && super.canDrop(evaluator);
-  }
-};
+if(expr instanceof TypedFieldExpr) {
 
 Review comment:
   Done. Checked other occurrences.


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-12 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r202062367
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetBooleanPredicate.java
 ##
 @@ -46,15 +46,29 @@ private ParquetBooleanPredicate(String name, 
List args, Expre
   ExpressionPosition pos
   ) {
 return new ParquetBooleanPredicate(name, args, pos) {
+  /**
+   * Evaluates a compound "AND" filter on the statistics of a RowGroup 
(the filter reads "filterA and filterB").
+   * Return value :
+   *   ALL : only if all filters return ALL
+   *   NONE : if one filter at least returns NONE
+   *   SOME : all other cases
+   * 
+   */
   @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
-// "and" : as long as one branch is OK to drop, we can drop it.
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
+RowsMatch resultMatch = RowsMatch.ALL;
 for (LogicalExpression child : this) {
-  if (child instanceof ParquetFilterPredicate && 
((ParquetFilterPredicate)child).canDrop(evaluator)) {
-return true;
+  if (child instanceof ParquetFilterPredicate) {
+switch(((ParquetFilterPredicate) child).matches(evaluator)) {
 
 Review comment:
   Done. Checked other occurrences.


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-12 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r202062305
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetBooleanPredicate.java
 ##
 @@ -46,15 +46,29 @@ private ParquetBooleanPredicate(String name, 
List args, Expre
   ExpressionPosition pos
   ) {
 return new ParquetBooleanPredicate(name, args, pos) {
+  /**
+   * Evaluates a compound "AND" filter on the statistics of a RowGroup 
(the filter reads "filterA and filterB").
+   * Return value :
+   *   ALL : only if all filters return ALL
+   *   NONE : if one filter at least returns NONE
+   *   SOME : all other cases
+   * 
+   */
   @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
-// "and" : as long as one branch is OK to drop, we can drop it.
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
+RowsMatch resultMatch = RowsMatch.ALL;
 for (LogicalExpression child : this) {
-  if (child instanceof ParquetFilterPredicate && 
((ParquetFilterPredicate)child).canDrop(evaluator)) {
-return true;
+  if (child instanceof ParquetFilterPredicate) {
+switch(((ParquetFilterPredicate) child).matches(evaluator)) {
+  case NONE :
 
 Review comment:
   Done. Checked other occurrences.


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-12 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r202062367
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetBooleanPredicate.java
 ##
 @@ -46,15 +46,29 @@ private ParquetBooleanPredicate(String name, 
List args, Expre
   ExpressionPosition pos
   ) {
 return new ParquetBooleanPredicate(name, args, pos) {
+  /**
+   * Evaluates a compound "AND" filter on the statistics of a RowGroup 
(the filter reads "filterA and filterB").
+   * Return value :
+   *   ALL : only if all filters return ALL
+   *   NONE : if one filter at least returns NONE
+   *   SOME : all other cases
+   * 
+   */
   @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
-// "and" : as long as one branch is OK to drop, we can drop it.
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
+RowsMatch resultMatch = RowsMatch.ALL;
 for (LogicalExpression child : this) {
-  if (child instanceof ParquetFilterPredicate && 
((ParquetFilterPredicate)child).canDrop(evaluator)) {
-return true;
+  if (child instanceof ParquetFilterPredicate) {
+switch(((ParquetFilterPredicate) child).matches(evaluator)) {
 
 Review comment:
   Done


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-12 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r202062305
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetBooleanPredicate.java
 ##
 @@ -46,15 +46,29 @@ private ParquetBooleanPredicate(String name, 
List args, Expre
   ExpressionPosition pos
   ) {
 return new ParquetBooleanPredicate(name, args, pos) {
+  /**
+   * Evaluates a compound "AND" filter on the statistics of a RowGroup 
(the filter reads "filterA and filterB").
+   * Return value :
+   *   ALL : only if all filters return ALL
+   *   NONE : if one filter at least returns NONE
+   *   SOME : all other cases
+   * 
+   */
   @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
-// "and" : as long as one branch is OK to drop, we can drop it.
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
+RowsMatch resultMatch = RowsMatch.ALL;
 for (LogicalExpression child : this) {
-  if (child instanceof ParquetFilterPredicate && 
((ParquetFilterPredicate)child).canDrop(evaluator)) {
-return true;
+  if (child instanceof ParquetFilterPredicate) {
+switch(((ParquetFilterPredicate) child).matches(evaluator)) {
+  case NONE :
 
 Review comment:
   Done


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-12 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r201930585
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -62,70 +62,72 @@ private ParquetIsPredicate(LogicalExpression expr, 
BiPredicate, Ra
 return visitor.visitUnknown(this, value);
   }
 
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  /**
+   * Apply the filter condition against the meta of the rowgroup.
+   */
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
 Statistics exprStat = expr.accept(evaluator, null);
-if (isNullOrEmpty(exprStat)) {
-  return false;
-}
+return isNullOrEmpty(exprStat) ? RowsMatch.SOME : 
predicate.apply(exprStat, evaluator);
+  }
 
-return predicate.test(exprStat, evaluator);
+  /**
+   * After the applying of the filter against the statistics of the rowgroup, 
if the result is RowsMatch.ALL,
+   * then we still must know if the rowgroup contains some null values, 
because it can change the filter result.
+   * If it contains some null values, then we change the RowsMatch.ALL into 
RowsMatch.SOME, which sya that maybe
+   * some values (the null ones) should be disgarded.
+   */
+  private static RowsMatch checkNull(Statistics exprStat) {
+return hasNoNulls(exprStat) ? RowsMatch.ALL : RowsMatch.SOME;
   }
 
   /**
* IS NULL predicate.
*/
   private static > LogicalExpression 
createIsNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are no nulls  -> canDrop
-(exprStat, evaluator) -> hasNoNulls(exprStat)) {
-  private final boolean isArray = isArray(expr);
-
-  private boolean isArray(LogicalExpression expression) {
-if (expression instanceof TypedFieldExpr) {
-  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expression;
-  SchemaPath schemaPath = typedFieldExpr.getPath();
-  return schemaPath.isArray();
-}
-return false;
-  }
-
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  (exprStat, evaluator) -> {
 // for arrays we are not able to define exact number of nulls
 // [1,2,3] vs [1,2] -> in second case 3 is absent and thus it's null 
but statistics shows no nulls
-return !isArray && super.canDrop(evaluator);
-  }
-};
+if(expr instanceof TypedFieldExpr) {
+  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expr;
+  if (typedFieldExpr.getPath().isArray()) {
+return RowsMatch.SOME;
+  }
+}
+if (hasNoNulls(exprStat)) {
+  return RowsMatch.NONE;
+}
+return isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.ALL : 
RowsMatch.SOME;
+  });
   }
 
   /**
* IS NOT NULL predicate.
*/
   private static > LogicalExpression 
createIsNotNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are all nulls  -> canDrop
-(exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount())
+  (exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount()) ? 
RowsMatch.NONE : checkNull(exprStat)
 );
   }
 
   /**
* IS TRUE predicate.
*/
   private static LogicalExpression createIsTruePredicate(LogicalExpression 
expr) {
-return new ParquetIsPredicate(expr, (exprStat, evaluator) ->
-//if max value is not true or if there are all nulls  -> canDrop
-isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && !((BooleanStatistics) exprStat).getMax()
-);
+return new ParquetIsPredicate(expr, (exprStat, evaluator) -> {
+  if (isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && !((BooleanStatistics)exprStat).getMax()) {
+return RowsMatch.NONE;
+  }
+  return ((BooleanStatistics)exprStat).getMin() && 
((BooleanStatistics)exprStat).getMax() ? checkNull(exprStat) : RowsMatch.SOME;
+});
   }
 
   /**
* IS FALSE predicate.
*/
   private static LogicalExpression createIsFalsePredicate(LogicalExpression 
expr) {
 return new ParquetIsPredicate(expr, (exprStat, evaluator) ->
-//if min value is not false or if there are all nulls  -> canDrop
-isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin()
+  exprStat.hasNonNullValue() && ((BooleanStatistics)exprStat).getMin() || 
isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.NONE : 
checkNull(exprStat)
 
 Review comment:
   Yes. Necessary for test file blnTbl/0_0_2.parquet where we have ST:[min: 
false, max: true, num_nulls: 1]. In this case, the file is discarded, which is 
wrong since it contains 1 false.
   Same issue for is true/false (for example 

[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-12 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r201930585
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -62,70 +62,72 @@ private ParquetIsPredicate(LogicalExpression expr, 
BiPredicate, Ra
 return visitor.visitUnknown(this, value);
   }
 
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  /**
+   * Apply the filter condition against the meta of the rowgroup.
+   */
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
 Statistics exprStat = expr.accept(evaluator, null);
-if (isNullOrEmpty(exprStat)) {
-  return false;
-}
+return isNullOrEmpty(exprStat) ? RowsMatch.SOME : 
predicate.apply(exprStat, evaluator);
+  }
 
-return predicate.test(exprStat, evaluator);
+  /**
+   * After the applying of the filter against the statistics of the rowgroup, 
if the result is RowsMatch.ALL,
+   * then we still must know if the rowgroup contains some null values, 
because it can change the filter result.
+   * If it contains some null values, then we change the RowsMatch.ALL into 
RowsMatch.SOME, which sya that maybe
+   * some values (the null ones) should be disgarded.
+   */
+  private static RowsMatch checkNull(Statistics exprStat) {
+return hasNoNulls(exprStat) ? RowsMatch.ALL : RowsMatch.SOME;
   }
 
   /**
* IS NULL predicate.
*/
   private static > LogicalExpression 
createIsNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are no nulls  -> canDrop
-(exprStat, evaluator) -> hasNoNulls(exprStat)) {
-  private final boolean isArray = isArray(expr);
-
-  private boolean isArray(LogicalExpression expression) {
-if (expression instanceof TypedFieldExpr) {
-  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expression;
-  SchemaPath schemaPath = typedFieldExpr.getPath();
-  return schemaPath.isArray();
-}
-return false;
-  }
-
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  (exprStat, evaluator) -> {
 // for arrays we are not able to define exact number of nulls
 // [1,2,3] vs [1,2] -> in second case 3 is absent and thus it's null 
but statistics shows no nulls
-return !isArray && super.canDrop(evaluator);
-  }
-};
+if(expr instanceof TypedFieldExpr) {
+  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expr;
+  if (typedFieldExpr.getPath().isArray()) {
+return RowsMatch.SOME;
+  }
+}
+if (hasNoNulls(exprStat)) {
+  return RowsMatch.NONE;
+}
+return isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.ALL : 
RowsMatch.SOME;
+  });
   }
 
   /**
* IS NOT NULL predicate.
*/
   private static > LogicalExpression 
createIsNotNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are all nulls  -> canDrop
-(exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount())
+  (exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount()) ? 
RowsMatch.NONE : checkNull(exprStat)
 );
   }
 
   /**
* IS TRUE predicate.
*/
   private static LogicalExpression createIsTruePredicate(LogicalExpression 
expr) {
-return new ParquetIsPredicate(expr, (exprStat, evaluator) ->
-//if max value is not true or if there are all nulls  -> canDrop
-isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && !((BooleanStatistics) exprStat).getMax()
-);
+return new ParquetIsPredicate(expr, (exprStat, evaluator) -> {
+  if (isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && !((BooleanStatistics)exprStat).getMax()) {
+return RowsMatch.NONE;
+  }
+  return ((BooleanStatistics)exprStat).getMin() && 
((BooleanStatistics)exprStat).getMax() ? checkNull(exprStat) : RowsMatch.SOME;
+});
   }
 
   /**
* IS FALSE predicate.
*/
   private static LogicalExpression createIsFalsePredicate(LogicalExpression 
expr) {
 return new ParquetIsPredicate(expr, (exprStat, evaluator) ->
-//if min value is not false or if there are all nulls  -> canDrop
-isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin()
+  exprStat.hasNonNullValue() && ((BooleanStatistics)exprStat).getMin() || 
isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.NONE : 
checkNull(exprStat)
 
 Review comment:
   Yes. Necessary for test file blnTbl/0_0_2.parquet where we have ST:[min: 
false, max: true, num_nulls: 1]. In this case, the file is discarded, which is 
wrong since it contains 1 false.



[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-12 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r201924556
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -62,70 +62,72 @@ private ParquetIsPredicate(LogicalExpression expr, 
BiPredicate, Ra
 return visitor.visitUnknown(this, value);
   }
 
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  /**
+   * Apply the filter condition against the meta of the rowgroup.
+   */
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
 Statistics exprStat = expr.accept(evaluator, null);
-if (isNullOrEmpty(exprStat)) {
-  return false;
-}
+return isNullOrEmpty(exprStat) ? RowsMatch.SOME : 
predicate.apply(exprStat, evaluator);
+  }
 
-return predicate.test(exprStat, evaluator);
+  /**
+   * After the applying of the filter against the statistics of the rowgroup, 
if the result is RowsMatch.ALL,
+   * then we still must know if the rowgroup contains some null values, 
because it can change the filter result.
+   * If it contains some null values, then we change the RowsMatch.ALL into 
RowsMatch.SOME, which sya that maybe
+   * some values (the null ones) should be disgarded.
+   */
+  private static RowsMatch checkNull(Statistics exprStat) {
+return hasNoNulls(exprStat) ? RowsMatch.ALL : RowsMatch.SOME;
   }
 
   /**
* IS NULL predicate.
*/
   private static > LogicalExpression 
createIsNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are no nulls  -> canDrop
-(exprStat, evaluator) -> hasNoNulls(exprStat)) {
-  private final boolean isArray = isArray(expr);
-
-  private boolean isArray(LogicalExpression expression) {
-if (expression instanceof TypedFieldExpr) {
-  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expression;
-  SchemaPath schemaPath = typedFieldExpr.getPath();
-  return schemaPath.isArray();
-}
-return false;
-  }
-
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  (exprStat, evaluator) -> {
 // for arrays we are not able to define exact number of nulls
 // [1,2,3] vs [1,2] -> in second case 3 is absent and thus it's null 
but statistics shows no nulls
-return !isArray && super.canDrop(evaluator);
-  }
-};
+if(expr instanceof TypedFieldExpr) {
+  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expr;
+  if (typedFieldExpr.getPath().isArray()) {
+return RowsMatch.SOME;
+  }
+}
+if (hasNoNulls(exprStat)) {
+  return RowsMatch.NONE;
+}
+return isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.ALL : 
RowsMatch.SOME;
+  });
   }
 
   /**
* IS NOT NULL predicate.
*/
   private static > LogicalExpression 
createIsNotNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are all nulls  -> canDrop
-(exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount())
+  (exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount()) ? 
RowsMatch.NONE : checkNull(exprStat)
 );
   }
 
   /**
* IS TRUE predicate.
*/
   private static LogicalExpression createIsTruePredicate(LogicalExpression 
expr) {
-return new ParquetIsPredicate(expr, (exprStat, evaluator) ->
-//if max value is not true or if there are all nulls  -> canDrop
-isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && !((BooleanStatistics) exprStat).getMax()
-);
+return new ParquetIsPredicate(expr, (exprStat, evaluator) -> {
+  if (isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && !((BooleanStatistics)exprStat).getMax()) {
+return RowsMatch.NONE;
+  }
+  return ((BooleanStatistics)exprStat).getMin() && 
((BooleanStatistics)exprStat).getMax() ? checkNull(exprStat) : RowsMatch.SOME;
 
 Review comment:
   Done


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-10 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r201412121
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -62,90 +62,90 @@ private ParquetIsPredicate(LogicalExpression expr, 
BiPredicate, Ra
 return visitor.visitUnknown(this, value);
   }
 
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  /**
+   * Apply the filter condition against the meta of the rowgroup.
+   */
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
 Statistics exprStat = expr.accept(evaluator, null);
-if (isNullOrEmpty(exprStat)) {
-  return false;
-}
+return isNullOrEmpty(exprStat) ? RowsMatch.SOME : 
predicate.apply(exprStat, evaluator);
+  }
 
-return predicate.test(exprStat, evaluator);
+  /**
+   * After the applying of the filter against the statistics of the rowgroup, 
if the result is RowsMatch.ALL,
+   * then we still must know if the rowgroup contains some null values, 
because it can change the filter result.
+   * If it contains some null values, then we change the RowsMatch.ALL into 
RowsMatch.SOME, which sya that maybe
+   * some values (the null ones) should be disgarded.
+   */
+  private static RowsMatch checkNull(Statistics exprStat) {
+return hasNoNulls(exprStat) ? RowsMatch.ALL : RowsMatch.SOME;
   }
 
   /**
* IS NULL predicate.
*/
   private static > LogicalExpression 
createIsNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are no nulls  -> canDrop
-(exprStat, evaluator) -> hasNoNulls(exprStat)) {
-  private final boolean isArray = isArray(expr);
-
-  private boolean isArray(LogicalExpression expression) {
-if (expression instanceof TypedFieldExpr) {
-  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expression;
-  SchemaPath schemaPath = typedFieldExpr.getPath();
-  return schemaPath.isArray();
-}
-return false;
-  }
-
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  (exprStat, evaluator) -> {
 // for arrays we are not able to define exact number of nulls
 // [1,2,3] vs [1,2] -> in second case 3 is absent and thus it's null 
but statistics shows no nulls
-return !isArray && super.canDrop(evaluator);
-  }
-};
+if(expr instanceof TypedFieldExpr) {
+  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expr;
+  if (typedFieldExpr.getPath().isArray()) {
+return RowsMatch.SOME;
+  }
+}
+if (hasNoNulls(exprStat)) {
+  return RowsMatch.NONE;
+}
+return isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.ALL : 
RowsMatch.SOME;
+  });
   }
 
   /**
* IS NOT NULL predicate.
*/
   private static > LogicalExpression 
createIsNotNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are all nulls  -> canDrop
-(exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount())
+  (exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount()) ? 
RowsMatch.NONE : checkNull(exprStat)
 );
   }
 
   /**
* IS TRUE predicate.
*/
-  private static LogicalExpression createIsTruePredicate(LogicalExpression 
expr) {
-return new ParquetIsPredicate(expr, (exprStat, evaluator) ->
-//if max value is not true or if there are all nulls  -> canDrop
-isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && !((BooleanStatistics) exprStat).getMax()
-);
+  private static > LogicalExpression 
createIsTruePredicate(LogicalExpression expr) {
+return new ParquetIsPredicate(expr, (exprStat, evaluator) -> {
+  if (isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && (!((BooleanStatistics)exprStat).getMin() && 
!((BooleanStatistics)exprStat).getMax())) {
 
 Review comment:
   OK. Suppressed getMin. Done


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-10 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r201410502
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -62,90 +62,90 @@ private ParquetIsPredicate(LogicalExpression expr, 
BiPredicate, Ra
 return visitor.visitUnknown(this, value);
   }
 
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  /**
+   * Apply the filter condition against the meta of the rowgroup.
+   */
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
 Statistics exprStat = expr.accept(evaluator, null);
-if (isNullOrEmpty(exprStat)) {
-  return false;
-}
+return isNullOrEmpty(exprStat) ? RowsMatch.SOME : 
predicate.apply(exprStat, evaluator);
+  }
 
-return predicate.test(exprStat, evaluator);
+  /**
+   * After the applying of the filter against the statistics of the rowgroup, 
if the result is RowsMatch.ALL,
+   * then we still must know if the rowgroup contains some null values, 
because it can change the filter result.
+   * If it contains some null values, then we change the RowsMatch.ALL into 
RowsMatch.SOME, which sya that maybe
+   * some values (the null ones) should be disgarded.
+   */
+  private static RowsMatch checkNull(Statistics exprStat) {
+return hasNoNulls(exprStat) ? RowsMatch.ALL : RowsMatch.SOME;
   }
 
   /**
* IS NULL predicate.
*/
   private static > LogicalExpression 
createIsNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are no nulls  -> canDrop
-(exprStat, evaluator) -> hasNoNulls(exprStat)) {
-  private final boolean isArray = isArray(expr);
-
-  private boolean isArray(LogicalExpression expression) {
-if (expression instanceof TypedFieldExpr) {
-  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expression;
-  SchemaPath schemaPath = typedFieldExpr.getPath();
-  return schemaPath.isArray();
-}
-return false;
-  }
-
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  (exprStat, evaluator) -> {
 // for arrays we are not able to define exact number of nulls
 // [1,2,3] vs [1,2] -> in second case 3 is absent and thus it's null 
but statistics shows no nulls
-return !isArray && super.canDrop(evaluator);
-  }
-};
+if(expr instanceof TypedFieldExpr) {
+  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expr;
+  if (typedFieldExpr.getPath().isArray()) {
+return RowsMatch.SOME;
+  }
+}
+if (hasNoNulls(exprStat)) {
+  return RowsMatch.NONE;
+}
+return isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.ALL : 
RowsMatch.SOME;
+  });
   }
 
   /**
* IS NOT NULL predicate.
*/
   private static > LogicalExpression 
createIsNotNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are all nulls  -> canDrop
-(exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount())
+  (exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount()) ? 
RowsMatch.NONE : checkNull(exprStat)
 );
   }
 
   /**
* IS TRUE predicate.
*/
-  private static LogicalExpression createIsTruePredicate(LogicalExpression 
expr) {
-return new ParquetIsPredicate(expr, (exprStat, evaluator) ->
-//if max value is not true or if there are all nulls  -> canDrop
-isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && !((BooleanStatistics) exprStat).getMax()
-);
+  private static > LogicalExpression 
createIsTruePredicate(LogicalExpression expr) {
 
 Review comment:
   C dropped. Done


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-10 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r201400269
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/stat/ParquetMetaStatCollector.java
 ##
 @@ -59,7 +59,7 @@ public ParquetMetaStatCollector(ParquetTableMetadataBase 
parquetTableMetadata,
 // Reasons to pass implicit columns and their values:
 // 1. Differentiate implicit columns from regular non-exist columns. 
Implicit columns do not
 //exist in parquet metadata. Without such knowledge, implicit columns 
is treated as non-exist
-//column.  A condition on non-exist column would lead to canDrop = 
true, which is not the
+//column.  A condition on non-exist column would lead to matches = 
true, which is not the
 
 Review comment:
   Done


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-10 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r201399967
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/stat/ParquetFooterStatCollector.java
 ##
 @@ -59,7 +59,7 @@ public ParquetFooterStatCollector(ParquetMetadata footer, 
int rowGroupIndex, Map
 // Reasons to pass implicit columns and their values:
 // 1. Differentiate implicit columns from regular non-exist columns. 
Implicit columns do not
 //exist in parquet metadata. Without such knowledge, implicit columns 
is treated as non-exist
-//column.  A condition on non-exist column would lead to canDrop = 
true, which is not the
+//column.  A condition on non-exist column would lead to matches = 
true, which is not the
 
 Review comment:
   ALL. Done


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-10 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r201395530
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -62,90 +62,90 @@ private ParquetIsPredicate(LogicalExpression expr, 
BiPredicate, Ra
 return visitor.visitUnknown(this, value);
   }
 
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  /**
+   * Apply the filter condition against the meta of the rowgroup.
+   */
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
 Statistics exprStat = expr.accept(evaluator, null);
-if (isNullOrEmpty(exprStat)) {
-  return false;
-}
+return isNullOrEmpty(exprStat) ? RowsMatch.SOME : 
predicate.apply(exprStat, evaluator);
+  }
 
-return predicate.test(exprStat, evaluator);
+  /**
+   * After the applying of the filter against the statistics of the rowgroup, 
if the result is RowsMatch.ALL,
+   * then we still must know if the rowgroup contains some null values, 
because it can change the filter result.
+   * If it contains some null values, then we change the RowsMatch.ALL into 
RowsMatch.SOME, which sya that maybe
+   * some values (the null ones) should be disgarded.
+   */
+  private static RowsMatch checkNull(Statistics exprStat) {
+return hasNoNulls(exprStat) ? RowsMatch.ALL : RowsMatch.SOME;
   }
 
   /**
* IS NULL predicate.
*/
   private static > LogicalExpression 
createIsNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are no nulls  -> canDrop
-(exprStat, evaluator) -> hasNoNulls(exprStat)) {
-  private final boolean isArray = isArray(expr);
-
-  private boolean isArray(LogicalExpression expression) {
-if (expression instanceof TypedFieldExpr) {
-  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expression;
-  SchemaPath schemaPath = typedFieldExpr.getPath();
-  return schemaPath.isArray();
-}
-return false;
-  }
-
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  (exprStat, evaluator) -> {
 // for arrays we are not able to define exact number of nulls
 // [1,2,3] vs [1,2] -> in second case 3 is absent and thus it's null 
but statistics shows no nulls
-return !isArray && super.canDrop(evaluator);
-  }
-};
+if(expr instanceof TypedFieldExpr) {
+  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expr;
+  if (typedFieldExpr.getPath().isArray()) {
+return RowsMatch.SOME;
+  }
+}
+if (hasNoNulls(exprStat)) {
+  return RowsMatch.NONE;
+}
+return isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.ALL : 
RowsMatch.SOME;
+  });
   }
 
   /**
* IS NOT NULL predicate.
*/
   private static > LogicalExpression 
createIsNotNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are all nulls  -> canDrop
-(exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount())
+  (exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount()) ? 
RowsMatch.NONE : checkNull(exprStat)
 );
   }
 
   /**
* IS TRUE predicate.
*/
-  private static LogicalExpression createIsTruePredicate(LogicalExpression 
expr) {
-return new ParquetIsPredicate(expr, (exprStat, evaluator) ->
-//if max value is not true or if there are all nulls  -> canDrop
-isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && !((BooleanStatistics) exprStat).getMax()
-);
+  private static > LogicalExpression 
createIsTruePredicate(LogicalExpression expr) {
+return new ParquetIsPredicate(expr, (exprStat, evaluator) -> {
+  if (isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && (!((BooleanStatistics)exprStat).getMin() && 
!((BooleanStatistics)exprStat).getMax())) {
+return RowsMatch.NONE;
+  }
+  return ((BooleanStatistics)exprStat).getMin() && 
((BooleanStatistics)exprStat).getMax() ? checkNull(exprStat) : RowsMatch.SOME;
+});
   }
 
   /**
* IS FALSE predicate.
*/
-  private static LogicalExpression createIsFalsePredicate(LogicalExpression 
expr) {
-return new ParquetIsPredicate(expr, (exprStat, evaluator) ->
-//if min value is not false or if there are all nulls  -> canDrop
-isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin()
+  private static > LogicalExpression 
createIsFalsePredicate(LogicalExpression expr) {
 
 Review comment:
   See previous comment


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use 

[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-10 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r201393681
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -62,90 +62,90 @@ private ParquetIsPredicate(LogicalExpression expr, 
BiPredicate, Ra
 return visitor.visitUnknown(this, value);
   }
 
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  /**
+   * Apply the filter condition against the meta of the rowgroup.
+   */
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
 Statistics exprStat = expr.accept(evaluator, null);
-if (isNullOrEmpty(exprStat)) {
-  return false;
-}
+return isNullOrEmpty(exprStat) ? RowsMatch.SOME : 
predicate.apply(exprStat, evaluator);
+  }
 
-return predicate.test(exprStat, evaluator);
+  /**
+   * After the applying of the filter against the statistics of the rowgroup, 
if the result is RowsMatch.ALL,
+   * then we still must know if the rowgroup contains some null values, 
because it can change the filter result.
+   * If it contains some null values, then we change the RowsMatch.ALL into 
RowsMatch.SOME, which sya that maybe
+   * some values (the null ones) should be disgarded.
+   */
+  private static RowsMatch checkNull(Statistics exprStat) {
+return hasNoNulls(exprStat) ? RowsMatch.ALL : RowsMatch.SOME;
   }
 
   /**
* IS NULL predicate.
*/
   private static > LogicalExpression 
createIsNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are no nulls  -> canDrop
-(exprStat, evaluator) -> hasNoNulls(exprStat)) {
-  private final boolean isArray = isArray(expr);
-
-  private boolean isArray(LogicalExpression expression) {
-if (expression instanceof TypedFieldExpr) {
-  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expression;
-  SchemaPath schemaPath = typedFieldExpr.getPath();
-  return schemaPath.isArray();
-}
-return false;
-  }
-
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  (exprStat, evaluator) -> {
 // for arrays we are not able to define exact number of nulls
 // [1,2,3] vs [1,2] -> in second case 3 is absent and thus it's null 
but statistics shows no nulls
-return !isArray && super.canDrop(evaluator);
-  }
-};
+if(expr instanceof TypedFieldExpr) {
+  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expr;
+  if (typedFieldExpr.getPath().isArray()) {
+return RowsMatch.SOME;
+  }
+}
+if (hasNoNulls(exprStat)) {
+  return RowsMatch.NONE;
+}
+return isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.ALL : 
RowsMatch.SOME;
+  });
   }
 
   /**
* IS NOT NULL predicate.
*/
   private static > LogicalExpression 
createIsNotNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are all nulls  -> canDrop
-(exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount())
+  (exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount()) ? 
RowsMatch.NONE : checkNull(exprStat)
 );
   }
 
   /**
* IS TRUE predicate.
*/
-  private static LogicalExpression createIsTruePredicate(LogicalExpression 
expr) {
-return new ParquetIsPredicate(expr, (exprStat, evaluator) ->
-//if max value is not true or if there are all nulls  -> canDrop
-isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && !((BooleanStatistics) exprStat).getMax()
-);
+  private static > LogicalExpression 
createIsTruePredicate(LogicalExpression expr) {
 
 Review comment:
   We can get rid of C parameter in createIsTruePredicate, 
createIsFalsePredicate, createIsNotTruePredicate, createIsNotFalsePredicate.
   example: private static LogicalExpression 
createIsFalsePredicate(LogicalExpression expr) {
   return new ParquetIsPredicate(expr, (exprStat, evaluator) ->
 exprStat.hasNonNullValue() && ((BooleanStatistics)exprStat).getMin() 
|| isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.NONE : 
checkNull(exprStat)
   );
 }
   
   Do you prefer this version?


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-10 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r201390049
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -62,90 +62,90 @@ private ParquetIsPredicate(LogicalExpression expr, 
BiPredicate, Ra
 return visitor.visitUnknown(this, value);
   }
 
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  /**
+   * Apply the filter condition against the meta of the rowgroup.
+   */
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
 Statistics exprStat = expr.accept(evaluator, null);
-if (isNullOrEmpty(exprStat)) {
-  return false;
-}
+return isNullOrEmpty(exprStat) ? RowsMatch.SOME : 
predicate.apply(exprStat, evaluator);
+  }
 
-return predicate.test(exprStat, evaluator);
+  /**
+   * After the applying of the filter against the statistics of the rowgroup, 
if the result is RowsMatch.ALL,
+   * then we still must know if the rowgroup contains some null values, 
because it can change the filter result.
+   * If it contains some null values, then we change the RowsMatch.ALL into 
RowsMatch.SOME, which sya that maybe
+   * some values (the null ones) should be disgarded.
+   */
+  private static RowsMatch checkNull(Statistics exprStat) {
+return hasNoNulls(exprStat) ? RowsMatch.ALL : RowsMatch.SOME;
   }
 
   /**
* IS NULL predicate.
*/
   private static > LogicalExpression 
createIsNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are no nulls  -> canDrop
-(exprStat, evaluator) -> hasNoNulls(exprStat)) {
-  private final boolean isArray = isArray(expr);
-
-  private boolean isArray(LogicalExpression expression) {
-if (expression instanceof TypedFieldExpr) {
-  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expression;
-  SchemaPath schemaPath = typedFieldExpr.getPath();
-  return schemaPath.isArray();
-}
-return false;
-  }
-
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  (exprStat, evaluator) -> {
 // for arrays we are not able to define exact number of nulls
 // [1,2,3] vs [1,2] -> in second case 3 is absent and thus it's null 
but statistics shows no nulls
-return !isArray && super.canDrop(evaluator);
-  }
-};
+if(expr instanceof TypedFieldExpr) {
+  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expr;
+  if (typedFieldExpr.getPath().isArray()) {
+return RowsMatch.SOME;
+  }
+}
+if (hasNoNulls(exprStat)) {
+  return RowsMatch.NONE;
+}
+return isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.ALL : 
RowsMatch.SOME;
+  });
   }
 
   /**
* IS NOT NULL predicate.
*/
   private static > LogicalExpression 
createIsNotNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are all nulls  -> canDrop
-(exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount())
+  (exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount()) ? 
RowsMatch.NONE : checkNull(exprStat)
 );
   }
 
   /**
* IS TRUE predicate.
*/
-  private static LogicalExpression createIsTruePredicate(LogicalExpression 
expr) {
-return new ParquetIsPredicate(expr, (exprStat, evaluator) ->
-//if max value is not true or if there are all nulls  -> canDrop
-isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && !((BooleanStatistics) exprStat).getMax()
-);
+  private static > LogicalExpression 
createIsTruePredicate(LogicalExpression expr) {
+return new ParquetIsPredicate(expr, (exprStat, evaluator) -> {
+  if (isAllNulls(exprStat, evaluator.getRowCount()) || 
exprStat.hasNonNullValue() && (!((BooleanStatistics)exprStat).getMin() && 
!((BooleanStatistics)exprStat).getMax())) {
 
 Review comment:
   I have found min=False and max=True, but also min=True and max=False. Hence 
I can not pre-suppose any order...


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-09 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r200935585
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -62,90 +60,89 @@ private ParquetIsPredicate(LogicalExpression expr, 
BiPredicate, Ra
 return visitor.visitUnknown(this, value);
   }
 
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  /**
+   * Apply the filter condition against the meta of the rowgroup.
+   */
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
 Statistics exprStat = expr.accept(evaluator, null);
-if (isNullOrEmpty(exprStat)) {
-  return false;
-}
+return ParquetPredicatesHelper.isNullOrEmpty(exprStat) ? RowsMatch.SOME : 
predicate.apply(exprStat, evaluator);
+  }
 
-return predicate.test(exprStat, evaluator);
+  /**
+   * After the applying of the filter against the statistics of the rowgroup, 
if the result is RowsMatch.ALL,
+   * then we still must know if the rowgroup contains some null values, 
because it can change the filter result.
+   * If it contains some null values, then we change the RowsMatch.ALL into 
RowsMatch.SOME, which sya that maybe
+   * some values (the null ones) should be disgarded.
+   */
+  static RowsMatch checkNull(Statistics exprStat) {
+return exprStat.getNumNulls() > 0 ? RowsMatch.SOME : RowsMatch.ALL;
   }
 
   /**
* IS NULL predicate.
*/
   private static > LogicalExpression 
createIsNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are no nulls  -> canDrop
-(exprStat, evaluator) -> hasNoNulls(exprStat)) {
-  private final boolean isArray = isArray(expr);
-
-  private boolean isArray(LogicalExpression expression) {
-if (expression instanceof TypedFieldExpr) {
-  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expression;
-  SchemaPath schemaPath = typedFieldExpr.getPath();
-  return schemaPath.isArray();
-}
-return false;
-  }
-
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  (exprStat, evaluator) -> {
 // for arrays we are not able to define exact number of nulls
 // [1,2,3] vs [1,2] -> in second case 3 is absent and thus it's null 
but statistics shows no nulls
-return !isArray && super.canDrop(evaluator);
-  }
-};
+TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expr;
+if (typedFieldExpr.getPath().isArray()) {
+  return RowsMatch.SOME;
+}
+if (hasNoNulls(exprStat)) {
+  return RowsMatch.NONE;
+}
+return isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.ALL : 
RowsMatch.SOME;
+  });
   }
 
   /**
* IS NOT NULL predicate.
*/
   private static > LogicalExpression 
createIsNotNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are all nulls  -> canDrop
-(exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount())
+  (exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount()) ? 
RowsMatch.NONE : checkNull(exprStat)
 );
   }
 
   /**
* IS TRUE predicate.
*/
-  private static LogicalExpression createIsTruePredicate(LogicalExpression 
expr) {
-return new ParquetIsPredicate(expr,
-//if max value is not true or if there are all nulls  -> canDrop
-(exprStat, evaluator) -> !((BooleanStatistics)exprStat).getMax() || 
isAllNulls(exprStat, evaluator.getRowCount())
-);
+  private static > LogicalExpression 
createIsTruePredicate(LogicalExpression expr) {
+return new ParquetIsPredicate(expr,
+  (exprStat, evaluator) -> {
+if (isAllNulls(exprStat, evaluator.getRowCount()) || 
(exprStat.genericGetMin().equals(Boolean.FALSE) && 
exprStat.genericGetMax().equals(Boolean.FALSE))) {
+  return RowsMatch.NONE;
+}
+return exprStat.genericGetMin().equals(Boolean.TRUE) && 
exprStat.genericGetMax().equals(Boolean.TRUE) ? checkNull(exprStat) : 
RowsMatch.SOME;
+  });
   }
 
   /**
* IS FALSE predicate.
*/
-  private static LogicalExpression createIsFalsePredicate(LogicalExpression 
expr) {
-return new ParquetIsPredicate(expr,
-//if min value is not false or if there are all nulls  -> canDrop
-(exprStat, evaluator) -> ((BooleanStatistics)exprStat).getMin() || 
isAllNulls(exprStat, evaluator.getRowCount())
+  private static > LogicalExpression 
createIsFalsePredicate(LogicalExpression expr) {
+return new ParquetIsPredicate(expr,
+  (exprStat, evaluator) -> !exprStat.genericGetMin().equals(Boolean.FALSE) 
|| isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.NONE : 
checkNull(exprStat)
 );
   }
 
   /**
* IS NOT TRUE predicate.
*/
-  private static LogicalExpression 

[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-02 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r199489637
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -62,90 +60,89 @@ private ParquetIsPredicate(LogicalExpression expr, 
BiPredicate, Ra
 return visitor.visitUnknown(this, value);
   }
 
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  /**
+   * Apply the filter condition against the meta of the rowgroup.
+   */
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
 Statistics exprStat = expr.accept(evaluator, null);
-if (isNullOrEmpty(exprStat)) {
-  return false;
-}
+return ParquetPredicatesHelper.isNullOrEmpty(exprStat) ? RowsMatch.SOME : 
predicate.apply(exprStat, evaluator);
+  }
 
-return predicate.test(exprStat, evaluator);
+  /**
+   * After the applying of the filter against the statistics of the rowgroup, 
if the result is RowsMatch.ALL,
+   * then we still must know if the rowgroup contains some null values, 
because it can change the filter result.
+   * If it contains some null values, then we change the RowsMatch.ALL into 
RowsMatch.SOME, which sya that maybe
+   * some values (the null ones) should be disgarded.
+   */
+  static RowsMatch checkNull(Statistics exprStat) {
+return exprStat.getNumNulls() > 0 ? RowsMatch.SOME : RowsMatch.ALL;
   }
 
   /**
* IS NULL predicate.
*/
   private static > LogicalExpression 
createIsNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are no nulls  -> canDrop
-(exprStat, evaluator) -> hasNoNulls(exprStat)) {
-  private final boolean isArray = isArray(expr);
-
-  private boolean isArray(LogicalExpression expression) {
-if (expression instanceof TypedFieldExpr) {
-  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expression;
-  SchemaPath schemaPath = typedFieldExpr.getPath();
-  return schemaPath.isArray();
-}
-return false;
-  }
-
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  (exprStat, evaluator) -> {
 // for arrays we are not able to define exact number of nulls
 // [1,2,3] vs [1,2] -> in second case 3 is absent and thus it's null 
but statistics shows no nulls
-return !isArray && super.canDrop(evaluator);
-  }
-};
+TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expr;
 
 Review comment:
   I can't swear of it. Surrounded with instanceof. 
   Done


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-02 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r199486245
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -62,90 +60,89 @@ private ParquetIsPredicate(LogicalExpression expr, 
BiPredicate, Ra
 return visitor.visitUnknown(this, value);
   }
 
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  /**
+   * Apply the filter condition against the meta of the rowgroup.
+   */
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
 Statistics exprStat = expr.accept(evaluator, null);
-if (isNullOrEmpty(exprStat)) {
-  return false;
-}
+return ParquetPredicatesHelper.isNullOrEmpty(exprStat) ? RowsMatch.SOME : 
predicate.apply(exprStat, evaluator);
+  }
 
-return predicate.test(exprStat, evaluator);
+  /**
+   * After the applying of the filter against the statistics of the rowgroup, 
if the result is RowsMatch.ALL,
+   * then we still must know if the rowgroup contains some null values, 
because it can change the filter result.
+   * If it contains some null values, then we change the RowsMatch.ALL into 
RowsMatch.SOME, which sya that maybe
+   * some values (the null ones) should be disgarded.
+   */
+  static RowsMatch checkNull(Statistics exprStat) {
+return exprStat.getNumNulls() > 0 ? RowsMatch.SOME : RowsMatch.ALL;
   }
 
   /**
* IS NULL predicate.
*/
   private static > LogicalExpression 
createIsNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are no nulls  -> canDrop
-(exprStat, evaluator) -> hasNoNulls(exprStat)) {
-  private final boolean isArray = isArray(expr);
-
-  private boolean isArray(LogicalExpression expression) {
-if (expression instanceof TypedFieldExpr) {
-  TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expression;
-  SchemaPath schemaPath = typedFieldExpr.getPath();
-  return schemaPath.isArray();
-}
-return false;
-  }
-
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  (exprStat, evaluator) -> {
 // for arrays we are not able to define exact number of nulls
 // [1,2,3] vs [1,2] -> in second case 3 is absent and thus it's null 
but statistics shows no nulls
-return !isArray && super.canDrop(evaluator);
-  }
-};
+TypedFieldExpr typedFieldExpr = (TypedFieldExpr) expr;
+if (typedFieldExpr.getPath().isArray()) {
+  return RowsMatch.SOME;
+}
+if (hasNoNulls(exprStat)) {
+  return RowsMatch.NONE;
+}
+return isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.ALL : 
RowsMatch.SOME;
+  });
   }
 
   /**
* IS NOT NULL predicate.
*/
   private static > LogicalExpression 
createIsNotNullPredicate(LogicalExpression expr) {
 return new ParquetIsPredicate(expr,
-//if there are all nulls  -> canDrop
-(exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount())
+  (exprStat, evaluator) -> isAllNulls(exprStat, evaluator.getRowCount()) ? 
RowsMatch.NONE : checkNull(exprStat)
 );
   }
 
   /**
* IS TRUE predicate.
*/
-  private static LogicalExpression createIsTruePredicate(LogicalExpression 
expr) {
-return new ParquetIsPredicate(expr,
-//if max value is not true or if there are all nulls  -> canDrop
-(exprStat, evaluator) -> !((BooleanStatistics)exprStat).getMax() || 
isAllNulls(exprStat, evaluator.getRowCount())
-);
+  private static > LogicalExpression 
createIsTruePredicate(LogicalExpression expr) {
+return new ParquetIsPredicate(expr,
+  (exprStat, evaluator) -> {
+if (isAllNulls(exprStat, evaluator.getRowCount()) || 
(exprStat.genericGetMin().equals(Boolean.FALSE) && 
exprStat.genericGetMax().equals(Boolean.FALSE))) {
+  return RowsMatch.NONE;
+}
+return exprStat.genericGetMin().equals(Boolean.TRUE) && 
exprStat.genericGetMax().equals(Boolean.TRUE) ? checkNull(exprStat) : 
RowsMatch.SOME;
+  });
   }
 
   /**
* IS FALSE predicate.
*/
-  private static LogicalExpression createIsFalsePredicate(LogicalExpression 
expr) {
-return new ParquetIsPredicate(expr,
-//if min value is not false or if there are all nulls  -> canDrop
-(exprStat, evaluator) -> ((BooleanStatistics)exprStat).getMin() || 
isAllNulls(exprStat, evaluator.getRowCount())
+  private static > LogicalExpression 
createIsFalsePredicate(LogicalExpression expr) {
+return new ParquetIsPredicate(expr,
+  (exprStat, evaluator) -> !exprStat.genericGetMin().equals(Boolean.FALSE) 
|| isAllNulls(exprStat, evaluator.getRowCount()) ? RowsMatch.NONE : 
checkNull(exprStat)
 );
   }
 
   /**
* IS NOT TRUE predicate.
*/
-  private static LogicalExpression 

[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-02 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r199483001
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetComparisonPredicate.java
 ##
 @@ -83,23 +84,26 @@ private ParquetComparisonPredicate(
* where Column1 and Column2 are from same parquet table.
*/
   @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
 Statistics leftStat = left.accept(evaluator, null);
-if (isNullOrEmpty(leftStat)) {
-  return false;
+if (isNullOrEmpty(leftStat) || !leftStat.hasNonNullValue()) {
+  return RowsMatch.SOME;
 }
-
 Statistics rightStat = right.accept(evaluator, null);
-if (isNullOrEmpty(rightStat)) {
-  return false;
+if (isNullOrEmpty(rightStat) || !rightStat.hasNonNullValue()) {
+  return RowsMatch.SOME;
 }
-
-// if either side is ALL null, = is evaluated to UNKNOWN -> canDrop
 if (isAllNulls(leftStat, evaluator.getRowCount()) || isAllNulls(rightStat, 
evaluator.getRowCount())) {
-  return true;
+return RowsMatch.NONE;
 }
+return predicate.apply(leftStat, rightStat);
+  }
 
-return (leftStat.hasNonNullValue() && rightStat.hasNonNullValue()) && 
predicate.test(leftStat, rightStat);
+  /**
+   * If one rowgroup contains some null values, change the RowsMatch.ALL into 
RowsMatch.SOME (null values should be discarded by filter)
+   */
+  static RowsMatch checkNull(Statistics leftStat, Statistics rightStat) {
 
 Review comment:
   Done


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-02 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r199482330
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetPushDownFilter.java
 ##
 @@ -165,12 +167,32 @@ protected void doOnMatch(RelOptRuleCall call, FilterPrel 
filter, ProjectPrel pro
   return;
 }
 
-
 RelNode newScan = ScanPrel.create(scan, scan.getTraitSet(), newGroupScan, 
scan.getRowType());;
 
 if (project != null) {
   newScan = project.copy(project.getTraitSet(), ImmutableList.of(newScan));
 }
+
+RowsMatch matchAll = RowsMatch.ALL;
+if (newGroupScan instanceof AbstractParquetGroupScan) {
+  List rowGroupInfos = ((AbstractParquetGroupScan) 
newGroupScan).rowGroupInfos;
+  for (RowGroupInfo rowGroup : rowGroupInfos) {
+if (rowGroup.getRowsMatch() !=  RowsMatch.ALL) {
+  matchAll = RowsMatch.SOME;
+  break;
+}
+  }
+} else {
+  matchAll = RowsMatch.SOME;
+}
+// If all groups are in RowsMatch.ALL, no need to apply the filter again 
to their rows => prune the filter
+if (matchAll == ParquetFilterPredicate.RowsMatch.ALL) {
 
 Review comment:
   Done


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-02 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r199480607
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -62,90 +60,89 @@ private ParquetIsPredicate(LogicalExpression expr, 
BiPredicate, Ra
 return visitor.visitUnknown(this, value);
   }
 
-  @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  /**
+   * Apply the filter condition against the meta of the rowgroup.
+   */
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
 Statistics exprStat = expr.accept(evaluator, null);
-if (isNullOrEmpty(exprStat)) {
-  return false;
-}
+return ParquetPredicatesHelper.isNullOrEmpty(exprStat) ? RowsMatch.SOME : 
predicate.apply(exprStat, evaluator);
+  }
 
-return predicate.test(exprStat, evaluator);
+  /**
+   * After the applying of the filter against the statistics of the rowgroup, 
if the result is RowsMatch.ALL,
+   * then we still must know if the rowgroup contains some null values, 
because it can change the filter result.
+   * If it contains some null values, then we change the RowsMatch.ALL into 
RowsMatch.SOME, which sya that maybe
+   * some values (the null ones) should be disgarded.
+   */
+  static RowsMatch checkNull(Statistics exprStat) {
+return exprStat.getNumNulls() > 0 ? RowsMatch.SOME : RowsMatch.ALL;
 
 Review comment:
   Done


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-02 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r199479547
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
 ##
 @@ -19,21 +19,18 @@
 
 import org.apache.drill.common.expression.LogicalExpression;
 import org.apache.drill.common.expression.LogicalExpressionBase;
-import org.apache.drill.common.expression.SchemaPath;
 import org.apache.drill.common.expression.TypedFieldExpr;
 import org.apache.drill.common.expression.visitors.ExprVisitor;
 import org.apache.drill.exec.expr.fn.FunctionGenerationHelper;
-import org.apache.parquet.column.statistics.BooleanStatistics;
 import org.apache.parquet.column.statistics.Statistics;
 
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
-import java.util.function.BiPredicate;
+import java.util.function.BiFunction;
 
 import static 
org.apache.drill.exec.expr.stat.ParquetPredicatesHelper.hasNoNulls;
 import static 
org.apache.drill.exec.expr.stat.ParquetPredicatesHelper.isAllNulls;
-import static 
org.apache.drill.exec.expr.stat.ParquetPredicatesHelper.isNullOrEmpty;
 
 Review comment:
   Back. Done


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] jbimbert commented on a change in pull request #1298: DRILL-5796: Filter pruning for multi rowgroup parquet file

2018-07-02 Thread GitBox
jbimbert commented on a change in pull request #1298: DRILL-5796: Filter 
pruning for multi rowgroup parquet file
URL: https://github.com/apache/drill/pull/1298#discussion_r199478733
 
 

 ##
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetComparisonPredicate.java
 ##
 @@ -83,23 +84,26 @@ private ParquetComparisonPredicate(
* where Column1 and Column2 are from same parquet table.
*/
   @Override
-  public boolean canDrop(RangeExprEvaluator evaluator) {
+  public RowsMatch matches(RangeExprEvaluator evaluator) {
 Statistics leftStat = left.accept(evaluator, null);
-if (isNullOrEmpty(leftStat)) {
-  return false;
+if (isNullOrEmpty(leftStat) || !leftStat.hasNonNullValue()) {
 
 Review comment:
   Done


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services