asfgit closed pull request #1349: DRILL-6554: Minor code improvements in parquet statistics handling URL: https://github.com/apache/drill/pull/1349
This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java index 547dc06704..42e6e0b6a4 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java @@ -113,9 +113,9 @@ public boolean canDrop(RangeExprEvaluator<C> evaluator) { * IS TRUE predicate. */ private static LogicalExpression createIsTruePredicate(LogicalExpression expr) { - return new ParquetIsPredicate<Boolean>(expr, + return new ParquetIsPredicate<Boolean>(expr, (exprStat, evaluator) -> //if max value is not true or if there are all nulls -> canDrop - (exprStat, evaluator) -> !((BooleanStatistics)exprStat).getMax() || isAllNulls(exprStat, evaluator.getRowCount()) + isAllNulls(exprStat, evaluator.getRowCount()) || exprStat.hasNonNullValue() && !((BooleanStatistics) exprStat).getMax() ); } @@ -123,9 +123,9 @@ private static LogicalExpression createIsTruePredicate(LogicalExpression expr) { * IS FALSE predicate. */ private static LogicalExpression createIsFalsePredicate(LogicalExpression expr) { - return new ParquetIsPredicate<Boolean>(expr, + return new ParquetIsPredicate<Boolean>(expr, (exprStat, evaluator) -> //if min value is not false or if there are all nulls -> canDrop - (exprStat, evaluator) -> ((BooleanStatistics)exprStat).getMin() || isAllNulls(exprStat, evaluator.getRowCount()) + isAllNulls(exprStat, evaluator.getRowCount()) || exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin() ); } @@ -133,9 +133,9 @@ private static LogicalExpression createIsFalsePredicate(LogicalExpression expr) * IS NOT TRUE predicate. */ private static LogicalExpression createIsNotTruePredicate(LogicalExpression expr) { - return new ParquetIsPredicate<Boolean>(expr, + return new ParquetIsPredicate<Boolean>(expr, (exprStat, evaluator) -> //if min value is not false or if there are no nulls -> canDrop - (exprStat, evaluator) -> ((BooleanStatistics)exprStat).getMin() && hasNoNulls(exprStat) + hasNoNulls(exprStat) && exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin() ); } @@ -143,9 +143,9 @@ private static LogicalExpression createIsNotTruePredicate(LogicalExpression expr * IS NOT FALSE predicate. */ private static LogicalExpression createIsNotFalsePredicate(LogicalExpression expr) { - return new ParquetIsPredicate<Boolean>(expr, + return new ParquetIsPredicate<Boolean>(expr, (exprStat, evaluator) -> //if max value is not true or if there are no nulls -> canDrop - (exprStat, evaluator) -> !((BooleanStatistics)exprStat).getMax() && hasNoNulls(exprStat) + hasNoNulls(exprStat) && exprStat.hasNonNullValue() && !((BooleanStatistics) exprStat).getMax() ); } diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetPredicatesHelper.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetPredicatesHelper.java index f804a7b06f..de4df1f5b9 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetPredicatesHelper.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetPredicatesHelper.java @@ -17,6 +17,7 @@ */ package org.apache.drill.exec.expr.stat; +import org.apache.parquet.Preconditions; import org.apache.parquet.column.statistics.Statistics; /** @@ -28,7 +29,7 @@ private ParquetPredicatesHelper() { /** * @param stat statistics object - * @return true if the input stat object has valid statistics; false otherwise + * @return <tt>true</tt> if the input stat object has valid statistics; false otherwise */ static boolean isNullOrEmpty(Statistics stat) { return stat == null || stat.isEmpty(); @@ -39,22 +40,21 @@ static boolean isNullOrEmpty(Statistics stat) { * * @param stat parquet column statistics * @param rowCount number of rows in the parquet file - * @return True if all rows are null in the parquet file - * False if at least one row is not null. + * @return <tt>true</tt> if all rows are null in the parquet file and <tt>false</tt> otherwise */ static boolean isAllNulls(Statistics stat, long rowCount) { - return stat.isNumNullsSet() && stat.getNumNulls() == rowCount; + Preconditions.checkArgument(rowCount >= 0, String.format("negative rowCount %d is not valid", rowCount)); + return stat.getNumNulls() == rowCount; } /** - * Checks that column chunk's statistics has at least one null + * Checks that column chunk's statistics does not have nulls * * @param stat parquet column statistics - * @return True if the parquet file has nulls - * False if the parquet file hasn't nulls. + * @return <tt>true</tt> if the parquet file does not have nulls and <tt>false</tt> otherwise */ static boolean hasNoNulls(Statistics stat) { - return !stat.isNumNullsSet() || stat.getNumNulls() == 0; + return stat.getNumNulls() <= 0; } } diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetReaderUtility.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetReaderUtility.java index 40203f5989..a7f78fb40c 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetReaderUtility.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetReaderUtility.java @@ -32,7 +32,7 @@ import org.apache.parquet.SemanticVersion; import org.apache.parquet.VersionParser; import org.apache.parquet.column.ColumnDescriptor; -import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.statistics.IntStatistics; import org.apache.parquet.format.ConvertedType; import org.apache.parquet.format.FileMetaData; import org.apache.parquet.format.SchemaElement; @@ -417,16 +417,9 @@ public static DateCorruptionStatus checkForCorruptDateValuesInStatistics(Parquet // column does not appear in this file, skip it continue; } - Statistics statistics = footer.getBlocks().get(rowGroupIndex).getColumns().get(colIndex).getStatistics(); - Integer max = (Integer) statistics.genericGetMax(); - if (statistics.hasNonNullValue()) { - if (max > ParquetReaderUtility.DATE_CORRUPTION_THRESHOLD) { - return DateCorruptionStatus.META_SHOWS_CORRUPTION; - } - } else { - // no statistics, go check the first page - return DateCorruptionStatus.META_UNCLEAR_TEST_VALUES; - } + IntStatistics statistics = (IntStatistics) footer.getBlocks().get(rowGroupIndex).getColumns().get(colIndex).getStatistics(); + return (statistics.hasNonNullValue() && statistics.compareMaxToValue(ParquetReaderUtility.DATE_CORRUPTION_THRESHOLD) > 0) ? + DateCorruptionStatus.META_SHOWS_CORRUPTION : DateCorruptionStatus.META_UNCLEAR_TEST_VALUES; } } } diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/metadata/Metadata.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/metadata/Metadata.java index a61cc18328..225916999d 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/metadata/Metadata.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/metadata/Metadata.java @@ -473,7 +473,7 @@ private ParquetFileMetadata_v3 getParquetFileMetadata_v3(ParquetTableMetadata_v3 // Write stats when they are not null Object minValue = null; Object maxValue = null; - if (stats.genericGetMax() != null && stats.genericGetMin() != null ) { + if (stats.hasNonNullValue()) { minValue = stats.genericGetMin(); maxValue = stats.genericGetMax(); if (containsCorruptDates == ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services