Repository: parquet-mr Updated Branches: refs/heads/master 8bbc6cb95 -> b82d96218
PARQUET-1217: Incorrect handling of missing values in Statistics In parquet-format every value in Statistics is optional while parquet-mr does not properly handle these scenarios: - null_count is set but min/max or min_value/max_value are not: filtering may fail with NPE or incorrect filtering occurs fix: check if min/max is set before comparing to the related values - null_count is not set: filtering handles null_count as if it would be 0 -> incorrect filtering may occur fix: introduce new method in Statistics object to check if num_nulls is set; check if num_nulls is set by the new method before using its value for filtering Author: Gabor Szadovszky <gabor.szadovs...@cloudera.com> Closes #458 from gszadovszky/PARQUET-1217 and squashes the following commits: 9d14090 [Gabor Szadovszky] Updates according to rdblue's comments 116d1d3 [Gabor Szadovszky] PARQUET-1217: Updates according to zi's comments c264b50 [Gabor Szadovszky] PARQUET-1217: fix handling of unset nullCount 2ec2fb1 [Gabor Szadovszky] PARQUET-1217: Incorrect handling of missing values in Statistics Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/b82d9621 Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/b82d9621 Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/b82d9621 Branch: refs/heads/master Commit: b82d96218bfd37f6df95a2e8d7675d091ab61970 Parents: 8bbc6cb Author: Gabor Szadovszky <gabor.szadovs...@cloudera.com> Authored: Tue Feb 27 14:19:14 2018 +0100 Committer: Zoltan Ivanfi <z...@cloudera.com> Committed: Tue Feb 27 14:19:14 2018 +0100 ---------------------------------------------------------------------- .../cli/commands/ParquetMetadataCommand.java | 4 +- .../parquet/cli/commands/ShowPagesCommand.java | 2 +- .../parquet/column/statistics/Statistics.java | 78 ++++++++++++++++++-- .../column/statistics/TestStatistics.java | 1 + .../statisticslevel/StatisticsFilter.java | 42 ++++++++++- .../converter/ParquetMetadataConverter.java | 19 +++-- .../statisticslevel/TestStatisticsFilter.java | 64 +++++++++++++++- .../converter/TestParquetMetadataConverter.java | 33 +++++++++ .../hadoop/TestColumnChunkPageWriteStore.java | 4 +- .../parquet/hadoop/TestParquetFileWriter.java | 48 ++++++------ 10 files changed, 249 insertions(+), 46 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b82d9621/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ParquetMetadataCommand.java ---------------------------------------------------------------------- diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ParquetMetadataCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ParquetMetadataCommand.java index 0bd77a3..54fe657 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ParquetMetadataCommand.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ParquetMetadataCommand.java @@ -168,12 +168,12 @@ public class ParquetMetadataCommand extends BaseCommand { if (typeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) { console.info(String.format("%-" + width + "s FIXED[%d] %s %-7s %-9d %-8s %-7s %s", name, type.getTypeLength(), shortCodec(codec), encodingSummary, count, - humanReadable(perValue), stats == null ? "" : String.valueOf(stats.getNumNulls()), + humanReadable(perValue), stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()), minMaxAsString(stats, type.getOriginalType()))); } else { console.info(String.format("%-" + width + "s %-9s %s %-7s %-9d %-10s %-7s %s", name, typeName, shortCodec(codec), encodingSummary, count, humanReadable(perValue), - stats == null ? "" : String.valueOf(stats.getNumNulls()), + stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()), minMaxAsString(stats, type.getOriginalType()))); } } http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b82d9621/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowPagesCommand.java ---------------------------------------------------------------------- diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowPagesCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowPagesCommand.java index beda452..4d0e2c9 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowPagesCommand.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowPagesCommand.java @@ -191,7 +191,7 @@ public class ShowPagesCommand extends BaseCommand { String enc = encodingAsString(page.getValueEncoding(), false); long totalSize = page.getCompressedSize(); int count = page.getValueCount(); - long numNulls = page.getStatistics().getNumNulls(); + String numNulls = page.getStatistics().isNumNullsSet() ? Long.toString(page.getStatistics().getNumNulls()) : ""; float perValue = ((float) totalSize) / count; String minMax = minMaxAsString(page.getStatistics(), type.getOriginalType()); return String.format("%3d-%-3d %-5s %s %-2s %-7d %-10s %-10s %-8s %-7s %s", http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b82d9621/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java ---------------------------------------------------------------------- diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java index 00d0bbf..a087c5f 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java @@ -35,6 +35,44 @@ import org.apache.parquet.schema.Type; */ public abstract class Statistics<T extends Comparable<T>> { + /** + * Builder class to build Statistics objects. Used to read the statistics from the Parquet file. + */ + public static class Builder { + private final PrimitiveType type; + private byte[] min; + private byte[] max; + private long numNulls = -1; + + private Builder(PrimitiveType type) { + this.type = type; + } + + public Builder withMin(byte[] min) { + this.min = min; + return this; + } + + public Builder withMax(byte[] max) { + this.max = max; + return this; + } + + public Builder withNumNulls(long numNulls) { + this.numNulls = numNulls; + return this; + } + + public Statistics<?> build() { + Statistics<?> stats = createStats(type); + if (min != null && max != null) { + stats.setMinMaxFromBytes(min, max); + } + stats.num_nulls = this.numNulls; + return stats; + } + } + private final PrimitiveType type; private final PrimitiveComparator<T> comparator; private boolean hasNonNullValue; @@ -110,6 +148,17 @@ public abstract class Statistics<T extends Comparable<T>> { } /** + * Returns a builder to create new statistics object. Used to read the statistics from the parquet file. + * + * @param type + * type of the column + * @return builder to create new statistics object + */ + public static Builder getBuilder(PrimitiveType type) { + return new Builder(type); + } + + /** * updates statistics min and max using the passed value * @param value value to use to update min and max */ @@ -217,7 +266,9 @@ public abstract class Statistics<T extends Comparable<T>> { * Abstract method to set min and max values from byte arrays. * @param minBytes byte array to set the min value to * @param maxBytes byte array to set the max value to + * @deprecated will be removed in 2.0.0. Use {@link #getBuilder(PrimitiveType)} instead. */ + @Deprecated abstract public void setMinMaxFromBytes(byte[] minBytes, byte[] maxBytes); /** @@ -310,9 +361,13 @@ public abstract class Statistics<T extends Comparable<T>> { @Override public String toString() { - if (this.hasNonNullValue()) - return String.format("min: %s, max: %s, num_nulls: %d", minAsString(), maxAsString(), this.getNumNulls()); - else if (!this.isEmpty()) + if (this.hasNonNullValue()) { + if (isNumNullsSet()) { + return String.format("min: %s, max: %s, num_nulls: %d", minAsString(), maxAsString(), this.getNumNulls()); + } else { + return String.format("min: %s, max: %s, num_nulls not defined", minAsString(), maxAsString()); + } + } else if (!this.isEmpty()) return String.format("num_nulls: %d, min/max not defined", this.getNumNulls()); else return "no stats for this column"; @@ -335,7 +390,7 @@ public abstract class Statistics<T extends Comparable<T>> { /** * Returns the null count - * @return null count + * @return null count or {@code -1} if the null count is not set */ public long getNumNulls() { return num_nulls; @@ -343,8 +398,12 @@ public abstract class Statistics<T extends Comparable<T>> { /** * Sets the number of nulls to the parameter value - * @param nulls null count to set the count to + * + * @param nulls + * null count to set the count to + * @deprecated will be removed in 2.0.0. Use {@link #getBuilder(PrimitiveType)} instead. */ + @Deprecated public void setNumNulls(long nulls) { num_nulls = nulls; } @@ -355,7 +414,7 @@ public abstract class Statistics<T extends Comparable<T>> { * @return true if object is empty, false otherwise */ public boolean isEmpty() { - return !hasNonNullValue && num_nulls == 0; + return !hasNonNullValue && !isNumNullsSet(); } /** @@ -366,6 +425,13 @@ public abstract class Statistics<T extends Comparable<T>> { } /** + * @return whether numNulls is set and can be used + */ + public boolean isNumNullsSet() { + return num_nulls >= 0; + } + + /** * Sets the page/column as having a valid non-null value * kind of misnomer here */ http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b82d9621/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java ---------------------------------------------------------------------- diff --git a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java index 8ca1ca6..5e5d5fd 100644 --- a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java +++ b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java @@ -42,6 +42,7 @@ public class TestStatistics { @Test public void testNumNulls() { IntStatistics stats = new IntStatistics(); + assertTrue(stats.isNumNullsSet()); assertEquals(stats.getNumNulls(), 0); stats.incrementNumNulls(); http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b82d9621/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java ---------------------------------------------------------------------- diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java index f168a60..446c8a3 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java @@ -40,7 +40,6 @@ import org.apache.parquet.filter2.predicate.Operators.UserDefined; import org.apache.parquet.filter2.predicate.UserDefinedPredicate; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; -import static org.apache.parquet.Preconditions.checkArgument; import static org.apache.parquet.Preconditions.checkNotNull; /** @@ -122,6 +121,10 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean> { } if (value == null) { + // We don't know anything about the nulls in this chunk + if (!stats.isNumNullsSet()) { + return BLOCK_MIGHT_MATCH; + } // we are looking for records where v eq(null) // so drop if there are no nulls in this chunk return !hasNulls(meta); @@ -133,6 +136,11 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean> { return BLOCK_CANNOT_MATCH; } + if (!stats.hasNonNullValue()) { + // stats does not contain min/max values, we cannot drop any chunks + return BLOCK_MIGHT_MATCH; + } + // drop if value < min || value > max return stats.compareMinToValue(value) > 0 || stats.compareMaxToValue(value) < 0; } @@ -166,12 +174,17 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean> { return isAllNulls(meta); } - if (hasNulls(meta)) { + if (stats.isNumNullsSet() && hasNulls(meta)) { // we are looking for records where v notEq(someNonNull) // but this chunk contains nulls, we cannot drop it return BLOCK_MIGHT_MATCH; } + if (!stats.hasNonNullValue()) { + // stats does not contain min/max values, we cannot drop any chunks + return BLOCK_MIGHT_MATCH; + } + // drop if this is a column where min = max = value return stats.compareMinToValue(value) == 0 && stats.compareMaxToValue(value) == 0; } @@ -201,6 +214,11 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean> { return BLOCK_CANNOT_MATCH; } + if (!stats.hasNonNullValue()) { + // stats does not contain min/max values, we cannot drop any chunks + return BLOCK_MIGHT_MATCH; + } + T value = lt.getValue(); // drop if value <= min @@ -232,6 +250,11 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean> { return BLOCK_CANNOT_MATCH; } + if (!stats.hasNonNullValue()) { + // stats does not contain min/max values, we cannot drop any chunks + return BLOCK_MIGHT_MATCH; + } + T value = ltEq.getValue(); // drop if value < min @@ -263,6 +286,11 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean> { return BLOCK_CANNOT_MATCH; } + if (!stats.hasNonNullValue()) { + // stats does not contain min/max values, we cannot drop any chunks + return BLOCK_MIGHT_MATCH; + } + T value = gt.getValue(); // drop if value >= max @@ -294,6 +322,11 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean> { return BLOCK_CANNOT_MATCH; } + if (!stats.hasNonNullValue()) { + // stats does not contain min/max values, we cannot drop any chunks + return BLOCK_MIGHT_MATCH; + } + T value = gtEq.getValue(); // drop if value > max @@ -355,6 +388,11 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean> { } } + if (!stats.hasNonNullValue()) { + // stats does not contain min/max values, we cannot drop any chunks + return BLOCK_MIGHT_MATCH; + } + org.apache.parquet.filter2.predicate.Statistics<T> udpStats = new org.apache.parquet.filter2.predicate.Statistics<T>(stats.genericGetMin(), stats.genericGetMax(), stats.comparator()); http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b82d9621/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java ---------------------------------------------------------------------- diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index c4e5da3..0daabb6 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -401,7 +401,8 @@ public class ParquetMetadataConverter { static org.apache.parquet.column.statistics.Statistics fromParquetStatisticsInternal (String createdBy, Statistics formatStats, PrimitiveType type, SortOrder typeSortOrder) { // create stats object based on the column type - org.apache.parquet.column.statistics.Statistics stats = org.apache.parquet.column.statistics.Statistics.createStats(type); + org.apache.parquet.column.statistics.Statistics.Builder statsBuilder = + org.apache.parquet.column.statistics.Statistics.getBuilder(type); if (formatStats != null) { // Use the new V2 min-max statistics over the former one if it is filled @@ -409,9 +410,12 @@ public class ParquetMetadataConverter { byte[] min = formatStats.min_value.array(); byte[] max = formatStats.max_value.array(); if (isMinMaxStatsSupported(type) || Arrays.equals(min, max)) { - stats.setMinMaxFromBytes(min, max); + statsBuilder.withMin(min); + statsBuilder.withMax(max); + } + if (formatStats.isSetNull_count()) { + statsBuilder.withNumNulls(formatStats.null_count); } - stats.setNumNulls(formatStats.null_count); } else { boolean isSet = formatStats.isSetMax() && formatStats.isSetMin(); boolean maxEqualsMin = isSet ? Arrays.equals(formatStats.getMin(), formatStats.getMax()) : false; @@ -424,13 +428,16 @@ public class ParquetMetadataConverter { if (!CorruptStatistics.shouldIgnoreStatistics(createdBy, type.getPrimitiveTypeName()) && (sortOrdersMatch || maxEqualsMin)) { if (isSet) { - stats.setMinMaxFromBytes(formatStats.min.array(), formatStats.max.array()); + statsBuilder.withMin(formatStats.min.array()); + statsBuilder.withMax(formatStats.max.array()); + } + if (formatStats.isSetNull_count()) { + statsBuilder.withNumNulls(formatStats.null_count); } - stats.setNumNulls(formatStats.null_count); } } } - return stats; + return statsBuilder.build(); } public org.apache.parquet.column.statistics.Statistics fromParquetStatistics( http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b82d9621/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java ---------------------------------------------------------------------- diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java index d8b4407..6fdec2a 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java @@ -22,7 +22,6 @@ import java.util.Arrays; import java.util.HashSet; import java.util.List; -import org.apache.parquet.io.api.Binary; import org.junit.Test; import org.apache.parquet.column.Encoding; @@ -39,6 +38,7 @@ import org.apache.parquet.filter2.predicate.UserDefinedPredicate; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Types; import static org.apache.parquet.filter2.predicate.FilterApi.binaryColumn; import static org.apache.parquet.io.api.Binary.fromString; @@ -62,7 +62,8 @@ import static org.apache.parquet.filter2.statisticslevel.StatisticsFilter.canDro public class TestStatisticsFilter { - private static ColumnChunkMetaData getIntColumnMeta(IntStatistics stats, long valueCount) { + private static ColumnChunkMetaData getIntColumnMeta(org.apache.parquet.column.statistics.Statistics<?> stats, + long valueCount) { return ColumnChunkMetaData.get(ColumnPath.get("int", "column"), PrimitiveTypeName.INT32, CompressionCodecName.GZIP, @@ -71,7 +72,8 @@ public class TestStatisticsFilter { 0L, 0L, valueCount, 0L, 0L); } - private static ColumnChunkMetaData getDoubleColumnMeta(DoubleStatistics stats, long valueCount) { + private static ColumnChunkMetaData getDoubleColumnMeta(org.apache.parquet.column.statistics.Statistics<?> stats, + long valueCount) { return ColumnChunkMetaData.get(ColumnPath.get("double", "column"), PrimitiveTypeName.DOUBLE, CompressionCodecName.GZIP, @@ -87,13 +89,16 @@ public class TestStatisticsFilter { private static final IntStatistics intStats = new IntStatistics(); private static final IntStatistics nullIntStats = new IntStatistics(); + private static final org.apache.parquet.column.statistics.Statistics<?> emptyIntStats = org.apache.parquet.column.statistics.Statistics + .getBuilder(Types.required(PrimitiveTypeName.INT32).named("test_int32")).build(); private static final DoubleStatistics doubleStats = new DoubleStatistics(); + private static final org.apache.parquet.column.statistics.Statistics<?> missingMinMaxDoubleStats = org.apache.parquet.column.statistics.Statistics + .getBuilder(Types.required(PrimitiveTypeName.DOUBLE).named("test_double")).withNumNulls(100).build(); static { intStats.setMinMax(10, 100); doubleStats.setMinMax(10, 100); - nullIntStats.setMinMax(0, 0); nullIntStats.setNumNulls(177); } @@ -105,6 +110,9 @@ public class TestStatisticsFilter { getIntColumnMeta(nullIntStats, 177L), // column of all nulls getDoubleColumnMeta(doubleStats, 177L)); + private static final List<ColumnChunkMetaData> missingMinMaxColumnMetas = Arrays.asList( + getIntColumnMeta(emptyIntStats, 177L), // missing min/max values and numNulls => stats is empty + getDoubleColumnMeta(missingMinMaxDoubleStats, 177L)); // missing min/max, some null values @Test public void testEqNonNull() { @@ -116,6 +124,9 @@ public class TestStatisticsFilter { // drop columns of all nulls when looking for non-null value assertTrue(canDrop(eq(intColumn, 0), nullColumnMetas)); assertTrue(canDrop(eq(missingColumn, fromString("any")), columnMetas)); + + assertFalse(canDrop(eq(intColumn, 50), missingMinMaxColumnMetas)); + assertFalse(canDrop(eq(doubleColumn, 50.0), missingMinMaxColumnMetas)); } @Test @@ -137,6 +148,9 @@ public class TestStatisticsFilter { getDoubleColumnMeta(doubleStats, 177L)))); assertFalse(canDrop(eq(missingColumn, null), columnMetas)); + + assertFalse(canDrop(eq(intColumn, null), missingMinMaxColumnMetas)); + assertFalse(canDrop(eq(doubleColumn, null), missingMinMaxColumnMetas)); } @Test @@ -163,6 +177,9 @@ public class TestStatisticsFilter { getDoubleColumnMeta(doubleStats, 177L)))); assertFalse(canDrop(notEq(missingColumn, fromString("any")), columnMetas)); + + assertFalse(canDrop(notEq(intColumn, 50), missingMinMaxColumnMetas)); + assertFalse(canDrop(notEq(doubleColumn, 50.0), missingMinMaxColumnMetas)); } @Test @@ -192,6 +209,9 @@ public class TestStatisticsFilter { getDoubleColumnMeta(doubleStats, 177L)))); assertTrue(canDrop(notEq(missingColumn, null), columnMetas)); + + assertFalse(canDrop(notEq(intColumn, null), missingMinMaxColumnMetas)); + assertFalse(canDrop(notEq(doubleColumn, null), missingMinMaxColumnMetas)); } @Test @@ -205,6 +225,9 @@ public class TestStatisticsFilter { assertTrue(canDrop(lt(intColumn, 7), nullColumnMetas)); assertTrue(canDrop(lt(missingColumn, fromString("any")), columnMetas)); + + assertFalse(canDrop(lt(intColumn, 0), missingMinMaxColumnMetas)); + assertFalse(canDrop(lt(doubleColumn, 0.0), missingMinMaxColumnMetas)); } @Test @@ -218,6 +241,9 @@ public class TestStatisticsFilter { assertTrue(canDrop(ltEq(intColumn, 7), nullColumnMetas)); assertTrue(canDrop(ltEq(missingColumn, fromString("any")), columnMetas)); + + assertFalse(canDrop(ltEq(intColumn, -1), missingMinMaxColumnMetas)); + assertFalse(canDrop(ltEq(doubleColumn, -0.1), missingMinMaxColumnMetas)); } @Test @@ -231,6 +257,9 @@ public class TestStatisticsFilter { assertTrue(canDrop(gt(intColumn, 7), nullColumnMetas)); assertTrue(canDrop(gt(missingColumn, fromString("any")), columnMetas)); + + assertFalse(canDrop(gt(intColumn, 0), missingMinMaxColumnMetas)); + assertFalse(canDrop(gt(doubleColumn, 0.0), missingMinMaxColumnMetas)); } @Test @@ -244,6 +273,9 @@ public class TestStatisticsFilter { assertTrue(canDrop(gtEq(intColumn, 7), nullColumnMetas)); assertTrue(canDrop(gtEq(missingColumn, fromString("any")), columnMetas)); + + assertFalse(canDrop(gtEq(intColumn, 1), missingMinMaxColumnMetas)); + assertFalse(canDrop(gtEq(doubleColumn, 0.1), missingMinMaxColumnMetas)); } @Test @@ -297,6 +329,26 @@ public class TestStatisticsFilter { } } + public static class AllPositiveUdp extends UserDefinedPredicate<Double> { + @Override + public boolean keep(Double value) { + if (value == null) { + return true; + } + throw new RuntimeException("this method should not be called with value != null"); + } + + @Override + public boolean canDrop(Statistics<Double> statistics) { + return statistics.getMin() <= 0.0; + } + + @Override + public boolean inverseCanDrop(Statistics<Double> statistics) { + return statistics.getMin() > 0.0; + } + } + @Test public void testUdp() { FilterPredicate pred = userDefined(intColumn, SevensAndEightsUdp.class); @@ -308,6 +360,8 @@ public class TestStatisticsFilter { FilterPredicate udpKeepMissingColumn = userDefined(missingColumn2, SevensAndEightsUdp.class); FilterPredicate invUdpKeepMissingColumn = LogicalInverseRewriter.rewrite(not(userDefined(missingColumn2, SevensAndEightsUdp.class))); + FilterPredicate allPositivePred = userDefined(doubleColumn, AllPositiveUdp.class); + IntStatistics seven = new IntStatistics(); seven.setMinMax(7, 7); @@ -392,6 +446,8 @@ public class TestStatisticsFilter { assertTrue(canDrop(invUdpKeepMissingColumn, Arrays.asList( getIntColumnMeta(neither, 177L), getDoubleColumnMeta(doubleStats, 177L)))); + + assertFalse(canDrop(allPositivePred, missingMinMaxColumnMetas)); } @Test http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b82d9621/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java ---------------------------------------------------------------------- diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index b83da5d..6cce32f 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -658,6 +658,7 @@ public class TestParquetMetadataConverter { binaryType); Assert.assertFalse("Stats should not be empty", convertedStats.isEmpty()); + Assert.assertTrue(convertedStats.isNumNullsSet()); Assert.assertEquals("Should have 3 nulls", 3, convertedStats.getNumNulls()); if (helper == StatsHelper.V1) { assertFalse("Min-max should be null for V1 stats", convertedStats.hasNonNullValue()); @@ -670,6 +671,38 @@ public class TestParquetMetadataConverter { } @Test + public void testMissingValuesFromStats() { + ParquetMetadataConverter converter = new ParquetMetadataConverter(); + PrimitiveType type = Types.required(PrimitiveTypeName.INT32).named("test_int32"); + + org.apache.parquet.format.Statistics formatStats = new org.apache.parquet.format.Statistics(); + Statistics<?> stats = converter.fromParquetStatistics(Version.FULL_VERSION, formatStats, type); + assertFalse(stats.isNumNullsSet()); + assertFalse(stats.hasNonNullValue()); + assertTrue(stats.isEmpty()); + assertEquals(-1, stats.getNumNulls()); + + formatStats.clear(); + formatStats.setMin(BytesUtils.intToBytes(-100)); + formatStats.setMax(BytesUtils.intToBytes(100)); + stats = converter.fromParquetStatistics(Version.FULL_VERSION, formatStats, type); + assertFalse(stats.isNumNullsSet()); + assertTrue(stats.hasNonNullValue()); + assertFalse(stats.isEmpty()); + assertEquals(-1, stats.getNumNulls()); + assertEquals(-100, stats.genericGetMin()); + assertEquals(100, stats.genericGetMax()); + + formatStats.clear(); + formatStats.setNull_count(2000); + stats = converter.fromParquetStatistics(Version.FULL_VERSION, formatStats, type); + assertTrue(stats.isNumNullsSet()); + assertFalse(stats.hasNonNullValue()); + assertFalse(stats.isEmpty()); + assertEquals(2000, stats.getNumNulls()); + } + + @Test public void testSkippedV2Stats() { testSkippedV2Stats( Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(12).as(OriginalType.INTERVAL).named(""), http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b82d9621/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java ---------------------------------------------------------------------- diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java index 87574cd..0b7b951 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java @@ -60,6 +60,7 @@ import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.parquet.hadoop.metadata.ParquetMetadata; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.MessageTypeParser; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.apache.parquet.schema.Types; import org.apache.parquet.bytes.HeapByteBufferAllocator; @@ -92,7 +93,8 @@ public class TestColumnChunkPageWriteStore { int v = 3; BytesInput definitionLevels = BytesInput.fromInt(d); BytesInput repetitionLevels = BytesInput.fromInt(r); - Statistics<?> statistics = new BinaryStatistics(); + Statistics<?> statistics = Statistics.getBuilder(Types.required(PrimitiveTypeName.BINARY).named("test_binary")) + .build(); BytesInput data = BytesInput.fromInt(v); int rowCount = 5; int nullCount = 1; http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b82d9621/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java ---------------------------------------------------------------------- diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java index 4243e9b..c73e569 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java @@ -27,7 +27,6 @@ import org.apache.hadoop.fs.Path; import org.apache.parquet.Version; import org.apache.parquet.bytes.BytesUtils; import org.apache.parquet.hadoop.ParquetOutputFormat.JobSummaryLevel; -import org.apache.parquet.hadoop.util.HadoopOutputFile; import org.junit.Assume; import org.junit.Rule; import org.junit.Test; @@ -48,6 +47,7 @@ import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.MessageTypeParser; import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Types; import java.io.File; import java.io.IOException; @@ -95,8 +95,8 @@ public class TestParquetFileWriter { private static final byte[] BYTES4 = { 3, 4, 5, 6 }; private static final CompressionCodecName CODEC = CompressionCodecName.UNCOMPRESSED; - private static final BinaryStatistics STATS1 = new BinaryStatistics(); - private static final BinaryStatistics STATS2 = new BinaryStatistics(); + private static final org.apache.parquet.column.statistics.Statistics<?> EMPTY_STATS = org.apache.parquet.column.statistics.Statistics + .getBuilder(Types.required(PrimitiveTypeName.BINARY).named("test_binary")).build(); private String writeSchema; @@ -145,24 +145,24 @@ public class TestParquetFileWriter { w.startBlock(3); w.startColumn(C1, 5, CODEC); long c1Starts = w.getPos(); - w.writeDataPage(2, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN); - w.writeDataPage(3, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(2, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(3, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); long c1Ends = w.getPos(); w.startColumn(C2, 6, CODEC); long c2Starts = w.getPos(); - w.writeDataPage(2, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN); - w.writeDataPage(3, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN); - w.writeDataPage(1, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(2, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(3, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(1, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); long c2Ends = w.getPos(); w.endBlock(); w.startBlock(4); w.startColumn(C1, 7, CODEC); - w.writeDataPage(7, 4, BytesInput.from(BYTES3), STATS1, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(7, 4, BytesInput.from(BYTES3), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); w.startColumn(C2, 8, CODEC); - w.writeDataPage(8, 4, BytesInput.from(BYTES4), STATS2, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); w.endBlock(); w.end(new HashMap<String, String>()); @@ -225,15 +225,15 @@ public class TestParquetFileWriter { w.startBlock(3); w.startColumn(C1, 5, CODEC); long c1Starts = w.getPos(); - w.writeDataPage(2, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN); - w.writeDataPage(3, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(2, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(3, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); long c1Ends = w.getPos(); w.startColumn(C2, 6, CODEC); long c2Starts = w.getPos(); - w.writeDataPage(2, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN); - w.writeDataPage(3, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN); - w.writeDataPage(1, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(2, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(3, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(1, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); long c2Ends = w.getPos(); w.endBlock(); @@ -242,10 +242,10 @@ public class TestParquetFileWriter { w.startBlock(4); w.startColumn(C1, 7, CODEC); - w.writeDataPage(7, 4, BytesInput.from(BYTES3), STATS1, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(7, 4, BytesInput.from(BYTES3), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); w.startColumn(C2, 8, CODEC); - w.writeDataPage(8, 4, BytesInput.from(BYTES4), STATS2, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); w.endBlock(); @@ -330,15 +330,15 @@ public class TestParquetFileWriter { w.startBlock(3); w.startColumn(C1, 5, CODEC); long c1Starts = w.getPos(); - w.writeDataPage(2, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN); - w.writeDataPage(3, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(2, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(3, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); long c1Ends = w.getPos(); w.startColumn(C2, 6, CODEC); long c2Starts = w.getPos(); - w.writeDataPage(2, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN); - w.writeDataPage(3, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN); - w.writeDataPage(1, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(2, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(3, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(1, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); long c2Ends = w.getPos(); w.endBlock(); @@ -347,10 +347,10 @@ public class TestParquetFileWriter { w.startBlock(4); w.startColumn(C1, 7, CODEC); - w.writeDataPage(7, 4, BytesInput.from(BYTES3), STATS1, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(7, 4, BytesInput.from(BYTES3), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); w.startColumn(C2, 8, CODEC); - w.writeDataPage(8, 4, BytesInput.from(BYTES4), STATS2, BIT_PACKED, BIT_PACKED, PLAIN); + w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); w.endBlock();