This is an automated email from the ASF dual-hosted git repository.
zivanfi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-mr.git
The following commit(s) were added to refs/heads/master by this push:
new a69f2b3 PARQUET-1365: Don't write page level statistics (#549)
a69f2b3 is described below
commit a69f2b30cd3c581588977ea4c93a53989e9c031c
Author: Gabor Szadovszky <[email protected]>
AuthorDate: Mon Nov 19 13:15:39 2018 +0100
PARQUET-1365: Don't write page level statistics (#549)
Page level statistics were never used in production and became pointless
after adding column indexes.
---
.../format/converter/ParquetMetadataConverter.java | 47 ++++++++++++++++------
.../parquet/hadoop/ColumnChunkPageWriteStore.java | 4 +-
.../apache/parquet/hadoop/ParquetFileWriter.java | 5 +--
.../hadoop/TestColumnChunkPageWriteStore.java | 1 -
4 files changed, 38 insertions(+), 19 deletions(-)
diff --git
a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
index 58ae503..b9c8996 100644
---
a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
+++
b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
@@ -1334,12 +1334,13 @@ public class ParquetMetadataConverter {
writePageHeader(newDataPageHeader(uncompressedSize,
compressedSize,
valueCount,
- new
org.apache.parquet.column.statistics.BooleanStatistics(),
rlEncoding,
dlEncoding,
valuesEncoding), to);
}
+ // Statistics are no longer saved in page headers
+ @Deprecated
public void writeDataPageHeader(
int uncompressedSize,
int compressedSize,
@@ -1350,7 +1351,7 @@ public class ParquetMetadataConverter {
org.apache.parquet.column.Encoding valuesEncoding,
OutputStream to) throws IOException {
writePageHeader(
- newDataPageHeader(uncompressedSize, compressedSize, valueCount,
statistics,
+ newDataPageHeader(uncompressedSize, compressedSize, valueCount,
rlEncoding, dlEncoding, valuesEncoding),
to);
}
@@ -1358,7 +1359,6 @@ public class ParquetMetadataConverter {
private PageHeader newDataPageHeader(
int uncompressedSize, int compressedSize,
int valueCount,
- org.apache.parquet.column.statistics.Statistics statistics,
org.apache.parquet.column.Encoding rlEncoding,
org.apache.parquet.column.Encoding dlEncoding,
org.apache.parquet.column.Encoding valuesEncoding) {
@@ -1369,12 +1369,11 @@ public class ParquetMetadataConverter {
getEncoding(valuesEncoding),
getEncoding(dlEncoding),
getEncoding(rlEncoding)));
- if (!statistics.isEmpty()) {
-
pageHeader.getData_page_header().setStatistics(toParquetStatistics(statistics));
- }
return pageHeader;
}
+ // Statistics are no longer saved in page headers
+ @Deprecated
public void writeDataPageV2Header(
int uncompressedSize, int compressedSize,
int valueCount, int nullCount, int rowCount,
@@ -1386,7 +1385,36 @@ public class ParquetMetadataConverter {
newDataPageV2Header(
uncompressedSize, compressedSize,
valueCount, nullCount, rowCount,
- statistics,
+ dataEncoding,
+ rlByteLength, dlByteLength), to);
+ }
+
+ public void writeDataPageV1Header(
+ int uncompressedSize,
+ int compressedSize,
+ int valueCount,
+ org.apache.parquet.column.Encoding rlEncoding,
+ org.apache.parquet.column.Encoding dlEncoding,
+ org.apache.parquet.column.Encoding valuesEncoding,
+ OutputStream to) throws IOException {
+ writePageHeader(newDataPageHeader(uncompressedSize,
+ compressedSize,
+ valueCount,
+ rlEncoding,
+ dlEncoding,
+ valuesEncoding), to);
+ }
+
+ public void writeDataPageV2Header(
+ int uncompressedSize, int compressedSize,
+ int valueCount, int nullCount, int rowCount,
+ org.apache.parquet.column.Encoding dataEncoding,
+ int rlByteLength, int dlByteLength,
+ OutputStream to) throws IOException {
+ writePageHeader(
+ newDataPageV2Header(
+ uncompressedSize, compressedSize,
+ valueCount, nullCount, rowCount,
dataEncoding,
rlByteLength, dlByteLength), to);
}
@@ -1394,7 +1422,6 @@ public class ParquetMetadataConverter {
private PageHeader newDataPageV2Header(
int uncompressedSize, int compressedSize,
int valueCount, int nullCount, int rowCount,
- org.apache.parquet.column.statistics.Statistics<?> statistics,
org.apache.parquet.column.Encoding dataEncoding,
int rlByteLength, int dlByteLength) {
// TODO: pageHeader.crc = ...;
@@ -1402,10 +1429,6 @@ public class ParquetMetadataConverter {
valueCount, nullCount, rowCount,
getEncoding(dataEncoding),
dlByteLength, rlByteLength);
- if (!statistics.isEmpty()) {
- dataPageHeaderV2.setStatistics(
- toParquetStatistics(statistics));
- }
PageHeader pageHeader = new PageHeader(PageType.DATA_PAGE_V2,
uncompressedSize, compressedSize);
pageHeader.setData_page_header_v2(dataPageHeaderV2);
return pageHeader;
diff --git
a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java
b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java
index 85bdbdb..f87630b 100644
---
a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java
+++
b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java
@@ -119,11 +119,10 @@ class ColumnChunkPageWriteStore implements PageWriteStore
{
+ compressedSize);
}
tempOutputStream.reset();
- parquetMetadataConverter.writeDataPageHeader(
+ parquetMetadataConverter.writeDataPageV1Header(
(int)uncompressedSize,
(int)compressedSize,
valueCount,
- statistics,
rlEncoding,
dlEncoding,
valuesEncoding,
@@ -171,7 +170,6 @@ class ColumnChunkPageWriteStore implements PageWriteStore {
parquetMetadataConverter.writeDataPageV2Header(
uncompressedSize, compressedSize,
valueCount, nullCount, rowCount,
- statistics,
dataEncoding,
rlByteLength,
dlByteLength,
diff --git
a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java
b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java
index a8cd686..20efe47 100644
---
a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java
+++
b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java
@@ -433,7 +433,7 @@ public class ParquetFileWriter {
long beforeHeader = out.getPos();
LOG.debug("{}: write data page: {} values", beforeHeader, valueCount);
int compressedPageSize = (int)bytes.size();
- metadataConverter.writeDataPageHeader(
+ metadataConverter.writeDataPageV1Header(
uncompressedPageSize, compressedPageSize,
valueCount,
rlEncoding,
@@ -518,10 +518,9 @@ public class ParquetFileWriter {
}
LOG.debug("{}: write data page: {} values", beforeHeader, valueCount);
int compressedPageSize = (int) bytes.size();
- metadataConverter.writeDataPageHeader(
+ metadataConverter.writeDataPageV1Header(
uncompressedPageSize, compressedPageSize,
valueCount,
- statistics,
rlEncoding,
dlEncoding,
valuesEncoding,
diff --git
a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java
b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java
index 9a27def..c353ee3 100644
---
a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java
+++
b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java
@@ -189,7 +189,6 @@ public class TestColumnChunkPageWriteStore {
assertEquals(r, intValue(page.getRepetitionLevels()));
assertEquals(dataEncoding, page.getDataEncoding());
assertEquals(v, intValue(page.getData()));
- assertEquals(statistics.toString(), page.getStatistics().toString());
// Checking column/offset indexes for the one page
ColumnChunkMetaData column =
footer.getBlocks().get(0).getColumns().get(0);