This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 085034f PARQUET-1494: [C++] Recognize statistics built with UNSIGNED
sort order by parquet-mr 1.10.0 onwards
085034f is described below
commit 085034f5ced88a8696c01327d3614e77b7fdfe07
Author: Ildar Musin <[email protected]>
AuthorDate: Wed Jan 23 13:49:44 2019 -0600
PARQUET-1494: [C++] Recognize statistics built with UNSIGNED sort order by
parquet-mr 1.10.0 onwards
Fixes the issue with min-max statistics built by parquet-mr 1.10+:
https://issues.apache.org/jira/browse/PARQUET-1494
Author: Ildar Musin <[email protected]>
Closes #3441 from zilder/fix_parquet_1494 and squashes the following
commits:
c5ba5fc07 <Ildar Musin> Test case for min-max statistics on binary column
built with parquet-mr
029f73047 <Ildar Musin> PARQUET-1494: Recognize statistics built with
UNSIGNED sort order by parquet-mr 1.10.0 onwards
---
cpp/src/parquet/metadata.cc | 11 +++++++++--
cpp/src/parquet/metadata.h | 1 +
cpp/src/parquet/statistics-test.cc | 28 ++++++++++++++++++++++++++++
cpp/submodules/parquet-testing | 2 +-
4 files changed, 39 insertions(+), 3 deletions(-)
diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index cc0bfec..1641afb 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -47,6 +47,11 @@ const ApplicationVersion&
ApplicationVersion::PARQUET_CPP_FIXED_STATS_VERSION()
return version;
}
+const ApplicationVersion& ApplicationVersion::PARQUET_MR_FIXED_STATS_VERSION()
{
+ static ApplicationVersion version("parquet-mr", 1, 10, 0);
+ return version;
+}
+
std::string ParquetVersionToString(ParquetVersion::type ver) {
switch (ver) {
case ParquetVersion::PARQUET_1_0:
@@ -547,8 +552,10 @@ bool ApplicationVersion::VersionEq(const
ApplicationVersion& other_version) cons
bool ApplicationVersion::HasCorrectStatistics(Type::type col_type,
EncodedStatistics& statistics,
SortOrder::type sort_order)
const {
- // Parquet cpp version 1.3.0 onwards stats are computed correctly for all
types
- if ((application_ != "parquet-cpp") ||
(VersionLt(PARQUET_CPP_FIXED_STATS_VERSION()))) {
+ // parquet-cpp version 1.3.0 and parquet-mr 1.10.0 onwards stats are computed
+ // correctly for all types
+ if ((application_ == "parquet-cpp" &&
VersionLt(PARQUET_CPP_FIXED_STATS_VERSION())) ||
+ (application_ == "parquet-mr" &&
VersionLt(PARQUET_MR_FIXED_STATS_VERSION()))) {
// Only SIGNED are valid unless max and min are the same
// (in which case the sort order does not matter)
bool max_equals_min = statistics.has_min && statistics.has_max
diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h
index 209c75a..3171458 100644
--- a/cpp/src/parquet/metadata.h
+++ b/cpp/src/parquet/metadata.h
@@ -43,6 +43,7 @@ class PARQUET_EXPORT ApplicationVersion {
static const ApplicationVersion& PARQUET_251_FIXED_VERSION();
static const ApplicationVersion& PARQUET_816_FIXED_VERSION();
static const ApplicationVersion& PARQUET_CPP_FIXED_STATS_VERSION();
+ static const ApplicationVersion& PARQUET_MR_FIXED_STATS_VERSION();
// Regular expression for the version format
// major . minor . patch unknown - prerelease.x + build info
// Eg: 1.5.0ab-cdh5.5.0+cd
diff --git a/cpp/src/parquet/statistics-test.cc
b/cpp/src/parquet/statistics-test.cc
index e1926a3..ecdbaeb 100644
--- a/cpp/src/parquet/statistics-test.cc
+++ b/cpp/src/parquet/statistics-test.cc
@@ -772,5 +772,33 @@ TEST(TestStatisticsDoubleNaN, NaNValues) {
ASSERT_EQ(min, -3.0);
ASSERT_EQ(max, 4.0);
}
+
+// Test statistics for binary column with UNSIGNED sort order
+TEST(TestStatisticsMinMax, Unsigned) {
+ std::string dir_string(test::get_data_dir());
+ std::stringstream ss;
+ ss << dir_string << "/binary.parquet";
+ auto path = ss.str();
+
+ // The file is generated by parquet-mr 1.10.0, the first version that
+ // supports correct statistics for binary data (see PARQUET-1025). It
+ // contains a single column of binary type. Data is just single byte values
+ // from 0x00 to 0x0B.
+ auto file_reader = ParquetFileReader::OpenFile(path);
+ auto rg_reader = file_reader->RowGroup(0);
+ auto metadata = rg_reader->metadata();
+ auto column_schema = metadata->schema()->Column(0);
+ ASSERT_EQ(SortOrder::UNSIGNED, column_schema->sort_order());
+
+ auto column_chunk = metadata->ColumnChunk(0);
+ ASSERT_TRUE(column_chunk->is_stats_set());
+
+ std::shared_ptr<RowGroupStatistics> stats = column_chunk->statistics();
+ ASSERT_TRUE(stats != NULL);
+ ASSERT_EQ(0, stats->null_count());
+ ASSERT_EQ(12, stats->num_values());
+ ASSERT_EQ(0x00, stats->EncodeMin()[0]);
+ ASSERT_EQ(0x0b, stats->EncodeMax()[0]);
+}
} // namespace test
} // namespace parquet
diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing
index 8eb0213..bb7b6ab 160000
--- a/cpp/submodules/parquet-testing
+++ b/cpp/submodules/parquet-testing
@@ -1 +1 @@
-Subproject commit 8eb0213c491752c9bbb1b884fcbb21deb548e464
+Subproject commit bb7b6abbb3fbeff845646364a4286142127be04c