This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 085034f  PARQUET-1494: [C++] Recognize statistics built with UNSIGNED 
sort order by parquet-mr 1.10.0 onwards
085034f is described below

commit 085034f5ced88a8696c01327d3614e77b7fdfe07
Author: Ildar Musin <[email protected]>
AuthorDate: Wed Jan 23 13:49:44 2019 -0600

    PARQUET-1494: [C++] Recognize statistics built with UNSIGNED sort order by 
parquet-mr 1.10.0 onwards
    
    Fixes the issue with min-max statistics built by parquet-mr 1.10+:
    https://issues.apache.org/jira/browse/PARQUET-1494
    
    Author: Ildar Musin <[email protected]>
    
    Closes #3441 from zilder/fix_parquet_1494 and squashes the following 
commits:
    
    c5ba5fc07 <Ildar Musin> Test case for min-max statistics on binary column 
built with parquet-mr
    029f73047 <Ildar Musin> PARQUET-1494: Recognize statistics built with 
UNSIGNED sort order by parquet-mr 1.10.0 onwards
---
 cpp/src/parquet/metadata.cc        | 11 +++++++++--
 cpp/src/parquet/metadata.h         |  1 +
 cpp/src/parquet/statistics-test.cc | 28 ++++++++++++++++++++++++++++
 cpp/submodules/parquet-testing     |  2 +-
 4 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index cc0bfec..1641afb 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -47,6 +47,11 @@ const ApplicationVersion& 
ApplicationVersion::PARQUET_CPP_FIXED_STATS_VERSION()
   return version;
 }
 
+const ApplicationVersion& ApplicationVersion::PARQUET_MR_FIXED_STATS_VERSION() 
{
+  static ApplicationVersion version("parquet-mr", 1, 10, 0);
+  return version;
+}
+
 std::string ParquetVersionToString(ParquetVersion::type ver) {
   switch (ver) {
     case ParquetVersion::PARQUET_1_0:
@@ -547,8 +552,10 @@ bool ApplicationVersion::VersionEq(const 
ApplicationVersion& other_version) cons
 bool ApplicationVersion::HasCorrectStatistics(Type::type col_type,
                                               EncodedStatistics& statistics,
                                               SortOrder::type sort_order) 
const {
-  // Parquet cpp version 1.3.0 onwards stats are computed correctly for all 
types
-  if ((application_ != "parquet-cpp") || 
(VersionLt(PARQUET_CPP_FIXED_STATS_VERSION()))) {
+  // parquet-cpp version 1.3.0 and parquet-mr 1.10.0 onwards stats are computed
+  // correctly for all types
+  if ((application_ == "parquet-cpp" && 
VersionLt(PARQUET_CPP_FIXED_STATS_VERSION())) ||
+      (application_ == "parquet-mr" && 
VersionLt(PARQUET_MR_FIXED_STATS_VERSION()))) {
     // Only SIGNED are valid unless max and min are the same
     // (in which case the sort order does not matter)
     bool max_equals_min = statistics.has_min && statistics.has_max
diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h
index 209c75a..3171458 100644
--- a/cpp/src/parquet/metadata.h
+++ b/cpp/src/parquet/metadata.h
@@ -43,6 +43,7 @@ class PARQUET_EXPORT ApplicationVersion {
   static const ApplicationVersion& PARQUET_251_FIXED_VERSION();
   static const ApplicationVersion& PARQUET_816_FIXED_VERSION();
   static const ApplicationVersion& PARQUET_CPP_FIXED_STATS_VERSION();
+  static const ApplicationVersion& PARQUET_MR_FIXED_STATS_VERSION();
   // Regular expression for the version format
   // major . minor . patch unknown - prerelease.x + build info
   // Eg: 1.5.0ab-cdh5.5.0+cd
diff --git a/cpp/src/parquet/statistics-test.cc 
b/cpp/src/parquet/statistics-test.cc
index e1926a3..ecdbaeb 100644
--- a/cpp/src/parquet/statistics-test.cc
+++ b/cpp/src/parquet/statistics-test.cc
@@ -772,5 +772,33 @@ TEST(TestStatisticsDoubleNaN, NaNValues) {
   ASSERT_EQ(min, -3.0);
   ASSERT_EQ(max, 4.0);
 }
+
+// Test statistics for binary column with UNSIGNED sort order
+TEST(TestStatisticsMinMax, Unsigned) {
+  std::string dir_string(test::get_data_dir());
+  std::stringstream ss;
+  ss << dir_string << "/binary.parquet";
+  auto path = ss.str();
+
+  // The file is generated by parquet-mr 1.10.0, the first version that
+  // supports correct statistics for binary data (see PARQUET-1025). It
+  // contains a single column of binary type. Data is just single byte values
+  // from 0x00 to 0x0B.
+  auto file_reader = ParquetFileReader::OpenFile(path);
+  auto rg_reader = file_reader->RowGroup(0);
+  auto metadata = rg_reader->metadata();
+  auto column_schema = metadata->schema()->Column(0);
+  ASSERT_EQ(SortOrder::UNSIGNED, column_schema->sort_order());
+
+  auto column_chunk = metadata->ColumnChunk(0);
+  ASSERT_TRUE(column_chunk->is_stats_set());
+
+  std::shared_ptr<RowGroupStatistics> stats = column_chunk->statistics();
+  ASSERT_TRUE(stats != NULL);
+  ASSERT_EQ(0, stats->null_count());
+  ASSERT_EQ(12, stats->num_values());
+  ASSERT_EQ(0x00, stats->EncodeMin()[0]);
+  ASSERT_EQ(0x0b, stats->EncodeMax()[0]);
+}
 }  // namespace test
 }  // namespace parquet
diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing
index 8eb0213..bb7b6ab 160000
--- a/cpp/submodules/parquet-testing
+++ b/cpp/submodules/parquet-testing
@@ -1 +1 @@
-Subproject commit 8eb0213c491752c9bbb1b884fcbb21deb548e464
+Subproject commit bb7b6abbb3fbeff845646364a4286142127be04c

Reply via email to