This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 66d948dc84 GH-36882: [C++][Parquet] Default RLE for bool values in the 
parquet version 2.x (#36955)
66d948dc84 is described below

commit 66d948dc84b1ee7c403a11ebf354ef006e2f636b
Author: Gang Wu <[email protected]>
AuthorDate: Fri Sep 1 00:55:37 2023 +0800

    GH-36882: [C++][Parquet] Default RLE for bool values in the parquet version 
2.x (#36955)
    
    ### Rationale for this change
    
    RLE is usually more efficient than PLAIN encoding for boolean columns, and 
it is already enabled by default in parquet-mr and arrow-rs.
    
    ### What changes are included in this PR?
    
    * Slight breaking change in ColumnProperties to set default encoding to 
UNKNOWN (used to be PLAIN).
    * If UNKNOWN is given, let the column writer decide the column encoding 
according to the selected Parquet format version and the column type.
    
    ### Are these changes tested?
    
    Yes.
    
    ### Are there any user-facing changes?
    
    Yes, default encoding of boolean type has been switched to RLE when the 
selected Parquet format version is at least 2.0 (the current default version is 
2.6). It used to always be PLAIN.
    
    * Closes: #36882
    
    Authored-by: Gang Wu <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/parquet/column_writer.cc              |  6 +++
 cpp/src/parquet/column_writer_test.cc         | 53 +++++++++++++++++++--------
 cpp/src/parquet/properties.h                  |  2 +-
 python/pyarrow/tests/parquet/test_metadata.py |  2 +-
 4 files changed, 46 insertions(+), 17 deletions(-)

diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index 3fca5542a0..ae9216ba7c 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -2332,6 +2332,12 @@ std::shared_ptr<ColumnWriter> 
ColumnWriter::Make(ColumnChunkMetaDataBuilder* met
   const bool use_dictionary = properties->dictionary_enabled(descr->path()) &&
                               descr->physical_type() != Type::BOOLEAN;
   Encoding::type encoding = properties->encoding(descr->path());
+  if (encoding == Encoding::UNKNOWN) {
+    encoding = (descr->physical_type() == Type::BOOLEAN &&
+                properties->version() != ParquetVersion::PARQUET_1_0)
+                   ? Encoding::RLE
+                   : Encoding::PLAIN;
+  }
   if (use_dictionary) {
     encoding = properties->dictionary_index_encoding();
   }
diff --git a/cpp/src/parquet/column_writer_test.cc 
b/cpp/src/parquet/column_writer_test.cc
index 58199c402b..b2fd9cf930 100644
--- a/cpp/src/parquet/column_writer_test.cc
+++ b/cpp/src/parquet/column_writer_test.cc
@@ -203,8 +203,15 @@ class TestPrimitiveWriter : public 
PrimitiveTypedTest<TestType> {
 
     if (this->type_num() == Type::BOOLEAN) {
       // Dictionary encoding is not allowed for boolean type
-      // There are 2 encodings (PLAIN, RLE) in a non dictionary encoding case
-      std::set<Encoding::type> expected({Encoding::PLAIN, Encoding::RLE});
+      std::set<Encoding::type> expected;
+      if (version == ParquetVersion::PARQUET_1_0) {
+        // There are 2 encodings (PLAIN, RLE) in a non dictionary encoding 
case for
+        // version 1.0. Note that RLE is used for DL/RL.
+        expected = {Encoding::PLAIN, Encoding::RLE};
+      } else {
+        // There is only 1 encoding (RLE) in a fallback case for version 2.0
+        expected = {Encoding::RLE};
+      }
       ASSERT_EQ(encodings, expected);
     } else if (version == ParquetVersion::PARQUET_1_0) {
       // There are 3 encodings (PLAIN_DICTIONARY, PLAIN, RLE) in a fallback 
case
@@ -223,7 +230,8 @@ class TestPrimitiveWriter : public 
PrimitiveTypedTest<TestType> {
     std::vector<parquet::PageEncodingStats> encoding_stats =
         this->metadata_encoding_stats();
     if (this->type_num() == Type::BOOLEAN) {
-      ASSERT_EQ(encoding_stats[0].encoding, Encoding::PLAIN);
+      ASSERT_EQ(encoding_stats[0].encoding,
+                version == ParquetVersion::PARQUET_1_0 ? Encoding::PLAIN : 
Encoding::RLE);
       ASSERT_EQ(encoding_stats[0].page_type, PageType::DATA_PAGE);
     } else if (version == ParquetVersion::PARQUET_1_0) {
       std::vector<Encoding::type> expected(
@@ -757,21 +765,36 @@ TEST_F(TestValuesWriterInt32Type, OptionalNullValueChunk) 
{
   ASSERT_EQ(0, this->values_read_);
 }
 
+class TestBooleanValuesWriter : public TestPrimitiveWriter<BooleanType> {
+ public:
+  void TestWithEncoding(ParquetVersion::type version, Encoding::type encoding) 
{
+    this->SetUpSchema(Repetition::REQUIRED);
+    auto writer = this->BuildWriter(SMALL_SIZE, ColumnProperties(), version,
+                                    /*enable_checksum*/ false);
+    for (int i = 0; i < SMALL_SIZE; i++) {
+      bool value = (i % 2 == 0) ? true : false;
+      writer->WriteBatch(1, nullptr, nullptr, &value);
+    }
+    writer->Close();
+    this->ReadColumn();
+    for (int i = 0; i < SMALL_SIZE; i++) {
+      ASSERT_EQ((i % 2 == 0) ? true : false, this->values_out_[i]) << i;
+    }
+    const auto& encodings = this->metadata_encodings();
+    auto iter = std::find(encodings.begin(), encodings.end(), encoding);
+    ASSERT_TRUE(iter != encodings.end());
+  }
+};
+
 // PARQUET-764
 // Correct bitpacking for boolean write at non-byte boundaries
-using TestBooleanValuesWriter = TestPrimitiveWriter<BooleanType>;
 TEST_F(TestBooleanValuesWriter, AlternateBooleanValues) {
-  this->SetUpSchema(Repetition::REQUIRED);
-  auto writer = this->BuildWriter();
-  for (int i = 0; i < SMALL_SIZE; i++) {
-    bool value = (i % 2 == 0) ? true : false;
-    writer->WriteBatch(1, nullptr, nullptr, &value);
-  }
-  writer->Close();
-  this->ReadColumn();
-  for (int i = 0; i < SMALL_SIZE; i++) {
-    ASSERT_EQ((i % 2 == 0) ? true : false, this->values_out_[i]) << i;
-  }
+  TestWithEncoding(ParquetVersion::PARQUET_1_0, Encoding::PLAIN);
+}
+
+// Default encoding for boolean is RLE when using V2 pages
+TEST_F(TestBooleanValuesWriter, RleEncodedBooleanValues) {
+  TestWithEncoding(ParquetVersion::PARQUET_2_4, Encoding::RLE);
 }
 
 // PARQUET-979
diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
index bd7eb9dc7a..bdc5b15332 100644
--- a/cpp/src/parquet/properties.h
+++ b/cpp/src/parquet/properties.h
@@ -135,7 +135,7 @@ static constexpr int64_t DEFAULT_WRITE_BATCH_SIZE = 1024;
 static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 1024 * 1024;
 static constexpr bool DEFAULT_ARE_STATISTICS_ENABLED = true;
 static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096;
-static constexpr Encoding::type DEFAULT_ENCODING = Encoding::PLAIN;
+static constexpr Encoding::type DEFAULT_ENCODING = Encoding::UNKNOWN;
 static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION;
 static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = 
Compression::UNCOMPRESSED;
 static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = false;
diff --git a/python/pyarrow/tests/parquet/test_metadata.py 
b/python/pyarrow/tests/parquet/test_metadata.py
index 3efaf1dbf5..82182b0449 100644
--- a/python/pyarrow/tests/parquet/test_metadata.py
+++ b/python/pyarrow/tests/parquet/test_metadata.py
@@ -128,7 +128,7 @@ def test_parquet_metadata_api():
     assert col_meta.is_stats_set is True
     assert isinstance(col_meta.statistics, pq.Statistics)
     assert col_meta.compression == 'SNAPPY'
-    assert set(col_meta.encodings) == {'PLAIN', 'RLE'}
+    assert set(col_meta.encodings) == {'RLE'}
     assert col_meta.has_dictionary_page is False
     assert col_meta.dictionary_page_offset is None
     assert col_meta.data_page_offset > 0

Reply via email to