This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 66d948dc84 GH-36882: [C++][Parquet] Default RLE for bool values in the
parquet version 2.x (#36955)
66d948dc84 is described below
commit 66d948dc84b1ee7c403a11ebf354ef006e2f636b
Author: Gang Wu <[email protected]>
AuthorDate: Fri Sep 1 00:55:37 2023 +0800
GH-36882: [C++][Parquet] Default RLE for bool values in the parquet version
2.x (#36955)
### Rationale for this change
RLE is usually more efficient than PLAIN encoding for boolean columns, and
it is already enabled by default in parquet-mr and arrow-rs.
### What changes are included in this PR?
* Slight breaking change in ColumnProperties to set default encoding to
UNKNOWN (used to be PLAIN).
* If UNKNOWN is given, let the column writer decide the column encoding
according to the selected Parquet format version and the column type.
### Are these changes tested?
Yes.
### Are there any user-facing changes?
Yes, default encoding of boolean type has been switched to RLE when the
selected Parquet format version is at least 2.0 (the current default version is
2.6). It used to always be PLAIN.
* Closes: #36882
Authored-by: Gang Wu <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/parquet/column_writer.cc | 6 +++
cpp/src/parquet/column_writer_test.cc | 53 +++++++++++++++++++--------
cpp/src/parquet/properties.h | 2 +-
python/pyarrow/tests/parquet/test_metadata.py | 2 +-
4 files changed, 46 insertions(+), 17 deletions(-)
diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index 3fca5542a0..ae9216ba7c 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -2332,6 +2332,12 @@ std::shared_ptr<ColumnWriter>
ColumnWriter::Make(ColumnChunkMetaDataBuilder* met
const bool use_dictionary = properties->dictionary_enabled(descr->path()) &&
descr->physical_type() != Type::BOOLEAN;
Encoding::type encoding = properties->encoding(descr->path());
+ if (encoding == Encoding::UNKNOWN) {
+ encoding = (descr->physical_type() == Type::BOOLEAN &&
+ properties->version() != ParquetVersion::PARQUET_1_0)
+ ? Encoding::RLE
+ : Encoding::PLAIN;
+ }
if (use_dictionary) {
encoding = properties->dictionary_index_encoding();
}
diff --git a/cpp/src/parquet/column_writer_test.cc
b/cpp/src/parquet/column_writer_test.cc
index 58199c402b..b2fd9cf930 100644
--- a/cpp/src/parquet/column_writer_test.cc
+++ b/cpp/src/parquet/column_writer_test.cc
@@ -203,8 +203,15 @@ class TestPrimitiveWriter : public
PrimitiveTypedTest<TestType> {
if (this->type_num() == Type::BOOLEAN) {
// Dictionary encoding is not allowed for boolean type
- // There are 2 encodings (PLAIN, RLE) in a non dictionary encoding case
- std::set<Encoding::type> expected({Encoding::PLAIN, Encoding::RLE});
+ std::set<Encoding::type> expected;
+ if (version == ParquetVersion::PARQUET_1_0) {
+ // There are 2 encodings (PLAIN, RLE) in a non dictionary encoding
case for
+ // version 1.0. Note that RLE is used for DL/RL.
+ expected = {Encoding::PLAIN, Encoding::RLE};
+ } else {
+ // There is only 1 encoding (RLE) in a fallback case for version 2.0
+ expected = {Encoding::RLE};
+ }
ASSERT_EQ(encodings, expected);
} else if (version == ParquetVersion::PARQUET_1_0) {
// There are 3 encodings (PLAIN_DICTIONARY, PLAIN, RLE) in a fallback
case
@@ -223,7 +230,8 @@ class TestPrimitiveWriter : public
PrimitiveTypedTest<TestType> {
std::vector<parquet::PageEncodingStats> encoding_stats =
this->metadata_encoding_stats();
if (this->type_num() == Type::BOOLEAN) {
- ASSERT_EQ(encoding_stats[0].encoding, Encoding::PLAIN);
+ ASSERT_EQ(encoding_stats[0].encoding,
+ version == ParquetVersion::PARQUET_1_0 ? Encoding::PLAIN :
Encoding::RLE);
ASSERT_EQ(encoding_stats[0].page_type, PageType::DATA_PAGE);
} else if (version == ParquetVersion::PARQUET_1_0) {
std::vector<Encoding::type> expected(
@@ -757,21 +765,36 @@ TEST_F(TestValuesWriterInt32Type, OptionalNullValueChunk)
{
ASSERT_EQ(0, this->values_read_);
}
+class TestBooleanValuesWriter : public TestPrimitiveWriter<BooleanType> {
+ public:
+ void TestWithEncoding(ParquetVersion::type version, Encoding::type encoding)
{
+ this->SetUpSchema(Repetition::REQUIRED);
+ auto writer = this->BuildWriter(SMALL_SIZE, ColumnProperties(), version,
+ /*enable_checksum*/ false);
+ for (int i = 0; i < SMALL_SIZE; i++) {
+ bool value = (i % 2 == 0) ? true : false;
+ writer->WriteBatch(1, nullptr, nullptr, &value);
+ }
+ writer->Close();
+ this->ReadColumn();
+ for (int i = 0; i < SMALL_SIZE; i++) {
+ ASSERT_EQ((i % 2 == 0) ? true : false, this->values_out_[i]) << i;
+ }
+ const auto& encodings = this->metadata_encodings();
+ auto iter = std::find(encodings.begin(), encodings.end(), encoding);
+ ASSERT_TRUE(iter != encodings.end());
+ }
+};
+
// PARQUET-764
// Correct bitpacking for boolean write at non-byte boundaries
-using TestBooleanValuesWriter = TestPrimitiveWriter<BooleanType>;
TEST_F(TestBooleanValuesWriter, AlternateBooleanValues) {
- this->SetUpSchema(Repetition::REQUIRED);
- auto writer = this->BuildWriter();
- for (int i = 0; i < SMALL_SIZE; i++) {
- bool value = (i % 2 == 0) ? true : false;
- writer->WriteBatch(1, nullptr, nullptr, &value);
- }
- writer->Close();
- this->ReadColumn();
- for (int i = 0; i < SMALL_SIZE; i++) {
- ASSERT_EQ((i % 2 == 0) ? true : false, this->values_out_[i]) << i;
- }
+ TestWithEncoding(ParquetVersion::PARQUET_1_0, Encoding::PLAIN);
+}
+
+// Default encoding for boolean is RLE when using V2 pages
+TEST_F(TestBooleanValuesWriter, RleEncodedBooleanValues) {
+ TestWithEncoding(ParquetVersion::PARQUET_2_4, Encoding::RLE);
}
// PARQUET-979
diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
index bd7eb9dc7a..bdc5b15332 100644
--- a/cpp/src/parquet/properties.h
+++ b/cpp/src/parquet/properties.h
@@ -135,7 +135,7 @@ static constexpr int64_t DEFAULT_WRITE_BATCH_SIZE = 1024;
static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 1024 * 1024;
static constexpr bool DEFAULT_ARE_STATISTICS_ENABLED = true;
static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096;
-static constexpr Encoding::type DEFAULT_ENCODING = Encoding::PLAIN;
+static constexpr Encoding::type DEFAULT_ENCODING = Encoding::UNKNOWN;
static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION;
static constexpr Compression::type DEFAULT_COMPRESSION_TYPE =
Compression::UNCOMPRESSED;
static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = false;
diff --git a/python/pyarrow/tests/parquet/test_metadata.py
b/python/pyarrow/tests/parquet/test_metadata.py
index 3efaf1dbf5..82182b0449 100644
--- a/python/pyarrow/tests/parquet/test_metadata.py
+++ b/python/pyarrow/tests/parquet/test_metadata.py
@@ -128,7 +128,7 @@ def test_parquet_metadata_api():
assert col_meta.is_stats_set is True
assert isinstance(col_meta.statistics, pq.Statistics)
assert col_meta.compression == 'SNAPPY'
- assert set(col_meta.encodings) == {'PLAIN', 'RLE'}
+ assert set(col_meta.encodings) == {'RLE'}
assert col_meta.has_dictionary_page is False
assert col_meta.dictionary_page_offset is None
assert col_meta.data_page_offset > 0