Repository: parquet-cpp Updated Branches: refs/heads/master 261072ca9 -> 5e524d146
PARQUET-700: Disable dictionary encoding for boolean columns Author: Uwe L. Korn <[email protected]> Closes #148 from xhochy/parquet-700 and squashes the following commits: d33a670 [Uwe L. Korn] Format fixes e8530ba [Uwe L. Korn] Also test writing booleans with Dictionary encoding 328b430 [Uwe L. Korn] Format fixes ab33f9b [Uwe L. Korn] PARQUET-700: Disable dictionary encoding for boolean columns Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/5e524d14 Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/5e524d14 Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/5e524d14 Branch: refs/heads/master Commit: 5e524d146c556b1f2ef6da6f8d9a6dbb6b8cea73 Parents: 261072c Author: Uwe L. Korn <[email protected]> Authored: Fri Sep 2 11:34:31 2016 -0400 Committer: Wes McKinney <[email protected]> Committed: Fri Sep 2 11:34:31 2016 -0400 ---------------------------------------------------------------------- src/parquet/column/column-writer-test.cc | 29 +++++++++++++-------------- src/parquet/column/writer.cc | 3 ++- 2 files changed, 16 insertions(+), 16 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/5e524d14/src/parquet/column/column-writer-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/column/column-writer-test.cc b/src/parquet/column/column-writer-test.cc index ab232ea..3806bd0 100644 --- a/src/parquet/column/column-writer-test.cc +++ b/src/parquet/column/column-writer-test.cc @@ -84,14 +84,22 @@ class TestPrimitiveWriter : public ::testing::Test { reader_.reset(new TypedColumnReader<TestType>(schema_.get(), std::move(page_reader))); } - std::unique_ptr<TypedColumnWriter<TestType>> BuildWriter( + std::shared_ptr<TypedColumnWriter<TestType>> BuildWriter( int64_t output_size = SMALL_SIZE, Encoding::type encoding = Encoding::PLAIN) { sink_.reset(new InMemoryOutputStream()); std::unique_ptr<SerializedPageWriter> pager( new SerializedPageWriter(sink_.get(), Compression::UNCOMPRESSED, &metadata_)); - return std::unique_ptr<TypedColumnWriter<TestType>>( - new TypedColumnWriter<TestType>(schema_.get(), std::move(pager), output_size, - encoding, writer_properties_.get())); + WriterProperties::Builder wp_builder; + if (encoding == Encoding::PLAIN_DICTIONARY || encoding == Encoding::RLE_DICTIONARY) { + wp_builder.enable_dictionary(); + } else { + wp_builder.disable_dictionary(); + wp_builder.encoding(encoding); + } + writer_properties_ = wp_builder.build(); + std::shared_ptr<ColumnWriter> writer = ColumnWriter::Make( + schema_.get(), std::move(pager), output_size, writer_properties_.get()); + return std::static_pointer_cast<TypedColumnWriter<TestType>>(writer); } void SyncValuesOut(); @@ -106,7 +114,7 @@ class TestPrimitiveWriter : public ::testing::Test { this->GenerateData(SMALL_SIZE); // Test case 1: required and non-repeated, so no definition or repetition levels - std::unique_ptr<TypedColumnWriter<TestType>> writer = + std::shared_ptr<TypedColumnWriter<TestType>> writer = this->BuildWriter(SMALL_SIZE, encoding); writer->WriteBatch(this->values_.size(), nullptr, nullptr, this->values_ptr_); // The behaviour should be independent from the number of Close() calls @@ -191,20 +199,11 @@ typedef ::testing::Types<Int32Type, Int64Type, Int96Type, FloatType, DoubleType, TYPED_TEST_CASE(TestPrimitiveWriter, TestTypes); -// Dictionary encoding for booleans is not supported. -typedef ::testing::Types<Int32Type, Int64Type, Int96Type, FloatType, DoubleType, - ByteArrayType, FLBAType> TestDictionaryTypes; - -template <typename T> -class TestPrimitiveDictionaryWriter : public TestPrimitiveWriter<T> {}; - -TYPED_TEST_CASE(TestPrimitiveDictionaryWriter, TestDictionaryTypes); - TYPED_TEST(TestPrimitiveWriter, RequiredPlain) { this->TestRequiredWithEncoding(Encoding::PLAIN); } -TYPED_TEST(TestPrimitiveDictionaryWriter, RequiredDictionary) { +TYPED_TEST(TestPrimitiveWriter, RequiredDictionary) { this->TestRequiredWithEncoding(Encoding::PLAIN_DICTIONARY); } http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/5e524d14/src/parquet/column/writer.cc ---------------------------------------------------------------------- diff --git a/src/parquet/column/writer.cc b/src/parquet/column/writer.cc index 1c376ad..da4b17c 100644 --- a/src/parquet/column/writer.cc +++ b/src/parquet/column/writer.cc @@ -200,7 +200,8 @@ std::shared_ptr<ColumnWriter> ColumnWriter::Make(const ColumnDescriptor* descr, std::unique_ptr<PageWriter> pager, int64_t expected_rows, const WriterProperties* properties) { Encoding::type encoding = properties->encoding(descr->path()); - if (properties->dictionary_enabled(descr->path())) { + if (properties->dictionary_enabled(descr->path()) && + descr->physical_type() != Type::BOOLEAN) { encoding = properties->dictionary_page_encoding(); } switch (descr->physical_type()) {
