Repository: parquet-cpp Updated Branches: refs/heads/master ee83fad67 -> c0eec9a59
PARQUET-454: Fix inconsistencies with boolean PLAIN encoding Requires PARQUET-485 (#32) The boolean Encoding::PLAIN code path was using RleDecoder, inconsistent with other implementations of Parquet. This patch adds an implementation of plain encoding and uses BitReader instead of RleDecoder to decode plain-encoded boolean data. Unit tests to verify. Also closes PR #12. Thanks to @edani for reporting. Author: Wes McKinney <[email protected]> Closes #34 from wesm/PARQUET-454 and squashes the following commits: 01cb5a7 [Wes McKinney] Use a seed in the data generation 0bf5d8a [Wes McKinney] Fix inconsistencies with boolean PLAIN encoding. Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/c0eec9a5 Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/c0eec9a5 Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/c0eec9a5 Branch: refs/heads/master Commit: c0eec9a593420a2f40786271fa813d44b85e7198 Parents: ee83fad Author: Wes McKinney <[email protected]> Authored: Tue Feb 2 15:00:45 2016 -0800 Committer: Julien Le Dem <[email protected]> Committed: Tue Feb 2 15:00:45 2016 -0800 ---------------------------------------------------------------------- src/parquet/encodings/CMakeLists.txt | 2 + src/parquet/encodings/encodings.h | 3 +- src/parquet/encodings/plain-encoding-test.cc | 65 +++++++++++++++++++++++ src/parquet/encodings/plain-encoding.h | 51 ++++++++++++++---- src/parquet/util/bit-util.h | 8 +++ src/parquet/util/test-common.h | 27 ++++++++++ 6 files changed, 144 insertions(+), 12 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0eec9a5/src/parquet/encodings/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/src/parquet/encodings/CMakeLists.txt b/src/parquet/encodings/CMakeLists.txt index 69ef1f3..638fba0 100644 --- a/src/parquet/encodings/CMakeLists.txt +++ b/src/parquet/encodings/CMakeLists.txt @@ -24,3 +24,5 @@ install(FILES dictionary-encoding.h plain-encoding.h DESTINATION include/parquet/encodings) + +ADD_PARQUET_TEST(plain-encoding-test) http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0eec9a5/src/parquet/encodings/encodings.h ---------------------------------------------------------------------- diff --git a/src/parquet/encodings/encodings.h b/src/parquet/encodings/encodings.h index 0d9202e..c427ff4 100644 --- a/src/parquet/encodings/encodings.h +++ b/src/parquet/encodings/encodings.h @@ -20,10 +20,9 @@ #include <cstdint> +#include "parquet/exception.h" #include "parquet/types.h" -#include "parquet/thrift/parquet_constants.h" -#include "parquet/thrift/parquet_types.h" #include "parquet/util/rle-encoding.h" #include "parquet/util/bit-stream-utils.inline.h" http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0eec9a5/src/parquet/encodings/plain-encoding-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/encodings/plain-encoding-test.cc b/src/parquet/encodings/plain-encoding-test.cc new file mode 100644 index 0000000..ca425dd --- /dev/null +++ b/src/parquet/encodings/plain-encoding-test.cc @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <cstdint> +#include <string> +#include <vector> + +#include <gtest/gtest.h> +#include "parquet/util/test-common.h" + +#include "parquet/encodings/encodings.h" + +using std::string; +using std::vector; + +namespace parquet_cpp { + +namespace test { + +TEST(BooleanTest, TestEncodeDecode) { + // PARQUET-454 + + size_t nvalues = 100; + size_t nbytes = BitUtil::RoundUp(nvalues, 8) / 8; + + // seed the prng so failure is deterministic + vector<bool> draws = flip_coins_seed(nvalues, 0.5, 0); + + PlainEncoder<Type::BOOLEAN> encoder(nullptr); + PlainDecoder<Type::BOOLEAN> decoder(nullptr); + + std::vector<uint8_t> encode_buffer(nbytes); + + size_t encoded_bytes = encoder.Encode(draws, nvalues, &encode_buffer[0]); + ASSERT_EQ(nbytes, encoded_bytes); + + std::vector<uint8_t> decode_buffer(nbytes); + const uint8_t* decode_data = &decode_buffer[0]; + + decoder.SetData(nvalues, &encode_buffer[0], encoded_bytes); + size_t values_decoded = decoder.Decode(&decode_buffer[0], nvalues); + ASSERT_EQ(nvalues, values_decoded); + + for (size_t i = 0; i < nvalues; ++i) { + ASSERT_EQ(BitUtil::GetArrayBit(decode_data, i), draws[i]) << i; + } +} + +} // namespace test + +} // namespace parquet_cpp http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0eec9a5/src/parquet/encodings/plain-encoding.h ---------------------------------------------------------------------- diff --git a/src/parquet/encodings/plain-encoding.h b/src/parquet/encodings/plain-encoding.h index 11e70c7..cc37776 100644 --- a/src/parquet/encodings/plain-encoding.h +++ b/src/parquet/encodings/plain-encoding.h @@ -21,6 +21,7 @@ #include "parquet/encodings/encodings.h" #include <algorithm> +#include <vector> using parquet::Type; @@ -103,19 +104,38 @@ class PlainDecoder<Type::BOOLEAN> : public Decoder<Type::BOOLEAN> { virtual void SetData(int num_values, const uint8_t* data, int len) { num_values_ = num_values; - decoder_ = RleDecoder(data, len, 1); + bit_reader_ = BitReader(data, len); + } + + // Two flavors of bool decoding + int Decode(uint8_t* buffer, int max_values) { + max_values = std::min(max_values, num_values_); + bool val; + for (int i = 0; i < max_values; ++i) { + if (!bit_reader_.GetValue(1, &val)) { + ParquetException::EofException(); + } + BitUtil::SetArrayBit(buffer, i, val); + } + num_values_ -= max_values; + return max_values; } virtual int Decode(bool* buffer, int max_values) { max_values = std::min(max_values, num_values_); + bool val; for (int i = 0; i < max_values; ++i) { - if (!decoder_.Get(&buffer[i])) ParquetException::EofException(); + if (!bit_reader_.GetValue(1, &val)) { + ParquetException::EofException(); + } + buffer[i] = val; } num_values_ -= max_values; return max_values; } + private: - RleDecoder decoder_; + BitReader bit_reader_; }; // ---------------------------------------------------------------------- @@ -132,6 +152,24 @@ class PlainEncoder : public Encoder<TYPE> { virtual size_t Encode(const T* src, int num_values, uint8_t* dst); }; +template <> +class PlainEncoder<Type::BOOLEAN> : public Encoder<Type::BOOLEAN> { + public: + explicit PlainEncoder(const parquet::SchemaElement* schema) : + Encoder<Type::BOOLEAN>(schema, parquet::Encoding::PLAIN) {} + + virtual size_t Encode(const std::vector<bool>& src, int num_values, + uint8_t* dst) { + size_t bytes_required = BitUtil::RoundUp(num_values, 8) / 8; + BitWriter bit_writer(dst, bytes_required); + for (size_t i = 0; i < num_values; ++i) { + bit_writer.PutValue(src[i], 1); + } + bit_writer.Flush(); + return bit_writer.bytes_written(); + } +}; + template <int TYPE> inline size_t PlainEncoder<TYPE>::Encode(const T* buffer, int num_values, uint8_t* dst) { @@ -141,13 +179,6 @@ inline size_t PlainEncoder<TYPE>::Encode(const T* buffer, int num_values, } template <> -inline size_t PlainEncoder<Type::BOOLEAN>::Encode( - const bool* src, int num_values, uint8_t* dst) { - ParquetException::NYI("bool encoding"); - return 0; -} - -template <> inline size_t PlainEncoder<Type::BYTE_ARRAY>::Encode(const ByteArray* src, int num_values, uint8_t* dst) { ParquetException::NYI("byte array encoding"); http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0eec9a5/src/parquet/util/bit-util.h ---------------------------------------------------------------------- diff --git a/src/parquet/util/bit-util.h b/src/parquet/util/bit-util.h index 7a2e921..eac5346 100644 --- a/src/parquet/util/bit-util.h +++ b/src/parquet/util/bit-util.h @@ -270,6 +270,14 @@ class BitUtil { return v | (static_cast<T>(0x1) << bitpos); } + static inline bool GetArrayBit(const uint8_t* bits, size_t i) { + return bits[i / 8] & (1 << (i % 8)); + } + + static inline void SetArrayBit(uint8_t* bits, size_t i, bool is_set) { + bits[i / 8] |= (1 << (i % 8)) * is_set; + } + // Set a specific bit to 0 // Behavior when bitpos is negative is undefined template<typename T> http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0eec9a5/src/parquet/util/test-common.h ---------------------------------------------------------------------- diff --git a/src/parquet/util/test-common.h b/src/parquet/util/test-common.h index 38bc32c..3cf82f5 100644 --- a/src/parquet/util/test-common.h +++ b/src/parquet/util/test-common.h @@ -19,6 +19,7 @@ #define PARQUET_UTIL_TEST_COMMON_H #include <iostream> +#include <random> #include <vector> using std::vector; @@ -46,6 +47,32 @@ static inline bool vector_equal(const vector<T>& left, const vector<T>& right) { return true; } +static inline vector<bool> flip_coins_seed(size_t n, double p, uint32_t seed) { + std::mt19937 gen(seed); + std::bernoulli_distribution d(p); + + vector<bool> draws; + for (size_t i = 0; i < n; ++i) { + draws.push_back(d(gen)); + } + return draws; +} + + +static inline vector<bool> flip_coins(size_t n, double p) { + std::random_device rd; + std::mt19937 gen(rd()); + + std::bernoulli_distribution d(p); + + vector<bool> draws; + for (size_t i = 0; i < n; ++i) { + draws.push_back(d(gen)); + } + return draws; +} + + } // namespace test } // namespace parquet_cpp
