Repository: parquet-cpp Updated Branches: refs/heads/master 08088af76 -> ee83fad67
PARQUET-503: Reenable parquet 2.0 encoding implementations. Author: Nong Li <[email protected]> Closes #35 from nongli/parquet-503 and squashes the following commits: cb2a4e1 [Nong Li] PARQUET-503: Reenable parquet 2.0 encoding implementations. Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/ee83fad6 Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/ee83fad6 Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/ee83fad6 Branch: refs/heads/master Commit: ee83fad67d07977b6493dc1e7b0dde63d58b9bf8 Parents: 08088af Author: Nong Li <[email protected]> Authored: Tue Feb 2 14:50:00 2016 -0800 Committer: Julien Le Dem <[email protected]> Committed: Tue Feb 2 14:50:00 2016 -0800 ---------------------------------------------------------------------- .gitignore | 1 + example/CMakeLists.txt | 5 ++-- src/parquet/encodings/delta-bit-pack-encoding.h | 10 ++++---- src/parquet/encodings/encodings.h | 8 +++--- src/parquet/util/bit-stream-utils.h | 12 +++++---- src/parquet/util/bit-stream-utils.inline.h | 26 +++++++++----------- src/parquet/util/bit-util-test.cc | 19 ++++++++++++++ 7 files changed, 49 insertions(+), 32 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ee83fad6/.gitignore ---------------------------------------------------------------------- diff --git a/.gitignore b/.gitignore index 172a03a..f90103a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ bin build generated +Testing/ CMakeCache.txt CMakeFiles http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ee83fad6/example/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index 730b408..bd9e66c 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -20,9 +20,8 @@ SET(LINK_LIBS snappystatic thriftstatic) -# Disabled because decoding code has changed -# add_executable(decode_benchmark decode_benchmark.cc) -# target_link_libraries(decode_benchmark ${LINK_LIBS}) +add_executable(decode_benchmark decode_benchmark.cc) +target_link_libraries(decode_benchmark ${LINK_LIBS}) add_executable(parquet_reader parquet_reader.cc) target_link_libraries(parquet_reader ${LINK_LIBS}) http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ee83fad6/src/parquet/encodings/delta-bit-pack-encoding.h ---------------------------------------------------------------------- diff --git a/src/parquet/encodings/delta-bit-pack-encoding.h b/src/parquet/encodings/delta-bit-pack-encoding.h index a0833b5..858fcec 100644 --- a/src/parquet/encodings/delta-bit-pack-encoding.h +++ b/src/parquet/encodings/delta-bit-pack-encoding.h @@ -54,7 +54,7 @@ class DeltaBitPackDecoder : public Decoder<TYPE> { using Decoder<TYPE>::num_values_; void InitBlock() { - uint64_t block_size; + int32_t block_size; if (!decoder_.GetVlqInt(&block_size)) ParquetException::EofException(); if (!decoder_.GetVlqInt(&num_mini_blocks_)) ParquetException::EofException(); if (!decoder_.GetVlqInt(&values_current_block_)) { @@ -104,17 +104,17 @@ class DeltaBitPackDecoder : public Decoder<TYPE> { } BitReader decoder_; - uint64_t values_current_block_; - uint64_t num_mini_blocks_; + int32_t values_current_block_; + int32_t num_mini_blocks_; uint64_t values_per_mini_block_; uint64_t values_current_mini_block_; - int64_t min_delta_; + int32_t min_delta_; int mini_block_idx_; std::vector<uint8_t> delta_bit_widths_; int delta_bit_width_; - int64_t last_value_; + int32_t last_value_; }; } // namespace parquet_cpp http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ee83fad6/src/parquet/encodings/encodings.h ---------------------------------------------------------------------- diff --git a/src/parquet/encodings/encodings.h b/src/parquet/encodings/encodings.h index 4fb3d9a..0d9202e 100644 --- a/src/parquet/encodings/encodings.h +++ b/src/parquet/encodings/encodings.h @@ -105,10 +105,8 @@ class Encoder { #include "parquet/encodings/plain-encoding.h" #include "parquet/encodings/dictionary-encoding.h" - -// The encoding tools changed and these are missing the ZigZag functions -// #include "parquet/encodings/delta-bit-pack-encoding.h" -// #include "parquet/encodings/delta-length-byte-array-encoding.h" -// #include "parquet/encodings/delta-byte-array-encoding.h" +#include "parquet/encodings/delta-bit-pack-encoding.h" +#include "parquet/encodings/delta-length-byte-array-encoding.h" +#include "parquet/encodings/delta-byte-array-encoding.h" #endif // PARQUET_ENCODINGS_ENCODINGS_H http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ee83fad6/src/parquet/util/bit-stream-utils.h ---------------------------------------------------------------------- diff --git a/src/parquet/util/bit-stream-utils.h b/src/parquet/util/bit-stream-utils.h index a02839d..3e8f95c 100644 --- a/src/parquet/util/bit-stream-utils.h +++ b/src/parquet/util/bit-stream-utils.h @@ -69,7 +69,10 @@ class BitWriter { /// room. The value is written byte aligned. /// For more details on vlq: /// en.wikipedia.org/wiki/Variable-length_quantity - bool PutVlqInt(int32_t v); + bool PutVlqInt(uint32_t v); + + // Writes an int zigzag encoded. + bool PutZigZagVlqInt(int32_t v); /// Get a pointer to the next aligned byte and advance the underlying buffer /// by num_bytes. @@ -135,6 +138,9 @@ class BitReader { /// the buffer. bool GetVlqInt(int32_t* v); + // Reads a zigzag encoded int `into` v. + bool GetZigZagVlqInt(int32_t* v); + /// Returns the number of bytes left in the stream, not including the current /// byte (i.e., there may be an additional fraction of a byte). int bytes_left() { return max_bytes_ - (byte_offset_ + BitUtil::Ceil(bit_offset_, 8)); } @@ -142,10 +148,6 @@ class BitReader { /// Maximum byte length of a vlq encoded int static const int MAX_VLQ_BYTE_LEN = 5; - // TODO(nongli): implementations to be fixed given changes in Impala - // bool GetZigZagVlqInt(int64_t* v); - // bool PutZigZagVlqInt(int32_t v); - private: const uint8_t* buffer_; int max_bytes_; http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ee83fad6/src/parquet/util/bit-stream-utils.inline.h ---------------------------------------------------------------------- diff --git a/src/parquet/util/bit-stream-utils.inline.h b/src/parquet/util/bit-stream-utils.inline.h index 77e2d48..e0dcab8 100644 --- a/src/parquet/util/bit-stream-utils.inline.h +++ b/src/parquet/util/bit-stream-utils.inline.h @@ -75,7 +75,7 @@ inline bool BitWriter::PutAligned(T val, int num_bytes) { return true; } -inline bool BitWriter::PutVlqInt(int32_t v) { +inline bool BitWriter::PutVlqInt(uint32_t v) { bool result = true; while ((v & 0xFFFFFF80) != 0L) { result &= PutAligned<uint8_t>((v & 0x7F) | 0x80, 1); @@ -152,20 +152,18 @@ inline bool BitReader::GetVlqInt(int32_t* v) { return true; } -// TODO(nongli): review/test these implementations given divergence in Impala -// functions - -// inline bool BitWriter::PutZigZagVlqInt(int32_t v) { -// uint32_t u = (v << 1) ^ (v >> 31); -// return PutVlqInt(u); -// } +inline bool BitWriter::PutZigZagVlqInt(int32_t v) { + uint32_t u = (v << 1) ^ (v >> 31); + return PutVlqInt(u); +} -// inline bool BitReader::GetZigZagVlqInt(int64_t* v) { -// uint64_t u; -// if (!GetVlqInt(&u)) return false; -// *reinterpret_cast<uint64_t*>(v) = (u >> 1) ^ -(u & 1); -// return true; -// } +inline bool BitReader::GetZigZagVlqInt(int32_t* v) { + int32_t u_signed; + if (!GetVlqInt(&u_signed)) return false; + uint32_t u = static_cast<uint32_t>(u_signed); + *reinterpret_cast<uint32_t*>(v) = (u >> 1) ^ -(u & 1); + return true; +} } // namespace parquet_cpp http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ee83fad6/src/parquet/util/bit-util-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/util/bit-util-test.cc b/src/parquet/util/bit-util-test.cc index 78efe1a..a8b6be0 100644 --- a/src/parquet/util/bit-util-test.cc +++ b/src/parquet/util/bit-util-test.cc @@ -26,6 +26,7 @@ #include <gtest/gtest.h> #include "parquet/util/bit-util.h" +#include "parquet/util/bit-stream-utils.inline.h" #include "parquet/util/cpu-info.h" namespace parquet_cpp { @@ -161,4 +162,22 @@ TEST(BitUtil, RoundUpDown) { EXPECT_EQ(BitUtil::RoundDownNumi64(65), 1); } +void TestZigZag(int32_t v) { + uint8_t buffer[BitReader::MAX_VLQ_BYTE_LEN]; + BitWriter writer(buffer, sizeof(buffer)); + BitReader reader(buffer, sizeof(buffer)); + writer.PutZigZagVlqInt(v); + int32_t result; + EXPECT_TRUE(reader.GetZigZagVlqInt(&result)); + EXPECT_EQ(v, result); +} + +TEST(BitStreamUtil, ZigZag) { + TestZigZag(0); + TestZigZag(1); + TestZigZag(-1); + TestZigZag(std::numeric_limits<int32_t>::max()); + TestZigZag(-std::numeric_limits<int32_t>::max()); +} + } // namespace parquet_cpp
