Repository: parquet-cpp Updated Branches: refs/heads/master 05409546f -> 708507f3e
PARQUET-708: account for "worst case scenario" in MaxBufferSize for bit_width > 1 Author: Uwe L. Korn <[email protected]> Closes #155 from xhochy/parquet-708 and squashes the following commits: a537717 [Uwe L. Korn] Test for more bit widths f346435 [Uwe L. Korn] PARQUET-708: account for "worst case scenario" in MaxBufferSize for bit_width > 1 Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/708507f3 Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/708507f3 Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/708507f3 Branch: refs/heads/master Commit: 708507f3e1dd69d097d674c31b0105e716c199e1 Parents: 0540954 Author: Uwe L. Korn <[email protected]> Authored: Tue Sep 6 16:20:54 2016 -0400 Committer: Wes McKinney <[email protected]> Committed: Tue Sep 6 16:20:54 2016 -0400 ---------------------------------------------------------------------- src/parquet/column/levels-test.cc | 32 ++++++++++++++++++++++++++++++++ src/parquet/util/rle-encoding.h | 8 ++++++-- 2 files changed, 38 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/708507f3/src/parquet/column/levels-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/column/levels-test.cc b/src/parquet/column/levels-test.cc index cb7cc38..126873c 100644 --- a/src/parquet/column/levels-test.cc +++ b/src/parquet/column/levels-test.cc @@ -206,4 +206,36 @@ TEST(TestLevelEncoder, MinimumBufferSize) { ASSERT_EQ(kNumToEncode, encode_count); } +TEST(TestLevelEncoder, MinimumBufferSize2) { + // PARQUET-708 + // Test the worst case for bit_width=2 consisting of + // LiteralRun(size=8) + // RepeatedRun(size=8) + // LiteralRun(size=8) + // ... + const int kNumToEncode = 1024; + + std::vector<int16_t> levels; + for (int i = 0; i < kNumToEncode; ++i) { + // This forces a literal run of 00000001 + // followed by eight 1s + if ((i % 16) < 7) { + levels.push_back(0); + } else { + levels.push_back(1); + } + } + + for (int bit_width = 1; bit_width <= 8; bit_width++) { + std::vector<uint8_t> output( + LevelEncoder::MaxBufferSize(Encoding::RLE, bit_width, kNumToEncode)); + + LevelEncoder encoder; + encoder.Init(Encoding::RLE, bit_width, kNumToEncode, output.data(), output.size()); + int encode_count = encoder.Encode(kNumToEncode, levels.data()); + + ASSERT_EQ(kNumToEncode, encode_count); + } +} + } // namespace parquet http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/708507f3/src/parquet/util/rle-encoding.h ---------------------------------------------------------------------- diff --git a/src/parquet/util/rle-encoding.h b/src/parquet/util/rle-encoding.h index 15fd550..8fa1c41 100644 --- a/src/parquet/util/rle-encoding.h +++ b/src/parquet/util/rle-encoding.h @@ -171,8 +171,12 @@ class RleEncoder { /// Returns the maximum byte size it could take to encode 'num_values'. static int MaxBufferSize(int bit_width, int num_values) { - int bytes_per_run = BitUtil::Ceil(bit_width * MAX_VALUES_PER_LITERAL_RUN, 8.0); - int num_runs = BitUtil::Ceil(num_values, MAX_VALUES_PER_LITERAL_RUN); + // For a bit_width > 1, the worst case is the repetition of "literal run of length 8 + // and then a repeated run of length 8". + // 8 values per smallest run, 8 bits per byte + // int bytes_per_run = BitUtil::Ceil(bit_width * 8, 8); + int bytes_per_run = bit_width; + int num_runs = BitUtil::Ceil(num_values, 8); int literal_max_size = num_runs + num_runs * bytes_per_run; // In the very worst case scenario, the data is a concatenation of repeated
