This is an automated email from the ASF dual-hosted git repository. wesm pushed a commit to branch maint-0.14.x in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 6be1f3da30063a439c7b87e08f7a56febf3e1e70 Author: Hatem Helal <[email protected]> AuthorDate: Thu Jul 11 21:23:18 2019 -0400 PARQUET-1623: [C++] Fix invalid memory access encountered when reading some parquet files There are a number of conditions that are needed to observe this invalid memory access when reading from a parquet file: * Must have a number of records equal to a power of two * The column must be serialized across multiple data pages Author: Hatem Helal <[email protected]> Closes #4857 from hatemhelal/parquet-1623 and squashes the following commits: 5cf75b5b4 <Hatem Helal> Add simplified unittest d3c80073e <Hatem Helal> Initialize BitmapWriter with num_def_levels to prevent invalid memory access --- cpp/src/parquet/column_reader-test.cc | 22 ++++++++++++++++++++++ cpp/src/parquet/column_reader.h | 2 +- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/column_reader-test.cc b/cpp/src/parquet/column_reader-test.cc index 46d099e..b6b928c 100644 --- a/cpp/src/parquet/column_reader-test.cc +++ b/cpp/src/parquet/column_reader-test.cc @@ -410,5 +410,27 @@ TEST(TestColumnReader, DefinitionLevelsToBitmap) { ASSERT_EQ(current_byte, valid_bits[1]); } +TEST(TestColumnReader, DefinitionLevelsToBitmapPowerOfTwo) { + // PARQUET-1623: Invalid memory access when decoding a valid bits vector that has a + // length equal to a power of two and also using a non-zero valid_bits_offset. This + // should not fail when run with ASAN or valgrind. + std::vector<int16_t> def_levels = {3, 3, 3, 2, 3, 3, 3, 3}; + std::vector<int16_t> rep_levels = {0, 1, 1, 1, 1, 1, 1, 1}; + std::vector<uint8_t> valid_bits(1, 0); + + const int max_def_level = 3; + const int max_rep_level = 1; + + int64_t values_read = -1; + int64_t null_count = 0; + + // Read the latter half of the validity bitmap + internal::DefinitionLevelsToBitmap(def_levels.data() + 4, 4, max_def_level, + max_rep_level, &values_read, &null_count, + valid_bits.data(), 4 /* valid_bits_offset */); + ASSERT_EQ(4, values_read); + ASSERT_EQ(0, null_count); +} + } // namespace test } // namespace parquet diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index e7d6afb..15987c5 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -186,7 +186,7 @@ static inline void DefinitionLevelsToBitmap( // We assume here that valid_bits is large enough to accommodate the // additional definition levels and the ones that have already been written ::arrow::internal::BitmapWriter valid_bits_writer(valid_bits, valid_bits_offset, - valid_bits_offset + num_def_levels); + num_def_levels); // TODO(itaiin): As an interim solution we are splitting the code path here // between repeated+flat column reads, and non-repeated+nested reads.
