[ https://issues.apache.org/jira/browse/PARQUET-1366?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16565841#comment-16565841 ]
ASF GitHub Bot commented on PARQUET-1366: ----------------------------------------- wesm closed pull request #483: PARQUET-1366: [C++] Streamline use of Arrow's bit-util.h APIs URL: https://github.com/apache/parquet-cpp/pull/483 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/src/parquet/arrow/test-util.h b/src/parquet/arrow/test-util.h index bfc78c87..2babacb8 100644 --- a/src/parquet/arrow/test-util.h +++ b/src/parquet/arrow/test-util.h @@ -368,7 +368,7 @@ Status MakeListArray(const std::shared_ptr<Array>& values, int64_t size, int32_t* offsets_ptr = reinterpret_cast<int32_t*>(offsets->mutable_data()); auto null_bitmap = AllocateBuffer(); - int64_t bitmap_size = ::arrow::BitUtil::CeilByte(size) / 8; + int64_t bitmap_size = ::arrow::BitUtil::BytesForBits(size); RETURN_NOT_OK(null_bitmap->Resize(bitmap_size)); uint8_t* null_bitmap_ptr = null_bitmap->mutable_data(); memset(null_bitmap_ptr, 0, bitmap_size); diff --git a/src/parquet/column_reader.cc b/src/parquet/column_reader.cc index bc3ee8aa..28d0dcb6 100644 --- a/src/parquet/column_reader.cc +++ b/src/parquet/column_reader.cc @@ -60,7 +60,7 @@ int LevelDecoder::SetData(Encoding::type encoding, int16_t max_level, } case Encoding::BIT_PACKED: { num_bytes = - static_cast<int32_t>(BitUtil::Ceil(num_buffered_values * bit_width_, 8)); + static_cast<int32_t>(BitUtil::BytesForBits(num_buffered_values * bit_width_)); if (!bit_packed_decoder_) { bit_packed_decoder_.reset(new ::arrow::BitReader(data, num_bytes)); } else { diff --git a/src/parquet/column_writer-test.cc b/src/parquet/column_writer-test.cc index aac582a2..6c0794a1 100644 --- a/src/parquet/column_writer-test.cc +++ b/src/parquet/column_writer-test.cc @@ -137,7 +137,7 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> { bool enable_dictionary, bool enable_statistics, int64_t num_rows) { std::vector<uint8_t> valid_bits( - BitUtil::RoundUpNumBytes(static_cast<uint32_t>(this->values_.size())) + 1, 255); + BitUtil::BytesForBits(static_cast<uint32_t>(this->values_.size())) + 1, 255); ColumnProperties column_properties(encoding, compression, enable_dictionary, enable_statistics); std::shared_ptr<TypedColumnWriter<TestType>> writer = diff --git a/src/parquet/column_writer.cc b/src/parquet/column_writer.cc index 7d47d3f6..48fba555 100644 --- a/src/parquet/column_writer.cc +++ b/src/parquet/column_writer.cc @@ -50,7 +50,7 @@ void LevelEncoder::Init(Encoding::type encoding, int16_t max_level, } case Encoding::BIT_PACKED: { int num_bytes = - static_cast<int>(BitUtil::Ceil(num_buffered_values * bit_width_, 8)); + static_cast<int>(BitUtil::BytesForBits(num_buffered_values * bit_width_)); bit_packed_encoder_.reset(new BitWriter(data, num_bytes)); break; } @@ -72,7 +72,8 @@ int LevelEncoder::MaxBufferSize(Encoding::type encoding, int16_t max_level, break; } case Encoding::BIT_PACKED: { - num_bytes = static_cast<int>(BitUtil::Ceil(num_buffered_values * bit_width, 8)); + num_bytes = + static_cast<int>(BitUtil::BytesForBits(num_buffered_values * bit_width)); break; } default: diff --git a/src/parquet/encoding-internal.h b/src/parquet/encoding-internal.h index 98f9e4a8..2dfb9ff3 100644 --- a/src/parquet/encoding-internal.h +++ b/src/parquet/encoding-internal.h @@ -151,12 +151,17 @@ class PlainDecoder<BooleanType> : public Decoder<BooleanType> { int Decode(uint8_t* buffer, int max_values) { max_values = std::min(max_values, num_values_); bool val; + ::arrow::internal::BitmapWriter bit_writer(buffer, 0, max_values); for (int i = 0; i < max_values; ++i) { if (!bit_reader_.GetValue(1, &val)) { ParquetException::EofException(); } - BitUtil::SetArrayBit(buffer, i, val); + if (val) { + bit_writer.Set(); + } + bit_writer.Next(); } + bit_writer.Finish(); num_values_ -= max_values; return max_values; } diff --git a/src/parquet/encoding-test.cc b/src/parquet/encoding-test.cc index 60285ab2..50e1394c 100644 --- a/src/parquet/encoding-test.cc +++ b/src/parquet/encoding-test.cc @@ -43,7 +43,7 @@ namespace test { TEST(VectorBooleanTest, TestEncodeDecode) { // PARQUET-454 int nvalues = 10000; - int nbytes = static_cast<int>(BitUtil::Ceil(nvalues, 8)); + int nbytes = static_cast<int>(BitUtil::BytesForBits(nvalues)); // seed the prng so failure is deterministic vector<bool> draws = flip_coins_seed(nvalues, 0.5, 0); @@ -252,7 +252,7 @@ class TestDictionaryEncoding : public TestEncodingBase<Type> { static constexpr int TYPE = Type::type_num; void CheckRoundtrip() { - std::vector<uint8_t> valid_bits(BitUtil::RoundUpNumBytes(num_values_) + 1, 255); + std::vector<uint8_t> valid_bits(BitUtil::BytesForBits(num_values_) + 1, 255); DictEncoder<Type> encoder(descr_.get(), &pool_); ASSERT_NO_THROW(encoder.Put(draws_, num_values_)); diff --git a/src/parquet/statistics-test.cc b/src/parquet/statistics-test.cc index 943d5ccf..d2ecede8 100644 --- a/src/parquet/statistics-test.cc +++ b/src/parquet/statistics-test.cc @@ -72,7 +72,7 @@ class TestRowGroupStatistics : public PrimitiveTypedTest<TestType> { TypedStats statistics3(this->schema_.Column(0)); std::vector<uint8_t> valid_bits( - BitUtil::RoundUpNumBytes(static_cast<uint32_t>(this->values_.size())) + 1, 255); + BitUtil::BytesForBits(static_cast<uint32_t>(this->values_.size())) + 1, 255); statistics3.UpdateSpaced(this->values_ptr_, valid_bits.data(), 0, this->values_.size(), 0); std::string encoded_min_spaced = statistics3.EncodeMin(); @@ -722,7 +722,7 @@ TEST(TestStatisticsFloatNaN, NaNValuesSpaced) { for (int i = 0; i < NUM_VALUES; i++) { nan_values[i] = std::nanf(""); } - std::vector<uint8_t> valid_bits(BitUtil::RoundUpNumBytes(NUM_VALUES) + 1, 255); + std::vector<uint8_t> valid_bits(BitUtil::BytesForBits(NUM_VALUES) + 1, 255); // Test values TypedRowGroupStatistics<FloatType> nan_stats(&descr); ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [C++] Streamline use of Arrow bit-util.h > ---------------------------------------- > > Key: PARQUET-1366 > URL: https://issues.apache.org/jira/browse/PARQUET-1366 > Project: Parquet > Issue Type: Task > Components: parquet-cpp > Reporter: Antoine Pitrou > Assignee: Antoine Pitrou > Priority: Minor > Labels: pull-request-available > Fix For: cpp-1.5.0 > > > Required for ARROW-2950: stop using certain bit-util APIs that will be > removed. -- This message was sent by Atlassian JIRA (v7.6.3#76005)