[jira] [Commented] (PARQUET-1366) [C++] Streamline use of Arrow bit-util.h

ASF GitHub Bot (JIRA) Wed, 01 Aug 2018 12:15:31 -0700


    [ 
https://issues.apache.org/jira/browse/PARQUET-1366?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16565841#comment-16565841
 ]


ASF GitHub Bot commented on PARQUET-1366:
-----------------------------------------

wesm closed pull request #483: PARQUET-1366: [C++] Streamline use of Arrow's 
bit-util.h APIs
URL: https://github.com/apache/parquet-cpp/pull/483
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/parquet/arrow/test-util.h b/src/parquet/arrow/test-util.h
index bfc78c87..2babacb8 100644
--- a/src/parquet/arrow/test-util.h
+++ b/src/parquet/arrow/test-util.h
@@ -368,7 +368,7 @@ Status MakeListArray(const std::shared_ptr<Array>& values, 
int64_t size,
   int32_t* offsets_ptr = reinterpret_cast<int32_t*>(offsets->mutable_data());
 
   auto null_bitmap = AllocateBuffer();
-  int64_t bitmap_size = ::arrow::BitUtil::CeilByte(size) / 8;
+  int64_t bitmap_size = ::arrow::BitUtil::BytesForBits(size);
   RETURN_NOT_OK(null_bitmap->Resize(bitmap_size));
   uint8_t* null_bitmap_ptr = null_bitmap->mutable_data();
   memset(null_bitmap_ptr, 0, bitmap_size);
diff --git a/src/parquet/column_reader.cc b/src/parquet/column_reader.cc
index bc3ee8aa..28d0dcb6 100644
--- a/src/parquet/column_reader.cc
+++ b/src/parquet/column_reader.cc
@@ -60,7 +60,7 @@ int LevelDecoder::SetData(Encoding::type encoding, int16_t 
max_level,
     }
     case Encoding::BIT_PACKED: {
       num_bytes =
-          static_cast<int32_t>(BitUtil::Ceil(num_buffered_values * bit_width_, 
8));
+          static_cast<int32_t>(BitUtil::BytesForBits(num_buffered_values * 
bit_width_));
       if (!bit_packed_decoder_) {
         bit_packed_decoder_.reset(new ::arrow::BitReader(data, num_bytes));
       } else {
diff --git a/src/parquet/column_writer-test.cc 
b/src/parquet/column_writer-test.cc
index aac582a2..6c0794a1 100644
--- a/src/parquet/column_writer-test.cc
+++ b/src/parquet/column_writer-test.cc
@@ -137,7 +137,7 @@ class TestPrimitiveWriter : public 
PrimitiveTypedTest<TestType> {
                                        bool enable_dictionary, bool 
enable_statistics,
                                        int64_t num_rows) {
     std::vector<uint8_t> valid_bits(
-        BitUtil::RoundUpNumBytes(static_cast<uint32_t>(this->values_.size())) 
+ 1, 255);
+        BitUtil::BytesForBits(static_cast<uint32_t>(this->values_.size())) + 
1, 255);
     ColumnProperties column_properties(encoding, compression, 
enable_dictionary,
                                        enable_statistics);
     std::shared_ptr<TypedColumnWriter<TestType>> writer =
diff --git a/src/parquet/column_writer.cc b/src/parquet/column_writer.cc
index 7d47d3f6..48fba555 100644
--- a/src/parquet/column_writer.cc
+++ b/src/parquet/column_writer.cc
@@ -50,7 +50,7 @@ void LevelEncoder::Init(Encoding::type encoding, int16_t 
max_level,
     }
     case Encoding::BIT_PACKED: {
       int num_bytes =
-          static_cast<int>(BitUtil::Ceil(num_buffered_values * bit_width_, 8));
+          static_cast<int>(BitUtil::BytesForBits(num_buffered_values * 
bit_width_));
       bit_packed_encoder_.reset(new BitWriter(data, num_bytes));
       break;
     }
@@ -72,7 +72,8 @@ int LevelEncoder::MaxBufferSize(Encoding::type encoding, 
int16_t max_level,
       break;
     }
     case Encoding::BIT_PACKED: {
-      num_bytes = static_cast<int>(BitUtil::Ceil(num_buffered_values * 
bit_width, 8));
+      num_bytes =
+          static_cast<int>(BitUtil::BytesForBits(num_buffered_values * 
bit_width));
       break;
     }
     default:
diff --git a/src/parquet/encoding-internal.h b/src/parquet/encoding-internal.h
index 98f9e4a8..2dfb9ff3 100644
--- a/src/parquet/encoding-internal.h
+++ b/src/parquet/encoding-internal.h
@@ -151,12 +151,17 @@ class PlainDecoder<BooleanType> : public 
Decoder<BooleanType> {
   int Decode(uint8_t* buffer, int max_values) {
     max_values = std::min(max_values, num_values_);
     bool val;
+    ::arrow::internal::BitmapWriter bit_writer(buffer, 0, max_values);
     for (int i = 0; i < max_values; ++i) {
       if (!bit_reader_.GetValue(1, &val)) {
         ParquetException::EofException();
       }
-      BitUtil::SetArrayBit(buffer, i, val);
+      if (val) {
+        bit_writer.Set();
+      }
+      bit_writer.Next();
     }
+    bit_writer.Finish();
     num_values_ -= max_values;
     return max_values;
   }
diff --git a/src/parquet/encoding-test.cc b/src/parquet/encoding-test.cc
index 60285ab2..50e1394c 100644
--- a/src/parquet/encoding-test.cc
+++ b/src/parquet/encoding-test.cc
@@ -43,7 +43,7 @@ namespace test {
 TEST(VectorBooleanTest, TestEncodeDecode) {
   // PARQUET-454
   int nvalues = 10000;
-  int nbytes = static_cast<int>(BitUtil::Ceil(nvalues, 8));
+  int nbytes = static_cast<int>(BitUtil::BytesForBits(nvalues));
 
   // seed the prng so failure is deterministic
   vector<bool> draws = flip_coins_seed(nvalues, 0.5, 0);
@@ -252,7 +252,7 @@ class TestDictionaryEncoding : public 
TestEncodingBase<Type> {
   static constexpr int TYPE = Type::type_num;
 
   void CheckRoundtrip() {
-    std::vector<uint8_t> valid_bits(BitUtil::RoundUpNumBytes(num_values_) + 1, 
255);
+    std::vector<uint8_t> valid_bits(BitUtil::BytesForBits(num_values_) + 1, 
255);
     DictEncoder<Type> encoder(descr_.get(), &pool_);
 
     ASSERT_NO_THROW(encoder.Put(draws_, num_values_));
diff --git a/src/parquet/statistics-test.cc b/src/parquet/statistics-test.cc
index 943d5ccf..d2ecede8 100644
--- a/src/parquet/statistics-test.cc
+++ b/src/parquet/statistics-test.cc
@@ -72,7 +72,7 @@ class TestRowGroupStatistics : public 
PrimitiveTypedTest<TestType> {
 
     TypedStats statistics3(this->schema_.Column(0));
     std::vector<uint8_t> valid_bits(
-        BitUtil::RoundUpNumBytes(static_cast<uint32_t>(this->values_.size())) 
+ 1, 255);
+        BitUtil::BytesForBits(static_cast<uint32_t>(this->values_.size())) + 
1, 255);
     statistics3.UpdateSpaced(this->values_ptr_, valid_bits.data(), 0,
                              this->values_.size(), 0);
     std::string encoded_min_spaced = statistics3.EncodeMin();
@@ -722,7 +722,7 @@ TEST(TestStatisticsFloatNaN, NaNValuesSpaced) {
   for (int i = 0; i < NUM_VALUES; i++) {
     nan_values[i] = std::nanf("");
   }
-  std::vector<uint8_t> valid_bits(BitUtil::RoundUpNumBytes(NUM_VALUES) + 1, 
255);
+  std::vector<uint8_t> valid_bits(BitUtil::BytesForBits(NUM_VALUES) + 1, 255);
 
   // Test values
   TypedRowGroupStatistics<FloatType> nan_stats(&descr);


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> [C++] Streamline use of Arrow bit-util.h
> ----------------------------------------
>
>                 Key: PARQUET-1366
>                 URL: https://issues.apache.org/jira/browse/PARQUET-1366
>             Project: Parquet
>          Issue Type: Task
>          Components: parquet-cpp
>            Reporter: Antoine Pitrou
>            Assignee: Antoine Pitrou
>            Priority: Minor
>              Labels: pull-request-available
>             Fix For: cpp-1.5.0
>
>
> Required for ARROW-2950: stop using certain bit-util APIs that will be 
> removed.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

[jira] [Commented] (PARQUET-1366) [C++] Streamline use of Arrow bit-util.h

Reply via email to