pitrou commented on a change in pull request #6985:
URL: https://github.com/apache/arrow/pull/6985#discussion_r419602697
##########
File path: cpp/src/arrow/util/bit_util.h
##########
@@ -610,6 +618,101 @@ class FirstTimeBitmapWriter {
}
}
+ /// Appends number_of_bits from word to valid_bits and valid_bits_offset.
+ ///
+ /// \param[in] word The LSB bitmap to append. Any bits past number_of_bits
are assumed
+ /// to be zero.
+ /// \param[in] number_of_bits The number of bits to append from word.
+ void AppendWord(uint64_t word, int64_t number_of_bits) {
+#if defined(ARROW_LITTLE_ENDIAN)
+ // Selection masks to retrieve all low order bits for each bytes.
+ constexpr uint64_t kLsbSelectionMasks[] = {
+ 0, // unused.
+ 0x0101010101010101,
+ 0x0303030303030303,
+ 0x0707070707070707,
+ 0x0F0F0F0F0F0F0F0F,
+ 0x1F1F1F1F1F1F1F1F,
+ 0x3F3F3F3F3F3F3F3F,
+ 0x7F7F7F7F7F7F7F7F,
+ };
+ if (ARROW_PREDICT_FALSE(number_of_bits == 0)) {
+ return;
+ }
+
+ // Location that the first byte needs to be written to.
+ uint8_t* append_position = bitmap_ + byte_offset_;
+
+ // Update state variables except for current_byte_ here.
+ position_ += number_of_bits;
+ int64_t bit_offset =
BitUtil::CountTrailingZeros(static_cast<uint32_t>(bit_mask_));
+ bit_mask_ = BitUtil::kBitmask[(bit_offset + number_of_bits) % 8];
+ byte_offset_ += (bit_offset + number_of_bits) / 8;
+
+ if (bit_offset != 0) {
+ // We are in the middle of the byte. This code updates the byte and
shifts the word
+ // so trailing bits can be appended below.
+ int64_t bits_to_carry = 8 - bit_offset;
+ // Get the mask the will select the lower order bits (the ones to carry
+ // over to the existing byte and shift up.
+ const uint64_t carry_mask = kLsbSelectionMasks[bits_to_carry];
+ // Mask to select non-carried bits.
+ const uint64_t non_carry_mask = ~carry_mask;
+
+ // valid bits should be a valid bitmask so all trailing bytes hsould be
unset
+ // so no mask is need to start.
+ current_byte_ |= (((word & carry_mask) & 0xFF) << bit_offset);
+ if (ARROW_PREDICT_FALSE(number_of_bits < bits_to_carry)) {
+ return;
+ }
+ *append_position = current_byte_;
+ append_position++;
+
+ // We illustrate logic with a 3-byte example in little endian/LSB order
+ // (N indicates not set bit Y indicates a set bit).
+ // Note this ordering is the reversed from HEX masks above with are
expressed
+ // big-endian/MSB and shifts right move the bits to the left (division).
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
22 23
+ // Assuming a bit-offset of 6 the non_carry_mask looks like:
+ // N N Y Y Y Y Y Y N N Y Y Y Y Y Y N N Y Y Y Y
Y Y
+ // So shifted_word should look like;
+ // 2 3 4 5 6 7 N N 10 11 12 13 14 15 N N 18 19 20 21 22 23
N N
+ // clang-format on
+ uint64_t shifted_word = (word & non_carry_mask) >> bits_to_carry;
+ // captured_carry:
+ // 0 1 N N N N N N 8 9 N N N N N N 16 17 N N N N
N N
+ uint64_t captured_carry = carry_mask & word;
+ // mask_cary_bits:
+ // N N N N N N 8 9 N N N N N N 16 17 N N N N N N
N N
+ uint64_t mask_carry_bits = (captured_carry >> 8) << bit_offset;
+
+ word = shifted_word | mask_carry_bits;
+ number_of_bits -= bits_to_carry;
+ }
+
+ int64_t bytes_for_word = ::arrow::BitUtil::BytesForBits(number_of_bits);
+ std::memcpy(append_position, &word, bytes_for_word);
+ // At this point, we are guaranteed to have flushed the previous
current_byte_ state.
+ // So the new state is either the last relevant byte in 'word'
+ // or zero if we happen to be at a fresh byte.
+ current_byte_ =
+ bit_mask_ != 0x1 ? *(reinterpret_cast<uint8_t*>(&word) +
bytes_for_word - 1) : 0;
Review comment:
This is cryptic. Can you rewrite it in a more understandable way? For
example (untested):
```cpp
if (bit_mask_ == 0x1) {
current_byte_ = 0;
} else {
current_byte_ = *(append_position + bytes_for_word - 1);
}
```
##########
File path: cpp/src/arrow/util/bit_util_test.cc
##########
@@ -315,6 +318,115 @@ TEST(FirstTimeBitmapWriter, NormalOperation) {
}
}
+std::string BitmapToString(const uint8_t* bitmap, int64_t bit_count) {
+ return arrow::internal::Bitmap(bitmap, /*offset*/ 0,
/*length=*/bit_count).ToString();
+}
+
+std::string BitmapToString(const std::vector<uint8_t>& bitmap, int64_t
bit_count) {
+ return BitmapToString(bitmap.data(), bit_count);
+}
+
+TEST(FirstTimeBitmapWriter,
AppendWordOffsetOverwritesCorrectBitsOnExistingByte) {
+ auto check_append = [](const std::string& expected_bits, int64_t offset) {
+ std::vector<uint8_t> valid_bits = {0x00};
Review comment:
For each of the tests here it would be nice to have two variants or
iterations: one that starts with a bitmap full of 0s, and one with a bitmap
full of 1s.
##########
File path: cpp/src/arrow/util/bit_util.h
##########
@@ -610,6 +618,103 @@ class FirstTimeBitmapWriter {
}
}
+ /// Appends number_of_bits from word to valid_bits and valid_bits_offset.
+ ///
+ /// \param[in] word The LSB bitmap to append. Any bits past number_of_bits
are assumed
+ /// to be unset (i.e. 0).
+ /// \param[in] number_of_bits The number of bits to append from word.
+ void AppendWord(uint64_t word, int64_t number_of_bits) {
+#if defined(ARROW_LITTLE_ENDIAN)
+ // Selection masks to retrieve all low order bits for each byte in word.
+ constexpr uint64_t kLsbSelectionMasks[] = {
+ 0, // unused.
+ 0x0101010101010101,
+ 0x0303030303030303,
+ 0x0707070707070707,
+ 0x0F0F0F0F0F0F0F0F,
+ 0x1F1F1F1F1F1F1F1F,
+ 0x3F3F3F3F3F3F3F3F,
+ 0x7F7F7F7F7F7F7F7F,
+ };
+ if (ARROW_PREDICT_FALSE(number_of_bits == 0)) {
+ return;
+ }
+
+ // Location that the first byte needs to be written to.
+ uint8_t* append_position = bitmap_ + byte_offset_;
+
+ // Update state variables except for current_byte_ here.
+ position_ += number_of_bits;
+ int64_t bit_offset =
BitUtil::CountTrailingZeros(static_cast<uint32_t>(bit_mask_));
+ bit_mask_ = BitUtil::kBitmask[(bit_offset + number_of_bits) % 8];
+ byte_offset_ += (bit_offset + number_of_bits) / 8;
+
+ if (bit_offset != 0) {
+ // We are in the middle of the byte. This code updates the byte and
shifts
+ // bits appropriately within word so it can be memcpy'd below.
+ int64_t bits_to_carry = 8 - bit_offset;
+ // Get the mask that will select the least significant bit (the ones to
carry
+ // over to the current_byte_ and shift up).
+ const uint64_t carry_mask = kLsbSelectionMasks[bits_to_carry];
+ // Mask to select non-carried bits.
+ const uint64_t non_carry_mask = ~carry_mask;
+
+ // Carry over bits from word to current_byte_. We assume any extra bits
in word
+ // unset so no additional accounting is needed for when number_of_bits <
+ // bits_to_carry.
+ current_byte_ |= (((word & carry_mask) & 0xFF) << bit_offset);
+ // Check if everything is transfered into current_byte_.
+ if (ARROW_PREDICT_FALSE(number_of_bits < bits_to_carry)) {
+ return;
+ }
+ *append_position = current_byte_;
+ append_position++;
+
+ // We illustrate the logic with a 3-byte example in little-endian/LSB
order
+ // ('N' indicates a not set bit, 'Y' indicates a set bit).
+ // Note this ordering is reversed from HEX masks above with are expressed
+ // big-endian/MSB and shifts right move the bits to the left.
+ // The original bit positions are:
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
22 23
+ // Assuming a bit-offset of 6 non_carry_mask is:
+ // N N Y Y Y Y Y Y N N Y Y Y Y Y Y N N Y Y Y Y
Y Y
+ // shifted_word is:
+ // 2 3 4 5 6 7 N N 10 11 12 13 14 15 N N 18 19 20 21 22 23
N N
+ uint64_t shifted_word = (word & non_carry_mask) >> bits_to_carry;
+ // captured_carry is:
+ // 0 1 N N N N N N 8 9 N N N N N N 16 17 N N N N
N N
+ uint64_t captured_carry = carry_mask & word;
+ // mask_carry_bits is:
+ // N N N N N N 8 9 N N N N N N 16 17 N N N N N N
N N
+ uint64_t mask_carry_bits = (captured_carry >> 8) << bit_offset;
Review comment:
This is exactly `(captured_carry >> bits_to_carry)`, right? So in the
end we could just write `word = word >> bits_to_carry`?
(which means most of the above could be simplified away)
##########
File path: cpp/src/arrow/util/bit_util_test.cc
##########
@@ -315,6 +318,115 @@ TEST(FirstTimeBitmapWriter, NormalOperation) {
}
}
+std::string BitmapToString(const uint8_t* bitmap, int64_t bit_count) {
+ return arrow::internal::Bitmap(bitmap, /*offset*/ 0,
/*length=*/bit_count).ToString();
+}
+
+std::string BitmapToString(const std::vector<uint8_t>& bitmap, int64_t
bit_count) {
+ return BitmapToString(bitmap.data(), bit_count);
+}
+
+TEST(FirstTimeBitmapWriter,
AppendWordOffsetOverwritesCorrectBitsOnExistingByte) {
+ auto check_append = [](const std::string& expected_bits, int64_t offset) {
+ std::vector<uint8_t> valid_bits = {0x00};
+ constexpr int64_t kBitsAfterAppend = 8;
+ internal::FirstTimeBitmapWriter writer(valid_bits.data(), offset,
+ /*length=*/(8 * valid_bits.size())
- offset);
+ writer.AppendWord(/*word=*/0xFF, /*number_of_bits=*/kBitsAfterAppend -
offset);
+ writer.Finish();
+ EXPECT_EQ(BitmapToString(valid_bits, kBitsAfterAppend), expected_bits);
+ };
+ check_append("11111111", 0);
+ check_append("01111111", 1);
+ check_append("00111111", 2);
+ check_append("00011111", 3);
+ check_append("00001111", 4);
+ check_append("00000111", 5);
+ check_append("00000011", 6);
+ check_append("00000001", 7);
+}
+
+TEST(FirstTimeBitmapWriter, AppendZeroBitsHasNoImpact) {
+ std::vector<uint8_t> valid_bits(/*count=*/1, 0);
+ internal::FirstTimeBitmapWriter writer(valid_bits.data(), /*start_offset=*/1,
+ /*length=*/valid_bits.size() * 8);
+ writer.AppendWord(/*word=*/0xFF, /*number_of_bits=*/0);
+ writer.AppendWord(/*word=*/0xFF, /*number_of_bits=*/0);
+ writer.AppendWord(/*word=*/0x01, /*number_of_bits=*/1);
+ writer.Finish();
+ EXPECT_EQ(valid_bits[0], 0x2);
+}
+
+TEST(FirstTimeBitmapWriter, AppendLessThenByte) {
Review comment:
Typo: "LessThan"
##########
File path: cpp/src/arrow/util/bit_util.h
##########
@@ -610,6 +618,103 @@ class FirstTimeBitmapWriter {
}
}
+ /// Appends number_of_bits from word to valid_bits and valid_bits_offset.
+ ///
+ /// \param[in] word The LSB bitmap to append. Any bits past number_of_bits
are assumed
+ /// to be unset (i.e. 0).
+ /// \param[in] number_of_bits The number of bits to append from word.
+ void AppendWord(uint64_t word, int64_t number_of_bits) {
+#if defined(ARROW_LITTLE_ENDIAN)
+ // Selection masks to retrieve all low order bits for each byte in word.
+ constexpr uint64_t kLsbSelectionMasks[] = {
+ 0, // unused.
+ 0x0101010101010101,
+ 0x0303030303030303,
+ 0x0707070707070707,
+ 0x0F0F0F0F0F0F0F0F,
+ 0x1F1F1F1F1F1F1F1F,
+ 0x3F3F3F3F3F3F3F3F,
+ 0x7F7F7F7F7F7F7F7F,
+ };
+ if (ARROW_PREDICT_FALSE(number_of_bits == 0)) {
+ return;
+ }
+
+ // Location that the first byte needs to be written to.
+ uint8_t* append_position = bitmap_ + byte_offset_;
+
+ // Update state variables except for current_byte_ here.
+ position_ += number_of_bits;
+ int64_t bit_offset =
BitUtil::CountTrailingZeros(static_cast<uint32_t>(bit_mask_));
+ bit_mask_ = BitUtil::kBitmask[(bit_offset + number_of_bits) % 8];
+ byte_offset_ += (bit_offset + number_of_bits) / 8;
+
+ if (bit_offset != 0) {
+ // We are in the middle of the byte. This code updates the byte and
shifts
+ // bits appropriately within word so it can be memcpy'd below.
+ int64_t bits_to_carry = 8 - bit_offset;
+ // Get the mask that will select the least significant bit (the ones to
carry
+ // over to the current_byte_ and shift up).
+ const uint64_t carry_mask = kLsbSelectionMasks[bits_to_carry];
+ // Mask to select non-carried bits.
+ const uint64_t non_carry_mask = ~carry_mask;
+
+ // Carry over bits from word to current_byte_. We assume any extra bits
in word
+ // unset so no additional accounting is needed for when number_of_bits <
+ // bits_to_carry.
+ current_byte_ |= (((word & carry_mask) & 0xFF) << bit_offset);
+ // Check if everything is transfered into current_byte_.
+ if (ARROW_PREDICT_FALSE(number_of_bits < bits_to_carry)) {
+ return;
+ }
+ *append_position = current_byte_;
+ append_position++;
+
+ // We illustrate the logic with a 3-byte example in little-endian/LSB
order
+ // ('N' indicates a not set bit, 'Y' indicates a set bit).
+ // Note this ordering is reversed from HEX masks above with are expressed
+ // big-endian/MSB and shifts right move the bits to the left.
+ // The original bit positions are:
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
22 23
+ // Assuming a bit-offset of 6 non_carry_mask is:
+ // N N Y Y Y Y Y Y N N Y Y Y Y Y Y N N Y Y Y Y
Y Y
+ // shifted_word is:
+ // 2 3 4 5 6 7 N N 10 11 12 13 14 15 N N 18 19 20 21 22 23
N N
+ uint64_t shifted_word = (word & non_carry_mask) >> bits_to_carry;
+ // captured_carry is:
+ // 0 1 N N N N N N 8 9 N N N N N N 16 17 N N N N
N N
+ uint64_t captured_carry = carry_mask & word;
+ // mask_carry_bits is:
+ // N N N N N N 8 9 N N N N N N 16 17 N N N N N N
N N
+ uint64_t mask_carry_bits = (captured_carry >> 8) << bit_offset;
+
+ word = shifted_word | mask_carry_bits;
+ number_of_bits -= bits_to_carry;
+ }
+
+ int64_t bytes_for_word = ::arrow::BitUtil::BytesForBits(number_of_bits);
+ std::memcpy(append_position, &word, bytes_for_word);
+ // At this point, the previous current_byte_ has been written to bitmap_.
+ // The new current_byte_ is either the last relevant byte in 'word'
+ // or cleared if the new position is byte aligned (i.e. a fresh byte).
+ current_byte_ =
+ bit_mask_ != 0x1 ? *(reinterpret_cast<uint8_t*>(&word) +
bytes_for_word - 1) : 0;
+
+#else // big-endian
+ BitmapReader reader(reinterpret_cast<uint8_t*>(&word), 0,
/*length=*/number_of_bits);
Review comment:
Uh, will this actually work in big-endian? This is going to read the
most significant byte first... I think you need to shift the word logically,
like this:
```cpp
for (int x = 0; x < number_of_bits; x++) {
if (word & 0x1) {
Set();
} else {
Clear();
}
Next();
word >>= 1;
}
```
##########
File path: cpp/src/parquet/level_conversion.cc
##########
@@ -0,0 +1,170 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "parquet/level_conversion.h"
+
+#include <algorithm>
+#include <limits>
+#if defined(ARROW_HAVE_BMI2)
+#include <x86intrin.h>
+#endif
+
+#include "arrow/util/bit_util.h"
+
+#include "parquet/exception.h"
+
+namespace parquet {
+namespace internal {
+namespace {
+inline void CheckLevelRange(const int16_t* levels, int64_t num_levels,
+ const int16_t max_expected_level) {
+ int16_t min_level = std::numeric_limits<int16_t>::max();
+ int16_t max_level = std::numeric_limits<int16_t>::min();
+ for (int x = 0; x < num_levels; x++) {
+ min_level = std::min(levels[x], min_level);
+ max_level = std::max(levels[x], max_level);
+ }
+ if (ARROW_PREDICT_FALSE(num_levels > 0 && (min_level < 0 || max_level >
max_level))) {
+ throw ParquetException("definition level exceeds maximum");
+ }
+}
+
+#if !defined(ARROW_HAVE_BMI2)
+
+inline void DefinitionLevelsToBitmapScalar(
+ const int16_t* def_levels, int64_t num_def_levels, const int16_t
max_definition_level,
+ const int16_t max_repetition_level, int64_t* values_read, int64_t*
null_count,
+ uint8_t* valid_bits, int64_t valid_bits_offset) {
+ // We assume here that valid_bits is large enough to accommodate the
+ // additional definition levels and the ones that have already been written
+ ::arrow::internal::BitmapWriter valid_bits_writer(valid_bits,
valid_bits_offset,
+ num_def_levels);
+
+ // TODO(itaiin): As an interim solution we are splitting the code path here
+ // between repeated+flat column reads, and non-repeated+nested reads.
+ // Those paths need to be merged in the future
Review comment:
Is this TODO still relevant? Is it a performance issue or a behaviour
issue?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]