fsaintjacques commented on a change in pull request #6985:
URL: https://github.com/apache/arrow/pull/6985#discussion_r420091781
##########
File path: cpp/src/arrow/util/bit_util.h
##########
@@ -610,6 +618,71 @@ class FirstTimeBitmapWriter {
}
}
+ /// Appends number_of_bits from word to valid_bits and valid_bits_offset.
+ ///
+ /// \param[in] word The LSB bitmap to append. Any bits past number_of_bits
are assumed
+ /// to be unset (i.e. 0).
+ /// \param[in] number_of_bits The number of bits to append from word.
+ void AppendWord(uint64_t word, int64_t number_of_bits) {
+#if defined(ARROW_LITTLE_ENDIAN)
+ if (ARROW_PREDICT_FALSE(number_of_bits == 0)) {
+ return;
+ }
+
+ // Location that the first byte needs to be written to.
+ uint8_t* append_position = bitmap_ + byte_offset_;
+
+ // Update state variables except for current_byte_ here.
+ position_ += number_of_bits;
+ int64_t bit_offset =
BitUtil::CountTrailingZeros(static_cast<uint32_t>(bit_mask_));
+ bit_mask_ = BitUtil::kBitmask[(bit_offset + number_of_bits) % 8];
+ byte_offset_ += (bit_offset + number_of_bits) / 8;
+
+ if (bit_offset != 0) {
+ // We are in the middle of the byte. This code updates the byte and
shifts
+ // bits appropriately within word so it can be memcpy'd below.
+ int64_t bits_to_carry = 8 - bit_offset;
+ // Carry over bits from word to current_byte_. We assume any extra bits
in word
+ // unset so no additional accounting is needed for when number_of_bits <
+ // bits_to_carry.
+ current_byte_ |= (word & BitUtil::kPrecedingBitmask[bits_to_carry]) <<
bit_offset;
+ // Check if everything is transfered into current_byte_.
+ if (ARROW_PREDICT_FALSE(number_of_bits < bits_to_carry)) {
+ return;
+ }
+ *append_position = current_byte_;
+ append_position++;
+ // Move the carry bits off of word.
+ word = word >> bits_to_carry;
+ number_of_bits -= bits_to_carry;
+ }
+
+ int64_t bytes_for_word = ::arrow::BitUtil::BytesForBits(number_of_bits);
+ std::memcpy(append_position, &word, bytes_for_word);
+ // At this point, the previous current_byte_ has been written to bitmap_.
+ // The new current_byte_ is either the last relevant byte in 'word'
+ // or cleared if the new position is byte aligned (i.e. a fresh byte).
+ if (bit_mask_ == 0x1) {
+ current_byte_ = 0;
+ } else {
+ // current_byte_ = *(reinterpret_cast<uint8_t*>(&word) + bytes_for_word
- 1) : 0;
+ current_byte_ = *(append_position + bytes_for_word - 1);
+ }
+
+#else // big-endian
+ for (int x = 0; x < number_of_bits; x++) {
Review comment:
An aborting static_assert such that one does not get a surprise at
runtime?
##########
File path: cpp/src/parquet/column_reader.cc
##########
@@ -50,6 +51,141 @@ using arrow::internal::checked_cast;
namespace parquet {
+namespace {
+
+inline void CheckLevelRange(const int16_t* levels, int64_t num_levels,
+ const int16_t max_expected_level) {
+ int16_t min_level = std::numeric_limits<int16_t>::max();
+ int16_t max_level = std::numeric_limits<int16_t>::min();
+ for (int x = 0; x < num_levels; x++) {
+ min_level = std::min(levels[x], min_level);
+ max_level = std::max(levels[x], max_level);
+ }
+ if (ARROW_PREDICT_FALSE(num_levels > 0 && (min_level < 0 || max_level >
max_level))) {
Review comment:
`max_level > max_level`?
##########
File path: cpp/src/parquet/level_conversion.cc
##########
@@ -0,0 +1,170 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "parquet/level_conversion.h"
+
+#include <algorithm>
+#include <limits>
+#if defined(ARROW_HAVE_BMI2)
+#include <x86intrin.h>
+#endif
+
+#include "arrow/util/bit_util.h"
+
+#include "parquet/exception.h"
+
+namespace parquet {
+namespace internal {
+namespace {
+inline void CheckLevelRange(const int16_t* levels, int64_t num_levels,
+ const int16_t max_expected_level) {
+ int16_t min_level = std::numeric_limits<int16_t>::max();
+ int16_t max_level = std::numeric_limits<int16_t>::min();
+ for (int x = 0; x < num_levels; x++) {
+ min_level = std::min(levels[x], min_level);
+ max_level = std::max(levels[x], max_level);
+ }
+ if (ARROW_PREDICT_FALSE(num_levels > 0 && (min_level < 0 || max_level >
max_level))) {
+ throw ParquetException("definition level exceeds maximum");
+ }
+}
+
+#if !defined(ARROW_HAVE_BMI2)
+
+inline void DefinitionLevelsToBitmapScalar(
+ const int16_t* def_levels, int64_t num_def_levels, const int16_t
max_definition_level,
+ const int16_t max_repetition_level, int64_t* values_read, int64_t*
null_count,
+ uint8_t* valid_bits, int64_t valid_bits_offset) {
+ // We assume here that valid_bits is large enough to accommodate the
+ // additional definition levels and the ones that have already been written
+ ::arrow::internal::BitmapWriter valid_bits_writer(valid_bits,
valid_bits_offset,
+ num_def_levels);
+
+ // TODO(itaiin): As an interim solution we are splitting the code path here
+ // between repeated+flat column reads, and non-repeated+nested reads.
+ // Those paths need to be merged in the future
+ for (int i = 0; i < num_def_levels; ++i) {
+ if (def_levels[i] == max_definition_level) {
+ valid_bits_writer.Set();
+ } else if (max_repetition_level > 0) {
+ // repetition+flat case
+ if (def_levels[i] == (max_definition_level - 1)) {
+ valid_bits_writer.Clear();
+ *null_count += 1;
+ } else {
+ continue;
+ }
+ } else {
+ // non-repeated+nested case
+ if (def_levels[i] < max_definition_level) {
+ valid_bits_writer.Clear();
+ *null_count += 1;
+ } else {
+ throw ParquetException("definition level exceeds maximum");
+ }
+ }
+
+ valid_bits_writer.Next();
+ }
+ valid_bits_writer.Finish();
+ *values_read = valid_bits_writer.position();
+}
+#endif
+
+template <bool has_repeated_parent>
+void DefinitionLevelsToBitmapSimd(const int16_t* def_levels, int64_t
num_def_levels,
+ const int16_t required_definition_level,
+ int64_t* values_read, int64_t* null_count,
+ uint8_t* valid_bits, int64_t
valid_bits_offset) {
+ constexpr int64_t kBitMaskSize = 64;
+ ::arrow::internal::FirstTimeBitmapWriter writer(valid_bits,
+
/*start_offset=*/valid_bits_offset,
+ /*length=*/num_def_levels);
+ int64_t set_count = 0;
+ *values_read = 0;
+ while (num_def_levels > 0) {
+ int64_t batch_size = std::min(num_def_levels, kBitMaskSize);
+ CheckLevelRange(def_levels, batch_size, required_definition_level);
+ uint64_t defined_bitmap = internal::GreaterThanBitmap(def_levels,
batch_size,
+
required_definition_level - 1);
+ if (has_repeated_parent) {
+#if defined(ARROW_HAVE_BMI2)
+ // This is currently a specialized code path assuming only (nested) lists
+ // present through the leaf (i.e. no structs).
+ // Upper level code only calls this method
+ // when the leaf-values are nullable (otherwise no spacing is needed),
+ // Because only nested lists exists it is sufficient to know that the
field
+ // was either null or included it (i.e. definition level >
max_definitation_level
+ // -2) If there where structs mixed in, we need to know the def_level of
the
+ // repeated parent so we can check for def_level > "def level of
repeated parent".
+ uint64_t present_bitmap = internal::GreaterThanBitmap(
+ def_levels, batch_size, required_definition_level - 2);
+ uint64_t selected_bits = _pext_u64(defined_bitmap, present_bitmap);
+ writer.AppendWord(selected_bits,
::arrow::BitUtil::PopCount(present_bitmap));
+ set_count += ::arrow::BitUtil::PopCount(selected_bits);
+#else
+ assert(false && "must not execute this without BMI2");
+#endif
+ } else {
+ writer.AppendWord(defined_bitmap, batch_size);
+ set_count += ::arrow::BitUtil::PopCount(defined_bitmap);
+ }
+ def_levels += batch_size;
+ num_def_levels -= batch_size;
+ }
+
+ *values_read = writer.position();
+ *null_count += *values_read - set_count;
+ writer.Finish();
+}
+
+} // namespace
+
+void DefinitionLevelsToBitmap(const int16_t* def_levels, int64_t
num_def_levels,
+ const int16_t max_definition_level,
+ const int16_t max_repetition_level, int64_t*
values_read,
+ int64_t* null_count, uint8_t* valid_bits,
+ int64_t valid_bits_offset) {
+ if (max_repetition_level > 0) {
Review comment:
The interlacing of 1 runtime branch and 2 static branches is hard to
read. It seems the big-endian version doesn't even close the block with a
corresponding parenthesis.
##########
File path: cpp/src/parquet/level_conversion.h
##########
@@ -0,0 +1,67 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "parquet/platform.h"
+
+namespace parquet {
+namespace internal {
+
+void PARQUET_EXPORT DefinitionLevelsToBitmap(
+ const int16_t* def_levels, int64_t num_def_levels, const int16_t
max_definition_level,
+ const int16_t max_repetition_level, int64_t* values_read, int64_t*
null_count,
+ uint8_t* valid_bits, int64_t valid_bits_offset);
+
+// These APIs are likely to be revised as part of ARROW-8494 to reduce
duplicate code.
+// They currently represent minimal functionality for vectorized computation
of definition
+// levels.
+
+#if defined(ARROW_LITTLE_ENDIAN)
+/// Builds a bitmap by applying predicate to the level vector provided.
+///
+/// \param[in] levels Rep or def level array.
+/// \param[in] num_levels The number of levels to process (must be [0, 64])
+/// \param[in] predicate The predicate to apply (must have the signature `bool
+/// predicate(int16_t)`.
+/// \returns The bitmap using least significant "bit" ordering.
+///
+/// N.B. Correct byte ordering is dependent on little-endian architectures.
+///
+template <typename Predicate>
+uint64_t LevelsToBitmap(const int16_t* levels, int64_t num_levels, Predicate
predicate) {
+ // Both clang and GCC can vectorize this automatically with SSE4/AVX2.
+ uint64_t mask = 0;
+ for (int x = 0; x < num_levels; x++) {
+ mask |= static_cast<int64_t>(predicate(levels[x]) ? 1 : 0) << x;
Review comment:
- Mixing `uint64_t` and `int64_t` in the mask update.
- Add a DCHECK_LE(num_levels, 64).
##########
File path: cpp/src/parquet/level_conversion.cc
##########
@@ -0,0 +1,170 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "parquet/level_conversion.h"
+
+#include <algorithm>
+#include <limits>
+#if defined(ARROW_HAVE_BMI2)
+#include <x86intrin.h>
+#endif
+
+#include "arrow/util/bit_util.h"
+
+#include "parquet/exception.h"
+
+namespace parquet {
+namespace internal {
+namespace {
+inline void CheckLevelRange(const int16_t* levels, int64_t num_levels,
+ const int16_t max_expected_level) {
+ int16_t min_level = std::numeric_limits<int16_t>::max();
+ int16_t max_level = std::numeric_limits<int16_t>::min();
+ for (int x = 0; x < num_levels; x++) {
+ min_level = std::min(levels[x], min_level);
+ max_level = std::max(levels[x], max_level);
+ }
+ if (ARROW_PREDICT_FALSE(num_levels > 0 && (min_level < 0 || max_level >
max_level))) {
+ throw ParquetException("definition level exceeds maximum");
+ }
+}
+
+#if !defined(ARROW_HAVE_BMI2)
+
+inline void DefinitionLevelsToBitmapScalar(
+ const int16_t* def_levels, int64_t num_def_levels, const int16_t
max_definition_level,
+ const int16_t max_repetition_level, int64_t* values_read, int64_t*
null_count,
+ uint8_t* valid_bits, int64_t valid_bits_offset) {
+ // We assume here that valid_bits is large enough to accommodate the
+ // additional definition levels and the ones that have already been written
+ ::arrow::internal::BitmapWriter valid_bits_writer(valid_bits,
valid_bits_offset,
+ num_def_levels);
+
+ // TODO(itaiin): As an interim solution we are splitting the code path here
+ // between repeated+flat column reads, and non-repeated+nested reads.
+ // Those paths need to be merged in the future
+ for (int i = 0; i < num_def_levels; ++i) {
+ if (def_levels[i] == max_definition_level) {
+ valid_bits_writer.Set();
+ } else if (max_repetition_level > 0) {
+ // repetition+flat case
+ if (def_levels[i] == (max_definition_level - 1)) {
+ valid_bits_writer.Clear();
+ *null_count += 1;
+ } else {
+ continue;
+ }
+ } else {
+ // non-repeated+nested case
+ if (def_levels[i] < max_definition_level) {
+ valid_bits_writer.Clear();
+ *null_count += 1;
+ } else {
+ throw ParquetException("definition level exceeds maximum");
+ }
+ }
+
+ valid_bits_writer.Next();
+ }
+ valid_bits_writer.Finish();
+ *values_read = valid_bits_writer.position();
+}
+#endif
+
+template <bool has_repeated_parent>
+void DefinitionLevelsToBitmapSimd(const int16_t* def_levels, int64_t
num_def_levels,
+ const int16_t required_definition_level,
+ int64_t* values_read, int64_t* null_count,
+ uint8_t* valid_bits, int64_t
valid_bits_offset) {
+ constexpr int64_t kBitMaskSize = 64;
+ ::arrow::internal::FirstTimeBitmapWriter writer(valid_bits,
+
/*start_offset=*/valid_bits_offset,
+ /*length=*/num_def_levels);
+ int64_t set_count = 0;
+ *values_read = 0;
+ while (num_def_levels > 0) {
Review comment:
The number of iterations is only dependent on num_def_levels. Break the
loop in increments of kBitMaskSize and process the leftover if needed. This
will ensure that `batch_size` is statically known to be `kBitMaskSize` inside
the loop, and thus the compiler will be able to apply constant propagation and
forward to `GreaterThanBitmap`.
##########
File path: cpp/src/parquet/level_conversion.h
##########
@@ -0,0 +1,67 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "parquet/platform.h"
+
+namespace parquet {
+namespace internal {
+
+void PARQUET_EXPORT DefinitionLevelsToBitmap(
+ const int16_t* def_levels, int64_t num_def_levels, const int16_t
max_definition_level,
+ const int16_t max_repetition_level, int64_t* values_read, int64_t*
null_count,
+ uint8_t* valid_bits, int64_t valid_bits_offset);
+
+// These APIs are likely to be revised as part of ARROW-8494 to reduce
duplicate code.
+// They currently represent minimal functionality for vectorized computation
of definition
+// levels.
+
+#if defined(ARROW_LITTLE_ENDIAN)
+/// Builds a bitmap by applying predicate to the level vector provided.
+///
+/// \param[in] levels Rep or def level array.
+/// \param[in] num_levels The number of levels to process (must be [0, 64])
+/// \param[in] predicate The predicate to apply (must have the signature `bool
+/// predicate(int16_t)`.
+/// \returns The bitmap using least significant "bit" ordering.
+///
+/// N.B. Correct byte ordering is dependent on little-endian architectures.
+///
+template <typename Predicate>
+uint64_t LevelsToBitmap(const int16_t* levels, int64_t num_levels, Predicate
predicate) {
+ // Both clang and GCC can vectorize this automatically with SSE4/AVX2.
+ uint64_t mask = 0;
+ for (int x = 0; x < num_levels; x++) {
+ mask |= static_cast<int64_t>(predicate(levels[x]) ? 1 : 0) << x;
+ }
+ return mask;
+}
+
+/// Builds a bitmap where each set bit indicates the corresponding level is
greater
+/// than rhs.
+static inline uint64_t GreaterThanBitmap(const int16_t* levels, int64_t
num_levels,
+ int16_t rhs) {
+ return LevelsToBitmap(levels, num_levels, [&](int16_t value) { return value
> rhs; });
Review comment:
You should just capture rhs by value.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]