sfc-gh-nthimmegowda commented on code in PR #14147: URL: https://github.com/apache/arrow/pull/14147#discussion_r977099469
########## cpp/src/parquet/encoding.cc: ########## @@ -2355,6 +2355,81 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl, std::shared_ptr<ResizableBuffer> buffered_data_; }; +// ---------------------------------------------------------------------- +// RLE_BOOLEAN_DECODER + +class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder { + public: + explicit RleBooleanDecoder(const ColumnDescriptor* descr) + : DecoderImpl(descr, Encoding::RLE) {} + + void SetData(int num_values, const uint8_t* data, int len) override { + num_values_ = num_values; + uint32_t num_bytes = 0; + + if (len < 4) { + throw ParquetException("Received invalid length : " + std::to_string(len) + + " (corrupt data page?)"); + } + // Load the first 4 bytes in little-endian, which indicates the length + num_bytes = + ::arrow::bit_util::ToLittleEndian(::arrow::util::SafeLoadAs<uint32_t>(data)); + if (num_bytes < 0 || num_bytes > static_cast<uint32_t>(len - 4)) { + throw ParquetException("Received invalid number of bytes : " + + std::to_string(num_bytes) + " (corrupt data page?)"); + } + + auto decoder_data = data + 4; + decoder_ = std::make_shared<::arrow::util::RleDecoder>(decoder_data, num_bytes, + /*bit_width=*/1); + } + + int Decode(bool* buffer, int max_values) override { + max_values = std::min(max_values, num_values_); + int val = 0; + + for (int i = 0; i < max_values; ++i) { + if (!decoder_->Get(&val)) { + throw ParquetException("Unable to parse bits for position (0 based) : " + + std::to_string(i) + " (corrupt data page?)"); + } + if (val) { + buffer[i] = true; + } else { + buffer[i] = false; + } + } + num_values_ -= max_values; + return max_values; + } + + int Decode(uint8_t* buffer, int max_values) override { + max_values = std::min(max_values, num_values_); + if (decoder_->GetBatch(buffer, max_values) != max_values) { Review Comment: My bad again, I assumed `col->ReadBatch` , will internally call `decoder_->GetBatch`. This method is covered in test `VectorBooleanTest` I went looking into the history why a boolean decoder has bits written to `uint8_t` datatype. This was introduced in this [commit](https://github.com/apache/arrow/commit/3d435e4f8d5fb7a54a4a9d285e1a42d60186d8dc#diff-61373111f1c45b61c1232091a83f44e66895cf9d27b264d9cbfceac45ff87941). This was introduced for testing purposes. The data is encoded with data from `vector<bool>` however it not possible to get reference for `bool` from `vector<bool>` which is needed to pass to Decode function `int Decode(bool* buffer, int max_values) `. [Link](https://en.wikipedia.org/w/index.php?title=Sequence_container_(C%2B%2B)&oldid=767869909#Specialization_for_bool) . To bypass we introduced a reader with `uint_8` datatype. I think a refactor on this is due, which is my next plan, that we replace `vector<bool>` with `bool[]` in test case and remove the class `BooleanDecoder` Proposed fix is here. Not including in this PR for compartmentalization. ```diff diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index a862743d5..4a4646724 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1147,8 +1147,7 @@ int PlainDecoder<DType>::Decode(T* buffer, int max_values) { } class PlainBooleanDecoder : public DecoderImpl, - virtual public TypedDecoder<BooleanType>, - virtual public BooleanDecoder { + virtual public TypedDecoder<BooleanType> { public: explicit PlainBooleanDecoder(const ColumnDescriptor* descr); void SetData(int num_values, const uint8_t* data, int len) override; @@ -2358,7 +2357,7 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl, // ---------------------------------------------------------------------- // RLE_BOOLEAN_DECODER -class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder { +class RleBooleanDecoder : public DecoderImpl, virtual public TypedDecoder<BooleanType> { public: explicit RleBooleanDecoder(const ColumnDescriptor* descr) : DecoderImpl(descr, Encoding::RLE) {} diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index b9ca7a7ee..c32da9979 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -65,7 +65,7 @@ using FLBAEncoder = TypedEncoder<FLBAType>; template <typename DType> class TypedDecoder; -class BooleanDecoder; +using BooleanDecoder = TypedDecoder<BooleanType>; using Int32Decoder = TypedDecoder<Int32Type>; using Int64Decoder = TypedDecoder<Int64Type>; using Int96Decoder = TypedDecoder<Int96Type>; @@ -394,12 +394,6 @@ class DictDecoder : virtual public TypedDecoder<DType> { // ---------------------------------------------------------------------- // TypedEncoder specializations, traits, and factory functions -class BooleanDecoder : virtual public TypedDecoder<BooleanType> { - public: - using TypedDecoder<BooleanType>::Decode; - virtual int Decode(uint8_t* buffer, int max_values) = 0; -}; ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org