kou commented on code in PR #14147: URL: https://github.com/apache/arrow/pull/14147#discussion_r972757477
########## cpp/src/parquet/encoding.cc: ########## @@ -2355,6 +2355,83 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl, std::shared_ptr<ResizableBuffer> buffered_data_; }; +// ---------------------------------------------------------------------- +// RLE_BOOLEAN_DECODER + +class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder { + + public: + explicit RleBooleanDecoder(const ColumnDescriptor* descr) + :DecoderImpl(descr, Encoding::RLE) {} + + void SetData(int num_values, const uint8_t* data, int len) override { + num_values_ = num_values; + int32_t num_bytes = 0; + + if (len < 4) { + throw ParquetException("Received invalid length (corrupt data page?)"); Review Comment: How about showing the invalid length too? ########## cpp/src/parquet/encoding.cc: ########## @@ -2355,6 +2355,83 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl, std::shared_ptr<ResizableBuffer> buffered_data_; }; +// ---------------------------------------------------------------------- +// RLE_BOOLEAN_DECODER + +class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder { + + public: + explicit RleBooleanDecoder(const ColumnDescriptor* descr) + :DecoderImpl(descr, Encoding::RLE) {} + + void SetData(int num_values, const uint8_t* data, int len) override { + num_values_ = num_values; + int32_t num_bytes = 0; + + if (len < 4) { + throw ParquetException("Received invalid length (corrupt data page?)"); + } + // Load the first 4 bytes, which indicates the legnth + num_bytes = ::arrow::util::SafeLoadAs<int32_t>(data); + if (num_bytes < 0 || num_bytes > len - 4) { + throw ParquetException("Received invalid number of bytes (corrupt data page?)"); Review Comment: Could you also show the invalid value? ########## cpp/src/parquet/encoding.cc: ########## @@ -2355,6 +2355,83 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl, std::shared_ptr<ResizableBuffer> buffered_data_; }; +// ---------------------------------------------------------------------- +// RLE_BOOLEAN_DECODER + +class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder { + + public: + explicit RleBooleanDecoder(const ColumnDescriptor* descr) + :DecoderImpl(descr, Encoding::RLE) {} + + void SetData(int num_values, const uint8_t* data, int len) override { + num_values_ = num_values; + int32_t num_bytes = 0; + + if (len < 4) { + throw ParquetException("Received invalid length (corrupt data page?)"); + } + // Load the first 4 bytes, which indicates the legnth + num_bytes = ::arrow::util::SafeLoadAs<int32_t>(data); + if (num_bytes < 0 || num_bytes > len - 4) { + throw ParquetException("Received invalid number of bytes (corrupt data page?)"); + } + + const uint8_t* decoder_data = data + 4; + if (len == 0) { Review Comment: It seems that `len` is always `>= 4` here. ########## cpp/src/parquet/encoding.cc: ########## @@ -2762,6 +2839,11 @@ std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encodin return std::unique_ptr<Decoder>(new DeltaLengthByteArrayDecoder(descr)); } throw ParquetException("DELTA_LENGTH_BYTE_ARRAY only supports BYTE_ARRAY"); + } else if (encoding == Encoding::RLE) { + if (type_num == Type::BOOLEAN) { + return std::unique_ptr<Decoder>(new RleBooleanDecoder(descr)); + } + throw ParquetException("RLE encoding only supports BINARY"); Review Comment: boolean? ########## cpp/src/parquet/encoding.cc: ########## @@ -2355,6 +2355,83 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl, std::shared_ptr<ResizableBuffer> buffered_data_; }; +// ---------------------------------------------------------------------- +// RLE_BOOLEAN_DECODER + +class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder { + + public: + explicit RleBooleanDecoder(const ColumnDescriptor* descr) + :DecoderImpl(descr, Encoding::RLE) {} + + void SetData(int num_values, const uint8_t* data, int len) override { + num_values_ = num_values; + int32_t num_bytes = 0; + + if (len < 4) { + throw ParquetException("Received invalid length (corrupt data page?)"); + } + // Load the first 4 bytes, which indicates the legnth + num_bytes = ::arrow::util::SafeLoadAs<int32_t>(data); Review Comment: `uint32_t`? https://parquet.apache.org/docs/file-format/data-pages/encodings/#a-namerlearun-length-encoding--bit-packing-hybrid-rle--3 > length := length of the <encoded-data> in bytes stored as 4 bytes little endian (unsigned int32) Could you parse this as little endian value on both little/big endian environment? We may be able to use `arrow/util/endian.h` for it... -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org