kou commented on code in PR #14147: URL: https://github.com/apache/arrow/pull/14147#discussion_r973778431
########## cpp/src/parquet/encoding.cc: ########## @@ -2355,6 +2355,80 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl, std::shared_ptr<ResizableBuffer> buffered_data_; }; +// ---------------------------------------------------------------------- +// RLE_BOOLEAN_DECODER + +class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder { + public: + explicit RleBooleanDecoder(const ColumnDescriptor* descr) + : DecoderImpl(descr, Encoding::RLE) {} + + void SetData(int num_values, const uint8_t* data, int len) override { + num_values_ = num_values; + uint32_t num_bytes = 0; + + if (len < 4) { + throw ParquetException("Received invalid length : " + std::to_string(len) + + " (corrupt data page?)"); + } + // Load the first 4 bytes in little-endian, which indicates the length + num_bytes = + ::arrow::bit_util::ToLittleEndian(::arrow::util::SafeLoadAs<uint32_t>(data)); + if (num_bytes < 0 || num_bytes > (uint32_t)(len - 4)) { + throw ParquetException("Received invalid number of bytes : " + + std::to_string(num_bytes) + " (corrupt data page?)"); + } + + const uint8_t* decoder_data = data + 4; + decoder_ = std::make_shared<::arrow::util::RleDecoder>(decoder_data, num_bytes, + /*bit_width=*/1); + } + + int Decode(bool* buffer, int max_values) override { + max_values = std::min(max_values, num_values_); + int val = 0; Review Comment: `bool`? Or can we use `decoder_->GetBatch()` in this method too? ########## cpp/src/parquet/encoding.cc: ########## @@ -2355,6 +2355,80 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl, std::shared_ptr<ResizableBuffer> buffered_data_; }; +// ---------------------------------------------------------------------- +// RLE_BOOLEAN_DECODER + +class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder { + public: + explicit RleBooleanDecoder(const ColumnDescriptor* descr) + : DecoderImpl(descr, Encoding::RLE) {} + + void SetData(int num_values, const uint8_t* data, int len) override { + num_values_ = num_values; + uint32_t num_bytes = 0; + + if (len < 4) { + throw ParquetException("Received invalid length : " + std::to_string(len) + + " (corrupt data page?)"); + } + // Load the first 4 bytes in little-endian, which indicates the length + num_bytes = + ::arrow::bit_util::ToLittleEndian(::arrow::util::SafeLoadAs<uint32_t>(data)); + if (num_bytes < 0 || num_bytes > (uint32_t)(len - 4)) { + throw ParquetException("Received invalid number of bytes : " + + std::to_string(num_bytes) + " (corrupt data page?)"); + } + + const uint8_t* decoder_data = data + 4; + decoder_ = std::make_shared<::arrow::util::RleDecoder>(decoder_data, num_bytes, + /*bit_width=*/1); + } + + int Decode(bool* buffer, int max_values) override { + max_values = std::min(max_values, num_values_); + int val = 0; + + for (int i = 0; i < max_values; ++i) { + if (!decoder_->Get(&val)) { + throw ParquetException("Unable to parse bits (corrupt data page?)"); Review Comment: Can we also show invalid position? ########## cpp/src/parquet/encoding.cc: ########## @@ -2355,6 +2355,80 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl, std::shared_ptr<ResizableBuffer> buffered_data_; }; +// ---------------------------------------------------------------------- +// RLE_BOOLEAN_DECODER + +class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder { + public: + explicit RleBooleanDecoder(const ColumnDescriptor* descr) + : DecoderImpl(descr, Encoding::RLE) {} + + void SetData(int num_values, const uint8_t* data, int len) override { + num_values_ = num_values; + uint32_t num_bytes = 0; + + if (len < 4) { + throw ParquetException("Received invalid length : " + std::to_string(len) + + " (corrupt data page?)"); + } + // Load the first 4 bytes in little-endian, which indicates the length + num_bytes = + ::arrow::bit_util::ToLittleEndian(::arrow::util::SafeLoadAs<uint32_t>(data)); + if (num_bytes < 0 || num_bytes > (uint32_t)(len - 4)) { + throw ParquetException("Received invalid number of bytes : " + + std::to_string(num_bytes) + " (corrupt data page?)"); + } + + const uint8_t* decoder_data = data + 4; Review Comment: ```suggestion auto decoder_data = data + 4; ``` ########## cpp/src/parquet/encoding.cc: ########## @@ -2355,6 +2355,80 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl, std::shared_ptr<ResizableBuffer> buffered_data_; }; +// ---------------------------------------------------------------------- +// RLE_BOOLEAN_DECODER + +class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder { + public: + explicit RleBooleanDecoder(const ColumnDescriptor* descr) + : DecoderImpl(descr, Encoding::RLE) {} + + void SetData(int num_values, const uint8_t* data, int len) override { + num_values_ = num_values; + uint32_t num_bytes = 0; + + if (len < 4) { + throw ParquetException("Received invalid length : " + std::to_string(len) + + " (corrupt data page?)"); + } + // Load the first 4 bytes in little-endian, which indicates the length + num_bytes = + ::arrow::bit_util::ToLittleEndian(::arrow::util::SafeLoadAs<uint32_t>(data)); + if (num_bytes < 0 || num_bytes > (uint32_t)(len - 4)) { Review Comment: ```suggestion if (num_bytes < 0 || num_bytes > static_cast<uint32_t>(len - 4)) { ``` ########## cpp/src/parquet/encoding.cc: ########## @@ -2355,6 +2355,80 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl, std::shared_ptr<ResizableBuffer> buffered_data_; }; +// ---------------------------------------------------------------------- +// RLE_BOOLEAN_DECODER + +class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder { + public: + explicit RleBooleanDecoder(const ColumnDescriptor* descr) + : DecoderImpl(descr, Encoding::RLE) {} + + void SetData(int num_values, const uint8_t* data, int len) override { + num_values_ = num_values; + uint32_t num_bytes = 0; + + if (len < 4) { + throw ParquetException("Received invalid length : " + std::to_string(len) + + " (corrupt data page?)"); + } + // Load the first 4 bytes in little-endian, which indicates the length + num_bytes = + ::arrow::bit_util::ToLittleEndian(::arrow::util::SafeLoadAs<uint32_t>(data)); + if (num_bytes < 0 || num_bytes > (uint32_t)(len - 4)) { + throw ParquetException("Received invalid number of bytes : " + + std::to_string(num_bytes) + " (corrupt data page?)"); + } + + const uint8_t* decoder_data = data + 4; + decoder_ = std::make_shared<::arrow::util::RleDecoder>(decoder_data, num_bytes, + /*bit_width=*/1); + } + + int Decode(bool* buffer, int max_values) override { + max_values = std::min(max_values, num_values_); + int val = 0; + + for (int i = 0; i < max_values; ++i) { + if (!decoder_->Get(&val)) { + throw ParquetException("Unable to parse bits (corrupt data page?)"); + } + if (val) { + buffer[i] = true; + } else { + buffer[i] = false; + } + } + num_values_ -= max_values; + return max_values; + } + + int Decode(uint8_t* buffer, int max_values) { + max_values = std::min(max_values, num_values_); + if (!decoder_->GetBatch(buffer, max_values)) { + ParquetException::EofException(); + } + + num_values_ -= max_values; Review Comment: It seems that `decoder_->GetBatch()` returns the number of read values. It may be smaller than `max_values`. So this may cause invalid `num_values_`. ########## cpp/src/parquet/encoding.cc: ########## @@ -2355,6 +2355,80 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl, std::shared_ptr<ResizableBuffer> buffered_data_; }; +// ---------------------------------------------------------------------- +// RLE_BOOLEAN_DECODER + +class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder { + public: + explicit RleBooleanDecoder(const ColumnDescriptor* descr) + : DecoderImpl(descr, Encoding::RLE) {} + + void SetData(int num_values, const uint8_t* data, int len) override { + num_values_ = num_values; + uint32_t num_bytes = 0; + + if (len < 4) { + throw ParquetException("Received invalid length : " + std::to_string(len) + + " (corrupt data page?)"); + } + // Load the first 4 bytes in little-endian, which indicates the length + num_bytes = + ::arrow::bit_util::ToLittleEndian(::arrow::util::SafeLoadAs<uint32_t>(data)); + if (num_bytes < 0 || num_bytes > (uint32_t)(len - 4)) { + throw ParquetException("Received invalid number of bytes : " + + std::to_string(num_bytes) + " (corrupt data page?)"); + } + + const uint8_t* decoder_data = data + 4; + decoder_ = std::make_shared<::arrow::util::RleDecoder>(decoder_data, num_bytes, + /*bit_width=*/1); + } + + int Decode(bool* buffer, int max_values) override { + max_values = std::min(max_values, num_values_); + int val = 0; + + for (int i = 0; i < max_values; ++i) { + if (!decoder_->Get(&val)) { + throw ParquetException("Unable to parse bits (corrupt data page?)"); + } + if (val) { + buffer[i] = true; + } else { + buffer[i] = false; + } + } + num_values_ -= max_values; + return max_values; + } + + int Decode(uint8_t* buffer, int max_values) { Review Comment: `override` is missing? https://github.com/apache/arrow/actions/runs/3076096805/jobs/4970024700#step:6:1903 ```text FAILED: src/parquet/CMakeFiles/parquet_objlib.dir/encoding.cc.o /usr/bin/ccache /usr/lib/ccache/clang++-12 -DARROW_HAVE_RUNTIME_AVX2 -DARROW_HAVE_RUNTIME_AVX512 -DARROW_HAVE_RUNTIME_BMI2 -DARROW_HAVE_RUNTIME_SSE4_2 -DARROW_HAVE_SSE4_2 -DARROW_NO_DEPRECATED_API -DARROW_UBSAN -DARROW_WITH_BROTLI -DARROW_WITH_BZ2 -DARROW_WITH_LZ4 -DARROW_WITH_RE2 -DARROW_WITH_SNAPPY -DARROW_WITH_UTF8PROC -DARROW_WITH_ZLIB -DARROW_WITH_ZSTD -DBOOST_ALL_NO_LIB -DHAVE_INTTYPES_H -DHAVE_NETDB_H -DHAVE_NETINET_IN_H -DPARQUET_EXPORTING -DPARQUET_THRIFT_VERSION_MAJOR=0 -DPARQUET_THRIFT_VERSION_MINOR=13 -DUTF8PROC_STATIC -Isrc -I/arrow/cpp/src -I/arrow/cpp/src/generated -isystem /arrow/cpp/thirdparty/flatbuffers/include -isystem /arrow/cpp/thirdparty/hadoop/include -isystem google_cloud_cpp_ep-install/include -isystem absl_ep-install/include -isystem crc32c_ep-install/include -isystem opentelemetry_ep-install/include -isystem protobuf_ep-install/include -isystem utf8proc_ep-install/include -isystem xsimd_ep/src/xsimd_ep-install/include -Qunused-arguments -fcolor-diagnos tics -ggdb -O0 -Wall -Wextra -Wdocumentation -Wshorten-64-to-32 -Wno-missing-braces -Wno-unused-parameter -Wno-constant-logical-operand -Wno-return-stack-address -Werror -Wno-unknown-warning-option -Wno-pass-failed -msse4.2 -fsanitize=address -DADDRESS_SANITIZER -fsanitize=undefined -fno-sanitize=alignment,vptr,function,float-divide-by-zero -fno-sanitize-recover=all -fsanitize-coverage=pc-table,inline-8bit-counters,edge,no-prune,trace-cmp,trace-div,trace-gep -fsanitize-blacklist=/arrow/cpp/build-support/sanitizer-disallowed-entries.txt -g -fPIC -fsanitize-coverage=pc-table,inline-8bit-counters,edge,no-prune,trace-cmp,trace-div,trace-gep -pthread -std=c++17 -MD -MT src/parquet/CMakeFiles/parquet_objlib.dir/encoding.cc.o -MF src/parquet/CMakeFiles/parquet_objlib.dir/encoding.cc.o.d -o src/parquet/CMakeFiles/parquet_objlib.dir/encoding.cc.o -c /arrow/cpp/src/parquet/encoding.cc /arrow/cpp/src/parquet/encoding.cc:2405:7: error: 'Decode' overrides a member function but is not marked 'override' [-Werror,-Winconsistent-missing-override] int Decode(uint8_t* buffer, int max_values) { ^ /arrow/cpp/src/parquet/encoding.h:400:15: note: overridden virtual function is here virtual int Decode(uint8_t* buffer, int max_values) = 0; ^ 1 error generated. ``` ########## cpp/src/parquet/level_comparison.cc: ########## @@ -44,10 +44,10 @@ struct GreaterThanDynamicFunction { using FunctionType = decltype(&GreaterThanBitmap); static std::vector<std::pair<DispatchLevel, FunctionType>> implementations() { - return { - { DispatchLevel::NONE, standard::GreaterThanBitmapImpl } + return {{DispatchLevel::NONE, standard::GreaterThanBitmapImpl} #if defined(ARROW_HAVE_RUNTIME_AVX2) - , { DispatchLevel::AVX2, GreaterThanBitmapAvx2 } + , + {DispatchLevel::AVX2, GreaterThanBitmapAvx2} Review Comment: Hmm, you might not use `clang-format-12`: https://github.com/apache/arrow/actions/runs/3076096806/jobs/4970024641#step:5:869 ```text FAILED: CMakeFiles/check-format cd /tmp/arrow-lint-mbv5v4zf/cpp-build && /usr/local/bin/python /arrow/cpp/build-support/run_clang_format.py --clang_format_binary /usr/bin/clang-format-12 --exclude_globs /arrow/cpp/build-support/lint_exclusions.txt --source_dir /arrow/cpp/src --source_dir /arrow/cpp/examples --source_dir /arrow/cpp/tools --quiet --- /arrow/cpp/src/arrow/util/bpacking.cc +++ /arrow/cpp/src/arrow/util/bpacking.cc (after clang format) @@ -153,14 +153,13 @@ using FunctionType = decltype(&unpack32_default); static std::vector<std::pair<DispatchLevel, FunctionType>> implementations() { - return {{DispatchLevel::NONE, unpack32_default} + return { + { DispatchLevel::NONE, unpack32_default } #if defined(ARROW_HAVE_RUNTIME_AVX2) - , - {DispatchLevel::AVX2, unpack32_avx2} + , { DispatchLevel::AVX2, unpack32_avx2 } #endif #if defined(ARROW_HAVE_RUNTIME_AVX512) - , - {DispatchLevel::AVX512, unpack32_avx512} + , { DispatchLevel::AVX512, unpack32_avx512 } #endif }; } --- /arrow/cpp/src/parquet/level_comparison.cc +++ /arrow/cpp/src/parquet/level_comparison.cc (after clang format) @@ -44,10 +44,10 @@ using FunctionType = decltype(&GreaterThanBitmap); static std::vector<std::pair<DispatchLevel, FunctionType>> implementations() { - return {{DispatchLevel::NONE, standard::GreaterThanBitmapImpl} + return { + { DispatchLevel::NONE, standard::GreaterThanBitmapImpl } #if defined(ARROW_HAVE_RUNTIME_AVX2) - , - {DispatchLevel::AVX2, GreaterThanBitmapAvx2} + , { DispatchLevel::AVX2, GreaterThanBitmapAvx2 } #endif }; } @@ -57,10 +57,10 @@ using FunctionType = decltype(&FindMinMax); static std::vector<std::pair<DispatchLevel, FunctionType>> implementations() { - return {{DispatchLevel::NONE, standard::FindMinMaxImpl} + return { + { DispatchLevel::NONE, standard::FindMinMaxImpl } #if defined(ARROW_HAVE_RUNTIME_AVX2) - , - {DispatchLevel::AVX2, FindMinMaxAvx2} + , { DispatchLevel::AVX2, FindMinMaxAvx2 } #endif }; } ``` Could you try `archery lint --clang-format --fix` instead? https://arrow.apache.org/docs/developers/cpp/development.html#code-style-linting-and-ci -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org