kou commented on code in PR #14147:
URL: https://github.com/apache/arrow/pull/14147#discussion_r973778431
##########
cpp/src/parquet/encoding.cc:
##########
@@ -2355,6 +2355,80 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
std::shared_ptr<ResizableBuffer> buffered_data_;
};
+// ----------------------------------------------------------------------
+// RLE_BOOLEAN_DECODER
+
+class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder {
+ public:
+ explicit RleBooleanDecoder(const ColumnDescriptor* descr)
+ : DecoderImpl(descr, Encoding::RLE) {}
+
+ void SetData(int num_values, const uint8_t* data, int len) override {
+ num_values_ = num_values;
+ uint32_t num_bytes = 0;
+
+ if (len < 4) {
+ throw ParquetException("Received invalid length : " +
std::to_string(len) +
+ " (corrupt data page?)");
+ }
+ // Load the first 4 bytes in little-endian, which indicates the length
+ num_bytes =
+
::arrow::bit_util::ToLittleEndian(::arrow::util::SafeLoadAs<uint32_t>(data));
+ if (num_bytes < 0 || num_bytes > (uint32_t)(len - 4)) {
+ throw ParquetException("Received invalid number of bytes : " +
+ std::to_string(num_bytes) + " (corrupt data
page?)");
+ }
+
+ const uint8_t* decoder_data = data + 4;
+ decoder_ = std::make_shared<::arrow::util::RleDecoder>(decoder_data,
num_bytes,
+ /*bit_width=*/1);
+ }
+
+ int Decode(bool* buffer, int max_values) override {
+ max_values = std::min(max_values, num_values_);
+ int val = 0;
Review Comment:
`bool`?
Or can we use `decoder_->GetBatch()` in this method too?
##########
cpp/src/parquet/encoding.cc:
##########
@@ -2355,6 +2355,80 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
std::shared_ptr<ResizableBuffer> buffered_data_;
};
+// ----------------------------------------------------------------------
+// RLE_BOOLEAN_DECODER
+
+class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder {
+ public:
+ explicit RleBooleanDecoder(const ColumnDescriptor* descr)
+ : DecoderImpl(descr, Encoding::RLE) {}
+
+ void SetData(int num_values, const uint8_t* data, int len) override {
+ num_values_ = num_values;
+ uint32_t num_bytes = 0;
+
+ if (len < 4) {
+ throw ParquetException("Received invalid length : " +
std::to_string(len) +
+ " (corrupt data page?)");
+ }
+ // Load the first 4 bytes in little-endian, which indicates the length
+ num_bytes =
+
::arrow::bit_util::ToLittleEndian(::arrow::util::SafeLoadAs<uint32_t>(data));
+ if (num_bytes < 0 || num_bytes > (uint32_t)(len - 4)) {
+ throw ParquetException("Received invalid number of bytes : " +
+ std::to_string(num_bytes) + " (corrupt data
page?)");
+ }
+
+ const uint8_t* decoder_data = data + 4;
+ decoder_ = std::make_shared<::arrow::util::RleDecoder>(decoder_data,
num_bytes,
+ /*bit_width=*/1);
+ }
+
+ int Decode(bool* buffer, int max_values) override {
+ max_values = std::min(max_values, num_values_);
+ int val = 0;
+
+ for (int i = 0; i < max_values; ++i) {
+ if (!decoder_->Get(&val)) {
+ throw ParquetException("Unable to parse bits (corrupt data page?)");
Review Comment:
Can we also show invalid position?
##########
cpp/src/parquet/encoding.cc:
##########
@@ -2355,6 +2355,80 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
std::shared_ptr<ResizableBuffer> buffered_data_;
};
+// ----------------------------------------------------------------------
+// RLE_BOOLEAN_DECODER
+
+class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder {
+ public:
+ explicit RleBooleanDecoder(const ColumnDescriptor* descr)
+ : DecoderImpl(descr, Encoding::RLE) {}
+
+ void SetData(int num_values, const uint8_t* data, int len) override {
+ num_values_ = num_values;
+ uint32_t num_bytes = 0;
+
+ if (len < 4) {
+ throw ParquetException("Received invalid length : " +
std::to_string(len) +
+ " (corrupt data page?)");
+ }
+ // Load the first 4 bytes in little-endian, which indicates the length
+ num_bytes =
+
::arrow::bit_util::ToLittleEndian(::arrow::util::SafeLoadAs<uint32_t>(data));
+ if (num_bytes < 0 || num_bytes > (uint32_t)(len - 4)) {
+ throw ParquetException("Received invalid number of bytes : " +
+ std::to_string(num_bytes) + " (corrupt data
page?)");
+ }
+
+ const uint8_t* decoder_data = data + 4;
Review Comment:
```suggestion
auto decoder_data = data + 4;
```
##########
cpp/src/parquet/encoding.cc:
##########
@@ -2355,6 +2355,80 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
std::shared_ptr<ResizableBuffer> buffered_data_;
};
+// ----------------------------------------------------------------------
+// RLE_BOOLEAN_DECODER
+
+class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder {
+ public:
+ explicit RleBooleanDecoder(const ColumnDescriptor* descr)
+ : DecoderImpl(descr, Encoding::RLE) {}
+
+ void SetData(int num_values, const uint8_t* data, int len) override {
+ num_values_ = num_values;
+ uint32_t num_bytes = 0;
+
+ if (len < 4) {
+ throw ParquetException("Received invalid length : " +
std::to_string(len) +
+ " (corrupt data page?)");
+ }
+ // Load the first 4 bytes in little-endian, which indicates the length
+ num_bytes =
+
::arrow::bit_util::ToLittleEndian(::arrow::util::SafeLoadAs<uint32_t>(data));
+ if (num_bytes < 0 || num_bytes > (uint32_t)(len - 4)) {
Review Comment:
```suggestion
if (num_bytes < 0 || num_bytes > static_cast<uint32_t>(len - 4)) {
```
##########
cpp/src/parquet/encoding.cc:
##########
@@ -2355,6 +2355,80 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
std::shared_ptr<ResizableBuffer> buffered_data_;
};
+// ----------------------------------------------------------------------
+// RLE_BOOLEAN_DECODER
+
+class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder {
+ public:
+ explicit RleBooleanDecoder(const ColumnDescriptor* descr)
+ : DecoderImpl(descr, Encoding::RLE) {}
+
+ void SetData(int num_values, const uint8_t* data, int len) override {
+ num_values_ = num_values;
+ uint32_t num_bytes = 0;
+
+ if (len < 4) {
+ throw ParquetException("Received invalid length : " +
std::to_string(len) +
+ " (corrupt data page?)");
+ }
+ // Load the first 4 bytes in little-endian, which indicates the length
+ num_bytes =
+
::arrow::bit_util::ToLittleEndian(::arrow::util::SafeLoadAs<uint32_t>(data));
+ if (num_bytes < 0 || num_bytes > (uint32_t)(len - 4)) {
+ throw ParquetException("Received invalid number of bytes : " +
+ std::to_string(num_bytes) + " (corrupt data
page?)");
+ }
+
+ const uint8_t* decoder_data = data + 4;
+ decoder_ = std::make_shared<::arrow::util::RleDecoder>(decoder_data,
num_bytes,
+ /*bit_width=*/1);
+ }
+
+ int Decode(bool* buffer, int max_values) override {
+ max_values = std::min(max_values, num_values_);
+ int val = 0;
+
+ for (int i = 0; i < max_values; ++i) {
+ if (!decoder_->Get(&val)) {
+ throw ParquetException("Unable to parse bits (corrupt data page?)");
+ }
+ if (val) {
+ buffer[i] = true;
+ } else {
+ buffer[i] = false;
+ }
+ }
+ num_values_ -= max_values;
+ return max_values;
+ }
+
+ int Decode(uint8_t* buffer, int max_values) {
+ max_values = std::min(max_values, num_values_);
+ if (!decoder_->GetBatch(buffer, max_values)) {
+ ParquetException::EofException();
+ }
+
+ num_values_ -= max_values;
Review Comment:
It seems that `decoder_->GetBatch()` returns the number of read values. It
may be smaller than `max_values`. So this may cause invalid `num_values_`.
##########
cpp/src/parquet/encoding.cc:
##########
@@ -2355,6 +2355,80 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
std::shared_ptr<ResizableBuffer> buffered_data_;
};
+// ----------------------------------------------------------------------
+// RLE_BOOLEAN_DECODER
+
+class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder {
+ public:
+ explicit RleBooleanDecoder(const ColumnDescriptor* descr)
+ : DecoderImpl(descr, Encoding::RLE) {}
+
+ void SetData(int num_values, const uint8_t* data, int len) override {
+ num_values_ = num_values;
+ uint32_t num_bytes = 0;
+
+ if (len < 4) {
+ throw ParquetException("Received invalid length : " +
std::to_string(len) +
+ " (corrupt data page?)");
+ }
+ // Load the first 4 bytes in little-endian, which indicates the length
+ num_bytes =
+
::arrow::bit_util::ToLittleEndian(::arrow::util::SafeLoadAs<uint32_t>(data));
+ if (num_bytes < 0 || num_bytes > (uint32_t)(len - 4)) {
+ throw ParquetException("Received invalid number of bytes : " +
+ std::to_string(num_bytes) + " (corrupt data
page?)");
+ }
+
+ const uint8_t* decoder_data = data + 4;
+ decoder_ = std::make_shared<::arrow::util::RleDecoder>(decoder_data,
num_bytes,
+ /*bit_width=*/1);
+ }
+
+ int Decode(bool* buffer, int max_values) override {
+ max_values = std::min(max_values, num_values_);
+ int val = 0;
+
+ for (int i = 0; i < max_values; ++i) {
+ if (!decoder_->Get(&val)) {
+ throw ParquetException("Unable to parse bits (corrupt data page?)");
+ }
+ if (val) {
+ buffer[i] = true;
+ } else {
+ buffer[i] = false;
+ }
+ }
+ num_values_ -= max_values;
+ return max_values;
+ }
+
+ int Decode(uint8_t* buffer, int max_values) {
Review Comment:
`override` is missing?
https://github.com/apache/arrow/actions/runs/3076096805/jobs/4970024700#step:6:1903
```text
FAILED: src/parquet/CMakeFiles/parquet_objlib.dir/encoding.cc.o
/usr/bin/ccache /usr/lib/ccache/clang++-12 -DARROW_HAVE_RUNTIME_AVX2
-DARROW_HAVE_RUNTIME_AVX512 -DARROW_HAVE_RUNTIME_BMI2
-DARROW_HAVE_RUNTIME_SSE4_2 -DARROW_HAVE_SSE4_2 -DARROW_NO_DEPRECATED_API
-DARROW_UBSAN -DARROW_WITH_BROTLI -DARROW_WITH_BZ2 -DARROW_WITH_LZ4
-DARROW_WITH_RE2 -DARROW_WITH_SNAPPY -DARROW_WITH_UTF8PROC -DARROW_WITH_ZLIB
-DARROW_WITH_ZSTD -DBOOST_ALL_NO_LIB -DHAVE_INTTYPES_H -DHAVE_NETDB_H
-DHAVE_NETINET_IN_H -DPARQUET_EXPORTING -DPARQUET_THRIFT_VERSION_MAJOR=0
-DPARQUET_THRIFT_VERSION_MINOR=13 -DUTF8PROC_STATIC -Isrc -I/arrow/cpp/src
-I/arrow/cpp/src/generated -isystem /arrow/cpp/thirdparty/flatbuffers/include
-isystem /arrow/cpp/thirdparty/hadoop/include -isystem
google_cloud_cpp_ep-install/include -isystem absl_ep-install/include -isystem
crc32c_ep-install/include -isystem opentelemetry_ep-install/include -isystem
protobuf_ep-install/include -isystem utf8proc_ep-install/include -isystem
xsimd_ep/src/xsimd_ep-install/include -Qunused-arguments -fcolor-diagnos
tics -ggdb -O0 -Wall -Wextra -Wdocumentation -Wshorten-64-to-32
-Wno-missing-braces -Wno-unused-parameter -Wno-constant-logical-operand
-Wno-return-stack-address -Werror -Wno-unknown-warning-option -Wno-pass-failed
-msse4.2 -fsanitize=address -DADDRESS_SANITIZER -fsanitize=undefined
-fno-sanitize=alignment,vptr,function,float-divide-by-zero
-fno-sanitize-recover=all
-fsanitize-coverage=pc-table,inline-8bit-counters,edge,no-prune,trace-cmp,trace-div,trace-gep
-fsanitize-blacklist=/arrow/cpp/build-support/sanitizer-disallowed-entries.txt
-g -fPIC
-fsanitize-coverage=pc-table,inline-8bit-counters,edge,no-prune,trace-cmp,trace-div,trace-gep
-pthread -std=c++17 -MD -MT
src/parquet/CMakeFiles/parquet_objlib.dir/encoding.cc.o -MF
src/parquet/CMakeFiles/parquet_objlib.dir/encoding.cc.o.d -o
src/parquet/CMakeFiles/parquet_objlib.dir/encoding.cc.o -c
/arrow/cpp/src/parquet/encoding.cc
/arrow/cpp/src/parquet/encoding.cc:2405:7: error: 'Decode' overrides a
member function but is not marked 'override'
[-Werror,-Winconsistent-missing-override]
int Decode(uint8_t* buffer, int max_values) {
^
/arrow/cpp/src/parquet/encoding.h:400:15: note: overridden virtual function
is here
virtual int Decode(uint8_t* buffer, int max_values) = 0;
^
1 error generated.
```
##########
cpp/src/parquet/level_comparison.cc:
##########
@@ -44,10 +44,10 @@ struct GreaterThanDynamicFunction {
using FunctionType = decltype(&GreaterThanBitmap);
static std::vector<std::pair<DispatchLevel, FunctionType>> implementations()
{
- return {
- { DispatchLevel::NONE, standard::GreaterThanBitmapImpl }
+ return {{DispatchLevel::NONE, standard::GreaterThanBitmapImpl}
#if defined(ARROW_HAVE_RUNTIME_AVX2)
- , { DispatchLevel::AVX2, GreaterThanBitmapAvx2 }
+ ,
+ {DispatchLevel::AVX2, GreaterThanBitmapAvx2}
Review Comment:
Hmm, you might not use `clang-format-12`:
https://github.com/apache/arrow/actions/runs/3076096806/jobs/4970024641#step:5:869
```text
FAILED: CMakeFiles/check-format
cd /tmp/arrow-lint-mbv5v4zf/cpp-build && /usr/local/bin/python
/arrow/cpp/build-support/run_clang_format.py --clang_format_binary
/usr/bin/clang-format-12 --exclude_globs
/arrow/cpp/build-support/lint_exclusions.txt --source_dir /arrow/cpp/src
--source_dir /arrow/cpp/examples --source_dir /arrow/cpp/tools --quiet
--- /arrow/cpp/src/arrow/util/bpacking.cc
+++ /arrow/cpp/src/arrow/util/bpacking.cc (after clang format)
@@ -153,14 +153,13 @@
using FunctionType = decltype(&unpack32_default);
static std::vector<std::pair<DispatchLevel, FunctionType>>
implementations() {
- return {{DispatchLevel::NONE, unpack32_default}
+ return {
+ { DispatchLevel::NONE, unpack32_default }
#if defined(ARROW_HAVE_RUNTIME_AVX2)
- ,
- {DispatchLevel::AVX2, unpack32_avx2}
+ , { DispatchLevel::AVX2, unpack32_avx2 }
#endif
#if defined(ARROW_HAVE_RUNTIME_AVX512)
- ,
- {DispatchLevel::AVX512, unpack32_avx512}
+ , { DispatchLevel::AVX512, unpack32_avx512 }
#endif
};
}
--- /arrow/cpp/src/parquet/level_comparison.cc
+++ /arrow/cpp/src/parquet/level_comparison.cc (after clang format)
@@ -44,10 +44,10 @@
using FunctionType = decltype(&GreaterThanBitmap);
static std::vector<std::pair<DispatchLevel, FunctionType>>
implementations() {
- return {{DispatchLevel::NONE, standard::GreaterThanBitmapImpl}
+ return {
+ { DispatchLevel::NONE, standard::GreaterThanBitmapImpl }
#if defined(ARROW_HAVE_RUNTIME_AVX2)
- ,
- {DispatchLevel::AVX2, GreaterThanBitmapAvx2}
+ , { DispatchLevel::AVX2, GreaterThanBitmapAvx2 }
#endif
};
}
@@ -57,10 +57,10 @@
using FunctionType = decltype(&FindMinMax);
static std::vector<std::pair<DispatchLevel, FunctionType>>
implementations() {
- return {{DispatchLevel::NONE, standard::FindMinMaxImpl}
+ return {
+ { DispatchLevel::NONE, standard::FindMinMaxImpl }
#if defined(ARROW_HAVE_RUNTIME_AVX2)
- ,
- {DispatchLevel::AVX2, FindMinMaxAvx2}
+ , { DispatchLevel::AVX2, FindMinMaxAvx2 }
#endif
};
}
```
Could you try `archery lint --clang-format --fix` instead?
https://arrow.apache.org/docs/developers/cpp/development.html#code-style-linting-and-ci
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]