kou commented on code in PR #14147:
URL: https://github.com/apache/arrow/pull/14147#discussion_r973778431


##########
cpp/src/parquet/encoding.cc:
##########
@@ -2355,6 +2355,80 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
   std::shared_ptr<ResizableBuffer> buffered_data_;
 };
 
+// ----------------------------------------------------------------------
+// RLE_BOOLEAN_DECODER
+
+class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder {
+ public:
+  explicit RleBooleanDecoder(const ColumnDescriptor* descr)
+      : DecoderImpl(descr, Encoding::RLE) {}
+
+  void SetData(int num_values, const uint8_t* data, int len) override {
+    num_values_ = num_values;
+    uint32_t num_bytes = 0;
+
+    if (len < 4) {
+      throw ParquetException("Received invalid length : " + 
std::to_string(len) +
+                             " (corrupt data page?)");
+    }
+    // Load the first 4 bytes in little-endian, which indicates the length
+    num_bytes =
+        
::arrow::bit_util::ToLittleEndian(::arrow::util::SafeLoadAs<uint32_t>(data));
+    if (num_bytes < 0 || num_bytes > (uint32_t)(len - 4)) {
+      throw ParquetException("Received invalid number of bytes : " +
+                             std::to_string(num_bytes) + " (corrupt data 
page?)");
+    }
+
+    const uint8_t* decoder_data = data + 4;
+    decoder_ = std::make_shared<::arrow::util::RleDecoder>(decoder_data, 
num_bytes,
+                                                           /*bit_width=*/1);
+  }
+
+  int Decode(bool* buffer, int max_values) override {
+    max_values = std::min(max_values, num_values_);
+    int val = 0;

Review Comment:
   `bool`?
   
   Or can we use `decoder_->GetBatch()` in this method too?



##########
cpp/src/parquet/encoding.cc:
##########
@@ -2355,6 +2355,80 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
   std::shared_ptr<ResizableBuffer> buffered_data_;
 };
 
+// ----------------------------------------------------------------------
+// RLE_BOOLEAN_DECODER
+
+class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder {
+ public:
+  explicit RleBooleanDecoder(const ColumnDescriptor* descr)
+      : DecoderImpl(descr, Encoding::RLE) {}
+
+  void SetData(int num_values, const uint8_t* data, int len) override {
+    num_values_ = num_values;
+    uint32_t num_bytes = 0;
+
+    if (len < 4) {
+      throw ParquetException("Received invalid length : " + 
std::to_string(len) +
+                             " (corrupt data page?)");
+    }
+    // Load the first 4 bytes in little-endian, which indicates the length
+    num_bytes =
+        
::arrow::bit_util::ToLittleEndian(::arrow::util::SafeLoadAs<uint32_t>(data));
+    if (num_bytes < 0 || num_bytes > (uint32_t)(len - 4)) {
+      throw ParquetException("Received invalid number of bytes : " +
+                             std::to_string(num_bytes) + " (corrupt data 
page?)");
+    }
+
+    const uint8_t* decoder_data = data + 4;
+    decoder_ = std::make_shared<::arrow::util::RleDecoder>(decoder_data, 
num_bytes,
+                                                           /*bit_width=*/1);
+  }
+
+  int Decode(bool* buffer, int max_values) override {
+    max_values = std::min(max_values, num_values_);
+    int val = 0;
+
+    for (int i = 0; i < max_values; ++i) {
+      if (!decoder_->Get(&val)) {
+        throw ParquetException("Unable to parse bits (corrupt data page?)");

Review Comment:
   Can we also show invalid position?



##########
cpp/src/parquet/encoding.cc:
##########
@@ -2355,6 +2355,80 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
   std::shared_ptr<ResizableBuffer> buffered_data_;
 };
 
+// ----------------------------------------------------------------------
+// RLE_BOOLEAN_DECODER
+
+class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder {
+ public:
+  explicit RleBooleanDecoder(const ColumnDescriptor* descr)
+      : DecoderImpl(descr, Encoding::RLE) {}
+
+  void SetData(int num_values, const uint8_t* data, int len) override {
+    num_values_ = num_values;
+    uint32_t num_bytes = 0;
+
+    if (len < 4) {
+      throw ParquetException("Received invalid length : " + 
std::to_string(len) +
+                             " (corrupt data page?)");
+    }
+    // Load the first 4 bytes in little-endian, which indicates the length
+    num_bytes =
+        
::arrow::bit_util::ToLittleEndian(::arrow::util::SafeLoadAs<uint32_t>(data));
+    if (num_bytes < 0 || num_bytes > (uint32_t)(len - 4)) {
+      throw ParquetException("Received invalid number of bytes : " +
+                             std::to_string(num_bytes) + " (corrupt data 
page?)");
+    }
+
+    const uint8_t* decoder_data = data + 4;

Review Comment:
   ```suggestion
       auto decoder_data = data + 4;
   ```



##########
cpp/src/parquet/encoding.cc:
##########
@@ -2355,6 +2355,80 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
   std::shared_ptr<ResizableBuffer> buffered_data_;
 };
 
+// ----------------------------------------------------------------------
+// RLE_BOOLEAN_DECODER
+
+class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder {
+ public:
+  explicit RleBooleanDecoder(const ColumnDescriptor* descr)
+      : DecoderImpl(descr, Encoding::RLE) {}
+
+  void SetData(int num_values, const uint8_t* data, int len) override {
+    num_values_ = num_values;
+    uint32_t num_bytes = 0;
+
+    if (len < 4) {
+      throw ParquetException("Received invalid length : " + 
std::to_string(len) +
+                             " (corrupt data page?)");
+    }
+    // Load the first 4 bytes in little-endian, which indicates the length
+    num_bytes =
+        
::arrow::bit_util::ToLittleEndian(::arrow::util::SafeLoadAs<uint32_t>(data));
+    if (num_bytes < 0 || num_bytes > (uint32_t)(len - 4)) {

Review Comment:
   ```suggestion
       if (num_bytes < 0 || num_bytes > static_cast<uint32_t>(len - 4)) {
   ```



##########
cpp/src/parquet/encoding.cc:
##########
@@ -2355,6 +2355,80 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
   std::shared_ptr<ResizableBuffer> buffered_data_;
 };
 
+// ----------------------------------------------------------------------
+// RLE_BOOLEAN_DECODER
+
+class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder {
+ public:
+  explicit RleBooleanDecoder(const ColumnDescriptor* descr)
+      : DecoderImpl(descr, Encoding::RLE) {}
+
+  void SetData(int num_values, const uint8_t* data, int len) override {
+    num_values_ = num_values;
+    uint32_t num_bytes = 0;
+
+    if (len < 4) {
+      throw ParquetException("Received invalid length : " + 
std::to_string(len) +
+                             " (corrupt data page?)");
+    }
+    // Load the first 4 bytes in little-endian, which indicates the length
+    num_bytes =
+        
::arrow::bit_util::ToLittleEndian(::arrow::util::SafeLoadAs<uint32_t>(data));
+    if (num_bytes < 0 || num_bytes > (uint32_t)(len - 4)) {
+      throw ParquetException("Received invalid number of bytes : " +
+                             std::to_string(num_bytes) + " (corrupt data 
page?)");
+    }
+
+    const uint8_t* decoder_data = data + 4;
+    decoder_ = std::make_shared<::arrow::util::RleDecoder>(decoder_data, 
num_bytes,
+                                                           /*bit_width=*/1);
+  }
+
+  int Decode(bool* buffer, int max_values) override {
+    max_values = std::min(max_values, num_values_);
+    int val = 0;
+
+    for (int i = 0; i < max_values; ++i) {
+      if (!decoder_->Get(&val)) {
+        throw ParquetException("Unable to parse bits (corrupt data page?)");
+      }
+      if (val) {
+        buffer[i] = true;
+      } else {
+        buffer[i] = false;
+      }
+    }
+    num_values_ -= max_values;
+    return max_values;
+  }
+
+  int Decode(uint8_t* buffer, int max_values) {
+    max_values = std::min(max_values, num_values_);
+    if (!decoder_->GetBatch(buffer, max_values)) {
+      ParquetException::EofException();
+    }
+
+    num_values_ -= max_values;

Review Comment:
   It seems that `decoder_->GetBatch()` returns the number of read values. It 
may be smaller than `max_values`. So this may cause invalid `num_values_`.



##########
cpp/src/parquet/encoding.cc:
##########
@@ -2355,6 +2355,80 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
   std::shared_ptr<ResizableBuffer> buffered_data_;
 };
 
+// ----------------------------------------------------------------------
+// RLE_BOOLEAN_DECODER
+
+class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder {
+ public:
+  explicit RleBooleanDecoder(const ColumnDescriptor* descr)
+      : DecoderImpl(descr, Encoding::RLE) {}
+
+  void SetData(int num_values, const uint8_t* data, int len) override {
+    num_values_ = num_values;
+    uint32_t num_bytes = 0;
+
+    if (len < 4) {
+      throw ParquetException("Received invalid length : " + 
std::to_string(len) +
+                             " (corrupt data page?)");
+    }
+    // Load the first 4 bytes in little-endian, which indicates the length
+    num_bytes =
+        
::arrow::bit_util::ToLittleEndian(::arrow::util::SafeLoadAs<uint32_t>(data));
+    if (num_bytes < 0 || num_bytes > (uint32_t)(len - 4)) {
+      throw ParquetException("Received invalid number of bytes : " +
+                             std::to_string(num_bytes) + " (corrupt data 
page?)");
+    }
+
+    const uint8_t* decoder_data = data + 4;
+    decoder_ = std::make_shared<::arrow::util::RleDecoder>(decoder_data, 
num_bytes,
+                                                           /*bit_width=*/1);
+  }
+
+  int Decode(bool* buffer, int max_values) override {
+    max_values = std::min(max_values, num_values_);
+    int val = 0;
+
+    for (int i = 0; i < max_values; ++i) {
+      if (!decoder_->Get(&val)) {
+        throw ParquetException("Unable to parse bits (corrupt data page?)");
+      }
+      if (val) {
+        buffer[i] = true;
+      } else {
+        buffer[i] = false;
+      }
+    }
+    num_values_ -= max_values;
+    return max_values;
+  }
+
+  int Decode(uint8_t* buffer, int max_values) {

Review Comment:
   `override` is missing?
   
   
https://github.com/apache/arrow/actions/runs/3076096805/jobs/4970024700#step:6:1903
   
   ```text
   FAILED: src/parquet/CMakeFiles/parquet_objlib.dir/encoding.cc.o 
   /usr/bin/ccache /usr/lib/ccache/clang++-12  -DARROW_HAVE_RUNTIME_AVX2 
-DARROW_HAVE_RUNTIME_AVX512 -DARROW_HAVE_RUNTIME_BMI2 
-DARROW_HAVE_RUNTIME_SSE4_2 -DARROW_HAVE_SSE4_2 -DARROW_NO_DEPRECATED_API 
-DARROW_UBSAN -DARROW_WITH_BROTLI -DARROW_WITH_BZ2 -DARROW_WITH_LZ4 
-DARROW_WITH_RE2 -DARROW_WITH_SNAPPY -DARROW_WITH_UTF8PROC -DARROW_WITH_ZLIB 
-DARROW_WITH_ZSTD -DBOOST_ALL_NO_LIB -DHAVE_INTTYPES_H -DHAVE_NETDB_H 
-DHAVE_NETINET_IN_H -DPARQUET_EXPORTING -DPARQUET_THRIFT_VERSION_MAJOR=0 
-DPARQUET_THRIFT_VERSION_MINOR=13 -DUTF8PROC_STATIC -Isrc -I/arrow/cpp/src 
-I/arrow/cpp/src/generated -isystem /arrow/cpp/thirdparty/flatbuffers/include 
-isystem /arrow/cpp/thirdparty/hadoop/include -isystem 
google_cloud_cpp_ep-install/include -isystem absl_ep-install/include -isystem 
crc32c_ep-install/include -isystem opentelemetry_ep-install/include -isystem 
protobuf_ep-install/include -isystem utf8proc_ep-install/include -isystem 
xsimd_ep/src/xsimd_ep-install/include -Qunused-arguments -fcolor-diagnos
 tics -ggdb -O0  -Wall -Wextra -Wdocumentation -Wshorten-64-to-32 
-Wno-missing-braces -Wno-unused-parameter -Wno-constant-logical-operand 
-Wno-return-stack-address -Werror -Wno-unknown-warning-option -Wno-pass-failed 
-msse4.2  -fsanitize=address -DADDRESS_SANITIZER -fsanitize=undefined 
-fno-sanitize=alignment,vptr,function,float-divide-by-zero 
-fno-sanitize-recover=all 
-fsanitize-coverage=pc-table,inline-8bit-counters,edge,no-prune,trace-cmp,trace-div,trace-gep
 -fsanitize-blacklist=/arrow/cpp/build-support/sanitizer-disallowed-entries.txt 
-g -fPIC   
-fsanitize-coverage=pc-table,inline-8bit-counters,edge,no-prune,trace-cmp,trace-div,trace-gep
 -pthread -std=c++17 -MD -MT 
src/parquet/CMakeFiles/parquet_objlib.dir/encoding.cc.o -MF 
src/parquet/CMakeFiles/parquet_objlib.dir/encoding.cc.o.d -o 
src/parquet/CMakeFiles/parquet_objlib.dir/encoding.cc.o -c 
/arrow/cpp/src/parquet/encoding.cc
   /arrow/cpp/src/parquet/encoding.cc:2405:7: error: 'Decode' overrides a 
member function but is not marked 'override' 
[-Werror,-Winconsistent-missing-override]
     int Decode(uint8_t* buffer, int max_values) {
         ^
   /arrow/cpp/src/parquet/encoding.h:400:15: note: overridden virtual function 
is here
     virtual int Decode(uint8_t* buffer, int max_values) = 0;
                 ^
   1 error generated.
   ```



##########
cpp/src/parquet/level_comparison.cc:
##########
@@ -44,10 +44,10 @@ struct GreaterThanDynamicFunction {
   using FunctionType = decltype(&GreaterThanBitmap);
 
   static std::vector<std::pair<DispatchLevel, FunctionType>> implementations() 
{
-    return {
-      { DispatchLevel::NONE, standard::GreaterThanBitmapImpl }
+    return {{DispatchLevel::NONE, standard::GreaterThanBitmapImpl}
 #if defined(ARROW_HAVE_RUNTIME_AVX2)
-      , { DispatchLevel::AVX2, GreaterThanBitmapAvx2 }
+            ,
+            {DispatchLevel::AVX2, GreaterThanBitmapAvx2}

Review Comment:
   Hmm, you might not use `clang-format-12`:
   
   
https://github.com/apache/arrow/actions/runs/3076096806/jobs/4970024641#step:5:869
   
   ```text
   FAILED: CMakeFiles/check-format 
   cd /tmp/arrow-lint-mbv5v4zf/cpp-build && /usr/local/bin/python 
/arrow/cpp/build-support/run_clang_format.py --clang_format_binary 
/usr/bin/clang-format-12 --exclude_globs 
/arrow/cpp/build-support/lint_exclusions.txt --source_dir /arrow/cpp/src 
--source_dir /arrow/cpp/examples --source_dir /arrow/cpp/tools --quiet
   
   --- /arrow/cpp/src/arrow/util/bpacking.cc
   +++ /arrow/cpp/src/arrow/util/bpacking.cc (after clang format)
   @@ -153,14 +153,13 @@
      using FunctionType = decltype(&unpack32_default);
    
      static std::vector<std::pair<DispatchLevel, FunctionType>> 
implementations() {
   -    return {{DispatchLevel::NONE, unpack32_default}
   +    return {
   +      { DispatchLevel::NONE, unpack32_default }
    #if defined(ARROW_HAVE_RUNTIME_AVX2)
   -            ,
   -            {DispatchLevel::AVX2, unpack32_avx2}
   +      , { DispatchLevel::AVX2, unpack32_avx2 }
    #endif
    #if defined(ARROW_HAVE_RUNTIME_AVX512)
   -            ,
   -            {DispatchLevel::AVX512, unpack32_avx512}
   +      , { DispatchLevel::AVX512, unpack32_avx512 }
    #endif
        };
      }
   
   --- /arrow/cpp/src/parquet/level_comparison.cc
   +++ /arrow/cpp/src/parquet/level_comparison.cc (after clang format)
   @@ -44,10 +44,10 @@
      using FunctionType = decltype(&GreaterThanBitmap);
    
      static std::vector<std::pair<DispatchLevel, FunctionType>> 
implementations() {
   -    return {{DispatchLevel::NONE, standard::GreaterThanBitmapImpl}
   +    return {
   +      { DispatchLevel::NONE, standard::GreaterThanBitmapImpl }
    #if defined(ARROW_HAVE_RUNTIME_AVX2)
   -            ,
   -            {DispatchLevel::AVX2, GreaterThanBitmapAvx2}
   +      , { DispatchLevel::AVX2, GreaterThanBitmapAvx2 }
    #endif
        };
      }
   @@ -57,10 +57,10 @@
      using FunctionType = decltype(&FindMinMax);
    
      static std::vector<std::pair<DispatchLevel, FunctionType>> 
implementations() {
   -    return {{DispatchLevel::NONE, standard::FindMinMaxImpl}
   +    return {
   +      { DispatchLevel::NONE, standard::FindMinMaxImpl }
    #if defined(ARROW_HAVE_RUNTIME_AVX2)
   -            ,
   -            {DispatchLevel::AVX2, FindMinMaxAvx2}
   +      , { DispatchLevel::AVX2, FindMinMaxAvx2 }
    #endif
        };
      }
   ```
   
   Could you try `archery lint --clang-format --fix` instead?
   
https://arrow.apache.org/docs/developers/cpp/development.html#code-style-linting-and-ci



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to