kou commented on code in PR #14147:
URL: https://github.com/apache/arrow/pull/14147#discussion_r974887734


##########
cpp/src/parquet/reader_test.cc:
##########
@@ -127,6 +127,89 @@ void CheckRowGroupMetadata(const RowGroupMetaData* 
rg_metadata,
   }
 }
 
+class TestBooleanRLE : public ::testing::Test {
+ public:
+  void SetUp() {
+    reader_ = 
ParquetFileReader::OpenFile(data_file("rle_boolean_encoding.parquet"));
+  }
+
+  void TearDown() {}
+
+ protected:
+  std::unique_ptr<ParquetFileReader> reader_;
+};
+
+TEST_F(TestBooleanRLE, TestBooleanScanner) {
+  auto group = reader_->RowGroup(0);
+
+  // column 0, id
+  auto scanner = std::make_shared<BoolScanner>(group->Column(0));
+
+  bool val = false;
+  bool is_null = false;
+  for (int i = 0; i < 8; i++) {
+    ASSERT_TRUE(scanner->HasNext());
+    ASSERT_TRUE(scanner->NextValue(&val, &is_null));
+
+    // For this file, 3rd index value is null
+    if (i == 2) {
+      ASSERT_TRUE(is_null);
+    } else {
+      ASSERT_FALSE(is_null);
+    }
+  }
+
+  ASSERT_FALSE(scanner->HasNext());
+  ASSERT_FALSE(scanner->NextValue(&val, &is_null));
+}
+
+TEST_F(TestBooleanRLE, TestBatchRead) {
+  auto group = reader_->RowGroup(0);
+
+  // column 0, id
+  auto col = std::dynamic_pointer_cast<BoolReader>(group->Column(0));
+
+  // This file only has 8 rows
+  ASSERT_EQ(8, reader_->metadata()->num_rows());
+  // This file only has 1 row group
+  ASSERT_EQ(1, reader_->metadata()->num_row_groups());
+  // Size of the metadata is 106 bytes
+  ASSERT_EQ(106, reader_->metadata()->size());
+  // This row group must have 8 rows
+  ASSERT_EQ(8, group->metadata()->num_rows());
+
+  // Check if the column is encoded with RLE
+  auto col_chunk = group->metadata()->ColumnChunk(0);
+  ASSERT_TRUE(std::find(col_chunk->encodings().begin(), 
col_chunk->encodings().end(),
+                        Encoding::RLE) != col_chunk->encodings().end());
+
+  // Assert column has values to be read
+  ASSERT_TRUE(col->HasNext());
+  int64_t curr_batch_read = 0;
+
+  const int16_t batch_size = 8;
+  int16_t def_levels[batch_size];
+  int16_t rep_levels[batch_size];
+  bool values[batch_size - 1];
+
+  auto levels_read =
+      col->ReadBatch(batch_size, def_levels, rep_levels, values, 
&curr_batch_read);
+  ASSERT_EQ(batch_size, levels_read);
+
+  // Since one value is a null value, expect batches read to be one less than 
indicated
+  // batch_size
+  ASSERT_EQ(batch_size - 1, curr_batch_read);
+
+  // 3rd index is null value
+  ASSERT_THAT(def_levels, testing::ElementsAre(1, 1, 0, 1, 1, 1, 1, 1));
+
+  // Validate inserted data is as expected
+  ASSERT_THAT(values, testing::ElementsAre(1, 0, 1, 1, 0, 0, 1));

Review Comment:
   Does the input have both `<bit-packed-run>` and `<rle-run>` cases in 
https://parquet.apache.org/docs/file-format/data-pages/encodings/#a-namerlearun-length-encoding--bit-packing-hybrid-rle--3
 ?



##########
cpp/src/parquet/reader_test.cc:
##########
@@ -127,6 +127,89 @@ void CheckRowGroupMetadata(const RowGroupMetaData* 
rg_metadata,
   }
 }
 
+class TestBooleanRLE : public ::testing::Test {
+ public:
+  void SetUp() {
+    reader_ = 
ParquetFileReader::OpenFile(data_file("rle_boolean_encoding.parquet"));
+  }
+
+  void TearDown() {}
+
+ protected:
+  std::unique_ptr<ParquetFileReader> reader_;
+};
+
+TEST_F(TestBooleanRLE, TestBooleanScanner) {
+  auto group = reader_->RowGroup(0);
+
+  // column 0, id
+  auto scanner = std::make_shared<BoolScanner>(group->Column(0));
+
+  bool val = false;
+  bool is_null = false;
+  for (int i = 0; i < 8; i++) {
+    ASSERT_TRUE(scanner->HasNext());
+    ASSERT_TRUE(scanner->NextValue(&val, &is_null));

Review Comment:
   Could you also validate `val`?



##########
cpp/src/parquet/encoding.cc:
##########
@@ -2355,6 +2355,81 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
   std::shared_ptr<ResizableBuffer> buffered_data_;
 };
 
+// ----------------------------------------------------------------------
+// RLE_BOOLEAN_DECODER
+
+class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder {
+ public:
+  explicit RleBooleanDecoder(const ColumnDescriptor* descr)
+      : DecoderImpl(descr, Encoding::RLE) {}
+
+  void SetData(int num_values, const uint8_t* data, int len) override {
+    num_values_ = num_values;
+    uint32_t num_bytes = 0;
+
+    if (len < 4) {
+      throw ParquetException("Received invalid length : " + 
std::to_string(len) +
+                             " (corrupt data page?)");
+    }
+    // Load the first 4 bytes in little-endian, which indicates the length
+    num_bytes =
+        
::arrow::bit_util::ToLittleEndian(::arrow::util::SafeLoadAs<uint32_t>(data));
+    if (num_bytes < 0 || num_bytes > static_cast<uint32_t>(len - 4)) {
+      throw ParquetException("Received invalid number of bytes : " +
+                             std::to_string(num_bytes) + " (corrupt data 
page?)");
+    }
+
+    auto decoder_data = data + 4;
+    decoder_ = std::make_shared<::arrow::util::RleDecoder>(decoder_data, 
num_bytes,
+                                                           /*bit_width=*/1);
+  }
+
+  int Decode(bool* buffer, int max_values) override {
+    max_values = std::min(max_values, num_values_);
+    int val = 0;
+
+    for (int i = 0; i < max_values; ++i) {
+      if (!decoder_->Get(&val)) {
+        throw ParquetException("Unable to parse bits for position (0 based) : 
" +
+                               std::to_string(i) + " (corrupt data page?)");
+      }
+      if (val) {
+        buffer[i] = true;
+      } else {
+        buffer[i] = false;
+      }
+    }
+    num_values_ -= max_values;
+    return max_values;
+  }
+
+  int Decode(uint8_t* buffer, int max_values) override {
+    max_values = std::min(max_values, num_values_);
+    if (decoder_->GetBatch(buffer, max_values) != max_values) {

Review Comment:
   It seems that this writes each value as 1byte value to `buffer`.
   But `PlainBooleanDecoder::Decode(uint8_t* buffer)` writes each value as 1bit 
value to `buffer`.
   
   Does the added tests use this method?



##########
cpp/src/parquet/encoding.cc:
##########
@@ -2355,6 +2355,80 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
   std::shared_ptr<ResizableBuffer> buffered_data_;
 };
 
+// ----------------------------------------------------------------------
+// RLE_BOOLEAN_DECODER
+
+class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder {
+ public:
+  explicit RleBooleanDecoder(const ColumnDescriptor* descr)
+      : DecoderImpl(descr, Encoding::RLE) {}
+
+  void SetData(int num_values, const uint8_t* data, int len) override {
+    num_values_ = num_values;
+    uint32_t num_bytes = 0;
+
+    if (len < 4) {
+      throw ParquetException("Received invalid length : " + 
std::to_string(len) +
+                             " (corrupt data page?)");
+    }
+    // Load the first 4 bytes in little-endian, which indicates the length
+    num_bytes =
+        
::arrow::bit_util::ToLittleEndian(::arrow::util::SafeLoadAs<uint32_t>(data));
+    if (num_bytes < 0 || num_bytes > (uint32_t)(len - 4)) {
+      throw ParquetException("Received invalid number of bytes : " +
+                             std::to_string(num_bytes) + " (corrupt data 
page?)");
+    }
+
+    const uint8_t* decoder_data = data + 4;
+    decoder_ = std::make_shared<::arrow::util::RleDecoder>(decoder_data, 
num_bytes,
+                                                           /*bit_width=*/1);
+  }
+
+  int Decode(bool* buffer, int max_values) override {
+    max_values = std::min(max_values, num_values_);
+    int val = 0;

Review Comment:
   Hmm. Do you know what value is used for boolean in Parquet? (`0` and `1`?) 
And do you also know the size of a boolean value in Parquet? (1bit? 8bits?)
   I can't find them in 
https://parquet.apache.org/docs/file-format/data-pages/encodings/ .
   
   I think that we need to use the same type for this. If Parquet uses 8bits, 
we can use `int8_t` or `uint8_t` here.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to