pitrou commented on code in PR #41346:
URL: https://github.com/apache/arrow/pull/41346#discussion_r1603642158


##########
cpp/src/parquet/column_reader_test.cc:
##########
@@ -431,6 +431,58 @@ TEST_F(TestPrimitiveReader, TestReadValuesMissing) {
                ParquetException);
 }
 
+// GH-41321: When max_def_level > 0, and Page has more or less
+// def-levels than the `num_values` in PageHeader. We should
+// detect and throw exception.
+TEST_F(TestPrimitiveReader, DefLevelNotExpected) {
+  max_def_level_ = 1;
+  max_rep_level_ = 0;
+
+  auto do_check = [&](const std::vector<int16_t>& input_def_levels, int 
num_values) {
+    std::vector<bool> values(num_values, false);
+    NodePtr type = schema::Boolean("a", Repetition::OPTIONAL);
+    const ColumnDescriptor descr(type, max_def_level_, max_rep_level_);
+
+    // The data page falls back to plain encoding
+    std::shared_ptr<ResizableBuffer> dummy = AllocateBuffer();
+    std::shared_ptr<DataPageV1> data_page = MakeDataPage<BooleanType>(
+        &descr, values, /*num_values=*/num_values, Encoding::PLAIN, 
/*indices=*/{},
+        /*indices_size=*/0, /*def_levels=*/input_def_levels, max_def_level_,
+        /*rep_levels=*/{},
+        /*max_rep_level=*/max_rep_level_);
+    pages_.push_back(data_page);
+    InitReader(&descr);
+    auto reader = static_cast<BoolReader*>(reader_.get());
+    ASSERT_TRUE(reader->HasNext());
+
+    constexpr int batch_size = 10;
+    std::vector<int16_t> def_levels(batch_size, 0);
+    std::vector<int16_t> rep_levels(batch_size, 0);
+    bool values_out[batch_size];
+    int64_t values_read;
+    EXPECT_THROW_THAT(
+        [&]() {
+          reader->ReadBatch(batch_size, def_levels.data(), rep_levels.data(), 
values_out,
+                            &values_read);
+        },
+        ParquetException,
+        ::testing::Property(
+            &ParquetException::what,
+            ::testing::HasSubstr("Number of decoded rep / def levels did "
+                                 "more or less than num_values in 
page_header")));
+  };
+  // storing def-levels less than value in page-header
+  {
+    std::vector<int16_t> input_def_levels(1, 1);
+    do_check(input_def_levels, /*num_values=*/3);
+  }
+  // storing def-levels more than value in page-header
+  {
+    std::vector<int16_t> input_def_levels(2, 1);
+    do_check(input_def_levels, /*num_values=*/1);
+  }

Review Comment:
   Do you also plan to add a test with rep levels?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to