pitrou commented on code in PR #41346:
URL: https://github.com/apache/arrow/pull/41346#discussion_r1603742338


##########
cpp/src/parquet/column_reader.cc:
##########
@@ -1577,7 +1595,10 @@ class TypedRecordReader : public 
TypedColumnReaderImpl<DType>,
       int64_t levels_read = 0;
       levels_read = this->ReadDefinitionLevels(batch_size, def_levels);

Review Comment:
   Here as well, we could perhaps make the code easier to read by checking the 
result immediately?
   ```c++
         if (this->ReadDefinitionLevels(batch_size, def_levels) != batch_size) {
           throw ParquetException(kErrorRepDefLevelNotMatchesNumValues);
         }
   ```



##########
cpp/src/parquet/column_reader_test.cc:
##########
@@ -431,6 +431,58 @@ TEST_F(TestPrimitiveReader, TestReadValuesMissing) {
                ParquetException);
 }
 
+// GH-41321: When max_def_level > 0, and Page has more or less
+// def-levels than the `num_values` in PageHeader. We should
+// detect and throw exception.
+TEST_F(TestPrimitiveReader, DefLevelNotExpected) {
+  max_def_level_ = 1;
+  max_rep_level_ = 0;
+
+  auto do_check = [&](const std::vector<int16_t>& input_def_levels, int 
num_values) {
+    std::vector<bool> values(num_values, false);
+    NodePtr type = schema::Boolean("a", Repetition::OPTIONAL);
+    const ColumnDescriptor descr(type, max_def_level_, max_rep_level_);
+
+    // The data page falls back to plain encoding
+    std::shared_ptr<ResizableBuffer> dummy = AllocateBuffer();
+    std::shared_ptr<DataPageV1> data_page = MakeDataPage<BooleanType>(
+        &descr, values, /*num_values=*/num_values, Encoding::PLAIN, 
/*indices=*/{},
+        /*indices_size=*/0, /*def_levels=*/input_def_levels, max_def_level_,
+        /*rep_levels=*/{},
+        /*max_rep_level=*/max_rep_level_);
+    pages_.push_back(data_page);
+    InitReader(&descr);
+    auto reader = static_cast<BoolReader*>(reader_.get());
+    ASSERT_TRUE(reader->HasNext());
+
+    constexpr int batch_size = 10;
+    std::vector<int16_t> def_levels(batch_size, 0);
+    std::vector<int16_t> rep_levels(batch_size, 0);
+    bool values_out[batch_size];
+    int64_t values_read;
+    EXPECT_THROW_THAT(
+        [&]() {
+          reader->ReadBatch(batch_size, def_levels.data(), rep_levels.data(), 
values_out,
+                            &values_read);
+        },
+        ParquetException,
+        ::testing::Property(
+            &ParquetException::what,
+            ::testing::HasSubstr("Number of decoded rep / def levels did "
+                                 "more or less than num_values in 
page_header")));
+  };
+  // storing def-levels less than value in page-header
+  {
+    std::vector<int16_t> input_def_levels(1, 1);
+    do_check(input_def_levels, /*num_values=*/3);
+  }
+  // storing def-levels more than value in page-header
+  {
+    std::vector<int16_t> input_def_levels(2, 1);
+    do_check(input_def_levels, /*num_values=*/1);
+  }

Review Comment:
   Thank you!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to