fatemehp commented on code in PR #14603:
URL: https://github.com/apache/arrow/pull/14603#discussion_r1035207103


##########
cpp/src/parquet/column_reader.cc:
##########
@@ -386,6 +386,57 @@ std::shared_ptr<Page> SerializedPageReader::NextPage() {
       throw ParquetException("Invalid page header");
     }
 
+    // Do some checks before trying to decrypt and/or decompress the page.
+    // Also skip the page if skip_page_callback_ is set and returns true.
+    const PageType::type page_type = LoadEnumSafe(&current_page_header_.type);
+    EncodedStatistics page_statistics;
+    if (page_type == PageType::DATA_PAGE) {
+      const format::DataPageHeader& header = 
current_page_header_.data_page_header;
+      if (header.num_values < 0) {
+        throw ParquetException("Invalid page header (negative number of 
values)");
+      }
+      page_statistics = ExtractStatsFromHeader(header);
+      seen_num_values_ += header.num_values;
+      if (skip_page_callback_) {
+        DataPageStats data_page_stats(page_statistics, header.num_values,
+                                      /*num_rows=*/std::nullopt);
+        if (skip_page_callback_(data_page_stats)) {
+          PARQUET_THROW_NOT_OK(stream_->Advance(compressed_len));
+          continue;
+        }
+      }
+    } else if (page_type == PageType::DATA_PAGE_V2) {
+      const format::DataPageHeaderV2& header = 
current_page_header_.data_page_header_v2;
+      if (header.num_values < 0) {
+        throw ParquetException("Invalid page header (negative number of 
values)");
+      }
+      if (header.definition_levels_byte_length < 0 ||
+          header.repetition_levels_byte_length < 0) {
+        throw ParquetException("Invalid page header (negative levels byte 
length)");
+      }
+      page_statistics = ExtractStatsFromHeader(header);
+      seen_num_values_ += header.num_values;
+      if (skip_page_callback_) {
+        DataPageStats data_page_stats(page_statistics, header.num_values,
+                                      header.num_rows);
+        if (skip_page_callback_(data_page_stats)) {
+          PARQUET_THROW_NOT_OK(stream_->Advance(compressed_len));
+          continue;
+        }
+      }
+    } else if (page_type == PageType::DICTIONARY_PAGE) {
+      const format::DictionaryPageHeader& dict_header =
+          current_page_header_.dictionary_page_header;
+      if (dict_header.num_values < 0) {

Review Comment:
   Done



##########
cpp/src/parquet/column_reader.cc:
##########
@@ -386,6 +386,57 @@ std::shared_ptr<Page> SerializedPageReader::NextPage() {
       throw ParquetException("Invalid page header");
     }
 
+    // Do some checks before trying to decrypt and/or decompress the page.
+    // Also skip the page if skip_page_callback_ is set and returns true.
+    const PageType::type page_type = LoadEnumSafe(&current_page_header_.type);
+    EncodedStatistics page_statistics;
+    if (page_type == PageType::DATA_PAGE) {
+      const format::DataPageHeader& header = 
current_page_header_.data_page_header;
+      if (header.num_values < 0) {
+        throw ParquetException("Invalid page header (negative number of 
values)");
+      }
+      page_statistics = ExtractStatsFromHeader(header);
+      seen_num_values_ += header.num_values;
+      if (skip_page_callback_) {
+        DataPageStats data_page_stats(page_statistics, header.num_values,
+                                      /*num_rows=*/std::nullopt);
+        if (skip_page_callback_(data_page_stats)) {
+          PARQUET_THROW_NOT_OK(stream_->Advance(compressed_len));
+          continue;
+        }
+      }
+    } else if (page_type == PageType::DATA_PAGE_V2) {
+      const format::DataPageHeaderV2& header = 
current_page_header_.data_page_header_v2;
+      if (header.num_values < 0) {

Review Comment:
   Done



##########
cpp/src/parquet/column_reader.cc:
##########
@@ -386,6 +386,57 @@ std::shared_ptr<Page> SerializedPageReader::NextPage() {
       throw ParquetException("Invalid page header");
     }
 
+    // Do some checks before trying to decrypt and/or decompress the page.
+    // Also skip the page if skip_page_callback_ is set and returns true.
+    const PageType::type page_type = LoadEnumSafe(&current_page_header_.type);
+    EncodedStatistics page_statistics;
+    if (page_type == PageType::DATA_PAGE) {
+      const format::DataPageHeader& header = 
current_page_header_.data_page_header;
+      if (header.num_values < 0) {
+        throw ParquetException("Invalid page header (negative number of 
values)");
+      }
+      page_statistics = ExtractStatsFromHeader(header);
+      seen_num_values_ += header.num_values;
+      if (skip_page_callback_) {
+        DataPageStats data_page_stats(page_statistics, header.num_values,
+                                      /*num_rows=*/std::nullopt);
+        if (skip_page_callback_(data_page_stats)) {
+          PARQUET_THROW_NOT_OK(stream_->Advance(compressed_len));
+          continue;
+        }
+      }
+    } else if (page_type == PageType::DATA_PAGE_V2) {
+      const format::DataPageHeaderV2& header = 
current_page_header_.data_page_header_v2;
+      if (header.num_values < 0) {
+        throw ParquetException("Invalid page header (negative number of 
values)");
+      }
+      if (header.definition_levels_byte_length < 0 ||
+          header.repetition_levels_byte_length < 0) {
+        throw ParquetException("Invalid page header (negative levels byte 
length)");
+      }
+      page_statistics = ExtractStatsFromHeader(header);
+      seen_num_values_ += header.num_values;
+      if (skip_page_callback_) {
+        DataPageStats data_page_stats(page_statistics, header.num_values,
+                                      header.num_rows);

Review Comment:
   Done



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to