pitrou commented on code in PR #14603:
URL: https://github.com/apache/arrow/pull/14603#discussion_r1035982919


##########
cpp/src/parquet/metadata.cc:
##########
@@ -19,10 +19,13 @@
 
 #include <algorithm>
 #include <cinttypes>
+#include <csignal>
+#include <iostream>
 #include <ostream>
 #include <string>
 #include <string_view>
 #include <utility>
+#include <variant>

Review Comment:
   Why are these new inclusions suddenly required?



##########
cpp/src/parquet/column_reader.h:
##########
@@ -115,11 +116,30 @@ class PARQUET_EXPORT PageReader {
                                           bool always_compressed = false,
                                           const CryptoContext* ctx = NULLPTR);
 
+  // If skip_page_callback_ is present (not null), NextPage() will call the
+  // callback function exactly once per page in the order the pages appear in
+  // the column. If the callback function returns true the page will be
+  // skipped. The callback will be called only if the page type is DATA_PAGE or

Review Comment:
   I think this would read better if it was advertised as a page filter 
(returning true to include a page and false to exclude it).
   Calling it a "skip page callback" may give the impression that it will only 
be called by the `Skip` APIs.



##########
cpp/src/parquet/metadata.h:
##########
@@ -20,13 +20,15 @@
 #include <cstdint>
 #include <map>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "parquet/platform.h"
 #include "parquet/properties.h"
 #include "parquet/schema.h"
+#include "parquet/statistics.h"

Review Comment:
   We should try to be parcimonious when adding inclusions in header files, as 
they increase compile times. This one doesn't seem necessary as `class 
EncodedStatistics` is forward-declared below.



##########
cpp/src/parquet/metadata.h:
##########
@@ -182,6 +184,28 @@ class PARQUET_EXPORT ColumnChunkMetaData {
   std::unique_ptr<ColumnChunkMetaDataImpl> impl_;
 };
 
+// \brief DataPageStats is a proxy around format::DataPageHeader and
+// format::DataPageHeaderV2.
+class PARQUET_EXPORT DataPageStats {

Review Comment:
   I'm not sure why this needs to manually define accessors and constructor 
instead of being a plain struct?
   ```c++
   struct DataPageStats {
     EncodedStatistics* encoded_statistics;
     int32_t num_values;
     std::optional<int32_t> num_rows;
   };
   ```



##########
cpp/src/parquet/column_reader.h:
##########
@@ -115,11 +116,30 @@ class PARQUET_EXPORT PageReader {
                                           bool always_compressed = false,
                                           const CryptoContext* ctx = NULLPTR);
 
+  // If skip_page_callback_ is present (not null), NextPage() will call the
+  // callback function exactly once per page in the order the pages appear in
+  // the column. If the callback function returns true the page will be
+  // skipped. The callback will be called only if the page type is DATA_PAGE or
+  // DATA_PAGE_V2. Dictionary pages will not be skipped.
+  // This setter must be called at most once to set the callback.
+  // \note API EXPERIMENTAL
+  void set_skip_page_callback(
+      std::function<bool(const DataPageStats&)> skip_page_callback) {
+    if (skip_page_callback_) {
+      throw ParquetException("set_skip_page_callback was called more than 
once");

Review Comment:
   Why would this be forbidden?



##########
cpp/src/parquet/column_reader.h:
##########
@@ -115,11 +116,30 @@ class PARQUET_EXPORT PageReader {
                                           bool always_compressed = false,
                                           const CryptoContext* ctx = NULLPTR);
 
+  // If skip_page_callback_ is present (not null), NextPage() will call the
+  // callback function exactly once per page in the order the pages appear in
+  // the column. If the callback function returns true the page will be
+  // skipped. The callback will be called only if the page type is DATA_PAGE or
+  // DATA_PAGE_V2. Dictionary pages will not be skipped.
+  // This setter must be called at most once to set the callback.
+  // \note API EXPERIMENTAL
+  void set_skip_page_callback(
+      std::function<bool(const DataPageStats&)> skip_page_callback) {

Review Comment:
   Can we define a type name for this? For example:
   ```c++
     using DataPageFilter = std::function<bool(const DataPageStats&)>;
   ```



##########
cpp/src/parquet/metadata.h:
##########
@@ -182,6 +184,28 @@ class PARQUET_EXPORT ColumnChunkMetaData {
   std::unique_ptr<ColumnChunkMetaDataImpl> impl_;
 };
 
+// \brief DataPageStats is a proxy around format::DataPageHeader and

Review Comment:
   Why define this class here if it's only referenced in `column_reader.h`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to