[GitHub] [arrow] bkietz commented on a change in pull request #9790: ARROW-11972: [C++][R][Python][Dataset] Extract IPC/Parquet fragment scan options

GitBox Thu, 25 Mar 2021 11:10:38 -0700


bkietz commented on a change in pull request #9790:
URL: https://github.com/apache/arrow/pull/9790#discussion_r601727188




##########
File path: cpp/src/arrow/dataset/file_parquet.h
##########
@@ -76,35 +78,20 @@ class ARROW_DS_EXPORT ParquetFileFormat : public FileFormat 
{
     /// \defgroup parquet-file-format-reader-properties properties which 
correspond to
     /// members of parquet::ReaderProperties.
     ///
-    /// We don't embed parquet::ReaderProperties directly because we get 
memory_pool from
-    /// ScanOptions at scan time and provide differing defaults.
-    ///
     /// @{
-    bool use_buffered_stream = false;
-    int64_t buffer_size = 1 << 13;
     std::shared_ptr<parquet::FileDecryptionProperties> 
file_decryption_properties;
     /// @}
 
     /// \defgroup parquet-file-format-arrow-reader-properties properties which 
correspond
     /// to members of parquet::ArrowReaderProperties.
     ///
-    /// We don't embed parquet::ReaderProperties directly because we get 
batch_size from
-    /// ScanOptions at scan time, and we will never pass use_threads == true 
(since we
-    /// defer parallelization of the scan). Additionally column names (rather 
than
-    /// indices) are used to indicate dictionary columns.
+    /// We don't embed parquet::ReaderProperties directly because column names 
(rather
+    /// than indices) are used to indicate dictionary columns, and other 
options are
+    /// deferred to scan time.
     ///
     /// @{
     std::unordered_set<std::string> dict_columns;
-    bool pre_buffer = false;
-    arrow::io::CacheOptions cache_options = 
arrow::io::CacheOptions::Defaults();
-    arrow::io::IOContext io_context;
     /// @}
-
-    /// EXPERIMENTAL: Parallelize conversion across columns. This option is 
ignored if a
-    /// scan is already parallelized across input files to avoid thread 
contention. This
-    /// option will be removed after support is added for simultaneous 
parallelization
-    /// across files and columns.
-    bool enable_parallel_column_conversion = false;
   } reader_options;

Review comment:
       I think now that `FileFormat::default_fragment_scan_options` is 
available and both IPC and CSV file formats embed options structs whole it'd be 
best to just follow that pattern with parquet: let ParquetFragmentScanOptions 
hold a ReaderProperties whole.
   
   `dict_columns` will need to remain here for now since it affects the schema 
of the scanned dataset, but once we can (for example) project a string column 
to dictionary<int32, string> I'd expect this option to be deprecated in favor 
of automatically pushing the projection-to-dictionary down to the parquet 
reader with `ArrowReaderOptions::set_read_dictionary`.
   
   `file_decryption_properties` may also need to be replicated, though I'm not 
sure how/if anyone is using encryption + datasets

##########
File path: cpp/src/arrow/dataset/file_ipc.cc
##########
@@ -81,15 +81,27 @@ class IpcScanTask : public ScanTask {
 
   Result<RecordBatchIterator> Execute() override {
     struct Impl {
-      static Result<RecordBatchIterator> Make(
-          const FileSource& source, std::vector<std::string> 
materialized_fields,
-          MemoryPool* pool) {
+      static Result<RecordBatchIterator> Make(const FileSource& source,
+                                              FileFormat* format,
+                                              ScanOptions* scan_options) {

Review comment:
       Non const pointer suggests mutation, should be changed here and in 
GetFragmentScanOptions
   ```suggestion
                                                 const ScanOptions* 
scan_options,
                                                 FileFormat* format) {
   ```

##########
File path: cpp/src/arrow/dataset/file_ipc.cc
##########
@@ -81,15 +81,27 @@ class IpcScanTask : public ScanTask {
 
   Result<RecordBatchIterator> Execute() override {
     struct Impl {
-      static Result<RecordBatchIterator> Make(
-          const FileSource& source, std::vector<std::string> 
materialized_fields,
-          MemoryPool* pool) {
+      static Result<RecordBatchIterator> Make(const FileSource& source,
+                                              FileFormat* format,
+                                              ScanOptions* scan_options) {
         ARROW_ASSIGN_OR_RAISE(auto reader, OpenReader(source));
 
-        auto options = default_read_options();
-        options.memory_pool = pool;
-        ARROW_ASSIGN_OR_RAISE(options.included_fields,
-                              GetIncludedFields(*reader->schema(), 
materialized_fields));
+        ARROW_ASSIGN_OR_RAISE(
+            auto ipc_scan_options,
+            GetFragmentScanOptions<IpcFragmentScanOptions>(
+                kIpcTypeName, scan_options, 
format->default_fragment_scan_options));
+        auto options = ipc_scan_options->options ? *ipc_scan_options->options
+                                                 : default_read_options();
+        options.memory_pool = scan_options->pool;
+        options.use_threads = false;
+        if (!options.included_fields.empty()) {
+          // Cannot set them ehre
+          return Status::Invalid(
+              "Cannot set included_fields in scan options for IPC fragments");

Review comment:
       We don't emit an error for other ignored options, so unless we want to 
emit an error for any non-default ignored option I think the most we should 
emit here is a warning:
   ```suggestion
             ARROW_LOG(WARNING) << 
"IpcFragmentScanOptions.options.included_fields was set but "
                 "will be ignored; included_fields are derived from fields 
referenced by the scan."
   ```

##########
File path: cpp/src/arrow/dataset/file_ipc.cc
##########
@@ -81,15 +81,27 @@ class IpcScanTask : public ScanTask {
 
   Result<RecordBatchIterator> Execute() override {
     struct Impl {
-      static Result<RecordBatchIterator> Make(
-          const FileSource& source, std::vector<std::string> 
materialized_fields,
-          MemoryPool* pool) {
+      static Result<RecordBatchIterator> Make(const FileSource& source,
+                                              FileFormat* format,
+                                              ScanOptions* scan_options) {
         ARROW_ASSIGN_OR_RAISE(auto reader, OpenReader(source));
 
-        auto options = default_read_options();
-        options.memory_pool = pool;
-        ARROW_ASSIGN_OR_RAISE(options.included_fields,
-                              GetIncludedFields(*reader->schema(), 
materialized_fields));
+        ARROW_ASSIGN_OR_RAISE(
+            auto ipc_scan_options,
+            GetFragmentScanOptions<IpcFragmentScanOptions>(
+                kIpcTypeName, scan_options, 
format->default_fragment_scan_options));
+        auto options = ipc_scan_options->options ? *ipc_scan_options->options
+                                                 : default_read_options();
+        options.memory_pool = scan_options->pool;
+        options.use_threads = false;
+        if (!options.included_fields.empty()) {
+          // Cannot set them ehre

Review comment:
       ```suggestion
             // Cannot set them here
   ```

##########
File path: cpp/src/arrow/dataset/type_fwd.h
##########
@@ -68,6 +68,7 @@ class IpcFileWriteOptions;
 

Review comment:
       ```suggestion
   class IpcFragmentScanOptions;
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] bkietz commented on a change in pull request #9790: ARROW-11972: [C++][R][Python][Dataset] Extract IPC/Parquet fragment scan options

Reply via email to