bkietz commented on a change in pull request #9790: URL: https://github.com/apache/arrow/pull/9790#discussion_r601727188
########## File path: cpp/src/arrow/dataset/file_parquet.h ########## @@ -76,35 +78,20 @@ class ARROW_DS_EXPORT ParquetFileFormat : public FileFormat { /// \defgroup parquet-file-format-reader-properties properties which correspond to /// members of parquet::ReaderProperties. /// - /// We don't embed parquet::ReaderProperties directly because we get memory_pool from - /// ScanOptions at scan time and provide differing defaults. - /// /// @{ - bool use_buffered_stream = false; - int64_t buffer_size = 1 << 13; std::shared_ptr<parquet::FileDecryptionProperties> file_decryption_properties; /// @} /// \defgroup parquet-file-format-arrow-reader-properties properties which correspond /// to members of parquet::ArrowReaderProperties. /// - /// We don't embed parquet::ReaderProperties directly because we get batch_size from - /// ScanOptions at scan time, and we will never pass use_threads == true (since we - /// defer parallelization of the scan). Additionally column names (rather than - /// indices) are used to indicate dictionary columns. + /// We don't embed parquet::ReaderProperties directly because column names (rather + /// than indices) are used to indicate dictionary columns, and other options are + /// deferred to scan time. /// /// @{ std::unordered_set<std::string> dict_columns; - bool pre_buffer = false; - arrow::io::CacheOptions cache_options = arrow::io::CacheOptions::Defaults(); - arrow::io::IOContext io_context; /// @} - - /// EXPERIMENTAL: Parallelize conversion across columns. This option is ignored if a - /// scan is already parallelized across input files to avoid thread contention. This - /// option will be removed after support is added for simultaneous parallelization - /// across files and columns. - bool enable_parallel_column_conversion = false; } reader_options; Review comment: I think now that `FileFormat::default_fragment_scan_options` is available and both IPC and CSV file formats embed options structs whole it'd be best to just follow that pattern with parquet: let ParquetFragmentScanOptions hold a ReaderProperties whole. `dict_columns` will need to remain here for now since it affects the schema of the scanned dataset, but once we can (for example) project a string column to dictionary<int32, string> I'd expect this option to be deprecated in favor of automatically pushing the projection-to-dictionary down to the parquet reader with `ArrowReaderOptions::set_read_dictionary`. `file_decryption_properties` may also need to be replicated, though I'm not sure how/if anyone is using encryption + datasets ########## File path: cpp/src/arrow/dataset/file_ipc.cc ########## @@ -81,15 +81,27 @@ class IpcScanTask : public ScanTask { Result<RecordBatchIterator> Execute() override { struct Impl { - static Result<RecordBatchIterator> Make( - const FileSource& source, std::vector<std::string> materialized_fields, - MemoryPool* pool) { + static Result<RecordBatchIterator> Make(const FileSource& source, + FileFormat* format, + ScanOptions* scan_options) { Review comment: Non const pointer suggests mutation, should be changed here and in GetFragmentScanOptions ```suggestion const ScanOptions* scan_options, FileFormat* format) { ``` ########## File path: cpp/src/arrow/dataset/file_ipc.cc ########## @@ -81,15 +81,27 @@ class IpcScanTask : public ScanTask { Result<RecordBatchIterator> Execute() override { struct Impl { - static Result<RecordBatchIterator> Make( - const FileSource& source, std::vector<std::string> materialized_fields, - MemoryPool* pool) { + static Result<RecordBatchIterator> Make(const FileSource& source, + FileFormat* format, + ScanOptions* scan_options) { ARROW_ASSIGN_OR_RAISE(auto reader, OpenReader(source)); - auto options = default_read_options(); - options.memory_pool = pool; - ARROW_ASSIGN_OR_RAISE(options.included_fields, - GetIncludedFields(*reader->schema(), materialized_fields)); + ARROW_ASSIGN_OR_RAISE( + auto ipc_scan_options, + GetFragmentScanOptions<IpcFragmentScanOptions>( + kIpcTypeName, scan_options, format->default_fragment_scan_options)); + auto options = ipc_scan_options->options ? *ipc_scan_options->options + : default_read_options(); + options.memory_pool = scan_options->pool; + options.use_threads = false; + if (!options.included_fields.empty()) { + // Cannot set them ehre + return Status::Invalid( + "Cannot set included_fields in scan options for IPC fragments"); Review comment: We don't emit an error for other ignored options, so unless we want to emit an error for any non-default ignored option I think the most we should emit here is a warning: ```suggestion ARROW_LOG(WARNING) << "IpcFragmentScanOptions.options.included_fields was set but " "will be ignored; included_fields are derived from fields referenced by the scan." ``` ########## File path: cpp/src/arrow/dataset/file_ipc.cc ########## @@ -81,15 +81,27 @@ class IpcScanTask : public ScanTask { Result<RecordBatchIterator> Execute() override { struct Impl { - static Result<RecordBatchIterator> Make( - const FileSource& source, std::vector<std::string> materialized_fields, - MemoryPool* pool) { + static Result<RecordBatchIterator> Make(const FileSource& source, + FileFormat* format, + ScanOptions* scan_options) { ARROW_ASSIGN_OR_RAISE(auto reader, OpenReader(source)); - auto options = default_read_options(); - options.memory_pool = pool; - ARROW_ASSIGN_OR_RAISE(options.included_fields, - GetIncludedFields(*reader->schema(), materialized_fields)); + ARROW_ASSIGN_OR_RAISE( + auto ipc_scan_options, + GetFragmentScanOptions<IpcFragmentScanOptions>( + kIpcTypeName, scan_options, format->default_fragment_scan_options)); + auto options = ipc_scan_options->options ? *ipc_scan_options->options + : default_read_options(); + options.memory_pool = scan_options->pool; + options.use_threads = false; + if (!options.included_fields.empty()) { + // Cannot set them ehre Review comment: ```suggestion // Cannot set them here ``` ########## File path: cpp/src/arrow/dataset/type_fwd.h ########## @@ -68,6 +68,7 @@ class IpcFileWriteOptions; Review comment: ```suggestion class IpcFragmentScanOptions; ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org