bkietz commented on a change in pull request #9790:
URL: https://github.com/apache/arrow/pull/9790#discussion_r601727188
##########
File path: cpp/src/arrow/dataset/file_parquet.h
##########
@@ -76,35 +78,20 @@ class ARROW_DS_EXPORT ParquetFileFormat : public FileFormat
{
/// \defgroup parquet-file-format-reader-properties properties which
correspond to
/// members of parquet::ReaderProperties.
///
- /// We don't embed parquet::ReaderProperties directly because we get
memory_pool from
- /// ScanOptions at scan time and provide differing defaults.
- ///
/// @{
- bool use_buffered_stream = false;
- int64_t buffer_size = 1 << 13;
std::shared_ptr<parquet::FileDecryptionProperties>
file_decryption_properties;
/// @}
/// \defgroup parquet-file-format-arrow-reader-properties properties which
correspond
/// to members of parquet::ArrowReaderProperties.
///
- /// We don't embed parquet::ReaderProperties directly because we get
batch_size from
- /// ScanOptions at scan time, and we will never pass use_threads == true
(since we
- /// defer parallelization of the scan). Additionally column names (rather
than
- /// indices) are used to indicate dictionary columns.
+ /// We don't embed parquet::ReaderProperties directly because column names
(rather
+ /// than indices) are used to indicate dictionary columns, and other
options are
+ /// deferred to scan time.
///
/// @{
std::unordered_set<std::string> dict_columns;
- bool pre_buffer = false;
- arrow::io::CacheOptions cache_options =
arrow::io::CacheOptions::Defaults();
- arrow::io::IOContext io_context;
/// @}
-
- /// EXPERIMENTAL: Parallelize conversion across columns. This option is
ignored if a
- /// scan is already parallelized across input files to avoid thread
contention. This
- /// option will be removed after support is added for simultaneous
parallelization
- /// across files and columns.
- bool enable_parallel_column_conversion = false;
} reader_options;
Review comment:
I think now that `FileFormat::default_fragment_scan_options` is
available and both IPC and CSV file formats embed options structs whole it'd be
best to just follow that pattern with parquet: let ParquetFragmentScanOptions
hold a ReaderProperties whole.
`dict_columns` will need to remain here for now since it affects the schema
of the scanned dataset, but once we can (for example) project a string column
to dictionary<int32, string> I'd expect this option to be deprecated in favor
of automatically pushing the projection-to-dictionary down to the parquet
reader with `ArrowReaderOptions::set_read_dictionary`.
`file_decryption_properties` may also need to be replicated, though I'm not
sure how/if anyone is using encryption + datasets
##########
File path: cpp/src/arrow/dataset/file_ipc.cc
##########
@@ -81,15 +81,27 @@ class IpcScanTask : public ScanTask {
Result<RecordBatchIterator> Execute() override {
struct Impl {
- static Result<RecordBatchIterator> Make(
- const FileSource& source, std::vector<std::string>
materialized_fields,
- MemoryPool* pool) {
+ static Result<RecordBatchIterator> Make(const FileSource& source,
+ FileFormat* format,
+ ScanOptions* scan_options) {
Review comment:
Non const pointer suggests mutation, should be changed here and in
GetFragmentScanOptions
```suggestion
const ScanOptions*
scan_options,
FileFormat* format) {
```
##########
File path: cpp/src/arrow/dataset/file_ipc.cc
##########
@@ -81,15 +81,27 @@ class IpcScanTask : public ScanTask {
Result<RecordBatchIterator> Execute() override {
struct Impl {
- static Result<RecordBatchIterator> Make(
- const FileSource& source, std::vector<std::string>
materialized_fields,
- MemoryPool* pool) {
+ static Result<RecordBatchIterator> Make(const FileSource& source,
+ FileFormat* format,
+ ScanOptions* scan_options) {
ARROW_ASSIGN_OR_RAISE(auto reader, OpenReader(source));
- auto options = default_read_options();
- options.memory_pool = pool;
- ARROW_ASSIGN_OR_RAISE(options.included_fields,
- GetIncludedFields(*reader->schema(),
materialized_fields));
+ ARROW_ASSIGN_OR_RAISE(
+ auto ipc_scan_options,
+ GetFragmentScanOptions<IpcFragmentScanOptions>(
+ kIpcTypeName, scan_options,
format->default_fragment_scan_options));
+ auto options = ipc_scan_options->options ? *ipc_scan_options->options
+ : default_read_options();
+ options.memory_pool = scan_options->pool;
+ options.use_threads = false;
+ if (!options.included_fields.empty()) {
+ // Cannot set them ehre
+ return Status::Invalid(
+ "Cannot set included_fields in scan options for IPC fragments");
Review comment:
We don't emit an error for other ignored options, so unless we want to
emit an error for any non-default ignored option I think the most we should
emit here is a warning:
```suggestion
ARROW_LOG(WARNING) <<
"IpcFragmentScanOptions.options.included_fields was set but "
"will be ignored; included_fields are derived from fields
referenced by the scan."
```
##########
File path: cpp/src/arrow/dataset/file_ipc.cc
##########
@@ -81,15 +81,27 @@ class IpcScanTask : public ScanTask {
Result<RecordBatchIterator> Execute() override {
struct Impl {
- static Result<RecordBatchIterator> Make(
- const FileSource& source, std::vector<std::string>
materialized_fields,
- MemoryPool* pool) {
+ static Result<RecordBatchIterator> Make(const FileSource& source,
+ FileFormat* format,
+ ScanOptions* scan_options) {
ARROW_ASSIGN_OR_RAISE(auto reader, OpenReader(source));
- auto options = default_read_options();
- options.memory_pool = pool;
- ARROW_ASSIGN_OR_RAISE(options.included_fields,
- GetIncludedFields(*reader->schema(),
materialized_fields));
+ ARROW_ASSIGN_OR_RAISE(
+ auto ipc_scan_options,
+ GetFragmentScanOptions<IpcFragmentScanOptions>(
+ kIpcTypeName, scan_options,
format->default_fragment_scan_options));
+ auto options = ipc_scan_options->options ? *ipc_scan_options->options
+ : default_read_options();
+ options.memory_pool = scan_options->pool;
+ options.use_threads = false;
+ if (!options.included_fields.empty()) {
+ // Cannot set them ehre
Review comment:
```suggestion
// Cannot set them here
```
##########
File path: cpp/src/arrow/dataset/type_fwd.h
##########
@@ -68,6 +68,7 @@ class IpcFileWriteOptions;
Review comment:
```suggestion
class IpcFragmentScanOptions;
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]