lidavidm commented on a change in pull request #9725: URL: https://github.com/apache/arrow/pull/9725#discussion_r598888784
########## File path: cpp/src/arrow/dataset/file_csv.h ########## @@ -33,11 +33,35 @@ namespace dataset { constexpr char kCsvTypeName[] = "csv"; +/// \brief Per-scan options for CSV fragments +struct ARROW_DS_EXPORT CsvFragmentScanOptions : public FragmentScanOptions { + std::string type_name() const override { return kCsvTypeName; } + + /// CSV conversion options + csv::ConvertOptions convert_options = csv::ConvertOptions::Defaults(); + + /// Block size for reading (see arrow::csv::ReadOptions::block_size) + int32_t block_size = 1 << 20; // 1 MB +}; + /// \brief A FileFormat implementation that reads from and writes to Csv files class ARROW_DS_EXPORT CsvFileFormat : public FileFormat { public: - /// Options affecting the parsing of CSV files - csv::ParseOptions parse_options = csv::ParseOptions::Defaults(); + /// \brief Dataset-wide options for CSV. For convenience (both for users, and + /// for authors of language bindings), these include fields for per-scan + /// options, however, if per-scan options are provided, then those fields will + /// be overridden. + struct ReaderOptions : CsvFragmentScanOptions { Review comment: Ok. I'll switch this to embed the structs again, especially now that we want to let the user specify defaults in the file format and so there's no need to split the options. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org