alamb commented on code in PR #16166: URL: https://github.com/apache/datafusion/pull/16166#discussion_r2112474913
########## datafusion/datasource/src/file_format.rs: ########## @@ -120,7 +121,26 @@ pub trait FileFormatFactory: Sync + Send + GetExt + fmt::Debug { &self, state: &dyn Session, format_options: &HashMap<String, String>, - ) -> Result<Arc<dyn FileFormat>>; + ) -> Result<Arc<dyn FileFormat>> { + let options = match self.options() { + (None, file_type) => { + let mut table_options = state.file_table_options(file_type); + table_options.alter_with_string_hash_map(format_options)?; + table_options.output_format() + } + (Some(options), _) => { + let mut table_options = TableFormatOptions::from_output_format(options); + table_options.alter_with_string_hash_map(format_options)?; + table_options.output_format() + } + }; + + Ok(self.default_from_output_format(options)) + } + + fn default_from_output_format(&self, options: OutputFormat) -> Arc<dyn FileFormat>; Review Comment: Can we please document these new public APIs? ########## datafusion/common/src/config.rs: ########## @@ -1612,42 +1623,241 @@ impl TableOptions { }; e.0.set(key, value) } +} - /// Initializes a new `TableOptions` from a hash map of string settings. +#[derive(Debug, Clone)] +#[allow(clippy::large_enum_variant)] +pub enum TableFormatOptions { Review Comment: Since this is a new pub struct, can we please add documentation explaining 1. what it is used for 2. How it is different than `TableOptions`? ########## datafusion/datasource/src/file_format.rs: ########## @@ -120,7 +121,26 @@ pub trait FileFormatFactory: Sync + Send + GetExt + fmt::Debug { &self, state: &dyn Session, format_options: &HashMap<String, String>, - ) -> Result<Arc<dyn FileFormat>>; + ) -> Result<Arc<dyn FileFormat>> { + let options = match self.options() { Review Comment: since this method is part of a trait, I think it can be overridden by other implementations. Is that the intention? In other words, do you expect anyone implementing FileFormatFactory to implement `create()` or should they now implement `default_from_output_format` and `options`? ########## datafusion/common/src/config.rs: ########## @@ -1612,42 +1623,241 @@ impl TableOptions { }; e.0.set(key, value) } +} - /// Initializes a new `TableOptions` from a hash map of string settings. +#[derive(Debug, Clone)] +#[allow(clippy::large_enum_variant)] +pub enum TableFormatOptions { + Csv { + options: CsvOptions, + extensions: Extensions, + }, + #[cfg(feature = "parquet")] + Parquet { + options: TableParquetOptions, + extensions: Extensions, + }, + Json { + options: JsonOptions, + extensions: Extensions, + }, +} + +impl ConfigField for TableFormatOptions { + /// Visits configuration settings for the current file format, or all formats if none is selected. + /// + /// This method adapts the behavior based on whether a file format is currently selected in `current_format`. + /// If a format is selected, it visits only the settings relevant to that format. Otherwise, + /// it visits all available format settings. + fn visit<V: Visit>(&self, v: &mut V, _key_prefix: &str, _description: &'static str) { + match self { + #[cfg(feature = "parquet")] + TableFormatOptions::Parquet { options, .. } => { + options.visit(v, "format", ""); + } + TableFormatOptions::Csv { options, .. } => { + options.visit(v, "format", ""); + } + TableFormatOptions::Json { options, .. } => { + options.visit(v, "format", ""); + } + } + } + + /// Sets a configuration value for a specific key within `TableOptions`. Review Comment: this is in `TableFormatOptions` I think ########## datafusion/common/src/config.rs: ########## @@ -1612,42 +1623,241 @@ impl TableOptions { }; e.0.set(key, value) } +} - /// Initializes a new `TableOptions` from a hash map of string settings. +#[derive(Debug, Clone)] +#[allow(clippy::large_enum_variant)] +pub enum TableFormatOptions { + Csv { + options: CsvOptions, + extensions: Extensions, + }, + #[cfg(feature = "parquet")] + Parquet { + options: TableParquetOptions, + extensions: Extensions, + }, + Json { + options: JsonOptions, + extensions: Extensions, + }, +} + +impl ConfigField for TableFormatOptions { + /// Visits configuration settings for the current file format, or all formats if none is selected. + /// + /// This method adapts the behavior based on whether a file format is currently selected in `current_format`. + /// If a format is selected, it visits only the settings relevant to that format. Otherwise, + /// it visits all available format settings. + fn visit<V: Visit>(&self, v: &mut V, _key_prefix: &str, _description: &'static str) { + match self { + #[cfg(feature = "parquet")] + TableFormatOptions::Parquet { options, .. } => { + options.visit(v, "format", ""); + } + TableFormatOptions::Csv { options, .. } => { + options.visit(v, "format", ""); + } + TableFormatOptions::Json { options, .. } => { + options.visit(v, "format", ""); + } + } + } + + /// Sets a configuration value for a specific key within `TableOptions`. + /// + /// This method delegates setting configuration values to the specific file format configurations, + /// based on the current format selected. If no format is selected, it returns an error. /// /// # Parameters /// - /// * `settings`: A hash map where each key-value pair represents a configuration setting. + /// * `key`: The configuration key specifying which setting to adjust, prefixed with the format (e.g., "format.delimiter") + /// for CSV format. + /// * `value`: The value to set for the specified configuration key. /// /// # Returns /// - /// A result containing the new `TableOptions` instance or an error if any setting could not be applied. - pub fn from_string_hash_map(settings: &HashMap<String, String>) -> Result<Self> { - let mut ret = Self::default(); - for (k, v) in settings { - ret.set(k, v)?; + /// A result indicating success or an error if the key is not recognized, if a format is not specified, + /// or if setting the configuration value fails for the specific format. + fn set(&mut self, key: &str, value: &str) -> Result<()> { + // Extensions are handled in the public `ConfigOptions::set` + let (key, rem) = key.split_once('.').unwrap_or((key, "")); + match key { + "format" => match self { + #[cfg(feature = "parquet")] + Self::Parquet { options, .. } => options.set(rem, value), + Self::Csv { options, .. } => options.set(rem, value), + Self::Json { options, .. } => options.set(rem, value), + }, + _ => _config_err!("Config value \"{key}\" not found on TableOptions"), } + } +} - Ok(ret) +impl TableFormatOptions { + /// Constructs a new instance of `TableOptions` with JSON file type and default settings. + /// + /// # Returns + /// + /// A new `TableOptions` instance with default configuration values. + pub fn new_json() -> Self { + Self::Json { + options: Default::default(), + extensions: Default::default(), + } } - /// Modifies the current `TableOptions` instance with settings from a hash map. + /// Constructs a new instance of `TableOptions` with CSV file type and default settings. + /// + /// # Returns + /// + /// A new `TableOptions` instance with default configuration values. + pub fn new_csv() -> Self { + Self::Csv { + options: Default::default(), + extensions: Default::default(), + } + } + + /// Constructs a new instance of `TableOptions` with Parquet file type and default settings. + /// + /// # Returns + /// + /// A new `TableOptions` instance with default configuration values. + #[cfg(feature = "parquet")] + pub fn new_parquet() -> Self { + Self::Parquet { + options: Default::default(), + extensions: Default::default(), + } + } + + pub fn csv_options_or_default(&self) -> CsvOptions { Review Comment: Likewise, these new public methods should have documentation ########## datafusion/datasource/src/file_format.rs: ########## @@ -120,7 +121,26 @@ pub trait FileFormatFactory: Sync + Send + GetExt + fmt::Debug { &self, state: &dyn Session, format_options: &HashMap<String, String>, - ) -> Result<Arc<dyn FileFormat>>; + ) -> Result<Arc<dyn FileFormat>> { + let options = match self.options() { + (None, file_type) => { + let mut table_options = state.file_table_options(file_type); + table_options.alter_with_string_hash_map(format_options)?; + table_options.output_format() + } + (Some(options), _) => { + let mut table_options = TableFormatOptions::from_output_format(options); + table_options.alter_with_string_hash_map(format_options)?; + table_options.output_format() + } + }; + + Ok(self.default_from_output_format(options)) + } + + fn default_from_output_format(&self, options: OutputFormat) -> Arc<dyn FileFormat>; + + fn options(&self) -> (Option<OutputFormat>, ConfigFileType); Review Comment: I find it strange that a FileFormatFactory returns an `Option<OutputFormat>` and then another method takes the output format as input -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org