devinjdangelo commented on code in PR #7390:
URL: https://github.com/apache/arrow-datafusion/pull/7390#discussion_r1303534238
##########
datafusion/core/src/datasource/listing/table.rs:
##########
@@ -1828,7 +1862,7 @@ mod tests {
)
.await
.expect_err("Example should fail!");
- assert_eq!("Error during planning: zstd compression requires
specifying a level such as zstd(4)", format!("{e}"));
+ assert_eq!("Invalid Option: zstd compression requires specifying a
level such as zstd(4)", format!("{e}"));
Review Comment:
This example I think shows why I wanted to create a new error type. "Error
during planning" I think would be more confusing to an end user than just a
straightforward "Invalid Option" error.
##########
datafusion/common/src/file_options/mod.rs:
##########
@@ -0,0 +1,269 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Options related to how files should be written
+
+pub mod arrow_writer;
+pub mod avro_writer;
+pub mod csv_writer;
+pub mod file_type;
+pub mod json_writer;
+pub mod parquet_writer;
+pub(crate) mod parse_utils;
+
+use std::{collections::HashMap, path::Path, str::FromStr};
+
+use crate::{
+ config::ConfigOptions, file_options::parse_utils::parse_boolean_string,
+ DataFusionError, FileType, Result,
+};
+
+use self::{
+ arrow_writer::ArrowWriterOptions, avro_writer::AvroWriterOptions,
+ csv_writer::CsvWriterOptions, json_writer::JsonWriterOptions,
+ parquet_writer::ParquetWriterOptions,
+};
+
+/// Represents a single arbitrary setting in a
+/// [StatementOptions] where OptionTuple.0 determines
+/// the specific setting to be modified and OptionTuple.1
+/// determines the value which should be applied
+pub type OptionTuple = (String, String);
+
+/// Represents arbitrary tuples of options passed as String
+/// tuples from SQL statements. As in the following statement:
+/// COPY ... TO ... (setting1 value1, setting2 value2, ...)
+#[derive(Clone, PartialEq, Eq, Hash, Debug)]
+pub struct StatementOptions {
+ options: Vec<OptionTuple>,
+}
+
+/// Useful for conversion from external tables which use Hashmap<String,
String>
+impl From<&HashMap<String, String>> for StatementOptions {
+ fn from(value: &HashMap<String, String>) -> Self {
+ Self {
+ options: value
+ .iter()
+ .map(|(k, v)| (k.to_owned(), v.to_owned()))
+ .collect::<Vec<OptionTuple>>(),
+ }
+ }
+}
+
+impl StatementOptions {
+ pub fn new(options: Vec<OptionTuple>) -> Self {
+ Self { options }
+ }
+
+ pub fn into_inner(self) -> Vec<OptionTuple> {
+ self.options
+ }
+
+ /// Scans for option and if it exists removes it and attempts to parse as
a boolean
+ /// Returns none if it does not exist.
+ pub fn get_bool_option(&mut self, find: &str) -> Result<Option<bool>> {
+ let maybe_option = self.scan_and_remove_option(find);
+ maybe_option
+ .map(|(_, v)| parse_boolean_string(find, v))
+ .transpose()
+ }
+
+ /// Scans for option and if it exists removes it and returns it
+ /// Returns none if it does not exist
+ pub fn get_str_option(&mut self, find: &str) -> Option<String> {
+ let maybe_option = self.scan_and_remove_option(find);
+ maybe_option.map(|(_, v)| v)
+ }
+
+ /// Infers the file_type given a target and arbitrary options.
+ /// If the options contain an explicit "format" option, that will be used.
+ /// Otherwise, attempt to infer file_type from the extension of target.
+ /// Finally, return an error if unable to determine the file_type
+ /// If found, format is removed from the options list.
+ pub fn try_infer_file_type(&mut self, target: &str) -> Result<FileType> {
+ let explicit_format = self.scan_and_remove_option("format");
+ let format = match explicit_format {
+ Some(s) => FileType::from_str(s.1.as_str()),
+ None => {
+ // try to infer file format from file extension
+ let extension: &str = &Path::new(target)
+ .extension()
+ .ok_or(DataFusionError::InvalidOption(
+ "Format not explicitly set and unable to get file
extension!"
+ .to_string(),
+ ))?
+ .to_str()
+ .ok_or(DataFusionError::InvalidOption(
+ "Format not explicitly set and failed to parse file
extension!"
+ .to_string(),
+ ))?
+ .to_lowercase();
+
+ FileType::from_str(extension)
+ }
+ }?;
+
+ Ok(format)
+ }
+
+ /// Finds an option in StatementOptions if exists, removes and returns it
+ /// along with the vec of remaining options.
+ fn scan_and_remove_option(&mut self, find: &str) -> Option<OptionTuple> {
+ let idx = self
+ .options
+ .iter()
+ .position(|(k, _)| k.to_lowercase() == find.to_lowercase());
+ match idx {
+ Some(i) => Some(self.options.swap_remove(i)),
+ None => None,
+ }
+ }
+}
+
+/// This type contains all options needed to initialize a particular
+/// RecordBatchWriter type. Each element in the enum contains a thin wrapper
+/// around a "writer builder" type (e.g. arrow::csv::WriterBuilder)
+/// plus any DataFusion specific writing options (e.g. CSV compression)
+#[derive(Clone, Debug)]
+pub enum FileTypeWriterOptions {
Review Comment:
This enum was proposed by @alamb in #6569 and is the key new abstraction in
this PR.
##########
datafusion/common/src/error.rs:
##########
@@ -73,6 +73,9 @@ pub enum DataFusionError {
/// This error happens whenever a plan is not valid. Examples include
/// impossible casts.
Plan(String),
+ /// This error happens when an invalid or unsupported option is passed
+ /// in a SQL statement
+ InvalidOption(String),
Review Comment:
If this causes any issues with the ongoing work around DataFusionError, I'm
happy to roll it back. I created this because I didn't feel any existing error
type seemed to make sense for an invalid option.
##########
datafusion/sqllogictest/test_files/copy.slt:
##########
@@ -139,20 +160,11 @@ select * from validate_single_json;
1 Foo
2 Bar
+# Error cases:
+
# Copy from table with options
-query IT
+query error DataFusion error: Invalid Option: Found unsupported option
row_group_size with value 55 for JSON format!
Review Comment:
As discussed in #6569, this query now throws an error since row_group_size
does not make sense in the context of writing JSON files.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]