[GitHub] [spark] Yaohua628 commented on a diff in pull request #38113: [SPARK-40667][SQL] Refactor File Data Source Options

GitBox Thu, 06 Oct 2022 15:27:00 -0700


Yaohua628 commented on code in PR #38113:
URL: https://github.com/apache/spark/pull/38113#discussion_r989527909



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala:
##########
@@ -327,3 +329,45 @@ class CSVOptions(
     settings
   }
 }
+
+object CSVOptions extends FileSourceOptionsSet {
+  val HEADER = newOption("header")
+  val INFER_SCHEMA = newOption("inferSchema")
+  val IGNORE_LEADING_WHITESPACE = newOption("ignoreLeadingWhiteSpace")
+  val IGNORE_TRAILING_WHITESPACE = newOption("ignoreTrailingWhiteSpace")
+  val PREFERS_DATE = newOption("prefersDate")
+  val ESCAPE_QUOTES = newOption("escapeQuotes")
+  val QUOTE_ALL = newOption("quoteAll")
+  val ENFORCE_SCHEMA = newOption("enforceSchema")
+  val QUOTE = newOption("quote")
+  val ESCAPE = newOption("escape")
+  val COMMENT = newOption("comment")
+  val MAX_COLUMNS = newOption("maxColumns")
+  val MAX_CHARS_PER_COLUMN = newOption("maxCharsPerColumn")
+  val MODE = newOption("mode")
+  val CHAR_TO_ESCAPE_QUOTE_ESCAPING = newOption("charToEscapeQuoteEscaping")
+  val LOCALE = newOption("locale")
+  val DATE_FORMAT = newOption("dateFormat")
+  val TIMESTAMP_FORMAT = newOption("timestampFormat")
+  val TIMESTAMP_NTZ_FORMAT = newOption("timestampNTZFormat")
+  val ENABLE_DATETIME_PARSING_FALLBACK = 
newOption("enableDateTimeParsingFallback")
+  val MULTI_LINE = newOption("multiLine")
+  val SAMPLING_RATIO = newOption("samplingRatio")
+  val EMPTY_VALUE = newOption("emptynewOption")
+  val LINE_SEP = newOption("lineSep")
+  val INPUT_BUFFER_SIZE = newOption("inputBufferSize")
+  val COLUMN_NAME_OF_CORRUPT_RECORD = newOption("columnNameOfCorruptRecord")
+  val NULL_VALUE = newOption("nullnewOption")
+  val NAN_VALUE = newOption("nannewOption")
+  val POSITIVE_INF = newOption("positiveInf")
+  val NEGATIVE_INF = newOption("negativeInf")
+  val TIME_ZONE = newOption("timeZone")
+  val UNESCAPED_QUOTE_HANDLING = newOption("unescapedQuoteHandling")
+  // Options with alternative
+  val ENCODING = newOption("encoding", Some("charset"))
+  val CHARSET = newOption("charset", Some("encoding"))
+  val CODEC = newOption("codec", Some("compression"))
+  val COMPRESSION = newOption("compression", Some("codec"))
+  val SEP = newOption("sep", Some("delimiter"))
+  val DELIMITER = newOption("delimiter", Some("sep"))

Review Comment:
   I feel like it will be very error-prone, e.g. forget to add its 
alternatives, etc
   We probably should just add once, and in the trait, we can handle the 
alternatives properly



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala:
##########
@@ -327,3 +329,45 @@ class CSVOptions(
     settings
   }
 }
+
+object CSVOptions extends FileSourceOptionsSet {
+  val HEADER = newOption("header")
+  val INFER_SCHEMA = newOption("inferSchema")
+  val IGNORE_LEADING_WHITESPACE = newOption("ignoreLeadingWhiteSpace")
+  val IGNORE_TRAILING_WHITESPACE = newOption("ignoreTrailingWhiteSpace")
+  val PREFERS_DATE = newOption("prefersDate")
+  val ESCAPE_QUOTES = newOption("escapeQuotes")
+  val QUOTE_ALL = newOption("quoteAll")
+  val ENFORCE_SCHEMA = newOption("enforceSchema")
+  val QUOTE = newOption("quote")
+  val ESCAPE = newOption("escape")
+  val COMMENT = newOption("comment")
+  val MAX_COLUMNS = newOption("maxColumns")
+  val MAX_CHARS_PER_COLUMN = newOption("maxCharsPerColumn")
+  val MODE = newOption("mode")
+  val CHAR_TO_ESCAPE_QUOTE_ESCAPING = newOption("charToEscapeQuoteEscaping")
+  val LOCALE = newOption("locale")
+  val DATE_FORMAT = newOption("dateFormat")
+  val TIMESTAMP_FORMAT = newOption("timestampFormat")
+  val TIMESTAMP_NTZ_FORMAT = newOption("timestampNTZFormat")
+  val ENABLE_DATETIME_PARSING_FALLBACK = 
newOption("enableDateTimeParsingFallback")
+  val MULTI_LINE = newOption("multiLine")
+  val SAMPLING_RATIO = newOption("samplingRatio")
+  val EMPTY_VALUE = newOption("emptynewOption")
+  val LINE_SEP = newOption("lineSep")
+  val INPUT_BUFFER_SIZE = newOption("inputBufferSize")
+  val COLUMN_NAME_OF_CORRUPT_RECORD = newOption("columnNameOfCorruptRecord")
+  val NULL_VALUE = newOption("nullnewOption")

Review Comment:
   nit: `nullValue`



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/FileSourceOptions.scala:
##########
@@ -40,3 +40,42 @@ object FileSourceOptions {
   val IGNORE_CORRUPT_FILES = "ignoreCorruptFiles"
   val IGNORE_MISSING_FILES = "ignoreMissingFiles"
 }
+
+/**
+ * Interface defines for a file-based data source, how to
+ *  - register a new option name
+ *  - retrieve all registered option names
+ *  - valid a given option name
+ *  - get alternative option name if any
+ */
+trait FileSourceOptionsSet {
+  private val validOptions = collection.mutable.Map[String, Option[String]]()
+
+  /**
+   * Register a new Option. If two options are alternative to each other, each 
of them needs to be
+   * registered individually
+   * @param name The primary option name
+   * @param alternative Alternative option name if any
+   */
+  protected def newOption(name: String, alternative: Option[String] = None): 
String = {
+    validOptions += (name -> alternative)
+    name
+  }
+
+  /**
+   * @return All valid file source options
+   */
+  def getAllValidOptionNames: scala.collection.Set[String] = 
validOptions.keySet
+
+  /**
+   * @param name Option name to be validated
+   * @return if the given Option name is valid
+   */
+  def isValidOptionName(name: String): Boolean = validOptions.contains(name)

Review Comment:
   what about `alternative` name? That's also a valid option, right? Also 
`getAllValidOptionsNames` above



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/FileSourceOptions.scala:
##########
@@ -40,3 +40,42 @@ object FileSourceOptions {
   val IGNORE_CORRUPT_FILES = "ignoreCorruptFiles"
   val IGNORE_MISSING_FILES = "ignoreMissingFiles"
 }
+
+/**
+ * Interface defines for a file-based data source, how to
+ *  - register a new option name
+ *  - retrieve all registered option names
+ *  - valid a given option name
+ *  - get alternative option name if any
+ */
+trait FileSourceOptionsSet {
+  private val validOptions = collection.mutable.Map[String, Option[String]]()
+
+  /**
+   * Register a new Option. If two options are alternative to each other, each 
of them needs to be
+   * registered individually
+   * @param name The primary option name
+   * @param alternative Alternative option name if any
+   */
+  protected def newOption(name: String, alternative: Option[String] = None): 
String = {
+    validOptions += (name -> alternative)
+    name
+  }
+
+  /**
+   * @return All valid file source options
+   */
+  def getAllValidOptionNames: scala.collection.Set[String] = 
validOptions.keySet
+
+  /**
+   * @param name Option name to be validated
+   * @return if the given Option name is valid
+   */
+  def isValidOptionName(name: String): Boolean = validOptions.contains(name)
+
+  /**
+   * @param name Option name
+   * @return Alternative option name if any
+   */
+  def getAlternativeOptionName(name: String): Option[String] = 
validOptions.getOrElse(name, None)

Review Comment:
   I am wondering what would be the usage of this public method



##########
connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroOptions.scala:
##########
@@ -139,11 +141,17 @@ private[sql] object AvroOptions {
     new AvroOptions(CaseInsensitiveMap(parameters), hadoopConf)
   }
 
-  val ignoreExtensionKey = "ignoreExtension"
-
+  val IGNORE_EXTENSION_KEY = newOption("ignoreExtension")

Review Comment:
   nit: should we change it to `IGNORE_EXTENSION`, just be consistent with 
other options



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala:
##########
@@ -327,3 +329,45 @@ class CSVOptions(
     settings
   }
 }
+
+object CSVOptions extends FileSourceOptionsSet {
+  val HEADER = newOption("header")
+  val INFER_SCHEMA = newOption("inferSchema")
+  val IGNORE_LEADING_WHITESPACE = newOption("ignoreLeadingWhiteSpace")
+  val IGNORE_TRAILING_WHITESPACE = newOption("ignoreTrailingWhiteSpace")
+  val PREFERS_DATE = newOption("prefersDate")
+  val ESCAPE_QUOTES = newOption("escapeQuotes")
+  val QUOTE_ALL = newOption("quoteAll")
+  val ENFORCE_SCHEMA = newOption("enforceSchema")
+  val QUOTE = newOption("quote")
+  val ESCAPE = newOption("escape")
+  val COMMENT = newOption("comment")
+  val MAX_COLUMNS = newOption("maxColumns")
+  val MAX_CHARS_PER_COLUMN = newOption("maxCharsPerColumn")
+  val MODE = newOption("mode")
+  val CHAR_TO_ESCAPE_QUOTE_ESCAPING = newOption("charToEscapeQuoteEscaping")
+  val LOCALE = newOption("locale")
+  val DATE_FORMAT = newOption("dateFormat")
+  val TIMESTAMP_FORMAT = newOption("timestampFormat")
+  val TIMESTAMP_NTZ_FORMAT = newOption("timestampNTZFormat")
+  val ENABLE_DATETIME_PARSING_FALLBACK = 
newOption("enableDateTimeParsingFallback")
+  val MULTI_LINE = newOption("multiLine")
+  val SAMPLING_RATIO = newOption("samplingRatio")
+  val EMPTY_VALUE = newOption("emptynewOption")
+  val LINE_SEP = newOption("lineSep")
+  val INPUT_BUFFER_SIZE = newOption("inputBufferSize")
+  val COLUMN_NAME_OF_CORRUPT_RECORD = newOption("columnNameOfCorruptRecord")
+  val NULL_VALUE = newOption("nullnewOption")
+  val NAN_VALUE = newOption("nannewOption")

Review Comment:
   nit: `nanValue`



##########
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala:
##########
@@ -3080,6 +3081,23 @@ abstract class CSVSuite
       }
     }
   }
+
+  test("SPARK-40667: check the number of valid CSV option names") {
+    assert(CSVOptions.getAllValidOptionNames.size == 38)
+  }
+
+  test("SPARK-40667: validate a given option name") {
+    assert(CSVOptions.isValidOptionName("inferSchema"))
+    assert(CSVOptions.isValidOptionName("prefersDate"))
+    assert(!CSVOptions.isValidOptionName("inferSchemas"))
+    assert(!CSVOptions.isValidOptionName("randomName"))

Review Comment:
   maybe add tests for alternatives:
   ```
   assert(CSVOptions.isValidOptionName("sep"))
   assert(CSVOptions.isValidOptionName("delimeter"))
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] Yaohua628 commented on a diff in pull request #38113: [SPARK-40667][SQL] Refactor File Data Source Options

Reply via email to